1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
14#include "AArch64CallingConvention.h"
15#include "AArch64ExpandImm.h"
16#include "AArch64MachineFunctionInfo.h"
17#include "AArch64PerfectShuffle.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
20#include "MCTargetDesc/AArch64AddressingModes.h"
21#include "Utils/AArch64BaseInfo.h"
22#include "Utils/AArch64SMEAttributes.h"
23#include "llvm/ADT/APFloat.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/SmallVector.h"
29#include "llvm/ADT/SmallVectorExtras.h"
30#include "llvm/ADT/Statistic.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/Twine.h"
33#include "llvm/Analysis/LoopInfo.h"
34#include "llvm/Analysis/MemoryLocation.h"
35#include "llvm/Analysis/ObjCARCUtil.h"
36#include "llvm/Analysis/OptimizationRemarkEmitter.h"
37#include "llvm/Analysis/TargetTransformInfo.h"
38#include "llvm/Analysis/ValueTracking.h"
39#include "llvm/Analysis/VectorUtils.h"
40#include "llvm/CodeGen/Analysis.h"
41#include "llvm/CodeGen/CallingConvLower.h"
42#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
43#include "llvm/CodeGen/GlobalISel/Utils.h"
44#include "llvm/CodeGen/ISDOpcodes.h"
45#include "llvm/CodeGen/MachineBasicBlock.h"
46#include "llvm/CodeGen/MachineFrameInfo.h"
47#include "llvm/CodeGen/MachineFunction.h"
48#include "llvm/CodeGen/MachineInstr.h"
49#include "llvm/CodeGen/MachineInstrBuilder.h"
50#include "llvm/CodeGen/MachineMemOperand.h"
51#include "llvm/CodeGen/MachineRegisterInfo.h"
52#include "llvm/CodeGen/SelectionDAG.h"
53#include "llvm/CodeGen/SelectionDAGNodes.h"
54#include "llvm/CodeGen/TargetCallingConv.h"
55#include "llvm/CodeGen/TargetInstrInfo.h"
56#include "llvm/CodeGen/TargetOpcodes.h"
57#include "llvm/CodeGen/ValueTypes.h"
58#include "llvm/CodeGenTypes/MachineValueType.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
63#include "llvm/IR/DerivedTypes.h"
64#include "llvm/IR/Function.h"
65#include "llvm/IR/GetElementPtrTypeIterator.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
69#include "llvm/IR/Instructions.h"
70#include "llvm/IR/IntrinsicInst.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
74#include "llvm/IR/PatternMatch.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
78#include "llvm/Support/AtomicOrdering.h"
79#include "llvm/Support/Casting.h"
80#include "llvm/Support/CodeGen.h"
81#include "llvm/Support/CommandLine.h"
82#include "llvm/Support/Debug.h"
83#include "llvm/Support/ErrorHandling.h"
84#include "llvm/Support/InstructionCost.h"
85#include "llvm/Support/KnownBits.h"
86#include "llvm/Support/MathExtras.h"
87#include "llvm/Support/SipHash.h"
88#include "llvm/Support/raw_ostream.h"
89#include "llvm/Target/TargetMachine.h"
90#include "llvm/Target/TargetOptions.h"
91#include "llvm/TargetParser/Triple.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
112
113// FIXME: The necessary dtprel relocations don't seem to be supported
114// well in the GNU bfd and gold linkers at the moment. Therefore, by
115// default, for now, fall back to GeneralDynamic code generation.
116cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
117 "aarch64-elf-ldtls-generation", cl::Hidden,
118 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
119 cl::init(Val: false));
120
121static cl::opt<bool>
122EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
123 cl::desc("Enable AArch64 logical imm instruction "
124 "optimization"),
125 cl::init(Val: true));
126
127// Temporary option added for the purpose of testing functionality added
128// to DAGCombiner.cpp in D92230. It is expected that this can be removed
129// in future when both implementations will be based off MGATHER rather
130// than the GLD1 nodes added for the SVE gather load intrinsics.
131static cl::opt<bool>
132EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
133 cl::desc("Combine extends of AArch64 masked "
134 "gather intrinsics"),
135 cl::init(Val: true));
136
137static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
138 cl::desc("Combine ext and trunc to TBL"),
139 cl::init(Val: true));
140
141// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
142// bottleneck after this transform on high end CPU. So this max leaf node
143// limitation is guard cmp+ccmp will be profitable.
144static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(Val: 16), cl::Hidden,
145 cl::desc("Maximum of xors"));
146
147// By turning this on, we will not fallback to DAG ISel when encountering
148// scalable vector types for all instruction, even if SVE is not yet supported
149// with some instructions.
150// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
151cl::opt<bool> EnableSVEGISel(
152 "aarch64-enable-gisel-sve", cl::Hidden,
153 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
154 cl::init(Val: false));
155
156// TODO: This option should be removed once we switch to always using PTRADD in
157// the SelectionDAG.
158static cl::opt<bool> UseFEATCPACodegen(
159 "aarch64-use-featcpa-codegen", cl::Hidden,
160 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
161 "SelectionDAG for FEAT_CPA"),
162 cl::init(Val: false));
163
164/// Value type used for condition codes.
165static const MVT MVT_CC = MVT::i32;
166
167static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
168 AArch64::X3, AArch64::X4, AArch64::X5,
169 AArch64::X6, AArch64::X7};
170static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
171 AArch64::Q3, AArch64::Q4, AArch64::Q5,
172 AArch64::Q6, AArch64::Q7};
173
174ArrayRef<MCPhysReg> llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; }
175
176ArrayRef<MCPhysReg> llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; }
177
178static inline EVT getPackedSVEVectorVT(EVT VT) {
179 switch (VT.getSimpleVT().SimpleTy) {
180 default:
181 llvm_unreachable("unexpected element type for vector");
182 case MVT::i8:
183 return MVT::nxv16i8;
184 case MVT::i16:
185 return MVT::nxv8i16;
186 case MVT::i32:
187 return MVT::nxv4i32;
188 case MVT::i64:
189 return MVT::nxv2i64;
190 case MVT::f16:
191 return MVT::nxv8f16;
192 case MVT::f32:
193 return MVT::nxv4f32;
194 case MVT::f64:
195 return MVT::nxv2f64;
196 case MVT::bf16:
197 return MVT::nxv8bf16;
198 }
199}
200
201// NOTE: Currently there's only a need to return integer vector types. If this
202// changes then just add an extra "type" parameter.
203static inline EVT getPackedSVEVectorVT(ElementCount EC) {
204 switch (EC.getKnownMinValue()) {
205 default:
206 llvm_unreachable("unexpected element count for vector");
207 case 16:
208 return MVT::nxv16i8;
209 case 8:
210 return MVT::nxv8i16;
211 case 4:
212 return MVT::nxv4i32;
213 case 2:
214 return MVT::nxv2i64;
215 }
216}
217
218static inline EVT getPromotedVTForPredicate(EVT VT) {
219 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
220 "Expected scalable predicate vector type!");
221 switch (VT.getVectorMinNumElements()) {
222 default:
223 llvm_unreachable("unexpected element count for vector");
224 case 2:
225 return MVT::nxv2i64;
226 case 4:
227 return MVT::nxv4i32;
228 case 8:
229 return MVT::nxv8i16;
230 case 16:
231 return MVT::nxv16i8;
232 }
233}
234
235/// Returns true if VT's elements occupy the lowest bit positions of its
236/// associated register class without any intervening space.
237///
238/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
239/// same register class, but only nxv8f16 can be treated as a packed vector.
240static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
241 assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
242 "Expected legal vector type!");
243 return VT.isFixedLengthVector() ||
244 VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock;
245}
246
247// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
248// predicate and end with a passthru value matching the result type.
249static bool isMergePassthruOpcode(unsigned Opc) {
250 switch (Opc) {
251 default:
252 return false;
253 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
254 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
255 case AArch64ISD::REVH_MERGE_PASSTHRU:
256 case AArch64ISD::REVW_MERGE_PASSTHRU:
257 case AArch64ISD::REVD_MERGE_PASSTHRU:
258 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
259 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
260 case AArch64ISD::DUP_MERGE_PASSTHRU:
261 case AArch64ISD::ABS_MERGE_PASSTHRU:
262 case AArch64ISD::NEG_MERGE_PASSTHRU:
263 case AArch64ISD::FNEG_MERGE_PASSTHRU:
264 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
265 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
266 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
267 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
268 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
269 case AArch64ISD::FRINT_MERGE_PASSTHRU:
270 case AArch64ISD::FROUND_MERGE_PASSTHRU:
271 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
272 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
273 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
274 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
275 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
276 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
277 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
278 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
279 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
280 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
281 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
282 case AArch64ISD::FABS_MERGE_PASSTHRU:
283 return true;
284 }
285}
286
287// Returns true if inactive lanes are known to be zeroed by construction.
288static bool isZeroingInactiveLanes(SDValue Op) {
289 switch (Op.getOpcode()) {
290 default:
291 return false;
292 // We guarantee i1 splat_vectors to zero the other lanes
293 case ISD::SPLAT_VECTOR:
294 case ISD::GET_ACTIVE_LANE_MASK:
295 case AArch64ISD::PTRUE:
296 case AArch64ISD::SETCC_MERGE_ZERO:
297 return true;
298 case ISD::INTRINSIC_WO_CHAIN:
299 switch (Op.getConstantOperandVal(i: 0)) {
300 default:
301 return false;
302 case Intrinsic::aarch64_sve_ptrue:
303 case Intrinsic::aarch64_sve_pnext:
304 case Intrinsic::aarch64_sve_cmpeq:
305 case Intrinsic::aarch64_sve_cmpne:
306 case Intrinsic::aarch64_sve_cmpge:
307 case Intrinsic::aarch64_sve_cmpgt:
308 case Intrinsic::aarch64_sve_cmphs:
309 case Intrinsic::aarch64_sve_cmphi:
310 case Intrinsic::aarch64_sve_cmpeq_wide:
311 case Intrinsic::aarch64_sve_cmpne_wide:
312 case Intrinsic::aarch64_sve_cmpge_wide:
313 case Intrinsic::aarch64_sve_cmpgt_wide:
314 case Intrinsic::aarch64_sve_cmplt_wide:
315 case Intrinsic::aarch64_sve_cmple_wide:
316 case Intrinsic::aarch64_sve_cmphs_wide:
317 case Intrinsic::aarch64_sve_cmphi_wide:
318 case Intrinsic::aarch64_sve_cmplo_wide:
319 case Intrinsic::aarch64_sve_cmpls_wide:
320 case Intrinsic::aarch64_sve_fcmpeq:
321 case Intrinsic::aarch64_sve_fcmpne:
322 case Intrinsic::aarch64_sve_fcmpge:
323 case Intrinsic::aarch64_sve_fcmpgt:
324 case Intrinsic::aarch64_sve_fcmpuo:
325 case Intrinsic::aarch64_sve_facgt:
326 case Intrinsic::aarch64_sve_facge:
327 case Intrinsic::aarch64_sve_whilege:
328 case Intrinsic::aarch64_sve_whilegt:
329 case Intrinsic::aarch64_sve_whilehi:
330 case Intrinsic::aarch64_sve_whilehs:
331 case Intrinsic::aarch64_sve_whilele:
332 case Intrinsic::aarch64_sve_whilelo:
333 case Intrinsic::aarch64_sve_whilels:
334 case Intrinsic::aarch64_sve_whilelt:
335 case Intrinsic::aarch64_sve_match:
336 case Intrinsic::aarch64_sve_nmatch:
337 case Intrinsic::aarch64_sve_whilege_x2:
338 case Intrinsic::aarch64_sve_whilegt_x2:
339 case Intrinsic::aarch64_sve_whilehi_x2:
340 case Intrinsic::aarch64_sve_whilehs_x2:
341 case Intrinsic::aarch64_sve_whilele_x2:
342 case Intrinsic::aarch64_sve_whilelo_x2:
343 case Intrinsic::aarch64_sve_whilels_x2:
344 case Intrinsic::aarch64_sve_whilelt_x2:
345 return true;
346 }
347 }
348}
349
350static std::tuple<SDValue, SDValue>
351extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG) {
352 SDLoc DL(Disc);
353 SDValue AddrDisc;
354 SDValue ConstDisc;
355
356 // If this is a blend, remember the constant and address discriminators.
357 // Otherwise, it's either a constant discriminator, or a non-blended
358 // address discriminator.
359 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
360 Disc->getConstantOperandVal(Num: 0) == Intrinsic::ptrauth_blend) {
361 AddrDisc = Disc->getOperand(Num: 1);
362 ConstDisc = Disc->getOperand(Num: 2);
363 } else {
364 ConstDisc = Disc;
365 }
366
367 // If the constant discriminator (either the blend RHS, or the entire
368 // discriminator value) isn't a 16-bit constant, bail out, and let the
369 // discriminator be computed separately.
370 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(Val&: ConstDisc);
371 if (!ConstDiscN || !isUInt<16>(x: ConstDiscN->getZExtValue()))
372 return std::make_tuple(args: DAG->getTargetConstant(Val: 0, DL, VT: MVT::i64), args&: Disc);
373
374 // If there's no address discriminator, use NoRegister, which we'll later
375 // replace with XZR, or directly use a Z variant of the inst. when available.
376 if (!AddrDisc)
377 AddrDisc = DAG->getRegister(Reg: AArch64::NoRegister, VT: MVT::i64);
378
379 return std::make_tuple(
380 args: DAG->getTargetConstant(Val: ConstDiscN->getZExtValue(), DL, VT: MVT::i64),
381 args&: AddrDisc);
382}
383
384AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
385 const AArch64Subtarget &STI)
386 : TargetLowering(TM), Subtarget(&STI) {
387 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
388 // we have to make something up. Arbitrarily, choose ZeroOrOne.
389 setBooleanContents(ZeroOrOneBooleanContent);
390 // When comparing vectors the result sets the different elements in the
391 // vector to all-one or all-zero.
392 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
393
394 // Set up the register classes.
395 addRegisterClass(VT: MVT::i32, RC: &AArch64::GPR32allRegClass);
396 addRegisterClass(VT: MVT::i64, RC: &AArch64::GPR64allRegClass);
397
398 if (Subtarget->hasLS64()) {
399 addRegisterClass(VT: MVT::i64x8, RC: &AArch64::GPR64x8ClassRegClass);
400 setOperationAction(Op: ISD::LOAD, VT: MVT::i64x8, Action: Custom);
401 setOperationAction(Op: ISD::STORE, VT: MVT::i64x8, Action: Custom);
402 }
403
404 if (Subtarget->hasFPARMv8()) {
405 addRegisterClass(VT: MVT::aarch64mfp8, RC: &AArch64::FPR8RegClass);
406 addRegisterClass(VT: MVT::f16, RC: &AArch64::FPR16RegClass);
407 addRegisterClass(VT: MVT::bf16, RC: &AArch64::FPR16RegClass);
408 addRegisterClass(VT: MVT::f32, RC: &AArch64::FPR32RegClass);
409 addRegisterClass(VT: MVT::f64, RC: &AArch64::FPR64RegClass);
410 addRegisterClass(VT: MVT::f128, RC: &AArch64::FPR128RegClass);
411 }
412
413 if (Subtarget->hasNEON()) {
414 addRegisterClass(VT: MVT::v16i8, RC: &AArch64::FPR8RegClass);
415 addRegisterClass(VT: MVT::v8i16, RC: &AArch64::FPR16RegClass);
416
417 addDRType(VT: MVT::v2f32);
418 addDRType(VT: MVT::v8i8);
419 addDRType(VT: MVT::v4i16);
420 addDRType(VT: MVT::v2i32);
421 addDRType(VT: MVT::v1i64);
422 addDRType(VT: MVT::v1f64);
423 addDRType(VT: MVT::v4f16);
424 addDRType(VT: MVT::v4bf16);
425
426 addQRType(VT: MVT::v4f32);
427 addQRType(VT: MVT::v2f64);
428 addQRType(VT: MVT::v16i8);
429 addQRType(VT: MVT::v8i16);
430 addQRType(VT: MVT::v4i32);
431 addQRType(VT: MVT::v2i64);
432 addQRType(VT: MVT::v8f16);
433 addQRType(VT: MVT::v8bf16);
434 }
435
436 if (Subtarget->isSVEorStreamingSVEAvailable()) {
437 // Add legal sve predicate types
438 addRegisterClass(VT: MVT::nxv1i1, RC: &AArch64::PPRRegClass);
439 addRegisterClass(VT: MVT::nxv2i1, RC: &AArch64::PPRRegClass);
440 addRegisterClass(VT: MVT::nxv4i1, RC: &AArch64::PPRRegClass);
441 addRegisterClass(VT: MVT::nxv8i1, RC: &AArch64::PPRRegClass);
442 addRegisterClass(VT: MVT::nxv16i1, RC: &AArch64::PPRRegClass);
443
444 // Add legal sve data types
445 addRegisterClass(VT: MVT::nxv16i8, RC: &AArch64::ZPRRegClass);
446 addRegisterClass(VT: MVT::nxv8i16, RC: &AArch64::ZPRRegClass);
447 addRegisterClass(VT: MVT::nxv4i32, RC: &AArch64::ZPRRegClass);
448 addRegisterClass(VT: MVT::nxv2i64, RC: &AArch64::ZPRRegClass);
449
450 addRegisterClass(VT: MVT::nxv2f16, RC: &AArch64::ZPRRegClass);
451 addRegisterClass(VT: MVT::nxv4f16, RC: &AArch64::ZPRRegClass);
452 addRegisterClass(VT: MVT::nxv8f16, RC: &AArch64::ZPRRegClass);
453 addRegisterClass(VT: MVT::nxv2f32, RC: &AArch64::ZPRRegClass);
454 addRegisterClass(VT: MVT::nxv4f32, RC: &AArch64::ZPRRegClass);
455 addRegisterClass(VT: MVT::nxv2f64, RC: &AArch64::ZPRRegClass);
456
457 addRegisterClass(VT: MVT::nxv2bf16, RC: &AArch64::ZPRRegClass);
458 addRegisterClass(VT: MVT::nxv4bf16, RC: &AArch64::ZPRRegClass);
459 addRegisterClass(VT: MVT::nxv8bf16, RC: &AArch64::ZPRRegClass);
460
461 if (Subtarget->useSVEForFixedLengthVectors()) {
462 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
463 if (useSVEForFixedLengthVectorVT(VT))
464 addRegisterClass(VT, RC: &AArch64::ZPRRegClass);
465
466 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
467 if (useSVEForFixedLengthVectorVT(VT))
468 addRegisterClass(VT, RC: &AArch64::ZPRRegClass);
469 }
470 }
471
472 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
473 addRegisterClass(VT: MVT::aarch64svcount, RC: &AArch64::PPRRegClass);
474 setOperationPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::aarch64svcount, DestVT: MVT::nxv16i1);
475 setOperationPromotedToType(Opc: ISD::STORE, OrigVT: MVT::aarch64svcount, DestVT: MVT::nxv16i1);
476
477 setOperationAction(Op: ISD::SELECT, VT: MVT::aarch64svcount, Action: Custom);
478 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::aarch64svcount, Action: Expand);
479 }
480
481 // Compute derived properties from the register classes
482 computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
483
484 // Provide all sorts of operation actions
485 setOperationAction(Op: ISD::GlobalAddress, VT: MVT::i64, Action: Custom);
486 setOperationAction(Op: ISD::GlobalTLSAddress, VT: MVT::i64, Action: Custom);
487 setOperationAction(Op: ISD::SETCC, VT: MVT::i32, Action: Custom);
488 setOperationAction(Op: ISD::SETCC, VT: MVT::i64, Action: Custom);
489 setOperationAction(Op: ISD::SETCC, VT: MVT::bf16, Action: Custom);
490 setOperationAction(Op: ISD::SETCC, VT: MVT::f16, Action: Custom);
491 setOperationAction(Op: ISD::SETCC, VT: MVT::f32, Action: Custom);
492 setOperationAction(Op: ISD::SETCC, VT: MVT::f64, Action: Custom);
493 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::bf16, Action: Custom);
494 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f16, Action: Custom);
495 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f32, Action: Custom);
496 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f64, Action: Custom);
497 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f16, Action: Custom);
498 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f32, Action: Custom);
499 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f64, Action: Custom);
500 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i32, Action: Legal);
501 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i64, Action: Legal);
502 setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
503 setOperationAction(Op: ISD::BR_CC, VT: MVT::i32, Action: Custom);
504 setOperationAction(Op: ISD::BR_CC, VT: MVT::i64, Action: Custom);
505 setOperationAction(Op: ISD::BR_CC, VT: MVT::f16, Action: Custom);
506 setOperationAction(Op: ISD::BR_CC, VT: MVT::f32, Action: Custom);
507 setOperationAction(Op: ISD::BR_CC, VT: MVT::f64, Action: Custom);
508 setOperationAction(Op: ISD::SELECT, VT: MVT::i32, Action: Custom);
509 setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
510 if (Subtarget->hasFPARMv8()) {
511 setOperationAction(Op: ISD::SELECT, VT: MVT::f16, Action: Custom);
512 setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Custom);
513 }
514 setOperationAction(Op: ISD::SELECT, VT: MVT::f32, Action: Custom);
515 setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Custom);
516 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i32, Action: Custom);
517 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Custom);
518 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f16, Action: Custom);
519 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::bf16, Action: Custom);
520 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f32, Action: Custom);
521 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f64, Action: Custom);
522 setOperationAction(Op: ISD::BR_JT, VT: MVT::Other, Action: Custom);
523 setOperationAction(Op: ISD::JumpTable, VT: MVT::i64, Action: Custom);
524 setOperationAction(Op: ISD::BRIND, VT: MVT::Other, Action: Custom);
525 setOperationAction(Op: ISD::SETCCCARRY, VT: MVT::i64, Action: Custom);
526
527 setOperationAction(Op: ISD::PtrAuthGlobalAddress, VT: MVT::i64, Action: Custom);
528
529 setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i64, Action: Custom);
530 setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i64, Action: Custom);
531 setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i64, Action: Custom);
532
533 setOperationAction(Op: ISD::FREM, VT: MVT::f32, Action: Expand);
534 setOperationAction(Op: ISD::FREM, VT: MVT::f64, Action: Expand);
535 setOperationAction(Op: ISD::FREM, VT: MVT::f80, Action: Expand);
536
537 setOperationAction(Op: ISD::BUILD_PAIR, VT: MVT::i64, Action: Expand);
538
539 // Custom lowering hooks are needed for XOR
540 // to fold it into CSINC/CSINV.
541 setOperationAction(Op: ISD::XOR, VT: MVT::i32, Action: Custom);
542 setOperationAction(Op: ISD::XOR, VT: MVT::i64, Action: Custom);
543
544 setOperationAction(Op: ISD::ADDRSPACECAST, VT: MVT::i32, Action: Custom);
545 setOperationAction(Op: ISD::ADDRSPACECAST, VT: MVT::i64, Action: Custom);
546
547 // Virtually no operation on f128 is legal, but LLVM can't expand them when
548 // there's a valid register class, so we need custom operations in most cases.
549 setOperationAction(Op: ISD::FABS, VT: MVT::f128, Action: Expand);
550 setOperationAction(Op: ISD::FADD, VT: MVT::f128, Action: LibCall);
551 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f128, Action: Expand);
552 setOperationAction(Op: ISD::FCOS, VT: MVT::f128, Action: Expand);
553 setOperationAction(Op: ISD::FDIV, VT: MVT::f128, Action: LibCall);
554 setOperationAction(Op: ISD::FMA, VT: MVT::f128, Action: Expand);
555 setOperationAction(Op: ISD::FMUL, VT: MVT::f128, Action: LibCall);
556 setOperationAction(Op: ISD::FNEG, VT: MVT::f128, Action: Expand);
557 setOperationAction(Op: ISD::FPOW, VT: MVT::f128, Action: Expand);
558 setOperationAction(Op: ISD::FREM, VT: MVT::f128, Action: Expand);
559 setOperationAction(Op: ISD::FRINT, VT: MVT::f128, Action: Expand);
560 setOperationAction(Op: ISD::FSIN, VT: MVT::f128, Action: Expand);
561 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f128, Action: Expand);
562 setOperationAction(Op: ISD::FSQRT, VT: MVT::f128, Action: Expand);
563 setOperationAction(Op: ISD::FSUB, VT: MVT::f128, Action: LibCall);
564 setOperationAction(Op: ISD::FTAN, VT: MVT::f128, Action: Expand);
565 setOperationAction(Op: ISD::FTRUNC, VT: MVT::f128, Action: Expand);
566 setOperationAction(Op: ISD::SETCC, VT: MVT::f128, Action: Custom);
567 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f128, Action: Custom);
568 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f128, Action: Custom);
569 setOperationAction(Op: ISD::BR_CC, VT: MVT::f128, Action: Custom);
570 setOperationAction(Op: ISD::SELECT, VT: MVT::f128, Action: Custom);
571 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f128, Action: Custom);
572 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f128, Action: Custom);
573 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
574 // aren't handled.
575
576 // Lowering for many of the conversions is actually specified by the non-f128
577 // type. The LowerXXX function will be trivial when f128 isn't involved.
578 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
579 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i64, Action: Custom);
580 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i128, Action: Custom);
581 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Custom);
582 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i64, Action: Custom);
583 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i128, Action: Custom);
584 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
585 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i64, Action: Custom);
586 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i128, Action: Custom);
587 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Custom);
588 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i64, Action: Custom);
589 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i128, Action: Custom);
590 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Custom);
591 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i64, Action: Custom);
592 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i128, Action: Custom);
593 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Custom);
594 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i64, Action: Custom);
595 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i128, Action: Custom);
596 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Custom);
597 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i64, Action: Custom);
598 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i128, Action: Custom);
599 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Custom);
600 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i64, Action: Custom);
601 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i128, Action: Custom);
602 if (Subtarget->hasFPARMv8()) {
603 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f16, Action: Custom);
604 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Custom);
605 }
606 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f32, Action: Custom);
607 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f64, Action: Custom);
608 if (Subtarget->hasFPARMv8()) {
609 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f16, Action: Custom);
610 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::bf16, Action: Custom);
611 }
612 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f32, Action: Custom);
613 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f64, Action: Custom);
614
615 setOperationAction(Op: ISD::FP_TO_UINT_SAT, VT: MVT::i32, Action: Custom);
616 setOperationAction(Op: ISD::FP_TO_UINT_SAT, VT: MVT::i64, Action: Custom);
617 setOperationAction(Op: ISD::FP_TO_SINT_SAT, VT: MVT::i32, Action: Custom);
618 setOperationAction(Op: ISD::FP_TO_SINT_SAT, VT: MVT::i64, Action: Custom);
619
620 // Variable arguments.
621 setOperationAction(Op: ISD::VASTART, VT: MVT::Other, Action: Custom);
622 setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Custom);
623 setOperationAction(Op: ISD::VACOPY, VT: MVT::Other, Action: Custom);
624 setOperationAction(Op: ISD::VAEND, VT: MVT::Other, Action: Expand);
625
626 // Variable-sized objects.
627 setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Expand);
628 setOperationAction(Op: ISD::STACKRESTORE, VT: MVT::Other, Action: Expand);
629
630 // Lowering Funnel Shifts to EXTR
631 setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Custom);
632 setOperationAction(Op: ISD::FSHR, VT: MVT::i64, Action: Custom);
633 setOperationAction(Op: ISD::FSHL, VT: MVT::i32, Action: Custom);
634 setOperationAction(Op: ISD::FSHL, VT: MVT::i64, Action: Custom);
635
636 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i64, Action: Custom);
637
638 // Constant pool entries
639 setOperationAction(Op: ISD::ConstantPool, VT: MVT::i64, Action: Custom);
640
641 // BlockAddress
642 setOperationAction(Op: ISD::BlockAddress, VT: MVT::i64, Action: Custom);
643
644 // AArch64 lacks both left-rotate and popcount instructions.
645 setOperationAction(Op: ISD::ROTL, VT: MVT::i32, Action: Expand);
646 setOperationAction(Op: ISD::ROTL, VT: MVT::i64, Action: Expand);
647 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
648 setOperationAction(Op: ISD::ROTL, VT, Action: Expand);
649 setOperationAction(Op: ISD::ROTR, VT, Action: Expand);
650 }
651
652 // AArch64 doesn't have i32 MULH{S|U}.
653 setOperationAction(Op: ISD::MULHU, VT: MVT::i32, Action: Expand);
654 setOperationAction(Op: ISD::MULHS, VT: MVT::i32, Action: Expand);
655
656 // AArch64 doesn't have {U|S}MUL_LOHI.
657 setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i32, Action: Expand);
658 setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i32, Action: Expand);
659 setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i64, Action: Expand);
660 setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i64, Action: Expand);
661
662 if (Subtarget->hasCSSC()) {
663 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Legal);
664 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Legal);
665 setOperationAction(Op: ISD::CTPOP, VT: MVT::i128, Action: Expand);
666
667 setOperationAction(Op: ISD::PARITY, VT: MVT::i128, Action: Expand);
668
669 setOperationAction(Op: ISD::CTTZ, VT: MVT::i32, Action: Legal);
670 setOperationAction(Op: ISD::CTTZ, VT: MVT::i64, Action: Legal);
671 setOperationAction(Op: ISD::CTTZ, VT: MVT::i128, Action: Expand);
672
673 setOperationAction(Op: ISD::ABS, VT: MVT::i32, Action: Legal);
674 setOperationAction(Op: ISD::ABS, VT: MVT::i64, Action: Legal);
675
676 setOperationAction(Op: ISD::SMAX, VT: MVT::i32, Action: Legal);
677 setOperationAction(Op: ISD::SMAX, VT: MVT::i64, Action: Legal);
678 setOperationAction(Op: ISD::UMAX, VT: MVT::i32, Action: Legal);
679 setOperationAction(Op: ISD::UMAX, VT: MVT::i64, Action: Legal);
680
681 setOperationAction(Op: ISD::SMIN, VT: MVT::i32, Action: Legal);
682 setOperationAction(Op: ISD::SMIN, VT: MVT::i64, Action: Legal);
683 setOperationAction(Op: ISD::UMIN, VT: MVT::i32, Action: Legal);
684 setOperationAction(Op: ISD::UMIN, VT: MVT::i64, Action: Legal);
685 } else {
686 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Custom);
687 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Custom);
688 setOperationAction(Op: ISD::CTPOP, VT: MVT::i128, Action: Custom);
689
690 setOperationAction(Op: ISD::PARITY, VT: MVT::i64, Action: Custom);
691 setOperationAction(Op: ISD::PARITY, VT: MVT::i128, Action: Custom);
692
693 setOperationAction(Op: ISD::ABS, VT: MVT::i32, Action: Custom);
694 setOperationAction(Op: ISD::ABS, VT: MVT::i64, Action: Custom);
695 }
696
697 setOperationAction(Op: ISD::SDIVREM, VT: MVT::i32, Action: Expand);
698 setOperationAction(Op: ISD::SDIVREM, VT: MVT::i64, Action: Expand);
699 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
700 setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
701 setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
702 }
703 setOperationAction(Op: ISD::SREM, VT: MVT::i32, Action: Expand);
704 setOperationAction(Op: ISD::SREM, VT: MVT::i64, Action: Expand);
705 setOperationAction(Op: ISD::UDIVREM, VT: MVT::i32, Action: Expand);
706 setOperationAction(Op: ISD::UDIVREM, VT: MVT::i64, Action: Expand);
707 setOperationAction(Op: ISD::UREM, VT: MVT::i32, Action: Expand);
708 setOperationAction(Op: ISD::UREM, VT: MVT::i64, Action: Expand);
709
710 // Custom lower Add/Sub/Mul with overflow.
711 setOperationAction(Op: ISD::SADDO, VT: MVT::i32, Action: Custom);
712 setOperationAction(Op: ISD::SADDO, VT: MVT::i64, Action: Custom);
713 setOperationAction(Op: ISD::UADDO, VT: MVT::i32, Action: Custom);
714 setOperationAction(Op: ISD::UADDO, VT: MVT::i64, Action: Custom);
715 setOperationAction(Op: ISD::SSUBO, VT: MVT::i32, Action: Custom);
716 setOperationAction(Op: ISD::SSUBO, VT: MVT::i64, Action: Custom);
717 setOperationAction(Op: ISD::USUBO, VT: MVT::i32, Action: Custom);
718 setOperationAction(Op: ISD::USUBO, VT: MVT::i64, Action: Custom);
719 setOperationAction(Op: ISD::SMULO, VT: MVT::i32, Action: Custom);
720 setOperationAction(Op: ISD::SMULO, VT: MVT::i64, Action: Custom);
721 setOperationAction(Op: ISD::UMULO, VT: MVT::i32, Action: Custom);
722 setOperationAction(Op: ISD::UMULO, VT: MVT::i64, Action: Custom);
723
724 setOperationAction(Op: ISD::UADDO_CARRY, VT: MVT::i32, Action: Custom);
725 setOperationAction(Op: ISD::UADDO_CARRY, VT: MVT::i64, Action: Custom);
726 setOperationAction(Op: ISD::USUBO_CARRY, VT: MVT::i32, Action: Custom);
727 setOperationAction(Op: ISD::USUBO_CARRY, VT: MVT::i64, Action: Custom);
728 setOperationAction(Op: ISD::SADDO_CARRY, VT: MVT::i32, Action: Custom);
729 setOperationAction(Op: ISD::SADDO_CARRY, VT: MVT::i64, Action: Custom);
730 setOperationAction(Op: ISD::SSUBO_CARRY, VT: MVT::i32, Action: Custom);
731 setOperationAction(Op: ISD::SSUBO_CARRY, VT: MVT::i64, Action: Custom);
732
733 setOperationAction(Op: ISD::FSIN, VT: MVT::f32, Action: Expand);
734 setOperationAction(Op: ISD::FSIN, VT: MVT::f64, Action: Expand);
735 setOperationAction(Op: ISD::FCOS, VT: MVT::f32, Action: Expand);
736 setOperationAction(Op: ISD::FCOS, VT: MVT::f64, Action: Expand);
737 setOperationAction(Op: ISD::FPOW, VT: MVT::f32, Action: Expand);
738 setOperationAction(Op: ISD::FPOW, VT: MVT::f64, Action: Expand);
739 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f64, Action: Custom);
740 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f32, Action: Custom);
741 if (Subtarget->hasFullFP16()) {
742 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f16, Action: Custom);
743 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Custom);
744 } else {
745 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f16, Action: Promote);
746 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Promote);
747 }
748
749 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
750 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
751 ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
752 ISD::FASIN, ISD::FATAN, ISD::FATAN2,
753 ISD::FCOSH, ISD::FSINH, ISD::FTANH,
754 ISD::FTAN, ISD::FEXP, ISD::FEXP2,
755 ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
756 ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW,
757 ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN,
758 ISD::STRICT_FACOS, ISD::STRICT_FASIN, ISD::STRICT_FATAN,
759 ISD::STRICT_FATAN2, ISD::STRICT_FCOSH, ISD::STRICT_FSINH,
760 ISD::STRICT_FTANH, ISD::STRICT_FEXP, ISD::STRICT_FEXP2,
761 ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10,
762 ISD::STRICT_FTAN}) {
763 setOperationAction(Op, VT: MVT::f16, Action: Promote);
764 setOperationAction(Op, VT: MVT::v4f16, Action: Expand);
765 setOperationAction(Op, VT: MVT::v8f16, Action: Expand);
766 setOperationAction(Op, VT: MVT::bf16, Action: Promote);
767 setOperationAction(Op, VT: MVT::v4bf16, Action: Expand);
768 setOperationAction(Op, VT: MVT::v8bf16, Action: Expand);
769 }
770
771 // fpextend from f16 or bf16 to f32 is legal
772 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f32, Action: Legal);
773 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v4f32, Action: Legal);
774 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f32, Action: Legal);
775 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::v4f32, Action: Legal);
776 // fpextend from bf16 to f64 needs to be split into two fpextends
777 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f64, Action: Custom);
778 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f64, Action: Custom);
779
780 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
781 for (auto Op : {
782 ISD::SETCC,
783 ISD::SELECT_CC,
784 ISD::BR_CC,
785 ISD::FADD,
786 ISD::FSUB,
787 ISD::FMUL,
788 ISD::FDIV,
789 ISD::FMA,
790 ISD::FCEIL,
791 ISD::FSQRT,
792 ISD::FFLOOR,
793 ISD::FNEARBYINT,
794 ISD::FRINT,
795 ISD::FROUND,
796 ISD::FROUNDEVEN,
797 ISD::FTRUNC,
798 ISD::FMINNUM,
799 ISD::FMAXNUM,
800 ISD::FMINIMUM,
801 ISD::FMAXIMUM,
802 ISD::FMINIMUMNUM,
803 ISD::FMAXIMUMNUM,
804 ISD::FCANONICALIZE,
805 ISD::STRICT_FADD,
806 ISD::STRICT_FSUB,
807 ISD::STRICT_FMUL,
808 ISD::STRICT_FDIV,
809 ISD::STRICT_FMA,
810 ISD::STRICT_FCEIL,
811 ISD::STRICT_FFLOOR,
812 ISD::STRICT_FSQRT,
813 ISD::STRICT_FRINT,
814 ISD::STRICT_FNEARBYINT,
815 ISD::STRICT_FROUND,
816 ISD::STRICT_FTRUNC,
817 ISD::STRICT_FROUNDEVEN,
818 ISD::STRICT_FMINNUM,
819 ISD::STRICT_FMAXNUM,
820 ISD::STRICT_FMINIMUM,
821 ISD::STRICT_FMAXIMUM,
822 })
823 setOperationAction(Op, VT: ScalarVT, Action: Promote);
824
825 for (auto Op : {ISD::FNEG, ISD::FABS})
826 setOperationAction(Op, VT: ScalarVT, Action: Legal);
827
828 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
829 // because the result type is integer.
830 for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
831 ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
832 ISD::STRICT_LLRINT})
833 setOperationAction(Op, VT: ScalarVT, Action: Custom);
834
835 // promote v4f16 to v4f32 when that is known to be safe.
836 auto V4Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: 4);
837 setOperationPromotedToType(Opc: ISD::FADD, OrigVT: V4Narrow, DestVT: MVT::v4f32);
838 setOperationPromotedToType(Opc: ISD::FSUB, OrigVT: V4Narrow, DestVT: MVT::v4f32);
839 setOperationPromotedToType(Opc: ISD::FMUL, OrigVT: V4Narrow, DestVT: MVT::v4f32);
840 setOperationPromotedToType(Opc: ISD::FDIV, OrigVT: V4Narrow, DestVT: MVT::v4f32);
841 setOperationPromotedToType(Opc: ISD::FCEIL, OrigVT: V4Narrow, DestVT: MVT::v4f32);
842 setOperationPromotedToType(Opc: ISD::FFLOOR, OrigVT: V4Narrow, DestVT: MVT::v4f32);
843 setOperationPromotedToType(Opc: ISD::FROUND, OrigVT: V4Narrow, DestVT: MVT::v4f32);
844 setOperationPromotedToType(Opc: ISD::FTRUNC, OrigVT: V4Narrow, DestVT: MVT::v4f32);
845 setOperationPromotedToType(Opc: ISD::FROUNDEVEN, OrigVT: V4Narrow, DestVT: MVT::v4f32);
846 setOperationPromotedToType(Opc: ISD::FRINT, OrigVT: V4Narrow, DestVT: MVT::v4f32);
847 setOperationPromotedToType(Opc: ISD::FNEARBYINT, OrigVT: V4Narrow, DestVT: MVT::v4f32);
848 setOperationPromotedToType(Opc: ISD::FCANONICALIZE, OrigVT: V4Narrow, DestVT: MVT::v4f32);
849 setOperationPromotedToType(Opc: ISD::SETCC, OrigVT: V4Narrow, DestVT: MVT::v4f32);
850
851 setOperationAction(Op: ISD::FABS, VT: V4Narrow, Action: Legal);
852 setOperationAction(Op: ISD::FNEG, VT: V4Narrow, Action: Legal);
853 setOperationAction(Op: ISD::FMA, VT: V4Narrow, Action: Expand);
854 setOperationAction(Op: ISD::BR_CC, VT: V4Narrow, Action: Expand);
855 setOperationAction(Op: ISD::SELECT, VT: V4Narrow, Action: Expand);
856 setOperationAction(Op: ISD::SELECT_CC, VT: V4Narrow, Action: Expand);
857 setOperationAction(Op: ISD::FCOPYSIGN, VT: V4Narrow, Action: Custom);
858 setOperationAction(Op: ISD::FSQRT, VT: V4Narrow, Action: Expand);
859
860 auto V8Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: 8);
861 setOperationPromotedToType(Opc: ISD::FCANONICALIZE, OrigVT: V8Narrow, DestVT: MVT::v8f32);
862 setOperationPromotedToType(Opc: ISD::SETCC, OrigVT: V8Narrow, DestVT: MVT::v8f32);
863
864 setOperationAction(Op: ISD::FABS, VT: V8Narrow, Action: Legal);
865 setOperationAction(Op: ISD::FADD, VT: V8Narrow, Action: Legal);
866 setOperationAction(Op: ISD::FCEIL, VT: V8Narrow, Action: Legal);
867 setOperationAction(Op: ISD::FCOPYSIGN, VT: V8Narrow, Action: Custom);
868 setOperationAction(Op: ISD::FDIV, VT: V8Narrow, Action: Legal);
869 setOperationAction(Op: ISD::FFLOOR, VT: V8Narrow, Action: Legal);
870 setOperationAction(Op: ISD::FMA, VT: V8Narrow, Action: Expand);
871 setOperationAction(Op: ISD::FMUL, VT: V8Narrow, Action: Legal);
872 setOperationAction(Op: ISD::FNEARBYINT, VT: V8Narrow, Action: Legal);
873 setOperationAction(Op: ISD::FNEG, VT: V8Narrow, Action: Legal);
874 setOperationAction(Op: ISD::FROUND, VT: V8Narrow, Action: Legal);
875 setOperationAction(Op: ISD::FROUNDEVEN, VT: V8Narrow, Action: Legal);
876 setOperationAction(Op: ISD::FRINT, VT: V8Narrow, Action: Legal);
877 setOperationAction(Op: ISD::FSQRT, VT: V8Narrow, Action: Expand);
878 setOperationAction(Op: ISD::FSUB, VT: V8Narrow, Action: Legal);
879 setOperationAction(Op: ISD::FTRUNC, VT: V8Narrow, Action: Legal);
880 setOperationAction(Op: ISD::BR_CC, VT: V8Narrow, Action: Expand);
881 setOperationAction(Op: ISD::SELECT, VT: V8Narrow, Action: Expand);
882 setOperationAction(Op: ISD::SELECT_CC, VT: V8Narrow, Action: Expand);
883 setOperationAction(Op: ISD::FP_EXTEND, VT: V8Narrow, Action: Expand);
884 };
885
886 if (!Subtarget->hasFullFP16()) {
887 LegalizeNarrowFP(MVT::f16);
888 }
889 LegalizeNarrowFP(MVT::bf16);
890 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4f32, Action: Custom);
891 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4bf16, Action: Custom);
892
893 // AArch64 has implementations of a lot of rounding-like FP operations.
894 // clang-format off
895 for (auto Op :
896 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
897 ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
898 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
899 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
900 ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
901 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,
902 ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT,
903 ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
904 ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
905 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
906 ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) {
907 for (MVT Ty : {MVT::f32, MVT::f64})
908 setOperationAction(Op, VT: Ty, Action: Legal);
909 if (Subtarget->hasFullFP16())
910 setOperationAction(Op, VT: MVT::f16, Action: Legal);
911 }
912 // clang-format on
913
914 // Basic strict FP operations are legal
915 for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
916 ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
917 for (MVT Ty : {MVT::f32, MVT::f64})
918 setOperationAction(Op, VT: Ty, Action: Legal);
919 if (Subtarget->hasFullFP16())
920 setOperationAction(Op, VT: MVT::f16, Action: Legal);
921 }
922
923 setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
924
925 setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
926 setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
927 setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Custom);
928 setOperationAction(Op: ISD::SET_FPMODE, VT: MVT::i32, Action: Custom);
929 setOperationAction(Op: ISD::RESET_FPMODE, VT: MVT::Other, Action: Custom);
930
931 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i128, Action: Custom);
932 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
933 setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i32, Action: LibCall);
934 setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i64, Action: LibCall);
935 } else {
936 setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i32, Action: Expand);
937 setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i64, Action: Expand);
938 }
939 setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i32, Action: Custom);
940 setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i64, Action: Custom);
941
942 // Generate outline atomics library calls only if LSE was not specified for
943 // subtarget
944 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
945 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i8, Action: LibCall);
946 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i16, Action: LibCall);
947 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i32, Action: LibCall);
948 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i64, Action: LibCall);
949 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i128, Action: LibCall);
950 setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i8, Action: LibCall);
951 setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i16, Action: LibCall);
952 setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i32, Action: LibCall);
953 setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i64, Action: LibCall);
954 setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i8, Action: LibCall);
955 setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i16, Action: LibCall);
956 setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i32, Action: LibCall);
957 setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i64, Action: LibCall);
958 setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i8, Action: LibCall);
959 setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i16, Action: LibCall);
960 setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i32, Action: LibCall);
961 setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i64, Action: LibCall);
962 setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i8, Action: LibCall);
963 setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i16, Action: LibCall);
964 setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i32, Action: LibCall);
965 setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i64, Action: LibCall);
966 setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i8, Action: LibCall);
967 setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i16, Action: LibCall);
968 setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i32, Action: LibCall);
969 setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i64, Action: LibCall);
970 }
971
972 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
973 setOperationAction(Op: ISD::ATOMIC_LOAD_FADD, VT: MVT::f16, Action: LibCall);
974 setOperationAction(Op: ISD::ATOMIC_LOAD_FADD, VT: MVT::f32, Action: LibCall);
975 setOperationAction(Op: ISD::ATOMIC_LOAD_FADD, VT: MVT::f64, Action: LibCall);
976 setOperationAction(Op: ISD::ATOMIC_LOAD_FADD, VT: MVT::bf16, Action: LibCall);
977
978 setOperationAction(Op: ISD::ATOMIC_LOAD_FMAX, VT: MVT::f16, Action: LibCall);
979 setOperationAction(Op: ISD::ATOMIC_LOAD_FMAX, VT: MVT::f32, Action: LibCall);
980 setOperationAction(Op: ISD::ATOMIC_LOAD_FMAX, VT: MVT::f64, Action: LibCall);
981 setOperationAction(Op: ISD::ATOMIC_LOAD_FMAX, VT: MVT::bf16, Action: LibCall);
982
983 setOperationAction(Op: ISD::ATOMIC_LOAD_FMIN, VT: MVT::f16, Action: LibCall);
984 setOperationAction(Op: ISD::ATOMIC_LOAD_FMIN, VT: MVT::f32, Action: LibCall);
985 setOperationAction(Op: ISD::ATOMIC_LOAD_FMIN, VT: MVT::f64, Action: LibCall);
986 setOperationAction(Op: ISD::ATOMIC_LOAD_FMIN, VT: MVT::bf16, Action: LibCall);
987
988 setOperationAction(Op: ISD::ATOMIC_LOAD_FMAXIMUM, VT: MVT::f16, Action: LibCall);
989 setOperationAction(Op: ISD::ATOMIC_LOAD_FMAXIMUM, VT: MVT::f32, Action: LibCall);
990 setOperationAction(Op: ISD::ATOMIC_LOAD_FMAXIMUM, VT: MVT::f64, Action: LibCall);
991 setOperationAction(Op: ISD::ATOMIC_LOAD_FMAXIMUM, VT: MVT::bf16, Action: LibCall);
992
993 setOperationAction(Op: ISD::ATOMIC_LOAD_FMINIMUM, VT: MVT::f16, Action: LibCall);
994 setOperationAction(Op: ISD::ATOMIC_LOAD_FMINIMUM, VT: MVT::f32, Action: LibCall);
995 setOperationAction(Op: ISD::ATOMIC_LOAD_FMINIMUM, VT: MVT::f64, Action: LibCall);
996 setOperationAction(Op: ISD::ATOMIC_LOAD_FMINIMUM, VT: MVT::bf16, Action: LibCall);
997 }
998
999 if (Subtarget->hasLSE128()) {
1000 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1001 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1002 setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i128, Action: Custom);
1003 setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i128, Action: Custom);
1004 setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i128, Action: Custom);
1005 }
1006
1007 // 128-bit loads and stores can be done without expanding
1008 setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Custom);
1009 setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Custom);
1010
1011 // Aligned 128-bit loads and stores are single-copy atomic according to the
1012 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1013 if (Subtarget->hasLSE2()) {
1014 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::i128, Action: Custom);
1015 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::i128, Action: Custom);
1016 }
1017
1018 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1019 // custom lowering, as there are no un-paired non-temporal stores and
1020 // legalization will break up 256 bit inputs.
1021 setOperationAction(Op: ISD::STORE, VT: MVT::v32i8, Action: Custom);
1022 setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Custom);
1023 setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Custom);
1024 setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Custom);
1025 setOperationAction(Op: ISD::STORE, VT: MVT::v8i32, Action: Custom);
1026 setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Custom);
1027 setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Custom);
1028 setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Custom);
1029
1030 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1031 // custom lowering, as there are no un-paired non-temporal loads legalization
1032 // will break up 256 bit inputs.
1033 setOperationAction(Op: ISD::LOAD, VT: MVT::v32i8, Action: Custom);
1034 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Custom);
1035 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Custom);
1036 setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Custom);
1037 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i32, Action: Custom);
1038 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Custom);
1039 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Custom);
1040 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Custom);
1041
1042 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1043 setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
1044
1045 if (getLibcallName(Call: RTLIB::SINCOS_STRET_F32) != nullptr &&
1046 getLibcallName(Call: RTLIB::SINCOS_STRET_F64) != nullptr) {
1047 // Issue __sincos_stret if available.
1048 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f64, Action: Custom);
1049 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f32, Action: Custom);
1050 } else {
1051 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f64, Action: Expand);
1052 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f32, Action: Expand);
1053 }
1054
1055 // Make floating-point constants legal for the large code model, so they don't
1056 // become loads from the constant pool.
1057 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1058 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f32, Action: Legal);
1059 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f64, Action: Legal);
1060 }
1061
1062 // AArch64 does not have floating-point extending loads, i1 sign-extending
1063 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1064 for (MVT VT : MVT::fp_valuetypes()) {
1065 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::bf16, Action: Expand);
1066 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f16, Action: Expand);
1067 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f32, Action: Expand);
1068 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f64, Action: Expand);
1069 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f80, Action: Expand);
1070 }
1071 for (MVT VT : MVT::integer_valuetypes())
1072 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Expand);
1073
1074 for (MVT WideVT : MVT::fp_valuetypes()) {
1075 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1076 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1077 setTruncStoreAction(ValVT: WideVT, MemVT: NarrowVT, Action: Expand);
1078 }
1079 }
1080 }
1081
1082 if (Subtarget->hasFPARMv8()) {
1083 setOperationAction(Op: ISD::BITCAST, VT: MVT::i16, Action: Custom);
1084 setOperationAction(Op: ISD::BITCAST, VT: MVT::f16, Action: Custom);
1085 setOperationAction(Op: ISD::BITCAST, VT: MVT::bf16, Action: Custom);
1086 }
1087
1088 // Indexed loads and stores are supported.
1089 for (unsigned im = (unsigned)ISD::PRE_INC;
1090 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1091 setIndexedLoadAction(IdxModes: im, VT: MVT::i8, Action: Legal);
1092 setIndexedLoadAction(IdxModes: im, VT: MVT::i16, Action: Legal);
1093 setIndexedLoadAction(IdxModes: im, VT: MVT::i32, Action: Legal);
1094 setIndexedLoadAction(IdxModes: im, VT: MVT::i64, Action: Legal);
1095 setIndexedLoadAction(IdxModes: im, VT: MVT::f64, Action: Legal);
1096 setIndexedLoadAction(IdxModes: im, VT: MVT::f32, Action: Legal);
1097 setIndexedLoadAction(IdxModes: im, VT: MVT::f16, Action: Legal);
1098 setIndexedLoadAction(IdxModes: im, VT: MVT::bf16, Action: Legal);
1099 setIndexedStoreAction(IdxModes: im, VT: MVT::i8, Action: Legal);
1100 setIndexedStoreAction(IdxModes: im, VT: MVT::i16, Action: Legal);
1101 setIndexedStoreAction(IdxModes: im, VT: MVT::i32, Action: Legal);
1102 setIndexedStoreAction(IdxModes: im, VT: MVT::i64, Action: Legal);
1103 setIndexedStoreAction(IdxModes: im, VT: MVT::f64, Action: Legal);
1104 setIndexedStoreAction(IdxModes: im, VT: MVT::f32, Action: Legal);
1105 setIndexedStoreAction(IdxModes: im, VT: MVT::f16, Action: Legal);
1106 setIndexedStoreAction(IdxModes: im, VT: MVT::bf16, Action: Legal);
1107 }
1108
1109 // Trap.
1110 setOperationAction(Op: ISD::TRAP, VT: MVT::Other, Action: Legal);
1111 setOperationAction(Op: ISD::DEBUGTRAP, VT: MVT::Other, Action: Legal);
1112 setOperationAction(Op: ISD::UBSANTRAP, VT: MVT::Other, Action: Legal);
1113
1114 // We combine OR nodes for bitfield operations.
1115 setTargetDAGCombine(ISD::OR);
1116 // Try to create BICs for vector ANDs.
1117 setTargetDAGCombine(ISD::AND);
1118
1119 // llvm.init.trampoline and llvm.adjust.trampoline
1120 setOperationAction(Op: ISD::INIT_TRAMPOLINE, VT: MVT::Other, Action: Custom);
1121 setOperationAction(Op: ISD::ADJUST_TRAMPOLINE, VT: MVT::Other, Action: Custom);
1122
1123 // Vector add and sub nodes may conceal a high-half opportunity.
1124 // Also, try to fold ADD into CSINC/CSINV..
1125 setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
1126 ISD::UINT_TO_FP});
1127
1128 setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1129 ISD::FP_TO_UINT_SAT, ISD::FADD});
1130
1131 // Try and combine setcc with csel
1132 setTargetDAGCombine(ISD::SETCC);
1133
1134 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1135
1136 setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
1137 ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
1138 ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
1139 ISD::STORE, ISD::BUILD_VECTOR});
1140 setTargetDAGCombine(ISD::TRUNCATE);
1141 setTargetDAGCombine(ISD::LOAD);
1142
1143 setTargetDAGCombine(ISD::MSTORE);
1144
1145 setTargetDAGCombine(ISD::MUL);
1146
1147 setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
1148
1149 setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
1150 ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
1151 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1152
1153 setTargetDAGCombine(
1154 {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
1155
1156 setTargetDAGCombine(ISD::FP_EXTEND);
1157
1158 setTargetDAGCombine(ISD::GlobalAddress);
1159
1160 setTargetDAGCombine(ISD::CTLZ);
1161
1162 setTargetDAGCombine(ISD::GET_ACTIVE_LANE_MASK);
1163
1164 setTargetDAGCombine(ISD::VECREDUCE_AND);
1165 setTargetDAGCombine(ISD::VECREDUCE_OR);
1166 setTargetDAGCombine(ISD::VECREDUCE_XOR);
1167
1168 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1169
1170 setTargetDAGCombine(ISD::SHL);
1171
1172 // In case of strict alignment, avoid an excessive number of byte wide stores.
1173 MaxStoresPerMemsetOptSize = 8;
1174 MaxStoresPerMemset =
1175 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1176
1177 MaxGluedStoresPerMemcpy = 4;
1178 MaxStoresPerMemcpyOptSize = 4;
1179 MaxStoresPerMemcpy =
1180 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1181
1182 MaxStoresPerMemmoveOptSize = 4;
1183 MaxStoresPerMemmove =
1184 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1185
1186 MaxLoadsPerMemcmpOptSize = 4;
1187 MaxLoadsPerMemcmp =
1188 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1189
1190 setStackPointerRegisterToSaveRestore(AArch64::SP);
1191
1192 setSchedulingPreference(Sched::Hybrid);
1193
1194 EnableExtLdPromotion = true;
1195
1196 // Set required alignment.
1197 setMinFunctionAlignment(Align(4));
1198 // Set preferred alignments.
1199
1200 // Don't align loops on Windows. The SEH unwind info generation needs to
1201 // know the exact length of functions before the alignments have been
1202 // expanded.
1203 if (!Subtarget->isTargetWindows())
1204 setPrefLoopAlignment(STI.getPrefLoopAlignment());
1205 setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
1206 setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
1207
1208 // Only change the limit for entries in a jump table if specified by
1209 // the sub target, but not at the command line.
1210 unsigned MaxJT = STI.getMaximumJumpTableSize();
1211 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1212 setMaximumJumpTableSize(MaxJT);
1213
1214 setHasExtractBitsInsn(true);
1215
1216 setMaxDivRemBitWidthSupported(128);
1217
1218 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::Other, Action: Custom);
1219 if (Subtarget->hasSME())
1220 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::i1, Action: Custom);
1221
1222 if (Subtarget->isNeonAvailable()) {
1223 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1224 // silliness like this:
1225 // clang-format off
1226 for (auto Op :
1227 {ISD::SELECT, ISD::SELECT_CC, ISD::FATAN2,
1228 ISD::BR_CC, ISD::FADD, ISD::FSUB,
1229 ISD::FMUL, ISD::FDIV, ISD::FMA,
1230 ISD::FNEG, ISD::FABS, ISD::FCEIL,
1231 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1232 ISD::FSIN, ISD::FCOS, ISD::FTAN,
1233 ISD::FASIN, ISD::FACOS, ISD::FATAN,
1234 ISD::FSINH, ISD::FCOSH, ISD::FTANH,
1235 ISD::FPOW, ISD::FLOG, ISD::FLOG2,
1236 ISD::FLOG10, ISD::FEXP, ISD::FEXP2,
1237 ISD::FEXP10, ISD::FRINT, ISD::FROUND,
1238 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM,
1239 ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM,
1240 ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1241 ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
1242 ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FCEIL,
1243 ISD::STRICT_FFLOOR, ISD::STRICT_FSQRT, ISD::STRICT_FRINT,
1244 ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND, ISD::STRICT_FTRUNC,
1245 ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
1246 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM})
1247 setOperationAction(Op, VT: MVT::v1f64, Action: Expand);
1248 // clang-format on
1249
1250 for (auto Op :
1251 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
1252 ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
1253 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
1254 ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
1255 setOperationAction(Op, VT: MVT::v1i64, Action: Expand);
1256
1257 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1258 // elements smaller than i32, so promote the input to i32 first.
1259 setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v4i8, DestVT: MVT::v4i32);
1260 setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v4i8, DestVT: MVT::v4i32);
1261
1262 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1263 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1264 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1265 for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1266 ISD::STRICT_UINT_TO_FP})
1267 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1268 setOperationAction(Op, VT, Action: Custom);
1269
1270 if (Subtarget->hasFullFP16()) {
1271 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
1272 setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
1273
1274 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v8i8, Action: Custom);
1275 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v8i8, Action: Custom);
1276 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v16i8, Action: Custom);
1277 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v16i8, Action: Custom);
1278 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1279 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1280 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v8i16, Action: Custom);
1281 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v8i16, Action: Custom);
1282 } else {
1283 // when AArch64 doesn't have fullfp16 support, promote the input
1284 // to i32 first.
1285 setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v8i8, DestVT: MVT::v8i32);
1286 setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v8i8, DestVT: MVT::v8i32);
1287 setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v16i8, DestVT: MVT::v16i32);
1288 setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v16i8, DestVT: MVT::v16i32);
1289 setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v4i16, DestVT: MVT::v4i32);
1290 setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v4i16, DestVT: MVT::v4i32);
1291 setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v8i16, DestVT: MVT::v8i32);
1292 setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v8i16, DestVT: MVT::v8i32);
1293 }
1294
1295 setOperationAction(Op: ISD::CTLZ, VT: MVT::v1i64, Action: Expand);
1296 setOperationAction(Op: ISD::CTLZ, VT: MVT::v2i64, Action: Expand);
1297 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v8i8, Action: Legal);
1298 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v16i8, Action: Legal);
1299 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v2i32, Action: Custom);
1300 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v4i32, Action: Custom);
1301 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v1i64, Action: Custom);
1302 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v2i64, Action: Custom);
1303 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1304 setOperationAction(Op: ISD::UMAX, VT, Action: Custom);
1305 setOperationAction(Op: ISD::SMAX, VT, Action: Custom);
1306 setOperationAction(Op: ISD::UMIN, VT, Action: Custom);
1307 setOperationAction(Op: ISD::SMIN, VT, Action: Custom);
1308 }
1309
1310 // Custom handling for some quad-vector types to detect MULL.
1311 setOperationAction(Op: ISD::MUL, VT: MVT::v8i16, Action: Custom);
1312 setOperationAction(Op: ISD::MUL, VT: MVT::v4i32, Action: Custom);
1313 setOperationAction(Op: ISD::MUL, VT: MVT::v2i64, Action: Custom);
1314 setOperationAction(Op: ISD::MUL, VT: MVT::v4i16, Action: Custom);
1315 setOperationAction(Op: ISD::MUL, VT: MVT::v2i32, Action: Custom);
1316 setOperationAction(Op: ISD::MUL, VT: MVT::v1i64, Action: Custom);
1317
1318 // Saturates
1319 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1320 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1321 setOperationAction(Op: ISD::SADDSAT, VT, Action: Legal);
1322 setOperationAction(Op: ISD::UADDSAT, VT, Action: Legal);
1323 setOperationAction(Op: ISD::SSUBSAT, VT, Action: Legal);
1324 setOperationAction(Op: ISD::USUBSAT, VT, Action: Legal);
1325 }
1326
1327 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1328 MVT::v4i32}) {
1329 setOperationAction(Op: ISD::AVGFLOORS, VT, Action: Legal);
1330 setOperationAction(Op: ISD::AVGFLOORU, VT, Action: Legal);
1331 setOperationAction(Op: ISD::AVGCEILS, VT, Action: Legal);
1332 setOperationAction(Op: ISD::AVGCEILU, VT, Action: Legal);
1333 setOperationAction(Op: ISD::ABDS, VT, Action: Legal);
1334 setOperationAction(Op: ISD::ABDU, VT, Action: Legal);
1335 }
1336
1337 // Vector reductions
1338 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1339 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1340 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1341 setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Legal);
1342 setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Legal);
1343 setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Legal);
1344 setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Legal);
1345
1346 setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Legal);
1347 }
1348 }
1349 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1350 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1351 setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Custom);
1352 setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Custom);
1353 setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Custom);
1354 setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Custom);
1355 setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Custom);
1356 setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1357 setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1358 setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1359 }
1360 setOperationAction(Op: ISD::VECREDUCE_ADD, VT: MVT::v2i64, Action: Custom);
1361 setOperationAction(Op: ISD::VECREDUCE_AND, VT: MVT::v2i64, Action: Custom);
1362 setOperationAction(Op: ISD::VECREDUCE_OR, VT: MVT::v2i64, Action: Custom);
1363 setOperationAction(Op: ISD::VECREDUCE_XOR, VT: MVT::v2i64, Action: Custom);
1364
1365 setOperationAction(Op: ISD::ANY_EXTEND, VT: MVT::v4i32, Action: Legal);
1366 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
1367 // Likewise, narrowing and extending vector loads/stores aren't handled
1368 // directly.
1369 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1370 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Expand);
1371
1372 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1373 setOperationAction(Op: ISD::MULHS, VT, Action: Legal);
1374 setOperationAction(Op: ISD::MULHU, VT, Action: Legal);
1375 } else {
1376 setOperationAction(Op: ISD::MULHS, VT, Action: Expand);
1377 setOperationAction(Op: ISD::MULHU, VT, Action: Expand);
1378 }
1379 setOperationAction(Op: ISD::SMUL_LOHI, VT, Action: Expand);
1380 setOperationAction(Op: ISD::UMUL_LOHI, VT, Action: Expand);
1381
1382 setOperationAction(Op: ISD::BSWAP, VT, Action: Expand);
1383 setOperationAction(Op: ISD::CTTZ, VT, Action: Expand);
1384
1385 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1386 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1387 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1388 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1389 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1390 }
1391 }
1392
1393 for (auto Op :
1394 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1395 ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1396 ISD::STRICT_FFLOOR, ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL,
1397 ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUND,
1398 ISD::STRICT_FROUNDEVEN}) {
1399 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1400 setOperationAction(Op, VT: Ty, Action: Legal);
1401 if (Subtarget->hasFullFP16())
1402 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1403 setOperationAction(Op, VT: Ty, Action: Legal);
1404 }
1405
1406 // LRINT and LLRINT.
1407 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1408 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1409 setOperationAction(Op, VT: Ty, Action: Custom);
1410 if (Subtarget->hasFullFP16())
1411 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1412 setOperationAction(Op, VT: Ty, Action: Custom);
1413 }
1414
1415 setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1416
1417 setOperationAction(Op: ISD::BITCAST, VT: MVT::i2, Action: Custom);
1418 setOperationAction(Op: ISD::BITCAST, VT: MVT::i4, Action: Custom);
1419 setOperationAction(Op: ISD::BITCAST, VT: MVT::i8, Action: Custom);
1420 setOperationAction(Op: ISD::BITCAST, VT: MVT::i16, Action: Custom);
1421
1422 setOperationAction(Op: ISD::BITCAST, VT: MVT::v2i8, Action: Custom);
1423 setOperationAction(Op: ISD::BITCAST, VT: MVT::v2i16, Action: Custom);
1424 setOperationAction(Op: ISD::BITCAST, VT: MVT::v4i8, Action: Custom);
1425
1426 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1427 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1428 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1429 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1430 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1431 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1432
1433 // ADDP custom lowering
1434 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1435 setOperationAction(Op: ISD::ADD, VT, Action: Custom);
1436 // FADDP custom lowering
1437 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1438 setOperationAction(Op: ISD::FADD, VT, Action: Custom);
1439
1440 if (Subtarget->hasDotProd()) {
1441 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1442 ISD::PARTIAL_REDUCE_UMLA};
1443
1444 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v4i32, InputVT: MVT::v16i8, Action: Legal);
1445 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v2i32, InputVT: MVT::v8i8, Action: Legal);
1446 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v2i64, InputVT: MVT::v16i8, Action: Custom);
1447
1448 if (Subtarget->hasMatMulInt8()) {
1449 setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: MVT::v4i32,
1450 InputVT: MVT::v16i8, Action: Legal);
1451 setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: MVT::v2i64,
1452 InputVT: MVT::v16i8, Action: Custom);
1453
1454 setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: MVT::v2i32,
1455 InputVT: MVT::v8i8, Action: Legal);
1456 }
1457 }
1458
1459 } else /* !isNeonAvailable */ {
1460 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1461 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1462 setOperationAction(Op, VT, Action: Expand);
1463
1464 if (VT.is128BitVector() || VT.is64BitVector()) {
1465 setOperationAction(Op: ISD::LOAD, VT, Action: Legal);
1466 setOperationAction(Op: ISD::STORE, VT, Action: Legal);
1467 setOperationAction(Op: ISD::BITCAST, VT,
1468 Action: Subtarget->isLittleEndian() ? Legal : Expand);
1469 }
1470 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1471 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1472 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1473 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1474 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1475 }
1476 }
1477 }
1478
1479 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1480 setOperationAction(Op: ISD::TRUNCATE_SSAT_S, VT, Action: Legal);
1481 setOperationAction(Op: ISD::TRUNCATE_SSAT_U, VT, Action: Legal);
1482 setOperationAction(Op: ISD::TRUNCATE_USAT_U, VT, Action: Legal);
1483 }
1484
1485 if (Subtarget->hasSME()) {
1486 setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::Other, Action: Custom);
1487 }
1488
1489 // FIXME: Move lowering for more nodes here if those are common between
1490 // SVE and SME.
1491 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1492 for (auto VT :
1493 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1494 setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Custom);
1495 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Custom);
1496 setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1497 setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1498 }
1499 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1500 setOperationAction(Op: ISD::VECTOR_FIND_LAST_ACTIVE, VT, Action: Legal);
1501 setOperationAction(Op: ISD::GET_ACTIVE_LANE_MASK, VT, Action: Legal);
1502 }
1503
1504 if (Subtarget->hasSVE2p1() ||
1505 (Subtarget->hasSME2() && Subtarget->isStreaming()))
1506 setOperationAction(Op: ISD::GET_ACTIVE_LANE_MASK, VT: MVT::nxv32i1, Action: Custom);
1507
1508 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1509 setOperationAction(Op: ISD::GET_ACTIVE_LANE_MASK, VT, Action: Custom);
1510 }
1511
1512 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1513 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1514 setOperationAction(Op: ISD::BITREVERSE, VT, Action: Custom);
1515 setOperationAction(Op: ISD::BSWAP, VT, Action: Custom);
1516 setOperationAction(Op: ISD::CTLZ, VT, Action: Custom);
1517 setOperationAction(Op: ISD::CTPOP, VT, Action: Custom);
1518 setOperationAction(Op: ISD::CTTZ, VT, Action: Custom);
1519 setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1520 setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Custom);
1521 setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Custom);
1522 setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Custom);
1523 setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Custom);
1524 setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1525 setOperationAction(Op: ISD::MUL, VT, Action: Custom);
1526 setOperationAction(Op: ISD::MULHS, VT, Action: Custom);
1527 setOperationAction(Op: ISD::MULHU, VT, Action: Custom);
1528 setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1529 setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1530 setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1531 setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1532 setOperationAction(Op: ISD::SDIV, VT, Action: Custom);
1533 setOperationAction(Op: ISD::UDIV, VT, Action: Custom);
1534 setOperationAction(Op: ISD::SMIN, VT, Action: Custom);
1535 setOperationAction(Op: ISD::UMIN, VT, Action: Custom);
1536 setOperationAction(Op: ISD::SMAX, VT, Action: Custom);
1537 setOperationAction(Op: ISD::UMAX, VT, Action: Custom);
1538 setOperationAction(Op: ISD::SHL, VT, Action: Custom);
1539 setOperationAction(Op: ISD::SRL, VT, Action: Custom);
1540 setOperationAction(Op: ISD::SRA, VT, Action: Custom);
1541 setOperationAction(Op: ISD::ABS, VT, Action: Custom);
1542 setOperationAction(Op: ISD::ABDS, VT, Action: Custom);
1543 setOperationAction(Op: ISD::ABDU, VT, Action: Custom);
1544 setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Custom);
1545 setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1546 setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1547 setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1548 setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Custom);
1549 setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Custom);
1550 setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Custom);
1551 setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Custom);
1552 setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1553 setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1554
1555 setOperationAction(Op: ISD::UMUL_LOHI, VT, Action: Expand);
1556 setOperationAction(Op: ISD::SMUL_LOHI, VT, Action: Expand);
1557 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1558 setOperationAction(Op: ISD::ROTL, VT, Action: Expand);
1559 setOperationAction(Op: ISD::ROTR, VT, Action: Expand);
1560
1561 setOperationAction(Op: ISD::SADDSAT, VT, Action: Legal);
1562 setOperationAction(Op: ISD::UADDSAT, VT, Action: Legal);
1563 setOperationAction(Op: ISD::SSUBSAT, VT, Action: Legal);
1564 setOperationAction(Op: ISD::USUBSAT, VT, Action: Legal);
1565 setOperationAction(Op: ISD::UREM, VT, Action: Expand);
1566 setOperationAction(Op: ISD::SREM, VT, Action: Expand);
1567 setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
1568 setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
1569
1570 setOperationAction(Op: ISD::AVGFLOORS, VT, Action: Custom);
1571 setOperationAction(Op: ISD::AVGFLOORU, VT, Action: Custom);
1572 setOperationAction(Op: ISD::AVGCEILS, VT, Action: Custom);
1573 setOperationAction(Op: ISD::AVGCEILU, VT, Action: Custom);
1574
1575 if (!Subtarget->isLittleEndian())
1576 setOperationAction(Op: ISD::BITCAST, VT, Action: Custom);
1577
1578 if (Subtarget->hasSVE2() ||
1579 (Subtarget->hasSME() && Subtarget->isStreaming()))
1580 // For SLI/SRI.
1581 setOperationAction(Op: ISD::OR, VT, Action: Custom);
1582 }
1583
1584 // Illegal unpacked integer vector types.
1585 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1586 setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Custom);
1587 setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1588 }
1589
1590 // Type legalize unpacked bitcasts.
1591 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1592 setOperationAction(Op: ISD::BITCAST, VT, Action: Custom);
1593
1594 for (auto VT :
1595 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1596 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1597 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Legal);
1598
1599 for (auto VT :
1600 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1601 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1602 setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1603 setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1604 setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1605 setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1606 setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1607 setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1608
1609 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1610 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Custom);
1611 setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1612
1613 // There are no legal MVT::nxv16f## based types.
1614 if (VT != MVT::nxv16i1) {
1615 setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Custom);
1616 setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Custom);
1617 setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Custom);
1618 setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Custom);
1619 }
1620 }
1621
1622 // NEON doesn't support masked loads/stores, but SME and SVE do.
1623 for (auto VT :
1624 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1625 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1626 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1627 setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1628 setOperationAction(Op: ISD::MSTORE, VT, Action: Custom);
1629 }
1630
1631 // Firstly, exclude all scalable vector extending loads/truncating stores,
1632 // include both integer and floating scalable vector.
1633 for (MVT VT : MVT::scalable_vector_valuetypes()) {
1634 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1635 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1636 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1637 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1638 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1639 }
1640 }
1641
1642 // Then, selectively enable those which we directly support.
1643 setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i8, Action: Legal);
1644 setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i16, Action: Legal);
1645 setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i32, Action: Legal);
1646 setTruncStoreAction(ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i8, Action: Legal);
1647 setTruncStoreAction(ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i16, Action: Legal);
1648 setTruncStoreAction(ValVT: MVT::nxv8i16, MemVT: MVT::nxv8i8, Action: Legal);
1649 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1650 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i8, Action: Legal);
1651 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i16, Action: Legal);
1652 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i32, Action: Legal);
1653 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i8, Action: Legal);
1654 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i16, Action: Legal);
1655 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv8i16, MemVT: MVT::nxv8i8, Action: Legal);
1656 }
1657
1658 // SVE supports truncating stores of 64 and 128-bit vectors
1659 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Custom);
1660 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Custom);
1661 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Custom);
1662 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Custom);
1663 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Custom);
1664
1665 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1666 MVT::nxv4f32, MVT::nxv2f64}) {
1667 setOperationAction(Op: ISD::BITCAST, VT, Action: Custom);
1668 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1669 setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1670 setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1671 setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1672 setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1673 setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1674 setOperationAction(Op: ISD::FADD, VT, Action: Custom);
1675 setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
1676 setOperationAction(Op: ISD::FDIV, VT, Action: Custom);
1677 setOperationAction(Op: ISD::FMA, VT, Action: Custom);
1678 setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Custom);
1679 setOperationAction(Op: ISD::FMAXNUM, VT, Action: Custom);
1680 setOperationAction(Op: ISD::FMINIMUM, VT, Action: Custom);
1681 setOperationAction(Op: ISD::FMINNUM, VT, Action: Custom);
1682 setOperationAction(Op: ISD::FMUL, VT, Action: Custom);
1683 setOperationAction(Op: ISD::FNEG, VT, Action: Custom);
1684 setOperationAction(Op: ISD::FSUB, VT, Action: Custom);
1685 setOperationAction(Op: ISD::FCEIL, VT, Action: Custom);
1686 setOperationAction(Op: ISD::FFLOOR, VT, Action: Custom);
1687 setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Custom);
1688 setOperationAction(Op: ISD::FRINT, VT, Action: Custom);
1689 setOperationAction(Op: ISD::LRINT, VT, Action: Custom);
1690 setOperationAction(Op: ISD::LLRINT, VT, Action: Custom);
1691 setOperationAction(Op: ISD::FROUND, VT, Action: Custom);
1692 setOperationAction(Op: ISD::FROUNDEVEN, VT, Action: Custom);
1693 setOperationAction(Op: ISD::FTRUNC, VT, Action: Custom);
1694 setOperationAction(Op: ISD::FSQRT, VT, Action: Custom);
1695 setOperationAction(Op: ISD::FABS, VT, Action: Custom);
1696 setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Custom);
1697 setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1698 setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Custom);
1699 setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Custom);
1700 setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Custom);
1701 setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Custom);
1702 setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Custom);
1703 setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1704 setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1705 setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1706
1707 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1708 setOperationAction(Op: ISD::FREM, VT, Action: Expand);
1709 setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
1710 setOperationAction(Op: ISD::FPOWI, VT, Action: Expand);
1711 setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
1712 setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
1713 setOperationAction(Op: ISD::FSINCOS, VT, Action: Expand);
1714 setOperationAction(Op: ISD::FTAN, VT, Action: Expand);
1715 setOperationAction(Op: ISD::FACOS, VT, Action: Expand);
1716 setOperationAction(Op: ISD::FASIN, VT, Action: Expand);
1717 setOperationAction(Op: ISD::FATAN, VT, Action: Expand);
1718 setOperationAction(Op: ISD::FATAN2, VT, Action: Expand);
1719 setOperationAction(Op: ISD::FCOSH, VT, Action: Expand);
1720 setOperationAction(Op: ISD::FSINH, VT, Action: Expand);
1721 setOperationAction(Op: ISD::FTANH, VT, Action: Expand);
1722 setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
1723 setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
1724 setOperationAction(Op: ISD::FEXP10, VT, Action: Expand);
1725 setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
1726 setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
1727 setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
1728
1729 setCondCodeAction(CCs: ISD::SETO, VT, Action: Expand);
1730 setCondCodeAction(CCs: ISD::SETOLT, VT, Action: Expand);
1731 setCondCodeAction(CCs: ISD::SETLT, VT, Action: Expand);
1732 setCondCodeAction(CCs: ISD::SETOLE, VT, Action: Expand);
1733 setCondCodeAction(CCs: ISD::SETLE, VT, Action: Expand);
1734 setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
1735 setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
1736 setCondCodeAction(CCs: ISD::SETUGE, VT, Action: Expand);
1737 setCondCodeAction(CCs: ISD::SETUGT, VT, Action: Expand);
1738 setCondCodeAction(CCs: ISD::SETUEQ, VT, Action: Expand);
1739 setCondCodeAction(CCs: ISD::SETONE, VT, Action: Expand);
1740 }
1741
1742 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1743 setOperationAction(Op: ISD::BITCAST, VT, Action: Custom);
1744 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1745 setOperationAction(Op: ISD::FABS, VT, Action: Legal);
1746 setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
1747 setOperationAction(Op: ISD::FNEG, VT, Action: Legal);
1748 setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Custom);
1749 setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1750 setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1751 setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1752 setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1753 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1754 setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1755 setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1756 setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1757 setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1758
1759 if (Subtarget->hasSVEB16B16()) {
1760 setOperationAction(Op: ISD::FADD, VT, Action: Legal);
1761 setOperationAction(Op: ISD::FMA, VT, Action: Custom);
1762 setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Custom);
1763 setOperationAction(Op: ISD::FMAXNUM, VT, Action: Custom);
1764 setOperationAction(Op: ISD::FMINIMUM, VT, Action: Custom);
1765 setOperationAction(Op: ISD::FMINNUM, VT, Action: Custom);
1766 setOperationAction(Op: ISD::FMUL, VT, Action: Legal);
1767 setOperationAction(Op: ISD::FSUB, VT, Action: Legal);
1768 }
1769 }
1770
1771 for (auto Opcode :
1772 {ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
1773 ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC, ISD::SETCC,
1774 ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMAXIMUM,
1775 ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMINIMUM}) {
1776 setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv2bf16, DestVT: MVT::nxv2f32);
1777 setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv4bf16, DestVT: MVT::nxv4f32);
1778 setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv8bf16, DestVT: MVT::nxv8f32);
1779 }
1780
1781 if (!Subtarget->hasSVEB16B16()) {
1782 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1783 ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) {
1784 setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv2bf16, DestVT: MVT::nxv2f32);
1785 setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv4bf16, DestVT: MVT::nxv4f32);
1786 setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv8bf16, DestVT: MVT::nxv8f32);
1787 }
1788 }
1789
1790 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::i8, Action: Custom);
1791 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::i16, Action: Custom);
1792
1793 // NEON doesn't support integer divides, but SVE does
1794 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1795 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1796 setOperationAction(Op: ISD::SDIV, VT, Action: Custom);
1797 setOperationAction(Op: ISD::UDIV, VT, Action: Custom);
1798 }
1799
1800 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1801 setOperationAction(Op: ISD::MUL, VT: MVT::v1i64, Action: Custom);
1802 setOperationAction(Op: ISD::MUL, VT: MVT::v2i64, Action: Custom);
1803
1804 // NOTE: Currently this has to happen after computeRegisterProperties rather
1805 // than the preferred option of combining it with the addRegisterClass call.
1806 if (Subtarget->useSVEForFixedLengthVectors()) {
1807 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
1808 if (useSVEForFixedLengthVectorVT(
1809 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1810 addTypeForFixedLengthSVE(VT);
1811 }
1812 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
1813 if (useSVEForFixedLengthVectorVT(
1814 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1815 addTypeForFixedLengthSVE(VT);
1816 }
1817
1818 // 64bit results can mean a bigger than NEON input.
1819 for (auto VT : {MVT::v8i8, MVT::v4i16})
1820 setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1821 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4f16, Action: Custom);
1822
1823 // 128bit results imply a bigger than NEON input.
1824 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1825 setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1826 for (auto VT : {MVT::v8f16, MVT::v4f32})
1827 setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1828
1829 // These operations are not supported on NEON but SVE can do them.
1830 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v1i64, Action: Custom);
1831 setOperationAction(Op: ISD::CTLZ, VT: MVT::v1i64, Action: Custom);
1832 setOperationAction(Op: ISD::CTLZ, VT: MVT::v2i64, Action: Custom);
1833 setOperationAction(Op: ISD::CTTZ, VT: MVT::v1i64, Action: Custom);
1834 setOperationAction(Op: ISD::MULHS, VT: MVT::v1i64, Action: Custom);
1835 setOperationAction(Op: ISD::MULHS, VT: MVT::v2i64, Action: Custom);
1836 setOperationAction(Op: ISD::MULHU, VT: MVT::v1i64, Action: Custom);
1837 setOperationAction(Op: ISD::MULHU, VT: MVT::v2i64, Action: Custom);
1838 setOperationAction(Op: ISD::SMAX, VT: MVT::v1i64, Action: Custom);
1839 setOperationAction(Op: ISD::SMAX, VT: MVT::v2i64, Action: Custom);
1840 setOperationAction(Op: ISD::SMIN, VT: MVT::v1i64, Action: Custom);
1841 setOperationAction(Op: ISD::SMIN, VT: MVT::v2i64, Action: Custom);
1842 setOperationAction(Op: ISD::UMAX, VT: MVT::v1i64, Action: Custom);
1843 setOperationAction(Op: ISD::UMAX, VT: MVT::v2i64, Action: Custom);
1844 setOperationAction(Op: ISD::UMIN, VT: MVT::v1i64, Action: Custom);
1845 setOperationAction(Op: ISD::UMIN, VT: MVT::v2i64, Action: Custom);
1846 setOperationAction(Op: ISD::VECREDUCE_SMAX, VT: MVT::v2i64, Action: Custom);
1847 setOperationAction(Op: ISD::VECREDUCE_SMIN, VT: MVT::v2i64, Action: Custom);
1848 setOperationAction(Op: ISD::VECREDUCE_UMAX, VT: MVT::v2i64, Action: Custom);
1849 setOperationAction(Op: ISD::VECREDUCE_UMIN, VT: MVT::v2i64, Action: Custom);
1850
1851 // Int operations with no NEON support.
1852 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1853 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1854 setOperationAction(Op: ISD::BITREVERSE, VT, Action: Custom);
1855 setOperationAction(Op: ISD::CTTZ, VT, Action: Custom);
1856 setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1857 setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1858 setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1859 setOperationAction(Op: ISD::MULHS, VT, Action: Custom);
1860 setOperationAction(Op: ISD::MULHU, VT, Action: Custom);
1861 }
1862
1863 // Use SVE for vectors with more than 2 elements.
1864 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1865 setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Custom);
1866 }
1867
1868 setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv2i1, DestVT: MVT::nxv2i64);
1869 setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv4i1, DestVT: MVT::nxv4i32);
1870 setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv8i1, DestVT: MVT::nxv8i16);
1871 setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv16i1, DestVT: MVT::nxv16i8);
1872
1873 setOperationAction(Op: ISD::VSCALE, VT: MVT::i32, Action: Custom);
1874
1875 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1876 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT, Action: Custom);
1877 }
1878
1879 // Handle partial reduction operations
1880 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1881 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1882 // Other pairs will default to 'Expand'.
1883 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1884 ISD::PARTIAL_REDUCE_UMLA};
1885 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv2i64, InputVT: MVT::nxv8i16, Action: Legal);
1886 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv4i32, InputVT: MVT::nxv16i8, Action: Legal);
1887
1888 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv2i64, InputVT: MVT::nxv16i8, Action: Custom);
1889
1890 if (Subtarget->hasMatMulInt8()) {
1891 setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: MVT::nxv4i32,
1892 InputVT: MVT::nxv16i8, Action: Legal);
1893 setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: MVT::nxv2i64,
1894 InputVT: MVT::nxv16i8, Action: Custom);
1895 }
1896
1897 // Wide add types
1898 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1899 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv2i64, InputVT: MVT::nxv4i32, Action: Legal);
1900 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv4i32, InputVT: MVT::nxv8i16, Action: Legal);
1901 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv8i16, InputVT: MVT::nxv16i8, Action: Legal);
1902 }
1903 }
1904
1905 // Handle operations that are only available in non-streaming SVE mode.
1906 if (Subtarget->isSVEAvailable()) {
1907 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1908 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1909 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1910 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1911 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1912 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1913 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1914 setOperationAction(Op: ISD::MGATHER, VT, Action: Custom);
1915 setOperationAction(Op: ISD::MSCATTER, VT, Action: Custom);
1916 }
1917
1918 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1919 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1920 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1921 setOperationAction(Op: ISD::VECREDUCE_SEQ_FADD, VT, Action: Custom);
1922
1923 // We can lower types that have <vscale x {2|4}> elements to compact.
1924 for (auto VT :
1925 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1926 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1927 setOperationAction(Op: ISD::VECTOR_COMPRESS, VT, Action: Custom);
1928
1929 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1930 // NEON vectors in the lowest bits of the SVE register.
1931 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1932 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1933 setOperationAction(Op: ISD::VECTOR_COMPRESS, VT, Action: Custom);
1934
1935 // Histcnt is SVE2 only
1936 if (Subtarget->hasSVE2()) {
1937 setOperationAction(Op: ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, VT: MVT::nxv4i32,
1938 Action: Custom);
1939 setOperationAction(Op: ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, VT: MVT::nxv2i64,
1940 Action: Custom);
1941
1942 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1943 ISD::PARTIAL_REDUCE_UMLA};
1944 // Must be lowered to SVE instructions.
1945 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v2i64, InputVT: MVT::v4i32, Action: Custom);
1946 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v2i64, InputVT: MVT::v8i16, Action: Custom);
1947 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v2i64, InputVT: MVT::v16i8, Action: Custom);
1948 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v4i32, InputVT: MVT::v8i16, Action: Custom);
1949 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v4i32, InputVT: MVT::v16i8, Action: Custom);
1950 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v8i16, InputVT: MVT::v16i8, Action: Custom);
1951 }
1952 }
1953
1954 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1955 // Only required for llvm.aarch64.mops.memset.tag
1956 setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::i8, Action: Custom);
1957 }
1958
1959 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::Other, Action: Custom);
1960
1961 if (Subtarget->hasSVE()) {
1962 setOperationAction(Op: ISD::FLDEXP, VT: MVT::f64, Action: Custom);
1963 setOperationAction(Op: ISD::FLDEXP, VT: MVT::f32, Action: Custom);
1964 setOperationAction(Op: ISD::FLDEXP, VT: MVT::f16, Action: Custom);
1965 setOperationAction(Op: ISD::FLDEXP, VT: MVT::bf16, Action: Custom);
1966 }
1967
1968 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1969
1970 IsStrictFPEnabled = true;
1971 setMaxAtomicSizeInBitsSupported(128);
1972
1973 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1974 // it, but it's just a wrapper around ldexp.
1975 if (Subtarget->isTargetWindows()) {
1976 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1977 if (isOperationExpand(Op, VT: MVT::f32))
1978 setOperationAction(Op, VT: MVT::f32, Action: Promote);
1979 }
1980
1981 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1982 // isn't legal.
1983 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1984 if (isOperationExpand(Op, VT: MVT::f16))
1985 setOperationAction(Op, VT: MVT::f16, Action: Promote);
1986}
1987
1988void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1989 assert(VT.isVector() && "VT should be a vector type");
1990
1991 if (VT.isFloatingPoint()) {
1992 MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1993 setOperationPromotedToType(Opc: ISD::LOAD, OrigVT: VT, DestVT: PromoteTo);
1994 setOperationPromotedToType(Opc: ISD::STORE, OrigVT: VT, DestVT: PromoteTo);
1995 }
1996
1997 // Mark vector float intrinsics as expand.
1998 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1999 setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
2000 setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
2001 setOperationAction(Op: ISD::FTAN, VT, Action: Expand);
2002 setOperationAction(Op: ISD::FASIN, VT, Action: Expand);
2003 setOperationAction(Op: ISD::FACOS, VT, Action: Expand);
2004 setOperationAction(Op: ISD::FATAN, VT, Action: Expand);
2005 setOperationAction(Op: ISD::FATAN2, VT, Action: Expand);
2006 setOperationAction(Op: ISD::FSINH, VT, Action: Expand);
2007 setOperationAction(Op: ISD::FCOSH, VT, Action: Expand);
2008 setOperationAction(Op: ISD::FTANH, VT, Action: Expand);
2009 setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
2010 setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
2011 setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
2012 setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
2013 setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
2014 setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
2015 setOperationAction(Op: ISD::FEXP10, VT, Action: Expand);
2016 }
2017
2018 // But we do support custom-lowering for FCOPYSIGN.
2019 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2020 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2021 VT == MVT::v8f16) &&
2022 Subtarget->hasFullFP16()))
2023 setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
2024
2025 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Custom);
2026 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Custom);
2027 setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Custom);
2028 setOperationAction(Op: ISD::ZERO_EXTEND_VECTOR_INREG, VT, Action: Custom);
2029 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Custom);
2030 setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Custom);
2031 setOperationAction(Op: ISD::SRA, VT, Action: Custom);
2032 setOperationAction(Op: ISD::SRL, VT, Action: Custom);
2033 setOperationAction(Op: ISD::SHL, VT, Action: Custom);
2034 setOperationAction(Op: ISD::OR, VT, Action: Custom);
2035 setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
2036 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Legal);
2037
2038 setOperationAction(Op: ISD::SELECT, VT, Action: Expand);
2039 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
2040 setOperationAction(Op: ISD::VSELECT, VT, Action: Expand);
2041 for (MVT InnerVT : MVT::all_valuetypes())
2042 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: InnerVT, MemVT: VT, Action: Expand);
2043
2044 // CNT supports only B element sizes, then use UADDLP to widen.
2045 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2046 setOperationAction(Op: ISD::CTPOP, VT, Action: Custom);
2047
2048 setOperationAction(Op: ISD::UDIV, VT, Action: Expand);
2049 setOperationAction(Op: ISD::SDIV, VT, Action: Expand);
2050 setOperationAction(Op: ISD::UREM, VT, Action: Expand);
2051 setOperationAction(Op: ISD::SREM, VT, Action: Expand);
2052 setOperationAction(Op: ISD::FREM, VT, Action: Expand);
2053
2054 for (unsigned Opcode :
2055 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
2056 ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
2057 setOperationAction(Op: Opcode, VT, Action: Custom);
2058
2059 if (!VT.isFloatingPoint())
2060 setOperationAction(Op: ISD::ABS, VT, Action: Legal);
2061
2062 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2063 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2064 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2065 setOperationAction(Op: Opcode, VT, Action: Legal);
2066
2067 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2068 // NEON types.
2069 if (VT.isFloatingPoint() &&
2070 VT.getVectorElementType() != MVT::bf16 &&
2071 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2072 for (unsigned Opcode :
2073 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
2074 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::STRICT_FMINIMUM,
2075 ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
2076 ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
2077 ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
2078 setOperationAction(Op: Opcode, VT, Action: Legal);
2079
2080 // Strict fp extend and trunc are legal
2081 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2082 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT, Action: Legal);
2083 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2084 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT, Action: Legal);
2085
2086 // FIXME: We could potentially make use of the vector comparison instructions
2087 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2088 // complications:
2089 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2090 // so we would need to expand when the condition code doesn't match the
2091 // kind of comparison.
2092 // * Some kinds of comparison require more than one FCMXY instruction so
2093 // would need to be expanded instead.
2094 // * The lowering of the non-strict versions involves target-specific ISD
2095 // nodes so we would likely need to add strict versions of all of them and
2096 // handle them appropriately.
2097 setOperationAction(Op: ISD::STRICT_FSETCC, VT, Action: Expand);
2098 setOperationAction(Op: ISD::STRICT_FSETCCS, VT, Action: Expand);
2099
2100 // When little-endian we can use ordinary d and q register loads/stores for
2101 // vector types, but when big-endian we need to use structure load/store which
2102 // only allow post-index addressing.
2103 if (Subtarget->isLittleEndian()) {
2104 for (unsigned im = (unsigned)ISD::PRE_INC;
2105 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2106 setIndexedLoadAction(IdxModes: im, VT, Action: Legal);
2107 setIndexedStoreAction(IdxModes: im, VT, Action: Legal);
2108 }
2109 } else {
2110 setIndexedLoadAction(IdxModes: ISD::POST_INC, VT, Action: Legal);
2111 setIndexedStoreAction(IdxModes: ISD::POST_INC, VT, Action: Legal);
2112 }
2113
2114 if (Subtarget->hasD128()) {
2115 setOperationAction(Op: ISD::READ_REGISTER, VT: MVT::i128, Action: Custom);
2116 setOperationAction(Op: ISD::WRITE_REGISTER, VT: MVT::i128, Action: Custom);
2117 }
2118
2119 if (VT.isInteger()) {
2120 // Let common code emit inverted variants of compares we do support.
2121 setCondCodeAction(CCs: ISD::SETNE, VT, Action: Expand);
2122 setCondCodeAction(CCs: ISD::SETLE, VT, Action: Expand);
2123 setCondCodeAction(CCs: ISD::SETLT, VT, Action: Expand);
2124 setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
2125 setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
2126 }
2127}
2128
2129bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
2130 EVT OpVT) const {
2131 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2132 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2133 ResVT.getVectorElementType() != MVT::i1)
2134 return true;
2135
2136 // Only support illegal types if the result is scalable and min elements > 1.
2137 if (ResVT.getVectorMinNumElements() == 1 ||
2138 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2139 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2140 return true;
2141
2142 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2143 // but anything larger should be expanded.
2144 if (OpVT.getFixedSizeInBits() > 64)
2145 return true;
2146
2147 return false;
2148}
2149
2150bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
2151 const IntrinsicInst *I) const {
2152 assert(I->getIntrinsicID() ==
2153 Intrinsic::experimental_vector_partial_reduce_add &&
2154 "Unexpected intrinsic!");
2155 return true;
2156}
2157
2158bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
2159 if (!Subtarget->isSVEorStreamingSVEAvailable())
2160 return true;
2161
2162 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2163 // also support fixed-width predicates.
2164 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2165 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2166 VT != MVT::v4i1 && VT != MVT::v2i1;
2167}
2168
2169bool AArch64TargetLowering::shouldExpandVectorMatch(EVT VT,
2170 unsigned SearchSize) const {
2171 // MATCH is SVE2 and only available in non-streaming mode.
2172 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2173 return true;
2174 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2175 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2176 return SearchSize != 8;
2177 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2178 return SearchSize != 8 && SearchSize != 16;
2179 return true;
2180}
2181
2182void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2183 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2184
2185 // By default everything must be expanded.
2186 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2187 setOperationAction(Op, VT, Action: Expand);
2188
2189 if (VT.isFloatingPoint()) {
2190 setCondCodeAction(CCs: ISD::SETO, VT, Action: Expand);
2191 setCondCodeAction(CCs: ISD::SETOLT, VT, Action: Expand);
2192 setCondCodeAction(CCs: ISD::SETOLE, VT, Action: Expand);
2193 setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
2194 setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
2195 setCondCodeAction(CCs: ISD::SETUGE, VT, Action: Expand);
2196 setCondCodeAction(CCs: ISD::SETUGT, VT, Action: Expand);
2197 setCondCodeAction(CCs: ISD::SETUEQ, VT, Action: Expand);
2198 setCondCodeAction(CCs: ISD::SETONE, VT, Action: Expand);
2199 }
2200
2201 TargetLoweringBase::LegalizeAction Default =
2202 VT == MVT::v1f64 ? Expand : Custom;
2203
2204 // Mark integer truncating stores/extending loads as having custom lowering
2205 if (VT.isInteger()) {
2206 MVT InnerVT = VT.changeVectorElementType(EltVT: MVT::i8);
2207 while (InnerVT != VT) {
2208 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Default);
2209 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2210 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2211 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2212 InnerVT = InnerVT.changeVectorElementType(
2213 EltVT: MVT::getIntegerVT(BitWidth: 2 * InnerVT.getScalarSizeInBits()));
2214 }
2215 }
2216
2217 // Mark floating-point truncating stores/extending loads as having custom
2218 // lowering
2219 if (VT.isFloatingPoint()) {
2220 MVT InnerVT = VT.changeVectorElementType(EltVT: MVT::f16);
2221 while (InnerVT != VT) {
2222 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Custom);
2223 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2224 InnerVT = InnerVT.changeVectorElementType(
2225 EltVT: MVT::getFloatingPointVT(BitWidth: 2 * InnerVT.getScalarSizeInBits()));
2226 }
2227 }
2228
2229 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2230 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2231
2232 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2233 ISD::PARTIAL_REDUCE_UMLA};
2234 unsigned NumElts = VT.getVectorNumElements();
2235 if (VT.getVectorElementType() == MVT::i64) {
2236 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2237 InputVT: MVT::getVectorVT(VT: MVT::i8, NumElements: NumElts * 8), Action: Custom);
2238 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2239 InputVT: MVT::getVectorVT(VT: MVT::i16, NumElements: NumElts * 4), Action: Custom);
2240 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2241 InputVT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts * 2), Action: Custom);
2242 } else if (VT.getVectorElementType() == MVT::i32) {
2243 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2244 InputVT: MVT::getVectorVT(VT: MVT::i8, NumElements: NumElts * 4), Action: Custom);
2245 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2246 InputVT: MVT::getVectorVT(VT: MVT::i16, NumElements: NumElts * 2), Action: Custom);
2247 } else if (VT.getVectorElementType() == MVT::i16) {
2248 setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2249 InputVT: MVT::getVectorVT(VT: MVT::i8, NumElements: NumElts * 2), Action: Custom);
2250 }
2251 if (Subtarget->hasMatMulInt8()) {
2252 if (VT.getVectorElementType() == MVT::i32)
2253 setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: VT,
2254 InputVT: MVT::getVectorVT(VT: MVT::i8, NumElements: NumElts * 4), Action: Custom);
2255 else if (VT.getVectorElementType() == MVT::i64)
2256 setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: VT,
2257 InputVT: MVT::getVectorVT(VT: MVT::i8, NumElements: NumElts * 8), Action: Custom);
2258 }
2259
2260 // Lower fixed length vector operations to scalable equivalents.
2261 setOperationAction(Op: ISD::ABDS, VT, Action: Default);
2262 setOperationAction(Op: ISD::ABDU, VT, Action: Default);
2263 setOperationAction(Op: ISD::ABS, VT, Action: Default);
2264 setOperationAction(Op: ISD::ADD, VT, Action: Default);
2265 setOperationAction(Op: ISD::AND, VT, Action: Default);
2266 setOperationAction(Op: ISD::ANY_EXTEND, VT, Action: Default);
2267 setOperationAction(Op: ISD::BITCAST, VT, Action: PreferNEON ? Legal : Default);
2268 setOperationAction(Op: ISD::BITREVERSE, VT, Action: Default);
2269 setOperationAction(Op: ISD::BSWAP, VT, Action: Default);
2270 setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Custom);
2271 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Default);
2272 setOperationAction(Op: ISD::CTLZ, VT, Action: Default);
2273 setOperationAction(Op: ISD::CTPOP, VT, Action: Default);
2274 setOperationAction(Op: ISD::CTTZ, VT, Action: Default);
2275 setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Default);
2276 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Default);
2277 setOperationAction(Op: ISD::FABS, VT, Action: Default);
2278 setOperationAction(Op: ISD::FADD, VT, Action: Default);
2279 setOperationAction(Op: ISD::FCEIL, VT, Action: Default);
2280 setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Default);
2281 setOperationAction(Op: ISD::FDIV, VT, Action: Default);
2282 setOperationAction(Op: ISD::FFLOOR, VT, Action: Default);
2283 setOperationAction(Op: ISD::FMA, VT, Action: Default);
2284 setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Default);
2285 setOperationAction(Op: ISD::FMAXNUM, VT, Action: Default);
2286 setOperationAction(Op: ISD::FMINIMUM, VT, Action: Default);
2287 setOperationAction(Op: ISD::FMINNUM, VT, Action: Default);
2288 setOperationAction(Op: ISD::FMUL, VT, Action: Default);
2289 setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Default);
2290 setOperationAction(Op: ISD::FNEG, VT, Action: Default);
2291 setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Default);
2292 setOperationAction(Op: ISD::FP_ROUND, VT, Action: Default);
2293 setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Default);
2294 setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Default);
2295 setOperationAction(Op: ISD::FRINT, VT, Action: Default);
2296 setOperationAction(Op: ISD::LRINT, VT, Action: Default);
2297 setOperationAction(Op: ISD::LLRINT, VT, Action: Default);
2298 setOperationAction(Op: ISD::FROUND, VT, Action: Default);
2299 setOperationAction(Op: ISD::FROUNDEVEN, VT, Action: Default);
2300 setOperationAction(Op: ISD::FSQRT, VT, Action: Default);
2301 setOperationAction(Op: ISD::FSUB, VT, Action: Default);
2302 setOperationAction(Op: ISD::FTRUNC, VT, Action: Default);
2303 setOperationAction(Op: ISD::GET_ACTIVE_LANE_MASK, VT, Action: Default);
2304 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Default);
2305 setOperationAction(Op: ISD::LOAD, VT, Action: PreferNEON ? Legal : Default);
2306 setOperationAction(Op: ISD::MGATHER, VT, Action: PreferSVE ? Default : Expand);
2307 setOperationAction(Op: ISD::MLOAD, VT, Action: Default);
2308 setOperationAction(Op: ISD::MSCATTER, VT, Action: PreferSVE ? Default : Expand);
2309 setOperationAction(Op: ISD::MSTORE, VT, Action: Default);
2310 setOperationAction(Op: ISD::MUL, VT, Action: Default);
2311 setOperationAction(Op: ISD::MULHS, VT, Action: Default);
2312 setOperationAction(Op: ISD::MULHU, VT, Action: Default);
2313 setOperationAction(Op: ISD::OR, VT, Action: Default);
2314 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT, Action: PreferNEON ? Legal : Expand);
2315 setOperationAction(Op: ISD::SDIV, VT, Action: Default);
2316 setOperationAction(Op: ISD::SELECT, VT, Action: Default);
2317 setOperationAction(Op: ISD::SETCC, VT, Action: Default);
2318 setOperationAction(Op: ISD::SHL, VT, Action: Default);
2319 setOperationAction(Op: ISD::SIGN_EXTEND, VT, Action: Default);
2320 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Default);
2321 setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Default);
2322 setOperationAction(Op: ISD::SMAX, VT, Action: Default);
2323 setOperationAction(Op: ISD::SMIN, VT, Action: Default);
2324 setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Default);
2325 setOperationAction(Op: ISD::SRA, VT, Action: Default);
2326 setOperationAction(Op: ISD::SRL, VT, Action: Default);
2327 setOperationAction(Op: ISD::STORE, VT, Action: PreferNEON ? Legal : Default);
2328 setOperationAction(Op: ISD::SUB, VT, Action: Default);
2329 setOperationAction(Op: ISD::TRUNCATE, VT, Action: Default);
2330 setOperationAction(Op: ISD::UDIV, VT, Action: Default);
2331 setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Default);
2332 setOperationAction(Op: ISD::UMAX, VT, Action: Default);
2333 setOperationAction(Op: ISD::UMIN, VT, Action: Default);
2334 setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Default);
2335 setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Default);
2336 setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Default);
2337 setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Default);
2338 setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Default);
2339 setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Default);
2340 setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Default);
2341 setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Default);
2342 setOperationAction(Op: ISD::VECREDUCE_SEQ_FADD, VT, Action: PreferSVE ? Default : Expand);
2343 setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Default);
2344 setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Default);
2345 setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Default);
2346 setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Default);
2347 setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Default);
2348 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Default);
2349 setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Default);
2350 setOperationAction(Op: ISD::VSELECT, VT, Action: Default);
2351 setOperationAction(Op: ISD::XOR, VT, Action: Default);
2352 setOperationAction(Op: ISD::ZERO_EXTEND, VT, Action: Default);
2353}
2354
2355void AArch64TargetLowering::addDRType(MVT VT) {
2356 addRegisterClass(VT, RC: &AArch64::FPR64RegClass);
2357 if (Subtarget->isNeonAvailable())
2358 addTypeForNEON(VT);
2359}
2360
2361void AArch64TargetLowering::addQRType(MVT VT) {
2362 addRegisterClass(VT, RC: &AArch64::FPR128RegClass);
2363 if (Subtarget->isNeonAvailable())
2364 addTypeForNEON(VT);
2365}
2366
2367EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
2368 LLVMContext &C, EVT VT) const {
2369 if (!VT.isVector())
2370 return MVT::i32;
2371 if (VT.isScalableVector())
2372 return EVT::getVectorVT(Context&: C, VT: MVT::i1, EC: VT.getVectorElementCount());
2373 return VT.changeVectorElementTypeToInteger();
2374}
2375
2376// isIntImmediate - This method tests to see if the node is a constant
2377// operand. If so Imm will receive the value.
2378static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2379 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(Val: N)) {
2380 Imm = C->getZExtValue();
2381 return true;
2382 }
2383 return false;
2384}
2385
2386// isOpcWithIntImmediate - This method tests to see if the node is a specific
2387// opcode and that it has a immediate integer right operand.
2388// If so Imm will receive the value.
2389static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2390 uint64_t &Imm) {
2391 return N->getOpcode() == Opc &&
2392 isIntImmediate(N: N->getOperand(Num: 1).getNode(), Imm);
2393}
2394
2395static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2396 const APInt &Demanded,
2397 TargetLowering::TargetLoweringOpt &TLO,
2398 unsigned NewOpc) {
2399 uint64_t OldImm = Imm, NewImm, Enc;
2400 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2401
2402 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2403 // bimm64.
2404 if (Imm == 0 || Imm == Mask ||
2405 AArch64_AM::isLogicalImmediate(imm: Imm & Mask, regSize: Size))
2406 return false;
2407
2408 unsigned EltSize = Size;
2409 uint64_t DemandedBits = Demanded.getZExtValue();
2410
2411 // Clear bits that are not demanded.
2412 Imm &= DemandedBits;
2413
2414 while (true) {
2415 // The goal here is to set the non-demanded bits in a way that minimizes
2416 // the number of switching between 0 and 1. In order to achieve this goal,
2417 // we set the non-demanded bits to the value of the preceding demanded bits.
2418 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2419 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2420 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2421 // The final result is 0b11000011.
2422 uint64_t NonDemandedBits = ~DemandedBits;
2423 uint64_t InvertedImm = ~Imm & DemandedBits;
2424 uint64_t RotatedImm =
2425 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2426 NonDemandedBits;
2427 uint64_t Sum = RotatedImm + NonDemandedBits;
2428 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2429 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2430 NewImm = (Imm | Ones) & Mask;
2431
2432 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2433 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2434 // we halve the element size and continue the search.
2435 if (isShiftedMask_64(Value: NewImm) || isShiftedMask_64(Value: ~(NewImm | ~Mask)))
2436 break;
2437
2438 // We cannot shrink the element size any further if it is 2-bits.
2439 if (EltSize == 2)
2440 return false;
2441
2442 EltSize /= 2;
2443 Mask >>= EltSize;
2444 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2445
2446 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2447 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2448 return false;
2449
2450 // Merge the upper and lower halves of Imm and DemandedBits.
2451 Imm |= Hi;
2452 DemandedBits |= DemandedBitsHi;
2453 }
2454
2455 ++NumOptimizedImms;
2456
2457 // Replicate the element across the register width.
2458 while (EltSize < Size) {
2459 NewImm |= NewImm << EltSize;
2460 EltSize *= 2;
2461 }
2462
2463 (void)OldImm;
2464 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2465 "demanded bits should never be altered");
2466 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2467
2468 // Create the new constant immediate node.
2469 EVT VT = Op.getValueType();
2470 SDLoc DL(Op);
2471 SDValue New;
2472
2473 // If the new constant immediate is all-zeros or all-ones, let the target
2474 // independent DAG combine optimize this node.
2475 if (NewImm == 0 || NewImm == OrigMask) {
2476 New = TLO.DAG.getNode(Opcode: Op.getOpcode(), DL, VT, N1: Op.getOperand(i: 0),
2477 N2: TLO.DAG.getConstant(Val: NewImm, DL, VT));
2478 // Otherwise, create a machine node so that target independent DAG combine
2479 // doesn't undo this optimization.
2480 } else {
2481 Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm, regSize: Size);
2482 SDValue EncConst = TLO.DAG.getTargetConstant(Val: Enc, DL, VT);
2483 New = SDValue(
2484 TLO.DAG.getMachineNode(Opcode: NewOpc, dl: DL, VT, Op1: Op.getOperand(i: 0), Op2: EncConst), 0);
2485 }
2486
2487 return TLO.CombineTo(O: Op, N: New);
2488}
2489
2490bool AArch64TargetLowering::targetShrinkDemandedConstant(
2491 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2492 TargetLoweringOpt &TLO) const {
2493 // Delay this optimization to as late as possible.
2494 if (!TLO.LegalOps)
2495 return false;
2496
2497 if (!EnableOptimizeLogicalImm)
2498 return false;
2499
2500 EVT VT = Op.getValueType();
2501 if (VT.isVector())
2502 return false;
2503
2504 unsigned Size = VT.getSizeInBits();
2505
2506 if (Size != 32 && Size != 64)
2507 return false;
2508
2509 // Exit early if we demand all bits.
2510 if (DemandedBits.popcount() == Size)
2511 return false;
2512
2513 unsigned NewOpc;
2514 switch (Op.getOpcode()) {
2515 default:
2516 return false;
2517 case ISD::AND:
2518 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2519 break;
2520 case ISD::OR:
2521 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2522 break;
2523 case ISD::XOR:
2524 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2525 break;
2526 }
2527 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1));
2528 if (!C)
2529 return false;
2530 uint64_t Imm = C->getZExtValue();
2531 return optimizeLogicalImm(Op, Size, Imm, Demanded: DemandedBits, TLO, NewOpc);
2532}
2533
2534/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2535/// Mask are known to be either zero or one and return them Known.
2536void AArch64TargetLowering::computeKnownBitsForTargetNode(
2537 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2538 const SelectionDAG &DAG, unsigned Depth) const {
2539 switch (Op.getOpcode()) {
2540 default:
2541 break;
2542 case AArch64ISD::DUP: {
2543 SDValue SrcOp = Op.getOperand(i: 0);
2544 Known = DAG.computeKnownBits(Op: SrcOp, Depth: Depth + 1);
2545 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2546 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2547 "Expected DUP implicit truncation");
2548 Known = Known.trunc(BitWidth: Op.getScalarValueSizeInBits());
2549 }
2550 break;
2551 }
2552 case AArch64ISD::CSEL: {
2553 KnownBits Known2;
2554 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2555 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2556 Known = Known.intersectWith(RHS: Known2);
2557 break;
2558 }
2559 case AArch64ISD::BICi: {
2560 // Compute the bit cleared value.
2561 APInt Mask =
2562 ~(Op->getConstantOperandAPInt(Num: 1) << Op->getConstantOperandAPInt(Num: 2))
2563 .trunc(width: Known.getBitWidth());
2564 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2565 Known &= KnownBits::makeConstant(C: Mask);
2566 break;
2567 }
2568 case AArch64ISD::VLSHR: {
2569 KnownBits Known2;
2570 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2571 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2572 Known = KnownBits::lshr(LHS: Known, RHS: Known2);
2573 break;
2574 }
2575 case AArch64ISD::VASHR: {
2576 KnownBits Known2;
2577 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2578 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2579 Known = KnownBits::ashr(LHS: Known, RHS: Known2);
2580 break;
2581 }
2582 case AArch64ISD::VSHL: {
2583 KnownBits Known2;
2584 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2585 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2586 Known = KnownBits::shl(LHS: Known, RHS: Known2);
2587 break;
2588 }
2589 case AArch64ISD::MOVI: {
2590 Known = KnownBits::makeConstant(
2591 C: APInt(Known.getBitWidth(), Op->getConstantOperandVal(Num: 0)));
2592 break;
2593 }
2594 case AArch64ISD::LOADgot:
2595 case AArch64ISD::ADDlow: {
2596 if (!Subtarget->isTargetILP32())
2597 break;
2598 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2599 Known.Zero = APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32);
2600 break;
2601 }
2602 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2603 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2604 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2605 break;
2606 }
2607 case ISD::INTRINSIC_W_CHAIN: {
2608 Intrinsic::ID IntID =
2609 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(Num: 1));
2610 switch (IntID) {
2611 default: return;
2612 case Intrinsic::aarch64_ldaxr:
2613 case Intrinsic::aarch64_ldxr: {
2614 unsigned BitWidth = Known.getBitWidth();
2615 EVT VT = cast<MemIntrinsicSDNode>(Val: Op)->getMemoryVT();
2616 unsigned MemBits = VT.getScalarSizeInBits();
2617 Known.Zero |= APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - MemBits);
2618 return;
2619 }
2620 }
2621 break;
2622 }
2623 case ISD::INTRINSIC_WO_CHAIN:
2624 case ISD::INTRINSIC_VOID: {
2625 unsigned IntNo = Op.getConstantOperandVal(i: 0);
2626 switch (IntNo) {
2627 default:
2628 break;
2629 case Intrinsic::aarch64_neon_uaddlv: {
2630 MVT VT = Op.getOperand(i: 1).getValueType().getSimpleVT();
2631 unsigned BitWidth = Known.getBitWidth();
2632 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2633 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2634 assert(BitWidth >= Bound && "Unexpected width!");
2635 APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - Bound);
2636 Known.Zero |= Mask;
2637 }
2638 break;
2639 }
2640 case Intrinsic::aarch64_neon_umaxv:
2641 case Intrinsic::aarch64_neon_uminv: {
2642 // Figure out the datatype of the vector operand. The UMINV instruction
2643 // will zero extend the result, so we can mark as known zero all the
2644 // bits larger than the element datatype. 32-bit or larget doesn't need
2645 // this as those are legal types and will be handled by isel directly.
2646 MVT VT = Op.getOperand(i: 1).getValueType().getSimpleVT();
2647 unsigned BitWidth = Known.getBitWidth();
2648 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2649 assert(BitWidth >= 8 && "Unexpected width!");
2650 APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 8);
2651 Known.Zero |= Mask;
2652 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2653 assert(BitWidth >= 16 && "Unexpected width!");
2654 APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 16);
2655 Known.Zero |= Mask;
2656 }
2657 break;
2658 } break;
2659 }
2660 }
2661 }
2662}
2663
2664unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode(
2665 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2666 unsigned Depth) const {
2667 EVT VT = Op.getValueType();
2668 unsigned VTBits = VT.getScalarSizeInBits();
2669 unsigned Opcode = Op.getOpcode();
2670 switch (Opcode) {
2671 case AArch64ISD::FCMEQ:
2672 case AArch64ISD::FCMGE:
2673 case AArch64ISD::FCMGT:
2674 // Compares return either 0 or all-ones
2675 return VTBits;
2676 case AArch64ISD::VASHR: {
2677 unsigned Tmp =
2678 DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), DemandedElts, Depth: Depth + 1);
2679 return std::min<uint64_t>(a: Tmp + Op.getConstantOperandVal(i: 1), b: VTBits);
2680 }
2681 }
2682
2683 return 1;
2684}
2685
2686MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
2687 EVT) const {
2688 return MVT::i64;
2689}
2690
2691bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2692 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2693 unsigned *Fast) const {
2694
2695 // Allow SVE loads/stores where the alignment >= the size of the element type,
2696 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2697 // for stores that come from IR, only require element-size alignment (even if
2698 // unaligned accesses are disabled). Without this, these will be forced to
2699 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2700 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2701 if (VT.isScalableVector()) {
2702 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2703 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2704 return true;
2705 }
2706
2707 if (Subtarget->requiresStrictAlign())
2708 return false;
2709
2710 if (Fast) {
2711 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2712 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2713 // See comments in performSTORECombine() for more details about
2714 // these conditions.
2715
2716 // Code that uses clang vector extensions can mark that it
2717 // wants unaligned accesses to be treated as fast by
2718 // underspecifying alignment to be 1 or 2.
2719 Alignment <= 2 ||
2720
2721 // Disregard v2i64. Memcpy lowering produces those and splitting
2722 // them regresses performance on micro-benchmarks and olden/bh.
2723 VT == MVT::v2i64;
2724 }
2725 return true;
2726}
2727
2728// Same as above but handling LLTs instead.
2729bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2730 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2731 unsigned *Fast) const {
2732 if (Subtarget->requiresStrictAlign())
2733 return false;
2734
2735 if (Fast) {
2736 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2737 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2738 Ty.getSizeInBytes() != 16 ||
2739 // See comments in performSTORECombine() for more details about
2740 // these conditions.
2741
2742 // Code that uses clang vector extensions can mark that it
2743 // wants unaligned accesses to be treated as fast by
2744 // underspecifying alignment to be 1 or 2.
2745 Alignment <= 2 ||
2746
2747 // Disregard v2i64. Memcpy lowering produces those and splitting
2748 // them regresses performance on micro-benchmarks and olden/bh.
2749 Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
2750 }
2751 return true;
2752}
2753
2754FastISel *
2755AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2756 const TargetLibraryInfo *libInfo) const {
2757 return AArch64::createFastISel(funcInfo, libInfo);
2758}
2759
2760MachineBasicBlock *
2761AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2762 MachineBasicBlock *MBB) const {
2763 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2764 // phi node:
2765
2766 // OrigBB:
2767 // [... previous instrs leading to comparison ...]
2768 // b.ne TrueBB
2769 // b EndBB
2770 // TrueBB:
2771 // ; Fallthrough
2772 // EndBB:
2773 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2774
2775 MachineFunction *MF = MBB->getParent();
2776 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2777 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2778 DebugLoc DL = MI.getDebugLoc();
2779 MachineFunction::iterator It = ++MBB->getIterator();
2780
2781 Register DestReg = MI.getOperand(i: 0).getReg();
2782 Register IfTrueReg = MI.getOperand(i: 1).getReg();
2783 Register IfFalseReg = MI.getOperand(i: 2).getReg();
2784 unsigned CondCode = MI.getOperand(i: 3).getImm();
2785 bool NZCVKilled = MI.getOperand(i: 4).isKill();
2786
2787 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2788 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2789 MF->insert(MBBI: It, MBB: TrueBB);
2790 MF->insert(MBBI: It, MBB: EndBB);
2791
2792 // Transfer rest of current basic-block to EndBB
2793 EndBB->splice(Where: EndBB->begin(), Other: MBB, From: std::next(x: MachineBasicBlock::iterator(MI)),
2794 To: MBB->end());
2795 EndBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
2796
2797 BuildMI(BB: MBB, MIMD: DL, MCID: TII->get(Opcode: AArch64::Bcc)).addImm(Val: CondCode).addMBB(MBB: TrueBB);
2798 BuildMI(BB: MBB, MIMD: DL, MCID: TII->get(Opcode: AArch64::B)).addMBB(MBB: EndBB);
2799 MBB->addSuccessor(Succ: TrueBB);
2800 MBB->addSuccessor(Succ: EndBB);
2801
2802 // TrueBB falls through to the end.
2803 TrueBB->addSuccessor(Succ: EndBB);
2804
2805 if (!NZCVKilled) {
2806 TrueBB->addLiveIn(PhysReg: AArch64::NZCV);
2807 EndBB->addLiveIn(PhysReg: AArch64::NZCV);
2808 }
2809
2810 BuildMI(BB&: *EndBB, I: EndBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AArch64::PHI), DestReg)
2811 .addReg(RegNo: IfTrueReg)
2812 .addMBB(MBB: TrueBB)
2813 .addReg(RegNo: IfFalseReg)
2814 .addMBB(MBB);
2815
2816 MI.eraseFromParent();
2817 return EndBB;
2818}
2819
2820MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2821 MachineInstr &MI, MachineBasicBlock *BB) const {
2822 assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2823 BB->getParent()->getFunction().getPersonalityFn())) &&
2824 "SEH does not use catchret!");
2825 return BB;
2826}
2827
2828MachineBasicBlock *
2829AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
2830 MachineBasicBlock *MBB) const {
2831 MachineFunction &MF = *MBB->getParent();
2832 MachineBasicBlock::iterator MBBI = MI.getIterator();
2833 const AArch64InstrInfo &TII =
2834 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2835 Register TargetReg = MI.getOperand(i: 0).getReg();
2836 MachineBasicBlock::iterator NextInst =
2837 TII.probedStackAlloc(MBBI, TargetReg, FrameSetup: false);
2838
2839 MI.eraseFromParent();
2840 return NextInst->getParent();
2841}
2842
2843MachineBasicBlock *
2844AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2845 MachineInstr &MI,
2846 MachineBasicBlock *BB) const {
2847 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2848 MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
2849
2850 MIB.addReg(RegNo: BaseReg + MI.getOperand(i: 0).getImm(), flags: RegState::Define);
2851 MIB.add(MO: MI.getOperand(i: 1)); // slice index register
2852 MIB.add(MO: MI.getOperand(i: 2)); // slice index offset
2853 MIB.add(MO: MI.getOperand(i: 3)); // pg
2854 MIB.add(MO: MI.getOperand(i: 4)); // base
2855 MIB.add(MO: MI.getOperand(i: 5)); // offset
2856
2857 MI.eraseFromParent(); // The pseudo is gone now.
2858 return BB;
2859}
2860
2861MachineBasicBlock *
2862AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
2863 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2864 MachineInstrBuilder MIB =
2865 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::LDR_ZA));
2866
2867 MIB.addReg(RegNo: AArch64::ZA, flags: RegState::Define);
2868 MIB.add(MO: MI.getOperand(i: 0)); // Vector select register
2869 MIB.add(MO: MI.getOperand(i: 1)); // Vector select offset
2870 MIB.add(MO: MI.getOperand(i: 2)); // Base
2871 MIB.add(MO: MI.getOperand(i: 1)); // Offset, same as vector select offset
2872
2873 MI.eraseFromParent(); // The pseudo is gone now.
2874 return BB;
2875}
2876
2877MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
2878 MachineBasicBlock *BB,
2879 unsigned Opcode,
2880 bool Op0IsDef) const {
2881 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2882 MachineInstrBuilder MIB;
2883
2884 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode))
2885 .addReg(RegNo: MI.getOperand(i: 0).getReg(), flags: Op0IsDef ? RegState::Define : 0);
2886 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2887 MIB.add(MO: MI.getOperand(i: I));
2888
2889 MI.eraseFromParent(); // The pseudo is gone now.
2890 return BB;
2891}
2892
2893MachineBasicBlock *
2894AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2895 MachineInstr &MI,
2896 MachineBasicBlock *BB) const {
2897 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2898 MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
2899 unsigned StartIdx = 0;
2900
2901 bool HasTile = BaseReg != AArch64::ZA;
2902 bool HasZPROut = HasTile && MI.getOperand(i: 0).isReg();
2903 if (HasZPROut) {
2904 MIB.add(MO: MI.getOperand(i: StartIdx)); // Output ZPR
2905 ++StartIdx;
2906 }
2907 if (HasTile) {
2908 MIB.addReg(RegNo: BaseReg + MI.getOperand(i: StartIdx).getImm(),
2909 flags: RegState::Define); // Output ZA Tile
2910 MIB.addReg(RegNo: BaseReg + MI.getOperand(i: StartIdx).getImm()); // Input Za Tile
2911 StartIdx++;
2912 } else {
2913 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
2914 if (MI.getOperand(i: 0).isReg() && !MI.getOperand(i: 1).isImm()) {
2915 MIB.add(MO: MI.getOperand(i: StartIdx)); // Output ZPR
2916 ++StartIdx;
2917 }
2918 MIB.addReg(RegNo: BaseReg, flags: RegState::Define).addReg(RegNo: BaseReg);
2919 }
2920 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2921 MIB.add(MO: MI.getOperand(i: I));
2922
2923 MI.eraseFromParent(); // The pseudo is gone now.
2924 return BB;
2925}
2926
2927MachineBasicBlock *
2928AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
2929 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2930 MachineInstrBuilder MIB =
2931 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::ZERO_M));
2932 MIB.add(MO: MI.getOperand(i: 0)); // Mask
2933
2934 unsigned Mask = MI.getOperand(i: 0).getImm();
2935 for (unsigned I = 0; I < 8; I++) {
2936 if (Mask & (1 << I))
2937 MIB.addDef(RegNo: AArch64::ZAD0 + I, Flags: RegState::ImplicitDefine);
2938 }
2939
2940 MI.eraseFromParent(); // The pseudo is gone now.
2941 return BB;
2942}
2943
2944MachineBasicBlock *
2945AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
2946 MachineBasicBlock *BB) const {
2947 MachineFunction *MF = BB->getParent();
2948 MachineFrameInfo &MFI = MF->getFrameInfo();
2949 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
2950 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
2951 if (TPIDR2.Uses > 0) {
2952 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2953 // Store the buffer pointer to the TPIDR2 stack object.
2954 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRXui))
2955 .addReg(RegNo: MI.getOperand(i: 0).getReg())
2956 .addFrameIndex(Idx: TPIDR2.FrameIndex)
2957 .addImm(Val: 0);
2958 // Set the reserved bytes (10-15) to zero
2959 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRHHui))
2960 .addReg(RegNo: AArch64::WZR)
2961 .addFrameIndex(Idx: TPIDR2.FrameIndex)
2962 .addImm(Val: 5);
2963 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRWui))
2964 .addReg(RegNo: AArch64::WZR)
2965 .addFrameIndex(Idx: TPIDR2.FrameIndex)
2966 .addImm(Val: 3);
2967 } else
2968 MFI.RemoveStackObject(ObjectIdx: TPIDR2.FrameIndex);
2969
2970 BB->remove_instr(I: &MI);
2971 return BB;
2972}
2973
2974MachineBasicBlock *
2975AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI,
2976 MachineBasicBlock *BB) const {
2977 MachineFunction *MF = BB->getParent();
2978 MachineFrameInfo &MFI = MF->getFrameInfo();
2979 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
2980 // TODO This function grows the stack with a subtraction, which doesn't work
2981 // on Windows. Some refactoring to share the functionality in
2982 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
2983 // supports SME
2984 assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
2985 "Lazy ZA save is not yet supported on Windows");
2986
2987 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
2988
2989 if (TPIDR2.Uses > 0) {
2990 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2991 MachineRegisterInfo &MRI = MF->getRegInfo();
2992
2993 // The SUBXrs below won't always be emitted in a form that accepts SP
2994 // directly
2995 Register SP = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
2996 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: SP)
2997 .addReg(RegNo: AArch64::SP);
2998
2999 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3000 auto Size = MI.getOperand(i: 1).getReg();
3001 auto Dest = MI.getOperand(i: 0).getReg();
3002 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::MSUBXrrr), DestReg: Dest)
3003 .addReg(RegNo: Size)
3004 .addReg(RegNo: Size)
3005 .addReg(RegNo: SP);
3006 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
3007 DestReg: AArch64::SP)
3008 .addReg(RegNo: Dest);
3009
3010 // We have just allocated a variable sized object, tell this to PEI.
3011 MFI.CreateVariableSizedObject(Alignment: Align(16), Alloca: nullptr);
3012 }
3013
3014 BB->remove_instr(I: &MI);
3015 return BB;
3016}
3017
3018// TODO: Find a way to merge this with EmitAllocateZABuffer.
3019MachineBasicBlock *
3020AArch64TargetLowering::EmitAllocateSMESaveBuffer(MachineInstr &MI,
3021 MachineBasicBlock *BB) const {
3022 MachineFunction *MF = BB->getParent();
3023 MachineFrameInfo &MFI = MF->getFrameInfo();
3024 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3025 assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
3026 "Lazy ZA save is not yet supported on Windows");
3027
3028 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3029 if (FuncInfo->isSMESaveBufferUsed()) {
3030 // Allocate a buffer object of the size given by MI.getOperand(1).
3031 auto Size = MI.getOperand(i: 1).getReg();
3032 auto Dest = MI.getOperand(i: 0).getReg();
3033 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::SUBXrx64), DestReg: AArch64::SP)
3034 .addReg(RegNo: AArch64::SP)
3035 .addReg(RegNo: Size)
3036 .addImm(Val: AArch64_AM::getArithExtendImm(ET: AArch64_AM::UXTX, Imm: 0));
3037 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: Dest)
3038 .addReg(RegNo: AArch64::SP);
3039
3040 // We have just allocated a variable sized object, tell this to PEI.
3041 MFI.CreateVariableSizedObject(Alignment: Align(16), Alloca: nullptr);
3042 } else
3043 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF),
3044 DestReg: MI.getOperand(i: 0).getReg());
3045
3046 BB->remove_instr(I: &MI);
3047 return BB;
3048}
3049
3050MachineBasicBlock *
3051AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
3052 MachineBasicBlock *BB) const {
3053 // If the buffer is used, emit a call to __arm_sme_state_size()
3054 MachineFunction *MF = BB->getParent();
3055 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3056 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3057 if (FuncInfo->isSMESaveBufferUsed()) {
3058 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3059 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::BL))
3060 .addExternalSymbol(FnName: "__arm_sme_state_size")
3061 .addReg(RegNo: AArch64::X0, flags: RegState::ImplicitDefine)
3062 .addRegMask(Mask: TRI->getCallPreservedMask(
3063 MF: *MF, CallingConv::
3064 AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
3065 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
3066 DestReg: MI.getOperand(i: 0).getReg())
3067 .addReg(RegNo: AArch64::X0);
3068 } else
3069 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
3070 DestReg: MI.getOperand(i: 0).getReg())
3071 .addReg(RegNo: AArch64::XZR);
3072 BB->remove_instr(I: &MI);
3073 return BB;
3074}
3075
3076MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
3077 MachineInstr &MI, MachineBasicBlock *BB) const {
3078
3079 int SMEOrigInstr = AArch64::getSMEPseudoMap(Opcode: MI.getOpcode());
3080 if (SMEOrigInstr != -1) {
3081 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3082 uint64_t SMEMatrixType =
3083 TII->get(Opcode: MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3084 switch (SMEMatrixType) {
3085 case (AArch64::SMEMatrixArray):
3086 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZA, MI, BB);
3087 case (AArch64::SMEMatrixTileB):
3088 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAB0, MI, BB);
3089 case (AArch64::SMEMatrixTileH):
3090 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAH0, MI, BB);
3091 case (AArch64::SMEMatrixTileS):
3092 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAS0, MI, BB);
3093 case (AArch64::SMEMatrixTileD):
3094 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAD0, MI, BB);
3095 case (AArch64::SMEMatrixTileQ):
3096 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAQ0, MI, BB);
3097 }
3098 }
3099
3100 switch (MI.getOpcode()) {
3101 default:
3102#ifndef NDEBUG
3103 MI.dump();
3104#endif
3105 llvm_unreachable("Unexpected instruction for custom inserter!");
3106 case AArch64::InitTPIDR2Obj:
3107 return EmitInitTPIDR2Object(MI, BB);
3108 case AArch64::AllocateZABuffer:
3109 return EmitAllocateZABuffer(MI, BB);
3110 case AArch64::AllocateSMESaveBuffer:
3111 return EmitAllocateSMESaveBuffer(MI, BB);
3112 case AArch64::GetSMESaveSize:
3113 return EmitGetSMESaveSize(MI, BB);
3114 case AArch64::F128CSEL:
3115 return EmitF128CSEL(MI, MBB: BB);
3116 case TargetOpcode::STATEPOINT:
3117 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3118 // while bl call instruction (where statepoint will be lowered at the end)
3119 // has implicit def. This def is early-clobber as it will be set at
3120 // the moment of the call and earlier than any use is read.
3121 // Add this implicit dead def here as a workaround.
3122 MI.addOperand(MF&: *MI.getMF(),
3123 Op: MachineOperand::CreateReg(
3124 Reg: AArch64::LR, /*isDef*/ true,
3125 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3126 /*isUndef*/ false, /*isEarlyClobber*/ true));
3127 [[fallthrough]];
3128 case TargetOpcode::STACKMAP:
3129 case TargetOpcode::PATCHPOINT:
3130 return emitPatchPoint(MI, MBB: BB);
3131
3132 case TargetOpcode::PATCHABLE_EVENT_CALL:
3133 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3134 return BB;
3135
3136 case AArch64::CATCHRET:
3137 return EmitLoweredCatchRet(MI, BB);
3138
3139 case AArch64::PROBED_STACKALLOC_DYN:
3140 return EmitDynamicProbedAlloc(MI, MBB: BB);
3141
3142 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3143 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_B, BaseReg: AArch64::ZAB0, MI, BB);
3144 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3145 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_H, BaseReg: AArch64::ZAH0, MI, BB);
3146 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3147 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_S, BaseReg: AArch64::ZAS0, MI, BB);
3148 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3149 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_D, BaseReg: AArch64::ZAD0, MI, BB);
3150 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3151 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_Q, BaseReg: AArch64::ZAQ0, MI, BB);
3152 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3153 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_B, BaseReg: AArch64::ZAB0, MI, BB);
3154 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3155 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_H, BaseReg: AArch64::ZAH0, MI, BB);
3156 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3157 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_S, BaseReg: AArch64::ZAS0, MI, BB);
3158 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3159 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_D, BaseReg: AArch64::ZAD0, MI, BB);
3160 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3161 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_Q, BaseReg: AArch64::ZAQ0, MI, BB);
3162 case AArch64::LDR_ZA_PSEUDO:
3163 return EmitFill(MI, BB);
3164 case AArch64::LDR_TX_PSEUDO:
3165 return EmitZTInstr(MI, BB, Opcode: AArch64::LDR_TX, /*Op0IsDef=*/true);
3166 case AArch64::STR_TX_PSEUDO:
3167 return EmitZTInstr(MI, BB, Opcode: AArch64::STR_TX, /*Op0IsDef=*/false);
3168 case AArch64::ZERO_M_PSEUDO:
3169 return EmitZero(MI, BB);
3170 case AArch64::ZERO_T_PSEUDO:
3171 return EmitZTInstr(MI, BB, Opcode: AArch64::ZERO_T, /*Op0IsDef=*/true);
3172 case AArch64::MOVT_TIZ_PSEUDO:
3173 return EmitZTInstr(MI, BB, Opcode: AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3174 }
3175}
3176
3177//===----------------------------------------------------------------------===//
3178// AArch64 Lowering private implementation.
3179//===----------------------------------------------------------------------===//
3180
3181//===----------------------------------------------------------------------===//
3182// Lowering Code
3183//===----------------------------------------------------------------------===//
3184
3185// Forward declarations of SVE fixed length lowering helpers
3186static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
3187static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
3188static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
3189static SDValue convertFixedMaskToScalableVector(SDValue Mask,
3190 SelectionDAG &DAG);
3191static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT);
3192static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
3193 EVT VT);
3194
3195/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3196static bool isZerosVector(const SDNode *N) {
3197 // Look through a bit convert.
3198 while (N->getOpcode() == ISD::BITCAST)
3199 N = N->getOperand(Num: 0).getNode();
3200
3201 if (ISD::isConstantSplatVectorAllZeros(N))
3202 return true;
3203
3204 if (N->getOpcode() != AArch64ISD::DUP)
3205 return false;
3206
3207 auto Opnd0 = N->getOperand(Num: 0);
3208 return isNullConstant(V: Opnd0) || isNullFPConstant(V: Opnd0);
3209}
3210
3211/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3212/// CC
3213static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
3214 switch (CC) {
3215 default:
3216 llvm_unreachable("Unknown condition code!");
3217 case ISD::SETNE:
3218 return AArch64CC::NE;
3219 case ISD::SETEQ:
3220 return AArch64CC::EQ;
3221 case ISD::SETGT:
3222 return AArch64CC::GT;
3223 case ISD::SETGE:
3224 return AArch64CC::GE;
3225 case ISD::SETLT:
3226 return AArch64CC::LT;
3227 case ISD::SETLE:
3228 return AArch64CC::LE;
3229 case ISD::SETUGT:
3230 return AArch64CC::HI;
3231 case ISD::SETUGE:
3232 return AArch64CC::HS;
3233 case ISD::SETULT:
3234 return AArch64CC::LO;
3235 case ISD::SETULE:
3236 return AArch64CC::LS;
3237 }
3238}
3239
3240/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3241static void changeFPCCToAArch64CC(ISD::CondCode CC,
3242 AArch64CC::CondCode &CondCode,
3243 AArch64CC::CondCode &CondCode2) {
3244 CondCode2 = AArch64CC::AL;
3245 switch (CC) {
3246 default:
3247 llvm_unreachable("Unknown FP condition!");
3248 case ISD::SETEQ:
3249 case ISD::SETOEQ:
3250 CondCode = AArch64CC::EQ;
3251 break;
3252 case ISD::SETGT:
3253 case ISD::SETOGT:
3254 CondCode = AArch64CC::GT;
3255 break;
3256 case ISD::SETGE:
3257 case ISD::SETOGE:
3258 CondCode = AArch64CC::GE;
3259 break;
3260 case ISD::SETOLT:
3261 CondCode = AArch64CC::MI;
3262 break;
3263 case ISD::SETOLE:
3264 CondCode = AArch64CC::LS;
3265 break;
3266 case ISD::SETONE:
3267 CondCode = AArch64CC::MI;
3268 CondCode2 = AArch64CC::GT;
3269 break;
3270 case ISD::SETO:
3271 CondCode = AArch64CC::VC;
3272 break;
3273 case ISD::SETUO:
3274 CondCode = AArch64CC::VS;
3275 break;
3276 case ISD::SETUEQ:
3277 CondCode = AArch64CC::EQ;
3278 CondCode2 = AArch64CC::VS;
3279 break;
3280 case ISD::SETUGT:
3281 CondCode = AArch64CC::HI;
3282 break;
3283 case ISD::SETUGE:
3284 CondCode = AArch64CC::PL;
3285 break;
3286 case ISD::SETLT:
3287 case ISD::SETULT:
3288 CondCode = AArch64CC::LT;
3289 break;
3290 case ISD::SETLE:
3291 case ISD::SETULE:
3292 CondCode = AArch64CC::LE;
3293 break;
3294 case ISD::SETNE:
3295 case ISD::SETUNE:
3296 CondCode = AArch64CC::NE;
3297 break;
3298 }
3299}
3300
3301/// Convert a DAG fp condition code to an AArch64 CC.
3302/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3303/// should be AND'ed instead of OR'ed.
3304static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
3305 AArch64CC::CondCode &CondCode,
3306 AArch64CC::CondCode &CondCode2) {
3307 CondCode2 = AArch64CC::AL;
3308 switch (CC) {
3309 default:
3310 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3311 assert(CondCode2 == AArch64CC::AL);
3312 break;
3313 case ISD::SETONE:
3314 // (a one b)
3315 // == ((a olt b) || (a ogt b))
3316 // == ((a ord b) && (a une b))
3317 CondCode = AArch64CC::VC;
3318 CondCode2 = AArch64CC::NE;
3319 break;
3320 case ISD::SETUEQ:
3321 // (a ueq b)
3322 // == ((a uno b) || (a oeq b))
3323 // == ((a ule b) && (a uge b))
3324 CondCode = AArch64CC::PL;
3325 CondCode2 = AArch64CC::LE;
3326 break;
3327 }
3328}
3329
3330/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3331/// CC usable with the vector instructions. Fewer operations are available
3332/// without a real NZCV register, so we have to use less efficient combinations
3333/// to get the same effect.
3334static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
3335 AArch64CC::CondCode &CondCode,
3336 AArch64CC::CondCode &CondCode2,
3337 bool &Invert) {
3338 Invert = false;
3339 switch (CC) {
3340 default:
3341 // Mostly the scalar mappings work fine.
3342 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3343 break;
3344 case ISD::SETUO:
3345 Invert = true;
3346 [[fallthrough]];
3347 case ISD::SETO:
3348 CondCode = AArch64CC::MI;
3349 CondCode2 = AArch64CC::GE;
3350 break;
3351 case ISD::SETUEQ:
3352 case ISD::SETULT:
3353 case ISD::SETULE:
3354 case ISD::SETUGT:
3355 case ISD::SETUGE:
3356 // All of the compare-mask comparisons are ordered, but we can switch
3357 // between the two by a double inversion. E.g. ULE == !OGT.
3358 Invert = true;
3359 changeFPCCToAArch64CC(CC: getSetCCInverse(Operation: CC, /* FP inverse */ Type: MVT::f32),
3360 CondCode, CondCode2);
3361 break;
3362 }
3363}
3364
3365static bool isLegalArithImmed(uint64_t C) {
3366 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3367 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3368 LLVM_DEBUG(dbgs() << "Is imm " << C
3369 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3370 return IsLegal;
3371}
3372
3373bool isLegalCmpImmed(APInt C) {
3374 // Works for negative immediates too, as it can be written as an ADDS
3375 // instruction with a negated immediate.
3376 return isLegalArithImmed(C: C.abs().getZExtValue());
3377}
3378
3379static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG) {
3380 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3381 if (Op->getFlags().hasNoSignedWrap())
3382 return true;
3383
3384 // We can still figure out if the second operand is safe to use
3385 // in a CMN instruction by checking if it is known to be not the minimum
3386 // signed value. If it is not, then we can safely use CMN.
3387 // Note: We can eventually remove this check and simply rely on
3388 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3389 // consistently sets them appropriately when making said nodes.
3390
3391 KnownBits KnownSrc = DAG.computeKnownBits(Op: Op.getOperand(i: 1));
3392 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3393}
3394
3395// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3396// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3397// can be set differently by this operation. It comes down to whether
3398// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3399// everything is fine. If not then the optimization is wrong. Thus general
3400// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3401//
3402// So, finally, the only LLVM-native comparisons that don't mention C or V
3403// are the ones that aren't unsigned comparisons. They're the only ones we can
3404// safely use CMN for in the absence of information about op2.
3405static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG) {
3406 return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: 0)) &&
3407 (isIntEqualitySetCC(Code: CC) ||
3408 (isUnsignedIntSetCC(Code: CC) && DAG.isKnownNeverZero(Op: Op.getOperand(i: 1))) ||
3409 (isSignedIntSetCC(Code: CC) && isSafeSignedCMN(Op, DAG)));
3410}
3411
3412static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL,
3413 SelectionDAG &DAG, SDValue Chain,
3414 bool IsSignaling) {
3415 EVT VT = LHS.getValueType();
3416 assert(VT != MVT::f128);
3417
3418 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3419
3420 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3421 LHS = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {MVT::f32, MVT::Other},
3422 Ops: {Chain, LHS});
3423 RHS = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {MVT::f32, MVT::Other},
3424 Ops: {LHS.getValue(R: 1), RHS});
3425 Chain = RHS.getValue(R: 1);
3426 }
3427 unsigned Opcode =
3428 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3429 return DAG.getNode(Opcode, DL, ResultTys: {MVT::i32, MVT::Other}, Ops: {Chain, LHS, RHS});
3430}
3431
3432static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3433 const SDLoc &DL, SelectionDAG &DAG) {
3434 EVT VT = LHS.getValueType();
3435 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3436
3437 if (VT.isFloatingPoint()) {
3438 assert(VT != MVT::f128);
3439 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3440 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: LHS);
3441 RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: RHS);
3442 }
3443 return DAG.getNode(Opcode: AArch64ISD::FCMP, DL, VT: MVT::i32, N1: LHS, N2: RHS);
3444 }
3445
3446 // The CMP instruction is just an alias for SUBS, and representing it as
3447 // SUBS means that it's possible to get CSE with subtract operations.
3448 // A later phase can perform the optimization of setting the destination
3449 // register to WZR/XZR if it ends up being unused.
3450 unsigned Opcode = AArch64ISD::SUBS;
3451
3452 if (isCMN(Op: RHS, CC, DAG)) {
3453 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3454 Opcode = AArch64ISD::ADDS;
3455 RHS = RHS.getOperand(i: 1);
3456 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(V: LHS.getOperand(i: 0)) &&
3457 isIntEqualitySetCC(Code: CC)) {
3458 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3459 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3460 Opcode = AArch64ISD::ADDS;
3461 LHS = LHS.getOperand(i: 1);
3462 } else if (isNullConstant(V: RHS) && !isUnsignedIntSetCC(Code: CC)) {
3463 if (LHS.getOpcode() == ISD::AND) {
3464 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3465 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3466 // of the signed comparisons.
3467 const SDValue ANDSNode =
3468 DAG.getNode(Opcode: AArch64ISD::ANDS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC),
3469 N1: LHS.getOperand(i: 0), N2: LHS.getOperand(i: 1));
3470 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3471 DAG.ReplaceAllUsesWith(From: LHS, To: ANDSNode);
3472 return ANDSNode.getValue(R: 1);
3473 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3474 // Use result of ANDS
3475 return LHS.getValue(R: 1);
3476 }
3477 }
3478
3479 return DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC), N1: LHS, N2: RHS)
3480 .getValue(R: 1);
3481}
3482
3483/// \defgroup AArch64CCMP CMP;CCMP matching
3484///
3485/// These functions deal with the formation of CMP;CCMP;... sequences.
3486/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3487/// a comparison. They set the NZCV flags to a predefined value if their
3488/// predicate is false. This allows to express arbitrary conjunctions, for
3489/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3490/// expressed as:
3491/// cmp A
3492/// ccmp B, inv(CB), CA
3493/// check for CB flags
3494///
3495/// This naturally lets us implement chains of AND operations with SETCC
3496/// operands. And we can even implement some other situations by transforming
3497/// them:
3498/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3499/// negating the flags used in a CCMP/FCCMP operations.
3500/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3501/// by negating the flags we test for afterwards. i.e.
3502/// NEG (CMP CCMP CCCMP ...) can be implemented.
3503/// - Note that we can only ever negate all previously processed results.
3504/// What we can not implement by flipping the flags to test is a negation
3505/// of two sub-trees (because the negation affects all sub-trees emitted so
3506/// far, so the 2nd sub-tree we emit would also affect the first).
3507/// With those tools we can implement some OR operations:
3508/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3509/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3510/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3511/// elimination rules from earlier to implement the whole thing as a
3512/// CCMP/FCCMP chain.
3513///
3514/// As complete example:
3515/// or (or (setCA (cmp A)) (setCB (cmp B)))
3516/// (and (setCC (cmp C)) (setCD (cmp D)))"
3517/// can be reassociated to:
3518/// or (and (setCC (cmp C)) setCD (cmp D))
3519// (or (setCA (cmp A)) (setCB (cmp B)))
3520/// can be transformed to:
3521/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3522/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3523/// which can be implemented as:
3524/// cmp C
3525/// ccmp D, inv(CD), CC
3526/// ccmp A, CA, inv(CD)
3527/// ccmp B, CB, inv(CA)
3528/// check for CB flags
3529///
3530/// A counterexample is "or (and A B) (and C D)" which translates to
3531/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3532/// can only implement 1 of the inner (not) operations, but not both!
3533/// @{
3534
3535/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3536static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
3537 ISD::CondCode CC, SDValue CCOp,
3538 AArch64CC::CondCode Predicate,
3539 AArch64CC::CondCode OutCC,
3540 const SDLoc &DL, SelectionDAG &DAG) {
3541 unsigned Opcode = 0;
3542 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3543
3544 if (LHS.getValueType().isFloatingPoint()) {
3545 assert(LHS.getValueType() != MVT::f128);
3546 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3547 LHS.getValueType() == MVT::bf16) {
3548 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: LHS);
3549 RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: RHS);
3550 }
3551 Opcode = AArch64ISD::FCCMP;
3552 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val&: RHS)) {
3553 APInt Imm = Const->getAPIntValue();
3554 if (Imm.isNegative() && Imm.sgt(RHS: -32)) {
3555 Opcode = AArch64ISD::CCMN;
3556 RHS = DAG.getConstant(Val: Imm.abs(), DL, VT: Const->getValueType(ResNo: 0));
3557 }
3558 } else if (isCMN(Op: RHS, CC, DAG)) {
3559 Opcode = AArch64ISD::CCMN;
3560 RHS = RHS.getOperand(i: 1);
3561 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(V: LHS.getOperand(i: 0)) &&
3562 isIntEqualitySetCC(Code: CC)) {
3563 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3564 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3565 Opcode = AArch64ISD::CCMN;
3566 LHS = LHS.getOperand(i: 1);
3567 }
3568 if (Opcode == 0)
3569 Opcode = AArch64ISD::CCMP;
3570
3571 SDValue Condition = DAG.getConstant(Val: Predicate, DL, VT: MVT_CC);
3572 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3573 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvOutCC);
3574 SDValue NZCVOp = DAG.getConstant(Val: NZCV, DL, VT: MVT::i32);
3575 return DAG.getNode(Opcode, DL, VT: MVT_CC, N1: LHS, N2: RHS, N3: NZCVOp, N4: Condition, N5: CCOp);
3576}
3577
3578/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3579/// expressed as a conjunction. See \ref AArch64CCMP.
3580/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3581/// changing the conditions on the SETCC tests.
3582/// (this means we can call emitConjunctionRec() with
3583/// Negate==true on this sub-tree)
3584/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3585/// cannot do the negation naturally. We are required to
3586/// emit the subtree first in this case.
3587/// \param WillNegate Is true if are called when the result of this
3588/// subexpression must be negated. This happens when the
3589/// outer expression is an OR. We can use this fact to know
3590/// that we have a double negation (or (or ...) ...) that
3591/// can be implemented for free.
3592static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3593 bool &MustBeFirst, bool WillNegate,
3594 unsigned Depth = 0) {
3595 if (!Val.hasOneUse())
3596 return false;
3597 unsigned Opcode = Val->getOpcode();
3598 if (Opcode == ISD::SETCC) {
3599 if (Val->getOperand(Num: 0).getValueType() == MVT::f128)
3600 return false;
3601 CanNegate = true;
3602 MustBeFirst = false;
3603 return true;
3604 }
3605 // Protect against exponential runtime and stack overflow.
3606 if (Depth > 6)
3607 return false;
3608 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3609 bool IsOR = Opcode == ISD::OR;
3610 SDValue O0 = Val->getOperand(Num: 0);
3611 SDValue O1 = Val->getOperand(Num: 1);
3612 bool CanNegateL;
3613 bool MustBeFirstL;
3614 if (!canEmitConjunction(Val: O0, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, Depth: Depth+1))
3615 return false;
3616 bool CanNegateR;
3617 bool MustBeFirstR;
3618 if (!canEmitConjunction(Val: O1, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, Depth: Depth+1))
3619 return false;
3620
3621 if (MustBeFirstL && MustBeFirstR)
3622 return false;
3623
3624 if (IsOR) {
3625 // For an OR expression we need to be able to naturally negate at least
3626 // one side or we cannot do the transformation at all.
3627 if (!CanNegateL && !CanNegateR)
3628 return false;
3629 // If we the result of the OR will be negated and we can naturally negate
3630 // the leafs, then this sub-tree as a whole negates naturally.
3631 CanNegate = WillNegate && CanNegateL && CanNegateR;
3632 // If we cannot naturally negate the whole sub-tree, then this must be
3633 // emitted first.
3634 MustBeFirst = !CanNegate;
3635 } else {
3636 assert(Opcode == ISD::AND && "Must be OR or AND");
3637 // We cannot naturally negate an AND operation.
3638 CanNegate = false;
3639 MustBeFirst = MustBeFirstL || MustBeFirstR;
3640 }
3641 return true;
3642 }
3643 return false;
3644}
3645
3646/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3647/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3648/// Tries to transform the given i1 producing node @p Val to a series compare
3649/// and conditional compare operations. @returns an NZCV flags producing node
3650/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3651/// transformation was not possible.
3652/// \p Negate is true if we want this sub-tree being negated just by changing
3653/// SETCC conditions.
3654static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3655 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3656 AArch64CC::CondCode Predicate) {
3657 // We're at a tree leaf, produce a conditional comparison operation.
3658 unsigned Opcode = Val->getOpcode();
3659 if (Opcode == ISD::SETCC) {
3660 SDValue LHS = Val->getOperand(Num: 0);
3661 SDValue RHS = Val->getOperand(Num: 1);
3662 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Val->getOperand(Num: 2))->get();
3663 bool isInteger = LHS.getValueType().isInteger();
3664 if (Negate)
3665 CC = getSetCCInverse(Operation: CC, Type: LHS.getValueType());
3666 SDLoc DL(Val);
3667 // Determine OutCC and handle FP special case.
3668 if (isInteger) {
3669 OutCC = changeIntCCToAArch64CC(CC);
3670 } else {
3671 assert(LHS.getValueType().isFloatingPoint());
3672 AArch64CC::CondCode ExtraCC;
3673 changeFPCCToANDAArch64CC(CC, CondCode&: OutCC, CondCode2&: ExtraCC);
3674 // Some floating point conditions can't be tested with a single condition
3675 // code. Construct an additional comparison in this case.
3676 if (ExtraCC != AArch64CC::AL) {
3677 SDValue ExtraCmp;
3678 if (!CCOp.getNode())
3679 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3680 else
3681 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3682 OutCC: ExtraCC, DL, DAG);
3683 CCOp = ExtraCmp;
3684 Predicate = ExtraCC;
3685 }
3686 }
3687
3688 // Produce a normal comparison if we are first in the chain
3689 if (!CCOp)
3690 return emitComparison(LHS, RHS, CC, DL, DAG);
3691 // Otherwise produce a ccmp.
3692 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3693 DAG);
3694 }
3695 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3696
3697 bool IsOR = Opcode == ISD::OR;
3698
3699 SDValue LHS = Val->getOperand(Num: 0);
3700 bool CanNegateL;
3701 bool MustBeFirstL;
3702 bool ValidL = canEmitConjunction(Val: LHS, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR);
3703 assert(ValidL && "Valid conjunction/disjunction tree");
3704 (void)ValidL;
3705
3706 SDValue RHS = Val->getOperand(Num: 1);
3707 bool CanNegateR;
3708 bool MustBeFirstR;
3709 bool ValidR = canEmitConjunction(Val: RHS, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR);
3710 assert(ValidR && "Valid conjunction/disjunction tree");
3711 (void)ValidR;
3712
3713 // Swap sub-tree that must come first to the right side.
3714 if (MustBeFirstL) {
3715 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3716 std::swap(a&: LHS, b&: RHS);
3717 std::swap(a&: CanNegateL, b&: CanNegateR);
3718 std::swap(a&: MustBeFirstL, b&: MustBeFirstR);
3719 }
3720
3721 bool NegateR;
3722 bool NegateAfterR;
3723 bool NegateL;
3724 bool NegateAfterAll;
3725 if (Opcode == ISD::OR) {
3726 // Swap the sub-tree that we can negate naturally to the left.
3727 if (!CanNegateL) {
3728 assert(CanNegateR && "at least one side must be negatable");
3729 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3730 assert(!Negate);
3731 std::swap(a&: LHS, b&: RHS);
3732 NegateR = false;
3733 NegateAfterR = true;
3734 } else {
3735 // Negate the left sub-tree if possible, otherwise negate the result.
3736 NegateR = CanNegateR;
3737 NegateAfterR = !CanNegateR;
3738 }
3739 NegateL = true;
3740 NegateAfterAll = !Negate;
3741 } else {
3742 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3743 assert(!Negate && "Valid conjunction/disjunction tree");
3744
3745 NegateL = false;
3746 NegateR = false;
3747 NegateAfterR = false;
3748 NegateAfterAll = false;
3749 }
3750
3751 // Emit sub-trees.
3752 AArch64CC::CondCode RHSCC;
3753 SDValue CmpR = emitConjunctionRec(DAG, Val: RHS, OutCC&: RHSCC, Negate: NegateR, CCOp, Predicate);
3754 if (NegateAfterR)
3755 RHSCC = AArch64CC::getInvertedCondCode(Code: RHSCC);
3756 SDValue CmpL = emitConjunctionRec(DAG, Val: LHS, OutCC, Negate: NegateL, CCOp: CmpR, Predicate: RHSCC);
3757 if (NegateAfterAll)
3758 OutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3759 return CmpL;
3760}
3761
3762/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3763/// In some cases this is even possible with OR operations in the expression.
3764/// See \ref AArch64CCMP.
3765/// \see emitConjunctionRec().
3766static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3767 AArch64CC::CondCode &OutCC) {
3768 bool DummyCanNegate;
3769 bool DummyMustBeFirst;
3770 if (!canEmitConjunction(Val, CanNegate&: DummyCanNegate, MustBeFirst&: DummyMustBeFirst, WillNegate: false))
3771 return SDValue();
3772
3773 return emitConjunctionRec(DAG, Val, OutCC, Negate: false, CCOp: SDValue(), Predicate: AArch64CC::AL);
3774}
3775
3776/// @}
3777
3778/// Returns how profitable it is to fold a comparison's operand's shift and/or
3779/// extension operations.
3780static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3781 auto isSupportedExtend = [&](SDValue V) {
3782 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3783 return true;
3784
3785 if (V.getOpcode() == ISD::AND)
3786 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1))) {
3787 uint64_t Mask = MaskCst->getZExtValue();
3788 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3789 }
3790
3791 return false;
3792 };
3793
3794 if (!Op.hasOneUse())
3795 return 0;
3796
3797 if (isSupportedExtend(Op))
3798 return 1;
3799
3800 unsigned Opc = Op.getOpcode();
3801 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3802 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
3803 uint64_t Shift = ShiftCst->getZExtValue();
3804 if (isSupportedExtend(Op.getOperand(i: 0)))
3805 return (Shift <= 4) ? 2 : 1;
3806 EVT VT = Op.getValueType();
3807 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3808 return 1;
3809 }
3810
3811 return 0;
3812}
3813
3814// emitComparison() converts comparison with one or negative one to comparison
3815// with 0. Note that this only works for signed comparisons because of how ANDS
3816// works.
3817static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC) {
3818 // Only works for ANDS and AND.
3819 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
3820 return false;
3821
3822 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
3823 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3824 return true;
3825 }
3826
3827 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
3828 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3829 return true;
3830 }
3831
3832 return false;
3833}
3834
3835static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3836 SDValue &AArch64cc, SelectionDAG &DAG,
3837 const SDLoc &DL) {
3838 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val: RHS.getNode())) {
3839 EVT VT = RHS.getValueType();
3840 APInt C = RHSC->getAPIntValue();
3841 // shouldBeAdjustedToZero is a special case to better fold with
3842 // emitComparison().
3843 if (shouldBeAdjustedToZero(LHS, C, CC)) {
3844 // Adjust the constant to zero.
3845 // CC has already been adjusted.
3846 RHS = DAG.getConstant(Val: 0, DL, VT);
3847 } else if (!isLegalCmpImmed(C)) {
3848 // Constant does not fit, try adjusting it by one?
3849 switch (CC) {
3850 default:
3851 break;
3852 case ISD::SETLT:
3853 case ISD::SETGE:
3854 if (!C.isMinSignedValue()) {
3855 APInt CMinusOne = C - 1;
3856 if (isLegalCmpImmed(C: CMinusOne)) {
3857 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3858 RHS = DAG.getConstant(Val: CMinusOne, DL, VT);
3859 }
3860 }
3861 break;
3862 case ISD::SETULT:
3863 case ISD::SETUGE:
3864 if (!C.isZero()) {
3865 APInt CMinusOne = C - 1;
3866 if (isLegalCmpImmed(C: CMinusOne)) {
3867 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3868 RHS = DAG.getConstant(Val: CMinusOne, DL, VT);
3869 }
3870 }
3871 break;
3872 case ISD::SETLE:
3873 case ISD::SETGT:
3874 if (!C.isMaxSignedValue()) {
3875 APInt CPlusOne = C + 1;
3876 if (isLegalCmpImmed(C: CPlusOne)) {
3877 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3878 RHS = DAG.getConstant(Val: CPlusOne, DL, VT);
3879 }
3880 }
3881 break;
3882 case ISD::SETULE:
3883 case ISD::SETUGT:
3884 if (!C.isAllOnes()) {
3885 APInt CPlusOne = C + 1;
3886 if (isLegalCmpImmed(C: CPlusOne)) {
3887 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3888 RHS = DAG.getConstant(Val: CPlusOne, DL, VT);
3889 }
3890 }
3891 break;
3892 }
3893 }
3894 }
3895
3896 // Comparisons are canonicalized so that the RHS operand is simpler than the
3897 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3898 // can fold some shift+extend operations on the RHS operand, so swap the
3899 // operands if that can be done.
3900 //
3901 // For example:
3902 // lsl w13, w11, #1
3903 // cmp w13, w12
3904 // can be turned into:
3905 // cmp w12, w11, lsl #1
3906 if (!isa<ConstantSDNode>(Val: RHS) || !isLegalCmpImmed(C: RHS->getAsAPIntVal())) {
3907 bool LHSIsCMN = isCMN(Op: LHS, CC, DAG);
3908 bool RHSIsCMN = isCMN(Op: RHS, CC, DAG);
3909 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(i: 1) : LHS;
3910 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(i: 1) : RHS;
3911
3912 if (getCmpOperandFoldingProfit(Op: TheLHS) + (LHSIsCMN ? 1 : 0) >
3913 getCmpOperandFoldingProfit(Op: TheRHS) + (RHSIsCMN ? 1 : 0)) {
3914 std::swap(a&: LHS, b&: RHS);
3915 CC = ISD::getSetCCSwappedOperands(Operation: CC);
3916 }
3917 }
3918
3919 SDValue Cmp;
3920 AArch64CC::CondCode AArch64CC;
3921 if (isIntEqualitySetCC(Code: CC) && isa<ConstantSDNode>(Val: RHS)) {
3922 const ConstantSDNode *RHSC = cast<ConstantSDNode>(Val&: RHS);
3923
3924 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3925 // For the i8 operand, the largest immediate is 255, so this can be easily
3926 // encoded in the compare instruction. For the i16 operand, however, the
3927 // largest immediate cannot be encoded in the compare.
3928 // Therefore, use a sign extending load and cmn to avoid materializing the
3929 // -1 constant. For example,
3930 // movz w1, #65535
3931 // ldrh w0, [x0, #0]
3932 // cmp w0, w1
3933 // >
3934 // ldrsh w0, [x0, #0]
3935 // cmn w0, #1
3936 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3937 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3938 // ensure both the LHS and RHS are truly zero extended and to make sure the
3939 // transformation is profitable.
3940 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(Val: LHS) &&
3941 cast<LoadSDNode>(Val&: LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3942 cast<LoadSDNode>(Val&: LHS)->getMemoryVT() == MVT::i16 &&
3943 LHS.getNode()->hasNUsesOfValue(NUses: 1, Value: 0)) {
3944 int16_t ValueofRHS = RHS->getAsZExtVal();
3945 if (ValueofRHS < 0 && isLegalArithImmed(C: -ValueofRHS)) {
3946 SDValue SExt =
3947 DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: LHS.getValueType(), N1: LHS,
3948 N2: DAG.getValueType(MVT::i16));
3949 Cmp = emitComparison(
3950 LHS: SExt, RHS: DAG.getSignedConstant(Val: ValueofRHS, DL, VT: RHS.getValueType()), CC,
3951 DL, DAG);
3952 AArch64CC = changeIntCCToAArch64CC(CC);
3953 }
3954 }
3955
3956 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3957 if ((Cmp = emitConjunction(DAG, Val: LHS, OutCC&: AArch64CC))) {
3958 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3959 AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
3960 }
3961 }
3962 }
3963
3964 if (!Cmp) {
3965 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
3966 AArch64CC = changeIntCCToAArch64CC(CC);
3967 }
3968 AArch64cc = DAG.getConstant(Val: AArch64CC, DL, VT: MVT_CC);
3969 return Cmp;
3970}
3971
3972static std::pair<SDValue, SDValue>
3973getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
3974 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3975 "Unsupported value type");
3976 SDValue Value, Overflow;
3977 SDLoc DL(Op);
3978 SDValue LHS = Op.getOperand(i: 0);
3979 SDValue RHS = Op.getOperand(i: 1);
3980 unsigned Opc = 0;
3981 switch (Op.getOpcode()) {
3982 default:
3983 llvm_unreachable("Unknown overflow instruction!");
3984 case ISD::SADDO:
3985 Opc = AArch64ISD::ADDS;
3986 CC = AArch64CC::VS;
3987 break;
3988 case ISD::UADDO:
3989 Opc = AArch64ISD::ADDS;
3990 CC = AArch64CC::HS;
3991 break;
3992 case ISD::SSUBO:
3993 Opc = AArch64ISD::SUBS;
3994 CC = AArch64CC::VS;
3995 break;
3996 case ISD::USUBO:
3997 Opc = AArch64ISD::SUBS;
3998 CC = AArch64CC::LO;
3999 break;
4000 // Multiply needs a little bit extra work.
4001 case ISD::SMULO:
4002 case ISD::UMULO: {
4003 CC = AArch64CC::NE;
4004 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4005 if (Op.getValueType() == MVT::i32) {
4006 // Extend to 64-bits, then perform a 64-bit multiply.
4007 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4008 LHS = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::i64, Operand: LHS);
4009 RHS = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::i64, Operand: RHS);
4010 SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4011 Value = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: Mul);
4012
4013 // Check that the result fits into a 32-bit integer.
4014 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT_CC);
4015 if (IsSigned) {
4016 // cmp xreg, wreg, sxtw
4017 SDValue SExtMul = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: Value);
4018 Overflow =
4019 DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Mul, N2: SExtMul).getValue(R: 1);
4020 } else {
4021 // tst xreg, #0xffffffff00000000
4022 SDValue UpperBits = DAG.getConstant(Val: 0xFFFFFFFF00000000, DL, VT: MVT::i64);
4023 Overflow =
4024 DAG.getNode(Opcode: AArch64ISD::ANDS, DL, VTList: VTs, N1: Mul, N2: UpperBits).getValue(R: 1);
4025 }
4026 break;
4027 }
4028 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4029 // For the 64 bit multiply
4030 Value = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4031 if (IsSigned) {
4032 SDValue UpperBits = DAG.getNode(Opcode: ISD::MULHS, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4033 SDValue LowerBits = DAG.getNode(Opcode: ISD::SRA, DL, VT: MVT::i64, N1: Value,
4034 N2: DAG.getConstant(Val: 63, DL, VT: MVT::i64));
4035 // It is important that LowerBits is last, otherwise the arithmetic
4036 // shift will not be folded into the compare (SUBS).
4037 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i32);
4038 Overflow = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: UpperBits, N2: LowerBits)
4039 .getValue(R: 1);
4040 } else {
4041 SDValue UpperBits = DAG.getNode(Opcode: ISD::MULHU, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4042 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i32);
4043 Overflow =
4044 DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs,
4045 N1: DAG.getConstant(Val: 0, DL, VT: MVT::i64),
4046 N2: UpperBits).getValue(R: 1);
4047 }
4048 break;
4049 }
4050 } // switch (...)
4051
4052 if (Opc) {
4053 SDVTList VTs = DAG.getVTList(VT1: Op->getValueType(ResNo: 0), VT2: MVT::i32);
4054
4055 // Emit the AArch64 operation with overflow check.
4056 Value = DAG.getNode(Opcode: Opc, DL, VTList: VTs, N1: LHS, N2: RHS);
4057 Overflow = Value.getValue(R: 1);
4058 }
4059 return std::make_pair(x&: Value, y&: Overflow);
4060}
4061
4062SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4063 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
4064 OverrideNEON: !Subtarget->isNeonAvailable()))
4065 return LowerToScalableOp(Op, DAG);
4066
4067 SDValue Sel = Op.getOperand(i: 0);
4068 SDValue Other = Op.getOperand(i: 1);
4069 SDLoc DL(Sel);
4070
4071 // If the operand is an overflow checking operation, invert the condition
4072 // code and kill the Not operation. I.e., transform:
4073 // (xor (overflow_op_bool, 1))
4074 // -->
4075 // (csel 1, 0, invert(cc), overflow_op_bool)
4076 // ... which later gets transformed to just a cset instruction with an
4077 // inverted condition code, rather than a cset + eor sequence.
4078 if (isOneConstant(V: Other) && ISD::isOverflowIntrOpRes(Op: Sel)) {
4079 // Only lower legal XALUO ops.
4080 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Sel->getValueType(ResNo: 0)))
4081 return SDValue();
4082
4083 SDValue TVal = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
4084 SDValue FVal = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
4085 AArch64CC::CondCode CC;
4086 SDValue Value, Overflow;
4087 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op: Sel.getValue(R: 0), DAG);
4088 SDValue CCVal = DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL, VT: MVT::i32);
4089 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: Op.getValueType(), N1: TVal, N2: FVal,
4090 N3: CCVal, N4: Overflow);
4091 }
4092 // If neither operand is a SELECT_CC, give up.
4093 if (Sel.getOpcode() != ISD::SELECT_CC)
4094 std::swap(a&: Sel, b&: Other);
4095 if (Sel.getOpcode() != ISD::SELECT_CC)
4096 return Op;
4097
4098 // The folding we want to perform is:
4099 // (xor x, (select_cc a, b, cc, 0, -1) )
4100 // -->
4101 // (csel x, (xor x, -1), cc ...)
4102 //
4103 // The latter will get matched to a CSINV instruction.
4104
4105 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Sel.getOperand(i: 4))->get();
4106 SDValue LHS = Sel.getOperand(i: 0);
4107 SDValue RHS = Sel.getOperand(i: 1);
4108 SDValue TVal = Sel.getOperand(i: 2);
4109 SDValue FVal = Sel.getOperand(i: 3);
4110
4111 // FIXME: This could be generalized to non-integer comparisons.
4112 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4113 return Op;
4114
4115 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
4116 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
4117
4118 // The values aren't constants, this isn't the pattern we're looking for.
4119 if (!CFVal || !CTVal)
4120 return Op;
4121
4122 // We can commute the SELECT_CC by inverting the condition. This
4123 // might be needed to make this fit into a CSINV pattern.
4124 if (CTVal->isAllOnes() && CFVal->isZero()) {
4125 std::swap(a&: TVal, b&: FVal);
4126 std::swap(a&: CTVal, b&: CFVal);
4127 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
4128 }
4129
4130 // If the constants line up, perform the transform!
4131 if (CTVal->isZero() && CFVal->isAllOnes()) {
4132 SDValue CCVal;
4133 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, DL);
4134
4135 FVal = Other;
4136 TVal = DAG.getNode(Opcode: ISD::XOR, DL, VT: Other.getValueType(), N1: Other,
4137 N2: DAG.getAllOnesConstant(DL, VT: Other.getValueType()));
4138
4139 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: Sel.getValueType(), N1: FVal, N2: TVal,
4140 N3: CCVal, N4: Cmp);
4141 }
4142
4143 return Op;
4144}
4145
4146// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4147// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4148// sets 'C' bit to 0.
4149static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
4150 SDLoc DL(Value);
4151 EVT VT = Value.getValueType();
4152 SDValue Op0 = Invert ? DAG.getConstant(Val: 0, DL, VT) : Value;
4153 SDValue Op1 = Invert ? Value : DAG.getConstant(Val: 1, DL, VT);
4154 SDValue Cmp =
4155 DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Glue), N1: Op0, N2: Op1);
4156 return Cmp.getValue(R: 1);
4157}
4158
4159// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4160// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4161static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG,
4162 bool Invert) {
4163 assert(Glue.getResNo() == 1);
4164 SDLoc DL(Glue);
4165 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
4166 SDValue One = DAG.getConstant(Val: 1, DL, VT);
4167 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4168 SDValue CC = DAG.getConstant(Val: Cond, DL, VT: MVT::i32);
4169 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
4170}
4171
4172// Value is 1 if 'V' bit of NZCV is 1, else 0
4173static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) {
4174 assert(Glue.getResNo() == 1);
4175 SDLoc DL(Glue);
4176 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
4177 SDValue One = DAG.getConstant(Val: 1, DL, VT);
4178 SDValue CC = DAG.getConstant(Val: AArch64CC::VS, DL, VT: MVT::i32);
4179 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
4180}
4181
4182// This lowering is inefficient, but it will get cleaned up by
4183// `foldOverflowCheck`
4184static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
4185 unsigned Opcode, bool IsSigned) {
4186 EVT VT0 = Op.getValue(R: 0).getValueType();
4187 EVT VT1 = Op.getValue(R: 1).getValueType();
4188
4189 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4190 return SDValue();
4191
4192 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4193 SDValue OpLHS = Op.getOperand(i: 0);
4194 SDValue OpRHS = Op.getOperand(i: 1);
4195 SDValue OpCarryIn = valueToCarryFlag(Value: Op.getOperand(i: 2), DAG, Invert: InvertCarry);
4196
4197 SDLoc DL(Op);
4198 SDVTList VTs = DAG.getVTList(VT1: VT0, VT2: VT1);
4199
4200 SDValue Sum = DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: VT0, VT2: MVT::Glue), N1: OpLHS,
4201 N2: OpRHS, N3: OpCarryIn);
4202
4203 SDValue OutFlag =
4204 IsSigned ? overflowFlagToValue(Glue: Sum.getValue(R: 1), VT: VT1, DAG)
4205 : carryFlagToValue(Glue: Sum.getValue(R: 1), VT: VT1, DAG, Invert: InvertCarry);
4206
4207 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: VTs, N1: Sum, N2: OutFlag);
4208}
4209
4210static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
4211 // Let legalize expand this if it isn't a legal type yet.
4212 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Op.getValueType()))
4213 return SDValue();
4214
4215 SDLoc DL(Op);
4216 AArch64CC::CondCode CC;
4217 // The actual operation that sets the overflow or carry flag.
4218 SDValue Value, Overflow;
4219 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4220
4221 // We use 0 and 1 as false and true values.
4222 SDValue TVal = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
4223 SDValue FVal = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
4224
4225 // We use an inverted condition, because the conditional select is inverted
4226 // too. This will allow it to be selected to a single instruction:
4227 // CSINC Wd, WZR, WZR, invert(cond).
4228 SDValue CCVal = DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL, VT: MVT::i32);
4229 Overflow =
4230 DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: MVT::i32, N1: FVal, N2: TVal, N3: CCVal, N4: Overflow);
4231
4232 SDVTList VTs = DAG.getVTList(VT1: Op.getValueType(), VT2: MVT::i32);
4233 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: VTs, N1: Value, N2: Overflow);
4234}
4235
4236// Prefetch operands are:
4237// 1: Address to prefetch
4238// 2: bool isWrite
4239// 3: int locality (0 = no locality ... 3 = extreme locality)
4240// 4: bool isDataCache
4241static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
4242 SDLoc DL(Op);
4243 unsigned IsWrite = Op.getConstantOperandVal(i: 2);
4244 unsigned Locality = Op.getConstantOperandVal(i: 3);
4245 unsigned IsData = Op.getConstantOperandVal(i: 4);
4246
4247 bool IsStream = !Locality;
4248 // When the locality number is set
4249 if (Locality) {
4250 // The front-end should have filtered out the out-of-range values
4251 assert(Locality <= 3 && "Prefetch locality out-of-range");
4252 // The locality degree is the opposite of the cache speed.
4253 // Put the number the other way around.
4254 // The encoding starts at 0 for level 1
4255 Locality = 3 - Locality;
4256 }
4257
4258 // built the mask value encoding the expected behavior.
4259 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4260 (!IsData << 3) | // IsDataCache bit
4261 (Locality << 1) | // Cache level bits
4262 (unsigned)IsStream; // Stream bit
4263 return DAG.getNode(Opcode: AArch64ISD::PREFETCH, DL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4264 N2: DAG.getTargetConstant(Val: PrfOp, DL, VT: MVT::i32),
4265 N3: Op.getOperand(i: 1));
4266}
4267
4268// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4269// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4270// (AND X Y) Z which produces a better opt with EmitComparison
4271static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS,
4272 SelectionDAG &DAG, const SDLoc DL) {
4273 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4274 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
4275 ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(Val&: RHS);
4276 if (LHSConstOp && RHSConst) {
4277 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4278 uint64_t RHSConstant = RHSConst->getZExtValue();
4279 if (isPowerOf2_64(Value: RHSConstant)) {
4280 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4281 LHS =
4282 DAG.getNode(Opcode: ISD::AND, DL, VT: LHS.getValueType(), N1: LHS.getOperand(i: 0),
4283 N2: DAG.getConstant(Val: NewMaskValue, DL, VT: LHS.getValueType()));
4284 RHS = DAG.getConstant(Val: 0, DL, VT: RHS.getValueType());
4285 CC = ISD::SETEQ;
4286 }
4287 }
4288 }
4289}
4290
4291SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4292 SelectionDAG &DAG) const {
4293 EVT VT = Op.getValueType();
4294 if (VT.isScalableVector()) {
4295 SDValue SrcVal = Op.getOperand(i: 0);
4296
4297 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4298 // Break conversion in two with the first part converting to f32 and the
4299 // second using native f32->VT instructions.
4300 SDLoc DL(Op);
4301 return DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT,
4302 Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::nxv2f32, Operand: SrcVal));
4303 }
4304
4305 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4306 }
4307
4308 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
4309 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4310
4311 bool IsStrict = Op->isStrictFPOpcode();
4312 SDValue Op0 = Op.getOperand(i: IsStrict ? 1 : 0);
4313 EVT Op0VT = Op0.getValueType();
4314 if (VT == MVT::f64) {
4315 // FP16->FP32 extends are legal for v32 and v4f32.
4316 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4317 return Op;
4318 // Split bf16->f64 extends into two fpextends.
4319 if (Op0VT == MVT::bf16 && IsStrict) {
4320 SDValue Ext1 =
4321 DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: SDLoc(Op), ResultTys: {MVT::f32, MVT::Other},
4322 Ops: {Op0, Op.getOperand(i: 0)});
4323 return DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: SDLoc(Op), ResultTys: {VT, MVT::Other},
4324 Ops: {Ext1, Ext1.getValue(R: 1)});
4325 }
4326 if (Op0VT == MVT::bf16)
4327 return DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc(Op), VT,
4328 Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc(Op), VT: MVT::f32, Operand: Op0));
4329 return SDValue();
4330 }
4331
4332 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4333 return SDValue();
4334}
4335
4336SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4337 SelectionDAG &DAG) const {
4338 EVT VT = Op.getValueType();
4339 bool IsStrict = Op->isStrictFPOpcode();
4340 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
4341 EVT SrcVT = SrcVal.getValueType();
4342 bool Trunc = Op.getConstantOperandVal(i: IsStrict ? 2 : 1) == 1;
4343
4344 if (VT.isScalableVector()) {
4345 // Let common code split the operation.
4346 if (SrcVT == MVT::nxv8f32)
4347 return Op;
4348
4349 if (VT.getScalarType() != MVT::bf16)
4350 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4351
4352 SDLoc DL(Op);
4353 constexpr EVT I32 = MVT::nxv4i32;
4354 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(Val: I, DL, VT: I32); };
4355
4356 SDValue NaN;
4357 SDValue Narrow;
4358
4359 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4360 if (Subtarget->hasBF16())
4361 return LowerToPredicatedOp(Op, DAG,
4362 NewOp: AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4363
4364 Narrow = getSVESafeBitCast(VT: I32, Op: SrcVal, DAG);
4365
4366 // Set the quiet bit.
4367 if (!DAG.isKnownNeverSNaN(Op: SrcVal))
4368 NaN = DAG.getNode(Opcode: ISD::OR, DL, VT: I32, N1: Narrow, N2: ImmV(0x400000));
4369 } else if (SrcVT == MVT::nxv2f64 &&
4370 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4371 // Round to float without introducing rounding errors and try again.
4372 SDValue Pg = getPredicateForVector(DAG, DL, VT: MVT::nxv2f32);
4373 Narrow = DAG.getNode(Opcode: AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, VT: MVT::nxv2f32,
4374 N1: Pg, N2: SrcVal, N3: DAG.getUNDEF(VT: MVT::nxv2f32));
4375
4376 SmallVector<SDValue, 3> NewOps;
4377 if (IsStrict)
4378 NewOps.push_back(Elt: Op.getOperand(i: 0));
4379 NewOps.push_back(Elt: Narrow);
4380 NewOps.push_back(Elt: Op.getOperand(i: IsStrict ? 2 : 1));
4381 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT, Ops: NewOps, Flags: Op->getFlags());
4382 } else
4383 return SDValue();
4384
4385 if (!Trunc) {
4386 SDValue Lsb = DAG.getNode(Opcode: ISD::SRL, DL, VT: I32, N1: Narrow, N2: ImmV(16));
4387 Lsb = DAG.getNode(Opcode: ISD::AND, DL, VT: I32, N1: Lsb, N2: ImmV(1));
4388 SDValue RoundingBias = DAG.getNode(Opcode: ISD::ADD, DL, VT: I32, N1: Lsb, N2: ImmV(0x7fff));
4389 Narrow = DAG.getNode(Opcode: ISD::ADD, DL, VT: I32, N1: Narrow, N2: RoundingBias);
4390 }
4391
4392 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4393 // 0x80000000.
4394 if (NaN) {
4395 EVT I1 = I32.changeElementType(EltVT: MVT::i1);
4396 EVT CondVT = VT.changeElementType(EltVT: MVT::i1);
4397 SDValue IsNaN = DAG.getSetCC(DL, VT: CondVT, LHS: SrcVal, RHS: SrcVal, Cond: ISD::SETUO);
4398 IsNaN = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: I1, Operand: IsNaN);
4399 Narrow = DAG.getSelect(DL, VT: I32, Cond: IsNaN, LHS: NaN, RHS: Narrow);
4400 }
4401
4402 // Now that we have rounded, shift the bits into position.
4403 Narrow = DAG.getNode(Opcode: ISD::SRL, DL, VT: I32, N1: Narrow, N2: ImmV(16));
4404 return getSVESafeBitCast(VT, Op: Narrow, DAG);
4405 }
4406
4407 if (useSVEForFixedLengthVectorVT(VT: SrcVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4408 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4409
4410 // Expand cases where the result type is BF16 but we don't have hardware
4411 // instructions to lower it.
4412 if (VT.getScalarType() == MVT::bf16 &&
4413 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4414 Subtarget->hasBF16())) {
4415 SDLoc DL(Op);
4416 SDValue Narrow = SrcVal;
4417 SDValue NaN;
4418 EVT I32 = SrcVT.changeElementType(EltVT: MVT::i32);
4419 EVT F32 = SrcVT.changeElementType(EltVT: MVT::f32);
4420 if (SrcVT.getScalarType() == MVT::f32) {
4421 bool NeverSNaN = DAG.isKnownNeverSNaN(Op: Narrow);
4422 Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: I32, Operand: Narrow);
4423 if (!NeverSNaN) {
4424 // Set the quiet bit.
4425 NaN = DAG.getNode(Opcode: ISD::OR, DL, VT: I32, N1: Narrow,
4426 N2: DAG.getConstant(Val: 0x400000, DL, VT: I32));
4427 }
4428 } else if (SrcVT.getScalarType() == MVT::f64) {
4429 Narrow = DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL, VT: F32, Operand: Narrow);
4430 Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: I32, Operand: Narrow);
4431 } else {
4432 return SDValue();
4433 }
4434 if (!Trunc) {
4435 SDValue One = DAG.getConstant(Val: 1, DL, VT: I32);
4436 SDValue Lsb = DAG.getNode(Opcode: ISD::SRL, DL, VT: I32, N1: Narrow,
4437 N2: DAG.getShiftAmountConstant(Val: 16, VT: I32, DL));
4438 Lsb = DAG.getNode(Opcode: ISD::AND, DL, VT: I32, N1: Lsb, N2: One);
4439 SDValue RoundingBias =
4440 DAG.getNode(Opcode: ISD::ADD, DL, VT: I32, N1: DAG.getConstant(Val: 0x7fff, DL, VT: I32), N2: Lsb);
4441 Narrow = DAG.getNode(Opcode: ISD::ADD, DL, VT: I32, N1: Narrow, N2: RoundingBias);
4442 }
4443
4444 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4445 // 0x80000000.
4446 if (NaN) {
4447 SDValue IsNaN = DAG.getSetCC(
4448 DL, VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: SrcVT),
4449 LHS: SrcVal, RHS: SrcVal, Cond: ISD::SETUO);
4450 Narrow = DAG.getSelect(DL, VT: I32, Cond: IsNaN, LHS: NaN, RHS: Narrow);
4451 }
4452
4453 // Now that we have rounded, shift the bits into position.
4454 Narrow = DAG.getNode(Opcode: ISD::SRL, DL, VT: I32, N1: Narrow,
4455 N2: DAG.getShiftAmountConstant(Val: 16, VT: I32, DL));
4456 if (VT.isVector()) {
4457 EVT I16 = I32.changeVectorElementType(EltVT: MVT::i16);
4458 Narrow = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: I16, Operand: Narrow);
4459 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Narrow);
4460 }
4461 Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32, Operand: Narrow);
4462 SDValue Result = DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT, Operand: Narrow);
4463 return IsStrict ? DAG.getMergeValues(Ops: {Result, Op.getOperand(i: 0)}, dl: DL)
4464 : Result;
4465 }
4466
4467 if (SrcVT != MVT::f128) {
4468 // Expand cases where the input is a vector bigger than NEON.
4469 if (useSVEForFixedLengthVectorVT(VT: SrcVT))
4470 return SDValue();
4471
4472 // It's legal except when f128 is involved
4473 return Op;
4474 }
4475
4476 return SDValue();
4477}
4478
4479SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4480 SelectionDAG &DAG) const {
4481 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4482 // Any additional optimization in this function should be recorded
4483 // in the cost tables.
4484 bool IsStrict = Op->isStrictFPOpcode();
4485 EVT InVT = Op.getOperand(i: IsStrict ? 1 : 0).getValueType();
4486 EVT VT = Op.getValueType();
4487
4488 assert(!(IsStrict && VT.isScalableVector()) &&
4489 "Unimplemented SVE support for STRICT_FP_to_INT!");
4490
4491 // f16 conversions are promoted to f32 when full fp16 is not supported.
4492 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4493 InVT.getVectorElementType() == MVT::bf16) {
4494 EVT NewVT = VT.changeElementType(EltVT: MVT::f32);
4495 SDLoc DL(Op);
4496 if (IsStrict) {
4497 SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {NewVT, MVT::Other},
4498 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1)});
4499 return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {VT, MVT::Other},
4500 Ops: {Ext.getValue(R: 1), Ext.getValue(R: 0)});
4501 }
4502 return DAG.getNode(
4503 Opcode: Op.getOpcode(), DL, VT: Op.getValueType(),
4504 Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: NewVT, Operand: Op.getOperand(i: 0)));
4505 }
4506
4507 if (VT.isScalableVector()) {
4508 if (VT.getVectorElementType() == MVT::i1) {
4509 SDLoc DL(Op);
4510 EVT CvtVT = getPromotedVTForPredicate(VT);
4511 SDValue Cvt = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: CvtVT, Operand: Op.getOperand(i: 0));
4512 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: CvtVT);
4513 return DAG.getSetCC(DL, VT, LHS: Cvt, RHS: Zero, Cond: ISD::SETNE);
4514 }
4515
4516 // Let common code split the operation.
4517 if (InVT == MVT::nxv8f32)
4518 return Op;
4519
4520 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4521 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4522 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4523 return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4524 }
4525
4526 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) ||
4527 useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4528 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4529
4530 uint64_t VTSize = VT.getFixedSizeInBits();
4531 uint64_t InVTSize = InVT.getFixedSizeInBits();
4532 if (VTSize < InVTSize) {
4533 SDLoc DL(Op);
4534 if (IsStrict) {
4535 InVT = InVT.changeVectorElementTypeToInteger();
4536 SDValue Cv = DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {InVT, MVT::Other},
4537 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1)});
4538 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Cv);
4539 return DAG.getMergeValues(Ops: {Trunc, Cv.getValue(R: 1)}, dl: DL);
4540 }
4541 SDValue Cv =
4542 DAG.getNode(Opcode: Op.getOpcode(), DL, VT: InVT.changeVectorElementTypeToInteger(),
4543 Operand: Op.getOperand(i: 0));
4544 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Cv);
4545 }
4546
4547 if (VTSize > InVTSize) {
4548 SDLoc DL(Op);
4549 MVT ExtVT =
4550 MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: VT.getScalarSizeInBits()),
4551 NumElements: VT.getVectorNumElements());
4552 if (IsStrict) {
4553 SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {ExtVT, MVT::Other},
4554 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1)});
4555 return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {VT, MVT::Other},
4556 Ops: {Ext.getValue(R: 1), Ext.getValue(R: 0)});
4557 }
4558 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: ExtVT, Operand: Op.getOperand(i: 0));
4559 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT, Operand: Ext);
4560 }
4561
4562 // Use a scalar operation for conversions between single-element vectors of
4563 // the same size.
4564 if (InVT.getVectorNumElements() == 1) {
4565 SDLoc DL(Op);
4566 SDValue Extract = DAG.getNode(
4567 Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: InVT.getScalarType(),
4568 N1: Op.getOperand(i: IsStrict ? 1 : 0), N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
4569 EVT ScalarVT = VT.getScalarType();
4570 if (IsStrict)
4571 return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {ScalarVT, MVT::Other},
4572 Ops: {Op.getOperand(i: 0), Extract});
4573 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: ScalarVT, Operand: Extract);
4574 }
4575
4576 // Type changing conversions are illegal.
4577 return Op;
4578}
4579
4580SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4581 SelectionDAG &DAG) const {
4582 bool IsStrict = Op->isStrictFPOpcode();
4583 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
4584
4585 if (SrcVal.getValueType().isVector())
4586 return LowerVectorFP_TO_INT(Op, DAG);
4587
4588 // f16 conversions are promoted to f32 when full fp16 is not supported.
4589 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4590 SrcVal.getValueType() == MVT::bf16) {
4591 SDLoc DL(Op);
4592 if (IsStrict) {
4593 SDValue Ext =
4594 DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {MVT::f32, MVT::Other},
4595 Ops: {Op.getOperand(i: 0), SrcVal});
4596 return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {Op.getValueType(), MVT::Other},
4597 Ops: {Ext.getValue(R: 1), Ext.getValue(R: 0)});
4598 }
4599 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: Op.getValueType(),
4600 Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: SrcVal));
4601 }
4602
4603 if (SrcVal.getValueType() != MVT::f128) {
4604 // It's legal except when f128 is involved
4605 return Op;
4606 }
4607
4608 return SDValue();
4609}
4610
4611SDValue
4612AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4613 SelectionDAG &DAG) const {
4614 // AArch64 FP-to-int conversions saturate to the destination element size, so
4615 // we can lower common saturating conversions to simple instructions.
4616 SDValue SrcVal = Op.getOperand(i: 0);
4617 EVT SrcVT = SrcVal.getValueType();
4618 EVT DstVT = Op.getValueType();
4619 EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
4620
4621 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4622 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4623 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4624 assert(SatWidth <= DstElementWidth &&
4625 "Saturation width cannot exceed result width");
4626
4627 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4628 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4629 // types, so this is hard to reach.
4630 if (DstVT.isScalableVector())
4631 return SDValue();
4632
4633 EVT SrcElementVT = SrcVT.getVectorElementType();
4634
4635 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4636 SDLoc DL(Op);
4637 SDValue SrcVal2;
4638 if ((SrcElementVT == MVT::f16 &&
4639 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4640 SrcElementVT == MVT::bf16) {
4641 MVT F32VT = MVT::getVectorVT(VT: MVT::f32, NumElements: SrcVT.getVectorNumElements());
4642 SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: F32VT, Operand: SrcVal);
4643 // If we are extending to a v8f32, split into two v4f32 to produce legal
4644 // types.
4645 if (F32VT.getSizeInBits() > 128) {
4646 std::tie(args&: SrcVal, args&: SrcVal2) = DAG.SplitVector(N: SrcVal, DL);
4647 F32VT = F32VT.getHalfNumVectorElementsVT();
4648 }
4649 SrcVT = F32VT;
4650 SrcElementVT = MVT::f32;
4651 SrcElementWidth = 32;
4652 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4653 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4654 return SDValue();
4655
4656 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4657 // width and produce a fcvtzu.
4658 if (SatWidth == 64 && SrcElementWidth < 64) {
4659 MVT F64VT = MVT::getVectorVT(VT: MVT::f64, NumElements: SrcVT.getVectorNumElements());
4660 SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: F64VT, Operand: SrcVal);
4661 SrcVT = F64VT;
4662 SrcElementVT = MVT::f64;
4663 SrcElementWidth = 64;
4664 }
4665 // Cases that we can emit directly.
4666 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4667 SDValue Res = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4668 N2: DAG.getValueType(DstVT.getScalarType()));
4669 if (SrcVal2) {
4670 SDValue Res2 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal2,
4671 N2: DAG.getValueType(DstVT.getScalarType()));
4672 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: DstVT, N1: Res, N2: Res2);
4673 }
4674 return Res;
4675 }
4676
4677 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4678 // result. This is only valid if the legal cvt is larger than the saturate
4679 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4680 // (at least until sqxtn is selected).
4681 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4682 return SDValue();
4683
4684 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4685 SDValue NativeCvt = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: IntVT, N1: SrcVal,
4686 N2: DAG.getValueType(IntVT.getScalarType()));
4687 SDValue NativeCvt2 =
4688 SrcVal2 ? DAG.getNode(Opcode: Op.getOpcode(), DL, VT: IntVT, N1: SrcVal2,
4689 N2: DAG.getValueType(IntVT.getScalarType()))
4690 : SDValue();
4691 SDValue Sat, Sat2;
4692 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4693 SDValue MinC = DAG.getConstant(
4694 Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4695 SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4696 SDValue Min2 = SrcVal2 ? DAG.getNode(Opcode: ISD::SMIN, DL, VT: IntVT, N1: NativeCvt2, N2: MinC) : SDValue();
4697 SDValue MaxC = DAG.getConstant(
4698 Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4699 Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: IntVT, N1: Min, N2: MaxC);
4700 Sat2 = SrcVal2 ? DAG.getNode(Opcode: ISD::SMAX, DL, VT: IntVT, N1: Min2, N2: MaxC) : SDValue();
4701 } else {
4702 SDValue MinC = DAG.getConstant(
4703 Val: APInt::getAllOnes(numBits: SatWidth).zext(width: SrcElementWidth), DL, VT: IntVT);
4704 Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4705 Sat2 = SrcVal2 ? DAG.getNode(Opcode: ISD::UMIN, DL, VT: IntVT, N1: NativeCvt2, N2: MinC) : SDValue();
4706 }
4707
4708 if (SrcVal2)
4709 Sat = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL,
4710 VT: IntVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext()),
4711 N1: Sat, N2: Sat2);
4712
4713 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4714}
4715
4716SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4717 SelectionDAG &DAG) const {
4718 // AArch64 FP-to-int conversions saturate to the destination register size, so
4719 // we can lower common saturating conversions to simple instructions.
4720 SDValue SrcVal = Op.getOperand(i: 0);
4721 EVT SrcVT = SrcVal.getValueType();
4722
4723 if (SrcVT.isVector())
4724 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4725
4726 EVT DstVT = Op.getValueType();
4727 EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
4728 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4729 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4730 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4731
4732 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4733 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4734 SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc(Op), VT: MVT::f32, Operand: SrcVal);
4735 SrcVT = MVT::f32;
4736 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4737 SrcVT != MVT::bf16)
4738 return SDValue();
4739
4740 SDLoc DL(Op);
4741 // Cases that we can emit directly.
4742 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4743 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4744 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4745 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4746 N2: DAG.getValueType(DstVT));
4747
4748 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4749 // result. This is only valid if the legal cvt is larger than the saturate
4750 // width.
4751 if (DstWidth < SatWidth)
4752 return SDValue();
4753
4754 SDValue NativeCvt =
4755 DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal, N2: DAG.getValueType(DstVT));
4756 SDValue Sat;
4757 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4758 SDValue MinC = DAG.getConstant(
4759 Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4760 SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4761 SDValue MaxC = DAG.getConstant(
4762 Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4763 Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: DstVT, N1: Min, N2: MaxC);
4764 } else {
4765 SDValue MinC = DAG.getConstant(
4766 Val: APInt::getAllOnes(numBits: SatWidth).zext(width: DstWidth), DL, VT: DstVT);
4767 Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4768 }
4769
4770 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4771}
4772
4773SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4774 SelectionDAG &DAG) const {
4775 EVT VT = Op.getValueType();
4776 SDValue Src = Op.getOperand(i: 0);
4777 SDLoc DL(Op);
4778
4779 assert(VT.isVector() && "Expected vector type");
4780
4781 EVT CastVT =
4782 VT.changeVectorElementType(EltVT: Src.getValueType().getVectorElementType());
4783
4784 // Round the floating-point value into a floating-point register with the
4785 // current rounding mode.
4786 SDValue FOp = DAG.getNode(Opcode: ISD::FRINT, DL, VT: CastVT, Operand: Src);
4787
4788 // Truncate the rounded floating point to an integer.
4789 return DAG.getNode(Opcode: ISD::FP_TO_SINT_SAT, DL, VT, N1: FOp,
4790 N2: DAG.getValueType(VT.getVectorElementType()));
4791}
4792
4793SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4794 SelectionDAG &DAG) const {
4795 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4796 // Any additional optimization in this function should be recorded
4797 // in the cost tables.
4798 bool IsStrict = Op->isStrictFPOpcode();
4799 EVT VT = Op.getValueType();
4800 SDLoc DL(Op);
4801 SDValue In = Op.getOperand(i: IsStrict ? 1 : 0);
4802 EVT InVT = In.getValueType();
4803 unsigned Opc = Op.getOpcode();
4804 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4805
4806 assert(!(IsStrict && VT.isScalableVector()) &&
4807 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
4808
4809 // NOTE: i1->bf16 does not require promotion to f32.
4810 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
4811 SDValue FalseVal = DAG.getConstantFP(Val: 0.0, DL, VT);
4812 SDValue TrueVal = IsSigned ? DAG.getConstantFP(Val: -1.0, DL, VT)
4813 : DAG.getConstantFP(Val: 1.0, DL, VT);
4814 return DAG.getNode(Opcode: ISD::VSELECT, DL, VT, N1: In, N2: TrueVal, N3: FalseVal);
4815 }
4816
4817 // Promote bf16 conversions to f32.
4818 if (VT.getVectorElementType() == MVT::bf16) {
4819 EVT F32 = VT.changeElementType(EltVT: MVT::f32);
4820 if (IsStrict) {
4821 SDValue Val = DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {F32, MVT::Other},
4822 Ops: {Op.getOperand(i: 0), In});
4823 return DAG.getNode(Opcode: ISD::STRICT_FP_ROUND, DL,
4824 ResultTys: {Op.getValueType(), MVT::Other},
4825 Ops: {Val.getValue(R: 1), Val.getValue(R: 0),
4826 DAG.getIntPtrConstant(Val: 0, DL, /*isTarget=*/true)});
4827 }
4828 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: Op.getValueType(),
4829 N1: DAG.getNode(Opcode: Op.getOpcode(), DL, VT: F32, Operand: In),
4830 N2: DAG.getIntPtrConstant(Val: 0, DL, /*isTarget=*/true));
4831 }
4832
4833 if (VT.isScalableVector()) {
4834 // Let common code split the operation.
4835 if (VT == MVT::nxv8f32)
4836 return Op;
4837
4838 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4839 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
4840 return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4841 }
4842
4843 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) ||
4844 useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4845 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4846
4847 uint64_t VTSize = VT.getFixedSizeInBits();
4848 uint64_t InVTSize = InVT.getFixedSizeInBits();
4849 if (VTSize < InVTSize) {
4850 // AArch64 doesn't have a direct vector instruction to convert
4851 // fixed point to floating point AND narrow it at the same time.
4852 // Additional rounding when the target is f32/f64 causes double
4853 // rounding issues. Conversion to f16 is fine due to narrow width.
4854 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
4855 bool IsTargetf16 = false;
4856 if (Op.hasOneUse() &&
4857 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
4858 // Some vector types are split during legalization into half, followed by
4859 // concatenation, followed by rounding to the original vector type. If we
4860 // end up resolving to f16 type, we shouldn't worry about rounding errors.
4861 SDNode *U = *Op->user_begin();
4862 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
4863 EVT TmpVT = U->user_begin()->getValueType(ResNo: 0);
4864 if (TmpVT.getScalarType() == MVT::f16)
4865 IsTargetf16 = true;
4866 }
4867 }
4868
4869 if (IsTargetf32 && !IsTargetf16) {
4870 return !IsStrict ? DAG.UnrollVectorOp(N: Op.getNode()) : SDValue();
4871 }
4872
4873 MVT CastVT =
4874 MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: InVT.getScalarSizeInBits()),
4875 NumElements: InVT.getVectorNumElements());
4876 if (IsStrict) {
4877 In = DAG.getNode(Opcode: Opc, DL, ResultTys: {CastVT, MVT::Other}, Ops: {Op.getOperand(i: 0), In});
4878 return DAG.getNode(Opcode: ISD::STRICT_FP_ROUND, DL, ResultTys: {VT, MVT::Other},
4879 Ops: {In.getValue(R: 1), In.getValue(R: 0),
4880 DAG.getIntPtrConstant(Val: 0, DL, /*isTarget=*/true)});
4881 }
4882 In = DAG.getNode(Opcode: Opc, DL, VT: CastVT, Operand: In);
4883 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: In,
4884 N2: DAG.getIntPtrConstant(Val: 0, DL, /*isTarget=*/true));
4885 }
4886
4887 if (VTSize > InVTSize) {
4888 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4889 EVT CastVT = VT.changeVectorElementTypeToInteger();
4890 In = DAG.getNode(Opcode: CastOpc, DL, VT: CastVT, Operand: In);
4891 if (IsStrict)
4892 return DAG.getNode(Opcode: Opc, DL, ResultTys: {VT, MVT::Other}, Ops: {Op.getOperand(i: 0), In});
4893 return DAG.getNode(Opcode: Opc, DL, VT, Operand: In);
4894 }
4895
4896 // Use a scalar operation for conversions between single-element vectors of
4897 // the same size.
4898 if (VT.getVectorNumElements() == 1) {
4899 SDValue Extract =
4900 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: InVT.getScalarType(), N1: In,
4901 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
4902 EVT ScalarVT = VT.getScalarType();
4903 if (IsStrict)
4904 return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {ScalarVT, MVT::Other},
4905 Ops: {Op.getOperand(i: 0), Extract});
4906 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: ScalarVT, Operand: Extract);
4907 }
4908
4909 return Op;
4910}
4911
4912SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4913 SelectionDAG &DAG) const {
4914 if (Op.getValueType().isVector())
4915 return LowerVectorINT_TO_FP(Op, DAG);
4916
4917 bool IsStrict = Op->isStrictFPOpcode();
4918 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
4919
4920 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4921 Op->getOpcode() == ISD::SINT_TO_FP;
4922
4923 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4924 SDLoc DL(Op);
4925 if (IsStrict) {
4926 SDValue Val = DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {PromoteVT, MVT::Other},
4927 Ops: {Op.getOperand(i: 0), SrcVal});
4928 return DAG.getNode(Opcode: ISD::STRICT_FP_ROUND, DL,
4929 ResultTys: {Op.getValueType(), MVT::Other},
4930 Ops: {Val.getValue(R: 1), Val.getValue(R: 0),
4931 DAG.getIntPtrConstant(Val: 0, DL, /*isTarget=*/true)});
4932 }
4933 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: Op.getValueType(),
4934 N1: DAG.getNode(Opcode: Op.getOpcode(), DL, VT: PromoteVT, Operand: SrcVal),
4935 N2: DAG.getIntPtrConstant(Val: 0, DL, /*isTarget=*/true));
4936 };
4937
4938 if (Op.getValueType() == MVT::bf16) {
4939 unsigned MaxWidth = IsSigned
4940 ? DAG.ComputeMaxSignificantBits(Op: SrcVal)
4941 : DAG.computeKnownBits(Op: SrcVal).countMaxActiveBits();
4942 // bf16 conversions are promoted to f32 when converting from i16.
4943 if (MaxWidth <= 24) {
4944 return IntToFpViaPromotion(MVT::f32);
4945 }
4946
4947 // bf16 conversions are promoted to f64 when converting from i32.
4948 if (MaxWidth <= 53) {
4949 return IntToFpViaPromotion(MVT::f64);
4950 }
4951
4952 // We need to be careful about i64 -> bf16.
4953 // Consider an i32 22216703.
4954 // This number cannot be represented exactly as an f32 and so a itofp will
4955 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4956 // However, the correct bf16 was supposed to be 22151168.0
4957 // We need to use sticky rounding to get this correct.
4958 if (SrcVal.getValueType() == MVT::i64) {
4959 SDLoc DL(Op);
4960 // This algorithm is equivalent to the following:
4961 // uint64_t SrcHi = SrcVal & ~0xfffull;
4962 // uint64_t SrcLo = SrcVal & 0xfffull;
4963 // uint64_t Highest = SrcVal >> 53;
4964 // bool HasHighest = Highest != 0;
4965 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4966 // double Rounded = static_cast<double>(ToRound);
4967 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4968 // uint64_t HasLo = SrcLo != 0;
4969 // bool NeedsAdjustment = HasHighest & HasLo;
4970 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4971 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4972 // return static_cast<__bf16>(Adjusted);
4973 //
4974 // Essentially, what happens is that SrcVal either fits perfectly in a
4975 // double-precision value or it is too big. If it is sufficiently small,
4976 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4977 // ensure that u64 -> double has no rounding error by only using the 52
4978 // MSB of the input. The low order bits will get merged into a sticky bit
4979 // which will avoid issues incurred by double rounding.
4980
4981 // Signed conversion is more or less like so:
4982 // copysign((__bf16)abs(SrcVal), SrcVal)
4983 SDValue SignBit;
4984 if (IsSigned) {
4985 SignBit = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4986 N2: DAG.getConstant(Val: 1ull << 63, DL, VT: MVT::i64));
4987 SrcVal = DAG.getNode(Opcode: ISD::ABS, DL, VT: MVT::i64, Operand: SrcVal);
4988 }
4989 SDValue SrcHi = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4990 N2: DAG.getConstant(Val: ~0xfffull, DL, VT: MVT::i64));
4991 SDValue SrcLo = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4992 N2: DAG.getConstant(Val: 0xfffull, DL, VT: MVT::i64));
4993 SDValue Highest =
4994 DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: SrcVal,
4995 N2: DAG.getShiftAmountConstant(Val: 53, VT: MVT::i64, DL));
4996 SDValue Zero64 = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
4997 SDValue ToRound =
4998 DAG.getSelectCC(DL, LHS: Highest, RHS: Zero64, True: SrcHi, False: SrcVal, Cond: ISD::SETNE);
4999 SDValue Rounded =
5000 IsStrict ? DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {MVT::f64, MVT::Other},
5001 Ops: {Op.getOperand(i: 0), ToRound})
5002 : DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f64, Operand: ToRound);
5003
5004 SDValue RoundedBits = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Rounded);
5005 if (SignBit) {
5006 RoundedBits = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: RoundedBits, N2: SignBit);
5007 }
5008
5009 SDValue HasHighest = DAG.getSetCC(
5010 DL,
5011 VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: MVT::i64),
5012 LHS: Highest, RHS: Zero64, Cond: ISD::SETNE);
5013
5014 SDValue HasLo = DAG.getSetCC(
5015 DL,
5016 VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: MVT::i64),
5017 LHS: SrcLo, RHS: Zero64, Cond: ISD::SETNE);
5018
5019 SDValue NeedsAdjustment =
5020 DAG.getNode(Opcode: ISD::AND, DL, VT: HasLo.getValueType(), N1: HasHighest, N2: HasLo);
5021 NeedsAdjustment = DAG.getZExtOrTrunc(Op: NeedsAdjustment, DL, VT: MVT::i64);
5022
5023 SDValue AdjustedBits =
5024 DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: RoundedBits, N2: NeedsAdjustment);
5025 SDValue Adjusted = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: AdjustedBits);
5026 return IsStrict
5027 ? DAG.getNode(
5028 Opcode: ISD::STRICT_FP_ROUND, DL,
5029 ResultTys: {Op.getValueType(), MVT::Other},
5030 Ops: {Rounded.getValue(R: 1), Adjusted,
5031 DAG.getIntPtrConstant(Val: 0, DL, /*isTarget=*/true)})
5032 : DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: Op.getValueType(), N1: Adjusted,
5033 N2: DAG.getIntPtrConstant(Val: 0, DL, /*isTarget=*/true));
5034 }
5035 }
5036
5037 // f16 conversions are promoted to f32 when full fp16 is not supported.
5038 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5039 return IntToFpViaPromotion(MVT::f32);
5040 }
5041
5042 // i128 conversions are libcalls.
5043 if (SrcVal.getValueType() == MVT::i128)
5044 return SDValue();
5045
5046 // Other conversions are legal, unless it's to the completely software-based
5047 // fp128.
5048 if (Op.getValueType() != MVT::f128)
5049 return Op;
5050 return SDValue();
5051}
5052
5053SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5054 SelectionDAG &DAG) const {
5055 // For iOS, we want to call an alternative entry point: __sincos_stret,
5056 // which returns the values in two S / D registers.
5057 SDLoc DL(Op);
5058 SDValue Arg = Op.getOperand(i: 0);
5059 EVT ArgVT = Arg.getValueType();
5060 Type *ArgTy = ArgVT.getTypeForEVT(Context&: *DAG.getContext());
5061
5062 ArgListTy Args;
5063 ArgListEntry Entry;
5064
5065 Entry.Node = Arg;
5066 Entry.Ty = ArgTy;
5067 Entry.IsSExt = false;
5068 Entry.IsZExt = false;
5069 Args.push_back(x: Entry);
5070
5071 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5072 : RTLIB::SINCOS_STRET_F32;
5073 const char *LibcallName = getLibcallName(Call: LC);
5074 SDValue Callee =
5075 DAG.getExternalSymbol(Sym: LibcallName, VT: getPointerTy(DL: DAG.getDataLayout()));
5076
5077 StructType *RetTy = StructType::get(elt1: ArgTy, elts: ArgTy);
5078 TargetLowering::CallLoweringInfo CLI(DAG);
5079 CallingConv::ID CC = getLibcallCallingConv(Call: LC);
5080 CLI.setDebugLoc(DL)
5081 .setChain(DAG.getEntryNode())
5082 .setLibCallee(CC, ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
5083
5084 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5085 return CallResult.first;
5086}
5087
5088static MVT getSVEContainerType(EVT ContentTy);
5089
5090SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5091 SelectionDAG &DAG) const {
5092 EVT OpVT = Op.getValueType();
5093 EVT ArgVT = Op.getOperand(i: 0).getValueType();
5094
5095 if (useSVEForFixedLengthVectorVT(VT: OpVT))
5096 return LowerFixedLengthBitcastToSVE(Op, DAG);
5097
5098 if (OpVT.isScalableVector()) {
5099 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5100
5101 // Handle type legalisation first.
5102 if (!isTypeLegal(VT: ArgVT)) {
5103 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5104 "Expected int->fp bitcast!");
5105
5106 // Bitcasting between unpacked vector types of different element counts is
5107 // not a NOP because the live elements are laid out differently.
5108 // 01234567
5109 // e.g. nxv2i32 = XX??XX??
5110 // nxv4f16 = X?X?X?X?
5111 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5112 return SDValue();
5113
5114 SDValue ExtResult =
5115 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SDLoc(Op), VT: getSVEContainerType(ContentTy: ArgVT),
5116 Operand: Op.getOperand(i: 0));
5117 return getSVESafeBitCast(VT: OpVT, Op: ExtResult, DAG);
5118 }
5119
5120 // Bitcasts between legal types with the same element count are legal.
5121 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5122 return Op;
5123
5124 // getSVESafeBitCast does not support casting between unpacked types.
5125 if (!isPackedVectorType(VT: OpVT, DAG))
5126 return SDValue();
5127
5128 return getSVESafeBitCast(VT: OpVT, Op: Op.getOperand(i: 0), DAG);
5129 }
5130
5131 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5132 return SDValue();
5133
5134 // Bitcasts between f16 and bf16 are legal.
5135 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5136 return Op;
5137
5138 assert(ArgVT == MVT::i16);
5139 SDLoc DL(Op);
5140
5141 Op = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Op.getOperand(i: 0));
5142 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Op);
5143 return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT: OpVT, Operand: Op);
5144}
5145
5146// Returns lane if Op extracts from a two-element vector and lane is constant
5147// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5148static std::optional<uint64_t>
5149getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
5150 SDNode *OpNode = Op.getNode();
5151 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5152 return std::nullopt;
5153
5154 EVT VT = OpNode->getOperand(Num: 0).getValueType();
5155 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: OpNode->getOperand(Num: 1));
5156 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5157 return std::nullopt;
5158
5159 return C->getZExtValue();
5160}
5161
5162static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG,
5163 bool isSigned) {
5164 EVT VT = N.getValueType();
5165
5166 if (N.getOpcode() != ISD::BUILD_VECTOR)
5167 return false;
5168
5169 for (const SDValue &Elt : N->op_values()) {
5170 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Elt)) {
5171 unsigned EltSize = VT.getScalarSizeInBits();
5172 unsigned HalfSize = EltSize / 2;
5173 if (isSigned) {
5174 if (!isIntN(N: HalfSize, x: C->getSExtValue()))
5175 return false;
5176 } else {
5177 if (!isUIntN(N: HalfSize, x: C->getZExtValue()))
5178 return false;
5179 }
5180 continue;
5181 }
5182 return false;
5183 }
5184
5185 return true;
5186}
5187
5188static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) {
5189 EVT VT = N.getValueType();
5190 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5191 EVT HalfVT = EVT::getVectorVT(
5192 Context&: *DAG.getContext(),
5193 VT: VT.getScalarType().getHalfSizedIntegerVT(Context&: *DAG.getContext()),
5194 EC: VT.getVectorElementCount());
5195 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc(N), VT: HalfVT, Operand: N);
5196}
5197
5198static bool isSignExtended(SDValue N, SelectionDAG &DAG) {
5199 return N.getOpcode() == ISD::SIGN_EXTEND ||
5200 N.getOpcode() == ISD::ANY_EXTEND ||
5201 isExtendedBUILD_VECTOR(N, DAG, isSigned: true);
5202}
5203
5204static bool isZeroExtended(SDValue N, SelectionDAG &DAG) {
5205 return N.getOpcode() == ISD::ZERO_EXTEND ||
5206 N.getOpcode() == ISD::ANY_EXTEND ||
5207 isExtendedBUILD_VECTOR(N, DAG, isSigned: false);
5208}
5209
5210static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) {
5211 unsigned Opcode = N.getOpcode();
5212 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5213 SDValue N0 = N.getOperand(i: 0);
5214 SDValue N1 = N.getOperand(i: 1);
5215 return N0->hasOneUse() && N1->hasOneUse() &&
5216 isSignExtended(N: N0, DAG) && isSignExtended(N: N1, DAG);
5217 }
5218 return false;
5219}
5220
5221static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) {
5222 unsigned Opcode = N.getOpcode();
5223 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5224 SDValue N0 = N.getOperand(i: 0);
5225 SDValue N1 = N.getOperand(i: 1);
5226 return N0->hasOneUse() && N1->hasOneUse() &&
5227 isZeroExtended(N: N0, DAG) && isZeroExtended(N: N1, DAG);
5228 }
5229 return false;
5230}
5231
5232SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5233 SelectionDAG &DAG) const {
5234 // The rounding mode is in bits 23:22 of the FPSCR.
5235 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5236 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5237 // so that the shift + and get folded into a bitfield extract.
5238 SDLoc DL(Op);
5239
5240 SDValue Chain = Op.getOperand(i: 0);
5241 SDValue FPCR_64 = DAG.getNode(
5242 Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other},
5243 Ops: {Chain, DAG.getConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)});
5244 Chain = FPCR_64.getValue(R: 1);
5245 SDValue FPCR_32 = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: FPCR_64);
5246 SDValue FltRounds = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: FPCR_32,
5247 N2: DAG.getConstant(Val: 1U << 22, DL, VT: MVT::i32));
5248 SDValue RMODE = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: FltRounds,
5249 N2: DAG.getConstant(Val: 22, DL, VT: MVT::i32));
5250 SDValue AND = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: RMODE,
5251 N2: DAG.getConstant(Val: 3, DL, VT: MVT::i32));
5252 return DAG.getMergeValues(Ops: {AND, Chain}, dl: DL);
5253}
5254
5255SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5256 SelectionDAG &DAG) const {
5257 SDLoc DL(Op);
5258 SDValue Chain = Op->getOperand(Num: 0);
5259 SDValue RMValue = Op->getOperand(Num: 1);
5260
5261 // The rounding mode is in bits 23:22 of the FPCR.
5262 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5263 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5264 // ((arg - 1) & 3) << 22).
5265 //
5266 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5267 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5268 // generated llvm.set.rounding to ensure this condition.
5269
5270 // Calculate new value of FPCR[23:22].
5271 RMValue = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: RMValue,
5272 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
5273 RMValue = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: RMValue,
5274 N2: DAG.getConstant(Val: 0x3, DL, VT: MVT::i32));
5275 RMValue =
5276 DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: RMValue,
5277 N2: DAG.getConstant(Val: AArch64::RoundingBitsPos, DL, VT: MVT::i32));
5278 RMValue = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: RMValue);
5279
5280 // Get current value of FPCR.
5281 SDValue Ops[] = {
5282 Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5283 SDValue FPCR =
5284 DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5285 Chain = FPCR.getValue(R: 1);
5286 FPCR = FPCR.getValue(R: 0);
5287
5288 // Put new rounding mode into FPSCR[23:22].
5289 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5290 FPCR = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: FPCR,
5291 N2: DAG.getConstant(Val: RMMask, DL, VT: MVT::i64));
5292 FPCR = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: FPCR, N2: RMValue);
5293 SDValue Ops2[] = {
5294 Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64),
5295 FPCR};
5296 return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5297}
5298
5299SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5300 SelectionDAG &DAG) const {
5301 SDLoc DL(Op);
5302 SDValue Chain = Op->getOperand(Num: 0);
5303
5304 // Get current value of FPCR.
5305 SDValue Ops[] = {
5306 Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5307 SDValue FPCR =
5308 DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5309 Chain = FPCR.getValue(R: 1);
5310 FPCR = FPCR.getValue(R: 0);
5311
5312 // Truncate FPCR to 32 bits.
5313 SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: FPCR);
5314
5315 return DAG.getMergeValues(Ops: {Result, Chain}, dl: DL);
5316}
5317
5318SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5319 SelectionDAG &DAG) const {
5320 SDLoc DL(Op);
5321 SDValue Chain = Op->getOperand(Num: 0);
5322 SDValue Mode = Op->getOperand(Num: 1);
5323
5324 // Extend the specified value to 64 bits.
5325 SDValue FPCR = DAG.getZExtOrTrunc(Op: Mode, DL, VT: MVT::i64);
5326
5327 // Set new value of FPCR.
5328 SDValue Ops2[] = {
5329 Chain, DAG.getConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64), FPCR};
5330 return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5331}
5332
5333SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5334 SelectionDAG &DAG) const {
5335 SDLoc DL(Op);
5336 SDValue Chain = Op->getOperand(Num: 0);
5337
5338 // Get current value of FPCR.
5339 SDValue Ops[] = {
5340 Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5341 SDValue FPCR =
5342 DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5343 Chain = FPCR.getValue(R: 1);
5344 FPCR = FPCR.getValue(R: 0);
5345
5346 // Clear bits that are not reserved.
5347 SDValue FPSCRMasked = DAG.getNode(
5348 Opcode: ISD::AND, DL, VT: MVT::i64, N1: FPCR,
5349 N2: DAG.getConstant(Val: AArch64::ReservedFPControlBits, DL, VT: MVT::i64));
5350
5351 // Set new value of FPCR.
5352 SDValue Ops2[] = {Chain,
5353 DAG.getConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64),
5354 FPSCRMasked};
5355 return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5356}
5357
5358static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5359 SDLoc DL, bool &IsMLA) {
5360 bool IsN0SExt = isSignExtended(N: N0, DAG);
5361 bool IsN1SExt = isSignExtended(N: N1, DAG);
5362 if (IsN0SExt && IsN1SExt)
5363 return AArch64ISD::SMULL;
5364
5365 bool IsN0ZExt = isZeroExtended(N: N0, DAG);
5366 bool IsN1ZExt = isZeroExtended(N: N1, DAG);
5367
5368 if (IsN0ZExt && IsN1ZExt)
5369 return AArch64ISD::UMULL;
5370
5371 // Select UMULL if we can replace the other operand with an extend.
5372 EVT VT = N0.getValueType();
5373 unsigned EltSize = VT.getScalarSizeInBits();
5374 APInt Mask = APInt::getHighBitsSet(numBits: EltSize, hiBitsSet: EltSize / 2);
5375 if (IsN0ZExt || IsN1ZExt) {
5376 if (DAG.MaskedValueIsZero(Op: IsN0ZExt ? N1 : N0, Mask))
5377 return AArch64ISD::UMULL;
5378 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(Op: N0, Mask) &&
5379 DAG.MaskedValueIsZero(Op: N1, Mask)) {
5380 // For v2i64 we look more aggressively at both operands being zero, to avoid
5381 // scalarization.
5382 return AArch64ISD::UMULL;
5383 }
5384
5385 if (IsN0SExt || IsN1SExt) {
5386 if (DAG.ComputeNumSignBits(Op: IsN0SExt ? N1 : N0) > EltSize / 2)
5387 return AArch64ISD::SMULL;
5388 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(Op: N0) > EltSize / 2 &&
5389 DAG.ComputeNumSignBits(Op: N1) > EltSize / 2) {
5390 return AArch64ISD::SMULL;
5391 }
5392
5393 if (!IsN1SExt && !IsN1ZExt)
5394 return 0;
5395
5396 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5397 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5398 if (IsN1SExt && isAddSubSExt(N: N0, DAG)) {
5399 IsMLA = true;
5400 return AArch64ISD::SMULL;
5401 }
5402 if (IsN1ZExt && isAddSubZExt(N: N0, DAG)) {
5403 IsMLA = true;
5404 return AArch64ISD::UMULL;
5405 }
5406 if (IsN0ZExt && isAddSubZExt(N: N1, DAG)) {
5407 std::swap(a&: N0, b&: N1);
5408 IsMLA = true;
5409 return AArch64ISD::UMULL;
5410 }
5411 return 0;
5412}
5413
5414SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5415 EVT VT = Op.getValueType();
5416
5417 bool OverrideNEON = !Subtarget->isNeonAvailable();
5418 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5419 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5420
5421 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5422 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5423 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5424 "unexpected type for custom-lowering ISD::MUL");
5425 SDValue N0 = Op.getOperand(i: 0);
5426 SDValue N1 = Op.getOperand(i: 1);
5427 bool isMLA = false;
5428 EVT OVT = VT;
5429 if (VT.is64BitVector()) {
5430 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5431 isNullConstant(V: N0.getOperand(i: 1)) &&
5432 N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5433 isNullConstant(V: N1.getOperand(i: 1))) {
5434 N0 = N0.getOperand(i: 0);
5435 N1 = N1.getOperand(i: 0);
5436 VT = N0.getValueType();
5437 } else {
5438 if (VT == MVT::v1i64) {
5439 if (Subtarget->hasSVE())
5440 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5441 // Fall through to expand this. It is not legal.
5442 return SDValue();
5443 } else
5444 // Other vector multiplications are legal.
5445 return Op;
5446 }
5447 }
5448
5449 SDLoc DL(Op);
5450 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, IsMLA&: isMLA);
5451
5452 if (!NewOpc) {
5453 if (VT.getVectorElementType() == MVT::i64) {
5454 // If SVE is available then i64 vector multiplications can also be made
5455 // legal.
5456 if (Subtarget->hasSVE())
5457 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5458 // Fall through to expand this. It is not legal.
5459 return SDValue();
5460 } else
5461 // Other vector multiplications are legal.
5462 return Op;
5463 }
5464
5465 // Legalize to a S/UMULL instruction
5466 SDValue Op0;
5467 SDValue Op1 = skipExtensionForVectorMULL(N: N1, DAG);
5468 if (!isMLA) {
5469 Op0 = skipExtensionForVectorMULL(N: N0, DAG);
5470 assert(Op0.getValueType().is64BitVector() &&
5471 Op1.getValueType().is64BitVector() &&
5472 "unexpected types for extended operands to VMULL");
5473 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: OVT,
5474 N1: DAG.getNode(Opcode: NewOpc, DL, VT, N1: Op0, N2: Op1),
5475 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
5476 }
5477 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5478 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5479 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5480 SDValue N00 = skipExtensionForVectorMULL(N: N0.getOperand(i: 0), DAG);
5481 SDValue N01 = skipExtensionForVectorMULL(N: N0.getOperand(i: 1), DAG);
5482 EVT Op1VT = Op1.getValueType();
5483 return DAG.getNode(
5484 Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: OVT,
5485 N1: DAG.getNode(Opcode: N0.getOpcode(), DL, VT,
5486 N1: DAG.getNode(Opcode: NewOpc, DL, VT,
5487 N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op1VT, Operand: N00), N2: Op1),
5488 N2: DAG.getNode(Opcode: NewOpc, DL, VT,
5489 N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op1VT, Operand: N01), N2: Op1)),
5490 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
5491}
5492
5493static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5494 int Pattern) {
5495 if (Pattern == AArch64SVEPredPattern::all)
5496 return DAG.getConstant(Val: 1, DL, VT);
5497 return DAG.getNode(Opcode: AArch64ISD::PTRUE, DL, VT,
5498 Operand: DAG.getTargetConstant(Val: Pattern, DL, VT: MVT::i32));
5499}
5500
5501static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG,
5502 bool IsSigned, bool IsEqual) {
5503 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5504 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5505
5506 if (!isa<ConstantSDNode>(Val: N->getOperand(Num: Op1)))
5507 return SDValue();
5508
5509 SDLoc DL(N);
5510 APInt Y = N->getConstantOperandAPInt(Num: Op1);
5511
5512 // When the second operand is the maximum value, comparisons that include
5513 // equality can never fail and thus we can return an all active predicate.
5514 if (IsEqual)
5515 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5516 return DAG.getConstant(Val: 1, DL, VT: N->getValueType(ResNo: 0));
5517
5518 if (!isa<ConstantSDNode>(Val: N->getOperand(Num: Op0)))
5519 return SDValue();
5520
5521 APInt X = N->getConstantOperandAPInt(Num: Op0);
5522
5523 bool Overflow;
5524 APInt NumActiveElems =
5525 IsSigned ? Y.ssub_ov(RHS: X, Overflow) : Y.usub_ov(RHS: X, Overflow);
5526
5527 if (Overflow)
5528 return SDValue();
5529
5530 if (IsEqual) {
5531 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5532 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(RHS: One, Overflow)
5533 : NumActiveElems.uadd_ov(RHS: One, Overflow);
5534 if (Overflow)
5535 return SDValue();
5536 }
5537
5538 std::optional<unsigned> PredPattern =
5539 getSVEPredPatternFromNumElements(MinNumElts: NumActiveElems.getZExtValue());
5540 unsigned MinSVEVectorSize = std::max(
5541 a: DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), b: 128u);
5542 unsigned ElementSize = 128 / N->getValueType(ResNo: 0).getVectorMinNumElements();
5543 if (PredPattern != std::nullopt &&
5544 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5545 return getPTrue(DAG, DL, VT: N->getValueType(ResNo: 0), Pattern: *PredPattern);
5546
5547 return SDValue();
5548}
5549
5550// Returns a safe bitcast between two scalable vector predicates, where
5551// any newly created lanes from a widening bitcast are defined as zero.
5552static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
5553 SDLoc DL(Op);
5554 EVT InVT = Op.getValueType();
5555
5556 assert(InVT.getVectorElementType() == MVT::i1 &&
5557 VT.getVectorElementType() == MVT::i1 &&
5558 "Expected a predicate-to-predicate bitcast");
5559 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5560 InVT.isScalableVector() &&
5561 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5562 "Only expect to cast between legal scalable predicate types!");
5563
5564 // Return the operand if the cast isn't changing type,
5565 if (InVT == VT)
5566 return Op;
5567
5568 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5569 // than VT. This will increase the chances of removing casts that introduce
5570 // new lanes, which have to be explicitly zero'd.
5571 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5572 Op.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5573 Op.getOperand(i: 1).getValueType().bitsGT(VT))
5574 Op = Op.getOperand(i: 1);
5575
5576 SDValue Reinterpret = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
5577
5578 // We only have to zero the lanes if new lanes are being defined, e.g. when
5579 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5580 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5581 // we can return here.
5582 if (InVT.bitsGT(VT))
5583 return Reinterpret;
5584
5585 // Check if the other lanes are already known to be zeroed by
5586 // construction.
5587 if (isZeroingInactiveLanes(Op))
5588 return Reinterpret;
5589
5590 // Zero the newly introduced lanes.
5591 SDValue Mask = DAG.getConstant(Val: 1, DL, VT: InVT);
5592 Mask = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Mask);
5593 return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Reinterpret, N2: Mask);
5594}
5595
5596SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5597 SDValue Chain, SDLoc DL,
5598 EVT VT) const {
5599 SDValue Callee = DAG.getExternalSymbol(Sym: "__arm_sme_state",
5600 VT: getPointerTy(DL: DAG.getDataLayout()));
5601 Type *Int64Ty = Type::getInt64Ty(C&: *DAG.getContext());
5602 Type *RetTy = StructType::get(elt1: Int64Ty, elts: Int64Ty);
5603 TargetLowering::CallLoweringInfo CLI(DAG);
5604 ArgListTy Args;
5605 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5606 CC: CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
5607 ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
5608 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5609 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ Val: 1, DL, VT: MVT::i64);
5610 return DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: CallResult.first.getOperand(i: 0),
5611 N2: Mask);
5612}
5613
5614// Lower an SME LDR/STR ZA intrinsic
5615// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5616// folded into the instruction
5617// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5618// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5619// and tile slice registers
5620// ldr(%tileslice, %ptr, %vecnum)
5621// ->
5622// %svl = rdsvl
5623// %ptr2 = %ptr + %svl * %vecnum
5624// %tileslice2 = %tileslice + %vecnum
5625// ldr [%tileslice2, 0], [%ptr2, 0]
5626// Case 3: If the vecnum is an immediate out of range, then the same is done as
5627// case 2, but the base and slice registers are modified by the greatest
5628// multiple of 15 lower than the vecnum and the remainder is folded into the
5629// instruction. This means that successive loads and stores that are offset from
5630// each other can share the same base and slice register updates.
5631// ldr(%tileslice, %ptr, 22)
5632// ldr(%tileslice, %ptr, 23)
5633// ->
5634// %svl = rdsvl
5635// %ptr2 = %ptr + %svl * 15
5636// %tileslice2 = %tileslice + 15
5637// ldr [%tileslice2, 7], [%ptr2, 7]
5638// ldr [%tileslice2, 8], [%ptr2, 8]
5639// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5640// operand and the immediate can be folded into the instruction, like case 2.
5641// ldr(%tileslice, %ptr, %vecnum + 7)
5642// ldr(%tileslice, %ptr, %vecnum + 8)
5643// ->
5644// %svl = rdsvl
5645// %ptr2 = %ptr + %svl * %vecnum
5646// %tileslice2 = %tileslice + %vecnum
5647// ldr [%tileslice2, 7], [%ptr2, 7]
5648// ldr [%tileslice2, 8], [%ptr2, 8]
5649// Case 5: The vecnum being an add of an immediate out of range is also handled,
5650// in which case the same remainder logic as case 3 is used.
5651SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
5652 SDLoc DL(N);
5653
5654 SDValue TileSlice = N->getOperand(Num: 2);
5655 SDValue Base = N->getOperand(Num: 3);
5656 SDValue VecNum = N->getOperand(Num: 4);
5657 int32_t ConstAddend = 0;
5658 SDValue VarAddend = VecNum;
5659
5660 // If the vnum is an add of an immediate, we can fold it into the instruction
5661 if (VecNum.getOpcode() == ISD::ADD &&
5662 isa<ConstantSDNode>(Val: VecNum.getOperand(i: 1))) {
5663 ConstAddend = cast<ConstantSDNode>(Val: VecNum.getOperand(i: 1))->getSExtValue();
5664 VarAddend = VecNum.getOperand(i: 0);
5665 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(Val&: VecNum)) {
5666 ConstAddend = ImmNode->getSExtValue();
5667 VarAddend = SDValue();
5668 }
5669
5670 int32_t ImmAddend = ConstAddend % 16;
5671 if (int32_t C = (ConstAddend - ImmAddend)) {
5672 SDValue CVal = DAG.getTargetConstant(Val: C, DL, VT: MVT::i32);
5673 VarAddend = VarAddend
5674 ? DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops: {VarAddend, CVal})
5675 : CVal;
5676 }
5677
5678 if (VarAddend) {
5679 // Get the vector length that will be multiplied by vnum
5680 auto SVL = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
5681 Operand: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
5682
5683 // Multiply SVL and vnum then add it to the base
5684 SDValue Mul = DAG.getNode(
5685 Opcode: ISD::MUL, DL, VT: MVT::i64,
5686 Ops: {SVL, DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: VarAddend)});
5687 Base = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, Ops: {Base, Mul});
5688 // Just add vnum to the tileslice
5689 TileSlice = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops: {TileSlice, VarAddend});
5690 }
5691
5692 return DAG.getNode(Opcode: IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
5693 DL, VT: MVT::Other,
5694 Ops: {/*Chain=*/N.getOperand(i: 0), TileSlice, Base,
5695 DAG.getTargetConstant(Val: ImmAddend, DL, VT: MVT::i32)});
5696}
5697
5698SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG) {
5699 SDLoc DL(Op);
5700 SDValue ID =
5701 DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_match, DL, VT: MVT::i64);
5702
5703 auto Op1 = Op.getOperand(i: 1);
5704 auto Op2 = Op.getOperand(i: 2);
5705 auto Mask = Op.getOperand(i: 3);
5706
5707 EVT Op1VT = Op1.getValueType();
5708 EVT Op2VT = Op2.getValueType();
5709 EVT ResVT = Op.getValueType();
5710
5711 assert((Op1VT.getVectorElementType() == MVT::i8 ||
5712 Op1VT.getVectorElementType() == MVT::i16) &&
5713 "Expected 8-bit or 16-bit characters.");
5714
5715 // Scalable vector type used to wrap operands.
5716 // A single container is enough for both operands because ultimately the
5717 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
5718 EVT OpContainerVT = Op1VT.isScalableVector()
5719 ? Op1VT
5720 : getContainerForFixedLengthVector(DAG, VT: Op1VT);
5721
5722 if (Op2VT.is128BitVector()) {
5723 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
5724 Op2 = convertToScalableVector(DAG, VT: OpContainerVT, V: Op2);
5725 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
5726 if (ResVT.isScalableVector())
5727 Op2 = DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT: OpContainerVT, N1: Op2,
5728 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64));
5729 } else {
5730 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
5731 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
5732 MVT Op2IntVT = MVT::getIntegerVT(BitWidth: Op2BitWidth);
5733 EVT Op2PromotedVT = getPackedSVEVectorVT(VT: Op2IntVT);
5734 Op2 = DAG.getBitcast(VT: MVT::getVectorVT(VT: Op2IntVT, NumElements: 1), V: Op2);
5735 Op2 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: Op2IntVT, N1: Op2,
5736 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
5737 Op2 = DAG.getSplatVector(VT: Op2PromotedVT, DL, Op: Op2);
5738 Op2 = DAG.getBitcast(VT: OpContainerVT, V: Op2);
5739 }
5740
5741 // If the result is scalable, we just need to carry out the MATCH.
5742 if (ResVT.isScalableVector())
5743 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ResVT, N1: ID, N2: Mask, N3: Op1, N4: Op2);
5744
5745 // If the result is fixed, we can still use MATCH but we need to wrap the
5746 // first operand and the mask in scalable vectors before doing so.
5747
5748 // Wrap the operands.
5749 Op1 = convertToScalableVector(DAG, VT: OpContainerVT, V: Op1);
5750 Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: Op1VT, Operand: Mask);
5751 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5752
5753 // Carry out the match.
5754 SDValue Match = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Mask.getValueType(),
5755 N1: ID, N2: Mask, N3: Op1, N4: Op2);
5756
5757 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
5758 // (v16i8/v8i8).
5759 Match = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: OpContainerVT, Operand: Match);
5760 Match = convertFromScalableVector(DAG, VT: Op1VT, V: Match);
5761 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT, Operand: Match);
5762}
5763
5764SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5765 SelectionDAG &DAG) const {
5766 unsigned IntNo = Op.getConstantOperandVal(i: 1);
5767 SDLoc DL(Op);
5768 switch (IntNo) {
5769 default:
5770 return SDValue(); // Don't custom lower most intrinsics.
5771 case Intrinsic::aarch64_prefetch: {
5772 SDValue Chain = Op.getOperand(i: 0);
5773 SDValue Addr = Op.getOperand(i: 2);
5774
5775 unsigned IsWrite = Op.getConstantOperandVal(i: 3);
5776 unsigned Locality = Op.getConstantOperandVal(i: 4);
5777 unsigned IsStream = Op.getConstantOperandVal(i: 5);
5778 unsigned IsData = Op.getConstantOperandVal(i: 6);
5779 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5780 (!IsData << 3) | // IsDataCache bit
5781 (Locality << 1) | // Cache level bits
5782 (unsigned)IsStream; // Stream bit
5783
5784 return DAG.getNode(Opcode: AArch64ISD::PREFETCH, DL, VT: MVT::Other, N1: Chain,
5785 N2: DAG.getTargetConstant(Val: PrfOp, DL, VT: MVT::i32), N3: Addr);
5786 }
5787 case Intrinsic::aarch64_sme_str:
5788 case Intrinsic::aarch64_sme_ldr: {
5789 return LowerSMELdrStr(N: Op, DAG, IsLoad: IntNo == Intrinsic::aarch64_sme_ldr);
5790 }
5791 case Intrinsic::aarch64_sme_za_enable:
5792 return DAG.getNode(
5793 Opcode: AArch64ISD::SMSTART, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue),
5794 N1: Op->getOperand(Num: 0), // Chain
5795 N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32));
5796 case Intrinsic::aarch64_sme_za_disable:
5797 return DAG.getNode(
5798 Opcode: AArch64ISD::SMSTOP, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue),
5799 N1: Op->getOperand(Num: 0), // Chain
5800 N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32));
5801 }
5802}
5803
5804SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5805 SelectionDAG &DAG) const {
5806 unsigned IntNo = Op.getConstantOperandVal(i: 1);
5807 SDLoc DL(Op);
5808 switch (IntNo) {
5809 default:
5810 return SDValue(); // Don't custom lower most intrinsics.
5811 case Intrinsic::aarch64_mops_memset_tag: {
5812 auto Node = cast<MemIntrinsicSDNode>(Val: Op.getNode());
5813 SDValue Chain = Node->getChain();
5814 SDValue Dst = Op.getOperand(i: 2);
5815 SDValue Val = Op.getOperand(i: 3);
5816 Val = DAG.getAnyExtOrTrunc(Op: Val, DL, VT: MVT::i64);
5817 SDValue Size = Op.getOperand(i: 4);
5818 auto Alignment = Node->getMemOperand()->getAlign();
5819 bool IsVol = Node->isVolatile();
5820 auto DstPtrInfo = Node->getPointerInfo();
5821
5822 const auto &SDI =
5823 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5824 SDValue MS = SDI.EmitMOPS(Opcode: AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
5825 Chain, Dst, SrcOrValue: Val, Size, Alignment, isVolatile: IsVol,
5826 DstPtrInfo, SrcPtrInfo: MachinePointerInfo{});
5827
5828 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5829 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5830 // LowerOperationWrapper will complain that the number of results has
5831 // changed.
5832 return DAG.getMergeValues(Ops: {MS.getValue(R: 0), MS.getValue(R: 2)}, dl: DL);
5833 }
5834 }
5835}
5836
5837SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5838 SelectionDAG &DAG) const {
5839 unsigned IntNo = Op.getConstantOperandVal(i: 0);
5840 SDLoc DL(Op);
5841 switch (IntNo) {
5842 default: return SDValue(); // Don't custom lower most intrinsics.
5843 case Intrinsic::thread_pointer: {
5844 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
5845 return DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL, VT: PtrVT);
5846 }
5847 case Intrinsic::aarch64_neon_abs: {
5848 EVT Ty = Op.getValueType();
5849 if (Ty == MVT::i64) {
5850 SDValue Result =
5851 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v1i64, Operand: Op.getOperand(i: 1));
5852 Result = DAG.getNode(Opcode: ISD::ABS, DL, VT: MVT::v1i64, Operand: Result);
5853 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Result);
5854 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(VT: Ty)) {
5855 return DAG.getNode(Opcode: ISD::ABS, DL, VT: Ty, Operand: Op.getOperand(i: 1));
5856 } else {
5857 report_fatal_error(reason: "Unexpected type for AArch64 NEON intrinsic");
5858 }
5859 }
5860 case Intrinsic::aarch64_neon_pmull64: {
5861 SDValue LHS = Op.getOperand(i: 1);
5862 SDValue RHS = Op.getOperand(i: 2);
5863
5864 std::optional<uint64_t> LHSLane =
5865 getConstantLaneNumOfExtractHalfOperand(Op&: LHS);
5866 std::optional<uint64_t> RHSLane =
5867 getConstantLaneNumOfExtractHalfOperand(Op&: RHS);
5868
5869 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5870 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5871
5872 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5873 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5874 // which ISel recognizes better. For example, generate a ldr into d*
5875 // registers as opposed to a GPR load followed by a fmov.
5876 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5877 std::optional<uint64_t> OtherLane,
5878 const SDLoc &DL,
5879 SelectionDAG &DAG) -> SDValue {
5880 // If the operand is an higher half itself, rewrite it to
5881 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5882 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5883 if (NLane == 1)
5884 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v1i64,
5885 N1: N.getOperand(i: 0), N2: DAG.getConstant(Val: 1, DL, VT: MVT::i64));
5886
5887 // Operand N is not a higher half but the other operand is.
5888 if (OtherLane == 1) {
5889 // If this operand is a lower half, rewrite it to
5890 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5891 // align lanes of two operands. A roundtrip sequence (to move from lane
5892 // 1 to lane 0) is like this:
5893 // mov x8, v0.d[1]
5894 // fmov d0, x8
5895 if (NLane == 0)
5896 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v1i64,
5897 N1: DAG.getNode(Opcode: AArch64ISD::DUPLANE64, DL, VT: MVT::v2i64,
5898 N1: N.getOperand(i: 0),
5899 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64)),
5900 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i64));
5901
5902 // Otherwise just dup from main to all lanes.
5903 return DAG.getNode(Opcode: AArch64ISD::DUP, DL, VT: MVT::v1i64, Operand: N);
5904 }
5905
5906 // Neither operand is an extract of higher half, so codegen may just use
5907 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5908 assert(N.getValueType() == MVT::i64 &&
5909 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5910 return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: N);
5911 };
5912
5913 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
5914 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
5915
5916 return DAG.getNode(Opcode: AArch64ISD::PMULL, DL, VT: Op.getValueType(), N1: LHS, N2: RHS);
5917 }
5918 case Intrinsic::aarch64_neon_smax:
5919 return DAG.getNode(Opcode: ISD::SMAX, DL, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
5920 N2: Op.getOperand(i: 2));
5921 case Intrinsic::aarch64_neon_umax:
5922 return DAG.getNode(Opcode: ISD::UMAX, DL, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
5923 N2: Op.getOperand(i: 2));
5924 case Intrinsic::aarch64_neon_smin:
5925 return DAG.getNode(Opcode: ISD::SMIN, DL, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
5926 N2: Op.getOperand(i: 2));
5927 case Intrinsic::aarch64_neon_umin:
5928 return DAG.getNode(Opcode: ISD::UMIN, DL, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
5929 N2: Op.getOperand(i: 2));
5930 case Intrinsic::aarch64_neon_scalar_sqxtn:
5931 case Intrinsic::aarch64_neon_scalar_sqxtun:
5932 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5933 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5934 if (Op.getValueType() == MVT::i32)
5935 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32,
5936 Operand: DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::f32,
5937 N1: Op.getOperand(i: 0),
5938 N2: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64,
5939 Operand: Op.getOperand(i: 1))));
5940 return SDValue();
5941 }
5942 case Intrinsic::aarch64_neon_sqxtn:
5943 return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_S, DL, VT: Op.getValueType(),
5944 Operand: Op.getOperand(i: 1));
5945 case Intrinsic::aarch64_neon_sqxtun:
5946 return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_U, DL, VT: Op.getValueType(),
5947 Operand: Op.getOperand(i: 1));
5948 case Intrinsic::aarch64_neon_uqxtn:
5949 return DAG.getNode(Opcode: ISD::TRUNCATE_USAT_U, DL, VT: Op.getValueType(),
5950 Operand: Op.getOperand(i: 1));
5951 case Intrinsic::aarch64_neon_sqshrn:
5952 if (Op.getValueType().isVector())
5953 return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_S, DL, VT: Op.getValueType(),
5954 Operand: DAG.getNode(Opcode: AArch64ISD::VASHR, DL,
5955 VT: Op.getOperand(i: 1).getValueType(),
5956 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)));
5957 return SDValue();
5958 case Intrinsic::aarch64_neon_sqshrun:
5959 if (Op.getValueType().isVector())
5960 return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_U, DL, VT: Op.getValueType(),
5961 Operand: DAG.getNode(Opcode: AArch64ISD::VASHR, DL,
5962 VT: Op.getOperand(i: 1).getValueType(),
5963 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)));
5964 return SDValue();
5965 case Intrinsic::aarch64_neon_uqshrn:
5966 if (Op.getValueType().isVector())
5967 return DAG.getNode(Opcode: ISD::TRUNCATE_USAT_U, DL, VT: Op.getValueType(),
5968 Operand: DAG.getNode(Opcode: AArch64ISD::VLSHR, DL,
5969 VT: Op.getOperand(i: 1).getValueType(),
5970 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)));
5971 return SDValue();
5972 case Intrinsic::aarch64_neon_sqrshrn:
5973 if (Op.getValueType().isVector())
5974 return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_S, DL, VT: Op.getValueType(),
5975 Operand: DAG.getNode(Opcode: AArch64ISD::SRSHR_I, DL,
5976 VT: Op.getOperand(i: 1).getValueType(),
5977 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)));
5978 return SDValue();
5979 case Intrinsic::aarch64_neon_sqrshrun:
5980 if (Op.getValueType().isVector())
5981 return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_U, DL, VT: Op.getValueType(),
5982 Operand: DAG.getNode(Opcode: AArch64ISD::SRSHR_I, DL,
5983 VT: Op.getOperand(i: 1).getValueType(),
5984 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)));
5985 return SDValue();
5986 case Intrinsic::aarch64_neon_uqrshrn:
5987 if (Op.getValueType().isVector())
5988 return DAG.getNode(Opcode: ISD::TRUNCATE_USAT_U, DL, VT: Op.getValueType(),
5989 Operand: DAG.getNode(Opcode: AArch64ISD::URSHR_I, DL,
5990 VT: Op.getOperand(i: 1).getValueType(),
5991 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)));
5992 return SDValue();
5993 case Intrinsic::aarch64_neon_sqadd:
5994 if (Op.getValueType().isVector())
5995 return DAG.getNode(Opcode: ISD::SADDSAT, DL, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
5996 N2: Op.getOperand(i: 2));
5997 return SDValue();
5998 case Intrinsic::aarch64_neon_sqsub:
5999 if (Op.getValueType().isVector())
6000 return DAG.getNode(Opcode: ISD::SSUBSAT, DL, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
6001 N2: Op.getOperand(i: 2));
6002 return SDValue();
6003 case Intrinsic::aarch64_neon_uqadd:
6004 if (Op.getValueType().isVector())
6005 return DAG.getNode(Opcode: ISD::UADDSAT, DL, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
6006 N2: Op.getOperand(i: 2));
6007 return SDValue();
6008 case Intrinsic::aarch64_neon_uqsub:
6009 if (Op.getValueType().isVector())
6010 return DAG.getNode(Opcode: ISD::USUBSAT, DL, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
6011 N2: Op.getOperand(i: 2));
6012 return SDValue();
6013 case Intrinsic::aarch64_sve_whilelt:
6014 return optimizeIncrementingWhile(N: Op.getNode(), DAG, /*IsSigned=*/true,
6015 /*IsEqual=*/false);
6016 case Intrinsic::aarch64_sve_whilels:
6017 return optimizeIncrementingWhile(N: Op.getNode(), DAG, /*IsSigned=*/false,
6018 /*IsEqual=*/true);
6019 case Intrinsic::aarch64_sve_whilele:
6020 return optimizeIncrementingWhile(N: Op.getNode(), DAG, /*IsSigned=*/true,
6021 /*IsEqual=*/true);
6022 case Intrinsic::aarch64_sve_sunpkhi:
6023 return DAG.getNode(Opcode: AArch64ISD::SUNPKHI, DL, VT: Op.getValueType(),
6024 Operand: Op.getOperand(i: 1));
6025 case Intrinsic::aarch64_sve_sunpklo:
6026 return DAG.getNode(Opcode: AArch64ISD::SUNPKLO, DL, VT: Op.getValueType(),
6027 Operand: Op.getOperand(i: 1));
6028 case Intrinsic::aarch64_sve_uunpkhi:
6029 return DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL, VT: Op.getValueType(),
6030 Operand: Op.getOperand(i: 1));
6031 case Intrinsic::aarch64_sve_uunpklo:
6032 return DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL, VT: Op.getValueType(),
6033 Operand: Op.getOperand(i: 1));
6034 case Intrinsic::aarch64_sve_clasta_n:
6035 return DAG.getNode(Opcode: AArch64ISD::CLASTA_N, DL, VT: Op.getValueType(),
6036 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
6037 case Intrinsic::aarch64_sve_clastb_n:
6038 return DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL, VT: Op.getValueType(),
6039 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
6040 case Intrinsic::aarch64_sve_lasta:
6041 return DAG.getNode(Opcode: AArch64ISD::LASTA, DL, VT: Op.getValueType(),
6042 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
6043 case Intrinsic::aarch64_sve_lastb:
6044 return DAG.getNode(Opcode: AArch64ISD::LASTB, DL, VT: Op.getValueType(),
6045 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
6046 case Intrinsic::aarch64_sve_rev:
6047 return DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: Op.getValueType(),
6048 Operand: Op.getOperand(i: 1));
6049 case Intrinsic::aarch64_sve_tbl:
6050 return DAG.getNode(Opcode: AArch64ISD::TBL, DL, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
6051 N2: Op.getOperand(i: 2));
6052 case Intrinsic::aarch64_sve_trn1:
6053 return DAG.getNode(Opcode: AArch64ISD::TRN1, DL, VT: Op.getValueType(),
6054 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
6055 case Intrinsic::aarch64_sve_trn2:
6056 return DAG.getNode(Opcode: AArch64ISD::TRN2, DL, VT: Op.getValueType(),
6057 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
6058 case Intrinsic::aarch64_sve_uzp1:
6059 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: Op.getValueType(),
6060 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
6061 case Intrinsic::aarch64_sve_uzp2:
6062 return DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT: Op.getValueType(),
6063 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
6064 case Intrinsic::aarch64_sve_zip1:
6065 return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: Op.getValueType(),
6066 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
6067 case Intrinsic::aarch64_sve_zip2:
6068 return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: Op.getValueType(),
6069 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
6070 case Intrinsic::aarch64_sve_splice:
6071 return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: Op.getValueType(),
6072 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
6073 case Intrinsic::aarch64_sve_ptrue:
6074 return getPTrue(DAG, DL, VT: Op.getValueType(), Pattern: Op.getConstantOperandVal(i: 1));
6075 case Intrinsic::aarch64_sve_clz:
6076 return DAG.getNode(Opcode: AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6077 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6078 case Intrinsic::aarch64_sme_cntsb:
6079 return DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: Op.getValueType(),
6080 Operand: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
6081 case Intrinsic::aarch64_sme_cntsh: {
6082 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
6083 SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: Op.getValueType(), Operand: One);
6084 return DAG.getNode(Opcode: ISD::SRL, DL, VT: Op.getValueType(), N1: Bytes, N2: One);
6085 }
6086 case Intrinsic::aarch64_sme_cntsw: {
6087 SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: Op.getValueType(),
6088 Operand: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
6089 return DAG.getNode(Opcode: ISD::SRL, DL, VT: Op.getValueType(), N1: Bytes,
6090 N2: DAG.getConstant(Val: 2, DL, VT: MVT::i32));
6091 }
6092 case Intrinsic::aarch64_sme_cntsd: {
6093 SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: Op.getValueType(),
6094 Operand: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
6095 return DAG.getNode(Opcode: ISD::SRL, DL, VT: Op.getValueType(), N1: Bytes,
6096 N2: DAG.getConstant(Val: 3, DL, VT: MVT::i32));
6097 }
6098 case Intrinsic::aarch64_sve_cnt: {
6099 SDValue Data = Op.getOperand(i: 3);
6100 // CTPOP only supports integer operands.
6101 if (Data.getValueType().isFloatingPoint())
6102 Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Data);
6103 return DAG.getNode(Opcode: AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6104 N1: Op.getOperand(i: 2), N2: Data, N3: Op.getOperand(i: 1));
6105 }
6106 case Intrinsic::aarch64_sve_dupq_lane:
6107 return LowerDUPQLane(Op, DAG);
6108 case Intrinsic::aarch64_sve_convert_from_svbool:
6109 if (Op.getValueType() == MVT::aarch64svcount)
6110 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Op.getOperand(i: 1));
6111 return getSVEPredicateBitCast(VT: Op.getValueType(), Op: Op.getOperand(i: 1), DAG);
6112 case Intrinsic::aarch64_sve_convert_to_svbool:
6113 if (Op.getOperand(i: 1).getValueType() == MVT::aarch64svcount)
6114 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i1, Operand: Op.getOperand(i: 1));
6115 return getSVEPredicateBitCast(VT: MVT::nxv16i1, Op: Op.getOperand(i: 1), DAG);
6116 case Intrinsic::aarch64_sve_fneg:
6117 return DAG.getNode(Opcode: AArch64ISD::FNEG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6118 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6119 case Intrinsic::aarch64_sve_frintp:
6120 return DAG.getNode(Opcode: AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6121 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6122 case Intrinsic::aarch64_sve_frintm:
6123 return DAG.getNode(Opcode: AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6124 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6125 case Intrinsic::aarch64_sve_frinti:
6126 return DAG.getNode(Opcode: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6127 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
6128 N3: Op.getOperand(i: 1));
6129 case Intrinsic::aarch64_sve_frintx:
6130 return DAG.getNode(Opcode: AArch64ISD::FRINT_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6131 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6132 case Intrinsic::aarch64_sve_frinta:
6133 return DAG.getNode(Opcode: AArch64ISD::FROUND_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6134 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6135 case Intrinsic::aarch64_sve_frintn:
6136 return DAG.getNode(Opcode: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6137 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
6138 N3: Op.getOperand(i: 1));
6139 case Intrinsic::aarch64_sve_frintz:
6140 return DAG.getNode(Opcode: AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6141 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6142 case Intrinsic::aarch64_sve_ucvtf:
6143 return DAG.getNode(Opcode: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6144 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
6145 N3: Op.getOperand(i: 1));
6146 case Intrinsic::aarch64_sve_scvtf:
6147 return DAG.getNode(Opcode: AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6148 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
6149 N3: Op.getOperand(i: 1));
6150 case Intrinsic::aarch64_sve_fcvtzu:
6151 return DAG.getNode(Opcode: AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6152 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6153 case Intrinsic::aarch64_sve_fcvtzs:
6154 return DAG.getNode(Opcode: AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6155 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6156 case Intrinsic::aarch64_sve_fsqrt:
6157 return DAG.getNode(Opcode: AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6158 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6159 case Intrinsic::aarch64_sve_frecpx:
6160 return DAG.getNode(Opcode: AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6161 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6162 case Intrinsic::aarch64_sve_frecpe_x:
6163 return DAG.getNode(Opcode: AArch64ISD::FRECPE, DL, VT: Op.getValueType(),
6164 Operand: Op.getOperand(i: 1));
6165 case Intrinsic::aarch64_sve_frecps_x:
6166 return DAG.getNode(Opcode: AArch64ISD::FRECPS, DL, VT: Op.getValueType(),
6167 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
6168 case Intrinsic::aarch64_sve_frsqrte_x:
6169 return DAG.getNode(Opcode: AArch64ISD::FRSQRTE, DL, VT: Op.getValueType(),
6170 Operand: Op.getOperand(i: 1));
6171 case Intrinsic::aarch64_sve_frsqrts_x:
6172 return DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL, VT: Op.getValueType(),
6173 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
6174 case Intrinsic::aarch64_sve_fabs:
6175 return DAG.getNode(Opcode: AArch64ISD::FABS_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6176 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6177 case Intrinsic::aarch64_sve_abs:
6178 return DAG.getNode(Opcode: AArch64ISD::ABS_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6179 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6180 case Intrinsic::aarch64_sve_neg:
6181 return DAG.getNode(Opcode: AArch64ISD::NEG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6182 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6183 case Intrinsic::aarch64_sve_insr: {
6184 SDValue Scalar = Op.getOperand(i: 2);
6185 EVT ScalarTy = Scalar.getValueType();
6186 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6187 Scalar = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Scalar);
6188
6189 return DAG.getNode(Opcode: AArch64ISD::INSR, DL, VT: Op.getValueType(),
6190 N1: Op.getOperand(i: 1), N2: Scalar);
6191 }
6192 case Intrinsic::aarch64_sve_rbit:
6193 return DAG.getNode(Opcode: AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6194 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
6195 N3: Op.getOperand(i: 1));
6196 case Intrinsic::aarch64_sve_revb:
6197 return DAG.getNode(Opcode: AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6198 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6199 case Intrinsic::aarch64_sve_revh:
6200 return DAG.getNode(Opcode: AArch64ISD::REVH_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6201 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6202 case Intrinsic::aarch64_sve_revw:
6203 return DAG.getNode(Opcode: AArch64ISD::REVW_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6204 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6205 case Intrinsic::aarch64_sve_revd:
6206 return DAG.getNode(Opcode: AArch64ISD::REVD_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6207 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
6208 case Intrinsic::aarch64_sve_sxtb:
6209 return DAG.getNode(
6210 Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6211 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
6212 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i8)),
6213 N4: Op.getOperand(i: 1));
6214 case Intrinsic::aarch64_sve_sxth:
6215 return DAG.getNode(
6216 Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6217 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
6218 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i16)),
6219 N4: Op.getOperand(i: 1));
6220 case Intrinsic::aarch64_sve_sxtw:
6221 return DAG.getNode(
6222 Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6223 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
6224 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i32)),
6225 N4: Op.getOperand(i: 1));
6226 case Intrinsic::aarch64_sve_uxtb:
6227 return DAG.getNode(
6228 Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6229 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
6230 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i8)),
6231 N4: Op.getOperand(i: 1));
6232 case Intrinsic::aarch64_sve_uxth:
6233 return DAG.getNode(
6234 Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6235 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
6236 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i16)),
6237 N4: Op.getOperand(i: 1));
6238 case Intrinsic::aarch64_sve_uxtw:
6239 return DAG.getNode(
6240 Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6241 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
6242 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i32)),
6243 N4: Op.getOperand(i: 1));
6244 case Intrinsic::localaddress: {
6245 const auto &MF = DAG.getMachineFunction();
6246 const auto *RegInfo = Subtarget->getRegisterInfo();
6247 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6248 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg,
6249 VT: Op.getSimpleValueType());
6250 }
6251
6252 case Intrinsic::eh_recoverfp: {
6253 // FIXME: This needs to be implemented to correctly handle highly aligned
6254 // stack objects. For now we simply return the incoming FP. Refer D53541
6255 // for more details.
6256 SDValue FnOp = Op.getOperand(i: 1);
6257 SDValue IncomingFPOp = Op.getOperand(i: 2);
6258 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: FnOp);
6259 auto *Fn = dyn_cast_or_null<Function>(Val: GSD ? GSD->getGlobal() : nullptr);
6260 if (!Fn)
6261 report_fatal_error(
6262 reason: "llvm.eh.recoverfp must take a function as the first argument");
6263 return IncomingFPOp;
6264 }
6265
6266 case Intrinsic::aarch64_neon_vsri:
6267 case Intrinsic::aarch64_neon_vsli:
6268 case Intrinsic::aarch64_sve_sri:
6269 case Intrinsic::aarch64_sve_sli: {
6270 EVT Ty = Op.getValueType();
6271
6272 if (!Ty.isVector())
6273 report_fatal_error(reason: "Unexpected type for aarch64_neon_vsli");
6274
6275 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6276
6277 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6278 IntNo == Intrinsic::aarch64_sve_sri;
6279 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6280 return DAG.getNode(Opcode, DL, VT: Ty, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2),
6281 N3: Op.getOperand(i: 3));
6282 }
6283
6284 case Intrinsic::aarch64_neon_srhadd:
6285 case Intrinsic::aarch64_neon_urhadd:
6286 case Intrinsic::aarch64_neon_shadd:
6287 case Intrinsic::aarch64_neon_uhadd: {
6288 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6289 IntNo == Intrinsic::aarch64_neon_shadd);
6290 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6291 IntNo == Intrinsic::aarch64_neon_urhadd);
6292 unsigned Opcode = IsSignedAdd
6293 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6294 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6295 return DAG.getNode(Opcode, DL, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
6296 N2: Op.getOperand(i: 2));
6297 }
6298 case Intrinsic::aarch64_neon_saddlp:
6299 case Intrinsic::aarch64_neon_uaddlp: {
6300 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6301 ? AArch64ISD::UADDLP
6302 : AArch64ISD::SADDLP;
6303 return DAG.getNode(Opcode, DL, VT: Op.getValueType(), Operand: Op.getOperand(i: 1));
6304 }
6305 case Intrinsic::aarch64_neon_sdot:
6306 case Intrinsic::aarch64_neon_udot:
6307 case Intrinsic::aarch64_sve_sdot:
6308 case Intrinsic::aarch64_sve_udot: {
6309 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6310 IntNo == Intrinsic::aarch64_sve_udot)
6311 ? AArch64ISD::UDOT
6312 : AArch64ISD::SDOT;
6313 return DAG.getNode(Opcode, DL, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
6314 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
6315 }
6316 case Intrinsic::aarch64_neon_usdot:
6317 case Intrinsic::aarch64_sve_usdot: {
6318 return DAG.getNode(Opcode: AArch64ISD::USDOT, DL, VT: Op.getValueType(),
6319 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
6320 }
6321 case Intrinsic::aarch64_neon_saddlv:
6322 case Intrinsic::aarch64_neon_uaddlv: {
6323 EVT OpVT = Op.getOperand(i: 1).getValueType();
6324 EVT ResVT = Op.getValueType();
6325 assert(
6326 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6327 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6328 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6329 "Unexpected aarch64_neon_u/saddlv type");
6330 (void)OpVT;
6331 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6332 SDValue ADDLV = DAG.getNode(
6333 Opcode: IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6334 : AArch64ISD::SADDLV,
6335 DL, VT: ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Operand: Op.getOperand(i: 1));
6336 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6337 Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6338 N1: ADDLV, N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
6339 return EXTRACT_VEC_ELT;
6340 }
6341 case Intrinsic::experimental_cttz_elts: {
6342 SDValue CttzOp = Op.getOperand(i: 1);
6343 EVT VT = CttzOp.getValueType();
6344 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6345
6346 if (VT.isFixedLengthVector()) {
6347 // We can use SVE instructions to lower this intrinsic by first creating
6348 // an SVE predicate register mask from the fixed-width vector.
6349 EVT NewVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
6350 SDValue Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: NewVT, Operand: CttzOp);
6351 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6352 }
6353
6354 SDValue NewCttzElts =
6355 DAG.getNode(Opcode: AArch64ISD::CTTZ_ELTS, DL, VT: MVT::i64, Operand: CttzOp);
6356 return DAG.getZExtOrTrunc(Op: NewCttzElts, DL, VT: Op.getValueType());
6357 }
6358 case Intrinsic::experimental_vector_match: {
6359 return LowerVectorMatch(Op, DAG);
6360 }
6361 }
6362}
6363
6364bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6365 if (VT.getVectorElementType() == MVT::i8 ||
6366 VT.getVectorElementType() == MVT::i16) {
6367 EltTy = MVT::i32;
6368 return true;
6369 }
6370 return false;
6371}
6372
6373bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6374 EVT DataVT) const {
6375 const EVT IndexVT = Extend.getOperand(i: 0).getValueType();
6376 // SVE only supports implicit extension of 32-bit indices.
6377 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6378 return false;
6379
6380 // Indices cannot be smaller than the main data type.
6381 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6382 return false;
6383
6384 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6385 // element container type, which would violate the previous clause.
6386 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6387}
6388
6389bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6390 EVT ExtVT = ExtVal.getValueType();
6391 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6392 return false;
6393
6394 // It may be worth creating extending masked loads if there are multiple
6395 // masked loads using the same predicate. That way we'll end up creating
6396 // extending masked loads that may then get split by the legaliser. This
6397 // results in just one set of predicate unpacks at the start, instead of
6398 // multiple sets of vector unpacks after each load.
6399 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(Val: ExtVal->getOperand(Num: 0))) {
6400 if (!isLoadExtLegalOrCustom(ExtType: ISD::ZEXTLOAD, ValVT: ExtVT, MemVT: Ld->getValueType(ResNo: 0))) {
6401 // Disable extending masked loads for fixed-width for now, since the code
6402 // quality doesn't look great.
6403 if (!ExtVT.isScalableVector())
6404 return false;
6405
6406 unsigned NumExtMaskedLoads = 0;
6407 for (auto *U : Ld->getMask()->users())
6408 if (isa<MaskedLoadSDNode>(Val: U))
6409 NumExtMaskedLoads++;
6410
6411 if (NumExtMaskedLoads <= 1)
6412 return false;
6413 }
6414 }
6415
6416 return true;
6417}
6418
6419unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6420 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6421 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: false, /*Extend*/ args: false),
6422 AArch64ISD::GLD1_MERGE_ZERO},
6423 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: false, /*Extend*/ args: true),
6424 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6425 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: true, /*Extend*/ args: false),
6426 AArch64ISD::GLD1_MERGE_ZERO},
6427 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: true, /*Extend*/ args: true),
6428 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6429 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: false, /*Extend*/ args: false),
6430 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6431 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: false, /*Extend*/ args: true),
6432 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6433 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: true, /*Extend*/ args: false),
6434 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6435 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: true, /*Extend*/ args: true),
6436 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6437 };
6438 auto Key = std::make_tuple(args&: IsScaled, args&: IsSigned, args&: NeedsExtend);
6439 return AddrModes.find(x: Key)->second;
6440}
6441
6442unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6443 switch (Opcode) {
6444 default:
6445 llvm_unreachable("unimplemented opcode");
6446 return Opcode;
6447 case AArch64ISD::GLD1_MERGE_ZERO:
6448 return AArch64ISD::GLD1S_MERGE_ZERO;
6449 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6450 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6451 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6452 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6453 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6454 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6455 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6456 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6457 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6458 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6459 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6460 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6461 }
6462}
6463
6464SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6465 SelectionDAG &DAG) const {
6466 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Val&: Op);
6467
6468 SDLoc DL(Op);
6469 SDValue Chain = MGT->getChain();
6470 SDValue PassThru = MGT->getPassThru();
6471 SDValue Mask = MGT->getMask();
6472 SDValue BasePtr = MGT->getBasePtr();
6473 SDValue Index = MGT->getIndex();
6474 SDValue Scale = MGT->getScale();
6475 EVT VT = Op.getValueType();
6476 EVT MemVT = MGT->getMemoryVT();
6477 ISD::LoadExtType ExtType = MGT->getExtensionType();
6478 ISD::MemIndexType IndexType = MGT->getIndexType();
6479
6480 // SVE supports zero (and so undef) passthrough values only, everything else
6481 // must be handled manually by an explicit select on the load's output.
6482 if (!PassThru->isUndef() && !isZerosVector(N: PassThru.getNode())) {
6483 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6484 SDValue Load =
6485 DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
6486 MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6487 SDValue Select = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
6488 return DAG.getMergeValues(Ops: {Select, Load.getValue(R: 1)}, dl: DL);
6489 }
6490
6491 bool IsScaled = MGT->isIndexScaled();
6492 bool IsSigned = MGT->isIndexSigned();
6493
6494 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6495 // must be calculated before hand.
6496 uint64_t ScaleVal = Scale->getAsZExtVal();
6497 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6498 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6499 EVT IndexVT = Index.getValueType();
6500 Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
6501 N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
6502 Scale = DAG.getTargetConstant(Val: 1, DL, VT: Scale.getValueType());
6503
6504 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6505 return DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
6506 MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6507 }
6508
6509 // Lower fixed length gather to a scalable equivalent.
6510 if (VT.isFixedLengthVector()) {
6511 assert(Subtarget->useSVEForFixedLengthVectors() &&
6512 "Cannot lower when not using SVE for fixed vectors!");
6513
6514 // NOTE: Handle floating-point as if integer then bitcast the result.
6515 EVT DataVT = VT.changeVectorElementTypeToInteger();
6516 MemVT = MemVT.changeVectorElementTypeToInteger();
6517
6518 // Find the smallest integer fixed length vector we can use for the gather.
6519 EVT PromotedVT = VT.changeVectorElementType(EltVT: MVT::i32);
6520 if (DataVT.getVectorElementType() == MVT::i64 ||
6521 Index.getValueType().getVectorElementType() == MVT::i64 ||
6522 Mask.getValueType().getVectorElementType() == MVT::i64)
6523 PromotedVT = VT.changeVectorElementType(EltVT: MVT::i64);
6524
6525 // Promote vector operands except for passthrough, which we know is either
6526 // undef or zero, and thus best constructed directly.
6527 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6528 Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
6529 Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
6530
6531 // A promoted result type forces the need for an extending load.
6532 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6533 ExtType = ISD::EXTLOAD;
6534
6535 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
6536
6537 // Convert fixed length vector operands to scalable.
6538 MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
6539 Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
6540 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6541 PassThru = PassThru->isUndef() ? DAG.getUNDEF(VT: ContainerVT)
6542 : DAG.getConstant(Val: 0, DL, VT: ContainerVT);
6543
6544 // Emit equivalent scalable vector gather.
6545 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6546 SDValue Load =
6547 DAG.getMaskedGather(VTs: DAG.getVTList(VT1: ContainerVT, VT2: MVT::Other), MemVT, dl: DL,
6548 Ops, MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6549
6550 // Extract fixed length data then convert to the required result type.
6551 SDValue Result = convertFromScalableVector(DAG, VT: PromotedVT, V: Load);
6552 Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DataVT, Operand: Result);
6553 if (VT.isFloatingPoint())
6554 Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Result);
6555
6556 return DAG.getMergeValues(Ops: {Result, Load.getValue(R: 1)}, dl: DL);
6557 }
6558
6559 // Everything else is legal.
6560 return Op;
6561}
6562
6563SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6564 SelectionDAG &DAG) const {
6565 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Val&: Op);
6566
6567 SDLoc DL(Op);
6568 SDValue Chain = MSC->getChain();
6569 SDValue StoreVal = MSC->getValue();
6570 SDValue Mask = MSC->getMask();
6571 SDValue BasePtr = MSC->getBasePtr();
6572 SDValue Index = MSC->getIndex();
6573 SDValue Scale = MSC->getScale();
6574 EVT VT = StoreVal.getValueType();
6575 EVT MemVT = MSC->getMemoryVT();
6576 ISD::MemIndexType IndexType = MSC->getIndexType();
6577 bool Truncating = MSC->isTruncatingStore();
6578
6579 bool IsScaled = MSC->isIndexScaled();
6580 bool IsSigned = MSC->isIndexSigned();
6581
6582 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6583 // must be calculated before hand.
6584 uint64_t ScaleVal = Scale->getAsZExtVal();
6585 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6586 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6587 EVT IndexVT = Index.getValueType();
6588 Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
6589 N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
6590 Scale = DAG.getTargetConstant(Val: 1, DL, VT: Scale.getValueType());
6591
6592 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6593 return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
6594 MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
6595 }
6596
6597 // Lower fixed length scatter to a scalable equivalent.
6598 if (VT.isFixedLengthVector()) {
6599 assert(Subtarget->useSVEForFixedLengthVectors() &&
6600 "Cannot lower when not using SVE for fixed vectors!");
6601
6602 // Once bitcast we treat floating-point scatters as if integer.
6603 if (VT.isFloatingPoint()) {
6604 VT = VT.changeVectorElementTypeToInteger();
6605 MemVT = MemVT.changeVectorElementTypeToInteger();
6606 StoreVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: StoreVal);
6607 }
6608
6609 // Find the smallest integer fixed length vector we can use for the scatter.
6610 EVT PromotedVT = VT.changeVectorElementType(EltVT: MVT::i32);
6611 if (VT.getVectorElementType() == MVT::i64 ||
6612 Index.getValueType().getVectorElementType() == MVT::i64 ||
6613 Mask.getValueType().getVectorElementType() == MVT::i64)
6614 PromotedVT = VT.changeVectorElementType(EltVT: MVT::i64);
6615
6616 // Promote vector operands.
6617 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6618 Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
6619 Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
6620 StoreVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: PromotedVT, Operand: StoreVal);
6621
6622 // A promoted value type forces the need for a truncating store.
6623 if (PromotedVT != VT)
6624 Truncating = true;
6625
6626 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
6627
6628 // Convert fixed length vector operands to scalable.
6629 MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
6630 Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
6631 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6632 StoreVal = convertToScalableVector(DAG, VT: ContainerVT, V: StoreVal);
6633
6634 // Emit equivalent scalable vector scatter.
6635 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6636 return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
6637 MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
6638 }
6639
6640 // Everything else is legal.
6641 return Op;
6642}
6643
6644SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6645 SDLoc DL(Op);
6646 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Val&: Op);
6647 assert(LoadNode && "Expected custom lowering of a masked load node");
6648 EVT VT = Op->getValueType(ResNo: 0);
6649
6650 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6651 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6652
6653 SDValue PassThru = LoadNode->getPassThru();
6654 SDValue Mask = LoadNode->getMask();
6655
6656 if (PassThru->isUndef() || isZerosVector(N: PassThru.getNode()))
6657 return Op;
6658
6659 SDValue Load = DAG.getMaskedLoad(
6660 VT, dl: DL, Chain: LoadNode->getChain(), Base: LoadNode->getBasePtr(),
6661 Offset: LoadNode->getOffset(), Mask, Src0: DAG.getUNDEF(VT), MemVT: LoadNode->getMemoryVT(),
6662 MMO: LoadNode->getMemOperand(), AM: LoadNode->getAddressingMode(),
6663 LoadNode->getExtensionType());
6664
6665 SDValue Result = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
6666
6667 return DAG.getMergeValues(Ops: {Result, Load.getValue(R: 1)}, dl: DL);
6668}
6669
6670// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6671static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
6672 EVT VT, EVT MemVT,
6673 SelectionDAG &DAG) {
6674 assert(VT.isVector() && "VT should be a vector type");
6675 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6676
6677 SDValue Value = ST->getValue();
6678
6679 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6680 // the word lane which represent the v4i8 subvector. It optimizes the store
6681 // to:
6682 //
6683 // xtn v0.8b, v0.8h
6684 // str s0, [x0]
6685
6686 SDValue Undef = DAG.getUNDEF(VT: MVT::i16);
6687 SDValue UndefVec = DAG.getBuildVector(VT: MVT::v4i16, DL,
6688 Ops: {Undef, Undef, Undef, Undef});
6689
6690 SDValue TruncExt = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16,
6691 N1: Value, N2: UndefVec);
6692 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: TruncExt);
6693
6694 Trunc = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Trunc);
6695 SDValue ExtractTrunc = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32,
6696 N1: Trunc, N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
6697
6698 return DAG.getStore(Chain: ST->getChain(), dl: DL, Val: ExtractTrunc,
6699 Ptr: ST->getBasePtr(), MMO: ST->getMemOperand());
6700}
6701
6702static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
6703 SDLoc DL(Op);
6704 SDValue Src = Op.getOperand(i: 0);
6705 MVT DestVT = Op.getSimpleValueType();
6706 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6707 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Val: Op.getNode());
6708
6709 unsigned SrcAS = N->getSrcAddressSpace();
6710 unsigned DestAS = N->getDestAddressSpace();
6711 assert(SrcAS != DestAS &&
6712 "addrspacecast must be between different address spaces");
6713 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
6714 TLI.getTargetMachine().getPointerSize(DestAS) &&
6715 "addrspacecast must be between different ptr sizes");
6716 (void)TLI;
6717
6718 if (SrcAS == ARM64AS::PTR32_SPTR) {
6719 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: DestVT, N1: Src,
6720 N2: DAG.getTargetConstant(Val: 0, DL, VT: DestVT));
6721 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
6722 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: DestVT, N1: Src,
6723 N2: DAG.getTargetConstant(Val: 0, DL, VT: DestVT));
6724 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
6725 (DestAS == ARM64AS::PTR32_UPTR)) {
6726 SDValue Ext = DAG.getAnyExtOrTrunc(Op: Src, DL, VT: DestVT);
6727 SDValue Trunc = DAG.getZeroExtendInReg(Op: Ext, DL, VT: DestVT);
6728 return Trunc;
6729 } else {
6730 return Src;
6731 }
6732}
6733
6734// Custom lowering for any store, vector or scalar and/or default or with
6735// a truncate operations. Currently only custom lower truncate operation
6736// from vector v4i16 to v4i8 or volatile stores of i128.
6737SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6738 SelectionDAG &DAG) const {
6739 SDLoc Dl(Op);
6740 StoreSDNode *StoreNode = cast<StoreSDNode>(Val&: Op);
6741 assert (StoreNode && "Can only custom lower store nodes");
6742
6743 SDValue Value = StoreNode->getValue();
6744
6745 EVT VT = Value.getValueType();
6746 EVT MemVT = StoreNode->getMemoryVT();
6747
6748 if (VT.isVector()) {
6749 if (useSVEForFixedLengthVectorVT(
6750 VT,
6751 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6752 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6753
6754 unsigned AS = StoreNode->getAddressSpace();
6755 Align Alignment = StoreNode->getAlign();
6756 if (Alignment < MemVT.getStoreSize() &&
6757 !allowsMisalignedMemoryAccesses(VT: MemVT, AddrSpace: AS, Alignment,
6758 Flags: StoreNode->getMemOperand()->getFlags(),
6759 Fast: nullptr)) {
6760 return scalarizeVectorStore(ST: StoreNode, DAG);
6761 }
6762
6763 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6764 MemVT == MVT::v4i8) {
6765 return LowerTruncateVectorStore(DL: Dl, ST: StoreNode, VT, MemVT, DAG);
6766 }
6767 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6768 // the custom lowering, as there are no un-paired non-temporal stores and
6769 // legalization will break up 256 bit inputs.
6770 ElementCount EC = MemVT.getVectorElementCount();
6771 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6772 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6773 (MemVT.getScalarSizeInBits() == 8u ||
6774 MemVT.getScalarSizeInBits() == 16u ||
6775 MemVT.getScalarSizeInBits() == 32u ||
6776 MemVT.getScalarSizeInBits() == 64u)) {
6777 SDValue Lo =
6778 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: Dl,
6779 VT: MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
6780 N1: StoreNode->getValue(), N2: DAG.getConstant(Val: 0, DL: Dl, VT: MVT::i64));
6781 SDValue Hi =
6782 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: Dl,
6783 VT: MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
6784 N1: StoreNode->getValue(),
6785 N2: DAG.getConstant(Val: EC.getKnownMinValue() / 2, DL: Dl, VT: MVT::i64));
6786 SDValue Result = DAG.getMemIntrinsicNode(
6787 Opcode: AArch64ISD::STNP, dl: Dl, VTList: DAG.getVTList(VT: MVT::Other),
6788 Ops: {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6789 MemVT: StoreNode->getMemoryVT(), MMO: StoreNode->getMemOperand());
6790 return Result;
6791 }
6792 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6793 return LowerStore128(Op, DAG);
6794 } else if (MemVT == MVT::i64x8) {
6795 SDValue Value = StoreNode->getValue();
6796 assert(Value->getValueType(0) == MVT::i64x8);
6797 SDValue Chain = StoreNode->getChain();
6798 SDValue Base = StoreNode->getBasePtr();
6799 EVT PtrVT = Base.getValueType();
6800 for (unsigned i = 0; i < 8; i++) {
6801 SDValue Part = DAG.getNode(Opcode: AArch64ISD::LS64_EXTRACT, DL: Dl, VT: MVT::i64,
6802 N1: Value, N2: DAG.getConstant(Val: i, DL: Dl, VT: MVT::i32));
6803 SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL: Dl, VT: PtrVT, N1: Base,
6804 N2: DAG.getConstant(Val: i * 8, DL: Dl, VT: PtrVT));
6805 Chain = DAG.getStore(Chain, dl: Dl, Val: Part, Ptr, PtrInfo: StoreNode->getPointerInfo(),
6806 Alignment: StoreNode->getBaseAlign());
6807 }
6808 return Chain;
6809 }
6810
6811 return SDValue();
6812}
6813
6814/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6815SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6816 SelectionDAG &DAG) const {
6817 MemSDNode *StoreNode = cast<MemSDNode>(Val&: Op);
6818 assert(StoreNode->getMemoryVT() == MVT::i128);
6819 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6820
6821 bool IsStoreRelease =
6822 StoreNode->getMergedOrdering() == AtomicOrdering::Release;
6823 if (StoreNode->isAtomic())
6824 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6825 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6826 StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
6827 StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
6828
6829 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6830 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6831 ? StoreNode->getOperand(Num: 1)
6832 : StoreNode->getOperand(Num: 2);
6833 SDLoc DL(Op);
6834 auto StoreValue = DAG.SplitScalar(N: Value, DL, LoVT: MVT::i64, HiVT: MVT::i64);
6835 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6836 if (DAG.getDataLayout().isBigEndian())
6837 std::swap(a&: StoreValue.first, b&: StoreValue.second);
6838 SDValue Result = DAG.getMemIntrinsicNode(
6839 Opcode, dl: DL, VTList: DAG.getVTList(VT: MVT::Other),
6840 Ops: {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6841 StoreNode->getBasePtr()},
6842 MemVT: StoreNode->getMemoryVT(), MMO: StoreNode->getMemOperand());
6843 return Result;
6844}
6845
6846SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6847 SelectionDAG &DAG) const {
6848 SDLoc DL(Op);
6849 LoadSDNode *LoadNode = cast<LoadSDNode>(Val&: Op);
6850 assert(LoadNode && "Expected custom lowering of a load node");
6851
6852 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6853 SmallVector<SDValue, 8> Ops;
6854 SDValue Base = LoadNode->getBasePtr();
6855 SDValue Chain = LoadNode->getChain();
6856 EVT PtrVT = Base.getValueType();
6857 for (unsigned i = 0; i < 8; i++) {
6858 SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: Base,
6859 N2: DAG.getConstant(Val: i * 8, DL, VT: PtrVT));
6860 SDValue Part =
6861 DAG.getLoad(VT: MVT::i64, dl: DL, Chain, Ptr, PtrInfo: LoadNode->getPointerInfo(),
6862 Alignment: LoadNode->getBaseAlign());
6863 Ops.push_back(Elt: Part);
6864 Chain = SDValue(Part.getNode(), 1);
6865 }
6866 SDValue Loaded = DAG.getNode(Opcode: AArch64ISD::LS64_BUILD, DL, VT: MVT::i64x8, Ops);
6867 return DAG.getMergeValues(Ops: {Loaded, Chain}, dl: DL);
6868 }
6869
6870 // Custom lowering for extending v4i8 vector loads.
6871 EVT VT = Op->getValueType(ResNo: 0);
6872 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6873
6874 if (LoadNode->getMemoryVT() != MVT::v4i8)
6875 return SDValue();
6876
6877 // Avoid generating unaligned loads.
6878 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
6879 return SDValue();
6880
6881 unsigned ExtType;
6882 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6883 ExtType = ISD::SIGN_EXTEND;
6884 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6885 LoadNode->getExtensionType() == ISD::EXTLOAD)
6886 ExtType = ISD::ZERO_EXTEND;
6887 else
6888 return SDValue();
6889
6890 SDValue Load = DAG.getLoad(VT: MVT::f32, dl: DL, Chain: LoadNode->getChain(),
6891 Ptr: LoadNode->getBasePtr(), PtrInfo: MachinePointerInfo());
6892 SDValue Chain = Load.getValue(R: 1);
6893 SDValue Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v2f32, Operand: Load);
6894 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i8, Operand: Vec);
6895 SDValue Ext = DAG.getNode(Opcode: ExtType, DL, VT: MVT::v8i16, Operand: BC);
6896 Ext = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v4i16, N1: Ext,
6897 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
6898 if (VT == MVT::v4i32)
6899 Ext = DAG.getNode(Opcode: ExtType, DL, VT: MVT::v4i32, Operand: Ext);
6900 return DAG.getMergeValues(Ops: {Ext, Chain}, dl: DL);
6901}
6902
6903SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
6904 SelectionDAG &DAG) const {
6905 SDLoc DL(Op);
6906 SDValue Vec = Op.getOperand(i: 0);
6907 SDValue Mask = Op.getOperand(i: 1);
6908 SDValue Passthru = Op.getOperand(i: 2);
6909 EVT VecVT = Vec.getValueType();
6910 EVT MaskVT = Mask.getValueType();
6911 EVT ElmtVT = VecVT.getVectorElementType();
6912 const bool IsFixedLength = VecVT.isFixedLengthVector();
6913 const bool HasPassthru = !Passthru.isUndef();
6914 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
6915 EVT FixedVecVT = MVT::getVectorVT(VT: ElmtVT.getSimpleVT(), NumElements: MinElmts);
6916
6917 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
6918
6919 if (!Subtarget->isSVEAvailable())
6920 return SDValue();
6921
6922 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
6923 return SDValue();
6924
6925 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
6926 if (MinElmts != 2 && MinElmts != 4)
6927 return SDValue();
6928
6929 // We can use the SVE register containing the NEON vector in its lowest bits.
6930 if (IsFixedLength) {
6931 EVT ScalableVecVT =
6932 MVT::getScalableVectorVT(VT: ElmtVT.getSimpleVT(), NumElements: MinElmts);
6933 EVT ScalableMaskVT = MVT::getScalableVectorVT(
6934 VT: MaskVT.getVectorElementType().getSimpleVT(), NumElements: MinElmts);
6935
6936 Vec = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: ScalableVecVT,
6937 N1: DAG.getUNDEF(VT: ScalableVecVT), N2: Vec,
6938 N3: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
6939 Mask = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: ScalableMaskVT,
6940 N1: DAG.getUNDEF(VT: ScalableMaskVT), N2: Mask,
6941 N3: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
6942 Mask = DAG.getNode(Opcode: ISD::TRUNCATE, DL,
6943 VT: ScalableMaskVT.changeVectorElementType(EltVT: MVT::i1), Operand: Mask);
6944 Passthru = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: ScalableVecVT,
6945 N1: DAG.getUNDEF(VT: ScalableVecVT), N2: Passthru,
6946 N3: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
6947
6948 VecVT = Vec.getValueType();
6949 MaskVT = Mask.getValueType();
6950 }
6951
6952 // Get legal type for compact instruction
6953 EVT ContainerVT = getSVEContainerType(ContentTy: VecVT);
6954 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
6955
6956 // Convert to i32 or i64 for smaller types, as these are the only supported
6957 // sizes for compact.
6958 if (ContainerVT != VecVT) {
6959 Vec = DAG.getBitcast(VT: CastVT, V: Vec);
6960 Vec = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ContainerVT, Operand: Vec);
6961 }
6962
6963 SDValue Compressed = DAG.getNode(
6964 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Vec.getValueType(),
6965 N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_compact, DL, VT: MVT::i64), N2: Mask, N3: Vec);
6966
6967 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
6968 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(N: Passthru.getNode())) {
6969 SDValue Offset = DAG.getNode(
6970 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i64,
6971 N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_cntp, DL, VT: MVT::i64), N2: Mask, N3: Mask);
6972
6973 SDValue IndexMask = DAG.getNode(
6974 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MaskVT,
6975 N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_whilelo, DL, VT: MVT::i64),
6976 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64), N3: Offset);
6977
6978 Compressed =
6979 DAG.getNode(Opcode: ISD::VSELECT, DL, VT: VecVT, N1: IndexMask, N2: Compressed, N3: Passthru);
6980 }
6981
6982 // Extracting from a legal SVE type before truncating produces better code.
6983 if (IsFixedLength) {
6984 Compressed = DAG.getNode(
6985 Opcode: ISD::EXTRACT_SUBVECTOR, DL,
6986 VT: FixedVecVT.changeVectorElementType(EltVT: ContainerVT.getVectorElementType()),
6987 N1: Compressed, N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
6988 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
6989 VecVT = FixedVecVT;
6990 }
6991
6992 // If we changed the element type before, we need to convert it back.
6993 if (ContainerVT != VecVT) {
6994 Compressed = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: CastVT, Operand: Compressed);
6995 Compressed = DAG.getBitcast(VT: VecVT, V: Compressed);
6996 }
6997
6998 return Compressed;
6999}
7000
7001// Generate SUBS and CSEL for integer abs.
7002SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7003 MVT VT = Op.getSimpleValueType();
7004
7005 if (VT.isVector())
7006 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABS_MERGE_PASSTHRU);
7007
7008 SDLoc DL(Op);
7009 SDValue Neg = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT),
7010 N2: Op.getOperand(i: 0));
7011 // Generate SUBS & CSEL.
7012 SDValue Cmp =
7013 DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::i32),
7014 N1: Op.getOperand(i: 0), N2: DAG.getConstant(Val: 0, DL, VT));
7015 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: Op.getOperand(i: 0), N2: Neg,
7016 N3: DAG.getConstant(Val: AArch64CC::PL, DL, VT: MVT::i32),
7017 N4: Cmp.getValue(R: 1));
7018}
7019
7020static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
7021 SDValue Chain = Op.getOperand(i: 0);
7022 SDValue Cond = Op.getOperand(i: 1);
7023 SDValue Dest = Op.getOperand(i: 2);
7024
7025 AArch64CC::CondCode CC;
7026 if (SDValue Cmp = emitConjunction(DAG, Val: Cond, OutCC&: CC)) {
7027 SDLoc DL(Op);
7028 SDValue CCVal = DAG.getConstant(Val: CC, DL, VT: MVT::i32);
7029 return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
7030 N4: Cmp);
7031 }
7032
7033 return SDValue();
7034}
7035
7036// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7037// FSHL is converted to FSHR before deciding what to do with it
7038static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) {
7039 SDValue Shifts = Op.getOperand(i: 2);
7040 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7041 // If opcode is FSHL, convert it to FSHR
7042 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Val&: Shifts)) {
7043 SDLoc DL(Op);
7044 MVT VT = Op.getSimpleValueType();
7045 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7046
7047 if (Op.getOpcode() == ISD::FSHL) {
7048 if (NewShiftNo == 0)
7049 return Op.getOperand(i: 0);
7050
7051 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7052 return DAG.getNode(
7053 Opcode: ISD::FSHR, DL, VT, N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1),
7054 N3: DAG.getConstant(Val: NewShiftNo, DL, VT: Shifts.getValueType()));
7055 }
7056
7057 if (Op.getOpcode() == ISD::FSHR) {
7058 if (NewShiftNo == 0)
7059 return Op.getOperand(i: 1);
7060
7061 if (ShiftNo->getZExtValue() == NewShiftNo)
7062 return Op;
7063
7064 // Rewrite using the normalised shift amount.
7065 return DAG.getNode(
7066 Opcode: ISD::FSHR, DL, VT, N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1),
7067 N3: DAG.getConstant(Val: NewShiftNo, DL, VT: Shifts.getValueType()));
7068 }
7069 }
7070
7071 return SDValue();
7072}
7073
7074static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
7075 SDValue X = Op.getOperand(i: 0);
7076 EVT XScalarTy = X.getValueType();
7077 SDValue Exp = Op.getOperand(i: 1);
7078
7079 SDLoc DL(Op);
7080 EVT XVT, ExpVT;
7081 switch (Op.getSimpleValueType().SimpleTy) {
7082 default:
7083 return SDValue();
7084 case MVT::bf16:
7085 case MVT::f16:
7086 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X);
7087 [[fallthrough]];
7088 case MVT::f32:
7089 XVT = MVT::nxv4f32;
7090 ExpVT = MVT::nxv4i32;
7091 break;
7092 case MVT::f64:
7093 XVT = MVT::nxv2f64;
7094 ExpVT = MVT::nxv2i64;
7095 Exp = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: Exp);
7096 break;
7097 }
7098
7099 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
7100 SDValue VX =
7101 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: XVT, N1: DAG.getUNDEF(VT: XVT), N2: X, N3: Zero);
7102 SDValue VExp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ExpVT,
7103 N1: DAG.getUNDEF(VT: ExpVT), N2: Exp, N3: Zero);
7104 SDValue VPg = getPTrue(DAG, DL, VT: XVT.changeVectorElementType(EltVT: MVT::i1),
7105 Pattern: AArch64SVEPredPattern::all);
7106 SDValue FScale =
7107 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: XVT,
7108 N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_fscale, DL, VT: MVT::i64),
7109 N2: VPg, N3: VX, N4: VExp);
7110 SDValue Final =
7111 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: X.getValueType(), N1: FScale, N2: Zero);
7112 if (X.getValueType() != XScalarTy)
7113 Final = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: XScalarTy, N1: Final,
7114 N2: DAG.getIntPtrConstant(Val: 1, DL: SDLoc(Op), /*isTarget=*/true));
7115 return Final;
7116}
7117
7118SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7119 SelectionDAG &DAG) const {
7120 return Op.getOperand(i: 0);
7121}
7122
7123SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7124 SelectionDAG &DAG) const {
7125 SDValue Chain = Op.getOperand(i: 0);
7126 SDValue Trmp = Op.getOperand(i: 1); // trampoline, >=32 bytes
7127 SDValue FPtr = Op.getOperand(i: 2); // nested function
7128 SDValue Nest = Op.getOperand(i: 3); // 'nest' parameter value
7129
7130 const Value *TrmpAddr = cast<SrcValueSDNode>(Val: Op.getOperand(i: 4))->getValue();
7131
7132 // ldr NestReg, .+16
7133 // ldr x17, .+20
7134 // br x17
7135 // .word 0
7136 // .nest: .qword nest
7137 // .fptr: .qword fptr
7138 SDValue OutChains[5];
7139
7140 const Function *Func =
7141 cast<Function>(Val: cast<SrcValueSDNode>(Val: Op.getOperand(i: 5))->getValue());
7142 CallingConv::ID CC = Func->getCallingConv();
7143 unsigned NestReg;
7144
7145 switch (CC) {
7146 default:
7147 NestReg = 0x0f; // X15
7148 break;
7149 case CallingConv::ARM64EC_Thunk_X64:
7150 // Must be kept in sync with AArch64CallingConv.td
7151 NestReg = 0x04; // X4
7152 break;
7153 }
7154
7155 const char FptrReg = 0x11; // X17
7156
7157 SDValue Addr = Trmp;
7158
7159 SDLoc DL(Op);
7160 OutChains[0] = DAG.getStore(
7161 Chain, dl: DL, Val: DAG.getConstant(Val: 0x58000080u | NestReg, DL, VT: MVT::i32), Ptr: Addr,
7162 PtrInfo: MachinePointerInfo(TrmpAddr));
7163
7164 Addr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Trmp,
7165 N2: DAG.getConstant(Val: 4, DL, VT: MVT::i64));
7166 OutChains[1] = DAG.getStore(
7167 Chain, dl: DL, Val: DAG.getConstant(Val: 0x580000b0u | FptrReg, DL, VT: MVT::i32), Ptr: Addr,
7168 PtrInfo: MachinePointerInfo(TrmpAddr, 4));
7169
7170 Addr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Trmp,
7171 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i64));
7172 OutChains[2] =
7173 DAG.getStore(Chain, dl: DL, Val: DAG.getConstant(Val: 0xd61f0220u, DL, VT: MVT::i32), Ptr: Addr,
7174 PtrInfo: MachinePointerInfo(TrmpAddr, 8));
7175
7176 Addr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Trmp,
7177 N2: DAG.getConstant(Val: 16, DL, VT: MVT::i64));
7178 OutChains[3] =
7179 DAG.getStore(Chain, dl: DL, Val: Nest, Ptr: Addr, PtrInfo: MachinePointerInfo(TrmpAddr, 16));
7180
7181 Addr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Trmp,
7182 N2: DAG.getConstant(Val: 24, DL, VT: MVT::i64));
7183 OutChains[4] =
7184 DAG.getStore(Chain, dl: DL, Val: FPtr, Ptr: Addr, PtrInfo: MachinePointerInfo(TrmpAddr, 24));
7185
7186 SDValue StoreToken = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: OutChains);
7187
7188 SDValue EndOfTrmp = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Trmp,
7189 N2: DAG.getConstant(Val: 12, DL, VT: MVT::i64));
7190
7191 // Call clear cache on the trampoline instructions.
7192 return DAG.getNode(Opcode: ISD::CLEAR_CACHE, DL, VT: MVT::Other, N1: StoreToken, N2: Trmp,
7193 N3: EndOfTrmp);
7194}
7195
7196SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
7197 SelectionDAG &DAG) const {
7198 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7199 LLVM_DEBUG(Op.dump());
7200
7201 switch (Op.getOpcode()) {
7202 default:
7203 llvm_unreachable("unimplemented operand");
7204 return SDValue();
7205 case ISD::BITCAST:
7206 return LowerBITCAST(Op, DAG);
7207 case ISD::GlobalAddress:
7208 return LowerGlobalAddress(Op, DAG);
7209 case ISD::GlobalTLSAddress:
7210 return LowerGlobalTLSAddress(Op, DAG);
7211 case ISD::PtrAuthGlobalAddress:
7212 return LowerPtrAuthGlobalAddress(Op, DAG);
7213 case ISD::ADJUST_TRAMPOLINE:
7214 return LowerADJUST_TRAMPOLINE(Op, DAG);
7215 case ISD::INIT_TRAMPOLINE:
7216 return LowerINIT_TRAMPOLINE(Op, DAG);
7217 case ISD::SETCC:
7218 case ISD::STRICT_FSETCC:
7219 case ISD::STRICT_FSETCCS:
7220 return LowerSETCC(Op, DAG);
7221 case ISD::SETCCCARRY:
7222 return LowerSETCCCARRY(Op, DAG);
7223 case ISD::BRCOND:
7224 return LowerBRCOND(Op, DAG);
7225 case ISD::BR_CC:
7226 return LowerBR_CC(Op, DAG);
7227 case ISD::SELECT:
7228 return LowerSELECT(Op, DAG);
7229 case ISD::SELECT_CC:
7230 return LowerSELECT_CC(Op, DAG);
7231 case ISD::JumpTable:
7232 return LowerJumpTable(Op, DAG);
7233 case ISD::BR_JT:
7234 return LowerBR_JT(Op, DAG);
7235 case ISD::BRIND:
7236 return LowerBRIND(Op, DAG);
7237 case ISD::ConstantPool:
7238 return LowerConstantPool(Op, DAG);
7239 case ISD::BlockAddress:
7240 return LowerBlockAddress(Op, DAG);
7241 case ISD::VASTART:
7242 return LowerVASTART(Op, DAG);
7243 case ISD::VACOPY:
7244 return LowerVACOPY(Op, DAG);
7245 case ISD::VAARG:
7246 return LowerVAARG(Op, DAG);
7247 case ISD::UADDO_CARRY:
7248 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: false /*unsigned*/);
7249 case ISD::USUBO_CARRY:
7250 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: false /*unsigned*/);
7251 case ISD::SADDO_CARRY:
7252 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: true /*signed*/);
7253 case ISD::SSUBO_CARRY:
7254 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: true /*signed*/);
7255 case ISD::SADDO:
7256 case ISD::UADDO:
7257 case ISD::SSUBO:
7258 case ISD::USUBO:
7259 case ISD::SMULO:
7260 case ISD::UMULO:
7261 return LowerXALUO(Op, DAG);
7262 case ISD::FADD:
7263 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FADD_PRED);
7264 case ISD::FSUB:
7265 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSUB_PRED);
7266 case ISD::FMUL:
7267 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMUL_PRED);
7268 case ISD::FMA:
7269 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMA_PRED);
7270 case ISD::FDIV:
7271 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FDIV_PRED);
7272 case ISD::FNEG:
7273 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEG_MERGE_PASSTHRU);
7274 case ISD::FCEIL:
7275 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FCEIL_MERGE_PASSTHRU);
7276 case ISD::FFLOOR:
7277 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7278 case ISD::FNEARBYINT:
7279 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7280 case ISD::FRINT:
7281 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FRINT_MERGE_PASSTHRU);
7282 case ISD::FROUND:
7283 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUND_MERGE_PASSTHRU);
7284 case ISD::FROUNDEVEN:
7285 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7286 case ISD::FTRUNC:
7287 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7288 case ISD::FSQRT:
7289 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSQRT_MERGE_PASSTHRU);
7290 case ISD::FABS:
7291 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FABS_MERGE_PASSTHRU);
7292 case ISD::FP_ROUND:
7293 case ISD::STRICT_FP_ROUND:
7294 return LowerFP_ROUND(Op, DAG);
7295 case ISD::FP_EXTEND:
7296 case ISD::STRICT_FP_EXTEND:
7297 return LowerFP_EXTEND(Op, DAG);
7298 case ISD::FRAMEADDR:
7299 return LowerFRAMEADDR(Op, DAG);
7300 case ISD::SPONENTRY:
7301 return LowerSPONENTRY(Op, DAG);
7302 case ISD::RETURNADDR:
7303 return LowerRETURNADDR(Op, DAG);
7304 case ISD::ADDROFRETURNADDR:
7305 return LowerADDROFRETURNADDR(Op, DAG);
7306 case ISD::CONCAT_VECTORS:
7307 return LowerCONCAT_VECTORS(Op, DAG);
7308 case ISD::INSERT_VECTOR_ELT:
7309 return LowerINSERT_VECTOR_ELT(Op, DAG);
7310 case ISD::EXTRACT_VECTOR_ELT:
7311 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7312 case ISD::BUILD_VECTOR:
7313 return LowerBUILD_VECTOR(Op, DAG);
7314 case ISD::ZERO_EXTEND_VECTOR_INREG:
7315 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7316 case ISD::VECTOR_SHUFFLE:
7317 return LowerVECTOR_SHUFFLE(Op, DAG);
7318 case ISD::SPLAT_VECTOR:
7319 return LowerSPLAT_VECTOR(Op, DAG);
7320 case ISD::EXTRACT_SUBVECTOR:
7321 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7322 case ISD::INSERT_SUBVECTOR:
7323 return LowerINSERT_SUBVECTOR(Op, DAG);
7324 case ISD::SDIV:
7325 case ISD::UDIV:
7326 return LowerDIV(Op, DAG);
7327 case ISD::SMIN:
7328 case ISD::UMIN:
7329 case ISD::SMAX:
7330 case ISD::UMAX:
7331 return LowerMinMax(Op, DAG);
7332 case ISD::SRA:
7333 case ISD::SRL:
7334 case ISD::SHL:
7335 return LowerVectorSRA_SRL_SHL(Op, DAG);
7336 case ISD::SHL_PARTS:
7337 case ISD::SRL_PARTS:
7338 case ISD::SRA_PARTS:
7339 return LowerShiftParts(Op, DAG);
7340 case ISD::CTPOP:
7341 case ISD::PARITY:
7342 return LowerCTPOP_PARITY(Op, DAG);
7343 case ISD::FCOPYSIGN:
7344 return LowerFCOPYSIGN(Op, DAG);
7345 case ISD::OR:
7346 return LowerVectorOR(Op, DAG);
7347 case ISD::XOR:
7348 return LowerXOR(Op, DAG);
7349 case ISD::PREFETCH:
7350 return LowerPREFETCH(Op, DAG);
7351 case ISD::SINT_TO_FP:
7352 case ISD::UINT_TO_FP:
7353 case ISD::STRICT_SINT_TO_FP:
7354 case ISD::STRICT_UINT_TO_FP:
7355 return LowerINT_TO_FP(Op, DAG);
7356 case ISD::FP_TO_SINT:
7357 case ISD::FP_TO_UINT:
7358 case ISD::STRICT_FP_TO_SINT:
7359 case ISD::STRICT_FP_TO_UINT:
7360 return LowerFP_TO_INT(Op, DAG);
7361 case ISD::FP_TO_SINT_SAT:
7362 case ISD::FP_TO_UINT_SAT:
7363 return LowerFP_TO_INT_SAT(Op, DAG);
7364 case ISD::FSINCOS:
7365 return LowerFSINCOS(Op, DAG);
7366 case ISD::GET_ROUNDING:
7367 return LowerGET_ROUNDING(Op, DAG);
7368 case ISD::SET_ROUNDING:
7369 return LowerSET_ROUNDING(Op, DAG);
7370 case ISD::GET_FPMODE:
7371 return LowerGET_FPMODE(Op, DAG);
7372 case ISD::SET_FPMODE:
7373 return LowerSET_FPMODE(Op, DAG);
7374 case ISD::RESET_FPMODE:
7375 return LowerRESET_FPMODE(Op, DAG);
7376 case ISD::MUL:
7377 return LowerMUL(Op, DAG);
7378 case ISD::MULHS:
7379 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHS_PRED);
7380 case ISD::MULHU:
7381 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHU_PRED);
7382 case ISD::INTRINSIC_W_CHAIN:
7383 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7384 case ISD::INTRINSIC_WO_CHAIN:
7385 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7386 case ISD::INTRINSIC_VOID:
7387 return LowerINTRINSIC_VOID(Op, DAG);
7388 case ISD::ATOMIC_STORE:
7389 if (cast<MemSDNode>(Val&: Op)->getMemoryVT() == MVT::i128) {
7390 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7391 return LowerStore128(Op, DAG);
7392 }
7393 return SDValue();
7394 case ISD::STORE:
7395 return LowerSTORE(Op, DAG);
7396 case ISD::MSTORE:
7397 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7398 case ISD::MGATHER:
7399 return LowerMGATHER(Op, DAG);
7400 case ISD::MSCATTER:
7401 return LowerMSCATTER(Op, DAG);
7402 case ISD::VECREDUCE_SEQ_FADD:
7403 return LowerVECREDUCE_SEQ_FADD(ScalarOp: Op, DAG);
7404 case ISD::VECREDUCE_ADD:
7405 case ISD::VECREDUCE_AND:
7406 case ISD::VECREDUCE_OR:
7407 case ISD::VECREDUCE_XOR:
7408 case ISD::VECREDUCE_SMAX:
7409 case ISD::VECREDUCE_SMIN:
7410 case ISD::VECREDUCE_UMAX:
7411 case ISD::VECREDUCE_UMIN:
7412 case ISD::VECREDUCE_FADD:
7413 case ISD::VECREDUCE_FMAX:
7414 case ISD::VECREDUCE_FMIN:
7415 case ISD::VECREDUCE_FMAXIMUM:
7416 case ISD::VECREDUCE_FMINIMUM:
7417 return LowerVECREDUCE(Op, DAG);
7418 case ISD::ATOMIC_LOAD_AND:
7419 return LowerATOMIC_LOAD_AND(Op, DAG);
7420 case ISD::DYNAMIC_STACKALLOC:
7421 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7422 case ISD::VSCALE:
7423 return LowerVSCALE(Op, DAG);
7424 case ISD::VECTOR_COMPRESS:
7425 return LowerVECTOR_COMPRESS(Op, DAG);
7426 case ISD::ANY_EXTEND:
7427 case ISD::SIGN_EXTEND:
7428 case ISD::ZERO_EXTEND:
7429 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7430 case ISD::ADDRSPACECAST:
7431 return LowerADDRSPACECAST(Op, DAG);
7432 case ISD::SIGN_EXTEND_INREG: {
7433 // Only custom lower when ExtraVT has a legal byte based element type.
7434 EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
7435 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7436 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7437 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7438 return SDValue();
7439
7440 return LowerToPredicatedOp(Op, DAG,
7441 NewOp: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
7442 }
7443 case ISD::TRUNCATE:
7444 return LowerTRUNCATE(Op, DAG);
7445 case ISD::MLOAD:
7446 return LowerMLOAD(Op, DAG);
7447 case ISD::LOAD:
7448 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
7449 OverrideNEON: !Subtarget->isNeonAvailable()))
7450 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7451 return LowerLOAD(Op, DAG);
7452 case ISD::ADD:
7453 case ISD::AND:
7454 case ISD::SUB:
7455 return LowerToScalableOp(Op, DAG);
7456 case ISD::FMAXIMUM:
7457 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAX_PRED);
7458 case ISD::FMAXNUM:
7459 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAXNM_PRED);
7460 case ISD::FMINIMUM:
7461 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMIN_PRED);
7462 case ISD::FMINNUM:
7463 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMINNM_PRED);
7464 case ISD::VSELECT:
7465 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7466 case ISD::ABS:
7467 return LowerABS(Op, DAG);
7468 case ISD::ABDS:
7469 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDS_PRED);
7470 case ISD::ABDU:
7471 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDU_PRED);
7472 case ISD::AVGFLOORS:
7473 return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDS_PRED);
7474 case ISD::AVGFLOORU:
7475 return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDU_PRED);
7476 case ISD::AVGCEILS:
7477 return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDS_PRED);
7478 case ISD::AVGCEILU:
7479 return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDU_PRED);
7480 case ISD::BITREVERSE:
7481 return LowerBitreverse(Op, DAG);
7482 case ISD::BSWAP:
7483 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BSWAP_MERGE_PASSTHRU);
7484 case ISD::CTLZ:
7485 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTLZ_MERGE_PASSTHRU);
7486 case ISD::CTTZ:
7487 return LowerCTTZ(Op, DAG);
7488 case ISD::VECTOR_SPLICE:
7489 return LowerVECTOR_SPLICE(Op, DAG);
7490 case ISD::VECTOR_DEINTERLEAVE:
7491 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7492 case ISD::VECTOR_INTERLEAVE:
7493 return LowerVECTOR_INTERLEAVE(Op, DAG);
7494 case ISD::GET_ACTIVE_LANE_MASK:
7495 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
7496 case ISD::LRINT:
7497 case ISD::LLRINT:
7498 if (Op.getValueType().isVector())
7499 return LowerVectorXRINT(Op, DAG);
7500 [[fallthrough]];
7501 case ISD::LROUND:
7502 case ISD::LLROUND: {
7503 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7504 Op.getOperand(0).getValueType() == MVT::bf16) &&
7505 "Expected custom lowering of rounding operations only for f16");
7506 SDLoc DL(Op);
7507 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Op.getOperand(i: 0));
7508 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: Op.getValueType(), Operand: Ext);
7509 }
7510 case ISD::STRICT_LROUND:
7511 case ISD::STRICT_LLROUND:
7512 case ISD::STRICT_LRINT:
7513 case ISD::STRICT_LLRINT: {
7514 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7515 Op.getOperand(1).getValueType() == MVT::bf16) &&
7516 "Expected custom lowering of rounding operations only for f16");
7517 SDLoc DL(Op);
7518 SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {MVT::f32, MVT::Other},
7519 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1)});
7520 return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {Op.getValueType(), MVT::Other},
7521 Ops: {Ext.getValue(R: 1), Ext.getValue(R: 0)});
7522 }
7523 case ISD::WRITE_REGISTER: {
7524 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7525 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7526 SDLoc DL(Op);
7527
7528 SDValue Chain = Op.getOperand(i: 0);
7529 SDValue SysRegName = Op.getOperand(i: 1);
7530 std::pair<SDValue, SDValue> Pair =
7531 DAG.SplitScalar(N: Op.getOperand(i: 2), DL, LoVT: MVT::i64, HiVT: MVT::i64);
7532
7533 // chain = MSRR(chain, sysregname, lo, hi)
7534 SDValue Result = DAG.getNode(Opcode: AArch64ISD::MSRR, DL, VT: MVT::Other, N1: Chain,
7535 N2: SysRegName, N3: Pair.first, N4: Pair.second);
7536
7537 return Result;
7538 }
7539 case ISD::FSHL:
7540 case ISD::FSHR:
7541 return LowerFunnelShift(Op, DAG);
7542 case ISD::FLDEXP:
7543 return LowerFLDEXP(Op, DAG);
7544 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
7545 return LowerVECTOR_HISTOGRAM(Op, DAG);
7546 case ISD::PARTIAL_REDUCE_SMLA:
7547 case ISD::PARTIAL_REDUCE_UMLA:
7548 case ISD::PARTIAL_REDUCE_SUMLA:
7549 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
7550 }
7551}
7552
7553bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
7554 return !Subtarget->useSVEForFixedLengthVectors();
7555}
7556
7557bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
7558 EVT VT, bool OverrideNEON) const {
7559 if (!VT.isFixedLengthVector() || !VT.isSimple())
7560 return false;
7561
7562 // Don't use SVE for vectors we cannot scalarize if required.
7563 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7564 // Fixed length predicates should be promoted to i8.
7565 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7566 case MVT::i1:
7567 default:
7568 return false;
7569 case MVT::i8:
7570 case MVT::i16:
7571 case MVT::i32:
7572 case MVT::i64:
7573 case MVT::f16:
7574 case MVT::f32:
7575 case MVT::f64:
7576 break;
7577 }
7578
7579 // NEON-sized vectors can be emulated using SVE instructions.
7580 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7581 return Subtarget->isSVEorStreamingSVEAvailable();
7582
7583 // Ensure NEON MVTs only belong to a single register class.
7584 if (VT.getFixedSizeInBits() <= 128)
7585 return false;
7586
7587 // Ensure wider than NEON code generation is enabled.
7588 if (!Subtarget->useSVEForFixedLengthVectors())
7589 return false;
7590
7591 // Don't use SVE for types that don't fit.
7592 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7593 return false;
7594
7595 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7596 // the base fixed length SVE support in place.
7597 if (!VT.isPow2VectorType())
7598 return false;
7599
7600 return true;
7601}
7602
7603//===----------------------------------------------------------------------===//
7604// Calling Convention Implementation
7605//===----------------------------------------------------------------------===//
7606
7607static unsigned getIntrinsicID(const SDNode *N) {
7608 unsigned Opcode = N->getOpcode();
7609 switch (Opcode) {
7610 default:
7611 return Intrinsic::not_intrinsic;
7612 case ISD::INTRINSIC_WO_CHAIN: {
7613 unsigned IID = N->getConstantOperandVal(Num: 0);
7614 if (IID < Intrinsic::num_intrinsics)
7615 return IID;
7616 return Intrinsic::not_intrinsic;
7617 }
7618 }
7619}
7620
7621bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
7622 SDValue N1) const {
7623 if (!N0.hasOneUse())
7624 return false;
7625
7626 unsigned IID = getIntrinsicID(N: N1.getNode());
7627 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7628 if (IID == Intrinsic::aarch64_neon_umull ||
7629 N1.getOpcode() == AArch64ISD::UMULL ||
7630 IID == Intrinsic::aarch64_neon_smull ||
7631 N1.getOpcode() == AArch64ISD::SMULL)
7632 return N0.getOpcode() != ISD::ADD;
7633
7634 return true;
7635}
7636
7637/// Selects the correct CCAssignFn for a given CallingConvention value.
7638CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
7639 bool IsVarArg) const {
7640 switch (CC) {
7641 default:
7642 reportFatalUsageError(reason: "unsupported calling convention");
7643 case CallingConv::GHC:
7644 return CC_AArch64_GHC;
7645 case CallingConv::PreserveNone:
7646 // The VarArg implementation makes assumptions about register
7647 // argument passing that do not hold for preserve_none, so we
7648 // instead fall back to C argument passing.
7649 // The non-vararg case is handled in the CC function itself.
7650 if (!IsVarArg)
7651 return CC_AArch64_Preserve_None;
7652 [[fallthrough]];
7653 case CallingConv::C:
7654 case CallingConv::Fast:
7655 case CallingConv::PreserveMost:
7656 case CallingConv::PreserveAll:
7657 case CallingConv::CXX_FAST_TLS:
7658 case CallingConv::Swift:
7659 case CallingConv::SwiftTail:
7660 case CallingConv::Tail:
7661 case CallingConv::GRAAL:
7662 if (Subtarget->isTargetWindows()) {
7663 if (IsVarArg) {
7664 if (Subtarget->isWindowsArm64EC())
7665 return CC_AArch64_Arm64EC_VarArg;
7666 return CC_AArch64_Win64_VarArg;
7667 }
7668 return CC_AArch64_Win64PCS;
7669 }
7670 if (!Subtarget->isTargetDarwin())
7671 return CC_AArch64_AAPCS;
7672 if (!IsVarArg)
7673 return CC_AArch64_DarwinPCS;
7674 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
7675 : CC_AArch64_DarwinPCS_VarArg;
7676 case CallingConv::Win64:
7677 if (IsVarArg) {
7678 if (Subtarget->isWindowsArm64EC())
7679 return CC_AArch64_Arm64EC_VarArg;
7680 return CC_AArch64_Win64_VarArg;
7681 }
7682 return CC_AArch64_Win64PCS;
7683 case CallingConv::CFGuard_Check:
7684 if (Subtarget->isWindowsArm64EC())
7685 return CC_AArch64_Arm64EC_CFGuard_Check;
7686 return CC_AArch64_Win64_CFGuard_Check;
7687 case CallingConv::AArch64_VectorCall:
7688 case CallingConv::AArch64_SVE_VectorCall:
7689 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
7690 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1:
7691 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
7692 return CC_AArch64_AAPCS;
7693 case CallingConv::ARM64EC_Thunk_X64:
7694 return CC_AArch64_Arm64EC_Thunk;
7695 case CallingConv::ARM64EC_Thunk_Native:
7696 return CC_AArch64_Arm64EC_Thunk_Native;
7697 }
7698}
7699
7700CCAssignFn *
7701AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
7702 switch (CC) {
7703 default:
7704 return RetCC_AArch64_AAPCS;
7705 case CallingConv::ARM64EC_Thunk_X64:
7706 return RetCC_AArch64_Arm64EC_Thunk;
7707 case CallingConv::CFGuard_Check:
7708 if (Subtarget->isWindowsArm64EC())
7709 return RetCC_AArch64_Arm64EC_CFGuard_Check;
7710 return RetCC_AArch64_AAPCS;
7711 }
7712}
7713
7714static bool isPassedInFPR(EVT VT) {
7715 return VT.isFixedLengthVector() ||
7716 (VT.isFloatingPoint() && !VT.isScalableVector());
7717}
7718
7719SDValue AArch64TargetLowering::LowerFormalArguments(
7720 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7721 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7722 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7723 MachineFunction &MF = DAG.getMachineFunction();
7724 const Function &F = MF.getFunction();
7725 MachineFrameInfo &MFI = MF.getFrameInfo();
7726 bool IsWin64 =
7727 Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg());
7728 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7729 (isVarArg && Subtarget->isWindowsArm64EC());
7730 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7731
7732 SmallVector<ISD::OutputArg, 4> Outs;
7733 GetReturnInfo(CC: CallConv, ReturnType: F.getReturnType(), attr: F.getAttributes(), Outs,
7734 TLI: DAG.getTargetLoweringInfo(), DL: MF.getDataLayout());
7735 if (any_of(Range&: Outs, P: [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7736 FuncInfo->setIsSVECC(true);
7737
7738 // Assign locations to all of the incoming arguments.
7739 SmallVector<CCValAssign, 16> ArgLocs;
7740 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7741
7742 // At this point, Ins[].VT may already be promoted to i32. To correctly
7743 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7744 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7745 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7746 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7747 // LocVT.
7748 unsigned NumArgs = Ins.size();
7749 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7750 unsigned CurArgIdx = 0;
7751 bool UseVarArgCC = false;
7752 if (IsWin64)
7753 UseVarArgCC = isVarArg;
7754
7755 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: UseVarArgCC);
7756
7757 for (unsigned i = 0; i != NumArgs; ++i) {
7758 MVT ValVT = Ins[i].VT;
7759 if (Ins[i].isOrigArg()) {
7760 std::advance(i&: CurOrigArg, n: Ins[i].getOrigArgIndex() - CurArgIdx);
7761 CurArgIdx = Ins[i].getOrigArgIndex();
7762
7763 // Get type of the original argument.
7764 EVT ActualVT = getValueType(DL: DAG.getDataLayout(), Ty: CurOrigArg->getType(),
7765 /*AllowUnknown*/ true);
7766 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7767 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7768 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7769 ValVT = MVT::i8;
7770 else if (ActualMVT == MVT::i16)
7771 ValVT = MVT::i16;
7772 }
7773 bool Res =
7774 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7775 assert(!Res && "Call operand has unhandled type");
7776 (void)Res;
7777 }
7778
7779 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
7780 bool IsLocallyStreaming =
7781 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7782 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7783 SDValue Glue = Chain.getValue(R: 1);
7784
7785 unsigned ExtraArgLocs = 0;
7786 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7787 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7788
7789 if (Ins[i].Flags.isByVal()) {
7790 // Byval is used for HFAs in the PCS, but the system should work in a
7791 // non-compliant manner for larger structs.
7792 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7793 int Size = Ins[i].Flags.getByValSize();
7794 unsigned NumRegs = (Size + 7) / 8;
7795
7796 // FIXME: This works on big-endian for composite byvals, which are the common
7797 // case. It should also work for fundamental types too.
7798 unsigned FrameIdx =
7799 MFI.CreateFixedObject(Size: 8 * NumRegs, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
7800 SDValue FrameIdxN = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
7801 InVals.push_back(Elt: FrameIdxN);
7802
7803 continue;
7804 }
7805
7806 if (Ins[i].Flags.isSwiftAsync())
7807 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
7808
7809 SDValue ArgValue;
7810 if (VA.isRegLoc()) {
7811 // Arguments stored in registers.
7812 EVT RegVT = VA.getLocVT();
7813 const TargetRegisterClass *RC;
7814
7815 if (RegVT == MVT::i32)
7816 RC = &AArch64::GPR32RegClass;
7817 else if (RegVT == MVT::i64)
7818 RC = &AArch64::GPR64RegClass;
7819 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7820 RC = &AArch64::FPR16RegClass;
7821 else if (RegVT == MVT::f32)
7822 RC = &AArch64::FPR32RegClass;
7823 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7824 RC = &AArch64::FPR64RegClass;
7825 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7826 RC = &AArch64::FPR128RegClass;
7827 else if (RegVT.isScalableVector() &&
7828 RegVT.getVectorElementType() == MVT::i1) {
7829 FuncInfo->setIsSVECC(true);
7830 RC = &AArch64::PPRRegClass;
7831 } else if (RegVT == MVT::aarch64svcount) {
7832 FuncInfo->setIsSVECC(true);
7833 RC = &AArch64::PPRRegClass;
7834 } else if (RegVT.isScalableVector()) {
7835 FuncInfo->setIsSVECC(true);
7836 RC = &AArch64::ZPRRegClass;
7837 } else
7838 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7839
7840 // Transform the arguments in physical registers into virtual ones.
7841 Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
7842
7843 if (IsLocallyStreaming) {
7844 // LocallyStreamingFunctions must insert the SMSTART in the correct
7845 // position, so we use Glue to ensure no instructions can be scheduled
7846 // between the chain of:
7847 // t0: ch,glue = EntryNode
7848 // t1: res,ch,glue = CopyFromReg
7849 // ...
7850 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7851 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7852 // ^^^^^^
7853 // This will be the new Chain/Root node.
7854 ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT, Glue);
7855 Glue = ArgValue.getValue(R: 2);
7856 if (isPassedInFPR(VT: ArgValue.getValueType())) {
7857 ArgValue =
7858 DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
7859 VTList: DAG.getVTList(VT1: ArgValue.getValueType(), VT2: MVT::Glue),
7860 Ops: {ArgValue, Glue});
7861 Glue = ArgValue.getValue(R: 1);
7862 }
7863 } else
7864 ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT);
7865
7866 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7867 // to 64 bits. Insert an assert[sz]ext to capture this, then
7868 // truncate to the right size.
7869 switch (VA.getLocInfo()) {
7870 default:
7871 llvm_unreachable("Unknown loc info!");
7872 case CCValAssign::Full:
7873 break;
7874 case CCValAssign::Indirect:
7875 assert(
7876 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7877 "Indirect arguments should be scalable on most subtargets");
7878 break;
7879 case CCValAssign::BCvt:
7880 ArgValue = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: ArgValue);
7881 break;
7882 case CCValAssign::AExt:
7883 case CCValAssign::SExt:
7884 case CCValAssign::ZExt:
7885 break;
7886 case CCValAssign::AExtUpper:
7887 ArgValue = DAG.getNode(Opcode: ISD::SRL, DL, VT: RegVT, N1: ArgValue,
7888 N2: DAG.getConstant(Val: 32, DL, VT: RegVT));
7889 ArgValue = DAG.getZExtOrTrunc(Op: ArgValue, DL, VT: VA.getValVT());
7890 break;
7891 }
7892 } else { // VA.isRegLoc()
7893 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7894 unsigned ArgOffset = VA.getLocMemOffset();
7895 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7896 ? VA.getLocVT().getSizeInBits()
7897 : VA.getValVT().getSizeInBits()) / 8;
7898
7899 uint32_t BEAlign = 0;
7900 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7901 !Ins[i].Flags.isInConsecutiveRegs())
7902 BEAlign = 8 - ArgSize;
7903
7904 SDValue FIN;
7905 MachinePointerInfo PtrInfo;
7906 if (StackViaX4) {
7907 // In both the ARM64EC varargs convention and the thunk convention,
7908 // arguments on the stack are accessed relative to x4, not sp. In
7909 // the thunk convention, there's an additional offset of 32 bytes
7910 // to account for the shadow store.
7911 unsigned ObjOffset = ArgOffset + BEAlign;
7912 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7913 ObjOffset += 32;
7914 Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
7915 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
7916 FIN = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Val,
7917 N2: DAG.getConstant(Val: ObjOffset, DL, VT: MVT::i64));
7918 PtrInfo = MachinePointerInfo::getUnknownStack(MF);
7919 } else {
7920 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset + BEAlign, IsImmutable: true);
7921
7922 // Create load nodes to retrieve arguments from the stack.
7923 FIN = DAG.getFrameIndex(FI, VT: getPointerTy(DL: DAG.getDataLayout()));
7924 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7925 }
7926
7927 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7928 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
7929 MVT MemVT = VA.getValVT();
7930
7931 switch (VA.getLocInfo()) {
7932 default:
7933 break;
7934 case CCValAssign::Trunc:
7935 case CCValAssign::BCvt:
7936 MemVT = VA.getLocVT();
7937 break;
7938 case CCValAssign::Indirect:
7939 assert((VA.getValVT().isScalableVector() ||
7940 Subtarget->isWindowsArm64EC()) &&
7941 "Indirect arguments should be scalable on most subtargets");
7942 MemVT = VA.getLocVT();
7943 break;
7944 case CCValAssign::SExt:
7945 ExtType = ISD::SEXTLOAD;
7946 break;
7947 case CCValAssign::ZExt:
7948 ExtType = ISD::ZEXTLOAD;
7949 break;
7950 case CCValAssign::AExt:
7951 ExtType = ISD::EXTLOAD;
7952 break;
7953 }
7954
7955 ArgValue = DAG.getExtLoad(ExtType, dl: DL, VT: VA.getLocVT(), Chain, Ptr: FIN, PtrInfo,
7956 MemVT);
7957 }
7958
7959 if (VA.getLocInfo() == CCValAssign::Indirect) {
7960 assert((VA.getValVT().isScalableVT() ||
7961 Subtarget->isWindowsArm64EC()) &&
7962 "Indirect arguments should be scalable on most subtargets");
7963
7964 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7965 unsigned NumParts = 1;
7966 if (Ins[i].Flags.isInConsecutiveRegs()) {
7967 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7968 ++NumParts;
7969 }
7970
7971 MVT PartLoad = VA.getValVT();
7972 SDValue Ptr = ArgValue;
7973
7974 // Ensure we generate all loads for each tuple part, whilst updating the
7975 // pointer after each load correctly using vscale.
7976 while (NumParts > 0) {
7977 ArgValue = DAG.getLoad(VT: PartLoad, dl: DL, Chain, Ptr, PtrInfo: MachinePointerInfo());
7978 InVals.push_back(Elt: ArgValue);
7979 NumParts--;
7980 if (NumParts > 0) {
7981 SDValue BytesIncrement;
7982 if (PartLoad.isScalableVector()) {
7983 BytesIncrement = DAG.getVScale(
7984 DL, VT: Ptr.getValueType(),
7985 MulImm: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7986 } else {
7987 BytesIncrement = DAG.getConstant(
7988 Val: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7989 VT: Ptr.getValueType());
7990 }
7991 Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
7992 N2: BytesIncrement, Flags: SDNodeFlags::NoUnsignedWrap);
7993 ExtraArgLocs++;
7994 i++;
7995 }
7996 }
7997 } else {
7998 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7999 ArgValue = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: ArgValue.getValueType(),
8000 N1: ArgValue, N2: DAG.getValueType(MVT::i32));
8001
8002 // i1 arguments are zero-extended to i8 by the caller. Emit a
8003 // hint to reflect this.
8004 if (Ins[i].isOrigArg()) {
8005 Argument *OrigArg = F.getArg(i: Ins[i].getOrigArgIndex());
8006 if (OrigArg->getType()->isIntegerTy(Bitwidth: 1)) {
8007 if (!Ins[i].Flags.isZExt()) {
8008 ArgValue = DAG.getNode(Opcode: AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8009 VT: ArgValue.getValueType(), Operand: ArgValue);
8010 }
8011 }
8012 }
8013
8014 InVals.push_back(Elt: ArgValue);
8015 }
8016 }
8017 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8018
8019 // Insert the SMSTART if this is a locally streaming function and
8020 // make sure it is Glued to the last CopyFromReg value.
8021 if (IsLocallyStreaming) {
8022 SDValue PStateSM;
8023 if (Attrs.hasStreamingCompatibleInterface()) {
8024 PStateSM = getRuntimePStateSM(DAG, Chain, DL, VT: MVT::i64);
8025 Register Reg = MF.getRegInfo().createVirtualRegister(
8026 RegClass: getRegClassFor(VT: PStateSM.getValueType().getSimpleVT()));
8027 FuncInfo->setPStateSMReg(Reg);
8028 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: PStateSM);
8029 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, InGlue: Glue,
8030 Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
8031 } else
8032 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, InGlue: Glue,
8033 Condition: AArch64SME::Always);
8034
8035 // Ensure that the SMSTART happens after the CopyWithChain such that its
8036 // chain result is used.
8037 for (unsigned I=0; I<InVals.size(); ++I) {
8038 Register Reg = MF.getRegInfo().createVirtualRegister(
8039 RegClass: getRegClassFor(VT: InVals[I].getValueType().getSimpleVT()));
8040 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: InVals[I]);
8041 InVals[I] = DAG.getCopyFromReg(Chain, dl: DL, Reg,
8042 VT: InVals[I].getValueType());
8043 }
8044 }
8045
8046 // varargs
8047 if (isVarArg) {
8048 if (DAG.getMachineFunction().getFrameInfo().hasVAStart()) {
8049 if (!Subtarget->isTargetDarwin() || IsWin64) {
8050 // The AAPCS variadic function ABI is identical to the non-variadic
8051 // one. As a result there may be more arguments in registers and we
8052 // should save them for future reference.
8053 // Win64 variadic functions also pass arguments in registers, but all
8054 // float arguments are passed in integer registers.
8055 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8056 }
8057
8058 // This will point to the next argument passed via stack.
8059 unsigned VarArgsOffset = CCInfo.getStackSize();
8060 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8061 VarArgsOffset =
8062 alignTo(Value: VarArgsOffset, Align: Subtarget->isTargetILP32() ? 4 : 8);
8063 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8064 FuncInfo->setVarArgsStackIndex(
8065 MFI.CreateFixedObject(Size: 4, SPOffset: VarArgsOffset, IsImmutable: true));
8066 }
8067
8068 if (MFI.hasMustTailInVarArgFunc()) {
8069 SmallVector<MVT, 2> RegParmTypes;
8070 RegParmTypes.push_back(Elt: MVT::i64);
8071 RegParmTypes.push_back(Elt: MVT::f128);
8072 // Compute the set of forwarded registers. The rest are scratch.
8073 SmallVectorImpl<ForwardedRegister> &Forwards =
8074 FuncInfo->getForwardedMustTailRegParms();
8075 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8076 Fn: CC_AArch64_AAPCS);
8077
8078 // Conservatively forward X8, since it might be used for aggregate return.
8079 if (!CCInfo.isAllocated(Reg: AArch64::X8)) {
8080 Register X8VReg = MF.addLiveIn(PReg: AArch64::X8, RC: &AArch64::GPR64RegClass);
8081 Forwards.push_back(Elt: ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8082 }
8083 }
8084 }
8085
8086 // On Windows, InReg pointers must be returned, so record the pointer in a
8087 // virtual register at the start of the function so it can be returned in the
8088 // epilogue.
8089 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8090 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8091 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8092 Ins[I].Flags.isInReg()) &&
8093 Ins[I].Flags.isSRet()) {
8094 assert(!FuncInfo->getSRetReturnReg());
8095
8096 MVT PtrTy = getPointerTy(DL: DAG.getDataLayout());
8097 Register Reg =
8098 MF.getRegInfo().createVirtualRegister(RegClass: getRegClassFor(VT: PtrTy));
8099 FuncInfo->setSRetReturnReg(Reg);
8100
8101 SDValue Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg, N: InVals[I]);
8102 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, N1: Copy, N2: Chain);
8103 break;
8104 }
8105 }
8106 }
8107
8108 unsigned StackArgSize = CCInfo.getStackSize();
8109 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8110 if (DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt)) {
8111 // This is a non-standard ABI so by fiat I say we're allowed to make full
8112 // use of the stack area to be popped, which must be aligned to 16 bytes in
8113 // any case:
8114 StackArgSize = alignTo(Value: StackArgSize, Align: 16);
8115
8116 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8117 // a multiple of 16.
8118 FuncInfo->setArgumentStackToRestore(StackArgSize);
8119
8120 // This realignment carries over to the available bytes below. Our own
8121 // callers will guarantee the space is free by giving an aligned value to
8122 // CALLSEQ_START.
8123 }
8124 // Even if we're not expected to free up the space, it's useful to know how
8125 // much is there while considering tail calls (because we can reuse it).
8126 FuncInfo->setBytesInStackArgArea(StackArgSize);
8127
8128 if (Subtarget->hasCustomCallingConv())
8129 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
8130
8131 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8132 // will be expanded and stored in the static object later using a pseudonode.
8133 if (Attrs.hasZAState()) {
8134 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8135 TPIDR2.FrameIndex = MFI.CreateStackObject(Size: 16, Alignment: Align(16), isSpillSlot: false);
8136 SDValue SVL = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
8137 Operand: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
8138
8139 SDValue Buffer;
8140 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8141 Buffer = DAG.getNode(Opcode: AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
8142 VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other), Ops: {Chain, SVL});
8143 } else {
8144 SDValue Size = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: SVL, N2: SVL);
8145 Buffer = DAG.getNode(Opcode: ISD::DYNAMIC_STACKALLOC, DL,
8146 VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other),
8147 Ops: {Chain, Size, DAG.getConstant(Val: 1, DL, VT: MVT::i64)});
8148 MFI.CreateVariableSizedObject(Alignment: Align(16), Alloca: nullptr);
8149 }
8150 Chain = DAG.getNode(
8151 Opcode: AArch64ISD::INIT_TPIDR2OBJ, DL, VTList: DAG.getVTList(VT: MVT::Other),
8152 Ops: {/*Chain*/ Buffer.getValue(R: 1), /*Buffer ptr*/ Buffer.getValue(R: 0)});
8153 } else if (Attrs.hasAgnosticZAInterface()) {
8154 // Call __arm_sme_state_size().
8155 SDValue BufferSize =
8156 DAG.getNode(Opcode: AArch64ISD::GET_SME_SAVE_SIZE, DL,
8157 VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other), N: Chain);
8158 Chain = BufferSize.getValue(R: 1);
8159
8160 SDValue Buffer;
8161 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8162 Buffer =
8163 DAG.getNode(Opcode: AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
8164 VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other), Ops: {Chain, BufferSize});
8165 } else {
8166 // Allocate space dynamically.
8167 Buffer = DAG.getNode(
8168 Opcode: ISD::DYNAMIC_STACKALLOC, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other),
8169 Ops: {Chain, BufferSize, DAG.getConstant(Val: 1, DL, VT: MVT::i64)});
8170 MFI.CreateVariableSizedObject(Alignment: Align(16), Alloca: nullptr);
8171 }
8172
8173 // Copy the value to a virtual register, and save that in FuncInfo.
8174 Register BufferPtr =
8175 MF.getRegInfo().createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
8176 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8177 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: BufferPtr, N: Buffer);
8178 }
8179
8180 if (CallConv == CallingConv::PreserveNone) {
8181 for (const ISD::InputArg &I : Ins) {
8182 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8183 I.Flags.isSwiftAsync()) {
8184 MachineFunction &MF = DAG.getMachineFunction();
8185 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
8186 MF.getFunction(),
8187 "Swift attributes can't be used with preserve_none",
8188 DL.getDebugLoc()));
8189 break;
8190 }
8191 }
8192 }
8193
8194 return Chain;
8195}
8196
8197void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8198 SelectionDAG &DAG,
8199 const SDLoc &DL,
8200 SDValue &Chain) const {
8201 MachineFunction &MF = DAG.getMachineFunction();
8202 MachineFrameInfo &MFI = MF.getFrameInfo();
8203 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8204 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
8205 Function &F = MF.getFunction();
8206 bool IsWin64 =
8207 Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg());
8208
8209 SmallVector<SDValue, 8> MemOps;
8210
8211 auto GPRArgRegs = AArch64::getGPRArgRegs();
8212 unsigned NumGPRArgRegs = GPRArgRegs.size();
8213 if (Subtarget->isWindowsArm64EC()) {
8214 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8215 // functions.
8216 NumGPRArgRegs = 4;
8217 }
8218 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(Regs: GPRArgRegs);
8219
8220 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8221 int GPRIdx = 0;
8222 if (GPRSaveSize != 0) {
8223 if (IsWin64) {
8224 GPRIdx = MFI.CreateFixedObject(Size: GPRSaveSize, SPOffset: -(int)GPRSaveSize, IsImmutable: false);
8225 if (GPRSaveSize & 15)
8226 // The extra size here, if triggered, will always be 8.
8227 MFI.CreateFixedObject(Size: 16 - (GPRSaveSize & 15), SPOffset: -(int)alignTo(Value: GPRSaveSize, Align: 16), IsImmutable: false);
8228 } else
8229 GPRIdx = MFI.CreateStackObject(Size: GPRSaveSize, Alignment: Align(8), isSpillSlot: false);
8230
8231 SDValue FIN;
8232 if (Subtarget->isWindowsArm64EC()) {
8233 // With the Arm64EC ABI, we reserve the save area as usual, but we
8234 // compute its address relative to x4. For a normal AArch64->AArch64
8235 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8236 // different address.
8237 Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
8238 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
8239 FIN = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: Val,
8240 N2: DAG.getConstant(Val: GPRSaveSize, DL, VT: MVT::i64));
8241 } else {
8242 FIN = DAG.getFrameIndex(FI: GPRIdx, VT: PtrVT);
8243 }
8244
8245 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8246 Register VReg = MF.addLiveIn(PReg: GPRArgRegs[i], RC: &AArch64::GPR64RegClass);
8247 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
8248 SDValue Store =
8249 DAG.getStore(Chain: Val.getValue(R: 1), dl: DL, Val, Ptr: FIN,
8250 PtrInfo: IsWin64 ? MachinePointerInfo::getFixedStack(
8251 MF, FI: GPRIdx, Offset: (i - FirstVariadicGPR) * 8)
8252 : MachinePointerInfo::getStack(MF, Offset: i * 8));
8253 MemOps.push_back(Elt: Store);
8254 FIN =
8255 DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN, N2: DAG.getConstant(Val: 8, DL, VT: PtrVT));
8256 }
8257 }
8258 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8259 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8260
8261 if (Subtarget->hasFPARMv8() && !IsWin64) {
8262 auto FPRArgRegs = AArch64::getFPRArgRegs();
8263 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8264 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(Regs: FPRArgRegs);
8265
8266 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8267 int FPRIdx = 0;
8268 if (FPRSaveSize != 0) {
8269 FPRIdx = MFI.CreateStackObject(Size: FPRSaveSize, Alignment: Align(16), isSpillSlot: false);
8270
8271 SDValue FIN = DAG.getFrameIndex(FI: FPRIdx, VT: PtrVT);
8272
8273 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8274 Register VReg = MF.addLiveIn(PReg: FPRArgRegs[i], RC: &AArch64::FPR128RegClass);
8275 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::f128);
8276
8277 SDValue Store = DAG.getStore(Chain: Val.getValue(R: 1), dl: DL, Val, Ptr: FIN,
8278 PtrInfo: MachinePointerInfo::getStack(MF, Offset: i * 16));
8279 MemOps.push_back(Elt: Store);
8280 FIN = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN,
8281 N2: DAG.getConstant(Val: 16, DL, VT: PtrVT));
8282 }
8283 }
8284 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8285 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8286 }
8287
8288 if (!MemOps.empty()) {
8289 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOps);
8290 }
8291}
8292
8293/// LowerCallResult - Lower the result values of a call into the
8294/// appropriate copies out of appropriate physical registers.
8295SDValue AArch64TargetLowering::LowerCallResult(
8296 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8297 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8298 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8299 SDValue ThisVal, bool RequiresSMChange) const {
8300 DenseMap<unsigned, SDValue> CopiedRegs;
8301 // Copy all of the result registers out of their specified physreg.
8302 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8303 CCValAssign VA = RVLocs[i];
8304
8305 // Pass 'this' value directly from the argument to return value, to avoid
8306 // reg unit interference
8307 if (i == 0 && isThisReturn) {
8308 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8309 "unexpected return calling convention register assignment");
8310 InVals.push_back(Elt: ThisVal);
8311 continue;
8312 }
8313
8314 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8315 // allows one use of a physreg per block.
8316 SDValue Val = CopiedRegs.lookup(Val: VA.getLocReg());
8317 if (!Val) {
8318 Val =
8319 DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
8320 Chain = Val.getValue(R: 1);
8321 InGlue = Val.getValue(R: 2);
8322 CopiedRegs[VA.getLocReg()] = Val;
8323 }
8324
8325 switch (VA.getLocInfo()) {
8326 default:
8327 llvm_unreachable("Unknown loc info!");
8328 case CCValAssign::Full:
8329 break;
8330 case CCValAssign::BCvt:
8331 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
8332 break;
8333 case CCValAssign::AExtUpper:
8334 Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: VA.getLocVT(), N1: Val,
8335 N2: DAG.getConstant(Val: 32, DL, VT: VA.getLocVT()));
8336 [[fallthrough]];
8337 case CCValAssign::AExt:
8338 [[fallthrough]];
8339 case CCValAssign::ZExt:
8340 Val = DAG.getZExtOrTrunc(Op: Val, DL, VT: VA.getValVT());
8341 break;
8342 }
8343
8344 if (RequiresSMChange && isPassedInFPR(VT: VA.getValVT()))
8345 Val = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
8346 VTList: DAG.getVTList(VT1: Val.getValueType(), VT2: MVT::Glue), N: Val);
8347
8348 InVals.push_back(Elt: Val);
8349 }
8350
8351 return Chain;
8352}
8353
8354/// Return true if the calling convention is one that we can guarantee TCO for.
8355static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8356 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8357 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
8358}
8359
8360/// Return true if we might ever do TCO for calls with this calling convention.
8361static bool mayTailCallThisCC(CallingConv::ID CC) {
8362 switch (CC) {
8363 case CallingConv::C:
8364 case CallingConv::AArch64_SVE_VectorCall:
8365 case CallingConv::PreserveMost:
8366 case CallingConv::PreserveAll:
8367 case CallingConv::PreserveNone:
8368 case CallingConv::Swift:
8369 case CallingConv::SwiftTail:
8370 case CallingConv::Tail:
8371 case CallingConv::Fast:
8372 return true;
8373 default:
8374 return false;
8375 }
8376}
8377
8378/// Return true if the call convention supports varargs
8379/// Currently only those that pass varargs like the C
8380/// calling convention does are eligible
8381/// Calling conventions listed in this function must also
8382/// be properly handled in AArch64Subtarget::isCallingConvWin64
8383static bool callConvSupportsVarArgs(CallingConv::ID CC) {
8384 switch (CC) {
8385 case CallingConv::C:
8386 case CallingConv::PreserveNone:
8387 // SVE vector call is only partially supported, but it should
8388 // support named arguments being passed. Any arguments being passed
8389 // as varargs, are still unsupported.
8390 case CallingConv::AArch64_SVE_VectorCall:
8391 return true;
8392 default:
8393 return false;
8394 }
8395}
8396
8397static void analyzeCallOperands(const AArch64TargetLowering &TLI,
8398 const AArch64Subtarget *Subtarget,
8399 const TargetLowering::CallLoweringInfo &CLI,
8400 CCState &CCInfo) {
8401 const SelectionDAG &DAG = CLI.DAG;
8402 CallingConv::ID CalleeCC = CLI.CallConv;
8403 bool IsVarArg = CLI.IsVarArg;
8404 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8405 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CC: CalleeCC, IsVarArg);
8406
8407 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8408 // for the shadow store.
8409 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8410 CCInfo.AllocateStack(Size: 32, Alignment: Align(16));
8411
8412 unsigned NumArgs = Outs.size();
8413 for (unsigned i = 0; i != NumArgs; ++i) {
8414 MVT ArgVT = Outs[i].VT;
8415 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8416
8417 bool UseVarArgCC = false;
8418 if (IsVarArg) {
8419 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8420 // too, so use the vararg CC to force them to integer registers.
8421 if (IsCalleeWin64) {
8422 UseVarArgCC = true;
8423 } else {
8424 UseVarArgCC = !Outs[i].IsFixed;
8425 }
8426 }
8427
8428 if (!UseVarArgCC) {
8429 // Get type of the original argument.
8430 EVT ActualVT =
8431 TLI.getValueType(DL: DAG.getDataLayout(), Ty: CLI.Args[Outs[i].OrigArgIndex].Ty,
8432 /*AllowUnknown*/ true);
8433 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8434 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8435 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8436 ArgVT = MVT::i8;
8437 else if (ActualMVT == MVT::i16)
8438 ArgVT = MVT::i16;
8439 }
8440
8441 // FIXME: CCAssignFnForCall should be called once, for the call and not per
8442 // argument. This logic should exactly mirror LowerFormalArguments.
8443 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC: CalleeCC, IsVarArg: UseVarArgCC);
8444 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
8445 assert(!Res && "Call operand has unhandled type");
8446 (void)Res;
8447 }
8448}
8449
8450static SMECallAttrs
8451getSMECallAttrs(const Function &Caller,
8452 const TargetLowering::CallLoweringInfo &CLI) {
8453 if (CLI.CB)
8454 return SMECallAttrs(*CLI.CB);
8455 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val: CLI.Callee))
8456 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol()));
8457 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal));
8458}
8459
8460bool AArch64TargetLowering::isEligibleForTailCallOptimization(
8461 const CallLoweringInfo &CLI) const {
8462 CallingConv::ID CalleeCC = CLI.CallConv;
8463 if (!mayTailCallThisCC(CC: CalleeCC))
8464 return false;
8465
8466 SDValue Callee = CLI.Callee;
8467 bool IsVarArg = CLI.IsVarArg;
8468 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8469 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8470 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8471 const SelectionDAG &DAG = CLI.DAG;
8472 MachineFunction &MF = DAG.getMachineFunction();
8473 const Function &CallerF = MF.getFunction();
8474 CallingConv::ID CallerCC = CallerF.getCallingConv();
8475
8476 // SME Streaming functions are not eligible for TCO as they may require
8477 // the streaming mode or ZA to be restored after returning from the call.
8478 SMECallAttrs CallAttrs = getSMECallAttrs(Caller: CallerF, CLI);
8479 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
8480 CallAttrs.requiresPreservingAllZAState() ||
8481 CallAttrs.caller().hasStreamingBody())
8482 return false;
8483
8484 // Functions using the C or Fast calling convention that have an SVE signature
8485 // preserve more registers and should assume the SVE_VectorCall CC.
8486 // The check for matching callee-saved regs will determine whether it is
8487 // eligible for TCO.
8488 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
8489 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
8490 CallerCC = CallingConv::AArch64_SVE_VectorCall;
8491
8492 bool CCMatch = CallerCC == CalleeCC;
8493
8494 // When using the Windows calling convention on a non-windows OS, we want
8495 // to back up and restore X18 in such functions; we can't do a tail call
8496 // from those functions.
8497 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
8498 CalleeCC != CallingConv::Win64)
8499 return false;
8500
8501 // Byval parameters hand the function a pointer directly into the stack area
8502 // we want to reuse during a tail call. Working around this *is* possible (see
8503 // X86) but less efficient and uglier in LowerCall.
8504 for (Function::const_arg_iterator i = CallerF.arg_begin(),
8505 e = CallerF.arg_end();
8506 i != e; ++i) {
8507 if (i->hasByValAttr())
8508 return false;
8509
8510 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8511 // In this case, it is necessary to save X0/X1 in the callee and return it
8512 // in X0. Tail call opt may interfere with this, so we disable tail call
8513 // opt when the caller has an "inreg" attribute -- except if the callee
8514 // also has that attribute on the same argument, and the same value is
8515 // passed.
8516 if (i->hasInRegAttr()) {
8517 unsigned ArgIdx = i - CallerF.arg_begin();
8518 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
8519 return false;
8520 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgNo: ArgIdx);
8521 if (!Attrs.hasAttribute(Kind: Attribute::InReg) ||
8522 !Attrs.hasAttribute(Kind: Attribute::StructRet) || !i->hasStructRetAttr() ||
8523 CLI.CB->getArgOperand(i: ArgIdx) != i) {
8524 return false;
8525 }
8526 }
8527 }
8528
8529 if (canGuaranteeTCO(CC: CalleeCC, GuaranteeTailCalls: getTargetMachine().Options.GuaranteedTailCallOpt))
8530 return CCMatch;
8531
8532 // Externally-defined functions with weak linkage should not be
8533 // tail-called on AArch64 when the OS does not support dynamic
8534 // pre-emption of symbols, as the AAELF spec requires normal calls
8535 // to undefined weak functions to be replaced with a NOP or jump to the
8536 // next instruction. The behaviour of branch instructions in this
8537 // situation (as used for tail calls) is implementation-defined, so we
8538 // cannot rely on the linker replacing the tail call with a return.
8539 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
8540 const GlobalValue *GV = G->getGlobal();
8541 const Triple &TT = getTargetMachine().getTargetTriple();
8542 if (GV->hasExternalWeakLinkage() &&
8543 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
8544 return false;
8545 }
8546
8547 // Now we search for cases where we can use a tail call without changing the
8548 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
8549 // concept.
8550
8551 // I want anyone implementing a new calling convention to think long and hard
8552 // about this assert.
8553 if (IsVarArg && !callConvSupportsVarArgs(CC: CalleeCC))
8554 report_fatal_error(reason: "Unsupported variadic calling convention");
8555
8556 LLVMContext &C = *DAG.getContext();
8557 // Check that the call results are passed in the same way.
8558 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8559 CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
8560 CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
8561 return false;
8562 // The callee has to preserve all registers the caller needs to preserve.
8563 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8564 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8565 if (!CCMatch) {
8566 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8567 if (Subtarget->hasCustomCallingConv()) {
8568 TRI->UpdateCustomCallPreservedMask(MF, Mask: &CallerPreserved);
8569 TRI->UpdateCustomCallPreservedMask(MF, Mask: &CalleePreserved);
8570 }
8571 if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
8572 return false;
8573 }
8574
8575 // Nothing more to check if the callee is taking no arguments
8576 if (Outs.empty())
8577 return true;
8578
8579 SmallVector<CCValAssign, 16> ArgLocs;
8580 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8581
8582 analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
8583
8584 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8585 // When we are musttail, additional checks have been done and we can safely ignore this check
8586 // At least two cases here: if caller is fastcc then we can't have any
8587 // memory arguments (we'd be expected to clean up the stack afterwards). If
8588 // caller is C then we could potentially use its argument area.
8589
8590 // FIXME: for now we take the most conservative of these in both cases:
8591 // disallow all variadic memory operands.
8592 for (const CCValAssign &ArgLoc : ArgLocs)
8593 if (!ArgLoc.isRegLoc())
8594 return false;
8595 }
8596
8597 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8598
8599 // If any of the arguments is passed indirectly, it must be SVE, so the
8600 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8601 // allocate space on the stack. That is why we determine this explicitly here
8602 // the call cannot be a tailcall.
8603 if (llvm::any_of(Range&: ArgLocs, P: [&](CCValAssign &A) {
8604 assert((A.getLocInfo() != CCValAssign::Indirect ||
8605 A.getValVT().isScalableVector() ||
8606 Subtarget->isWindowsArm64EC()) &&
8607 "Expected value to be scalable");
8608 return A.getLocInfo() == CCValAssign::Indirect;
8609 }))
8610 return false;
8611
8612 // If the stack arguments for this call do not fit into our own save area then
8613 // the call cannot be made tail.
8614 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8615 return false;
8616
8617 const MachineRegisterInfo &MRI = MF.getRegInfo();
8618 if (!parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals))
8619 return false;
8620
8621 return true;
8622}
8623
8624SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8625 SelectionDAG &DAG,
8626 MachineFrameInfo &MFI,
8627 int ClobberedFI) const {
8628 SmallVector<SDValue, 8> ArgChains;
8629 int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
8630 int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - 1;
8631
8632 // Include the original chain at the beginning of the list. When this is
8633 // used by target LowerCall hooks, this helps legalize find the
8634 // CALLSEQ_BEGIN node.
8635 ArgChains.push_back(Elt: Chain);
8636
8637 // Add a chain value for each stack argument corresponding
8638 for (SDNode *U : DAG.getEntryNode().getNode()->users())
8639 if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U))
8640 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr()))
8641 if (FI->getIndex() < 0) {
8642 int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
8643 int64_t InLastByte = InFirstByte;
8644 InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - 1;
8645
8646 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8647 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8648 ArgChains.push_back(Elt: SDValue(L, 1));
8649 }
8650
8651 // Build a tokenfactor for all the chains.
8652 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ArgChains);
8653}
8654
8655bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8656 bool TailCallOpt) const {
8657 return (CallCC == CallingConv::Fast && TailCallOpt) ||
8658 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8659}
8660
8661// Check if the value is zero-extended from i1 to i8
8662static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8663 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8664 if (SizeInBits < 8)
8665 return false;
8666
8667 APInt RequiredZero(SizeInBits, 0xFE);
8668 KnownBits Bits = DAG.computeKnownBits(Op: Arg, Depth: 4);
8669 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
8670 return ZExtBool;
8671}
8672
8673void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8674 SDNode *Node) const {
8675 // Live-in physreg copies that are glued to SMSTART are applied as
8676 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
8677 // register allocator to pass call args in callee saved regs, without extra
8678 // copies to avoid these fake clobbers of actually-preserved GPRs.
8679 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
8680 MI.getOpcode() == AArch64::MSRpstatePseudo) {
8681 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
8682 if (MachineOperand &MO = MI.getOperand(i: I);
8683 MO.isReg() && MO.isImplicit() && MO.isDef() &&
8684 (AArch64::GPR32RegClass.contains(Reg: MO.getReg()) ||
8685 AArch64::GPR64RegClass.contains(Reg: MO.getReg())))
8686 MI.removeOperand(OpNo: I);
8687
8688 // The SVE vector length can change when entering/leaving streaming mode.
8689 // FPMR is set to 0 when entering/leaving streaming mode.
8690 if (MI.getOperand(i: 0).getImm() == AArch64SVCR::SVCRSM ||
8691 MI.getOperand(i: 0).getImm() == AArch64SVCR::SVCRSMZA) {
8692 MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /*IsDef=*/isDef: false,
8693 /*IsImplicit=*/isImp: true));
8694 MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /*IsDef=*/isDef: true,
8695 /*IsImplicit=*/isImp: true));
8696 MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::FPMR, /*IsDef=*/isDef: true,
8697 /*IsImplicit=*/isImp: true));
8698 }
8699 }
8700
8701 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8702 // have nothing to do with VG, were it not that they are used to materialise a
8703 // frame-address. If they contain a frame-index to a scalable vector, this
8704 // will likely require an ADDVL instruction to materialise the address, thus
8705 // reading VG.
8706 const MachineFunction &MF = *MI.getMF();
8707 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
8708 (MI.getOpcode() == AArch64::ADDXri ||
8709 MI.getOpcode() == AArch64::SUBXri)) {
8710 const MachineOperand &MO = MI.getOperand(i: 1);
8711 if (MO.isFI() && MF.getFrameInfo().getStackID(ObjectIdx: MO.getIndex()) ==
8712 TargetStackID::ScalableVector)
8713 MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /*IsDef=*/isDef: false,
8714 /*IsImplicit=*/isImp: true));
8715 }
8716}
8717
8718SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
8719 bool Enable, SDValue Chain,
8720 SDValue InGlue,
8721 unsigned Condition,
8722 SDValue PStateSM) const {
8723 MachineFunction &MF = DAG.getMachineFunction();
8724 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8725 FuncInfo->setHasStreamingModeChanges(true);
8726
8727 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8728 SDValue RegMask = DAG.getRegisterMask(RegMask: TRI->getSMStartStopCallPreservedMask());
8729 SDValue MSROp =
8730 DAG.getTargetConstant(Val: (int32_t)AArch64SVCR::SVCRSM, DL, VT: MVT::i32);
8731 SmallVector<SDValue> Ops = {Chain, MSROp};
8732 unsigned Opcode;
8733 if (Condition != AArch64SME::Always) {
8734 SDValue ConditionOp = DAG.getTargetConstant(Val: Condition, DL, VT: MVT::i64);
8735 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
8736 assert(PStateSM && "PStateSM should be defined");
8737 Ops.push_back(Elt: ConditionOp);
8738 Ops.push_back(Elt: PStateSM);
8739 } else {
8740 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8741 }
8742 Ops.push_back(Elt: RegMask);
8743
8744 if (InGlue)
8745 Ops.push_back(Elt: InGlue);
8746
8747 return DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops);
8748}
8749
8750// Emit a call to __arm_sme_save or __arm_sme_restore.
8751static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
8752 SelectionDAG &DAG,
8753 AArch64FunctionInfo *Info, SDLoc DL,
8754 SDValue Chain, bool IsSave) {
8755 MachineFunction &MF = DAG.getMachineFunction();
8756 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8757 FuncInfo->setSMESaveBufferUsed();
8758
8759 TargetLowering::ArgListTy Args;
8760 TargetLowering::ArgListEntry Entry;
8761 Entry.Ty = PointerType::getUnqual(C&: *DAG.getContext());
8762 Entry.Node =
8763 DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getSMESaveBufferAddr(), VT: MVT::i64);
8764 Args.push_back(x: Entry);
8765
8766 SDValue Callee =
8767 DAG.getExternalSymbol(Sym: IsSave ? "__arm_sme_save" : "__arm_sme_restore",
8768 VT: TLI.getPointerTy(DL: DAG.getDataLayout()));
8769 auto *RetTy = Type::getVoidTy(C&: *DAG.getContext());
8770 TargetLowering::CallLoweringInfo CLI(DAG);
8771 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8772 CC: CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1, ResultType: RetTy,
8773 Target: Callee, ArgsList: std::move(Args));
8774 return TLI.LowerCallTo(CLI).second;
8775}
8776
8777static AArch64SME::ToggleCondition
8778getSMToggleCondition(const SMECallAttrs &CallAttrs) {
8779 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
8780 CallAttrs.caller().hasStreamingBody())
8781 return AArch64SME::Always;
8782 if (CallAttrs.callee().hasNonStreamingInterface())
8783 return AArch64SME::IfCallerIsStreaming;
8784 if (CallAttrs.callee().hasStreamingInterface())
8785 return AArch64SME::IfCallerIsNonStreaming;
8786
8787 llvm_unreachable("Unsupported attributes");
8788}
8789
8790/// Check whether a stack argument requires lowering in a tail call.
8791static bool shouldLowerTailCallStackArg(const MachineFunction &MF,
8792 const CCValAssign &VA, SDValue Arg,
8793 ISD::ArgFlagsTy Flags, int CallOffset) {
8794 // FIXME: We should be able to handle this case, but it's not clear how to.
8795 if (Flags.isZExt() || Flags.isSExt())
8796 return true;
8797
8798 for (;;) {
8799 // Look through nodes that don't alter the bits of the incoming value.
8800 unsigned Op = Arg.getOpcode();
8801 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
8802 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
8803 Arg = Arg.getOperand(i: 0);
8804 continue;
8805 }
8806 break;
8807 }
8808
8809 // If the argument is a load from the same immutable stack slot, we can reuse
8810 // it.
8811 if (auto *LoadNode = dyn_cast<LoadSDNode>(Val&: Arg)) {
8812 if (auto *FINode = dyn_cast<FrameIndexSDNode>(Val: LoadNode->getBasePtr())) {
8813 const MachineFrameInfo &MFI = MF.getFrameInfo();
8814 int FI = FINode->getIndex();
8815 if (!MFI.isImmutableObjectIndex(ObjectIdx: FI))
8816 return true;
8817 if (CallOffset != MFI.getObjectOffset(ObjectIdx: FI))
8818 return true;
8819 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
8820 if (SizeInBits / 8 != MFI.getObjectSize(ObjectIdx: FI))
8821 return true;
8822 return false;
8823 }
8824 }
8825
8826 return true;
8827}
8828
8829/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8830/// and add input and output parameter nodes.
8831SDValue
8832AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8833 SmallVectorImpl<SDValue> &InVals) const {
8834 SelectionDAG &DAG = CLI.DAG;
8835 SDLoc &DL = CLI.DL;
8836 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8837 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8838 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8839 SDValue Chain = CLI.Chain;
8840 SDValue Callee = CLI.Callee;
8841 bool &IsTailCall = CLI.IsTailCall;
8842 CallingConv::ID &CallConv = CLI.CallConv;
8843 bool IsVarArg = CLI.IsVarArg;
8844
8845 MachineFunction &MF = DAG.getMachineFunction();
8846 MachineFunction::CallSiteInfo CSInfo;
8847 bool IsThisReturn = false;
8848
8849 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8850 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8851 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8852 bool IsSibCall = false;
8853 bool GuardWithBTI = false;
8854
8855 if (CLI.CB && CLI.CB->hasFnAttr(Kind: Attribute::ReturnsTwice) &&
8856 !Subtarget->noBTIAtReturnTwice()) {
8857 GuardWithBTI = FuncInfo->branchTargetEnforcement();
8858 }
8859
8860 // Analyze operands of the call, assigning locations to each operand.
8861 SmallVector<CCValAssign, 16> ArgLocs;
8862 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
8863
8864 if (IsVarArg) {
8865 unsigned NumArgs = Outs.size();
8866
8867 for (unsigned i = 0; i != NumArgs; ++i) {
8868 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
8869 report_fatal_error(reason: "Passing SVE types to variadic functions is "
8870 "currently not supported");
8871 }
8872 }
8873
8874 analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
8875
8876 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
8877 // Assign locations to each value returned by this call.
8878 SmallVector<CCValAssign, 16> RVLocs;
8879 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
8880 *DAG.getContext());
8881 RetCCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
8882
8883 // Check callee args/returns for SVE registers and set calling convention
8884 // accordingly.
8885 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
8886 auto HasSVERegLoc = [](CCValAssign &Loc) {
8887 if (!Loc.isRegLoc())
8888 return false;
8889 return AArch64::ZPRRegClass.contains(Reg: Loc.getLocReg()) ||
8890 AArch64::PPRRegClass.contains(Reg: Loc.getLocReg());
8891 };
8892 if (any_of(Range&: RVLocs, P: HasSVERegLoc) || any_of(Range&: ArgLocs, P: HasSVERegLoc))
8893 CallConv = CallingConv::AArch64_SVE_VectorCall;
8894 }
8895
8896 if (IsTailCall) {
8897 // Check if it's really possible to do a tail call.
8898 IsTailCall = isEligibleForTailCallOptimization(CLI);
8899
8900 // A sibling call is one where we're under the usual C ABI and not planning
8901 // to change that but can still do a tail call:
8902 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
8903 CallConv != CallingConv::SwiftTail)
8904 IsSibCall = true;
8905
8906 if (IsTailCall)
8907 ++NumTailCalls;
8908 }
8909
8910 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
8911 report_fatal_error(reason: "failed to perform tail call elimination on a call "
8912 "site marked musttail");
8913
8914 // Get a count of how many bytes are to be pushed on the stack.
8915 unsigned NumBytes = CCInfo.getStackSize();
8916
8917 if (IsSibCall) {
8918 // Since we're not changing the ABI to make this a tail call, the memory
8919 // operands are already available in the caller's incoming argument space.
8920 NumBytes = 0;
8921 }
8922
8923 // FPDiff is the byte offset of the call's argument area from the callee's.
8924 // Stores to callee stack arguments will be placed in FixedStackSlots offset
8925 // by this amount for a tail call. In a sibling call it must be 0 because the
8926 // caller will deallocate the entire stack and the callee still expects its
8927 // arguments to begin at SP+0. Completely unused for non-tail calls.
8928 int FPDiff = 0;
8929
8930 if (IsTailCall && !IsSibCall) {
8931 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
8932
8933 // Since callee will pop argument stack as a tail call, we must keep the
8934 // popped size 16-byte aligned.
8935 NumBytes = alignTo(Value: NumBytes, Align: 16);
8936
8937 // FPDiff will be negative if this tail call requires more space than we
8938 // would automatically have in our incoming argument space. Positive if we
8939 // can actually shrink the stack.
8940 FPDiff = NumReusableBytes - NumBytes;
8941
8942 // Update the required reserved area if this is the tail call requiring the
8943 // most argument stack space.
8944 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
8945 FuncInfo->setTailCallReservedStack(-FPDiff);
8946
8947 // The stack pointer must be 16-byte aligned at all times it's used for a
8948 // memory operation, which in practice means at *all* times and in
8949 // particular across call boundaries. Therefore our own arguments started at
8950 // a 16-byte aligned SP and the delta applied for the tail call should
8951 // satisfy the same constraint.
8952 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
8953 }
8954
8955 // Determine whether we need any streaming mode changes.
8956 SMECallAttrs CallAttrs = getSMECallAttrs(Caller: MF.getFunction(), CLI);
8957
8958 auto DescribeCallsite =
8959 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
8960 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
8961 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: CLI.Callee))
8962 R << ore::NV("Callee", ES->getSymbol());
8963 else if (CLI.CB && CLI.CB->getCalledFunction())
8964 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
8965 else
8966 R << "unknown callee";
8967 R << "'";
8968 return R;
8969 };
8970
8971 bool RequiresLazySave = CallAttrs.requiresLazySave();
8972 bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
8973 if (RequiresLazySave) {
8974 const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8975 MachinePointerInfo MPI =
8976 MachinePointerInfo::getStack(MF, Offset: TPIDR2.FrameIndex);
8977 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
8978 FI: TPIDR2.FrameIndex,
8979 VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8980 SDValue NumZaSaveSlicesAddr =
8981 DAG.getNode(Opcode: ISD::ADD, DL, VT: TPIDR2ObjAddr.getValueType(), N1: TPIDR2ObjAddr,
8982 N2: DAG.getConstant(Val: 8, DL, VT: TPIDR2ObjAddr.getValueType()));
8983 SDValue NumZaSaveSlices = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
8984 Operand: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
8985 Chain = DAG.getTruncStore(Chain, dl: DL, Val: NumZaSaveSlices, Ptr: NumZaSaveSlicesAddr,
8986 PtrInfo: MPI, SVT: MVT::i16);
8987 Chain = DAG.getNode(
8988 Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, N1: Chain,
8989 N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_set_tpidr2, DL, VT: MVT::i32),
8990 N3: TPIDR2ObjAddr);
8991 OptimizationRemarkEmitter ORE(&MF.getFunction());
8992 ORE.emit(RemarkBuilder: [&]() {
8993 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8994 CLI.CB)
8995 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8996 &MF.getFunction());
8997 return DescribeCallsite(R) << " sets up a lazy save for ZA";
8998 });
8999 } else if (RequiresSaveAllZA) {
9000 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9001 "Cannot share state that may not exist");
9002 Chain = emitSMEStateSaveRestore(TLI: *this, DAG, Info: FuncInfo, DL, Chain,
9003 /*IsSave=*/true);
9004 }
9005
9006 SDValue PStateSM;
9007 bool RequiresSMChange = CallAttrs.requiresSMChange();
9008 if (RequiresSMChange) {
9009 if (CallAttrs.caller().hasStreamingInterfaceOrBody())
9010 PStateSM = DAG.getConstant(Val: 1, DL, VT: MVT::i64);
9011 else if (CallAttrs.caller().hasNonStreamingInterface())
9012 PStateSM = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
9013 else
9014 PStateSM = getRuntimePStateSM(DAG, Chain, DL, VT: MVT::i64);
9015 OptimizationRemarkEmitter ORE(&MF.getFunction());
9016 ORE.emit(RemarkBuilder: [&]() {
9017 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9018 CLI.CB)
9019 : OptimizationRemarkAnalysis("sme", "SMETransition",
9020 &MF.getFunction());
9021 DescribeCallsite(R) << " requires a streaming mode transition";
9022 return R;
9023 });
9024 }
9025
9026 SDValue ZTFrameIdx;
9027 MachineFrameInfo &MFI = MF.getFrameInfo();
9028 bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
9029
9030 // If the caller has ZT0 state which will not be preserved by the callee,
9031 // spill ZT0 before the call.
9032 if (ShouldPreserveZT0) {
9033 unsigned ZTObj = MFI.CreateSpillStackObject(Size: 64, Alignment: Align(16));
9034 ZTFrameIdx = DAG.getFrameIndex(
9035 FI: ZTObj,
9036 VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
9037
9038 Chain = DAG.getNode(Opcode: AArch64ISD::SAVE_ZT, DL, VTList: DAG.getVTList(VT: MVT::Other),
9039 Ops: {Chain, DAG.getConstant(Val: 0, DL, VT: MVT::i32), ZTFrameIdx});
9040 }
9041
9042 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9043 // PSTATE.ZA before the call if there is no lazy-save active.
9044 bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
9045 assert((!DisableZA || !RequiresLazySave) &&
9046 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9047
9048 if (DisableZA)
9049 Chain = DAG.getNode(
9050 Opcode: AArch64ISD::SMSTOP, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), N1: Chain,
9051 N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32));
9052
9053 // Adjust the stack pointer for the new arguments...
9054 // These operations are automatically eliminated by the prolog/epilog pass
9055 if (!IsSibCall)
9056 Chain = DAG.getCALLSEQ_START(Chain, InSize: IsTailCall ? 0 : NumBytes, OutSize: 0, DL);
9057
9058 SDValue StackPtr = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::SP,
9059 VT: getPointerTy(DL: DAG.getDataLayout()));
9060
9061 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
9062 SmallSet<unsigned, 8> RegsUsed;
9063 SmallVector<SDValue, 8> MemOpChains;
9064 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
9065
9066 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9067 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9068 for (const auto &F : Forwards) {
9069 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: F.VReg, VT: F.VT);
9070 RegsToPass.emplace_back(Args: F.PReg, Args&: Val);
9071 }
9072 }
9073
9074 // Walk the register/memloc assignments, inserting copies/loads.
9075 unsigned ExtraArgLocs = 0;
9076 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9077 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9078 SDValue Arg = OutVals[i];
9079 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9080
9081 // Promote the value if needed.
9082 switch (VA.getLocInfo()) {
9083 default:
9084 llvm_unreachable("Unknown loc info!");
9085 case CCValAssign::Full:
9086 break;
9087 case CCValAssign::SExt:
9088 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9089 break;
9090 case CCValAssign::ZExt:
9091 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9092 break;
9093 case CCValAssign::AExt:
9094 if (Outs[i].ArgVT == MVT::i1) {
9095 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9096 //
9097 // Check if we actually have to do this, because the value may
9098 // already be zero-extended.
9099 //
9100 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9101 // and rely on DAGCombiner to fold this, because the following
9102 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9103 //
9104 // (ext (zext x)) -> (zext x)
9105 //
9106 // This will give us (zext i32), which we cannot remove, so
9107 // try to check this beforehand.
9108 if (!checkZExtBool(Arg, DAG)) {
9109 Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Arg);
9110 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i8, Operand: Arg);
9111 }
9112 }
9113 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9114 break;
9115 case CCValAssign::AExtUpper:
9116 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9117 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9118 Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
9119 N2: DAG.getConstant(Val: 32, DL, VT: VA.getLocVT()));
9120 break;
9121 case CCValAssign::BCvt:
9122 Arg = DAG.getBitcast(VT: VA.getLocVT(), V: Arg);
9123 break;
9124 case CCValAssign::Trunc:
9125 Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
9126 break;
9127 case CCValAssign::FPExt:
9128 Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9129 break;
9130 case CCValAssign::Indirect:
9131 bool isScalable = VA.getValVT().isScalableVT();
9132 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9133 "Indirect arguments should be scalable on most subtargets");
9134
9135 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9136 uint64_t PartSize = StoreSize;
9137 unsigned NumParts = 1;
9138 if (Outs[i].Flags.isInConsecutiveRegs()) {
9139 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9140 ++NumParts;
9141 StoreSize *= NumParts;
9142 }
9143
9144 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(Context&: *DAG.getContext());
9145 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9146 MachineFrameInfo &MFI = MF.getFrameInfo();
9147 int FI = MFI.CreateStackObject(Size: StoreSize, Alignment, isSpillSlot: false);
9148 if (isScalable)
9149 MFI.setStackID(ObjectIdx: FI, ID: TargetStackID::ScalableVector);
9150
9151 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
9152 SDValue Ptr = DAG.getFrameIndex(
9153 FI, VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
9154 SDValue SpillSlot = Ptr;
9155
9156 // Ensure we generate all stores for each tuple part, whilst updating the
9157 // pointer after each store correctly using vscale.
9158 while (NumParts) {
9159 SDValue Store = DAG.getStore(Chain, dl: DL, Val: OutVals[i], Ptr, PtrInfo: MPI);
9160 MemOpChains.push_back(Elt: Store);
9161
9162 NumParts--;
9163 if (NumParts > 0) {
9164 SDValue BytesIncrement;
9165 if (isScalable) {
9166 BytesIncrement = DAG.getVScale(
9167 DL, VT: Ptr.getValueType(),
9168 MulImm: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9169 } else {
9170 BytesIncrement = DAG.getConstant(
9171 Val: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9172 VT: Ptr.getValueType());
9173 }
9174 MPI = MachinePointerInfo(MPI.getAddrSpace());
9175 Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
9176 N2: BytesIncrement, Flags: SDNodeFlags::NoUnsignedWrap);
9177 ExtraArgLocs++;
9178 i++;
9179 }
9180 }
9181
9182 Arg = SpillSlot;
9183 break;
9184 }
9185
9186 if (VA.isRegLoc()) {
9187 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9188 Outs[0].VT == MVT::i64) {
9189 assert(VA.getLocVT() == MVT::i64 &&
9190 "unexpected calling convention register assignment");
9191 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9192 "unexpected use of 'returned'");
9193 IsThisReturn = true;
9194 }
9195 if (RegsUsed.count(V: VA.getLocReg())) {
9196 // If this register has already been used then we're trying to pack
9197 // parts of an [N x i32] into an X-register. The extension type will
9198 // take care of putting the two halves in the right place but we have to
9199 // combine them.
9200 SDValue &Bits =
9201 llvm::find_if(Range&: RegsToPass,
9202 P: [=](const std::pair<unsigned, SDValue> &Elt) {
9203 return Elt.first == VA.getLocReg();
9204 })
9205 ->second;
9206 Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
9207 // Call site info is used for function's parameter entry value
9208 // tracking. For now we track only simple cases when parameter
9209 // is transferred through whole register.
9210 llvm::erase_if(C&: CSInfo.ArgRegPairs,
9211 P: [&VA](MachineFunction::ArgRegPair ArgReg) {
9212 return ArgReg.Reg == VA.getLocReg();
9213 });
9214 } else {
9215 // Add an extra level of indirection for streaming mode changes by
9216 // using a pseudo copy node that cannot be rematerialised between a
9217 // smstart/smstop and the call by the simple register coalescer.
9218 if (RequiresSMChange && isPassedInFPR(VT: Arg.getValueType()))
9219 Arg = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
9220 VTList: DAG.getVTList(VT1: Arg.getValueType(), VT2: MVT::Glue), N: Arg);
9221 RegsToPass.emplace_back(Args: VA.getLocReg(), Args&: Arg);
9222 RegsUsed.insert(V: VA.getLocReg());
9223 const TargetOptions &Options = DAG.getTarget().Options;
9224 if (Options.EmitCallSiteInfo)
9225 CSInfo.ArgRegPairs.emplace_back(Args: VA.getLocReg(), Args&: i);
9226 }
9227 } else {
9228 assert(VA.isMemLoc());
9229
9230 SDValue DstAddr;
9231 MachinePointerInfo DstInfo;
9232
9233 // FIXME: This works on big-endian for composite byvals, which are the
9234 // common case. It should also work for fundamental types too.
9235 uint32_t BEAlign = 0;
9236 unsigned OpSize;
9237 if (VA.getLocInfo() == CCValAssign::Indirect ||
9238 VA.getLocInfo() == CCValAssign::Trunc)
9239 OpSize = VA.getLocVT().getFixedSizeInBits();
9240 else
9241 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9242 : VA.getValVT().getSizeInBits();
9243 OpSize = (OpSize + 7) / 8;
9244 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9245 !Flags.isInConsecutiveRegs()) {
9246 if (OpSize < 8)
9247 BEAlign = 8 - OpSize;
9248 }
9249 unsigned LocMemOffset = VA.getLocMemOffset();
9250 int32_t Offset = LocMemOffset + BEAlign;
9251
9252 if (IsTailCall) {
9253 // When the frame pointer is perfectly aligned for the tail call and the
9254 // same stack argument is passed down intact, we can reuse it.
9255 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, CallOffset: Offset))
9256 continue;
9257
9258 Offset = Offset + FPDiff;
9259 int FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
9260
9261 DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
9262 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9263
9264 // Make sure any stack arguments overlapping with where we're storing
9265 // are loaded before this eventual operation. Otherwise they'll be
9266 // clobbered.
9267 Chain = addTokenForArgument(Chain, DAG, MFI&: MF.getFrameInfo(), ClobberedFI: FI);
9268 } else {
9269 SDValue PtrOff = DAG.getIntPtrConstant(Val: Offset, DL);
9270
9271 DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: StackPtr, N2: PtrOff);
9272 DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
9273 }
9274
9275 if (Outs[i].Flags.isByVal()) {
9276 SDValue SizeNode =
9277 DAG.getConstant(Val: Outs[i].Flags.getByValSize(), DL, VT: MVT::i64);
9278 SDValue Cpy = DAG.getMemcpy(
9279 Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
9280 Alignment: Outs[i].Flags.getNonZeroByValAlign(),
9281 /*isVol = */ false, /*AlwaysInline = */ false,
9282 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo, SrcPtrInfo: MachinePointerInfo());
9283
9284 MemOpChains.push_back(Elt: Cpy);
9285 } else {
9286 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9287 // promoted to a legal register type i32, we should truncate Arg back to
9288 // i1/i8/i16.
9289 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9290 VA.getValVT() == MVT::i16)
9291 Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Arg);
9292
9293 SDValue Store = DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo);
9294 MemOpChains.push_back(Elt: Store);
9295 }
9296 }
9297 }
9298
9299 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
9300 !(CLI.CB && CLI.CB->isMustTailCall())) {
9301 SDValue ParamPtr = StackPtr;
9302 if (IsTailCall) {
9303 // Create a dummy object at the top of the stack that can be used to get
9304 // the SP after the epilogue
9305 int FI = MF.getFrameInfo().CreateFixedObject(Size: 1, SPOffset: FPDiff, IsImmutable: true);
9306 ParamPtr = DAG.getFrameIndex(FI, VT: PtrVT);
9307 }
9308
9309 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9310 // describing the argument list. x4 contains the address of the
9311 // first stack parameter. x5 contains the size in bytes of all parameters
9312 // passed on the stack.
9313 RegsToPass.emplace_back(Args: AArch64::X4, Args&: ParamPtr);
9314 RegsToPass.emplace_back(Args: AArch64::X5,
9315 Args: DAG.getConstant(Val: NumBytes, DL, VT: MVT::i64));
9316 }
9317
9318 if (!MemOpChains.empty())
9319 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
9320
9321 SDValue InGlue;
9322 if (RequiresSMChange) {
9323 if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9324 Chain = DAG.getNode(Opcode: AArch64ISD::VG_SAVE, DL,
9325 VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), N: Chain);
9326 InGlue = Chain.getValue(R: 1);
9327 }
9328
9329 SDValue NewChain = changeStreamingMode(
9330 DAG, DL, Enable: CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
9331 Condition: getSMToggleCondition(CallAttrs), PStateSM);
9332 Chain = NewChain.getValue(R: 0);
9333 InGlue = NewChain.getValue(R: 1);
9334 }
9335
9336 // Build a sequence of copy-to-reg nodes chained together with token chain
9337 // and flag operands which copy the outgoing args into the appropriate regs.
9338 for (auto &RegToPass : RegsToPass) {
9339 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RegToPass.first,
9340 N: RegToPass.second, Glue: InGlue);
9341 InGlue = Chain.getValue(R: 1);
9342 }
9343
9344 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9345 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9346 // node so that legalize doesn't hack it.
9347 const GlobalValue *CalledGlobal = nullptr;
9348 unsigned OpFlags = 0;
9349 if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
9350 CalledGlobal = G->getGlobal();
9351 OpFlags = Subtarget->classifyGlobalFunctionReference(GV: CalledGlobal,
9352 TM: getTargetMachine());
9353 if (OpFlags & AArch64II::MO_GOT) {
9354 Callee = DAG.getTargetGlobalAddress(GV: CalledGlobal, DL, VT: PtrVT, offset: 0, TargetFlags: OpFlags);
9355 Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
9356 } else {
9357 const GlobalValue *GV = G->getGlobal();
9358 Callee = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: OpFlags);
9359 }
9360 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: Callee)) {
9361 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9362 Subtarget->isTargetMachO()) ||
9363 MF.getFunction().getParent()->getRtLibUseGOT();
9364 const char *Sym = S->getSymbol();
9365 if (UseGot) {
9366 Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: AArch64II::MO_GOT);
9367 Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
9368 } else {
9369 Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: 0);
9370 }
9371 }
9372
9373 // We don't usually want to end the call-sequence here because we would tidy
9374 // the frame up *after* the call, however in the ABI-changing tail-call case
9375 // we've carefully laid out the parameters so that when sp is reset they'll be
9376 // in the correct location.
9377 if (IsTailCall && !IsSibCall) {
9378 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: InGlue, DL);
9379 InGlue = Chain.getValue(R: 1);
9380 }
9381
9382 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9383
9384 std::vector<SDValue> Ops;
9385 Ops.push_back(x: Chain);
9386 Ops.push_back(x: Callee);
9387
9388 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9389 // be expanded to the call, directly followed by a special marker sequence and
9390 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9391 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CB: CLI.CB)) {
9392 assert(!IsTailCall &&
9393 "tail calls cannot be marked with clang.arc.attachedcall");
9394 Opc = AArch64ISD::CALL_RVMARKER;
9395
9396 // Add a target global address for the retainRV/claimRV runtime function
9397 // just before the call target.
9398 Function *ARCFn = *objcarc::getAttachedARCFunction(CB: CLI.CB);
9399 auto GA = DAG.getTargetGlobalAddress(GV: ARCFn, DL, VT: PtrVT);
9400 Ops.insert(position: Ops.begin() + 1, x: GA);
9401
9402 // We may or may not need to emit both the marker and the retain/claim call.
9403 // Tell the pseudo expansion using an additional boolean op.
9404 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CB: CLI.CB);
9405 SDValue DoEmitMarker =
9406 DAG.getTargetConstant(Val: ShouldEmitMarker, DL, VT: MVT::i32);
9407 Ops.insert(position: Ops.begin() + 2, x: DoEmitMarker);
9408 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9409 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
9410 } else if (GuardWithBTI) {
9411 Opc = AArch64ISD::CALL_BTI;
9412 }
9413
9414 if (IsTailCall) {
9415 // Each tail call may have to adjust the stack by a different amount, so
9416 // this information must travel along with the operation for eventual
9417 // consumption by emitEpilogue.
9418 Ops.push_back(x: DAG.getSignedTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
9419 }
9420
9421 if (CLI.PAI) {
9422 const uint64_t Key = CLI.PAI->Key;
9423 assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
9424 "Invalid auth call key");
9425
9426 // Split the discriminator into address/integer components.
9427 SDValue AddrDisc, IntDisc;
9428 std::tie(args&: IntDisc, args&: AddrDisc) =
9429 extractPtrauthBlendDiscriminators(Disc: CLI.PAI->Discriminator, DAG: &DAG);
9430
9431 if (Opc == AArch64ISD::CALL_RVMARKER)
9432 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
9433 else
9434 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
9435 Ops.push_back(x: DAG.getTargetConstant(Val: Key, DL, VT: MVT::i32));
9436 Ops.push_back(x: IntDisc);
9437 Ops.push_back(x: AddrDisc);
9438 }
9439
9440 // Add argument registers to the end of the list so that they are known live
9441 // into the call.
9442 for (auto &RegToPass : RegsToPass)
9443 Ops.push_back(x: DAG.getRegister(Reg: RegToPass.first,
9444 VT: RegToPass.second.getValueType()));
9445
9446 // Add a register mask operand representing the call-preserved registers.
9447 const uint32_t *Mask;
9448 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9449 if (IsThisReturn) {
9450 // For 'this' returns, use the X0-preserving mask if applicable
9451 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
9452 if (!Mask) {
9453 IsThisReturn = false;
9454 Mask = TRI->getCallPreservedMask(MF, CallConv);
9455 }
9456 } else
9457 Mask = TRI->getCallPreservedMask(MF, CallConv);
9458
9459 if (Subtarget->hasCustomCallingConv())
9460 TRI->UpdateCustomCallPreservedMask(MF, Mask: &Mask);
9461
9462 if (TRI->isAnyArgRegReserved(MF))
9463 TRI->emitReservedArgRegCallError(MF);
9464
9465 assert(Mask && "Missing call preserved mask for calling convention");
9466 Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
9467
9468 if (InGlue.getNode())
9469 Ops.push_back(x: InGlue);
9470
9471 // If we're doing a tall call, use a TC_RETURN here rather than an
9472 // actual call instruction.
9473 if (IsTailCall) {
9474 MF.getFrameInfo().setHasTailCall();
9475 SDValue Ret = DAG.getNode(Opcode: Opc, DL, VT: MVT::Other, Ops);
9476 if (IsCFICall)
9477 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9478
9479 DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CLI.NoMerge);
9480 DAG.addCallSiteInfo(Node: Ret.getNode(), CallInfo: std::move(CSInfo));
9481 if (CalledGlobal &&
9482 MF.getFunction().getParent()->getModuleFlag(Key: "import-call-optimization"))
9483 DAG.addCalledGlobal(Node: Ret.getNode(), GV: CalledGlobal, OpFlags);
9484 return Ret;
9485 }
9486
9487 // Returns a chain and a flag for retval copy to use.
9488 Chain = DAG.getNode(Opcode: Opc, DL, ResultTys: {MVT::Other, MVT::Glue}, Ops);
9489 if (IsCFICall)
9490 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9491
9492 DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CLI.NoMerge);
9493 InGlue = Chain.getValue(R: 1);
9494 DAG.addCallSiteInfo(Node: Chain.getNode(), CallInfo: std::move(CSInfo));
9495 if (CalledGlobal &&
9496 MF.getFunction().getParent()->getModuleFlag(Key: "import-call-optimization"))
9497 DAG.addCalledGlobal(Node: Chain.getNode(), GV: CalledGlobal, OpFlags);
9498
9499 uint64_t CalleePopBytes =
9500 DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt) ? alignTo(Value: NumBytes, Align: 16) : 0;
9501
9502 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: CalleePopBytes, Glue: InGlue, DL);
9503 InGlue = Chain.getValue(R: 1);
9504
9505 // Handle result values, copying them out of physregs into vregs that we
9506 // return.
9507 SDValue Result = LowerCallResult(
9508 Chain, InGlue, CallConv, isVarArg: IsVarArg, RVLocs, DL, DAG, InVals, isThisReturn: IsThisReturn,
9509 ThisVal: IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
9510
9511 if (!Ins.empty())
9512 InGlue = Result.getValue(R: Result->getNumValues() - 1);
9513
9514 if (RequiresSMChange) {
9515 assert(PStateSM && "Expected a PStateSM to be set");
9516 Result = changeStreamingMode(
9517 DAG, DL, Enable: !CallAttrs.callee().hasStreamingInterface(), Chain: Result, InGlue,
9518 Condition: getSMToggleCondition(CallAttrs), PStateSM);
9519
9520 if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9521 InGlue = Result.getValue(R: 1);
9522 Result =
9523 DAG.getNode(Opcode: AArch64ISD::VG_RESTORE, DL,
9524 VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops: {Result, InGlue});
9525 }
9526 }
9527
9528 if (CallAttrs.requiresEnablingZAAfterCall())
9529 // Unconditionally resume ZA.
9530 Result = DAG.getNode(
9531 Opcode: AArch64ISD::SMSTART, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), N1: Result,
9532 N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32));
9533
9534 if (ShouldPreserveZT0)
9535 Result =
9536 DAG.getNode(Opcode: AArch64ISD::RESTORE_ZT, DL, VTList: DAG.getVTList(VT: MVT::Other),
9537 Ops: {Result, DAG.getConstant(Val: 0, DL, VT: MVT::i32), ZTFrameIdx});
9538
9539 if (RequiresLazySave) {
9540 // Conditionally restore the lazy save using a pseudo node.
9541 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9542 SDValue RegMask = DAG.getRegisterMask(
9543 RegMask: TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
9544 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
9545 Sym: "__arm_tpidr2_restore", VT: getPointerTy(DL: DAG.getDataLayout()));
9546 SDValue TPIDR2_EL0 = DAG.getNode(
9547 Opcode: ISD::INTRINSIC_W_CHAIN, DL, VT: MVT::i64, N1: Result,
9548 N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_get_tpidr2, DL, VT: MVT::i32));
9549
9550 // Copy the address of the TPIDR2 block into X0 before 'calling' the
9551 // RESTORE_ZA pseudo.
9552 SDValue Glue;
9553 SDValue TPIDR2Block = DAG.getFrameIndex(
9554 FI: TPIDR2.FrameIndex,
9555 VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
9556 Result = DAG.getCopyToReg(Chain: Result, dl: DL, Reg: AArch64::X0, N: TPIDR2Block, Glue);
9557 Result =
9558 DAG.getNode(Opcode: AArch64ISD::RESTORE_ZA, DL, VT: MVT::Other,
9559 Ops: {Result, TPIDR2_EL0, DAG.getRegister(Reg: AArch64::X0, VT: MVT::i64),
9560 RestoreRoutine, RegMask, Result.getValue(R: 1)});
9561
9562 // Finally reset the TPIDR2_EL0 register to 0.
9563 Result = DAG.getNode(
9564 Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, N1: Result,
9565 N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_set_tpidr2, DL, VT: MVT::i32),
9566 N3: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
9567 TPIDR2.Uses++;
9568 } else if (RequiresSaveAllZA) {
9569 Result = emitSMEStateSaveRestore(TLI: *this, DAG, Info: FuncInfo, DL, Chain: Result,
9570 /*IsSave=*/false);
9571 }
9572
9573 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
9574 RequiresSaveAllZA) {
9575 for (unsigned I = 0; I < InVals.size(); ++I) {
9576 // The smstart/smstop is chained as part of the call, but when the
9577 // resulting chain is discarded (which happens when the call is not part
9578 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
9579 // smstart/smstop is chained to the result value. We can do that by doing
9580 // a vreg -> vreg copy.
9581 Register Reg = MF.getRegInfo().createVirtualRegister(
9582 RegClass: getRegClassFor(VT: InVals[I].getValueType().getSimpleVT()));
9583 SDValue X = DAG.getCopyToReg(Chain: Result, dl: DL, Reg, N: InVals[I]);
9584 InVals[I] = DAG.getCopyFromReg(Chain: X, dl: DL, Reg,
9585 VT: InVals[I].getValueType());
9586 }
9587 }
9588
9589 if (CallConv == CallingConv::PreserveNone) {
9590 for (const ISD::OutputArg &O : Outs) {
9591 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
9592 O.Flags.isSwiftAsync()) {
9593 MachineFunction &MF = DAG.getMachineFunction();
9594 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9595 MF.getFunction(),
9596 "Swift attributes can't be used with preserve_none",
9597 DL.getDebugLoc()));
9598 break;
9599 }
9600 }
9601 }
9602
9603 return Result;
9604}
9605
9606bool AArch64TargetLowering::CanLowerReturn(
9607 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9608 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
9609 const Type *RetTy) const {
9610 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
9611 SmallVector<CCValAssign, 16> RVLocs;
9612 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9613 return CCInfo.CheckReturn(Outs, Fn: RetCC);
9614}
9615
9616SDValue
9617AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9618 bool isVarArg,
9619 const SmallVectorImpl<ISD::OutputArg> &Outs,
9620 const SmallVectorImpl<SDValue> &OutVals,
9621 const SDLoc &DL, SelectionDAG &DAG) const {
9622 auto &MF = DAG.getMachineFunction();
9623 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9624
9625 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
9626 SmallVector<CCValAssign, 16> RVLocs;
9627 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9628 CCInfo.AnalyzeReturn(Outs, Fn: RetCC);
9629
9630 // Copy the result values into the output registers.
9631 SDValue Glue;
9632 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
9633 SmallSet<unsigned, 4> RegsUsed;
9634 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
9635 ++i, ++realRVLocIdx) {
9636 CCValAssign &VA = RVLocs[i];
9637 assert(VA.isRegLoc() && "Can only return in registers!");
9638 SDValue Arg = OutVals[realRVLocIdx];
9639
9640 switch (VA.getLocInfo()) {
9641 default:
9642 llvm_unreachable("Unknown loc info!");
9643 case CCValAssign::Full:
9644 if (Outs[i].ArgVT == MVT::i1) {
9645 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
9646 // value. This is strictly redundant on Darwin (which uses "zeroext
9647 // i1"), but will be optimised out before ISel.
9648 Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Arg);
9649 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9650 }
9651 break;
9652 case CCValAssign::BCvt:
9653 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
9654 break;
9655 case CCValAssign::AExt:
9656 case CCValAssign::ZExt:
9657 Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
9658 break;
9659 case CCValAssign::AExtUpper:
9660 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9661 Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
9662 Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
9663 N2: DAG.getConstant(Val: 32, DL, VT: VA.getLocVT()));
9664 break;
9665 }
9666
9667 if (RegsUsed.count(V: VA.getLocReg())) {
9668 SDValue &Bits =
9669 llvm::find_if(Range&: RetVals, P: [=](const std::pair<unsigned, SDValue> &Elt) {
9670 return Elt.first == VA.getLocReg();
9671 })->second;
9672 Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
9673 } else {
9674 RetVals.emplace_back(Args: VA.getLocReg(), Args&: Arg);
9675 RegsUsed.insert(V: VA.getLocReg());
9676 }
9677 }
9678
9679 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9680
9681 // Emit SMSTOP before returning from a locally streaming function
9682 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
9683 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
9684 if (FuncAttrs.hasStreamingCompatibleInterface()) {
9685 Register Reg = FuncInfo->getPStateSMReg();
9686 assert(Reg.isValid() && "PStateSM Register is invalid");
9687 SDValue PStateSM = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: MVT::i64);
9688 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9689 /*Glue*/ InGlue: SDValue(),
9690 Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
9691 } else
9692 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9693 /*Glue*/ InGlue: SDValue(), Condition: AArch64SME::Always);
9694 Glue = Chain.getValue(R: 1);
9695 }
9696
9697 SmallVector<SDValue, 4> RetOps(1, Chain);
9698 for (auto &RetVal : RetVals) {
9699 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
9700 isPassedInFPR(VT: RetVal.second.getValueType()))
9701 RetVal.second =
9702 DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
9703 VTList: DAG.getVTList(VT1: RetVal.second.getValueType(), VT2: MVT::Glue),
9704 N: RetVal.second);
9705 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetVal.first, N: RetVal.second, Glue);
9706 Glue = Chain.getValue(R: 1);
9707 RetOps.push_back(
9708 Elt: DAG.getRegister(Reg: RetVal.first, VT: RetVal.second.getValueType()));
9709 }
9710
9711 // Windows AArch64 ABIs require that for returning structs by value we copy
9712 // the sret argument into X0 for the return.
9713 // We saved the argument into a virtual register in the entry block,
9714 // so now we copy the value out and into X0.
9715 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
9716 SDValue Val = DAG.getCopyFromReg(Chain: RetOps[0], dl: DL, Reg: SRetReg,
9717 VT: getPointerTy(DL: MF.getDataLayout()));
9718
9719 unsigned RetValReg = AArch64::X0;
9720 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
9721 RetValReg = AArch64::X8;
9722 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetValReg, N: Val, Glue);
9723 Glue = Chain.getValue(R: 1);
9724
9725 RetOps.push_back(
9726 Elt: DAG.getRegister(Reg: RetValReg, VT: getPointerTy(DL: DAG.getDataLayout())));
9727 }
9728
9729 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(MF: &MF);
9730 if (I) {
9731 for (; *I; ++I) {
9732 if (AArch64::GPR64RegClass.contains(Reg: *I))
9733 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
9734 else if (AArch64::FPR64RegClass.contains(Reg: *I))
9735 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::getFloatingPointVT(BitWidth: 64)));
9736 else
9737 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9738 }
9739 }
9740
9741 RetOps[0] = Chain; // Update chain.
9742
9743 // Add the glue if we have it.
9744 if (Glue.getNode())
9745 RetOps.push_back(Elt: Glue);
9746
9747 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9748 // ARM64EC entry thunks use a special return sequence: instead of a regular
9749 // "ret" instruction, they need to explicitly call the emulator.
9750 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9751 SDValue Arm64ECRetDest =
9752 DAG.getExternalSymbol(Sym: "__os_arm64x_dispatch_ret", VT: PtrVT);
9753 Arm64ECRetDest =
9754 getAddr(N: cast<ExternalSymbolSDNode>(Val&: Arm64ECRetDest), DAG, Flags: 0);
9755 Arm64ECRetDest = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Arm64ECRetDest,
9756 PtrInfo: MachinePointerInfo());
9757 RetOps.insert(I: RetOps.begin() + 1, Elt: Arm64ECRetDest);
9758 RetOps.insert(I: RetOps.begin() + 2, Elt: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
9759 return DAG.getNode(Opcode: AArch64ISD::TC_RETURN, DL, VT: MVT::Other, Ops: RetOps);
9760 }
9761
9762 return DAG.getNode(Opcode: AArch64ISD::RET_GLUE, DL, VT: MVT::Other, Ops: RetOps);
9763}
9764
9765//===----------------------------------------------------------------------===//
9766// Other Lowering Code
9767//===----------------------------------------------------------------------===//
9768
9769SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9770 SelectionDAG &DAG,
9771 unsigned Flag) const {
9772 return DAG.getTargetGlobalAddress(GV: N->getGlobal(), DL: SDLoc(N), VT: Ty,
9773 offset: N->getOffset(), TargetFlags: Flag);
9774}
9775
9776SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9777 SelectionDAG &DAG,
9778 unsigned Flag) const {
9779 return DAG.getTargetJumpTable(JTI: N->getIndex(), VT: Ty, TargetFlags: Flag);
9780}
9781
9782SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9783 SelectionDAG &DAG,
9784 unsigned Flag) const {
9785 return DAG.getTargetConstantPool(C: N->getConstVal(), VT: Ty, Align: N->getAlign(),
9786 Offset: N->getOffset(), TargetFlags: Flag);
9787}
9788
9789SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9790 SelectionDAG &DAG,
9791 unsigned Flag) const {
9792 return DAG.getTargetBlockAddress(BA: N->getBlockAddress(), VT: Ty, Offset: 0, TargetFlags: Flag);
9793}
9794
9795SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9796 SelectionDAG &DAG,
9797 unsigned Flag) const {
9798 return DAG.getTargetExternalSymbol(Sym: N->getSymbol(), VT: Ty, TargetFlags: Flag);
9799}
9800
9801// (loadGOT sym)
9802template <class NodeTy>
9803SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9804 unsigned Flags) const {
9805 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9806 SDLoc DL(N);
9807 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9808 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
9809 // FIXME: Once remat is capable of dealing with instructions with register
9810 // operands, expand this into two nodes instead of using a wrapper node.
9811 if (DAG.getMachineFunction()
9812 .getInfo<AArch64FunctionInfo>()
9813 ->hasELFSignedGOT())
9814 return SDValue(DAG.getMachineNode(Opcode: AArch64::LOADgotAUTH, dl: DL, VT: Ty, Op1: GotAddr),
9815 0);
9816 return DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: Ty, Operand: GotAddr);
9817}
9818
9819// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9820template <class NodeTy>
9821SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9822 unsigned Flags) const {
9823 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9824 SDLoc DL(N);
9825 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9826 const unsigned char MO_NC = AArch64II::MO_NC;
9827 return DAG.getNode(
9828 AArch64ISD::WrapperLarge, DL, Ty,
9829 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
9830 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
9831 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
9832 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
9833}
9834
9835// (addlow (adrp %hi(sym)) %lo(sym))
9836template <class NodeTy>
9837SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9838 unsigned Flags) const {
9839 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9840 SDLoc DL(N);
9841 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9842 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
9843 SDValue Lo = getTargetNode(N, Ty, DAG,
9844 AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
9845 SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: Ty, Operand: Hi);
9846 return DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: Ty, N1: ADRP, N2: Lo);
9847}
9848
9849// (adr sym)
9850template <class NodeTy>
9851SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9852 unsigned Flags) const {
9853 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9854 SDLoc DL(N);
9855 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9856 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9857 return DAG.getNode(Opcode: AArch64ISD::ADR, DL, VT: Ty, Operand: Sym);
9858}
9859
9860SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9861 SelectionDAG &DAG) const {
9862 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Val&: Op);
9863 const GlobalValue *GV = GN->getGlobal();
9864 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM: getTargetMachine());
9865
9866 if (OpFlags != AArch64II::MO_NO_FLAG)
9867 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
9868 "unexpected offset in global node");
9869
9870 // This also catches the large code model case for Darwin, and tiny code
9871 // model with got relocations.
9872 if ((OpFlags & AArch64II::MO_GOT) != 0) {
9873 return getGOT(N: GN, DAG, Flags: OpFlags);
9874 }
9875
9876 SDValue Result;
9877 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
9878 !getTargetMachine().isPositionIndependent()) {
9879 Result = getAddrLarge(N: GN, DAG, Flags: OpFlags);
9880 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9881 Result = getAddrTiny(N: GN, DAG, Flags: OpFlags);
9882 } else {
9883 Result = getAddr(N: GN, DAG, Flags: OpFlags);
9884 }
9885 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9886 SDLoc DL(GN);
9887 if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
9888 Result = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Result,
9889 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()));
9890 return Result;
9891}
9892
9893/// Convert a TLS address reference into the correct sequence of loads
9894/// and calls to compute the variable's address (for Darwin, currently) and
9895/// return an SDValue containing the final node.
9896
9897/// Darwin only has one TLS scheme which must be capable of dealing with the
9898/// fully general situation, in the worst case. This means:
9899/// + "extern __thread" declaration.
9900/// + Defined in a possibly unknown dynamic library.
9901///
9902/// The general system is that each __thread variable has a [3 x i64] descriptor
9903/// which contains information used by the runtime to calculate the address. The
9904/// only part of this the compiler needs to know about is the first xword, which
9905/// contains a function pointer that must be called with the address of the
9906/// entire descriptor in "x0".
9907///
9908/// Since this descriptor may be in a different unit, in general even the
9909/// descriptor must be accessed via an indirect load. The "ideal" code sequence
9910/// is:
9911/// adrp x0, _var@TLVPPAGE
9912/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
9913/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
9914/// ; the function pointer
9915/// blr x1 ; Uses descriptor address in x0
9916/// ; Address of _var is now in x0.
9917///
9918/// If the address of _var's descriptor *is* known to the linker, then it can
9919/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
9920/// a slight efficiency gain.
9921SDValue
9922AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
9923 SelectionDAG &DAG) const {
9924 assert(Subtarget->isTargetDarwin() &&
9925 "This function expects a Darwin target");
9926
9927 SDLoc DL(Op);
9928 MVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9929 MVT PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
9930 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: Op)->getGlobal();
9931
9932 SDValue TLVPAddr =
9933 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS);
9934 SDValue DescAddr = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TLVPAddr);
9935
9936 // The first entry in the descriptor is a function pointer that we must call
9937 // to obtain the address of the variable.
9938 SDValue Chain = DAG.getEntryNode();
9939 SDValue FuncTLVGet = DAG.getLoad(
9940 VT: PtrMemVT, dl: DL, Chain, Ptr: DescAddr,
9941 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()),
9942 Alignment: Align(PtrMemVT.getSizeInBits() / 8),
9943 MMOFlags: MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
9944 Chain = FuncTLVGet.getValue(R: 1);
9945
9946 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
9947 FuncTLVGet = DAG.getZExtOrTrunc(Op: FuncTLVGet, DL, VT: PtrVT);
9948
9949 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9950 MFI.setAdjustsStack(true);
9951
9952 // TLS calls preserve all registers except those that absolutely must be
9953 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
9954 // silly).
9955 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9956 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
9957 if (Subtarget->hasCustomCallingConv())
9958 TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
9959
9960 // Finally, we can make the call. This is just a degenerate version of a
9961 // normal AArch64 call node: x0 takes the address of the descriptor, and
9962 // returns the address of the variable in this thread.
9963 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: AArch64::X0, N: DescAddr, Glue: SDValue());
9964
9965 unsigned Opcode = AArch64ISD::CALL;
9966 SmallVector<SDValue, 8> Ops;
9967 Ops.push_back(Elt: Chain);
9968 Ops.push_back(Elt: FuncTLVGet);
9969
9970 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
9971 if (DAG.getMachineFunction().getFunction().hasFnAttribute(Kind: "ptrauth-calls")) {
9972 Opcode = AArch64ISD::AUTH_CALL;
9973 Ops.push_back(Elt: DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32));
9974 Ops.push_back(Elt: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64)); // Integer Disc.
9975 Ops.push_back(Elt: DAG.getRegister(Reg: AArch64::NoRegister, VT: MVT::i64)); // Addr Disc.
9976 }
9977
9978 Ops.push_back(Elt: DAG.getRegister(Reg: AArch64::X0, VT: MVT::i64));
9979 Ops.push_back(Elt: DAG.getRegisterMask(RegMask: Mask));
9980 Ops.push_back(Elt: Chain.getValue(R: 1));
9981 Chain = DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops);
9982 return DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::X0, VT: PtrVT, Glue: Chain.getValue(R: 1));
9983}
9984
9985/// Convert a thread-local variable reference into a sequence of instructions to
9986/// compute the variable's address for the local exec TLS model of ELF targets.
9987/// The sequence depends on the maximum TLS area size.
9988SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
9989 SDValue ThreadBase,
9990 const SDLoc &DL,
9991 SelectionDAG &DAG) const {
9992 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9993 SDValue TPOff, Addr;
9994
9995 switch (DAG.getTarget().Options.TLSSize) {
9996 default:
9997 llvm_unreachable("Unexpected TLS size");
9998
9999 case 12: {
10000 // mrs x0, TPIDR_EL0
10001 // add x0, x0, :tprel_lo12:a
10002 SDValue Var = DAG.getTargetGlobalAddress(
10003 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10004 return SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: ThreadBase,
10005 Op2: Var,
10006 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
10007 0);
10008 }
10009
10010 case 24: {
10011 // mrs x0, TPIDR_EL0
10012 // add x0, x0, :tprel_hi12:a
10013 // add x0, x0, :tprel_lo12_nc:a
10014 SDValue HiVar = DAG.getTargetGlobalAddress(
10015 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_HI12);
10016 SDValue LoVar = DAG.getTargetGlobalAddress(
10017 GV, DL, VT: PtrVT, offset: 0,
10018 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10019 Addr = SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: ThreadBase,
10020 Op2: HiVar,
10021 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
10022 0);
10023 return SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: Addr,
10024 Op2: LoVar,
10025 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
10026 0);
10027 }
10028
10029 case 32: {
10030 // mrs x1, TPIDR_EL0
10031 // movz x0, #:tprel_g1:a
10032 // movk x0, #:tprel_g0_nc:a
10033 // add x0, x1, x0
10034 SDValue HiVar = DAG.getTargetGlobalAddress(
10035 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G1);
10036 SDValue LoVar = DAG.getTargetGlobalAddress(
10037 GV, DL, VT: PtrVT, offset: 0,
10038 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
10039 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::MOVZXi, dl: DL, VT: PtrVT, Op1: HiVar,
10040 Op2: DAG.getTargetConstant(Val: 16, DL, VT: MVT::i32)),
10041 0);
10042 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
10043 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
10044 0);
10045 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
10046 }
10047
10048 case 48: {
10049 // mrs x1, TPIDR_EL0
10050 // movz x0, #:tprel_g2:a
10051 // movk x0, #:tprel_g1_nc:a
10052 // movk x0, #:tprel_g0_nc:a
10053 // add x0, x1, x0
10054 SDValue HiVar = DAG.getTargetGlobalAddress(
10055 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G2);
10056 SDValue MiVar = DAG.getTargetGlobalAddress(
10057 GV, DL, VT: PtrVT, offset: 0,
10058 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
10059 SDValue LoVar = DAG.getTargetGlobalAddress(
10060 GV, DL, VT: PtrVT, offset: 0,
10061 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
10062 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::MOVZXi, dl: DL, VT: PtrVT, Op1: HiVar,
10063 Op2: DAG.getTargetConstant(Val: 32, DL, VT: MVT::i32)),
10064 0);
10065 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: MiVar,
10066 Op3: DAG.getTargetConstant(Val: 16, DL, VT: MVT::i32)),
10067 0);
10068 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
10069 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
10070 0);
10071 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
10072 }
10073 }
10074}
10075
10076/// When accessing thread-local variables under either the general-dynamic or
10077/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10078/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10079/// is a function pointer to carry out the resolution.
10080///
10081/// The sequence is:
10082/// adrp x0, :tlsdesc:var
10083/// ldr x1, [x0, #:tlsdesc_lo12:var]
10084/// add x0, x0, #:tlsdesc_lo12:var
10085/// .tlsdesccall var
10086/// blr x1
10087/// (TPIDR_EL0 offset now in x0)
10088///
10089/// The above sequence must be produced unscheduled, to enable the linker to
10090/// optimize/relax this sequence.
10091/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10092/// above sequence, and expanded really late in the compilation flow, to ensure
10093/// the sequence is produced as per above.
10094SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10095 const SDLoc &DL,
10096 SelectionDAG &DAG) const {
10097 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
10098
10099 SDValue Chain = DAG.getEntryNode();
10100 SDVTList NodeTys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
10101
10102 unsigned Opcode =
10103 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10104 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10105 : AArch64ISD::TLSDESC_CALLSEQ;
10106 Chain = DAG.getNode(Opcode, DL, VTList: NodeTys, Ops: {Chain, SymAddr});
10107 SDValue Glue = Chain.getValue(R: 1);
10108
10109 return DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::X0, VT: PtrVT, Glue);
10110}
10111
10112SDValue
10113AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10114 SelectionDAG &DAG) const {
10115 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10116
10117 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
10118 AArch64FunctionInfo *MFI =
10119 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10120
10121 TLSModel::Model Model = MFI->hasELFSignedGOT()
10122 ? TLSModel::GeneralDynamic
10123 : getTargetMachine().getTLSModel(GV: GA->getGlobal());
10124
10125 if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
10126 if (Model == TLSModel::LocalDynamic)
10127 Model = TLSModel::GeneralDynamic;
10128 }
10129
10130 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
10131 Model != TLSModel::LocalExec)
10132 report_fatal_error(reason: "ELF TLS only supported in small memory model or "
10133 "in local exec TLS model");
10134 // Different choices can be made for the maximum size of the TLS area for a
10135 // module. For the small address model, the default TLS size is 16MiB and the
10136 // maximum TLS size is 4GiB.
10137 // FIXME: add tiny and large code model support for TLS access models other
10138 // than local exec. We currently generate the same code as small for tiny,
10139 // which may be larger than needed.
10140
10141 SDValue TPOff;
10142 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
10143 SDLoc DL(Op);
10144 const GlobalValue *GV = GA->getGlobal();
10145
10146 SDValue ThreadBase = DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL, VT: PtrVT);
10147
10148 if (Model == TLSModel::LocalExec) {
10149 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10150 } else if (Model == TLSModel::InitialExec) {
10151 TPOff = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS);
10152 TPOff = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TPOff);
10153 } else if (Model == TLSModel::LocalDynamic) {
10154 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10155 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10156 // the beginning of the module's TLS region, followed by a DTPREL offset
10157 // calculation.
10158
10159 // These accesses will need deduplicating if there's more than one.
10160 MFI->incNumLocalDynamicTLSAccesses();
10161
10162 // The call needs a relocation too for linker relaxation. It doesn't make
10163 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10164 // the address.
10165 SDValue SymAddr = DAG.getTargetExternalSymbol(Sym: "_TLS_MODULE_BASE_", VT: PtrVT,
10166 TargetFlags: AArch64II::MO_TLS);
10167
10168 // Now we can calculate the offset from TPIDR_EL0 to this module's
10169 // thread-local area.
10170 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10171
10172 // Now use :dtprel_whatever: operations to calculate this variable's offset
10173 // in its thread-storage area.
10174 SDValue HiVar = DAG.getTargetGlobalAddress(
10175 GV, DL, VT: MVT::i64, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_HI12);
10176 SDValue LoVar = DAG.getTargetGlobalAddress(
10177 GV, DL, VT: MVT::i64, offset: 0,
10178 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10179
10180 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TPOff, Op2: HiVar,
10181 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
10182 0);
10183 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
10184 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
10185 0);
10186 } else if (Model == TLSModel::GeneralDynamic) {
10187 // The call needs a relocation too for linker relaxation. It doesn't make
10188 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10189 // the address.
10190 SDValue SymAddr =
10191 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS);
10192
10193 // Finally we can make a call to calculate the offset from tpidr_el0.
10194 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10195 } else
10196 llvm_unreachable("Unsupported ELF TLS access model");
10197
10198 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
10199}
10200
10201SDValue
10202AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10203 SelectionDAG &DAG) const {
10204 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10205
10206 SDValue Chain = DAG.getEntryNode();
10207 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
10208 SDLoc DL(Op);
10209
10210 SDValue TEB = DAG.getRegister(Reg: AArch64::X18, VT: MVT::i64);
10211
10212 // Load the ThreadLocalStoragePointer from the TEB
10213 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10214 SDValue TLSArray =
10215 DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TEB, N2: DAG.getIntPtrConstant(Val: 0x58, DL));
10216 TLSArray = DAG.getLoad(VT: PtrVT, dl: DL, Chain, Ptr: TLSArray, PtrInfo: MachinePointerInfo());
10217 Chain = TLSArray.getValue(R: 1);
10218
10219 // Load the TLS index from the C runtime;
10220 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10221 // This also does the same as LOADgot, but using a generic i32 load,
10222 // while LOADgot only loads i64.
10223 SDValue TLSIndexHi =
10224 DAG.getTargetExternalSymbol(Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGE);
10225 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10226 Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10227 SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: PtrVT, Operand: TLSIndexHi);
10228 SDValue TLSIndex =
10229 DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: ADRP, N2: TLSIndexLo);
10230 TLSIndex = DAG.getLoad(VT: MVT::i32, dl: DL, Chain, Ptr: TLSIndex, PtrInfo: MachinePointerInfo());
10231 Chain = TLSIndex.getValue(R: 1);
10232
10233 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10234 // offset into the TLSArray.
10235 TLSIndex = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: PtrVT, Operand: TLSIndex);
10236 SDValue Slot = DAG.getNode(Opcode: ISD::SHL, DL, VT: PtrVT, N1: TLSIndex,
10237 N2: DAG.getConstant(Val: 3, DL, VT: PtrVT));
10238 SDValue TLS = DAG.getLoad(VT: PtrVT, dl: DL, Chain,
10239 Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TLSArray, N2: Slot),
10240 PtrInfo: MachinePointerInfo());
10241 Chain = TLS.getValue(R: 1);
10242
10243 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
10244 const GlobalValue *GV = GA->getGlobal();
10245 SDValue TGAHi = DAG.getTargetGlobalAddress(
10246 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_HI12);
10247 SDValue TGALo = DAG.getTargetGlobalAddress(
10248 GV, DL, VT: PtrVT, offset: 0,
10249 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10250
10251 // Add the offset from the start of the .tls section (section base).
10252 SDValue Addr =
10253 SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TLS, Op2: TGAHi,
10254 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
10255 0);
10256 Addr = DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: Addr, N2: TGALo);
10257 return Addr;
10258}
10259
10260SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10261 SelectionDAG &DAG) const {
10262 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
10263 if (DAG.getTarget().useEmulatedTLS())
10264 return LowerToTLSEmulatedModel(GA, DAG);
10265
10266 if (Subtarget->isTargetDarwin())
10267 return LowerDarwinGlobalTLSAddress(Op, DAG);
10268 if (Subtarget->isTargetELF())
10269 return LowerELFGlobalTLSAddress(Op, DAG);
10270 if (Subtarget->isTargetWindows())
10271 return LowerWindowsGlobalTLSAddress(Op, DAG);
10272
10273 llvm_unreachable("Unexpected platform trying to use TLS");
10274}
10275
10276//===----------------------------------------------------------------------===//
10277// PtrAuthGlobalAddress lowering
10278//
10279// We have 3 lowering alternatives to choose from:
10280// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10281// If the GV doesn't need a GOT load (i.e., is locally defined)
10282// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10283//
10284// - LOADgotPAC: similar to LOADgot, with added PAC.
10285// If the GV needs a GOT load, materialize the pointer using the usual
10286// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10287// section is assumed to be read-only (for example, via relro mechanism). See
10288// LowerMOVaddrPAC.
10289//
10290// - LOADauthptrstatic: similar to LOADgot, but use a
10291// special stub slot instead of a GOT slot.
10292// Load a signed pointer for symbol 'sym' from a stub slot named
10293// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10294// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10295// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10296//
10297// All 3 are pseudos that are expand late to longer sequences: this lets us
10298// provide integrity guarantees on the to-be-signed intermediate values.
10299//
10300// LOADauthptrstatic is undesirable because it requires a large section filled
10301// with often similarly-signed pointers, making it a good harvesting target.
10302// Thus, it's only used for ptrauth references to extern_weak to avoid null
10303// checks.
10304
10305static SDValue LowerPtrAuthGlobalAddressStatically(
10306 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10307 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10308 const auto *TGN = cast<GlobalAddressSDNode>(Val: TGA.getNode());
10309 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10310
10311 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10312 // offset alone as a pointer if the symbol wasn't available, which would
10313 // probably break null checks in users. Ptrauth complicates things further:
10314 // error out.
10315 if (TGN->getOffset() != 0)
10316 report_fatal_error(
10317 reason: "unsupported non-zero offset in weak ptrauth global reference");
10318
10319 if (!isNullConstant(V: AddrDiscriminator))
10320 report_fatal_error(reason: "unsupported weak addr-div ptrauth global");
10321
10322 SDValue Key = DAG.getTargetConstant(Val: KeyC, DL, VT: MVT::i32);
10323 return SDValue(DAG.getMachineNode(Opcode: AArch64::LOADauthptrstatic, dl: DL, VT: MVT::i64,
10324 Ops: {TGA, Key, Discriminator}),
10325 0);
10326}
10327
10328SDValue
10329AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10330 SelectionDAG &DAG) const {
10331 SDValue Ptr = Op.getOperand(i: 0);
10332 uint64_t KeyC = Op.getConstantOperandVal(i: 1);
10333 SDValue AddrDiscriminator = Op.getOperand(i: 2);
10334 uint64_t DiscriminatorC = Op.getConstantOperandVal(i: 3);
10335 EVT VT = Op.getValueType();
10336 SDLoc DL(Op);
10337
10338 if (KeyC > AArch64PACKey::LAST)
10339 report_fatal_error(reason: "key in ptrauth global out of range [0, " +
10340 Twine((int)AArch64PACKey::LAST) + "]");
10341
10342 // Blend only works if the integer discriminator is 16-bit wide.
10343 if (!isUInt<16>(x: DiscriminatorC))
10344 report_fatal_error(
10345 reason: "constant discriminator in ptrauth global out of range [0, 0xffff]");
10346
10347 // Choosing between 3 lowering alternatives is target-specific.
10348 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10349 report_fatal_error(reason: "ptrauth global lowering only supported on MachO/ELF");
10350
10351 int64_t PtrOffsetC = 0;
10352 if (Ptr.getOpcode() == ISD::ADD) {
10353 PtrOffsetC = Ptr.getConstantOperandVal(i: 1);
10354 Ptr = Ptr.getOperand(i: 0);
10355 }
10356 const auto *PtrN = cast<GlobalAddressSDNode>(Val: Ptr.getNode());
10357 const GlobalValue *PtrGV = PtrN->getGlobal();
10358
10359 // Classify the reference to determine whether it needs a GOT load.
10360 const unsigned OpFlags =
10361 Subtarget->ClassifyGlobalReference(GV: PtrGV, TM: getTargetMachine());
10362 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10363 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10364 "unsupported non-GOT op flags on ptrauth global reference");
10365
10366 // Fold any offset into the GV; our pseudos expect it there.
10367 PtrOffsetC += PtrN->getOffset();
10368 SDValue TPtr = DAG.getTargetGlobalAddress(GV: PtrGV, DL, VT, offset: PtrOffsetC,
10369 /*TargetFlags=*/0);
10370 assert(PtrN->getTargetFlags() == 0 &&
10371 "unsupported target flags on ptrauth global");
10372
10373 SDValue Key = DAG.getTargetConstant(Val: KeyC, DL, VT: MVT::i32);
10374 SDValue Discriminator = DAG.getTargetConstant(Val: DiscriminatorC, DL, VT: MVT::i64);
10375 SDValue TAddrDiscriminator = !isNullConstant(V: AddrDiscriminator)
10376 ? AddrDiscriminator
10377 : DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
10378
10379 // No GOT load needed -> MOVaddrPAC
10380 if (!NeedsGOTLoad) {
10381 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10382 return SDValue(
10383 DAG.getMachineNode(Opcode: AArch64::MOVaddrPAC, dl: DL, VT: MVT::i64,
10384 Ops: {TPtr, Key, TAddrDiscriminator, Discriminator}),
10385 0);
10386 }
10387
10388 // GOT load -> LOADgotPAC
10389 // Note that we disallow extern_weak refs to avoid null checks later.
10390 if (!PtrGV->hasExternalWeakLinkage())
10391 return SDValue(
10392 DAG.getMachineNode(Opcode: AArch64::LOADgotPAC, dl: DL, VT: MVT::i64,
10393 Ops: {TPtr, Key, TAddrDiscriminator, Discriminator}),
10394 0);
10395
10396 // extern_weak ref -> LOADauthptrstatic
10397 return LowerPtrAuthGlobalAddressStatically(
10398 TGA: TPtr, DL, VT, KeyC: (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10399 DAG);
10400}
10401
10402// Looks through \param Val to determine the bit that can be used to
10403// check the sign of the value. It returns the unextended value and
10404// the sign bit position.
10405std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10406 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10407 return {Val.getOperand(i: 0),
10408 cast<VTSDNode>(Val: Val.getOperand(i: 1))->getVT().getFixedSizeInBits() -
10409 1};
10410
10411 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10412 return {Val.getOperand(i: 0),
10413 Val.getOperand(i: 0)->getValueType(ResNo: 0).getFixedSizeInBits() - 1};
10414
10415 return {Val, Val.getValueSizeInBits() - 1};
10416}
10417
10418SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10419 SDValue Chain = Op.getOperand(i: 0);
10420 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 1))->get();
10421 SDValue LHS = Op.getOperand(i: 2);
10422 SDValue RHS = Op.getOperand(i: 3);
10423 SDValue Dest = Op.getOperand(i: 4);
10424 SDLoc DL(Op);
10425
10426 MachineFunction &MF = DAG.getMachineFunction();
10427 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10428 // will not be produced, as they are conditional branch instructions that do
10429 // not set flags.
10430 bool ProduceNonFlagSettingCondBr =
10431 !MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening);
10432
10433 // Handle f128 first, since lowering it will result in comparing the return
10434 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10435 // is expecting to deal with.
10436 if (LHS.getValueType() == MVT::f128) {
10437 softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL, OldLHS: LHS, OldRHS: RHS);
10438
10439 // If softenSetCCOperands returned a scalar, we need to compare the result
10440 // against zero to select between true and false values.
10441 if (!RHS.getNode()) {
10442 RHS = DAG.getConstant(Val: 0, DL, VT: LHS.getValueType());
10443 CC = ISD::SETNE;
10444 }
10445 }
10446
10447 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10448 // instruction.
10449 if (ISD::isOverflowIntrOpRes(Op: LHS) && isOneConstant(V: RHS) &&
10450 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10451 // Only lower legal XALUO ops.
10452 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: LHS->getValueType(ResNo: 0)))
10453 return SDValue();
10454
10455 // The actual operation with overflow check.
10456 AArch64CC::CondCode OFCC;
10457 SDValue Value, Overflow;
10458 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: LHS.getValue(R: 0), DAG);
10459
10460 if (CC == ISD::SETNE)
10461 OFCC = getInvertedCondCode(Code: OFCC);
10462 SDValue CCVal = DAG.getConstant(Val: OFCC, DL, VT: MVT::i32);
10463
10464 return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
10465 N4: Overflow);
10466 }
10467
10468 if (LHS.getValueType().isInteger()) {
10469 assert((LHS.getValueType() == RHS.getValueType()) &&
10470 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10471
10472 // If the RHS of the comparison is zero, we can potentially fold this
10473 // to a specialized branch.
10474 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
10475 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10476 if (CC == ISD::SETEQ) {
10477 // See if we can use a TBZ to fold in an AND as well.
10478 // TBZ has a smaller branch displacement than CBZ. If the offset is
10479 // out of bounds, a late MI-layer pass rewrites branches.
10480 // 403.gcc is an example that hits this case.
10481 if (LHS.getOpcode() == ISD::AND &&
10482 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
10483 isPowerOf2_64(Value: LHS.getConstantOperandVal(i: 1))) {
10484 SDValue Test = LHS.getOperand(i: 0);
10485 uint64_t Mask = LHS.getConstantOperandVal(i: 1);
10486 return DAG.getNode(Opcode: AArch64ISD::TBZ, DL, VT: MVT::Other, N1: Chain, N2: Test,
10487 N3: DAG.getConstant(Val: Log2_64(Value: Mask), DL, VT: MVT::i64),
10488 N4: Dest);
10489 }
10490
10491 return DAG.getNode(Opcode: AArch64ISD::CBZ, DL, VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
10492 } else if (CC == ISD::SETNE) {
10493 // See if we can use a TBZ to fold in an AND as well.
10494 // TBZ has a smaller branch displacement than CBZ. If the offset is
10495 // out of bounds, a late MI-layer pass rewrites branches.
10496 // 403.gcc is an example that hits this case.
10497 if (LHS.getOpcode() == ISD::AND &&
10498 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
10499 isPowerOf2_64(Value: LHS.getConstantOperandVal(i: 1))) {
10500 SDValue Test = LHS.getOperand(i: 0);
10501 uint64_t Mask = LHS.getConstantOperandVal(i: 1);
10502 return DAG.getNode(Opcode: AArch64ISD::TBNZ, DL, VT: MVT::Other, N1: Chain, N2: Test,
10503 N3: DAG.getConstant(Val: Log2_64(Value: Mask), DL, VT: MVT::i64),
10504 N4: Dest);
10505 }
10506
10507 return DAG.getNode(Opcode: AArch64ISD::CBNZ, DL, VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
10508 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
10509 // Don't combine AND since emitComparison converts the AND to an ANDS
10510 // (a.k.a. TST) and the test in the test bit and branch instruction
10511 // becomes redundant. This would also increase register pressure.
10512 uint64_t SignBitPos;
10513 std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
10514 return DAG.getNode(Opcode: AArch64ISD::TBNZ, DL, VT: MVT::Other, N1: Chain, N2: LHS,
10515 N3: DAG.getConstant(Val: SignBitPos, DL, VT: MVT::i64), N4: Dest);
10516 }
10517 }
10518 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
10519 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
10520 // Don't combine AND since emitComparison converts the AND to an ANDS
10521 // (a.k.a. TST) and the test in the test bit and branch instruction
10522 // becomes redundant. This would also increase register pressure.
10523 uint64_t SignBitPos;
10524 std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
10525 return DAG.getNode(Opcode: AArch64ISD::TBZ, DL, VT: MVT::Other, N1: Chain, N2: LHS,
10526 N3: DAG.getConstant(Val: SignBitPos, DL, VT: MVT::i64), N4: Dest);
10527 }
10528
10529 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
10530 // larger branch displacement but do prefer CB over cmp + br.
10531 if (Subtarget->hasCMPBR() &&
10532 AArch64CC::isValidCBCond(Code: changeIntCCToAArch64CC(CC)) &&
10533 ProduceNonFlagSettingCondBr) {
10534 SDValue Cond =
10535 DAG.getTargetConstant(Val: changeIntCCToAArch64CC(CC), DL, VT: MVT::i32);
10536 return DAG.getNode(Opcode: AArch64ISD::CB, DL, VT: MVT::Other, N1: Chain, N2: Cond, N3: LHS, N4: RHS,
10537 N5: Dest);
10538 }
10539
10540 SDValue CCVal;
10541 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, DL);
10542 return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
10543 N4: Cmp);
10544 }
10545
10546 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
10547 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10548
10549 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10550 // clean. Some of them require two branches to implement.
10551 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
10552 AArch64CC::CondCode CC1, CC2;
10553 changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
10554 SDValue CC1Val = DAG.getConstant(Val: CC1, DL, VT: MVT::i32);
10555 SDValue BR1 =
10556 DAG.getNode(Opcode: AArch64ISD::BRCOND, DL, VT: MVT::Other, N1: Chain, N2: Dest, N3: CC1Val, N4: Cmp);
10557 if (CC2 != AArch64CC::AL) {
10558 SDValue CC2Val = DAG.getConstant(Val: CC2, DL, VT: MVT::i32);
10559 return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL, VT: MVT::Other, N1: BR1, N2: Dest, N3: CC2Val,
10560 N4: Cmp);
10561 }
10562
10563 return BR1;
10564}
10565
10566SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
10567 SelectionDAG &DAG) const {
10568 if (!Subtarget->isNeonAvailable() &&
10569 !Subtarget->useSVEForFixedLengthVectors())
10570 return SDValue();
10571
10572 EVT VT = Op.getValueType();
10573 EVT IntVT = VT.changeTypeToInteger();
10574 SDLoc DL(Op);
10575
10576 SDValue In1 = Op.getOperand(i: 0);
10577 SDValue In2 = Op.getOperand(i: 1);
10578 EVT SrcVT = In2.getValueType();
10579
10580 if (!SrcVT.bitsEq(VT))
10581 In2 = DAG.getFPExtendOrRound(Op: In2, DL, VT);
10582
10583 if (VT.isScalableVector())
10584 IntVT =
10585 getPackedSVEVectorVT(VT: VT.getVectorElementType().changeTypeToInteger());
10586
10587 if (VT.isFixedLengthVector() &&
10588 useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
10589 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
10590
10591 In1 = convertToScalableVector(DAG, VT: ContainerVT, V: In1);
10592 In2 = convertToScalableVector(DAG, VT: ContainerVT, V: In2);
10593
10594 SDValue Res = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: ContainerVT, N1: In1, N2: In2);
10595 return convertFromScalableVector(DAG, VT, V: Res);
10596 }
10597
10598 // With SVE, but without Neon, extend the scalars to scalable vectors and use
10599 // a SVE FCOPYSIGN.
10600 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
10601 Subtarget->isSVEorStreamingSVEAvailable()) {
10602 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
10603 return SDValue();
10604 EVT SVT = getPackedSVEVectorVT(VT);
10605
10606 SDValue Ins1 =
10607 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: SVT, N1: DAG.getUNDEF(VT: SVT), N2: In1,
10608 N3: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
10609 SDValue Ins2 =
10610 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: SVT, N1: DAG.getUNDEF(VT: SVT), N2: In2,
10611 N3: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
10612 SDValue FCS = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: SVT, N1: Ins1, N2: Ins2);
10613 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: FCS,
10614 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
10615 }
10616
10617 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
10618 if (VT.isScalableVector())
10619 return getSVESafeBitCast(VT, Op, DAG);
10620
10621 return DAG.getBitcast(VT, V: Op);
10622 };
10623
10624 SDValue VecVal1, VecVal2;
10625 EVT VecVT;
10626 auto SetVecVal = [&](int Idx = -1) {
10627 if (!VT.isVector()) {
10628 VecVal1 =
10629 DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In1);
10630 VecVal2 =
10631 DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In2);
10632 } else {
10633 VecVal1 = BitCast(VecVT, In1, DAG);
10634 VecVal2 = BitCast(VecVT, In2, DAG);
10635 }
10636 };
10637 if (VT.isVector()) {
10638 VecVT = IntVT;
10639 SetVecVal();
10640 } else if (VT == MVT::f64) {
10641 VecVT = MVT::v2i64;
10642 SetVecVal(AArch64::dsub);
10643 } else if (VT == MVT::f32) {
10644 VecVT = MVT::v4i32;
10645 SetVecVal(AArch64::ssub);
10646 } else if (VT == MVT::f16 || VT == MVT::bf16) {
10647 VecVT = MVT::v8i16;
10648 SetVecVal(AArch64::hsub);
10649 } else {
10650 llvm_unreachable("Invalid type for copysign!");
10651 }
10652
10653 unsigned BitWidth = In1.getScalarValueSizeInBits();
10654 SDValue SignMaskV = DAG.getConstant(Val: ~APInt::getSignMask(BitWidth), DL, VT: VecVT);
10655
10656 // We want to materialize a mask with every bit but the high bit set, but the
10657 // AdvSIMD immediate moves cannot materialize that in a single instruction for
10658 // 64-bit elements. Instead, materialize all bits set and then negate that.
10659 if (VT == MVT::f64 || VT == MVT::v2f64) {
10660 SignMaskV = DAG.getConstant(Val: APInt::getAllOnes(numBits: BitWidth), DL, VT: VecVT);
10661 SignMaskV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2f64, Operand: SignMaskV);
10662 SignMaskV = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::v2f64, Operand: SignMaskV);
10663 SignMaskV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i64, Operand: SignMaskV);
10664 }
10665
10666 SDValue BSP =
10667 DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT: VecVT, N1: SignMaskV, N2: VecVal1, N3: VecVal2);
10668 if (VT == MVT::f16 || VT == MVT::bf16)
10669 return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT, Operand: BSP);
10670 if (VT == MVT::f32)
10671 return DAG.getTargetExtractSubreg(SRIdx: AArch64::ssub, DL, VT, Operand: BSP);
10672 if (VT == MVT::f64)
10673 return DAG.getTargetExtractSubreg(SRIdx: AArch64::dsub, DL, VT, Operand: BSP);
10674
10675 return BitCast(VT, BSP, DAG);
10676}
10677
10678SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
10679 SelectionDAG &DAG) const {
10680 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
10681 Kind: Attribute::NoImplicitFloat))
10682 return SDValue();
10683
10684 EVT VT = Op.getValueType();
10685 if (VT.isScalableVector() ||
10686 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
10687 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTPOP_MERGE_PASSTHRU);
10688
10689 bool IsParity = Op.getOpcode() == ISD::PARITY;
10690 SDValue Val = Op.getOperand(i: 0);
10691 SDLoc DL(Op);
10692
10693 // for i32, general parity function using EORs is more efficient compared to
10694 // using floating point
10695 if (VT == MVT::i32 && IsParity)
10696 return SDValue();
10697
10698 if (Subtarget->isSVEorStreamingSVEAvailable()) {
10699 if (VT == MVT::i32 || VT == MVT::i64) {
10700 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
10701 Val = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT,
10702 N1: DAG.getUNDEF(VT: ContainerVT), N2: Val,
10703 N3: DAG.getVectorIdxConstant(Val: 0, DL));
10704 Val = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: ContainerVT, Operand: Val);
10705 Val = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Val,
10706 N2: DAG.getVectorIdxConstant(Val: 0, DL));
10707 if (IsParity)
10708 Val = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Val, N2: DAG.getConstant(Val: 1, DL, VT));
10709 return Val;
10710 }
10711
10712 if (VT == MVT::i128) {
10713 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i64, Operand: Val);
10714 Val = convertToScalableVector(DAG, VT: MVT::nxv2i64, V: Val);
10715 Val = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: MVT::nxv2i64, Operand: Val);
10716 Val = convertFromScalableVector(DAG, VT: MVT::v2i64, V: Val);
10717 Val = DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: MVT::i64, Operand: Val);
10718 Val = DAG.getZExtOrTrunc(Op: Val, DL, VT);
10719 if (IsParity)
10720 Val = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Val, N2: DAG.getConstant(Val: 1, DL, VT));
10721 return Val;
10722 }
10723 }
10724
10725 if (!Subtarget->isNeonAvailable())
10726 return SDValue();
10727
10728 // If there is no CNT instruction available, GPR popcount can
10729 // be more efficiently lowered to the following sequence that uses
10730 // AdvSIMD registers/instructions as long as the copies to/from
10731 // the AdvSIMD registers are cheap.
10732 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
10733 // CNT V0.8B, V0.8B // 8xbyte pop-counts
10734 // ADDV B0, V0.8B // sum 8xbyte pop-counts
10735 // FMOV X0, D0 // copy result back to integer reg
10736 if (VT == MVT::i32 || VT == MVT::i64) {
10737 if (VT == MVT::i32)
10738 Val = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: Val);
10739 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i8, Operand: Val);
10740
10741 SDValue CtPop = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: MVT::v8i8, Operand: Val);
10742 SDValue AddV = DAG.getNode(Opcode: AArch64ISD::UADDV, DL, VT: MVT::v8i8, Operand: CtPop);
10743 AddV = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL,
10744 VT: VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, Operand: AddV);
10745 AddV = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: AddV,
10746 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
10747 if (IsParity)
10748 AddV = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: AddV, N2: DAG.getConstant(Val: 1, DL, VT));
10749 return AddV;
10750 } else if (VT == MVT::i128) {
10751 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v16i8, Operand: Val);
10752
10753 SDValue CtPop = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: MVT::v16i8, Operand: Val);
10754 SDValue AddV = DAG.getNode(Opcode: AArch64ISD::UADDV, DL, VT: MVT::v16i8, Operand: CtPop);
10755 AddV = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64,
10756 N1: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: MVT::v2i64, Operand: AddV),
10757 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
10758 AddV = DAG.getZExtOrTrunc(Op: AddV, DL, VT);
10759 if (IsParity)
10760 AddV = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: AddV, N2: DAG.getConstant(Val: 1, DL, VT));
10761 return AddV;
10762 }
10763
10764 assert(!IsParity && "ISD::PARITY of vector types not supported");
10765
10766 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
10767 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
10768 "Unexpected type for custom ctpop lowering");
10769
10770 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
10771 Val = DAG.getBitcast(VT: VT8Bit, V: Val);
10772 Val = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: VT8Bit, Operand: Val);
10773
10774 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
10775 VT.getVectorNumElements() >= 2) {
10776 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
10777 SDValue Zeros = DAG.getConstant(Val: 0, DL, VT: DT);
10778 SDValue Ones = DAG.getConstant(Val: 1, DL, VT: VT8Bit);
10779
10780 if (VT == MVT::v2i64) {
10781 Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10782 Val = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT, Operand: Val);
10783 } else if (VT == MVT::v2i32) {
10784 Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10785 } else if (VT == MVT::v4i32) {
10786 Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10787 } else {
10788 llvm_unreachable("Unexpected type for custom ctpop lowering");
10789 }
10790
10791 return Val;
10792 }
10793
10794 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
10795 unsigned EltSize = 8;
10796 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
10797 while (EltSize != VT.getScalarSizeInBits()) {
10798 EltSize *= 2;
10799 NumElts /= 2;
10800 MVT WidenVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: EltSize), NumElements: NumElts);
10801 Val = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT: WidenVT, Operand: Val);
10802 }
10803
10804 return Val;
10805}
10806
10807SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10808 EVT VT = Op.getValueType();
10809 assert(VT.isScalableVector() ||
10810 useSVEForFixedLengthVectorVT(
10811 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
10812
10813 SDLoc DL(Op);
10814 SDValue RBIT = DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT, Operand: Op.getOperand(i: 0));
10815 return DAG.getNode(Opcode: ISD::CTLZ, DL, VT, Operand: RBIT);
10816}
10817
10818SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
10819 SelectionDAG &DAG) const {
10820
10821 EVT VT = Op.getValueType();
10822 SDLoc DL(Op);
10823 unsigned Opcode = Op.getOpcode();
10824 ISD::CondCode CC;
10825 switch (Opcode) {
10826 default:
10827 llvm_unreachable("Wrong instruction");
10828 case ISD::SMAX:
10829 CC = ISD::SETGT;
10830 break;
10831 case ISD::SMIN:
10832 CC = ISD::SETLT;
10833 break;
10834 case ISD::UMAX:
10835 CC = ISD::SETUGT;
10836 break;
10837 case ISD::UMIN:
10838 CC = ISD::SETULT;
10839 break;
10840 }
10841
10842 if (VT.isScalableVector() ||
10843 useSVEForFixedLengthVectorVT(
10844 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
10845 switch (Opcode) {
10846 default:
10847 llvm_unreachable("Wrong instruction");
10848 case ISD::SMAX:
10849 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMAX_PRED);
10850 case ISD::SMIN:
10851 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMIN_PRED);
10852 case ISD::UMAX:
10853 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMAX_PRED);
10854 case ISD::UMIN:
10855 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMIN_PRED);
10856 }
10857 }
10858
10859 SDValue Op0 = Op.getOperand(i: 0);
10860 SDValue Op1 = Op.getOperand(i: 1);
10861 SDValue Cond = DAG.getSetCC(DL, VT, LHS: Op0, RHS: Op1, Cond: CC);
10862 return DAG.getSelect(DL, VT, Cond, LHS: Op0, RHS: Op1);
10863}
10864
10865SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10866 SelectionDAG &DAG) const {
10867 EVT VT = Op.getValueType();
10868
10869 if (VT.isScalableVector() ||
10870 useSVEForFixedLengthVectorVT(
10871 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
10872 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10873
10874 SDLoc DL(Op);
10875 SDValue REVB;
10876 MVT VST;
10877
10878 switch (VT.getSimpleVT().SimpleTy) {
10879 default:
10880 llvm_unreachable("Invalid type for bitreverse!");
10881
10882 case MVT::v2i32: {
10883 VST = MVT::v8i8;
10884 REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: 0));
10885
10886 break;
10887 }
10888
10889 case MVT::v4i32: {
10890 VST = MVT::v16i8;
10891 REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: 0));
10892
10893 break;
10894 }
10895
10896 case MVT::v1i64: {
10897 VST = MVT::v8i8;
10898 REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: 0));
10899
10900 break;
10901 }
10902
10903 case MVT::v2i64: {
10904 VST = MVT::v16i8;
10905 REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: 0));
10906
10907 break;
10908 }
10909 }
10910
10911 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
10912 Operand: DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT: VST, Operand: REVB));
10913}
10914
10915// Check whether the continuous comparison sequence.
10916static bool
10917isOrXorChain(SDValue N, unsigned &Num,
10918 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
10919 if (Num == MaxXors)
10920 return false;
10921
10922 // Skip the one-use zext
10923 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
10924 N = N->getOperand(Num: 0);
10925
10926 // The leaf node must be XOR
10927 if (N->getOpcode() == ISD::XOR) {
10928 WorkList.push_back(Elt: std::make_pair(x: N->getOperand(Num: 0), y: N->getOperand(Num: 1)));
10929 Num++;
10930 return true;
10931 }
10932
10933 // All the non-leaf nodes must be OR.
10934 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
10935 return false;
10936
10937 if (isOrXorChain(N: N->getOperand(Num: 0), Num, WorkList) &&
10938 isOrXorChain(N: N->getOperand(Num: 1), Num, WorkList))
10939 return true;
10940 return false;
10941}
10942
10943// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
10944static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
10945 SDValue LHS = N->getOperand(Num: 0);
10946 SDValue RHS = N->getOperand(Num: 1);
10947 SDLoc DL(N);
10948 EVT VT = N->getValueType(ResNo: 0);
10949 SmallVector<std::pair<SDValue, SDValue>, 16> WorkList;
10950
10951 // Only handle integer compares.
10952 if (N->getOpcode() != ISD::SETCC)
10953 return SDValue();
10954
10955 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
10956 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
10957 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
10958 unsigned NumXors = 0;
10959 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(V: RHS) &&
10960 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
10961 isOrXorChain(N: LHS, Num&: NumXors, WorkList)) {
10962 SDValue XOR0, XOR1;
10963 std::tie(args&: XOR0, args&: XOR1) = WorkList[0];
10964 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
10965 SDValue Cmp = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
10966 for (unsigned I = 1; I < WorkList.size(); I++) {
10967 std::tie(args&: XOR0, args&: XOR1) = WorkList[I];
10968 SDValue CmpChain = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
10969 Cmp = DAG.getNode(Opcode: LogicOp, DL, VT, N1: Cmp, N2: CmpChain);
10970 }
10971
10972 // Exit early by inverting the condition, which help reduce indentations.
10973 return Cmp;
10974 }
10975
10976 return SDValue();
10977}
10978
10979SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
10980
10981 if (Op.getValueType().isVector())
10982 return LowerVSETCC(Op, DAG);
10983
10984 bool IsStrict = Op->isStrictFPOpcode();
10985 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10986 unsigned OpNo = IsStrict ? 1 : 0;
10987 SDValue Chain;
10988 if (IsStrict)
10989 Chain = Op.getOperand(i: 0);
10990 SDValue LHS = Op.getOperand(i: OpNo + 0);
10991 SDValue RHS = Op.getOperand(i: OpNo + 1);
10992 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: OpNo + 2))->get();
10993 SDLoc DL(Op);
10994
10995 // We chose ZeroOrOneBooleanContents, so use zero and one.
10996 EVT VT = Op.getValueType();
10997 SDValue TVal = DAG.getConstant(Val: 1, DL, VT);
10998 SDValue FVal = DAG.getConstant(Val: 0, DL, VT);
10999
11000 // Handle f128 first, since one possible outcome is a normal integer
11001 // comparison which gets picked up by the next if statement.
11002 if (LHS.getValueType() == MVT::f128) {
11003 softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL, OldLHS: LHS, OldRHS: RHS, Chain,
11004 IsSignaling);
11005
11006 // If softenSetCCOperands returned a scalar, use it.
11007 if (!RHS.getNode()) {
11008 assert(LHS.getValueType() == Op.getValueType() &&
11009 "Unexpected setcc expansion!");
11010 return IsStrict ? DAG.getMergeValues(Ops: {LHS, Chain}, dl: DL) : LHS;
11011 }
11012 }
11013
11014 if (LHS.getValueType().isInteger()) {
11015
11016 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11017
11018 SDValue CCVal;
11019 SDValue Cmp = getAArch64Cmp(
11020 LHS, RHS, CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), AArch64cc&: CCVal, DAG, DL);
11021
11022 // Note that we inverted the condition above, so we reverse the order of
11023 // the true and false operands here. This will allow the setcc to be
11024 // matched to a single CSINC instruction.
11025 SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: FVal, N2: TVal, N3: CCVal, N4: Cmp);
11026 return IsStrict ? DAG.getMergeValues(Ops: {Res, Chain}, dl: DL) : Res;
11027 }
11028
11029 // Now we know we're dealing with FP values.
11030 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11031 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11032
11033 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11034 // and do the comparison.
11035 SDValue Cmp;
11036 if (IsStrict)
11037 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11038 else
11039 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11040
11041 AArch64CC::CondCode CC1, CC2;
11042 changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
11043 SDValue Res;
11044 if (CC2 == AArch64CC::AL) {
11045 changeFPCCToAArch64CC(CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), CondCode&: CC1,
11046 CondCode2&: CC2);
11047 SDValue CC1Val = DAG.getConstant(Val: CC1, DL, VT: MVT::i32);
11048
11049 // Note that we inverted the condition above, so we reverse the order of
11050 // the true and false operands here. This will allow the setcc to be
11051 // matched to a single CSINC instruction.
11052 Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: FVal, N2: TVal, N3: CC1Val, N4: Cmp);
11053 } else {
11054 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11055 // totally clean. Some of them require two CSELs to implement. As is in
11056 // this case, we emit the first CSEL and then emit a second using the output
11057 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11058
11059 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11060 SDValue CC1Val = DAG.getConstant(Val: CC1, DL, VT: MVT::i32);
11061 SDValue CS1 =
11062 DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
11063
11064 SDValue CC2Val = DAG.getConstant(Val: CC2, DL, VT: MVT::i32);
11065 Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
11066 }
11067 return IsStrict ? DAG.getMergeValues(Ops: {Res, Cmp.getValue(R: 1)}, dl: DL) : Res;
11068}
11069
11070SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11071 SelectionDAG &DAG) const {
11072
11073 SDValue LHS = Op.getOperand(i: 0);
11074 SDValue RHS = Op.getOperand(i: 1);
11075 EVT VT = LHS.getValueType();
11076 if (VT != MVT::i32 && VT != MVT::i64)
11077 return SDValue();
11078
11079 SDLoc DL(Op);
11080 SDValue Carry = Op.getOperand(i: 2);
11081 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11082 SDValue InvCarry = valueToCarryFlag(Value: Carry, DAG, Invert: true);
11083 SDValue Cmp = DAG.getNode(Opcode: AArch64ISD::SBCS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Glue),
11084 N1: LHS, N2: RHS, N3: InvCarry);
11085
11086 EVT OpVT = Op.getValueType();
11087 SDValue TVal = DAG.getConstant(Val: 1, DL, VT: OpVT);
11088 SDValue FVal = DAG.getConstant(Val: 0, DL, VT: OpVT);
11089
11090 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: Op.getOperand(i: 3))->get();
11091 ISD::CondCode CondInv = ISD::getSetCCInverse(Operation: Cond, Type: VT);
11092 SDValue CCVal =
11093 DAG.getConstant(Val: changeIntCCToAArch64CC(CC: CondInv), DL, VT: MVT::i32);
11094 // Inputs are swapped because the condition is inverted. This will allow
11095 // matching with a single CSINC instruction.
11096 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OpVT, N1: FVal, N2: TVal, N3: CCVal,
11097 N4: Cmp.getValue(R: 1));
11098}
11099
11100/// Emit vector comparison for floating-point values, producing a mask.
11101static SDValue emitVectorComparison(SDValue LHS, SDValue RHS,
11102 AArch64CC::CondCode CC, bool NoNans, EVT VT,
11103 const SDLoc &DL, SelectionDAG &DAG) {
11104 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
11105 "function only supposed to emit natural comparisons");
11106
11107 switch (CC) {
11108 default:
11109 return SDValue();
11110 case AArch64CC::NE: {
11111 SDValue Fcmeq = DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL, VT, N1: LHS, N2: RHS);
11112 // Use vector semantics for the inversion to potentially save a copy between
11113 // SIMD and regular registers.
11114 if (!LHS.getValueType().isVector()) {
11115 EVT VecVT =
11116 EVT::getVectorVT(Context&: *DAG.getContext(), VT, NumElements: 128 / VT.getSizeInBits());
11117 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
11118 SDValue MaskVec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VecVT,
11119 N1: DAG.getUNDEF(VT: VecVT), N2: Fcmeq, N3: Zero);
11120 SDValue InvertedMask = DAG.getNOT(DL, Val: MaskVec, VT: VecVT);
11121 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: InvertedMask, N2: Zero);
11122 }
11123 return DAG.getNOT(DL, Val: Fcmeq, VT);
11124 }
11125 case AArch64CC::EQ:
11126 return DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL, VT, N1: LHS, N2: RHS);
11127 case AArch64CC::GE:
11128 return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL, VT, N1: LHS, N2: RHS);
11129 case AArch64CC::GT:
11130 return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL, VT, N1: LHS, N2: RHS);
11131 case AArch64CC::LE:
11132 if (!NoNans)
11133 return SDValue();
11134 // If we ignore NaNs then we can use to the LS implementation.
11135 [[fallthrough]];
11136 case AArch64CC::LS:
11137 return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL, VT, N1: RHS, N2: LHS);
11138 case AArch64CC::LT:
11139 if (!NoNans)
11140 return SDValue();
11141 // If we ignore NaNs then we can use to the MI implementation.
11142 [[fallthrough]];
11143 case AArch64CC::MI:
11144 return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL, VT, N1: RHS, N2: LHS);
11145 }
11146}
11147
11148/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
11149/// values are scalars, try to emit a mask generating vector instruction.
11150static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal,
11151 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
11152 const SDLoc &DL, SelectionDAG &DAG) {
11153 assert(!LHS.getValueType().isVector());
11154 assert(!RHS.getValueType().isVector());
11155
11156 auto *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
11157 auto *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
11158 if (!CTVal || !CFVal)
11159 return {};
11160 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
11161 !(CTVal->isZero() && CFVal->isAllOnes()))
11162 return {};
11163
11164 if (CTVal->isZero())
11165 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11166
11167 EVT VT = TVal.getValueType();
11168 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
11169 return {};
11170
11171 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
11172 bool OneNaN = false;
11173 if (LHS == RHS) {
11174 OneNaN = true;
11175 } else if (DAG.isKnownNeverNaN(Op: RHS)) {
11176 OneNaN = true;
11177 RHS = LHS;
11178 } else if (DAG.isKnownNeverNaN(Op: LHS)) {
11179 OneNaN = true;
11180 LHS = RHS;
11181 }
11182 if (OneNaN)
11183 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
11184 }
11185
11186 AArch64CC::CondCode CC1;
11187 AArch64CC::CondCode CC2;
11188 bool ShouldInvert = false;
11189 changeVectorFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2, Invert&: ShouldInvert);
11190 SDValue Cmp = emitVectorComparison(LHS, RHS, CC: CC1, NoNans: NoNaNs, VT, DL, DAG);
11191 SDValue Cmp2;
11192 if (CC2 != AArch64CC::AL) {
11193 Cmp2 = emitVectorComparison(LHS, RHS, CC: CC2, NoNans: NoNaNs, VT, DL, DAG);
11194 if (!Cmp2)
11195 return {};
11196 }
11197 if (!Cmp2 && !ShouldInvert)
11198 return Cmp;
11199
11200 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT, NumElements: 128 / VT.getSizeInBits());
11201 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
11202 Cmp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VecVT, N1: DAG.getUNDEF(VT: VecVT), N2: Cmp,
11203 N3: Zero);
11204 if (Cmp2) {
11205 Cmp2 = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VecVT, N1: DAG.getUNDEF(VT: VecVT),
11206 N2: Cmp2, N3: Zero);
11207 Cmp = DAG.getNode(Opcode: ISD::OR, DL, VT: VecVT, N1: Cmp, N2: Cmp2);
11208 }
11209 if (ShouldInvert)
11210 Cmp = DAG.getNOT(DL, Val: Cmp, VT: VecVT);
11211 Cmp = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Cmp, N2: Zero);
11212 return Cmp;
11213}
11214
11215SDValue AArch64TargetLowering::LowerSELECT_CC(
11216 ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,
11217 iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs,
11218 const SDLoc &DL, SelectionDAG &DAG) const {
11219 // Handle f128 first, because it will result in a comparison of some RTLIB
11220 // call result against zero.
11221 if (LHS.getValueType() == MVT::f128) {
11222 softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL, OldLHS: LHS, OldRHS: RHS);
11223
11224 // If softenSetCCOperands returned a scalar, we need to compare the result
11225 // against zero to select between true and false values.
11226 if (!RHS.getNode()) {
11227 RHS = DAG.getConstant(Val: 0, DL, VT: LHS.getValueType());
11228 CC = ISD::SETNE;
11229 }
11230 }
11231
11232 // Also handle f16, for which we need to do a f32 comparison.
11233 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11234 LHS.getValueType() == MVT::bf16) {
11235 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: LHS);
11236 RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: RHS);
11237 }
11238
11239 // Next, handle integers.
11240 if (LHS.getValueType().isInteger()) {
11241 assert((LHS.getValueType() == RHS.getValueType()) &&
11242 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11243
11244 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
11245 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
11246 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
11247 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
11248 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
11249 // supported types.
11250 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
11251 CTVal->isOne() && CFVal->isAllOnes() &&
11252 LHS.getValueType() == TVal.getValueType()) {
11253 EVT VT = LHS.getValueType();
11254 SDValue Shift =
11255 DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: LHS,
11256 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL, VT));
11257 return DAG.getNode(Opcode: ISD::OR, DL, VT, N1: Shift, N2: DAG.getConstant(Val: 1, DL, VT));
11258 }
11259
11260 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11261 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11262 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11263 // Both require less instructions than compare and conditional select.
11264 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11265 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11266 LHS.getValueType() == RHS.getValueType()) {
11267 EVT VT = LHS.getValueType();
11268 SDValue Shift =
11269 DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: LHS,
11270 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL, VT));
11271
11272 if (CC == ISD::SETGT)
11273 Shift = DAG.getNOT(DL, Val: Shift, VT);
11274
11275 return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: LHS, N2: Shift);
11276 }
11277
11278 unsigned Opcode = AArch64ISD::CSEL;
11279
11280 // If both the TVal and the FVal are constants, see if we can swap them in
11281 // order to for a CSINV or CSINC out of them.
11282 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11283 std::swap(a&: TVal, b&: FVal);
11284 std::swap(a&: CTVal, b&: CFVal);
11285 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11286 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11287 std::swap(a&: TVal, b&: FVal);
11288 std::swap(a&: CTVal, b&: CFVal);
11289 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11290 } else if (TVal.getOpcode() == ISD::XOR) {
11291 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11292 // with a CSINV rather than a CSEL.
11293 if (isAllOnesConstant(V: TVal.getOperand(i: 1))) {
11294 std::swap(a&: TVal, b&: FVal);
11295 std::swap(a&: CTVal, b&: CFVal);
11296 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11297 }
11298 } else if (TVal.getOpcode() == ISD::SUB) {
11299 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11300 // that we can match with a CSNEG rather than a CSEL.
11301 if (isNullConstant(V: TVal.getOperand(i: 0))) {
11302 std::swap(a&: TVal, b&: FVal);
11303 std::swap(a&: CTVal, b&: CFVal);
11304 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11305 }
11306 } else if (CTVal && CFVal) {
11307 const int64_t TrueVal = CTVal->getSExtValue();
11308 const int64_t FalseVal = CFVal->getSExtValue();
11309 bool Swap = false;
11310
11311 // If both TVal and FVal are constants, see if FVal is the
11312 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11313 // instead of a CSEL in that case.
11314 if (TrueVal == ~FalseVal) {
11315 Opcode = AArch64ISD::CSINV;
11316 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11317 TrueVal == -FalseVal) {
11318 Opcode = AArch64ISD::CSNEG;
11319 } else if (TVal.getValueType() == MVT::i32) {
11320 // If our operands are only 32-bit wide, make sure we use 32-bit
11321 // arithmetic for the check whether we can use CSINC. This ensures that
11322 // the addition in the check will wrap around properly in case there is
11323 // an overflow (which would not be the case if we do the check with
11324 // 64-bit arithmetic).
11325 const uint32_t TrueVal32 = CTVal->getZExtValue();
11326 const uint32_t FalseVal32 = CFVal->getZExtValue();
11327
11328 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11329 Opcode = AArch64ISD::CSINC;
11330
11331 if (TrueVal32 > FalseVal32) {
11332 Swap = true;
11333 }
11334 }
11335 } else {
11336 // 64-bit check whether we can use CSINC.
11337 const uint64_t TrueVal64 = TrueVal;
11338 const uint64_t FalseVal64 = FalseVal;
11339
11340 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11341 Opcode = AArch64ISD::CSINC;
11342
11343 if (TrueVal > FalseVal) {
11344 Swap = true;
11345 }
11346 }
11347 }
11348
11349 // Swap TVal and FVal if necessary.
11350 if (Swap) {
11351 std::swap(a&: TVal, b&: FVal);
11352 std::swap(a&: CTVal, b&: CFVal);
11353 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11354 }
11355
11356 if (Opcode != AArch64ISD::CSEL) {
11357 // Drop FVal since we can get its value by simply inverting/negating
11358 // TVal.
11359 FVal = TVal;
11360 }
11361 }
11362
11363 // Avoid materializing a constant when possible by reusing a known value in
11364 // a register. However, don't perform this optimization if the known value
11365 // is one, zero or negative one in the case of a CSEL. We can always
11366 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11367 // FVal, respectively.
11368 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(Val&: RHS);
11369 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11370 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11371 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
11372 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11373 // "a != C ? x : a" to avoid materializing C.
11374 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11375 TVal = LHS;
11376 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11377 FVal = LHS;
11378 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11379 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11380 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11381 // avoid materializing C.
11382 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
11383 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11384 Opcode = AArch64ISD::CSINV;
11385 TVal = LHS;
11386 FVal = DAG.getConstant(Val: 0, DL, VT: FVal.getValueType());
11387 }
11388 }
11389
11390 SDValue CCVal;
11391 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, DL);
11392 EVT VT = TVal.getValueType();
11393 return DAG.getNode(Opcode, DL, VT, N1: TVal, N2: FVal, N3: CCVal, N4: Cmp);
11394 }
11395
11396 // Now we know we're dealing with FP values.
11397 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11398 LHS.getValueType() == MVT::f64);
11399 assert(LHS.getValueType() == RHS.getValueType());
11400 EVT VT = TVal.getValueType();
11401
11402 // If the purpose of the comparison is to select between all ones
11403 // or all zeros, try to use a vector comparison because the operands are
11404 // already stored in SIMD registers.
11405 if (Subtarget->isNeonAvailable() && all_of(Range&: Users, P: [](const SDNode *U) {
11406 switch (U->getOpcode()) {
11407 default:
11408 return false;
11409 case ISD::INSERT_VECTOR_ELT:
11410 case ISD::SCALAR_TO_VECTOR:
11411 case AArch64ISD::DUP:
11412 return true;
11413 }
11414 })) {
11415 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs;
11416 SDValue VectorCmp =
11417 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
11418 if (VectorCmp)
11419 return VectorCmp;
11420 }
11421
11422 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11423
11424 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11425 // clean. Some of them require two CSELs to implement.
11426 AArch64CC::CondCode CC1, CC2;
11427 changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
11428
11429 if (DAG.getTarget().Options.UnsafeFPMath) {
11430 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11431 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11432 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(Val&: RHS);
11433 if (RHSVal && RHSVal->isZero()) {
11434 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(Val&: FVal);
11435 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(Val&: TVal);
11436
11437 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11438 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11439 TVal = LHS;
11440 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11441 CFVal && CFVal->isZero() &&
11442 FVal.getValueType() == LHS.getValueType())
11443 FVal = LHS;
11444 }
11445 }
11446
11447 // Emit first, and possibly only, CSEL.
11448 SDValue CC1Val = DAG.getConstant(Val: CC1, DL, VT: MVT::i32);
11449 SDValue CS1 = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
11450
11451 // If we need a second CSEL, emit it, using the output of the first as the
11452 // RHS. We're effectively OR'ing the two CC's together.
11453 if (CC2 != AArch64CC::AL) {
11454 SDValue CC2Val = DAG.getConstant(Val: CC2, DL, VT: MVT::i32);
11455 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
11456 }
11457
11458 // Otherwise, return the output of the first CSEL.
11459 return CS1;
11460}
11461
11462SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
11463 SelectionDAG &DAG) const {
11464 EVT Ty = Op.getValueType();
11465 auto Idx = Op.getConstantOperandAPInt(i: 2);
11466 int64_t IdxVal = Idx.getSExtValue();
11467 assert(Ty.isScalableVector() &&
11468 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
11469
11470 // We can use the splice instruction for certain index values where we are
11471 // able to efficiently generate the correct predicate. The index will be
11472 // inverted and used directly as the input to the ptrue instruction, i.e.
11473 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
11474 // splice predicate. However, we can only do this if we can guarantee that
11475 // there are enough elements in the vector, hence we check the index <= min
11476 // number of elements.
11477 std::optional<unsigned> PredPattern;
11478 if (Ty.isScalableVector() && IdxVal < 0 &&
11479 (PredPattern = getSVEPredPatternFromNumElements(MinNumElts: std::abs(i: IdxVal))) !=
11480 std::nullopt) {
11481 SDLoc DL(Op);
11482
11483 // Create a predicate where all but the last -IdxVal elements are false.
11484 EVT PredVT = Ty.changeVectorElementType(EltVT: MVT::i1);
11485 SDValue Pred = getPTrue(DAG, DL, VT: PredVT, Pattern: *PredPattern);
11486 Pred = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: PredVT, Operand: Pred);
11487
11488 // Now splice the two inputs together using the predicate.
11489 return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: Ty, N1: Pred, N2: Op.getOperand(i: 0),
11490 N3: Op.getOperand(i: 1));
11491 }
11492
11493 // We can select to an EXT instruction when indexing the first 256 bytes.
11494 unsigned BlockSize = AArch64::SVEBitsPerBlock / Ty.getVectorMinNumElements();
11495 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
11496 return Op;
11497
11498 return SDValue();
11499}
11500
11501SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
11502 SelectionDAG &DAG) const {
11503 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 4))->get();
11504 SDValue LHS = Op.getOperand(i: 0);
11505 SDValue RHS = Op.getOperand(i: 1);
11506 SDValue TVal = Op.getOperand(i: 2);
11507 SDValue FVal = Op.getOperand(i: 3);
11508 bool HasNoNans = Op->getFlags().hasNoNaNs();
11509 SDLoc DL(Op);
11510 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Users: Op->users(), HasNoNaNs: HasNoNans, DL,
11511 DAG);
11512}
11513
11514SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
11515 SelectionDAG &DAG) const {
11516 SDValue CCVal = Op->getOperand(Num: 0);
11517 SDValue TVal = Op->getOperand(Num: 1);
11518 SDValue FVal = Op->getOperand(Num: 2);
11519 bool HasNoNans = Op->getFlags().hasNoNaNs();
11520 SDLoc DL(Op);
11521
11522 EVT Ty = Op.getValueType();
11523 if (Ty == MVT::aarch64svcount) {
11524 TVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i1, Operand: TVal);
11525 FVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i1, Operand: FVal);
11526 SDValue Sel =
11527 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::nxv16i1, N1: CCVal, N2: TVal, N3: FVal);
11528 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Ty, Operand: Sel);
11529 }
11530
11531 if (Ty.isScalableVector()) {
11532 MVT PredVT = MVT::getVectorVT(VT: MVT::i1, EC: Ty.getVectorElementCount());
11533 SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: CCVal);
11534 return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
11535 }
11536
11537 if (useSVEForFixedLengthVectorVT(VT: Ty, OverrideNEON: !Subtarget->isNeonAvailable())) {
11538 // FIXME: Ideally this would be the same as above using i1 types, however
11539 // for the moment we can't deal with fixed i1 vector types properly, so
11540 // instead extend the predicate to a result type sized integer vector.
11541 MVT SplatValVT = MVT::getIntegerVT(BitWidth: Ty.getScalarSizeInBits());
11542 MVT PredVT = MVT::getVectorVT(VT: SplatValVT, EC: Ty.getVectorElementCount());
11543 SDValue SplatVal = DAG.getSExtOrTrunc(Op: CCVal, DL, VT: SplatValVT);
11544 SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: SplatVal);
11545 return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
11546 }
11547
11548 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
11549 // instruction.
11550 if (ISD::isOverflowIntrOpRes(Op: CCVal)) {
11551 // Only lower legal XALUO ops.
11552 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: CCVal->getValueType(ResNo: 0)))
11553 return SDValue();
11554
11555 AArch64CC::CondCode OFCC;
11556 SDValue Value, Overflow;
11557 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: CCVal.getValue(R: 0), DAG);
11558 SDValue CCVal = DAG.getConstant(Val: OFCC, DL, VT: MVT::i32);
11559
11560 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: Op.getValueType(), N1: TVal, N2: FVal,
11561 N3: CCVal, N4: Overflow);
11562 }
11563
11564 // Lower it the same way as we would lower a SELECT_CC node.
11565 ISD::CondCode CC;
11566 SDValue LHS, RHS;
11567 if (CCVal.getOpcode() == ISD::SETCC) {
11568 LHS = CCVal.getOperand(i: 0);
11569 RHS = CCVal.getOperand(i: 1);
11570 CC = cast<CondCodeSDNode>(Val: CCVal.getOperand(i: 2))->get();
11571 } else {
11572 LHS = CCVal;
11573 RHS = DAG.getConstant(Val: 0, DL, VT: CCVal.getValueType());
11574 CC = ISD::SETNE;
11575 }
11576
11577 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
11578 // order to use FCSELSrrr
11579 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11580 TVal = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
11581 Operand: DAG.getUNDEF(VT: MVT::f32), Subreg: TVal);
11582 FVal = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
11583 Operand: DAG.getUNDEF(VT: MVT::f32), Subreg: FVal);
11584 }
11585
11586 SDValue Res =
11587 LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Users: Op->users(), HasNoNaNs: HasNoNans, DL, DAG);
11588
11589 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11590 return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT: Ty, Operand: Res);
11591 }
11592
11593 return Res;
11594}
11595
11596SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
11597 SelectionDAG &DAG) const {
11598 // Jump table entries as PC relative offsets. No additional tweaking
11599 // is necessary here. Just get the address of the jump table.
11600 JumpTableSDNode *JT = cast<JumpTableSDNode>(Val&: Op);
11601
11602 CodeModel::Model CM = getTargetMachine().getCodeModel();
11603 if (CM == CodeModel::Large && !getTargetMachine().isPositionIndependent() &&
11604 !Subtarget->isTargetMachO())
11605 return getAddrLarge(N: JT, DAG);
11606 if (CM == CodeModel::Tiny)
11607 return getAddrTiny(N: JT, DAG);
11608 return getAddr(N: JT, DAG);
11609}
11610
11611SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
11612 SelectionDAG &DAG) const {
11613 // Jump table entries as PC relative offsets. No additional tweaking
11614 // is necessary here. Just get the address of the jump table.
11615 SDLoc DL(Op);
11616 SDValue JT = Op.getOperand(i: 1);
11617 SDValue Entry = Op.getOperand(i: 2);
11618 int JTI = cast<JumpTableSDNode>(Val: JT.getNode())->getIndex();
11619
11620 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11621 AFI->setJumpTableEntryInfo(Idx: JTI, Size: 4, PCRelSym: nullptr);
11622
11623 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
11624 // sequence later, to guarantee the integrity of the intermediate values.
11625 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
11626 Kind: "aarch64-jump-table-hardening")) {
11627 CodeModel::Model CM = getTargetMachine().getCodeModel();
11628 if (Subtarget->isTargetMachO()) {
11629 if (CM != CodeModel::Small && CM != CodeModel::Large)
11630 report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
11631 } else {
11632 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
11633 assert(Subtarget->isTargetELF() &&
11634 "jump table hardening only supported on MachO/ELF");
11635 if (CM != CodeModel::Small)
11636 report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
11637 }
11638
11639 SDValue X16Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::X16,
11640 N: Entry, Glue: SDValue());
11641 SDNode *B = DAG.getMachineNode(Opcode: AArch64::BR_JumpTable, dl: DL, VT: MVT::Other,
11642 Op1: DAG.getTargetJumpTable(JTI, VT: MVT::i32),
11643 Op2: X16Copy.getValue(R: 0), Op3: X16Copy.getValue(R: 1));
11644 return SDValue(B, 0);
11645 }
11646
11647 SDNode *Dest =
11648 DAG.getMachineNode(Opcode: AArch64::JumpTableDest32, dl: DL, VT1: MVT::i64, VT2: MVT::i64, Op1: JT,
11649 Op2: Entry, Op3: DAG.getTargetJumpTable(JTI, VT: MVT::i32));
11650 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Chain: Op.getOperand(i: 0), DL);
11651 return DAG.getNode(Opcode: ISD::BRIND, DL, VT: MVT::Other, N1: JTInfo, N2: SDValue(Dest, 0));
11652}
11653
11654SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
11655 SDValue Chain = Op.getOperand(i: 0);
11656 SDValue Dest = Op.getOperand(i: 1);
11657
11658 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
11659 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
11660 if (Dest->isMachineOpcode() &&
11661 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
11662 return SDValue();
11663
11664 const MachineFunction &MF = DAG.getMachineFunction();
11665 std::optional<uint16_t> BADisc =
11666 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(ParentFn: MF.getFunction());
11667 if (!BADisc)
11668 return SDValue();
11669
11670 SDLoc DL(Op);
11671
11672 SDValue Disc = DAG.getTargetConstant(Val: *BADisc, DL, VT: MVT::i64);
11673 SDValue Key = DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32);
11674 SDValue AddrDisc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
11675
11676 SDNode *BrA = DAG.getMachineNode(Opcode: AArch64::BRA, dl: DL, VT: MVT::Other,
11677 Ops: {Dest, Key, Disc, AddrDisc, Chain});
11678 return SDValue(BrA, 0);
11679}
11680
11681SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
11682 SelectionDAG &DAG) const {
11683 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Val&: Op);
11684 CodeModel::Model CM = getTargetMachine().getCodeModel();
11685 if (CM == CodeModel::Large) {
11686 // Use the GOT for the large code model on iOS.
11687 if (Subtarget->isTargetMachO()) {
11688 return getGOT(N: CP, DAG);
11689 }
11690 if (!getTargetMachine().isPositionIndependent())
11691 return getAddrLarge(N: CP, DAG);
11692 } else if (CM == CodeModel::Tiny) {
11693 return getAddrTiny(N: CP, DAG);
11694 }
11695 return getAddr(N: CP, DAG);
11696}
11697
11698SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
11699 SelectionDAG &DAG) const {
11700 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Val&: Op);
11701 const BlockAddress *BA = BAN->getBlockAddress();
11702
11703 if (std::optional<uint16_t> BADisc =
11704 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
11705 ParentFn: *BA->getFunction())) {
11706 SDLoc DL(Op);
11707
11708 // This isn't cheap, but BRIND is rare.
11709 SDValue TargetBA = DAG.getTargetBlockAddress(BA, VT: BAN->getValueType(ResNo: 0));
11710
11711 SDValue Disc = DAG.getTargetConstant(Val: *BADisc, DL, VT: MVT::i64);
11712
11713 SDValue Key = DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32);
11714 SDValue AddrDisc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
11715
11716 SDNode *MOV =
11717 DAG.getMachineNode(Opcode: AArch64::MOVaddrPAC, dl: DL, ResultTys: {MVT::Other, MVT::Glue},
11718 Ops: {TargetBA, Key, AddrDisc, Disc});
11719 return DAG.getCopyFromReg(Chain: SDValue(MOV, 0), dl: DL, Reg: AArch64::X16, VT: MVT::i64,
11720 Glue: SDValue(MOV, 1));
11721 }
11722
11723 CodeModel::Model CM = getTargetMachine().getCodeModel();
11724 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
11725 if (!getTargetMachine().isPositionIndependent())
11726 return getAddrLarge(N: BAN, DAG);
11727 } else if (CM == CodeModel::Tiny) {
11728 return getAddrTiny(N: BAN, DAG);
11729 }
11730 return getAddr(N: BAN, DAG);
11731}
11732
11733SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
11734 SelectionDAG &DAG) const {
11735 AArch64FunctionInfo *FuncInfo =
11736 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11737
11738 SDLoc DL(Op);
11739 SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(),
11740 VT: getPointerTy(DL: DAG.getDataLayout()));
11741 FR = DAG.getZExtOrTrunc(Op: FR, DL, VT: getPointerMemTy(DL: DAG.getDataLayout()));
11742 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
11743 return DAG.getStore(Chain: Op.getOperand(i: 0), dl: DL, Val: FR, Ptr: Op.getOperand(i: 1),
11744 PtrInfo: MachinePointerInfo(SV));
11745}
11746
11747SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
11748 SelectionDAG &DAG) const {
11749 MachineFunction &MF = DAG.getMachineFunction();
11750 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
11751
11752 SDLoc DL(Op);
11753 SDValue FR;
11754 if (Subtarget->isWindowsArm64EC()) {
11755 // With the Arm64EC ABI, we compute the address of the varargs save area
11756 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
11757 // but calls from an entry thunk can pass in a different address.
11758 Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
11759 SDValue Val = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: VReg, VT: MVT::i64);
11760 uint64_t StackOffset;
11761 if (FuncInfo->getVarArgsGPRSize() > 0)
11762 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
11763 else
11764 StackOffset = FuncInfo->getVarArgsStackOffset();
11765 FR = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Val,
11766 N2: DAG.getConstant(Val: StackOffset, DL, VT: MVT::i64));
11767 } else {
11768 FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRSize() > 0
11769 ? FuncInfo->getVarArgsGPRIndex()
11770 : FuncInfo->getVarArgsStackIndex(),
11771 VT: getPointerTy(DL: DAG.getDataLayout()));
11772 }
11773 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
11774 return DAG.getStore(Chain: Op.getOperand(i: 0), dl: DL, Val: FR, Ptr: Op.getOperand(i: 1),
11775 PtrInfo: MachinePointerInfo(SV));
11776}
11777
11778SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
11779 SelectionDAG &DAG) const {
11780 // The layout of the va_list struct is specified in the AArch64 Procedure Call
11781 // Standard, section B.3.
11782 MachineFunction &MF = DAG.getMachineFunction();
11783 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
11784 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11785 auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
11786 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
11787 SDLoc DL(Op);
11788
11789 SDValue Chain = Op.getOperand(i: 0);
11790 SDValue VAList = Op.getOperand(i: 1);
11791 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
11792 SmallVector<SDValue, 4> MemOps;
11793
11794 // void *__stack at offset 0
11795 unsigned Offset = 0;
11796 SDValue Stack = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(), VT: PtrVT);
11797 Stack = DAG.getZExtOrTrunc(Op: Stack, DL, VT: PtrMemVT);
11798 MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: Stack, Ptr: VAList,
11799 PtrInfo: MachinePointerInfo(SV), Alignment: Align(PtrSize)));
11800
11801 // void *__gr_top at offset 8 (4 on ILP32)
11802 Offset += PtrSize;
11803 int GPRSize = FuncInfo->getVarArgsGPRSize();
11804 if (GPRSize > 0) {
11805 SDValue GRTop, GRTopAddr;
11806
11807 GRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11808 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11809
11810 GRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRIndex(), VT: PtrVT);
11811 GRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: GRTop,
11812 N2: DAG.getSignedConstant(Val: GPRSize, DL, VT: PtrVT));
11813 GRTop = DAG.getZExtOrTrunc(Op: GRTop, DL, VT: PtrMemVT);
11814
11815 MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: GRTop, Ptr: GRTopAddr,
11816 PtrInfo: MachinePointerInfo(SV, Offset),
11817 Alignment: Align(PtrSize)));
11818 }
11819
11820 // void *__vr_top at offset 16 (8 on ILP32)
11821 Offset += PtrSize;
11822 int FPRSize = FuncInfo->getVarArgsFPRSize();
11823 if (FPRSize > 0) {
11824 SDValue VRTop, VRTopAddr;
11825 VRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11826 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11827
11828 VRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFPRIndex(), VT: PtrVT);
11829 VRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VRTop,
11830 N2: DAG.getSignedConstant(Val: FPRSize, DL, VT: PtrVT));
11831 VRTop = DAG.getZExtOrTrunc(Op: VRTop, DL, VT: PtrMemVT);
11832
11833 MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: VRTop, Ptr: VRTopAddr,
11834 PtrInfo: MachinePointerInfo(SV, Offset),
11835 Alignment: Align(PtrSize)));
11836 }
11837
11838 // int __gr_offs at offset 24 (12 on ILP32)
11839 Offset += PtrSize;
11840 SDValue GROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11841 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11842 MemOps.push_back(
11843 Elt: DAG.getStore(Chain, dl: DL, Val: DAG.getSignedConstant(Val: -GPRSize, DL, VT: MVT::i32),
11844 Ptr: GROffsAddr, PtrInfo: MachinePointerInfo(SV, Offset), Alignment: Align(4)));
11845
11846 // int __vr_offs at offset 28 (16 on ILP32)
11847 Offset += 4;
11848 SDValue VROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11849 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11850 MemOps.push_back(
11851 Elt: DAG.getStore(Chain, dl: DL, Val: DAG.getSignedConstant(Val: -FPRSize, DL, VT: MVT::i32),
11852 Ptr: VROffsAddr, PtrInfo: MachinePointerInfo(SV, Offset), Alignment: Align(4)));
11853
11854 return DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOps);
11855}
11856
11857SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
11858 SelectionDAG &DAG) const {
11859 MachineFunction &MF = DAG.getMachineFunction();
11860 Function &F = MF.getFunction();
11861
11862 if (Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg()))
11863 return LowerWin64_VASTART(Op, DAG);
11864 else if (Subtarget->isTargetDarwin())
11865 return LowerDarwin_VASTART(Op, DAG);
11866 else
11867 return LowerAAPCS_VASTART(Op, DAG);
11868}
11869
11870SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
11871 SelectionDAG &DAG) const {
11872 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
11873 // pointer.
11874 SDLoc DL(Op);
11875 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11876 unsigned VaListSize =
11877 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
11878 ? PtrSize
11879 : Subtarget->isTargetILP32() ? 20 : 32;
11880 const Value *DestSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 3))->getValue();
11881 const Value *SrcSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 4))->getValue();
11882
11883 return DAG.getMemcpy(Chain: Op.getOperand(i: 0), dl: DL, Dst: Op.getOperand(i: 1), Src: Op.getOperand(i: 2),
11884 Size: DAG.getConstant(Val: VaListSize, DL, VT: MVT::i32),
11885 Alignment: Align(PtrSize), isVol: false, AlwaysInline: false, /*CI=*/nullptr,
11886 OverrideTailCall: std::nullopt, DstPtrInfo: MachinePointerInfo(DestSV),
11887 SrcPtrInfo: MachinePointerInfo(SrcSV));
11888}
11889
11890SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
11891 assert(Subtarget->isTargetDarwin() &&
11892 "automatic va_arg instruction only works on Darwin");
11893
11894 const Value *V = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
11895 EVT VT = Op.getValueType();
11896 SDLoc DL(Op);
11897 SDValue Chain = Op.getOperand(i: 0);
11898 SDValue Addr = Op.getOperand(i: 1);
11899 MaybeAlign Align(Op.getConstantOperandVal(i: 3));
11900 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
11901 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
11902 auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
11903 SDValue VAList =
11904 DAG.getLoad(VT: PtrMemVT, dl: DL, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo(V));
11905 Chain = VAList.getValue(R: 1);
11906 VAList = DAG.getZExtOrTrunc(Op: VAList, DL, VT: PtrVT);
11907
11908 if (VT.isScalableVector())
11909 report_fatal_error(reason: "Passing SVE types to variadic functions is "
11910 "currently not supported");
11911
11912 if (Align && *Align > MinSlotSize) {
11913 VAList = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11914 N2: DAG.getConstant(Val: Align->value() - 1, DL, VT: PtrVT));
11915 VAList =
11916 DAG.getNode(Opcode: ISD::AND, DL, VT: PtrVT, N1: VAList,
11917 N2: DAG.getSignedConstant(Val: -(int64_t)Align->value(), DL, VT: PtrVT));
11918 }
11919
11920 Type *ArgTy = VT.getTypeForEVT(Context&: *DAG.getContext());
11921 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(Ty: ArgTy);
11922
11923 // Scalar integer and FP values smaller than 64 bits are implicitly extended
11924 // up to 64 bits. At the very least, we have to increase the striding of the
11925 // vaargs list to match this, and for FP values we need to introduce
11926 // FP_ROUND nodes as well.
11927 if (VT.isInteger() && !VT.isVector())
11928 ArgSize = std::max(a: ArgSize, b: MinSlotSize);
11929 bool NeedFPTrunc = false;
11930 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
11931 ArgSize = 8;
11932 NeedFPTrunc = true;
11933 }
11934
11935 // Increment the pointer, VAList, to the next vaarg
11936 SDValue VANext = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11937 N2: DAG.getConstant(Val: ArgSize, DL, VT: PtrVT));
11938 VANext = DAG.getZExtOrTrunc(Op: VANext, DL, VT: PtrMemVT);
11939
11940 // Store the incremented VAList to the legalized pointer
11941 SDValue APStore =
11942 DAG.getStore(Chain, dl: DL, Val: VANext, Ptr: Addr, PtrInfo: MachinePointerInfo(V));
11943
11944 // Load the actual argument out of the pointer VAList
11945 if (NeedFPTrunc) {
11946 // Load the value as an f64.
11947 SDValue WideFP =
11948 DAG.getLoad(VT: MVT::f64, dl: DL, Chain: APStore, Ptr: VAList, PtrInfo: MachinePointerInfo());
11949 // Round the value down to an f32.
11950 SDValue NarrowFP =
11951 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: WideFP.getValue(R: 0),
11952 N2: DAG.getIntPtrConstant(Val: 1, DL, /*isTarget=*/true));
11953 SDValue Ops[] = { NarrowFP, WideFP.getValue(R: 1) };
11954 // Merge the rounded value with the chain output of the load.
11955 return DAG.getMergeValues(Ops, dl: DL);
11956 }
11957
11958 return DAG.getLoad(VT, dl: DL, Chain: APStore, Ptr: VAList, PtrInfo: MachinePointerInfo());
11959}
11960
11961SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
11962 SelectionDAG &DAG) const {
11963 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11964 MFI.setFrameAddressIsTaken(true);
11965
11966 EVT VT = Op.getValueType();
11967 SDLoc DL(Op);
11968 unsigned Depth = Op.getConstantOperandVal(i: 0);
11969 SDValue FrameAddr =
11970 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::FP, VT: MVT::i64);
11971 while (Depth--)
11972 FrameAddr = DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr: FrameAddr,
11973 PtrInfo: MachinePointerInfo());
11974
11975 if (Subtarget->isTargetILP32())
11976 FrameAddr = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: MVT::i64, N1: FrameAddr,
11977 N2: DAG.getValueType(VT));
11978
11979 return FrameAddr;
11980}
11981
11982SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
11983 SelectionDAG &DAG) const {
11984 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11985
11986 EVT VT = getPointerTy(DL: DAG.getDataLayout());
11987 int FI = MFI.CreateFixedObject(Size: 4, SPOffset: 0, IsImmutable: false);
11988 return DAG.getFrameIndex(FI, VT);
11989}
11990
11991#define GET_REGISTER_MATCHER
11992#include "AArch64GenAsmMatcher.inc"
11993
11994// FIXME? Maybe this could be a TableGen attribute on some registers and
11995// this table could be generated automatically from RegInfo.
11996Register AArch64TargetLowering::
11997getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
11998 Register Reg = MatchRegisterName(Name: RegName);
11999 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12000 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12001 unsigned DwarfRegNum = MRI->getDwarfRegNum(RegNum: Reg, isEH: false);
12002 if (!Subtarget->isXRegisterReserved(i: DwarfRegNum) &&
12003 !MRI->isReservedReg(MF, Reg))
12004 Reg = Register();
12005 }
12006 return Reg;
12007}
12008
12009SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12010 SelectionDAG &DAG) const {
12011 DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
12012
12013 EVT VT = Op.getValueType();
12014 SDLoc DL(Op);
12015
12016 SDValue FrameAddr =
12017 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::FP, VT);
12018 SDValue Offset = DAG.getConstant(Val: 8, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
12019
12020 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset);
12021}
12022
12023SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12024 SelectionDAG &DAG) const {
12025 MachineFunction &MF = DAG.getMachineFunction();
12026 MachineFrameInfo &MFI = MF.getFrameInfo();
12027 MFI.setReturnAddressIsTaken(true);
12028
12029 EVT VT = Op.getValueType();
12030 SDLoc DL(Op);
12031 unsigned Depth = Op.getConstantOperandVal(i: 0);
12032 SDValue ReturnAddress;
12033 if (Depth) {
12034 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12035 SDValue Offset = DAG.getConstant(Val: 8, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
12036 ReturnAddress = DAG.getLoad(
12037 VT, dl: DL, Chain: DAG.getEntryNode(),
12038 Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset), PtrInfo: MachinePointerInfo());
12039 } else {
12040 // Return LR, which contains the return address. Mark it an implicit
12041 // live-in.
12042 Register Reg = MF.addLiveIn(PReg: AArch64::LR, RC: &AArch64::GPR64RegClass);
12043 ReturnAddress = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
12044 }
12045
12046 // The XPACLRI instruction assembles to a hint-space instruction before
12047 // Armv8.3-A therefore this instruction can be safely used for any pre
12048 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
12049 // that instead.
12050 SDNode *St;
12051 if (Subtarget->hasPAuth()) {
12052 St = DAG.getMachineNode(Opcode: AArch64::XPACI, dl: DL, VT, Op1: ReturnAddress);
12053 } else {
12054 // XPACLRI operates on LR therefore we must move the operand accordingly.
12055 SDValue Chain =
12056 DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::LR, N: ReturnAddress);
12057 St = DAG.getMachineNode(Opcode: AArch64::XPACLRI, dl: DL, VT, Op1: Chain);
12058 }
12059 return SDValue(St, 0);
12060}
12061
12062/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
12063/// i32 values and take a 2 x i32 value to shift plus a shift amount.
12064SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
12065 SelectionDAG &DAG) const {
12066 SDValue Lo, Hi;
12067 expandShiftParts(N: Op.getNode(), Lo, Hi, DAG);
12068 return DAG.getMergeValues(Ops: {Lo, Hi}, dl: SDLoc(Op));
12069}
12070
12071bool AArch64TargetLowering::isOffsetFoldingLegal(
12072 const GlobalAddressSDNode *GA) const {
12073 // Offsets are folded in the DAG combine rather than here so that we can
12074 // intelligently choose an offset based on the uses.
12075 return false;
12076}
12077
12078bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
12079 bool OptForSize) const {
12080 bool IsLegal = false;
12081 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
12082 // 16-bit case when target has full fp16 support.
12083 // We encode bf16 bit patterns as if they were fp16. This results in very
12084 // strange looking assembly but should populate the register with appropriate
12085 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
12086 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
12087 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
12088 // FIXME: We should be able to handle f128 as well with a clever lowering.
12089 const APInt ImmInt = Imm.bitcastToAPInt();
12090 if (VT == MVT::f64)
12091 IsLegal = AArch64_AM::getFP64Imm(Imm: ImmInt) != -1 || Imm.isPosZero();
12092 else if (VT == MVT::f32)
12093 IsLegal = AArch64_AM::getFP32Imm(Imm: ImmInt) != -1 || Imm.isPosZero();
12094 else if (VT == MVT::f16 || VT == MVT::bf16)
12095 IsLegal =
12096 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(Imm: ImmInt) != -1) ||
12097 Imm.isPosZero();
12098
12099 // If we can not materialize in immediate field for fmov, check if the
12100 // value can be encoded as the immediate operand of a logical instruction.
12101 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12102 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12103 // generate that fmov.
12104 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12105 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12106 // however the mov+fmov sequence is always better because of the reduced
12107 // cache pressure. The timings are still the same if you consider
12108 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12109 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12110 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
12111 AArch64_IMM::expandMOVImm(Imm: ImmInt.getZExtValue(), BitSize: VT.getSizeInBits(), Insn);
12112 assert(Insn.size() <= 4 &&
12113 "Should be able to build any value with at most 4 moves");
12114 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12115 IsLegal = Insn.size() <= Limit;
12116 }
12117
12118 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12119 << " imm value: "; Imm.dump(););
12120 return IsLegal;
12121}
12122
12123//===----------------------------------------------------------------------===//
12124// AArch64 Optimization Hooks
12125//===----------------------------------------------------------------------===//
12126
12127static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12128 SDValue Operand, SelectionDAG &DAG,
12129 int &ExtraSteps) {
12130 EVT VT = Operand.getValueType();
12131 if ((ST->hasNEON() &&
12132 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12133 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12134 VT == MVT::v4f32)) ||
12135 (ST->hasSVE() &&
12136 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12137 if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) {
12138 // For the reciprocal estimates, convergence is quadratic, so the number
12139 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12140 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12141 // the result for float (23 mantissa bits) is 2 and for double (52
12142 // mantissa bits) is 3.
12143 constexpr unsigned AccurateBits = 8;
12144 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12145 ExtraSteps = DesiredBits <= AccurateBits
12146 ? 0
12147 : Log2_64_Ceil(Value: DesiredBits) - Log2_64_Ceil(Value: AccurateBits);
12148 }
12149
12150 return DAG.getNode(Opcode, DL: SDLoc(Operand), VT, Operand);
12151 }
12152
12153 return SDValue();
12154}
12155
12156SDValue
12157AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12158 const DenormalMode &Mode) const {
12159 SDLoc DL(Op);
12160 EVT VT = Op.getValueType();
12161 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT);
12162 SDValue FPZero = DAG.getConstantFP(Val: 0.0, DL, VT);
12163 return DAG.getSetCC(DL, VT: CCVT, LHS: Op, RHS: FPZero, Cond: ISD::SETEQ);
12164}
12165
12166SDValue
12167AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12168 SelectionDAG &DAG) const {
12169 return Op;
12170}
12171
12172SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12173 SelectionDAG &DAG, int Enabled,
12174 int &ExtraSteps,
12175 bool &UseOneConst,
12176 bool Reciprocal) const {
12177 if (Enabled == ReciprocalEstimate::Enabled ||
12178 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12179 if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRSQRTE, Operand,
12180 DAG, ExtraSteps)) {
12181 SDLoc DL(Operand);
12182 EVT VT = Operand.getValueType();
12183
12184 SDNodeFlags Flags = SDNodeFlags::AllowReassociation;
12185
12186 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12187 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12188 for (int i = ExtraSteps; i > 0; --i) {
12189 SDValue Step = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Estimate,
12190 Flags);
12191 Step = DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL, VT, N1: Operand, N2: Step, Flags);
12192 Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
12193 }
12194 if (!Reciprocal)
12195 Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Operand, N2: Estimate, Flags);
12196
12197 ExtraSteps = 0;
12198 return Estimate;
12199 }
12200
12201 return SDValue();
12202}
12203
12204SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12205 SelectionDAG &DAG, int Enabled,
12206 int &ExtraSteps) const {
12207 if (Enabled == ReciprocalEstimate::Enabled)
12208 if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRECPE, Operand,
12209 DAG, ExtraSteps)) {
12210 SDLoc DL(Operand);
12211 EVT VT = Operand.getValueType();
12212
12213 SDNodeFlags Flags = SDNodeFlags::AllowReassociation;
12214
12215 // Newton reciprocal iteration: E * (2 - X * E)
12216 // AArch64 reciprocal iteration instruction: (2 - M * N)
12217 for (int i = ExtraSteps; i > 0; --i) {
12218 SDValue Step = DAG.getNode(Opcode: AArch64ISD::FRECPS, DL, VT, N1: Operand,
12219 N2: Estimate, Flags);
12220 Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
12221 }
12222
12223 ExtraSteps = 0;
12224 return Estimate;
12225 }
12226
12227 return SDValue();
12228}
12229
12230//===----------------------------------------------------------------------===//
12231// AArch64 Inline Assembly Support
12232//===----------------------------------------------------------------------===//
12233
12234// Table of Constraints
12235// TODO: This is the current set of constraints supported by ARM for the
12236// compiler, not all of them may make sense.
12237//
12238// r - A general register
12239// w - An FP/SIMD register of some size in the range v0-v31
12240// x - An FP/SIMD register of some size in the range v0-v15
12241// I - Constant that can be used with an ADD instruction
12242// J - Constant that can be used with a SUB instruction
12243// K - Constant that can be used with a 32-bit logical instruction
12244// L - Constant that can be used with a 64-bit logical instruction
12245// M - Constant that can be used as a 32-bit MOV immediate
12246// N - Constant that can be used as a 64-bit MOV immediate
12247// Q - A memory reference with base register and no offset
12248// S - A symbolic address
12249// Y - Floating point constant zero
12250// Z - Integer constant zero
12251//
12252// Note that general register operands will be output using their 64-bit x
12253// register name, whatever the size of the variable, unless the asm operand
12254// is prefixed by the %w modifier. Floating-point and SIMD register operands
12255// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12256// %q modifier.
12257const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12258 // At this point, we have to lower this constraint to something else, so we
12259 // lower it to an "r" or "w". However, by doing this we will force the result
12260 // to be in register, while the X constraint is much more permissive.
12261 //
12262 // Although we are correct (we are free to emit anything, without
12263 // constraints), we might break use cases that would expect us to be more
12264 // efficient and emit something else.
12265 if (!Subtarget->hasFPARMv8())
12266 return "r";
12267
12268 if (ConstraintVT.isFloatingPoint())
12269 return "w";
12270
12271 if (ConstraintVT.isVector() &&
12272 (ConstraintVT.getSizeInBits() == 64 ||
12273 ConstraintVT.getSizeInBits() == 128))
12274 return "w";
12275
12276 return "r";
12277}
12278
12279enum class PredicateConstraint { Uph, Upl, Upa };
12280
12281// Returns a {Reg, RegisterClass} tuple if the constraint is
12282// a specific predicate register.
12283//
12284// For some constraint like "{pn3}" the default path in
12285// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12286// suitable register class for this register is "PPRorPNR", after which it
12287// determines that nxv16i1 is an appropriate type for the constraint, which is
12288// not what we want. The code here pre-empts this by matching the register
12289// explicitly.
12290static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12291parseSVERegAsConstraint(StringRef Constraint) {
12292 if (!Constraint.starts_with(Prefix: '{') || !Constraint.ends_with(Suffix: '}') ||
12293 (Constraint[1] != 'p' && Constraint[1] != 'z'))
12294 return std::nullopt;
12295
12296 bool IsPredicate = Constraint[1] == 'p';
12297 Constraint = Constraint.substr(Start: 2, N: Constraint.size() - 3);
12298 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with(Prefix: "n");
12299 if (IsPredicateAsCount)
12300 Constraint = Constraint.drop_front(N: 1);
12301
12302 unsigned V;
12303 if (Constraint.getAsInteger(Radix: 10, Result&: V) || V > 31)
12304 return std::nullopt;
12305
12306 if (IsPredicateAsCount)
12307 return std::make_pair(x: AArch64::PN0 + V, y: &AArch64::PNRRegClass);
12308 if (IsPredicate)
12309 return std::make_pair(x: AArch64::P0 + V, y: &AArch64::PPRRegClass);
12310 return std::make_pair(x: AArch64::Z0 + V, y: &AArch64::ZPRRegClass);
12311}
12312
12313static std::optional<PredicateConstraint>
12314parsePredicateConstraint(StringRef Constraint) {
12315 return StringSwitch<std::optional<PredicateConstraint>>(Constraint)
12316 .Case(S: "Uph", Value: PredicateConstraint::Uph)
12317 .Case(S: "Upl", Value: PredicateConstraint::Upl)
12318 .Case(S: "Upa", Value: PredicateConstraint::Upa)
12319 .Default(Value: std::nullopt);
12320}
12321
12322static const TargetRegisterClass *
12323getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
12324 if (VT != MVT::aarch64svcount &&
12325 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12326 return nullptr;
12327
12328 switch (Constraint) {
12329 case PredicateConstraint::Uph:
12330 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12331 : &AArch64::PPR_p8to15RegClass;
12332 case PredicateConstraint::Upl:
12333 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12334 : &AArch64::PPR_3bRegClass;
12335 case PredicateConstraint::Upa:
12336 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12337 : &AArch64::PPRRegClass;
12338 }
12339
12340 llvm_unreachable("Missing PredicateConstraint!");
12341}
12342
12343enum class ReducedGprConstraint { Uci, Ucj };
12344
12345static std::optional<ReducedGprConstraint>
12346parseReducedGprConstraint(StringRef Constraint) {
12347 return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint)
12348 .Case(S: "Uci", Value: ReducedGprConstraint::Uci)
12349 .Case(S: "Ucj", Value: ReducedGprConstraint::Ucj)
12350 .Default(Value: std::nullopt);
12351}
12352
12353static const TargetRegisterClass *
12354getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) {
12355 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12356 return nullptr;
12357
12358 switch (Constraint) {
12359 case ReducedGprConstraint::Uci:
12360 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12361 case ReducedGprConstraint::Ucj:
12362 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12363 }
12364
12365 llvm_unreachable("Missing ReducedGprConstraint!");
12366}
12367
12368// The set of cc code supported is from
12369// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12370static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) {
12371 AArch64CC::CondCode Cond = StringSwitch<AArch64CC::CondCode>(Constraint)
12372 .Case(S: "{@cchi}", Value: AArch64CC::HI)
12373 .Case(S: "{@cccs}", Value: AArch64CC::HS)
12374 .Case(S: "{@cclo}", Value: AArch64CC::LO)
12375 .Case(S: "{@ccls}", Value: AArch64CC::LS)
12376 .Case(S: "{@cccc}", Value: AArch64CC::LO)
12377 .Case(S: "{@cceq}", Value: AArch64CC::EQ)
12378 .Case(S: "{@ccgt}", Value: AArch64CC::GT)
12379 .Case(S: "{@ccge}", Value: AArch64CC::GE)
12380 .Case(S: "{@cclt}", Value: AArch64CC::LT)
12381 .Case(S: "{@ccle}", Value: AArch64CC::LE)
12382 .Case(S: "{@cchs}", Value: AArch64CC::HS)
12383 .Case(S: "{@ccne}", Value: AArch64CC::NE)
12384 .Case(S: "{@ccvc}", Value: AArch64CC::VC)
12385 .Case(S: "{@ccpl}", Value: AArch64CC::PL)
12386 .Case(S: "{@ccvs}", Value: AArch64CC::VS)
12387 .Case(S: "{@ccmi}", Value: AArch64CC::MI)
12388 .Default(Value: AArch64CC::Invalid);
12389 return Cond;
12390}
12391
12392/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12393/// WZR, invert(<cond>)'.
12394static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL,
12395 SelectionDAG &DAG) {
12396 return DAG.getNode(
12397 Opcode: AArch64ISD::CSINC, DL, VT: MVT::i32, N1: DAG.getConstant(Val: 0, DL, VT: MVT::i32),
12398 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32),
12399 N3: DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL, VT: MVT::i32), N4: NZCV);
12400}
12401
12402// Lower @cc flag output via getSETCC.
12403SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12404 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12405 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12406 AArch64CC::CondCode Cond = parseConstraintCode(Constraint: OpInfo.ConstraintCode);
12407 if (Cond == AArch64CC::Invalid)
12408 return SDValue();
12409 // The output variable should be a scalar integer.
12410 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12411 OpInfo.ConstraintVT.getSizeInBits() < 8)
12412 report_fatal_error(reason: "Flag output operand is of invalid type");
12413
12414 // Get NZCV register. Only update chain when copyfrom is glued.
12415 if (Glue.getNode()) {
12416 Glue = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::NZCV, VT: MVT::i32, Glue);
12417 Chain = Glue.getValue(R: 1);
12418 } else
12419 Glue = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::NZCV, VT: MVT::i32);
12420 // Extract CC code.
12421 SDValue CC = getSETCC(CC: Cond, NZCV: Glue, DL, DAG);
12422
12423 SDValue Result;
12424
12425 // Truncate or ZERO_EXTEND based on value types.
12426 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12427 Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: OpInfo.ConstraintVT, Operand: CC);
12428 else
12429 Result = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: OpInfo.ConstraintVT, Operand: CC);
12430
12431 return Result;
12432}
12433
12434/// getConstraintType - Given a constraint letter, return the type of
12435/// constraint it is for this target.
12436AArch64TargetLowering::ConstraintType
12437AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12438 if (Constraint.size() == 1) {
12439 switch (Constraint[0]) {
12440 default:
12441 break;
12442 case 'x':
12443 case 'w':
12444 case 'y':
12445 return C_RegisterClass;
12446 // An address with a single base register. Due to the way we
12447 // currently handle addresses it is the same as 'r'.
12448 case 'Q':
12449 return C_Memory;
12450 case 'I':
12451 case 'J':
12452 case 'K':
12453 case 'L':
12454 case 'M':
12455 case 'N':
12456 case 'Y':
12457 case 'Z':
12458 return C_Immediate;
12459 case 'z':
12460 case 'S': // A symbol or label reference with a constant offset
12461 return C_Other;
12462 }
12463 } else if (parsePredicateConstraint(Constraint))
12464 return C_RegisterClass;
12465 else if (parseReducedGprConstraint(Constraint))
12466 return C_RegisterClass;
12467 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
12468 return C_Other;
12469 return TargetLowering::getConstraintType(Constraint);
12470}
12471
12472/// Examine constraint type and operand type and determine a weight value.
12473/// This object must already have been set up with the operand type
12474/// and the current alternative constraint selected.
12475TargetLowering::ConstraintWeight
12476AArch64TargetLowering::getSingleConstraintMatchWeight(
12477 AsmOperandInfo &info, const char *constraint) const {
12478 ConstraintWeight weight = CW_Invalid;
12479 Value *CallOperandVal = info.CallOperandVal;
12480 // If we don't have a value, we can't do a match,
12481 // but allow it at the lowest weight.
12482 if (!CallOperandVal)
12483 return CW_Default;
12484 Type *type = CallOperandVal->getType();
12485 // Look at the constraint type.
12486 switch (*constraint) {
12487 default:
12488 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
12489 break;
12490 case 'x':
12491 case 'w':
12492 case 'y':
12493 if (type->isFloatingPointTy() || type->isVectorTy())
12494 weight = CW_Register;
12495 break;
12496 case 'z':
12497 weight = CW_Constant;
12498 break;
12499 case 'U':
12500 if (parsePredicateConstraint(Constraint: constraint) ||
12501 parseReducedGprConstraint(Constraint: constraint))
12502 weight = CW_Register;
12503 break;
12504 }
12505 return weight;
12506}
12507
12508std::pair<unsigned, const TargetRegisterClass *>
12509AArch64TargetLowering::getRegForInlineAsmConstraint(
12510 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12511 if (Constraint.size() == 1) {
12512 switch (Constraint[0]) {
12513 case 'r':
12514 if (VT.isScalableVector())
12515 return std::make_pair(x: 0U, y: nullptr);
12516 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
12517 return std::make_pair(x: 0U, y: &AArch64::GPR64x8ClassRegClass);
12518 if (VT.getFixedSizeInBits() == 64)
12519 return std::make_pair(x: 0U, y: &AArch64::GPR64commonRegClass);
12520 return std::make_pair(x: 0U, y: &AArch64::GPR32commonRegClass);
12521 case 'w': {
12522 if (!Subtarget->hasFPARMv8())
12523 break;
12524 if (VT.isScalableVector()) {
12525 if (VT.getVectorElementType() != MVT::i1)
12526 return std::make_pair(x: 0U, y: &AArch64::ZPRRegClass);
12527 return std::make_pair(x: 0U, y: nullptr);
12528 }
12529 if (VT == MVT::Other)
12530 break;
12531 uint64_t VTSize = VT.getFixedSizeInBits();
12532 if (VTSize == 16)
12533 return std::make_pair(x: 0U, y: &AArch64::FPR16RegClass);
12534 if (VTSize == 32)
12535 return std::make_pair(x: 0U, y: &AArch64::FPR32RegClass);
12536 if (VTSize == 64)
12537 return std::make_pair(x: 0U, y: &AArch64::FPR64RegClass);
12538 if (VTSize == 128)
12539 return std::make_pair(x: 0U, y: &AArch64::FPR128RegClass);
12540 break;
12541 }
12542 // The instructions that this constraint is designed for can
12543 // only take 128-bit registers so just use that regclass.
12544 case 'x':
12545 if (!Subtarget->hasFPARMv8())
12546 break;
12547 if (VT.isScalableVector())
12548 return std::make_pair(x: 0U, y: &AArch64::ZPR_4bRegClass);
12549 if (VT.getSizeInBits() == 128)
12550 return std::make_pair(x: 0U, y: &AArch64::FPR128_loRegClass);
12551 break;
12552 case 'y':
12553 if (!Subtarget->hasFPARMv8())
12554 break;
12555 if (VT.isScalableVector())
12556 return std::make_pair(x: 0U, y: &AArch64::ZPR_3bRegClass);
12557 break;
12558 }
12559 } else {
12560 if (const auto P = parseSVERegAsConstraint(Constraint)) {
12561 // SME functions that are not in streaming mode, should
12562 // still observe clobbers of Z-registers by clobbering
12563 // the lower 128bits of those registers.
12564 if (AArch64::ZPRRegClass.hasSubClassEq(RC: P->second) &&
12565 !Subtarget->isSVEorStreamingSVEAvailable())
12566 return std::make_pair(x: TRI->getSubReg(Reg: P->first, Idx: AArch64::zsub),
12567 y: &AArch64::FPR128RegClass);
12568 return *P;
12569 }
12570 if (const auto PC = parsePredicateConstraint(Constraint))
12571 if (const auto *RegClass = getPredicateRegisterClass(Constraint: *PC, VT))
12572 return std::make_pair(x: 0U, y&: RegClass);
12573
12574 if (const auto RGC = parseReducedGprConstraint(Constraint))
12575 if (const auto *RegClass = getReducedGprRegisterClass(Constraint: *RGC, VT))
12576 return std::make_pair(x: 0U, y&: RegClass);
12577 }
12578 if (StringRef("{cc}").equals_insensitive(RHS: Constraint) ||
12579 parseConstraintCode(Constraint) != AArch64CC::Invalid)
12580 return std::make_pair(x: unsigned(AArch64::NZCV), y: &AArch64::CCRRegClass);
12581
12582 if (Constraint == "{za}") {
12583 return std::make_pair(x: unsigned(AArch64::ZA), y: &AArch64::MPRRegClass);
12584 }
12585
12586 if (Constraint == "{zt0}") {
12587 return std::make_pair(x: unsigned(AArch64::ZT0), y: &AArch64::ZTRRegClass);
12588 }
12589
12590 // Use the default implementation in TargetLowering to convert the register
12591 // constraint into a member of a register class.
12592 std::pair<unsigned, const TargetRegisterClass *> Res;
12593 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
12594
12595 // Not found as a standard register?
12596 if (!Res.second) {
12597 unsigned Size = Constraint.size();
12598 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
12599 tolower(c: Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
12600 int RegNo;
12601 bool Failed = Constraint.slice(Start: 2, End: Size - 1).getAsInteger(Radix: 10, Result&: RegNo);
12602 if (!Failed && RegNo >= 0 && RegNo <= 31) {
12603 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
12604 // By default we'll emit v0-v31 for this unless there's a modifier where
12605 // we'll emit the correct register as well.
12606 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
12607 Res.first = AArch64::FPR64RegClass.getRegister(i: RegNo);
12608 Res.second = &AArch64::FPR64RegClass;
12609 } else {
12610 Res.first = AArch64::FPR128RegClass.getRegister(i: RegNo);
12611 Res.second = &AArch64::FPR128RegClass;
12612 }
12613 }
12614 }
12615 }
12616
12617 if (Res.second && !Subtarget->hasFPARMv8() &&
12618 !AArch64::GPR32allRegClass.hasSubClassEq(RC: Res.second) &&
12619 !AArch64::GPR64allRegClass.hasSubClassEq(RC: Res.second))
12620 return std::make_pair(x: 0U, y: nullptr);
12621
12622 return Res;
12623}
12624
12625EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
12626 llvm::Type *Ty,
12627 bool AllowUnknown) const {
12628 if (Subtarget->hasLS64() && Ty->isIntegerTy(Bitwidth: 512))
12629 return EVT(MVT::i64x8);
12630
12631 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
12632}
12633
12634/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12635/// vector. If it is invalid, don't add anything to Ops.
12636void AArch64TargetLowering::LowerAsmOperandForConstraint(
12637 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
12638 SelectionDAG &DAG) const {
12639 SDValue Result;
12640
12641 // Currently only support length 1 constraints.
12642 if (Constraint.size() != 1)
12643 return;
12644
12645 char ConstraintLetter = Constraint[0];
12646 switch (ConstraintLetter) {
12647 default:
12648 break;
12649
12650 // This set of constraints deal with valid constants for various instructions.
12651 // Validate and return a target constant for them if we can.
12652 case 'z': {
12653 // 'z' maps to xzr or wzr so it needs an input of 0.
12654 if (!isNullConstant(V: Op))
12655 return;
12656
12657 if (Op.getValueType() == MVT::i64)
12658 Result = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
12659 else
12660 Result = DAG.getRegister(Reg: AArch64::WZR, VT: MVT::i32);
12661 break;
12662 }
12663 case 'S':
12664 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
12665 // supported for PIC while "s" isn't, making "s" less useful. We implement
12666 // "S" but not "s".
12667 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint: "s", Ops, DAG);
12668 break;
12669
12670 case 'I':
12671 case 'J':
12672 case 'K':
12673 case 'L':
12674 case 'M':
12675 case 'N':
12676 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op);
12677 if (!C)
12678 return;
12679
12680 // Grab the value and do some validation.
12681 uint64_t CVal = C->getZExtValue();
12682 switch (ConstraintLetter) {
12683 // The I constraint applies only to simple ADD or SUB immediate operands:
12684 // i.e. 0 to 4095 with optional shift by 12
12685 // The J constraint applies only to ADD or SUB immediates that would be
12686 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
12687 // instruction [or vice versa], in other words -1 to -4095 with optional
12688 // left shift by 12.
12689 case 'I':
12690 if (isUInt<12>(x: CVal) || isShiftedUInt<12, 12>(x: CVal))
12691 break;
12692 return;
12693 case 'J': {
12694 uint64_t NVal = -C->getSExtValue();
12695 if (isUInt<12>(x: NVal) || isShiftedUInt<12, 12>(x: NVal)) {
12696 CVal = C->getSExtValue();
12697 break;
12698 }
12699 return;
12700 }
12701 // The K and L constraints apply *only* to logical immediates, including
12702 // what used to be the MOVI alias for ORR (though the MOVI alias has now
12703 // been removed and MOV should be used). So these constraints have to
12704 // distinguish between bit patterns that are valid 32-bit or 64-bit
12705 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
12706 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
12707 // versa.
12708 case 'K':
12709 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 32))
12710 break;
12711 return;
12712 case 'L':
12713 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 64))
12714 break;
12715 return;
12716 // The M and N constraints are a superset of K and L respectively, for use
12717 // with the MOV (immediate) alias. As well as the logical immediates they
12718 // also match 32 or 64-bit immediates that can be loaded either using a
12719 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
12720 // (M) or 64-bit 0x1234000000000000 (N) etc.
12721 // As a note some of this code is liberally stolen from the asm parser.
12722 case 'M': {
12723 if (!isUInt<32>(x: CVal))
12724 return;
12725 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 32))
12726 break;
12727 if ((CVal & 0xFFFF) == CVal)
12728 break;
12729 if ((CVal & 0xFFFF0000ULL) == CVal)
12730 break;
12731 uint64_t NCVal = ~(uint32_t)CVal;
12732 if ((NCVal & 0xFFFFULL) == NCVal)
12733 break;
12734 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12735 break;
12736 return;
12737 }
12738 case 'N': {
12739 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 64))
12740 break;
12741 if ((CVal & 0xFFFFULL) == CVal)
12742 break;
12743 if ((CVal & 0xFFFF0000ULL) == CVal)
12744 break;
12745 if ((CVal & 0xFFFF00000000ULL) == CVal)
12746 break;
12747 if ((CVal & 0xFFFF000000000000ULL) == CVal)
12748 break;
12749 uint64_t NCVal = ~CVal;
12750 if ((NCVal & 0xFFFFULL) == NCVal)
12751 break;
12752 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12753 break;
12754 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
12755 break;
12756 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
12757 break;
12758 return;
12759 }
12760 default:
12761 return;
12762 }
12763
12764 // All assembler immediates are 64-bit integers.
12765 Result = DAG.getTargetConstant(Val: CVal, DL: SDLoc(Op), VT: MVT::i64);
12766 break;
12767 }
12768
12769 if (Result.getNode()) {
12770 Ops.push_back(x: Result);
12771 return;
12772 }
12773
12774 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12775}
12776
12777//===----------------------------------------------------------------------===//
12778// AArch64 Advanced SIMD Support
12779//===----------------------------------------------------------------------===//
12780
12781/// WidenVector - Given a value in the V64 register class, produce the
12782/// equivalent value in the V128 register class.
12783static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
12784 EVT VT = V64Reg.getValueType();
12785 unsigned NarrowSize = VT.getVectorNumElements();
12786 MVT EltTy = VT.getVectorElementType().getSimpleVT();
12787 MVT WideTy = MVT::getVectorVT(VT: EltTy, NumElements: 2 * NarrowSize);
12788 SDLoc DL(V64Reg);
12789
12790 return DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: WideTy, N1: DAG.getUNDEF(VT: WideTy),
12791 N2: V64Reg, N3: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
12792}
12793
12794/// getExtFactor - Determine the adjustment factor for the position when
12795/// generating an "extract from vector registers" instruction.
12796static unsigned getExtFactor(SDValue &V) {
12797 EVT EltType = V.getValueType().getVectorElementType();
12798 return EltType.getSizeInBits() / 8;
12799}
12800
12801// Check if a vector is built from one vector via extracted elements of
12802// another together with an AND mask, ensuring that all elements fit
12803// within range. This can be reconstructed using AND and NEON's TBL1.
12804SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
12805 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12806 SDLoc DL(Op);
12807 EVT VT = Op.getValueType();
12808 assert(!VT.isScalableVector() &&
12809 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12810
12811 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
12812 // directly to TBL1.
12813 if (VT != MVT::v16i8 && VT != MVT::v8i8)
12814 return SDValue();
12815
12816 unsigned NumElts = VT.getVectorNumElements();
12817 assert((NumElts == 8 || NumElts == 16) &&
12818 "Need to have exactly 8 or 16 elements in vector.");
12819
12820 SDValue SourceVec;
12821 SDValue MaskSourceVec;
12822 SmallVector<SDValue, 16> AndMaskConstants;
12823
12824 for (unsigned i = 0; i < NumElts; ++i) {
12825 SDValue V = Op.getOperand(i);
12826 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12827 return SDValue();
12828
12829 SDValue OperandSourceVec = V.getOperand(i: 0);
12830 if (!SourceVec)
12831 SourceVec = OperandSourceVec;
12832 else if (SourceVec != OperandSourceVec)
12833 return SDValue();
12834
12835 // This only looks at shuffles with elements that are
12836 // a) truncated by a constant AND mask extracted from a mask vector, or
12837 // b) extracted directly from a mask vector.
12838 SDValue MaskSource = V.getOperand(i: 1);
12839 if (MaskSource.getOpcode() == ISD::AND) {
12840 if (!isa<ConstantSDNode>(Val: MaskSource.getOperand(i: 1)))
12841 return SDValue();
12842
12843 AndMaskConstants.push_back(Elt: MaskSource.getOperand(i: 1));
12844 MaskSource = MaskSource->getOperand(Num: 0);
12845 } else if (!AndMaskConstants.empty()) {
12846 // Either all or no operands should have an AND mask.
12847 return SDValue();
12848 }
12849
12850 // An ANY_EXTEND may be inserted between the AND and the source vector
12851 // extraction. We don't care about that, so we can just skip it.
12852 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
12853 MaskSource = MaskSource.getOperand(i: 0);
12854
12855 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12856 return SDValue();
12857
12858 SDValue MaskIdx = MaskSource.getOperand(i: 1);
12859 if (!isa<ConstantSDNode>(Val: MaskIdx) ||
12860 !cast<ConstantSDNode>(Val&: MaskIdx)->getConstantIntValue()->equalsInt(V: i))
12861 return SDValue();
12862
12863 // We only apply this if all elements come from the same vector with the
12864 // same vector type.
12865 if (!MaskSourceVec) {
12866 MaskSourceVec = MaskSource->getOperand(Num: 0);
12867 if (MaskSourceVec.getValueType() != VT)
12868 return SDValue();
12869 } else if (MaskSourceVec != MaskSource->getOperand(Num: 0)) {
12870 return SDValue();
12871 }
12872 }
12873
12874 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
12875 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
12876 // insert, we know that the index in the mask must be smaller than the number
12877 // of elements in the source, or we would have an out-of-bounds access.
12878 if (NumElts == 8)
12879 SourceVec = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: SourceVec,
12880 N2: DAG.getUNDEF(VT));
12881
12882 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
12883 if (!AndMaskConstants.empty())
12884 MaskSourceVec = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: MaskSourceVec,
12885 N2: DAG.getBuildVector(VT, DL, Ops: AndMaskConstants));
12886
12887 return DAG.getNode(
12888 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
12889 N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL, VT: MVT::i32), N2: SourceVec,
12890 N3: MaskSourceVec);
12891}
12892
12893// Gather data to see if the operation can be modelled as a
12894// shuffle in combination with VEXTs.
12895SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
12896 SelectionDAG &DAG) const {
12897 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12898 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
12899 SDLoc DL(Op);
12900 EVT VT = Op.getValueType();
12901 assert(!VT.isScalableVector() &&
12902 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12903 unsigned NumElts = VT.getVectorNumElements();
12904
12905 struct ShuffleSourceInfo {
12906 SDValue Vec;
12907 unsigned MinElt;
12908 unsigned MaxElt;
12909
12910 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
12911 // be compatible with the shuffle we intend to construct. As a result
12912 // ShuffleVec will be some sliding window into the original Vec.
12913 SDValue ShuffleVec;
12914
12915 // Code should guarantee that element i in Vec starts at element "WindowBase
12916 // + i * WindowScale in ShuffleVec".
12917 int WindowBase;
12918 int WindowScale;
12919
12920 ShuffleSourceInfo(SDValue Vec)
12921 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
12922 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
12923
12924 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
12925 };
12926
12927 // First gather all vectors used as an immediate source for this BUILD_VECTOR
12928 // node.
12929 SmallVector<ShuffleSourceInfo, 2> Sources;
12930 for (unsigned i = 0; i < NumElts; ++i) {
12931 SDValue V = Op.getOperand(i);
12932 if (V.isUndef())
12933 continue;
12934 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12935 !isa<ConstantSDNode>(Val: V.getOperand(i: 1)) ||
12936 V.getOperand(i: 0).getValueType().isScalableVector()) {
12937 LLVM_DEBUG(
12938 dbgs() << "Reshuffle failed: "
12939 "a shuffle can only come from building a vector from "
12940 "various elements of other fixed-width vectors, provided "
12941 "their indices are constant\n");
12942 return SDValue();
12943 }
12944
12945 // Add this element source to the list if it's not already there.
12946 SDValue SourceVec = V.getOperand(i: 0);
12947 auto Source = find(Range&: Sources, Val: SourceVec);
12948 if (Source == Sources.end())
12949 Source = Sources.insert(I: Sources.end(), Elt: ShuffleSourceInfo(SourceVec));
12950
12951 // Update the minimum and maximum lane number seen.
12952 unsigned EltNo = V.getConstantOperandVal(i: 1);
12953 Source->MinElt = std::min(a: Source->MinElt, b: EltNo);
12954 Source->MaxElt = std::max(a: Source->MaxElt, b: EltNo);
12955 }
12956
12957 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
12958 // better than moving to/from gpr registers for larger vectors.
12959 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
12960 // Construct a mask for the tbl. We may need to adjust the index for types
12961 // larger than i8.
12962 SmallVector<unsigned, 16> Mask;
12963 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
12964 for (unsigned I = 0; I < NumElts; ++I) {
12965 SDValue V = Op.getOperand(i: I);
12966 if (V.isUndef()) {
12967 for (unsigned OF = 0; OF < OutputFactor; OF++)
12968 Mask.push_back(Elt: -1);
12969 continue;
12970 }
12971 // Set the Mask lanes adjusted for the size of the input and output
12972 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
12973 // output element, adjusted in their positions per input and output types.
12974 unsigned Lane = V.getConstantOperandVal(i: 1);
12975 for (unsigned S = 0; S < Sources.size(); S++) {
12976 if (V.getOperand(i: 0) == Sources[S].Vec) {
12977 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
12978 unsigned InputBase = 16 * S + Lane * InputSize / 8;
12979 for (unsigned OF = 0; OF < OutputFactor; OF++)
12980 Mask.push_back(Elt: InputBase + OF);
12981 break;
12982 }
12983 }
12984 }
12985
12986 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
12987 // v16i8, and the TBLMask
12988 SmallVector<SDValue, 16> TBLOperands;
12989 TBLOperands.push_back(Elt: DAG.getConstant(Val: Sources.size() == 3
12990 ? Intrinsic::aarch64_neon_tbl3
12991 : Intrinsic::aarch64_neon_tbl4,
12992 DL, VT: MVT::i32));
12993 for (unsigned i = 0; i < Sources.size(); i++) {
12994 SDValue Src = Sources[i].Vec;
12995 EVT SrcVT = Src.getValueType();
12996 Src = DAG.getBitcast(VT: SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, V: Src);
12997 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
12998 "Expected a legally typed vector");
12999 if (SrcVT.is64BitVector())
13000 Src = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: Src,
13001 N2: DAG.getUNDEF(VT: MVT::v8i8));
13002 TBLOperands.push_back(Elt: Src);
13003 }
13004
13005 SmallVector<SDValue, 16> TBLMask;
13006 for (unsigned i = 0; i < Mask.size(); i++)
13007 TBLMask.push_back(Elt: DAG.getConstant(Val: Mask[i], DL, VT: MVT::i32));
13008 assert((Mask.size() == 8 || Mask.size() == 16) &&
13009 "Expected a v8i8 or v16i8 Mask");
13010 TBLOperands.push_back(Elt: DAG.getBuildVector(
13011 VT: Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, Ops: TBLMask));
13012
13013 SDValue Shuffle =
13014 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL,
13015 VT: Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, Ops: TBLOperands);
13016 return DAG.getBitcast(VT, V: Shuffle);
13017 }
13018
13019 if (Sources.size() > 2) {
13020 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13021 << "sensible when at most two source vectors are "
13022 << "involved\n");
13023 return SDValue();
13024 }
13025
13026 // Find out the smallest element size among result and two sources, and use
13027 // it as element size to build the shuffle_vector.
13028 EVT SmallestEltTy = VT.getVectorElementType();
13029 for (auto &Source : Sources) {
13030 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
13031 if (SrcEltTy.bitsLT(VT: SmallestEltTy)) {
13032 SmallestEltTy = SrcEltTy;
13033 }
13034 }
13035 unsigned ResMultiplier =
13036 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13037 uint64_t VTSize = VT.getFixedSizeInBits();
13038 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
13039 EVT ShuffleVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SmallestEltTy, NumElements: NumElts);
13040
13041 // If the source vector is too wide or too narrow, we may nevertheless be able
13042 // to construct a compatible shuffle either by concatenating it with UNDEF or
13043 // extracting a suitable range of elements.
13044 for (auto &Src : Sources) {
13045 EVT SrcVT = Src.ShuffleVec.getValueType();
13046
13047 TypeSize SrcVTSize = SrcVT.getSizeInBits();
13048 if (SrcVTSize == TypeSize::getFixed(ExactSize: VTSize))
13049 continue;
13050
13051 // This stage of the search produces a source with the same element type as
13052 // the original, but with a total width matching the BUILD_VECTOR output.
13053 EVT EltVT = SrcVT.getVectorElementType();
13054 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
13055 EVT DestVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumSrcElts);
13056
13057 if (SrcVTSize.getFixedValue() < VTSize) {
13058 assert(2 * SrcVTSize == VTSize);
13059 // We can pad out the smaller vector for free, so if it's part of a
13060 // shuffle...
13061 Src.ShuffleVec =
13062 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: DestVT, N1: Src.ShuffleVec,
13063 N2: DAG.getUNDEF(VT: Src.ShuffleVec.getValueType()));
13064 continue;
13065 }
13066
13067 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
13068 LLVM_DEBUG(
13069 dbgs() << "Reshuffle failed: result vector too small to extract\n");
13070 return SDValue();
13071 }
13072
13073 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
13074 LLVM_DEBUG(
13075 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
13076 return SDValue();
13077 }
13078
13079 if (Src.MinElt >= NumSrcElts) {
13080 // The extraction can just take the second half
13081 Src.ShuffleVec =
13082 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: DestVT, N1: Src.ShuffleVec,
13083 N2: DAG.getConstant(Val: NumSrcElts, DL, VT: MVT::i64));
13084 Src.WindowBase = -NumSrcElts;
13085 } else if (Src.MaxElt < NumSrcElts) {
13086 // The extraction can just take the first half
13087 Src.ShuffleVec =
13088 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: DestVT, N1: Src.ShuffleVec,
13089 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
13090 } else {
13091 // An actual VEXT is needed
13092 SDValue VEXTSrc1 =
13093 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: DestVT, N1: Src.ShuffleVec,
13094 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
13095 SDValue VEXTSrc2 =
13096 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: DestVT, N1: Src.ShuffleVec,
13097 N2: DAG.getConstant(Val: NumSrcElts, DL, VT: MVT::i64));
13098 unsigned Imm = Src.MinElt * getExtFactor(V&: VEXTSrc1);
13099
13100 if (!SrcVT.is64BitVector()) {
13101 LLVM_DEBUG(
13102 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
13103 "for SVE vectors.");
13104 return SDValue();
13105 }
13106
13107 Src.ShuffleVec =
13108 DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: DestVT, N1: VEXTSrc1, N2: VEXTSrc2,
13109 N3: DAG.getConstant(Val: Imm, DL, VT: MVT::i32));
13110 Src.WindowBase = -Src.MinElt;
13111 }
13112 }
13113
13114 // Another possible incompatibility occurs from the vector element types. We
13115 // can fix this by bitcasting the source vectors to the same type we intend
13116 // for the shuffle.
13117 for (auto &Src : Sources) {
13118 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13119 if (SrcEltTy == SmallestEltTy)
13120 continue;
13121 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13122 if (DAG.getDataLayout().isBigEndian()) {
13123 Src.ShuffleVec =
13124 DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: ShuffleVT, Operand: Src.ShuffleVec);
13125 } else {
13126 Src.ShuffleVec = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ShuffleVT, Operand: Src.ShuffleVec);
13127 }
13128 Src.WindowScale =
13129 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13130 Src.WindowBase *= Src.WindowScale;
13131 }
13132
13133 // Final check before we try to actually produce a shuffle.
13134 LLVM_DEBUG({
13135 for (auto Src : Sources)
13136 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13137 });
13138
13139 // The stars all align, our next step is to produce the mask for the shuffle.
13140 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13141 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13142 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13143 SDValue Entry = Op.getOperand(i);
13144 if (Entry.isUndef())
13145 continue;
13146
13147 auto Src = find(Range&: Sources, Val: Entry.getOperand(i: 0));
13148 int EltNo = cast<ConstantSDNode>(Val: Entry.getOperand(i: 1))->getSExtValue();
13149
13150 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13151 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13152 // segment.
13153 EVT OrigEltTy = Entry.getOperand(i: 0).getValueType().getVectorElementType();
13154 int BitsDefined = std::min(a: OrigEltTy.getScalarSizeInBits(),
13155 b: VT.getScalarSizeInBits());
13156 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13157
13158 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13159 // starting at the appropriate offset.
13160 int *LaneMask = &Mask[i * ResMultiplier];
13161
13162 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13163 ExtractBase += NumElts * (Src - Sources.begin());
13164 for (int j = 0; j < LanesDefined; ++j)
13165 LaneMask[j] = ExtractBase + j;
13166 }
13167
13168 // Final check before we try to produce nonsense...
13169 if (!isShuffleMaskLegal(M: Mask, VT: ShuffleVT)) {
13170 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13171 return SDValue();
13172 }
13173
13174 SDValue ShuffleOps[] = { DAG.getUNDEF(VT: ShuffleVT), DAG.getUNDEF(VT: ShuffleVT) };
13175 for (unsigned i = 0; i < Sources.size(); ++i)
13176 ShuffleOps[i] = Sources[i].ShuffleVec;
13177
13178 SDValue Shuffle =
13179 DAG.getVectorShuffle(VT: ShuffleVT, dl: DL, N1: ShuffleOps[0], N2: ShuffleOps[1], Mask);
13180 SDValue V;
13181 if (DAG.getDataLayout().isBigEndian()) {
13182 V = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Shuffle);
13183 } else {
13184 V = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Shuffle);
13185 }
13186
13187 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13188 dbgs() << "Reshuffle, creating node: "; V.dump(););
13189
13190 return V;
13191}
13192
13193// check if an EXT instruction can handle the shuffle mask when the
13194// vector sources of the shuffle are the same.
13195static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13196 unsigned NumElts = VT.getVectorNumElements();
13197
13198 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13199 if (M[0] < 0)
13200 return false;
13201
13202 Imm = M[0];
13203
13204 // If this is a VEXT shuffle, the immediate value is the index of the first
13205 // element. The other shuffle indices must be the successive elements after
13206 // the first one.
13207 unsigned ExpectedElt = Imm;
13208 for (unsigned i = 1; i < NumElts; ++i) {
13209 // Increment the expected index. If it wraps around, just follow it
13210 // back to index zero and keep going.
13211 ++ExpectedElt;
13212 if (ExpectedElt == NumElts)
13213 ExpectedElt = 0;
13214
13215 if (M[i] < 0)
13216 continue; // ignore UNDEF indices
13217 if (ExpectedElt != static_cast<unsigned>(M[i]))
13218 return false;
13219 }
13220
13221 return true;
13222}
13223
13224// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13225// v4i32s. This is really a truncate, which we can construct out of (legal)
13226// concats and truncate nodes.
13227static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
13228 if (V.getValueType() != MVT::v16i8)
13229 return SDValue();
13230 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13231
13232 for (unsigned X = 0; X < 4; X++) {
13233 // Check the first item in each group is an extract from lane 0 of a v4i32
13234 // or v4i16.
13235 SDValue BaseExt = V.getOperand(i: X * 4);
13236 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13237 (BaseExt.getOperand(i: 0).getValueType() != MVT::v4i16 &&
13238 BaseExt.getOperand(i: 0).getValueType() != MVT::v4i32) ||
13239 !isa<ConstantSDNode>(Val: BaseExt.getOperand(i: 1)) ||
13240 BaseExt.getConstantOperandVal(i: 1) != 0)
13241 return SDValue();
13242 SDValue Base = BaseExt.getOperand(i: 0);
13243 // And check the other items are extracts from the same vector.
13244 for (unsigned Y = 1; Y < 4; Y++) {
13245 SDValue Ext = V.getOperand(i: X * 4 + Y);
13246 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13247 Ext.getOperand(i: 0) != Base ||
13248 !isa<ConstantSDNode>(Val: Ext.getOperand(i: 1)) ||
13249 Ext.getConstantOperandVal(i: 1) != Y)
13250 return SDValue();
13251 }
13252 }
13253
13254 // Turn the buildvector into a series of truncates and concates, which will
13255 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13256 // concat together to produce 2 v8i16. These are both truncated and concat
13257 // together.
13258 SDLoc DL(V);
13259 SDValue Trunc[4] = {
13260 V.getOperand(i: 0).getOperand(i: 0), V.getOperand(i: 4).getOperand(i: 0),
13261 V.getOperand(i: 8).getOperand(i: 0), V.getOperand(i: 12).getOperand(i: 0)};
13262 for (SDValue &V : Trunc)
13263 if (V.getValueType() == MVT::v4i32)
13264 V = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v4i16, Operand: V);
13265 SDValue Concat0 =
13266 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16, N1: Trunc[0], N2: Trunc[1]);
13267 SDValue Concat1 =
13268 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16, N1: Trunc[2], N2: Trunc[3]);
13269 SDValue Trunc0 = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: Concat0);
13270 SDValue Trunc1 = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: Concat1);
13271 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: Trunc0, N2: Trunc1);
13272}
13273
13274/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13275/// element width than the vector lane type. If that is the case the function
13276/// returns true and writes the value of the DUP instruction lane operand into
13277/// DupLaneOp
13278static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13279 unsigned &DupLaneOp) {
13280 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13281 "Only possible block sizes for wide DUP are: 16, 32, 64");
13282
13283 if (BlockSize <= VT.getScalarSizeInBits())
13284 return false;
13285 if (BlockSize % VT.getScalarSizeInBits() != 0)
13286 return false;
13287 if (VT.getSizeInBits() % BlockSize != 0)
13288 return false;
13289
13290 size_t SingleVecNumElements = VT.getVectorNumElements();
13291 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13292 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13293
13294 // We are looking for masks like
13295 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13296 // might be replaced by 'undefined'. BlockIndices will eventually contain
13297 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13298 // for the above examples)
13299 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13300 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13301 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13302 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13303 if (Elt < 0)
13304 continue;
13305 // For now we don't support shuffles that use the second operand
13306 if ((unsigned)Elt >= SingleVecNumElements)
13307 return false;
13308 if (BlockElts[I] < 0)
13309 BlockElts[I] = Elt;
13310 else if (BlockElts[I] != Elt)
13311 return false;
13312 }
13313
13314 // We found a candidate block (possibly with some undefs). It must be a
13315 // sequence of consecutive integers starting with a value divisible by
13316 // NumEltsPerBlock with some values possibly replaced by undef-s.
13317
13318 // Find first non-undef element
13319 auto FirstRealEltIter = find_if(Range&: BlockElts, P: [](int Elt) { return Elt >= 0; });
13320 assert(FirstRealEltIter != BlockElts.end() &&
13321 "Shuffle with all-undefs must have been caught by previous cases, "
13322 "e.g. isSplat()");
13323 if (FirstRealEltIter == BlockElts.end()) {
13324 DupLaneOp = 0;
13325 return true;
13326 }
13327
13328 // Index of FirstRealElt in BlockElts
13329 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13330
13331 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13332 return false;
13333 // BlockElts[0] must have the following value if it isn't undef:
13334 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13335
13336 // Check the first element
13337 if (Elt0 % NumEltsPerBlock != 0)
13338 return false;
13339 // Check that the sequence indeed consists of consecutive integers (modulo
13340 // undefs)
13341 for (size_t I = 0; I < NumEltsPerBlock; I++)
13342 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13343 return false;
13344
13345 DupLaneOp = Elt0 / NumEltsPerBlock;
13346 return true;
13347}
13348
13349// check if an EXT instruction can handle the shuffle mask when the
13350// vector sources of the shuffle are different.
13351static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13352 unsigned &Imm) {
13353 // Look for the first non-undef element.
13354 const int *FirstRealElt = find_if(Range&: M, P: [](int Elt) { return Elt >= 0; });
13355
13356 // Benefit form APInt to handle overflow when calculating expected element.
13357 unsigned NumElts = VT.getVectorNumElements();
13358 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13359 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13360 /*implicitTrunc=*/true);
13361 // The following shuffle indices must be the successive elements after the
13362 // first real element.
13363 bool FoundWrongElt = std::any_of(first: FirstRealElt + 1, last: M.end(), pred: [&](int Elt) {
13364 return Elt != ExpectedElt++ && Elt != -1;
13365 });
13366 if (FoundWrongElt)
13367 return false;
13368
13369 // The index of an EXT is the first element if it is not UNDEF.
13370 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13371 // value of the first element. E.g.
13372 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13373 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13374 // ExpectedElt is the last mask index plus 1.
13375 Imm = ExpectedElt.getZExtValue();
13376
13377 // There are two difference cases requiring to reverse input vectors.
13378 // For example, for vector <4 x i32> we have the following cases,
13379 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13380 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13381 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13382 // to reverse two input vectors.
13383 if (Imm < NumElts)
13384 ReverseEXT = true;
13385 else
13386 Imm -= NumElts;
13387
13388 return true;
13389}
13390
13391/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13392/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13393/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13394static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13395 unsigned NumElts = VT.getVectorNumElements();
13396 if (NumElts % 2 != 0)
13397 return false;
13398 WhichResult = (M[0] == 0 ? 0 : 1);
13399 unsigned Idx = WhichResult * NumElts / 2;
13400 for (unsigned i = 0; i != NumElts; i += 2) {
13401 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13402 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13403 return false;
13404 Idx += 1;
13405 }
13406
13407 return true;
13408}
13409
13410/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13411/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13412/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13413static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13414 unsigned Half = VT.getVectorNumElements() / 2;
13415 WhichResult = (M[0] == 0 ? 0 : 1);
13416 for (unsigned j = 0; j != 2; ++j) {
13417 unsigned Idx = WhichResult;
13418 for (unsigned i = 0; i != Half; ++i) {
13419 int MIdx = M[i + j * Half];
13420 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13421 return false;
13422 Idx += 2;
13423 }
13424 }
13425
13426 return true;
13427}
13428
13429/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13430/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13431/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13432static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13433 unsigned NumElts = VT.getVectorNumElements();
13434 if (NumElts % 2 != 0)
13435 return false;
13436 WhichResult = (M[0] == 0 ? 0 : 1);
13437 for (unsigned i = 0; i < NumElts; i += 2) {
13438 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13439 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13440 return false;
13441 }
13442 return true;
13443}
13444
13445static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13446 bool &DstIsLeft, int &Anomaly) {
13447 if (M.size() != static_cast<size_t>(NumInputElements))
13448 return false;
13449
13450 int NumLHSMatch = 0, NumRHSMatch = 0;
13451 int LastLHSMismatch = -1, LastRHSMismatch = -1;
13452
13453 for (int i = 0; i < NumInputElements; ++i) {
13454 if (M[i] == -1) {
13455 ++NumLHSMatch;
13456 ++NumRHSMatch;
13457 continue;
13458 }
13459
13460 if (M[i] == i)
13461 ++NumLHSMatch;
13462 else
13463 LastLHSMismatch = i;
13464
13465 if (M[i] == i + NumInputElements)
13466 ++NumRHSMatch;
13467 else
13468 LastRHSMismatch = i;
13469 }
13470
13471 if (NumLHSMatch == NumInputElements - 1) {
13472 DstIsLeft = true;
13473 Anomaly = LastLHSMismatch;
13474 return true;
13475 } else if (NumRHSMatch == NumInputElements - 1) {
13476 DstIsLeft = false;
13477 Anomaly = LastRHSMismatch;
13478 return true;
13479 }
13480
13481 return false;
13482}
13483
13484static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
13485 if (VT.getSizeInBits() != 128)
13486 return false;
13487
13488 unsigned NumElts = VT.getVectorNumElements();
13489
13490 for (int I = 0, E = NumElts / 2; I != E; I++) {
13491 if (Mask[I] != I)
13492 return false;
13493 }
13494
13495 int Offset = NumElts / 2;
13496 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
13497 if (Mask[I] != I + SplitLHS * Offset)
13498 return false;
13499 }
13500
13501 return true;
13502}
13503
13504static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
13505 SDLoc DL(Op);
13506 EVT VT = Op.getValueType();
13507 SDValue V0 = Op.getOperand(i: 0);
13508 SDValue V1 = Op.getOperand(i: 1);
13509 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
13510
13511 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
13512 VT.getVectorElementType() != V1.getValueType().getVectorElementType())
13513 return SDValue();
13514
13515 bool SplitV0 = V0.getValueSizeInBits() == 128;
13516
13517 if (!isConcatMask(Mask, VT, SplitLHS: SplitV0))
13518 return SDValue();
13519
13520 EVT CastVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
13521 if (SplitV0) {
13522 V0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: CastVT, N1: V0,
13523 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
13524 }
13525 if (V1.getValueSizeInBits() == 128) {
13526 V1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: CastVT, N1: V1,
13527 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
13528 }
13529 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: V0, N2: V1);
13530}
13531
13532/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
13533/// the specified operations to build the shuffle. ID is the perfect-shuffle
13534//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
13535//table entry and LHS/RHS are the immediate inputs for this stage of the
13536//shuffle.
13537static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2,
13538 unsigned PFEntry, SDValue LHS,
13539 SDValue RHS, SelectionDAG &DAG,
13540 const SDLoc &DL) {
13541 unsigned OpNum = (PFEntry >> 26) & 0x0F;
13542 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
13543 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
13544
13545 enum {
13546 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
13547 OP_VREV,
13548 OP_VDUP0,
13549 OP_VDUP1,
13550 OP_VDUP2,
13551 OP_VDUP3,
13552 OP_VEXT1,
13553 OP_VEXT2,
13554 OP_VEXT3,
13555 OP_VUZPL, // VUZP, left result
13556 OP_VUZPR, // VUZP, right result
13557 OP_VZIPL, // VZIP, left result
13558 OP_VZIPR, // VZIP, right result
13559 OP_VTRNL, // VTRN, left result
13560 OP_VTRNR, // VTRN, right result
13561 OP_MOVLANE // Move lane. RHSID is the lane to move into
13562 };
13563
13564 if (OpNum == OP_COPY) {
13565 if (LHSID == (1 * 9 + 2) * 9 + 3)
13566 return LHS;
13567 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
13568 return RHS;
13569 }
13570
13571 if (OpNum == OP_MOVLANE) {
13572 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
13573 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
13574 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
13575 Elt = 3 - Elt;
13576 while (Elt > 0) {
13577 ID /= 9;
13578 Elt--;
13579 }
13580 return (ID % 9 == 8) ? -1 : ID % 9;
13581 };
13582
13583 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
13584 // get the lane to move from the PFID, which is always from the
13585 // original vectors (V1 or V2).
13586 SDValue OpLHS = GeneratePerfectShuffle(
13587 ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
13588 EVT VT = OpLHS.getValueType();
13589 assert(RHSID < 8 && "Expected a lane index for RHSID!");
13590 unsigned ExtLane = 0;
13591 SDValue Input;
13592
13593 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
13594 // convert into a higher type.
13595 if (RHSID & 0x4) {
13596 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
13597 if (MaskElt == -1)
13598 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
13599 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13600 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
13601 Input = MaskElt < 2 ? V1 : V2;
13602 if (VT.getScalarSizeInBits() == 16) {
13603 Input = DAG.getBitcast(VT: MVT::v2f32, V: Input);
13604 OpLHS = DAG.getBitcast(VT: MVT::v2f32, V: OpLHS);
13605 } else {
13606 assert(VT.getScalarSizeInBits() == 32 &&
13607 "Expected 16 or 32 bit shuffle elements");
13608 Input = DAG.getBitcast(VT: MVT::v2f64, V: Input);
13609 OpLHS = DAG.getBitcast(VT: MVT::v2f64, V: OpLHS);
13610 }
13611 } else {
13612 int MaskElt = getPFIDLane(ID, RHSID);
13613 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13614 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
13615 Input = MaskElt < 4 ? V1 : V2;
13616 // Be careful about creating illegal types. Use f16 instead of i16.
13617 if (VT == MVT::v4i16) {
13618 Input = DAG.getBitcast(VT: MVT::v4f16, V: Input);
13619 OpLHS = DAG.getBitcast(VT: MVT::v4f16, V: OpLHS);
13620 }
13621 }
13622 SDValue Ext = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL,
13623 VT: Input.getValueType().getVectorElementType(),
13624 N1: Input, N2: DAG.getVectorIdxConstant(Val: ExtLane, DL));
13625 SDValue Ins =
13626 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: Input.getValueType(), N1: OpLHS,
13627 N2: Ext, N3: DAG.getVectorIdxConstant(Val: RHSID & 0x3, DL));
13628 return DAG.getBitcast(VT, V: Ins);
13629 }
13630
13631 SDValue OpLHS, OpRHS;
13632 OpLHS = GeneratePerfectShuffle(ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS,
13633 RHS, DAG, DL);
13634 OpRHS = GeneratePerfectShuffle(ID: RHSID, V1, V2, PFEntry: PerfectShuffleTable[RHSID], LHS,
13635 RHS, DAG, DL);
13636 EVT VT = OpLHS.getValueType();
13637
13638 switch (OpNum) {
13639 default:
13640 llvm_unreachable("Unknown shuffle opcode!");
13641 case OP_VREV:
13642 // VREV divides the vector in half and swaps within the half.
13643 if (VT.getVectorElementType() == MVT::i32 ||
13644 VT.getVectorElementType() == MVT::f32)
13645 return DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT, Operand: OpLHS);
13646 // vrev <4 x i16> -> REV32
13647 if (VT.getVectorElementType() == MVT::i16 ||
13648 VT.getVectorElementType() == MVT::f16 ||
13649 VT.getVectorElementType() == MVT::bf16)
13650 return DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT, Operand: OpLHS);
13651 // vrev <4 x i8> -> REV16
13652 assert(VT.getVectorElementType() == MVT::i8);
13653 return DAG.getNode(Opcode: AArch64ISD::REV16, DL, VT, Operand: OpLHS);
13654 case OP_VDUP0:
13655 case OP_VDUP1:
13656 case OP_VDUP2:
13657 case OP_VDUP3: {
13658 EVT EltTy = VT.getVectorElementType();
13659 unsigned Opcode;
13660 if (EltTy == MVT::i8)
13661 Opcode = AArch64ISD::DUPLANE8;
13662 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
13663 Opcode = AArch64ISD::DUPLANE16;
13664 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
13665 Opcode = AArch64ISD::DUPLANE32;
13666 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
13667 Opcode = AArch64ISD::DUPLANE64;
13668 else
13669 llvm_unreachable("Invalid vector element type?");
13670
13671 if (VT.getSizeInBits() == 64)
13672 OpLHS = WidenVector(V64Reg: OpLHS, DAG);
13673 SDValue Lane = DAG.getConstant(Val: OpNum - OP_VDUP0, DL, VT: MVT::i64);
13674 return DAG.getNode(Opcode, DL, VT, N1: OpLHS, N2: Lane);
13675 }
13676 case OP_VEXT1:
13677 case OP_VEXT2:
13678 case OP_VEXT3: {
13679 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(V&: OpLHS);
13680 return DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT, N1: OpLHS, N2: OpRHS,
13681 N3: DAG.getConstant(Val: Imm, DL, VT: MVT::i32));
13682 }
13683 case OP_VUZPL:
13684 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT, N1: OpLHS, N2: OpRHS);
13685 case OP_VUZPR:
13686 return DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT, N1: OpLHS, N2: OpRHS);
13687 case OP_VZIPL:
13688 return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT, N1: OpLHS, N2: OpRHS);
13689 case OP_VZIPR:
13690 return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT, N1: OpLHS, N2: OpRHS);
13691 case OP_VTRNL:
13692 return DAG.getNode(Opcode: AArch64ISD::TRN1, DL, VT, N1: OpLHS, N2: OpRHS);
13693 case OP_VTRNR:
13694 return DAG.getNode(Opcode: AArch64ISD::TRN2, DL, VT, N1: OpLHS, N2: OpRHS);
13695 }
13696}
13697
13698static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
13699 SelectionDAG &DAG) {
13700 // Check to see if we can use the TBL instruction.
13701 SDValue V1 = Op.getOperand(i: 0);
13702 SDValue V2 = Op.getOperand(i: 1);
13703 SDLoc DL(Op);
13704
13705 EVT EltVT = Op.getValueType().getVectorElementType();
13706 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
13707
13708 bool Swap = false;
13709 if (V1.isUndef() || isZerosVector(N: V1.getNode())) {
13710 std::swap(a&: V1, b&: V2);
13711 Swap = true;
13712 }
13713
13714 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
13715 // out of range values with 0s. We do need to make sure that any out-of-range
13716 // values are really out-of-range for a v16i8 vector.
13717 bool IsUndefOrZero = V2.isUndef() || isZerosVector(N: V2.getNode());
13718 MVT IndexVT = MVT::v8i8;
13719 unsigned IndexLen = 8;
13720 if (Op.getValueSizeInBits() == 128) {
13721 IndexVT = MVT::v16i8;
13722 IndexLen = 16;
13723 }
13724
13725 SmallVector<SDValue, 8> TBLMask;
13726 for (int Val : ShuffleMask) {
13727 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
13728 unsigned Offset = Byte + Val * BytesPerElt;
13729 if (Swap)
13730 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
13731 if (IsUndefOrZero && Offset >= IndexLen)
13732 Offset = 255;
13733 TBLMask.push_back(Elt: DAG.getConstant(Val: Offset, DL, VT: MVT::i32));
13734 }
13735 }
13736
13737 SDValue V1Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V1);
13738 SDValue V2Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V2);
13739
13740 SDValue Shuffle;
13741 if (IsUndefOrZero) {
13742 if (IndexLen == 8)
13743 V1Cst = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: V1Cst, N2: V1Cst);
13744 Shuffle = DAG.getNode(
13745 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
13746 N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL, VT: MVT::i32), N2: V1Cst,
13747 N3: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
13748 } else {
13749 if (IndexLen == 8) {
13750 V1Cst = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: V1Cst, N2: V2Cst);
13751 Shuffle = DAG.getNode(
13752 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
13753 N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL, VT: MVT::i32), N2: V1Cst,
13754 N3: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
13755 } else {
13756 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
13757 // cannot currently represent the register constraints on the input
13758 // table registers.
13759 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
13760 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
13761 // IndexLen));
13762 Shuffle = DAG.getNode(
13763 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
13764 N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl2, DL, VT: MVT::i32), N2: V1Cst,
13765 N3: V2Cst,
13766 N4: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
13767 }
13768 }
13769 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
13770}
13771
13772static unsigned getDUPLANEOp(EVT EltType) {
13773 if (EltType == MVT::i8)
13774 return AArch64ISD::DUPLANE8;
13775 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
13776 return AArch64ISD::DUPLANE16;
13777 if (EltType == MVT::i32 || EltType == MVT::f32)
13778 return AArch64ISD::DUPLANE32;
13779 if (EltType == MVT::i64 || EltType == MVT::f64)
13780 return AArch64ISD::DUPLANE64;
13781
13782 llvm_unreachable("Invalid vector element type?");
13783}
13784
13785static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
13786 unsigned Opcode, SelectionDAG &DAG) {
13787 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
13788 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
13789 // Match: dup (bitcast (extract_subv X, C)), LaneC
13790 if (BitCast.getOpcode() != ISD::BITCAST ||
13791 BitCast.getOperand(i: 0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
13792 return false;
13793
13794 // The extract index must align in the destination type. That may not
13795 // happen if the bitcast is from narrow to wide type.
13796 SDValue Extract = BitCast.getOperand(i: 0);
13797 unsigned ExtIdx = Extract.getConstantOperandVal(i: 1);
13798 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
13799 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
13800 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
13801 if (ExtIdxInBits % CastedEltBitWidth != 0)
13802 return false;
13803
13804 // Can't handle cases where vector size is not 128-bit
13805 if (!Extract.getOperand(i: 0).getValueType().is128BitVector())
13806 return false;
13807
13808 // Update the lane value by offsetting with the scaled extract index.
13809 LaneC += ExtIdxInBits / CastedEltBitWidth;
13810
13811 // Determine the casted vector type of the wide vector input.
13812 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
13813 // Examples:
13814 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
13815 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
13816 unsigned SrcVecNumElts =
13817 Extract.getOperand(i: 0).getValueSizeInBits() / CastedEltBitWidth;
13818 CastVT = MVT::getVectorVT(VT: BitCast.getSimpleValueType().getScalarType(),
13819 NumElements: SrcVecNumElts);
13820 return true;
13821 };
13822 MVT CastVT;
13823 if (getScaledOffsetDup(V, Lane, CastVT)) {
13824 V = DAG.getBitcast(VT: CastVT, V: V.getOperand(i: 0).getOperand(i: 0));
13825 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13826 V.getOperand(i: 0).getValueType().is128BitVector()) {
13827 // The lane is incremented by the index of the extract.
13828 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
13829 Lane += V.getConstantOperandVal(i: 1);
13830 V = V.getOperand(i: 0);
13831 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
13832 // The lane is decremented if we are splatting from the 2nd operand.
13833 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
13834 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
13835 Lane -= Idx * VT.getVectorNumElements() / 2;
13836 V = WidenVector(V64Reg: V.getOperand(i: Idx), DAG);
13837 } else if (VT.getSizeInBits() == 64) {
13838 // Widen the operand to 128-bit register with undef.
13839 V = WidenVector(V64Reg: V, DAG);
13840 }
13841 return DAG.getNode(Opcode, DL, VT, N1: V, N2: DAG.getConstant(Val: Lane, DL, VT: MVT::i64));
13842}
13843
13844// Try to widen element type to get a new mask value for a better permutation
13845// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
13846// UZP1/2, TRN1/2, REV, INS, etc.
13847// For example:
13848// shufflevector <4 x i32> %a, <4 x i32> %b,
13849// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
13850// is equivalent to:
13851// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
13852// Finally, we can get:
13853// mov v0.d[0], v1.d[1]
13854static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
13855 SDLoc DL(Op);
13856 EVT VT = Op.getValueType();
13857 EVT ScalarVT = VT.getVectorElementType();
13858 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
13859 SDValue V0 = Op.getOperand(i: 0);
13860 SDValue V1 = Op.getOperand(i: 1);
13861 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
13862
13863 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
13864 // We need to make sure the wider element type is legal. Thus, ElementSize
13865 // should be not larger than 32 bits, and i1 type should also be excluded.
13866 if (ElementSize > 32 || ElementSize == 1)
13867 return SDValue();
13868
13869 SmallVector<int, 8> NewMask;
13870 if (widenShuffleMaskElts(M: Mask, NewMask)) {
13871 MVT NewEltVT = VT.isFloatingPoint()
13872 ? MVT::getFloatingPointVT(BitWidth: ElementSize * 2)
13873 : MVT::getIntegerVT(BitWidth: ElementSize * 2);
13874 MVT NewVT = MVT::getVectorVT(VT: NewEltVT, NumElements: VT.getVectorNumElements() / 2);
13875 if (DAG.getTargetLoweringInfo().isTypeLegal(VT: NewVT)) {
13876 V0 = DAG.getBitcast(VT: NewVT, V: V0);
13877 V1 = DAG.getBitcast(VT: NewVT, V: V1);
13878 return DAG.getBitcast(VT,
13879 V: DAG.getVectorShuffle(VT: NewVT, dl: DL, N1: V0, N2: V1, Mask: NewMask));
13880 }
13881 }
13882
13883 return SDValue();
13884}
13885
13886// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
13887static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
13888 ArrayRef<int> ShuffleMask,
13889 SelectionDAG &DAG) {
13890 SDValue Tbl1 = Op->getOperand(Num: 0);
13891 SDValue Tbl2 = Op->getOperand(Num: 1);
13892 SDLoc DL(Op);
13893 SDValue Tbl2ID =
13894 DAG.getTargetConstant(Val: Intrinsic::aarch64_neon_tbl2, DL, VT: MVT::i64);
13895
13896 EVT VT = Op.getValueType();
13897 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
13898 Tbl1.getOperand(i: 0) != Tbl2ID ||
13899 Tbl2.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
13900 Tbl2.getOperand(i: 0) != Tbl2ID)
13901 return SDValue();
13902
13903 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
13904 return SDValue();
13905
13906 SDValue Mask1 = Tbl1.getOperand(i: 3);
13907 SDValue Mask2 = Tbl2.getOperand(i: 3);
13908 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
13909 Mask2.getOpcode() != ISD::BUILD_VECTOR)
13910 return SDValue();
13911
13912 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
13913 for (unsigned I = 0; I < 16; I++) {
13914 if (ShuffleMask[I] < 16)
13915 TBLMaskParts[I] = Mask1.getOperand(i: ShuffleMask[I]);
13916 else {
13917 auto *C = dyn_cast<ConstantSDNode>(Val: Mask2.getOperand(i: ShuffleMask[I] - 16));
13918 if (!C)
13919 return SDValue();
13920 TBLMaskParts[I] = DAG.getConstant(Val: C->getSExtValue() + 32, DL, VT: MVT::i32);
13921 }
13922 }
13923
13924 SDValue TBLMask = DAG.getBuildVector(VT, DL, Ops: TBLMaskParts);
13925 SDValue ID =
13926 DAG.getTargetConstant(Val: Intrinsic::aarch64_neon_tbl4, DL, VT: MVT::i64);
13927
13928 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::v16i8,
13929 Ops: {ID, Tbl1->getOperand(Num: 1), Tbl1->getOperand(Num: 2),
13930 Tbl2->getOperand(Num: 1), Tbl2->getOperand(Num: 2), TBLMask});
13931}
13932
13933// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
13934// but we don't have an appropriate instruction,
13935// so custom-lower it as ZIP1-with-zeros.
13936SDValue
13937AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
13938 SelectionDAG &DAG) const {
13939 SDLoc DL(Op);
13940 EVT VT = Op.getValueType();
13941 SDValue SrcOp = Op.getOperand(i: 0);
13942 EVT SrcVT = SrcOp.getValueType();
13943 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
13944 "Unexpected extension factor.");
13945 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
13946 // FIXME: support multi-step zipping?
13947 if (Scale != 2)
13948 return SDValue();
13949 SDValue Zeros = DAG.getConstant(Val: 0, DL, VT: SrcVT);
13950 return DAG.getBitcast(VT,
13951 V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: SrcVT, N1: SrcOp, N2: Zeros));
13952}
13953
13954SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
13955 SelectionDAG &DAG) const {
13956 SDLoc DL(Op);
13957 EVT VT = Op.getValueType();
13958
13959 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
13960
13961 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
13962 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
13963
13964 // Convert shuffles that are directly supported on NEON to target-specific
13965 // DAG nodes, instead of keeping them as shuffles and matching them again
13966 // during code selection. This is more efficient and avoids the possibility
13967 // of inconsistencies between legalization and selection.
13968 ArrayRef<int> ShuffleMask = SVN->getMask();
13969
13970 SDValue V1 = Op.getOperand(i: 0);
13971 SDValue V2 = Op.getOperand(i: 1);
13972
13973 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
13974 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
13975 "Unexpected VECTOR_SHUFFLE mask size!");
13976
13977 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
13978 return Res;
13979
13980 if (SVN->isSplat()) {
13981 int Lane = SVN->getSplatIndex();
13982 // If this is undef splat, generate it via "just" vdup, if possible.
13983 if (Lane == -1)
13984 Lane = 0;
13985
13986 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
13987 return DAG.getNode(Opcode: AArch64ISD::DUP, DL, VT: V1.getValueType(),
13988 Operand: V1.getOperand(i: 0));
13989 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
13990 // constant. If so, we can just reference the lane's definition directly.
13991 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
13992 !isa<ConstantSDNode>(Val: V1.getOperand(i: Lane)))
13993 return DAG.getNode(Opcode: AArch64ISD::DUP, DL, VT, Operand: V1.getOperand(i: Lane));
13994
13995 // Otherwise, duplicate from the lane of the input vector.
13996 unsigned Opcode = getDUPLANEOp(EltType: V1.getValueType().getVectorElementType());
13997 return constructDup(V: V1, Lane, DL, VT, Opcode, DAG);
13998 }
13999
14000 // Check if the mask matches a DUP for a wider element
14001 for (unsigned LaneSize : {64U, 32U, 16U}) {
14002 unsigned Lane = 0;
14003 if (isWideDUPMask(M: ShuffleMask, VT, BlockSize: LaneSize, DupLaneOp&: Lane)) {
14004 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
14005 : LaneSize == 32 ? AArch64ISD::DUPLANE32
14006 : AArch64ISD::DUPLANE16;
14007 // Cast V1 to an integer vector with required lane size
14008 MVT NewEltTy = MVT::getIntegerVT(BitWidth: LaneSize);
14009 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
14010 MVT NewVecTy = MVT::getVectorVT(VT: NewEltTy, NumElements: NewEltCount);
14011 V1 = DAG.getBitcast(VT: NewVecTy, V: V1);
14012 // Construct the DUP instruction
14013 V1 = constructDup(V: V1, Lane, DL, VT: NewVecTy, Opcode, DAG);
14014 // Cast back to the original type
14015 return DAG.getBitcast(VT, V: V1);
14016 }
14017 }
14018
14019 unsigned NumElts = VT.getVectorNumElements();
14020 unsigned EltSize = VT.getScalarSizeInBits();
14021 if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: 64))
14022 return DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: V1.getValueType(), Operand: V1);
14023 if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: 32))
14024 return DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: V1.getValueType(), Operand: V1);
14025 if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: 16))
14026 return DAG.getNode(Opcode: AArch64ISD::REV16, DL, VT: V1.getValueType(), Operand: V1);
14027
14028 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
14029 ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size())) {
14030 SDValue Rev = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT, Operand: V1);
14031 return DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT, N1: Rev, N2: Rev,
14032 N3: DAG.getConstant(Val: 8, DL, VT: MVT::i32));
14033 }
14034
14035 bool ReverseEXT = false;
14036 unsigned Imm;
14037 if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm)) {
14038 if (ReverseEXT)
14039 std::swap(a&: V1, b&: V2);
14040 Imm *= getExtFactor(V&: V1);
14041 return DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: V1.getValueType(), N1: V1, N2: V2,
14042 N3: DAG.getConstant(Val: Imm, DL, VT: MVT::i32));
14043 } else if (V2->isUndef() && isSingletonEXTMask(M: ShuffleMask, VT, Imm)) {
14044 Imm *= getExtFactor(V&: V1);
14045 return DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: V1.getValueType(), N1: V1, N2: V1,
14046 N3: DAG.getConstant(Val: Imm, DL, VT: MVT::i32));
14047 }
14048
14049 unsigned WhichResult;
14050 if (isZIPMask(M: ShuffleMask, NumElts, WhichResultOut&: WhichResult)) {
14051 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14052 return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V2);
14053 }
14054 if (isUZPMask(M: ShuffleMask, NumElts, WhichResultOut&: WhichResult)) {
14055 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14056 return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V2);
14057 }
14058 if (isTRNMask(M: ShuffleMask, NumElts, WhichResult)) {
14059 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14060 return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V2);
14061 }
14062
14063 if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
14064 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14065 return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V1);
14066 }
14067 if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
14068 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14069 return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V1);
14070 }
14071 if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
14072 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14073 return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V1);
14074 }
14075
14076 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
14077 return Concat;
14078
14079 bool DstIsLeft;
14080 int Anomaly;
14081 int NumInputElements = V1.getValueType().getVectorNumElements();
14082 if (isINSMask(M: ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
14083 SDValue DstVec = DstIsLeft ? V1 : V2;
14084 SDValue DstLaneV = DAG.getConstant(Val: Anomaly, DL, VT: MVT::i64);
14085
14086 SDValue SrcVec = V1;
14087 int SrcLane = ShuffleMask[Anomaly];
14088 if (SrcLane >= NumInputElements) {
14089 SrcVec = V2;
14090 SrcLane -= NumElts;
14091 }
14092 SDValue SrcLaneV = DAG.getConstant(Val: SrcLane, DL, VT: MVT::i64);
14093
14094 EVT ScalarVT = VT.getVectorElementType();
14095
14096 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
14097 ScalarVT = MVT::i32;
14098
14099 return DAG.getNode(
14100 Opcode: ISD::INSERT_VECTOR_ELT, DL, VT, N1: DstVec,
14101 N2: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarVT, N1: SrcVec, N2: SrcLaneV),
14102 N3: DstLaneV);
14103 }
14104
14105 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
14106 return NewSD;
14107
14108 // If the shuffle is not directly supported and it has 4 elements, use
14109 // the PerfectShuffle-generated table to synthesize it from other shuffles.
14110 if (NumElts == 4) {
14111 unsigned PFIndexes[4];
14112 for (unsigned i = 0; i != 4; ++i) {
14113 if (ShuffleMask[i] < 0)
14114 PFIndexes[i] = 8;
14115 else
14116 PFIndexes[i] = ShuffleMask[i];
14117 }
14118
14119 // Compute the index in the perfect shuffle table.
14120 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14121 PFIndexes[2] * 9 + PFIndexes[3];
14122 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14123 return GeneratePerfectShuffle(ID: PFTableIndex, V1, V2, PFEntry, LHS: V1, RHS: V2, DAG,
14124 DL);
14125 }
14126
14127 // Check for a "select shuffle", generating a BSL to pick between lanes in
14128 // V1/V2.
14129 if (ShuffleVectorInst::isSelectMask(Mask: ShuffleMask, NumSrcElts: NumElts)) {
14130 assert(VT.getScalarSizeInBits() <= 32 &&
14131 "Expected larger vector element sizes to be handled already");
14132 SmallVector<SDValue> MaskElts;
14133 for (int M : ShuffleMask)
14134 MaskElts.push_back(Elt: DAG.getConstant(
14135 Val: M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, VT: MVT::i32));
14136 EVT IVT = VT.changeVectorElementTypeToInteger();
14137 SDValue MaskConst = DAG.getBuildVector(VT: IVT, DL, Ops: MaskElts);
14138 return DAG.getBitcast(VT, V: DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT: IVT, N1: MaskConst,
14139 N2: DAG.getBitcast(VT: IVT, V: V1),
14140 N3: DAG.getBitcast(VT: IVT, V: V2)));
14141 }
14142
14143 // Fall back to generating a TBL
14144 return GenerateTBL(Op, ShuffleMask, DAG);
14145}
14146
14147SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14148 SelectionDAG &DAG) const {
14149 EVT VT = Op.getValueType();
14150
14151 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14152 return LowerToScalableOp(Op, DAG);
14153
14154 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14155 "Unexpected vector type!");
14156
14157 // We can handle the constant cases during isel.
14158 if (isa<ConstantSDNode>(Val: Op.getOperand(i: 0)))
14159 return Op;
14160
14161 // There isn't a natural way to handle the general i1 case, so we use some
14162 // trickery with whilelo.
14163 SDLoc DL(Op);
14164 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 0), DL, VT: MVT::i64);
14165 SplatVal = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i64, N1: SplatVal,
14166 N2: DAG.getValueType(MVT::i1));
14167 SDValue ID =
14168 DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo, DL, VT: MVT::i64);
14169 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
14170 if (VT == MVT::nxv1i1)
14171 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::nxv1i1,
14172 N1: DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::nxv2i1, N1: ID,
14173 N2: Zero, N3: SplatVal),
14174 N2: Zero);
14175 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: ID, N2: Zero, N3: SplatVal);
14176}
14177
14178SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14179 SelectionDAG &DAG) const {
14180 SDLoc DL(Op);
14181
14182 EVT VT = Op.getValueType();
14183 if (!isTypeLegal(VT) || !VT.isScalableVector())
14184 return SDValue();
14185
14186 // Current lowering only supports the SVE-ACLE types.
14187 if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
14188 return SDValue();
14189
14190 // The DUPQ operation is independent of element type so normalise to i64s.
14191 SDValue Idx128 = Op.getOperand(i: 2);
14192
14193 // DUPQ can be used when idx is in range.
14194 auto *CIdx = dyn_cast<ConstantSDNode>(Val&: Idx128);
14195 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14196 SDValue CI = DAG.getTargetConstant(Val: CIdx->getZExtValue(), DL, VT: MVT::i64);
14197 return DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT, N1: Op.getOperand(i: 1), N2: CI);
14198 }
14199
14200 SDValue V = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv2i64, Operand: Op.getOperand(i: 1));
14201
14202 // The ACLE says this must produce the same result as:
14203 // svtbl(data, svadd_x(svptrue_b64(),
14204 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14205 // index * 2))
14206 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i64);
14207 SDValue SplatOne = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: One);
14208
14209 // create the vector 0,1,0,1,...
14210 SDValue SV = DAG.getStepVector(DL, ResVT: MVT::nxv2i64);
14211 SV = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::nxv2i64, N1: SV, N2: SplatOne);
14212
14213 // create the vector idx64,idx64+1,idx64,idx64+1,...
14214 SDValue Idx64 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Idx128, N2: Idx128);
14215 SDValue SplatIdx64 = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: Idx64);
14216 SDValue ShuffleMask = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::nxv2i64, N1: SV, N2: SplatIdx64);
14217
14218 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14219 SDValue TBL = DAG.getNode(Opcode: AArch64ISD::TBL, DL, VT: MVT::nxv2i64, N1: V, N2: ShuffleMask);
14220 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: TBL);
14221}
14222
14223
14224static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14225 APInt &UndefBits) {
14226 EVT VT = BVN->getValueType(ResNo: 0);
14227 APInt SplatBits, SplatUndef;
14228 unsigned SplatBitSize;
14229 bool HasAnyUndefs;
14230 if (BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14231 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14232
14233 for (unsigned i = 0; i < NumSplats; ++i) {
14234 CnstBits <<= SplatBitSize;
14235 UndefBits <<= SplatBitSize;
14236 CnstBits |= SplatBits.zextOrTrunc(width: VT.getSizeInBits());
14237 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(width: VT.getSizeInBits());
14238 }
14239
14240 return true;
14241 }
14242
14243 return false;
14244}
14245
14246// Try 64-bit splatted SIMD immediate.
14247static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14248 const APInt &Bits) {
14249 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
14250 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
14251 EVT VT = Op.getValueType();
14252 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14253
14254 if (AArch64_AM::isAdvSIMDModImmType10(Imm: Value)) {
14255 Value = AArch64_AM::encodeAdvSIMDModImmType10(Imm: Value);
14256
14257 SDLoc DL(Op);
14258 SDValue Mov =
14259 DAG.getNode(Opcode: NewOp, DL, VT: MovTy, Operand: DAG.getConstant(Val: Value, DL, VT: MVT::i32));
14260 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14261 }
14262 }
14263
14264 return SDValue();
14265}
14266
14267// Try 32-bit splatted SIMD immediate.
14268static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14269 const APInt &Bits,
14270 const SDValue *LHS = nullptr) {
14271 EVT VT = Op.getValueType();
14272 if (VT.isFixedLengthVector() &&
14273 !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
14274 return SDValue();
14275
14276 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
14277 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
14278 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14279 bool isAdvSIMDModImm = false;
14280 uint64_t Shift;
14281
14282 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Imm: Value))) {
14283 Value = AArch64_AM::encodeAdvSIMDModImmType1(Imm: Value);
14284 Shift = 0;
14285 }
14286 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Imm: Value))) {
14287 Value = AArch64_AM::encodeAdvSIMDModImmType2(Imm: Value);
14288 Shift = 8;
14289 }
14290 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Imm: Value))) {
14291 Value = AArch64_AM::encodeAdvSIMDModImmType3(Imm: Value);
14292 Shift = 16;
14293 }
14294 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Imm: Value))) {
14295 Value = AArch64_AM::encodeAdvSIMDModImmType4(Imm: Value);
14296 Shift = 24;
14297 }
14298
14299 if (isAdvSIMDModImm) {
14300 SDLoc DL(Op);
14301 SDValue Mov;
14302
14303 if (LHS)
14304 Mov = DAG.getNode(Opcode: NewOp, DL, VT: MovTy,
14305 N1: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: MovTy, Operand: *LHS),
14306 N2: DAG.getConstant(Val: Value, DL, VT: MVT::i32),
14307 N3: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
14308 else
14309 Mov =
14310 DAG.getNode(Opcode: NewOp, DL, VT: MovTy, N1: DAG.getConstant(Val: Value, DL, VT: MVT::i32),
14311 N2: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
14312
14313 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14314 }
14315 }
14316
14317 return SDValue();
14318}
14319
14320// Try 16-bit splatted SIMD immediate.
14321static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14322 const APInt &Bits,
14323 const SDValue *LHS = nullptr) {
14324 EVT VT = Op.getValueType();
14325 if (VT.isFixedLengthVector() &&
14326 !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
14327 return SDValue();
14328
14329 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
14330 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
14331 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14332 bool isAdvSIMDModImm = false;
14333 uint64_t Shift;
14334
14335 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Imm: Value))) {
14336 Value = AArch64_AM::encodeAdvSIMDModImmType5(Imm: Value);
14337 Shift = 0;
14338 }
14339 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Imm: Value))) {
14340 Value = AArch64_AM::encodeAdvSIMDModImmType6(Imm: Value);
14341 Shift = 8;
14342 }
14343
14344 if (isAdvSIMDModImm) {
14345 SDLoc DL(Op);
14346 SDValue Mov;
14347
14348 if (LHS)
14349 Mov = DAG.getNode(Opcode: NewOp, DL, VT: MovTy,
14350 N1: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: MovTy, Operand: *LHS),
14351 N2: DAG.getConstant(Val: Value, DL, VT: MVT::i32),
14352 N3: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
14353 else
14354 Mov =
14355 DAG.getNode(Opcode: NewOp, DL, VT: MovTy, N1: DAG.getConstant(Val: Value, DL, VT: MVT::i32),
14356 N2: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
14357
14358 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14359 }
14360 }
14361
14362 return SDValue();
14363}
14364
14365// Try 32-bit splatted SIMD immediate with shifted ones.
14366static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
14367 SelectionDAG &DAG, const APInt &Bits) {
14368 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
14369 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
14370 EVT VT = Op.getValueType();
14371 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14372 bool isAdvSIMDModImm = false;
14373 uint64_t Shift;
14374
14375 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Imm: Value))) {
14376 Value = AArch64_AM::encodeAdvSIMDModImmType7(Imm: Value);
14377 Shift = 264;
14378 }
14379 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Imm: Value))) {
14380 Value = AArch64_AM::encodeAdvSIMDModImmType8(Imm: Value);
14381 Shift = 272;
14382 }
14383
14384 if (isAdvSIMDModImm) {
14385 SDLoc DL(Op);
14386 SDValue Mov =
14387 DAG.getNode(Opcode: NewOp, DL, VT: MovTy, N1: DAG.getConstant(Val: Value, DL, VT: MVT::i32),
14388 N2: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
14389 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14390 }
14391 }
14392
14393 return SDValue();
14394}
14395
14396// Try 8-bit splatted SIMD immediate.
14397static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14398 const APInt &Bits) {
14399 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
14400 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
14401 EVT VT = Op.getValueType();
14402 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14403
14404 if (AArch64_AM::isAdvSIMDModImmType9(Imm: Value)) {
14405 Value = AArch64_AM::encodeAdvSIMDModImmType9(Imm: Value);
14406
14407 SDLoc DL(Op);
14408 SDValue Mov =
14409 DAG.getNode(Opcode: NewOp, DL, VT: MovTy, Operand: DAG.getConstant(Val: Value, DL, VT: MVT::i32));
14410 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14411 }
14412 }
14413
14414 return SDValue();
14415}
14416
14417// Try FP splatted SIMD immediate.
14418static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14419 const APInt &Bits) {
14420 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
14421 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
14422 EVT VT = Op.getValueType();
14423 bool isWide = (VT.getSizeInBits() == 128);
14424 MVT MovTy;
14425 bool isAdvSIMDModImm = false;
14426
14427 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Imm: Value))) {
14428 Value = AArch64_AM::encodeAdvSIMDModImmType11(Imm: Value);
14429 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14430 }
14431 else if (isWide &&
14432 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Imm: Value))) {
14433 Value = AArch64_AM::encodeAdvSIMDModImmType12(Imm: Value);
14434 MovTy = MVT::v2f64;
14435 }
14436
14437 if (isAdvSIMDModImm) {
14438 SDLoc DL(Op);
14439 SDValue Mov =
14440 DAG.getNode(Opcode: NewOp, DL, VT: MovTy, Operand: DAG.getConstant(Val: Value, DL, VT: MVT::i32));
14441 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14442 }
14443 }
14444
14445 return SDValue();
14446}
14447
14448// Specialized code to quickly find if PotentialBVec is a BuildVector that
14449// consists of only the same constant int value, returned in reference arg
14450// ConstVal
14451static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
14452 uint64_t &ConstVal) {
14453 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(Val: PotentialBVec);
14454 if (!Bvec)
14455 return false;
14456 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: 0));
14457 if (!FirstElt)
14458 return false;
14459 EVT VT = Bvec->getValueType(ResNo: 0);
14460 unsigned NumElts = VT.getVectorNumElements();
14461 for (unsigned i = 1; i < NumElts; ++i)
14462 if (dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: i)) != FirstElt)
14463 return false;
14464 ConstVal = FirstElt->getZExtValue();
14465 return true;
14466}
14467
14468static bool isAllInactivePredicate(SDValue N) {
14469 // Look through cast.
14470 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
14471 N = N.getOperand(i: 0);
14472
14473 return ISD::isConstantSplatVectorAllZeros(N: N.getNode());
14474}
14475
14476static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
14477 unsigned NumElts = N.getValueType().getVectorMinNumElements();
14478
14479 // Look through cast.
14480 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14481 N = N.getOperand(i: 0);
14482 // When reinterpreting from a type with fewer elements the "new" elements
14483 // are not active, so bail if they're likely to be used.
14484 if (N.getValueType().getVectorMinNumElements() < NumElts)
14485 return false;
14486 }
14487
14488 if (ISD::isConstantSplatVectorAllOnes(N: N.getNode()))
14489 return true;
14490
14491 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14492 // or smaller than the implicit element type represented by N.
14493 // NOTE: A larger element count implies a smaller element type.
14494 if (N.getOpcode() == AArch64ISD::PTRUE &&
14495 N.getConstantOperandVal(i: 0) == AArch64SVEPredPattern::all)
14496 return N.getValueType().getVectorMinNumElements() >= NumElts;
14497
14498 // If we're compiling for a specific vector-length, we can check if the
14499 // pattern's VL equals that of the scalable vector at runtime.
14500 if (N.getOpcode() == AArch64ISD::PTRUE) {
14501 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14502 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
14503 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
14504 if (MaxSVESize && MinSVESize == MaxSVESize) {
14505 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
14506 unsigned PatNumElts =
14507 getNumElementsFromSVEPredPattern(Pattern: N.getConstantOperandVal(i: 0));
14508 return PatNumElts == (NumElts * VScale);
14509 }
14510 }
14511
14512 return false;
14513}
14514
14515// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
14516// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
14517// BUILD_VECTORs with constant element C1, C2 is a constant, and:
14518// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
14519// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
14520// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
14521static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
14522 EVT VT = N->getValueType(ResNo: 0);
14523
14524 if (!VT.isVector())
14525 return SDValue();
14526
14527 SDLoc DL(N);
14528
14529 SDValue And;
14530 SDValue Shift;
14531
14532 SDValue FirstOp = N->getOperand(Num: 0);
14533 unsigned FirstOpc = FirstOp.getOpcode();
14534 SDValue SecondOp = N->getOperand(Num: 1);
14535 unsigned SecondOpc = SecondOp.getOpcode();
14536
14537 // Is one of the operands an AND or a BICi? The AND may have been optimised to
14538 // a BICi in order to use an immediate instead of a register.
14539 // Is the other operand an shl or lshr? This will have been turned into:
14540 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
14541 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
14542 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
14543 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
14544 SecondOpc == AArch64ISD::SHL_PRED ||
14545 SecondOpc == AArch64ISD::SRL_PRED)) {
14546 And = FirstOp;
14547 Shift = SecondOp;
14548
14549 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
14550 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
14551 FirstOpc == AArch64ISD::SHL_PRED ||
14552 FirstOpc == AArch64ISD::SRL_PRED)) {
14553 And = SecondOp;
14554 Shift = FirstOp;
14555 } else
14556 return SDValue();
14557
14558 bool IsAnd = And.getOpcode() == ISD::AND;
14559 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
14560 Shift.getOpcode() == AArch64ISD::SRL_PRED;
14561 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
14562 Shift.getOpcode() == AArch64ISD::SRL_PRED;
14563
14564 // Is the shift amount constant and are all lanes active?
14565 uint64_t C2;
14566 if (ShiftHasPredOp) {
14567 if (!isAllActivePredicate(DAG, N: Shift.getOperand(i: 0)))
14568 return SDValue();
14569 APInt C;
14570 if (!ISD::isConstantSplatVector(N: Shift.getOperand(i: 2).getNode(), SplatValue&: C))
14571 return SDValue();
14572 C2 = C.getZExtValue();
14573 } else if (ConstantSDNode *C2node =
14574 dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
14575 C2 = C2node->getZExtValue();
14576 else
14577 return SDValue();
14578
14579 APInt C1AsAPInt;
14580 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
14581 if (IsAnd) {
14582 // Is the and mask vector all constant?
14583 if (!ISD::isConstantSplatVector(N: And.getOperand(i: 1).getNode(), SplatValue&: C1AsAPInt))
14584 return SDValue();
14585 } else {
14586 // Reconstruct the corresponding AND immediate from the two BICi immediates.
14587 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: 1));
14588 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: 2));
14589 assert(C1nodeImm && C1nodeShift);
14590 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
14591 C1AsAPInt = C1AsAPInt.zextOrTrunc(width: ElemSizeInBits);
14592 }
14593
14594 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
14595 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
14596 // how much one can shift elements of a particular size?
14597 if (C2 > ElemSizeInBits)
14598 return SDValue();
14599
14600 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(numBits: ElemSizeInBits, hiBitsSet: C2)
14601 : APInt::getLowBitsSet(numBits: ElemSizeInBits, loBitsSet: C2);
14602 if (C1AsAPInt != RequiredC1)
14603 return SDValue();
14604
14605 SDValue X = And.getOperand(i: 0);
14606 SDValue Y = ShiftHasPredOp ? Shift.getOperand(i: 1) : Shift.getOperand(i: 0);
14607 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(Val: C2, DL, VT: MVT::i32)
14608 : Shift.getOperand(i: 1);
14609
14610 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
14611 SDValue ResultSLI = DAG.getNode(Opcode: Inst, DL, VT, N1: X, N2: Y, N3: Imm);
14612
14613 return ResultSLI;
14614}
14615
14616SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
14617 SelectionDAG &DAG) const {
14618 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
14619 OverrideNEON: !Subtarget->isNeonAvailable()))
14620 return LowerToScalableOp(Op, DAG);
14621
14622 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
14623 if (SDValue Res = tryLowerToSLI(N: Op.getNode(), DAG))
14624 return Res;
14625
14626 EVT VT = Op.getValueType();
14627 if (VT.isScalableVector())
14628 return Op;
14629
14630 SDValue LHS = Op.getOperand(i: 0);
14631 BuildVectorSDNode *BVN =
14632 dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: 1).getNode());
14633 if (!BVN) {
14634 // OR commutes, so try swapping the operands.
14635 LHS = Op.getOperand(i: 1);
14636 BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: 0).getNode());
14637 }
14638 if (!BVN)
14639 return Op;
14640
14641 APInt DefBits(VT.getSizeInBits(), 0);
14642 APInt UndefBits(VT.getSizeInBits(), 0);
14643 if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
14644 SDValue NewOp;
14645
14646 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
14647 Bits: DefBits, LHS: &LHS)) ||
14648 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
14649 Bits: DefBits, LHS: &LHS)))
14650 return NewOp;
14651
14652 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
14653 Bits: UndefBits, LHS: &LHS)) ||
14654 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
14655 Bits: UndefBits, LHS: &LHS)))
14656 return NewOp;
14657 }
14658
14659 // We can always fall back to a non-immediate OR.
14660 return Op;
14661}
14662
14663// Normalize the operands of BUILD_VECTOR. The value of constant operands will
14664// be truncated to fit element width.
14665static SDValue NormalizeBuildVector(SDValue Op,
14666 SelectionDAG &DAG) {
14667 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
14668 SDLoc DL(Op);
14669 EVT VT = Op.getValueType();
14670 EVT EltTy= VT.getVectorElementType();
14671
14672 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
14673 return Op;
14674
14675 SmallVector<SDValue, 16> Ops;
14676 for (SDValue Lane : Op->ops()) {
14677 // For integer vectors, type legalization would have promoted the
14678 // operands already. Otherwise, if Op is a floating-point splat
14679 // (with operands cast to integers), then the only possibilities
14680 // are constants and UNDEFs.
14681 if (auto *CstLane = dyn_cast<ConstantSDNode>(Val&: Lane)) {
14682 Lane = DAG.getConstant(
14683 Val: CstLane->getAPIntValue().trunc(width: EltTy.getSizeInBits()).getZExtValue(),
14684 DL, VT: MVT::i32);
14685 } else if (Lane.getNode()->isUndef()) {
14686 Lane = DAG.getUNDEF(VT: MVT::i32);
14687 } else {
14688 assert(Lane.getValueType() == MVT::i32 &&
14689 "Unexpected BUILD_VECTOR operand type");
14690 }
14691 Ops.push_back(Elt: Lane);
14692 }
14693 return DAG.getBuildVector(VT, DL, Ops);
14694}
14695
14696static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
14697 const AArch64Subtarget *ST) {
14698 EVT VT = Op.getValueType();
14699 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
14700 "Expected a legal NEON vector");
14701
14702 APInt DefBits(VT.getSizeInBits(), 0);
14703 APInt UndefBits(VT.getSizeInBits(), 0);
14704 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
14705 if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
14706 auto TryMOVIWithBits = [&](APInt DefBits) {
14707 SDValue NewOp;
14708 if ((NewOp =
14709 tryAdvSIMDModImm64(NewOp: AArch64ISD::MOVIedit, Op, DAG, Bits: DefBits)) ||
14710 (NewOp =
14711 tryAdvSIMDModImm32(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) ||
14712 (NewOp =
14713 tryAdvSIMDModImm321s(NewOp: AArch64ISD::MOVImsl, Op, DAG, Bits: DefBits)) ||
14714 (NewOp =
14715 tryAdvSIMDModImm16(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) ||
14716 (NewOp = tryAdvSIMDModImm8(NewOp: AArch64ISD::MOVI, Op, DAG, Bits: DefBits)) ||
14717 (NewOp = tryAdvSIMDModImmFP(NewOp: AArch64ISD::FMOV, Op, DAG, Bits: DefBits)))
14718 return NewOp;
14719
14720 APInt NotDefBits = ~DefBits;
14721 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::MVNIshift, Op, DAG,
14722 Bits: NotDefBits)) ||
14723 (NewOp = tryAdvSIMDModImm321s(NewOp: AArch64ISD::MVNImsl, Op, DAG,
14724 Bits: NotDefBits)) ||
14725 (NewOp =
14726 tryAdvSIMDModImm16(NewOp: AArch64ISD::MVNIshift, Op, DAG, Bits: NotDefBits)))
14727 return NewOp;
14728 return SDValue();
14729 };
14730 if (SDValue R = TryMOVIWithBits(DefBits))
14731 return R;
14732 if (SDValue R = TryMOVIWithBits(UndefBits))
14733 return R;
14734
14735 // See if a fneg of the constant can be materialized with a MOVI, etc
14736 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
14737 // FNegate each sub-element of the constant
14738 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
14739 APInt Neg = APInt::getHighBitsSet(numBits: FVT.getSizeInBits(), hiBitsSet: 1)
14740 .zext(width: VT.getSizeInBits());
14741 APInt NegBits(VT.getSizeInBits(), 0);
14742 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
14743 for (unsigned i = 0; i < NumElts; i++)
14744 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
14745 NegBits = DefBits ^ NegBits;
14746
14747 // Try to create the new constants with MOVI, and if so generate a fneg
14748 // for it.
14749 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
14750 SDLoc DL(Op);
14751 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(VT: FVT, NumElements: NumElts);
14752 return DAG.getNode(
14753 Opcode: AArch64ISD::NVCAST, DL, VT,
14754 Operand: DAG.getNode(Opcode: ISD::FNEG, DL, VT: VFVT,
14755 Operand: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: VFVT, Operand: NewOp)));
14756 }
14757 return SDValue();
14758 };
14759 SDValue R;
14760 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
14761 (R = TryWithFNeg(DefBits, MVT::f64)) ||
14762 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
14763 return R;
14764 }
14765
14766 return SDValue();
14767}
14768
14769SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
14770 SDValue Op, SelectionDAG &DAG) const {
14771 EVT VT = Op.getValueType();
14772 SDLoc DL(Op);
14773 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
14774 auto *BVN = cast<BuildVectorSDNode>(Val&: Op);
14775
14776 if (auto SeqInfo = BVN->isConstantSequence()) {
14777 SDValue Start = DAG.getConstant(Val: SeqInfo->first, DL, VT: ContainerVT);
14778 SDValue Steps = DAG.getStepVector(DL, ResVT: ContainerVT, StepVal: SeqInfo->second);
14779 SDValue Seq = DAG.getNode(Opcode: ISD::ADD, DL, VT: ContainerVT, N1: Start, N2: Steps);
14780 return convertFromScalableVector(DAG, VT, V: Seq);
14781 }
14782
14783 unsigned NumElems = VT.getVectorNumElements();
14784 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
14785 NumElems <= 1 || BVN->isConstant())
14786 return SDValue();
14787
14788 auto IsExtractElt = [](SDValue Op) {
14789 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
14790 };
14791
14792 // For integer types that are not already in vectors limit to at most four
14793 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
14794 if (VT.getScalarType().isInteger() &&
14795 NumElems - count_if(Range: Op->op_values(), P: IsExtractElt) > 4)
14796 return SDValue();
14797
14798 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
14799 SDValue ZeroI64 = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
14800 SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(
14801 C: Op->op_values(), F: [&, Undef = DAG.getUNDEF(VT: ContainerVT)](SDValue Op) {
14802 return Op.isUndef() ? Undef
14803 : DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL,
14804 VT: ContainerVT, N1: Undef, N2: Op, N3: ZeroI64);
14805 });
14806
14807 ElementCount ZipEC = ContainerVT.getVectorElementCount();
14808 while (Intermediates.size() > 1) {
14809 EVT ZipVT = getPackedSVEVectorVT(EC: ZipEC);
14810
14811 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
14812 SDValue Op0 = DAG.getBitcast(VT: ZipVT, V: Intermediates[I + 0]);
14813 SDValue Op1 = DAG.getBitcast(VT: ZipVT, V: Intermediates[I + 1]);
14814 Intermediates[I / 2] =
14815 Op1.isUndef() ? Op0
14816 : DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ZipVT, N1: Op0, N2: Op1);
14817 }
14818
14819 Intermediates.resize(N: Intermediates.size() / 2);
14820 ZipEC = ZipEC.divideCoefficientBy(RHS: 2);
14821 }
14822
14823 assert(Intermediates.size() == 1);
14824 SDValue Vec = DAG.getBitcast(VT: ContainerVT, V: Intermediates[0]);
14825 return convertFromScalableVector(DAG, VT, V: Vec);
14826}
14827
14828SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
14829 SelectionDAG &DAG) const {
14830 EVT VT = Op.getValueType();
14831
14832 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14833 cast<BuildVectorSDNode>(Val&: Op)->isConstantSequence();
14834 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
14835 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
14836
14837 // Try to build a simple constant vector.
14838 Op = NormalizeBuildVector(Op, DAG);
14839 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
14840 // abort.
14841 if (Op.getOpcode() != ISD::BUILD_VECTOR)
14842 return SDValue();
14843
14844 // Certain vector constants, used to express things like logical NOT and
14845 // arithmetic NEG, are passed through unmodified. This allows special
14846 // patterns for these operations to match, which will lower these constants
14847 // to whatever is proven necessary.
14848 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
14849 if (BVN->isConstant()) {
14850 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
14851 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
14852 APInt Val(BitSize,
14853 Const->getAPIntValue().zextOrTrunc(width: BitSize).getZExtValue());
14854 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
14855 return Op;
14856 }
14857 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
14858 if (Const->isZero() && !Const->isNegative())
14859 return Op;
14860 }
14861
14862 if (SDValue V = ConstantBuildVector(Op, DAG, ST: Subtarget))
14863 return V;
14864
14865 // Scan through the operands to find some interesting properties we can
14866 // exploit:
14867 // 1) If only one value is used, we can use a DUP, or
14868 // 2) if only the low element is not undef, we can just insert that, or
14869 // 3) if only one constant value is used (w/ some non-constant lanes),
14870 // we can splat the constant value into the whole vector then fill
14871 // in the non-constant lanes.
14872 // 4) FIXME: If different constant values are used, but we can intelligently
14873 // select the values we'll be overwriting for the non-constant
14874 // lanes such that we can directly materialize the vector
14875 // some other way (MOVI, e.g.), we can be sneaky.
14876 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
14877 SDLoc DL(Op);
14878 unsigned NumElts = VT.getVectorNumElements();
14879 bool isOnlyLowElement = true;
14880 bool usesOnlyOneValue = true;
14881 bool usesOnlyOneConstantValue = true;
14882 bool isConstant = true;
14883 bool AllLanesExtractElt = true;
14884 unsigned NumConstantLanes = 0;
14885 unsigned NumDifferentLanes = 0;
14886 unsigned NumUndefLanes = 0;
14887 SDValue Value;
14888 SDValue ConstantValue;
14889 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
14890 unsigned ConsecutiveValCount = 0;
14891 SDValue PrevVal;
14892 for (unsigned i = 0; i < NumElts; ++i) {
14893 SDValue V = Op.getOperand(i);
14894 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14895 AllLanesExtractElt = false;
14896 if (V.isUndef()) {
14897 ++NumUndefLanes;
14898 continue;
14899 }
14900 if (i > 0)
14901 isOnlyLowElement = false;
14902 if (!isIntOrFPConstant(V))
14903 isConstant = false;
14904
14905 if (isIntOrFPConstant(V)) {
14906 ++NumConstantLanes;
14907 if (!ConstantValue.getNode())
14908 ConstantValue = V;
14909 else if (ConstantValue != V)
14910 usesOnlyOneConstantValue = false;
14911 }
14912
14913 if (!Value.getNode())
14914 Value = V;
14915 else if (V != Value) {
14916 usesOnlyOneValue = false;
14917 ++NumDifferentLanes;
14918 }
14919
14920 if (PrevVal != V) {
14921 ConsecutiveValCount = 0;
14922 PrevVal = V;
14923 }
14924
14925 // Keep different values and its last consecutive count. For example,
14926 //
14927 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14928 // t24, t24, t24, t24, t24, t24, t24, t24
14929 // t23 = consecutive count 8
14930 // t24 = consecutive count 8
14931 // ------------------------------------------------------------------
14932 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
14933 // t24, t24, t24, t24, t24, t24, t24, t24
14934 // t23 = consecutive count 5
14935 // t24 = consecutive count 9
14936 DifferentValueMap[V] = ++ConsecutiveValCount;
14937 }
14938
14939 if (!Value.getNode()) {
14940 LLVM_DEBUG(
14941 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
14942 return DAG.getUNDEF(VT);
14943 }
14944
14945 // Convert BUILD_VECTOR where all elements but the lowest are undef into
14946 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
14947 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
14948 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(V: Value))) {
14949 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
14950 "SCALAR_TO_VECTOR node\n");
14951 return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT, Operand: Value);
14952 }
14953
14954 if (AllLanesExtractElt) {
14955 SDNode *Vector = nullptr;
14956 bool Even = false;
14957 bool Odd = false;
14958 // Check whether the extract elements match the Even pattern <0,2,4,...> or
14959 // the Odd pattern <1,3,5,...>.
14960 for (unsigned i = 0; i < NumElts; ++i) {
14961 SDValue V = Op.getOperand(i);
14962 const SDNode *N = V.getNode();
14963 if (!isa<ConstantSDNode>(Val: N->getOperand(Num: 1))) {
14964 Even = false;
14965 Odd = false;
14966 break;
14967 }
14968 SDValue N0 = N->getOperand(Num: 0);
14969
14970 // All elements are extracted from the same vector.
14971 if (!Vector) {
14972 Vector = N0.getNode();
14973 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
14974 // BUILD_VECTOR.
14975 if (VT.getVectorElementType() !=
14976 N0.getValueType().getVectorElementType())
14977 break;
14978 } else if (Vector != N0.getNode()) {
14979 Odd = false;
14980 Even = false;
14981 break;
14982 }
14983
14984 // Extracted values are either at Even indices <0,2,4,...> or at Odd
14985 // indices <1,3,5,...>.
14986 uint64_t Val = N->getConstantOperandVal(Num: 1);
14987 if (Val == 2 * i) {
14988 Even = true;
14989 continue;
14990 }
14991 if (Val - 1 == 2 * i) {
14992 Odd = true;
14993 continue;
14994 }
14995
14996 // Something does not match: abort.
14997 Odd = false;
14998 Even = false;
14999 break;
15000 }
15001 if (Even || Odd) {
15002 SDValue LHS =
15003 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: SDValue(Vector, 0),
15004 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
15005 SDValue RHS =
15006 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: SDValue(Vector, 0),
15007 N2: DAG.getConstant(Val: NumElts, DL, VT: MVT::i64));
15008
15009 if (Even && !Odd)
15010 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT, N1: LHS, N2: RHS);
15011 if (Odd && !Even)
15012 return DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT, N1: LHS, N2: RHS);
15013 }
15014 }
15015
15016 // Use DUP for non-constant splats. For f32 constant splats, reduce to
15017 // i32 and try again.
15018 if (usesOnlyOneValue) {
15019 if (!isConstant) {
15020 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15021 Value.getValueType() != VT) {
15022 LLVM_DEBUG(
15023 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
15024 return DAG.getNode(Opcode: AArch64ISD::DUP, DL, VT, Operand: Value);
15025 }
15026
15027 // This is actually a DUPLANExx operation, which keeps everything vectory.
15028
15029 SDValue Lane = Value.getOperand(i: 1);
15030 Value = Value.getOperand(i: 0);
15031 if (Value.getValueSizeInBits() == 64) {
15032 LLVM_DEBUG(
15033 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
15034 "widening it\n");
15035 Value = WidenVector(V64Reg: Value, DAG);
15036 }
15037
15038 unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
15039 return DAG.getNode(Opcode, DL, VT, N1: Value, N2: Lane);
15040 }
15041
15042 if (VT.getVectorElementType().isFloatingPoint()) {
15043 SmallVector<SDValue, 8> Ops;
15044 EVT EltTy = VT.getVectorElementType();
15045 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
15046 EltTy == MVT::f64) && "Unsupported floating-point vector type");
15047 LLVM_DEBUG(
15048 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
15049 "BITCASTS, and try again\n");
15050 MVT NewType = MVT::getIntegerVT(BitWidth: EltTy.getSizeInBits());
15051 for (unsigned i = 0; i < NumElts; ++i)
15052 Ops.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: NewType, Operand: Op.getOperand(i)));
15053 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: NewType, NumElements: NumElts);
15054 SDValue Val = DAG.getBuildVector(VT: VecVT, DL, Ops);
15055 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
15056 Val.dump(););
15057 Val = LowerBUILD_VECTOR(Op: Val, DAG);
15058 if (Val.getNode())
15059 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
15060 }
15061 }
15062
15063 // If we need to insert a small number of different non-constant elements and
15064 // the vector width is sufficiently large, prefer using DUP with the common
15065 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
15066 // skip the constant lane handling below.
15067 bool PreferDUPAndInsert =
15068 !isConstant && NumDifferentLanes >= 1 &&
15069 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
15070 NumDifferentLanes >= NumConstantLanes;
15071
15072 // If there was only one constant value used and for more than one lane,
15073 // start by splatting that value, then replace the non-constant lanes. This
15074 // is better than the default, which will perform a separate initialization
15075 // for each lane.
15076 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
15077 // Firstly, try to materialize the splat constant.
15078 SDValue Val = DAG.getSplatBuildVector(VT, DL, Op: ConstantValue);
15079 unsigned BitSize = VT.getScalarSizeInBits();
15080 APInt ConstantValueAPInt(1, 0);
15081 if (auto *C = dyn_cast<ConstantSDNode>(Val&: ConstantValue))
15082 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(width: BitSize);
15083 if (!isNullConstant(V: ConstantValue) && !isNullFPConstant(V: ConstantValue) &&
15084 !ConstantValueAPInt.isAllOnes()) {
15085 Val = ConstantBuildVector(Op: Val, DAG, ST: Subtarget);
15086 if (!Val)
15087 // Otherwise, materialize the constant and splat it.
15088 Val = DAG.getNode(Opcode: AArch64ISD::DUP, DL, VT, Operand: ConstantValue);
15089 }
15090
15091 // Now insert the non-constant lanes.
15092 for (unsigned i = 0; i < NumElts; ++i) {
15093 SDValue V = Op.getOperand(i);
15094 SDValue LaneIdx = DAG.getConstant(Val: i, DL, VT: MVT::i64);
15095 if (!isIntOrFPConstant(V))
15096 // Note that type legalization likely mucked about with the VT of the
15097 // source operand, so we may have to convert it here before inserting.
15098 Val = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT, N1: Val, N2: V, N3: LaneIdx);
15099 }
15100 return Val;
15101 }
15102
15103 // This will generate a load from the constant pool.
15104 if (isConstant) {
15105 LLVM_DEBUG(
15106 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
15107 "expansion\n");
15108 return SDValue();
15109 }
15110
15111 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15112 // v4i32s. This is really a truncate, which we can construct out of (legal)
15113 // concats and truncate nodes.
15114 if (SDValue M = ReconstructTruncateFromBuildVector(V: Op, DAG))
15115 return M;
15116
15117 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15118 if (NumElts >= 4) {
15119 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15120 return Shuffle;
15121
15122 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15123 return Shuffle;
15124 }
15125
15126 if (PreferDUPAndInsert) {
15127 // First, build a constant vector with the common element.
15128 SmallVector<SDValue, 8> Ops(NumElts, Value);
15129 SDValue NewVector = LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT, DL, Ops), DAG);
15130 // Next, insert the elements that do not match the common value.
15131 for (unsigned I = 0; I < NumElts; ++I)
15132 if (Op.getOperand(i: I) != Value)
15133 NewVector =
15134 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT, N1: NewVector,
15135 N2: Op.getOperand(i: I), N3: DAG.getConstant(Val: I, DL, VT: MVT::i64));
15136
15137 return NewVector;
15138 }
15139
15140 // If vector consists of two different values, try to generate two DUPs and
15141 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15142 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15143 SmallVector<SDValue, 2> Vals;
15144 // Check the consecutive count of the value is the half number of vector
15145 // elements. In this case, we can use CONCAT_VECTORS. For example,
15146 //
15147 // canUseVECTOR_CONCAT = true;
15148 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15149 // t24, t24, t24, t24, t24, t24, t24, t24
15150 //
15151 // canUseVECTOR_CONCAT = false;
15152 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15153 // t24, t24, t24, t24, t24, t24, t24, t24
15154 bool canUseVECTOR_CONCAT = true;
15155 for (auto Pair : DifferentValueMap) {
15156 // Check different values have same length which is NumElts / 2.
15157 if (Pair.second != NumElts / 2)
15158 canUseVECTOR_CONCAT = false;
15159 Vals.push_back(Elt: Pair.first);
15160 }
15161
15162 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15163 // CONCAT_VECTORs. For example,
15164 //
15165 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15166 // t24, t24, t24, t24, t24, t24, t24, t24
15167 // ==>
15168 // t26: v8i8 = AArch64ISD::DUP t23
15169 // t28: v8i8 = AArch64ISD::DUP t24
15170 // t29: v16i8 = concat_vectors t26, t28
15171 if (canUseVECTOR_CONCAT) {
15172 EVT SubVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
15173 if (isTypeLegal(VT: SubVT) && SubVT.isVector() &&
15174 SubVT.getVectorNumElements() >= 2) {
15175 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15176 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15177 SDValue DUP1 =
15178 LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL, Ops: Ops1), DAG);
15179 SDValue DUP2 =
15180 LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL, Ops: Ops2), DAG);
15181 SDValue CONCAT_VECTORS =
15182 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: DUP1, N2: DUP2);
15183 return CONCAT_VECTORS;
15184 }
15185 }
15186
15187 // Let's try to generate VECTOR_SHUFFLE. For example,
15188 //
15189 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15190 // ==>
15191 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15192 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15193 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15194 if (NumElts >= 8) {
15195 SmallVector<int, 16> MaskVec;
15196 // Build mask for VECTOR_SHUFLLE.
15197 SDValue FirstLaneVal = Op.getOperand(i: 0);
15198 for (unsigned i = 0; i < NumElts; ++i) {
15199 SDValue Val = Op.getOperand(i);
15200 if (FirstLaneVal == Val)
15201 MaskVec.push_back(Elt: i);
15202 else
15203 MaskVec.push_back(Elt: i + NumElts);
15204 }
15205
15206 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15207 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15208 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops: Ops1);
15209 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops: Ops2);
15210 SDValue VECTOR_SHUFFLE =
15211 DAG.getVectorShuffle(VT, dl: DL, N1: VEC1, N2: VEC2, Mask: MaskVec);
15212 return VECTOR_SHUFFLE;
15213 }
15214 }
15215
15216 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15217 // know the default expansion would otherwise fall back on something even
15218 // worse. For a vector with one or two non-undef values, that's
15219 // scalar_to_vector for the elements followed by a shuffle (provided the
15220 // shuffle is valid for the target) and materialization element by element
15221 // on the stack followed by a load for everything else.
15222 if (!isConstant && !usesOnlyOneValue) {
15223 LLVM_DEBUG(
15224 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15225 "of INSERT_VECTOR_ELT\n");
15226
15227 SDValue Vec = DAG.getUNDEF(VT);
15228 SDValue Op0 = Op.getOperand(i: 0);
15229 unsigned i = 0;
15230
15231 // Use SCALAR_TO_VECTOR for lane zero to
15232 // a) Avoid a RMW dependency on the full vector register, and
15233 // b) Allow the register coalescer to fold away the copy if the
15234 // value is already in an S or D register, and we're forced to emit an
15235 // INSERT_SUBREG that we can't fold anywhere.
15236 //
15237 // We also allow types like i8 and i16 which are illegal scalar but legal
15238 // vector element types. After type-legalization the inserted value is
15239 // extended (i32) and it is safe to cast them to the vector type by ignoring
15240 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15241 if (!Op0.isUndef()) {
15242 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15243 Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT, Operand: Op0);
15244 ++i;
15245 }
15246 LLVM_DEBUG({
15247 if (i < NumElts)
15248 dbgs() << "Creating nodes for the other vector elements:\n";
15249 });
15250 for (; i < NumElts; ++i) {
15251 SDValue V = Op.getOperand(i);
15252 if (V.isUndef())
15253 continue;
15254 SDValue LaneIdx = DAG.getConstant(Val: i, DL, VT: MVT::i64);
15255 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT, N1: Vec, N2: V, N3: LaneIdx);
15256 }
15257 return Vec;
15258 }
15259
15260 LLVM_DEBUG(
15261 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15262 "better alternative\n");
15263 return SDValue();
15264}
15265
15266SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15267 SelectionDAG &DAG) const {
15268 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
15269 OverrideNEON: !Subtarget->isNeonAvailable()))
15270 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15271
15272 assert(Op.getValueType().isScalableVector() &&
15273 isTypeLegal(Op.getValueType()) &&
15274 "Expected legal scalable vector type!");
15275
15276 if (isTypeLegal(VT: Op.getOperand(i: 0).getValueType())) {
15277 unsigned NumOperands = Op->getNumOperands();
15278 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15279 "Unexpected number of operands in CONCAT_VECTORS");
15280
15281 if (NumOperands == 2)
15282 return Op;
15283
15284 // Concat each pair of subvectors and pack into the lower half of the array.
15285 SmallVector<SDValue> ConcatOps(Op->ops());
15286 while (ConcatOps.size() > 1) {
15287 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15288 SDValue V1 = ConcatOps[I];
15289 SDValue V2 = ConcatOps[I + 1];
15290 EVT SubVT = V1.getValueType();
15291 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
15292 ConcatOps[I / 2] =
15293 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT: PairVT, N1: V1, N2: V2);
15294 }
15295 ConcatOps.resize(N: ConcatOps.size() / 2);
15296 }
15297 return ConcatOps[0];
15298 }
15299
15300 return SDValue();
15301}
15302
15303SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15304 SelectionDAG &DAG) const {
15305 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15306
15307 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
15308 OverrideNEON: !Subtarget->isNeonAvailable()))
15309 return LowerFixedLengthInsertVectorElt(Op, DAG);
15310
15311 EVT VT = Op.getOperand(i: 0).getValueType();
15312
15313 if (VT.getScalarType() == MVT::i1) {
15314 EVT VectorVT = getPromotedVTForPredicate(VT);
15315 SDLoc DL(Op);
15316 SDValue ExtendedVector =
15317 DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 0), DL, VT: VectorVT);
15318 SDValue ExtendedValue =
15319 DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 1), DL,
15320 VT: VectorVT.getScalarType().getSizeInBits() < 32
15321 ? MVT::i32
15322 : VectorVT.getScalarType());
15323 ExtendedVector =
15324 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VectorVT, N1: ExtendedVector,
15325 N2: ExtendedValue, N3: Op.getOperand(i: 2));
15326 return DAG.getAnyExtOrTrunc(Op: ExtendedVector, DL, VT);
15327 }
15328
15329 // Check for non-constant or out of range lane.
15330 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
15331 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15332 return SDValue();
15333
15334 return Op;
15335}
15336
15337SDValue
15338AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15339 SelectionDAG &DAG) const {
15340 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
15341 EVT VT = Op.getOperand(i: 0).getValueType();
15342
15343 if (VT.getScalarType() == MVT::i1) {
15344 // We can't directly extract from an SVE predicate; extend it first.
15345 // (This isn't the only possible lowering, but it's straightforward.)
15346 EVT VectorVT = getPromotedVTForPredicate(VT);
15347 SDLoc DL(Op);
15348 SDValue Extend =
15349 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VectorVT, Operand: Op.getOperand(i: 0));
15350 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
15351 SDValue Extract = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtractTy,
15352 N1: Extend, N2: Op.getOperand(i: 1));
15353 return DAG.getAnyExtOrTrunc(Op: Extract, DL, VT: Op.getValueType());
15354 }
15355
15356 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
15357 return LowerFixedLengthExtractVectorElt(Op, DAG);
15358
15359 // Check for non-constant or out of range lane.
15360 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1));
15361 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15362 return SDValue();
15363
15364 // Insertion/extraction are legal for V128 types.
15365 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
15366 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
15367 VT == MVT::v8f16 || VT == MVT::v8bf16)
15368 return Op;
15369
15370 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
15371 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
15372 VT != MVT::v4bf16)
15373 return SDValue();
15374
15375 // For V64 types, we perform extraction by expanding the value
15376 // to a V128 type and perform the extraction on that.
15377 SDLoc DL(Op);
15378 SDValue WideVec = WidenVector(V64Reg: Op.getOperand(i: 0), DAG);
15379 EVT WideTy = WideVec.getValueType();
15380
15381 EVT ExtrTy = WideTy.getVectorElementType();
15382 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
15383 ExtrTy = MVT::i32;
15384
15385 // For extractions, we just return the result directly.
15386 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtrTy, N1: WideVec,
15387 N2: Op.getOperand(i: 1));
15388}
15389
15390SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
15391 SelectionDAG &DAG) const {
15392 EVT VT = Op.getValueType();
15393 assert(VT.isFixedLengthVector() &&
15394 "Only cases that extract a fixed length vector are supported!");
15395 EVT InVT = Op.getOperand(i: 0).getValueType();
15396
15397 // If we don't have legal types yet, do nothing
15398 if (!isTypeLegal(VT: InVT))
15399 return SDValue();
15400
15401 if (InVT.is128BitVector()) {
15402 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
15403 unsigned Idx = Op.getConstantOperandVal(i: 1);
15404
15405 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
15406 if (Idx == 0)
15407 return Op;
15408
15409 // If this is extracting the upper 64-bits of a 128-bit vector, we match
15410 // that directly.
15411 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
15412 return Op;
15413 }
15414
15415 if (InVT.isScalableVector() ||
15416 useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable())) {
15417 SDLoc DL(Op);
15418 SDValue Vec = Op.getOperand(i: 0);
15419 SDValue Idx = Op.getOperand(i: 1);
15420
15421 EVT PackedVT = getPackedSVEVectorVT(VT: InVT.getVectorElementType());
15422 if (PackedVT != InVT) {
15423 // Pack input into the bottom part of an SVE register and try again.
15424 SDValue Container = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: PackedVT,
15425 N1: DAG.getUNDEF(VT: PackedVT), N2: Vec,
15426 N3: DAG.getVectorIdxConstant(Val: 0, DL));
15427 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Container, N2: Idx);
15428 }
15429
15430 // This will get matched by custom code during ISelDAGToDAG.
15431 if (isNullConstant(V: Idx))
15432 return Op;
15433
15434 assert(InVT.isScalableVector() && "Unexpected vector type!");
15435 // Move requested subvector to the start of the vector and try again.
15436 SDValue Splice = DAG.getNode(Opcode: ISD::VECTOR_SPLICE, DL, VT: InVT, N1: Vec, N2: Vec, N3: Idx);
15437 return convertFromScalableVector(DAG, VT, V: Splice);
15438 }
15439
15440 return SDValue();
15441}
15442
15443SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
15444 SelectionDAG &DAG) const {
15445 assert(Op.getValueType().isScalableVector() &&
15446 "Only expect to lower inserts into scalable vectors!");
15447
15448 EVT InVT = Op.getOperand(i: 1).getValueType();
15449 unsigned Idx = Op.getConstantOperandVal(i: 2);
15450
15451 SDValue Vec0 = Op.getOperand(i: 0);
15452 SDValue Vec1 = Op.getOperand(i: 1);
15453 SDLoc DL(Op);
15454 EVT VT = Op.getValueType();
15455
15456 if (InVT.isScalableVector()) {
15457 if (!isTypeLegal(VT))
15458 return SDValue();
15459
15460 // Break down insert_subvector into simpler parts.
15461 if (VT.getVectorElementType() == MVT::i1) {
15462 unsigned NumElts = VT.getVectorMinNumElements();
15463 EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
15464
15465 SDValue Lo, Hi;
15466 Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
15467 N2: DAG.getVectorIdxConstant(Val: 0, DL));
15468 Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
15469 N2: DAG.getVectorIdxConstant(Val: NumElts / 2, DL));
15470 if (Idx < (NumElts / 2))
15471 Lo = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Lo, N2: Vec1,
15472 N3: DAG.getVectorIdxConstant(Val: Idx, DL));
15473 else
15474 Hi = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Hi, N2: Vec1,
15475 N3: DAG.getVectorIdxConstant(Val: Idx - (NumElts / 2), DL));
15476
15477 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Lo, N2: Hi);
15478 }
15479
15480 // We can select these directly.
15481 if (isTypeLegal(VT: InVT) && Vec0.isUndef())
15482 return Op;
15483
15484 // Ensure the subvector is half the size of the main vector.
15485 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
15486 return SDValue();
15487
15488 // Here narrow and wide refers to the vector element types. After "casting"
15489 // both vectors must have the same bit length and so because the subvector
15490 // has fewer elements, those elements need to be bigger.
15491 EVT NarrowVT = getPackedSVEVectorVT(EC: VT.getVectorElementCount());
15492 EVT WideVT = getPackedSVEVectorVT(EC: InVT.getVectorElementCount());
15493
15494 // NOP cast operands to the largest legal vector of the same element count.
15495 if (VT.isFloatingPoint()) {
15496 Vec0 = getSVESafeBitCast(VT: NarrowVT, Op: Vec0, DAG);
15497 Vec1 = getSVESafeBitCast(VT: NarrowVT, Op: Vec1, DAG);
15498 } else {
15499 // Legal integer vectors are already their largest so Vec0 is fine as is.
15500 Vec1 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: WideVT, Operand: Vec1);
15501 Vec1 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: NarrowVT, Operand: Vec1);
15502 }
15503
15504 // To replace the top/bottom half of vector V with vector SubV we widen the
15505 // preserved half of V, concatenate this to SubV (the order depending on the
15506 // half being replaced) and then narrow the result.
15507 SDValue Narrow;
15508 if (Idx == 0) {
15509 SDValue HiVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL, VT: WideVT, Operand: Vec0);
15510 HiVec0 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: NarrowVT, Operand: HiVec0);
15511 Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: Vec1, N2: HiVec0);
15512 } else {
15513 assert(Idx == InVT.getVectorMinNumElements() &&
15514 "Invalid subvector index!");
15515 SDValue LoVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL, VT: WideVT, Operand: Vec0);
15516 LoVec0 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: NarrowVT, Operand: LoVec0);
15517 Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: LoVec0, N2: Vec1);
15518 }
15519
15520 return getSVESafeBitCast(VT, Op: Narrow, DAG);
15521 }
15522
15523 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
15524 // This will be matched by custom code during ISelDAGToDAG.
15525 if (Vec0.isUndef())
15526 return Op;
15527
15528 std::optional<unsigned> PredPattern =
15529 getSVEPredPatternFromNumElements(MinNumElts: InVT.getVectorNumElements());
15530 auto PredTy = VT.changeVectorElementType(EltVT: MVT::i1);
15531 SDValue PTrue = getPTrue(DAG, DL, VT: PredTy, Pattern: *PredPattern);
15532 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, V: Vec1);
15533 return DAG.getNode(Opcode: ISD::VSELECT, DL, VT, N1: PTrue, N2: ScalableVec1, N3: Vec0);
15534 }
15535
15536 return SDValue();
15537}
15538
15539static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
15540 if (Op.getOpcode() != AArch64ISD::DUP &&
15541 Op.getOpcode() != ISD::SPLAT_VECTOR &&
15542 Op.getOpcode() != ISD::BUILD_VECTOR)
15543 return false;
15544
15545 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
15546 !isAllConstantBuildVector(PotentialBVec: Op, ConstVal&: SplatVal))
15547 return false;
15548
15549 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
15550 !isa<ConstantSDNode>(Val: Op->getOperand(Num: 0)))
15551 return false;
15552
15553 SplatVal = Op->getConstantOperandVal(Num: 0);
15554 if (Op.getValueType().getVectorElementType() != MVT::i64)
15555 SplatVal = (int32_t)SplatVal;
15556
15557 Negated = false;
15558 if (isPowerOf2_64(Value: SplatVal))
15559 return true;
15560
15561 Negated = true;
15562 if (isPowerOf2_64(Value: -SplatVal)) {
15563 SplatVal = -SplatVal;
15564 return true;
15565 }
15566
15567 return false;
15568}
15569
15570SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
15571 EVT VT = Op.getValueType();
15572 SDLoc DL(Op);
15573
15574 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
15575 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
15576
15577 assert(VT.isScalableVector() && "Expected a scalable vector.");
15578
15579 bool Signed = Op.getOpcode() == ISD::SDIV;
15580 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
15581
15582 bool Negated;
15583 uint64_t SplatVal;
15584 if (Signed && isPow2Splat(Op: Op.getOperand(i: 1), SplatVal, Negated)) {
15585 SDValue Pg = getPredicateForScalableVector(DAG, DL, VT);
15586 SDValue Res =
15587 DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL, VT, N1: Pg, N2: Op->getOperand(Num: 0),
15588 N3: DAG.getTargetConstant(Val: Log2_64(Value: SplatVal), DL, VT: MVT::i32));
15589 if (Negated)
15590 Res = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Res);
15591
15592 return Res;
15593 }
15594
15595 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
15596 return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
15597
15598 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
15599 // operations, and truncate the result.
15600 EVT WidenedVT;
15601 if (VT == MVT::nxv16i8)
15602 WidenedVT = MVT::nxv8i16;
15603 else if (VT == MVT::nxv8i16)
15604 WidenedVT = MVT::nxv4i32;
15605 else
15606 llvm_unreachable("Unexpected Custom DIV operation");
15607
15608 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
15609 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
15610 SDValue Op0Lo = DAG.getNode(Opcode: UnpkLo, DL, VT: WidenedVT, Operand: Op.getOperand(i: 0));
15611 SDValue Op1Lo = DAG.getNode(Opcode: UnpkLo, DL, VT: WidenedVT, Operand: Op.getOperand(i: 1));
15612 SDValue Op0Hi = DAG.getNode(Opcode: UnpkHi, DL, VT: WidenedVT, Operand: Op.getOperand(i: 0));
15613 SDValue Op1Hi = DAG.getNode(Opcode: UnpkHi, DL, VT: WidenedVT, Operand: Op.getOperand(i: 1));
15614 SDValue ResultLo = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: WidenedVT, N1: Op0Lo, N2: Op1Lo);
15615 SDValue ResultHi = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: WidenedVT, N1: Op0Hi, N2: Op1Hi);
15616 SDValue ResultLoCast = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: ResultLo);
15617 SDValue ResultHiCast = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: ResultHi);
15618 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT, N1: ResultLoCast, N2: ResultHiCast);
15619}
15620
15621bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
15622 EVT VT, unsigned DefinedValues) const {
15623 if (!Subtarget->isNeonAvailable())
15624 return false;
15625 return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
15626}
15627
15628bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
15629 // Currently no fixed length shuffles that require SVE are legal.
15630 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
15631 return false;
15632
15633 if (VT.getVectorNumElements() == 4 &&
15634 (VT.is128BitVector() || VT.is64BitVector())) {
15635 unsigned Cost = getPerfectShuffleCost(M);
15636 if (Cost <= 1)
15637 return true;
15638 }
15639
15640 bool DummyBool;
15641 int DummyInt;
15642 unsigned DummyUnsigned;
15643
15644 unsigned EltSize = VT.getScalarSizeInBits();
15645 unsigned NumElts = VT.getVectorNumElements();
15646 return (ShuffleVectorSDNode::isSplatMask(Mask: M) ||
15647 isREVMask(M, EltSize, NumElts, BlockSize: 64) ||
15648 isREVMask(M, EltSize, NumElts, BlockSize: 32) ||
15649 isREVMask(M, EltSize, NumElts, BlockSize: 16) ||
15650 isEXTMask(M, VT, ReverseEXT&: DummyBool, Imm&: DummyUnsigned) ||
15651 isTRNMask(M, NumElts, WhichResult&: DummyUnsigned) ||
15652 isUZPMask(M, NumElts, WhichResultOut&: DummyUnsigned) ||
15653 isZIPMask(M, NumElts, WhichResultOut&: DummyUnsigned) ||
15654 isTRN_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) ||
15655 isUZP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) ||
15656 isZIP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) ||
15657 isINSMask(M, NumInputElements: NumElts, DstIsLeft&: DummyBool, Anomaly&: DummyInt) ||
15658 isConcatMask(Mask: M, VT, SplitLHS: VT.getSizeInBits() == 128));
15659}
15660
15661bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
15662 EVT VT) const {
15663 // Just delegate to the generic legality, clear masks aren't special.
15664 return isShuffleMaskLegal(M, VT);
15665}
15666
15667/// getVShiftImm - Check if this is a valid build_vector for the immediate
15668/// operand of a vector shift operation, where all the elements of the
15669/// build_vector must have the same constant integer value.
15670static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
15671 // Ignore bit_converts.
15672 while (Op.getOpcode() == ISD::BITCAST)
15673 Op = Op.getOperand(i: 0);
15674 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getNode());
15675 APInt SplatBits, SplatUndef;
15676 unsigned SplatBitSize;
15677 bool HasAnyUndefs;
15678 if (!BVN || !BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize,
15679 HasAnyUndefs, MinSplatBits: ElementBits) ||
15680 SplatBitSize > ElementBits)
15681 return false;
15682 Cnt = SplatBits.getSExtValue();
15683 return true;
15684}
15685
15686/// isVShiftLImm - Check if this is a valid build_vector for the immediate
15687/// operand of a vector shift left operation. That value must be in the range:
15688/// 0 <= Value < ElementBits for a left shift; or
15689/// 0 <= Value <= ElementBits for a long left shift.
15690static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
15691 assert(VT.isVector() && "vector shift count is not a vector type");
15692 int64_t ElementBits = VT.getScalarSizeInBits();
15693 if (!getVShiftImm(Op, ElementBits, Cnt))
15694 return false;
15695 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
15696}
15697
15698/// isVShiftRImm - Check if this is a valid build_vector for the immediate
15699/// operand of a vector shift right operation. The value must be in the range:
15700/// 1 <= Value <= ElementBits for a right shift; or
15701static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
15702 assert(VT.isVector() && "vector shift count is not a vector type");
15703 int64_t ElementBits = VT.getScalarSizeInBits();
15704 if (!getVShiftImm(Op, ElementBits, Cnt))
15705 return false;
15706 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
15707}
15708
15709SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
15710 SelectionDAG &DAG) const {
15711 EVT VT = Op.getValueType();
15712
15713 if (VT.getScalarType() == MVT::i1) {
15714 // Lower i1 truncate to `(x & 1) != 0`.
15715 SDLoc DL(Op);
15716 EVT OpVT = Op.getOperand(i: 0).getValueType();
15717 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: OpVT);
15718 SDValue One = DAG.getConstant(Val: 1, DL, VT: OpVT);
15719 SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: OpVT, N1: Op.getOperand(i: 0), N2: One);
15720 return DAG.getSetCC(DL, VT, LHS: And, RHS: Zero, Cond: ISD::SETNE);
15721 }
15722
15723 if (!VT.isVector() || VT.isScalableVector())
15724 return SDValue();
15725
15726 if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: 0).getValueType(),
15727 OverrideNEON: !Subtarget->isNeonAvailable()))
15728 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
15729
15730 return SDValue();
15731}
15732
15733// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
15734// possibly a truncated type, it tells how many bits of the value are to be
15735// used.
15736static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT,
15737 SelectionDAG &DAG,
15738 unsigned &ShiftValue,
15739 SDValue &RShOperand) {
15740 if (Shift->getOpcode() != ISD::SRL)
15741 return false;
15742
15743 EVT VT = Shift.getValueType();
15744 assert(VT.isScalableVT());
15745
15746 auto ShiftOp1 =
15747 dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Shift->getOperand(Num: 1)));
15748 if (!ShiftOp1)
15749 return false;
15750
15751 ShiftValue = ShiftOp1->getZExtValue();
15752 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
15753 return false;
15754
15755 SDValue Add = Shift->getOperand(Num: 0);
15756 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
15757 return false;
15758
15759 assert(ResVT.getScalarSizeInBits() <= VT.getScalarSizeInBits() &&
15760 "ResVT must be truncated or same type as the shift.");
15761 // Check if an overflow can lead to incorrect results.
15762 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
15763 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
15764 return false;
15765
15766 auto AddOp1 =
15767 dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Add->getOperand(Num: 1)));
15768 if (!AddOp1)
15769 return false;
15770 uint64_t AddValue = AddOp1->getZExtValue();
15771 if (AddValue != 1ULL << (ShiftValue - 1))
15772 return false;
15773
15774 RShOperand = Add->getOperand(Num: 0);
15775 return true;
15776}
15777
15778SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
15779 SelectionDAG &DAG) const {
15780 EVT VT = Op.getValueType();
15781 SDLoc DL(Op);
15782 int64_t Cnt;
15783
15784 if (!Op.getOperand(i: 1).getValueType().isVector())
15785 return Op;
15786 unsigned EltSize = VT.getScalarSizeInBits();
15787
15788 switch (Op.getOpcode()) {
15789 case ISD::SHL:
15790 if (VT.isScalableVector() ||
15791 useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
15792 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SHL_PRED);
15793
15794 if (isVShiftLImm(Op: Op.getOperand(i: 1), VT, isLong: false, Cnt) && Cnt < EltSize)
15795 return DAG.getNode(Opcode: AArch64ISD::VSHL, DL, VT, N1: Op.getOperand(i: 0),
15796 N2: DAG.getConstant(Val: Cnt, DL, VT: MVT::i32));
15797 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
15798 N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_ushl, DL,
15799 VT: MVT::i32),
15800 N2: Op.getOperand(i: 0), N3: Op.getOperand(i: 1));
15801 case ISD::SRA:
15802 case ISD::SRL:
15803 if (VT.isScalableVector() &&
15804 (Subtarget->hasSVE2() ||
15805 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
15806 SDValue RShOperand;
15807 unsigned ShiftValue;
15808 if (canLowerSRLToRoundingShiftForVT(Shift: Op, ResVT: VT, DAG, ShiftValue, RShOperand))
15809 return DAG.getNode(Opcode: AArch64ISD::URSHR_I_PRED, DL, VT,
15810 N1: getPredicateForVector(DAG, DL, VT), N2: RShOperand,
15811 N3: DAG.getTargetConstant(Val: ShiftValue, DL, VT: MVT::i32));
15812 }
15813
15814 if (VT.isScalableVector() ||
15815 useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
15816 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
15817 : AArch64ISD::SRL_PRED;
15818 return LowerToPredicatedOp(Op, DAG, NewOp: Opc);
15819 }
15820
15821 // Right shift immediate
15822 if (isVShiftRImm(Op: Op.getOperand(i: 1), VT, isNarrow: false, Cnt) && Cnt < EltSize) {
15823 unsigned Opc =
15824 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
15825 return DAG.getNode(Opcode: Opc, DL, VT, N1: Op.getOperand(i: 0),
15826 N2: DAG.getConstant(Val: Cnt, DL, VT: MVT::i32), Flags: Op->getFlags());
15827 }
15828
15829 // Right shift register. Note, there is not a shift right register
15830 // instruction, but the shift left register instruction takes a signed
15831 // value, where negative numbers specify a right shift.
15832 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
15833 : Intrinsic::aarch64_neon_ushl;
15834 // negate the shift amount
15835 SDValue NegShift = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT),
15836 N2: Op.getOperand(i: 1));
15837 SDValue NegShiftLeft =
15838 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
15839 N1: DAG.getConstant(Val: Opc, DL, VT: MVT::i32), N2: Op.getOperand(i: 0),
15840 N3: NegShift);
15841 return NegShiftLeft;
15842 }
15843
15844 llvm_unreachable("unexpected shift opcode");
15845}
15846
15847SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
15848 SelectionDAG &DAG) const {
15849 if (Op.getValueType().isScalableVector())
15850 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SETCC_MERGE_ZERO);
15851
15852 if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: 0).getValueType(),
15853 OverrideNEON: !Subtarget->isNeonAvailable()))
15854 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
15855
15856 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
15857 SDValue LHS = Op.getOperand(i: 0);
15858 SDValue RHS = Op.getOperand(i: 1);
15859 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
15860 SDLoc DL(Op);
15861
15862 if (LHS.getValueType().getVectorElementType().isInteger())
15863 return Op;
15864
15865 assert(((!Subtarget->hasFullFP16() &&
15866 LHS.getValueType().getVectorElementType() != MVT::f16) ||
15867 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
15868 LHS.getValueType().getVectorElementType() != MVT::f128) &&
15869 "Unexpected type!");
15870
15871 // Lower isnan(x) | isnan(never-nan) to x != x.
15872 // Lower !isnan(x) & !isnan(never-nan) to x == x.
15873 if (CC == ISD::SETUO || CC == ISD::SETO) {
15874 bool OneNaN = false;
15875 if (LHS == RHS) {
15876 OneNaN = true;
15877 } else if (DAG.isKnownNeverNaN(Op: RHS)) {
15878 OneNaN = true;
15879 RHS = LHS;
15880 } else if (DAG.isKnownNeverNaN(Op: LHS)) {
15881 OneNaN = true;
15882 LHS = RHS;
15883 }
15884 if (OneNaN) {
15885 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
15886 }
15887 }
15888
15889 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
15890 // clean. Some of them require two branches to implement.
15891 AArch64CC::CondCode CC1, CC2;
15892 bool ShouldInvert;
15893 changeVectorFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2, Invert&: ShouldInvert);
15894
15895 bool NoNaNs =
15896 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
15897 SDValue Cmp = emitVectorComparison(LHS, RHS, CC: CC1, NoNans: NoNaNs, VT: CmpVT, DL, DAG);
15898 if (!Cmp.getNode())
15899 return SDValue();
15900
15901 if (CC2 != AArch64CC::AL) {
15902 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC: CC2, NoNans: NoNaNs, VT: CmpVT, DL, DAG);
15903 if (!Cmp2.getNode())
15904 return SDValue();
15905
15906 Cmp = DAG.getNode(Opcode: ISD::OR, DL, VT: CmpVT, N1: Cmp, N2: Cmp2);
15907 }
15908
15909 Cmp = DAG.getSExtOrTrunc(Op: Cmp, DL, VT: Op.getValueType());
15910
15911 if (ShouldInvert)
15912 Cmp = DAG.getNOT(DL, Val: Cmp, VT: Cmp.getValueType());
15913
15914 return Cmp;
15915}
15916
15917static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
15918 SelectionDAG &DAG) {
15919 SDValue VecOp = ScalarOp.getOperand(i: 0);
15920 auto Rdx = DAG.getNode(Opcode: Op, DL, VT: VecOp.getSimpleValueType(), Operand: VecOp);
15921 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarOp.getValueType(), N1: Rdx,
15922 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
15923}
15924
15925static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
15926 SDLoc DL, SelectionDAG &DAG) {
15927 unsigned ScalarOpcode;
15928 switch (Opcode) {
15929 case ISD::VECREDUCE_AND:
15930 ScalarOpcode = ISD::AND;
15931 break;
15932 case ISD::VECREDUCE_OR:
15933 ScalarOpcode = ISD::OR;
15934 break;
15935 case ISD::VECREDUCE_XOR:
15936 ScalarOpcode = ISD::XOR;
15937 break;
15938 default:
15939 llvm_unreachable("Expected bitwise vector reduction");
15940 return SDValue();
15941 }
15942
15943 EVT VecVT = Vec.getValueType();
15944 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
15945 "Expected power-of-2 length vector");
15946
15947 EVT ElemVT = VecVT.getVectorElementType();
15948
15949 SDValue Result;
15950 unsigned NumElems = VecVT.getVectorNumElements();
15951
15952 // Special case for boolean reductions
15953 if (ElemVT == MVT::i1) {
15954 // Split large vectors into smaller ones
15955 if (NumElems > 16) {
15956 SDValue Lo, Hi;
15957 std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
15958 EVT HalfVT = Lo.getValueType();
15959 SDValue HalfVec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: HalfVT, N1: Lo, N2: Hi);
15960 return getVectorBitwiseReduce(Opcode, Vec: HalfVec, VT, DL, DAG);
15961 }
15962
15963 // Results of setcc operations get widened to 128 bits if their input
15964 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
15965 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
15966 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
15967 // size leads to the best codegen, since e.g. setcc results might need to be
15968 // truncated otherwise.
15969 unsigned ExtendedWidth = 64;
15970 if (Vec.getOpcode() == ISD::SETCC &&
15971 Vec.getOperand(i: 0).getValueSizeInBits() >= 128) {
15972 ExtendedWidth = 128;
15973 }
15974 EVT ExtendedVT = MVT::getIntegerVT(BitWidth: std::max(a: ExtendedWidth / NumElems, b: 8u));
15975
15976 // any_ext doesn't work with umin/umax, so only use it for uadd.
15977 unsigned ExtendOp =
15978 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
15979 SDValue Extended = DAG.getNode(
15980 Opcode: ExtendOp, DL, VT: VecVT.changeVectorElementType(EltVT: ExtendedVT), Operand: Vec);
15981 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
15982 // in that case we bitcast the sign extended values from v2i64 to v4i32
15983 // before reduction for optimal code generation.
15984 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
15985 NumElems == 2 && ExtendedWidth == 128) {
15986 Extended = DAG.getBitcast(VT: MVT::v4i32, V: Extended);
15987 ExtendedVT = MVT::i32;
15988 }
15989 switch (ScalarOpcode) {
15990 case ISD::AND:
15991 Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMIN, DL, VT: ExtendedVT, Operand: Extended);
15992 break;
15993 case ISD::OR:
15994 Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMAX, DL, VT: ExtendedVT, Operand: Extended);
15995 break;
15996 case ISD::XOR:
15997 Result = DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ExtendedVT, Operand: Extended);
15998 break;
15999 default:
16000 llvm_unreachable("Unexpected Opcode");
16001 }
16002
16003 Result = DAG.getAnyExtOrTrunc(Op: Result, DL, VT: MVT::i1);
16004 } else {
16005 // Iteratively split the vector in half and combine using the bitwise
16006 // operation until it fits in a 64 bit register.
16007 while (VecVT.getSizeInBits() > 64) {
16008 SDValue Lo, Hi;
16009 std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
16010 VecVT = Lo.getValueType();
16011 NumElems = VecVT.getVectorNumElements();
16012 Vec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: VecVT, N1: Lo, N2: Hi);
16013 }
16014
16015 EVT ScalarVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VecVT.getSizeInBits());
16016
16017 // Do the remaining work on a scalar since it allows the code generator to
16018 // combine the shift and bitwise operation into one instruction and since
16019 // integer instructions can have higher throughput than vector instructions.
16020 SDValue Scalar = DAG.getBitcast(VT: ScalarVT, V: Vec);
16021
16022 // Iteratively combine the lower and upper halves of the scalar using the
16023 // bitwise operation, halving the relevant region of the scalar in each
16024 // iteration, until the relevant region is just one element of the original
16025 // vector.
16026 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16027 SDValue ShiftAmount =
16028 DAG.getConstant(Val: Shift * ElemVT.getSizeInBits(), DL, VT: MVT::i64);
16029 SDValue Shifted =
16030 DAG.getNode(Opcode: ISD::SRL, DL, VT: ScalarVT, N1: Scalar, N2: ShiftAmount);
16031 Scalar = DAG.getNode(Opcode: ScalarOpcode, DL, VT: ScalarVT, N1: Scalar, N2: Shifted);
16032 }
16033
16034 Result = DAG.getAnyExtOrTrunc(Op: Scalar, DL, VT: ElemVT);
16035 }
16036
16037 return DAG.getAnyExtOrTrunc(Op: Result, DL, VT);
16038}
16039
16040SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16041 SelectionDAG &DAG) const {
16042 SDValue Src = Op.getOperand(i: 0);
16043
16044 // Try to lower fixed length reductions to SVE.
16045 EVT SrcVT = Src.getValueType();
16046 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16047 Op.getOpcode() == ISD::VECREDUCE_AND ||
16048 Op.getOpcode() == ISD::VECREDUCE_OR ||
16049 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16050 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16051 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16052 SrcVT.getVectorElementType() == MVT::i64);
16053 if (SrcVT.isScalableVector() ||
16054 useSVEForFixedLengthVectorVT(
16055 VT: SrcVT, OverrideNEON: OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16056
16057 if (SrcVT.getVectorElementType() == MVT::i1)
16058 return LowerPredReductionToSVE(ScalarOp: Op, DAG);
16059
16060 switch (Op.getOpcode()) {
16061 case ISD::VECREDUCE_ADD:
16062 return LowerReductionToSVE(Opcode: AArch64ISD::UADDV_PRED, ScalarOp: Op, DAG);
16063 case ISD::VECREDUCE_AND:
16064 return LowerReductionToSVE(Opcode: AArch64ISD::ANDV_PRED, ScalarOp: Op, DAG);
16065 case ISD::VECREDUCE_OR:
16066 return LowerReductionToSVE(Opcode: AArch64ISD::ORV_PRED, ScalarOp: Op, DAG);
16067 case ISD::VECREDUCE_SMAX:
16068 return LowerReductionToSVE(Opcode: AArch64ISD::SMAXV_PRED, ScalarOp: Op, DAG);
16069 case ISD::VECREDUCE_SMIN:
16070 return LowerReductionToSVE(Opcode: AArch64ISD::SMINV_PRED, ScalarOp: Op, DAG);
16071 case ISD::VECREDUCE_UMAX:
16072 return LowerReductionToSVE(Opcode: AArch64ISD::UMAXV_PRED, ScalarOp: Op, DAG);
16073 case ISD::VECREDUCE_UMIN:
16074 return LowerReductionToSVE(Opcode: AArch64ISD::UMINV_PRED, ScalarOp: Op, DAG);
16075 case ISD::VECREDUCE_XOR:
16076 return LowerReductionToSVE(Opcode: AArch64ISD::EORV_PRED, ScalarOp: Op, DAG);
16077 case ISD::VECREDUCE_FADD:
16078 return LowerReductionToSVE(Opcode: AArch64ISD::FADDV_PRED, ScalarOp: Op, DAG);
16079 case ISD::VECREDUCE_FMAX:
16080 return LowerReductionToSVE(Opcode: AArch64ISD::FMAXNMV_PRED, ScalarOp: Op, DAG);
16081 case ISD::VECREDUCE_FMIN:
16082 return LowerReductionToSVE(Opcode: AArch64ISD::FMINNMV_PRED, ScalarOp: Op, DAG);
16083 case ISD::VECREDUCE_FMAXIMUM:
16084 return LowerReductionToSVE(Opcode: AArch64ISD::FMAXV_PRED, ScalarOp: Op, DAG);
16085 case ISD::VECREDUCE_FMINIMUM:
16086 return LowerReductionToSVE(Opcode: AArch64ISD::FMINV_PRED, ScalarOp: Op, DAG);
16087 default:
16088 llvm_unreachable("Unhandled fixed length reduction");
16089 }
16090 }
16091
16092 // Lower NEON reductions.
16093 SDLoc DL(Op);
16094 switch (Op.getOpcode()) {
16095 case ISD::VECREDUCE_AND:
16096 case ISD::VECREDUCE_OR:
16097 case ISD::VECREDUCE_XOR:
16098 return getVectorBitwiseReduce(Opcode: Op.getOpcode(), Vec: Op.getOperand(i: 0),
16099 VT: Op.getValueType(), DL, DAG);
16100 case ISD::VECREDUCE_ADD:
16101 return getReductionSDNode(Op: AArch64ISD::UADDV, DL, ScalarOp: Op, DAG);
16102 case ISD::VECREDUCE_SMAX:
16103 return getReductionSDNode(Op: AArch64ISD::SMAXV, DL, ScalarOp: Op, DAG);
16104 case ISD::VECREDUCE_SMIN:
16105 return getReductionSDNode(Op: AArch64ISD::SMINV, DL, ScalarOp: Op, DAG);
16106 case ISD::VECREDUCE_UMAX:
16107 return getReductionSDNode(Op: AArch64ISD::UMAXV, DL, ScalarOp: Op, DAG);
16108 case ISD::VECREDUCE_UMIN:
16109 return getReductionSDNode(Op: AArch64ISD::UMINV, DL, ScalarOp: Op, DAG);
16110 default:
16111 llvm_unreachable("Unhandled reduction");
16112 }
16113}
16114
16115SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16116 SelectionDAG &DAG) const {
16117 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16118 // No point replacing if we don't have the relevant instruction/libcall anyway
16119 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16120 return SDValue();
16121
16122 // LSE has an atomic load-clear instruction, but not a load-and.
16123 SDLoc DL(Op);
16124 MVT VT = Op.getSimpleValueType();
16125 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16126 SDValue RHS = Op.getOperand(i: 2);
16127 AtomicSDNode *AN = cast<AtomicSDNode>(Val: Op.getNode());
16128 RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: DAG.getAllOnesConstant(DL, VT), N2: RHS);
16129 return DAG.getAtomic(Opcode: ISD::ATOMIC_LOAD_CLR, dl: DL, MemVT: AN->getMemoryVT(),
16130 Chain: Op.getOperand(i: 0), Ptr: Op.getOperand(i: 1), Val: RHS,
16131 MMO: AN->getMemOperand());
16132}
16133
16134SDValue
16135AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16136 SelectionDAG &DAG) const {
16137
16138 SDLoc DL(Op);
16139 // Get the inputs.
16140 SDNode *Node = Op.getNode();
16141 SDValue Chain = Op.getOperand(i: 0);
16142 SDValue Size = Op.getOperand(i: 1);
16143 MaybeAlign Align =
16144 cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getMaybeAlignValue();
16145 EVT VT = Node->getValueType(ResNo: 0);
16146
16147 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
16148 Kind: "no-stack-arg-probe")) {
16149 SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::SP, VT: MVT::i64);
16150 Chain = SP.getValue(R: 1);
16151 SP = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: SP, N2: Size);
16152 if (Align)
16153 SP =
16154 DAG.getNode(Opcode: ISD::AND, DL, VT, N1: SP.getValue(R: 0),
16155 N2: DAG.getSignedConstant(Val: -(uint64_t)Align->value(), DL, VT));
16156 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: AArch64::SP, N: SP);
16157 SDValue Ops[2] = {SP, Chain};
16158 return DAG.getMergeValues(Ops, dl: DL);
16159 }
16160
16161 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL);
16162
16163 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
16164 SDValue Callee = DAG.getTargetExternalSymbol(Sym: Subtarget->getChkStkName(),
16165 VT: PtrVT, TargetFlags: 0);
16166
16167 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16168 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16169 if (Subtarget->hasCustomCallingConv())
16170 TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
16171
16172 Size = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: Size,
16173 N2: DAG.getConstant(Val: 4, DL, VT: MVT::i64));
16174 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: AArch64::X15, N: Size, Glue: SDValue());
16175 Chain =
16176 DAG.getNode(Opcode: AArch64ISD::CALL, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue),
16177 N1: Chain, N2: Callee, N3: DAG.getRegister(Reg: AArch64::X15, VT: MVT::i64),
16178 N4: DAG.getRegisterMask(RegMask: Mask), N5: Chain.getValue(R: 1));
16179 // To match the actual intent better, we should read the output from X15 here
16180 // again (instead of potentially spilling it to the stack), but rereading Size
16181 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16182 // here.
16183
16184 Size = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i64, N1: Size,
16185 N2: DAG.getConstant(Val: 4, DL, VT: MVT::i64));
16186
16187 SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::SP, VT: MVT::i64);
16188 Chain = SP.getValue(R: 1);
16189 SP = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: SP, N2: Size);
16190 if (Align)
16191 SP = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: SP.getValue(R: 0),
16192 N2: DAG.getSignedConstant(Val: -(uint64_t)Align->value(), DL, VT));
16193 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: AArch64::SP, N: SP);
16194
16195 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL);
16196
16197 SDValue Ops[2] = {SP, Chain};
16198 return DAG.getMergeValues(Ops, dl: DL);
16199}
16200
16201SDValue
16202AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16203 SelectionDAG &DAG) const {
16204 // Get the inputs.
16205 SDNode *Node = Op.getNode();
16206 SDValue Chain = Op.getOperand(i: 0);
16207 SDValue Size = Op.getOperand(i: 1);
16208
16209 MaybeAlign Align =
16210 cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getMaybeAlignValue();
16211 SDLoc DL(Op);
16212 EVT VT = Node->getValueType(ResNo: 0);
16213
16214 // Construct the new SP value in a GPR.
16215 SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::SP, VT: MVT::i64);
16216 Chain = SP.getValue(R: 1);
16217 SP = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: SP, N2: Size);
16218 if (Align)
16219 SP = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: SP.getValue(R: 0),
16220 N2: DAG.getSignedConstant(Val: -(uint64_t)Align->value(), DL, VT));
16221
16222 // Set the real SP to the new value with a probing loop.
16223 Chain = DAG.getNode(Opcode: AArch64ISD::PROBED_ALLOCA, DL, VT: MVT::Other, N1: Chain, N2: SP);
16224 SDValue Ops[2] = {SP, Chain};
16225 return DAG.getMergeValues(Ops, dl: DL);
16226}
16227
16228SDValue
16229AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16230 SelectionDAG &DAG) const {
16231 MachineFunction &MF = DAG.getMachineFunction();
16232
16233 if (Subtarget->isTargetWindows())
16234 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16235 else if (hasInlineStackProbe(MF))
16236 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16237 else
16238 return SDValue();
16239}
16240
16241SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16242 unsigned NewOp) const {
16243 if (Subtarget->hasSVE2())
16244 return LowerToPredicatedOp(Op, DAG, NewOp);
16245
16246 // Default to expand.
16247 return SDValue();
16248}
16249
16250SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16251 SelectionDAG &DAG) const {
16252 EVT VT = Op.getValueType();
16253 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16254
16255 SDLoc DL(Op);
16256 APInt MulImm = Op.getConstantOperandAPInt(i: 0);
16257 return DAG.getZExtOrTrunc(Op: DAG.getVScale(DL, VT: MVT::i64, MulImm: MulImm.sext(width: 64)), DL,
16258 VT);
16259}
16260
16261/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16262template <unsigned NumVecs>
16263static bool
16264setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
16265 AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
16266 Info.opc = ISD::INTRINSIC_VOID;
16267 // Retrieve EC from first vector argument.
16268 const EVT VT = TLI.getMemValueType(DL, Ty: CI.getArgOperand(i: 0)->getType());
16269 ElementCount EC = VT.getVectorElementCount();
16270#ifndef NDEBUG
16271 // Check the assumption that all input vectors are the same type.
16272 for (unsigned I = 0; I < NumVecs; ++I)
16273 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16274 "Invalid type.");
16275#endif
16276 // memVT is `NumVecs * VT`.
16277 Info.memVT = EVT::getVectorVT(Context&: CI.getType()->getContext(), VT: VT.getScalarType(),
16278 EC: EC * NumVecs);
16279 Info.ptrVal = CI.getArgOperand(i: CI.arg_size() - 1);
16280 Info.offset = 0;
16281 Info.align.reset();
16282 Info.flags = MachineMemOperand::MOStore;
16283 return true;
16284}
16285
16286/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16287/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16288/// specified in the intrinsic calls.
16289bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
16290 const CallInst &I,
16291 MachineFunction &MF,
16292 unsigned Intrinsic) const {
16293 auto &DL = I.getDataLayout();
16294 switch (Intrinsic) {
16295 case Intrinsic::aarch64_sve_st2:
16296 return setInfoSVEStN<2>(TLI: *this, DL, Info, CI: I);
16297 case Intrinsic::aarch64_sve_st3:
16298 return setInfoSVEStN<3>(TLI: *this, DL, Info, CI: I);
16299 case Intrinsic::aarch64_sve_st4:
16300 return setInfoSVEStN<4>(TLI: *this, DL, Info, CI: I);
16301 case Intrinsic::aarch64_neon_ld2:
16302 case Intrinsic::aarch64_neon_ld3:
16303 case Intrinsic::aarch64_neon_ld4:
16304 case Intrinsic::aarch64_neon_ld1x2:
16305 case Intrinsic::aarch64_neon_ld1x3:
16306 case Intrinsic::aarch64_neon_ld1x4: {
16307 Info.opc = ISD::INTRINSIC_W_CHAIN;
16308 uint64_t NumElts = DL.getTypeSizeInBits(Ty: I.getType()) / 64;
16309 Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: MVT::i64, NumElements: NumElts);
16310 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
16311 Info.offset = 0;
16312 Info.align.reset();
16313 // volatile loads with NEON intrinsics not supported
16314 Info.flags = MachineMemOperand::MOLoad;
16315 return true;
16316 }
16317 case Intrinsic::aarch64_neon_ld2lane:
16318 case Intrinsic::aarch64_neon_ld3lane:
16319 case Intrinsic::aarch64_neon_ld4lane:
16320 case Intrinsic::aarch64_neon_ld2r:
16321 case Intrinsic::aarch64_neon_ld3r:
16322 case Intrinsic::aarch64_neon_ld4r: {
16323 Info.opc = ISD::INTRINSIC_W_CHAIN;
16324 // ldx return struct with the same vec type
16325 Type *RetTy = I.getType();
16326 auto *StructTy = cast<StructType>(Val: RetTy);
16327 unsigned NumElts = StructTy->getNumElements();
16328 Type *VecTy = StructTy->getElementType(N: 0);
16329 MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
16330 Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
16331 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
16332 Info.offset = 0;
16333 Info.align.reset();
16334 // volatile loads with NEON intrinsics not supported
16335 Info.flags = MachineMemOperand::MOLoad;
16336 return true;
16337 }
16338 case Intrinsic::aarch64_neon_st2:
16339 case Intrinsic::aarch64_neon_st3:
16340 case Intrinsic::aarch64_neon_st4:
16341 case Intrinsic::aarch64_neon_st1x2:
16342 case Intrinsic::aarch64_neon_st1x3:
16343 case Intrinsic::aarch64_neon_st1x4: {
16344 Info.opc = ISD::INTRINSIC_VOID;
16345 unsigned NumElts = 0;
16346 for (const Value *Arg : I.args()) {
16347 Type *ArgTy = Arg->getType();
16348 if (!ArgTy->isVectorTy())
16349 break;
16350 NumElts += DL.getTypeSizeInBits(Ty: ArgTy) / 64;
16351 }
16352 Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: MVT::i64, NumElements: NumElts);
16353 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
16354 Info.offset = 0;
16355 Info.align.reset();
16356 // volatile stores with NEON intrinsics not supported
16357 Info.flags = MachineMemOperand::MOStore;
16358 return true;
16359 }
16360 case Intrinsic::aarch64_neon_st2lane:
16361 case Intrinsic::aarch64_neon_st3lane:
16362 case Intrinsic::aarch64_neon_st4lane: {
16363 Info.opc = ISD::INTRINSIC_VOID;
16364 unsigned NumElts = 0;
16365 // all the vector type is same
16366 Type *VecTy = I.getArgOperand(i: 0)->getType();
16367 MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
16368
16369 for (const Value *Arg : I.args()) {
16370 Type *ArgTy = Arg->getType();
16371 if (!ArgTy->isVectorTy())
16372 break;
16373 NumElts += 1;
16374 }
16375
16376 Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
16377 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
16378 Info.offset = 0;
16379 Info.align.reset();
16380 // volatile stores with NEON intrinsics not supported
16381 Info.flags = MachineMemOperand::MOStore;
16382 return true;
16383 }
16384 case Intrinsic::aarch64_ldaxr:
16385 case Intrinsic::aarch64_ldxr: {
16386 Type *ValTy = I.getParamElementType(ArgNo: 0);
16387 Info.opc = ISD::INTRINSIC_W_CHAIN;
16388 Info.memVT = MVT::getVT(Ty: ValTy);
16389 Info.ptrVal = I.getArgOperand(i: 0);
16390 Info.offset = 0;
16391 Info.align = DL.getABITypeAlign(Ty: ValTy);
16392 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
16393 return true;
16394 }
16395 case Intrinsic::aarch64_stlxr:
16396 case Intrinsic::aarch64_stxr: {
16397 Type *ValTy = I.getParamElementType(ArgNo: 1);
16398 Info.opc = ISD::INTRINSIC_W_CHAIN;
16399 Info.memVT = MVT::getVT(Ty: ValTy);
16400 Info.ptrVal = I.getArgOperand(i: 1);
16401 Info.offset = 0;
16402 Info.align = DL.getABITypeAlign(Ty: ValTy);
16403 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
16404 return true;
16405 }
16406 case Intrinsic::aarch64_ldaxp:
16407 case Intrinsic::aarch64_ldxp:
16408 Info.opc = ISD::INTRINSIC_W_CHAIN;
16409 Info.memVT = MVT::i128;
16410 Info.ptrVal = I.getArgOperand(i: 0);
16411 Info.offset = 0;
16412 Info.align = Align(16);
16413 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
16414 return true;
16415 case Intrinsic::aarch64_stlxp:
16416 case Intrinsic::aarch64_stxp:
16417 Info.opc = ISD::INTRINSIC_W_CHAIN;
16418 Info.memVT = MVT::i128;
16419 Info.ptrVal = I.getArgOperand(i: 2);
16420 Info.offset = 0;
16421 Info.align = Align(16);
16422 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
16423 return true;
16424 case Intrinsic::aarch64_sve_ldnt1: {
16425 Type *ElTy = cast<VectorType>(Val: I.getType())->getElementType();
16426 Info.opc = ISD::INTRINSIC_W_CHAIN;
16427 Info.memVT = MVT::getVT(Ty: I.getType());
16428 Info.ptrVal = I.getArgOperand(i: 1);
16429 Info.offset = 0;
16430 Info.align = DL.getABITypeAlign(Ty: ElTy);
16431 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
16432 return true;
16433 }
16434 case Intrinsic::aarch64_sve_stnt1: {
16435 Type *ElTy =
16436 cast<VectorType>(Val: I.getArgOperand(i: 0)->getType())->getElementType();
16437 Info.opc = ISD::INTRINSIC_W_CHAIN;
16438 Info.memVT = MVT::getVT(Ty: I.getOperand(i_nocapture: 0)->getType());
16439 Info.ptrVal = I.getArgOperand(i: 2);
16440 Info.offset = 0;
16441 Info.align = DL.getABITypeAlign(Ty: ElTy);
16442 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
16443 return true;
16444 }
16445 case Intrinsic::aarch64_mops_memset_tag: {
16446 Value *Dst = I.getArgOperand(i: 0);
16447 Value *Val = I.getArgOperand(i: 1);
16448 Info.opc = ISD::INTRINSIC_W_CHAIN;
16449 Info.memVT = MVT::getVT(Ty: Val->getType());
16450 Info.ptrVal = Dst;
16451 Info.offset = 0;
16452 Info.align = I.getParamAlign(ArgNo: 0).valueOrOne();
16453 Info.flags = MachineMemOperand::MOStore;
16454 // The size of the memory being operated on is unknown at this point
16455 Info.size = MemoryLocation::UnknownSize;
16456 return true;
16457 }
16458 default:
16459 break;
16460 }
16461
16462 return false;
16463}
16464
16465bool AArch64TargetLowering::shouldReduceLoadWidth(
16466 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
16467 std::optional<unsigned> ByteOffset) const {
16468 // TODO: This may be worth removing. Check regression tests for diffs.
16469 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
16470 ByteOffset))
16471 return false;
16472
16473 // If we're reducing the load width in order to avoid having to use an extra
16474 // instruction to do extension then it's probably a good idea.
16475 if (ExtTy != ISD::NON_EXTLOAD)
16476 return true;
16477 // Don't reduce load width if it would prevent us from combining a shift into
16478 // the offset.
16479 MemSDNode *Mem = dyn_cast<MemSDNode>(Val: Load);
16480 assert(Mem);
16481 const SDValue &Base = Mem->getBasePtr();
16482 if (Base.getOpcode() == ISD::ADD &&
16483 Base.getOperand(i: 1).getOpcode() == ISD::SHL &&
16484 Base.getOperand(i: 1).hasOneUse() &&
16485 Base.getOperand(i: 1).getOperand(i: 1).getOpcode() == ISD::Constant) {
16486 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
16487 if (Mem->getMemoryVT().isScalableVector())
16488 return false;
16489 // The shift can be combined if it matches the size of the value being
16490 // loaded (and so reducing the width would make it not match).
16491 uint64_t ShiftAmount = Base.getOperand(i: 1).getConstantOperandVal(i: 1);
16492 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
16493 if (ShiftAmount == Log2_32(Value: LoadBytes))
16494 return false;
16495 }
16496 // We have no reason to disallow reducing the load width, so allow it.
16497 return true;
16498}
16499
16500// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
16501bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const {
16502 EVT VT = Extend.getValueType();
16503 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
16504 SDValue Extract = Extend.getOperand(i: 0);
16505 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
16506 Extract = Extract.getOperand(i: 0);
16507 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
16508 EVT VecVT = Extract.getOperand(i: 0).getValueType();
16509 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
16510 return false;
16511 }
16512 }
16513 return true;
16514}
16515
16516// Truncations from 64-bit GPR to 32-bit GPR is free.
16517bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
16518 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16519 return false;
16520 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
16521 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
16522 return NumBits1 > NumBits2;
16523}
16524bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
16525 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16526 return false;
16527 uint64_t NumBits1 = VT1.getFixedSizeInBits();
16528 uint64_t NumBits2 = VT2.getFixedSizeInBits();
16529 return NumBits1 > NumBits2;
16530}
16531
16532/// Check if it is profitable to hoist instruction in then/else to if.
16533/// Not profitable if I and it's user can form a FMA instruction
16534/// because we prefer FMSUB/FMADD.
16535bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
16536 if (I->getOpcode() != Instruction::FMul)
16537 return true;
16538
16539 if (!I->hasOneUse())
16540 return true;
16541
16542 Instruction *User = I->user_back();
16543
16544 if (!(User->getOpcode() == Instruction::FSub ||
16545 User->getOpcode() == Instruction::FAdd))
16546 return true;
16547
16548 const TargetOptions &Options = getTargetMachine().Options;
16549 const Function *F = I->getFunction();
16550 const DataLayout &DL = F->getDataLayout();
16551 Type *Ty = User->getOperand(i: 0)->getType();
16552
16553 return !(isFMAFasterThanFMulAndFAdd(F: *F, Ty) &&
16554 isOperationLegalOrCustom(Op: ISD::FMA, VT: getValueType(DL, Ty)) &&
16555 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16556 Options.UnsafeFPMath));
16557}
16558
16559// All 32-bit GPR operations implicitly zero the high-half of the corresponding
16560// 64-bit GPR.
16561bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
16562 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16563 return false;
16564 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
16565 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
16566 return NumBits1 == 32 && NumBits2 == 64;
16567}
16568bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
16569 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16570 return false;
16571 unsigned NumBits1 = VT1.getSizeInBits();
16572 unsigned NumBits2 = VT2.getSizeInBits();
16573 return NumBits1 == 32 && NumBits2 == 64;
16574}
16575
16576bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
16577 EVT VT1 = Val.getValueType();
16578 if (isZExtFree(VT1, VT2)) {
16579 return true;
16580 }
16581
16582 if (Val.getOpcode() != ISD::LOAD)
16583 return false;
16584
16585 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
16586 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
16587 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
16588 VT1.getSizeInBits() <= 32);
16589}
16590
16591bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
16592 if (isa<FPExtInst>(Val: Ext))
16593 return false;
16594
16595 // Vector types are not free.
16596 if (Ext->getType()->isVectorTy())
16597 return false;
16598
16599 for (const Use &U : Ext->uses()) {
16600 // The extension is free if we can fold it with a left shift in an
16601 // addressing mode or an arithmetic operation: add, sub, and cmp.
16602
16603 // Is there a shift?
16604 const Instruction *Instr = cast<Instruction>(Val: U.getUser());
16605
16606 // Is this a constant shift?
16607 switch (Instr->getOpcode()) {
16608 case Instruction::Shl:
16609 if (!isa<ConstantInt>(Val: Instr->getOperand(i: 1)))
16610 return false;
16611 break;
16612 case Instruction::GetElementPtr: {
16613 gep_type_iterator GTI = gep_type_begin(GEP: Instr);
16614 auto &DL = Ext->getDataLayout();
16615 std::advance(i&: GTI, n: U.getOperandNo()-1);
16616 Type *IdxTy = GTI.getIndexedType();
16617 // This extension will end up with a shift because of the scaling factor.
16618 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
16619 // Get the shift amount based on the scaling factor:
16620 // log2(sizeof(IdxTy)) - log2(8).
16621 if (IdxTy->isScalableTy())
16622 return false;
16623 uint64_t ShiftAmt =
16624 llvm::countr_zero(Val: DL.getTypeStoreSizeInBits(Ty: IdxTy).getFixedValue()) -
16625 3;
16626 // Is the constant foldable in the shift of the addressing mode?
16627 // I.e., shift amount is between 1 and 4 inclusive.
16628 if (ShiftAmt == 0 || ShiftAmt > 4)
16629 return false;
16630 break;
16631 }
16632 case Instruction::Trunc:
16633 // Check if this is a noop.
16634 // trunc(sext ty1 to ty2) to ty1.
16635 if (Instr->getType() == Ext->getOperand(i: 0)->getType())
16636 continue;
16637 [[fallthrough]];
16638 default:
16639 return false;
16640 }
16641
16642 // At this point we can use the bfm family, so this extension is free
16643 // for that use.
16644 }
16645 return true;
16646}
16647
16648static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16649 unsigned NumElts, bool IsLittleEndian,
16650 SmallVectorImpl<int> &Mask) {
16651 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
16652 return false;
16653
16654 assert(DstWidth % SrcWidth == 0 &&
16655 "TBL lowering is not supported for a conversion instruction with this "
16656 "source and destination element type.");
16657
16658 unsigned Factor = DstWidth / SrcWidth;
16659 unsigned MaskLen = NumElts * Factor;
16660
16661 Mask.clear();
16662 Mask.resize(N: MaskLen, NV: NumElts);
16663
16664 unsigned SrcIndex = 0;
16665 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
16666 Mask[I] = SrcIndex++;
16667
16668 return true;
16669}
16670
16671static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op,
16672 FixedVectorType *ZExtTy,
16673 FixedVectorType *DstTy,
16674 bool IsLittleEndian) {
16675 auto *SrcTy = cast<FixedVectorType>(Val: Op->getType());
16676 unsigned NumElts = SrcTy->getNumElements();
16677 auto SrcWidth = cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16678 auto DstWidth = cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16679
16680 SmallVector<int> Mask;
16681 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16682 return nullptr;
16683
16684 auto *FirstEltZero = Builder.CreateInsertElement(
16685 Vec: PoisonValue::get(T: SrcTy), NewElt: Builder.getIntN(N: SrcWidth, C: 0), Idx: uint64_t(0));
16686 Value *Result = Builder.CreateShuffleVector(V1: Op, V2: FirstEltZero, Mask);
16687 Result = Builder.CreateBitCast(V: Result, DestTy: DstTy);
16688 if (DstTy != ZExtTy)
16689 Result = Builder.CreateZExt(V: Result, DestTy: ZExtTy);
16690 return Result;
16691}
16692
16693static Value *createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op,
16694 FixedVectorType *DstTy,
16695 bool IsLittleEndian) {
16696 auto *SrcTy = cast<FixedVectorType>(Val: Op->getType());
16697 auto SrcWidth = cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16698 auto DstWidth = cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16699
16700 SmallVector<int> Mask;
16701 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts: SrcTy->getNumElements(),
16702 IsLittleEndian: !IsLittleEndian, Mask))
16703 return nullptr;
16704
16705 auto *FirstEltZero = Builder.CreateInsertElement(
16706 Vec: PoisonValue::get(T: SrcTy), NewElt: Builder.getIntN(N: SrcWidth, C: 0), Idx: uint64_t(0));
16707
16708 return Builder.CreateShuffleVector(V1: Op, V2: FirstEltZero, Mask);
16709}
16710
16711static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
16712 IRBuilder<> Builder(TI);
16713 SmallVector<Value *> Parts;
16714 int NumElements = cast<FixedVectorType>(Val: TI->getType())->getNumElements();
16715 auto *SrcTy = cast<FixedVectorType>(Val: TI->getOperand(i_nocapture: 0)->getType());
16716 auto *DstTy = cast<FixedVectorType>(Val: TI->getType());
16717 assert(SrcTy->getElementType()->isIntegerTy() &&
16718 "Non-integer type source vector element is not supported");
16719 assert(DstTy->getElementType()->isIntegerTy(8) &&
16720 "Unsupported destination vector element type");
16721 unsigned SrcElemTySz =
16722 cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16723 unsigned DstElemTySz =
16724 cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16725 assert((SrcElemTySz % DstElemTySz == 0) &&
16726 "Cannot lower truncate to tbl instructions for a source element size "
16727 "that is not divisible by the destination element size");
16728 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16729 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
16730 "Unsupported source vector element type size");
16731 Type *VecTy = FixedVectorType::get(ElementType: Builder.getInt8Ty(), NumElts: 16);
16732
16733 // Create a mask to choose every nth byte from the source vector table of
16734 // bytes to create the truncated destination vector, where 'n' is the truncate
16735 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16736 // 0,8,16,..Y*8th bytes for the little-endian format
16737 SmallVector<Constant *, 16> MaskConst;
16738 for (int Itr = 0; Itr < 16; Itr++) {
16739 if (Itr < NumElements)
16740 MaskConst.push_back(Elt: Builder.getInt8(
16741 C: IsLittleEndian ? Itr * TruncFactor
16742 : Itr * TruncFactor + (TruncFactor - 1)));
16743 else
16744 MaskConst.push_back(Elt: Builder.getInt8(C: 255));
16745 }
16746
16747 int MaxTblSz = 128 * 4;
16748 int MaxSrcSz = SrcElemTySz * NumElements;
16749 int ElemsPerTbl =
16750 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16751 assert(ElemsPerTbl <= 16 &&
16752 "Maximum elements selected using TBL instruction cannot exceed 16!");
16753
16754 int ShuffleCount = 128 / SrcElemTySz;
16755 SmallVector<int> ShuffleLanes;
16756 for (int i = 0; i < ShuffleCount; ++i)
16757 ShuffleLanes.push_back(Elt: i);
16758
16759 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16760 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16761 // call TBL & save the result in a vector of TBL results for combining later.
16762 SmallVector<Value *> Results;
16763 while (ShuffleLanes.back() < NumElements) {
16764 Parts.push_back(Elt: Builder.CreateBitCast(
16765 V: Builder.CreateShuffleVector(V: TI->getOperand(i_nocapture: 0), Mask: ShuffleLanes), DestTy: VecTy));
16766
16767 if (Parts.size() == 4) {
16768 Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
16769 Results.push_back(
16770 Elt: Builder.CreateIntrinsic(ID: Intrinsic::aarch64_neon_tbl4, Types: VecTy, Args: Parts));
16771 Parts.clear();
16772 }
16773
16774 for (int i = 0; i < ShuffleCount; ++i)
16775 ShuffleLanes[i] += ShuffleCount;
16776 }
16777
16778 assert((Parts.empty() || Results.empty()) &&
16779 "Lowering trunc for vectors requiring different TBL instructions is "
16780 "not supported!");
16781 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16782 // registers
16783 if (!Parts.empty()) {
16784 Intrinsic::ID TblID;
16785 switch (Parts.size()) {
16786 case 1:
16787 TblID = Intrinsic::aarch64_neon_tbl1;
16788 break;
16789 case 2:
16790 TblID = Intrinsic::aarch64_neon_tbl2;
16791 break;
16792 case 3:
16793 TblID = Intrinsic::aarch64_neon_tbl3;
16794 break;
16795 }
16796
16797 Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
16798 Results.push_back(Elt: Builder.CreateIntrinsic(ID: TblID, Types: VecTy, Args: Parts));
16799 }
16800
16801 // Extract the destination vector from TBL result(s) after combining them
16802 // where applicable. Currently, at most two TBLs are supported.
16803 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
16804 "more than 2 tbl instructions!");
16805 Value *FinalResult = Results[0];
16806 if (Results.size() == 1) {
16807 if (ElemsPerTbl < 16) {
16808 SmallVector<int> FinalMask(ElemsPerTbl);
16809 std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: 0);
16810 FinalResult = Builder.CreateShuffleVector(V: Results[0], Mask: FinalMask);
16811 }
16812 } else {
16813 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16814 if (ElemsPerTbl < 16) {
16815 std::iota(first: FinalMask.begin(), last: FinalMask.begin() + ElemsPerTbl, value: 0);
16816 std::iota(first: FinalMask.begin() + ElemsPerTbl, last: FinalMask.end(), value: 16);
16817 } else {
16818 std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: 0);
16819 }
16820 FinalResult =
16821 Builder.CreateShuffleVector(V1: Results[0], V2: Results[1], Mask: FinalMask);
16822 }
16823
16824 TI->replaceAllUsesWith(V: FinalResult);
16825 TI->eraseFromParent();
16826}
16827
16828bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
16829 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
16830 // shuffle_vector instructions are serialized when targeting SVE,
16831 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
16832 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
16833 return false;
16834
16835 // Try to optimize conversions using tbl. This requires materializing constant
16836 // index vectors, which can increase code size and add loads. Skip the
16837 // transform unless the conversion is in a loop block guaranteed to execute
16838 // and we are not optimizing for size.
16839 Function *F = I->getParent()->getParent();
16840 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
16841 return false;
16842
16843 auto *SrcTy = dyn_cast<FixedVectorType>(Val: I->getOperand(i: 0)->getType());
16844 auto *DstTy = dyn_cast<FixedVectorType>(Val: I->getType());
16845 if (!SrcTy || !DstTy)
16846 return false;
16847
16848 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16849 // lowered to tbl instructions to insert the original i8 elements
16850 // into i8x lanes. This is enabled for cases where it is beneficial.
16851 auto *ZExt = dyn_cast<ZExtInst>(Val: I);
16852 if (ZExt && SrcTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
16853 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16854 if (DstWidth % 8 != 0)
16855 return false;
16856
16857 auto *TruncDstType =
16858 cast<FixedVectorType>(Val: VectorType::getTruncatedElementVectorType(VTy: DstTy));
16859 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16860 // the remaining ZExt folded into the user, don't use tbl lowering.
16861 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16862 if (TTI.getCastInstrCost(Opcode: I->getOpcode(), Dst: DstTy, Src: TruncDstType,
16863 CCH: TargetTransformInfo::getCastContextHint(I),
16864 CostKind: TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) {
16865 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
16866 return false;
16867
16868 DstTy = TruncDstType;
16869 }
16870
16871 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
16872 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
16873 // most one extra extend step is needed and using tbl is not profitable.
16874 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
16875 // udot instruction.
16876 if (SrcWidth * 4 <= DstWidth) {
16877 if (all_of(Range: I->users(), P: [&](auto *U) {
16878 auto *SingleUser = cast<Instruction>(&*U);
16879 if (match(SingleUser, m_c_Mul(L: m_Specific(V: I), R: m_SExt(Op: m_Value()))))
16880 return true;
16881 if (match(SingleUser,
16882 m_Intrinsic<
16883 Intrinsic::experimental_vector_partial_reduce_add>(
16884 Op0: m_Value(), Op1: m_Specific(V: I))))
16885 return true;
16886 return false;
16887 }))
16888 return false;
16889 }
16890
16891 if (DstTy->getScalarSizeInBits() >= 64)
16892 return false;
16893
16894 IRBuilder<> Builder(ZExt);
16895 Value *Result = createTblShuffleForZExt(
16896 Builder, Op: ZExt->getOperand(i_nocapture: 0), ZExtTy: cast<FixedVectorType>(Val: ZExt->getType()),
16897 DstTy, IsLittleEndian: Subtarget->isLittleEndian());
16898 if (!Result)
16899 return false;
16900 ZExt->replaceAllUsesWith(V: Result);
16901 ZExt->eraseFromParent();
16902 return true;
16903 }
16904
16905 auto *UIToFP = dyn_cast<UIToFPInst>(Val: I);
16906 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(Bitwidth: 8) &&
16907 DstTy->getElementType()->isFloatTy()) ||
16908 (SrcTy->getElementType()->isIntegerTy(Bitwidth: 16) &&
16909 DstTy->getElementType()->isDoubleTy()))) {
16910 IRBuilder<> Builder(I);
16911 Value *ZExt = createTblShuffleForZExt(
16912 Builder, Op: I->getOperand(i: 0), ZExtTy: FixedVectorType::getInteger(VTy: DstTy),
16913 DstTy: FixedVectorType::getInteger(VTy: DstTy), IsLittleEndian: Subtarget->isLittleEndian());
16914 assert(ZExt && "Cannot fail for the i8 to float conversion");
16915 auto *UI = Builder.CreateUIToFP(V: ZExt, DestTy: DstTy);
16916 I->replaceAllUsesWith(V: UI);
16917 I->eraseFromParent();
16918 return true;
16919 }
16920
16921 auto *SIToFP = dyn_cast<SIToFPInst>(Val: I);
16922 if (SIToFP && SrcTy->getElementType()->isIntegerTy(Bitwidth: 8) &&
16923 DstTy->getElementType()->isFloatTy()) {
16924 IRBuilder<> Builder(I);
16925 auto *Shuffle = createTblShuffleForSExt(Builder, Op: I->getOperand(i: 0),
16926 DstTy: FixedVectorType::getInteger(VTy: DstTy),
16927 IsLittleEndian: Subtarget->isLittleEndian());
16928 assert(Shuffle && "Cannot fail for the i8 to float conversion");
16929 auto *Cast = Builder.CreateBitCast(V: Shuffle, DestTy: VectorType::getInteger(VTy: DstTy));
16930 auto *AShr = Builder.CreateAShr(LHS: Cast, RHS: 24, Name: "", isExact: true);
16931 auto *SI = Builder.CreateSIToFP(V: AShr, DestTy: DstTy);
16932 I->replaceAllUsesWith(V: SI);
16933 I->eraseFromParent();
16934 return true;
16935 }
16936
16937 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
16938 // followed by a truncate lowered to using tbl.4.
16939 auto *FPToUI = dyn_cast<FPToUIInst>(Val: I);
16940 if (FPToUI &&
16941 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
16942 SrcTy->getElementType()->isFloatTy() &&
16943 DstTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
16944 IRBuilder<> Builder(I);
16945 auto *WideConv = Builder.CreateFPToUI(V: FPToUI->getOperand(i_nocapture: 0),
16946 DestTy: VectorType::getInteger(VTy: SrcTy));
16947 auto *TruncI = Builder.CreateTrunc(V: WideConv, DestTy: DstTy);
16948 I->replaceAllUsesWith(V: TruncI);
16949 I->eraseFromParent();
16950 createTblForTrunc(TI: cast<TruncInst>(Val: TruncI), IsLittleEndian: Subtarget->isLittleEndian());
16951 return true;
16952 }
16953
16954 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
16955 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
16956 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
16957 // registers
16958 auto *TI = dyn_cast<TruncInst>(Val: I);
16959 if (TI && DstTy->getElementType()->isIntegerTy(Bitwidth: 8) &&
16960 ((SrcTy->getElementType()->isIntegerTy(Bitwidth: 32) ||
16961 SrcTy->getElementType()->isIntegerTy(Bitwidth: 64)) &&
16962 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
16963 createTblForTrunc(TI, IsLittleEndian: Subtarget->isLittleEndian());
16964 return true;
16965 }
16966
16967 return false;
16968}
16969
16970bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
16971 Align &RequiredAlignment) const {
16972 if (!LoadedType.isSimple() ||
16973 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
16974 return false;
16975 // Cyclone supports unaligned accesses.
16976 RequiredAlignment = Align(1);
16977 unsigned NumBits = LoadedType.getSizeInBits();
16978 return NumBits == 32 || NumBits == 64;
16979}
16980
16981/// A helper function for determining the number of interleaved accesses we
16982/// will generate when lowering accesses of the given type.
16983unsigned AArch64TargetLowering::getNumInterleavedAccesses(
16984 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
16985 unsigned VecSize = 128;
16986 unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
16987 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
16988 if (UseScalable && isa<FixedVectorType>(Val: VecTy))
16989 VecSize = std::max(a: Subtarget->getMinSVEVectorSizeInBits(), b: 128u);
16990 return std::max<unsigned>(a: 1, b: (MinElts * ElSize + 127) / VecSize);
16991}
16992
16993MachineMemOperand::Flags
16994AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
16995 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
16996 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
16997 return MOStridedAccess;
16998 return MachineMemOperand::MONone;
16999}
17000
17001bool AArch64TargetLowering::isLegalInterleavedAccessType(
17002 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17003 unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
17004 auto EC = VecTy->getElementCount();
17005 unsigned MinElts = EC.getKnownMinValue();
17006
17007 UseScalable = false;
17008
17009 if (isa<FixedVectorType>(Val: VecTy) && !Subtarget->isNeonAvailable() &&
17010 (!Subtarget->useSVEForFixedLengthVectors() ||
17011 !getSVEPredPatternFromNumElements(MinNumElts: MinElts)))
17012 return false;
17013
17014 if (isa<ScalableVectorType>(Val: VecTy) &&
17015 !Subtarget->isSVEorStreamingSVEAvailable())
17016 return false;
17017
17018 // Ensure the number of vector elements is greater than 1.
17019 if (MinElts < 2)
17020 return false;
17021
17022 // Ensure the element type is legal.
17023 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17024 return false;
17025
17026 if (EC.isScalable()) {
17027 UseScalable = true;
17028 return isPowerOf2_32(Value: MinElts) && (MinElts * ElSize) % 128 == 0;
17029 }
17030
17031 unsigned VecSize = DL.getTypeSizeInBits(Ty: VecTy);
17032 if (Subtarget->useSVEForFixedLengthVectors()) {
17033 unsigned MinSVEVectorSize =
17034 std::max(a: Subtarget->getMinSVEVectorSizeInBits(), b: 128u);
17035 if (VecSize % MinSVEVectorSize == 0 ||
17036 (VecSize < MinSVEVectorSize && isPowerOf2_32(Value: MinElts) &&
17037 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17038 UseScalable = true;
17039 return true;
17040 }
17041 }
17042
17043 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17044 // 128 will be split into multiple interleaved accesses.
17045 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17046}
17047
17048static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
17049 if (VTy->getElementType() == Type::getDoubleTy(C&: VTy->getContext()))
17050 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 2);
17051
17052 if (VTy->getElementType() == Type::getFloatTy(C&: VTy->getContext()))
17053 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 4);
17054
17055 if (VTy->getElementType() == Type::getBFloatTy(C&: VTy->getContext()))
17056 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 8);
17057
17058 if (VTy->getElementType() == Type::getHalfTy(C&: VTy->getContext()))
17059 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 8);
17060
17061 if (VTy->getElementType() == Type::getInt64Ty(C&: VTy->getContext()))
17062 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 2);
17063
17064 if (VTy->getElementType() == Type::getInt32Ty(C&: VTy->getContext()))
17065 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 4);
17066
17067 if (VTy->getElementType() == Type::getInt16Ty(C&: VTy->getContext()))
17068 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 8);
17069
17070 if (VTy->getElementType() == Type::getInt8Ty(C&: VTy->getContext()))
17071 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 16);
17072
17073 llvm_unreachable("Cannot handle input vector type");
17074}
17075
17076static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17077 bool Scalable, Type *LDVTy,
17078 Type *PtrTy) {
17079 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17080 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17081 Intrinsic::aarch64_sve_ld3_sret,
17082 Intrinsic::aarch64_sve_ld4_sret};
17083 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17084 Intrinsic::aarch64_neon_ld3,
17085 Intrinsic::aarch64_neon_ld4};
17086 if (Scalable)
17087 return Intrinsic::getOrInsertDeclaration(M, id: SVELoads[Factor - 2], Tys: {LDVTy});
17088
17089 return Intrinsic::getOrInsertDeclaration(M, id: NEONLoads[Factor - 2],
17090 Tys: {LDVTy, PtrTy});
17091}
17092
17093static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17094 bool Scalable, Type *STVTy,
17095 Type *PtrTy) {
17096 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17097 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17098 Intrinsic::aarch64_sve_st3,
17099 Intrinsic::aarch64_sve_st4};
17100 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17101 Intrinsic::aarch64_neon_st3,
17102 Intrinsic::aarch64_neon_st4};
17103 if (Scalable)
17104 return Intrinsic::getOrInsertDeclaration(M, id: SVEStores[Factor - 2], Tys: {STVTy});
17105
17106 return Intrinsic::getOrInsertDeclaration(M, id: NEONStores[Factor - 2],
17107 Tys: {STVTy, PtrTy});
17108}
17109
17110/// Lower an interleaved load into a ldN intrinsic.
17111///
17112/// E.g. Lower an interleaved load (Factor = 2):
17113/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17114/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17115/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17116///
17117/// Into:
17118/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17119/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17120/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17121bool AArch64TargetLowering::lowerInterleavedLoad(
17122 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
17123 ArrayRef<unsigned> Indices, unsigned Factor) const {
17124 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17125 "Invalid interleave factor");
17126 assert(!Shuffles.empty() && "Empty shufflevector input");
17127 assert(Shuffles.size() == Indices.size() &&
17128 "Unmatched number of shufflevectors and indices");
17129
17130 const DataLayout &DL = LI->getDataLayout();
17131
17132 VectorType *VTy = Shuffles[0]->getType();
17133
17134 // Skip if we do not have NEON and skip illegal vector types. We can
17135 // "legalize" wide vector types into multiple interleaved accesses as long as
17136 // the vector types are divisible by 128.
17137 bool UseScalable;
17138 if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
17139 return false;
17140
17141 // Check if the interleave is a zext(shuffle), that can be better optimized
17142 // into shift / and masks. For the moment we do this just for uitofp (not
17143 // zext) to avoid issues with widening instructions.
17144 if (Shuffles.size() == 4 && all_of(Range&: Shuffles, P: [](ShuffleVectorInst *SI) {
17145 return SI->hasOneUse() && match(V: SI->user_back(), P: m_UIToFP(Op: m_Value())) &&
17146 SI->getType()->getScalarSizeInBits() * 4 ==
17147 SI->user_back()->getType()->getScalarSizeInBits();
17148 }))
17149 return false;
17150
17151 unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
17152
17153 auto *FVTy = cast<FixedVectorType>(Val: VTy);
17154
17155 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17156 // load integer vectors first and then convert to pointer vectors.
17157 Type *EltTy = FVTy->getElementType();
17158 if (EltTy->isPointerTy())
17159 FVTy =
17160 FixedVectorType::get(ElementType: DL.getIntPtrType(EltTy), NumElts: FVTy->getNumElements());
17161
17162 // If we're going to generate more than one load, reset the sub-vector type
17163 // to something legal.
17164 FVTy = FixedVectorType::get(ElementType: FVTy->getElementType(),
17165 NumElts: FVTy->getNumElements() / NumLoads);
17166
17167 auto *LDVTy =
17168 UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: FVTy)) : FVTy;
17169
17170 IRBuilder<> Builder(LI);
17171
17172 // The base address of the load.
17173 Value *BaseAddr = LI->getPointerOperand();
17174
17175 Type *PtrTy = LI->getPointerOperandType();
17176 Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: LDVTy->getContext()),
17177 EC: LDVTy->getElementCount());
17178
17179 Function *LdNFunc = getStructuredLoadFunction(M: LI->getModule(), Factor,
17180 Scalable: UseScalable, LDVTy, PtrTy);
17181
17182 // Holds sub-vectors extracted from the load intrinsic return values. The
17183 // sub-vectors are associated with the shufflevector instructions they will
17184 // replace.
17185 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
17186
17187 Value *PTrue = nullptr;
17188 if (UseScalable) {
17189 std::optional<unsigned> PgPattern =
17190 getSVEPredPatternFromNumElements(MinNumElts: FVTy->getNumElements());
17191 if (Subtarget->getMinSVEVectorSizeInBits() ==
17192 Subtarget->getMaxSVEVectorSizeInBits() &&
17193 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(Ty: FVTy))
17194 PgPattern = AArch64SVEPredPattern::all;
17195
17196 auto *PTruePat =
17197 ConstantInt::get(Ty: Type::getInt32Ty(C&: LDVTy->getContext()), V: *PgPattern);
17198 PTrue = Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, Types: {PredTy},
17199 Args: {PTruePat});
17200 }
17201
17202 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17203
17204 // If we're generating more than one load, compute the base address of
17205 // subsequent loads as an offset from the previous.
17206 if (LoadCount > 0)
17207 BaseAddr = Builder.CreateConstGEP1_32(Ty: LDVTy->getElementType(), Ptr: BaseAddr,
17208 Idx0: FVTy->getNumElements() * Factor);
17209
17210 CallInst *LdN;
17211 if (UseScalable)
17212 LdN = Builder.CreateCall(Callee: LdNFunc, Args: {PTrue, BaseAddr}, Name: "ldN");
17213 else
17214 LdN = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
17215
17216 // Extract and store the sub-vectors returned by the load intrinsic.
17217 for (unsigned i = 0; i < Shuffles.size(); i++) {
17218 ShuffleVectorInst *SVI = Shuffles[i];
17219 unsigned Index = Indices[i];
17220
17221 Value *SubVec = Builder.CreateExtractValue(Agg: LdN, Idxs: Index);
17222
17223 if (UseScalable)
17224 SubVec = Builder.CreateExtractVector(DstType: FVTy, SrcVec: SubVec, Idx: uint64_t(0));
17225
17226 // Convert the integer vector to pointer vector if the element is pointer.
17227 if (EltTy->isPointerTy())
17228 SubVec = Builder.CreateIntToPtr(
17229 V: SubVec, DestTy: FixedVectorType::get(ElementType: SVI->getType()->getElementType(),
17230 NumElts: FVTy->getNumElements()));
17231
17232 SubVecs[SVI].push_back(Elt: SubVec);
17233 }
17234 }
17235
17236 // Replace uses of the shufflevector instructions with the sub-vectors
17237 // returned by the load intrinsic. If a shufflevector instruction is
17238 // associated with more than one sub-vector, those sub-vectors will be
17239 // concatenated into a single wide vector.
17240 for (ShuffleVectorInst *SVI : Shuffles) {
17241 auto &SubVec = SubVecs[SVI];
17242 auto *WideVec =
17243 SubVec.size() > 1 ? concatenateVectors(Builder, Vecs: SubVec) : SubVec[0];
17244 SVI->replaceAllUsesWith(V: WideVec);
17245 }
17246
17247 return true;
17248}
17249
17250template <typename Iter>
17251bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17252 int MaxLookupDist = 20;
17253 unsigned IdxWidth = DL.getIndexSizeInBits(AS: 0);
17254 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17255 const Value *PtrA1 =
17256 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
17257
17258 while (++It != End) {
17259 if (It->isDebugOrPseudoInst())
17260 continue;
17261 if (MaxLookupDist-- == 0)
17262 break;
17263 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17264 const Value *PtrB1 =
17265 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17266 DL, OffsetB);
17267 if (PtrA1 == PtrB1 &&
17268 (OffsetA.sextOrTrunc(width: IdxWidth) - OffsetB.sextOrTrunc(width: IdxWidth))
17269 .abs() == 16)
17270 return true;
17271 }
17272 }
17273
17274 return false;
17275}
17276
17277/// Lower an interleaved store into a stN intrinsic.
17278///
17279/// E.g. Lower an interleaved store (Factor = 3):
17280/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17281/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17282/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17283///
17284/// Into:
17285/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17286/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17287/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17288/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17289///
17290/// Note that the new shufflevectors will be removed and we'll only generate one
17291/// st3 instruction in CodeGen.
17292///
17293/// Example for a more general valid mask (Factor 3). Lower:
17294/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17295/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17296/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17297///
17298/// Into:
17299/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17300/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17301/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17302/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17303bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
17304 ShuffleVectorInst *SVI,
17305 unsigned Factor) const {
17306
17307 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17308 "Invalid interleave factor");
17309
17310 auto *VecTy = cast<FixedVectorType>(Val: SVI->getType());
17311 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
17312
17313 unsigned LaneLen = VecTy->getNumElements() / Factor;
17314 Type *EltTy = VecTy->getElementType();
17315 auto *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: LaneLen);
17316
17317 const DataLayout &DL = SI->getDataLayout();
17318 bool UseScalable;
17319
17320 // Skip if we do not have NEON and skip illegal vector types. We can
17321 // "legalize" wide vector types into multiple interleaved accesses as long as
17322 // the vector types are divisible by 128.
17323 if (!isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
17324 return false;
17325
17326 unsigned NumStores = getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
17327
17328 Value *Op0 = SVI->getOperand(i_nocapture: 0);
17329 Value *Op1 = SVI->getOperand(i_nocapture: 1);
17330 IRBuilder<> Builder(SI);
17331
17332 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
17333 // vectors to integer vectors.
17334 if (EltTy->isPointerTy()) {
17335 Type *IntTy = DL.getIntPtrType(EltTy);
17336 unsigned NumOpElts =
17337 cast<FixedVectorType>(Val: Op0->getType())->getNumElements();
17338
17339 // Convert to the corresponding integer vector.
17340 auto *IntVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: NumOpElts);
17341 Op0 = Builder.CreatePtrToInt(V: Op0, DestTy: IntVecTy);
17342 Op1 = Builder.CreatePtrToInt(V: Op1, DestTy: IntVecTy);
17343
17344 SubVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: LaneLen);
17345 }
17346
17347 // If we're going to generate more than one store, reset the lane length
17348 // and sub-vector type to something legal.
17349 LaneLen /= NumStores;
17350 SubVecTy = FixedVectorType::get(ElementType: SubVecTy->getElementType(), NumElts: LaneLen);
17351
17352 auto *STVTy = UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: SubVecTy))
17353 : SubVecTy;
17354
17355 // The base address of the store.
17356 Value *BaseAddr = SI->getPointerOperand();
17357
17358 auto Mask = SVI->getShuffleMask();
17359
17360 // Sanity check if all the indices are NOT in range.
17361 // If mask is `poison`, `Mask` may be a vector of -1s.
17362 // If all of them are `poison`, OOB read will happen later.
17363 if (llvm::all_of(Range&: Mask, P: [](int Idx) { return Idx == PoisonMaskElem; })) {
17364 return false;
17365 }
17366 // A 64bit st2 which does not start at element 0 will involved adding extra
17367 // ext elements making the st2 unprofitable, and if there is a nearby store
17368 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
17369 // zip;ldp pair which has higher throughput.
17370 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
17371 (Mask[0] != 0 ||
17372 hasNearbyPairedStore(It: SI->getIterator(), End: SI->getParent()->end(), Ptr: BaseAddr,
17373 DL) ||
17374 hasNearbyPairedStore(It: SI->getReverseIterator(), End: SI->getParent()->rend(),
17375 Ptr: BaseAddr, DL)))
17376 return false;
17377
17378 Type *PtrTy = SI->getPointerOperandType();
17379 Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: STVTy->getContext()),
17380 EC: STVTy->getElementCount());
17381
17382 Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
17383 Scalable: UseScalable, STVTy, PtrTy);
17384
17385 Value *PTrue = nullptr;
17386 if (UseScalable) {
17387 std::optional<unsigned> PgPattern =
17388 getSVEPredPatternFromNumElements(MinNumElts: SubVecTy->getNumElements());
17389 if (Subtarget->getMinSVEVectorSizeInBits() ==
17390 Subtarget->getMaxSVEVectorSizeInBits() &&
17391 Subtarget->getMinSVEVectorSizeInBits() ==
17392 DL.getTypeSizeInBits(Ty: SubVecTy))
17393 PgPattern = AArch64SVEPredPattern::all;
17394
17395 auto *PTruePat =
17396 ConstantInt::get(Ty: Type::getInt32Ty(C&: STVTy->getContext()), V: *PgPattern);
17397 PTrue = Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, Types: {PredTy},
17398 Args: {PTruePat});
17399 }
17400
17401 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
17402
17403 SmallVector<Value *, 5> Ops;
17404
17405 // Split the shufflevector operands into sub vectors for the new stN call.
17406 for (unsigned i = 0; i < Factor; i++) {
17407 Value *Shuffle;
17408 unsigned IdxI = StoreCount * LaneLen * Factor + i;
17409 if (Mask[IdxI] >= 0) {
17410 Shuffle = Builder.CreateShuffleVector(
17411 V1: Op0, V2: Op1, Mask: createSequentialMask(Start: Mask[IdxI], NumInts: LaneLen, NumUndefs: 0));
17412 } else {
17413 unsigned StartMask = 0;
17414 for (unsigned j = 1; j < LaneLen; j++) {
17415 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17416 if (Mask[IdxJ] >= 0) {
17417 StartMask = Mask[IdxJ] - j;
17418 break;
17419 }
17420 }
17421 // Note: Filling undef gaps with random elements is ok, since
17422 // those elements were being written anyway (with undefs).
17423 // In the case of all undefs we're defaulting to using elems from 0
17424 // Note: StartMask cannot be negative, it's checked in
17425 // isReInterleaveMask
17426 Shuffle = Builder.CreateShuffleVector(
17427 V1: Op0, V2: Op1, Mask: createSequentialMask(Start: StartMask, NumInts: LaneLen, NumUndefs: 0));
17428 }
17429
17430 if (UseScalable)
17431 Shuffle = Builder.CreateInsertVector(DstType: STVTy, SrcVec: PoisonValue::get(T: STVTy),
17432 SubVec: Shuffle, Idx: uint64_t(0));
17433
17434 Ops.push_back(Elt: Shuffle);
17435 }
17436
17437 if (UseScalable)
17438 Ops.push_back(Elt: PTrue);
17439
17440 // If we generating more than one store, we compute the base address of
17441 // subsequent stores as an offset from the previous.
17442 if (StoreCount > 0)
17443 BaseAddr = Builder.CreateConstGEP1_32(Ty: SubVecTy->getElementType(),
17444 Ptr: BaseAddr, Idx0: LaneLen * Factor);
17445
17446 Ops.push_back(Elt: BaseAddr);
17447 Builder.CreateCall(Callee: StNFunc, Args: Ops);
17448 }
17449 return true;
17450}
17451
17452bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
17453 LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const {
17454 unsigned Factor = DeinterleavedValues.size();
17455 if (Factor != 2 && Factor != 4) {
17456 LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
17457 return false;
17458 }
17459
17460 VectorType *VTy = cast<VectorType>(Val: DeinterleavedValues[0]->getType());
17461
17462 const DataLayout &DL = LI->getModule()->getDataLayout();
17463 bool UseScalable;
17464 if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
17465 return false;
17466
17467 // TODO: Add support for using SVE instructions with fixed types later, using
17468 // the code from lowerInterleavedLoad to obtain the correct container type.
17469 if (UseScalable && !VTy->isScalableTy())
17470 return false;
17471
17472 unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
17473 VectorType *LdTy =
17474 VectorType::get(ElementType: VTy->getElementType(),
17475 EC: VTy->getElementCount().divideCoefficientBy(RHS: NumLoads));
17476
17477 Type *PtrTy = LI->getPointerOperandType();
17478 Function *LdNFunc = getStructuredLoadFunction(M: LI->getModule(), Factor,
17479 Scalable: UseScalable, LDVTy: LdTy, PtrTy);
17480
17481 IRBuilder<> Builder(LI);
17482 Value *Pred = nullptr;
17483 if (UseScalable)
17484 Pred =
17485 Builder.CreateVectorSplat(EC: LdTy->getElementCount(), V: Builder.getTrue());
17486
17487 Value *BaseAddr = LI->getPointerOperand();
17488 if (NumLoads > 1) {
17489 // Create multiple legal small ldN.
17490 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(T: VTy));
17491 for (unsigned I = 0; I < NumLoads; ++I) {
17492 Value *Offset = Builder.getInt64(C: I * Factor);
17493
17494 Value *Address = Builder.CreateGEP(Ty: LdTy, Ptr: BaseAddr, IdxList: {Offset});
17495 Value *LdN = nullptr;
17496 if (UseScalable)
17497 LdN = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, Address}, Name: "ldN");
17498 else
17499 LdN = Builder.CreateCall(Callee: LdNFunc, Args: Address, Name: "ldN");
17500 Value *Idx =
17501 Builder.getInt64(C: I * LdTy->getElementCount().getKnownMinValue());
17502 for (unsigned J = 0; J < Factor; ++J) {
17503 ExtractedLdValues[J] = Builder.CreateInsertVector(
17504 DstType: VTy, SrcVec: ExtractedLdValues[J], SubVec: Builder.CreateExtractValue(Agg: LdN, Idxs: J), Idx);
17505 }
17506 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
17507 }
17508 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17509 for (unsigned J = 0; J < Factor; ++J)
17510 DeinterleavedValues[J]->replaceAllUsesWith(V: ExtractedLdValues[J]);
17511 } else {
17512 Value *Result;
17513 if (UseScalable)
17514 Result = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, BaseAddr}, Name: "ldN");
17515 else
17516 Result = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
17517 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17518 for (unsigned I = 0; I < Factor; I++) {
17519 Value *NewExtract = Builder.CreateExtractValue(Agg: Result, Idxs: I);
17520 DeinterleavedValues[I]->replaceAllUsesWith(V: NewExtract);
17521 }
17522 }
17523 return true;
17524}
17525
17526bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
17527 StoreInst *SI, ArrayRef<Value *> InterleavedValues) const {
17528 unsigned Factor = InterleavedValues.size();
17529 if (Factor != 2 && Factor != 4) {
17530 LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
17531 return false;
17532 }
17533
17534 VectorType *VTy = cast<VectorType>(Val: InterleavedValues[0]->getType());
17535 const DataLayout &DL = SI->getModule()->getDataLayout();
17536
17537 bool UseScalable;
17538 if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
17539 return false;
17540
17541 // TODO: Add support for using SVE instructions with fixed types later, using
17542 // the code from lowerInterleavedStore to obtain the correct container type.
17543 if (UseScalable && !VTy->isScalableTy())
17544 return false;
17545
17546 unsigned NumStores = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
17547
17548 VectorType *StTy =
17549 VectorType::get(ElementType: VTy->getElementType(),
17550 EC: VTy->getElementCount().divideCoefficientBy(RHS: NumStores));
17551
17552 Type *PtrTy = SI->getPointerOperandType();
17553 Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
17554 Scalable: UseScalable, STVTy: StTy, PtrTy);
17555
17556 IRBuilder<> Builder(SI);
17557
17558 Value *BaseAddr = SI->getPointerOperand();
17559 Value *Pred = nullptr;
17560
17561 if (UseScalable)
17562 Pred =
17563 Builder.CreateVectorSplat(EC: StTy->getElementCount(), V: Builder.getTrue());
17564
17565 auto ExtractedValues = InterleavedValues;
17566 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
17567 if (UseScalable)
17568 StoreOperands.push_back(Elt: Pred);
17569 StoreOperands.push_back(Elt: BaseAddr);
17570 for (unsigned I = 0; I < NumStores; ++I) {
17571 Value *Address = BaseAddr;
17572 if (NumStores > 1) {
17573 Value *Offset = Builder.getInt64(C: I * Factor);
17574 Address = Builder.CreateGEP(Ty: StTy, Ptr: BaseAddr, IdxList: {Offset});
17575 Value *Idx =
17576 Builder.getInt64(C: I * StTy->getElementCount().getKnownMinValue());
17577 for (unsigned J = 0; J < Factor; J++) {
17578 StoreOperands[J] =
17579 Builder.CreateExtractVector(DstType: StTy, SrcVec: ExtractedValues[J], Idx);
17580 }
17581 // update the address
17582 StoreOperands[StoreOperands.size() - 1] = Address;
17583 }
17584 Builder.CreateCall(Callee: StNFunc, Args: StoreOperands);
17585 }
17586 return true;
17587}
17588
17589EVT AArch64TargetLowering::getOptimalMemOpType(
17590 const MemOp &Op, const AttributeList &FuncAttributes) const {
17591 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Kind: Attribute::NoImplicitFloat);
17592 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17593 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17594 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17595 // taken one instruction to materialize the v2i64 zero and one store (with
17596 // restrictive addressing mode). Just do i64 stores.
17597 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17598 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17599 if (Op.isAligned(AlignCheck))
17600 return true;
17601 unsigned Fast;
17602 return allowsMisalignedMemoryAccesses(VT, AddrSpace: 0, Alignment: Align(1),
17603 Flags: MachineMemOperand::MONone, Fast: &Fast) &&
17604 Fast;
17605 };
17606
17607 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17608 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
17609 return MVT::v16i8;
17610 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17611 return MVT::f128;
17612 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17613 return MVT::i64;
17614 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17615 return MVT::i32;
17616 return MVT::Other;
17617}
17618
17619LLT AArch64TargetLowering::getOptimalMemOpLLT(
17620 const MemOp &Op, const AttributeList &FuncAttributes) const {
17621 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Kind: Attribute::NoImplicitFloat);
17622 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17623 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17624 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17625 // taken one instruction to materialize the v2i64 zero and one store (with
17626 // restrictive addressing mode). Just do i64 stores.
17627 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17628 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17629 if (Op.isAligned(AlignCheck))
17630 return true;
17631 unsigned Fast;
17632 return allowsMisalignedMemoryAccesses(VT, AddrSpace: 0, Alignment: Align(1),
17633 Flags: MachineMemOperand::MONone, Fast: &Fast) &&
17634 Fast;
17635 };
17636
17637 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17638 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
17639 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
17640 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17641 return LLT::scalar(SizeInBits: 128);
17642 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17643 return LLT::scalar(SizeInBits: 64);
17644 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17645 return LLT::scalar(SizeInBits: 32);
17646 return LLT();
17647}
17648
17649// 12-bit optionally shifted immediates are legal for adds.
17650bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
17651 if (Immed == std::numeric_limits<int64_t>::min()) {
17652 return false;
17653 }
17654 // Same encoding for add/sub, just flip the sign.
17655 return isLegalArithImmed(C: (uint64_t)std::abs(i: Immed));
17656}
17657
17658bool AArch64TargetLowering::isLegalAddScalableImmediate(int64_t Imm) const {
17659 // We will only emit addvl/inc* instructions for SVE2
17660 if (!Subtarget->hasSVE2())
17661 return false;
17662
17663 // addvl's immediates are in terms of the number of bytes in a register.
17664 // Since there are 16 in the base supported size (128bits), we need to
17665 // divide the immediate by that much to give us a useful immediate to
17666 // multiply by vscale. We can't have a remainder as a result of this.
17667 if (Imm % 16 == 0)
17668 return isInt<6>(x: Imm / 16);
17669
17670 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
17671 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17672 // of addvl as a result, so only take h|w|d into account.
17673 // Dec[h|w|d] will cover subtractions.
17674 // Immediates are in the range [1,16], so we can't do a 2's complement check.
17675 // FIXME: Can we make use of other patterns to cover other immediates?
17676
17677 // inch|dech
17678 if (Imm % 8 == 0)
17679 return std::abs(i: Imm / 8) <= 16;
17680 // incw|decw
17681 if (Imm % 4 == 0)
17682 return std::abs(i: Imm / 4) <= 16;
17683 // incd|decd
17684 if (Imm % 2 == 0)
17685 return std::abs(i: Imm / 2) <= 16;
17686
17687 return false;
17688}
17689
17690// Return false to prevent folding
17691// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
17692// if the folding leads to worse code.
17693bool AArch64TargetLowering::isMulAddWithConstProfitable(
17694 SDValue AddNode, SDValue ConstNode) const {
17695 // Let the DAGCombiner decide for vector types and large types.
17696 const EVT VT = AddNode.getValueType();
17697 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
17698 return true;
17699
17700 // It is worse if c1 is legal add immediate, while c1*c2 is not
17701 // and has to be composed by at least two instructions.
17702 const ConstantSDNode *C1Node = cast<ConstantSDNode>(Val: AddNode.getOperand(i: 1));
17703 const ConstantSDNode *C2Node = cast<ConstantSDNode>(Val&: ConstNode);
17704 const int64_t C1 = C1Node->getSExtValue();
17705 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17706 if (!isLegalAddImmediate(Immed: C1) || isLegalAddImmediate(Immed: C1C2.getSExtValue()))
17707 return true;
17708 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
17709 // Adapt to the width of a register.
17710 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
17711 AArch64_IMM::expandMOVImm(Imm: C1C2.getZExtValue(), BitSize, Insn);
17712 if (Insn.size() > 1)
17713 return false;
17714
17715 // Default to true and let the DAGCombiner decide.
17716 return true;
17717}
17718
17719// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17720// immediates is the same as for an add or a sub.
17721bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
17722 return isLegalAddImmediate(Immed);
17723}
17724
17725/// isLegalAddressingMode - Return true if the addressing mode represented
17726/// by AM is legal for this target, for a load/store of the specified type.
17727bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
17728 const AddrMode &AMode, Type *Ty,
17729 unsigned AS, Instruction *I) const {
17730 // AArch64 has five basic addressing modes:
17731 // reg
17732 // reg + 9-bit signed offset
17733 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
17734 // reg1 + reg2
17735 // reg + SIZE_IN_BYTES * reg
17736
17737 // No global is ever allowed as a base.
17738 if (AMode.BaseGV)
17739 return false;
17740
17741 // No reg+reg+imm addressing.
17742 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17743 return false;
17744
17745 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
17746 // `2*ScaledReg` into `BaseReg + ScaledReg`
17747 AddrMode AM = AMode;
17748 if (AM.Scale && !AM.HasBaseReg) {
17749 if (AM.Scale == 1) {
17750 AM.HasBaseReg = true;
17751 AM.Scale = 0;
17752 } else if (AM.Scale == 2) {
17753 AM.HasBaseReg = true;
17754 AM.Scale = 1;
17755 } else {
17756 return false;
17757 }
17758 }
17759
17760 // A base register is required in all addressing modes.
17761 if (!AM.HasBaseReg)
17762 return false;
17763
17764 if (Ty->isScalableTy()) {
17765 if (isa<ScalableVectorType>(Val: Ty)) {
17766 // See if we have a foldable vscale-based offset, for vector types which
17767 // are either legal or smaller than the minimum; more work will be
17768 // required if we need to consider addressing for types which need
17769 // legalization by splitting.
17770 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
17771 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17772 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
17773 isPowerOf2_64(Value: VecNumBytes))
17774 return isInt<4>(x: AM.ScalableOffset / (int64_t)VecNumBytes);
17775
17776 uint64_t VecElemNumBytes =
17777 DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: Ty)->getElementType()) / 8;
17778 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17779 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
17780 }
17781
17782 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17783 }
17784
17785 // No scalable offsets allowed for non-scalable types.
17786 if (AM.ScalableOffset)
17787 return false;
17788
17789 // check reg + imm case:
17790 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
17791 uint64_t NumBytes = 0;
17792 if (Ty->isSized()) {
17793 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
17794 NumBytes = NumBits / 8;
17795 if (!isPowerOf2_64(Value: NumBits))
17796 NumBytes = 0;
17797 }
17798
17799 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, Offset: AM.BaseOffs,
17800 Scale: AM.Scale);
17801}
17802
17803// Check whether the 2 offsets belong to the same imm24 range, and their high
17804// 12bits are same, then their high part can be decoded with the offset of add.
17805int64_t
17806AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset,
17807 int64_t MaxOffset) const {
17808 int64_t HighPart = MinOffset & ~0xfffULL;
17809 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(Immed: HighPart)) {
17810 // Rebase the value to an integer multiple of imm12.
17811 return HighPart;
17812 }
17813
17814 return 0;
17815}
17816
17817bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
17818 // Consider splitting large offset of struct or array.
17819 return true;
17820}
17821
17822bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
17823 const MachineFunction &MF, EVT VT) const {
17824 VT = VT.getScalarType();
17825
17826 if (!VT.isSimple())
17827 return false;
17828
17829 switch (VT.getSimpleVT().SimpleTy) {
17830 case MVT::f16:
17831 return Subtarget->hasFullFP16();
17832 case MVT::f32:
17833 case MVT::f64:
17834 return true;
17835 default:
17836 break;
17837 }
17838
17839 return false;
17840}
17841
17842bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
17843 Type *Ty) const {
17844 switch (Ty->getScalarType()->getTypeID()) {
17845 case Type::FloatTyID:
17846 case Type::DoubleTyID:
17847 return true;
17848 default:
17849 return false;
17850 }
17851}
17852
17853bool AArch64TargetLowering::generateFMAsInMachineCombiner(
17854 EVT VT, CodeGenOptLevel OptLevel) const {
17855 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
17856 !useSVEForFixedLengthVectorVT(VT);
17857}
17858
17859const MCPhysReg *
17860AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
17861 // LR is a callee-save register, but we must treat it as clobbered by any call
17862 // site. Hence we include LR in the scratch registers, which are in turn added
17863 // as implicit-defs for stackmaps and patchpoints.
17864 static const MCPhysReg ScratchRegs[] = {
17865 AArch64::X16, AArch64::X17, AArch64::LR, 0
17866 };
17867 return ScratchRegs;
17868}
17869
17870ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const {
17871 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
17872 return RCRegs;
17873}
17874
17875bool
17876AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
17877 CombineLevel Level) const {
17878 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
17879 N->getOpcode() == ISD::SRL) &&
17880 "Expected shift op");
17881
17882 SDValue ShiftLHS = N->getOperand(Num: 0);
17883 EVT VT = N->getValueType(ResNo: 0);
17884
17885 if (!ShiftLHS->hasOneUse())
17886 return false;
17887
17888 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
17889 !ShiftLHS.getOperand(i: 0)->hasOneUse())
17890 return false;
17891
17892 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
17893 // combine it with shift 'N' to let it be lowered to UBFX except:
17894 // ((x >> C) & mask) << C.
17895 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
17896 isa<ConstantSDNode>(Val: ShiftLHS.getOperand(i: 1))) {
17897 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(i: 1);
17898 if (isMask_64(Value: TruncMask)) {
17899 SDValue AndLHS = ShiftLHS.getOperand(i: 0);
17900 if (AndLHS.getOpcode() == ISD::SRL) {
17901 if (auto *SRLC = dyn_cast<ConstantSDNode>(Val: AndLHS.getOperand(i: 1))) {
17902 if (N->getOpcode() == ISD::SHL)
17903 if (auto *SHLC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1)))
17904 return SRLC->getZExtValue() == SHLC->getZExtValue();
17905 return false;
17906 }
17907 }
17908 }
17909 }
17910 return true;
17911}
17912
17913bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
17914 const SDNode *N) const {
17915 assert(N->getOpcode() == ISD::XOR &&
17916 (N->getOperand(0).getOpcode() == ISD::SHL ||
17917 N->getOperand(0).getOpcode() == ISD::SRL) &&
17918 "Expected XOR(SHIFT) pattern");
17919
17920 // Only commute if the entire NOT mask is a hidden shifted mask.
17921 auto *XorC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
17922 auto *ShiftC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0).getOperand(i: 1));
17923 if (XorC && ShiftC) {
17924 unsigned MaskIdx, MaskLen;
17925 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
17926 unsigned ShiftAmt = ShiftC->getZExtValue();
17927 unsigned BitWidth = N->getValueType(ResNo: 0).getScalarSizeInBits();
17928 if (N->getOperand(Num: 0).getOpcode() == ISD::SHL)
17929 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
17930 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
17931 }
17932 }
17933
17934 return false;
17935}
17936
17937bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
17938 const SDNode *N, CombineLevel Level) const {
17939 assert(((N->getOpcode() == ISD::SHL &&
17940 N->getOperand(0).getOpcode() == ISD::SRL) ||
17941 (N->getOpcode() == ISD::SRL &&
17942 N->getOperand(0).getOpcode() == ISD::SHL)) &&
17943 "Expected shift-shift mask");
17944 // Don't allow multiuse shift folding with the same shift amount.
17945 if (!N->getOperand(Num: 0)->hasOneUse())
17946 return false;
17947
17948 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
17949 EVT VT = N->getValueType(ResNo: 0);
17950 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
17951 auto *C1 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0).getOperand(i: 1));
17952 auto *C2 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
17953 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
17954 }
17955
17956 // We do not need to fold when this shifting used in specific load case:
17957 // (ldr x, (add x, (shl (srl x, c1) 2)))
17958 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
17959 if (auto C2 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1))) {
17960 unsigned ShlAmt = C2->getZExtValue();
17961 if (auto ShouldADD = *N->user_begin();
17962 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
17963 if (auto ShouldLOAD = dyn_cast<LoadSDNode>(Val: *ShouldADD->user_begin())) {
17964 unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
17965 if ((1ULL << ShlAmt) == ByteVT &&
17966 isIndexedLoadLegal(IdxMode: ISD::PRE_INC, VT: ShouldLOAD->getMemoryVT()))
17967 return false;
17968 }
17969 }
17970 }
17971 }
17972
17973 return true;
17974}
17975
17976bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant(
17977 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
17978 SDValue Y) const {
17979 return VT.isScalableVector() && isTypeLegal(VT) &&
17980 SelectOpcode == ISD::VSELECT;
17981}
17982
17983bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
17984 Type *Ty) const {
17985 assert(Ty->isIntegerTy());
17986
17987 unsigned BitSize = Ty->getPrimitiveSizeInBits();
17988 if (BitSize == 0)
17989 return false;
17990
17991 int64_t Val = Imm.getSExtValue();
17992 if (Val == 0 || AArch64_AM::isLogicalImmediate(imm: Val, regSize: BitSize))
17993 return true;
17994
17995 if ((int64_t)Val < 0)
17996 Val = ~Val;
17997 if (BitSize == 32)
17998 Val &= (1LL << 32) - 1;
17999
18000 unsigned Shift = llvm::Log2_64(Value: (uint64_t)Val) / 16;
18001 // MOVZ is free so return true for one or fewer MOVK.
18002 return Shift < 3;
18003}
18004
18005bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
18006 unsigned Index) const {
18007 if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
18008 return false;
18009
18010 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18011}
18012
18013/// Turn vector tests of the signbit in the form of:
18014/// xor (sra X, elt_size(X)-1), -1
18015/// into:
18016/// cmge X, X, #0
18017static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
18018 const AArch64Subtarget *Subtarget) {
18019 EVT VT = N->getValueType(ResNo: 0);
18020 if (!Subtarget->hasNEON() || !VT.isVector())
18021 return SDValue();
18022
18023 // There must be a shift right algebraic before the xor, and the xor must be a
18024 // 'not' operation.
18025 SDValue Shift = N->getOperand(Num: 0);
18026 SDValue Ones = N->getOperand(Num: 1);
18027 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18028 !ISD::isBuildVectorAllOnes(N: Ones.getNode()))
18029 return SDValue();
18030
18031 // The shift should be smearing the sign bit across each vector element.
18032 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1));
18033 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18034 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18035 return SDValue();
18036
18037 SDLoc DL(N);
18038 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: Shift.getValueType());
18039 return DAG.getSetCC(DL, VT, LHS: Shift.getOperand(i: 0), RHS: Zero, Cond: ISD::SETGE);
18040}
18041
18042// Given a vecreduce_add node, detect the below pattern and convert it to the
18043// node sequence with UABDL, [S|U]ADB and UADDLP.
18044//
18045// i32 vecreduce_add(
18046// v16i32 abs(
18047// v16i32 sub(
18048// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18049// =================>
18050// i32 vecreduce_add(
18051// v4i32 UADDLP(
18052// v8i16 add(
18053// v8i16 zext(
18054// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18055// v8i16 zext(
18056// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18057static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
18058 SelectionDAG &DAG) {
18059 // Assumed i32 vecreduce_add
18060 if (N->getValueType(ResNo: 0) != MVT::i32)
18061 return SDValue();
18062
18063 SDValue VecReduceOp0 = N->getOperand(Num: 0);
18064 unsigned Opcode = VecReduceOp0.getOpcode();
18065 // Assumed v16i32 abs
18066 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(ResNo: 0) != MVT::v16i32)
18067 return SDValue();
18068
18069 SDValue ABS = VecReduceOp0;
18070 // Assumed v16i32 sub
18071 if (ABS->getOperand(Num: 0)->getOpcode() != ISD::SUB ||
18072 ABS->getOperand(Num: 0)->getValueType(ResNo: 0) != MVT::v16i32)
18073 return SDValue();
18074
18075 SDValue SUB = ABS->getOperand(Num: 0);
18076 unsigned Opcode0 = SUB->getOperand(Num: 0).getOpcode();
18077 unsigned Opcode1 = SUB->getOperand(Num: 1).getOpcode();
18078 // Assumed v16i32 type
18079 if (SUB->getOperand(Num: 0)->getValueType(ResNo: 0) != MVT::v16i32 ||
18080 SUB->getOperand(Num: 1)->getValueType(ResNo: 0) != MVT::v16i32)
18081 return SDValue();
18082
18083 // Assumed zext or sext
18084 bool IsZExt = false;
18085 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18086 IsZExt = true;
18087 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18088 IsZExt = false;
18089 } else
18090 return SDValue();
18091
18092 SDValue EXT0 = SUB->getOperand(Num: 0);
18093 SDValue EXT1 = SUB->getOperand(Num: 1);
18094 // Assumed zext's operand has v16i8 type
18095 if (EXT0->getOperand(Num: 0)->getValueType(ResNo: 0) != MVT::v16i8 ||
18096 EXT1->getOperand(Num: 0)->getValueType(ResNo: 0) != MVT::v16i8)
18097 return SDValue();
18098
18099 // Pattern is detected. Let's convert it to sequence of nodes.
18100 SDLoc DL(N);
18101
18102 // First, create the node pattern of UABD/SABD.
18103 SDValue UABDHigh8Op0 =
18104 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT0->getOperand(Num: 0),
18105 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i64));
18106 SDValue UABDHigh8Op1 =
18107 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT1->getOperand(Num: 0),
18108 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i64));
18109 SDValue UABDHigh8 = DAG.getNode(Opcode: IsZExt ? ISD::ABDU : ISD::ABDS, DL, VT: MVT::v8i8,
18110 N1: UABDHigh8Op0, N2: UABDHigh8Op1);
18111 SDValue UABDL = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::v8i16, Operand: UABDHigh8);
18112
18113 // Second, create the node pattern of UABAL.
18114 SDValue UABDLo8Op0 =
18115 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT0->getOperand(Num: 0),
18116 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
18117 SDValue UABDLo8Op1 =
18118 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT1->getOperand(Num: 0),
18119 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
18120 SDValue UABDLo8 = DAG.getNode(Opcode: IsZExt ? ISD::ABDU : ISD::ABDS, DL, VT: MVT::v8i8,
18121 N1: UABDLo8Op0, N2: UABDLo8Op1);
18122 SDValue ZExtUABD = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::v8i16, Operand: UABDLo8);
18123 SDValue UABAL = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::v8i16, N1: UABDL, N2: ZExtUABD);
18124
18125 // Third, create the node of UADDLP.
18126 SDValue UADDLP = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT: MVT::v4i32, Operand: UABAL);
18127
18128 // Fourth, create the node of VECREDUCE_ADD.
18129 return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: MVT::i32, Operand: UADDLP);
18130}
18131
18132static SDValue
18133performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18134 const AArch64Subtarget *ST) {
18135 if (DCI.isBeforeLegalize())
18136 return SDValue();
18137
18138 if (SDValue While = optimizeIncrementingWhile(N, DAG&: DCI.DAG, /*IsSigned=*/false,
18139 /*IsEqual=*/false))
18140 return While;
18141
18142 if (!N->getValueType(ResNo: 0).isScalableVector() ||
18143 (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
18144 return SDValue();
18145
18146 if (!N->hasNUsesOfValue(NUses: 2, Value: 0))
18147 return SDValue();
18148
18149 const uint64_t HalfSize = N->getValueType(ResNo: 0).getVectorMinNumElements() / 2;
18150 if (HalfSize < 2)
18151 return SDValue();
18152
18153 auto It = N->user_begin();
18154 SDNode *Lo = *It++;
18155 SDNode *Hi = *It;
18156
18157 if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18158 Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
18159 return SDValue();
18160
18161 uint64_t OffLo = Lo->getConstantOperandVal(Num: 1);
18162 uint64_t OffHi = Hi->getConstantOperandVal(Num: 1);
18163
18164 if (OffLo > OffHi) {
18165 std::swap(a&: Lo, b&: Hi);
18166 std::swap(a&: OffLo, b&: OffHi);
18167 }
18168
18169 if (OffLo != 0 || OffHi != HalfSize)
18170 return SDValue();
18171
18172 EVT HalfVec = Lo->getValueType(ResNo: 0);
18173 if (HalfVec != Hi->getValueType(ResNo: 0) ||
18174 HalfVec.getVectorElementCount() != ElementCount::getScalable(MinVal: HalfSize))
18175 return SDValue();
18176
18177 SelectionDAG &DAG = DCI.DAG;
18178 SDLoc DL(N);
18179 SDValue ID =
18180 DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo_x2, DL, VT: MVT::i64);
18181 SDValue Idx = N->getOperand(Num: 0);
18182 SDValue TC = N->getOperand(Num: 1);
18183 if (Idx.getValueType() != MVT::i64) {
18184 Idx = DAG.getZExtOrTrunc(Op: Idx, DL, VT: MVT::i64);
18185 TC = DAG.getZExtOrTrunc(Op: TC, DL, VT: MVT::i64);
18186 }
18187 auto R =
18188 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL,
18189 ResultTys: {Lo->getValueType(ResNo: 0), Hi->getValueType(ResNo: 0)}, Ops: {ID, Idx, TC});
18190
18191 DCI.CombineTo(N: Lo, Res: R.getValue(R: 0));
18192 DCI.CombineTo(N: Hi, Res: R.getValue(R: 1));
18193
18194 return SDValue(N, 0);
18195}
18196
18197// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18198// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18199// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18200// If we have vectors larger than v16i8 we extract v16i8 vectors,
18201// Follow the same steps above to get DOT instructions concatenate them
18202// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18203static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
18204 const AArch64Subtarget *ST) {
18205 if (!ST->isNeonAvailable())
18206 return SDValue();
18207
18208 if (!ST->hasDotProd())
18209 return performVecReduceAddCombineWithUADDLP(N, DAG);
18210
18211 SDValue Op0 = N->getOperand(Num: 0);
18212 if (N->getValueType(ResNo: 0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18213 Op0.getValueType().getVectorElementType() != MVT::i32)
18214 return SDValue();
18215
18216 unsigned ExtOpcode = Op0.getOpcode();
18217 SDValue A = Op0;
18218 SDValue B;
18219 unsigned DotOpcode;
18220 if (ExtOpcode == ISD::MUL) {
18221 A = Op0.getOperand(i: 0);
18222 B = Op0.getOperand(i: 1);
18223 if (A.getOperand(i: 0).getValueType() != B.getOperand(i: 0).getValueType())
18224 return SDValue();
18225 auto OpCodeA = A.getOpcode();
18226 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18227 return SDValue();
18228
18229 auto OpCodeB = B.getOpcode();
18230 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18231 return SDValue();
18232
18233 if (OpCodeA == OpCodeB) {
18234 DotOpcode =
18235 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
18236 } else {
18237 // Check USDOT support support
18238 if (!ST->hasMatMulInt8())
18239 return SDValue();
18240 DotOpcode = AArch64ISD::USDOT;
18241 if (OpCodeA == ISD::SIGN_EXTEND)
18242 std::swap(a&: A, b&: B);
18243 }
18244 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18245 DotOpcode = AArch64ISD::UDOT;
18246 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18247 DotOpcode = AArch64ISD::SDOT;
18248 } else {
18249 return SDValue();
18250 }
18251
18252 EVT Op0VT = A.getOperand(i: 0).getValueType();
18253 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
18254 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
18255 if (!IsValidElementCount || !IsValidSize)
18256 return SDValue();
18257
18258 SDLoc DL(Op0);
18259 // For non-mla reductions B can be set to 1. For MLA we take the operand of
18260 // the extend B.
18261 if (!B)
18262 B = DAG.getConstant(Val: 1, DL, VT: Op0VT);
18263 else
18264 B = B.getOperand(i: 0);
18265
18266 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
18267 unsigned NumOfVecReduce;
18268 EVT TargetType;
18269 if (IsMultipleOf16) {
18270 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
18271 TargetType = MVT::v4i32;
18272 } else {
18273 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
18274 TargetType = MVT::v2i32;
18275 }
18276 // Handle the case where we need to generate only one Dot operation.
18277 if (NumOfVecReduce == 1) {
18278 SDValue Zeros = DAG.getConstant(Val: 0, DL, VT: TargetType);
18279 SDValue Dot = DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros,
18280 N2: A.getOperand(i: 0), N3: B);
18281 return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: 0), Operand: Dot);
18282 }
18283 // Generate Dot instructions that are multiple of 16.
18284 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
18285 SmallVector<SDValue, 4> SDotVec16;
18286 unsigned I = 0;
18287 for (; I < VecReduce16Num; I += 1) {
18288 SDValue Zeros = DAG.getConstant(Val: 0, DL, VT: MVT::v4i32);
18289 SDValue Op0 =
18290 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v16i8, N1: A.getOperand(i: 0),
18291 N2: DAG.getConstant(Val: I * 16, DL, VT: MVT::i64));
18292 SDValue Op1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v16i8, N1: B,
18293 N2: DAG.getConstant(Val: I * 16, DL, VT: MVT::i64));
18294 SDValue Dot =
18295 DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Op0, N3: Op1);
18296 SDotVec16.push_back(Elt: Dot);
18297 }
18298 // Concatenate dot operations.
18299 EVT SDot16EVT =
18300 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: 4 * VecReduce16Num);
18301 SDValue ConcatSDot16 =
18302 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: SDot16EVT, Ops: SDotVec16);
18303 SDValue VecReduceAdd16 =
18304 DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: 0), Operand: ConcatSDot16);
18305 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
18306 if (VecReduce8Num == 0)
18307 return VecReduceAdd16;
18308
18309 // Generate the remainder Dot operation that is multiple of 8.
18310 SDValue Zeros = DAG.getConstant(Val: 0, DL, VT: MVT::v2i32);
18311 SDValue Vec8Op0 =
18312 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: A.getOperand(i: 0),
18313 N2: DAG.getConstant(Val: I * 16, DL, VT: MVT::i64));
18314 SDValue Vec8Op1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: B,
18315 N2: DAG.getConstant(Val: I * 16, DL, VT: MVT::i64));
18316 SDValue Dot =
18317 DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Vec8Op0, N3: Vec8Op1);
18318 SDValue VecReduceAdd8 =
18319 DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: 0), Operand: Dot);
18320 return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: 0), N1: VecReduceAdd16,
18321 N2: VecReduceAdd8);
18322}
18323
18324// Given an (integer) vecreduce, we know the order of the inputs does not
18325// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
18326// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
18327// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
18328static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
18329 auto DetectAddExtract = [&](SDValue A) {
18330 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
18331 // UADDLP(x) if found.
18332 assert(A.getOpcode() == ISD::ADD);
18333 EVT VT = A.getValueType();
18334 SDValue Op0 = A.getOperand(i: 0);
18335 SDValue Op1 = A.getOperand(i: 1);
18336 if (Op0.getOpcode() != Op1.getOpcode() ||
18337 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
18338 Op0.getOpcode() != ISD::SIGN_EXTEND))
18339 return SDValue();
18340 SDValue Ext0 = Op0.getOperand(i: 0);
18341 SDValue Ext1 = Op1.getOperand(i: 0);
18342 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18343 Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18344 Ext0.getOperand(i: 0) != Ext1.getOperand(i: 0))
18345 return SDValue();
18346 // Check that the type is twice the add types, and the extract are from
18347 // upper/lower parts of the same source.
18348 if (Ext0.getOperand(i: 0).getValueType().getVectorNumElements() !=
18349 VT.getVectorNumElements() * 2)
18350 return SDValue();
18351 if ((Ext0.getConstantOperandVal(i: 1) != 0 ||
18352 Ext1.getConstantOperandVal(i: 1) != VT.getVectorNumElements()) &&
18353 (Ext1.getConstantOperandVal(i: 1) != 0 ||
18354 Ext0.getConstantOperandVal(i: 1) != VT.getVectorNumElements()))
18355 return SDValue();
18356 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
18357 : AArch64ISD::SADDLP;
18358 return DAG.getNode(Opcode, DL: SDLoc(A), VT, Operand: Ext0.getOperand(i: 0));
18359 };
18360
18361 if (SDValue R = DetectAddExtract(A))
18362 return R;
18363
18364 if (A.getOperand(i: 0).getOpcode() == ISD::ADD && A.getOperand(i: 0).hasOneUse())
18365 if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: 0), DAG))
18366 return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(A), VT: A.getValueType(), N1: R,
18367 N2: A.getOperand(i: 1));
18368 if (A.getOperand(i: 1).getOpcode() == ISD::ADD && A.getOperand(i: 1).hasOneUse())
18369 if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: 1), DAG))
18370 return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(A), VT: A.getValueType(), N1: R,
18371 N2: A.getOperand(i: 0));
18372 return SDValue();
18373}
18374
18375// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
18376// UADDLV(concat), where the concat represents the 64-bit zext sources.
18377static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG) {
18378 // Look for add(zext(64-bit source), zext(64-bit source)), returning
18379 // UADDLV(concat(zext, zext)) if found.
18380 assert(A.getOpcode() == ISD::ADD);
18381 EVT VT = A.getValueType();
18382 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18383 return SDValue();
18384 SDValue Op0 = A.getOperand(i: 0);
18385 SDValue Op1 = A.getOperand(i: 1);
18386 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
18387 return SDValue();
18388 SDValue Ext0 = Op0.getOperand(i: 0);
18389 SDValue Ext1 = Op1.getOperand(i: 0);
18390 EVT ExtVT0 = Ext0.getValueType();
18391 EVT ExtVT1 = Ext1.getValueType();
18392 // Check zext VTs are the same and 64-bit length.
18393 if (ExtVT0 != ExtVT1 ||
18394 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
18395 return SDValue();
18396 // Get VT for concat of zext sources.
18397 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
18398 SDValue Concat =
18399 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(A), VT: PairVT, N1: Ext0, N2: Ext1);
18400
18401 switch (VT.getSimpleVT().SimpleTy) {
18402 case MVT::v2i64:
18403 case MVT::v4i32:
18404 return DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: SDLoc(A), VT, Operand: Concat);
18405 case MVT::v8i16: {
18406 SDValue Uaddlv =
18407 DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: SDLoc(A), VT: MVT::v4i32, Operand: Concat);
18408 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: SDLoc(A), VT: MVT::v8i16, Operand: Uaddlv);
18409 }
18410 default:
18411 llvm_unreachable("Unhandled vector type");
18412 }
18413}
18414
18415static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
18416 SDValue A = N->getOperand(Num: 0);
18417 if (A.getOpcode() == ISD::ADD) {
18418 if (SDValue R = performUADDVAddCombine(A, DAG))
18419 return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: R);
18420 else if (SDValue R = performUADDVZextCombine(A, DAG))
18421 return R;
18422 }
18423 return SDValue();
18424}
18425
18426static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
18427 TargetLowering::DAGCombinerInfo &DCI,
18428 const AArch64Subtarget *Subtarget) {
18429 if (DCI.isBeforeLegalizeOps())
18430 return SDValue();
18431
18432 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
18433}
18434
18435SDValue
18436AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
18437 SelectionDAG &DAG,
18438 SmallVectorImpl<SDNode *> &Created) const {
18439 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
18440 if (isIntDivCheap(VT: N->getValueType(ResNo: 0), Attr))
18441 return SDValue(N, 0); // Lower SDIV as SDIV
18442
18443 EVT VT = N->getValueType(ResNo: 0);
18444
18445 // If SVE is available, we can generate
18446 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
18447 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
18448 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
18449 return SDValue(N, 0);
18450
18451 // fold (sdiv X, pow2)
18452 if ((VT != MVT::i32 && VT != MVT::i64) ||
18453 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18454 return SDValue();
18455
18456 // If the divisor is 2 or -2, the default expansion is better. It will add
18457 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
18458 if (Divisor == 2 ||
18459 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
18460 return SDValue();
18461
18462 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
18463}
18464
18465SDValue
18466AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
18467 SelectionDAG &DAG,
18468 SmallVectorImpl<SDNode *> &Created) const {
18469 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
18470 if (isIntDivCheap(VT: N->getValueType(ResNo: 0), Attr))
18471 return SDValue(N, 0); // Lower SREM as SREM
18472
18473 EVT VT = N->getValueType(ResNo: 0);
18474
18475 // For scalable and fixed types, mark them as cheap so we can handle it much
18476 // later. This allows us to handle larger than legal types.
18477 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
18478 return SDValue(N, 0);
18479
18480 // fold (srem X, pow2)
18481 if ((VT != MVT::i32 && VT != MVT::i64) ||
18482 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18483 return SDValue();
18484
18485 unsigned Lg2 = Divisor.countr_zero();
18486 if (Lg2 == 0)
18487 return SDValue();
18488
18489 SDLoc DL(N);
18490 SDValue N0 = N->getOperand(Num: 0);
18491 SDValue Pow2MinusOne = DAG.getConstant(Val: (1ULL << Lg2) - 1, DL, VT);
18492 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
18493 SDValue CCVal, CSNeg;
18494 if (Lg2 == 1) {
18495 SDValue Cmp = getAArch64Cmp(LHS: N0, RHS: Zero, CC: ISD::SETGE, AArch64cc&: CCVal, DAG, DL);
18496 SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
18497 CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: And, N2: And, N3: CCVal, N4: Cmp);
18498
18499 Created.push_back(Elt: Cmp.getNode());
18500 Created.push_back(Elt: And.getNode());
18501 } else {
18502 SDValue CCVal = DAG.getConstant(Val: AArch64CC::MI, DL, VT: MVT_CC);
18503 SDVTList VTs = DAG.getVTList(VT1: VT, VT2: MVT::i32);
18504
18505 SDValue Negs = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Zero, N2: N0);
18506 SDValue AndPos = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
18507 SDValue AndNeg = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Negs, N2: Pow2MinusOne);
18508 CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: AndPos, N2: AndNeg, N3: CCVal,
18509 N4: Negs.getValue(R: 1));
18510
18511 Created.push_back(Elt: Negs.getNode());
18512 Created.push_back(Elt: AndPos.getNode());
18513 Created.push_back(Elt: AndNeg.getNode());
18514 }
18515
18516 return CSNeg;
18517}
18518
18519static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
18520 switch(getIntrinsicID(N: S.getNode())) {
18521 default:
18522 break;
18523 case Intrinsic::aarch64_sve_cntb:
18524 return 8;
18525 case Intrinsic::aarch64_sve_cnth:
18526 return 16;
18527 case Intrinsic::aarch64_sve_cntw:
18528 return 32;
18529 case Intrinsic::aarch64_sve_cntd:
18530 return 64;
18531 }
18532 return {};
18533}
18534
18535/// Calculates what the pre-extend type is, based on the extension
18536/// operation node provided by \p Extend.
18537///
18538/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
18539/// pre-extend type is pulled directly from the operand, while other extend
18540/// operations need a bit more inspection to get this information.
18541///
18542/// \param Extend The SDNode from the DAG that represents the extend operation
18543///
18544/// \returns The type representing the \p Extend source type, or \p MVT::Other
18545/// if no valid type can be determined
18546static EVT calculatePreExtendType(SDValue Extend) {
18547 switch (Extend.getOpcode()) {
18548 case ISD::SIGN_EXTEND:
18549 case ISD::ZERO_EXTEND:
18550 case ISD::ANY_EXTEND:
18551 return Extend.getOperand(i: 0).getValueType();
18552 case ISD::AssertSext:
18553 case ISD::AssertZext:
18554 case ISD::SIGN_EXTEND_INREG: {
18555 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Val: Extend.getOperand(i: 1));
18556 if (!TypeNode)
18557 return MVT::Other;
18558 return TypeNode->getVT();
18559 }
18560 case ISD::AND: {
18561 ConstantSDNode *Constant =
18562 dyn_cast<ConstantSDNode>(Val: Extend.getOperand(i: 1).getNode());
18563 if (!Constant)
18564 return MVT::Other;
18565
18566 uint32_t Mask = Constant->getZExtValue();
18567
18568 if (Mask == UCHAR_MAX)
18569 return MVT::i8;
18570 else if (Mask == USHRT_MAX)
18571 return MVT::i16;
18572 else if (Mask == UINT_MAX)
18573 return MVT::i32;
18574
18575 return MVT::Other;
18576 }
18577 default:
18578 return MVT::Other;
18579 }
18580}
18581
18582/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
18583/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
18584/// SExt/ZExt rather than the scalar SExt/ZExt
18585static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
18586 EVT VT = BV.getValueType();
18587 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
18588 BV.getOpcode() != ISD::VECTOR_SHUFFLE)
18589 return SDValue();
18590
18591 // Use the first item in the buildvector/shuffle to get the size of the
18592 // extend, and make sure it looks valid.
18593 SDValue Extend = BV->getOperand(Num: 0);
18594 unsigned ExtendOpcode = Extend.getOpcode();
18595 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
18596 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
18597 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
18598 ExtendOpcode == ISD::AssertSext;
18599 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
18600 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
18601 return SDValue();
18602 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
18603 // ensure calculatePreExtendType will work without issue.
18604 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
18605 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
18606 return SDValue();
18607
18608 // Restrict valid pre-extend data type
18609 EVT PreExtendType = calculatePreExtendType(Extend);
18610 if (PreExtendType == MVT::Other ||
18611 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
18612 return SDValue();
18613
18614 // Make sure all other operands are equally extended.
18615 bool SeenZExtOrSExt = !IsAnyExt;
18616 for (SDValue Op : drop_begin(RangeOrContainer: BV->ops())) {
18617 if (Op.isUndef())
18618 continue;
18619
18620 if (calculatePreExtendType(Extend: Op) != PreExtendType)
18621 return SDValue();
18622
18623 unsigned Opc = Op.getOpcode();
18624 if (Opc == ISD::ANY_EXTEND)
18625 continue;
18626
18627 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
18628 Opc == ISD::AssertSext;
18629
18630 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
18631 return SDValue();
18632
18633 IsSExt = OpcIsSExt;
18634 SeenZExtOrSExt = true;
18635 }
18636
18637 SDValue NBV;
18638 SDLoc DL(BV);
18639 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
18640 EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType);
18641 EVT PreExtendLegalType =
18642 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
18643 SmallVector<SDValue, 8> NewOps;
18644 for (SDValue Op : BV->ops())
18645 NewOps.push_back(Elt: Op.isUndef() ? DAG.getUNDEF(VT: PreExtendLegalType)
18646 : DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 0), DL,
18647 VT: PreExtendLegalType));
18648 NBV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: PreExtendVT, Ops: NewOps);
18649 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
18650 EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType.getScalarType());
18651 NBV = DAG.getVectorShuffle(VT: PreExtendVT, dl: DL, N1: BV.getOperand(i: 0).getOperand(i: 0),
18652 N2: BV.getOperand(i: 1).isUndef()
18653 ? DAG.getUNDEF(VT: PreExtendVT)
18654 : BV.getOperand(i: 1).getOperand(i: 0),
18655 Mask: cast<ShuffleVectorSDNode>(Val&: BV)->getMask());
18656 }
18657 unsigned ExtOpc = !SeenZExtOrSExt
18658 ? ISD::ANY_EXTEND
18659 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
18660 return DAG.getNode(Opcode: ExtOpc, DL, VT, Operand: NBV);
18661}
18662
18663/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
18664/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
18665static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
18666 // If the value type isn't a vector, none of the operands are going to be dups
18667 EVT VT = Mul->getValueType(ResNo: 0);
18668 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18669 return SDValue();
18670
18671 SDValue Op0 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: 0), DAG);
18672 SDValue Op1 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: 1), DAG);
18673
18674 // Neither operands have been changed, don't make any further changes
18675 if (!Op0 && !Op1)
18676 return SDValue();
18677
18678 SDLoc DL(Mul);
18679 return DAG.getNode(Opcode: Mul->getOpcode(), DL, VT, N1: Op0 ? Op0 : Mul->getOperand(Num: 0),
18680 N2: Op1 ? Op1 : Mul->getOperand(Num: 1));
18681}
18682
18683// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
18684// Same for other types with equivalent constants.
18685static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
18686 EVT VT = N->getValueType(ResNo: 0);
18687 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
18688 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
18689 return SDValue();
18690 if (N->getOperand(Num: 0).getOpcode() != ISD::AND ||
18691 N->getOperand(Num: 0).getOperand(i: 0).getOpcode() != ISD::SRL)
18692 return SDValue();
18693
18694 SDValue And = N->getOperand(Num: 0);
18695 SDValue Srl = And.getOperand(i: 0);
18696
18697 APInt V1, V2, V3;
18698 if (!ISD::isConstantSplatVector(N: N->getOperand(Num: 1).getNode(), SplatValue&: V1) ||
18699 !ISD::isConstantSplatVector(N: And.getOperand(i: 1).getNode(), SplatValue&: V2) ||
18700 !ISD::isConstantSplatVector(N: Srl.getOperand(i: 1).getNode(), SplatValue&: V3))
18701 return SDValue();
18702
18703 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
18704 if (!V1.isMask(numBits: HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
18705 V3 != (HalfSize - 1))
18706 return SDValue();
18707
18708 EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
18709 VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: HalfSize),
18710 EC: VT.getVectorElementCount() * 2);
18711
18712 SDLoc DL(N);
18713 SDValue In = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: HalfVT, Operand: Srl.getOperand(i: 0));
18714 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: In.getValueType());
18715 SDValue CM = DAG.getSetCC(DL, VT: HalfVT, LHS: Zero, RHS: In, Cond: ISD::SETGT);
18716 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: CM);
18717}
18718
18719// Transform vector add(zext i8 to i32, zext i8 to i32)
18720// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18721// This allows extra uses of saddl/uaddl at the lower vector widths, and less
18722// extends.
18723static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) {
18724 EVT VT = N->getValueType(ResNo: 0);
18725 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
18726 (N->getOperand(Num: 0).getOpcode() != ISD::ZERO_EXTEND &&
18727 N->getOperand(Num: 0).getOpcode() != ISD::SIGN_EXTEND) ||
18728 (N->getOperand(Num: 1).getOpcode() != ISD::ZERO_EXTEND &&
18729 N->getOperand(Num: 1).getOpcode() != ISD::SIGN_EXTEND) ||
18730 N->getOperand(Num: 0).getOperand(i: 0).getValueType() !=
18731 N->getOperand(Num: 1).getOperand(i: 0).getValueType())
18732 return SDValue();
18733
18734 if (N->getOpcode() == ISD::MUL &&
18735 N->getOperand(Num: 0).getOpcode() != N->getOperand(Num: 1).getOpcode())
18736 return SDValue();
18737
18738 SDValue N0 = N->getOperand(Num: 0).getOperand(i: 0);
18739 SDValue N1 = N->getOperand(Num: 1).getOperand(i: 0);
18740 EVT InVT = N0.getValueType();
18741
18742 EVT S1 = InVT.getScalarType();
18743 EVT S2 = VT.getScalarType();
18744 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
18745 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
18746 SDLoc DL(N);
18747 EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
18748 VT: S2.getHalfSizedIntegerVT(Context&: *DAG.getContext()),
18749 EC: VT.getVectorElementCount());
18750 SDValue NewN0 = DAG.getNode(Opcode: N->getOperand(Num: 0).getOpcode(), DL, VT: HalfVT, Operand: N0);
18751 SDValue NewN1 = DAG.getNode(Opcode: N->getOperand(Num: 1).getOpcode(), DL, VT: HalfVT, Operand: N1);
18752 SDValue NewOp = DAG.getNode(Opcode: N->getOpcode(), DL, VT: HalfVT, N1: NewN0, N2: NewN1);
18753 return DAG.getNode(Opcode: N->getOpcode() == ISD::MUL ? N->getOperand(Num: 0).getOpcode()
18754 : (unsigned)ISD::SIGN_EXTEND,
18755 DL, VT, Operand: NewOp);
18756 }
18757 return SDValue();
18758}
18759
18760static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
18761 TargetLowering::DAGCombinerInfo &DCI,
18762 const AArch64Subtarget *Subtarget) {
18763
18764 if (SDValue Ext = performMulVectorExtendCombine(Mul: N, DAG))
18765 return Ext;
18766 if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
18767 return Ext;
18768 if (SDValue Ext = performVectorExtCombine(N, DAG))
18769 return Ext;
18770
18771 if (DCI.isBeforeLegalizeOps())
18772 return SDValue();
18773
18774 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
18775 // and in MachineCombiner pass, add+mul will be combined into madd.
18776 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
18777 SDLoc DL(N);
18778 EVT VT = N->getValueType(ResNo: 0);
18779 SDValue N0 = N->getOperand(Num: 0);
18780 SDValue N1 = N->getOperand(Num: 1);
18781 SDValue MulOper;
18782 unsigned AddSubOpc;
18783
18784 auto IsAddSubWith1 = [&](SDValue V) -> bool {
18785 AddSubOpc = V->getOpcode();
18786 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
18787 SDValue Opnd = V->getOperand(Num: 1);
18788 MulOper = V->getOperand(Num: 0);
18789 if (AddSubOpc == ISD::SUB)
18790 std::swap(a&: Opnd, b&: MulOper);
18791 if (auto C = dyn_cast<ConstantSDNode>(Val&: Opnd))
18792 return C->isOne();
18793 }
18794 return false;
18795 };
18796
18797 if (IsAddSubWith1(N0)) {
18798 SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1, N2: MulOper);
18799 return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1, N2: MulVal);
18800 }
18801
18802 if (IsAddSubWith1(N1)) {
18803 SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: N0, N2: MulOper);
18804 return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1: N0, N2: MulVal);
18805 }
18806
18807 // The below optimizations require a constant RHS.
18808 if (!isa<ConstantSDNode>(Val: N1))
18809 return SDValue();
18810
18811 ConstantSDNode *C = cast<ConstantSDNode>(Val&: N1);
18812 const APInt &ConstValue = C->getAPIntValue();
18813
18814 // Allow the scaling to be folded into the `cnt` instruction by preventing
18815 // the scaling to be obscured here. This makes it easier to pattern match.
18816 if (IsSVECntIntrinsic(S: N0) ||
18817 (N0->getOpcode() == ISD::TRUNCATE &&
18818 (IsSVECntIntrinsic(S: N0->getOperand(Num: 0)))))
18819 if (ConstValue.sge(RHS: 1) && ConstValue.sle(RHS: 16))
18820 return SDValue();
18821
18822 // Multiplication of a power of two plus/minus one can be done more
18823 // cheaply as shift+add/sub. For now, this is true unilaterally. If
18824 // future CPUs have a cheaper MADD instruction, this may need to be
18825 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18826 // 64-bit is 5 cycles, so this is always a win.
18827 // More aggressively, some multiplications N0 * C can be lowered to
18828 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
18829 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
18830 // TODO: lower more cases.
18831
18832 // TrailingZeroes is used to test if the mul can be lowered to
18833 // shift+add+shift.
18834 unsigned TrailingZeroes = ConstValue.countr_zero();
18835 if (TrailingZeroes) {
18836 // Conservatively do not lower to shift+add+shift if the mul might be
18837 // folded into smul or umul.
18838 if (N0->hasOneUse() && (isSignExtended(N: N0, DAG) ||
18839 isZeroExtended(N: N0, DAG)))
18840 return SDValue();
18841 // Conservatively do not lower to shift+add+shift if the mul might be
18842 // folded into madd or msub.
18843 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
18844 N->user_begin()->getOpcode() == ISD::SUB))
18845 return SDValue();
18846 }
18847 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18848 // and shift+add+shift.
18849 APInt ShiftedConstValue = ConstValue.ashr(ShiftAmt: TrailingZeroes);
18850 unsigned ShiftAmt;
18851
18852 auto Shl = [&](SDValue N0, unsigned N1) {
18853 if (!N0.getNode())
18854 return SDValue();
18855 // If shift causes overflow, ignore this combine.
18856 if (N1 >= N0.getValueSizeInBits())
18857 return SDValue();
18858 SDValue RHS = DAG.getConstant(Val: N1, DL, VT: MVT::i64);
18859 return DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N0, N2: RHS);
18860 };
18861 auto Add = [&](SDValue N0, SDValue N1) {
18862 if (!N0.getNode() || !N1.getNode())
18863 return SDValue();
18864 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: N0, N2: N1);
18865 };
18866 auto Sub = [&](SDValue N0, SDValue N1) {
18867 if (!N0.getNode() || !N1.getNode())
18868 return SDValue();
18869 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: N0, N2: N1);
18870 };
18871 auto Negate = [&](SDValue N) {
18872 if (!N0.getNode())
18873 return SDValue();
18874 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
18875 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: N);
18876 };
18877
18878 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
18879 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
18880 // the (2^N - 1) can't be execused via a single instruction.
18881 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
18882 unsigned BitWidth = C.getBitWidth();
18883 for (unsigned i = 1; i < BitWidth / 2; i++) {
18884 APInt Rem;
18885 APInt X(BitWidth, (1 << i) + 1);
18886 APInt::sdivrem(LHS: C, RHS: X, Quotient&: N, Remainder&: Rem);
18887 APInt NVMinus1 = N - 1;
18888 if (Rem == 0 && NVMinus1.isPowerOf2()) {
18889 M = X;
18890 return true;
18891 }
18892 }
18893 return false;
18894 };
18895
18896 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
18897 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
18898 // the (2^N - 1) can't be execused via a single instruction.
18899 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
18900 APInt CVMinus1 = C - 1;
18901 if (CVMinus1.isNegative())
18902 return false;
18903 unsigned TrailingZeroes = CVMinus1.countr_zero();
18904 APInt SCVMinus1 = CVMinus1.ashr(ShiftAmt: TrailingZeroes) - 1;
18905 if (SCVMinus1.isPowerOf2()) {
18906 unsigned BitWidth = SCVMinus1.getBitWidth();
18907 M = APInt(BitWidth, SCVMinus1.logBase2());
18908 N = APInt(BitWidth, TrailingZeroes);
18909 return true;
18910 }
18911 return false;
18912 };
18913
18914 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
18915 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
18916 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
18917 APInt CVMinus1 = C - 1;
18918 if (CVMinus1.isNegative())
18919 return false;
18920 unsigned TrailingZeroes = CVMinus1.countr_zero();
18921 APInt CVPlus1 = CVMinus1.ashr(ShiftAmt: TrailingZeroes) + 1;
18922 if (CVPlus1.isPowerOf2()) {
18923 unsigned BitWidth = CVPlus1.getBitWidth();
18924 M = APInt(BitWidth, CVPlus1.logBase2());
18925 N = APInt(BitWidth, TrailingZeroes);
18926 return true;
18927 }
18928 return false;
18929 };
18930
18931 if (ConstValue.isNonNegative()) {
18932 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
18933 // (mul x, 2^N - 1) => (sub (shl x, N), x)
18934 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
18935 // (mul x, (2^M + 1) * (2^N + 1))
18936 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
18937 // (mul x, (2^M + 1) * 2^N + 1))
18938 // => MV = add (shl x, M), x); add (shl MV, N), x)
18939 // (mul x, 1 - (1 - 2^M) * 2^N))
18940 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
18941 APInt SCVMinus1 = ShiftedConstValue - 1;
18942 APInt SCVPlus1 = ShiftedConstValue + 1;
18943 APInt CVPlus1 = ConstValue + 1;
18944 APInt CVM, CVN;
18945 if (SCVMinus1.isPowerOf2()) {
18946 ShiftAmt = SCVMinus1.logBase2();
18947 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
18948 } else if (CVPlus1.isPowerOf2()) {
18949 ShiftAmt = CVPlus1.logBase2();
18950 return Sub(Shl(N0, ShiftAmt), N0);
18951 } else if (SCVPlus1.isPowerOf2()) {
18952 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18953 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
18954 }
18955 if (Subtarget->hasALULSLFast() &&
18956 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
18957 APInt CVMMinus1 = CVM - 1;
18958 APInt CVNMinus1 = CVN - 1;
18959 unsigned ShiftM1 = CVMMinus1.logBase2();
18960 unsigned ShiftN1 = CVNMinus1.logBase2();
18961 // ALULSLFast implicate that Shifts <= 4 places are fast
18962 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
18963 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
18964 return Add(Shl(MVal, ShiftN1), MVal);
18965 }
18966 }
18967 if (Subtarget->hasALULSLFast() &&
18968 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
18969 unsigned ShiftM = CVM.getZExtValue();
18970 unsigned ShiftN = CVN.getZExtValue();
18971 // ALULSLFast implicate that Shifts <= 4 places are fast
18972 if (ShiftM <= 4 && ShiftN <= 4) {
18973 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
18974 return Add(Shl(MVal, CVN.getZExtValue()), N0);
18975 }
18976 }
18977
18978 if (Subtarget->hasALULSLFast() &&
18979 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
18980 unsigned ShiftM = CVM.getZExtValue();
18981 unsigned ShiftN = CVN.getZExtValue();
18982 // ALULSLFast implicate that Shifts <= 4 places are fast
18983 if (ShiftM <= 4 && ShiftN <= 4) {
18984 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
18985 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
18986 }
18987 }
18988 } else {
18989 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
18990 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
18991 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
18992 APInt SCVPlus1 = -ShiftedConstValue + 1;
18993 APInt CVNegPlus1 = -ConstValue + 1;
18994 APInt CVNegMinus1 = -ConstValue - 1;
18995 if (CVNegPlus1.isPowerOf2()) {
18996 ShiftAmt = CVNegPlus1.logBase2();
18997 return Sub(N0, Shl(N0, ShiftAmt));
18998 } else if (CVNegMinus1.isPowerOf2()) {
18999 ShiftAmt = CVNegMinus1.logBase2();
19000 return Negate(Add(Shl(N0, ShiftAmt), N0));
19001 } else if (SCVPlus1.isPowerOf2()) {
19002 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19003 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
19004 }
19005 }
19006
19007 return SDValue();
19008}
19009
19010static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
19011 SelectionDAG &DAG) {
19012 // Take advantage of vector comparisons producing 0 or -1 in each lane to
19013 // optimize away operation when it's from a constant.
19014 //
19015 // The general transformation is:
19016 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19017 // AND(VECTOR_CMP(x,y), constant2)
19018 // constant2 = UNARYOP(constant)
19019
19020 // Early exit if this isn't a vector operation, the operand of the
19021 // unary operation isn't a bitwise AND, or if the sizes of the operations
19022 // aren't the same.
19023 EVT VT = N->getValueType(ResNo: 0);
19024 if (!VT.isVector() || N->getOperand(Num: 0)->getOpcode() != ISD::AND ||
19025 N->getOperand(Num: 0)->getOperand(Num: 0)->getOpcode() != ISD::SETCC ||
19026 VT.getSizeInBits() != N->getOperand(Num: 0)->getValueType(ResNo: 0).getSizeInBits())
19027 return SDValue();
19028
19029 // Now check that the other operand of the AND is a constant. We could
19030 // make the transformation for non-constant splats as well, but it's unclear
19031 // that would be a benefit as it would not eliminate any operations, just
19032 // perform one more step in scalar code before moving to the vector unit.
19033 if (BuildVectorSDNode *BV =
19034 dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: 0)->getOperand(Num: 1))) {
19035 // Bail out if the vector isn't a constant.
19036 if (!BV->isConstant())
19037 return SDValue();
19038
19039 // Everything checks out. Build up the new and improved node.
19040 SDLoc DL(N);
19041 EVT IntVT = BV->getValueType(ResNo: 0);
19042 // Create a new constant of the appropriate type for the transformed
19043 // DAG.
19044 SDValue SourceConst = DAG.getNode(Opcode: N->getOpcode(), DL, VT, Operand: SDValue(BV, 0));
19045 // The AND node needs bitcasts to/from an integer vector type around it.
19046 SDValue MaskConst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntVT, Operand: SourceConst);
19047 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: IntVT,
19048 N1: N->getOperand(Num: 0)->getOperand(Num: 0), N2: MaskConst);
19049 SDValue Res = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewAnd);
19050 return Res;
19051 }
19052
19053 return SDValue();
19054}
19055
19056/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19057/// functions, this can help to reduce the number of fmovs to/from GPRs.
19058static SDValue
19059tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
19060 TargetLowering::DAGCombinerInfo &DCI,
19061 const AArch64Subtarget *Subtarget) {
19062 if (N->isStrictFPOpcode())
19063 return SDValue();
19064
19065 if (DCI.isBeforeLegalizeOps())
19066 return SDValue();
19067
19068 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19069 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19070 return SDValue();
19071
19072 auto isSupportedType = [](EVT VT) {
19073 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19074 };
19075
19076 SDValue SrcVal = N->getOperand(Num: 0);
19077 EVT SrcTy = SrcVal.getValueType();
19078 EVT DestTy = N->getValueType(ResNo: 0);
19079
19080 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19081 return SDValue();
19082
19083 EVT SrcVecTy;
19084 EVT DestVecTy;
19085 if (DestTy.bitsGT(VT: SrcTy)) {
19086 DestVecTy = getPackedSVEVectorVT(VT: DestTy);
19087 SrcVecTy = DestVecTy.changeVectorElementType(EltVT: SrcTy);
19088 } else {
19089 SrcVecTy = getPackedSVEVectorVT(VT: SrcTy);
19090 DestVecTy = SrcVecTy.changeVectorElementType(EltVT: DestTy);
19091 }
19092
19093 // Ensure the resulting src/dest vector type is legal.
19094 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19095 return SDValue();
19096
19097 SDLoc DL(N);
19098 SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: 0, DL);
19099 SDValue Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: SrcVecTy,
19100 N1: DAG.getUNDEF(VT: SrcVecTy), N2: SrcVal, N3: ZeroIdx);
19101 SDValue Convert = DAG.getNode(Opcode: N->getOpcode(), DL, VT: DestVecTy, Operand: Vec);
19102 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: DestTy, N1: Convert, N2: ZeroIdx);
19103}
19104
19105static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
19106 TargetLowering::DAGCombinerInfo &DCI,
19107 const AArch64Subtarget *Subtarget) {
19108 // First try to optimize away the conversion when it's conditionally from
19109 // a constant. Vectors only.
19110 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
19111 return Res;
19112
19113 if (SDValue Res =
19114 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19115 return Res;
19116
19117 EVT VT = N->getValueType(ResNo: 0);
19118 if (VT != MVT::f32 && VT != MVT::f64)
19119 return SDValue();
19120
19121 // Only optimize when the source and destination types have the same width.
19122 if (VT.getSizeInBits() != N->getOperand(Num: 0).getValueSizeInBits())
19123 return SDValue();
19124
19125 // If the result of an integer load is only used by an integer-to-float
19126 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19127 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19128 SDValue N0 = N->getOperand(Num: 0);
19129 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N: N0.getNode()) &&
19130 N0.hasOneUse() &&
19131 // Do not change the width of a volatile load.
19132 !cast<LoadSDNode>(Val&: N0)->isVolatile()) {
19133 LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
19134 SDValue Load = DAG.getLoad(VT, dl: SDLoc(N), Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
19135 PtrInfo: LN0->getPointerInfo(), Alignment: LN0->getAlign(),
19136 MMOFlags: LN0->getMemOperand()->getFlags());
19137
19138 // Make sure successors of the original load stay after it by updating them
19139 // to use the new Chain.
19140 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LN0, 1), To: Load.getValue(R: 1));
19141
19142 unsigned Opcode =
19143 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
19144 return DAG.getNode(Opcode, DL: SDLoc(N), VT, Operand: Load);
19145 }
19146
19147 return SDValue();
19148}
19149
19150/// Fold a floating-point multiply by power of two into floating-point to
19151/// fixed-point conversion.
19152static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
19153 TargetLowering::DAGCombinerInfo &DCI,
19154 const AArch64Subtarget *Subtarget) {
19155 if (SDValue Res =
19156 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19157 return Res;
19158
19159 if (!Subtarget->isNeonAvailable())
19160 return SDValue();
19161
19162 if (!N->getValueType(ResNo: 0).isSimple())
19163 return SDValue();
19164
19165 SDValue Op = N->getOperand(Num: 0);
19166 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19167 return SDValue();
19168
19169 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19170 return SDValue();
19171
19172 SDValue ConstVec = Op->getOperand(Num: 1);
19173 if (!isa<BuildVectorSDNode>(Val: ConstVec))
19174 return SDValue();
19175
19176 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19177 uint32_t FloatBits = FloatTy.getSizeInBits();
19178 if (FloatBits != 32 && FloatBits != 64 &&
19179 (FloatBits != 16 || !Subtarget->hasFullFP16()))
19180 return SDValue();
19181
19182 MVT IntTy = N->getSimpleValueType(ResNo: 0).getVectorElementType();
19183 uint32_t IntBits = IntTy.getSizeInBits();
19184 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19185 return SDValue();
19186
19187 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19188 if (IntBits > FloatBits)
19189 return SDValue();
19190
19191 BitVector UndefElements;
19192 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Val&: ConstVec);
19193 int32_t Bits = IntBits == 64 ? 64 : 32;
19194 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(UndefElements: &UndefElements, BitWidth: Bits + 1);
19195 if (C == -1 || C == 0 || C > Bits)
19196 return SDValue();
19197
19198 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19199 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ResTy))
19200 return SDValue();
19201
19202 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19203 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19204 EVT SatVT = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT();
19205 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19206 return SDValue();
19207 }
19208
19209 SDLoc DL(N);
19210 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19211 N->getOpcode() == ISD::FP_TO_SINT_SAT);
19212 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19213 : Intrinsic::aarch64_neon_vcvtfp2fxu;
19214 SDValue FixConv =
19215 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ResTy,
19216 N1: DAG.getConstant(Val: IntrinsicOpcode, DL, VT: MVT::i32),
19217 N2: Op->getOperand(Num: 0), N3: DAG.getConstant(Val: C, DL, VT: MVT::i32));
19218 // We can handle smaller integers by generating an extra trunc.
19219 if (IntBits < FloatBits)
19220 FixConv = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: N->getValueType(ResNo: 0), Operand: FixConv);
19221
19222 return FixConv;
19223}
19224
19225static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19226 const AArch64TargetLowering &TLI) {
19227 EVT VT = N->getValueType(ResNo: 0);
19228 SelectionDAG &DAG = DCI.DAG;
19229 SDLoc DL(N);
19230 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
19231
19232 if (!VT.isVector())
19233 return SDValue();
19234
19235 if (VT.isScalableVector() && !Subtarget.hasSVE2())
19236 return SDValue();
19237
19238 if (VT.isFixedLengthVector() &&
19239 (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
19240 return SDValue();
19241
19242 SDValue N0 = N->getOperand(Num: 0);
19243 if (N0.getOpcode() != ISD::AND)
19244 return SDValue();
19245
19246 SDValue N1 = N->getOperand(Num: 1);
19247 if (N1.getOpcode() != ISD::AND)
19248 return SDValue();
19249
19250 // InstCombine does (not (neg a)) => (add a -1).
19251 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
19252 // Loop over all combinations of AND operands.
19253 for (int i = 1; i >= 0; --i) {
19254 for (int j = 1; j >= 0; --j) {
19255 SDValue O0 = N0->getOperand(Num: i);
19256 SDValue O1 = N1->getOperand(Num: j);
19257 SDValue Sub, Add, SubSibling, AddSibling;
19258
19259 // Find a SUB and an ADD operand, one from each AND.
19260 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
19261 Sub = O0;
19262 Add = O1;
19263 SubSibling = N0->getOperand(Num: 1 - i);
19264 AddSibling = N1->getOperand(Num: 1 - j);
19265 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
19266 Add = O0;
19267 Sub = O1;
19268 AddSibling = N0->getOperand(Num: 1 - i);
19269 SubSibling = N1->getOperand(Num: 1 - j);
19270 } else
19271 continue;
19272
19273 if (!ISD::isConstantSplatVectorAllZeros(N: Sub.getOperand(i: 0).getNode()))
19274 continue;
19275
19276 // Constant ones is always righthand operand of the Add.
19277 if (!ISD::isConstantSplatVectorAllOnes(N: Add.getOperand(i: 1).getNode()))
19278 continue;
19279
19280 if (Sub.getOperand(i: 1) != Add.getOperand(i: 0))
19281 continue;
19282
19283 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Sub, N2: SubSibling, N3: AddSibling);
19284 }
19285 }
19286
19287 // (or (and a b) (and (not a) c)) => (bsl a b c)
19288 // We only have to look for constant vectors here since the general, variable
19289 // case can be handled in TableGen.
19290 unsigned Bits = VT.getScalarSizeInBits();
19291 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
19292 for (int i = 1; i >= 0; --i)
19293 for (int j = 1; j >= 0; --j) {
19294 APInt Val1, Val2;
19295
19296 if (ISD::isConstantSplatVector(N: N0->getOperand(Num: i).getNode(), SplatValue&: Val1) &&
19297 ISD::isConstantSplatVector(N: N1->getOperand(Num: j).getNode(), SplatValue&: Val2) &&
19298 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
19299 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0->getOperand(Num: i),
19300 N2: N0->getOperand(Num: 1 - i), N3: N1->getOperand(Num: 1 - j));
19301 }
19302 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(Val: N0->getOperand(Num: i));
19303 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(Val: N1->getOperand(Num: j));
19304 if (!BVN0 || !BVN1)
19305 continue;
19306
19307 bool FoundMatch = true;
19308 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
19309 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(Val: BVN0->getOperand(Num: k));
19310 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val: BVN1->getOperand(Num: k));
19311 if (!CN0 || !CN1 ||
19312 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
19313 FoundMatch = false;
19314 break;
19315 }
19316 }
19317 if (FoundMatch)
19318 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0->getOperand(Num: i),
19319 N2: N0->getOperand(Num: 1 - i), N3: N1->getOperand(Num: 1 - j));
19320 }
19321
19322 return SDValue();
19323}
19324
19325// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19326// convert to csel(ccmp(.., cc0)), depending on cc1:
19327
19328// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19329// =>
19330// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19331//
19332// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19333// =>
19334// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19335static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
19336 EVT VT = N->getValueType(ResNo: 0);
19337 SDValue CSel0 = N->getOperand(Num: 0);
19338 SDValue CSel1 = N->getOperand(Num: 1);
19339
19340 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
19341 CSel1.getOpcode() != AArch64ISD::CSEL)
19342 return SDValue();
19343
19344 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
19345 return SDValue();
19346
19347 if (!isNullConstant(V: CSel0.getOperand(i: 0)) ||
19348 !isOneConstant(V: CSel0.getOperand(i: 1)) ||
19349 !isNullConstant(V: CSel1.getOperand(i: 0)) ||
19350 !isOneConstant(V: CSel1.getOperand(i: 1)))
19351 return SDValue();
19352
19353 SDValue Cmp0 = CSel0.getOperand(i: 3);
19354 SDValue Cmp1 = CSel1.getOperand(i: 3);
19355 AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(i: 2);
19356 AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(i: 2);
19357 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
19358 return SDValue();
19359 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
19360 Cmp0.getOpcode() == AArch64ISD::SUBS) {
19361 std::swap(a&: Cmp0, b&: Cmp1);
19362 std::swap(a&: CC0, b&: CC1);
19363 }
19364
19365 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
19366 return SDValue();
19367
19368 SDLoc DL(N);
19369 SDValue CCmp, Condition;
19370 unsigned NZCV;
19371
19372 if (N->getOpcode() == ISD::AND) {
19373 AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(Code: CC0);
19374 Condition = DAG.getConstant(Val: InvCC0, DL, VT: MVT_CC);
19375 NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: CC1);
19376 } else {
19377 AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(Code: CC1);
19378 Condition = DAG.getConstant(Val: CC0, DL, VT: MVT_CC);
19379 NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvCC1);
19380 }
19381
19382 SDValue NZCVOp = DAG.getConstant(Val: NZCV, DL, VT: MVT::i32);
19383
19384 auto *Op1 = dyn_cast<ConstantSDNode>(Val: Cmp1.getOperand(i: 1));
19385 if (Op1 && Op1->getAPIntValue().isNegative() &&
19386 Op1->getAPIntValue().sgt(RHS: -32)) {
19387 // CCMP accept the constant int the range [0, 31]
19388 // if the Op1 is a constant in the range [-31, -1], we
19389 // can select to CCMN to avoid the extra mov
19390 SDValue AbsOp1 =
19391 DAG.getConstant(Val: Op1->getAPIntValue().abs(), DL, VT: Op1->getValueType(ResNo: 0));
19392 CCmp = DAG.getNode(Opcode: AArch64ISD::CCMN, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: 0), N2: AbsOp1,
19393 N3: NZCVOp, N4: Condition, N5: Cmp0);
19394 } else {
19395 CCmp = DAG.getNode(Opcode: AArch64ISD::CCMP, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: 0),
19396 N2: Cmp1.getOperand(i: 1), N3: NZCVOp, N4: Condition, N5: Cmp0);
19397 }
19398 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: CSel0.getOperand(i: 0),
19399 N2: CSel0.getOperand(i: 1), N3: DAG.getConstant(Val: CC1, DL, VT: MVT::i32),
19400 N4: CCmp);
19401}
19402
19403static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19404 const AArch64Subtarget *Subtarget,
19405 const AArch64TargetLowering &TLI) {
19406 SelectionDAG &DAG = DCI.DAG;
19407 EVT VT = N->getValueType(ResNo: 0);
19408
19409 if (SDValue R = performANDORCSELCombine(N, DAG))
19410 return R;
19411
19412 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19413 return SDValue();
19414
19415 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
19416 return Res;
19417
19418 return SDValue();
19419}
19420
19421static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
19422 if (!MemVT.getVectorElementType().isSimple())
19423 return false;
19424
19425 uint64_t MaskForTy = 0ull;
19426 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
19427 case MVT::i8:
19428 MaskForTy = 0xffull;
19429 break;
19430 case MVT::i16:
19431 MaskForTy = 0xffffull;
19432 break;
19433 case MVT::i32:
19434 MaskForTy = 0xffffffffull;
19435 break;
19436 default:
19437 return false;
19438 break;
19439 }
19440
19441 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
19442 if (auto *Op0 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0)))
19443 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
19444
19445 return false;
19446}
19447
19448static SDValue performReinterpretCastCombine(SDNode *N) {
19449 SDValue LeafOp = SDValue(N, 0);
19450 SDValue Op = N->getOperand(Num: 0);
19451 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
19452 LeafOp.getValueType() != Op.getValueType())
19453 Op = Op->getOperand(Num: 0);
19454 if (LeafOp.getValueType() == Op.getValueType())
19455 return Op;
19456 return SDValue();
19457}
19458
19459static SDValue performSVEAndCombine(SDNode *N,
19460 TargetLowering::DAGCombinerInfo &DCI) {
19461 SelectionDAG &DAG = DCI.DAG;
19462 SDValue Src = N->getOperand(Num: 0);
19463 unsigned Opc = Src->getOpcode();
19464
19465 // Zero/any extend of an unsigned unpack
19466 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
19467 SDValue UnpkOp = Src->getOperand(Num: 0);
19468 SDValue Dup = N->getOperand(Num: 1);
19469
19470 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
19471 return SDValue();
19472
19473 SDLoc DL(N);
19474 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Dup->getOperand(Num: 0));
19475 if (!C)
19476 return SDValue();
19477
19478 uint64_t ExtVal = C->getZExtValue();
19479
19480 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
19481 return ((ExtVal == 0xFF && VT == MVT::i8) ||
19482 (ExtVal == 0xFFFF && VT == MVT::i16) ||
19483 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
19484 };
19485
19486 // If the mask is fully covered by the unpack, we don't need to push
19487 // a new AND onto the operand
19488 EVT EltTy = UnpkOp->getValueType(ResNo: 0).getVectorElementType();
19489 if (MaskAndTypeMatch(EltTy))
19490 return Src;
19491
19492 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
19493 // to see if the mask is all-ones of size MemTy.
19494 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(Val&: UnpkOp);
19495 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
19496 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
19497 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
19498 if (MaskAndTypeMatch(EltTy))
19499 return Src;
19500 }
19501
19502 // Truncate to prevent a DUP with an over wide constant
19503 APInt Mask = C->getAPIntValue().trunc(width: EltTy.getSizeInBits());
19504
19505 // Otherwise, make sure we propagate the AND to the operand
19506 // of the unpack
19507 Dup = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: UnpkOp->getValueType(ResNo: 0),
19508 Operand: DAG.getConstant(Val: Mask.zextOrTrunc(width: 32), DL, VT: MVT::i32));
19509
19510 SDValue And = DAG.getNode(Opcode: ISD::AND, DL,
19511 VT: UnpkOp->getValueType(ResNo: 0), N1: UnpkOp, N2: Dup);
19512
19513 return DAG.getNode(Opcode: Opc, DL, VT: N->getValueType(ResNo: 0), Operand: And);
19514 }
19515
19516 if (DCI.isBeforeLegalizeOps())
19517 return SDValue();
19518
19519 // If both sides of AND operations are i1 splat_vectors then
19520 // we can produce just i1 splat_vector as the result.
19521 if (isAllActivePredicate(DAG, N: N->getOperand(Num: 0)))
19522 return N->getOperand(Num: 1);
19523 if (isAllActivePredicate(DAG, N: N->getOperand(Num: 1)))
19524 return N->getOperand(Num: 0);
19525
19526 if (!EnableCombineMGatherIntrinsics)
19527 return SDValue();
19528
19529 SDValue Mask = N->getOperand(Num: 1);
19530
19531 if (!Src.hasOneUse())
19532 return SDValue();
19533
19534 EVT MemVT;
19535
19536 // SVE load instructions perform an implicit zero-extend, which makes them
19537 // perfect candidates for combining.
19538 switch (Opc) {
19539 case AArch64ISD::LD1_MERGE_ZERO:
19540 case AArch64ISD::LDNF1_MERGE_ZERO:
19541 case AArch64ISD::LDFF1_MERGE_ZERO:
19542 MemVT = cast<VTSDNode>(Val: Src->getOperand(Num: 3))->getVT();
19543 break;
19544 case AArch64ISD::GLD1_MERGE_ZERO:
19545 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
19546 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
19547 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
19548 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
19549 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
19550 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
19551 case AArch64ISD::GLDFF1_MERGE_ZERO:
19552 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
19553 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
19554 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
19555 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
19556 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
19557 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
19558 case AArch64ISD::GLDNT1_MERGE_ZERO:
19559 MemVT = cast<VTSDNode>(Val: Src->getOperand(Num: 4))->getVT();
19560 break;
19561 default:
19562 return SDValue();
19563 }
19564
19565 if (isConstantSplatVectorMaskForType(N: Mask.getNode(), MemVT))
19566 return Src;
19567
19568 return SDValue();
19569}
19570
19571// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
19572static SDValue performANDSETCCCombine(SDNode *N,
19573 TargetLowering::DAGCombinerInfo &DCI) {
19574
19575 // This function performs an optimization on a specific pattern involving
19576 // an AND operation and SETCC (Set Condition Code) node.
19577
19578 SDValue SetCC = N->getOperand(Num: 0);
19579 EVT VT = N->getValueType(ResNo: 0);
19580 SelectionDAG &DAG = DCI.DAG;
19581
19582 // Checks if the current node (N) is used by any SELECT instruction and
19583 // returns an empty SDValue to avoid applying the optimization to prevent
19584 // incorrect results
19585 for (auto U : N->users())
19586 if (U->getOpcode() == ISD::SELECT)
19587 return SDValue();
19588
19589 // Check if the operand is a SETCC node with floating-point comparison
19590 if (SetCC.getOpcode() == ISD::SETCC &&
19591 SetCC.getOperand(i: 0).getValueType() == MVT::f32) {
19592
19593 SDValue Cmp;
19594 AArch64CC::CondCode CC;
19595
19596 // Check if the DAG is after legalization and if we can emit the conjunction
19597 if (!DCI.isBeforeLegalize() &&
19598 (Cmp = emitConjunction(DAG, Val: SDValue(N, 0), OutCC&: CC))) {
19599
19600 AArch64CC::CondCode InvertedCC = AArch64CC::getInvertedCondCode(Code: CC);
19601
19602 SDLoc DL(N);
19603 return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT),
19604 N2: DAG.getConstant(Val: 0, DL, VT),
19605 N3: DAG.getConstant(Val: InvertedCC, DL, VT: MVT::i32), N4: Cmp);
19606 }
19607 }
19608 return SDValue();
19609}
19610
19611static SDValue performANDCombine(SDNode *N,
19612 TargetLowering::DAGCombinerInfo &DCI) {
19613 SelectionDAG &DAG = DCI.DAG;
19614 SDValue LHS = N->getOperand(Num: 0);
19615 SDValue RHS = N->getOperand(Num: 1);
19616 EVT VT = N->getValueType(ResNo: 0);
19617
19618 if (SDValue R = performANDORCSELCombine(N, DAG))
19619 return R;
19620
19621 if (SDValue R = performANDSETCCCombine(N,DCI))
19622 return R;
19623
19624 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19625 return SDValue();
19626
19627 if (VT.isScalableVector())
19628 return performSVEAndCombine(N, DCI);
19629
19630 // The combining code below works only for NEON vectors. In particular, it
19631 // does not work for SVE when dealing with vectors wider than 128 bits.
19632 if (!VT.is64BitVector() && !VT.is128BitVector())
19633 return SDValue();
19634
19635 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
19636 if (!BVN)
19637 return SDValue();
19638
19639 // AND does not accept an immediate, so check if we can use a BIC immediate
19640 // instruction instead. We do this here instead of using a (and x, (mvni imm))
19641 // pattern in isel, because some immediates may be lowered to the preferred
19642 // (and x, (movi imm)) form, even though an mvni representation also exists.
19643 APInt DefBits(VT.getSizeInBits(), 0);
19644 APInt UndefBits(VT.getSizeInBits(), 0);
19645 if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
19646 SDValue NewOp;
19647
19648 // Any bits known to already be 0 need not be cleared again, which can help
19649 // reduce the size of the immediate to one supported by the instruction.
19650 KnownBits Known = DAG.computeKnownBits(Op: LHS);
19651 APInt ZeroSplat(VT.getSizeInBits(), 0);
19652 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
19653 ZeroSplat |= Known.Zero.zext(width: VT.getSizeInBits())
19654 << (Known.Zero.getBitWidth() * I);
19655
19656 DefBits = ~(DefBits | ZeroSplat);
19657 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
19658 Bits: DefBits, LHS: &LHS)) ||
19659 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
19660 Bits: DefBits, LHS: &LHS)))
19661 return NewOp;
19662
19663 UndefBits = ~(UndefBits | ZeroSplat);
19664 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
19665 Bits: UndefBits, LHS: &LHS)) ||
19666 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
19667 Bits: UndefBits, LHS: &LHS)))
19668 return NewOp;
19669 }
19670
19671 return SDValue();
19672}
19673
19674static SDValue performFADDCombine(SDNode *N,
19675 TargetLowering::DAGCombinerInfo &DCI) {
19676 SelectionDAG &DAG = DCI.DAG;
19677 SDValue LHS = N->getOperand(Num: 0);
19678 SDValue RHS = N->getOperand(Num: 1);
19679 EVT VT = N->getValueType(ResNo: 0);
19680 SDLoc DL(N);
19681
19682 if (!N->getFlags().hasAllowReassociation())
19683 return SDValue();
19684
19685 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
19686 auto ReassocComplex = [&](SDValue A, SDValue B) {
19687 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
19688 return SDValue();
19689 unsigned Opc = A.getConstantOperandVal(i: 0);
19690 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
19691 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
19692 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
19693 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
19694 return SDValue();
19695 SDValue VCMLA = DAG.getNode(
19696 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: A.getOperand(i: 0),
19697 N2: DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: A.getOperand(i: 1), N2: B, Flags: N->getFlags()),
19698 N3: A.getOperand(i: 2), N4: A.getOperand(i: 3));
19699 VCMLA->setFlags(A->getFlags());
19700 return VCMLA;
19701 };
19702 if (SDValue R = ReassocComplex(LHS, RHS))
19703 return R;
19704 if (SDValue R = ReassocComplex(RHS, LHS))
19705 return R;
19706
19707 return SDValue();
19708}
19709
19710static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
19711 switch (Opcode) {
19712 case ISD::STRICT_FADD:
19713 case ISD::FADD:
19714 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
19715 case ISD::ADD:
19716 return VT == MVT::i64;
19717 default:
19718 return false;
19719 }
19720}
19721
19722static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19723 AArch64CC::CondCode Cond);
19724
19725static bool isPredicateCCSettingOp(SDValue N) {
19726 if ((N.getOpcode() == ISD::SETCC) ||
19727 // get_active_lane_mask is lowered to a whilelo instruction.
19728 (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
19729 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
19730 (N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilege ||
19731 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilegt ||
19732 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilehi ||
19733 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilehs ||
19734 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilele ||
19735 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilelo ||
19736 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilels ||
19737 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilelt)))
19738 return true;
19739
19740 return false;
19741}
19742
19743// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
19744// ... into: "ptrue p, all" + PTEST
19745static SDValue
19746performFirstTrueTestVectorCombine(SDNode *N,
19747 TargetLowering::DAGCombinerInfo &DCI,
19748 const AArch64Subtarget *Subtarget) {
19749 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19750 // Make sure PTEST can be legalised with illegal types.
19751 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19752 return SDValue();
19753
19754 SDValue N0 = N->getOperand(Num: 0);
19755 EVT VT = N0.getValueType();
19756
19757 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
19758 !isNullConstant(V: N->getOperand(Num: 1)))
19759 return SDValue();
19760
19761 // Restricted the DAG combine to only cases where we're extracting from a
19762 // flag-setting operation.
19763 if (!isPredicateCCSettingOp(N: N0))
19764 return SDValue();
19765
19766 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
19767 SelectionDAG &DAG = DCI.DAG;
19768 SDValue Pg = getPTrue(DAG, DL: SDLoc(N), VT, Pattern: AArch64SVEPredPattern::all);
19769 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg, Op: N0, Cond: AArch64CC::FIRST_ACTIVE);
19770}
19771
19772// Materialize : Idx = (add (mul vscale, NumEls), -1)
19773// i1 = extract_vector_elt t37, Constant:i64<Idx>
19774// ... into: "ptrue p, all" + PTEST
19775static SDValue
19776performLastTrueTestVectorCombine(SDNode *N,
19777 TargetLowering::DAGCombinerInfo &DCI,
19778 const AArch64Subtarget *Subtarget) {
19779 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19780 // Make sure PTEST is legal types.
19781 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19782 return SDValue();
19783
19784 SDValue N0 = N->getOperand(Num: 0);
19785 EVT OpVT = N0.getValueType();
19786
19787 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
19788 return SDValue();
19789
19790 // Idx == (add (mul vscale, NumEls), -1)
19791 SDValue Idx = N->getOperand(Num: 1);
19792 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(V: Idx.getOperand(i: 1)))
19793 return SDValue();
19794
19795 SDValue VS = Idx.getOperand(i: 0);
19796 if (VS.getOpcode() != ISD::VSCALE)
19797 return SDValue();
19798
19799 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
19800 if (VS.getConstantOperandVal(i: 0) != NumEls)
19801 return SDValue();
19802
19803 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
19804 SelectionDAG &DAG = DCI.DAG;
19805 SDValue Pg = getPTrue(DAG, DL: SDLoc(N), VT: OpVT, Pattern: AArch64SVEPredPattern::all);
19806 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg, Op: N0, Cond: AArch64CC::LAST_ACTIVE);
19807}
19808
19809static SDValue
19810performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19811 const AArch64Subtarget *Subtarget) {
19812 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19813 SelectionDAG &DAG = DCI.DAG;
19814 SDValue Vec = N->getOperand(Num: 0);
19815 SDValue Idx = N->getOperand(Num: 1);
19816
19817 if (DCI.isBeforeLegalize() || Idx.getOpcode() != ISD::VECTOR_FIND_LAST_ACTIVE)
19818 return SDValue();
19819
19820 // Only legal for 8, 16, 32, and 64 bit element types.
19821 EVT EltVT = Vec.getValueType().getVectorElementType();
19822 if (!is_contained(Range: ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
19823 MVT::bf16, MVT::f32, MVT::f64}),
19824 Element: EltVT.getSimpleVT().SimpleTy))
19825 return SDValue();
19826
19827 SDValue Mask = Idx.getOperand(i: 0);
19828 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19829 if (!TLI.isOperationLegal(Op: ISD::VECTOR_FIND_LAST_ACTIVE, VT: Mask.getValueType()))
19830 return SDValue();
19831
19832 return DAG.getNode(Opcode: AArch64ISD::LASTB, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: Mask,
19833 N2: Vec);
19834}
19835
19836static SDValue
19837performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19838 const AArch64Subtarget *Subtarget) {
19839 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19840 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
19841 return Res;
19842 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
19843 return Res;
19844 if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
19845 return Res;
19846
19847 SelectionDAG &DAG = DCI.DAG;
19848 SDValue N0 = N->getOperand(Num: 0), N1 = N->getOperand(Num: 1);
19849
19850 EVT VT = N->getValueType(ResNo: 0);
19851 const bool FullFP16 = Subtarget->hasFullFP16();
19852 bool IsStrict = N0->isStrictFPOpcode();
19853
19854 // extract(dup x) -> x
19855 if (N0.getOpcode() == AArch64ISD::DUP)
19856 return VT.isInteger() ? DAG.getZExtOrTrunc(Op: N0.getOperand(i: 0), DL: SDLoc(N), VT)
19857 : N0.getOperand(i: 0);
19858
19859 // Rewrite for pairwise fadd pattern
19860 // (f32 (extract_vector_elt
19861 // (fadd (vXf32 Other)
19862 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
19863 // ->
19864 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
19865 // (extract_vector_elt (vXf32 Other) 1))
19866 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
19867 // we can only do this when it's used only by the extract_vector_elt.
19868 if (isNullConstant(V: N1) && hasPairwiseAdd(Opcode: N0->getOpcode(), VT, FullFP16) &&
19869 (!IsStrict || N0.hasOneUse())) {
19870 SDLoc DL(N0);
19871 SDValue N00 = N0->getOperand(Num: IsStrict ? 1 : 0);
19872 SDValue N01 = N0->getOperand(Num: IsStrict ? 2 : 1);
19873
19874 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N01);
19875 SDValue Other = N00;
19876
19877 // And handle the commutative case.
19878 if (!Shuffle) {
19879 Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N00);
19880 Other = N01;
19881 }
19882
19883 if (Shuffle && Shuffle->getMaskElt(Idx: 0) == 1 &&
19884 Other == Shuffle->getOperand(Num: 0)) {
19885 SDValue Extract1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Other,
19886 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
19887 SDValue Extract2 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Other,
19888 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i64));
19889 if (!IsStrict)
19890 return DAG.getNode(Opcode: N0->getOpcode(), DL, VT, N1: Extract1, N2: Extract2);
19891
19892 // For strict_fadd we need uses of the final extract_vector to be replaced
19893 // with the strict_fadd, but we also need uses of the chain output of the
19894 // original strict_fadd to use the chain output of the new strict_fadd as
19895 // otherwise it may not be deleted.
19896 SDValue Ret = DAG.getNode(Opcode: N0->getOpcode(), DL,
19897 ResultTys: {VT, MVT::Other},
19898 Ops: {N0->getOperand(Num: 0), Extract1, Extract2});
19899 DAG.ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Ret);
19900 DAG.ReplaceAllUsesOfValueWith(From: N0.getValue(R: 1), To: Ret.getValue(R: 1));
19901 return SDValue(N, 0);
19902 }
19903 }
19904
19905 return SDValue();
19906}
19907
19908static SDValue performConcatVectorsCombine(SDNode *N,
19909 TargetLowering::DAGCombinerInfo &DCI,
19910 SelectionDAG &DAG) {
19911 SDLoc DL(N);
19912 EVT VT = N->getValueType(ResNo: 0);
19913 SDValue N0 = N->getOperand(Num: 0), N1 = N->getOperand(Num: 1);
19914 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
19915
19916 if (VT.isScalableVector())
19917 return SDValue();
19918
19919 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19920 N1Opc == ISD::TRUNCATE) {
19921 SDValue N00 = N0->getOperand(Num: 0);
19922 SDValue N10 = N1->getOperand(Num: 0);
19923 EVT N00VT = N00.getValueType();
19924 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
19925
19926 // Optimize concat_vectors of truncated vectors, where the intermediate
19927 // type is illegal, to avoid said illegality, e.g.,
19928 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
19929 // (v2i16 (truncate (v2i64)))))
19930 // ->
19931 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
19932 // (v4i32 (bitcast (v2i64))),
19933 // <0, 2, 4, 6>)))
19934 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
19935 // on both input and result type, so we might generate worse code.
19936 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
19937 if (N00VT == N10.getValueType() &&
19938 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
19939 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
19940 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
19941 SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
19942 for (size_t i = 0; i < Mask.size(); ++i)
19943 Mask[i] = i * 2;
19944 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT,
19945 Operand: DAG.getVectorShuffle(
19946 VT: MidVT, dl: DL,
19947 N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MidVT, Operand: N00),
19948 N2: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MidVT, Operand: N10), Mask));
19949 }
19950
19951 // Optimize two large shifts and a combine into a single combine and shift
19952 // For AArch64 architectures, sequences like the following:
19953 //
19954 // ushr v0.4s, v0.4s, #20
19955 // ushr v1.4s, v1.4s, #20
19956 // uzp1 v0.8h, v0.8h, v1.8h
19957 //
19958 // Can be optimized to:
19959 //
19960 // uzp2 v0.8h, v0.8h, v1.8h
19961 // ushr v0.8h, v0.8h, #4
19962 //
19963 // This optimization reduces instruction count.
19964 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
19965 N00->getOperand(Num: 1) == N10->getOperand(Num: 1)) {
19966 SDValue N000 = N00->getOperand(Num: 0);
19967 SDValue N100 = N10->getOperand(Num: 0);
19968 uint64_t N001ConstVal = N00->getConstantOperandVal(Num: 1),
19969 N101ConstVal = N10->getConstantOperandVal(Num: 1),
19970 NScalarSize = N->getValueType(ResNo: 0).getScalarSizeInBits();
19971
19972 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
19973 N000 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: N000);
19974 N100 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: N100);
19975 SDValue Uzp = DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT, N1: N000, N2: N100);
19976 SDValue NewShiftConstant =
19977 DAG.getConstant(Val: N001ConstVal - NScalarSize, DL, VT: MVT::i32);
19978
19979 return DAG.getNode(Opcode: AArch64ISD::VLSHR, DL, VT, N1: Uzp, N2: NewShiftConstant);
19980 }
19981 }
19982 }
19983
19984 if (N->getOperand(Num: 0).getValueType() == MVT::v4i8 ||
19985 N->getOperand(Num: 0).getValueType() == MVT::v2i16 ||
19986 N->getOperand(Num: 0).getValueType() == MVT::v2i8) {
19987 EVT SrcVT = N->getOperand(Num: 0).getValueType();
19988 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
19989 // loads to prevent having to go through the v4i8 load legalization that
19990 // needs to extend each element into a larger type.
19991 if (N->getNumOperands() % 2 == 0 &&
19992 all_of(Range: N->op_values(), P: [SrcVT](SDValue V) {
19993 if (V.getValueType() != SrcVT)
19994 return false;
19995 if (V.isUndef())
19996 return true;
19997 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val&: V);
19998 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
19999 LD->getExtensionType() == ISD::NON_EXTLOAD;
20000 })) {
20001 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
20002 EVT NVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: FVT, NumElements: N->getNumOperands());
20003 SmallVector<SDValue> Ops;
20004
20005 for (unsigned i = 0; i < N->getNumOperands(); i++) {
20006 SDValue V = N->getOperand(Num: i);
20007 if (V.isUndef())
20008 Ops.push_back(Elt: DAG.getUNDEF(VT: FVT));
20009 else {
20010 LoadSDNode *LD = cast<LoadSDNode>(Val&: V);
20011 SDValue NewLoad = DAG.getLoad(VT: FVT, dl: DL, Chain: LD->getChain(),
20012 Ptr: LD->getBasePtr(), MMO: LD->getMemOperand());
20013 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: NewLoad.getValue(R: 1));
20014 Ops.push_back(Elt: NewLoad);
20015 }
20016 }
20017 return DAG.getBitcast(VT: N->getValueType(ResNo: 0),
20018 V: DAG.getBuildVector(VT: NVT, DL, Ops));
20019 }
20020 }
20021
20022 // Canonicalise concat_vectors to replace concatenations of truncated nots
20023 // with nots of concatenated truncates. This in some cases allows for multiple
20024 // redundant negations to be eliminated.
20025 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
20026 // (v4i16 (truncate (not (v4i32)))))
20027 // ->
20028 // (not (concat_vectors (v4i16 (truncate (v4i32))),
20029 // (v4i16 (truncate (v4i32)))))
20030 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20031 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N: N0.getNode()) &&
20032 N->isOnlyUserOf(N: N1.getNode())) {
20033 auto isBitwiseVectorNegate = [](SDValue V) {
20034 return V->getOpcode() == ISD::XOR &&
20035 ISD::isConstantSplatVectorAllOnes(N: V.getOperand(i: 1).getNode());
20036 };
20037 SDValue N00 = N0->getOperand(Num: 0);
20038 SDValue N10 = N1->getOperand(Num: 0);
20039 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N: N00.getNode()) &&
20040 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N: N10.getNode())) {
20041 return DAG.getNOT(
20042 DL,
20043 Val: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT,
20044 N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: N0.getValueType(),
20045 Operand: N00->getOperand(Num: 0)),
20046 N2: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: N1.getValueType(),
20047 Operand: N10->getOperand(Num: 0))),
20048 VT);
20049 }
20050 }
20051
20052 // Wait till after everything is legalized to try this. That way we have
20053 // legal vector types and such.
20054 if (DCI.isBeforeLegalizeOps())
20055 return SDValue();
20056
20057 // Optimise concat_vectors of two identical binops with a 128-bit destination
20058 // size, combine into an binop of two contacts of the source vectors. eg:
20059 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20060 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20061 DAG.getTargetLoweringInfo().isBinOp(Opcode: N0Opc) && N0->hasOneUse() &&
20062 N1->hasOneUse()) {
20063 SDValue N00 = N0->getOperand(Num: 0);
20064 SDValue N01 = N0->getOperand(Num: 1);
20065 SDValue N10 = N1->getOperand(Num: 0);
20066 SDValue N11 = N1->getOperand(Num: 1);
20067
20068 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20069 SDValue Concat0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: N00, N2: N10);
20070 SDValue Concat1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: N01, N2: N11);
20071 return DAG.getNode(Opcode: N0Opc, DL, VT, N1: Concat0, N2: Concat1);
20072 }
20073 }
20074
20075 auto IsRSHRN = [](SDValue Shr) {
20076 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20077 return false;
20078 SDValue Op = Shr.getOperand(i: 0);
20079 EVT VT = Op.getValueType();
20080 unsigned ShtAmt = Shr.getConstantOperandVal(i: 1);
20081 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20082 return false;
20083
20084 APInt Imm;
20085 if (Op.getOperand(i: 1).getOpcode() == AArch64ISD::MOVIshift)
20086 Imm = APInt(VT.getScalarSizeInBits(),
20087 Op.getOperand(i: 1).getConstantOperandVal(i: 0)
20088 << Op.getOperand(i: 1).getConstantOperandVal(i: 1));
20089 else if (Op.getOperand(i: 1).getOpcode() == AArch64ISD::DUP &&
20090 isa<ConstantSDNode>(Val: Op.getOperand(i: 1).getOperand(i: 0)))
20091 Imm = APInt(VT.getScalarSizeInBits(),
20092 Op.getOperand(i: 1).getConstantOperandVal(i: 0));
20093 else
20094 return false;
20095
20096 if (Imm != 1ULL << (ShtAmt - 1))
20097 return false;
20098 return true;
20099 };
20100
20101 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20102 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20103 ((IsRSHRN(N1) &&
20104 N0.getConstantOperandVal(i: 1) == N1.getConstantOperandVal(i: 1)) ||
20105 N1.isUndef())) {
20106 SDValue X = N0.getOperand(i: 0).getOperand(i: 0);
20107 SDValue Y = N1.isUndef() ? DAG.getUNDEF(VT: X.getValueType())
20108 : N1.getOperand(i: 0).getOperand(i: 0);
20109 EVT BVT =
20110 X.getValueType().getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
20111 SDValue CC = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: BVT, N1: X, N2: Y);
20112 SDValue Add = DAG.getNode(
20113 Opcode: ISD::ADD, DL, VT: BVT, N1: CC,
20114 N2: DAG.getConstant(Val: 1ULL << (N0.getConstantOperandVal(i: 1) - 1), DL, VT: BVT));
20115 SDValue Shr =
20116 DAG.getNode(Opcode: AArch64ISD::VLSHR, DL, VT: BVT, N1: Add, N2: N0.getOperand(i: 1));
20117 return Shr;
20118 }
20119
20120 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20121 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20122 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(i: 0) == N1.getOperand(i: 0) &&
20123 N0.getOperand(i: 1) == N1.getOperand(i: 1)) {
20124 SDValue E0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: N0.getOperand(i: 0),
20125 N2: DAG.getUNDEF(VT: N0.getValueType()));
20126 SDValue E1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: N0.getOperand(i: 1),
20127 N2: DAG.getUNDEF(VT: N0.getValueType()));
20128 return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT, N1: E0, N2: E1);
20129 }
20130
20131 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20132 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20133 // canonicalise to that.
20134 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20135 assert(VT.getScalarSizeInBits() == 64);
20136 return DAG.getNode(Opcode: AArch64ISD::DUPLANE64, DL, VT, N1: WidenVector(V64Reg: N0, DAG),
20137 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
20138 }
20139
20140 // Canonicalise concat_vectors so that the right-hand vector has as few
20141 // bit-casts as possible before its real operation. The primary matching
20142 // destination for these operations will be the narrowing "2" instructions,
20143 // which depend on the operation being performed on this right-hand vector.
20144 // For example,
20145 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20146 // becomes
20147 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20148
20149 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20150 return SDValue();
20151 SDValue RHS = N1->getOperand(Num: 0);
20152 MVT RHSTy = RHS.getValueType().getSimpleVT();
20153 // If the RHS is not a vector, this is not the pattern we're looking for.
20154 if (!RHSTy.isVector())
20155 return SDValue();
20156
20157 LLVM_DEBUG(
20158 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20159
20160 MVT ConcatTy = MVT::getVectorVT(VT: RHSTy.getVectorElementType(),
20161 NumElements: RHSTy.getVectorNumElements() * 2);
20162 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT,
20163 Operand: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ConcatTy,
20164 N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: RHSTy, Operand: N0),
20165 N2: RHS));
20166}
20167
20168static SDValue
20169performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20170 SelectionDAG &DAG) {
20171 if (DCI.isBeforeLegalizeOps())
20172 return SDValue();
20173
20174 EVT VT = N->getValueType(ResNo: 0);
20175 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20176 return SDValue();
20177
20178 SDValue V = N->getOperand(Num: 0);
20179
20180 // NOTE: This combine exists in DAGCombiner, but that version's legality check
20181 // blocks this combine because the non-const case requires custom lowering.
20182 //
20183 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20184 if (V.getOpcode() == ISD::SPLAT_VECTOR)
20185 if (isa<ConstantSDNode>(Val: V.getOperand(i: 0)))
20186 return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc(N), VT, Operand: V.getOperand(i: 0));
20187
20188 return SDValue();
20189}
20190
20191static SDValue
20192performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20193 SelectionDAG &DAG) {
20194 SDLoc DL(N);
20195 SDValue Vec = N->getOperand(Num: 0);
20196 SDValue SubVec = N->getOperand(Num: 1);
20197 uint64_t IdxVal = N->getConstantOperandVal(Num: 2);
20198 EVT VecVT = Vec.getValueType();
20199 EVT SubVT = SubVec.getValueType();
20200
20201 // Promote fixed length vector zeros.
20202 if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
20203 Vec.isUndef() && isZerosVector(N: SubVec.getNode()))
20204 return VecVT.isInteger() ? DAG.getConstant(Val: 0, DL, VT: VecVT)
20205 : DAG.getConstantFP(Val: 0, DL, VT: VecVT);
20206
20207 // Only do this for legal fixed vector types.
20208 if (!VecVT.isFixedLengthVector() ||
20209 !DAG.getTargetLoweringInfo().isTypeLegal(VT: VecVT) ||
20210 !DAG.getTargetLoweringInfo().isTypeLegal(VT: SubVT))
20211 return SDValue();
20212
20213 // Ignore widening patterns.
20214 if (IdxVal == 0 && Vec.isUndef())
20215 return SDValue();
20216
20217 // Subvector must be half the width and an "aligned" insertion.
20218 unsigned NumSubElts = SubVT.getVectorNumElements();
20219 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20220 (IdxVal != 0 && IdxVal != NumSubElts))
20221 return SDValue();
20222
20223 // Fold insert_subvector -> concat_vectors
20224 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20225 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20226 SDValue Lo, Hi;
20227 if (IdxVal == 0) {
20228 Lo = SubVec;
20229 Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
20230 N2: DAG.getVectorIdxConstant(Val: NumSubElts, DL));
20231 } else {
20232 Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
20233 N2: DAG.getVectorIdxConstant(Val: 0, DL));
20234 Hi = SubVec;
20235 }
20236 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: VecVT, N1: Lo, N2: Hi);
20237}
20238
20239static SDValue tryCombineFixedPointConvert(SDNode *N,
20240 TargetLowering::DAGCombinerInfo &DCI,
20241 SelectionDAG &DAG) {
20242 // Wait until after everything is legalized to try this. That way we have
20243 // legal vector types and such.
20244 if (DCI.isBeforeLegalizeOps())
20245 return SDValue();
20246 // Transform a scalar conversion of a value from a lane extract into a
20247 // lane extract of a vector conversion. E.g., from foo1 to foo2:
20248 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20249 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20250 //
20251 // The second form interacts better with instruction selection and the
20252 // register allocator to avoid cross-class register copies that aren't
20253 // coalescable due to a lane reference.
20254
20255 // Check the operand and see if it originates from a lane extract.
20256 SDValue Op1 = N->getOperand(Num: 1);
20257 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
20258 return SDValue();
20259
20260 // Yep, no additional predication needed. Perform the transform.
20261 SDValue IID = N->getOperand(Num: 0);
20262 SDValue Shift = N->getOperand(Num: 2);
20263 SDValue Vec = Op1.getOperand(i: 0);
20264 SDValue Lane = Op1.getOperand(i: 1);
20265 EVT ResTy = N->getValueType(ResNo: 0);
20266 EVT VecResTy;
20267 SDLoc DL(N);
20268
20269 // The vector width should be 128 bits by the time we get here, even
20270 // if it started as 64 bits (the extract_vector handling will have
20271 // done so). Bail if it is not.
20272 if (Vec.getValueSizeInBits() != 128)
20273 return SDValue();
20274
20275 if (Vec.getValueType() == MVT::v4i32)
20276 VecResTy = MVT::v4f32;
20277 else if (Vec.getValueType() == MVT::v2i64)
20278 VecResTy = MVT::v2f64;
20279 else
20280 return SDValue();
20281
20282 SDValue Convert =
20283 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: VecResTy, N1: IID, N2: Vec, N3: Shift);
20284 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResTy, N1: Convert, N2: Lane);
20285}
20286
20287// AArch64 high-vector "long" operations are formed by performing the non-high
20288// version on an extract_subvector of each operand which gets the high half:
20289//
20290// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
20291//
20292// However, there are cases which don't have an extract_high explicitly, but
20293// have another operation that can be made compatible with one for free. For
20294// example:
20295//
20296// (dupv64 scalar) --> (extract_high (dup128 scalar))
20297//
20298// This routine does the actual conversion of such DUPs, once outer routines
20299// have determined that everything else is in order.
20300// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
20301// similarly here.
20302static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
20303 MVT VT = N.getSimpleValueType();
20304 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
20305 N.getConstantOperandVal(i: 1) == 0)
20306 N = N.getOperand(i: 0);
20307
20308 switch (N.getOpcode()) {
20309 case AArch64ISD::DUP:
20310 case AArch64ISD::DUPLANE8:
20311 case AArch64ISD::DUPLANE16:
20312 case AArch64ISD::DUPLANE32:
20313 case AArch64ISD::DUPLANE64:
20314 case AArch64ISD::MOVI:
20315 case AArch64ISD::MOVIshift:
20316 case AArch64ISD::MOVIedit:
20317 case AArch64ISD::MOVImsl:
20318 case AArch64ISD::MVNIshift:
20319 case AArch64ISD::MVNImsl:
20320 break;
20321 default:
20322 // FMOV could be supported, but isn't very useful, as it would only occur
20323 // if you passed a bitcast' floating point immediate to an eligible long
20324 // integer op (addl, smull, ...).
20325 return SDValue();
20326 }
20327
20328 if (!VT.is64BitVector())
20329 return SDValue();
20330
20331 SDLoc DL(N);
20332 unsigned NumElems = VT.getVectorNumElements();
20333 if (N.getValueType().is64BitVector()) {
20334 MVT ElementTy = VT.getVectorElementType();
20335 MVT NewVT = MVT::getVectorVT(VT: ElementTy, NumElements: NumElems * 2);
20336 N = DAG.getNode(Opcode: N->getOpcode(), DL, VT: NewVT, Ops: N->ops());
20337 }
20338
20339 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: N,
20340 N2: DAG.getConstant(Val: NumElems, DL, VT: MVT::i64));
20341}
20342
20343static bool isEssentiallyExtractHighSubvector(SDValue N) {
20344 if (N.getOpcode() == ISD::BITCAST)
20345 N = N.getOperand(i: 0);
20346 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20347 return false;
20348 if (N.getOperand(i: 0).getValueType().isScalableVector())
20349 return false;
20350 return N.getConstantOperandAPInt(i: 1) ==
20351 N.getOperand(i: 0).getValueType().getVectorNumElements() / 2;
20352}
20353
20354/// Helper structure to keep track of ISD::SET_CC operands.
20355struct GenericSetCCInfo {
20356 const SDValue *Opnd0;
20357 const SDValue *Opnd1;
20358 ISD::CondCode CC;
20359};
20360
20361/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
20362struct AArch64SetCCInfo {
20363 const SDValue *Cmp;
20364 AArch64CC::CondCode CC;
20365};
20366
20367/// Helper structure to keep track of SetCC information.
20368union SetCCInfo {
20369 GenericSetCCInfo Generic;
20370 AArch64SetCCInfo AArch64;
20371};
20372
20373/// Helper structure to be able to read SetCC information. If set to
20374/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
20375/// GenericSetCCInfo.
20376struct SetCCInfoAndKind {
20377 SetCCInfo Info;
20378 bool IsAArch64;
20379};
20380
20381/// Check whether or not \p Op is a SET_CC operation, either a generic or
20382/// an
20383/// AArch64 lowered one.
20384/// \p SetCCInfo is filled accordingly.
20385/// \post SetCCInfo is meanginfull only when this function returns true.
20386/// \return True when Op is a kind of SET_CC operation.
20387static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
20388 // If this is a setcc, this is straight forward.
20389 if (Op.getOpcode() == ISD::SETCC) {
20390 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(i: 0);
20391 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(i: 1);
20392 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
20393 SetCCInfo.IsAArch64 = false;
20394 return true;
20395 }
20396 // Otherwise, check if this is a matching csel instruction.
20397 // In other words:
20398 // - csel 1, 0, cc
20399 // - csel 0, 1, !cc
20400 if (Op.getOpcode() != AArch64ISD::CSEL)
20401 return false;
20402 // Set the information about the operands.
20403 // TODO: we want the operands of the Cmp not the csel
20404 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(i: 3);
20405 SetCCInfo.IsAArch64 = true;
20406 SetCCInfo.Info.AArch64.CC =
20407 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: 2));
20408
20409 // Check that the operands matches the constraints:
20410 // (1) Both operands must be constants.
20411 // (2) One must be 1 and the other must be 0.
20412 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 0));
20413 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1));
20414
20415 // Check (1).
20416 if (!TValue || !FValue)
20417 return false;
20418
20419 // Check (2).
20420 if (!TValue->isOne()) {
20421 // Update the comparison when we are interested in !cc.
20422 std::swap(a&: TValue, b&: FValue);
20423 SetCCInfo.Info.AArch64.CC =
20424 AArch64CC::getInvertedCondCode(Code: SetCCInfo.Info.AArch64.CC);
20425 }
20426 return TValue->isOne() && FValue->isZero();
20427}
20428
20429// Returns true if Op is setcc or zext of setcc.
20430static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
20431 if (isSetCC(Op, SetCCInfo&: Info))
20432 return true;
20433 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
20434 isSetCC(Op: Op->getOperand(Num: 0), SetCCInfo&: Info));
20435}
20436
20437// The folding we want to perform is:
20438// (add x, [zext] (setcc cc ...) )
20439// -->
20440// (csel x, (add x, 1), !cc ...)
20441//
20442// The latter will get matched to a CSINC instruction.
20443static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
20444 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
20445 SDValue LHS = Op->getOperand(Num: 0);
20446 SDValue RHS = Op->getOperand(Num: 1);
20447 SetCCInfoAndKind InfoAndKind;
20448
20449 // If both operands are a SET_CC, then we don't want to perform this
20450 // folding and create another csel as this results in more instructions
20451 // (and higher register usage).
20452 if (isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind) &&
20453 isSetCCOrZExtSetCC(Op: RHS, Info&: InfoAndKind))
20454 return SDValue();
20455
20456 // If neither operand is a SET_CC, give up.
20457 if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind)) {
20458 std::swap(a&: LHS, b&: RHS);
20459 if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind))
20460 return SDValue();
20461 }
20462
20463 // FIXME: This could be generatized to work for FP comparisons.
20464 EVT CmpVT = InfoAndKind.IsAArch64
20465 ? InfoAndKind.Info.AArch64.Cmp->getOperand(i: 0).getValueType()
20466 : InfoAndKind.Info.Generic.Opnd0->getValueType();
20467 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
20468 return SDValue();
20469
20470 SDValue CCVal;
20471 SDValue Cmp;
20472 SDLoc DL(Op);
20473 if (InfoAndKind.IsAArch64) {
20474 CCVal = DAG.getConstant(
20475 Val: AArch64CC::getInvertedCondCode(Code: InfoAndKind.Info.AArch64.CC), DL,
20476 VT: MVT::i32);
20477 Cmp = *InfoAndKind.Info.AArch64.Cmp;
20478 } else
20479 Cmp = getAArch64Cmp(
20480 LHS: *InfoAndKind.Info.Generic.Opnd0, RHS: *InfoAndKind.Info.Generic.Opnd1,
20481 CC: ISD::getSetCCInverse(Operation: InfoAndKind.Info.Generic.CC, Type: CmpVT), AArch64cc&: CCVal, DAG,
20482 DL);
20483
20484 EVT VT = Op->getValueType(ResNo: 0);
20485 LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: DAG.getConstant(Val: 1, DL, VT));
20486 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: RHS, N2: LHS, N3: CCVal, N4: Cmp);
20487}
20488
20489// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
20490static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
20491 EVT VT = N->getValueType(ResNo: 0);
20492 // Only scalar integer and vector types.
20493 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
20494 return SDValue();
20495
20496 SDValue LHS = N->getOperand(Num: 0);
20497 SDValue RHS = N->getOperand(Num: 1);
20498 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20499 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
20500 return SDValue();
20501
20502 auto *LHSN1 = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1));
20503 auto *RHSN1 = dyn_cast<ConstantSDNode>(Val: RHS->getOperand(Num: 1));
20504 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
20505 return SDValue();
20506
20507 SDValue Op1 = LHS->getOperand(Num: 0);
20508 SDValue Op2 = RHS->getOperand(Num: 0);
20509 EVT OpVT1 = Op1.getValueType();
20510 EVT OpVT2 = Op2.getValueType();
20511 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
20512 Op2.getOpcode() != AArch64ISD::UADDV ||
20513 OpVT1.getVectorElementType() != VT)
20514 return SDValue();
20515
20516 SDValue Val1 = Op1.getOperand(i: 0);
20517 SDValue Val2 = Op2.getOperand(i: 0);
20518 EVT ValVT = Val1->getValueType(ResNo: 0);
20519 SDLoc DL(N);
20520 SDValue AddVal = DAG.getNode(Opcode: ISD::ADD, DL, VT: ValVT, N1: Val1, N2: Val2);
20521 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT,
20522 N1: DAG.getNode(Opcode: AArch64ISD::UADDV, DL, VT: ValVT, Operand: AddVal),
20523 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
20524}
20525
20526/// Perform the scalar expression combine in the form of:
20527/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
20528/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
20529static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
20530 EVT VT = N->getValueType(ResNo: 0);
20531 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
20532 return SDValue();
20533
20534 SDValue LHS = N->getOperand(Num: 0);
20535 SDValue RHS = N->getOperand(Num: 1);
20536
20537 // Handle commutivity.
20538 if (LHS.getOpcode() != AArch64ISD::CSEL &&
20539 LHS.getOpcode() != AArch64ISD::CSNEG) {
20540 std::swap(a&: LHS, b&: RHS);
20541 if (LHS.getOpcode() != AArch64ISD::CSEL &&
20542 LHS.getOpcode() != AArch64ISD::CSNEG) {
20543 return SDValue();
20544 }
20545 }
20546
20547 if (!LHS.hasOneUse())
20548 return SDValue();
20549
20550 AArch64CC::CondCode AArch64CC =
20551 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: 2));
20552
20553 // The CSEL should include a const one operand, and the CSNEG should include
20554 // One or NegOne operand.
20555 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 0));
20556 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
20557 if (!CTVal || !CFVal)
20558 return SDValue();
20559
20560 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
20561 (CTVal->isOne() || CFVal->isOne())) &&
20562 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
20563 (CTVal->isOne() || CFVal->isAllOnes())))
20564 return SDValue();
20565
20566 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
20567 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
20568 !CFVal->isOne()) {
20569 std::swap(a&: CTVal, b&: CFVal);
20570 AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
20571 }
20572
20573 SDLoc DL(N);
20574 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
20575 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
20576 !CFVal->isAllOnes()) {
20577 APInt C = -1 * CFVal->getAPIntValue();
20578 CTVal = cast<ConstantSDNode>(Val: DAG.getConstant(Val: C, DL, VT));
20579 CFVal = cast<ConstantSDNode>(Val: DAG.getAllOnesConstant(DL, VT));
20580 AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
20581 }
20582
20583 // It might be neutral for larger constants, as the immediate need to be
20584 // materialized in a register.
20585 APInt ADDC = CTVal->getAPIntValue();
20586 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20587 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
20588 return SDValue();
20589
20590 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
20591 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
20592 "Unexpected constant value");
20593
20594 SDValue NewNode = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: SDValue(CTVal, 0));
20595 SDValue CCVal = DAG.getConstant(Val: AArch64CC, DL, VT: MVT::i32);
20596 SDValue Cmp = LHS.getOperand(i: 3);
20597
20598 return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: NewNode, N2: RHS, N3: CCVal, N4: Cmp);
20599}
20600
20601// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
20602static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
20603 EVT VT = N->getValueType(ResNo: 0);
20604 if (N->getOpcode() != ISD::ADD)
20605 return SDValue();
20606
20607 SDValue Dot = N->getOperand(Num: 0);
20608 SDValue A = N->getOperand(Num: 1);
20609 // Handle commutivity
20610 auto isZeroDot = [](SDValue Dot) {
20611 return (Dot.getOpcode() == AArch64ISD::UDOT ||
20612 Dot.getOpcode() == AArch64ISD::SDOT) &&
20613 isZerosVector(N: Dot.getOperand(i: 0).getNode());
20614 };
20615 if (!isZeroDot(Dot))
20616 std::swap(a&: Dot, b&: A);
20617 if (!isZeroDot(Dot))
20618 return SDValue();
20619
20620 return DAG.getNode(Opcode: Dot.getOpcode(), DL: SDLoc(N), VT, N1: A, N2: Dot.getOperand(i: 1),
20621 N3: Dot.getOperand(i: 2));
20622}
20623
20624static bool isNegatedInteger(SDValue Op) {
20625 return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: 0));
20626}
20627
20628static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
20629 SDLoc DL(Op);
20630 EVT VT = Op.getValueType();
20631 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
20632 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: Op);
20633}
20634
20635// Try to fold
20636//
20637// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
20638//
20639// The folding helps csel to be matched with csneg without generating
20640// redundant neg instruction, which includes negation of the csel expansion
20641// of abs node lowered by lowerABS.
20642static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
20643 if (!isNegatedInteger(Op: SDValue(N, 0)))
20644 return SDValue();
20645
20646 SDValue CSel = N->getOperand(Num: 1);
20647 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
20648 return SDValue();
20649
20650 SDValue N0 = CSel.getOperand(i: 0);
20651 SDValue N1 = CSel.getOperand(i: 1);
20652
20653 // If both of them is not negations, it's not worth the folding as it
20654 // introduces two additional negations while reducing one negation.
20655 if (!isNegatedInteger(Op: N0) && !isNegatedInteger(Op: N1))
20656 return SDValue();
20657
20658 SDValue N0N = getNegatedInteger(Op: N0, DAG);
20659 SDValue N1N = getNegatedInteger(Op: N1, DAG);
20660
20661 SDLoc DL(N);
20662 EVT VT = CSel.getValueType();
20663 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: N0N, N2: N1N, N3: CSel.getOperand(i: 2),
20664 N4: CSel.getOperand(i: 3));
20665}
20666
20667// The basic add/sub long vector instructions have variants with "2" on the end
20668// which act on the high-half of their inputs. They are normally matched by
20669// patterns like:
20670//
20671// (add (zeroext (extract_high LHS)),
20672// (zeroext (extract_high RHS)))
20673// -> uaddl2 vD, vN, vM
20674//
20675// However, if one of the extracts is something like a duplicate, this
20676// instruction can still be used profitably. This function puts the DAG into a
20677// more appropriate form for those patterns to trigger.
20678static SDValue performAddSubLongCombine(SDNode *N,
20679 TargetLowering::DAGCombinerInfo &DCI) {
20680 SelectionDAG &DAG = DCI.DAG;
20681 if (DCI.isBeforeLegalizeOps())
20682 return SDValue();
20683
20684 MVT VT = N->getSimpleValueType(ResNo: 0);
20685 if (!VT.is128BitVector()) {
20686 if (N->getOpcode() == ISD::ADD)
20687 return performSetccAddFolding(Op: N, DAG);
20688 return SDValue();
20689 }
20690
20691 // Make sure both branches are extended in the same way.
20692 SDValue LHS = N->getOperand(Num: 0);
20693 SDValue RHS = N->getOperand(Num: 1);
20694 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
20695 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
20696 LHS.getOpcode() != RHS.getOpcode())
20697 return SDValue();
20698
20699 unsigned ExtType = LHS.getOpcode();
20700
20701 // It's not worth doing if at least one of the inputs isn't already an
20702 // extract, but we don't know which it'll be so we have to try both.
20703 if (isEssentiallyExtractHighSubvector(N: LHS.getOperand(i: 0))) {
20704 RHS = tryExtendDUPToExtractHigh(N: RHS.getOperand(i: 0), DAG);
20705 if (!RHS.getNode())
20706 return SDValue();
20707
20708 RHS = DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT, Operand: RHS);
20709 } else if (isEssentiallyExtractHighSubvector(N: RHS.getOperand(i: 0))) {
20710 LHS = tryExtendDUPToExtractHigh(N: LHS.getOperand(i: 0), DAG);
20711 if (!LHS.getNode())
20712 return SDValue();
20713
20714 LHS = DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT, Operand: LHS);
20715 }
20716
20717 return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT, N1: LHS, N2: RHS);
20718}
20719
20720static bool isCMP(SDValue Op) {
20721 return Op.getOpcode() == AArch64ISD::SUBS &&
20722 !Op.getNode()->hasAnyUseOfValue(Value: 0);
20723}
20724
20725// (CSEL 1 0 CC Cond) => CC
20726// (CSEL 0 1 CC Cond) => !CC
20727static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
20728 if (Op.getOpcode() != AArch64ISD::CSEL)
20729 return std::nullopt;
20730 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: 2));
20731 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
20732 return std::nullopt;
20733 SDValue OpLHS = Op.getOperand(i: 0);
20734 SDValue OpRHS = Op.getOperand(i: 1);
20735 if (isOneConstant(V: OpLHS) && isNullConstant(V: OpRHS))
20736 return CC;
20737 if (isNullConstant(V: OpLHS) && isOneConstant(V: OpRHS))
20738 return getInvertedCondCode(Code: CC);
20739
20740 return std::nullopt;
20741}
20742
20743// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
20744// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
20745static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
20746 SDValue CmpOp = Op->getOperand(Num: 2);
20747 if (!isCMP(Op: CmpOp))
20748 return SDValue();
20749
20750 if (IsAdd) {
20751 if (!isOneConstant(V: CmpOp.getOperand(i: 1)))
20752 return SDValue();
20753 } else {
20754 if (!isNullConstant(V: CmpOp.getOperand(i: 0)))
20755 return SDValue();
20756 }
20757
20758 SDValue CsetOp = CmpOp->getOperand(Num: IsAdd ? 0 : 1);
20759 auto CC = getCSETCondCode(Op: CsetOp);
20760 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
20761 return SDValue();
20762
20763 return DAG.getNode(Opcode: Op->getOpcode(), DL: SDLoc(Op), VTList: Op->getVTList(),
20764 N1: Op->getOperand(Num: 0), N2: Op->getOperand(Num: 1),
20765 N3: CsetOp.getOperand(i: 3));
20766}
20767
20768// (ADC x 0 cond) => (CINC x HS cond)
20769static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
20770 SDValue LHS = N->getOperand(Num: 0);
20771 SDValue RHS = N->getOperand(Num: 1);
20772 SDValue Cond = N->getOperand(Num: 2);
20773
20774 if (!isNullConstant(V: RHS))
20775 return SDValue();
20776
20777 EVT VT = N->getValueType(ResNo: 0);
20778 SDLoc DL(N);
20779
20780 // (CINC x cc cond) <=> (CSINC x x !cc cond)
20781 SDValue CC = DAG.getConstant(Val: AArch64CC::LO, DL, VT: MVT::i32);
20782 return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: LHS, N2: LHS, N3: CC, N4: Cond);
20783}
20784
20785static SDValue performBuildVectorCombine(SDNode *N,
20786 TargetLowering::DAGCombinerInfo &DCI,
20787 SelectionDAG &DAG) {
20788 SDLoc DL(N);
20789 EVT VT = N->getValueType(ResNo: 0);
20790
20791 if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
20792 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
20793 SDValue Elt0 = N->getOperand(Num: 0), Elt1 = N->getOperand(Num: 1),
20794 Elt2 = N->getOperand(Num: 2), Elt3 = N->getOperand(Num: 3);
20795 if (Elt0->getOpcode() == ISD::FP_ROUND &&
20796 Elt1->getOpcode() == ISD::FP_ROUND &&
20797 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 1)) &&
20798 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 1)) &&
20799 Elt0->getConstantOperandVal(Num: 1) == Elt1->getConstantOperandVal(Num: 1) &&
20800 Elt0->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20801 Elt1->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20802 // Constant index.
20803 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 0)->getOperand(Num: 1)) &&
20804 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 0)->getOperand(Num: 1)) &&
20805 Elt0->getOperand(Num: 0)->getOperand(Num: 0) ==
20806 Elt1->getOperand(Num: 0)->getOperand(Num: 0) &&
20807 Elt0->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 0 &&
20808 Elt1->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 1) {
20809 SDValue LowLanesSrcVec = Elt0->getOperand(Num: 0)->getOperand(Num: 0);
20810 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
20811 SDValue HighLanes;
20812 if (Elt2->getOpcode() == ISD::UNDEF &&
20813 Elt3->getOpcode() == ISD::UNDEF) {
20814 HighLanes = DAG.getUNDEF(VT: MVT::v2f32);
20815 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
20816 Elt3->getOpcode() == ISD::FP_ROUND &&
20817 isa<ConstantSDNode>(Val: Elt2->getOperand(Num: 1)) &&
20818 isa<ConstantSDNode>(Val: Elt3->getOperand(Num: 1)) &&
20819 Elt2->getConstantOperandVal(Num: 1) ==
20820 Elt3->getConstantOperandVal(Num: 1) &&
20821 Elt2->getOperand(Num: 0)->getOpcode() ==
20822 ISD::EXTRACT_VECTOR_ELT &&
20823 Elt3->getOperand(Num: 0)->getOpcode() ==
20824 ISD::EXTRACT_VECTOR_ELT &&
20825 // Constant index.
20826 isa<ConstantSDNode>(Val: Elt2->getOperand(Num: 0)->getOperand(Num: 1)) &&
20827 isa<ConstantSDNode>(Val: Elt3->getOperand(Num: 0)->getOperand(Num: 1)) &&
20828 Elt2->getOperand(Num: 0)->getOperand(Num: 0) ==
20829 Elt3->getOperand(Num: 0)->getOperand(Num: 0) &&
20830 Elt2->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 0 &&
20831 Elt3->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 1) {
20832 SDValue HighLanesSrcVec = Elt2->getOperand(Num: 0)->getOperand(Num: 0);
20833 HighLanes =
20834 DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL, VT: MVT::v2f32, Operand: HighLanesSrcVec);
20835 }
20836 if (HighLanes) {
20837 SDValue DoubleToSingleSticky =
20838 DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL, VT: MVT::v2f32, Operand: LowLanesSrcVec);
20839 SDValue Concat = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v4f32,
20840 N1: DoubleToSingleSticky, N2: HighLanes);
20841 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Concat,
20842 N2: Elt0->getOperand(Num: 1));
20843 }
20844 }
20845 }
20846 }
20847
20848 if (VT == MVT::v2f64) {
20849 SDValue Elt0 = N->getOperand(Num: 0), Elt1 = N->getOperand(Num: 1);
20850 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
20851 Elt1->getOpcode() == ISD::FP_EXTEND &&
20852 Elt0->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20853 Elt1->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20854 Elt0->getOperand(Num: 0)->getOperand(Num: 0) ==
20855 Elt1->getOperand(Num: 0)->getOperand(Num: 0) &&
20856 // Constant index.
20857 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 0)->getOperand(Num: 1)) &&
20858 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 0)->getOperand(Num: 1)) &&
20859 Elt0->getOperand(Num: 0)->getConstantOperandVal(Num: 1) + 1 ==
20860 Elt1->getOperand(Num: 0)->getConstantOperandVal(Num: 1) &&
20861 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20862 // ResultType's known minimum vector length.
20863 Elt0->getOperand(Num: 0)->getConstantOperandVal(Num: 1) %
20864 VT.getVectorMinNumElements() ==
20865 0) {
20866 SDValue SrcVec = Elt0->getOperand(Num: 0)->getOperand(Num: 0);
20867 if (SrcVec.getValueType() == MVT::v4f16 ||
20868 SrcVec.getValueType() == MVT::v4bf16) {
20869 SDValue HalfToSingle =
20870 DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::v4f32, Operand: SrcVec);
20871 SDValue SubvectorIdx = Elt0->getOperand(Num: 0)->getOperand(Num: 1);
20872 SDValue Extract = DAG.getNode(
20873 Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: VT.changeVectorElementType(EltVT: MVT::f32),
20874 N1: HalfToSingle, N2: SubvectorIdx);
20875 return DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Extract);
20876 }
20877 }
20878 }
20879
20880 // A build vector of two extracted elements is equivalent to an
20881 // extract subvector where the inner vector is any-extended to the
20882 // extract_vector_elt VT.
20883 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
20884 // (extract_elt_iXX_to_i32 vec Idx+1))
20885 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
20886
20887 // For now, only consider the v2i32 case, which arises as a result of
20888 // legalization.
20889 if (VT != MVT::v2i32)
20890 return SDValue();
20891
20892 SDValue Elt0 = N->getOperand(Num: 0), Elt1 = N->getOperand(Num: 1);
20893 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
20894 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20895 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20896 // Constant index.
20897 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 1)) &&
20898 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 1)) &&
20899 // Both EXTRACT_VECTOR_ELT from same vector...
20900 Elt0->getOperand(Num: 0) == Elt1->getOperand(Num: 0) &&
20901 // ... and contiguous. First element's index +1 == second element's index.
20902 Elt0->getConstantOperandVal(Num: 1) + 1 == Elt1->getConstantOperandVal(Num: 1) &&
20903 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20904 // ResultType's known minimum vector length.
20905 Elt0->getConstantOperandVal(Num: 1) % VT.getVectorMinNumElements() == 0) {
20906 SDValue VecToExtend = Elt0->getOperand(Num: 0);
20907 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(EltVT: MVT::i32);
20908 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ExtVT))
20909 return SDValue();
20910
20911 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Val: Elt0->getConstantOperandVal(Num: 1), DL);
20912
20913 SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ExtVT, Operand: VecToExtend);
20914 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v2i32, N1: Ext,
20915 N2: SubvectorIdx);
20916 }
20917
20918 return SDValue();
20919}
20920
20921static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
20922 TargetLowering::DAGCombinerInfo &DCI) {
20923 SDLoc DL(N);
20924 EVT VT = N->getValueType(ResNo: 0);
20925 SDValue N0 = N->getOperand(Num: 0);
20926 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
20927 N0.getOpcode() == AArch64ISD::DUP) {
20928 SDValue Op = N0.getOperand(i: 0);
20929 if (VT.getScalarType() == MVT::i32 &&
20930 N0.getOperand(i: 0).getValueType().getScalarType() == MVT::i64)
20931 Op = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: Op);
20932 return DAG.getNode(Opcode: N0.getOpcode(), DL, VT, Operand: Op);
20933 }
20934
20935 // Performing the following combine produces a preferable form for ISEL.
20936 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
20937 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20938 N0.hasOneUse()) {
20939 SDValue Op = N0.getOperand(i: 0);
20940 SDValue ExtractIndexNode = N0.getOperand(i: 1);
20941 if (!isa<ConstantSDNode>(Val: ExtractIndexNode))
20942 return SDValue();
20943
20944 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
20945 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
20946 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
20947 "Unexpected legalisation result!");
20948
20949 EVT SrcVectorType = Op.getValueType();
20950 // We also assume that SrcVectorType cannot be a V64 (see
20951 // LowerEXTRACT_VECTOR_ELT).
20952 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
20953 "Unexpected legalisation result!");
20954
20955 unsigned ExtractIndex =
20956 cast<ConstantSDNode>(Val&: ExtractIndexNode)->getZExtValue();
20957 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
20958
20959 Op = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: CastVT, Operand: Op);
20960 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Op,
20961 N2: DAG.getVectorIdxConstant(Val: ExtractIndex * 2, DL));
20962 }
20963
20964 return SDValue();
20965}
20966
20967// Check an node is an extend or shift operand
20968static bool isExtendOrShiftOperand(SDValue N) {
20969 unsigned Opcode = N.getOpcode();
20970 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
20971 EVT SrcVT;
20972 if (Opcode == ISD::SIGN_EXTEND_INREG)
20973 SrcVT = cast<VTSDNode>(Val: N.getOperand(i: 1))->getVT();
20974 else
20975 SrcVT = N.getOperand(i: 0).getValueType();
20976
20977 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
20978 } else if (Opcode == ISD::AND) {
20979 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1));
20980 if (!CSD)
20981 return false;
20982 uint64_t AndMask = CSD->getZExtValue();
20983 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
20984 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
20985 return isa<ConstantSDNode>(Val: N.getOperand(i: 1));
20986 }
20987
20988 return false;
20989}
20990
20991// (N - Y) + Z --> (Z - Y) + N
20992// when N is an extend or shift operand
20993static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z,
20994 SelectionDAG &DAG) {
20995 auto IsOneUseExtend = [](SDValue N) {
20996 return N.hasOneUse() && isExtendOrShiftOperand(N);
20997 };
20998
20999 // DAGCombiner will revert the combination when Z is constant cause
21000 // dead loop. So don't enable the combination when Z is constant.
21001 // If Z is one use shift C, we also can't do the optimization.
21002 // It will falling to self infinite loop.
21003 if (isa<ConstantSDNode>(Val: Z) || IsOneUseExtend(Z))
21004 return SDValue();
21005
21006 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
21007 return SDValue();
21008
21009 SDValue Shift = SUB.getOperand(i: 0);
21010 if (!IsOneUseExtend(Shift))
21011 return SDValue();
21012
21013 SDLoc DL(N);
21014 EVT VT = N->getValueType(ResNo: 0);
21015
21016 SDValue Y = SUB.getOperand(i: 1);
21017 SDValue NewSub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Z, N2: Y);
21018 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: NewSub, N2: Shift);
21019}
21020
21021static SDValue performAddCombineForShiftedOperands(SDNode *N,
21022 SelectionDAG &DAG) {
21023 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
21024 // commutative.
21025 if (N->getOpcode() != ISD::ADD)
21026 return SDValue();
21027
21028 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
21029 // shifted register is only available for i32 and i64.
21030 EVT VT = N->getValueType(ResNo: 0);
21031 if (VT != MVT::i32 && VT != MVT::i64)
21032 return SDValue();
21033
21034 SDLoc DL(N);
21035 SDValue LHS = N->getOperand(Num: 0);
21036 SDValue RHS = N->getOperand(Num: 1);
21037
21038 if (SDValue Val = performAddCombineSubShift(N, SUB: LHS, Z: RHS, DAG))
21039 return Val;
21040 if (SDValue Val = performAddCombineSubShift(N, SUB: RHS, Z: LHS, DAG))
21041 return Val;
21042
21043 uint64_t LHSImm = 0, RHSImm = 0;
21044 // If both operand are shifted by imm and shift amount is not greater than 4
21045 // for one operand, swap LHS and RHS to put operand with smaller shift amount
21046 // on RHS.
21047 //
21048 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
21049 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
21050 // with LSL (shift > 4). For the rest of processors, this is no-op for
21051 // performance or correctness.
21052 if (isOpcWithIntImmediate(N: LHS.getNode(), Opc: ISD::SHL, Imm&: LHSImm) &&
21053 isOpcWithIntImmediate(N: RHS.getNode(), Opc: ISD::SHL, Imm&: RHSImm) && LHSImm <= 4 &&
21054 RHSImm > 4 && LHS.hasOneUse())
21055 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: LHS);
21056
21057 return SDValue();
21058}
21059
21060// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
21061// This reassociates it back to allow the creation of more mls instructions.
21062static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) {
21063 if (N->getOpcode() != ISD::SUB)
21064 return SDValue();
21065
21066 SDValue Add = N->getOperand(Num: 1);
21067 SDValue X = N->getOperand(Num: 0);
21068 if (Add.getOpcode() != ISD::ADD)
21069 return SDValue();
21070
21071 if (!Add.hasOneUse())
21072 return SDValue();
21073 if (DAG.isConstantIntBuildVectorOrConstantInt(N: X))
21074 return SDValue();
21075
21076 SDValue M1 = Add.getOperand(i: 0);
21077 SDValue M2 = Add.getOperand(i: 1);
21078 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
21079 M1.getOpcode() != AArch64ISD::UMULL)
21080 return SDValue();
21081 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
21082 M2.getOpcode() != AArch64ISD::UMULL)
21083 return SDValue();
21084
21085 EVT VT = N->getValueType(ResNo: 0);
21086 SDValue Sub = DAG.getNode(Opcode: ISD::SUB, DL: SDLoc(N), VT, N1: X, N2: M1);
21087 return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc(N), VT, N1: Sub, N2: M2);
21088}
21089
21090// Combine into mla/mls.
21091// This works on the patterns of:
21092// add v1, (mul v2, v3)
21093// sub v1, (mul v2, v3)
21094// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
21095// It will transform the add/sub to a scalable version, so that we can
21096// make use of SVE's MLA/MLS that will be generated for that pattern
21097static SDValue
21098performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
21099 SelectionDAG &DAG = DCI.DAG;
21100 // Make sure that the types are legal
21101 if (!DCI.isAfterLegalizeDAG())
21102 return SDValue();
21103 // Before using SVE's features, check first if it's available.
21104 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
21105 return SDValue();
21106
21107 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
21108 return SDValue();
21109
21110 if (!N->getValueType(ResNo: 0).isFixedLengthVector())
21111 return SDValue();
21112
21113 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
21114 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21115 return SDValue();
21116
21117 if (!cast<ConstantSDNode>(Val: Op1->getOperand(Num: 1))->isZero())
21118 return SDValue();
21119
21120 SDValue MulValue = Op1->getOperand(Num: 0);
21121 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
21122 return SDValue();
21123
21124 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
21125 return SDValue();
21126
21127 EVT ScalableVT = MulValue.getValueType();
21128 if (!ScalableVT.isScalableVector())
21129 return SDValue();
21130
21131 SDValue ScaledOp = convertToScalableVector(DAG, VT: ScalableVT, V: Op0);
21132 SDValue NewValue =
21133 DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT: ScalableVT, Ops: {ScaledOp, MulValue});
21134 return convertFromScalableVector(DAG, VT: N->getValueType(ResNo: 0), V: NewValue);
21135 };
21136
21137 if (SDValue res = performOpt(N->getOperand(Num: 0), N->getOperand(Num: 1)))
21138 return res;
21139 else if (N->getOpcode() == ISD::ADD)
21140 return performOpt(N->getOperand(Num: 1), N->getOperand(Num: 0));
21141
21142 return SDValue();
21143}
21144
21145// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
21146// help, for example, to produce ssra from sshr+add.
21147static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
21148 EVT VT = N->getValueType(ResNo: 0);
21149 if (VT != MVT::i64 ||
21150 DAG.getTargetLoweringInfo().isOperationExpand(Op: N->getOpcode(), VT: MVT::v1i64))
21151 return SDValue();
21152 SDValue Op0 = N->getOperand(Num: 0);
21153 SDValue Op1 = N->getOperand(Num: 1);
21154
21155 // At least one of the operands should be an extract, and the other should be
21156 // something that is easy to convert to v1i64 type (in this case a load).
21157 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21158 Op0.getOpcode() != ISD::LOAD)
21159 return SDValue();
21160 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21161 Op1.getOpcode() != ISD::LOAD)
21162 return SDValue();
21163
21164 SDLoc DL(N);
21165 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21166 Op0.getOperand(i: 0).getValueType() == MVT::v1i64) {
21167 Op0 = Op0.getOperand(i: 0);
21168 Op1 = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: Op1);
21169 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21170 Op1.getOperand(i: 0).getValueType() == MVT::v1i64) {
21171 Op0 = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: Op0);
21172 Op1 = Op1.getOperand(i: 0);
21173 } else
21174 return SDValue();
21175
21176 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64,
21177 N1: DAG.getNode(Opcode: N->getOpcode(), DL, VT: MVT::v1i64, N1: Op0, N2: Op1),
21178 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
21179}
21180
21181static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {
21182 SDValue BV = peekThroughOneUseBitcasts(V: B);
21183 if (!BV->hasOneUse())
21184 return false;
21185 if (auto *Ld = dyn_cast<LoadSDNode>(Val&: BV)) {
21186 if (!Ld || !Ld->isSimple())
21187 return false;
21188 Loads.push_back(Elt: Ld);
21189 return true;
21190 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
21191 BV.getOpcode() == ISD::CONCAT_VECTORS) {
21192 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
21193 auto *Ld = dyn_cast<LoadSDNode>(Val: BV.getOperand(i: Op));
21194 if (!Ld || !Ld->isSimple() || !BV.getOperand(i: Op).hasOneUse())
21195 return false;
21196 Loads.push_back(Elt: Ld);
21197 }
21198 return true;
21199 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
21200 // Try to find a tree of shuffles and concats from how IR shuffles of loads
21201 // are lowered. Note that this only comes up because we do not always visit
21202 // operands before uses. After that is fixed this can be removed and in the
21203 // meantime this is fairly specific to the lowering we expect from IR.
21204 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
21205 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
21206 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
21207 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
21208 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
21209 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
21210 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
21211 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
21212 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
21213 if (B.getOperand(i: 0).getOpcode() != ISD::VECTOR_SHUFFLE ||
21214 B.getOperand(i: 0).getOperand(i: 0).getOpcode() != ISD::CONCAT_VECTORS ||
21215 B.getOperand(i: 0).getOperand(i: 1).getOpcode() != ISD::CONCAT_VECTORS ||
21216 B.getOperand(i: 1).getOpcode() != ISD::CONCAT_VECTORS ||
21217 B.getOperand(i: 1).getNumOperands() != 4)
21218 return false;
21219 auto SV1 = cast<ShuffleVectorSDNode>(Val&: B);
21220 auto SV2 = cast<ShuffleVectorSDNode>(Val: B.getOperand(i: 0));
21221 int NumElts = B.getValueType().getVectorNumElements();
21222 int NumSubElts = NumElts / 4;
21223 for (int I = 0; I < NumSubElts; I++) {
21224 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
21225 if (SV1->getMaskElt(Idx: I) != I ||
21226 SV1->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts ||
21227 SV1->getMaskElt(Idx: I + NumSubElts * 2) != I + NumSubElts * 2 ||
21228 SV1->getMaskElt(Idx: I + NumSubElts * 3) != I + NumElts)
21229 return false;
21230 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
21231 if (SV2->getMaskElt(Idx: I) != I ||
21232 SV2->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts ||
21233 SV2->getMaskElt(Idx: I + NumSubElts * 2) != I + NumElts)
21234 return false;
21235 }
21236 auto *Ld0 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: 0).getOperand(i: 0));
21237 auto *Ld1 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: 0).getOperand(i: 1));
21238 auto *Ld2 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: 1).getOperand(i: 0));
21239 auto *Ld3 = dyn_cast<LoadSDNode>(Val: B.getOperand(i: 1).getOperand(i: 0));
21240 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
21241 !Ld2->isSimple() || !Ld3->isSimple())
21242 return false;
21243 Loads.push_back(Elt: Ld0);
21244 Loads.push_back(Elt: Ld1);
21245 Loads.push_back(Elt: Ld2);
21246 Loads.push_back(Elt: Ld3);
21247 return true;
21248 }
21249 return false;
21250}
21251
21252static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1,
21253 SelectionDAG &DAG,
21254 unsigned &NumSubLoads) {
21255 if (!Op0.hasOneUse() || !Op1.hasOneUse())
21256 return false;
21257
21258 SmallVector<LoadSDNode *> Loads0, Loads1;
21259 if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
21260 isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
21261 if (NumSubLoads && Loads0.size() != NumSubLoads)
21262 return false;
21263 NumSubLoads = Loads0.size();
21264 return Loads0.size() == Loads1.size() &&
21265 all_of(Range: zip(t&: Loads0, u&: Loads1), P: [&DAG](auto L) {
21266 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
21267 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
21268 DAG.areNonVolatileConsecutiveLoads(LD: get<1>(L), Base: get<0>(L),
21269 Bytes: Size / 8, Dist: 1);
21270 });
21271 }
21272
21273 if (Op0.getOpcode() != Op1.getOpcode())
21274 return false;
21275
21276 switch (Op0.getOpcode()) {
21277 case ISD::ADD:
21278 case ISD::SUB:
21279 return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: 0), Op1: Op1.getOperand(i: 0),
21280 DAG, NumSubLoads) &&
21281 areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: 1), Op1: Op1.getOperand(i: 1),
21282 DAG, NumSubLoads);
21283 case ISD::SIGN_EXTEND:
21284 case ISD::ANY_EXTEND:
21285 case ISD::ZERO_EXTEND:
21286 EVT XVT = Op0.getOperand(i: 0).getValueType();
21287 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
21288 XVT.getScalarSizeInBits() != 32)
21289 return false;
21290 return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: 0), Op1: Op1.getOperand(i: 0),
21291 DAG, NumSubLoads);
21292 }
21293 return false;
21294}
21295
21296// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
21297// into a single load of twice the size, that we extract the bottom part and top
21298// part so that the shl can use a shll2 instruction. The two loads in that
21299// example can also be larger trees of instructions, which are identical except
21300// for the leaves which are all loads offset from the LHS, including
21301// buildvectors of multiple loads. For example the RHS tree could be
21302// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
21303// Whilst it can be common for the larger loads to replace LDP instructions
21304// (which doesn't gain anything on it's own), the larger loads can help create
21305// more efficient code, and in buildvectors prevent the need for ld1 lane
21306// inserts which can be slower than normal loads.
21307static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
21308 EVT VT = N->getValueType(ResNo: 0);
21309 if (!VT.isFixedLengthVector() ||
21310 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
21311 VT.getScalarSizeInBits() != 64))
21312 return SDValue();
21313
21314 SDValue Other = N->getOperand(Num: 0);
21315 SDValue Shift = N->getOperand(Num: 1);
21316 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
21317 std::swap(a&: Shift, b&: Other);
21318 APInt ShiftAmt;
21319 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
21320 !ISD::isConstantSplatVector(N: Shift.getOperand(i: 1).getNode(), SplatValue&: ShiftAmt))
21321 return SDValue();
21322
21323 if (!ISD::isExtOpcode(Opcode: Shift.getOperand(i: 0).getOpcode()) ||
21324 !ISD::isExtOpcode(Opcode: Other.getOpcode()) ||
21325 Shift.getOperand(i: 0).getOperand(i: 0).getValueType() !=
21326 Other.getOperand(i: 0).getValueType() ||
21327 !Other.hasOneUse() || !Shift.getOperand(i: 0).hasOneUse())
21328 return SDValue();
21329
21330 SDValue Op0 = Other.getOperand(i: 0);
21331 SDValue Op1 = Shift.getOperand(i: 0).getOperand(i: 0);
21332
21333 unsigned NumSubLoads = 0;
21334 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
21335 return SDValue();
21336
21337 // Attempt to rule out some unprofitable cases using heuristics (some working
21338 // around suboptimal code generation), notably if the extend not be able to
21339 // use ushll2 instructions as the types are not large enough. Otherwise zip's
21340 // will need to be created which can increase the instruction count.
21341 unsigned NumElts = Op0.getValueType().getVectorNumElements();
21342 unsigned NumSubElts = NumElts / NumSubLoads;
21343 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
21344 (Other.getOpcode() != Shift.getOperand(i: 0).getOpcode() &&
21345 Op0.getValueType().getSizeInBits() < 128 &&
21346 !DAG.getTargetLoweringInfo().isTypeLegal(VT: Op0.getValueType())))
21347 return SDValue();
21348
21349 // Recreate the tree with the new combined loads.
21350 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
21351 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
21352 EVT DVT =
21353 Op0.getValueType().getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
21354
21355 SmallVector<LoadSDNode *> Loads0, Loads1;
21356 if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
21357 isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
21358 EVT LoadVT = EVT::getVectorVT(
21359 Context&: *DAG.getContext(), VT: Op0.getValueType().getScalarType(),
21360 NumElements: Op0.getValueType().getVectorNumElements() / Loads0.size());
21361 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
21362
21363 SmallVector<SDValue> NewLoads;
21364 for (const auto &[L0, L1] : zip(t&: Loads0, u&: Loads1)) {
21365 SDValue Load = DAG.getLoad(VT: DLoadVT, dl: SDLoc(L0), Chain: L0->getChain(),
21366 Ptr: L0->getBasePtr(), PtrInfo: L0->getPointerInfo(),
21367 Alignment: L0->getBaseAlign());
21368 DAG.makeEquivalentMemoryOrdering(OldLoad: L0, NewMemOp: Load.getValue(R: 1));
21369 DAG.makeEquivalentMemoryOrdering(OldLoad: L1, NewMemOp: Load.getValue(R: 1));
21370 NewLoads.push_back(Elt: Load);
21371 }
21372 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op0), VT: DVT, Ops: NewLoads);
21373 }
21374
21375 SmallVector<SDValue> Ops;
21376 for (const auto &[O0, O1] : zip(t: Op0->op_values(), u: Op1->op_values()))
21377 Ops.push_back(Elt: GenCombinedTree(O0, O1, DAG));
21378 return DAG.getNode(Opcode: Op0.getOpcode(), DL: SDLoc(Op0), VT: DVT, Ops);
21379 };
21380 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
21381
21382 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
21383 int Hi = NumSubElts, Lo = 0;
21384 for (unsigned i = 0; i < NumSubLoads; i++) {
21385 for (unsigned j = 0; j < NumSubElts; j++) {
21386 LowMask[i * NumSubElts + j] = Lo++;
21387 HighMask[i * NumSubElts + j] = Hi++;
21388 }
21389 Lo += NumSubElts;
21390 Hi += NumSubElts;
21391 }
21392 SDLoc DL(N);
21393 SDValue Ext0, Ext1;
21394 // Extract the top and bottom lanes, then extend the result. Possibly extend
21395 // the result then extract the lanes if the two operands match as it produces
21396 // slightly smaller code.
21397 if (Other.getOpcode() != Shift.getOperand(i: 0).getOpcode()) {
21398 SDValue SubL = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: Op0.getValueType(),
21399 N1: NewOp, N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
21400 SDValue SubH =
21401 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: Op0.getValueType(), N1: NewOp,
21402 N2: DAG.getConstant(Val: NumSubElts * NumSubLoads, DL, VT: MVT::i64));
21403 SDValue Extr0 =
21404 DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
21405 SDValue Extr1 =
21406 DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
21407 Ext0 = DAG.getNode(Opcode: Other.getOpcode(), DL, VT, Operand: Extr0);
21408 Ext1 = DAG.getNode(Opcode: Shift.getOperand(i: 0).getOpcode(), DL, VT, Operand: Extr1);
21409 } else {
21410 EVT DVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
21411 SDValue Ext = DAG.getNode(Opcode: Other.getOpcode(), DL, VT: DVT, Operand: NewOp);
21412 SDValue SubL = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Ext,
21413 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
21414 SDValue SubH =
21415 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Ext,
21416 N2: DAG.getConstant(Val: NumSubElts * NumSubLoads, DL, VT: MVT::i64));
21417 Ext0 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
21418 Ext1 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
21419 }
21420 SDValue NShift =
21421 DAG.getNode(Opcode: Shift.getOpcode(), DL, VT, N1: Ext1, N2: Shift.getOperand(i: 1));
21422 return DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: Ext0, N2: NShift);
21423}
21424
21425static SDValue performAddSubCombine(SDNode *N,
21426 TargetLowering::DAGCombinerInfo &DCI) {
21427 // Try to change sum of two reductions.
21428 if (SDValue Val = performAddUADDVCombine(N, DAG&: DCI.DAG))
21429 return Val;
21430 if (SDValue Val = performAddDotCombine(N, DAG&: DCI.DAG))
21431 return Val;
21432 if (SDValue Val = performAddCSelIntoCSinc(N, DAG&: DCI.DAG))
21433 return Val;
21434 if (SDValue Val = performNegCSelCombine(N, DAG&: DCI.DAG))
21435 return Val;
21436 if (SDValue Val = performVectorExtCombine(N, DAG&: DCI.DAG))
21437 return Val;
21438 if (SDValue Val = performAddCombineForShiftedOperands(N, DAG&: DCI.DAG))
21439 return Val;
21440 if (SDValue Val = performSubAddMULCombine(N, DAG&: DCI.DAG))
21441 return Val;
21442 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
21443 return Val;
21444 if (SDValue Val = performAddSubIntoVectorOp(N, DAG&: DCI.DAG))
21445 return Val;
21446
21447 if (SDValue Val = performExtBinopLoadFold(N, DAG&: DCI.DAG))
21448 return Val;
21449
21450 return performAddSubLongCombine(N, DCI);
21451}
21452
21453// Massage DAGs which we can use the high-half "long" operations on into
21454// something isel will recognize better. E.g.
21455//
21456// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
21457// (aarch64_neon_umull (extract_high (v2i64 vec)))
21458// (extract_high (v2i64 (dup128 scalar)))))
21459//
21460static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
21461 TargetLowering::DAGCombinerInfo &DCI,
21462 SelectionDAG &DAG) {
21463 if (DCI.isBeforeLegalizeOps())
21464 return SDValue();
21465
21466 SDValue LHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? 0 : 1);
21467 SDValue RHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? 1 : 2);
21468 assert(LHS.getValueType().is64BitVector() &&
21469 RHS.getValueType().is64BitVector() &&
21470 "unexpected shape for long operation");
21471
21472 // Either node could be a DUP, but it's not worth doing both of them (you'd
21473 // just as well use the non-high version) so look for a corresponding extract
21474 // operation on the other "wing".
21475 if (isEssentiallyExtractHighSubvector(N: LHS)) {
21476 RHS = tryExtendDUPToExtractHigh(N: RHS, DAG);
21477 if (!RHS.getNode())
21478 return SDValue();
21479 } else if (isEssentiallyExtractHighSubvector(N: RHS)) {
21480 LHS = tryExtendDUPToExtractHigh(N: LHS, DAG);
21481 if (!LHS.getNode())
21482 return SDValue();
21483 } else
21484 return SDValue();
21485
21486 if (IID == Intrinsic::not_intrinsic)
21487 return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: LHS, N2: RHS);
21488
21489 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21490 N1: N->getOperand(Num: 0), N2: LHS, N3: RHS);
21491}
21492
21493static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
21494 MVT ElemTy = N->getSimpleValueType(ResNo: 0).getScalarType();
21495 unsigned ElemBits = ElemTy.getSizeInBits();
21496
21497 int64_t ShiftAmount;
21498 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: 2))) {
21499 APInt SplatValue, SplatUndef;
21500 unsigned SplatBitSize;
21501 bool HasAnyUndefs;
21502 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
21503 HasAnyUndefs, MinSplatBits: ElemBits) ||
21504 SplatBitSize != ElemBits)
21505 return SDValue();
21506
21507 ShiftAmount = SplatValue.getSExtValue();
21508 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2))) {
21509 ShiftAmount = CVN->getSExtValue();
21510 } else
21511 return SDValue();
21512
21513 // If the shift amount is zero, remove the shift intrinsic.
21514 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
21515 return N->getOperand(Num: 1);
21516
21517 unsigned Opcode;
21518 bool IsRightShift;
21519 switch (IID) {
21520 default:
21521 llvm_unreachable("Unknown shift intrinsic");
21522 case Intrinsic::aarch64_neon_sqshl:
21523 Opcode = AArch64ISD::SQSHL_I;
21524 IsRightShift = false;
21525 break;
21526 case Intrinsic::aarch64_neon_uqshl:
21527 Opcode = AArch64ISD::UQSHL_I;
21528 IsRightShift = false;
21529 break;
21530 case Intrinsic::aarch64_neon_srshl:
21531 Opcode = AArch64ISD::SRSHR_I;
21532 IsRightShift = true;
21533 break;
21534 case Intrinsic::aarch64_neon_urshl:
21535 Opcode = AArch64ISD::URSHR_I;
21536 IsRightShift = true;
21537 break;
21538 case Intrinsic::aarch64_neon_sqshlu:
21539 Opcode = AArch64ISD::SQSHLU_I;
21540 IsRightShift = false;
21541 break;
21542 case Intrinsic::aarch64_neon_sshl:
21543 case Intrinsic::aarch64_neon_ushl:
21544 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
21545 // left shift for positive shift amounts. For negative shifts we can use a
21546 // VASHR/VLSHR as appropriate.
21547 if (ShiftAmount < 0) {
21548 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
21549 : AArch64ISD::VLSHR;
21550 ShiftAmount = -ShiftAmount;
21551 } else
21552 Opcode = AArch64ISD::VSHL;
21553 IsRightShift = false;
21554 break;
21555 }
21556
21557 EVT VT = N->getValueType(ResNo: 0);
21558 SDValue Op = N->getOperand(Num: 1);
21559 SDLoc DL(N);
21560 if (VT == MVT::i64) {
21561 Op = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: Op);
21562 VT = MVT::v1i64;
21563 }
21564
21565 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
21566 Op = DAG.getNode(Opcode, DL, VT, N1: Op,
21567 N2: DAG.getSignedConstant(Val: -ShiftAmount, DL, VT: MVT::i32));
21568 if (N->getValueType(ResNo: 0) == MVT::i64)
21569 Op = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64, N1: Op,
21570 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
21571 return Op;
21572 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
21573 Op = DAG.getNode(Opcode, DL, VT, N1: Op,
21574 N2: DAG.getConstant(Val: ShiftAmount, DL, VT: MVT::i32));
21575 if (N->getValueType(ResNo: 0) == MVT::i64)
21576 Op = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64, N1: Op,
21577 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
21578 return Op;
21579 }
21580
21581 return SDValue();
21582}
21583
21584// The CRC32[BH] instructions ignore the high bits of their data operand. Since
21585// the intrinsics must be legal and take an i32, this means there's almost
21586// certainly going to be a zext in the DAG which we can eliminate.
21587static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
21588 SDValue AndN = N->getOperand(Num: 2);
21589 if (AndN.getOpcode() != ISD::AND)
21590 return SDValue();
21591
21592 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: AndN.getOperand(i: 1));
21593 if (!CMask || CMask->getZExtValue() != Mask)
21594 return SDValue();
21595
21596 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SDLoc(N), VT: MVT::i32,
21597 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), N3: AndN.getOperand(i: 0));
21598}
21599
21600static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
21601 SelectionDAG &DAG) {
21602 SDLoc DL(N);
21603 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: 0),
21604 N1: DAG.getNode(Opcode: Opc, DL, VT: N->getOperand(Num: 1).getSimpleValueType(),
21605 Operand: N->getOperand(Num: 1)),
21606 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
21607}
21608
21609static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
21610 SDLoc DL(N);
21611 SDValue Op1 = N->getOperand(Num: 1);
21612 SDValue Op2 = N->getOperand(Num: 2);
21613 EVT ScalarTy = Op2.getValueType();
21614 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
21615 ScalarTy = MVT::i32;
21616
21617 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
21618 SDValue StepVector = DAG.getStepVector(DL, ResVT: N->getValueType(ResNo: 0));
21619 SDValue Step = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: 0), Operand: Op2);
21620 SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: N->getValueType(ResNo: 0), N1: StepVector, N2: Step);
21621 SDValue Base = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: 0), Operand: Op1);
21622 return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: 0), N1: Mul, N2: Base);
21623}
21624
21625static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
21626 SDLoc DL(N);
21627 SDValue Scalar = N->getOperand(Num: 3);
21628 EVT ScalarTy = Scalar.getValueType();
21629
21630 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
21631 Scalar = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Scalar);
21632
21633 SDValue Passthru = N->getOperand(Num: 1);
21634 SDValue Pred = N->getOperand(Num: 2);
21635 return DAG.getNode(Opcode: AArch64ISD::DUP_MERGE_PASSTHRU, DL, VT: N->getValueType(ResNo: 0),
21636 N1: Pred, N2: Scalar, N3: Passthru);
21637}
21638
21639static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
21640 SDLoc DL(N);
21641 LLVMContext &Ctx = *DAG.getContext();
21642 EVT VT = N->getValueType(ResNo: 0);
21643
21644 assert(VT.isScalableVector() && "Expected a scalable vector.");
21645
21646 // Current lowering only supports the SVE-ACLE types.
21647 if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
21648 return SDValue();
21649
21650 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
21651 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
21652 EVT ByteVT =
21653 EVT::getVectorVT(Context&: Ctx, VT: MVT::i8, EC: ElementCount::getScalable(MinVal: ByteSize));
21654
21655 // Convert everything to the domain of EXT (i.e bytes).
21656 SDValue Op0 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ByteVT, Operand: N->getOperand(Num: 1));
21657 SDValue Op1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ByteVT, Operand: N->getOperand(Num: 2));
21658 SDValue Op2 = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i32, N1: N->getOperand(Num: 3),
21659 N2: DAG.getConstant(Val: ElemSize, DL, VT: MVT::i32));
21660
21661 SDValue EXT = DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: ByteVT, N1: Op0, N2: Op1, N3: Op2);
21662 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: EXT);
21663}
21664
21665static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
21666 TargetLowering::DAGCombinerInfo &DCI,
21667 SelectionDAG &DAG) {
21668 if (DCI.isBeforeLegalize())
21669 return SDValue();
21670
21671 SDValue Comparator = N->getOperand(Num: 3);
21672 if (Comparator.getOpcode() == AArch64ISD::DUP ||
21673 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
21674 unsigned IID = getIntrinsicID(N);
21675 EVT VT = N->getValueType(ResNo: 0);
21676 EVT CmpVT = N->getOperand(Num: 2).getValueType();
21677 SDValue Pred = N->getOperand(Num: 1);
21678 SDValue Imm;
21679 SDLoc DL(N);
21680
21681 switch (IID) {
21682 default:
21683 llvm_unreachable("Called with wrong intrinsic!");
21684 break;
21685
21686 // Signed comparisons
21687 case Intrinsic::aarch64_sve_cmpeq_wide:
21688 case Intrinsic::aarch64_sve_cmpne_wide:
21689 case Intrinsic::aarch64_sve_cmpge_wide:
21690 case Intrinsic::aarch64_sve_cmpgt_wide:
21691 case Intrinsic::aarch64_sve_cmplt_wide:
21692 case Intrinsic::aarch64_sve_cmple_wide: {
21693 if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: 0))) {
21694 int64_t ImmVal = CN->getSExtValue();
21695 if (ImmVal >= -16 && ImmVal <= 15)
21696 Imm = DAG.getSignedConstant(Val: ImmVal, DL, VT: MVT::i32);
21697 else
21698 return SDValue();
21699 }
21700 break;
21701 }
21702 // Unsigned comparisons
21703 case Intrinsic::aarch64_sve_cmphs_wide:
21704 case Intrinsic::aarch64_sve_cmphi_wide:
21705 case Intrinsic::aarch64_sve_cmplo_wide:
21706 case Intrinsic::aarch64_sve_cmpls_wide: {
21707 if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: 0))) {
21708 uint64_t ImmVal = CN->getZExtValue();
21709 if (ImmVal <= 127)
21710 Imm = DAG.getConstant(Val: ImmVal, DL, VT: MVT::i32);
21711 else
21712 return SDValue();
21713 }
21714 break;
21715 }
21716 }
21717
21718 if (!Imm)
21719 return SDValue();
21720
21721 SDValue Splat = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: CmpVT, Operand: Imm);
21722 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT, N1: Pred,
21723 N2: N->getOperand(Num: 2), N3: Splat, N4: DAG.getCondCode(Cond: CC));
21724 }
21725
21726 return SDValue();
21727}
21728
21729static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
21730 AArch64CC::CondCode Cond) {
21731 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21732
21733 SDLoc DL(Op);
21734 assert(Op.getValueType().isScalableVector() &&
21735 TLI.isTypeLegal(Op.getValueType()) &&
21736 "Expected legal scalable vector type!");
21737 assert(Op.getValueType() == Pg.getValueType() &&
21738 "Expected same type for PTEST operands");
21739
21740 // Ensure target specific opcodes are using legal type.
21741 EVT OutVT = TLI.getTypeToTransformTo(Context&: *DAG.getContext(), VT);
21742 SDValue TVal = DAG.getConstant(Val: 1, DL, VT: OutVT);
21743 SDValue FVal = DAG.getConstant(Val: 0, DL, VT: OutVT);
21744
21745 // Ensure operands have type nxv16i1.
21746 if (Op.getValueType() != MVT::nxv16i1) {
21747 if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&
21748 isZeroingInactiveLanes(Op))
21749 Pg = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv16i1, Operand: Pg);
21750 else
21751 Pg = getSVEPredicateBitCast(VT: MVT::nxv16i1, Op: Pg, DAG);
21752 Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv16i1, Operand: Op);
21753 }
21754
21755 // Set condition code (CC) flags.
21756 SDValue Test = DAG.getNode(
21757 Opcode: Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
21758 DL, VT: MVT::i32, N1: Pg, N2: Op);
21759
21760 // Convert CC to integer based on requested condition.
21761 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
21762 SDValue CC = DAG.getConstant(Val: getInvertedCondCode(Code: Cond), DL, VT: MVT::i32);
21763 SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OutVT, N1: FVal, N2: TVal, N3: CC, N4: Test);
21764 return DAG.getZExtOrTrunc(Op: Res, DL, VT);
21765}
21766
21767static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
21768 SelectionDAG &DAG) {
21769 SDLoc DL(N);
21770
21771 SDValue Pred = N->getOperand(Num: 1);
21772 SDValue VecToReduce = N->getOperand(Num: 2);
21773
21774 // NOTE: The integer reduction's result type is not always linked to the
21775 // operand's element type so we construct it from the intrinsic's result type.
21776 EVT ReduceVT = getPackedSVEVectorVT(VT: N->getValueType(ResNo: 0));
21777 SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
21778
21779 // SVE reductions set the whole vector register with the first element
21780 // containing the reduction result, which we'll now extract.
21781 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
21782 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: 0), N1: Reduce,
21783 N2: Zero);
21784}
21785
21786static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
21787 SelectionDAG &DAG) {
21788 SDLoc DL(N);
21789
21790 SDValue Pred = N->getOperand(Num: 1);
21791 SDValue VecToReduce = N->getOperand(Num: 2);
21792
21793 EVT ReduceVT = VecToReduce.getValueType();
21794 SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
21795
21796 // SVE reductions set the whole vector register with the first element
21797 // containing the reduction result, which we'll now extract.
21798 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
21799 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: 0), N1: Reduce,
21800 N2: Zero);
21801}
21802
21803static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
21804 SelectionDAG &DAG) {
21805 SDLoc DL(N);
21806
21807 SDValue Pred = N->getOperand(Num: 1);
21808 SDValue InitVal = N->getOperand(Num: 2);
21809 SDValue VecToReduce = N->getOperand(Num: 3);
21810 EVT ReduceVT = VecToReduce.getValueType();
21811
21812 // Ordered reductions use the first lane of the result vector as the
21813 // reduction's initial value.
21814 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
21815 InitVal = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ReduceVT,
21816 N1: DAG.getUNDEF(VT: ReduceVT), N2: InitVal, N3: Zero);
21817
21818 SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: InitVal, N3: VecToReduce);
21819
21820 // SVE reductions set the whole vector register with the first element
21821 // containing the reduction result, which we'll now extract.
21822 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: 0), N1: Reduce,
21823 N2: Zero);
21824}
21825
21826// If a merged operation has no inactive lanes we can relax it to a predicated
21827// or unpredicated operation, which potentially allows better isel (perhaps
21828// using immediate forms) or relaxing register reuse requirements.
21829static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
21830 SelectionDAG &DAG, bool UnpredOp = false,
21831 bool SwapOperands = false) {
21832 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
21833 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
21834 SDValue Pg = N->getOperand(Num: 1);
21835 SDValue Op1 = N->getOperand(Num: SwapOperands ? 3 : 2);
21836 SDValue Op2 = N->getOperand(Num: SwapOperands ? 2 : 3);
21837
21838 // ISD way to specify an all active predicate.
21839 if (isAllActivePredicate(DAG, N: Pg)) {
21840 if (UnpredOp)
21841 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: Op1, N2: Op2);
21842
21843 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: Pg, N2: Op1, N3: Op2);
21844 }
21845
21846 // FUTURE: SplatVector(true)
21847 return SDValue();
21848}
21849
21850SDValue tryLowerPartialReductionToDot(SDNode *N,
21851 const AArch64Subtarget *Subtarget,
21852 SelectionDAG &DAG) {
21853
21854 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21855 getIntrinsicID(N) ==
21856 Intrinsic::experimental_vector_partial_reduce_add &&
21857 "Expected a partial reduction node");
21858
21859 bool Scalable = N->getValueType(ResNo: 0).isScalableVector();
21860 if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
21861 return SDValue();
21862 if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
21863 return SDValue();
21864
21865 SDLoc DL(N);
21866
21867 SDValue Op2 = N->getOperand(Num: 2);
21868 unsigned Op2Opcode = Op2->getOpcode();
21869 SDValue MulOpLHS, MulOpRHS;
21870 bool MulOpLHSIsSigned, MulOpRHSIsSigned;
21871 if (ISD::isExtOpcode(Opcode: Op2Opcode)) {
21872 MulOpLHSIsSigned = MulOpRHSIsSigned = (Op2Opcode == ISD::SIGN_EXTEND);
21873 MulOpLHS = Op2->getOperand(Num: 0);
21874 MulOpRHS = DAG.getConstant(Val: 1, DL, VT: MulOpLHS.getValueType());
21875 } else if (Op2Opcode == ISD::MUL) {
21876 SDValue ExtMulOpLHS = Op2->getOperand(Num: 0);
21877 SDValue ExtMulOpRHS = Op2->getOperand(Num: 1);
21878
21879 unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
21880 unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
21881 if (!ISD::isExtOpcode(Opcode: ExtMulOpLHSOpcode) ||
21882 !ISD::isExtOpcode(Opcode: ExtMulOpRHSOpcode))
21883 return SDValue();
21884
21885 MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
21886 MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
21887
21888 MulOpLHS = ExtMulOpLHS->getOperand(Num: 0);
21889 MulOpRHS = ExtMulOpRHS->getOperand(Num: 0);
21890
21891 if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
21892 return SDValue();
21893 } else
21894 return SDValue();
21895
21896 SDValue Acc = N->getOperand(Num: 1);
21897 EVT ReducedVT = N->getValueType(ResNo: 0);
21898 EVT MulSrcVT = MulOpLHS.getValueType();
21899
21900 // Dot products operate on chunks of four elements so there must be four times
21901 // as many elements in the wide type
21902 if (!(ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
21903 !(ReducedVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
21904 !(ReducedVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
21905 !(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
21906 !(ReducedVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
21907 !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
21908 return SDValue();
21909
21910 // If the extensions are mixed, we should lower it to a usdot instead
21911 unsigned Opcode = 0;
21912 if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
21913 if (!Subtarget->hasMatMulInt8())
21914 return SDValue();
21915
21916 bool Scalable = N->getValueType(ResNo: 0).isScalableVT();
21917 // There's no nxv2i64 version of usdot
21918 if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
21919 return SDValue();
21920
21921 Opcode = AArch64ISD::USDOT;
21922 // USDOT expects the signed operand to be last
21923 if (!MulOpRHSIsSigned)
21924 std::swap(a&: MulOpLHS, b&: MulOpRHS);
21925 } else
21926 Opcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
21927
21928 // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
21929 // product followed by a zero / sign extension
21930 if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
21931 (ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
21932 EVT ReducedVTI32 =
21933 (ReducedVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
21934
21935 SDValue DotI32 =
21936 DAG.getNode(Opcode, DL, VT: ReducedVTI32,
21937 N1: DAG.getConstant(Val: 0, DL, VT: ReducedVTI32), N2: MulOpLHS, N3: MulOpRHS);
21938 SDValue Extended = DAG.getSExtOrTrunc(Op: DotI32, DL, VT: ReducedVT);
21939 return DAG.getNode(Opcode: ISD::ADD, DL, VT: ReducedVT, N1: Acc, N2: Extended);
21940 }
21941
21942 return DAG.getNode(Opcode, DL, VT: ReducedVT, N1: Acc, N2: MulOpLHS, N3: MulOpRHS);
21943}
21944
21945SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
21946 const AArch64Subtarget *Subtarget,
21947 SelectionDAG &DAG) {
21948
21949 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21950 getIntrinsicID(N) ==
21951 Intrinsic::experimental_vector_partial_reduce_add &&
21952 "Expected a partial reduction node");
21953
21954 if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
21955 return SDValue();
21956
21957 SDLoc DL(N);
21958
21959 if (!ISD::isExtOpcode(Opcode: N->getOperand(Num: 2).getOpcode()))
21960 return SDValue();
21961 SDValue Acc = N->getOperand(Num: 1);
21962 SDValue Ext = N->getOperand(Num: 2);
21963 EVT AccVT = Acc.getValueType();
21964 EVT ExtVT = Ext.getValueType();
21965 if (ExtVT.getVectorElementType() != AccVT.getVectorElementType())
21966 return SDValue();
21967
21968 SDValue ExtOp = Ext->getOperand(Num: 0);
21969 EVT ExtOpVT = ExtOp.getValueType();
21970
21971 if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
21972 !(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
21973 !(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
21974 return SDValue();
21975
21976 bool ExtOpIsSigned = Ext.getOpcode() == ISD::SIGN_EXTEND;
21977 unsigned BottomOpcode =
21978 ExtOpIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
21979 unsigned TopOpcode = ExtOpIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
21980 SDValue BottomNode = DAG.getNode(Opcode: BottomOpcode, DL, VT: AccVT, N1: Acc, N2: ExtOp);
21981 return DAG.getNode(Opcode: TopOpcode, DL, VT: AccVT, N1: BottomNode, N2: ExtOp);
21982}
21983
21984static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
21985 SDLoc DL(N);
21986 EVT VT = N->getValueType(ResNo: 0);
21987 SDValue Op1 = N->getOperand(Num: 1);
21988 SDValue Op2 = N->getOperand(Num: 2);
21989 SDValue Op3 = N->getOperand(Num: 3);
21990
21991 switch (IID) {
21992 default:
21993 llvm_unreachable("Called with wrong intrinsic!");
21994 case Intrinsic::aarch64_sve_bsl:
21995 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Op3, N2: Op1, N3: Op2);
21996 case Intrinsic::aarch64_sve_bsl1n:
21997 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Op3, N2: DAG.getNOT(DL, Val: Op1, VT),
21998 N3: Op2);
21999 case Intrinsic::aarch64_sve_bsl2n:
22000 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Op3, N2: Op1,
22001 N3: DAG.getNOT(DL, Val: Op2, VT));
22002 case Intrinsic::aarch64_sve_nbsl:
22003 return DAG.getNOT(DL, Val: DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Op3, N2: Op1, N3: Op2),
22004 VT);
22005 }
22006}
22007
22008static SDValue performIntrinsicCombine(SDNode *N,
22009 TargetLowering::DAGCombinerInfo &DCI,
22010 const AArch64Subtarget *Subtarget) {
22011 SelectionDAG &DAG = DCI.DAG;
22012 unsigned IID = getIntrinsicID(N);
22013 switch (IID) {
22014 default:
22015 break;
22016 case Intrinsic::experimental_vector_partial_reduce_add: {
22017 if (SDValue Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
22018 return Dot;
22019 if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
22020 return WideAdd;
22021 SDLoc DL(N);
22022 SDValue Input = N->getOperand(Num: 2);
22023 return DAG.getNode(Opcode: ISD::PARTIAL_REDUCE_UMLA, DL, VT: N->getValueType(ResNo: 0),
22024 N1: N->getOperand(Num: 1), N2: Input,
22025 N3: DAG.getConstant(Val: 1, DL, VT: Input.getValueType()));
22026 }
22027 case Intrinsic::aarch64_neon_vcvtfxs2fp:
22028 case Intrinsic::aarch64_neon_vcvtfxu2fp:
22029 return tryCombineFixedPointConvert(N, DCI, DAG);
22030 case Intrinsic::aarch64_neon_saddv:
22031 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SADDV, N, DAG);
22032 case Intrinsic::aarch64_neon_uaddv:
22033 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UADDV, N, DAG);
22034 case Intrinsic::aarch64_neon_sminv:
22035 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMINV, N, DAG);
22036 case Intrinsic::aarch64_neon_uminv:
22037 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMINV, N, DAG);
22038 case Intrinsic::aarch64_neon_smaxv:
22039 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMAXV, N, DAG);
22040 case Intrinsic::aarch64_neon_umaxv:
22041 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMAXV, N, DAG);
22042 case Intrinsic::aarch64_neon_fmax:
22043 return DAG.getNode(Opcode: ISD::FMAXIMUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22044 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22045 case Intrinsic::aarch64_neon_fmin:
22046 return DAG.getNode(Opcode: ISD::FMINIMUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22047 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22048 case Intrinsic::aarch64_neon_fmaxnm:
22049 return DAG.getNode(Opcode: ISD::FMAXNUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22050 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22051 case Intrinsic::aarch64_neon_fminnm:
22052 return DAG.getNode(Opcode: ISD::FMINNUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22053 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22054 case Intrinsic::aarch64_neon_smull:
22055 return DAG.getNode(Opcode: AArch64ISD::SMULL, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22056 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22057 case Intrinsic::aarch64_neon_umull:
22058 return DAG.getNode(Opcode: AArch64ISD::UMULL, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22059 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22060 case Intrinsic::aarch64_neon_pmull:
22061 return DAG.getNode(Opcode: AArch64ISD::PMULL, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22062 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22063 case Intrinsic::aarch64_neon_sqdmull:
22064 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
22065 case Intrinsic::aarch64_neon_sqshl:
22066 case Intrinsic::aarch64_neon_uqshl:
22067 case Intrinsic::aarch64_neon_sqshlu:
22068 case Intrinsic::aarch64_neon_srshl:
22069 case Intrinsic::aarch64_neon_urshl:
22070 case Intrinsic::aarch64_neon_sshl:
22071 case Intrinsic::aarch64_neon_ushl:
22072 return tryCombineShiftImm(IID, N, DAG);
22073 case Intrinsic::aarch64_neon_sabd:
22074 return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22075 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22076 case Intrinsic::aarch64_neon_uabd:
22077 return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22078 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22079 case Intrinsic::aarch64_crc32b:
22080 case Intrinsic::aarch64_crc32cb:
22081 return tryCombineCRC32(Mask: 0xff, N, DAG);
22082 case Intrinsic::aarch64_crc32h:
22083 case Intrinsic::aarch64_crc32ch:
22084 return tryCombineCRC32(Mask: 0xffff, N, DAG);
22085 case Intrinsic::aarch64_sve_saddv:
22086 // There is no i64 version of SADDV because the sign is irrelevant.
22087 if (N->getOperand(Num: 2)->getValueType(ResNo: 0).getVectorElementType() == MVT::i64)
22088 return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
22089 else
22090 return combineSVEReductionInt(N, Opc: AArch64ISD::SADDV_PRED, DAG);
22091 case Intrinsic::aarch64_sve_uaddv:
22092 return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
22093 case Intrinsic::aarch64_sve_smaxv:
22094 return combineSVEReductionInt(N, Opc: AArch64ISD::SMAXV_PRED, DAG);
22095 case Intrinsic::aarch64_sve_umaxv:
22096 return combineSVEReductionInt(N, Opc: AArch64ISD::UMAXV_PRED, DAG);
22097 case Intrinsic::aarch64_sve_sminv:
22098 return combineSVEReductionInt(N, Opc: AArch64ISD::SMINV_PRED, DAG);
22099 case Intrinsic::aarch64_sve_uminv:
22100 return combineSVEReductionInt(N, Opc: AArch64ISD::UMINV_PRED, DAG);
22101 case Intrinsic::aarch64_sve_orv:
22102 return combineSVEReductionInt(N, Opc: AArch64ISD::ORV_PRED, DAG);
22103 case Intrinsic::aarch64_sve_eorv:
22104 return combineSVEReductionInt(N, Opc: AArch64ISD::EORV_PRED, DAG);
22105 case Intrinsic::aarch64_sve_andv:
22106 return combineSVEReductionInt(N, Opc: AArch64ISD::ANDV_PRED, DAG);
22107 case Intrinsic::aarch64_sve_index:
22108 return LowerSVEIntrinsicIndex(N, DAG);
22109 case Intrinsic::aarch64_sve_dup:
22110 return LowerSVEIntrinsicDUP(N, DAG);
22111 case Intrinsic::aarch64_sve_dup_x:
22112 return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22113 Operand: N->getOperand(Num: 1));
22114 case Intrinsic::aarch64_sve_ext:
22115 return LowerSVEIntrinsicEXT(N, DAG);
22116 case Intrinsic::aarch64_sve_mul_u:
22117 return DAG.getNode(Opcode: AArch64ISD::MUL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22118 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22119 case Intrinsic::aarch64_sve_smulh_u:
22120 return DAG.getNode(Opcode: AArch64ISD::MULHS_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22121 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22122 case Intrinsic::aarch64_sve_umulh_u:
22123 return DAG.getNode(Opcode: AArch64ISD::MULHU_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22124 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22125 case Intrinsic::aarch64_sve_smin_u:
22126 return DAG.getNode(Opcode: AArch64ISD::SMIN_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22127 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22128 case Intrinsic::aarch64_sve_umin_u:
22129 return DAG.getNode(Opcode: AArch64ISD::UMIN_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22130 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22131 case Intrinsic::aarch64_sve_smax_u:
22132 return DAG.getNode(Opcode: AArch64ISD::SMAX_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22133 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22134 case Intrinsic::aarch64_sve_umax_u:
22135 return DAG.getNode(Opcode: AArch64ISD::UMAX_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22136 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22137 case Intrinsic::aarch64_sve_lsl_u:
22138 return DAG.getNode(Opcode: AArch64ISD::SHL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22139 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22140 case Intrinsic::aarch64_sve_lsr_u:
22141 return DAG.getNode(Opcode: AArch64ISD::SRL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22142 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22143 case Intrinsic::aarch64_sve_asr_u:
22144 return DAG.getNode(Opcode: AArch64ISD::SRA_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22145 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22146 case Intrinsic::aarch64_sve_fadd_u:
22147 return DAG.getNode(Opcode: AArch64ISD::FADD_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22148 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22149 case Intrinsic::aarch64_sve_fdiv_u:
22150 return DAG.getNode(Opcode: AArch64ISD::FDIV_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22151 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22152 case Intrinsic::aarch64_sve_fmax_u:
22153 return DAG.getNode(Opcode: AArch64ISD::FMAX_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22154 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22155 case Intrinsic::aarch64_sve_fmaxnm_u:
22156 return DAG.getNode(Opcode: AArch64ISD::FMAXNM_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22157 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22158 case Intrinsic::aarch64_sve_fmla_u:
22159 return DAG.getNode(Opcode: AArch64ISD::FMA_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22160 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 3), N3: N->getOperand(Num: 4),
22161 N4: N->getOperand(Num: 2));
22162 case Intrinsic::aarch64_sve_fmin_u:
22163 return DAG.getNode(Opcode: AArch64ISD::FMIN_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22164 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22165 case Intrinsic::aarch64_sve_fminnm_u:
22166 return DAG.getNode(Opcode: AArch64ISD::FMINNM_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22167 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22168 case Intrinsic::aarch64_sve_fmul_u:
22169 return DAG.getNode(Opcode: AArch64ISD::FMUL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22170 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22171 case Intrinsic::aarch64_sve_fsub_u:
22172 return DAG.getNode(Opcode: AArch64ISD::FSUB_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22173 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22174 case Intrinsic::aarch64_sve_add_u:
22175 return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
22176 N2: N->getOperand(Num: 3));
22177 case Intrinsic::aarch64_sve_sub_u:
22178 return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
22179 N2: N->getOperand(Num: 3));
22180 case Intrinsic::aarch64_sve_subr:
22181 return convertMergedOpToPredOp(N, Opc: ISD::SUB, DAG, UnpredOp: true, SwapOperands: true);
22182 case Intrinsic::aarch64_sve_and_u:
22183 return DAG.getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
22184 N2: N->getOperand(Num: 3));
22185 case Intrinsic::aarch64_sve_bic_u:
22186 return DAG.getNode(Opcode: AArch64ISD::BIC, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22187 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
22188 case Intrinsic::aarch64_sve_saddwb:
22189 return DAG.getNode(Opcode: AArch64ISD::SADDWB, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22190 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22191 case Intrinsic::aarch64_sve_saddwt:
22192 return DAG.getNode(Opcode: AArch64ISD::SADDWT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22193 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22194 case Intrinsic::aarch64_sve_uaddwb:
22195 return DAG.getNode(Opcode: AArch64ISD::UADDWB, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22196 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22197 case Intrinsic::aarch64_sve_uaddwt:
22198 return DAG.getNode(Opcode: AArch64ISD::UADDWT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22199 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22200 case Intrinsic::aarch64_sve_eor_u:
22201 return DAG.getNode(Opcode: ISD::XOR, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
22202 N2: N->getOperand(Num: 3));
22203 case Intrinsic::aarch64_sve_orr_u:
22204 return DAG.getNode(Opcode: ISD::OR, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
22205 N2: N->getOperand(Num: 3));
22206 case Intrinsic::aarch64_sve_sabd_u:
22207 return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22208 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
22209 case Intrinsic::aarch64_sve_uabd_u:
22210 return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22211 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
22212 case Intrinsic::aarch64_sve_sdiv_u:
22213 return DAG.getNode(Opcode: AArch64ISD::SDIV_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22214 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22215 case Intrinsic::aarch64_sve_udiv_u:
22216 return DAG.getNode(Opcode: AArch64ISD::UDIV_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22217 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22218 case Intrinsic::aarch64_sve_sqadd:
22219 return convertMergedOpToPredOp(N, Opc: ISD::SADDSAT, DAG, UnpredOp: true);
22220 case Intrinsic::aarch64_sve_sqsub_u:
22221 return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22222 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
22223 case Intrinsic::aarch64_sve_uqadd:
22224 return convertMergedOpToPredOp(N, Opc: ISD::UADDSAT, DAG, UnpredOp: true);
22225 case Intrinsic::aarch64_sve_uqsub_u:
22226 return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22227 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
22228 case Intrinsic::aarch64_sve_sqadd_x:
22229 return DAG.getNode(Opcode: ISD::SADDSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22230 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22231 case Intrinsic::aarch64_sve_sqsub_x:
22232 return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22233 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22234 case Intrinsic::aarch64_sve_uqadd_x:
22235 return DAG.getNode(Opcode: ISD::UADDSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22236 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22237 case Intrinsic::aarch64_sve_uqsub_x:
22238 return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22239 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22240 case Intrinsic::aarch64_sve_asrd:
22241 return DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22242 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22243 case Intrinsic::aarch64_sve_cmphs:
22244 if (!N->getOperand(Num: 2).getValueType().isFloatingPoint())
22245 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
22246 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
22247 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETUGE));
22248 break;
22249 case Intrinsic::aarch64_sve_cmphi:
22250 if (!N->getOperand(Num: 2).getValueType().isFloatingPoint())
22251 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
22252 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
22253 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETUGT));
22254 break;
22255 case Intrinsic::aarch64_sve_fcmpge:
22256 case Intrinsic::aarch64_sve_cmpge:
22257 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
22258 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
22259 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETGE));
22260 break;
22261 case Intrinsic::aarch64_sve_fcmpgt:
22262 case Intrinsic::aarch64_sve_cmpgt:
22263 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
22264 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
22265 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETGT));
22266 break;
22267 case Intrinsic::aarch64_sve_fcmpeq:
22268 case Intrinsic::aarch64_sve_cmpeq:
22269 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
22270 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
22271 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETEQ));
22272 break;
22273 case Intrinsic::aarch64_sve_fcmpne:
22274 case Intrinsic::aarch64_sve_cmpne:
22275 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
22276 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
22277 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETNE));
22278 break;
22279 case Intrinsic::aarch64_sve_fcmpuo:
22280 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
22281 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
22282 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETUO));
22283 break;
22284 case Intrinsic::aarch64_sve_fadda:
22285 return combineSVEReductionOrderedFP(N, Opc: AArch64ISD::FADDA_PRED, DAG);
22286 case Intrinsic::aarch64_sve_faddv:
22287 return combineSVEReductionFP(N, Opc: AArch64ISD::FADDV_PRED, DAG);
22288 case Intrinsic::aarch64_sve_fmaxnmv:
22289 return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXNMV_PRED, DAG);
22290 case Intrinsic::aarch64_sve_fmaxv:
22291 return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXV_PRED, DAG);
22292 case Intrinsic::aarch64_sve_fminnmv:
22293 return combineSVEReductionFP(N, Opc: AArch64ISD::FMINNMV_PRED, DAG);
22294 case Intrinsic::aarch64_sve_fminv:
22295 return combineSVEReductionFP(N, Opc: AArch64ISD::FMINV_PRED, DAG);
22296 case Intrinsic::aarch64_sve_sel:
22297 return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22298 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
22299 case Intrinsic::aarch64_sve_cmpeq_wide:
22300 return tryConvertSVEWideCompare(N, CC: ISD::SETEQ, DCI, DAG);
22301 case Intrinsic::aarch64_sve_cmpne_wide:
22302 return tryConvertSVEWideCompare(N, CC: ISD::SETNE, DCI, DAG);
22303 case Intrinsic::aarch64_sve_cmpge_wide:
22304 return tryConvertSVEWideCompare(N, CC: ISD::SETGE, DCI, DAG);
22305 case Intrinsic::aarch64_sve_cmpgt_wide:
22306 return tryConvertSVEWideCompare(N, CC: ISD::SETGT, DCI, DAG);
22307 case Intrinsic::aarch64_sve_cmplt_wide:
22308 return tryConvertSVEWideCompare(N, CC: ISD::SETLT, DCI, DAG);
22309 case Intrinsic::aarch64_sve_cmple_wide:
22310 return tryConvertSVEWideCompare(N, CC: ISD::SETLE, DCI, DAG);
22311 case Intrinsic::aarch64_sve_cmphs_wide:
22312 return tryConvertSVEWideCompare(N, CC: ISD::SETUGE, DCI, DAG);
22313 case Intrinsic::aarch64_sve_cmphi_wide:
22314 return tryConvertSVEWideCompare(N, CC: ISD::SETUGT, DCI, DAG);
22315 case Intrinsic::aarch64_sve_cmplo_wide:
22316 return tryConvertSVEWideCompare(N, CC: ISD::SETULT, DCI, DAG);
22317 case Intrinsic::aarch64_sve_cmpls_wide:
22318 return tryConvertSVEWideCompare(N, CC: ISD::SETULE, DCI, DAG);
22319 case Intrinsic::aarch64_sve_ptest_any:
22320 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg: N->getOperand(Num: 1), Op: N->getOperand(Num: 2),
22321 Cond: AArch64CC::ANY_ACTIVE);
22322 case Intrinsic::aarch64_sve_ptest_first:
22323 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg: N->getOperand(Num: 1), Op: N->getOperand(Num: 2),
22324 Cond: AArch64CC::FIRST_ACTIVE);
22325 case Intrinsic::aarch64_sve_ptest_last:
22326 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg: N->getOperand(Num: 1), Op: N->getOperand(Num: 2),
22327 Cond: AArch64CC::LAST_ACTIVE);
22328 case Intrinsic::aarch64_sve_whilelo:
22329 return DAG.getNode(Opcode: ISD::GET_ACTIVE_LANE_MASK, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22330 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
22331 case Intrinsic::aarch64_sve_bsl:
22332 case Intrinsic::aarch64_sve_bsl1n:
22333 case Intrinsic::aarch64_sve_bsl2n:
22334 case Intrinsic::aarch64_sve_nbsl:
22335 return combineSVEBitSel(IID, N, DAG);
22336 }
22337 return SDValue();
22338}
22339
22340static bool isCheapToExtend(const SDValue &N) {
22341 unsigned OC = N->getOpcode();
22342 return OC == ISD::LOAD || OC == ISD::MLOAD ||
22343 ISD::isConstantSplatVectorAllZeros(N: N.getNode());
22344}
22345
22346static SDValue
22347performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
22348 SelectionDAG &DAG) {
22349 // If we have (sext (setcc A B)) and A and B are cheap to extend,
22350 // we can move the sext into the arguments and have the same result. For
22351 // example, if A and B are both loads, we can make those extending loads and
22352 // avoid an extra instruction. This pattern appears often in VLS code
22353 // generation where the inputs to the setcc have a different size to the
22354 // instruction that wants to use the result of the setcc.
22355 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
22356 N->getOperand(0)->getOpcode() == ISD::SETCC);
22357 const SDValue SetCC = N->getOperand(Num: 0);
22358
22359 const SDValue CCOp0 = SetCC.getOperand(i: 0);
22360 const SDValue CCOp1 = SetCC.getOperand(i: 1);
22361 if (!CCOp0->getValueType(ResNo: 0).isInteger() ||
22362 !CCOp1->getValueType(ResNo: 0).isInteger())
22363 return SDValue();
22364
22365 ISD::CondCode Code =
22366 cast<CondCodeSDNode>(Val: SetCC->getOperand(Num: 2).getNode())->get();
22367
22368 ISD::NodeType ExtType =
22369 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22370
22371 if (isCheapToExtend(N: SetCC.getOperand(i: 0)) &&
22372 isCheapToExtend(N: SetCC.getOperand(i: 1))) {
22373 const SDValue Ext1 =
22374 DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: CCOp0);
22375 const SDValue Ext2 =
22376 DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: CCOp1);
22377
22378 return DAG.getSetCC(
22379 DL: SDLoc(SetCC), VT: N->getValueType(ResNo: 0), LHS: Ext1, RHS: Ext2,
22380 Cond: cast<CondCodeSDNode>(Val: SetCC->getOperand(Num: 2).getNode())->get());
22381 }
22382
22383 return SDValue();
22384}
22385
22386// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
22387// This comes from interleaved vectorization. It is performed late to capture
22388// uitofp converts too.
22389static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N,
22390 SelectionDAG &DAG) {
22391 EVT VT = N->getValueType(ResNo: 0);
22392 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
22393 N->getOpcode() != ISD::ZERO_EXTEND ||
22394 N->getOperand(Num: 0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
22395 return SDValue();
22396
22397 unsigned ExtOffset = N->getOperand(Num: 0).getConstantOperandVal(i: 1);
22398 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22399 return SDValue();
22400
22401 EVT InVT = N->getOperand(Num: 0).getOperand(i: 0).getValueType();
22402 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 0).getOperand(i: 0));
22403 if (!Shuffle ||
22404 InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
22405 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
22406 return SDValue();
22407
22408 unsigned Idx;
22409 bool IsDeInterleave = ShuffleVectorInst::isDeInterleaveMaskOfFactor(
22410 Mask: Shuffle->getMask().slice(N: ExtOffset, M: VT.getVectorNumElements()), Factor: 4, Index&: Idx);
22411 // An undef interleave shuffle can come up after other canonicalizations,
22412 // where the shuffle has been converted to
22413 // zext(extract(shuffle b, undef, [u,u,0,4]))
22414 bool IsUndefDeInterleave = false;
22415 if (!IsDeInterleave)
22416 IsUndefDeInterleave =
22417 Shuffle->getOperand(Num: 1).isUndef() &&
22418 all_of(
22419 Range: Shuffle->getMask().slice(N: ExtOffset, M: VT.getVectorNumElements() / 2),
22420 P: [](int M) { return M < 0; }) &&
22421 ShuffleVectorInst::isDeInterleaveMaskOfFactor(
22422 Mask: Shuffle->getMask().slice(N: ExtOffset + VT.getVectorNumElements() / 2,
22423 M: VT.getVectorNumElements() / 2),
22424 Factor: 4, Index&: Idx);
22425 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
22426 return SDValue();
22427 SDLoc DL(N);
22428 SDValue BC1 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
22429 Operand: Shuffle->getOperand(Num: IsUndefDeInterleave ? 1 : 0));
22430 SDValue BC2 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
22431 Operand: Shuffle->getOperand(Num: IsUndefDeInterleave ? 0 : 1));
22432 SDValue UZP = DAG.getNode(Opcode: Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
22433 VT, N1: BC1, N2: BC2);
22434 if ((Idx & 1) == 1)
22435 UZP = DAG.getNode(Opcode: ISD::SRL, DL, VT, N1: UZP,
22436 N2: DAG.getConstant(Val: InVT.getScalarSizeInBits(), DL, VT));
22437 return DAG.getNode(
22438 Opcode: ISD::AND, DL, VT, N1: UZP,
22439 N2: DAG.getConstant(Val: (1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
22440}
22441
22442// This comes up similar to the above when lowering deinterleaving shuffles from
22443// zexts. We have legalized the operations in the generally case to
22444// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
22445// the extract is to the low half and the uzp is uzp1. There would be an extra
22446// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
22447// there could also be an existing and / shift that can be combined in, either
22448// before of after the extract.
22449static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
22450 EVT VT = N->getValueType(ResNo: 0);
22451 if (N->getOpcode() != ISD::ZERO_EXTEND ||
22452 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
22453 return SDValue();
22454
22455 SDValue Op = N->getOperand(Num: 0);
22456 unsigned ExtOffset = (unsigned)-1;
22457 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22458 ExtOffset = Op.getConstantOperandVal(i: 1);
22459 Op = Op.getOperand(i: 0);
22460 }
22461
22462 unsigned Shift = 0;
22463 APInt Mask = APInt::getLowBitsSet(numBits: VT.getScalarSizeInBits(),
22464 loBitsSet: Op.getValueType().getScalarSizeInBits());
22465
22466 if (Op.getOpcode() == AArch64ISD::VLSHR) {
22467 Shift = Op.getConstantOperandVal(i: 1);
22468 Op = Op.getOperand(i: 0);
22469 Mask = Mask.lshr(shiftAmt: Shift);
22470 }
22471 if (Op.getOpcode() == ISD::AND &&
22472 ISD::isConstantSplatVector(N: Op.getOperand(i: 1).getNode(), SplatValue&: Mask)) {
22473 Op = Op.getOperand(i: 0);
22474 Mask = Mask.zext(width: VT.getScalarSizeInBits());
22475 } else if (Op.getOpcode() == AArch64ISD::BICi) {
22476 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
22477 Op.getConstantOperandVal(i: 1) << Op.getConstantOperandVal(i: 2));
22478 Mask = Mask.zext(width: VT.getScalarSizeInBits());
22479 Op = Op.getOperand(i: 0);
22480 }
22481
22482 if (ExtOffset == (unsigned)-1) {
22483 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22484 ExtOffset = Op.getConstantOperandVal(i: 1);
22485 Op = Op.getOperand(i: 0);
22486 } else
22487 return SDValue();
22488 }
22489 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22490 return SDValue();
22491
22492 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
22493 return SDValue();
22494 if (Op.getOpcode() == AArch64ISD::UZP2)
22495 Shift += VT.getScalarSizeInBits() / 2;
22496
22497 SDLoc DL(N);
22498 SDValue BC = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
22499 Operand: Op.getOperand(i: ExtOffset == 0 ? 0 : 1));
22500 if (Shift != 0)
22501 BC = DAG.getNode(Opcode: AArch64ISD::VLSHR, DL, VT, N1: BC,
22502 N2: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
22503 return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: BC, N2: DAG.getConstant(Val: Mask, DL, VT));
22504}
22505
22506static SDValue performExtendCombine(SDNode *N,
22507 TargetLowering::DAGCombinerInfo &DCI,
22508 SelectionDAG &DAG) {
22509 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
22510 // we can convert that DUP into another extract_high (of a bigger DUP), which
22511 // helps the backend to decide that an sabdl2 would be useful, saving a real
22512 // extract_high operation.
22513 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
22514 N->getOperand(Num: 0).getValueType().is64BitVector() &&
22515 (N->getOperand(Num: 0).getOpcode() == ISD::ABDU ||
22516 N->getOperand(Num: 0).getOpcode() == ISD::ABDS)) {
22517 SDNode *ABDNode = N->getOperand(Num: 0).getNode();
22518 SDValue NewABD =
22519 tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N: ABDNode, DCI, DAG);
22520 if (!NewABD.getNode())
22521 return SDValue();
22522
22523 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: NewABD);
22524 }
22525
22526 if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG))
22527 return R;
22528 if (SDValue R = performZExtUZPCombine(N, DAG))
22529 return R;
22530
22531 if (N->getValueType(ResNo: 0).isFixedLengthVector() &&
22532 N->getOpcode() == ISD::SIGN_EXTEND &&
22533 N->getOperand(Num: 0)->getOpcode() == ISD::SETCC)
22534 return performSignExtendSetCCCombine(N, DCI, DAG);
22535
22536 // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
22537 // that the top half of the result register must be unused, due to the
22538 // any_extend. This means that we can replace this pattern with (rev16
22539 // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
22540 // ...)), which is what this pattern would otherwise be lowered to.
22541 // Only apply this optimisation if any_extend in original pattern to i32 or
22542 // i64, because this type will become the input type to REV16 in the new
22543 // pattern, so must be a legitimate REV16 input type.
22544 SDValue Bswap = N->getOperand(Num: 0);
22545 if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
22546 Bswap.getValueType() == MVT::i16 &&
22547 (N->getValueType(ResNo: 0) == MVT::i32 || N->getValueType(ResNo: 0) == MVT::i64)) {
22548 SDLoc DL(N);
22549 SDValue NewAnyExtend = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: N->getValueType(ResNo: 0),
22550 Operand: Bswap->getOperand(Num: 0));
22551 return DAG.getNode(Opcode: AArch64ISD::REV16, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
22552 Operand: NewAnyExtend);
22553 }
22554
22555 return SDValue();
22556}
22557
22558static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
22559 SDValue SplatVal, unsigned NumVecElts) {
22560 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
22561 Align OrigAlignment = St.getAlign();
22562 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
22563
22564 // Create scalar stores. This is at least as good as the code sequence for a
22565 // split unaligned store which is a dup.s, ext.b, and two stores.
22566 // Most of the time the three stores should be replaced by store pair
22567 // instructions (stp).
22568 SDLoc DL(&St);
22569 SDValue BasePtr = St.getBasePtr();
22570 uint64_t BaseOffset = 0;
22571
22572 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
22573 SDValue NewST1 =
22574 DAG.getStore(Chain: St.getChain(), dl: DL, Val: SplatVal, Ptr: BasePtr, PtrInfo,
22575 Alignment: OrigAlignment, MMOFlags: St.getMemOperand()->getFlags());
22576
22577 // As this in ISel, we will not merge this add which may degrade results.
22578 if (BasePtr->getOpcode() == ISD::ADD &&
22579 isa<ConstantSDNode>(Val: BasePtr->getOperand(Num: 1))) {
22580 BaseOffset = cast<ConstantSDNode>(Val: BasePtr->getOperand(Num: 1))->getSExtValue();
22581 BasePtr = BasePtr->getOperand(Num: 0);
22582 }
22583
22584 unsigned Offset = EltOffset;
22585 while (--NumVecElts) {
22586 Align Alignment = commonAlignment(A: OrigAlignment, Offset);
22587 SDValue OffsetPtr =
22588 DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr,
22589 N2: DAG.getConstant(Val: BaseOffset + Offset, DL, VT: MVT::i64));
22590 NewST1 = DAG.getStore(Chain: NewST1.getValue(R: 0), dl: DL, Val: SplatVal, Ptr: OffsetPtr,
22591 PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
22592 MMOFlags: St.getMemOperand()->getFlags());
22593 Offset += EltOffset;
22594 }
22595 return NewST1;
22596}
22597
22598// Returns an SVE type that ContentTy can be trivially sign or zero extended
22599// into.
22600static MVT getSVEContainerType(EVT ContentTy) {
22601 assert(ContentTy.isSimple() && "No SVE containers for extended types");
22602
22603 switch (ContentTy.getSimpleVT().SimpleTy) {
22604 default:
22605 llvm_unreachable("No known SVE container for this MVT type");
22606 case MVT::nxv2i8:
22607 case MVT::nxv2i16:
22608 case MVT::nxv2i32:
22609 case MVT::nxv2i64:
22610 case MVT::nxv2f32:
22611 case MVT::nxv2f64:
22612 return MVT::nxv2i64;
22613 case MVT::nxv4i8:
22614 case MVT::nxv4i16:
22615 case MVT::nxv4i32:
22616 case MVT::nxv4f32:
22617 return MVT::nxv4i32;
22618 case MVT::nxv8i8:
22619 case MVT::nxv8i16:
22620 case MVT::nxv8f16:
22621 case MVT::nxv8bf16:
22622 return MVT::nxv8i16;
22623 case MVT::nxv16i8:
22624 return MVT::nxv16i8;
22625 }
22626}
22627
22628static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
22629 SDLoc DL(N);
22630 EVT VT = N->getValueType(ResNo: 0);
22631
22632 if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
22633 return SDValue();
22634
22635 EVT ContainerVT = VT;
22636 if (ContainerVT.isInteger())
22637 ContainerVT = getSVEContainerType(ContentTy: ContainerVT);
22638
22639 SDVTList VTs = DAG.getVTList(VT1: ContainerVT, VT2: MVT::Other);
22640 SDValue Ops[] = { N->getOperand(Num: 0), // Chain
22641 N->getOperand(Num: 2), // Pg
22642 N->getOperand(Num: 3), // Base
22643 DAG.getValueType(VT) };
22644
22645 SDValue Load = DAG.getNode(Opcode: Opc, DL, VTList: VTs, Ops);
22646 SDValue LoadChain = SDValue(Load.getNode(), 1);
22647
22648 if (ContainerVT.isInteger() && (VT != ContainerVT))
22649 Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Load.getValue(R: 0));
22650
22651 return DAG.getMergeValues(Ops: { Load, LoadChain }, dl: DL);
22652}
22653
22654static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
22655 SDLoc DL(N);
22656 EVT VT = N->getValueType(ResNo: 0);
22657 EVT PtrTy = N->getOperand(Num: 3).getValueType();
22658
22659 EVT LoadVT = VT;
22660 if (VT.isFloatingPoint())
22661 LoadVT = VT.changeTypeToInteger();
22662
22663 auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
22664 SDValue PassThru = DAG.getConstant(Val: 0, DL, VT: LoadVT);
22665 SDValue L = DAG.getMaskedLoad(VT: LoadVT, dl: DL, Chain: MINode->getChain(),
22666 Base: MINode->getOperand(Num: 3), Offset: DAG.getUNDEF(VT: PtrTy),
22667 Mask: MINode->getOperand(Num: 2), Src0: PassThru,
22668 MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
22669 AM: ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding: false);
22670
22671 if (VT.isFloatingPoint()) {
22672 SDValue Ops[] = { DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: L), L.getValue(R: 1) };
22673 return DAG.getMergeValues(Ops, dl: DL);
22674 }
22675
22676 return L;
22677}
22678
22679template <unsigned Opcode>
22680static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
22681 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
22682 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
22683 "Unsupported opcode.");
22684 SDLoc DL(N);
22685 EVT VT = N->getValueType(ResNo: 0);
22686
22687 EVT LoadVT = VT;
22688 if (VT.isFloatingPoint())
22689 LoadVT = VT.changeTypeToInteger();
22690
22691 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 2), N->getOperand(Num: 3)};
22692 SDValue Load = DAG.getNode(Opcode, DL, ResultTys: {LoadVT, MVT::Other}, Ops);
22693 SDValue LoadChain = SDValue(Load.getNode(), 1);
22694
22695 if (VT.isFloatingPoint())
22696 Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Load.getValue(R: 0));
22697
22698 return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
22699}
22700
22701static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
22702 SDLoc DL(N);
22703 SDValue Data = N->getOperand(Num: 2);
22704 EVT DataVT = Data.getValueType();
22705 EVT HwSrcVt = getSVEContainerType(ContentTy: DataVT);
22706 SDValue InputVT = DAG.getValueType(DataVT);
22707
22708 if (DataVT.isFloatingPoint())
22709 InputVT = DAG.getValueType(HwSrcVt);
22710
22711 SDValue SrcNew;
22712 if (Data.getValueType().isFloatingPoint())
22713 SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Data);
22714 else
22715 SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Data);
22716
22717 SDValue Ops[] = { N->getOperand(Num: 0), // Chain
22718 SrcNew,
22719 N->getOperand(Num: 4), // Base
22720 N->getOperand(Num: 3), // Pg
22721 InputVT
22722 };
22723
22724 return DAG.getNode(Opcode: AArch64ISD::ST1_PRED, DL, VT: N->getValueType(ResNo: 0), Ops);
22725}
22726
22727static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
22728 SDLoc DL(N);
22729
22730 SDValue Data = N->getOperand(Num: 2);
22731 EVT DataVT = Data.getValueType();
22732 EVT PtrTy = N->getOperand(Num: 4).getValueType();
22733
22734 if (DataVT.isFloatingPoint())
22735 Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DataVT.changeTypeToInteger(), Operand: Data);
22736
22737 auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
22738 return DAG.getMaskedStore(Chain: MINode->getChain(), dl: DL, Val: Data, Base: MINode->getOperand(Num: 4),
22739 Offset: DAG.getUNDEF(VT: PtrTy), Mask: MINode->getOperand(Num: 3),
22740 MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
22741 AM: ISD::UNINDEXED, IsTruncating: false, IsCompressing: false);
22742}
22743
22744/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
22745/// load store optimizer pass will merge them to store pair stores. This should
22746/// be better than a movi to create the vector zero followed by a vector store
22747/// if the zero constant is not re-used, since one instructions and one register
22748/// live range will be removed.
22749///
22750/// For example, the final generated code should be:
22751///
22752/// stp xzr, xzr, [x0]
22753///
22754/// instead of:
22755///
22756/// movi v0.2d, #0
22757/// str q0, [x0]
22758///
22759static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
22760 SDValue StVal = St.getValue();
22761 EVT VT = StVal.getValueType();
22762
22763 // Avoid scalarizing zero splat stores for scalable vectors.
22764 if (VT.isScalableVector())
22765 return SDValue();
22766
22767 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
22768 // 2, 3 or 4 i32 elements.
22769 int NumVecElts = VT.getVectorNumElements();
22770 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
22771 VT.getVectorElementType().getSizeInBits() == 64) ||
22772 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
22773 VT.getVectorElementType().getSizeInBits() == 32)))
22774 return SDValue();
22775
22776 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
22777 return SDValue();
22778
22779 // If the zero constant has more than one use then the vector store could be
22780 // better since the constant mov will be amortized and stp q instructions
22781 // should be able to be formed.
22782 if (!StVal.hasOneUse())
22783 return SDValue();
22784
22785 // If the store is truncating then it's going down to i16 or smaller, which
22786 // means it can be implemented in a single store anyway.
22787 if (St.isTruncatingStore())
22788 return SDValue();
22789
22790 // If the immediate offset of the address operand is too large for the stp
22791 // instruction, then bail out.
22792 if (DAG.isBaseWithConstantOffset(Op: St.getBasePtr())) {
22793 int64_t Offset = St.getBasePtr()->getConstantOperandVal(Num: 1);
22794 if (Offset < -512 || Offset > 504)
22795 return SDValue();
22796 }
22797
22798 for (int I = 0; I < NumVecElts; ++I) {
22799 SDValue EltVal = StVal.getOperand(i: I);
22800 if (!isNullConstant(V: EltVal) && !isNullFPConstant(V: EltVal))
22801 return SDValue();
22802 }
22803
22804 // Use a CopyFromReg WZR/XZR here to prevent
22805 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
22806 SDLoc DL(&St);
22807 unsigned ZeroReg;
22808 EVT ZeroVT;
22809 if (VT.getVectorElementType().getSizeInBits() == 32) {
22810 ZeroReg = AArch64::WZR;
22811 ZeroVT = MVT::i32;
22812 } else {
22813 ZeroReg = AArch64::XZR;
22814 ZeroVT = MVT::i64;
22815 }
22816 SDValue SplatVal =
22817 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: ZeroReg, VT: ZeroVT);
22818 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
22819}
22820
22821/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
22822/// value. The load store optimizer pass will merge them to store pair stores.
22823/// This has better performance than a splat of the scalar followed by a split
22824/// vector store. Even if the stores are not merged it is four stores vs a dup,
22825/// followed by an ext.b and two stores.
22826static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
22827 SDValue StVal = St.getValue();
22828 EVT VT = StVal.getValueType();
22829
22830 // Don't replace floating point stores, they possibly won't be transformed to
22831 // stp because of the store pair suppress pass.
22832 if (VT.isFloatingPoint())
22833 return SDValue();
22834
22835 // We can express a splat as store pair(s) for 2 or 4 elements.
22836 unsigned NumVecElts = VT.getVectorNumElements();
22837 if (NumVecElts != 4 && NumVecElts != 2)
22838 return SDValue();
22839
22840 // If the store is truncating then it's going down to i16 or smaller, which
22841 // means it can be implemented in a single store anyway.
22842 if (St.isTruncatingStore())
22843 return SDValue();
22844
22845 // Check that this is a splat.
22846 // Make sure that each of the relevant vector element locations are inserted
22847 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
22848 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
22849 SDValue SplatVal;
22850 for (unsigned I = 0; I < NumVecElts; ++I) {
22851 // Check for insert vector elements.
22852 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
22853 return SDValue();
22854
22855 // Check that same value is inserted at each vector element.
22856 if (I == 0)
22857 SplatVal = StVal.getOperand(i: 1);
22858 else if (StVal.getOperand(i: 1) != SplatVal)
22859 return SDValue();
22860
22861 // Check insert element index.
22862 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(Val: StVal.getOperand(i: 2));
22863 if (!CIndex)
22864 return SDValue();
22865 uint64_t IndexVal = CIndex->getZExtValue();
22866 if (IndexVal >= NumVecElts)
22867 return SDValue();
22868 IndexNotInserted.reset(position: IndexVal);
22869
22870 StVal = StVal.getOperand(i: 0);
22871 }
22872 // Check that all vector element locations were inserted to.
22873 if (IndexNotInserted.any())
22874 return SDValue();
22875
22876 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
22877}
22878
22879static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
22880 SelectionDAG &DAG,
22881 const AArch64Subtarget *Subtarget) {
22882
22883 StoreSDNode *S = cast<StoreSDNode>(Val: N);
22884 if (S->isVolatile() || S->isIndexed())
22885 return SDValue();
22886
22887 SDValue StVal = S->getValue();
22888 EVT VT = StVal.getValueType();
22889
22890 if (!VT.isFixedLengthVector())
22891 return SDValue();
22892
22893 // If we get a splat of zeros, convert this vector store to a store of
22894 // scalars. They will be merged into store pairs of xzr thereby removing one
22895 // instruction and one register.
22896 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, St&: *S))
22897 return ReplacedZeroSplat;
22898
22899 // FIXME: The logic for deciding if an unaligned store should be split should
22900 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
22901 // a call to that function here.
22902
22903 if (!Subtarget->isMisaligned128StoreSlow())
22904 return SDValue();
22905
22906 // Don't split at -Oz.
22907 if (DAG.getMachineFunction().getFunction().hasMinSize())
22908 return SDValue();
22909
22910 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
22911 // those up regresses performance on micro-benchmarks and olden/bh.
22912 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
22913 return SDValue();
22914
22915 // Split unaligned 16B stores. They are terrible for performance.
22916 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
22917 // extensions can use this to mark that it does not want splitting to happen
22918 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
22919 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
22920 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
22921 S->getAlign() <= Align(2))
22922 return SDValue();
22923
22924 // If we get a splat of a scalar convert this vector store to a store of
22925 // scalars. They will be merged into store pairs thereby removing two
22926 // instructions.
22927 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, St&: *S))
22928 return ReplacedSplat;
22929
22930 SDLoc DL(S);
22931
22932 // Split VT into two.
22933 EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
22934 unsigned NumElts = HalfVT.getVectorNumElements();
22935 SDValue SubVector0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: StVal,
22936 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
22937 SDValue SubVector1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: StVal,
22938 N2: DAG.getConstant(Val: NumElts, DL, VT: MVT::i64));
22939 SDValue BasePtr = S->getBasePtr();
22940 SDValue NewST1 =
22941 DAG.getStore(Chain: S->getChain(), dl: DL, Val: SubVector0, Ptr: BasePtr, PtrInfo: S->getPointerInfo(),
22942 Alignment: S->getAlign(), MMOFlags: S->getMemOperand()->getFlags());
22943 SDValue OffsetPtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr,
22944 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i64));
22945 return DAG.getStore(Chain: NewST1.getValue(R: 0), dl: DL, Val: SubVector1, Ptr: OffsetPtr,
22946 PtrInfo: S->getPointerInfo(), Alignment: S->getAlign(),
22947 MMOFlags: S->getMemOperand()->getFlags());
22948}
22949
22950static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
22951 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
22952
22953 // splice(pg, op1, undef) -> op1
22954 if (N->getOperand(Num: 2).isUndef())
22955 return N->getOperand(Num: 1);
22956
22957 return SDValue();
22958}
22959
22960static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
22961 const AArch64Subtarget *Subtarget) {
22962 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
22963 N->getOpcode() == AArch64ISD::UUNPKLO) &&
22964 "Unexpected Opcode!");
22965
22966 // uunpklo/hi undef -> undef
22967 if (N->getOperand(Num: 0).isUndef())
22968 return DAG.getUNDEF(VT: N->getValueType(ResNo: 0));
22969
22970 // If this is a masked load followed by an UUNPKLO, fold this into a masked
22971 // extending load. We can do this even if this is already a masked
22972 // {z,}extload.
22973 if (N->getOperand(Num: 0).getOpcode() == ISD::MLOAD &&
22974 N->getOpcode() == AArch64ISD::UUNPKLO) {
22975 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(Val: N->getOperand(Num: 0));
22976 SDValue Mask = MLD->getMask();
22977 SDLoc DL(N);
22978
22979 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
22980 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22981 (MLD->getPassThru()->isUndef() ||
22982 isZerosVector(N: MLD->getPassThru().getNode()))) {
22983 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22984 unsigned PgPattern = Mask->getConstantOperandVal(Num: 0);
22985 EVT VT = N->getValueType(ResNo: 0);
22986
22987 // Ensure we can double the size of the predicate pattern
22988 unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
22989 if (NumElts &&
22990 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
22991 Mask =
22992 getPTrue(DAG, DL, VT: VT.changeVectorElementType(EltVT: MVT::i1), Pattern: PgPattern);
22993 SDValue PassThru = DAG.getConstant(Val: 0, DL, VT);
22994 SDValue NewLoad = DAG.getMaskedLoad(
22995 VT, dl: DL, Chain: MLD->getChain(), Base: MLD->getBasePtr(), Offset: MLD->getOffset(), Mask,
22996 Src0: PassThru, MemVT: MLD->getMemoryVT(), MMO: MLD->getMemOperand(),
22997 AM: MLD->getAddressingMode(), ISD::ZEXTLOAD);
22998
22999 DAG.ReplaceAllUsesOfValueWith(From: SDValue(MLD, 1), To: NewLoad.getValue(R: 1));
23000
23001 return NewLoad;
23002 }
23003 }
23004 }
23005
23006 return SDValue();
23007}
23008
23009static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N) {
23010 if (N->getOpcode() != AArch64ISD::UZP1)
23011 return false;
23012 SDValue Op0 = N->getOperand(Num: 0);
23013 EVT SrcVT = Op0->getValueType(ResNo: 0);
23014 EVT DstVT = N->getValueType(ResNo: 0);
23015 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
23016 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
23017 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
23018}
23019
23020// Try to combine rounding shifts where the operands come from an extend, and
23021// the result is truncated and combined into one vector.
23022// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
23023static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG) {
23024 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
23025 SDValue Op0 = N->getOperand(Num: 0);
23026 SDValue Op1 = N->getOperand(Num: 1);
23027 EVT ResVT = N->getValueType(ResNo: 0);
23028
23029 unsigned RshOpc = Op0.getOpcode();
23030 if (RshOpc != AArch64ISD::RSHRNB_I)
23031 return SDValue();
23032
23033 // Same op code and imm value?
23034 SDValue ShiftValue = Op0.getOperand(i: 1);
23035 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(i: 1))
23036 return SDValue();
23037
23038 // Same unextended operand value?
23039 SDValue Lo = Op0.getOperand(i: 0);
23040 SDValue Hi = Op1.getOperand(i: 0);
23041 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
23042 Hi.getOpcode() != AArch64ISD::UUNPKHI)
23043 return SDValue();
23044 SDValue OrigArg = Lo.getOperand(i: 0);
23045 if (OrigArg != Hi.getOperand(i: 0))
23046 return SDValue();
23047
23048 SDLoc DL(N);
23049 return DAG.getNode(Opcode: AArch64ISD::URSHR_I_PRED, DL, VT: ResVT,
23050 N1: getPredicateForVector(DAG, DL, VT: ResVT), N2: OrigArg,
23051 N3: ShiftValue);
23052}
23053
23054// Try to simplify:
23055// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
23056// t2 = nxv8i16 srl(t1, ShiftValue)
23057// to
23058// t1 = nxv8i16 rshrnb(X, shiftvalue).
23059// rshrnb will zero the top half bits of each element. Therefore, this combine
23060// should only be performed when a following instruction with the rshrnb
23061// as an operand does not care about the top half of each element. For example,
23062// a uzp1 or a truncating store.
23063static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
23064 const AArch64Subtarget *Subtarget) {
23065 EVT VT = Srl->getValueType(ResNo: 0);
23066 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
23067 return SDValue();
23068
23069 EVT ResVT;
23070 if (VT == MVT::nxv8i16)
23071 ResVT = MVT::nxv16i8;
23072 else if (VT == MVT::nxv4i32)
23073 ResVT = MVT::nxv8i16;
23074 else if (VT == MVT::nxv2i64)
23075 ResVT = MVT::nxv4i32;
23076 else
23077 return SDValue();
23078
23079 SDLoc DL(Srl);
23080 unsigned ShiftValue;
23081 SDValue RShOperand;
23082 if (!canLowerSRLToRoundingShiftForVT(Shift: Srl, ResVT, DAG, ShiftValue, RShOperand))
23083 return SDValue();
23084 SDValue Rshrnb = DAG.getNode(
23085 Opcode: AArch64ISD::RSHRNB_I, DL, VT: ResVT,
23086 Ops: {RShOperand, DAG.getTargetConstant(Val: ShiftValue, DL, VT: MVT::i32)});
23087 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Rshrnb);
23088}
23089
23090static SDValue isNVCastToHalfWidthElements(SDValue V) {
23091 if (V.getOpcode() != AArch64ISD::NVCAST)
23092 return SDValue();
23093
23094 SDValue Op = V.getOperand(i: 0);
23095 if (!Op.getValueType().isVector() ||
23096 V.getValueType().getVectorElementCount() !=
23097 Op.getValueType().getVectorElementCount() * 2)
23098 return SDValue();
23099
23100 return Op;
23101}
23102
23103static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
23104 const AArch64Subtarget *Subtarget) {
23105 SDLoc DL(N);
23106 SDValue Op0 = N->getOperand(Num: 0);
23107 SDValue Op1 = N->getOperand(Num: 1);
23108 EVT ResVT = N->getValueType(ResNo: 0);
23109
23110 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
23111 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23112 Op1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23113 Op0.getOperand(i: 0) == Op1.getOperand(i: 0)) {
23114
23115 SDValue SourceVec = Op0.getOperand(i: 0);
23116 uint64_t ExtIdx0 = Op0.getConstantOperandVal(i: 1);
23117 uint64_t ExtIdx1 = Op1.getConstantOperandVal(i: 1);
23118 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
23119 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
23120 EVT OpVT = Op0.getOperand(i: 1).getValueType();
23121 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
23122 SDValue Uzp = DAG.getNode(Opcode: N->getOpcode(), DL, VT: WidenedResVT, N1: SourceVec,
23123 N2: DAG.getUNDEF(VT: WidenedResVT));
23124 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: ResVT, N1: Uzp,
23125 N2: DAG.getConstant(Val: 0, DL, VT: OpVT));
23126 }
23127 }
23128
23129 // Following optimizations only work with uzp1.
23130 if (N->getOpcode() == AArch64ISD::UZP2)
23131 return SDValue();
23132
23133 // uzp1(x, undef) -> concat(truncate(x), undef)
23134 if (Op1.getOpcode() == ISD::UNDEF) {
23135 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
23136 switch (ResVT.getSimpleVT().SimpleTy) {
23137 default:
23138 break;
23139 case MVT::v16i8:
23140 BCVT = MVT::v8i16;
23141 HalfVT = MVT::v8i8;
23142 break;
23143 case MVT::v8i16:
23144 BCVT = MVT::v4i32;
23145 HalfVT = MVT::v4i16;
23146 break;
23147 case MVT::v4i32:
23148 BCVT = MVT::v2i64;
23149 HalfVT = MVT::v2i32;
23150 break;
23151 }
23152 if (BCVT != MVT::Other) {
23153 SDValue BC = DAG.getBitcast(VT: BCVT, V: Op0);
23154 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: HalfVT, Operand: BC);
23155 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ResVT, N1: Trunc,
23156 N2: DAG.getUNDEF(VT: HalfVT));
23157 }
23158 }
23159
23160 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
23161 return Urshr;
23162
23163 if (SDValue PreCast = isNVCastToHalfWidthElements(V: Op0)) {
23164 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: PreCast, DAG, Subtarget)) {
23165 Rshrnb = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: ResVT, Operand: Rshrnb);
23166 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Rshrnb, N2: Op1);
23167 }
23168 }
23169
23170 if (SDValue PreCast = isNVCastToHalfWidthElements(V: Op1)) {
23171 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: PreCast, DAG, Subtarget)) {
23172 Rshrnb = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: ResVT, Operand: Rshrnb);
23173 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Rshrnb);
23174 }
23175 }
23176
23177 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
23178 if (SDValue PreCast = isNVCastToHalfWidthElements(V: Op0)) {
23179 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
23180 if (PreCast.getOperand(i: 0).getOpcode() == AArch64ISD::UZP1) {
23181 SDValue X = PreCast.getOperand(i: 0).getOperand(i: 0);
23182 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: X, N2: Op1);
23183 }
23184 }
23185 }
23186
23187 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
23188 if (SDValue PreCast = isNVCastToHalfWidthElements(V: Op1)) {
23189 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
23190 if (PreCast.getOperand(i: 0).getOpcode() == AArch64ISD::UZP1) {
23191 SDValue Z = PreCast.getOperand(i: 0).getOperand(i: 1);
23192 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Z);
23193 }
23194 }
23195 }
23196
23197 // These optimizations only work on little endian.
23198 if (!DAG.getDataLayout().isLittleEndian())
23199 return SDValue();
23200
23201 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
23202 // Example:
23203 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
23204 // to
23205 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
23206 if (isHalvingTruncateAndConcatOfLegalIntScalableType(N) &&
23207 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
23208 if (Op0.getOperand(i: 0).getValueType() == Op1.getOperand(i: 0).getValueType()) {
23209 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0.getOperand(i: 0),
23210 N2: Op1.getOperand(i: 0));
23211 }
23212 }
23213
23214 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
23215 return SDValue();
23216
23217 SDValue SourceOp0 = peekThroughBitcasts(V: Op0);
23218 SDValue SourceOp1 = peekThroughBitcasts(V: Op1);
23219
23220 // truncating uzp1(x, y) -> xtn(concat (x, y))
23221 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
23222 EVT Op0Ty = SourceOp0.getValueType();
23223 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
23224 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
23225 SDValue Concat =
23226 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL,
23227 VT: Op0Ty.getDoubleNumVectorElementsVT(Context&: *DAG.getContext()),
23228 N1: SourceOp0, N2: SourceOp1);
23229 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT, Operand: Concat);
23230 }
23231 }
23232
23233 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
23234 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
23235 SourceOp1.getOpcode() != ISD::TRUNCATE)
23236 return SDValue();
23237 SourceOp0 = SourceOp0.getOperand(i: 0);
23238 SourceOp1 = SourceOp1.getOperand(i: 0);
23239
23240 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
23241 !SourceOp0.getValueType().isSimple())
23242 return SDValue();
23243
23244 EVT ResultTy;
23245
23246 switch (SourceOp0.getSimpleValueType().SimpleTy) {
23247 case MVT::v2i64:
23248 ResultTy = MVT::v4i32;
23249 break;
23250 case MVT::v4i32:
23251 ResultTy = MVT::v8i16;
23252 break;
23253 case MVT::v8i16:
23254 ResultTy = MVT::v16i8;
23255 break;
23256 default:
23257 return SDValue();
23258 }
23259
23260 SDValue UzpOp0 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp0);
23261 SDValue UzpOp1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp1);
23262 SDValue UzpResult =
23263 DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UzpOp0.getValueType(), N1: UzpOp0, N2: UzpOp1);
23264
23265 EVT BitcastResultTy;
23266
23267 switch (ResVT.getSimpleVT().SimpleTy) {
23268 case MVT::v2i32:
23269 BitcastResultTy = MVT::v2i64;
23270 break;
23271 case MVT::v4i16:
23272 BitcastResultTy = MVT::v4i32;
23273 break;
23274 case MVT::v8i8:
23275 BitcastResultTy = MVT::v8i16;
23276 break;
23277 default:
23278 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
23279 }
23280
23281 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT,
23282 Operand: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: BitcastResultTy, Operand: UzpResult));
23283}
23284
23285static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
23286 unsigned Opc = N->getOpcode();
23287
23288 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
23289 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23290 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
23291 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23292 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
23293 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
23294 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
23295 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
23296
23297 SDLoc DL(N);
23298 SDValue Chain = N->getOperand(Num: 0);
23299 SDValue Pg = N->getOperand(Num: 1);
23300 SDValue Base = N->getOperand(Num: 2);
23301 SDValue Offset = N->getOperand(Num: 3);
23302 SDValue Ty = N->getOperand(Num: 4);
23303
23304 EVT ResVT = N->getValueType(ResNo: 0);
23305
23306 const auto OffsetOpc = Offset.getOpcode();
23307 const bool OffsetIsZExt =
23308 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
23309 const bool OffsetIsSExt =
23310 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
23311
23312 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
23313 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
23314 SDValue ExtPg = Offset.getOperand(i: 0);
23315 VTSDNode *ExtFrom = cast<VTSDNode>(Val: Offset.getOperand(i: 2).getNode());
23316 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
23317
23318 // If the predicate for the sign- or zero-extended offset is the
23319 // same as the predicate used for this load and the sign-/zero-extension
23320 // was from a 32-bits...
23321 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
23322 SDValue UnextendedOffset = Offset.getOperand(i: 1);
23323
23324 unsigned NewOpc = getGatherVecOpcode(IsScaled: Scaled, IsSigned: OffsetIsSExt, NeedsExtend: true);
23325 if (Signed)
23326 NewOpc = getSignExtendedGatherOpcode(Opcode: NewOpc);
23327
23328 return DAG.getNode(Opcode: NewOpc, DL, ResultTys: {ResVT, MVT::Other},
23329 Ops: {Chain, Pg, Base, UnextendedOffset, Ty});
23330 }
23331 }
23332
23333 return SDValue();
23334}
23335
23336/// Optimize a vector shift instruction and its operand if shifted out
23337/// bits are not used.
23338static SDValue performVectorShiftCombine(SDNode *N,
23339 const AArch64TargetLowering &TLI,
23340 TargetLowering::DAGCombinerInfo &DCI) {
23341 assert(N->getOpcode() == AArch64ISD::VASHR ||
23342 N->getOpcode() == AArch64ISD::VLSHR);
23343
23344 SDValue Op = N->getOperand(Num: 0);
23345 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
23346
23347 unsigned ShiftImm = N->getConstantOperandVal(Num: 1);
23348 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
23349
23350 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
23351 if (N->getOpcode() == AArch64ISD::VASHR &&
23352 Op.getOpcode() == AArch64ISD::VSHL &&
23353 N->getOperand(Num: 1) == Op.getOperand(i: 1))
23354 if (DCI.DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0)) > ShiftImm)
23355 return Op.getOperand(i: 0);
23356
23357 // If the shift is exact, the shifted out bits matter.
23358 if (N->getFlags().hasExact())
23359 return SDValue();
23360
23361 APInt ShiftedOutBits = APInt::getLowBitsSet(numBits: OpScalarSize, loBitsSet: ShiftImm);
23362 APInt DemandedMask = ~ShiftedOutBits;
23363
23364 if (TLI.SimplifyDemandedBits(Op, DemandedBits: DemandedMask, DCI))
23365 return SDValue(N, 0);
23366
23367 return SDValue();
23368}
23369
23370static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
23371 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
23372 // This transform works in partnership with performSetCCPunpkCombine to
23373 // remove unnecessary transfer of predicates into standard registers and back
23374 if (N->getOperand(Num: 0).getOpcode() == ISD::SIGN_EXTEND &&
23375 N->getOperand(Num: 0)->getOperand(Num: 0)->getValueType(ResNo: 0).getScalarType() ==
23376 MVT::i1) {
23377 SDValue CC = N->getOperand(Num: 0)->getOperand(Num: 0);
23378 auto VT = CC->getValueType(ResNo: 0).getHalfNumVectorElementsVT(Context&: *DAG.getContext());
23379 SDValue Unpk = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SDLoc(N), VT, N1: CC,
23380 N2: DAG.getVectorIdxConstant(Val: 0, DL: SDLoc(N)));
23381 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: Unpk);
23382 }
23383
23384 return SDValue();
23385}
23386
23387/// Target-specific DAG combine function for post-increment LD1 (lane) and
23388/// post-increment LD1R.
23389static SDValue performPostLD1Combine(SDNode *N,
23390 TargetLowering::DAGCombinerInfo &DCI,
23391 bool IsLaneOp) {
23392 if (DCI.isBeforeLegalizeOps())
23393 return SDValue();
23394
23395 SelectionDAG &DAG = DCI.DAG;
23396 EVT VT = N->getValueType(ResNo: 0);
23397
23398 if (!VT.is128BitVector() && !VT.is64BitVector())
23399 return SDValue();
23400
23401 // If it is not LOAD, can not do such combine.
23402 unsigned LoadIdx = IsLaneOp ? 1 : 0;
23403 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N->getOperand(Num: LoadIdx).getNode());
23404 if (!LD)
23405 return SDValue();
23406
23407 // If the Generic combiner already helped form a pre- or post-indexed load,
23408 // skip forming one here.
23409 if (LD->isIndexed())
23410 return SDValue();
23411
23412 // The vector lane must be a constant in the LD1LANE opcode.
23413 SDValue Lane;
23414 if (IsLaneOp) {
23415 Lane = N->getOperand(Num: 2);
23416 auto *LaneC = dyn_cast<ConstantSDNode>(Val&: Lane);
23417 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
23418 return SDValue();
23419 if (LaneC->getZExtValue() == 0 && isNullOrNullSplat(V: N->getOperand(Num: 0)))
23420 return SDValue();
23421 }
23422
23423 LoadSDNode *LoadSDN = cast<LoadSDNode>(Val: LD);
23424 EVT MemVT = LoadSDN->getMemoryVT();
23425 // Check if memory operand is the same type as the vector element.
23426 if (MemVT != VT.getVectorElementType())
23427 return SDValue();
23428
23429 // Check if there are other uses. If so, do not combine as it will introduce
23430 // an extra load.
23431 for (SDUse &U : LD->uses()) {
23432 if (U.getResNo() == 1) // Ignore uses of the chain result.
23433 continue;
23434 if (U.getUser() != N)
23435 return SDValue();
23436 }
23437
23438 // If there is one use and it can splat the value, prefer that operation.
23439 // TODO: This could be expanded to more operations if they reliably use the
23440 // index variants.
23441 if (N->hasOneUse()) {
23442 unsigned UseOpc = N->user_begin()->getOpcode();
23443 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
23444 return SDValue();
23445 }
23446
23447 SDValue Addr = LD->getOperand(Num: 1);
23448 SDValue Vector = N->getOperand(Num: 0);
23449 // Search for a use of the address operand that is an increment.
23450 for (SDUse &Use : Addr->uses()) {
23451 SDNode *User = Use.getUser();
23452 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
23453 continue;
23454
23455 // If the increment is a constant, it must match the memory ref size.
23456 SDValue Inc = User->getOperand(Num: User->getOperand(Num: 0) == Addr ? 1 : 0);
23457 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
23458 uint32_t IncVal = CInc->getZExtValue();
23459 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
23460 if (IncVal != NumBytes)
23461 continue;
23462 Inc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
23463 }
23464
23465 // To avoid cycle construction make sure that neither the load nor the add
23466 // are predecessors to each other or the Vector.
23467 SmallPtrSet<const SDNode *, 32> Visited;
23468 SmallVector<const SDNode *, 16> Worklist;
23469 Visited.insert(Ptr: Addr.getNode());
23470 Worklist.push_back(Elt: User);
23471 Worklist.push_back(Elt: LD);
23472 Worklist.push_back(Elt: Vector.getNode());
23473 if (SDNode::hasPredecessorHelper(N: LD, Visited, Worklist) ||
23474 SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
23475 continue;
23476
23477 SmallVector<SDValue, 8> Ops;
23478 Ops.push_back(Elt: LD->getOperand(Num: 0)); // Chain
23479 if (IsLaneOp) {
23480 Ops.push_back(Elt: Vector); // The vector to be inserted
23481 Ops.push_back(Elt: Lane); // The lane to be inserted in the vector
23482 }
23483 Ops.push_back(Elt: Addr);
23484 Ops.push_back(Elt: Inc);
23485
23486 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
23487 SDVTList SDTys = DAG.getVTList(VTs: Tys);
23488 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
23489 SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOp, dl: SDLoc(N), VTList: SDTys, Ops,
23490 MemVT,
23491 MMO: LoadSDN->getMemOperand());
23492
23493 // Update the uses.
23494 SDValue NewResults[] = {
23495 SDValue(LD, 0), // The result of load
23496 SDValue(UpdN.getNode(), 2) // Chain
23497 };
23498 DCI.CombineTo(N: LD, To: NewResults);
23499 DCI.CombineTo(N, Res: SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
23500 DCI.CombineTo(N: User, Res: SDValue(UpdN.getNode(), 1)); // Write back register
23501
23502 break;
23503 }
23504 return SDValue();
23505}
23506
23507/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
23508/// address translation.
23509static bool performTBISimplification(SDValue Addr,
23510 TargetLowering::DAGCombinerInfo &DCI,
23511 SelectionDAG &DAG) {
23512 APInt DemandedMask = APInt::getLowBitsSet(numBits: 64, loBitsSet: 56);
23513 KnownBits Known;
23514 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
23515 !DCI.isBeforeLegalizeOps());
23516 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23517 if (TLI.SimplifyDemandedBits(Op: Addr, DemandedBits: DemandedMask, Known, TLO)) {
23518 DCI.CommitTargetLoweringOpt(TLO);
23519 return true;
23520 }
23521 return false;
23522}
23523
23524static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
23525 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
23526 "Expected STORE dag node in input!");
23527
23528 if (auto Store = dyn_cast<StoreSDNode>(Val: N)) {
23529 if (!Store->isTruncatingStore() || Store->isIndexed())
23530 return SDValue();
23531 SDValue Ext = Store->getValue();
23532 auto ExtOpCode = Ext.getOpcode();
23533 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
23534 ExtOpCode != ISD::ANY_EXTEND)
23535 return SDValue();
23536 SDValue Orig = Ext->getOperand(Num: 0);
23537 if (Store->getMemoryVT() != Orig.getValueType())
23538 return SDValue();
23539 return DAG.getStore(Chain: Store->getChain(), dl: SDLoc(Store), Val: Orig,
23540 Ptr: Store->getBasePtr(), MMO: Store->getMemOperand());
23541 }
23542
23543 return SDValue();
23544}
23545
23546// A custom combine to lower load <3 x i8> as the more efficient sequence
23547// below:
23548// ldrb wX, [x0, #2]
23549// ldrh wY, [x0]
23550// orr wX, wY, wX, lsl #16
23551// fmov s0, wX
23552//
23553// Note that an alternative sequence with even fewer (although usually more
23554// complex/expensive) instructions would be:
23555// ld1r.4h { v0 }, [x0], #2
23556// ld1.b { v0 }[2], [x0]
23557//
23558// Generating this sequence unfortunately results in noticeably worse codegen
23559// for code that extends the loaded v3i8, due to legalization breaking vector
23560// shuffle detection in a way that is very difficult to work around.
23561// TODO: Revisit once v3i8 legalization has been improved in general.
23562static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
23563 EVT MemVT = LD->getMemoryVT();
23564 if (MemVT != EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i8, NumElements: 3) ||
23565 LD->getBaseAlign() >= 4)
23566 return SDValue();
23567
23568 SDLoc DL(LD);
23569 MachineFunction &MF = DAG.getMachineFunction();
23570 SDValue Chain = LD->getChain();
23571 SDValue BasePtr = LD->getBasePtr();
23572 MachineMemOperand *MMO = LD->getMemOperand();
23573 assert(LD->getOffset().isUndef() && "undef offset expected");
23574
23575 // Load 2 x i8, then 1 x i8.
23576 SDValue L16 = DAG.getLoad(VT: MVT::i16, dl: DL, Chain, Ptr: BasePtr, MMO);
23577 TypeSize Offset2 = TypeSize::getFixed(ExactSize: 2);
23578 SDValue L8 = DAG.getLoad(VT: MVT::i8, dl: DL, Chain,
23579 Ptr: DAG.getMemBasePlusOffset(Base: BasePtr, Offset: Offset2, DL),
23580 MMO: MF.getMachineMemOperand(MMO, Offset: 2, Size: 1));
23581
23582 // Extend to i32.
23583 SDValue Ext16 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: L16);
23584 SDValue Ext8 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: L8);
23585
23586 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
23587 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: Ext8,
23588 N2: DAG.getConstant(Val: 16, DL, VT: MVT::i32));
23589 SDValue Or = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Ext16, N2: Shl);
23590 SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v4i8, Operand: Or);
23591
23592 // Extract v3i8 again.
23593 SDValue Extract = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MemVT, N1: Cast,
23594 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
23595 SDValue TokenFactor = DAG.getNode(
23596 Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
23597 Ops: {SDValue(cast<SDNode>(Val&: L16), 1), SDValue(cast<SDNode>(Val&: L8), 1)});
23598 return DAG.getMergeValues(Ops: {Extract, TokenFactor}, dl: DL);
23599}
23600
23601// Perform TBI simplification if supported by the target and try to break up
23602// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
23603// load instructions can be selected.
23604static SDValue performLOADCombine(SDNode *N,
23605 TargetLowering::DAGCombinerInfo &DCI,
23606 SelectionDAG &DAG,
23607 const AArch64Subtarget *Subtarget) {
23608 if (Subtarget->supportsAddressTopByteIgnored())
23609 performTBISimplification(Addr: N->getOperand(Num: 1), DCI, DAG);
23610
23611 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
23612 EVT RegVT = LD->getValueType(ResNo: 0);
23613 EVT MemVT = LD->getMemoryVT();
23614 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23615 SDLoc DL(LD);
23616
23617 // Cast ptr32 and ptr64 pointers to the default address space before a load.
23618 unsigned AddrSpace = LD->getAddressSpace();
23619 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
23620 AddrSpace == ARM64AS::PTR32_UPTR) {
23621 MVT PtrVT = TLI.getPointerTy(DL: DAG.getDataLayout());
23622 if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
23623 SDValue Cast =
23624 DAG.getAddrSpaceCast(dl: DL, VT: PtrVT, Ptr: LD->getBasePtr(), SrcAS: AddrSpace, DestAS: 0);
23625 return DAG.getExtLoad(ExtType: LD->getExtensionType(), dl: DL, VT: RegVT, Chain: LD->getChain(),
23626 Ptr: Cast, PtrInfo: LD->getPointerInfo(), MemVT,
23627 Alignment: LD->getBaseAlign(),
23628 MMOFlags: LD->getMemOperand()->getFlags());
23629 }
23630 }
23631
23632 if (LD->isVolatile() || !Subtarget->isLittleEndian())
23633 return SDValue(N, 0);
23634
23635 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
23636 return Res;
23637
23638 if (!LD->isNonTemporal())
23639 return SDValue(N, 0);
23640
23641 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
23642 MemVT.getSizeInBits() % 256 == 0 ||
23643 256 % MemVT.getScalarSizeInBits() != 0)
23644 return SDValue(N, 0);
23645
23646 SDValue Chain = LD->getChain();
23647 SDValue BasePtr = LD->getBasePtr();
23648 SDNodeFlags Flags = LD->getFlags();
23649 SmallVector<SDValue, 4> LoadOps;
23650 SmallVector<SDValue, 4> LoadOpsChain;
23651 // Replace any non temporal load over 256-bit with a series of 256 bit loads
23652 // and a scalar/vector load less than 256. This way we can utilize 256-bit
23653 // loads and reduce the amount of load instructions generated.
23654 MVT NewVT =
23655 MVT::getVectorVT(VT: MemVT.getVectorElementType().getSimpleVT(),
23656 NumElements: 256 / MemVT.getVectorElementType().getSizeInBits());
23657 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
23658 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
23659 for (unsigned I = 0; I < Num256Loads; I++) {
23660 unsigned PtrOffset = I * 32;
23661 SDValue NewPtr = DAG.getMemBasePlusOffset(
23662 Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
23663 Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
23664 SDValue NewLoad = DAG.getLoad(
23665 VT: NewVT, dl: DL, Chain, Ptr: NewPtr, PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset),
23666 Alignment: NewAlign, MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
23667 LoadOps.push_back(Elt: NewLoad);
23668 LoadOpsChain.push_back(Elt: SDValue(cast<SDNode>(Val&: NewLoad), 1));
23669 }
23670
23671 // Process remaining bits of the load operation.
23672 // This is done by creating an UNDEF vector to match the size of the
23673 // 256-bit loads and inserting the remaining load to it. We extract the
23674 // original load type at the end using EXTRACT_SUBVECTOR instruction.
23675 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
23676 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
23677 MVT RemainingVT = MVT::getVectorVT(
23678 VT: MemVT.getVectorElementType().getSimpleVT(),
23679 NumElements: BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
23680 SDValue NewPtr = DAG.getMemBasePlusOffset(
23681 Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
23682 Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
23683 SDValue RemainingLoad =
23684 DAG.getLoad(VT: RemainingVT, dl: DL, Chain, Ptr: NewPtr,
23685 PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset), Alignment: NewAlign,
23686 MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
23687 SDValue UndefVector = DAG.getUNDEF(VT: NewVT);
23688 SDValue InsertIdx = DAG.getVectorIdxConstant(Val: 0, DL);
23689 SDValue ExtendedRemainingLoad =
23690 DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewVT,
23691 Ops: {UndefVector, RemainingLoad, InsertIdx});
23692 LoadOps.push_back(Elt: ExtendedRemainingLoad);
23693 LoadOpsChain.push_back(Elt: SDValue(cast<SDNode>(Val&: RemainingLoad), 1));
23694 EVT ConcatVT =
23695 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getScalarType(),
23696 NumElements: LoadOps.size() * NewVT.getVectorNumElements());
23697 SDValue ConcatVectors =
23698 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ConcatVT, Ops: LoadOps);
23699 // Extract the original vector type size.
23700 SDValue ExtractSubVector =
23701 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MemVT,
23702 Ops: {ConcatVectors, DAG.getVectorIdxConstant(Val: 0, DL)});
23703 SDValue TokenFactor =
23704 DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: LoadOpsChain);
23705 return DAG.getMergeValues(Ops: {ExtractSubVector, TokenFactor}, dl: DL);
23706}
23707
23708static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
23709 EVT VecVT = Op.getValueType();
23710 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
23711 "Need boolean vector type.");
23712
23713 if (Depth > 3)
23714 return MVT::INVALID_SIMPLE_VALUE_TYPE;
23715
23716 // We can get the base type from a vector compare or truncate.
23717 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
23718 return Op.getOperand(i: 0).getValueType();
23719
23720 // If an operand is a bool vector, continue looking.
23721 EVT BaseVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
23722 for (SDValue Operand : Op->op_values()) {
23723 if (Operand.getValueType() != VecVT)
23724 continue;
23725
23726 EVT OperandVT = tryGetOriginalBoolVectorType(Op: Operand, Depth: Depth + 1);
23727 if (!BaseVT.isSimple())
23728 BaseVT = OperandVT;
23729 else if (OperandVT != BaseVT)
23730 return MVT::INVALID_SIMPLE_VALUE_TYPE;
23731 }
23732
23733 return BaseVT;
23734}
23735
23736// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
23737// iN, we can use a trick that extracts the i^th bit from the i^th element and
23738// then performs a vector add to get a scalar bitmask. This requires that each
23739// element's bits are either all 1 or all 0.
23740static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
23741 SDLoc DL(N);
23742 SDValue ComparisonResult(N, 0);
23743 EVT VecVT = ComparisonResult.getValueType();
23744 assert(VecVT.isVector() && "Must be a vector type");
23745
23746 unsigned NumElts = VecVT.getVectorNumElements();
23747 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
23748 return SDValue();
23749
23750 if (VecVT.getVectorElementType() != MVT::i1 &&
23751 !DAG.getTargetLoweringInfo().isTypeLegal(VT: VecVT))
23752 return SDValue();
23753
23754 // If we can find the original types to work on instead of a vector of i1,
23755 // we can avoid extend/extract conversion instructions.
23756 if (VecVT.getVectorElementType() == MVT::i1) {
23757 VecVT = tryGetOriginalBoolVectorType(Op: ComparisonResult);
23758 if (!VecVT.isSimple()) {
23759 unsigned BitsPerElement = std::max(a: 64 / NumElts, b: 8u); // >= 64-bit vector
23760 VecVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: BitsPerElement), NumElements: NumElts);
23761 }
23762 }
23763 VecVT = VecVT.changeVectorElementTypeToInteger();
23764
23765 // Large vectors don't map directly to this conversion, so to avoid too many
23766 // edge cases, we don't apply it here. The conversion will likely still be
23767 // applied later via multiple smaller vectors, whose results are concatenated.
23768 if (VecVT.getSizeInBits() > 128)
23769 return SDValue();
23770
23771 // Ensure that all elements' bits are either 0s or 1s.
23772 ComparisonResult = DAG.getSExtOrTrunc(Op: ComparisonResult, DL, VT: VecVT);
23773
23774 SmallVector<SDValue, 16> MaskConstants;
23775 if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
23776 VecVT == MVT::v16i8) {
23777 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
23778 // per entry. We split it into two halves, apply the mask, zip the halves to
23779 // create 8x 16-bit values, and the perform the vector reduce.
23780 for (unsigned Half = 0; Half < 2; ++Half) {
23781 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
23782 MaskConstants.push_back(Elt: DAG.getConstant(Val: MaskBit, DL, VT: MVT::i32));
23783 }
23784 }
23785 SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
23786 SDValue RepresentativeBits =
23787 DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
23788
23789 SDValue UpperRepresentativeBits =
23790 DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: VecVT, N1: RepresentativeBits,
23791 N2: RepresentativeBits, N3: DAG.getConstant(Val: 8, DL, VT: MVT::i32));
23792 SDValue Zipped = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: VecVT,
23793 N1: RepresentativeBits, N2: UpperRepresentativeBits);
23794 Zipped = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i16, Operand: Zipped);
23795 return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: MVT::i16, Operand: Zipped);
23796 }
23797
23798 // All other vector sizes.
23799 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
23800 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
23801 MaskConstants.push_back(Elt: DAG.getConstant(Val: MaskBit, DL, VT: MVT::i64));
23802 }
23803
23804 SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
23805 SDValue RepresentativeBits =
23806 DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
23807 EVT ResultVT = MVT::getIntegerVT(BitWidth: std::max<unsigned>(
23808 a: NumElts, b: VecVT.getVectorElementType().getSizeInBits()));
23809 return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ResultVT, Operand: RepresentativeBits);
23810}
23811
23812static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
23813 StoreSDNode *Store) {
23814 if (!Store->isTruncatingStore())
23815 return SDValue();
23816
23817 SDLoc DL(Store);
23818 SDValue VecOp = Store->getValue();
23819 EVT VT = VecOp.getValueType();
23820 EVT MemVT = Store->getMemoryVT();
23821
23822 if (!MemVT.isVector() || !VT.isVector() ||
23823 MemVT.getVectorElementType() != MVT::i1)
23824 return SDValue();
23825
23826 // If we are storing a vector that we are currently building, let
23827 // `scalarizeVectorStore()` handle this more efficiently.
23828 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
23829 return SDValue();
23830
23831 VecOp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: VecOp);
23832 SDValue VectorBits = vectorToScalarBitmask(N: VecOp.getNode(), DAG);
23833 if (!VectorBits)
23834 return SDValue();
23835
23836 EVT StoreVT =
23837 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getStoreSizeInBits());
23838 SDValue ExtendedBits = DAG.getZExtOrTrunc(Op: VectorBits, DL, VT: StoreVT);
23839 return DAG.getStore(Chain: Store->getChain(), dl: DL, Val: ExtendedBits, Ptr: Store->getBasePtr(),
23840 MMO: Store->getMemOperand());
23841}
23842
23843bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
23844 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
23845 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
23846 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
23847}
23848
23849// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
23850static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
23851 const AArch64Subtarget *Subtarget) {
23852 SDValue Value = ST->getValue();
23853 EVT ValueVT = Value.getValueType();
23854
23855 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
23856 Value.getOpcode() != ISD::TRUNCATE ||
23857 ValueVT != EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i8, NumElements: 3))
23858 return SDValue();
23859
23860 assert(ST->getOffset().isUndef() && "undef offset expected");
23861 SDLoc DL(ST);
23862 auto WideVT = EVT::getVectorVT(
23863 Context&: *DAG.getContext(),
23864 VT: Value->getOperand(Num: 0).getValueType().getVectorElementType(), NumElements: 4);
23865 SDValue UndefVector = DAG.getUNDEF(VT: WideVT);
23866 SDValue WideTrunc = DAG.getNode(
23867 Opcode: ISD::INSERT_SUBVECTOR, DL, VT: WideVT,
23868 Ops: {UndefVector, Value->getOperand(Num: 0), DAG.getVectorIdxConstant(Val: 0, DL)});
23869 SDValue Cast = DAG.getNode(
23870 Opcode: ISD::BITCAST, DL, VT: WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
23871 Operand: WideTrunc);
23872
23873 MachineFunction &MF = DAG.getMachineFunction();
23874 SDValue Chain = ST->getChain();
23875 MachineMemOperand *MMO = ST->getMemOperand();
23876 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
23877 SDValue E2 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
23878 N2: DAG.getConstant(Val: 2 * IdxScale, DL, VT: MVT::i64));
23879 TypeSize Offset2 = TypeSize::getFixed(ExactSize: 2);
23880 SDValue Ptr2 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset2, DL);
23881 Chain = DAG.getStore(Chain, dl: DL, Val: E2, Ptr: Ptr2, MMO: MF.getMachineMemOperand(MMO, Offset: 2, Size: 1));
23882
23883 SDValue E1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
23884 N2: DAG.getConstant(Val: 1 * IdxScale, DL, VT: MVT::i64));
23885 TypeSize Offset1 = TypeSize::getFixed(ExactSize: 1);
23886 SDValue Ptr1 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset1, DL);
23887 Chain = DAG.getStore(Chain, dl: DL, Val: E1, Ptr: Ptr1, MMO: MF.getMachineMemOperand(MMO, Offset: 1, Size: 1));
23888
23889 SDValue E0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
23890 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
23891 Chain = DAG.getStore(Chain, dl: DL, Val: E0, Ptr: ST->getBasePtr(),
23892 MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: 1));
23893 return Chain;
23894}
23895
23896static unsigned getFPSubregForVT(EVT VT) {
23897 assert(VT.isSimple() && "Expected simple VT");
23898 switch (VT.getSimpleVT().SimpleTy) {
23899 case MVT::aarch64mfp8:
23900 return AArch64::bsub;
23901 case MVT::f16:
23902 return AArch64::hsub;
23903 case MVT::f32:
23904 return AArch64::ssub;
23905 case MVT::f64:
23906 return AArch64::dsub;
23907 default:
23908 llvm_unreachable("Unexpected VT!");
23909 }
23910}
23911
23912static SDValue performSTORECombine(SDNode *N,
23913 TargetLowering::DAGCombinerInfo &DCI,
23914 SelectionDAG &DAG,
23915 const AArch64Subtarget *Subtarget) {
23916 StoreSDNode *ST = cast<StoreSDNode>(Val: N);
23917 SDValue Chain = ST->getChain();
23918 SDValue Value = ST->getValue();
23919 SDValue Ptr = ST->getBasePtr();
23920 EVT ValueVT = Value.getValueType();
23921 EVT MemVT = ST->getMemoryVT();
23922 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23923 SDLoc DL(ST);
23924
23925 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
23926 EVT EltVT = VT.getVectorElementType();
23927 return EltVT == MVT::f32 || EltVT == MVT::f64;
23928 };
23929
23930 // Cast ptr32 and ptr64 pointers to the default address space before a store.
23931 unsigned AddrSpace = ST->getAddressSpace();
23932 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
23933 AddrSpace == ARM64AS::PTR32_UPTR) {
23934 MVT PtrVT = TLI.getPointerTy(DL: DAG.getDataLayout());
23935 if (PtrVT != Ptr.getSimpleValueType()) {
23936 SDValue Cast = DAG.getAddrSpaceCast(dl: DL, VT: PtrVT, Ptr, SrcAS: AddrSpace, DestAS: 0);
23937 return DAG.getStore(Chain, dl: DL, Val: Value, Ptr: Cast, PtrInfo: ST->getPointerInfo(),
23938 Alignment: ST->getBaseAlign(), MMOFlags: ST->getMemOperand()->getFlags(),
23939 AAInfo: ST->getAAInfo());
23940 }
23941 }
23942
23943 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
23944 return Res;
23945
23946 // If this is an FP_ROUND followed by a store, fold this into a truncating
23947 // store. We can do this even if this is already a truncstore.
23948 // We purposefully don't care about legality of the nodes here as we know
23949 // they can be split down into something legal.
23950 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
23951 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
23952 Subtarget->useSVEForFixedLengthVectors() &&
23953 ValueVT.isFixedLengthVector() &&
23954 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
23955 hasValidElementTypeForFPTruncStore(Value.getOperand(i: 0).getValueType()))
23956 return DAG.getTruncStore(Chain, dl: DL, Val: Value.getOperand(i: 0), Ptr, SVT: MemVT,
23957 MMO: ST->getMemOperand());
23958
23959 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
23960 return Split;
23961
23962 if (Subtarget->supportsAddressTopByteIgnored() &&
23963 performTBISimplification(Addr: N->getOperand(Num: 2), DCI, DAG))
23964 return SDValue(N, 0);
23965
23966 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
23967 return Store;
23968
23969 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, Store: ST))
23970 return Store;
23971
23972 if (ST->isTruncatingStore() &&
23973 isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: MemVT)) {
23974 if (SDValue Rshrnb =
23975 trySimplifySrlAddToRshrnb(Srl: ST->getOperand(Num: 1), DAG, Subtarget)) {
23976 return DAG.getTruncStore(Chain: ST->getChain(), dl: ST, Val: Rshrnb, Ptr: ST->getBasePtr(),
23977 SVT: MemVT, MMO: ST->getMemOperand());
23978 }
23979 }
23980
23981 // This is an integer vector_extract_elt followed by a (possibly truncating)
23982 // store. We may be able to replace this with a store of an FP subregister.
23983 if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
23984 Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23985
23986 SDValue Vector = Value.getOperand(i: 0);
23987 SDValue ExtIdx = Value.getOperand(i: 1);
23988 EVT VectorVT = Vector.getValueType();
23989 EVT ElemVT = VectorVT.getVectorElementType();
23990
23991 if (!ValueVT.isInteger())
23992 return SDValue();
23993
23994 // Propagate zero constants (applying this fold may miss optimizations).
23995 if (ISD::isConstantSplatVectorAllZeros(N: Vector.getNode())) {
23996 SDValue ZeroElt = DAG.getConstant(Val: 0, DL, VT: ValueVT);
23997 DAG.ReplaceAllUsesWith(From: Value, To: ZeroElt);
23998 return SDValue();
23999 }
24000
24001 if (ValueVT != MemVT && !ST->isTruncatingStore())
24002 return SDValue();
24003
24004 // This could generate an additional extract if the index is non-zero and
24005 // the extracted value has multiple uses.
24006 auto *ExtCst = dyn_cast<ConstantSDNode>(Val&: ExtIdx);
24007 if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
24008 return SDValue();
24009
24010 // These can lower to st1, which is preferable if we're unlikely to fold the
24011 // addressing into the store.
24012 if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
24013 (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
24014 !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
24015 return SDValue();
24016
24017 if (MemVT == MVT::i64 || MemVT == MVT::i32) {
24018 // Heuristic: If there are other users of w/x integer scalars extracted
24019 // from this vector that won't fold into the store -- abandon folding.
24020 // Applying this fold may disrupt paired stores.
24021 for (const auto &Use : Vector->uses()) {
24022 if (Use.getResNo() != Vector.getResNo())
24023 continue;
24024 const SDNode *User = Use.getUser();
24025 if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24026 (!User->hasOneUse() ||
24027 (*User->user_begin())->getOpcode() != ISD::STORE))
24028 return SDValue();
24029 }
24030 }
24031
24032 SDValue ExtVector = Vector;
24033 if (!ExtCst || !ExtCst->isZero()) {
24034 // Handle extracting from lanes != 0.
24035 SDValue Ext = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL,
24036 VT: Value.getValueType(), N1: Vector, N2: ExtIdx);
24037 SDValue Zero = DAG.getVectorIdxConstant(Val: 0, DL);
24038 ExtVector = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VectorVT,
24039 N1: DAG.getUNDEF(VT: VectorVT), N2: Ext, N3: Zero);
24040 }
24041
24042 EVT FPMemVT = MemVT == MVT::i8
24043 ? MVT::aarch64mfp8
24044 : EVT::getFloatingPointVT(BitWidth: MemVT.getSizeInBits());
24045 SDValue FPSubreg = DAG.getTargetExtractSubreg(SRIdx: getFPSubregForVT(VT: FPMemVT), DL,
24046 VT: FPMemVT, Operand: ExtVector);
24047
24048 return DAG.getStore(Chain: ST->getChain(), dl: DL, Val: FPSubreg, Ptr: ST->getBasePtr(),
24049 MMO: ST->getMemOperand());
24050 }
24051
24052 return SDValue();
24053}
24054
24055static SDValue performMSTORECombine(SDNode *N,
24056 TargetLowering::DAGCombinerInfo &DCI,
24057 SelectionDAG &DAG,
24058 const AArch64Subtarget *Subtarget) {
24059 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(Val: N);
24060 SDValue Value = MST->getValue();
24061 SDValue Mask = MST->getMask();
24062 SDLoc DL(N);
24063
24064 // If this is a UZP1 followed by a masked store, fold this into a masked
24065 // truncating store. We can do this even if this is already a masked
24066 // truncstore.
24067 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
24068 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
24069 Value.getValueType().isInteger()) {
24070 Value = Value.getOperand(i: 0);
24071 if (Value.getOpcode() == ISD::BITCAST) {
24072 EVT HalfVT =
24073 Value.getValueType().getHalfNumVectorElementsVT(Context&: *DAG.getContext());
24074 EVT InVT = Value.getOperand(i: 0).getValueType();
24075
24076 if (HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext()) == InVT) {
24077 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
24078 unsigned PgPattern = Mask->getConstantOperandVal(Num: 0);
24079
24080 // Ensure we can double the size of the predicate pattern
24081 unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
24082 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
24083 MinSVESize) {
24084 Mask = getPTrue(DAG, DL, VT: InVT.changeVectorElementType(EltVT: MVT::i1),
24085 Pattern: PgPattern);
24086 return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Value.getOperand(i: 0),
24087 Base: MST->getBasePtr(), Offset: MST->getOffset(), Mask,
24088 MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
24089 AM: MST->getAddressingMode(),
24090 /*IsTruncating=*/true);
24091 }
24092 }
24093 }
24094 }
24095
24096 if (MST->isTruncatingStore()) {
24097 EVT ValueVT = Value->getValueType(ResNo: 0);
24098 EVT MemVT = MST->getMemoryVT();
24099 if (!isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: MemVT))
24100 return SDValue();
24101 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Value, DAG, Subtarget)) {
24102 return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Rshrnb, Base: MST->getBasePtr(),
24103 Offset: MST->getOffset(), Mask: MST->getMask(),
24104 MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
24105 AM: MST->getAddressingMode(), IsTruncating: true);
24106 }
24107 }
24108
24109 return SDValue();
24110}
24111
24112/// \return true if part of the index was folded into the Base.
24113static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
24114 SDLoc DL, SelectionDAG &DAG) {
24115 // This function assumes a vector of i64 indices.
24116 EVT IndexVT = Index.getValueType();
24117 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
24118 return false;
24119
24120 // Simplify:
24121 // BasePtr = Ptr
24122 // Index = X + splat(Offset)
24123 // ->
24124 // BasePtr = Ptr + Offset * scale.
24125 // Index = X
24126 if (Index.getOpcode() == ISD::ADD) {
24127 if (auto Offset = DAG.getSplatValue(V: Index.getOperand(i: 1))) {
24128 Offset = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: Offset, N2: Scale);
24129 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr, N2: Offset);
24130 Index = Index.getOperand(i: 0);
24131 return true;
24132 }
24133 }
24134
24135 // Simplify:
24136 // BasePtr = Ptr
24137 // Index = (X + splat(Offset)) << splat(Shift)
24138 // ->
24139 // BasePtr = Ptr + (Offset << Shift) * scale)
24140 // Index = X << splat(shift)
24141 if (Index.getOpcode() == ISD::SHL &&
24142 Index.getOperand(i: 0).getOpcode() == ISD::ADD) {
24143 SDValue Add = Index.getOperand(i: 0);
24144 SDValue ShiftOp = Index.getOperand(i: 1);
24145 SDValue OffsetOp = Add.getOperand(i: 1);
24146 if (auto Shift = DAG.getSplatValue(V: ShiftOp))
24147 if (auto Offset = DAG.getSplatValue(V: OffsetOp)) {
24148 Offset = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i64, N1: Offset, N2: Shift);
24149 Offset = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: Offset, N2: Scale);
24150 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr, N2: Offset);
24151 Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: Index.getValueType(),
24152 N1: Add.getOperand(i: 0), N2: ShiftOp);
24153 return true;
24154 }
24155 }
24156
24157 return false;
24158}
24159
24160// Analyse the specified address returning true if a more optimal addressing
24161// mode is available. When returning true all parameters are updated to reflect
24162// their recommended values.
24163static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
24164 SDValue &BasePtr, SDValue &Index,
24165 SelectionDAG &DAG) {
24166 // Try to iteratively fold parts of the index into the base pointer to
24167 // simplify the index as much as possible.
24168 bool Changed = false;
24169 while (foldIndexIntoBase(BasePtr, Index, Scale: N->getScale(), DL: SDLoc(N), DAG))
24170 Changed = true;
24171
24172 // Only consider element types that are pointer sized as smaller types can
24173 // be easily promoted.
24174 EVT IndexVT = Index.getValueType();
24175 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
24176 return Changed;
24177
24178 // Can indices be trivially shrunk?
24179 EVT DataVT = N->getOperand(Num: 1).getValueType();
24180 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
24181 // will later be re-extended to 64 bits in legalization
24182 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
24183 return Changed;
24184 if (ISD::isVectorShrinkable(N: Index.getNode(), NewEltSize: 32, Signed: N->isIndexSigned())) {
24185 EVT NewIndexVT = IndexVT.changeVectorElementType(EltVT: MVT::i32);
24186 Index = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc(N), VT: NewIndexVT, Operand: Index);
24187 return true;
24188 }
24189
24190 // Match:
24191 // Index = step(const)
24192 int64_t Stride = 0;
24193 if (Index.getOpcode() == ISD::STEP_VECTOR) {
24194 Stride = cast<ConstantSDNode>(Val: Index.getOperand(i: 0))->getSExtValue();
24195 }
24196 // Match:
24197 // Index = step(const) << shift(const)
24198 else if (Index.getOpcode() == ISD::SHL &&
24199 Index.getOperand(i: 0).getOpcode() == ISD::STEP_VECTOR) {
24200 SDValue RHS = Index.getOperand(i: 1);
24201 if (auto *Shift =
24202 dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: RHS))) {
24203 int64_t Step = (int64_t)Index.getOperand(i: 0).getConstantOperandVal(i: 1);
24204 Stride = Step << Shift->getZExtValue();
24205 }
24206 }
24207
24208 // Return early because no supported pattern is found.
24209 if (Stride == 0)
24210 return Changed;
24211
24212 if (Stride < std::numeric_limits<int32_t>::min() ||
24213 Stride > std::numeric_limits<int32_t>::max())
24214 return Changed;
24215
24216 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
24217 unsigned MaxVScale =
24218 Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
24219 int64_t LastElementOffset =
24220 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
24221
24222 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
24223 LastElementOffset > std::numeric_limits<int32_t>::max())
24224 return Changed;
24225
24226 EVT NewIndexVT = IndexVT.changeVectorElementType(EltVT: MVT::i32);
24227 // Stride does not scale explicitly by 'Scale', because it happens in
24228 // the gather/scatter addressing mode.
24229 Index = DAG.getStepVector(DL: SDLoc(N), ResVT: NewIndexVT, StepVal: APInt(32, Stride, true));
24230 return true;
24231}
24232
24233static SDValue performMaskedGatherScatterCombine(
24234 SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
24235 if (!DCI.isBeforeLegalize())
24236 return SDValue();
24237 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(Val: N);
24238
24239 SDLoc DL(MGS);
24240 SDValue Chain = MGS->getChain();
24241 SDValue Scale = MGS->getScale();
24242 SDValue Index = MGS->getIndex();
24243 SDValue Mask = MGS->getMask();
24244 SDValue BasePtr = MGS->getBasePtr();
24245 ISD::MemIndexType IndexType = MGS->getIndexType();
24246
24247 if (!findMoreOptimalIndexType(N: MGS, BasePtr, Index, DAG))
24248 return SDValue();
24249
24250 // Here we catch such cases early and change MGATHER's IndexType to allow
24251 // the use of an Index that's more legalisation friendly.
24252 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(Val: MGS)) {
24253 SDValue PassThru = MGT->getPassThru();
24254 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
24255 return DAG.getMaskedGather(
24256 VTs: DAG.getVTList(VT1: N->getValueType(ResNo: 0), VT2: MVT::Other), MemVT: MGT->getMemoryVT(), dl: DL,
24257 Ops, MMO: MGT->getMemOperand(), IndexType, ExtTy: MGT->getExtensionType());
24258 }
24259 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(Val: MGS)) {
24260 SDValue Data = MSC->getValue();
24261 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
24262 return DAG.getMaskedScatter(VTs: DAG.getVTList(VT: MVT::Other), MemVT: MSC->getMemoryVT(),
24263 dl: DL, Ops, MMO: MSC->getMemOperand(), IndexType,
24264 IsTruncating: MSC->isTruncatingStore());
24265 }
24266 auto *HG = cast<MaskedHistogramSDNode>(Val: MGS);
24267 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
24268 Index, Scale, HG->getIntID()};
24269 return DAG.getMaskedHistogram(VTs: DAG.getVTList(VT: MVT::Other), MemVT: HG->getMemoryVT(),
24270 dl: DL, Ops, MMO: HG->getMemOperand(), IndexType);
24271}
24272
24273/// Target-specific DAG combine function for NEON load/store intrinsics
24274/// to merge base address updates.
24275static SDValue performNEONPostLDSTCombine(SDNode *N,
24276 TargetLowering::DAGCombinerInfo &DCI,
24277 SelectionDAG &DAG) {
24278 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24279 return SDValue();
24280
24281 unsigned AddrOpIdx = N->getNumOperands() - 1;
24282 SDValue Addr = N->getOperand(Num: AddrOpIdx);
24283
24284 // Search for a use of the address operand that is an increment.
24285 for (SDUse &Use : Addr->uses()) {
24286 SDNode *User = Use.getUser();
24287 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24288 continue;
24289
24290 // Check that the add is independent of the load/store. Otherwise, folding
24291 // it would create a cycle.
24292 SmallPtrSet<const SDNode *, 32> Visited;
24293 SmallVector<const SDNode *, 16> Worklist;
24294 Visited.insert(Ptr: Addr.getNode());
24295 Worklist.push_back(Elt: N);
24296 Worklist.push_back(Elt: User);
24297 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
24298 SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
24299 continue;
24300
24301 // Find the new opcode for the updating load/store.
24302 bool IsStore = false;
24303 bool IsLaneOp = false;
24304 bool IsDupOp = false;
24305 unsigned NewOpc = 0;
24306 unsigned NumVecs = 0;
24307 unsigned IntNo = N->getConstantOperandVal(Num: 1);
24308 switch (IntNo) {
24309 default: llvm_unreachable("unexpected intrinsic for Neon base update");
24310 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
24311 NumVecs = 2; break;
24312 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
24313 NumVecs = 3; break;
24314 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
24315 NumVecs = 4; break;
24316 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
24317 NumVecs = 2; IsStore = true; break;
24318 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
24319 NumVecs = 3; IsStore = true; break;
24320 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
24321 NumVecs = 4; IsStore = true; break;
24322 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
24323 NumVecs = 2; break;
24324 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
24325 NumVecs = 3; break;
24326 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
24327 NumVecs = 4; break;
24328 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
24329 NumVecs = 2; IsStore = true; break;
24330 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
24331 NumVecs = 3; IsStore = true; break;
24332 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
24333 NumVecs = 4; IsStore = true; break;
24334 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
24335 NumVecs = 2; IsDupOp = true; break;
24336 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
24337 NumVecs = 3; IsDupOp = true; break;
24338 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
24339 NumVecs = 4; IsDupOp = true; break;
24340 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
24341 NumVecs = 2; IsLaneOp = true; break;
24342 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
24343 NumVecs = 3; IsLaneOp = true; break;
24344 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
24345 NumVecs = 4; IsLaneOp = true; break;
24346 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
24347 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
24348 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
24349 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
24350 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
24351 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
24352 }
24353
24354 EVT VecTy;
24355 if (IsStore)
24356 VecTy = N->getOperand(Num: 2).getValueType();
24357 else
24358 VecTy = N->getValueType(ResNo: 0);
24359
24360 // If the increment is a constant, it must match the memory ref size.
24361 SDValue Inc = User->getOperand(Num: User->getOperand(Num: 0) == Addr ? 1 : 0);
24362 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
24363 uint32_t IncVal = CInc->getZExtValue();
24364 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
24365 if (IsLaneOp || IsDupOp)
24366 NumBytes /= VecTy.getVectorNumElements();
24367 if (IncVal != NumBytes)
24368 continue;
24369 Inc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
24370 }
24371 SmallVector<SDValue, 8> Ops;
24372 Ops.push_back(Elt: N->getOperand(Num: 0)); // Incoming chain
24373 // Load lane and store have vector list as input.
24374 if (IsLaneOp || IsStore)
24375 for (unsigned i = 2; i < AddrOpIdx; ++i)
24376 Ops.push_back(Elt: N->getOperand(Num: i));
24377 Ops.push_back(Elt: Addr); // Base register
24378 Ops.push_back(Elt: Inc);
24379
24380 // Return Types.
24381 EVT Tys[6];
24382 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
24383 unsigned n;
24384 for (n = 0; n < NumResultVecs; ++n)
24385 Tys[n] = VecTy;
24386 Tys[n++] = MVT::i64; // Type of write back register
24387 Tys[n] = MVT::Other; // Type of the chain
24388 SDVTList SDTys = DAG.getVTList(VTs: ArrayRef(Tys, NumResultVecs + 2));
24389
24390 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(Val: N);
24391 SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOpc, dl: SDLoc(N), VTList: SDTys, Ops,
24392 MemVT: MemInt->getMemoryVT(),
24393 MMO: MemInt->getMemOperand());
24394
24395 // Update the uses.
24396 std::vector<SDValue> NewResults;
24397 for (unsigned i = 0; i < NumResultVecs; ++i) {
24398 NewResults.push_back(x: SDValue(UpdN.getNode(), i));
24399 }
24400 NewResults.push_back(x: SDValue(UpdN.getNode(), NumResultVecs + 1));
24401 DCI.CombineTo(N, To: NewResults);
24402 DCI.CombineTo(N: User, Res: SDValue(UpdN.getNode(), NumResultVecs));
24403
24404 break;
24405 }
24406 return SDValue();
24407}
24408
24409// Checks to see if the value is the prescribed width and returns information
24410// about its extension mode.
24411static
24412bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
24413 ExtType = ISD::NON_EXTLOAD;
24414 switch(V.getNode()->getOpcode()) {
24415 default:
24416 return false;
24417 case ISD::LOAD: {
24418 LoadSDNode *LoadNode = cast<LoadSDNode>(Val: V.getNode());
24419 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
24420 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
24421 ExtType = LoadNode->getExtensionType();
24422 return true;
24423 }
24424 return false;
24425 }
24426 case ISD::AssertSext: {
24427 VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: 1));
24428 if ((TypeNode->getVT() == MVT::i8 && width == 8)
24429 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
24430 ExtType = ISD::SEXTLOAD;
24431 return true;
24432 }
24433 return false;
24434 }
24435 case ISD::AssertZext: {
24436 VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: 1));
24437 if ((TypeNode->getVT() == MVT::i8 && width == 8)
24438 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
24439 ExtType = ISD::ZEXTLOAD;
24440 return true;
24441 }
24442 return false;
24443 }
24444 case ISD::Constant:
24445 case ISD::TargetConstant: {
24446 return std::abs(i: cast<ConstantSDNode>(Val: V.getNode())->getSExtValue()) <
24447 1LL << (width - 1);
24448 }
24449 }
24450
24451 return true;
24452}
24453
24454// This function does a whole lot of voodoo to determine if the tests are
24455// equivalent without and with a mask. Essentially what happens is that given a
24456// DAG resembling:
24457//
24458// +-------------+ +-------------+ +-------------+ +-------------+
24459// | Input | | AddConstant | | CompConstant| | CC |
24460// +-------------+ +-------------+ +-------------+ +-------------+
24461// | | | |
24462// V V | +----------+
24463// +-------------+ +----+ | |
24464// | ADD | |0xff| | |
24465// +-------------+ +----+ | |
24466// | | | |
24467// V V | |
24468// +-------------+ | |
24469// | AND | | |
24470// +-------------+ | |
24471// | | |
24472// +-----+ | |
24473// | | |
24474// V V V
24475// +-------------+
24476// | CMP |
24477// +-------------+
24478//
24479// The AND node may be safely removed for some combinations of inputs. In
24480// particular we need to take into account the extension type of the Input,
24481// the exact values of AddConstant, CompConstant, and CC, along with the nominal
24482// width of the input (this can work for any width inputs, the above graph is
24483// specific to 8 bits.
24484//
24485// The specific equations were worked out by generating output tables for each
24486// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
24487// problem was simplified by working with 4 bit inputs, which means we only
24488// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
24489// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
24490// patterns present in both extensions (0,7). For every distinct set of
24491// AddConstant and CompConstants bit patterns we can consider the masked and
24492// unmasked versions to be equivalent if the result of this function is true for
24493// all 16 distinct bit patterns of for the current extension type of Input (w0).
24494//
24495// sub w8, w0, w1
24496// and w10, w8, #0x0f
24497// cmp w8, w2
24498// cset w9, AArch64CC
24499// cmp w10, w2
24500// cset w11, AArch64CC
24501// cmp w9, w11
24502// cset w0, eq
24503// ret
24504//
24505// Since the above function shows when the outputs are equivalent it defines
24506// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
24507// would be expensive to run during compiles. The equations below were written
24508// in a test harness that confirmed they gave equivalent outputs to the above
24509// for all inputs function, so they can be used determine if the removal is
24510// legal instead.
24511//
24512// isEquivalentMaskless() is the code for testing if the AND can be removed
24513// factored out of the DAG recognition as the DAG can take several forms.
24514
24515static bool isEquivalentMaskless(unsigned CC, unsigned width,
24516 ISD::LoadExtType ExtType, int AddConstant,
24517 int CompConstant) {
24518 // By being careful about our equations and only writing the in term
24519 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
24520 // make them generally applicable to all bit widths.
24521 int MaxUInt = (1 << width);
24522
24523 // For the purposes of these comparisons sign extending the type is
24524 // equivalent to zero extending the add and displacing it by half the integer
24525 // width. Provided we are careful and make sure our equations are valid over
24526 // the whole range we can just adjust the input and avoid writing equations
24527 // for sign extended inputs.
24528 if (ExtType == ISD::SEXTLOAD)
24529 AddConstant -= (1 << (width-1));
24530
24531 switch(CC) {
24532 case AArch64CC::LE:
24533 case AArch64CC::GT:
24534 if ((AddConstant == 0) ||
24535 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
24536 (AddConstant >= 0 && CompConstant < 0) ||
24537 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
24538 return true;
24539 break;
24540 case AArch64CC::LT:
24541 case AArch64CC::GE:
24542 if ((AddConstant == 0) ||
24543 (AddConstant >= 0 && CompConstant <= 0) ||
24544 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
24545 return true;
24546 break;
24547 case AArch64CC::HI:
24548 case AArch64CC::LS:
24549 if ((AddConstant >= 0 && CompConstant < 0) ||
24550 (AddConstant <= 0 && CompConstant >= -1 &&
24551 CompConstant < AddConstant + MaxUInt))
24552 return true;
24553 break;
24554 case AArch64CC::PL:
24555 case AArch64CC::MI:
24556 if ((AddConstant == 0) ||
24557 (AddConstant > 0 && CompConstant <= 0) ||
24558 (AddConstant < 0 && CompConstant <= AddConstant))
24559 return true;
24560 break;
24561 case AArch64CC::LO:
24562 case AArch64CC::HS:
24563 if ((AddConstant >= 0 && CompConstant <= 0) ||
24564 (AddConstant <= 0 && CompConstant >= 0 &&
24565 CompConstant <= AddConstant + MaxUInt))
24566 return true;
24567 break;
24568 case AArch64CC::EQ:
24569 case AArch64CC::NE:
24570 if ((AddConstant > 0 && CompConstant < 0) ||
24571 (AddConstant < 0 && CompConstant >= 0 &&
24572 CompConstant < AddConstant + MaxUInt) ||
24573 (AddConstant >= 0 && CompConstant >= 0 &&
24574 CompConstant >= AddConstant) ||
24575 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
24576 return true;
24577 break;
24578 case AArch64CC::VS:
24579 case AArch64CC::VC:
24580 case AArch64CC::AL:
24581 case AArch64CC::NV:
24582 return true;
24583 case AArch64CC::Invalid:
24584 break;
24585 }
24586
24587 return false;
24588}
24589
24590// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
24591// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
24592static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode,
24593 SDNode *AndNode, SelectionDAG &DAG,
24594 unsigned CCIndex, unsigned CmpIndex,
24595 unsigned CC) {
24596 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(Val: SubsNode->getOperand(Num: 1));
24597 if (!SubsC)
24598 return SDValue();
24599
24600 APInt SubsAP = SubsC->getAPIntValue();
24601 if (CC == AArch64CC::HI) {
24602 if (!SubsAP.isMask())
24603 return SDValue();
24604 } else if (CC == AArch64CC::LO) {
24605 if (!SubsAP.isPowerOf2())
24606 return SDValue();
24607 } else
24608 return SDValue();
24609
24610 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: 1));
24611 if (!AndC)
24612 return SDValue();
24613
24614 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
24615
24616 SDLoc DL(N);
24617 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
24618 SDValue ANDS = DAG.getNode(
24619 Opcode: AArch64ISD::ANDS, DL, VTList: SubsNode->getVTList(), N1: AndNode->getOperand(Num: 0),
24620 N2: DAG.getConstant(Val: AndSMask, DL, VT: SubsC->getValueType(ResNo: 0)));
24621 SDValue AArch64_CC =
24622 DAG.getConstant(Val: CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL,
24623 VT: N->getOperand(Num: CCIndex)->getValueType(ResNo: 0));
24624
24625 // For now, only performCSELCombine and performBRCONDCombine call this
24626 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
24627 // operands. So just init the ops direct to simplify the code. If we have some
24628 // other case with different CCIndex, CmpIndex, we need to use for loop to
24629 // rewrite the code here.
24630 // TODO: Do we need to assert number of operand is 4 here?
24631 assert((CCIndex == 2 && CmpIndex == 3) &&
24632 "Expected CCIndex to be 2 and CmpIndex to be 3.");
24633 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), AArch64_CC,
24634 ANDS.getValue(R: 1)};
24635 return DAG.getNode(Opcode: N->getOpcode(), DL: N, VTList: N->getVTList(), Ops);
24636}
24637
24638static
24639SDValue performCONDCombine(SDNode *N,
24640 TargetLowering::DAGCombinerInfo &DCI,
24641 SelectionDAG &DAG, unsigned CCIndex,
24642 unsigned CmpIndex) {
24643 unsigned CC = cast<ConstantSDNode>(Val: N->getOperand(Num: CCIndex))->getSExtValue();
24644 SDNode *SubsNode = N->getOperand(Num: CmpIndex).getNode();
24645 unsigned CondOpcode = SubsNode->getOpcode();
24646
24647 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(Value: 0) ||
24648 !SubsNode->hasOneUse())
24649 return SDValue();
24650
24651 // There is a SUBS feeding this condition. Is it fed by a mask we can
24652 // use?
24653
24654 SDNode *AndNode = SubsNode->getOperand(Num: 0).getNode();
24655 unsigned MaskBits = 0;
24656
24657 if (AndNode->getOpcode() != ISD::AND)
24658 return SDValue();
24659
24660 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
24661 CmpIndex, CC))
24662 return Val;
24663
24664 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: 1))) {
24665 uint32_t CNV = CN->getZExtValue();
24666 if (CNV == 255)
24667 MaskBits = 8;
24668 else if (CNV == 65535)
24669 MaskBits = 16;
24670 }
24671
24672 if (!MaskBits)
24673 return SDValue();
24674
24675 SDValue AddValue = AndNode->getOperand(Num: 0);
24676
24677 if (AddValue.getOpcode() != ISD::ADD)
24678 return SDValue();
24679
24680 // The basic dag structure is correct, grab the inputs and validate them.
24681
24682 SDValue AddInputValue1 = AddValue.getNode()->getOperand(Num: 0);
24683 SDValue AddInputValue2 = AddValue.getNode()->getOperand(Num: 1);
24684 SDValue SubsInputValue = SubsNode->getOperand(Num: 1);
24685
24686 // The mask is present and the provenance of all the values is a smaller type,
24687 // lets see if the mask is superfluous.
24688
24689 if (!isa<ConstantSDNode>(Val: AddInputValue2.getNode()) ||
24690 !isa<ConstantSDNode>(Val: SubsInputValue.getNode()))
24691 return SDValue();
24692
24693 ISD::LoadExtType ExtType;
24694
24695 if (!checkValueWidth(V: SubsInputValue, width: MaskBits, ExtType) ||
24696 !checkValueWidth(V: AddInputValue2, width: MaskBits, ExtType) ||
24697 !checkValueWidth(V: AddInputValue1, width: MaskBits, ExtType) )
24698 return SDValue();
24699
24700 if(!isEquivalentMaskless(CC, width: MaskBits, ExtType,
24701 AddConstant: cast<ConstantSDNode>(Val: AddInputValue2.getNode())->getSExtValue(),
24702 CompConstant: cast<ConstantSDNode>(Val: SubsInputValue.getNode())->getSExtValue()))
24703 return SDValue();
24704
24705 // The AND is not necessary, remove it.
24706
24707 SDVTList VTs = DAG.getVTList(VT1: SubsNode->getValueType(ResNo: 0),
24708 VT2: SubsNode->getValueType(ResNo: 1));
24709 SDValue Ops[] = { AddValue, SubsNode->getOperand(Num: 1) };
24710
24711 SDValue NewValue = DAG.getNode(Opcode: CondOpcode, DL: SDLoc(SubsNode), VTList: VTs, Ops);
24712 DAG.ReplaceAllUsesWith(From: SubsNode, To: NewValue.getNode());
24713
24714 return SDValue(N, 0);
24715}
24716
24717// Optimize compare with zero and branch.
24718static SDValue performBRCONDCombine(SDNode *N,
24719 TargetLowering::DAGCombinerInfo &DCI,
24720 SelectionDAG &DAG) {
24721 MachineFunction &MF = DAG.getMachineFunction();
24722 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
24723 // will not be produced, as they are conditional branch instructions that do
24724 // not set flags.
24725 if (MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening))
24726 return SDValue();
24727
24728 if (SDValue NV = performCONDCombine(N, DCI, DAG, CCIndex: 2, CmpIndex: 3))
24729 N = NV.getNode();
24730 SDValue Chain = N->getOperand(Num: 0);
24731 SDValue Dest = N->getOperand(Num: 1);
24732 SDValue CCVal = N->getOperand(Num: 2);
24733 SDValue Cmp = N->getOperand(Num: 3);
24734
24735 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
24736 unsigned CC = CCVal->getAsZExtVal();
24737 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
24738 return SDValue();
24739
24740 // Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)
24741 if (isCMP(Op: Cmp) && CC == AArch64CC::NE && isOneConstant(V: Cmp.getOperand(i: 1))) {
24742 SDValue CSel = Cmp.getOperand(i: 0);
24743 auto CSelCC = getCSETCondCode(Op: CSel);
24744 if (CSelCC) {
24745 SDLoc DL(N);
24746 return DAG.getNode(
24747 Opcode: N->getOpcode(), DL, VTList: N->getVTList(), N1: Chain, N2: Dest,
24748 N3: DAG.getConstant(Val: getInvertedCondCode(Code: *CSelCC), DL, VT: MVT::i32),
24749 N4: CSel.getOperand(i: 3));
24750 }
24751 }
24752
24753 unsigned CmpOpc = Cmp.getOpcode();
24754 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
24755 return SDValue();
24756
24757 // Only attempt folding if there is only one use of the flag and no use of the
24758 // value.
24759 if (!Cmp->hasNUsesOfValue(NUses: 0, Value: 0) || !Cmp->hasNUsesOfValue(NUses: 1, Value: 1))
24760 return SDValue();
24761
24762 SDValue LHS = Cmp.getOperand(i: 0);
24763 SDValue RHS = Cmp.getOperand(i: 1);
24764
24765 assert(LHS.getValueType() == RHS.getValueType() &&
24766 "Expected the value type to be the same for both operands!");
24767 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
24768 return SDValue();
24769
24770 if (isNullConstant(V: LHS))
24771 std::swap(a&: LHS, b&: RHS);
24772
24773 if (!isNullConstant(V: RHS))
24774 return SDValue();
24775
24776 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
24777 LHS.getOpcode() == ISD::SRL)
24778 return SDValue();
24779
24780 // Fold the compare into the branch instruction.
24781 SDValue BR;
24782 if (CC == AArch64CC::EQ)
24783 BR = DAG.getNode(Opcode: AArch64ISD::CBZ, DL: SDLoc(N), VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
24784 else
24785 BR = DAG.getNode(Opcode: AArch64ISD::CBNZ, DL: SDLoc(N), VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
24786
24787 // Do not add new nodes to DAG combiner worklist.
24788 DCI.CombineTo(N, Res: BR, AddTo: false);
24789
24790 return SDValue();
24791}
24792
24793static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
24794 unsigned CC = N->getConstantOperandVal(Num: 2);
24795 SDValue SUBS = N->getOperand(Num: 3);
24796 SDValue Zero, CTTZ;
24797
24798 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
24799 Zero = N->getOperand(Num: 0);
24800 CTTZ = N->getOperand(Num: 1);
24801 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
24802 Zero = N->getOperand(Num: 1);
24803 CTTZ = N->getOperand(Num: 0);
24804 } else
24805 return SDValue();
24806
24807 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
24808 (CTTZ.getOpcode() == ISD::TRUNCATE &&
24809 CTTZ.getOperand(i: 0).getOpcode() != ISD::CTTZ))
24810 return SDValue();
24811
24812 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
24813 "Illegal type in CTTZ folding");
24814
24815 if (!isNullConstant(V: Zero) || !isNullConstant(V: SUBS.getOperand(i: 1)))
24816 return SDValue();
24817
24818 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
24819 ? CTTZ.getOperand(i: 0).getOperand(i: 0)
24820 : CTTZ.getOperand(i: 0);
24821
24822 if (X != SUBS.getOperand(i: 0))
24823 return SDValue();
24824
24825 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
24826 ? CTTZ.getOperand(i: 0).getValueSizeInBits()
24827 : CTTZ.getValueSizeInBits();
24828 SDValue BitWidthMinusOne =
24829 DAG.getConstant(Val: BitWidth - 1, DL: SDLoc(N), VT: CTTZ.getValueType());
24830 return DAG.getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: CTTZ.getValueType(), N1: CTTZ,
24831 N2: BitWidthMinusOne);
24832}
24833
24834// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
24835// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
24836// Where x and y are constants and x != y
24837
24838// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
24839// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
24840// Where x and y are constants and x != y
24841static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
24842 SDValue L = Op->getOperand(Num: 0);
24843 SDValue R = Op->getOperand(Num: 1);
24844 AArch64CC::CondCode OpCC =
24845 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(Num: 2));
24846
24847 SDValue OpCmp = Op->getOperand(Num: 3);
24848 if (!isCMP(Op: OpCmp))
24849 return SDValue();
24850
24851 SDValue CmpLHS = OpCmp.getOperand(i: 0);
24852 SDValue CmpRHS = OpCmp.getOperand(i: 1);
24853
24854 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
24855 std::swap(a&: CmpLHS, b&: CmpRHS);
24856 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
24857 return SDValue();
24858
24859 SDValue X = CmpLHS->getOperand(Num: 0);
24860 SDValue Y = CmpLHS->getOperand(Num: 1);
24861 if (!isa<ConstantSDNode>(Val: X) || !isa<ConstantSDNode>(Val: Y) || X == Y) {
24862 return SDValue();
24863 }
24864
24865 // If one of the constant is opaque constant, x,y sdnode is still different
24866 // but the real value maybe the same. So check APInt here to make sure the
24867 // code is correct.
24868 ConstantSDNode *CX = cast<ConstantSDNode>(Val&: X);
24869 ConstantSDNode *CY = cast<ConstantSDNode>(Val&: Y);
24870 if (CX->getAPIntValue() == CY->getAPIntValue())
24871 return SDValue();
24872
24873 AArch64CC::CondCode CC =
24874 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(Num: 2));
24875 SDValue Cond = CmpLHS->getOperand(Num: 3);
24876
24877 if (CmpRHS == Y)
24878 CC = AArch64CC::getInvertedCondCode(Code: CC);
24879 else if (CmpRHS != X)
24880 return SDValue();
24881
24882 if (OpCC == AArch64CC::NE)
24883 CC = AArch64CC::getInvertedCondCode(Code: CC);
24884 else if (OpCC != AArch64CC::EQ)
24885 return SDValue();
24886
24887 SDLoc DL(Op);
24888 EVT VT = Op->getValueType(ResNo: 0);
24889
24890 SDValue CCValue = DAG.getConstant(Val: CC, DL, VT: MVT::i32);
24891 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: L, N2: R, N3: CCValue, N4: Cond);
24892}
24893
24894// Reassociate the true/false expressions of a CSEL instruction to obtain a
24895// common subexpression with the comparison instruction. For example, change
24896// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
24897// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
24898// subexpression.
24899static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
24900 SDValue SubsNode = N->getOperand(Num: 3);
24901 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
24902 return SDValue();
24903
24904 SDValue CmpOpToMatch = SubsNode.getOperand(i: 1);
24905 SDValue CmpOpOther = SubsNode.getOperand(i: 0);
24906 EVT VT = N->getValueType(ResNo: 0);
24907
24908 unsigned ExpectedOpcode;
24909 SDValue ExpectedOp;
24910 SDValue SubsOp;
24911 auto *CmpOpConst = dyn_cast<ConstantSDNode>(Val&: CmpOpToMatch);
24912 if (CmpOpConst) {
24913 ExpectedOpcode = ISD::ADD;
24914 ExpectedOp =
24915 DAG.getConstant(Val: -CmpOpConst->getAPIntValue(), DL: SDLoc(CmpOpConst),
24916 VT: CmpOpConst->getValueType(ResNo: 0));
24917 SubsOp = DAG.getConstant(Val: CmpOpConst->getAPIntValue(), DL: SDLoc(CmpOpConst),
24918 VT: CmpOpConst->getValueType(ResNo: 0));
24919 } else {
24920 ExpectedOpcode = ISD::SUB;
24921 ExpectedOp = CmpOpToMatch;
24922 SubsOp = CmpOpToMatch;
24923 }
24924
24925 // Get the operand that can be reassociated with the SUBS instruction.
24926 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
24927 if (Op.getOpcode() != ExpectedOpcode)
24928 return SDValue();
24929 if (Op.getOperand(i: 0).getOpcode() != ISD::ADD ||
24930 !Op.getOperand(i: 0).hasOneUse())
24931 return SDValue();
24932 SDValue X = Op.getOperand(i: 0).getOperand(i: 0);
24933 SDValue Y = Op.getOperand(i: 0).getOperand(i: 1);
24934 if (X != CmpOpOther)
24935 std::swap(a&: X, b&: Y);
24936 if (X != CmpOpOther)
24937 return SDValue();
24938 if (ExpectedOp != Op.getOperand(i: 1))
24939 return SDValue();
24940 return Y;
24941 };
24942
24943 // Try the reassociation using the given constant and condition code.
24944 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
24945 SDValue SubsOp) {
24946 SDValue TReassocOp = GetReassociationOp(N->getOperand(Num: 0), ExpectedOp);
24947 SDValue FReassocOp = GetReassociationOp(N->getOperand(Num: 1), ExpectedOp);
24948 if (!TReassocOp && !FReassocOp)
24949 return SDValue();
24950
24951 SDValue NewCmp = DAG.getNode(Opcode: AArch64ISD::SUBS, DL: SDLoc(SubsNode),
24952 VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC), N1: CmpOpOther, N2: SubsOp);
24953
24954 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
24955 if (!ReassocOp)
24956 return N->getOperand(Num: OpNum);
24957 SDValue Res = DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(N->getOperand(Num: OpNum)), VT,
24958 N1: NewCmp.getValue(R: 0), N2: ReassocOp);
24959 DAG.ReplaceAllUsesWith(From: N->getOperand(Num: OpNum), To: Res);
24960 return Res;
24961 };
24962
24963 SDValue TValReassoc = Reassociate(TReassocOp, 0);
24964 SDValue FValReassoc = Reassociate(FReassocOp, 1);
24965 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: SDLoc(N), VT, N1: TValReassoc, N2: FValReassoc,
24966 N3: DAG.getConstant(Val: NewCC, DL: SDLoc(N->getOperand(Num: 2)), VT: MVT_CC),
24967 N4: NewCmp.getValue(R: 1));
24968 };
24969
24970 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(Num: 2));
24971
24972 // First, try to eliminate the compare instruction by searching for a
24973 // subtraction with the same constant.
24974 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
24975 return R;
24976
24977 if (!CmpOpConst) {
24978 // Try again with the operands of the SUBS instruction and the condition
24979 // swapped. Due to canonicalization, this only helps for non-constant
24980 // operands of the SUBS instruction.
24981 std::swap(a&: CmpOpToMatch, b&: CmpOpOther);
24982 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
24983 return R;
24984 return SDValue();
24985 }
24986
24987 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
24988 return SDValue();
24989
24990 // Next, search for a subtraction with a slightly different constant. By
24991 // adjusting the condition code, we can still eliminate the compare
24992 // instruction. Adjusting the constant is only valid if it does not result
24993 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
24994 // Since such comparisons are trivially true/false, we should not encounter
24995 // them here but check for them nevertheless to be on the safe side.
24996 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
24997 AArch64CC::CondCode NewCC) {
24998 auto ExpectedOp = DAG.getConstant(Val: -NewCmpConst, DL: SDLoc(CmpOpConst),
24999 VT: CmpOpConst->getValueType(ResNo: 0));
25000 auto SubsOp = DAG.getConstant(Val: NewCmpConst, DL: SDLoc(CmpOpConst),
25001 VT: CmpOpConst->getValueType(ResNo: 0));
25002 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
25003 };
25004 switch (CC) {
25005 case AArch64CC::EQ:
25006 case AArch64CC::LS:
25007 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25008 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
25009 case AArch64CC::NE:
25010 case AArch64CC::HI:
25011 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25012 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
25013 case AArch64CC::LO:
25014 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25015 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
25016 case AArch64CC::HS:
25017 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25018 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
25019 case AArch64CC::LT:
25020 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25021 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
25022 case AArch64CC::LE:
25023 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25024 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
25025 case AArch64CC::GT:
25026 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25027 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
25028 case AArch64CC::GE:
25029 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25030 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
25031 default:
25032 return SDValue();
25033 }
25034}
25035
25036static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG) {
25037 AArch64CC::CondCode OpCC =
25038 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(Num: 2));
25039
25040 if (OpCC != AArch64CC::NE)
25041 return SDValue();
25042
25043 SDValue PTest = Op->getOperand(Num: 3);
25044 if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)
25045 return SDValue();
25046
25047 SDValue TruePred = PTest.getOperand(i: 0);
25048 SDValue AnyPred = PTest.getOperand(i: 1);
25049
25050 if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25051 TruePred = TruePred.getOperand(i: 0);
25052
25053 if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25054 AnyPred = AnyPred.getOperand(i: 0);
25055
25056 if (TruePred != AnyPred && !isAllActivePredicate(DAG, N: TruePred))
25057 return SDValue();
25058
25059 SDValue LastB = Op->getOperand(Num: 0);
25060 SDValue Default = Op->getOperand(Num: 1);
25061
25062 if (LastB.getOpcode() != AArch64ISD::LASTB || LastB.getOperand(i: 0) != AnyPred)
25063 return SDValue();
25064
25065 return DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL: SDLoc(Op), VT: Op->getValueType(ResNo: 0),
25066 N1: AnyPred, N2: Default, N3: LastB.getOperand(i: 1));
25067}
25068
25069// Optimize CSEL instructions
25070static SDValue performCSELCombine(SDNode *N,
25071 TargetLowering::DAGCombinerInfo &DCI,
25072 SelectionDAG &DAG) {
25073 // CSEL x, x, cc -> x
25074 if (N->getOperand(Num: 0) == N->getOperand(Num: 1))
25075 return N->getOperand(Num: 0);
25076
25077 if (SDValue R = foldCSELOfCSEL(Op: N, DAG))
25078 return R;
25079
25080 // Try to reassociate the true/false expressions so that we can do CSE with
25081 // a SUBS instruction used to perform the comparison.
25082 if (SDValue R = reassociateCSELOperandsForCSE(N, DAG))
25083 return R;
25084
25085 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
25086 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
25087 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
25088 return Folded;
25089
25090 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
25091 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
25092 SDValue Cond = N->getOperand(Num: 3);
25093 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
25094 Cond.hasOneUse() && Cond->hasNUsesOfValue(NUses: 0, Value: 0) &&
25095 DAG.doesNodeExist(Opcode: ISD::SUB, VTList: N->getVTList(),
25096 Ops: {Cond.getOperand(i: 1), Cond.getOperand(i: 0)}) &&
25097 !DAG.doesNodeExist(Opcode: ISD::SUB, VTList: N->getVTList(),
25098 Ops: {Cond.getOperand(i: 0), Cond.getOperand(i: 1)}) &&
25099 !isNullConstant(V: Cond.getOperand(i: 1))) {
25100 AArch64CC::CondCode OldCond =
25101 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(Num: 2));
25102 AArch64CC::CondCode NewCond = getSwappedCondition(CC: OldCond);
25103 if (NewCond != AArch64CC::AL) {
25104 SDLoc DL(N);
25105 SDValue Sub = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: Cond->getVTList(),
25106 N1: Cond.getOperand(i: 1), N2: Cond.getOperand(i: 0));
25107 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VTList: N->getVTList(), N1: N->getOperand(Num: 0),
25108 N2: N->getOperand(Num: 1),
25109 N3: DAG.getConstant(Val: NewCond, DL, VT: MVT::i32),
25110 N4: Sub.getValue(R: 1));
25111 }
25112 }
25113
25114 // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
25115 if (SDValue CondLast = foldCSELofLASTB(Op: N, DAG))
25116 return CondLast;
25117
25118 return performCONDCombine(N, DCI, DAG, CCIndex: 2, CmpIndex: 3);
25119}
25120
25121// Try to re-use an already extended operand of a vector SetCC feeding a
25122// extended select. Doing so avoids requiring another full extension of the
25123// SET_CC result when lowering the select.
25124static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
25125 EVT Op0MVT = Op->getOperand(Num: 0).getValueType();
25126 if (!Op0MVT.isVector() || Op->use_empty())
25127 return SDValue();
25128
25129 // Make sure that all uses of Op are VSELECTs with result matching types where
25130 // the result type has a larger element type than the SetCC operand.
25131 SDNode *FirstUse = *Op->user_begin();
25132 if (FirstUse->getOpcode() != ISD::VSELECT)
25133 return SDValue();
25134 EVT UseMVT = FirstUse->getValueType(ResNo: 0);
25135 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
25136 return SDValue();
25137 if (any_of(Range: Op->users(), P: [&UseMVT](const SDNode *N) {
25138 return N->getOpcode() != ISD::VSELECT || N->getValueType(ResNo: 0) != UseMVT;
25139 }))
25140 return SDValue();
25141
25142 APInt V;
25143 if (!ISD::isConstantSplatVector(N: Op->getOperand(Num: 1).getNode(), SplatValue&: V))
25144 return SDValue();
25145
25146 SDLoc DL(Op);
25147 SDValue Op0ExtV;
25148 SDValue Op1ExtV;
25149 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op->getOperand(Num: 2))->get();
25150 // Check if the first operand of the SET_CC is already extended. If it is,
25151 // split the SET_CC and re-use the extended version of the operand.
25152 SDNode *Op0SExt = DAG.getNodeIfExists(Opcode: ISD::SIGN_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
25153 Ops: Op->getOperand(Num: 0));
25154 SDNode *Op0ZExt = DAG.getNodeIfExists(Opcode: ISD::ZERO_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
25155 Ops: Op->getOperand(Num: 0));
25156 if (Op0SExt && (isSignedIntSetCC(Code: CC) || isIntEqualitySetCC(Code: CC))) {
25157 Op0ExtV = SDValue(Op0SExt, 0);
25158 Op1ExtV = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: 1));
25159 } else if (Op0ZExt && (isUnsignedIntSetCC(Code: CC) || isIntEqualitySetCC(Code: CC))) {
25160 Op0ExtV = SDValue(Op0ZExt, 0);
25161 Op1ExtV = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: 1));
25162 } else
25163 return SDValue();
25164
25165 return DAG.getNode(Opcode: ISD::SETCC, DL, VT: UseMVT.changeVectorElementType(EltVT: MVT::i1),
25166 N1: Op0ExtV, N2: Op1ExtV, N3: Op->getOperand(Num: 2));
25167}
25168
25169static SDValue
25170performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
25171 SelectionDAG &DAG) {
25172 SDValue Vec = N->getOperand(Num: 0);
25173 if (DCI.isBeforeLegalize() &&
25174 Vec.getValueType().getVectorElementType() == MVT::i1 &&
25175 Vec.getValueType().isFixedLengthVector() &&
25176 Vec.getValueType().isPow2VectorType()) {
25177 SDLoc DL(N);
25178 return getVectorBitwiseReduce(Opcode: N->getOpcode(), Vec, VT: N->getValueType(ResNo: 0), DL,
25179 DAG);
25180 }
25181
25182 return SDValue();
25183}
25184
25185static SDValue performSETCCCombine(SDNode *N,
25186 TargetLowering::DAGCombinerInfo &DCI,
25187 SelectionDAG &DAG) {
25188 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
25189 SDValue LHS = N->getOperand(Num: 0);
25190 SDValue RHS = N->getOperand(Num: 1);
25191 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
25192 SDLoc DL(N);
25193 EVT VT = N->getValueType(ResNo: 0);
25194
25195 if (SDValue V = tryToWidenSetCCOperands(Op: N, DAG))
25196 return V;
25197
25198 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
25199 if (Cond == ISD::SETNE && isOneConstant(V: RHS) &&
25200 LHS->getOpcode() == AArch64ISD::CSEL &&
25201 isNullConstant(V: LHS->getOperand(Num: 0)) && isOneConstant(V: LHS->getOperand(Num: 1)) &&
25202 LHS->hasOneUse()) {
25203 // Invert CSEL's condition.
25204 auto OldCond =
25205 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: 2));
25206 auto NewCond = getInvertedCondCode(Code: OldCond);
25207
25208 // csel 0, 1, !cond, X
25209 SDValue CSEL =
25210 DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: LHS.getValueType(), N1: LHS.getOperand(i: 0),
25211 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: NewCond, DL, VT: MVT::i32),
25212 N4: LHS.getOperand(i: 3));
25213 return DAG.getZExtOrTrunc(Op: CSEL, DL, VT);
25214 }
25215
25216 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
25217 if (Cond == ISD::SETNE && isNullConstant(V: RHS) &&
25218 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Val: LHS->getOperand(Num: 1)) &&
25219 LHS->hasOneUse()) {
25220 EVT TstVT = LHS->getValueType(ResNo: 0);
25221 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 &&
25222 LHS->getConstantOperandVal(Num: 1) < TstVT.getFixedSizeInBits()) {
25223 // this pattern will get better opt in emitComparison
25224 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(Num: 1);
25225 SDValue TST = DAG.getNode(Opcode: ISD::AND, DL, VT: TstVT, N1: LHS->getOperand(Num: 0),
25226 N2: DAG.getSignedConstant(Val: TstImm, DL, VT: TstVT));
25227 return DAG.getNode(Opcode: ISD::SETCC, DL, VT, N1: TST, N2: RHS, N3: N->getOperand(Num: 2));
25228 }
25229 }
25230
25231 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
25232 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
25233 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
25234 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
25235 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
25236 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
25237 (isNullConstant(V: RHS) || isAllOnesConstant(V: RHS)) &&
25238 LHS->getOpcode() == ISD::BITCAST) {
25239 EVT ToVT = LHS->getValueType(ResNo: 0);
25240 EVT FromVT = LHS->getOperand(Num: 0).getValueType();
25241 if (FromVT.isFixedLengthVector() &&
25242 FromVT.getVectorElementType() == MVT::i1) {
25243 bool IsNull = isNullConstant(V: RHS);
25244 LHS = DAG.getNode(Opcode: IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
25245 DL, VT: MVT::i1, Operand: LHS->getOperand(Num: 0));
25246 LHS = DAG.getNode(Opcode: IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, VT: ToVT,
25247 Operand: LHS);
25248 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
25249 }
25250 }
25251
25252 // Try to perform the memcmp when the result is tested for [in]equality with 0
25253 if (SDValue V = performOrXorChainCombine(N, DAG))
25254 return V;
25255
25256 EVT CmpVT = LHS.getValueType();
25257
25258 // NOTE: This exists as a combine only because it proved too awkward to match
25259 // splat(1) across all the NEON types during isel.
25260 APInt SplatLHSVal;
25261 if (CmpVT.isInteger() && Cond == ISD::SETGT &&
25262 ISD::isConstantSplatVector(N: LHS.getNode(), SplatValue&: SplatLHSVal) &&
25263 SplatLHSVal.isOne())
25264 return DAG.getSetCC(DL, VT, LHS: DAG.getConstant(Val: 0, DL, VT: CmpVT), RHS, Cond: ISD::SETGE);
25265
25266 return SDValue();
25267}
25268
25269// Replace a flag-setting operator (eg ANDS) with the generic version
25270// (eg AND) if the flag is unused.
25271static SDValue performFlagSettingCombine(SDNode *N,
25272 TargetLowering::DAGCombinerInfo &DCI,
25273 unsigned GenericOpcode) {
25274 SDLoc DL(N);
25275 SDValue LHS = N->getOperand(Num: 0);
25276 SDValue RHS = N->getOperand(Num: 1);
25277 EVT VT = N->getValueType(ResNo: 0);
25278
25279 // If the flag result isn't used, convert back to a generic opcode.
25280 if (!N->hasAnyUseOfValue(Value: 1)) {
25281 SDValue Res = DCI.DAG.getNode(Opcode: GenericOpcode, DL, VT, Ops: N->ops());
25282 return DCI.DAG.getMergeValues(Ops: {Res, DCI.DAG.getConstant(Val: 0, DL, VT: MVT::i32)},
25283 dl: DL);
25284 }
25285
25286 // Combine identical generic nodes into this node, re-using the result.
25287 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
25288 Opcode: GenericOpcode, VTList: DCI.DAG.getVTList(VT), Ops: {LHS, RHS}))
25289 DCI.CombineTo(N: Generic, Res: SDValue(N, 0));
25290
25291 return SDValue();
25292}
25293
25294static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
25295 // setcc_merge_zero pred
25296 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
25297 // => extract_subvector (inner setcc_merge_zero)
25298 SDValue Pred = N->getOperand(Num: 0);
25299 SDValue LHS = N->getOperand(Num: 1);
25300 SDValue RHS = N->getOperand(Num: 2);
25301 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 3))->get();
25302
25303 if (Cond != ISD::SETNE || !isZerosVector(N: RHS.getNode()) ||
25304 LHS->getOpcode() != ISD::SIGN_EXTEND)
25305 return SDValue();
25306
25307 SDValue Extract = LHS->getOperand(Num: 0);
25308 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
25309 Extract->getValueType(ResNo: 0) != N->getValueType(ResNo: 0) ||
25310 Extract->getConstantOperandVal(Num: 1) != 0)
25311 return SDValue();
25312
25313 SDValue InnerSetCC = Extract->getOperand(Num: 0);
25314 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
25315 return SDValue();
25316
25317 // By this point we've effectively got
25318 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
25319 // lanes are already zero then the trunc(sext()) sequence is redundant and we
25320 // can operate on A directly.
25321 SDValue InnerPred = InnerSetCC.getOperand(i: 0);
25322 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
25323 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
25324 Pred.getConstantOperandVal(i: 0) == InnerPred.getConstantOperandVal(i: 0) &&
25325 Pred->getConstantOperandVal(Num: 0) >= AArch64SVEPredPattern::vl1 &&
25326 Pred->getConstantOperandVal(Num: 0) <= AArch64SVEPredPattern::vl256)
25327 return Extract;
25328
25329 return SDValue();
25330}
25331
25332static SDValue
25333performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
25334 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25335 "Unexpected opcode!");
25336
25337 SelectionDAG &DAG = DCI.DAG;
25338 SDValue Pred = N->getOperand(Num: 0);
25339 SDValue LHS = N->getOperand(Num: 1);
25340 SDValue RHS = N->getOperand(Num: 2);
25341 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 3))->get();
25342
25343 if (SDValue V = performSetCCPunpkCombine(N, DAG))
25344 return V;
25345
25346 if (Cond == ISD::SETNE && isZerosVector(N: RHS.getNode()) &&
25347 LHS->getOpcode() == ISD::SIGN_EXTEND &&
25348 LHS->getOperand(Num: 0)->getValueType(ResNo: 0) == N->getValueType(ResNo: 0)) {
25349 // setcc_merge_zero(
25350 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
25351 // => setcc_merge_zero(pred, ...)
25352 if (LHS->getOperand(Num: 0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25353 LHS->getOperand(Num: 0)->getOperand(Num: 0) == Pred)
25354 return LHS->getOperand(Num: 0);
25355
25356 // setcc_merge_zero(
25357 // all_active, extend(nxvNi1 ...), != splat(0))
25358 // -> nxvNi1 ...
25359 if (isAllActivePredicate(DAG, N: Pred))
25360 return LHS->getOperand(Num: 0);
25361
25362 // setcc_merge_zero(
25363 // pred, extend(nxvNi1 ...), != splat(0))
25364 // -> nxvNi1 and(pred, ...)
25365 if (DCI.isAfterLegalizeDAG())
25366 // Do this after legalization to allow more folds on setcc_merge_zero
25367 // to be recognized.
25368 return DAG.getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
25369 N1: LHS->getOperand(Num: 0), N2: Pred);
25370 }
25371
25372 return SDValue();
25373}
25374
25375// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
25376// as well as whether the test should be inverted. This code is required to
25377// catch these cases (as opposed to standard dag combines) because
25378// AArch64ISD::TBZ is matched during legalization.
25379static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
25380 SelectionDAG &DAG) {
25381
25382 if (!Op->hasOneUse())
25383 return Op;
25384
25385 // We don't handle undef/constant-fold cases below, as they should have
25386 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
25387 // etc.)
25388
25389 // (tbz (trunc x), b) -> (tbz x, b)
25390 // This case is just here to enable more of the below cases to be caught.
25391 if (Op->getOpcode() == ISD::TRUNCATE &&
25392 Bit < Op->getValueType(ResNo: 0).getSizeInBits()) {
25393 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
25394 }
25395
25396 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
25397 if (Op->getOpcode() == ISD::ANY_EXTEND &&
25398 Bit < Op->getOperand(Num: 0).getValueSizeInBits()) {
25399 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
25400 }
25401
25402 if (Op->getNumOperands() != 2)
25403 return Op;
25404
25405 auto *C = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
25406 if (!C)
25407 return Op;
25408
25409 switch (Op->getOpcode()) {
25410 default:
25411 return Op;
25412
25413 // (tbz (and x, m), b) -> (tbz x, b)
25414 case ISD::AND:
25415 if ((C->getZExtValue() >> Bit) & 1)
25416 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
25417 return Op;
25418
25419 // (tbz (shl x, c), b) -> (tbz x, b-c)
25420 case ISD::SHL:
25421 if (C->getZExtValue() <= Bit &&
25422 (Bit - C->getZExtValue()) < Op->getValueType(ResNo: 0).getSizeInBits()) {
25423 Bit = Bit - C->getZExtValue();
25424 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
25425 }
25426 return Op;
25427
25428 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
25429 case ISD::SRA:
25430 Bit = Bit + C->getZExtValue();
25431 if (Bit >= Op->getValueType(ResNo: 0).getSizeInBits())
25432 Bit = Op->getValueType(ResNo: 0).getSizeInBits() - 1;
25433 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
25434
25435 // (tbz (srl x, c), b) -> (tbz x, b+c)
25436 case ISD::SRL:
25437 if ((Bit + C->getZExtValue()) < Op->getValueType(ResNo: 0).getSizeInBits()) {
25438 Bit = Bit + C->getZExtValue();
25439 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
25440 }
25441 return Op;
25442
25443 // (tbz (xor x, -1), b) -> (tbnz x, b)
25444 case ISD::XOR:
25445 if ((C->getZExtValue() >> Bit) & 1)
25446 Invert = !Invert;
25447 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
25448 }
25449}
25450
25451// Optimize test single bit zero/non-zero and branch.
25452static SDValue performTBZCombine(SDNode *N,
25453 TargetLowering::DAGCombinerInfo &DCI,
25454 SelectionDAG &DAG) {
25455 unsigned Bit = N->getConstantOperandVal(Num: 2);
25456 bool Invert = false;
25457 SDValue TestSrc = N->getOperand(Num: 1);
25458 SDValue NewTestSrc = getTestBitOperand(Op: TestSrc, Bit, Invert, DAG);
25459
25460 if (TestSrc == NewTestSrc)
25461 return SDValue();
25462
25463 unsigned NewOpc = N->getOpcode();
25464 if (Invert) {
25465 if (NewOpc == AArch64ISD::TBZ)
25466 NewOpc = AArch64ISD::TBNZ;
25467 else {
25468 assert(NewOpc == AArch64ISD::TBNZ);
25469 NewOpc = AArch64ISD::TBZ;
25470 }
25471 }
25472
25473 SDLoc DL(N);
25474 return DAG.getNode(Opcode: NewOpc, DL, VT: MVT::Other, N1: N->getOperand(Num: 0), N2: NewTestSrc,
25475 N3: DAG.getConstant(Val: Bit, DL, VT: MVT::i64), N4: N->getOperand(Num: 3));
25476}
25477
25478// Swap vselect operands where it may allow a predicated operation to achieve
25479// the `sel`.
25480//
25481// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
25482// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
25483static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
25484 auto SelectA = N->getOperand(Num: 1);
25485 auto SelectB = N->getOperand(Num: 2);
25486 auto NTy = N->getValueType(ResNo: 0);
25487
25488 if (!NTy.isScalableVector())
25489 return SDValue();
25490 SDValue SetCC = N->getOperand(Num: 0);
25491 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
25492 return SDValue();
25493
25494 switch (SelectB.getOpcode()) {
25495 default:
25496 return SDValue();
25497 case ISD::FMUL:
25498 case ISD::FSUB:
25499 case ISD::FADD:
25500 break;
25501 }
25502 if (SelectA != SelectB.getOperand(i: 0))
25503 return SDValue();
25504
25505 ISD::CondCode CC = cast<CondCodeSDNode>(Val: SetCC.getOperand(i: 2))->get();
25506 ISD::CondCode InverseCC =
25507 ISD::getSetCCInverse(Operation: CC, Type: SetCC.getOperand(i: 0).getValueType());
25508 auto InverseSetCC =
25509 DAG.getSetCC(DL: SDLoc(SetCC), VT: SetCC.getValueType(), LHS: SetCC.getOperand(i: 0),
25510 RHS: SetCC.getOperand(i: 1), Cond: InverseCC);
25511
25512 return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT: NTy,
25513 Ops: {InverseSetCC, SelectB, SelectA});
25514}
25515
25516// vselect (v1i1 setcc) ->
25517// vselect (v1iXX setcc) (XX is the size of the compared operand type)
25518// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
25519// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
25520// such VSELECT.
25521static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
25522 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
25523 return SwapResult;
25524
25525 SDValue N0 = N->getOperand(Num: 0);
25526 EVT CCVT = N0.getValueType();
25527
25528 if (isAllActivePredicate(DAG, N: N0))
25529 return N->getOperand(Num: 1);
25530
25531 if (isAllInactivePredicate(N: N0))
25532 return N->getOperand(Num: 2);
25533
25534 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
25535 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
25536 // supported types.
25537 SDValue SetCC = N->getOperand(Num: 0);
25538 if (SetCC.getOpcode() == ISD::SETCC &&
25539 SetCC.getOperand(i: 2) == DAG.getCondCode(Cond: ISD::SETGT)) {
25540 SDValue CmpLHS = SetCC.getOperand(i: 0);
25541 EVT VT = CmpLHS.getValueType();
25542 SDNode *CmpRHS = SetCC.getOperand(i: 1).getNode();
25543 SDNode *SplatLHS = N->getOperand(Num: 1).getNode();
25544 SDNode *SplatRHS = N->getOperand(Num: 2).getNode();
25545 APInt SplatLHSVal;
25546 if (CmpLHS.getValueType() == N->getOperand(Num: 1).getValueType() &&
25547 VT.isSimple() &&
25548 is_contained(Range: ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
25549 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
25550 Element: VT.getSimpleVT().SimpleTy) &&
25551 ISD::isConstantSplatVector(N: SplatLHS, SplatValue&: SplatLHSVal) &&
25552 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(N: CmpRHS) &&
25553 ISD::isConstantSplatVectorAllOnes(N: SplatRHS)) {
25554 unsigned NumElts = VT.getVectorNumElements();
25555 SmallVector<SDValue, 8> Ops(
25556 NumElts, DAG.getConstant(Val: VT.getScalarSizeInBits() - 1, DL: SDLoc(N),
25557 VT: VT.getScalarType()));
25558 SDValue Val = DAG.getBuildVector(VT, DL: SDLoc(N), Ops);
25559
25560 auto Shift = DAG.getNode(Opcode: ISD::SRA, DL: SDLoc(N), VT, N1: CmpLHS, N2: Val);
25561 auto Or = DAG.getNode(Opcode: ISD::OR, DL: SDLoc(N), VT, N1: Shift, N2: N->getOperand(Num: 1));
25562 return Or;
25563 }
25564 }
25565
25566 EVT CmpVT = N0.getOperand(i: 0).getValueType();
25567 if (N0.getOpcode() != ISD::SETCC ||
25568 CCVT.getVectorElementCount() != ElementCount::getFixed(MinVal: 1) ||
25569 CCVT.getVectorElementType() != MVT::i1 ||
25570 CmpVT.getVectorElementType().isFloatingPoint())
25571 return SDValue();
25572
25573 EVT ResVT = N->getValueType(ResNo: 0);
25574 // Only combine when the result type is of the same size as the compared
25575 // operands.
25576 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
25577 return SDValue();
25578
25579 SDValue IfTrue = N->getOperand(Num: 1);
25580 SDValue IfFalse = N->getOperand(Num: 2);
25581 SetCC = DAG.getSetCC(DL: SDLoc(N), VT: CmpVT.changeVectorElementTypeToInteger(),
25582 LHS: N0.getOperand(i: 0), RHS: N0.getOperand(i: 1),
25583 Cond: cast<CondCodeSDNode>(Val: N0.getOperand(i: 2))->get());
25584 return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT: ResVT, N1: SetCC,
25585 N2: IfTrue, N3: IfFalse);
25586}
25587
25588/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
25589/// the compare-mask instructions rather than going via NZCV, even if LHS and
25590/// RHS are really scalar. This replaces any scalar setcc in the above pattern
25591/// with a vector one followed by a DUP shuffle on the result.
25592static SDValue performSelectCombine(SDNode *N,
25593 TargetLowering::DAGCombinerInfo &DCI) {
25594 SelectionDAG &DAG = DCI.DAG;
25595 SDValue N0 = N->getOperand(Num: 0);
25596 EVT ResVT = N->getValueType(ResNo: 0);
25597
25598 if (N0.getOpcode() != ISD::SETCC)
25599 return SDValue();
25600
25601 if (ResVT.isScalableVT())
25602 return SDValue();
25603
25604 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
25605 // scalar SetCCResultType. We also don't expect vectors, because we assume
25606 // that selects fed by vector SETCCs are canonicalized to VSELECT.
25607 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
25608 "Scalar-SETCC feeding SELECT has unexpected result type!");
25609
25610 // If NumMaskElts == 0, the comparison is larger than select result. The
25611 // largest real NEON comparison is 64-bits per lane, which means the result is
25612 // at most 32-bits and an illegal vector. Just bail out for now.
25613 EVT SrcVT = N0.getOperand(i: 0).getValueType();
25614
25615 // Don't try to do this optimization when the setcc itself has i1 operands.
25616 // There are no legal vectors of i1, so this would be pointless. v1f16 is
25617 // ruled out to prevent the creation of setcc that need to be scalarized.
25618 if (SrcVT == MVT::i1 ||
25619 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
25620 return SDValue();
25621
25622 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
25623 if (!ResVT.isVector() || NumMaskElts == 0)
25624 return SDValue();
25625
25626 SrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SrcVT, NumElements: NumMaskElts);
25627 EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
25628
25629 // Also bail out if the vector CCVT isn't the same size as ResVT.
25630 // This can happen if the SETCC operand size doesn't divide the ResVT size
25631 // (e.g., f64 vs v3f32).
25632 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
25633 return SDValue();
25634
25635 // Make sure we didn't create illegal types, if we're not supposed to.
25636 assert(DCI.isBeforeLegalize() ||
25637 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
25638
25639 // First perform a vector comparison, where lane 0 is the one we're interested
25640 // in.
25641 SDLoc DL(N0);
25642 SDValue LHS =
25643 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: 0));
25644 SDValue RHS =
25645 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: 1));
25646 SDValue SetCC = DAG.getNode(Opcode: ISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS, N3: N0.getOperand(i: 2));
25647
25648 // Now duplicate the comparison mask we want across all other lanes.
25649 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
25650 SDValue Mask = DAG.getVectorShuffle(VT: CCVT, dl: DL, N1: SetCC, N2: SetCC, Mask: DUPMask);
25651 Mask = DAG.getNode(Opcode: ISD::BITCAST, DL,
25652 VT: ResVT.changeVectorElementTypeToInteger(), Operand: Mask);
25653
25654 return DAG.getSelect(DL, VT: ResVT, Cond: Mask, LHS: N->getOperand(Num: 1), RHS: N->getOperand(Num: 2));
25655}
25656
25657static SDValue performDUPCombine(SDNode *N,
25658 TargetLowering::DAGCombinerInfo &DCI) {
25659 EVT VT = N->getValueType(ResNo: 0);
25660 SDLoc DL(N);
25661 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
25662 // 128bit vector version.
25663 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
25664 EVT LVT = VT.getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
25665 SmallVector<SDValue> Ops(N->ops());
25666 if (SDNode *LN = DCI.DAG.getNodeIfExists(Opcode: N->getOpcode(),
25667 VTList: DCI.DAG.getVTList(VT: LVT), Ops)) {
25668 return DCI.DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: SDValue(LN, 0),
25669 N2: DCI.DAG.getConstant(Val: 0, DL, VT: MVT::i64));
25670 }
25671 }
25672
25673 if (N->getOpcode() == AArch64ISD::DUP) {
25674 // If the instruction is known to produce a scalar in SIMD registers, we can
25675 // duplicate it across the vector lanes using DUPLANE instead of moving it
25676 // to a GPR first. For example, this allows us to handle:
25677 // v4i32 = DUP (i32 (FCMGT (f32, f32)))
25678 SDValue Op = N->getOperand(Num: 0);
25679 // FIXME: Ideally, we should be able to handle all instructions that
25680 // produce a scalar value in FPRs.
25681 if (Op.getOpcode() == AArch64ISD::FCMEQ ||
25682 Op.getOpcode() == AArch64ISD::FCMGE ||
25683 Op.getOpcode() == AArch64ISD::FCMGT) {
25684 EVT ElemVT = VT.getVectorElementType();
25685 EVT ExpandedVT = VT;
25686 // Insert into a 128-bit vector to match DUPLANE's pattern.
25687 if (VT.getSizeInBits() != 128)
25688 ExpandedVT = EVT::getVectorVT(Context&: *DCI.DAG.getContext(), VT: ElemVT,
25689 NumElements: 128 / ElemVT.getSizeInBits());
25690 SDValue Zero = DCI.DAG.getConstant(Val: 0, DL, VT: MVT::i64);
25691 SDValue Vec = DCI.DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ExpandedVT,
25692 N1: DCI.DAG.getUNDEF(VT: ExpandedVT), N2: Op, N3: Zero);
25693 return DCI.DAG.getNode(Opcode: getDUPLANEOp(EltType: ElemVT), DL, VT, N1: Vec, N2: Zero);
25694 }
25695
25696 if (DCI.isAfterLegalizeDAG()) {
25697 // If scalar dup's operand is extract_vector_elt, try to combine them into
25698 // duplane. For example,
25699 //
25700 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
25701 // t18: v4i32 = AArch64ISD::DUP t21
25702 // ==>
25703 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
25704 SDValue EXTRACT_VEC_ELT = N->getOperand(Num: 0);
25705 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25706 if (VT == EXTRACT_VEC_ELT.getOperand(i: 0).getValueType()) {
25707 unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
25708 return DCI.DAG.getNode(Opcode, DL, VT, N1: EXTRACT_VEC_ELT.getOperand(i: 0),
25709 N2: EXTRACT_VEC_ELT.getOperand(i: 1));
25710 }
25711 }
25712 }
25713
25714 return performPostLD1Combine(N, DCI, IsLaneOp: false);
25715 }
25716
25717 return SDValue();
25718}
25719
25720/// Get rid of unnecessary NVCASTs (that don't change the type).
25721static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) {
25722 if (N->getValueType(ResNo: 0) == N->getOperand(Num: 0).getValueType())
25723 return N->getOperand(Num: 0);
25724 if (N->getOperand(Num: 0).getOpcode() == AArch64ISD::NVCAST)
25725 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
25726 Operand: N->getOperand(Num: 0).getOperand(i: 0));
25727
25728 return SDValue();
25729}
25730
25731// If all users of the globaladdr are of the form (globaladdr + constant), find
25732// the smallest constant, fold it into the globaladdr's offset and rewrite the
25733// globaladdr as (globaladdr + constant) - constant.
25734static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
25735 const AArch64Subtarget *Subtarget,
25736 const TargetMachine &TM) {
25737 auto *GN = cast<GlobalAddressSDNode>(Val: N);
25738 if (Subtarget->ClassifyGlobalReference(GV: GN->getGlobal(), TM) !=
25739 AArch64II::MO_NO_FLAG)
25740 return SDValue();
25741
25742 uint64_t MinOffset = -1ull;
25743 for (SDNode *N : GN->users()) {
25744 if (N->getOpcode() != ISD::ADD)
25745 return SDValue();
25746 auto *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0));
25747 if (!C)
25748 C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
25749 if (!C)
25750 return SDValue();
25751 MinOffset = std::min(a: MinOffset, b: C->getZExtValue());
25752 }
25753 uint64_t Offset = MinOffset + GN->getOffset();
25754
25755 // Require that the new offset is larger than the existing one. Otherwise, we
25756 // can end up oscillating between two possible DAGs, for example,
25757 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
25758 if (Offset <= uint64_t(GN->getOffset()))
25759 return SDValue();
25760
25761 // Check whether folding this offset is legal. It must not go out of bounds of
25762 // the referenced object to avoid violating the code model, and must be
25763 // smaller than 2^20 because this is the largest offset expressible in all
25764 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
25765 // stores an immediate signed 21 bit offset.)
25766 //
25767 // This check also prevents us from folding negative offsets, which will end
25768 // up being treated in the same way as large positive ones. They could also
25769 // cause code model violations, and aren't really common enough to matter.
25770 if (Offset >= (1 << 20))
25771 return SDValue();
25772
25773 const GlobalValue *GV = GN->getGlobal();
25774 Type *T = GV->getValueType();
25775 if (!T->isSized() ||
25776 Offset > GV->getDataLayout().getTypeAllocSize(Ty: T))
25777 return SDValue();
25778
25779 SDLoc DL(GN);
25780 SDValue Result = DAG.getGlobalAddress(GV, DL, VT: MVT::i64, offset: Offset);
25781 return DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: Result,
25782 N2: DAG.getConstant(Val: MinOffset, DL, VT: MVT::i64));
25783}
25784
25785static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG,
25786 const AArch64Subtarget *Subtarget) {
25787 SDValue BR = N->getOperand(Num: 0);
25788 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
25789 !BR.getValueType().isScalarInteger())
25790 return SDValue();
25791
25792 SDLoc DL(N);
25793 return DAG.getNode(Opcode: ISD::CTTZ, DL, VT: BR.getValueType(), Operand: BR.getOperand(i: 0));
25794}
25795
25796// Turns the vector of indices into a vector of byte offstes by scaling Offset
25797// by (BitWidth / 8).
25798static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
25799 SDLoc DL, unsigned BitWidth) {
25800 assert(Offset.getValueType().isScalableVector() &&
25801 "This method is only for scalable vectors of offsets");
25802
25803 SDValue Shift = DAG.getConstant(Val: Log2_32(Value: BitWidth / 8), DL, VT: MVT::i64);
25804 SDValue SplatShift = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: Shift);
25805
25806 return DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::nxv2i64, N1: Offset, N2: SplatShift);
25807}
25808
25809/// Check if the value of \p OffsetInBytes can be used as an immediate for
25810/// the gather load/prefetch and scatter store instructions with vector base and
25811/// immediate offset addressing mode:
25812///
25813/// [<Zn>.[S|D]{, #<imm>}]
25814///
25815/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
25816inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
25817 unsigned ScalarSizeInBytes) {
25818 // The immediate is not a multiple of the scalar size.
25819 if (OffsetInBytes % ScalarSizeInBytes)
25820 return false;
25821
25822 // The immediate is out of range.
25823 if (OffsetInBytes / ScalarSizeInBytes > 31)
25824 return false;
25825
25826 return true;
25827}
25828
25829/// Check if the value of \p Offset represents a valid immediate for the SVE
25830/// gather load/prefetch and scatter store instructiona with vector base and
25831/// immediate offset addressing mode:
25832///
25833/// [<Zn>.[S|D]{, #<imm>}]
25834///
25835/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
25836static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
25837 unsigned ScalarSizeInBytes) {
25838 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Val: Offset.getNode());
25839 return OffsetConst && isValidImmForSVEVecImmAddrMode(
25840 OffsetInBytes: OffsetConst->getZExtValue(), ScalarSizeInBytes);
25841}
25842
25843static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
25844 unsigned Opcode,
25845 bool OnlyPackedOffsets = true) {
25846 const SDValue Src = N->getOperand(Num: 2);
25847 const EVT SrcVT = Src->getValueType(ResNo: 0);
25848 assert(SrcVT.isScalableVector() &&
25849 "Scatter stores are only possible for SVE vectors");
25850
25851 SDLoc DL(N);
25852 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
25853
25854 // Make sure that source data will fit into an SVE register
25855 if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
25856 return SDValue();
25857
25858 // For FPs, ACLE only supports _packed_ single and double precision types.
25859 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
25860 if (SrcElVT.isFloatingPoint())
25861 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
25862 ((Opcode != AArch64ISD::SST1Q_PRED &&
25863 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
25864 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
25865 return SDValue();
25866
25867 // Depending on the addressing mode, this is either a pointer or a vector of
25868 // pointers (that fits into one register)
25869 SDValue Base = N->getOperand(Num: 4);
25870 // Depending on the addressing mode, this is either a single offset or a
25871 // vector of offsets (that fits into one register)
25872 SDValue Offset = N->getOperand(Num: 5);
25873
25874 // For "scalar + vector of indices", just scale the indices. This only
25875 // applies to non-temporal scatters because there's no instruction that takes
25876 // indices.
25877 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
25878 Offset =
25879 getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
25880 Opcode = AArch64ISD::SSTNT1_PRED;
25881 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
25882 Offset =
25883 getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
25884 Opcode = AArch64ISD::SST1Q_PRED;
25885 }
25886
25887 // In the case of non-temporal gather loads there's only one SVE instruction
25888 // per data-size: "scalar + vector", i.e.
25889 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
25890 // Since we do have intrinsics that allow the arguments to be in a different
25891 // order, we may need to swap them to match the spec.
25892 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
25893 Offset.getValueType().isVector())
25894 std::swap(a&: Base, b&: Offset);
25895
25896 // SST1_IMM requires that the offset is an immediate that is:
25897 // * a multiple of #SizeInBytes,
25898 // * in the range [0, 31 x #SizeInBytes],
25899 // where #SizeInBytes is the size in bytes of the stored items. For
25900 // immediates outside that range and non-immediate scalar offsets use SST1 or
25901 // SST1_UXTW instead.
25902 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
25903 if (!isValidImmForSVEVecImmAddrMode(Offset,
25904 ScalarSizeInBytes: SrcVT.getScalarSizeInBits() / 8)) {
25905 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
25906 Opcode = AArch64ISD::SST1_UXTW_PRED;
25907 else
25908 Opcode = AArch64ISD::SST1_PRED;
25909
25910 std::swap(a&: Base, b&: Offset);
25911 }
25912 }
25913
25914 auto &TLI = DAG.getTargetLoweringInfo();
25915 if (!TLI.isTypeLegal(VT: Base.getValueType()))
25916 return SDValue();
25917
25918 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
25919 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
25920 // nxv2i64. Legalize accordingly.
25921 if (!OnlyPackedOffsets &&
25922 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
25923 Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset).getValue(R: 0);
25924
25925 if (!TLI.isTypeLegal(VT: Offset.getValueType()))
25926 return SDValue();
25927
25928 // Source value type that is representable in hardware
25929 EVT HwSrcVt = getSVEContainerType(ContentTy: SrcVT);
25930
25931 // Keep the original type of the input data to store - this is needed to be
25932 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
25933 // FP values we want the integer equivalent, so just use HwSrcVt.
25934 SDValue InputVT = DAG.getValueType(SrcVT);
25935 if (SrcVT.isFloatingPoint())
25936 InputVT = DAG.getValueType(HwSrcVt);
25937
25938 SDVTList VTs = DAG.getVTList(VT: MVT::Other);
25939 SDValue SrcNew;
25940
25941 if (Src.getValueType().isFloatingPoint())
25942 SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Src);
25943 else
25944 SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Src);
25945
25946 SDValue Ops[] = {N->getOperand(Num: 0), // Chain
25947 SrcNew,
25948 N->getOperand(Num: 3), // Pg
25949 Base,
25950 Offset,
25951 InputVT};
25952
25953 return DAG.getNode(Opcode, DL, VTList: VTs, Ops);
25954}
25955
25956static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
25957 unsigned Opcode,
25958 bool OnlyPackedOffsets = true) {
25959 const EVT RetVT = N->getValueType(ResNo: 0);
25960 assert(RetVT.isScalableVector() &&
25961 "Gather loads are only possible for SVE vectors");
25962
25963 SDLoc DL(N);
25964
25965 // Make sure that the loaded data will fit into an SVE register
25966 if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
25967 return SDValue();
25968
25969 // Depending on the addressing mode, this is either a pointer or a vector of
25970 // pointers (that fits into one register)
25971 SDValue Base = N->getOperand(Num: 3);
25972 // Depending on the addressing mode, this is either a single offset or a
25973 // vector of offsets (that fits into one register)
25974 SDValue Offset = N->getOperand(Num: 4);
25975
25976 // For "scalar + vector of indices", scale the indices to obtain unscaled
25977 // offsets. This applies to non-temporal and quadword gathers, which do not
25978 // have an addressing mode with scaled offset.
25979 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
25980 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
25981 BitWidth: RetVT.getScalarSizeInBits());
25982 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
25983 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
25984 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
25985 BitWidth: RetVT.getScalarSizeInBits());
25986 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
25987 }
25988
25989 // In the case of non-temporal gather loads and quadword gather loads there's
25990 // only one addressing mode : "vector + scalar", e.g.
25991 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
25992 // Since we do have intrinsics that allow the arguments to be in a different
25993 // order, we may need to swap them to match the spec.
25994 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
25995 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
25996 Offset.getValueType().isVector())
25997 std::swap(a&: Base, b&: Offset);
25998
25999 // GLD{FF}1_IMM requires that the offset is an immediate that is:
26000 // * a multiple of #SizeInBytes,
26001 // * in the range [0, 31 x #SizeInBytes],
26002 // where #SizeInBytes is the size in bytes of the loaded items. For
26003 // immediates outside that range and non-immediate scalar offsets use
26004 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
26005 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
26006 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
26007 if (!isValidImmForSVEVecImmAddrMode(Offset,
26008 ScalarSizeInBytes: RetVT.getScalarSizeInBits() / 8)) {
26009 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26010 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26011 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
26012 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
26013 else
26014 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26015 ? AArch64ISD::GLD1_MERGE_ZERO
26016 : AArch64ISD::GLDFF1_MERGE_ZERO;
26017
26018 std::swap(a&: Base, b&: Offset);
26019 }
26020 }
26021
26022 auto &TLI = DAG.getTargetLoweringInfo();
26023 if (!TLI.isTypeLegal(VT: Base.getValueType()))
26024 return SDValue();
26025
26026 // Some gather load variants allow unpacked offsets, but only as nxv2i32
26027 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26028 // nxv2i64. Legalize accordingly.
26029 if (!OnlyPackedOffsets &&
26030 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26031 Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset).getValue(R: 0);
26032
26033 // Return value type that is representable in hardware
26034 EVT HwRetVt = getSVEContainerType(ContentTy: RetVT);
26035
26036 // Keep the original output value type around - this is needed to be able to
26037 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
26038 // values we want the integer equivalent, so just use HwRetVT.
26039 SDValue OutVT = DAG.getValueType(RetVT);
26040 if (RetVT.isFloatingPoint())
26041 OutVT = DAG.getValueType(HwRetVt);
26042
26043 SDVTList VTs = DAG.getVTList(VT1: HwRetVt, VT2: MVT::Other);
26044 SDValue Ops[] = {N->getOperand(Num: 0), // Chain
26045 N->getOperand(Num: 2), // Pg
26046 Base, Offset, OutVT};
26047
26048 SDValue Load = DAG.getNode(Opcode, DL, VTList: VTs, Ops);
26049 SDValue LoadChain = SDValue(Load.getNode(), 1);
26050
26051 if (RetVT.isInteger() && (RetVT != HwRetVt))
26052 Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: RetVT, Operand: Load.getValue(R: 0));
26053
26054 // If the original return value was FP, bitcast accordingly. Doing it here
26055 // means that we can avoid adding TableGen patterns for FPs.
26056 if (RetVT.isFloatingPoint())
26057 Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: RetVT, Operand: Load.getValue(R: 0));
26058
26059 return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
26060}
26061
26062static SDValue
26063performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
26064 SelectionDAG &DAG) {
26065 SDLoc DL(N);
26066 SDValue Src = N->getOperand(Num: 0);
26067 unsigned Opc = Src->getOpcode();
26068
26069 // Sign extend of an unsigned unpack -> signed unpack
26070 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
26071
26072 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
26073 : AArch64ISD::SUNPKLO;
26074
26075 // Push the sign extend to the operand of the unpack
26076 // This is necessary where, for example, the operand of the unpack
26077 // is another unpack:
26078 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
26079 // ->
26080 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
26081 // ->
26082 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
26083 SDValue ExtOp = Src->getOperand(Num: 0);
26084 auto VT = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT();
26085 EVT EltTy = VT.getVectorElementType();
26086 (void)EltTy;
26087
26088 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
26089 "Sign extending from an invalid type");
26090
26091 EVT ExtVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
26092
26093 SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ExtOp.getValueType(),
26094 N1: ExtOp, N2: DAG.getValueType(ExtVT));
26095
26096 return DAG.getNode(Opcode: SOpc, DL, VT: N->getValueType(ResNo: 0), Operand: Ext);
26097 }
26098
26099 if (DCI.isBeforeLegalizeOps())
26100 return SDValue();
26101
26102 if (!EnableCombineMGatherIntrinsics)
26103 return SDValue();
26104
26105 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
26106 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
26107 unsigned NewOpc;
26108 unsigned MemVTOpNum = 4;
26109 switch (Opc) {
26110 case AArch64ISD::LD1_MERGE_ZERO:
26111 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
26112 MemVTOpNum = 3;
26113 break;
26114 case AArch64ISD::LDNF1_MERGE_ZERO:
26115 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
26116 MemVTOpNum = 3;
26117 break;
26118 case AArch64ISD::LDFF1_MERGE_ZERO:
26119 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
26120 MemVTOpNum = 3;
26121 break;
26122 case AArch64ISD::GLD1_MERGE_ZERO:
26123 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
26124 break;
26125 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
26126 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
26127 break;
26128 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
26129 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
26130 break;
26131 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
26132 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
26133 break;
26134 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
26135 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
26136 break;
26137 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
26138 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
26139 break;
26140 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
26141 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
26142 break;
26143 case AArch64ISD::GLDFF1_MERGE_ZERO:
26144 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
26145 break;
26146 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
26147 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
26148 break;
26149 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
26150 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
26151 break;
26152 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
26153 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
26154 break;
26155 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
26156 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
26157 break;
26158 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
26159 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
26160 break;
26161 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
26162 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
26163 break;
26164 case AArch64ISD::GLDNT1_MERGE_ZERO:
26165 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
26166 break;
26167 default:
26168 return SDValue();
26169 }
26170
26171 EVT SignExtSrcVT = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT();
26172 EVT SrcMemVT = cast<VTSDNode>(Val: Src->getOperand(Num: MemVTOpNum))->getVT();
26173
26174 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
26175 return SDValue();
26176
26177 EVT DstVT = N->getValueType(ResNo: 0);
26178 SDVTList VTs = DAG.getVTList(VT1: DstVT, VT2: MVT::Other);
26179
26180 SmallVector<SDValue, 5> Ops;
26181 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
26182 Ops.push_back(Elt: Src->getOperand(Num: I));
26183
26184 SDValue ExtLoad = DAG.getNode(Opcode: NewOpc, DL: SDLoc(N), VTList: VTs, Ops);
26185 DCI.CombineTo(N, Res: ExtLoad);
26186 DCI.CombineTo(N: Src.getNode(), Res0: ExtLoad, Res1: ExtLoad.getValue(R: 1));
26187
26188 // Return N so it doesn't get rechecked
26189 return SDValue(N, 0);
26190}
26191
26192/// Legalize the gather prefetch (scalar + vector addressing mode) when the
26193/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
26194/// != nxv2i32) do not need legalization.
26195static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
26196 const unsigned OffsetPos = 4;
26197 SDValue Offset = N->getOperand(Num: OffsetPos);
26198
26199 // Not an unpacked vector, bail out.
26200 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
26201 return SDValue();
26202
26203 // Extend the unpacked offset vector to 64-bit lanes.
26204 SDLoc DL(N);
26205 Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset);
26206 SmallVector<SDValue, 5> Ops(N->ops());
26207 // Replace the offset operand with the 64-bit one.
26208 Ops[OffsetPos] = Offset;
26209
26210 return DAG.getNode(Opcode: N->getOpcode(), DL, VTList: DAG.getVTList(VT: MVT::Other), Ops);
26211}
26212
26213/// Combines a node carrying the intrinsic
26214/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
26215/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
26216/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
26217/// sve gather prefetch instruction with vector plus immediate addressing mode.
26218static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
26219 unsigned ScalarSizeInBytes) {
26220 const unsigned ImmPos = 4, OffsetPos = 3;
26221 // No need to combine the node if the immediate is valid...
26222 if (isValidImmForSVEVecImmAddrMode(Offset: N->getOperand(Num: ImmPos), ScalarSizeInBytes))
26223 return SDValue();
26224
26225 // ...otherwise swap the offset base with the offset...
26226 SmallVector<SDValue, 5> Ops(N->ops());
26227 std::swap(a&: Ops[ImmPos], b&: Ops[OffsetPos]);
26228 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
26229 // `aarch64_sve_prfb_gather_uxtw_index`.
26230 SDLoc DL(N);
26231 Ops[1] = DAG.getConstant(Val: Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
26232 VT: MVT::i64);
26233
26234 return DAG.getNode(Opcode: N->getOpcode(), DL, VTList: DAG.getVTList(VT: MVT::Other), Ops);
26235}
26236
26237// Return true if the vector operation can guarantee only the first lane of its
26238// result contains data, with all bits in other lanes set to zero.
26239static bool isLanes1toNKnownZero(SDValue Op) {
26240 switch (Op.getOpcode()) {
26241 default:
26242 return false;
26243 case AArch64ISD::ANDV_PRED:
26244 case AArch64ISD::EORV_PRED:
26245 case AArch64ISD::FADDA_PRED:
26246 case AArch64ISD::FADDV_PRED:
26247 case AArch64ISD::FMAXNMV_PRED:
26248 case AArch64ISD::FMAXV_PRED:
26249 case AArch64ISD::FMINNMV_PRED:
26250 case AArch64ISD::FMINV_PRED:
26251 case AArch64ISD::ORV_PRED:
26252 case AArch64ISD::SADDV_PRED:
26253 case AArch64ISD::SMAXV_PRED:
26254 case AArch64ISD::SMINV_PRED:
26255 case AArch64ISD::UADDV_PRED:
26256 case AArch64ISD::UMAXV_PRED:
26257 case AArch64ISD::UMINV_PRED:
26258 return true;
26259 }
26260}
26261
26262static SDValue removeRedundantInsertVectorElt(SDNode *N) {
26263 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
26264 SDValue InsertVec = N->getOperand(Num: 0);
26265 SDValue InsertElt = N->getOperand(Num: 1);
26266 SDValue InsertIdx = N->getOperand(Num: 2);
26267
26268 // We only care about inserts into the first element...
26269 if (!isNullConstant(V: InsertIdx))
26270 return SDValue();
26271 // ...of a zero'd vector...
26272 if (!ISD::isConstantSplatVectorAllZeros(N: InsertVec.getNode()))
26273 return SDValue();
26274 // ...where the inserted data was previously extracted...
26275 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26276 return SDValue();
26277
26278 SDValue ExtractVec = InsertElt.getOperand(i: 0);
26279 SDValue ExtractIdx = InsertElt.getOperand(i: 1);
26280
26281 // ...from the first element of a vector.
26282 if (!isNullConstant(V: ExtractIdx))
26283 return SDValue();
26284
26285 // If we get here we are effectively trying to zero lanes 1-N of a vector.
26286
26287 // Ensure there's no type conversion going on.
26288 if (N->getValueType(ResNo: 0) != ExtractVec.getValueType())
26289 return SDValue();
26290
26291 if (!isLanes1toNKnownZero(Op: ExtractVec))
26292 return SDValue();
26293
26294 // The explicit zeroing is redundant.
26295 return ExtractVec;
26296}
26297
26298static SDValue
26299performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
26300 if (SDValue Res = removeRedundantInsertVectorElt(N))
26301 return Res;
26302
26303 return performPostLD1Combine(N, DCI, IsLaneOp: true);
26304}
26305
26306static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
26307 TargetLowering::DAGCombinerInfo &DCI,
26308 const AArch64Subtarget *Subtarget) {
26309 SDValue N0 = N->getOperand(Num: 0);
26310 EVT VT = N->getValueType(ResNo: 0);
26311
26312 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
26313 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
26314 return SDValue();
26315
26316 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
26317 EVT EltVT = VT.getVectorElementType();
26318 return EltVT == MVT::f32 || EltVT == MVT::f64;
26319 };
26320
26321 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
26322 // We purposefully don't care about legality of the nodes here as we know
26323 // they can be split down into something legal.
26324 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N: N0.getNode()) &&
26325 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
26326 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
26327 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
26328 LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
26329 SDValue ExtLoad = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: SDLoc(N), VT,
26330 Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
26331 MemVT: N0.getValueType(), MMO: LN0->getMemOperand());
26332 DCI.CombineTo(N, Res: ExtLoad);
26333 DCI.CombineTo(
26334 N: N0.getNode(),
26335 Res0: DAG.getNode(Opcode: ISD::FP_ROUND, DL: SDLoc(N0), VT: N0.getValueType(), N1: ExtLoad,
26336 N2: DAG.getIntPtrConstant(Val: 1, DL: SDLoc(N0), /*isTarget=*/true)),
26337 Res1: ExtLoad.getValue(R: 1));
26338 return SDValue(N, 0); // Return N so it doesn't get rechecked!
26339 }
26340
26341 return SDValue();
26342}
26343
26344static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
26345 const AArch64Subtarget *Subtarget) {
26346 EVT VT = N->getValueType(ResNo: 0);
26347
26348 // Don't expand for NEON, SVE2 or SME
26349 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
26350 return SDValue();
26351
26352 SDLoc DL(N);
26353
26354 SDValue Mask = N->getOperand(Num: 0);
26355 SDValue In1 = N->getOperand(Num: 1);
26356 SDValue In2 = N->getOperand(Num: 2);
26357
26358 SDValue InvMask = DAG.getNOT(DL, Val: Mask, VT);
26359 SDValue Sel = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Mask, N2: In1);
26360 SDValue SelInv = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: InvMask, N2: In2);
26361 return DAG.getNode(Opcode: ISD::OR, DL, VT, N1: Sel, N2: SelInv);
26362}
26363
26364static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
26365 EVT VT = N->getValueType(ResNo: 0);
26366
26367 SDValue Insert = N->getOperand(Num: 0);
26368 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
26369 return SDValue();
26370
26371 if (!Insert.getOperand(i: 0).isUndef())
26372 return SDValue();
26373
26374 uint64_t IdxInsert = Insert.getConstantOperandVal(i: 2);
26375 uint64_t IdxDupLane = N->getConstantOperandVal(Num: 1);
26376 if (IdxInsert != 0 || IdxDupLane != 0)
26377 return SDValue();
26378
26379 SDValue Bitcast = Insert.getOperand(i: 1);
26380 if (Bitcast.getOpcode() != ISD::BITCAST)
26381 return SDValue();
26382
26383 SDValue Subvec = Bitcast.getOperand(i: 0);
26384 EVT SubvecVT = Subvec.getValueType();
26385 if (!SubvecVT.is128BitVector())
26386 return SDValue();
26387 EVT NewSubvecVT =
26388 getPackedSVEVectorVT(VT: Subvec.getValueType().getVectorElementType());
26389
26390 SDLoc DL(N);
26391 SDValue NewInsert =
26392 DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewSubvecVT,
26393 N1: DAG.getUNDEF(VT: NewSubvecVT), N2: Subvec, N3: Insert->getOperand(Num: 2));
26394 SDValue NewDuplane128 = DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT: NewSubvecVT,
26395 N1: NewInsert, N2: N->getOperand(Num: 1));
26396 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewDuplane128);
26397}
26398
26399// Try to combine mull with uzp1.
26400static SDValue tryCombineMULLWithUZP1(SDNode *N,
26401 TargetLowering::DAGCombinerInfo &DCI,
26402 SelectionDAG &DAG) {
26403 if (DCI.isBeforeLegalizeOps())
26404 return SDValue();
26405
26406 SDValue LHS = N->getOperand(Num: 0);
26407 SDValue RHS = N->getOperand(Num: 1);
26408
26409 SDValue ExtractHigh;
26410 SDValue ExtractLow;
26411 SDValue TruncHigh;
26412 SDValue TruncLow;
26413 SDLoc DL(N);
26414
26415 // Check the operands are trunc and extract_high.
26416 if (isEssentiallyExtractHighSubvector(N: LHS) &&
26417 RHS.getOpcode() == ISD::TRUNCATE) {
26418 TruncHigh = RHS;
26419 if (LHS.getOpcode() == ISD::BITCAST)
26420 ExtractHigh = LHS.getOperand(i: 0);
26421 else
26422 ExtractHigh = LHS;
26423 } else if (isEssentiallyExtractHighSubvector(N: RHS) &&
26424 LHS.getOpcode() == ISD::TRUNCATE) {
26425 TruncHigh = LHS;
26426 if (RHS.getOpcode() == ISD::BITCAST)
26427 ExtractHigh = RHS.getOperand(i: 0);
26428 else
26429 ExtractHigh = RHS;
26430 } else
26431 return SDValue();
26432
26433 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26434 // with uzp1.
26435 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26436 SDValue TruncHighOp = TruncHigh.getOperand(i: 0);
26437 EVT TruncHighOpVT = TruncHighOp.getValueType();
26438 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
26439 DAG.isSplatValue(V: TruncHighOp, AllowUndefs: false))
26440 return SDValue();
26441
26442 // Check there is other extract_high with same source vector.
26443 // For example,
26444 //
26445 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
26446 // t12: v4i16 = truncate t11
26447 // t31: v4i32 = AArch64ISD::SMULL t18, t12
26448 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
26449 // t16: v4i16 = truncate t15
26450 // t30: v4i32 = AArch64ISD::SMULL t23, t1
26451 //
26452 // This dagcombine assumes the two extract_high uses same source vector in
26453 // order to detect the pair of the mull. If they have different source vector,
26454 // this code will not work.
26455 // TODO: Should also try to look through a bitcast.
26456 bool HasFoundMULLow = true;
26457 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(i: 0);
26458 if (ExtractHighSrcVec->use_size() != 2)
26459 HasFoundMULLow = false;
26460
26461 // Find ExtractLow.
26462 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
26463 if (User == ExtractHigh.getNode())
26464 continue;
26465
26466 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
26467 !isNullConstant(V: User->getOperand(Num: 1))) {
26468 HasFoundMULLow = false;
26469 break;
26470 }
26471
26472 ExtractLow.setNode(User);
26473 }
26474
26475 if (!ExtractLow || !ExtractLow->hasOneUse())
26476 HasFoundMULLow = false;
26477
26478 // Check ExtractLow's user.
26479 if (HasFoundMULLow) {
26480 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
26481 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
26482 HasFoundMULLow = false;
26483 } else {
26484 if (ExtractLowUser->getOperand(Num: 0) == ExtractLow) {
26485 if (ExtractLowUser->getOperand(Num: 1).getOpcode() == ISD::TRUNCATE)
26486 TruncLow = ExtractLowUser->getOperand(Num: 1);
26487 else
26488 HasFoundMULLow = false;
26489 } else {
26490 if (ExtractLowUser->getOperand(Num: 0).getOpcode() == ISD::TRUNCATE)
26491 TruncLow = ExtractLowUser->getOperand(Num: 0);
26492 else
26493 HasFoundMULLow = false;
26494 }
26495 }
26496 }
26497
26498 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26499 // with uzp1.
26500 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26501 EVT TruncHighVT = TruncHigh.getValueType();
26502 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
26503 SDValue TruncLowOp =
26504 HasFoundMULLow ? TruncLow.getOperand(i: 0) : DAG.getUNDEF(VT: UZP1VT);
26505 EVT TruncLowOpVT = TruncLowOp.getValueType();
26506 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
26507 DAG.isSplatValue(V: TruncLowOp, AllowUndefs: false)))
26508 return SDValue();
26509
26510 // Create uzp1, extract_high and extract_low.
26511 if (TruncHighOpVT != UZP1VT)
26512 TruncHighOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncHighOp);
26513 if (TruncLowOpVT != UZP1VT)
26514 TruncLowOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncLowOp);
26515
26516 SDValue UZP1 =
26517 DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UZP1VT, N1: TruncLowOp, N2: TruncHighOp);
26518 SDValue HighIdxCst =
26519 DAG.getConstant(Val: TruncHighVT.getVectorNumElements(), DL, VT: MVT::i64);
26520 SDValue NewTruncHigh =
26521 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncHighVT, N1: UZP1, N2: HighIdxCst);
26522 DAG.ReplaceAllUsesWith(From: TruncHigh, To: NewTruncHigh);
26523
26524 if (HasFoundMULLow) {
26525 EVT TruncLowVT = TruncLow.getValueType();
26526 SDValue NewTruncLow = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncLowVT,
26527 N1: UZP1, N2: ExtractLow.getOperand(i: 1));
26528 DAG.ReplaceAllUsesWith(From: TruncLow, To: NewTruncLow);
26529 }
26530
26531 return SDValue(N, 0);
26532}
26533
26534static SDValue performMULLCombine(SDNode *N,
26535 TargetLowering::DAGCombinerInfo &DCI,
26536 SelectionDAG &DAG) {
26537 if (SDValue Val =
26538 tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N, DCI, DAG))
26539 return Val;
26540
26541 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
26542 return Val;
26543
26544 return SDValue();
26545}
26546
26547static SDValue
26548performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
26549 SelectionDAG &DAG) {
26550 // Let's do below transform.
26551 //
26552 // t34: v4i32 = AArch64ISD::UADDLV t2
26553 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
26554 // t7: i64 = zero_extend t35
26555 // t20: v1i64 = scalar_to_vector t7
26556 // ==>
26557 // t34: v4i32 = AArch64ISD::UADDLV t2
26558 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
26559 // t40: v1i64 = AArch64ISD::NVCAST t39
26560 if (DCI.isBeforeLegalizeOps())
26561 return SDValue();
26562
26563 EVT VT = N->getValueType(ResNo: 0);
26564 if (VT != MVT::v1i64)
26565 return SDValue();
26566
26567 SDValue ZEXT = N->getOperand(Num: 0);
26568 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
26569 return SDValue();
26570
26571 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(i: 0);
26572 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
26573 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
26574 return SDValue();
26575
26576 if (!isNullConstant(V: EXTRACT_VEC_ELT.getOperand(i: 1)))
26577 return SDValue();
26578
26579 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(i: 0);
26580 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
26581 UADDLV.getValueType() != MVT::v4i32 ||
26582 UADDLV.getOperand(i: 0).getValueType() != MVT::v8i8)
26583 return SDValue();
26584
26585 // Let's generate new sequence with AArch64ISD::NVCAST.
26586 SDLoc DL(N);
26587 SDValue EXTRACT_SUBVEC =
26588 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v2i32, N1: UADDLV,
26589 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
26590 SDValue NVCAST =
26591 DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: MVT::v1i64, Operand: EXTRACT_SUBVEC);
26592
26593 return NVCAST;
26594}
26595
26596/// If the operand is a bitwise AND with a constant RHS, and the shift has a
26597/// constant RHS and is the only use, we can pull it out of the shift, i.e.
26598///
26599/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
26600///
26601/// We prefer this canonical form to match existing isel patterns.
26602static SDValue performSHLCombine(SDNode *N,
26603 TargetLowering::DAGCombinerInfo &DCI,
26604 SelectionDAG &DAG) {
26605 if (DCI.isBeforeLegalizeOps())
26606 return SDValue();
26607
26608 SDValue Op0 = N->getOperand(Num: 0);
26609 if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
26610 return SDValue();
26611
26612 SDValue C1 = Op0->getOperand(Num: 1);
26613 SDValue C2 = N->getOperand(Num: 1);
26614 if (!isa<ConstantSDNode>(Val: C1) || !isa<ConstantSDNode>(Val: C2))
26615 return SDValue();
26616
26617 // Might be folded into shifted op, do not lower.
26618 if (N->hasOneUse()) {
26619 unsigned UseOpc = N->user_begin()->getOpcode();
26620 if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
26621 UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
26622 return SDValue();
26623 }
26624
26625 SDLoc DL(N);
26626 EVT VT = N->getValueType(ResNo: 0);
26627
26628 // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
26629 // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
26630 // causing infinite loop. Result may also be worse.
26631 SDValue NewRHS = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: C1, N2: C2);
26632 if (!isa<ConstantSDNode>(Val: NewRHS))
26633 return SDValue();
26634
26635 SDValue X = Op0->getOperand(Num: 0);
26636 SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: X, N2: C2);
26637 return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: NewShift, N2: NewRHS);
26638}
26639
26640SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
26641 DAGCombinerInfo &DCI) const {
26642 SelectionDAG &DAG = DCI.DAG;
26643 switch (N->getOpcode()) {
26644 default:
26645 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
26646 break;
26647 case ISD::VECREDUCE_AND:
26648 case ISD::VECREDUCE_OR:
26649 case ISD::VECREDUCE_XOR:
26650 return performVecReduceBitwiseCombine(N, DCI, DAG);
26651 case ISD::ADD:
26652 case ISD::SUB:
26653 return performAddSubCombine(N, DCI);
26654 case ISD::BUILD_VECTOR:
26655 return performBuildVectorCombine(N, DCI, DAG);
26656 case ISD::TRUNCATE:
26657 return performTruncateCombine(N, DAG, DCI);
26658 case AArch64ISD::ANDS:
26659 return performFlagSettingCombine(N, DCI, GenericOpcode: ISD::AND);
26660 case AArch64ISD::ADC:
26661 if (auto R = foldOverflowCheck(Op: N, DAG, /* IsAdd */ true))
26662 return R;
26663 return foldADCToCINC(N, DAG);
26664 case AArch64ISD::SBC:
26665 return foldOverflowCheck(Op: N, DAG, /* IsAdd */ false);
26666 case AArch64ISD::ADCS:
26667 if (auto R = foldOverflowCheck(Op: N, DAG, /* IsAdd */ true))
26668 return R;
26669 return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::ADC);
26670 case AArch64ISD::SBCS:
26671 if (auto R = foldOverflowCheck(Op: N, DAG, /* IsAdd */ false))
26672 return R;
26673 return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::SBC);
26674 case AArch64ISD::BICi: {
26675 APInt DemandedBits =
26676 APInt::getAllOnes(numBits: N->getValueType(ResNo: 0).getScalarSizeInBits());
26677 APInt DemandedElts =
26678 APInt::getAllOnes(numBits: N->getValueType(ResNo: 0).getVectorNumElements());
26679
26680 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(
26681 Op: SDValue(N, 0), DemandedBits, DemandedElts, DCI))
26682 return SDValue();
26683
26684 break;
26685 }
26686 case ISD::XOR:
26687 return performXorCombine(N, DAG, DCI, Subtarget);
26688 case ISD::MUL:
26689 return performMulCombine(N, DAG, DCI, Subtarget);
26690 case ISD::SINT_TO_FP:
26691 case ISD::UINT_TO_FP:
26692 return performIntToFpCombine(N, DAG, DCI, Subtarget);
26693 case ISD::FP_TO_SINT:
26694 case ISD::FP_TO_UINT:
26695 case ISD::FP_TO_SINT_SAT:
26696 case ISD::FP_TO_UINT_SAT:
26697 return performFpToIntCombine(N, DAG, DCI, Subtarget);
26698 case ISD::OR:
26699 return performORCombine(N, DCI, Subtarget, TLI: *this);
26700 case ISD::AND:
26701 return performANDCombine(N, DCI);
26702 case ISD::FADD:
26703 return performFADDCombine(N, DCI);
26704 case ISD::INTRINSIC_WO_CHAIN:
26705 return performIntrinsicCombine(N, DCI, Subtarget);
26706 case ISD::ANY_EXTEND:
26707 case ISD::ZERO_EXTEND:
26708 case ISD::SIGN_EXTEND:
26709 return performExtendCombine(N, DCI, DAG);
26710 case ISD::SIGN_EXTEND_INREG:
26711 return performSignExtendInRegCombine(N, DCI, DAG);
26712 case ISD::CONCAT_VECTORS:
26713 return performConcatVectorsCombine(N, DCI, DAG);
26714 case ISD::EXTRACT_SUBVECTOR:
26715 return performExtractSubvectorCombine(N, DCI, DAG);
26716 case ISD::INSERT_SUBVECTOR:
26717 return performInsertSubvectorCombine(N, DCI, DAG);
26718 case ISD::SELECT:
26719 return performSelectCombine(N, DCI);
26720 case ISD::VSELECT:
26721 return performVSelectCombine(N, DAG&: DCI.DAG);
26722 case ISD::SETCC:
26723 return performSETCCCombine(N, DCI, DAG);
26724 case ISD::LOAD:
26725 return performLOADCombine(N, DCI, DAG, Subtarget);
26726 case ISD::STORE:
26727 return performSTORECombine(N, DCI, DAG, Subtarget);
26728 case ISD::MSTORE:
26729 return performMSTORECombine(N, DCI, DAG, Subtarget);
26730 case ISD::MGATHER:
26731 case ISD::MSCATTER:
26732 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
26733 return performMaskedGatherScatterCombine(N, DCI, DAG);
26734 case ISD::FP_EXTEND:
26735 return performFPExtendCombine(N, DAG, DCI, Subtarget);
26736 case AArch64ISD::BRCOND:
26737 return performBRCONDCombine(N, DCI, DAG);
26738 case AArch64ISD::TBNZ:
26739 case AArch64ISD::TBZ:
26740 return performTBZCombine(N, DCI, DAG);
26741 case AArch64ISD::CSEL:
26742 return performCSELCombine(N, DCI, DAG);
26743 case AArch64ISD::DUP:
26744 case AArch64ISD::DUPLANE8:
26745 case AArch64ISD::DUPLANE16:
26746 case AArch64ISD::DUPLANE32:
26747 case AArch64ISD::DUPLANE64:
26748 return performDUPCombine(N, DCI);
26749 case AArch64ISD::DUPLANE128:
26750 return performDupLane128Combine(N, DAG);
26751 case AArch64ISD::NVCAST:
26752 return performNVCASTCombine(N, DAG);
26753 case AArch64ISD::SPLICE:
26754 return performSpliceCombine(N, DAG);
26755 case AArch64ISD::UUNPKLO:
26756 case AArch64ISD::UUNPKHI:
26757 return performUnpackCombine(N, DAG, Subtarget);
26758 case AArch64ISD::UZP1:
26759 case AArch64ISD::UZP2:
26760 return performUzpCombine(N, DAG, Subtarget);
26761 case AArch64ISD::SETCC_MERGE_ZERO:
26762 return performSetccMergeZeroCombine(N, DCI);
26763 case AArch64ISD::REINTERPRET_CAST:
26764 return performReinterpretCastCombine(N);
26765 case AArch64ISD::GLD1_MERGE_ZERO:
26766 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
26767 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
26768 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
26769 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
26770 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
26771 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
26772 case AArch64ISD::GLD1S_MERGE_ZERO:
26773 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
26774 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
26775 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
26776 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
26777 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
26778 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
26779 return performGLD1Combine(N, DAG);
26780 case AArch64ISD::VASHR:
26781 case AArch64ISD::VLSHR:
26782 return performVectorShiftCombine(N, TLI: *this, DCI);
26783 case AArch64ISD::SUNPKLO:
26784 return performSunpkloCombine(N, DAG);
26785 case AArch64ISD::BSP:
26786 return performBSPExpandForSVE(N, DAG, Subtarget);
26787 case ISD::INSERT_VECTOR_ELT:
26788 return performInsertVectorEltCombine(N, DCI);
26789 case ISD::EXTRACT_VECTOR_ELT:
26790 return performExtractVectorEltCombine(N, DCI, Subtarget);
26791 case ISD::VECREDUCE_ADD:
26792 return performVecReduceAddCombine(N, DAG&: DCI.DAG, ST: Subtarget);
26793 case ISD::GET_ACTIVE_LANE_MASK:
26794 return performActiveLaneMaskCombine(N, DCI, ST: Subtarget);
26795 case AArch64ISD::UADDV:
26796 return performUADDVCombine(N, DAG);
26797 case AArch64ISD::SMULL:
26798 case AArch64ISD::UMULL:
26799 case AArch64ISD::PMULL:
26800 return performMULLCombine(N, DCI, DAG);
26801 case ISD::INTRINSIC_VOID:
26802 case ISD::INTRINSIC_W_CHAIN:
26803 switch (N->getConstantOperandVal(Num: 1)) {
26804 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
26805 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 1 /*=ScalarSizeInBytes*/);
26806 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
26807 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 2 /*=ScalarSizeInBytes*/);
26808 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
26809 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 4 /*=ScalarSizeInBytes*/);
26810 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
26811 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 8 /*=ScalarSizeInBytes*/);
26812 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
26813 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
26814 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
26815 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
26816 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
26817 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
26818 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
26819 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
26820 return legalizeSVEGatherPrefetchOffsVec(N, DAG);
26821 case Intrinsic::aarch64_neon_ld2:
26822 case Intrinsic::aarch64_neon_ld3:
26823 case Intrinsic::aarch64_neon_ld4:
26824 case Intrinsic::aarch64_neon_ld1x2:
26825 case Intrinsic::aarch64_neon_ld1x3:
26826 case Intrinsic::aarch64_neon_ld1x4:
26827 case Intrinsic::aarch64_neon_ld2lane:
26828 case Intrinsic::aarch64_neon_ld3lane:
26829 case Intrinsic::aarch64_neon_ld4lane:
26830 case Intrinsic::aarch64_neon_ld2r:
26831 case Intrinsic::aarch64_neon_ld3r:
26832 case Intrinsic::aarch64_neon_ld4r:
26833 case Intrinsic::aarch64_neon_st2:
26834 case Intrinsic::aarch64_neon_st3:
26835 case Intrinsic::aarch64_neon_st4:
26836 case Intrinsic::aarch64_neon_st1x2:
26837 case Intrinsic::aarch64_neon_st1x3:
26838 case Intrinsic::aarch64_neon_st1x4:
26839 case Intrinsic::aarch64_neon_st2lane:
26840 case Intrinsic::aarch64_neon_st3lane:
26841 case Intrinsic::aarch64_neon_st4lane:
26842 return performNEONPostLDSTCombine(N, DCI, DAG);
26843 case Intrinsic::aarch64_sve_ldnt1:
26844 return performLDNT1Combine(N, DAG);
26845 case Intrinsic::aarch64_sve_ld1rq:
26846 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
26847 case Intrinsic::aarch64_sve_ld1ro:
26848 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
26849 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
26850 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
26851 case Intrinsic::aarch64_sve_ldnt1_gather:
26852 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
26853 case Intrinsic::aarch64_sve_ldnt1_gather_index:
26854 return performGatherLoadCombine(N, DAG,
26855 Opcode: AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
26856 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
26857 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
26858 case Intrinsic::aarch64_sve_ld1:
26859 return performLD1Combine(N, DAG, Opc: AArch64ISD::LD1_MERGE_ZERO);
26860 case Intrinsic::aarch64_sve_ldnf1:
26861 return performLD1Combine(N, DAG, Opc: AArch64ISD::LDNF1_MERGE_ZERO);
26862 case Intrinsic::aarch64_sve_ldff1:
26863 return performLD1Combine(N, DAG, Opc: AArch64ISD::LDFF1_MERGE_ZERO);
26864 case Intrinsic::aarch64_sve_st1:
26865 return performST1Combine(N, DAG);
26866 case Intrinsic::aarch64_sve_stnt1:
26867 return performSTNT1Combine(N, DAG);
26868 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
26869 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
26870 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
26871 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
26872 case Intrinsic::aarch64_sve_stnt1_scatter:
26873 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
26874 case Intrinsic::aarch64_sve_stnt1_scatter_index:
26875 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_INDEX_PRED);
26876 case Intrinsic::aarch64_sve_ld1_gather:
26877 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_MERGE_ZERO);
26878 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
26879 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
26880 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1Q_MERGE_ZERO);
26881 case Intrinsic::aarch64_sve_ld1q_gather_index:
26882 return performGatherLoadCombine(N, DAG,
26883 Opcode: AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
26884 case Intrinsic::aarch64_sve_ld1_gather_index:
26885 return performGatherLoadCombine(N, DAG,
26886 Opcode: AArch64ISD::GLD1_SCALED_MERGE_ZERO);
26887 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
26888 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_SXTW_MERGE_ZERO,
26889 /*OnlyPackedOffsets=*/false);
26890 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
26891 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_UXTW_MERGE_ZERO,
26892 /*OnlyPackedOffsets=*/false);
26893 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
26894 return performGatherLoadCombine(N, DAG,
26895 Opcode: AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
26896 /*OnlyPackedOffsets=*/false);
26897 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
26898 return performGatherLoadCombine(N, DAG,
26899 Opcode: AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
26900 /*OnlyPackedOffsets=*/false);
26901 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
26902 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_IMM_MERGE_ZERO);
26903 case Intrinsic::aarch64_sve_ldff1_gather:
26904 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDFF1_MERGE_ZERO);
26905 case Intrinsic::aarch64_sve_ldff1_gather_index:
26906 return performGatherLoadCombine(N, DAG,
26907 Opcode: AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
26908 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
26909 return performGatherLoadCombine(N, DAG,
26910 Opcode: AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
26911 /*OnlyPackedOffsets=*/false);
26912 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
26913 return performGatherLoadCombine(N, DAG,
26914 Opcode: AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
26915 /*OnlyPackedOffsets=*/false);
26916 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
26917 return performGatherLoadCombine(N, DAG,
26918 Opcode: AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
26919 /*OnlyPackedOffsets=*/false);
26920 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
26921 return performGatherLoadCombine(N, DAG,
26922 Opcode: AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
26923 /*OnlyPackedOffsets=*/false);
26924 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
26925 return performGatherLoadCombine(N, DAG,
26926 Opcode: AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
26927 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
26928 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
26929 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_PRED);
26930 case Intrinsic::aarch64_sve_st1q_scatter_index:
26931 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_INDEX_PRED);
26932 case Intrinsic::aarch64_sve_st1_scatter:
26933 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_PRED);
26934 case Intrinsic::aarch64_sve_st1_scatter_index:
26935 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SCALED_PRED);
26936 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
26937 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SXTW_PRED,
26938 /*OnlyPackedOffsets=*/false);
26939 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
26940 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_UXTW_PRED,
26941 /*OnlyPackedOffsets=*/false);
26942 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
26943 return performScatterStoreCombine(N, DAG,
26944 Opcode: AArch64ISD::SST1_SXTW_SCALED_PRED,
26945 /*OnlyPackedOffsets=*/false);
26946 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
26947 return performScatterStoreCombine(N, DAG,
26948 Opcode: AArch64ISD::SST1_UXTW_SCALED_PRED,
26949 /*OnlyPackedOffsets=*/false);
26950 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
26951 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_IMM_PRED);
26952 case Intrinsic::aarch64_rndr:
26953 case Intrinsic::aarch64_rndrrs: {
26954 unsigned IntrinsicID = N->getConstantOperandVal(Num: 1);
26955 auto Register =
26956 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
26957 : AArch64SysReg::RNDRRS);
26958 SDLoc DL(N);
26959 SDValue A = DAG.getNode(
26960 Opcode: AArch64ISD::MRS, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::i32, VT3: MVT::Other),
26961 N1: N->getOperand(Num: 0), N2: DAG.getConstant(Val: Register, DL, VT: MVT::i32));
26962 SDValue B = DAG.getNode(
26963 Opcode: AArch64ISD::CSINC, DL, VT: MVT::i32, N1: DAG.getConstant(Val: 0, DL, VT: MVT::i32),
26964 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32),
26965 N3: DAG.getConstant(Val: AArch64CC::NE, DL, VT: MVT::i32), N4: A.getValue(R: 1));
26966 return DAG.getMergeValues(
26967 Ops: {A, DAG.getZExtOrTrunc(Op: B, DL, VT: MVT::i1), A.getValue(R: 2)}, dl: DL);
26968 }
26969 case Intrinsic::aarch64_sme_ldr_zt:
26970 return DAG.getNode(Opcode: AArch64ISD::RESTORE_ZT, DL: SDLoc(N),
26971 VTList: DAG.getVTList(VT: MVT::Other), N1: N->getOperand(Num: 0),
26972 N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
26973 case Intrinsic::aarch64_sme_str_zt:
26974 return DAG.getNode(Opcode: AArch64ISD::SAVE_ZT, DL: SDLoc(N),
26975 VTList: DAG.getVTList(VT: MVT::Other), N1: N->getOperand(Num: 0),
26976 N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
26977 default:
26978 break;
26979 }
26980 break;
26981 case ISD::GlobalAddress:
26982 return performGlobalAddressCombine(N, DAG, Subtarget, TM: getTargetMachine());
26983 case ISD::CTLZ:
26984 return performCTLZCombine(N, DAG, Subtarget);
26985 case ISD::SCALAR_TO_VECTOR:
26986 return performScalarToVectorCombine(N, DCI, DAG);
26987 case ISD::SHL:
26988 return performSHLCombine(N, DCI, DAG);
26989 }
26990 return SDValue();
26991}
26992
26993// Check if the return value is used as only a return value, as otherwise
26994// we can't perform a tail-call. In particular, we need to check for
26995// target ISD nodes that are returns and any other "odd" constructs
26996// that the generic analysis code won't necessarily catch.
26997bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
26998 SDValue &Chain) const {
26999 if (N->getNumValues() != 1)
27000 return false;
27001 if (!N->hasNUsesOfValue(NUses: 1, Value: 0))
27002 return false;
27003
27004 SDValue TCChain = Chain;
27005 SDNode *Copy = *N->user_begin();
27006 if (Copy->getOpcode() == ISD::CopyToReg) {
27007 // If the copy has a glue operand, we conservatively assume it isn't safe to
27008 // perform a tail call.
27009 if (Copy->getOperand(Num: Copy->getNumOperands() - 1).getValueType() ==
27010 MVT::Glue)
27011 return false;
27012 TCChain = Copy->getOperand(Num: 0);
27013 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
27014 return false;
27015
27016 bool HasRet = false;
27017 for (SDNode *Node : Copy->users()) {
27018 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
27019 return false;
27020 HasRet = true;
27021 }
27022
27023 if (!HasRet)
27024 return false;
27025
27026 Chain = TCChain;
27027 return true;
27028}
27029
27030// Return whether the an instruction can potentially be optimized to a tail
27031// call. This will cause the optimizers to attempt to move, or duplicate,
27032// return instructions to help enable tail call optimizations for this
27033// instruction.
27034bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
27035 return CI->isTailCall();
27036}
27037
27038bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
27039 Register Offset, bool IsPre,
27040 MachineRegisterInfo &MRI) const {
27041 auto CstOffset = getIConstantVRegVal(VReg: Offset, MRI);
27042 if (!CstOffset || CstOffset->isZero())
27043 return false;
27044
27045 // All of the indexed addressing mode instructions take a signed 9 bit
27046 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
27047 // encodes the sign/indexing direction.
27048 return isInt<9>(x: CstOffset->getSExtValue());
27049}
27050
27051bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
27052 SDValue &Base,
27053 SDValue &Offset,
27054 SelectionDAG &DAG) const {
27055 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
27056 return false;
27057
27058 // Non-null if there is exactly one user of the loaded value (ignoring chain).
27059 SDNode *ValOnlyUser = nullptr;
27060 for (SDUse &U : N->uses()) {
27061 if (U.getResNo() == 1)
27062 continue; // Ignore chain.
27063 if (ValOnlyUser == nullptr)
27064 ValOnlyUser = U.getUser();
27065 else {
27066 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
27067 break;
27068 }
27069 }
27070
27071 auto IsUndefOrZero = [](SDValue V) {
27072 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
27073 };
27074
27075 // If the only user of the value is a scalable vector splat, it is
27076 // preferable to do a replicating load (ld1r*).
27077 if (ValOnlyUser && ValOnlyUser->getValueType(ResNo: 0).isScalableVector() &&
27078 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
27079 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
27080 IsUndefOrZero(ValOnlyUser->getOperand(Num: 2)))))
27081 return false;
27082
27083 Base = Op->getOperand(Num: 0);
27084 // All of the indexed addressing mode instructions take a signed
27085 // 9 bit immediate offset.
27086 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1))) {
27087 int64_t RHSC = RHS->getSExtValue();
27088 if (Op->getOpcode() == ISD::SUB)
27089 RHSC = -(uint64_t)RHSC;
27090 if (!isInt<9>(x: RHSC))
27091 return false;
27092 // When big-endian VLD1/VST1 are used for vector load and store, and these
27093 // only allow an offset that's equal to the store size.
27094 EVT MemType = cast<MemSDNode>(Val: N)->getMemoryVT();
27095 if (!Subtarget->isLittleEndian() && MemType.isVector() &&
27096 (uint64_t)RHSC != MemType.getStoreSize())
27097 return false;
27098 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
27099 // when dealing with subtraction.
27100 Offset = DAG.getConstant(Val: RHSC, DL: SDLoc(N), VT: RHS->getValueType(ResNo: 0));
27101 return true;
27102 }
27103 return false;
27104}
27105
27106bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
27107 SDValue &Offset,
27108 ISD::MemIndexedMode &AM,
27109 SelectionDAG &DAG) const {
27110 EVT VT;
27111 SDValue Ptr;
27112 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
27113 VT = LD->getMemoryVT();
27114 Ptr = LD->getBasePtr();
27115 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
27116 VT = ST->getMemoryVT();
27117 Ptr = ST->getBasePtr();
27118 } else
27119 return false;
27120
27121 if (!getIndexedAddressParts(N, Op: Ptr.getNode(), Base, Offset, DAG))
27122 return false;
27123 AM = ISD::PRE_INC;
27124 return true;
27125}
27126
27127bool AArch64TargetLowering::getPostIndexedAddressParts(
27128 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
27129 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
27130 EVT VT;
27131 SDValue Ptr;
27132 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
27133 VT = LD->getMemoryVT();
27134 Ptr = LD->getBasePtr();
27135 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
27136 VT = ST->getMemoryVT();
27137 Ptr = ST->getBasePtr();
27138 } else
27139 return false;
27140
27141 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
27142 return false;
27143 // Post-indexing updates the base, so it's not a valid transform
27144 // if that's not the same as the load's pointer.
27145 if (Ptr != Base)
27146 return false;
27147 AM = ISD::POST_INC;
27148 return true;
27149}
27150
27151static void replaceBoolVectorBitcast(SDNode *N,
27152 SmallVectorImpl<SDValue> &Results,
27153 SelectionDAG &DAG) {
27154 SDLoc DL(N);
27155 SDValue Op = N->getOperand(Num: 0);
27156 EVT VT = N->getValueType(ResNo: 0);
27157 [[maybe_unused]] EVT SrcVT = Op.getValueType();
27158 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
27159 "Must be bool vector.");
27160
27161 // Special handling for Clang's __builtin_convertvector. For vectors with <8
27162 // elements, it adds a vector concatenation with undef(s). If we encounter
27163 // this here, we can skip the concat.
27164 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(i: 0).isUndef()) {
27165 bool AllUndef = true;
27166 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
27167 AllUndef &= Op.getOperand(i: I).isUndef();
27168
27169 if (AllUndef)
27170 Op = Op.getOperand(i: 0);
27171 }
27172
27173 SDValue VectorBits = vectorToScalarBitmask(N: Op.getNode(), DAG);
27174 if (VectorBits)
27175 Results.push_back(Elt: DAG.getZExtOrTrunc(Op: VectorBits, DL, VT));
27176}
27177
27178static void CustomNonLegalBITCASTResults(SDNode *N,
27179 SmallVectorImpl<SDValue> &Results,
27180 SelectionDAG &DAG, EVT ExtendVT,
27181 EVT CastVT) {
27182 SDLoc DL(N);
27183 SDValue Op = N->getOperand(Num: 0);
27184 EVT VT = N->getValueType(ResNo: 0);
27185
27186 // Use SCALAR_TO_VECTOR for lane zero
27187 SDValue Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: ExtendVT, Operand: Op);
27188 SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: CastVT, Operand: Vec);
27189 SDValue IdxZero = DAG.getVectorIdxConstant(Val: 0, DL);
27190 Results.push_back(
27191 Elt: DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: CastVal, N2: IdxZero));
27192}
27193
27194void AArch64TargetLowering::ReplaceBITCASTResults(
27195 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
27196 SDLoc DL(N);
27197 SDValue Op = N->getOperand(Num: 0);
27198 EVT VT = N->getValueType(ResNo: 0);
27199 EVT SrcVT = Op.getValueType();
27200
27201 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
27202 CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v2i32, CastVT: MVT::v4i16);
27203 return;
27204 }
27205
27206 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
27207 CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v2i32, CastVT: MVT::v8i8);
27208 return;
27209 }
27210
27211 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
27212 CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v4i16, CastVT: MVT::v8i8);
27213 return;
27214 }
27215
27216 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(VT: SrcVT)) {
27217 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
27218 "Expected fp->int bitcast!");
27219
27220 // Bitcasting between unpacked vector types of different element counts is
27221 // not a NOP because the live elements are laid out differently.
27222 // 01234567
27223 // e.g. nxv2i32 = XX??XX??
27224 // nxv4f16 = X?X?X?X?
27225 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
27226 return;
27227
27228 SDValue CastResult = getSVESafeBitCast(VT: getSVEContainerType(ContentTy: VT), Op, DAG);
27229 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: CastResult));
27230 return;
27231 }
27232
27233 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
27234 !VT.isVector())
27235 return replaceBoolVectorBitcast(N, Results, DAG);
27236
27237 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
27238 return;
27239
27240 Op = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
27241 Operand: DAG.getUNDEF(VT: MVT::i32), Subreg: Op);
27242 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Op);
27243 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Op));
27244}
27245
27246static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
27247 SelectionDAG &DAG,
27248 const AArch64Subtarget *Subtarget) {
27249 EVT VT = N->getValueType(ResNo: 0);
27250 if (!VT.is256BitVector() ||
27251 (VT.getScalarType().isFloatingPoint() &&
27252 !N->getFlags().hasAllowReassociation()) ||
27253 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
27254 VT.getScalarType() == MVT::bf16)
27255 return;
27256
27257 SDValue X = N->getOperand(Num: 0);
27258 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 1));
27259 if (!Shuf) {
27260 Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 0));
27261 X = N->getOperand(Num: 1);
27262 if (!Shuf)
27263 return;
27264 }
27265
27266 if (Shuf->getOperand(Num: 0) != X || !Shuf->getOperand(Num: 1)->isUndef())
27267 return;
27268
27269 // Check the mask is 1,0,3,2,5,4,...
27270 ArrayRef<int> Mask = Shuf->getMask();
27271 for (int I = 0, E = Mask.size(); I < E; I++)
27272 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
27273 return;
27274
27275 SDLoc DL(N);
27276 auto LoHi = DAG.SplitVector(N: X, DL);
27277 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
27278 SDValue Addp = DAG.getNode(Opcode: AArch64ISD::ADDP, DL: N, VT: LoHi.first.getValueType(),
27279 N1: LoHi.first, N2: LoHi.second);
27280
27281 // Shuffle the elements back into order.
27282 SmallVector<int> NMask;
27283 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
27284 NMask.push_back(Elt: I);
27285 NMask.push_back(Elt: I);
27286 }
27287 Results.push_back(
27288 Elt: DAG.getVectorShuffle(VT, dl: DL,
27289 N1: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Addp,
27290 N2: DAG.getUNDEF(VT: LoHi.first.getValueType())),
27291 N2: DAG.getUNDEF(VT), Mask: NMask));
27292}
27293
27294static void ReplaceReductionResults(SDNode *N,
27295 SmallVectorImpl<SDValue> &Results,
27296 SelectionDAG &DAG, unsigned InterOp,
27297 unsigned AcrossOp) {
27298 EVT LoVT, HiVT;
27299 SDValue Lo, Hi;
27300 SDLoc DL(N);
27301 std::tie(args&: LoVT, args&: HiVT) = DAG.GetSplitDestVTs(VT: N->getValueType(ResNo: 0));
27302 std::tie(args&: Lo, args&: Hi) = DAG.SplitVectorOperand(N, OpNo: 0);
27303 SDValue InterVal = DAG.getNode(Opcode: InterOp, DL, VT: LoVT, N1: Lo, N2: Hi);
27304 SDValue SplitVal = DAG.getNode(Opcode: AcrossOp, DL, VT: LoVT, Operand: InterVal);
27305 Results.push_back(Elt: SplitVal);
27306}
27307
27308void AArch64TargetLowering::ReplaceExtractSubVectorResults(
27309 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
27310 SDValue In = N->getOperand(Num: 0);
27311 EVT InVT = In.getValueType();
27312
27313 // Common code will handle these just fine.
27314 if (!InVT.isScalableVector() || !InVT.isInteger())
27315 return;
27316
27317 SDLoc DL(N);
27318 EVT VT = N->getValueType(ResNo: 0);
27319
27320 // The following checks bail if this is not a halving operation.
27321
27322 ElementCount ResEC = VT.getVectorElementCount();
27323
27324 if (InVT.getVectorElementCount() != (ResEC * 2))
27325 return;
27326
27327 auto *CIndex = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
27328 if (!CIndex)
27329 return;
27330
27331 unsigned Index = CIndex->getZExtValue();
27332 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
27333 return;
27334
27335 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
27336 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
27337
27338 SDValue Half = DAG.getNode(Opcode, DL, VT: ExtendedHalfVT, Operand: N->getOperand(Num: 0));
27339 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Half));
27340}
27341
27342void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
27343 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
27344 assert((Subtarget->hasSVE2p1() ||
27345 (Subtarget->hasSME2() && Subtarget->isStreaming())) &&
27346 "Custom lower of get.active.lane.mask missing required feature.");
27347
27348 assert(N->getValueType(0) == MVT::nxv32i1 &&
27349 "Unexpected result type for get.active.lane.mask");
27350
27351 SDLoc DL(N);
27352 SDValue Idx = N->getOperand(Num: 0);
27353 SDValue TC = N->getOperand(Num: 1);
27354
27355 assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&
27356 "Unexpected operand type for get.active.lane.mask");
27357
27358 if (Idx.getValueType() != MVT::i64) {
27359 Idx = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: Idx);
27360 TC = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: TC);
27361 }
27362
27363 SDValue ID =
27364 DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo_x2, DL, VT: MVT::i64);
27365 EVT HalfVT = N->getValueType(ResNo: 0).getHalfNumVectorElementsVT(Context&: *DAG.getContext());
27366 auto WideMask =
27367 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, ResultTys: {HalfVT, HalfVT}, Ops: {ID, Idx, TC});
27368
27369 Results.push_back(Elt: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: N->getValueType(ResNo: 0),
27370 Ops: {WideMask.getValue(R: 0), WideMask.getValue(R: 1)}));
27371}
27372
27373// Create an even/odd pair of X registers holding integer value V.
27374static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
27375 SDLoc DL(V.getNode());
27376 auto [VLo, VHi] = DAG.SplitScalar(N: V, DL, LoVT: MVT::i64, HiVT: MVT::i64);
27377 if (DAG.getDataLayout().isBigEndian())
27378 std::swap (a&: VLo, b&: VHi);
27379 SDValue RegClass =
27380 DAG.getTargetConstant(Val: AArch64::XSeqPairsClassRegClassID, DL, VT: MVT::i32);
27381 SDValue SubReg0 = DAG.getTargetConstant(Val: AArch64::sube64, DL, VT: MVT::i32);
27382 SDValue SubReg1 = DAG.getTargetConstant(Val: AArch64::subo64, DL, VT: MVT::i32);
27383 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
27384 return SDValue(
27385 DAG.getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT: MVT::Untyped, Ops), 0);
27386}
27387
27388static void ReplaceCMP_SWAP_128Results(SDNode *N,
27389 SmallVectorImpl<SDValue> &Results,
27390 SelectionDAG &DAG,
27391 const AArch64Subtarget *Subtarget) {
27392 assert(N->getValueType(0) == MVT::i128 &&
27393 "AtomicCmpSwap on types less than 128 should be legal");
27394
27395 MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
27396 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
27397 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
27398 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
27399 SDValue Ops[] = {
27400 createGPRPairNode(DAG, V: N->getOperand(Num: 2)), // Compare value
27401 createGPRPairNode(DAG, V: N->getOperand(Num: 3)), // Store value
27402 N->getOperand(Num: 1), // Ptr
27403 N->getOperand(Num: 0), // Chain in
27404 };
27405
27406 unsigned Opcode;
27407 switch (MemOp->getMergedOrdering()) {
27408 case AtomicOrdering::Monotonic:
27409 Opcode = AArch64::CASPX;
27410 break;
27411 case AtomicOrdering::Acquire:
27412 Opcode = AArch64::CASPAX;
27413 break;
27414 case AtomicOrdering::Release:
27415 Opcode = AArch64::CASPLX;
27416 break;
27417 case AtomicOrdering::AcquireRelease:
27418 case AtomicOrdering::SequentiallyConsistent:
27419 Opcode = AArch64::CASPALX;
27420 break;
27421 default:
27422 llvm_unreachable("Unexpected ordering!");
27423 }
27424
27425 MachineSDNode *CmpSwap = DAG.getMachineNode(
27426 Opcode, dl: SDLoc(N), VTs: DAG.getVTList(VT1: MVT::Untyped, VT2: MVT::Other), Ops);
27427 DAG.setNodeMemRefs(N: CmpSwap, NewMemRefs: {MemOp});
27428
27429 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
27430 if (DAG.getDataLayout().isBigEndian())
27431 std::swap(a&: SubReg1, b&: SubReg2);
27432 SDValue Lo = DAG.getTargetExtractSubreg(SRIdx: SubReg1, DL: SDLoc(N), VT: MVT::i64,
27433 Operand: SDValue(CmpSwap, 0));
27434 SDValue Hi = DAG.getTargetExtractSubreg(SRIdx: SubReg2, DL: SDLoc(N), VT: MVT::i64,
27435 Operand: SDValue(CmpSwap, 0));
27436 Results.push_back(
27437 Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc(N), VT: MVT::i128, N1: Lo, N2: Hi));
27438 Results.push_back(Elt: SDValue(CmpSwap, 1)); // Chain out
27439 return;
27440 }
27441
27442 unsigned Opcode;
27443 switch (MemOp->getMergedOrdering()) {
27444 case AtomicOrdering::Monotonic:
27445 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
27446 break;
27447 case AtomicOrdering::Acquire:
27448 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
27449 break;
27450 case AtomicOrdering::Release:
27451 Opcode = AArch64::CMP_SWAP_128_RELEASE;
27452 break;
27453 case AtomicOrdering::AcquireRelease:
27454 case AtomicOrdering::SequentiallyConsistent:
27455 Opcode = AArch64::CMP_SWAP_128;
27456 break;
27457 default:
27458 llvm_unreachable("Unexpected ordering!");
27459 }
27460
27461 SDLoc DL(N);
27462 auto Desired = DAG.SplitScalar(N: N->getOperand(Num: 2), DL, LoVT: MVT::i64, HiVT: MVT::i64);
27463 auto New = DAG.SplitScalar(N: N->getOperand(Num: 3), DL, LoVT: MVT::i64, HiVT: MVT::i64);
27464 SDValue Ops[] = {N->getOperand(Num: 1), Desired.first, Desired.second,
27465 New.first, New.second, N->getOperand(Num: 0)};
27466 SDNode *CmpSwap = DAG.getMachineNode(
27467 Opcode, dl: SDLoc(N), VTs: DAG.getVTList(VT1: MVT::i64, VT2: MVT::i64, VT3: MVT::i32, VT4: MVT::Other),
27468 Ops);
27469 DAG.setNodeMemRefs(N: cast<MachineSDNode>(Val: CmpSwap), NewMemRefs: {MemOp});
27470
27471 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i128,
27472 N1: SDValue(CmpSwap, 0), N2: SDValue(CmpSwap, 1)));
27473 Results.push_back(Elt: SDValue(CmpSwap, 3));
27474}
27475
27476static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
27477 AtomicOrdering Ordering) {
27478 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
27479 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
27480 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
27481 // ATOMIC_LOAD_CLR at any point.
27482 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
27483 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
27484 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
27485 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
27486
27487 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
27488 // The operand will need to be XORed in a separate step.
27489 switch (Ordering) {
27490 case AtomicOrdering::Monotonic:
27491 return AArch64::LDCLRP;
27492 break;
27493 case AtomicOrdering::Acquire:
27494 return AArch64::LDCLRPA;
27495 break;
27496 case AtomicOrdering::Release:
27497 return AArch64::LDCLRPL;
27498 break;
27499 case AtomicOrdering::AcquireRelease:
27500 case AtomicOrdering::SequentiallyConsistent:
27501 return AArch64::LDCLRPAL;
27502 break;
27503 default:
27504 llvm_unreachable("Unexpected ordering!");
27505 }
27506 }
27507
27508 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
27509 switch (Ordering) {
27510 case AtomicOrdering::Monotonic:
27511 return AArch64::LDSETP;
27512 break;
27513 case AtomicOrdering::Acquire:
27514 return AArch64::LDSETPA;
27515 break;
27516 case AtomicOrdering::Release:
27517 return AArch64::LDSETPL;
27518 break;
27519 case AtomicOrdering::AcquireRelease:
27520 case AtomicOrdering::SequentiallyConsistent:
27521 return AArch64::LDSETPAL;
27522 break;
27523 default:
27524 llvm_unreachable("Unexpected ordering!");
27525 }
27526 }
27527
27528 if (ISDOpcode == ISD::ATOMIC_SWAP) {
27529 switch (Ordering) {
27530 case AtomicOrdering::Monotonic:
27531 return AArch64::SWPP;
27532 break;
27533 case AtomicOrdering::Acquire:
27534 return AArch64::SWPPA;
27535 break;
27536 case AtomicOrdering::Release:
27537 return AArch64::SWPPL;
27538 break;
27539 case AtomicOrdering::AcquireRelease:
27540 case AtomicOrdering::SequentiallyConsistent:
27541 return AArch64::SWPPAL;
27542 break;
27543 default:
27544 llvm_unreachable("Unexpected ordering!");
27545 }
27546 }
27547
27548 llvm_unreachable("Unexpected ISDOpcode!");
27549}
27550
27551static void ReplaceATOMIC_LOAD_128Results(SDNode *N,
27552 SmallVectorImpl<SDValue> &Results,
27553 SelectionDAG &DAG,
27554 const AArch64Subtarget *Subtarget) {
27555 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
27556 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
27557 // rather than the CASP instructions, because CASP has register classes for
27558 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
27559 // to present them as single operands. LSE128 instructions use the GPR64
27560 // register class (because the pair does not have to be sequential), like
27561 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
27562
27563 assert(N->getValueType(0) == MVT::i128 &&
27564 "AtomicLoadXXX on types less than 128 should be legal");
27565
27566 if (!Subtarget->hasLSE128())
27567 return;
27568
27569 MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
27570 const SDValue &Chain = N->getOperand(Num: 0);
27571 const SDValue &Ptr = N->getOperand(Num: 1);
27572 const SDValue &Val128 = N->getOperand(Num: 2);
27573 std::pair<SDValue, SDValue> Val2x64 =
27574 DAG.SplitScalar(N: Val128, DL: SDLoc(Val128), LoVT: MVT::i64, HiVT: MVT::i64);
27575
27576 const unsigned ISDOpcode = N->getOpcode();
27577 const unsigned MachineOpcode =
27578 getAtomicLoad128Opcode(ISDOpcode, Ordering: MemOp->getMergedOrdering());
27579
27580 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
27581 SDLoc DL(Val128);
27582 Val2x64.first =
27583 DAG.getNode(Opcode: ISD::XOR, DL, VT: MVT::i64,
27584 N1: DAG.getAllOnesConstant(DL, VT: MVT::i64), N2: Val2x64.first);
27585 Val2x64.second =
27586 DAG.getNode(Opcode: ISD::XOR, DL, VT: MVT::i64,
27587 N1: DAG.getAllOnesConstant(DL, VT: MVT::i64), N2: Val2x64.second);
27588 }
27589
27590 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
27591 if (DAG.getDataLayout().isBigEndian())
27592 std::swap(a&: Ops[0], b&: Ops[1]);
27593
27594 MachineSDNode *AtomicInst =
27595 DAG.getMachineNode(Opcode: MachineOpcode, dl: SDLoc(N),
27596 VTs: DAG.getVTList(VT1: MVT::i64, VT2: MVT::i64, VT3: MVT::Other), Ops);
27597
27598 DAG.setNodeMemRefs(N: AtomicInst, NewMemRefs: {MemOp});
27599
27600 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
27601 if (DAG.getDataLayout().isBigEndian())
27602 std::swap(a&: Lo, b&: Hi);
27603
27604 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc(N), VT: MVT::i128, N1: Lo, N2: Hi));
27605 Results.push_back(Elt: SDValue(AtomicInst, 2)); // Chain out
27606}
27607
27608void AArch64TargetLowering::ReplaceNodeResults(
27609 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
27610 switch (N->getOpcode()) {
27611 default:
27612 llvm_unreachable("Don't know how to custom expand this");
27613 case ISD::BITCAST:
27614 ReplaceBITCASTResults(N, Results, DAG);
27615 return;
27616 case ISD::VECREDUCE_ADD:
27617 case ISD::VECREDUCE_SMAX:
27618 case ISD::VECREDUCE_SMIN:
27619 case ISD::VECREDUCE_UMAX:
27620 case ISD::VECREDUCE_UMIN:
27621 Results.push_back(Elt: LowerVECREDUCE(Op: SDValue(N, 0), DAG));
27622 return;
27623 case ISD::VECTOR_COMPRESS:
27624 if (SDValue Res = LowerVECTOR_COMPRESS(Op: SDValue(N, 0), DAG))
27625 Results.push_back(Elt: Res);
27626 return;
27627 case ISD::ADD:
27628 case ISD::FADD:
27629 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
27630 return;
27631
27632 case ISD::CTPOP:
27633 case ISD::PARITY:
27634 if (SDValue Result = LowerCTPOP_PARITY(Op: SDValue(N, 0), DAG))
27635 Results.push_back(Elt: Result);
27636 return;
27637 case AArch64ISD::SADDV:
27638 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::SADDV);
27639 return;
27640 case AArch64ISD::UADDV:
27641 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::UADDV);
27642 return;
27643 case AArch64ISD::SMINV:
27644 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMIN, AcrossOp: AArch64ISD::SMINV);
27645 return;
27646 case AArch64ISD::UMINV:
27647 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMIN, AcrossOp: AArch64ISD::UMINV);
27648 return;
27649 case AArch64ISD::SMAXV:
27650 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMAX, AcrossOp: AArch64ISD::SMAXV);
27651 return;
27652 case AArch64ISD::UMAXV:
27653 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMAX, AcrossOp: AArch64ISD::UMAXV);
27654 return;
27655 case ISD::MULHS:
27656 if (useSVEForFixedLengthVectorVT(VT: SDValue(N, 0).getValueType()))
27657 Results.push_back(
27658 Elt: LowerToPredicatedOp(Op: SDValue(N, 0), DAG, NewOp: AArch64ISD::MULHS_PRED));
27659 return;
27660 case ISD::MULHU:
27661 if (useSVEForFixedLengthVectorVT(VT: SDValue(N, 0).getValueType()))
27662 Results.push_back(
27663 Elt: LowerToPredicatedOp(Op: SDValue(N, 0), DAG, NewOp: AArch64ISD::MULHU_PRED));
27664 return;
27665 case ISD::FP_TO_UINT:
27666 case ISD::FP_TO_SINT:
27667 case ISD::STRICT_FP_TO_SINT:
27668 case ISD::STRICT_FP_TO_UINT:
27669 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
27670 // Let normal code take care of it by not adding anything to Results.
27671 return;
27672 case ISD::ATOMIC_CMP_SWAP:
27673 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
27674 return;
27675 case ISD::ATOMIC_LOAD_CLR:
27676 assert(N->getValueType(0) != MVT::i128 &&
27677 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
27678 break;
27679 case ISD::ATOMIC_LOAD_AND:
27680 case ISD::ATOMIC_LOAD_OR:
27681 case ISD::ATOMIC_SWAP: {
27682 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
27683 "Expected 128-bit atomicrmw.");
27684 // These need custom type legalisation so we go directly to instruction.
27685 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
27686 return;
27687 }
27688 case ISD::ADDRSPACECAST: {
27689 SDValue V = LowerADDRSPACECAST(Op: SDValue(N, 0), DAG);
27690 Results.push_back(Elt: V);
27691 return;
27692 }
27693 case ISD::ATOMIC_LOAD:
27694 case ISD::LOAD: {
27695 MemSDNode *LoadNode = cast<MemSDNode>(Val: N);
27696 EVT MemVT = LoadNode->getMemoryVT();
27697 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
27698 // targets.
27699 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
27700 MemVT.getSizeInBits() == 256u &&
27701 (MemVT.getScalarSizeInBits() == 8u ||
27702 MemVT.getScalarSizeInBits() == 16u ||
27703 MemVT.getScalarSizeInBits() == 32u ||
27704 MemVT.getScalarSizeInBits() == 64u)) {
27705
27706 SDValue Result = DAG.getMemIntrinsicNode(
27707 Opcode: AArch64ISD::LDNP, dl: SDLoc(N),
27708 VTList: DAG.getVTList(VTs: {MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
27709 MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
27710 MVT::Other}),
27711 Ops: {LoadNode->getChain(), LoadNode->getBasePtr()},
27712 MemVT: LoadNode->getMemoryVT(), MMO: LoadNode->getMemOperand());
27713
27714 SDValue Pair = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(N), VT: MemVT,
27715 N1: Result.getValue(R: 0), N2: Result.getValue(R: 1));
27716 Results.append(IL: {Pair, Result.getValue(R: 2) /* Chain */});
27717 return;
27718 }
27719
27720 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
27721 LoadNode->getMemoryVT() != MVT::i128) {
27722 // Non-volatile or atomic loads are optimized later in AArch64's load/store
27723 // optimizer.
27724 return;
27725 }
27726
27727 if (SDValue(N, 0).getValueType() == MVT::i128) {
27728 auto *AN = dyn_cast<AtomicSDNode>(Val: LoadNode);
27729 bool isLoadAcquire =
27730 AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire;
27731 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
27732
27733 if (isLoadAcquire)
27734 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
27735
27736 SDValue Result = DAG.getMemIntrinsicNode(
27737 Opcode, dl: SDLoc(N), VTList: DAG.getVTList(VTs: {MVT::i64, MVT::i64, MVT::Other}),
27738 Ops: {LoadNode->getChain(), LoadNode->getBasePtr()},
27739 MemVT: LoadNode->getMemoryVT(), MMO: LoadNode->getMemOperand());
27740
27741 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
27742
27743 SDValue Pair =
27744 DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc(N), VT: MVT::i128,
27745 N1: Result.getValue(R: FirstRes), N2: Result.getValue(R: 1 - FirstRes));
27746 Results.append(IL: {Pair, Result.getValue(R: 2) /* Chain */});
27747 }
27748 return;
27749 }
27750 case ISD::EXTRACT_SUBVECTOR:
27751 ReplaceExtractSubVectorResults(N, Results, DAG);
27752 return;
27753 case ISD::INSERT_SUBVECTOR:
27754 case ISD::CONCAT_VECTORS:
27755 // Custom lowering has been requested for INSERT_SUBVECTOR and
27756 // CONCAT_VECTORS -- but delegate to common code for result type
27757 // legalisation
27758 return;
27759 case ISD::GET_ACTIVE_LANE_MASK:
27760 ReplaceGetActiveLaneMaskResults(N, Results, DAG);
27761 return;
27762 case ISD::INTRINSIC_WO_CHAIN: {
27763 EVT VT = N->getValueType(ResNo: 0);
27764
27765 Intrinsic::ID IntID =
27766 static_cast<Intrinsic::ID>(N->getConstantOperandVal(Num: 0));
27767 switch (IntID) {
27768 default:
27769 return;
27770 case Intrinsic::aarch64_sve_clasta_n: {
27771 assert((VT == MVT::i8 || VT == MVT::i16) &&
27772 "custom lowering for unexpected type");
27773 SDLoc DL(N);
27774 auto Op2 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: N->getOperand(Num: 2));
27775 auto V = DAG.getNode(Opcode: AArch64ISD::CLASTA_N, DL, VT: MVT::i32,
27776 N1: N->getOperand(Num: 1), N2: Op2, N3: N->getOperand(Num: 3));
27777 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
27778 return;
27779 }
27780 case Intrinsic::aarch64_sve_clastb_n: {
27781 assert((VT == MVT::i8 || VT == MVT::i16) &&
27782 "custom lowering for unexpected type");
27783 SDLoc DL(N);
27784 auto Op2 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: N->getOperand(Num: 2));
27785 auto V = DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL, VT: MVT::i32,
27786 N1: N->getOperand(Num: 1), N2: Op2, N3: N->getOperand(Num: 3));
27787 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
27788 return;
27789 }
27790 case Intrinsic::aarch64_sve_lasta: {
27791 assert((VT == MVT::i8 || VT == MVT::i16) &&
27792 "custom lowering for unexpected type");
27793 SDLoc DL(N);
27794 auto V = DAG.getNode(Opcode: AArch64ISD::LASTA, DL, VT: MVT::i32,
27795 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
27796 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
27797 return;
27798 }
27799 case Intrinsic::aarch64_sve_lastb: {
27800 assert((VT == MVT::i8 || VT == MVT::i16) &&
27801 "custom lowering for unexpected type");
27802 SDLoc DL(N);
27803 auto V = DAG.getNode(Opcode: AArch64ISD::LASTB, DL, VT: MVT::i32,
27804 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
27805 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
27806 return;
27807 }
27808 case Intrinsic::aarch64_sme_in_streaming_mode: {
27809 SDLoc DL(N);
27810 SDValue Chain = DAG.getEntryNode();
27811 SDValue RuntimePStateSM =
27812 getRuntimePStateSM(DAG, Chain, DL, VT: N->getValueType(ResNo: 0));
27813 Results.push_back(
27814 Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: RuntimePStateSM));
27815 return;
27816 }
27817 case Intrinsic::experimental_vector_match: {
27818 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
27819 return;
27820
27821 // NOTE: Only trivial type promotion is supported.
27822 EVT NewVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
27823 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
27824 return;
27825
27826 SDLoc DL(N);
27827 auto V = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: NewVT, Ops: N->ops());
27828 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
27829 return;
27830 }
27831 }
27832 }
27833 case ISD::READ_REGISTER: {
27834 SDLoc DL(N);
27835 assert(N->getValueType(0) == MVT::i128 &&
27836 "READ_REGISTER custom lowering is only for 128-bit sysregs");
27837 SDValue Chain = N->getOperand(Num: 0);
27838 SDValue SysRegName = N->getOperand(Num: 1);
27839
27840 SDValue Result = DAG.getNode(
27841 Opcode: AArch64ISD::MRRS, DL, VTList: DAG.getVTList(VTs: {MVT::i64, MVT::i64, MVT::Other}),
27842 N1: Chain, N2: SysRegName);
27843
27844 // Sysregs are not endian. Result.getValue(0) always contains the lower half
27845 // of the 128-bit System Register value.
27846 SDValue Pair = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i128,
27847 N1: Result.getValue(R: 0), N2: Result.getValue(R: 1));
27848 Results.push_back(Elt: Pair);
27849 Results.push_back(Elt: Result.getValue(R: 2)); // Chain
27850 return;
27851 }
27852 }
27853}
27854
27855bool AArch64TargetLowering::useLoadStackGuardNode(const Module &M) const {
27856 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
27857 return TargetLowering::useLoadStackGuardNode(M);
27858 return true;
27859}
27860
27861unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
27862 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
27863 // reciprocal if there are three or more FDIVs.
27864 return 3;
27865}
27866
27867TargetLoweringBase::LegalizeTypeAction
27868AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
27869 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
27870 // v4i16, v2i32 instead of to promote.
27871 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
27872 VT == MVT::v1f32)
27873 return TypeWidenVector;
27874
27875 return TargetLoweringBase::getPreferredVectorAction(VT);
27876}
27877
27878// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
27879// provided the address is 16-byte aligned.
27880bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
27881 if (!Subtarget->hasLSE2())
27882 return false;
27883
27884 if (auto LI = dyn_cast<LoadInst>(Val: I))
27885 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
27886 LI->getAlign() >= Align(16);
27887
27888 if (auto SI = dyn_cast<StoreInst>(Val: I))
27889 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27890 SI->getAlign() >= Align(16);
27891
27892 return false;
27893}
27894
27895bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction *I) const {
27896 if (!Subtarget->hasLSE128())
27897 return false;
27898
27899 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
27900 // will clobber the two registers.
27901 if (const auto *SI = dyn_cast<StoreInst>(Val: I))
27902 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27903 SI->getAlign() >= Align(16) &&
27904 (SI->getOrdering() == AtomicOrdering::Release ||
27905 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
27906
27907 if (const auto *RMW = dyn_cast<AtomicRMWInst>(Val: I))
27908 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27909 RMW->getAlign() >= Align(16) &&
27910 (RMW->getOperation() == AtomicRMWInst::Xchg ||
27911 RMW->getOperation() == AtomicRMWInst::And ||
27912 RMW->getOperation() == AtomicRMWInst::Or);
27913
27914 return false;
27915}
27916
27917bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction *I) const {
27918 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
27919 return false;
27920
27921 if (auto LI = dyn_cast<LoadInst>(Val: I))
27922 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
27923 LI->getAlign() >= Align(16) &&
27924 LI->getOrdering() == AtomicOrdering::Acquire;
27925
27926 if (auto SI = dyn_cast<StoreInst>(Val: I))
27927 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27928 SI->getAlign() >= Align(16) &&
27929 SI->getOrdering() == AtomicOrdering::Release;
27930
27931 return false;
27932}
27933
27934bool AArch64TargetLowering::shouldInsertFencesForAtomic(
27935 const Instruction *I) const {
27936 if (isOpSuitableForRCPC3(I))
27937 return false;
27938 if (isOpSuitableForLSE128(I))
27939 return false;
27940 if (isOpSuitableForLDPSTP(I))
27941 return true;
27942 return false;
27943}
27944
27945bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
27946 const Instruction *I) const {
27947 // Store-Release instructions only provide seq_cst guarantees when paired with
27948 // Load-Acquire instructions. MSVC CRT does not use these instructions to
27949 // implement seq_cst loads and stores, so we need additional explicit fences
27950 // after memory writes.
27951 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
27952 return false;
27953
27954 switch (I->getOpcode()) {
27955 default:
27956 return false;
27957 case Instruction::AtomicCmpXchg:
27958 return cast<AtomicCmpXchgInst>(Val: I)->getSuccessOrdering() ==
27959 AtomicOrdering::SequentiallyConsistent;
27960 case Instruction::AtomicRMW:
27961 return cast<AtomicRMWInst>(Val: I)->getOrdering() ==
27962 AtomicOrdering::SequentiallyConsistent;
27963 case Instruction::Store:
27964 return cast<StoreInst>(Val: I)->getOrdering() ==
27965 AtomicOrdering::SequentiallyConsistent;
27966 }
27967}
27968
27969// Loads and stores less than 128-bits are already atomic; ones above that
27970// are doomed anyway, so defer to the default libcall and blame the OS when
27971// things go wrong.
27972TargetLoweringBase::AtomicExpansionKind
27973AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
27974 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
27975 if (Size != 128)
27976 return AtomicExpansionKind::None;
27977 if (isOpSuitableForRCPC3(I: SI))
27978 return AtomicExpansionKind::None;
27979 if (isOpSuitableForLSE128(I: SI))
27980 return AtomicExpansionKind::Expand;
27981 if (isOpSuitableForLDPSTP(I: SI))
27982 return AtomicExpansionKind::None;
27983 return AtomicExpansionKind::Expand;
27984}
27985
27986// Loads and stores less than 128-bits are already atomic; ones above that
27987// are doomed anyway, so defer to the default libcall and blame the OS when
27988// things go wrong.
27989TargetLowering::AtomicExpansionKind
27990AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
27991 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
27992
27993 if (Size != 128)
27994 return AtomicExpansionKind::None;
27995 if (isOpSuitableForRCPC3(I: LI))
27996 return AtomicExpansionKind::None;
27997 // No LSE128 loads
27998 if (isOpSuitableForLDPSTP(I: LI))
27999 return AtomicExpansionKind::None;
28000
28001 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28002 // implement atomicrmw without spilling. If the target address is also on the
28003 // stack and close enough to the spill slot, this can lead to a situation
28004 // where the monitor always gets cleared and the atomic operation can never
28005 // succeed. So at -O0 lower this operation to a CAS loop.
28006 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
28007 return AtomicExpansionKind::CmpXChg;
28008
28009 // Using CAS for an atomic load has a better chance of succeeding under high
28010 // contention situations. So use it if available.
28011 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
28012 : AtomicExpansionKind::LLSC;
28013}
28014
28015// Return true if the atomic operation expansion will lower to use a library
28016// call, and is thus ineligible to use an LLSC expansion.
28017static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
28018 const AtomicRMWInst *RMW) {
28019 if (!RMW->isFloatingPointOperation())
28020 return false;
28021 switch (RMW->getType()->getScalarType()->getTypeID()) {
28022 case Type::FloatTyID:
28023 case Type::DoubleTyID:
28024 case Type::HalfTyID:
28025 case Type::BFloatTyID:
28026 // Will use soft float
28027 return !Subtarget.hasFPARMv8();
28028 default:
28029 // fp128 will emit library calls.
28030 return true;
28031 }
28032
28033 llvm_unreachable("covered type switch");
28034}
28035
28036// The "default" for integer RMW operations is to expand to an LL/SC loop.
28037// However, with the LSE instructions (or outline-atomics mode, which provides
28038// library routines in place of the LSE-instructions), we can directly emit many
28039// operations instead.
28040TargetLowering::AtomicExpansionKind
28041AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
28042 Type *Ty = AI->getType();
28043 unsigned Size = Ty->getPrimitiveSizeInBits();
28044 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
28045
28046 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
28047 (AI->getOperation() == AtomicRMWInst::Xchg ||
28048 AI->getOperation() == AtomicRMWInst::Or ||
28049 AI->getOperation() == AtomicRMWInst::And);
28050 if (CanUseLSE128)
28051 return AtomicExpansionKind::None;
28052
28053 // If LSFE available, use atomic FP instructions in preference to expansion
28054 if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||
28055 AI->getOperation() == AtomicRMWInst::FMax ||
28056 AI->getOperation() == AtomicRMWInst::FMin ||
28057 AI->getOperation() == AtomicRMWInst::FMaximum ||
28058 AI->getOperation() == AtomicRMWInst::FMinimum))
28059 return AtomicExpansionKind::None;
28060
28061 // Nand is not supported in LSE.
28062 // Leave 128 bits to LLSC or CmpXChg.
28063 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
28064 !AI->isFloatingPointOperation()) {
28065 if (Subtarget->hasLSE())
28066 return AtomicExpansionKind::None;
28067 if (Subtarget->outlineAtomics()) {
28068 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
28069 // Don't outline them unless
28070 // (1) high level <atomic> support approved:
28071 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
28072 // (2) low level libgcc and compiler-rt support implemented by:
28073 // min/max outline atomics helpers
28074 if (AI->getOperation() != AtomicRMWInst::Min &&
28075 AI->getOperation() != AtomicRMWInst::Max &&
28076 AI->getOperation() != AtomicRMWInst::UMin &&
28077 AI->getOperation() != AtomicRMWInst::UMax) {
28078 return AtomicExpansionKind::None;
28079 }
28080 }
28081 }
28082
28083 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28084 // implement atomicrmw without spilling. If the target address is also on the
28085 // stack and close enough to the spill slot, this can lead to a situation
28086 // where the monitor always gets cleared and the atomic operation can never
28087 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
28088 // we have a single CAS instruction that can replace the loop.
28089 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
28090 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(Subtarget: *Subtarget, RMW: AI))
28091 return AtomicExpansionKind::CmpXChg;
28092
28093 return AtomicExpansionKind::LLSC;
28094}
28095
28096TargetLowering::AtomicExpansionKind
28097AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
28098 AtomicCmpXchgInst *AI) const {
28099 // If subtarget has LSE, leave cmpxchg intact for codegen.
28100 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
28101 return AtomicExpansionKind::None;
28102 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28103 // implement cmpxchg without spilling. If the address being exchanged is also
28104 // on the stack and close enough to the spill slot, this can lead to a
28105 // situation where the monitor always gets cleared and the atomic operation
28106 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
28107 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
28108 return AtomicExpansionKind::None;
28109
28110 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
28111 // it.
28112 unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
28113 if (Size > 64)
28114 return AtomicExpansionKind::None;
28115
28116 return AtomicExpansionKind::LLSC;
28117}
28118
28119Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
28120 Type *ValueTy, Value *Addr,
28121 AtomicOrdering Ord) const {
28122 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
28123 bool IsAcquire = isAcquireOrStronger(AO: Ord);
28124
28125 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
28126 // intrinsic must return {i64, i64} and we have to recombine them into a
28127 // single i128 here.
28128 if (ValueTy->getPrimitiveSizeInBits() == 128) {
28129 Intrinsic::ID Int =
28130 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
28131
28132 Value *LoHi =
28133 Builder.CreateIntrinsic(ID: Int, Args: Addr, /*FMFSource=*/nullptr, Name: "lohi");
28134
28135 Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: 0, Name: "lo");
28136 Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: 1, Name: "hi");
28137
28138 auto *Int128Ty = Type::getInt128Ty(C&: Builder.getContext());
28139 Lo = Builder.CreateZExt(V: Lo, DestTy: Int128Ty, Name: "lo64");
28140 Hi = Builder.CreateZExt(V: Hi, DestTy: Int128Ty, Name: "hi64");
28141
28142 Value *Or = Builder.CreateOr(
28143 LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: Int128Ty, V: 64)), Name: "val64");
28144 return Builder.CreateBitCast(V: Or, DestTy: ValueTy);
28145 }
28146
28147 Type *Tys[] = { Addr->getType() };
28148 Intrinsic::ID Int =
28149 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
28150
28151 const DataLayout &DL = M->getDataLayout();
28152 IntegerType *IntEltTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: ValueTy));
28153 CallInst *CI = Builder.CreateIntrinsic(ID: Int, Types: Tys, Args: Addr);
28154 CI->addParamAttr(ArgNo: 0, Attr: Attribute::get(Context&: Builder.getContext(),
28155 Kind: Attribute::ElementType, Ty: IntEltTy));
28156 Value *Trunc = Builder.CreateTrunc(V: CI, DestTy: IntEltTy);
28157
28158 return Builder.CreateBitCast(V: Trunc, DestTy: ValueTy);
28159}
28160
28161void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
28162 IRBuilderBase &Builder) const {
28163 Builder.CreateIntrinsic(ID: Intrinsic::aarch64_clrex, Args: {});
28164}
28165
28166Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
28167 Value *Val, Value *Addr,
28168 AtomicOrdering Ord) const {
28169 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
28170 bool IsRelease = isReleaseOrStronger(AO: Ord);
28171
28172 // Since the intrinsics must have legal type, the i128 intrinsics take two
28173 // parameters: "i64, i64". We must marshal Val into the appropriate form
28174 // before the call.
28175 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
28176 Intrinsic::ID Int =
28177 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
28178 Function *Stxr = Intrinsic::getOrInsertDeclaration(M, id: Int);
28179 Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
28180 Type *Int128Ty = Type::getInt128Ty(C&: M->getContext());
28181
28182 Value *CastVal = Builder.CreateBitCast(V: Val, DestTy: Int128Ty);
28183
28184 Value *Lo = Builder.CreateTrunc(V: CastVal, DestTy: Int64Ty, Name: "lo");
28185 Value *Hi =
28186 Builder.CreateTrunc(V: Builder.CreateLShr(LHS: CastVal, RHS: 64), DestTy: Int64Ty, Name: "hi");
28187 return Builder.CreateCall(Callee: Stxr, Args: {Lo, Hi, Addr});
28188 }
28189
28190 Intrinsic::ID Int =
28191 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
28192 Type *Tys[] = { Addr->getType() };
28193 Function *Stxr = Intrinsic::getOrInsertDeclaration(M, id: Int, Tys);
28194
28195 const DataLayout &DL = M->getDataLayout();
28196 IntegerType *IntValTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: Val->getType()));
28197 Val = Builder.CreateBitCast(V: Val, DestTy: IntValTy);
28198
28199 CallInst *CI = Builder.CreateCall(
28200 Callee: Stxr, Args: {Builder.CreateZExtOrBitCast(
28201 V: Val, DestTy: Stxr->getFunctionType()->getParamType(i: 0)),
28202 Addr});
28203 CI->addParamAttr(ArgNo: 1, Attr: Attribute::get(Context&: Builder.getContext(),
28204 Kind: Attribute::ElementType, Ty: Val->getType()));
28205 return CI;
28206}
28207
28208bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
28209 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
28210 const DataLayout &DL) const {
28211 if (!Ty->isArrayTy()) {
28212 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
28213 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
28214 }
28215
28216 // All non aggregate members of the type must have the same type
28217 SmallVector<EVT> ValueVTs;
28218 ComputeValueVTs(TLI: *this, DL, Ty, ValueVTs);
28219 return all_equal(Range&: ValueVTs);
28220}
28221
28222bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
28223 EVT) const {
28224 return false;
28225}
28226
28227static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
28228 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
28229 Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
28230 M, id: Intrinsic::thread_pointer, Tys: IRB.getPtrTy());
28231 return IRB.CreatePointerCast(
28232 V: IRB.CreateConstGEP1_32(Ty: IRB.getInt8Ty(), Ptr: IRB.CreateCall(Callee: ThreadPointerFunc),
28233 Idx0: Offset),
28234 DestTy: IRB.getPtrTy(AddrSpace: 0));
28235}
28236
28237Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
28238 // Android provides a fixed TLS slot for the stack cookie. See the definition
28239 // of TLS_SLOT_STACK_GUARD in
28240 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
28241 if (Subtarget->isTargetAndroid())
28242 return UseTlsOffset(IRB, Offset: 0x28);
28243
28244 // Fuchsia is similar.
28245 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
28246 if (Subtarget->isTargetFuchsia())
28247 return UseTlsOffset(IRB, Offset: -0x10);
28248
28249 return TargetLowering::getIRStackGuard(IRB);
28250}
28251
28252void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
28253 // MSVC CRT provides functionalities for stack protection.
28254 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
28255 // MSVC CRT has a global variable holding security cookie.
28256 M.getOrInsertGlobal(Name: "__security_cookie",
28257 Ty: PointerType::getUnqual(C&: M.getContext()));
28258
28259 // MSVC CRT has a function to validate security cookie.
28260 FunctionCallee SecurityCheckCookie =
28261 M.getOrInsertFunction(Name: Subtarget->getSecurityCheckCookieName(),
28262 RetTy: Type::getVoidTy(C&: M.getContext()),
28263 Args: PointerType::getUnqual(C&: M.getContext()));
28264 if (Function *F = dyn_cast<Function>(Val: SecurityCheckCookie.getCallee())) {
28265 F->setCallingConv(CallingConv::Win64);
28266 F->addParamAttr(ArgNo: 0, Kind: Attribute::AttrKind::InReg);
28267 }
28268 return;
28269 }
28270 TargetLowering::insertSSPDeclarations(M);
28271}
28272
28273Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
28274 // MSVC CRT has a global variable holding security cookie.
28275 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
28276 return M.getGlobalVariable(Name: "__security_cookie");
28277 return TargetLowering::getSDagStackGuard(M);
28278}
28279
28280Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
28281 // MSVC CRT has a function to validate security cookie.
28282 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
28283 return M.getFunction(Name: Subtarget->getSecurityCheckCookieName());
28284 return TargetLowering::getSSPStackGuardCheck(M);
28285}
28286
28287Value *
28288AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
28289 // Android provides a fixed TLS slot for the SafeStack pointer. See the
28290 // definition of TLS_SLOT_SAFESTACK in
28291 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
28292 if (Subtarget->isTargetAndroid())
28293 return UseTlsOffset(IRB, Offset: 0x48);
28294
28295 // Fuchsia is similar.
28296 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
28297 if (Subtarget->isTargetFuchsia())
28298 return UseTlsOffset(IRB, Offset: -0x8);
28299
28300 return TargetLowering::getSafeStackPointerLocation(IRB);
28301}
28302
28303/// If a physical register, this returns the register that receives the
28304/// exception address on entry to an EH pad.
28305Register AArch64TargetLowering::getExceptionPointerRegister(
28306 const Constant *PersonalityFn) const {
28307 // FIXME: This is a guess. Has this been defined yet?
28308 return AArch64::X0;
28309}
28310
28311/// If a physical register, this returns the register that receives the
28312/// exception typeid on entry to a landing pad.
28313Register AArch64TargetLowering::getExceptionSelectorRegister(
28314 const Constant *PersonalityFn) const {
28315 // FIXME: This is a guess. Has this been defined yet?
28316 return AArch64::X1;
28317}
28318
28319bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
28320 const Instruction &AndI) const {
28321 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
28322 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
28323 // may be beneficial to sink in other cases, but we would have to check that
28324 // the cmp would not get folded into the br to form a cbz for these to be
28325 // beneficial.
28326 ConstantInt* Mask = dyn_cast<ConstantInt>(Val: AndI.getOperand(i: 1));
28327 if (!Mask)
28328 return false;
28329 return Mask->getValue().isPowerOf2();
28330}
28331
28332bool AArch64TargetLowering::
28333 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
28334 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
28335 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
28336 SelectionDAG &DAG) const {
28337 // Does baseline recommend not to perform the fold by default?
28338 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
28339 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
28340 return false;
28341 // Else, if this is a vector shift, prefer 'shl'.
28342 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
28343}
28344
28345TargetLowering::ShiftLegalizationStrategy
28346AArch64TargetLowering::preferredShiftLegalizationStrategy(
28347 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
28348 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
28349 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
28350 return ShiftLegalizationStrategy::LowerToLibcall;
28351 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
28352 ExpansionFactor);
28353}
28354
28355void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
28356 // Update IsSplitCSR in AArch64unctionInfo.
28357 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
28358 AFI->setIsSplitCSR(true);
28359}
28360
28361void AArch64TargetLowering::insertCopiesSplitCSR(
28362 MachineBasicBlock *Entry,
28363 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
28364 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
28365 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
28366 if (!IStart)
28367 return;
28368
28369 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
28370 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
28371 MachineBasicBlock::iterator MBBI = Entry->begin();
28372 for (const MCPhysReg *I = IStart; *I; ++I) {
28373 const TargetRegisterClass *RC = nullptr;
28374 if (AArch64::GPR64RegClass.contains(Reg: *I))
28375 RC = &AArch64::GPR64RegClass;
28376 else if (AArch64::FPR64RegClass.contains(Reg: *I))
28377 RC = &AArch64::FPR64RegClass;
28378 else
28379 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
28380
28381 Register NewVR = MRI->createVirtualRegister(RegClass: RC);
28382 // Create copy from CSR to a virtual register.
28383 // FIXME: this currently does not emit CFI pseudo-instructions, it works
28384 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
28385 // nounwind. If we want to generalize this later, we may need to emit
28386 // CFI pseudo-instructions.
28387 assert(Entry->getParent()->getFunction().hasFnAttribute(
28388 Attribute::NoUnwind) &&
28389 "Function should be nounwind in insertCopiesSplitCSR!");
28390 Entry->addLiveIn(PhysReg: *I);
28391 BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
28392 .addReg(RegNo: *I);
28393
28394 // Insert the copy-back instructions right before the terminator.
28395 for (auto *Exit : Exits)
28396 BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(),
28397 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
28398 .addReg(RegNo: NewVR);
28399 }
28400}
28401
28402bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
28403 // Integer division on AArch64 is expensive. However, when aggressively
28404 // optimizing for code size, we prefer to use a div instruction, as it is
28405 // usually smaller than the alternative sequence.
28406 // The exception to this is vector division. Since AArch64 doesn't have vector
28407 // integer division, leaving the division as-is is a loss even in terms of
28408 // size, because it will have to be scalarized, while the alternative code
28409 // sequence can be performed in vector form.
28410 bool OptSize = Attr.hasFnAttr(Kind: Attribute::MinSize);
28411 return OptSize && !VT.isVector();
28412}
28413
28414bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
28415 const MachineFunction &MF) const {
28416 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
28417 // In future, we could allow this when SVE is available, but currently,
28418 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
28419 // the general lowering may introduce stack spills/reloads).
28420 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
28421 return false;
28422
28423 // Do not merge to float value size (128 bytes) if no implicit float attribute
28424 // is set.
28425 bool NoFloat = MF.getFunction().hasFnAttribute(Kind: Attribute::NoImplicitFloat);
28426 return !NoFloat || MemVT.getSizeInBits() <= 64;
28427}
28428
28429bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
28430 // We want inc-of-add for scalars and sub-of-not for vectors.
28431 return VT.isScalarInteger();
28432}
28433
28434bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
28435 EVT VT) const {
28436 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
28437 // legalize.
28438 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
28439 return false;
28440 if (FPVT == MVT::v8bf16)
28441 return false;
28442 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
28443}
28444
28445bool AArch64TargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
28446 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
28447 // avoid vselect becoming bsl / unrolling.
28448 return !VT.isFixedLengthVector();
28449}
28450
28451MachineInstr *
28452AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
28453 MachineBasicBlock::instr_iterator &MBBI,
28454 const TargetInstrInfo *TII) const {
28455 assert(MBBI->isCall() && MBBI->getCFIType() &&
28456 "Invalid call instruction for a KCFI check");
28457
28458 switch (MBBI->getOpcode()) {
28459 case AArch64::BLR:
28460 case AArch64::BLRNoIP:
28461 case AArch64::TCRETURNri:
28462 case AArch64::TCRETURNrix16x17:
28463 case AArch64::TCRETURNrix17:
28464 case AArch64::TCRETURNrinotx16:
28465 break;
28466 default:
28467 llvm_unreachable("Unexpected CFI call opcode");
28468 }
28469
28470 MachineOperand &Target = MBBI->getOperand(i: 0);
28471 assert(Target.isReg() && "Invalid target operand for an indirect call");
28472 Target.setIsRenamable(false);
28473
28474 return BuildMI(BB&: MBB, I: MBBI, MIMD: MBBI->getDebugLoc(), MCID: TII->get(Opcode: AArch64::KCFI_CHECK))
28475 .addReg(RegNo: Target.getReg())
28476 .addImm(Val: MBBI->getCFIType())
28477 .getInstr();
28478}
28479
28480bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
28481 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
28482}
28483
28484unsigned
28485AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
28486 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
28487 return getPointerTy(DL).getSizeInBits();
28488
28489 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
28490}
28491
28492void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
28493 MachineFrameInfo &MFI = MF.getFrameInfo();
28494 // If we have any vulnerable SVE stack objects then the stack protector
28495 // needs to be placed at the top of the SVE stack area, as the SVE locals
28496 // are placed above the other locals, so we allocate it as if it were a
28497 // scalable vector.
28498 // FIXME: It may be worthwhile having a specific interface for this rather
28499 // than doing it here in finalizeLowering.
28500 if (MFI.hasStackProtectorIndex()) {
28501 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
28502 if (MFI.getStackID(ObjectIdx: i) == TargetStackID::ScalableVector &&
28503 MFI.getObjectSSPLayout(ObjectIdx: i) != MachineFrameInfo::SSPLK_None) {
28504 MFI.setStackID(ObjectIdx: MFI.getStackProtectorIndex(),
28505 ID: TargetStackID::ScalableVector);
28506 MFI.setObjectAlignment(ObjectIdx: MFI.getStackProtectorIndex(), Alignment: Align(16));
28507 break;
28508 }
28509 }
28510 }
28511 MFI.computeMaxCallFrameSize(MF);
28512 TargetLoweringBase::finalizeLowering(MF);
28513}
28514
28515// Unlike X86, we let frame lowering assign offsets to all catch objects.
28516bool AArch64TargetLowering::needsFixedCatchObjects() const {
28517 return false;
28518}
28519
28520bool AArch64TargetLowering::shouldLocalize(
28521 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
28522 auto &MF = *MI.getMF();
28523 auto &MRI = MF.getRegInfo();
28524 auto maxUses = [](unsigned RematCost) {
28525 // A cost of 1 means remats are basically free.
28526 if (RematCost == 1)
28527 return std::numeric_limits<unsigned>::max();
28528 if (RematCost == 2)
28529 return 2U;
28530
28531 // Remat is too expensive, only sink if there's one user.
28532 if (RematCost > 2)
28533 return 1U;
28534 llvm_unreachable("Unexpected remat cost");
28535 };
28536
28537 unsigned Opc = MI.getOpcode();
28538 switch (Opc) {
28539 case TargetOpcode::G_GLOBAL_VALUE: {
28540 // On Darwin, TLS global vars get selected into function calls, which
28541 // we don't want localized, as they can get moved into the middle of a
28542 // another call sequence.
28543 const GlobalValue &GV = *MI.getOperand(i: 1).getGlobal();
28544 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
28545 return false;
28546 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
28547 }
28548 case TargetOpcode::G_FCONSTANT:
28549 case TargetOpcode::G_CONSTANT: {
28550 const ConstantInt *CI;
28551 unsigned AdditionalCost = 0;
28552
28553 if (Opc == TargetOpcode::G_CONSTANT)
28554 CI = MI.getOperand(i: 1).getCImm();
28555 else {
28556 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
28557 // We try to estimate cost of 32/64b fpimms, as they'll likely be
28558 // materialized as integers.
28559 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
28560 break;
28561 auto APF = MI.getOperand(i: 1).getFPImm()->getValueAPF();
28562 bool OptForSize = MF.getFunction().hasOptSize();
28563 if (isFPImmLegal(Imm: APF, VT: EVT::getFloatingPointVT(BitWidth: Ty.getScalarSizeInBits()),
28564 OptForSize))
28565 return true; // Constant should be cheap.
28566 CI =
28567 ConstantInt::get(Context&: MF.getFunction().getContext(), V: APF.bitcastToAPInt());
28568 // FP materialization also costs an extra move, from gpr to fpr.
28569 AdditionalCost = 1;
28570 }
28571 APInt Imm = CI->getValue();
28572 InstructionCost Cost = TTI->getIntImmCost(
28573 Imm, Ty: CI->getType(), CostKind: TargetTransformInfo::TCK_CodeSize);
28574 assert(Cost.isValid() && "Expected a valid imm cost");
28575
28576 unsigned RematCost = Cost.getValue();
28577 RematCost += AdditionalCost;
28578 Register Reg = MI.getOperand(i: 0).getReg();
28579 unsigned MaxUses = maxUses(RematCost);
28580 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
28581 if (MaxUses == std::numeric_limits<unsigned>::max())
28582 --MaxUses;
28583 return MRI.hasAtMostUserInstrs(Reg, MaxUsers: MaxUses);
28584 }
28585 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
28586 // localizable.
28587 case AArch64::ADRP:
28588 case AArch64::G_ADD_LOW:
28589 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
28590 case TargetOpcode::G_PTR_ADD:
28591 return true;
28592 default:
28593 break;
28594 }
28595 return TargetLoweringBase::shouldLocalize(MI, TTI);
28596}
28597
28598bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
28599 // Fallback for scalable vectors.
28600 // Note that if EnableSVEGISel is true, we allow scalable vector types for
28601 // all instructions, regardless of whether they are actually supported.
28602 if (!EnableSVEGISel) {
28603 if (Inst.getType()->isScalableTy()) {
28604 return true;
28605 }
28606
28607 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
28608 if (Inst.getOperand(i)->getType()->isScalableTy())
28609 return true;
28610
28611 if (const AllocaInst *AI = dyn_cast<AllocaInst>(Val: &Inst)) {
28612 if (AI->getAllocatedType()->isScalableTy())
28613 return true;
28614 }
28615 }
28616
28617 // Checks to allow the use of SME instructions
28618 if (auto *Base = dyn_cast<CallBase>(Val: &Inst)) {
28619 auto CallAttrs = SMECallAttrs(*Base);
28620 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
28621 CallAttrs.requiresPreservingZT0() ||
28622 CallAttrs.requiresPreservingAllZAState())
28623 return true;
28624 }
28625 return false;
28626}
28627
28628// Return the largest legal scalable vector type that matches VT's element type.
28629static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
28630 assert(VT.isFixedLengthVector() &&
28631 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
28632 "Expected legal fixed length vector!");
28633 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
28634 default:
28635 llvm_unreachable("unexpected element type for SVE container");
28636 case MVT::i8:
28637 return EVT(MVT::nxv16i8);
28638 case MVT::i16:
28639 return EVT(MVT::nxv8i16);
28640 case MVT::i32:
28641 return EVT(MVT::nxv4i32);
28642 case MVT::i64:
28643 return EVT(MVT::nxv2i64);
28644 case MVT::bf16:
28645 return EVT(MVT::nxv8bf16);
28646 case MVT::f16:
28647 return EVT(MVT::nxv8f16);
28648 case MVT::f32:
28649 return EVT(MVT::nxv4f32);
28650 case MVT::f64:
28651 return EVT(MVT::nxv2f64);
28652 }
28653}
28654
28655// Return a predicate with active lanes corresponding to the extent of VT.
28656static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
28657 EVT VT) {
28658 assert(VT.isFixedLengthVector() &&
28659 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
28660 "Expected legal fixed length vector!");
28661
28662 std::optional<unsigned> PgPattern =
28663 getSVEPredPatternFromNumElements(MinNumElts: VT.getVectorNumElements());
28664 assert(PgPattern && "Unexpected element count for SVE predicate");
28665
28666 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
28667 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
28668 // variants of instructions when available.
28669 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
28670 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
28671 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
28672 if (MaxSVESize && MinSVESize == MaxSVESize &&
28673 MaxSVESize == VT.getSizeInBits())
28674 PgPattern = AArch64SVEPredPattern::all;
28675
28676 MVT MaskVT;
28677 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
28678 default:
28679 llvm_unreachable("unexpected element type for SVE predicate");
28680 case MVT::i8:
28681 MaskVT = MVT::nxv16i1;
28682 break;
28683 case MVT::i16:
28684 case MVT::f16:
28685 case MVT::bf16:
28686 MaskVT = MVT::nxv8i1;
28687 break;
28688 case MVT::i32:
28689 case MVT::f32:
28690 MaskVT = MVT::nxv4i1;
28691 break;
28692 case MVT::i64:
28693 case MVT::f64:
28694 MaskVT = MVT::nxv2i1;
28695 break;
28696 }
28697
28698 return getPTrue(DAG, DL, VT: MaskVT, Pattern: *PgPattern);
28699}
28700
28701static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
28702 EVT VT) {
28703 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
28704 "Expected legal scalable vector!");
28705 auto PredTy = VT.changeVectorElementType(EltVT: MVT::i1);
28706 return getPTrue(DAG, DL, VT: PredTy, Pattern: AArch64SVEPredPattern::all);
28707}
28708
28709static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
28710 if (VT.isFixedLengthVector())
28711 return getPredicateForFixedLengthVector(DAG, DL, VT);
28712
28713 return getPredicateForScalableVector(DAG, DL, VT);
28714}
28715
28716// Grow V to consume an entire SVE register.
28717static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
28718 assert(VT.isScalableVector() &&
28719 "Expected to convert into a scalable vector!");
28720 assert(V.getValueType().isFixedLengthVector() &&
28721 "Expected a fixed length vector operand!");
28722 SDLoc DL(V);
28723 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
28724 return DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT, N1: DAG.getUNDEF(VT), N2: V, N3: Zero);
28725}
28726
28727// Shrink V so it's just big enough to maintain a VT's worth of data.
28728static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
28729 assert(VT.isFixedLengthVector() &&
28730 "Expected to convert into a fixed length vector!");
28731 assert(V.getValueType().isScalableVector() &&
28732 "Expected a scalable vector operand!");
28733 SDLoc DL(V);
28734 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
28735 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: V, N2: Zero);
28736}
28737
28738// Convert all fixed length vector loads larger than NEON to masked_loads.
28739SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
28740 SDValue Op, SelectionDAG &DAG) const {
28741 auto Load = cast<LoadSDNode>(Val&: Op);
28742
28743 SDLoc DL(Op);
28744 EVT VT = Op.getValueType();
28745 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28746 EVT LoadVT = ContainerVT;
28747 EVT MemVT = Load->getMemoryVT();
28748
28749 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28750
28751 if (VT.isFloatingPoint()) {
28752 LoadVT = ContainerVT.changeTypeToInteger();
28753 MemVT = MemVT.changeTypeToInteger();
28754 }
28755
28756 SDValue NewLoad = DAG.getMaskedLoad(
28757 VT: LoadVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(), Mask: Pg,
28758 Src0: DAG.getUNDEF(VT: LoadVT), MemVT, MMO: Load->getMemOperand(),
28759 AM: Load->getAddressingMode(), Load->getExtensionType());
28760
28761 SDValue Result = NewLoad;
28762 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
28763 EVT ExtendVT = ContainerVT.changeVectorElementType(
28764 EltVT: Load->getMemoryVT().getVectorElementType());
28765
28766 Result = getSVESafeBitCast(VT: ExtendVT, Op: Result, DAG);
28767 Result = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
28768 N1: Pg, N2: Result, N3: DAG.getUNDEF(VT: ContainerVT));
28769 } else if (VT.isFloatingPoint()) {
28770 Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Result);
28771 }
28772
28773 Result = convertFromScalableVector(DAG, VT, V: Result);
28774 SDValue MergedValues[2] = {Result, NewLoad.getValue(R: 1)};
28775 return DAG.getMergeValues(Ops: MergedValues, dl: DL);
28776}
28777
28778static SDValue convertFixedMaskToScalableVector(SDValue Mask,
28779 SelectionDAG &DAG) {
28780 SDLoc DL(Mask);
28781 EVT InVT = Mask.getValueType();
28782 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
28783 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
28784
28785 if (ISD::isBuildVectorAllOnes(N: Mask.getNode()))
28786 return Pg;
28787
28788 bool InvertCond = false;
28789 if (isBitwiseNot(V: Mask)) {
28790 InvertCond = true;
28791 Mask = Mask.getOperand(i: 0);
28792 }
28793
28794 SDValue Op1, Op2;
28795 ISD::CondCode CC;
28796
28797 // When Mask is the result of a SETCC, it's better to regenerate the compare.
28798 if (Mask.getOpcode() == ISD::SETCC) {
28799 Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Mask.getOperand(i: 0));
28800 Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Mask.getOperand(i: 1));
28801 CC = cast<CondCodeSDNode>(Val: Mask.getOperand(i: 2))->get();
28802 } else {
28803 Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Mask);
28804 Op2 = DAG.getConstant(Val: 0, DL, VT: ContainerVT);
28805 CC = ISD::SETNE;
28806 }
28807
28808 if (InvertCond)
28809 CC = getSetCCInverse(Operation: CC, Type: Op1.getValueType());
28810
28811 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: Pg.getValueType(),
28812 Ops: {Pg, Op1, Op2, DAG.getCondCode(Cond: CC)});
28813}
28814
28815// Convert all fixed length vector loads larger than NEON to masked_loads.
28816SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
28817 SDValue Op, SelectionDAG &DAG) const {
28818 auto Load = cast<MaskedLoadSDNode>(Val&: Op);
28819
28820 SDLoc DL(Op);
28821 EVT VT = Op.getValueType();
28822 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28823
28824 SDValue Mask = Load->getMask();
28825 // If this is an extending load and the mask type is not the same as
28826 // load's type then we have to extend the mask type.
28827 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
28828 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
28829 "Incorrect mask type");
28830 Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: Mask);
28831 }
28832 Mask = convertFixedMaskToScalableVector(Mask, DAG);
28833
28834 SDValue PassThru;
28835 bool IsPassThruZeroOrUndef = false;
28836
28837 if (Load->getPassThru()->isUndef()) {
28838 PassThru = DAG.getUNDEF(VT: ContainerVT);
28839 IsPassThruZeroOrUndef = true;
28840 } else {
28841 if (ContainerVT.isInteger())
28842 PassThru = DAG.getConstant(Val: 0, DL, VT: ContainerVT);
28843 else
28844 PassThru = DAG.getConstantFP(Val: 0, DL, VT: ContainerVT);
28845 if (isZerosVector(N: Load->getPassThru().getNode()))
28846 IsPassThruZeroOrUndef = true;
28847 }
28848
28849 SDValue NewLoad = DAG.getMaskedLoad(
28850 VT: ContainerVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(),
28851 Mask, Src0: PassThru, MemVT: Load->getMemoryVT(), MMO: Load->getMemOperand(),
28852 AM: Load->getAddressingMode(), Load->getExtensionType());
28853
28854 SDValue Result = NewLoad;
28855 if (!IsPassThruZeroOrUndef) {
28856 SDValue OldPassThru =
28857 convertToScalableVector(DAG, VT: ContainerVT, V: Load->getPassThru());
28858 Result = DAG.getSelect(DL, VT: ContainerVT, Cond: Mask, LHS: Result, RHS: OldPassThru);
28859 }
28860
28861 Result = convertFromScalableVector(DAG, VT, V: Result);
28862 SDValue MergedValues[2] = {Result, NewLoad.getValue(R: 1)};
28863 return DAG.getMergeValues(Ops: MergedValues, dl: DL);
28864}
28865
28866// Convert all fixed length vector stores larger than NEON to masked_stores.
28867SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
28868 SDValue Op, SelectionDAG &DAG) const {
28869 auto Store = cast<StoreSDNode>(Val&: Op);
28870
28871 SDLoc DL(Op);
28872 EVT VT = Store->getValue().getValueType();
28873 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28874 EVT MemVT = Store->getMemoryVT();
28875
28876 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28877 auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
28878
28879 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
28880 EVT TruncVT = ContainerVT.changeVectorElementType(
28881 EltVT: Store->getMemoryVT().getVectorElementType());
28882 MemVT = MemVT.changeTypeToInteger();
28883 NewValue = DAG.getNode(Opcode: AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, VT: TruncVT, N1: Pg,
28884 N2: NewValue, N3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64),
28885 N4: DAG.getUNDEF(VT: TruncVT));
28886 NewValue =
28887 getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
28888 } else if (VT.isFloatingPoint()) {
28889 MemVT = MemVT.changeTypeToInteger();
28890 NewValue =
28891 getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
28892 }
28893
28894 return DAG.getMaskedStore(Chain: Store->getChain(), dl: DL, Val: NewValue,
28895 Base: Store->getBasePtr(), Offset: Store->getOffset(), Mask: Pg, MemVT,
28896 MMO: Store->getMemOperand(), AM: Store->getAddressingMode(),
28897 IsTruncating: Store->isTruncatingStore());
28898}
28899
28900SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
28901 SDValue Op, SelectionDAG &DAG) const {
28902 auto *Store = cast<MaskedStoreSDNode>(Val&: Op);
28903
28904 SDLoc DL(Op);
28905 EVT VT = Store->getValue().getValueType();
28906 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28907
28908 auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
28909 SDValue Mask = convertFixedMaskToScalableVector(Mask: Store->getMask(), DAG);
28910
28911 return DAG.getMaskedStore(
28912 Chain: Store->getChain(), dl: DL, Val: NewValue, Base: Store->getBasePtr(), Offset: Store->getOffset(),
28913 Mask, MemVT: Store->getMemoryVT(), MMO: Store->getMemOperand(),
28914 AM: Store->getAddressingMode(), IsTruncating: Store->isTruncatingStore());
28915}
28916
28917SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
28918 SDValue Op, SelectionDAG &DAG) const {
28919 SDLoc DL(Op);
28920 EVT VT = Op.getValueType();
28921 EVT EltVT = VT.getVectorElementType();
28922
28923 bool Signed = Op.getOpcode() == ISD::SDIV;
28924 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
28925
28926 bool Negated;
28927 uint64_t SplatVal;
28928 if (Signed && isPow2Splat(Op: Op.getOperand(i: 1), SplatVal, Negated)) {
28929 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28930 SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: 0));
28931 SDValue Op2 = DAG.getTargetConstant(Val: Log2_64(Value: SplatVal), DL, VT: MVT::i32);
28932
28933 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28934 SDValue Res =
28935 DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL, VT: ContainerVT, N1: Pg, N2: Op1, N3: Op2);
28936 if (Negated)
28937 Res = DAG.getNode(Opcode: ISD::SUB, DL, VT: ContainerVT,
28938 N1: DAG.getConstant(Val: 0, DL, VT: ContainerVT), N2: Res);
28939
28940 return convertFromScalableVector(DAG, VT, V: Res);
28941 }
28942
28943 // Scalable vector i32/i64 DIV is supported.
28944 if (EltVT == MVT::i32 || EltVT == MVT::i64)
28945 return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
28946
28947 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
28948 EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
28949 EVT PromVT = HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext());
28950 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28951
28952 // If the wider type is legal: extend, op, and truncate.
28953 EVT WideVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
28954 if (DAG.getTargetLoweringInfo().isTypeLegal(VT: WideVT)) {
28955 SDValue Op0 = DAG.getNode(Opcode: ExtendOpcode, DL, VT: WideVT, Operand: Op.getOperand(i: 0));
28956 SDValue Op1 = DAG.getNode(Opcode: ExtendOpcode, DL, VT: WideVT, Operand: Op.getOperand(i: 1));
28957 SDValue Div = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: WideVT, N1: Op0, N2: Op1);
28958 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Div);
28959 }
28960
28961 auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
28962 &ExtendOpcode](SDValue Op) {
28963 SDValue IdxZero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
28964 SDValue IdxHalf =
28965 DAG.getConstant(Val: HalfVT.getVectorNumElements(), DL, VT: MVT::i64);
28966 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Op, N2: IdxZero);
28967 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Op, N2: IdxHalf);
28968 return std::pair<SDValue, SDValue>(
28969 {DAG.getNode(Opcode: ExtendOpcode, DL, VT: PromVT, Operand: Lo),
28970 DAG.getNode(Opcode: ExtendOpcode, DL, VT: PromVT, Operand: Hi)});
28971 };
28972
28973 // If wider type is not legal: split, extend, op, trunc and concat.
28974 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(i: 0));
28975 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(i: 1));
28976 SDValue Lo = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: PromVT, N1: Op0LoExt, N2: Op1LoExt);
28977 SDValue Hi = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: PromVT, N1: Op0HiExt, N2: Op1HiExt);
28978 SDValue LoTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: HalfVT, Operand: Lo);
28979 SDValue HiTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: HalfVT, Operand: Hi);
28980 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: {LoTrunc, HiTrunc});
28981}
28982
28983SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
28984 SDValue Op, SelectionDAG &DAG) const {
28985 EVT VT = Op.getValueType();
28986 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28987
28988 SDLoc DL(Op);
28989 SDValue Val = Op.getOperand(i: 0);
28990 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
28991 Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
28992
28993 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
28994 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
28995
28996 // Repeatedly unpack Val until the result is of the desired element type.
28997 switch (ContainerVT.getSimpleVT().SimpleTy) {
28998 default:
28999 llvm_unreachable("unimplemented container type");
29000 case MVT::nxv16i8:
29001 Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv8i16, Operand: Val);
29002 if (VT.getVectorElementType() == MVT::i16)
29003 break;
29004 [[fallthrough]];
29005 case MVT::nxv8i16:
29006 Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv4i32, Operand: Val);
29007 if (VT.getVectorElementType() == MVT::i32)
29008 break;
29009 [[fallthrough]];
29010 case MVT::nxv4i32:
29011 Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv2i64, Operand: Val);
29012 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
29013 break;
29014 }
29015
29016 return convertFromScalableVector(DAG, VT, V: Val);
29017}
29018
29019SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
29020 SDValue Op, SelectionDAG &DAG) const {
29021 EVT VT = Op.getValueType();
29022 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29023
29024 SDLoc DL(Op);
29025 SDValue Val = Op.getOperand(i: 0);
29026 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
29027 Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
29028
29029 // Repeatedly truncate Val until the result is of the desired element type.
29030 switch (ContainerVT.getSimpleVT().SimpleTy) {
29031 default:
29032 llvm_unreachable("unimplemented container type");
29033 case MVT::nxv2i64:
29034 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv4i32, Operand: Val);
29035 Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv4i32, N1: Val, N2: Val);
29036 if (VT.getVectorElementType() == MVT::i32)
29037 break;
29038 [[fallthrough]];
29039 case MVT::nxv4i32:
29040 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv8i16, Operand: Val);
29041 Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv8i16, N1: Val, N2: Val);
29042 if (VT.getVectorElementType() == MVT::i16)
29043 break;
29044 [[fallthrough]];
29045 case MVT::nxv8i16:
29046 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i8, Operand: Val);
29047 Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv16i8, N1: Val, N2: Val);
29048 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
29049 break;
29050 }
29051
29052 return convertFromScalableVector(DAG, VT, V: Val);
29053}
29054
29055SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
29056 SDValue Op, SelectionDAG &DAG) const {
29057 EVT VT = Op.getValueType();
29058 EVT InVT = Op.getOperand(i: 0).getValueType();
29059 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
29060
29061 SDLoc DL(Op);
29062 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
29063 SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 0));
29064
29065 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Op0, N2: Op.getOperand(i: 1));
29066}
29067
29068SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
29069 SDValue Op, SelectionDAG &DAG) const {
29070 EVT VT = Op.getValueType();
29071 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29072
29073 SDLoc DL(Op);
29074 EVT InVT = Op.getOperand(i: 0).getValueType();
29075 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
29076 SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 0));
29077
29078 auto ScalableRes = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT, N1: Op0,
29079 N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2));
29080
29081 return convertFromScalableVector(DAG, VT, V: ScalableRes);
29082}
29083
29084// Convert vector operation 'Op' to an equivalent predicated operation whereby
29085// the original operation's type is used to construct a suitable predicate.
29086// NOTE: The results for inactive lanes are undefined.
29087SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
29088 SelectionDAG &DAG,
29089 unsigned NewOp) const {
29090 EVT VT = Op.getValueType();
29091 SDLoc DL(Op);
29092 auto Pg = getPredicateForVector(DAG, DL, VT);
29093
29094 if (VT.isFixedLengthVector()) {
29095 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
29096 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29097
29098 // Create list of operands by converting existing ones to scalable types.
29099 SmallVector<SDValue, 4> Operands = {Pg};
29100 for (const SDValue &V : Op->op_values()) {
29101 if (isa<CondCodeSDNode>(Val: V)) {
29102 Operands.push_back(Elt: V);
29103 continue;
29104 }
29105
29106 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(Val: V)) {
29107 EVT VTArg = VTNode->getVT().getVectorElementType();
29108 EVT NewVTArg = ContainerVT.changeVectorElementType(EltVT: VTArg);
29109 Operands.push_back(Elt: DAG.getValueType(NewVTArg));
29110 continue;
29111 }
29112
29113 assert(isTypeLegal(V.getValueType()) &&
29114 "Expected only legal fixed-width types");
29115 Operands.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
29116 }
29117
29118 if (isMergePassthruOpcode(Opc: NewOp))
29119 Operands.push_back(Elt: DAG.getUNDEF(VT: ContainerVT));
29120
29121 auto ScalableRes = DAG.getNode(Opcode: NewOp, DL, VT: ContainerVT, Ops: Operands);
29122 return convertFromScalableVector(DAG, VT, V: ScalableRes);
29123 }
29124
29125 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
29126
29127 SmallVector<SDValue, 4> Operands = {Pg};
29128 for (const SDValue &V : Op->op_values()) {
29129 assert((!V.getValueType().isVector() ||
29130 V.getValueType().isScalableVector()) &&
29131 "Only scalable vectors are supported!");
29132 Operands.push_back(Elt: V);
29133 }
29134
29135 if (isMergePassthruOpcode(Opc: NewOp))
29136 Operands.push_back(Elt: DAG.getUNDEF(VT));
29137
29138 return DAG.getNode(Opcode: NewOp, DL, VT, Ops: Operands, Flags: Op->getFlags());
29139}
29140
29141// If a fixed length vector operation has no side effects when applied to
29142// undefined elements, we can safely use scalable vectors to perform the same
29143// operation without needing to worry about predication.
29144SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
29145 SelectionDAG &DAG) const {
29146 EVT VT = Op.getValueType();
29147 assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
29148 "Only expected to lower fixed length vector operation!");
29149 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29150
29151 // Create list of operands by converting existing ones to scalable types.
29152 SmallVector<SDValue, 4> Ops;
29153 for (const SDValue &V : Op->op_values()) {
29154 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
29155
29156 // Pass through non-vector operands.
29157 if (!V.getValueType().isVector()) {
29158 Ops.push_back(Elt: V);
29159 continue;
29160 }
29161
29162 // "cast" fixed length vector to a scalable vector.
29163 assert(V.getValueType().isFixedLengthVector() &&
29164 isTypeLegal(V.getValueType()) &&
29165 "Only fixed length vectors are supported!");
29166 Ops.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
29167 }
29168
29169 auto ScalableRes = DAG.getNode(Opcode: Op.getOpcode(), DL: SDLoc(Op), VT: ContainerVT, Ops);
29170 return convertFromScalableVector(DAG, VT, V: ScalableRes);
29171}
29172
29173SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
29174 SelectionDAG &DAG) const {
29175 SDLoc DL(ScalarOp);
29176 SDValue AccOp = ScalarOp.getOperand(i: 0);
29177 SDValue VecOp = ScalarOp.getOperand(i: 1);
29178 EVT SrcVT = VecOp.getValueType();
29179 EVT ResVT = SrcVT.getVectorElementType();
29180
29181 EVT ContainerVT = SrcVT;
29182 if (SrcVT.isFixedLengthVector()) {
29183 ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
29184 VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
29185 }
29186
29187 SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
29188 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
29189
29190 // Convert operands to Scalable.
29191 AccOp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT,
29192 N1: DAG.getUNDEF(VT: ContainerVT), N2: AccOp, N3: Zero);
29193
29194 // Perform reduction.
29195 SDValue Rdx = DAG.getNode(Opcode: AArch64ISD::FADDA_PRED, DL, VT: ContainerVT,
29196 N1: Pg, N2: AccOp, N3: VecOp);
29197
29198 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT, N1: Rdx, N2: Zero);
29199}
29200
29201SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
29202 SelectionDAG &DAG) const {
29203 SDLoc DL(ReduceOp);
29204 SDValue Op = ReduceOp.getOperand(i: 0);
29205 EVT OpVT = Op.getValueType();
29206 EVT VT = ReduceOp.getValueType();
29207
29208 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
29209 return SDValue();
29210
29211 SDValue Pg = getPredicateForVector(DAG, DL, VT: OpVT);
29212
29213 switch (ReduceOp.getOpcode()) {
29214 default:
29215 return SDValue();
29216 case ISD::VECREDUCE_OR:
29217 if (isAllActivePredicate(DAG, N: Pg) && OpVT == MVT::nxv16i1)
29218 // The predicate can be 'Op' because
29219 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
29220 return getPTest(DAG, VT, Pg: Op, Op, Cond: AArch64CC::ANY_ACTIVE);
29221 else
29222 return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::ANY_ACTIVE);
29223 case ISD::VECREDUCE_AND: {
29224 Op = DAG.getNode(Opcode: ISD::XOR, DL, VT: OpVT, N1: Op, N2: Pg);
29225 return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::NONE_ACTIVE);
29226 }
29227 case ISD::VECREDUCE_XOR: {
29228 SDValue ID =
29229 DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_cntp, DL, VT: MVT::i64);
29230 if (OpVT == MVT::nxv1i1) {
29231 // Emulate a CNTP on .Q using .D and a different governing predicate.
29232 Pg = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv2i1, Operand: Pg);
29233 Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv2i1, Operand: Op);
29234 }
29235 SDValue Cntp =
29236 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i64, N1: ID, N2: Pg, N3: Op);
29237 return DAG.getAnyExtOrTrunc(Op: Cntp, DL, VT);
29238 }
29239 }
29240
29241 return SDValue();
29242}
29243
29244SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
29245 SDValue ScalarOp,
29246 SelectionDAG &DAG) const {
29247 SDLoc DL(ScalarOp);
29248 SDValue VecOp = ScalarOp.getOperand(i: 0);
29249 EVT SrcVT = VecOp.getValueType();
29250
29251 if (useSVEForFixedLengthVectorVT(
29252 VT: SrcVT,
29253 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
29254 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
29255 VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
29256 }
29257
29258 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
29259 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
29260 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
29261 SDValue BoolVec = VecOp.getOperand(i: 0);
29262 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
29263 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
29264 SDValue CntpOp = DAG.getNode(
29265 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i64,
29266 N1: DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_cntp, DL, VT: MVT::i64),
29267 N2: BoolVec, N3: BoolVec);
29268 return DAG.getAnyExtOrTrunc(Op: CntpOp, DL, VT: ScalarOp.getValueType());
29269 }
29270 }
29271
29272 // UADDV always returns an i64 result.
29273 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
29274 SrcVT.getVectorElementType();
29275 EVT RdxVT = SrcVT;
29276 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
29277 RdxVT = getPackedSVEVectorVT(VT: ResVT);
29278
29279 SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
29280 SDValue Rdx = DAG.getNode(Opcode, DL, VT: RdxVT, N1: Pg, N2: VecOp);
29281 SDValue Res = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT,
29282 N1: Rdx, N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
29283
29284 // The VEC_REDUCE nodes expect an element size result.
29285 if (ResVT != ScalarOp.getValueType())
29286 Res = DAG.getAnyExtOrTrunc(Op: Res, DL, VT: ScalarOp.getValueType());
29287
29288 return Res;
29289}
29290
29291SDValue
29292AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
29293 SelectionDAG &DAG) const {
29294 EVT VT = Op.getValueType();
29295 SDLoc DL(Op);
29296
29297 EVT InVT = Op.getOperand(i: 1).getValueType();
29298 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
29299 SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 1));
29300 SDValue Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 2));
29301
29302 // Convert the mask to a predicated (NOTE: We don't need to worry about
29303 // inactive lanes since VSELECT is safe when given undefined elements).
29304 EVT MaskVT = Op.getOperand(i: 0).getValueType();
29305 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskVT);
29306 auto Mask = convertToScalableVector(DAG, VT: MaskContainerVT, V: Op.getOperand(i: 0));
29307 Mask = DAG.getNode(Opcode: ISD::TRUNCATE, DL,
29308 VT: MaskContainerVT.changeVectorElementType(EltVT: MVT::i1), Operand: Mask);
29309
29310 auto ScalableRes = DAG.getNode(Opcode: ISD::VSELECT, DL, VT: ContainerVT,
29311 N1: Mask, N2: Op1, N3: Op2);
29312
29313 return convertFromScalableVector(DAG, VT, V: ScalableRes);
29314}
29315
29316SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
29317 SDValue Op, SelectionDAG &DAG) const {
29318 SDLoc DL(Op);
29319 EVT InVT = Op.getOperand(i: 0).getValueType();
29320 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
29321
29322 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
29323 "Only expected to lower fixed length vector operation!");
29324 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
29325 "Expected integer result of the same bit length as the inputs!");
29326
29327 auto Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: 0));
29328 auto Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: 1));
29329 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
29330
29331 EVT CmpVT = Pg.getValueType();
29332 auto Cmp = DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: CmpVT,
29333 Ops: {Pg, Op1, Op2, Op.getOperand(i: 2)});
29334
29335 EVT PromoteVT = ContainerVT.changeTypeToInteger();
29336 auto Promote = DAG.getBoolExtOrTrunc(Op: Cmp, SL: DL, VT: PromoteVT, OpVT: InVT);
29337 return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Promote);
29338}
29339
29340SDValue
29341AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
29342 SelectionDAG &DAG) const {
29343 SDLoc DL(Op);
29344 auto SrcOp = Op.getOperand(i: 0);
29345 EVT VT = Op.getValueType();
29346 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29347 EVT ContainerSrcVT =
29348 getContainerForFixedLengthVector(DAG, VT: SrcOp.getValueType());
29349
29350 SrcOp = convertToScalableVector(DAG, VT: ContainerSrcVT, V: SrcOp);
29351 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerDstVT, Operand: SrcOp);
29352 return convertFromScalableVector(DAG, VT, V: Op);
29353}
29354
29355SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
29356 SDValue Op, SelectionDAG &DAG) const {
29357 SDLoc DL(Op);
29358 unsigned NumOperands = Op->getNumOperands();
29359
29360 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
29361 "Unexpected number of operands in CONCAT_VECTORS");
29362
29363 auto SrcOp1 = Op.getOperand(i: 0);
29364 auto SrcOp2 = Op.getOperand(i: 1);
29365 EVT VT = Op.getValueType();
29366 EVT SrcVT = SrcOp1.getValueType();
29367
29368 // Match a splat of 128b segments that fit in a single register.
29369 if (SrcVT.is128BitVector() && all_equal(Range: Op.getNode()->op_values())) {
29370 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29371 SDValue Splat =
29372 DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT: ContainerVT,
29373 N1: convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp1),
29374 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64, /*isTarget=*/true));
29375 return convertFromScalableVector(DAG, VT, V: Splat);
29376 }
29377
29378 if (NumOperands > 2) {
29379 SmallVector<SDValue, 4> Ops;
29380 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
29381 for (unsigned I = 0; I < NumOperands; I += 2)
29382 Ops.push_back(Elt: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: PairVT,
29383 N1: Op->getOperand(Num: I), N2: Op->getOperand(Num: I + 1)));
29384
29385 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops);
29386 }
29387
29388 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29389
29390 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
29391 SrcOp1 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp1);
29392 SrcOp2 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp2);
29393
29394 Op = DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: ContainerVT, N1: Pg, N2: SrcOp1, N3: SrcOp2);
29395
29396 return convertFromScalableVector(DAG, VT, V: Op);
29397}
29398
29399SDValue
29400AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
29401 SelectionDAG &DAG) const {
29402 EVT VT = Op.getValueType();
29403 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29404
29405 SDLoc DL(Op);
29406 SDValue Val = Op.getOperand(i: 0);
29407 SDValue Pg = getPredicateForVector(DAG, DL, VT);
29408 EVT SrcVT = Val.getValueType();
29409 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29410 EVT ExtendVT = ContainerVT.changeVectorElementType(
29411 EltVT: SrcVT.getVectorElementType());
29412
29413 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
29414 Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VT.changeTypeToInteger(), Operand: Val);
29415
29416 Val = convertToScalableVector(DAG, VT: ContainerVT.changeTypeToInteger(), V: Val);
29417 Val = getSVESafeBitCast(VT: ExtendVT, Op: Val, DAG);
29418 Val = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
29419 N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: ContainerVT));
29420
29421 return convertFromScalableVector(DAG, VT, V: Val);
29422}
29423
29424SDValue
29425AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
29426 SelectionDAG &DAG) const {
29427 EVT VT = Op.getValueType();
29428 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29429
29430 SDLoc DL(Op);
29431 SDValue Val = Op.getOperand(i: 0);
29432 EVT SrcVT = Val.getValueType();
29433 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
29434 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
29435 EltVT: VT.getVectorElementType());
29436 SDValue Pg = getPredicateForVector(DAG, DL, VT: RoundVT);
29437
29438 Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
29439 Val = DAG.getNode(Opcode: AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, VT: RoundVT, N1: Pg, N2: Val,
29440 N3: Op.getOperand(i: 1), N4: DAG.getUNDEF(VT: RoundVT));
29441 Val = getSVESafeBitCast(VT: ContainerSrcVT.changeTypeToInteger(), Op: Val, DAG);
29442 Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
29443
29444 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
29445 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
29446}
29447
29448SDValue
29449AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
29450 SelectionDAG &DAG) const {
29451 EVT VT = Op.getValueType();
29452 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29453
29454 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
29455 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
29456 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
29457
29458 SDLoc DL(Op);
29459 SDValue Val = Op.getOperand(i: 0);
29460 EVT SrcVT = Val.getValueType();
29461 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29462 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
29463
29464 if (VT.bitsGE(VT: SrcVT)) {
29465 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29466
29467 Val = DAG.getNode(Opcode: IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
29468 VT: VT.changeTypeToInteger(), Operand: Val);
29469
29470 // Safe to use a larger than specified operand because by promoting the
29471 // value nothing has changed from an arithmetic point of view.
29472 Val =
29473 convertToScalableVector(DAG, VT: ContainerDstVT.changeTypeToInteger(), V: Val);
29474 Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
29475 N3: DAG.getUNDEF(VT: ContainerDstVT));
29476 return convertFromScalableVector(DAG, VT, V: Val);
29477 } else {
29478 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
29479 EltVT: ContainerDstVT.getVectorElementType());
29480 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
29481
29482 Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
29483 Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
29484 Val = getSVESafeBitCast(VT: ContainerSrcVT, Op: Val, DAG);
29485 Val = convertFromScalableVector(DAG, VT: SrcVT, V: Val);
29486
29487 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
29488 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
29489 }
29490}
29491
29492SDValue
29493AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
29494 SelectionDAG &DAG) const {
29495 SDLoc DL(Op);
29496 EVT OpVT = Op.getValueType();
29497 assert(OpVT.isScalableVector() &&
29498 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
29499
29500 // Are multi-register uzp instructions available?
29501 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
29502 OpVT.getVectorElementType() != MVT::i1) {
29503 Intrinsic::ID IntID;
29504 switch (Op->getNumOperands()) {
29505 default:
29506 return SDValue();
29507 case 2:
29508 IntID = Intrinsic::aarch64_sve_uzp_x2;
29509 break;
29510 case 4:
29511 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
29512 OpVT.getScalarSizeInBits() == 64)
29513 return SDValue();
29514 IntID = Intrinsic::aarch64_sve_uzp_x4;
29515 break;
29516 }
29517
29518 SmallVector<SDValue, 5> Ops;
29519 Ops.push_back(Elt: DAG.getTargetConstant(Val: IntID, DL, VT: MVT::i64));
29520 Ops.append(in_start: Op->op_values().begin(), in_end: Op->op_values().end());
29521 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VTList: Op->getVTList(), Ops);
29522 }
29523
29524 if (Op->getNumOperands() != 2)
29525 return SDValue();
29526
29527 SDValue Even = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: OpVT, N1: Op.getOperand(i: 0),
29528 N2: Op.getOperand(i: 1));
29529 SDValue Odd = DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT: OpVT, N1: Op.getOperand(i: 0),
29530 N2: Op.getOperand(i: 1));
29531 return DAG.getMergeValues(Ops: {Even, Odd}, dl: DL);
29532}
29533
29534SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
29535 SelectionDAG &DAG) const {
29536 SDLoc DL(Op);
29537 EVT OpVT = Op.getValueType();
29538 assert(OpVT.isScalableVector() &&
29539 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
29540
29541 // Are multi-register zip instructions available?
29542 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
29543 OpVT.getVectorElementType() != MVT::i1) {
29544 Intrinsic::ID IntID;
29545 switch (Op->getNumOperands()) {
29546 default:
29547 return SDValue();
29548 case 2:
29549 IntID = Intrinsic::aarch64_sve_zip_x2;
29550 break;
29551 case 4:
29552 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
29553 OpVT.getScalarSizeInBits() == 64)
29554 return SDValue();
29555 IntID = Intrinsic::aarch64_sve_zip_x4;
29556 break;
29557 }
29558
29559 SmallVector<SDValue, 5> Ops;
29560 Ops.push_back(Elt: DAG.getTargetConstant(Val: IntID, DL, VT: MVT::i64));
29561 Ops.append(in_start: Op->op_values().begin(), in_end: Op->op_values().end());
29562 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VTList: Op->getVTList(), Ops);
29563 }
29564
29565 if (Op->getNumOperands() != 2)
29566 return SDValue();
29567
29568 SDValue Lo = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: OpVT, N1: Op.getOperand(i: 0),
29569 N2: Op.getOperand(i: 1));
29570 SDValue Hi = DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: OpVT, N1: Op.getOperand(i: 0),
29571 N2: Op.getOperand(i: 1));
29572 return DAG.getMergeValues(Ops: {Lo, Hi}, dl: DL);
29573}
29574
29575SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
29576 SelectionDAG &DAG) const {
29577 // FIXME: Maybe share some code with LowerMGather/Scatter?
29578 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Val&: Op);
29579 SDLoc DL(HG);
29580 SDValue Chain = HG->getChain();
29581 SDValue Inc = HG->getInc();
29582 SDValue Mask = HG->getMask();
29583 SDValue Ptr = HG->getBasePtr();
29584 SDValue Index = HG->getIndex();
29585 SDValue Scale = HG->getScale();
29586 SDValue IntID = HG->getIntID();
29587
29588 // The Intrinsic ID determines the type of update operation.
29589 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(Val: IntID.getNode());
29590 // Right now, we only support 'add' as an update.
29591 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
29592 "Unexpected histogram update operation");
29593
29594 EVT IndexVT = Index.getValueType();
29595 LLVMContext &Ctx = *DAG.getContext();
29596 ElementCount EC = IndexVT.getVectorElementCount();
29597 EVT MemVT = EVT::getVectorVT(Context&: Ctx, VT: HG->getMemoryVT(), EC);
29598 EVT IncExtVT =
29599 EVT::getIntegerVT(Context&: Ctx, BitWidth: AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
29600 EVT IncSplatVT = EVT::getVectorVT(Context&: Ctx, VT: IncExtVT, EC);
29601 bool ExtTrunc = IncSplatVT != MemVT;
29602
29603 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
29604 SDValue PassThru = DAG.getSplatVector(VT: IncSplatVT, DL, Op: Zero);
29605 SDValue IncSplat = DAG.getSplatVector(
29606 VT: IncSplatVT, DL, Op: DAG.getAnyExtOrTrunc(Op: Inc, DL, VT: IncExtVT));
29607 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
29608
29609 MachineMemOperand *MMO = HG->getMemOperand();
29610 // Create an MMO for the gather, without load|store flags.
29611 MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
29612 PtrInfo: MMO->getPointerInfo(), F: MachineMemOperand::MOLoad, Size: MMO->getSize(),
29613 BaseAlignment: MMO->getAlign(), AAInfo: MMO->getAAInfo());
29614 ISD::MemIndexType IndexType = HG->getIndexType();
29615 SDValue Gather = DAG.getMaskedGather(
29616 VTs: DAG.getVTList(VT1: IncSplatVT, VT2: MVT::Other), MemVT, dl: DL, Ops, MMO: GMMO, IndexType,
29617 ExtTy: ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
29618
29619 SDValue GChain = Gather.getValue(R: 1);
29620
29621 // Perform the histcnt, multiply by inc, add to bucket data.
29622 SDValue ID =
29623 DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_histcnt, DL, VT: IncExtVT);
29624 SDValue HistCnt =
29625 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT, N1: ID, N2: Mask, N3: Index, N4: Index);
29626 SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: IncSplatVT, N1: HistCnt, N2: IncSplat);
29627 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: IncSplatVT, N1: Gather, N2: Mul);
29628
29629 // Create an MMO for the scatter, without load|store flags.
29630 MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
29631 PtrInfo: MMO->getPointerInfo(), F: MachineMemOperand::MOStore, Size: MMO->getSize(),
29632 BaseAlignment: MMO->getAlign(), AAInfo: MMO->getAAInfo());
29633
29634 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
29635 SDValue Scatter = DAG.getMaskedScatter(VTs: DAG.getVTList(VT: MVT::Other), MemVT, dl: DL,
29636 Ops: ScatterOps, MMO: SMMO, IndexType, IsTruncating: ExtTrunc);
29637 return Scatter;
29638}
29639
29640/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
29641/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can
29642/// however still make use of the dot product instruction by instead
29643/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
29644/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise
29645/// the following pattern is emitted:
29646/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
29647/// NTy/2))))
29648SDValue
29649AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
29650 SelectionDAG &DAG) const {
29651 SDLoc DL(Op);
29652
29653 SDValue Acc = Op.getOperand(i: 0);
29654 SDValue LHS = Op.getOperand(i: 1);
29655 SDValue RHS = Op.getOperand(i: 2);
29656 EVT ResultVT = Op.getValueType();
29657 EVT OrigResultVT = ResultVT;
29658 EVT OpVT = LHS.getValueType();
29659
29660 bool ConvertToScalable =
29661 ResultVT.isFixedLengthVector() &&
29662 useSVEForFixedLengthVectorVT(VT: ResultVT, /*OverrideNEON=*/true);
29663
29664 if (ConvertToScalable) {
29665 ResultVT = getContainerForFixedLengthVector(DAG, VT: ResultVT);
29666 OpVT = getContainerForFixedLengthVector(DAG, VT: LHS.getValueType());
29667 Acc = convertToScalableVector(DAG, VT: ResultVT, V: Acc);
29668 LHS = convertToScalableVector(DAG, VT: OpVT, V: LHS);
29669 RHS = convertToScalableVector(DAG, VT: OpVT, V: RHS);
29670 Op = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: ResultVT, Ops: {Acc, LHS, RHS});
29671 }
29672
29673 // Two-way and four-way partial reductions are supported by patterns.
29674 // We only need to handle the 8-way partial reduction.
29675 if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)
29676 return ConvertToScalable ? convertFromScalableVector(DAG, VT: OrigResultVT, V: Op)
29677 : Op;
29678
29679 EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
29680 SDValue DotNode = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DotVT,
29681 N1: DAG.getConstant(Val: 0, DL, VT: DotVT), N2: LHS, N3: RHS);
29682
29683 SDValue Res;
29684 bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
29685 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
29686 unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
29687 unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
29688 SDValue Lo = DAG.getNode(Opcode: LoOpcode, DL, VT: ResultVT, N1: Acc, N2: DotNode);
29689 Res = DAG.getNode(Opcode: HiOpcode, DL, VT: ResultVT, N1: Lo, N2: DotNode);
29690 } else {
29691 // Fold (nx)v4i32 into (nx)v2i64
29692 auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(N: DotNode, DL);
29693 if (IsUnsigned) {
29694 DotNodeLo = DAG.getZExtOrTrunc(Op: DotNodeLo, DL, VT: ResultVT);
29695 DotNodeHi = DAG.getZExtOrTrunc(Op: DotNodeHi, DL, VT: ResultVT);
29696 } else {
29697 DotNodeLo = DAG.getSExtOrTrunc(Op: DotNodeLo, DL, VT: ResultVT);
29698 DotNodeHi = DAG.getSExtOrTrunc(Op: DotNodeHi, DL, VT: ResultVT);
29699 }
29700 auto Lo = DAG.getNode(Opcode: ISD::ADD, DL, VT: ResultVT, N1: Acc, N2: DotNodeLo);
29701 Res = DAG.getNode(Opcode: ISD::ADD, DL, VT: ResultVT, N1: Lo, N2: DotNodeHi);
29702 }
29703
29704 return ConvertToScalable ? convertFromScalableVector(DAG, VT: OrigResultVT, V: Res)
29705 : Res;
29706}
29707
29708SDValue
29709AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
29710 SelectionDAG &DAG) const {
29711 EVT VT = Op.getValueType();
29712 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29713
29714 assert(Subtarget->isSVEorStreamingSVEAvailable() &&
29715 "Lowering fixed length get_active_lane_mask requires SVE!");
29716
29717 // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
29718 // but we can use SVE when available.
29719
29720 SDLoc DL(Op);
29721 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29722 EVT WhileVT = ContainerVT.changeElementType(EltVT: MVT::i1);
29723
29724 SDValue Mask = DAG.getNode(Opcode: ISD::GET_ACTIVE_LANE_MASK, DL, VT: WhileVT,
29725 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1));
29726 SDValue MaskAsInt = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: ContainerVT, Operand: Mask);
29727 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: MaskAsInt,
29728 N2: DAG.getVectorIdxConstant(Val: 0, DL));
29729}
29730
29731SDValue
29732AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
29733 SelectionDAG &DAG) const {
29734 EVT VT = Op.getValueType();
29735 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29736
29737 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
29738 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
29739 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
29740
29741 SDLoc DL(Op);
29742 SDValue Val = Op.getOperand(i: 0);
29743 EVT SrcVT = Val.getValueType();
29744 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29745 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
29746
29747 if (VT.bitsGT(VT: SrcVT)) {
29748 EVT CvtVT = ContainerDstVT.changeVectorElementType(
29749 EltVT: ContainerSrcVT.getVectorElementType());
29750 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29751
29752 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
29753 Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: Val);
29754
29755 Val = convertToScalableVector(DAG, VT: ContainerDstVT, V: Val);
29756 Val = getSVESafeBitCast(VT: CvtVT, Op: Val, DAG);
29757 Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
29758 N3: DAG.getUNDEF(VT: ContainerDstVT));
29759 return convertFromScalableVector(DAG, VT, V: Val);
29760 } else {
29761 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
29762 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
29763
29764 // Safe to use a larger than specified result since an fp_to_int where the
29765 // result doesn't fit into the destination is undefined.
29766 Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
29767 Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
29768 Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
29769
29770 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Val);
29771 }
29772}
29773
29774static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
29775 ArrayRef<int> ShuffleMask, EVT VT,
29776 EVT ContainerVT, SelectionDAG &DAG) {
29777 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29778 SDLoc DL(Op);
29779 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
29780 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
29781 bool IsSingleOp =
29782 ShuffleVectorInst::isSingleSourceMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size());
29783
29784 if (!Subtarget.isNeonAvailable() && !MinSVESize)
29785 MinSVESize = 128;
29786
29787 // Ignore two operands if no SVE2 or all index numbers couldn't
29788 // be represented.
29789 if (!IsSingleOp && !Subtarget.hasSVE2())
29790 return SDValue();
29791
29792 EVT VTOp1 = Op.getOperand(i: 0).getValueType();
29793 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
29794 unsigned IndexLen = MinSVESize / BitsPerElt;
29795 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
29796 uint64_t MaxOffset = maxUIntN(N: BitsPerElt);
29797 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
29798 EVT MaskType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MaskEltType, NumElements: IndexLen);
29799 bool MinMaxEqual = (MinSVESize == MaxSVESize);
29800 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
29801 "Incorrectly legalised shuffle operation");
29802
29803 SmallVector<SDValue, 8> TBLMask;
29804 // If MinSVESize is not equal to MaxSVESize then we need to know which
29805 // TBL mask element needs adjustment.
29806 SmallVector<SDValue, 8> AddRuntimeVLMask;
29807
29808 // Bail out for 8-bits element types, because with 2048-bit SVE register
29809 // size 8 bits is only sufficient to index into the first source vector.
29810 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
29811 return SDValue();
29812
29813 for (int Index : ShuffleMask) {
29814 // Handling poison index value.
29815 if (Index < 0)
29816 Index = 0;
29817 // If the mask refers to elements in the second operand, then we have to
29818 // offset the index by the number of elements in a vector. If this is number
29819 // is not known at compile-time, we need to maintain a mask with 'VL' values
29820 // to add at runtime.
29821 if ((unsigned)Index >= ElementsPerVectorReg) {
29822 if (MinMaxEqual) {
29823 Index += IndexLen - ElementsPerVectorReg;
29824 } else {
29825 Index = Index - ElementsPerVectorReg;
29826 AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: 1, DL, VT: MVT::i64));
29827 }
29828 } else if (!MinMaxEqual)
29829 AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
29830 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
29831 // to 255, this might point to the last element of in the second operand
29832 // of the shufflevector, thus we are rejecting this transform.
29833 if ((unsigned)Index >= MaxOffset)
29834 return SDValue();
29835 TBLMask.push_back(Elt: DAG.getConstant(Val: Index, DL, VT: MVT::i64));
29836 }
29837
29838 // Choosing an out-of-range index leads to the lane being zeroed vs zero
29839 // value where it would perform first lane duplication for out of
29840 // index elements. For i8 elements an out-of-range index could be a valid
29841 // for 2048-bit vector register size.
29842 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
29843 TBLMask.push_back(Elt: DAG.getConstant(Val: (int)MaxOffset, DL, VT: MVT::i64));
29844 if (!MinMaxEqual)
29845 AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
29846 }
29847
29848 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskType);
29849 SDValue VecMask =
29850 DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
29851 SDValue SVEMask = convertToScalableVector(DAG, VT: MaskContainerVT, V: VecMask);
29852
29853 SDValue Shuffle;
29854 if (IsSingleOp)
29855 Shuffle =
29856 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ContainerVT,
29857 N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_tbl, DL, VT: MVT::i32),
29858 N2: Op1, N3: SVEMask);
29859 else if (Subtarget.hasSVE2()) {
29860 if (!MinMaxEqual) {
29861 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
29862 SDValue VScale = (BitsPerElt == 64)
29863 ? DAG.getVScale(DL, VT: MVT::i64, MulImm: APInt(64, MinNumElts))
29864 : DAG.getVScale(DL, VT: MVT::i32, MulImm: APInt(32, MinNumElts));
29865 SDValue VecMask =
29866 DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
29867 SDValue MulByMask = DAG.getNode(
29868 Opcode: ISD::MUL, DL, VT: MaskType,
29869 N1: DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MaskType, Operand: VScale),
29870 N2: DAG.getBuildVector(VT: MaskType, DL,
29871 Ops: ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
29872 SDValue UpdatedVecMask =
29873 DAG.getNode(Opcode: ISD::ADD, DL, VT: MaskType, N1: VecMask, N2: MulByMask);
29874 SVEMask = convertToScalableVector(
29875 DAG, VT: getContainerForFixedLengthVector(DAG, VT: MaskType), V: UpdatedVecMask);
29876 }
29877 Shuffle =
29878 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ContainerVT,
29879 N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_tbl2, DL, VT: MVT::i32),
29880 N2: Op1, N3: Op2, N4: SVEMask);
29881 }
29882 Shuffle = convertFromScalableVector(DAG, VT, V: Shuffle);
29883 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
29884}
29885
29886SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
29887 SDValue Op, SelectionDAG &DAG) const {
29888 EVT VT = Op.getValueType();
29889 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29890
29891 auto *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
29892 auto ShuffleMask = SVN->getMask();
29893
29894 SDLoc DL(Op);
29895 SDValue Op1 = Op.getOperand(i: 0);
29896 SDValue Op2 = Op.getOperand(i: 1);
29897
29898 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29899 Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op1);
29900 Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op2);
29901
29902 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
29903 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
29904 return MVT::i32;
29905 return ScalarTy;
29906 };
29907
29908 if (SVN->isSplat()) {
29909 unsigned Lane = std::max(a: 0, b: SVN->getSplatIndex());
29910 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
29911 SDValue SplatEl = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarTy, N1: Op1,
29912 N2: DAG.getConstant(Val: Lane, DL, VT: MVT::i64));
29913 Op = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: ContainerVT, Operand: SplatEl);
29914 return convertFromScalableVector(DAG, VT, V: Op);
29915 }
29916
29917 bool ReverseEXT = false;
29918 unsigned Imm;
29919 if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm) &&
29920 Imm == VT.getVectorNumElements() - 1) {
29921 if (ReverseEXT)
29922 std::swap(a&: Op1, b&: Op2);
29923 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
29924 SDValue Scalar = DAG.getNode(
29925 Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarTy, N1: Op1,
29926 N2: DAG.getConstant(Val: VT.getVectorNumElements() - 1, DL, VT: MVT::i64));
29927 Op = DAG.getNode(Opcode: AArch64ISD::INSR, DL, VT: ContainerVT, N1: Op2, N2: Scalar);
29928 return convertFromScalableVector(DAG, VT, V: Op);
29929 }
29930
29931 unsigned EltSize = VT.getScalarSizeInBits();
29932 for (unsigned BlockSize : {64U, 32U, 16U}) {
29933 if (isREVMask(M: ShuffleMask, EltSize, NumElts: VT.getVectorNumElements(), BlockSize)) {
29934 unsigned RevOp;
29935 if (EltSize == 8)
29936 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
29937 else if (EltSize == 16)
29938 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
29939 else
29940 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
29941 EVT BlockedVT =
29942 getPackedSVEVectorVT(VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: BlockSize));
29943 SDValue Pg = getPredicateForVector(DAG, DL, VT: BlockedVT);
29944 SDValue BlockedOp1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: BlockedVT, Operand: Op1);
29945 SDValue BlockedRev = DAG.getNode(Opcode: RevOp, DL, VT: BlockedVT, N1: Pg, N2: BlockedOp1,
29946 N3: DAG.getUNDEF(VT: BlockedVT));
29947 SDValue Container =
29948 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: BlockedRev);
29949 return convertFromScalableVector(DAG, VT, V: Container);
29950 }
29951 }
29952
29953 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
29954 isREVMask(M: ShuffleMask, EltSize, NumElts: VT.getVectorNumElements(), BlockSize: 128)) {
29955 SDValue Pg = getPredicateForVector(DAG, DL, VT);
29956 SDValue Revd = DAG.getNode(Opcode: AArch64ISD::REVD_MERGE_PASSTHRU, DL, VT: ContainerVT,
29957 N1: Pg, N2: Op1, N3: DAG.getUNDEF(VT: ContainerVT));
29958 return convertFromScalableVector(DAG, VT, V: Revd);
29959 }
29960
29961 unsigned WhichResult;
29962 if (isZIPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult) &&
29963 WhichResult == 0)
29964 return convertFromScalableVector(
29965 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op2));
29966
29967 if (isTRNMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResult)) {
29968 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
29969 return convertFromScalableVector(
29970 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
29971 }
29972
29973 if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult == 0)
29974 return convertFromScalableVector(
29975 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op1));
29976
29977 if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
29978 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
29979 return convertFromScalableVector(
29980 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
29981 }
29982
29983 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
29984 // represents the same logical operation as performed by a ZIP instruction. In
29985 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
29986 // equivalent to an AArch64 instruction. There's the extra component of
29987 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
29988 // only operated on 64/128bit vector types that have a direct mapping to a
29989 // target register and so an exact mapping is implied.
29990 // However, when using SVE for fixed length vectors, most legal vector types
29991 // are actually sub-vectors of a larger SVE register. When mapping
29992 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
29993 // how the mask's indices translate. Specifically, when the mapping requires
29994 // an exact meaning for a specific vector index (e.g. Index X is the last
29995 // vector element in the register) then such mappings are often only safe when
29996 // the exact SVE register size is know. The main exception to this is when
29997 // indices are logically relative to the first element of either
29998 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
29999 // when converting from fixed-length to scalable vector types (i.e. the start
30000 // of a fixed length vector is always the start of a scalable vector).
30001 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
30002 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
30003 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
30004 if (ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size()) &&
30005 Op2.isUndef()) {
30006 Op = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: ContainerVT, Operand: Op1);
30007 return convertFromScalableVector(DAG, VT, V: Op);
30008 }
30009
30010 if (isZIPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult) &&
30011 WhichResult != 0)
30012 return convertFromScalableVector(
30013 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op2));
30014
30015 if (isUZPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult)) {
30016 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30017 return convertFromScalableVector(
30018 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
30019 }
30020
30021 if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult != 0)
30022 return convertFromScalableVector(
30023 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op1));
30024
30025 if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
30026 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30027 return convertFromScalableVector(
30028 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
30029 }
30030
30031 if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
30032 Subtarget->isSVEorStreamingSVEAvailable()) {
30033 assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&
30034 "Unsupported SVE vector size");
30035
30036 unsigned Segments = VT.getFixedSizeInBits() / AArch64::SVEBitsPerBlock;
30037 unsigned SegmentElts = VT.getVectorNumElements() / Segments;
30038 if (std::optional<unsigned> Lane =
30039 isDUPQMask(Mask: ShuffleMask, Segments, SegmentSize: SegmentElts)) {
30040 SDValue IID =
30041 DAG.getConstant(Val: Intrinsic::aarch64_sve_dup_laneq, DL, VT: MVT::i64);
30042 return convertFromScalableVector(
30043 DAG, VT,
30044 V: DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ContainerVT,
30045 Ops: {IID, Op1,
30046 DAG.getConstant(Val: *Lane, DL, VT: MVT::i64,
30047 /*isTarget=*/true)}));
30048 }
30049 }
30050 }
30051
30052 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
30053 // This may allow the shuffle to be matched as something cheaper like ZIP1.
30054 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
30055 return WideOp;
30056
30057 // Avoid producing TBL instruction if we don't know SVE register minimal size,
30058 // unless NEON is not available and we can assume minimal SVE register size is
30059 // 128-bits.
30060 if (MinSVESize || !Subtarget->isNeonAvailable())
30061 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
30062 DAG);
30063
30064 return SDValue();
30065}
30066
30067SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
30068 SelectionDAG &DAG) const {
30069 SDLoc DL(Op);
30070 EVT InVT = Op.getValueType();
30071
30072 assert(VT.isScalableVector() && isTypeLegal(VT) &&
30073 InVT.isScalableVector() && isTypeLegal(InVT) &&
30074 "Only expect to cast between legal scalable vector types!");
30075 assert(VT.getVectorElementType() != MVT::i1 &&
30076 InVT.getVectorElementType() != MVT::i1 &&
30077 "For predicate bitcasts, use getSVEPredicateBitCast");
30078
30079 if (InVT == VT)
30080 return Op;
30081
30082 EVT PackedVT = getPackedSVEVectorVT(VT: VT.getVectorElementType());
30083 EVT PackedInVT = getPackedSVEVectorVT(VT: InVT.getVectorElementType());
30084
30085 // Safe bitcasting between unpacked vector types of different element counts
30086 // is currently unsupported because the following is missing the necessary
30087 // work to ensure the result's elements live where they're supposed to within
30088 // an SVE register.
30089 // 01234567
30090 // e.g. nxv2i32 = XX??XX??
30091 // nxv4f16 = X?X?X?X?
30092 assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
30093 VT == PackedVT || InVT == PackedInVT) &&
30094 "Unexpected bitcast!");
30095
30096 // Pack input if required.
30097 if (InVT != PackedInVT)
30098 Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: PackedInVT, Operand: Op);
30099
30100 if (Subtarget->isLittleEndian() ||
30101 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
30102 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: PackedVT, Operand: Op);
30103 else {
30104 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
30105 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
30106
30107 // Simulate the effect of casting through memory.
30108 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: PackedInVTAsInt, Operand: Op);
30109 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
30110 Op = DAG.getNode(Opcode: ISD::BSWAP, DL, VT: PackedInVTAsInt, Operand: Op);
30111 Op = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: PackedVTAsInt, Operand: Op);
30112 if (PackedVTAsInt.getScalarSizeInBits() != 8)
30113 Op = DAG.getNode(Opcode: ISD::BSWAP, DL, VT: PackedVTAsInt, Operand: Op);
30114 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: PackedVT, Operand: Op);
30115 }
30116
30117 // Unpack result if required.
30118 if (VT != PackedVT)
30119 Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
30120
30121 return Op;
30122}
30123
30124bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
30125 SDValue N) const {
30126 return ::isAllActivePredicate(DAG, N);
30127}
30128
30129EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
30130 return ::getPromotedVTForPredicate(VT);
30131}
30132
30133bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
30134 SDValue Op, const APInt &OriginalDemandedBits,
30135 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
30136 unsigned Depth) const {
30137
30138 unsigned Opc = Op.getOpcode();
30139 switch (Opc) {
30140 case AArch64ISD::VSHL: {
30141 // Match (VSHL (VLSHR Val X) X)
30142 SDValue ShiftL = Op;
30143 SDValue ShiftR = Op->getOperand(Num: 0);
30144 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
30145 return false;
30146
30147 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
30148 return false;
30149
30150 unsigned ShiftLBits = ShiftL->getConstantOperandVal(Num: 1);
30151 unsigned ShiftRBits = ShiftR->getConstantOperandVal(Num: 1);
30152
30153 // Other cases can be handled as well, but this is not
30154 // implemented.
30155 if (ShiftRBits != ShiftLBits)
30156 return false;
30157
30158 unsigned ScalarSize = Op.getScalarValueSizeInBits();
30159 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
30160
30161 APInt ZeroBits = APInt::getLowBitsSet(numBits: ScalarSize, loBitsSet: ShiftLBits);
30162 APInt UnusedBits = ~OriginalDemandedBits;
30163
30164 if ((ZeroBits & UnusedBits) != ZeroBits)
30165 return false;
30166
30167 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
30168 // used - simplify to just Val.
30169 return TLO.CombineTo(O: Op, N: ShiftR->getOperand(Num: 0));
30170 }
30171 case AArch64ISD::BICi: {
30172 // Fold BICi if all destination bits already known to be zeroed
30173 SDValue Op0 = Op.getOperand(i: 0);
30174 KnownBits KnownOp0 =
30175 TLO.DAG.computeKnownBits(Op: Op0, DemandedElts: OriginalDemandedElts, Depth: Depth + 1);
30176 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
30177 APInt BitsToClear =
30178 (Op->getConstantOperandAPInt(Num: 1) << Op->getConstantOperandAPInt(Num: 2))
30179 .trunc(width: KnownOp0.getBitWidth());
30180 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
30181 if (BitsToClear.isSubsetOf(RHS: AlreadyZeroedBitsToClear))
30182 return TLO.CombineTo(O: Op, N: Op0);
30183
30184 Known = KnownOp0 & KnownBits::makeConstant(C: ~BitsToClear);
30185 return false;
30186 }
30187 case ISD::INTRINSIC_WO_CHAIN: {
30188 if (auto ElementSize = IsSVECntIntrinsic(S: Op)) {
30189 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
30190 if (!MaxSVEVectorSizeInBits)
30191 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
30192 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
30193 // The SVE count intrinsics don't support the multiplier immediate so we
30194 // don't have to account for that here. The value returned may be slightly
30195 // over the true required bits, as this is based on the "ALL" pattern. The
30196 // other patterns are also exposed by these intrinsics, but they all
30197 // return a value that's strictly less than "ALL".
30198 unsigned RequiredBits = llvm::bit_width(Value: MaxElements);
30199 unsigned BitWidth = Known.Zero.getBitWidth();
30200 if (RequiredBits < BitWidth)
30201 Known.Zero.setHighBits(BitWidth - RequiredBits);
30202 return false;
30203 }
30204 }
30205 }
30206
30207 return TargetLowering::SimplifyDemandedBitsForTargetNode(
30208 Op, DemandedBits: OriginalDemandedBits, DemandedElts: OriginalDemandedElts, Known, TLO, Depth);
30209}
30210
30211bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
30212 return Op.getOpcode() == AArch64ISD::DUP ||
30213 Op.getOpcode() == AArch64ISD::MOVI ||
30214 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
30215 Op.getOperand(i: 0).getOpcode() == AArch64ISD::DUP) ||
30216 TargetLowering::isTargetCanonicalConstantNode(Op);
30217}
30218
30219bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
30220 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
30221 Subtarget->hasComplxNum();
30222}
30223
30224bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
30225 ComplexDeinterleavingOperation Operation, Type *Ty) const {
30226 auto *VTy = dyn_cast<VectorType>(Val: Ty);
30227 if (!VTy)
30228 return false;
30229
30230 // If the vector is scalable, SVE is enabled, implying support for complex
30231 // numbers. Otherwise, we need to ensure complex number support is available
30232 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
30233 return false;
30234
30235 auto *ScalarTy = VTy->getScalarType();
30236 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
30237
30238 // We can only process vectors that have a bit size of 128 or higher (with an
30239 // additional 64 bits for Neon). Additionally, these vectors must have a
30240 // power-of-2 size, as we later split them into the smallest supported size
30241 // and merging them back together after applying complex operation.
30242 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
30243 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
30244 !llvm::isPowerOf2_32(Value: VTyWidth))
30245 return false;
30246
30247 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
30248 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
30249
30250 if (Operation == ComplexDeinterleavingOperation::CDot)
30251 return ScalarWidth == 32 || ScalarWidth == 64;
30252 return 8 <= ScalarWidth && ScalarWidth <= 64;
30253 }
30254
30255 // CDot is not supported outside of scalable/sve scopes
30256 if (Operation == ComplexDeinterleavingOperation::CDot)
30257 return false;
30258
30259 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
30260 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
30261}
30262
30263Value *AArch64TargetLowering::createComplexDeinterleavingIR(
30264 IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
30265 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
30266 Value *Accumulator) const {
30267 VectorType *Ty = cast<VectorType>(Val: InputA->getType());
30268 if (Accumulator == nullptr)
30269 Accumulator = Constant::getNullValue(Ty);
30270 bool IsScalable = Ty->isScalableTy();
30271 bool IsInt = Ty->getElementType()->isIntegerTy();
30272
30273 unsigned TyWidth =
30274 Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
30275
30276 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
30277 "Vector type must be either 64 or a power of 2 that is at least 128");
30278
30279 if (TyWidth > 128) {
30280 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
30281 int AccStride = cast<VectorType>(Val: Accumulator->getType())
30282 ->getElementCount()
30283 .getKnownMinValue() /
30284 2;
30285 auto *HalfTy = VectorType::getHalfElementsVectorType(VTy: Ty);
30286 auto *LowerSplitA = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: uint64_t(0));
30287 auto *LowerSplitB = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: uint64_t(0));
30288 auto *UpperSplitA = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: Stride);
30289 auto *UpperSplitB = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: Stride);
30290 Value *LowerSplitAcc = nullptr;
30291 Value *UpperSplitAcc = nullptr;
30292 Type *FullTy = Ty;
30293 FullTy = Accumulator->getType();
30294 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
30295 VTy: cast<VectorType>(Val: Accumulator->getType()));
30296 LowerSplitAcc = B.CreateExtractVector(DstType: HalfAccTy, SrcVec: Accumulator, Idx: uint64_t(0));
30297 UpperSplitAcc = B.CreateExtractVector(DstType: HalfAccTy, SrcVec: Accumulator, Idx: AccStride);
30298 auto *LowerSplitInt = createComplexDeinterleavingIR(
30299 B, OperationType, Rotation, InputA: LowerSplitA, InputB: LowerSplitB, Accumulator: LowerSplitAcc);
30300 auto *UpperSplitInt = createComplexDeinterleavingIR(
30301 B, OperationType, Rotation, InputA: UpperSplitA, InputB: UpperSplitB, Accumulator: UpperSplitAcc);
30302
30303 auto *Result = B.CreateInsertVector(DstType: FullTy, SrcVec: PoisonValue::get(T: FullTy),
30304 SubVec: LowerSplitInt, Idx: uint64_t(0));
30305 return B.CreateInsertVector(DstType: FullTy, SrcVec: Result, SubVec: UpperSplitInt, Idx: AccStride);
30306 }
30307
30308 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
30309 if (IsScalable) {
30310 if (IsInt)
30311 return B.CreateIntrinsic(
30312 ID: Intrinsic::aarch64_sve_cmla_x, Types: Ty,
30313 Args: {Accumulator, InputA, InputB, B.getInt32(C: (int)Rotation * 90)});
30314
30315 auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
30316 return B.CreateIntrinsic(
30317 ID: Intrinsic::aarch64_sve_fcmla, Types: Ty,
30318 Args: {Mask, Accumulator, InputA, InputB, B.getInt32(C: (int)Rotation * 90)});
30319 }
30320
30321 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
30322 Intrinsic::aarch64_neon_vcmla_rot90,
30323 Intrinsic::aarch64_neon_vcmla_rot180,
30324 Intrinsic::aarch64_neon_vcmla_rot270};
30325
30326
30327 return B.CreateIntrinsic(ID: IdMap[(int)Rotation], Types: Ty,
30328 Args: {Accumulator, InputA, InputB});
30329 }
30330
30331 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
30332 if (IsScalable) {
30333 if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
30334 Rotation == ComplexDeinterleavingRotation::Rotation_270) {
30335 if (IsInt)
30336 return B.CreateIntrinsic(
30337 ID: Intrinsic::aarch64_sve_cadd_x, Types: Ty,
30338 Args: {InputA, InputB, B.getInt32(C: (int)Rotation * 90)});
30339
30340 auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
30341 return B.CreateIntrinsic(
30342 ID: Intrinsic::aarch64_sve_fcadd, Types: Ty,
30343 Args: {Mask, InputA, InputB, B.getInt32(C: (int)Rotation * 90)});
30344 }
30345 return nullptr;
30346 }
30347
30348 Intrinsic::ID IntId = Intrinsic::not_intrinsic;
30349 if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
30350 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
30351 else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
30352 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
30353
30354 if (IntId == Intrinsic::not_intrinsic)
30355 return nullptr;
30356
30357 return B.CreateIntrinsic(ID: IntId, Types: Ty, Args: {InputA, InputB});
30358 }
30359
30360 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
30361 IsScalable) {
30362 return B.CreateIntrinsic(
30363 ID: Intrinsic::aarch64_sve_cdot, Types: Accumulator->getType(),
30364 Args: {Accumulator, InputA, InputB, B.getInt32(C: (int)Rotation * 90)});
30365 }
30366
30367 return nullptr;
30368}
30369
30370bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
30371 unsigned Opc = N->getOpcode();
30372 if (ISD::isExtOpcode(Opcode: Opc)) {
30373 if (any_of(Range: N->users(),
30374 P: [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
30375 return false;
30376 }
30377 return true;
30378}
30379
30380unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
30381 return Subtarget->getMinimumJumpTableEntries();
30382}
30383
30384MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
30385 CallingConv::ID CC,
30386 EVT VT) const {
30387 bool NonUnitFixedLengthVector =
30388 VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
30389 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
30390 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
30391
30392 EVT VT1;
30393 MVT RegisterVT;
30394 unsigned NumIntermediates;
30395 getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1, NumIntermediates,
30396 RegisterVT);
30397 return RegisterVT;
30398}
30399
30400unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
30401 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
30402 bool NonUnitFixedLengthVector =
30403 VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
30404 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
30405 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
30406
30407 EVT VT1;
30408 MVT VT2;
30409 unsigned NumIntermediates;
30410 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1,
30411 NumIntermediates, RegisterVT&: VT2);
30412}
30413
30414unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
30415 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
30416 unsigned &NumIntermediates, MVT &RegisterVT) const {
30417 int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
30418 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
30419 if (!RegisterVT.isFixedLengthVector() ||
30420 RegisterVT.getFixedSizeInBits() <= 128)
30421 return NumRegs;
30422
30423 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
30424 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
30425 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
30426
30427 // A size mismatch here implies either type promotion or widening and would
30428 // have resulted in scalarisation if larger vectors had not be available.
30429 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
30430 EVT EltTy = VT.getVectorElementType();
30431 EVT NewVT = EVT::getVectorVT(Context, VT: EltTy, EC: ElementCount::getFixed(MinVal: 1));
30432 if (!isTypeLegal(VT: NewVT))
30433 NewVT = EltTy;
30434
30435 IntermediateVT = NewVT;
30436 NumIntermediates = VT.getVectorNumElements();
30437 RegisterVT = getRegisterType(Context, VT: NewVT);
30438 return NumIntermediates;
30439 }
30440
30441 // SVE VLS support does not introduce a new ABI so we should use NEON sized
30442 // types for vector arguments and returns.
30443
30444 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
30445 NumIntermediates *= NumSubRegs;
30446 NumRegs *= NumSubRegs;
30447
30448 switch (RegisterVT.getVectorElementType().SimpleTy) {
30449 default:
30450 llvm_unreachable("unexpected element type for vector");
30451 case MVT::i8:
30452 IntermediateVT = RegisterVT = MVT::v16i8;
30453 break;
30454 case MVT::i16:
30455 IntermediateVT = RegisterVT = MVT::v8i16;
30456 break;
30457 case MVT::i32:
30458 IntermediateVT = RegisterVT = MVT::v4i32;
30459 break;
30460 case MVT::i64:
30461 IntermediateVT = RegisterVT = MVT::v2i64;
30462 break;
30463 case MVT::f16:
30464 IntermediateVT = RegisterVT = MVT::v8f16;
30465 break;
30466 case MVT::f32:
30467 IntermediateVT = RegisterVT = MVT::v4f32;
30468 break;
30469 case MVT::f64:
30470 IntermediateVT = RegisterVT = MVT::v2f64;
30471 break;
30472 case MVT::bf16:
30473 IntermediateVT = RegisterVT = MVT::v8bf16;
30474 break;
30475 }
30476
30477 return NumRegs;
30478}
30479
30480bool AArch64TargetLowering::hasInlineStackProbe(
30481 const MachineFunction &MF) const {
30482 return !Subtarget->isTargetWindows() &&
30483 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
30484}
30485
30486bool AArch64TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
30487 switch (Opc) {
30488 case ISD::TRUNCATE_SSAT_S:
30489 case ISD::TRUNCATE_SSAT_U:
30490 case ISD::TRUNCATE_USAT_U:
30491 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
30492 return true;
30493 }
30494
30495 return TargetLowering::isTypeDesirableForOp(Opc, VT);
30496}
30497
30498bool AArch64TargetLowering::shouldPreservePtrArith(const Function &F,
30499 EVT VT) const {
30500 return Subtarget->hasCPA() && UseFEATCPACodegen;
30501}
30502