1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
14#include "AArch64CallingConvention.h"
15#include "AArch64ExpandImm.h"
16#include "AArch64MachineFunctionInfo.h"
17#include "AArch64PerfectShuffle.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
20#include "MCTargetDesc/AArch64AddressingModes.h"
21#include "Utils/AArch64BaseInfo.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
27#include "llvm/ADT/SmallVector.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
31#include "llvm/Analysis/LoopInfo.h"
32#include "llvm/Analysis/MemoryLocation.h"
33#include "llvm/Analysis/ObjCARCUtil.h"
34#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35#include "llvm/Analysis/TargetTransformInfo.h"
36#include "llvm/Analysis/ValueTracking.h"
37#include "llvm/Analysis/VectorUtils.h"
38#include "llvm/CodeGen/Analysis.h"
39#include "llvm/CodeGen/CallingConvLower.h"
40#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
41#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
42#include "llvm/CodeGen/GlobalISel/Utils.h"
43#include "llvm/CodeGen/ISDOpcodes.h"
44#include "llvm/CodeGen/MachineBasicBlock.h"
45#include "llvm/CodeGen/MachineFrameInfo.h"
46#include "llvm/CodeGen/MachineFunction.h"
47#include "llvm/CodeGen/MachineInstr.h"
48#include "llvm/CodeGen/MachineInstrBuilder.h"
49#include "llvm/CodeGen/MachineMemOperand.h"
50#include "llvm/CodeGen/MachineRegisterInfo.h"
51#include "llvm/CodeGen/RuntimeLibcallUtil.h"
52#include "llvm/CodeGen/SelectionDAG.h"
53#include "llvm/CodeGen/SelectionDAGNodes.h"
54#include "llvm/CodeGen/TargetCallingConv.h"
55#include "llvm/CodeGen/TargetInstrInfo.h"
56#include "llvm/CodeGen/TargetOpcodes.h"
57#include "llvm/CodeGen/ValueTypes.h"
58#include "llvm/CodeGenTypes/MachineValueType.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
63#include "llvm/IR/DerivedTypes.h"
64#include "llvm/IR/Function.h"
65#include "llvm/IR/GetElementPtrTypeIterator.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
69#include "llvm/IR/Instructions.h"
70#include "llvm/IR/IntrinsicInst.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
74#include "llvm/IR/PatternMatch.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
78#include "llvm/MC/MCRegisterInfo.h"
79#include "llvm/Support/AtomicOrdering.h"
80#include "llvm/Support/Casting.h"
81#include "llvm/Support/CodeGen.h"
82#include "llvm/Support/CommandLine.h"
83#include "llvm/Support/Debug.h"
84#include "llvm/Support/ErrorHandling.h"
85#include "llvm/Support/InstructionCost.h"
86#include "llvm/Support/KnownBits.h"
87#include "llvm/Support/MathExtras.h"
88#include "llvm/Support/SipHash.h"
89#include "llvm/Support/raw_ostream.h"
90#include "llvm/Target/TargetMachine.h"
91#include "llvm/Target/TargetOptions.h"
92#include "llvm/TargetParser/Triple.h"
93#include <algorithm>
94#include <bitset>
95#include <cassert>
96#include <cctype>
97#include <cstdint>
98#include <cstdlib>
99#include <iterator>
100#include <limits>
101#include <optional>
102#include <tuple>
103#include <utility>
104#include <vector>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumShiftInserts, "Number of vector shift inserts");
113STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
114
115// FIXME: The necessary dtprel relocations don't seem to be supported
116// well in the GNU bfd and gold linkers at the moment. Therefore, by
117// default, for now, fall back to GeneralDynamic code generation.
118cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
119 "aarch64-elf-ldtls-generation", cl::Hidden,
120 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
121 cl::init(Val: false));
122
123static cl::opt<bool>
124EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
125 cl::desc("Enable AArch64 logical imm instruction "
126 "optimization"),
127 cl::init(Val: true));
128
129// Temporary option added for the purpose of testing functionality added
130// to DAGCombiner.cpp in D92230. It is expected that this can be removed
131// in future when both implementations will be based off MGATHER rather
132// than the GLD1 nodes added for the SVE gather load intrinsics.
133static cl::opt<bool>
134EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
135 cl::desc("Combine extends of AArch64 masked "
136 "gather intrinsics"),
137 cl::init(Val: true));
138
139static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
140 cl::desc("Combine ext and trunc to TBL"),
141 cl::init(Val: true));
142
143// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
144// bottleneck after this transform on high end CPU. So this max leaf node
145// limitation is guard cmp+ccmp will be profitable.
146static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(Val: 16), cl::Hidden,
147 cl::desc("Maximum of xors"));
148
149// By turning this on, we will not fallback to DAG ISel when encountering
150// scalable vector types for all instruction, even if SVE is not yet supported
151// with some instructions.
152// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153cl::opt<bool> EnableSVEGISel(
154 "aarch64-enable-gisel-sve", cl::Hidden,
155 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
156 cl::init(Val: false));
157
158/// Value type used for condition codes.
159static const MVT MVT_CC = MVT::i32;
160
161static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
162 AArch64::X3, AArch64::X4, AArch64::X5,
163 AArch64::X6, AArch64::X7};
164static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
165 AArch64::Q3, AArch64::Q4, AArch64::Q5,
166 AArch64::Q6, AArch64::Q7};
167
168ArrayRef<MCPhysReg> llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; }
169
170ArrayRef<MCPhysReg> llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; }
171
172static inline EVT getPackedSVEVectorVT(EVT VT) {
173 switch (VT.getSimpleVT().SimpleTy) {
174 default:
175 llvm_unreachable("unexpected element type for vector");
176 case MVT::i8:
177 return MVT::nxv16i8;
178 case MVT::i16:
179 return MVT::nxv8i16;
180 case MVT::i32:
181 return MVT::nxv4i32;
182 case MVT::i64:
183 return MVT::nxv2i64;
184 case MVT::f16:
185 return MVT::nxv8f16;
186 case MVT::f32:
187 return MVT::nxv4f32;
188 case MVT::f64:
189 return MVT::nxv2f64;
190 case MVT::bf16:
191 return MVT::nxv8bf16;
192 }
193}
194
195// NOTE: Currently there's only a need to return integer vector types. If this
196// changes then just add an extra "type" parameter.
197static inline EVT getPackedSVEVectorVT(ElementCount EC) {
198 switch (EC.getKnownMinValue()) {
199 default:
200 llvm_unreachable("unexpected element count for vector");
201 case 16:
202 return MVT::nxv16i8;
203 case 8:
204 return MVT::nxv8i16;
205 case 4:
206 return MVT::nxv4i32;
207 case 2:
208 return MVT::nxv2i64;
209 }
210}
211
212static inline EVT getPromotedVTForPredicate(EVT VT) {
213 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
214 "Expected scalable predicate vector type!");
215 switch (VT.getVectorMinNumElements()) {
216 default:
217 llvm_unreachable("unexpected element count for vector");
218 case 2:
219 return MVT::nxv2i64;
220 case 4:
221 return MVT::nxv4i32;
222 case 8:
223 return MVT::nxv8i16;
224 case 16:
225 return MVT::nxv16i8;
226 }
227}
228
229/// Returns true if VT's elements occupy the lowest bit positions of its
230/// associated register class without any intervening space.
231///
232/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
233/// same register class, but only nxv8f16 can be treated as a packed vector.
234static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
235 assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
236 "Expected legal vector type!");
237 return VT.isFixedLengthVector() ||
238 VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock;
239}
240
241// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
242// predicate and end with a passthru value matching the result type.
243static bool isMergePassthruOpcode(unsigned Opc) {
244 switch (Opc) {
245 default:
246 return false;
247 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
248 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
249 case AArch64ISD::REVH_MERGE_PASSTHRU:
250 case AArch64ISD::REVW_MERGE_PASSTHRU:
251 case AArch64ISD::REVD_MERGE_PASSTHRU:
252 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
253 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
254 case AArch64ISD::DUP_MERGE_PASSTHRU:
255 case AArch64ISD::ABS_MERGE_PASSTHRU:
256 case AArch64ISD::NEG_MERGE_PASSTHRU:
257 case AArch64ISD::FNEG_MERGE_PASSTHRU:
258 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
259 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
260 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
261 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
262 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
263 case AArch64ISD::FRINT_MERGE_PASSTHRU:
264 case AArch64ISD::FROUND_MERGE_PASSTHRU:
265 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
266 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
267 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
268 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
269 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
270 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
271 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
272 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
273 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
274 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
275 case AArch64ISD::FABS_MERGE_PASSTHRU:
276 return true;
277 }
278}
279
280// Returns true if inactive lanes are known to be zeroed by construction.
281static bool isZeroingInactiveLanes(SDValue Op) {
282 switch (Op.getOpcode()) {
283 default:
284 return false;
285 // We guarantee i1 splat_vectors to zero the other lanes
286 case ISD::SPLAT_VECTOR:
287 case AArch64ISD::PTRUE:
288 case AArch64ISD::SETCC_MERGE_ZERO:
289 return true;
290 case ISD::INTRINSIC_WO_CHAIN:
291 switch (Op.getConstantOperandVal(i: 0)) {
292 default:
293 return false;
294 case Intrinsic::aarch64_sve_ptrue:
295 case Intrinsic::aarch64_sve_pnext:
296 case Intrinsic::aarch64_sve_cmpeq:
297 case Intrinsic::aarch64_sve_cmpne:
298 case Intrinsic::aarch64_sve_cmpge:
299 case Intrinsic::aarch64_sve_cmpgt:
300 case Intrinsic::aarch64_sve_cmphs:
301 case Intrinsic::aarch64_sve_cmphi:
302 case Intrinsic::aarch64_sve_cmpeq_wide:
303 case Intrinsic::aarch64_sve_cmpne_wide:
304 case Intrinsic::aarch64_sve_cmpge_wide:
305 case Intrinsic::aarch64_sve_cmpgt_wide:
306 case Intrinsic::aarch64_sve_cmplt_wide:
307 case Intrinsic::aarch64_sve_cmple_wide:
308 case Intrinsic::aarch64_sve_cmphs_wide:
309 case Intrinsic::aarch64_sve_cmphi_wide:
310 case Intrinsic::aarch64_sve_cmplo_wide:
311 case Intrinsic::aarch64_sve_cmpls_wide:
312 case Intrinsic::aarch64_sve_fcmpeq:
313 case Intrinsic::aarch64_sve_fcmpne:
314 case Intrinsic::aarch64_sve_fcmpge:
315 case Intrinsic::aarch64_sve_fcmpgt:
316 case Intrinsic::aarch64_sve_fcmpuo:
317 case Intrinsic::aarch64_sve_facgt:
318 case Intrinsic::aarch64_sve_facge:
319 case Intrinsic::aarch64_sve_whilege:
320 case Intrinsic::aarch64_sve_whilegt:
321 case Intrinsic::aarch64_sve_whilehi:
322 case Intrinsic::aarch64_sve_whilehs:
323 case Intrinsic::aarch64_sve_whilele:
324 case Intrinsic::aarch64_sve_whilelo:
325 case Intrinsic::aarch64_sve_whilels:
326 case Intrinsic::aarch64_sve_whilelt:
327 case Intrinsic::aarch64_sve_match:
328 case Intrinsic::aarch64_sve_nmatch:
329 case Intrinsic::aarch64_sve_whilege_x2:
330 case Intrinsic::aarch64_sve_whilegt_x2:
331 case Intrinsic::aarch64_sve_whilehi_x2:
332 case Intrinsic::aarch64_sve_whilehs_x2:
333 case Intrinsic::aarch64_sve_whilele_x2:
334 case Intrinsic::aarch64_sve_whilelo_x2:
335 case Intrinsic::aarch64_sve_whilels_x2:
336 case Intrinsic::aarch64_sve_whilelt_x2:
337 return true;
338 }
339 }
340}
341
342static std::tuple<SDValue, SDValue>
343extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG) {
344 SDLoc DL(Disc);
345 SDValue AddrDisc;
346 SDValue ConstDisc;
347
348 // If this is a blend, remember the constant and address discriminators.
349 // Otherwise, it's either a constant discriminator, or a non-blended
350 // address discriminator.
351 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
352 Disc->getConstantOperandVal(Num: 0) == Intrinsic::ptrauth_blend) {
353 AddrDisc = Disc->getOperand(Num: 1);
354 ConstDisc = Disc->getOperand(Num: 2);
355 } else {
356 ConstDisc = Disc;
357 }
358
359 // If the constant discriminator (either the blend RHS, or the entire
360 // discriminator value) isn't a 16-bit constant, bail out, and let the
361 // discriminator be computed separately.
362 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(Val&: ConstDisc);
363 if (!ConstDiscN || !isUInt<16>(x: ConstDiscN->getZExtValue()))
364 return std::make_tuple(args: DAG->getTargetConstant(Val: 0, DL, VT: MVT::i64), args&: Disc);
365
366 // If there's no address discriminator, use NoRegister, which we'll later
367 // replace with XZR, or directly use a Z variant of the inst. when available.
368 if (!AddrDisc)
369 AddrDisc = DAG->getRegister(Reg: AArch64::NoRegister, VT: MVT::i64);
370
371 return std::make_tuple(
372 args: DAG->getTargetConstant(Val: ConstDiscN->getZExtValue(), DL, VT: MVT::i64),
373 args&: AddrDisc);
374}
375
376AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
377 const AArch64Subtarget &STI)
378 : TargetLowering(TM), Subtarget(&STI) {
379 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
380 // we have to make something up. Arbitrarily, choose ZeroOrOne.
381 setBooleanContents(ZeroOrOneBooleanContent);
382 // When comparing vectors the result sets the different elements in the
383 // vector to all-one or all-zero.
384 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
385
386 // Set up the register classes.
387 addRegisterClass(VT: MVT::i32, RC: &AArch64::GPR32allRegClass);
388 addRegisterClass(VT: MVT::i64, RC: &AArch64::GPR64allRegClass);
389
390 if (Subtarget->hasLS64()) {
391 addRegisterClass(VT: MVT::i64x8, RC: &AArch64::GPR64x8ClassRegClass);
392 setOperationAction(Op: ISD::LOAD, VT: MVT::i64x8, Action: Custom);
393 setOperationAction(Op: ISD::STORE, VT: MVT::i64x8, Action: Custom);
394 }
395
396 if (Subtarget->hasFPARMv8()) {
397 addRegisterClass(VT: MVT::f16, RC: &AArch64::FPR16RegClass);
398 addRegisterClass(VT: MVT::bf16, RC: &AArch64::FPR16RegClass);
399 addRegisterClass(VT: MVT::f32, RC: &AArch64::FPR32RegClass);
400 addRegisterClass(VT: MVT::f64, RC: &AArch64::FPR64RegClass);
401 addRegisterClass(VT: MVT::f128, RC: &AArch64::FPR128RegClass);
402 }
403
404 if (Subtarget->hasNEON()) {
405 addRegisterClass(VT: MVT::v16i8, RC: &AArch64::FPR8RegClass);
406 addRegisterClass(VT: MVT::v8i16, RC: &AArch64::FPR16RegClass);
407
408 addDRType(VT: MVT::v2f32);
409 addDRType(VT: MVT::v8i8);
410 addDRType(VT: MVT::v4i16);
411 addDRType(VT: MVT::v2i32);
412 addDRType(VT: MVT::v1i64);
413 addDRType(VT: MVT::v1f64);
414 addDRType(VT: MVT::v4f16);
415 addDRType(VT: MVT::v4bf16);
416
417 addQRType(VT: MVT::v4f32);
418 addQRType(VT: MVT::v2f64);
419 addQRType(VT: MVT::v16i8);
420 addQRType(VT: MVT::v8i16);
421 addQRType(VT: MVT::v4i32);
422 addQRType(VT: MVT::v2i64);
423 addQRType(VT: MVT::v8f16);
424 addQRType(VT: MVT::v8bf16);
425 }
426
427 if (Subtarget->isSVEorStreamingSVEAvailable()) {
428 // Add legal sve predicate types
429 addRegisterClass(VT: MVT::nxv1i1, RC: &AArch64::PPRRegClass);
430 addRegisterClass(VT: MVT::nxv2i1, RC: &AArch64::PPRRegClass);
431 addRegisterClass(VT: MVT::nxv4i1, RC: &AArch64::PPRRegClass);
432 addRegisterClass(VT: MVT::nxv8i1, RC: &AArch64::PPRRegClass);
433 addRegisterClass(VT: MVT::nxv16i1, RC: &AArch64::PPRRegClass);
434
435 // Add legal sve data types
436 addRegisterClass(VT: MVT::nxv16i8, RC: &AArch64::ZPRRegClass);
437 addRegisterClass(VT: MVT::nxv8i16, RC: &AArch64::ZPRRegClass);
438 addRegisterClass(VT: MVT::nxv4i32, RC: &AArch64::ZPRRegClass);
439 addRegisterClass(VT: MVT::nxv2i64, RC: &AArch64::ZPRRegClass);
440
441 addRegisterClass(VT: MVT::nxv2f16, RC: &AArch64::ZPRRegClass);
442 addRegisterClass(VT: MVT::nxv4f16, RC: &AArch64::ZPRRegClass);
443 addRegisterClass(VT: MVT::nxv8f16, RC: &AArch64::ZPRRegClass);
444 addRegisterClass(VT: MVT::nxv2f32, RC: &AArch64::ZPRRegClass);
445 addRegisterClass(VT: MVT::nxv4f32, RC: &AArch64::ZPRRegClass);
446 addRegisterClass(VT: MVT::nxv2f64, RC: &AArch64::ZPRRegClass);
447
448 addRegisterClass(VT: MVT::nxv2bf16, RC: &AArch64::ZPRRegClass);
449 addRegisterClass(VT: MVT::nxv4bf16, RC: &AArch64::ZPRRegClass);
450 addRegisterClass(VT: MVT::nxv8bf16, RC: &AArch64::ZPRRegClass);
451
452 if (Subtarget->useSVEForFixedLengthVectors()) {
453 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
454 if (useSVEForFixedLengthVectorVT(VT))
455 addRegisterClass(VT, RC: &AArch64::ZPRRegClass);
456
457 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
458 if (useSVEForFixedLengthVectorVT(VT))
459 addRegisterClass(VT, RC: &AArch64::ZPRRegClass);
460 }
461 }
462
463 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
464 addRegisterClass(VT: MVT::aarch64svcount, RC: &AArch64::PPRRegClass);
465 setOperationPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::aarch64svcount, DestVT: MVT::nxv16i1);
466 setOperationPromotedToType(Opc: ISD::STORE, OrigVT: MVT::aarch64svcount, DestVT: MVT::nxv16i1);
467
468 setOperationAction(Op: ISD::SELECT, VT: MVT::aarch64svcount, Action: Custom);
469 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::aarch64svcount, Action: Expand);
470 }
471
472 // Compute derived properties from the register classes
473 computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
474
475 // Provide all sorts of operation actions
476 setOperationAction(Op: ISD::GlobalAddress, VT: MVT::i64, Action: Custom);
477 setOperationAction(Op: ISD::GlobalTLSAddress, VT: MVT::i64, Action: Custom);
478 setOperationAction(Op: ISD::SETCC, VT: MVT::i32, Action: Custom);
479 setOperationAction(Op: ISD::SETCC, VT: MVT::i64, Action: Custom);
480 setOperationAction(Op: ISD::SETCC, VT: MVT::bf16, Action: Custom);
481 setOperationAction(Op: ISD::SETCC, VT: MVT::f16, Action: Custom);
482 setOperationAction(Op: ISD::SETCC, VT: MVT::f32, Action: Custom);
483 setOperationAction(Op: ISD::SETCC, VT: MVT::f64, Action: Custom);
484 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::bf16, Action: Custom);
485 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f16, Action: Custom);
486 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f32, Action: Custom);
487 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f64, Action: Custom);
488 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f16, Action: Custom);
489 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f32, Action: Custom);
490 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f64, Action: Custom);
491 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i32, Action: Legal);
492 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i64, Action: Legal);
493 setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
494 setOperationAction(Op: ISD::BR_CC, VT: MVT::i32, Action: Custom);
495 setOperationAction(Op: ISD::BR_CC, VT: MVT::i64, Action: Custom);
496 setOperationAction(Op: ISD::BR_CC, VT: MVT::f16, Action: Custom);
497 setOperationAction(Op: ISD::BR_CC, VT: MVT::f32, Action: Custom);
498 setOperationAction(Op: ISD::BR_CC, VT: MVT::f64, Action: Custom);
499 setOperationAction(Op: ISD::SELECT, VT: MVT::i32, Action: Custom);
500 setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
501 setOperationAction(Op: ISD::SELECT, VT: MVT::f16, Action: Custom);
502 setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Custom);
503 setOperationAction(Op: ISD::SELECT, VT: MVT::f32, Action: Custom);
504 setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Custom);
505 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i32, Action: Custom);
506 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Custom);
507 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f16, Action: Custom);
508 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::bf16, Action: Custom);
509 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f32, Action: Custom);
510 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f64, Action: Custom);
511 setOperationAction(Op: ISD::BR_JT, VT: MVT::Other, Action: Custom);
512 setOperationAction(Op: ISD::JumpTable, VT: MVT::i64, Action: Custom);
513 setOperationAction(Op: ISD::BRIND, VT: MVT::Other, Action: Custom);
514 setOperationAction(Op: ISD::SETCCCARRY, VT: MVT::i64, Action: Custom);
515
516 setOperationAction(Op: ISD::PtrAuthGlobalAddress, VT: MVT::i64, Action: Custom);
517
518 setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i64, Action: Custom);
519 setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i64, Action: Custom);
520 setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i64, Action: Custom);
521
522 setOperationAction(Op: ISD::FREM, VT: MVT::f32, Action: Expand);
523 setOperationAction(Op: ISD::FREM, VT: MVT::f64, Action: Expand);
524 setOperationAction(Op: ISD::FREM, VT: MVT::f80, Action: Expand);
525
526 setOperationAction(Op: ISD::BUILD_PAIR, VT: MVT::i64, Action: Expand);
527
528 // Custom lowering hooks are needed for XOR
529 // to fold it into CSINC/CSINV.
530 setOperationAction(Op: ISD::XOR, VT: MVT::i32, Action: Custom);
531 setOperationAction(Op: ISD::XOR, VT: MVT::i64, Action: Custom);
532
533 // Virtually no operation on f128 is legal, but LLVM can't expand them when
534 // there's a valid register class, so we need custom operations in most cases.
535 setOperationAction(Op: ISD::FABS, VT: MVT::f128, Action: Expand);
536 setOperationAction(Op: ISD::FADD, VT: MVT::f128, Action: LibCall);
537 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f128, Action: Expand);
538 setOperationAction(Op: ISD::FCOS, VT: MVT::f128, Action: Expand);
539 setOperationAction(Op: ISD::FDIV, VT: MVT::f128, Action: LibCall);
540 setOperationAction(Op: ISD::FMA, VT: MVT::f128, Action: Expand);
541 setOperationAction(Op: ISD::FMUL, VT: MVT::f128, Action: LibCall);
542 setOperationAction(Op: ISD::FNEG, VT: MVT::f128, Action: Expand);
543 setOperationAction(Op: ISD::FPOW, VT: MVT::f128, Action: Expand);
544 setOperationAction(Op: ISD::FREM, VT: MVT::f128, Action: Expand);
545 setOperationAction(Op: ISD::FRINT, VT: MVT::f128, Action: Expand);
546 setOperationAction(Op: ISD::FSIN, VT: MVT::f128, Action: Expand);
547 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f128, Action: Expand);
548 setOperationAction(Op: ISD::FSQRT, VT: MVT::f128, Action: Expand);
549 setOperationAction(Op: ISD::FSUB, VT: MVT::f128, Action: LibCall);
550 setOperationAction(Op: ISD::FTAN, VT: MVT::f128, Action: Expand);
551 setOperationAction(Op: ISD::FTRUNC, VT: MVT::f128, Action: Expand);
552 setOperationAction(Op: ISD::SETCC, VT: MVT::f128, Action: Custom);
553 setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f128, Action: Custom);
554 setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f128, Action: Custom);
555 setOperationAction(Op: ISD::BR_CC, VT: MVT::f128, Action: Custom);
556 setOperationAction(Op: ISD::SELECT, VT: MVT::f128, Action: Custom);
557 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f128, Action: Custom);
558 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f128, Action: Custom);
559 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
560 // aren't handled.
561
562 // Lowering for many of the conversions is actually specified by the non-f128
563 // type. The LowerXXX function will be trivial when f128 isn't involved.
564 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
565 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i64, Action: Custom);
566 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i128, Action: Custom);
567 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Custom);
568 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i64, Action: Custom);
569 setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i128, Action: Custom);
570 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
571 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i64, Action: Custom);
572 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i128, Action: Custom);
573 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Custom);
574 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i64, Action: Custom);
575 setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i128, Action: Custom);
576 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Custom);
577 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i64, Action: Custom);
578 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i128, Action: Custom);
579 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Custom);
580 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i64, Action: Custom);
581 setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i128, Action: Custom);
582 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Custom);
583 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i64, Action: Custom);
584 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i128, Action: Custom);
585 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Custom);
586 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i64, Action: Custom);
587 setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i128, Action: Custom);
588 if (Subtarget->hasFPARMv8()) {
589 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f16, Action: Custom);
590 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Custom);
591 }
592 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f32, Action: Custom);
593 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f64, Action: Custom);
594 if (Subtarget->hasFPARMv8()) {
595 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f16, Action: Custom);
596 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::bf16, Action: Custom);
597 }
598 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f32, Action: Custom);
599 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f64, Action: Custom);
600
601 setOperationAction(Op: ISD::FP_TO_UINT_SAT, VT: MVT::i32, Action: Custom);
602 setOperationAction(Op: ISD::FP_TO_UINT_SAT, VT: MVT::i64, Action: Custom);
603 setOperationAction(Op: ISD::FP_TO_SINT_SAT, VT: MVT::i32, Action: Custom);
604 setOperationAction(Op: ISD::FP_TO_SINT_SAT, VT: MVT::i64, Action: Custom);
605
606 // Variable arguments.
607 setOperationAction(Op: ISD::VASTART, VT: MVT::Other, Action: Custom);
608 setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Custom);
609 setOperationAction(Op: ISD::VACOPY, VT: MVT::Other, Action: Custom);
610 setOperationAction(Op: ISD::VAEND, VT: MVT::Other, Action: Expand);
611
612 // Variable-sized objects.
613 setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Expand);
614 setOperationAction(Op: ISD::STACKRESTORE, VT: MVT::Other, Action: Expand);
615
616 // Lowering Funnel Shifts to EXTR
617 setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Custom);
618 setOperationAction(Op: ISD::FSHR, VT: MVT::i64, Action: Custom);
619 setOperationAction(Op: ISD::FSHL, VT: MVT::i32, Action: Custom);
620 setOperationAction(Op: ISD::FSHL, VT: MVT::i64, Action: Custom);
621
622 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i64, Action: Custom);
623
624 // Constant pool entries
625 setOperationAction(Op: ISD::ConstantPool, VT: MVT::i64, Action: Custom);
626
627 // BlockAddress
628 setOperationAction(Op: ISD::BlockAddress, VT: MVT::i64, Action: Custom);
629
630 // AArch64 lacks both left-rotate and popcount instructions.
631 setOperationAction(Op: ISD::ROTL, VT: MVT::i32, Action: Expand);
632 setOperationAction(Op: ISD::ROTL, VT: MVT::i64, Action: Expand);
633 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
634 setOperationAction(Op: ISD::ROTL, VT, Action: Expand);
635 setOperationAction(Op: ISD::ROTR, VT, Action: Expand);
636 }
637
638 // AArch64 doesn't have i32 MULH{S|U}.
639 setOperationAction(Op: ISD::MULHU, VT: MVT::i32, Action: Expand);
640 setOperationAction(Op: ISD::MULHS, VT: MVT::i32, Action: Expand);
641
642 // AArch64 doesn't have {U|S}MUL_LOHI.
643 setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i32, Action: Expand);
644 setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i32, Action: Expand);
645 setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i64, Action: Expand);
646 setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i64, Action: Expand);
647
648 if (Subtarget->hasCSSC()) {
649 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Legal);
650 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Legal);
651 setOperationAction(Op: ISD::CTPOP, VT: MVT::i128, Action: Expand);
652
653 setOperationAction(Op: ISD::PARITY, VT: MVT::i128, Action: Expand);
654
655 setOperationAction(Op: ISD::CTTZ, VT: MVT::i32, Action: Legal);
656 setOperationAction(Op: ISD::CTTZ, VT: MVT::i64, Action: Legal);
657 setOperationAction(Op: ISD::CTTZ, VT: MVT::i128, Action: Expand);
658
659 setOperationAction(Op: ISD::ABS, VT: MVT::i32, Action: Legal);
660 setOperationAction(Op: ISD::ABS, VT: MVT::i64, Action: Legal);
661
662 setOperationAction(Op: ISD::SMAX, VT: MVT::i32, Action: Legal);
663 setOperationAction(Op: ISD::SMAX, VT: MVT::i64, Action: Legal);
664 setOperationAction(Op: ISD::UMAX, VT: MVT::i32, Action: Legal);
665 setOperationAction(Op: ISD::UMAX, VT: MVT::i64, Action: Legal);
666
667 setOperationAction(Op: ISD::SMIN, VT: MVT::i32, Action: Legal);
668 setOperationAction(Op: ISD::SMIN, VT: MVT::i64, Action: Legal);
669 setOperationAction(Op: ISD::UMIN, VT: MVT::i32, Action: Legal);
670 setOperationAction(Op: ISD::UMIN, VT: MVT::i64, Action: Legal);
671 } else {
672 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Custom);
673 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Custom);
674 setOperationAction(Op: ISD::CTPOP, VT: MVT::i128, Action: Custom);
675
676 setOperationAction(Op: ISD::PARITY, VT: MVT::i64, Action: Custom);
677 setOperationAction(Op: ISD::PARITY, VT: MVT::i128, Action: Custom);
678
679 setOperationAction(Op: ISD::ABS, VT: MVT::i32, Action: Custom);
680 setOperationAction(Op: ISD::ABS, VT: MVT::i64, Action: Custom);
681 }
682
683 setOperationAction(Op: ISD::SDIVREM, VT: MVT::i32, Action: Expand);
684 setOperationAction(Op: ISD::SDIVREM, VT: MVT::i64, Action: Expand);
685 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
686 setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
687 setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
688 }
689 setOperationAction(Op: ISD::SREM, VT: MVT::i32, Action: Expand);
690 setOperationAction(Op: ISD::SREM, VT: MVT::i64, Action: Expand);
691 setOperationAction(Op: ISD::UDIVREM, VT: MVT::i32, Action: Expand);
692 setOperationAction(Op: ISD::UDIVREM, VT: MVT::i64, Action: Expand);
693 setOperationAction(Op: ISD::UREM, VT: MVT::i32, Action: Expand);
694 setOperationAction(Op: ISD::UREM, VT: MVT::i64, Action: Expand);
695
696 // Custom lower Add/Sub/Mul with overflow.
697 setOperationAction(Op: ISD::SADDO, VT: MVT::i32, Action: Custom);
698 setOperationAction(Op: ISD::SADDO, VT: MVT::i64, Action: Custom);
699 setOperationAction(Op: ISD::UADDO, VT: MVT::i32, Action: Custom);
700 setOperationAction(Op: ISD::UADDO, VT: MVT::i64, Action: Custom);
701 setOperationAction(Op: ISD::SSUBO, VT: MVT::i32, Action: Custom);
702 setOperationAction(Op: ISD::SSUBO, VT: MVT::i64, Action: Custom);
703 setOperationAction(Op: ISD::USUBO, VT: MVT::i32, Action: Custom);
704 setOperationAction(Op: ISD::USUBO, VT: MVT::i64, Action: Custom);
705 setOperationAction(Op: ISD::SMULO, VT: MVT::i32, Action: Custom);
706 setOperationAction(Op: ISD::SMULO, VT: MVT::i64, Action: Custom);
707 setOperationAction(Op: ISD::UMULO, VT: MVT::i32, Action: Custom);
708 setOperationAction(Op: ISD::UMULO, VT: MVT::i64, Action: Custom);
709
710 setOperationAction(Op: ISD::UADDO_CARRY, VT: MVT::i32, Action: Custom);
711 setOperationAction(Op: ISD::UADDO_CARRY, VT: MVT::i64, Action: Custom);
712 setOperationAction(Op: ISD::USUBO_CARRY, VT: MVT::i32, Action: Custom);
713 setOperationAction(Op: ISD::USUBO_CARRY, VT: MVT::i64, Action: Custom);
714 setOperationAction(Op: ISD::SADDO_CARRY, VT: MVT::i32, Action: Custom);
715 setOperationAction(Op: ISD::SADDO_CARRY, VT: MVT::i64, Action: Custom);
716 setOperationAction(Op: ISD::SSUBO_CARRY, VT: MVT::i32, Action: Custom);
717 setOperationAction(Op: ISD::SSUBO_CARRY, VT: MVT::i64, Action: Custom);
718
719 setOperationAction(Op: ISD::FSIN, VT: MVT::f32, Action: Expand);
720 setOperationAction(Op: ISD::FSIN, VT: MVT::f64, Action: Expand);
721 setOperationAction(Op: ISD::FCOS, VT: MVT::f32, Action: Expand);
722 setOperationAction(Op: ISD::FCOS, VT: MVT::f64, Action: Expand);
723 setOperationAction(Op: ISD::FPOW, VT: MVT::f32, Action: Expand);
724 setOperationAction(Op: ISD::FPOW, VT: MVT::f64, Action: Expand);
725 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f64, Action: Custom);
726 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f32, Action: Custom);
727 if (Subtarget->hasFullFP16()) {
728 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f16, Action: Custom);
729 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Custom);
730 } else {
731 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f16, Action: Promote);
732 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Promote);
733 }
734
735 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
736 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
737 ISD::FACOS, ISD::FASIN, ISD::FATAN,
738 ISD::FCOSH, ISD::FSINH, ISD::FTANH,
739 ISD::FTAN, ISD::FEXP, ISD::FEXP2,
740 ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
741 ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW,
742 ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN,
743 ISD::STRICT_FACOS, ISD::STRICT_FASIN, ISD::STRICT_FATAN,
744 ISD::STRICT_FCOSH, ISD::STRICT_FSINH, ISD::STRICT_FTANH,
745 ISD::STRICT_FEXP, ISD::STRICT_FEXP2, ISD::STRICT_FLOG,
746 ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, ISD::STRICT_FTAN}) {
747 setOperationAction(Op, VT: MVT::f16, Action: Promote);
748 setOperationAction(Op, VT: MVT::v4f16, Action: Expand);
749 setOperationAction(Op, VT: MVT::v8f16, Action: Expand);
750 setOperationAction(Op, VT: MVT::bf16, Action: Promote);
751 setOperationAction(Op, VT: MVT::v4bf16, Action: Expand);
752 setOperationAction(Op, VT: MVT::v8bf16, Action: Expand);
753 }
754
755 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
756 for (auto Op : {
757 ISD::SETCC,
758 ISD::SELECT_CC,
759 ISD::BR_CC,
760 ISD::FADD,
761 ISD::FSUB,
762 ISD::FMUL,
763 ISD::FDIV,
764 ISD::FMA,
765 ISD::FCEIL,
766 ISD::FSQRT,
767 ISD::FFLOOR,
768 ISD::FNEARBYINT,
769 ISD::FRINT,
770 ISD::FROUND,
771 ISD::FROUNDEVEN,
772 ISD::FTRUNC,
773 ISD::FMINNUM,
774 ISD::FMAXNUM,
775 ISD::FMINIMUM,
776 ISD::FMAXIMUM,
777 ISD::STRICT_FADD,
778 ISD::STRICT_FSUB,
779 ISD::STRICT_FMUL,
780 ISD::STRICT_FDIV,
781 ISD::STRICT_FMA,
782 ISD::STRICT_FCEIL,
783 ISD::STRICT_FFLOOR,
784 ISD::STRICT_FSQRT,
785 ISD::STRICT_FRINT,
786 ISD::STRICT_FNEARBYINT,
787 ISD::STRICT_FROUND,
788 ISD::STRICT_FTRUNC,
789 ISD::STRICT_FROUNDEVEN,
790 ISD::STRICT_FMINNUM,
791 ISD::STRICT_FMAXNUM,
792 ISD::STRICT_FMINIMUM,
793 ISD::STRICT_FMAXIMUM,
794 })
795 setOperationAction(Op, VT: ScalarVT, Action: Promote);
796
797 for (auto Op : {ISD::FNEG, ISD::FABS})
798 setOperationAction(Op, VT: ScalarVT, Action: Legal);
799
800 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
801 // because the result type is integer.
802 for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
803 ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
804 ISD::STRICT_LLRINT})
805 setOperationAction(Op, VT: ScalarVT, Action: Custom);
806
807 // promote v4f16 to v4f32 when that is known to be safe.
808 auto V4Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: 4);
809 setOperationPromotedToType(Opc: ISD::FADD, OrigVT: V4Narrow, DestVT: MVT::v4f32);
810 setOperationPromotedToType(Opc: ISD::FSUB, OrigVT: V4Narrow, DestVT: MVT::v4f32);
811 setOperationPromotedToType(Opc: ISD::FMUL, OrigVT: V4Narrow, DestVT: MVT::v4f32);
812 setOperationPromotedToType(Opc: ISD::FDIV, OrigVT: V4Narrow, DestVT: MVT::v4f32);
813 setOperationPromotedToType(Opc: ISD::FCEIL, OrigVT: V4Narrow, DestVT: MVT::v4f32);
814 setOperationPromotedToType(Opc: ISD::FFLOOR, OrigVT: V4Narrow, DestVT: MVT::v4f32);
815 setOperationPromotedToType(Opc: ISD::FROUND, OrigVT: V4Narrow, DestVT: MVT::v4f32);
816 setOperationPromotedToType(Opc: ISD::FTRUNC, OrigVT: V4Narrow, DestVT: MVT::v4f32);
817 setOperationPromotedToType(Opc: ISD::FROUNDEVEN, OrigVT: V4Narrow, DestVT: MVT::v4f32);
818 setOperationPromotedToType(Opc: ISD::FRINT, OrigVT: V4Narrow, DestVT: MVT::v4f32);
819 setOperationPromotedToType(Opc: ISD::FNEARBYINT, OrigVT: V4Narrow, DestVT: MVT::v4f32);
820
821 setOperationAction(Op: ISD::FABS, VT: V4Narrow, Action: Legal);
822 setOperationAction(Op: ISD::FNEG, VT: V4Narrow, Action: Legal);
823 setOperationAction(Op: ISD::FMA, VT: V4Narrow, Action: Expand);
824 setOperationAction(Op: ISD::SETCC, VT: V4Narrow, Action: Custom);
825 setOperationAction(Op: ISD::BR_CC, VT: V4Narrow, Action: Expand);
826 setOperationAction(Op: ISD::SELECT, VT: V4Narrow, Action: Expand);
827 setOperationAction(Op: ISD::SELECT_CC, VT: V4Narrow, Action: Expand);
828 setOperationAction(Op: ISD::FCOPYSIGN, VT: V4Narrow, Action: Custom);
829 setOperationAction(Op: ISD::FSQRT, VT: V4Narrow, Action: Expand);
830
831 auto V8Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: 8);
832 setOperationAction(Op: ISD::FABS, VT: V8Narrow, Action: Legal);
833 setOperationAction(Op: ISD::FADD, VT: V8Narrow, Action: Legal);
834 setOperationAction(Op: ISD::FCEIL, VT: V8Narrow, Action: Legal);
835 setOperationAction(Op: ISD::FCOPYSIGN, VT: V8Narrow, Action: Custom);
836 setOperationAction(Op: ISD::FDIV, VT: V8Narrow, Action: Legal);
837 setOperationAction(Op: ISD::FFLOOR, VT: V8Narrow, Action: Legal);
838 setOperationAction(Op: ISD::FMA, VT: V8Narrow, Action: Expand);
839 setOperationAction(Op: ISD::FMUL, VT: V8Narrow, Action: Legal);
840 setOperationAction(Op: ISD::FNEARBYINT, VT: V8Narrow, Action: Legal);
841 setOperationAction(Op: ISD::FNEG, VT: V8Narrow, Action: Legal);
842 setOperationAction(Op: ISD::FROUND, VT: V8Narrow, Action: Legal);
843 setOperationAction(Op: ISD::FROUNDEVEN, VT: V8Narrow, Action: Legal);
844 setOperationAction(Op: ISD::FRINT, VT: V8Narrow, Action: Legal);
845 setOperationAction(Op: ISD::FSQRT, VT: V8Narrow, Action: Expand);
846 setOperationAction(Op: ISD::FSUB, VT: V8Narrow, Action: Legal);
847 setOperationAction(Op: ISD::FTRUNC, VT: V8Narrow, Action: Legal);
848 setOperationAction(Op: ISD::SETCC, VT: V8Narrow, Action: Expand);
849 setOperationAction(Op: ISD::BR_CC, VT: V8Narrow, Action: Expand);
850 setOperationAction(Op: ISD::SELECT, VT: V8Narrow, Action: Expand);
851 setOperationAction(Op: ISD::SELECT_CC, VT: V8Narrow, Action: Expand);
852 setOperationAction(Op: ISD::FP_EXTEND, VT: V8Narrow, Action: Expand);
853 };
854
855 if (!Subtarget->hasFullFP16()) {
856 LegalizeNarrowFP(MVT::f16);
857 }
858 LegalizeNarrowFP(MVT::bf16);
859 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4f32, Action: Custom);
860 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4bf16, Action: Custom);
861
862 // AArch64 has implementations of a lot of rounding-like FP operations.
863 for (auto Op :
864 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
865 ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
866 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
867 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
868 ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
869 ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT,
870 ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
871 ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
872 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
873 ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) {
874 for (MVT Ty : {MVT::f32, MVT::f64})
875 setOperationAction(Op, VT: Ty, Action: Legal);
876 if (Subtarget->hasFullFP16())
877 setOperationAction(Op, VT: MVT::f16, Action: Legal);
878 }
879
880 // Basic strict FP operations are legal
881 for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
882 ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
883 for (MVT Ty : {MVT::f32, MVT::f64})
884 setOperationAction(Op, VT: Ty, Action: Legal);
885 if (Subtarget->hasFullFP16())
886 setOperationAction(Op, VT: MVT::f16, Action: Legal);
887 }
888
889 // Strict conversion to a larger type is legal
890 for (auto VT : {MVT::f32, MVT::f64})
891 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT, Action: Legal);
892
893 setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
894
895 setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
896 setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
897 setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Custom);
898 setOperationAction(Op: ISD::SET_FPMODE, VT: MVT::i32, Action: Custom);
899 setOperationAction(Op: ISD::RESET_FPMODE, VT: MVT::Other, Action: Custom);
900
901 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i128, Action: Custom);
902 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
903 setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i32, Action: LibCall);
904 setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i64, Action: LibCall);
905 } else {
906 setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i32, Action: Expand);
907 setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i64, Action: Expand);
908 }
909 setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i32, Action: Custom);
910 setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i64, Action: Custom);
911
912 // Generate outline atomics library calls only if LSE was not specified for
913 // subtarget
914 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
915 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i8, Action: LibCall);
916 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i16, Action: LibCall);
917 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i32, Action: LibCall);
918 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i64, Action: LibCall);
919 setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i128, Action: LibCall);
920 setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i8, Action: LibCall);
921 setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i16, Action: LibCall);
922 setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i32, Action: LibCall);
923 setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i64, Action: LibCall);
924 setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i8, Action: LibCall);
925 setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i16, Action: LibCall);
926 setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i32, Action: LibCall);
927 setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i64, Action: LibCall);
928 setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i8, Action: LibCall);
929 setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i16, Action: LibCall);
930 setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i32, Action: LibCall);
931 setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i64, Action: LibCall);
932 setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i8, Action: LibCall);
933 setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i16, Action: LibCall);
934 setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i32, Action: LibCall);
935 setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i64, Action: LibCall);
936 setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i8, Action: LibCall);
937 setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i16, Action: LibCall);
938 setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i32, Action: LibCall);
939 setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i64, Action: LibCall);
940#define LCALLNAMES(A, B, N) \
941 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
942 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
943 setLibcallName(A##N##_REL, #B #N "_rel"); \
944 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
945#define LCALLNAME4(A, B) \
946 LCALLNAMES(A, B, 1) \
947 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
948#define LCALLNAME5(A, B) \
949 LCALLNAMES(A, B, 1) \
950 LCALLNAMES(A, B, 2) \
951 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
952 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
953 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
954 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
955 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
956 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
957 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
958#undef LCALLNAMES
959#undef LCALLNAME4
960#undef LCALLNAME5
961 }
962
963 if (Subtarget->hasLSE128()) {
964 // Custom lowering because i128 is not legal. Must be replaced by 2x64
965 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
966 setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i128, Action: Custom);
967 setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i128, Action: Custom);
968 setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i128, Action: Custom);
969 }
970
971 // 128-bit loads and stores can be done without expanding
972 setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Custom);
973 setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Custom);
974
975 // Aligned 128-bit loads and stores are single-copy atomic according to the
976 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
977 if (Subtarget->hasLSE2()) {
978 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::i128, Action: Custom);
979 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::i128, Action: Custom);
980 }
981
982 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
983 // custom lowering, as there are no un-paired non-temporal stores and
984 // legalization will break up 256 bit inputs.
985 setOperationAction(Op: ISD::STORE, VT: MVT::v32i8, Action: Custom);
986 setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Custom);
987 setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Custom);
988 setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Custom);
989 setOperationAction(Op: ISD::STORE, VT: MVT::v8i32, Action: Custom);
990 setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Custom);
991 setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Custom);
992 setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Custom);
993
994 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
995 // custom lowering, as there are no un-paired non-temporal loads legalization
996 // will break up 256 bit inputs.
997 setOperationAction(Op: ISD::LOAD, VT: MVT::v32i8, Action: Custom);
998 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Custom);
999 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Custom);
1000 setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Custom);
1001 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i32, Action: Custom);
1002 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Custom);
1003 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Custom);
1004 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Custom);
1005
1006 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1007 setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
1008
1009 if (getLibcallName(Call: RTLIB::SINCOS_STRET_F32) != nullptr &&
1010 getLibcallName(Call: RTLIB::SINCOS_STRET_F64) != nullptr) {
1011 // Issue __sincos_stret if available.
1012 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f64, Action: Custom);
1013 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f32, Action: Custom);
1014 } else {
1015 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f64, Action: Expand);
1016 setOperationAction(Op: ISD::FSINCOS, VT: MVT::f32, Action: Expand);
1017 }
1018
1019 // Make floating-point constants legal for the large code model, so they don't
1020 // become loads from the constant pool.
1021 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1022 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f32, Action: Legal);
1023 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f64, Action: Legal);
1024 }
1025
1026 // AArch64 does not have floating-point extending loads, i1 sign-extending
1027 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1028 for (MVT VT : MVT::fp_valuetypes()) {
1029 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::bf16, Action: Expand);
1030 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f16, Action: Expand);
1031 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f32, Action: Expand);
1032 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f64, Action: Expand);
1033 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f80, Action: Expand);
1034 }
1035 for (MVT VT : MVT::integer_valuetypes())
1036 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Expand);
1037
1038 for (MVT WideVT : MVT::fp_valuetypes()) {
1039 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1040 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1041 setTruncStoreAction(ValVT: WideVT, MemVT: NarrowVT, Action: Expand);
1042 }
1043 }
1044 }
1045
1046 if (Subtarget->hasFPARMv8()) {
1047 setOperationAction(Op: ISD::BITCAST, VT: MVT::i16, Action: Custom);
1048 setOperationAction(Op: ISD::BITCAST, VT: MVT::f16, Action: Custom);
1049 setOperationAction(Op: ISD::BITCAST, VT: MVT::bf16, Action: Custom);
1050 }
1051
1052 // Indexed loads and stores are supported.
1053 for (unsigned im = (unsigned)ISD::PRE_INC;
1054 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1055 setIndexedLoadAction(IdxModes: im, VT: MVT::i8, Action: Legal);
1056 setIndexedLoadAction(IdxModes: im, VT: MVT::i16, Action: Legal);
1057 setIndexedLoadAction(IdxModes: im, VT: MVT::i32, Action: Legal);
1058 setIndexedLoadAction(IdxModes: im, VT: MVT::i64, Action: Legal);
1059 setIndexedLoadAction(IdxModes: im, VT: MVT::f64, Action: Legal);
1060 setIndexedLoadAction(IdxModes: im, VT: MVT::f32, Action: Legal);
1061 setIndexedLoadAction(IdxModes: im, VT: MVT::f16, Action: Legal);
1062 setIndexedLoadAction(IdxModes: im, VT: MVT::bf16, Action: Legal);
1063 setIndexedStoreAction(IdxModes: im, VT: MVT::i8, Action: Legal);
1064 setIndexedStoreAction(IdxModes: im, VT: MVT::i16, Action: Legal);
1065 setIndexedStoreAction(IdxModes: im, VT: MVT::i32, Action: Legal);
1066 setIndexedStoreAction(IdxModes: im, VT: MVT::i64, Action: Legal);
1067 setIndexedStoreAction(IdxModes: im, VT: MVT::f64, Action: Legal);
1068 setIndexedStoreAction(IdxModes: im, VT: MVT::f32, Action: Legal);
1069 setIndexedStoreAction(IdxModes: im, VT: MVT::f16, Action: Legal);
1070 setIndexedStoreAction(IdxModes: im, VT: MVT::bf16, Action: Legal);
1071 }
1072
1073 // Trap.
1074 setOperationAction(Op: ISD::TRAP, VT: MVT::Other, Action: Legal);
1075 setOperationAction(Op: ISD::DEBUGTRAP, VT: MVT::Other, Action: Legal);
1076 setOperationAction(Op: ISD::UBSANTRAP, VT: MVT::Other, Action: Legal);
1077
1078 // We combine OR nodes for bitfield operations.
1079 setTargetDAGCombine(ISD::OR);
1080 // Try to create BICs for vector ANDs.
1081 setTargetDAGCombine(ISD::AND);
1082
1083 // llvm.init.trampoline and llvm.adjust.trampoline
1084 setOperationAction(Op: ISD::INIT_TRAMPOLINE, VT: MVT::Other, Action: Custom);
1085 setOperationAction(Op: ISD::ADJUST_TRAMPOLINE, VT: MVT::Other, Action: Custom);
1086
1087 // Vector add and sub nodes may conceal a high-half opportunity.
1088 // Also, try to fold ADD into CSINC/CSINV..
1089 setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
1090 ISD::UINT_TO_FP});
1091
1092 setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1093 ISD::FP_TO_UINT_SAT, ISD::FADD});
1094
1095 // Try and combine setcc with csel
1096 setTargetDAGCombine(ISD::SETCC);
1097
1098 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1099
1100 setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
1101 ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
1102 ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
1103 ISD::STORE, ISD::BUILD_VECTOR});
1104 setTargetDAGCombine(ISD::TRUNCATE);
1105 setTargetDAGCombine(ISD::LOAD);
1106
1107 setTargetDAGCombine(ISD::MSTORE);
1108
1109 setTargetDAGCombine(ISD::MUL);
1110
1111 setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
1112
1113 setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
1114 ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
1115 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1116
1117 setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
1118
1119 setTargetDAGCombine(ISD::FP_EXTEND);
1120
1121 setTargetDAGCombine(ISD::GlobalAddress);
1122
1123 setTargetDAGCombine(ISD::CTLZ);
1124
1125 setTargetDAGCombine(ISD::VECREDUCE_AND);
1126 setTargetDAGCombine(ISD::VECREDUCE_OR);
1127 setTargetDAGCombine(ISD::VECREDUCE_XOR);
1128
1129 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1130
1131 // In case of strict alignment, avoid an excessive number of byte wide stores.
1132 MaxStoresPerMemsetOptSize = 8;
1133 MaxStoresPerMemset =
1134 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1135
1136 MaxGluedStoresPerMemcpy = 4;
1137 MaxStoresPerMemcpyOptSize = 4;
1138 MaxStoresPerMemcpy =
1139 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1140
1141 MaxStoresPerMemmoveOptSize = 4;
1142 MaxStoresPerMemmove = 4;
1143
1144 MaxLoadsPerMemcmpOptSize = 4;
1145 MaxLoadsPerMemcmp =
1146 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1147
1148 setStackPointerRegisterToSaveRestore(AArch64::SP);
1149
1150 setSchedulingPreference(Sched::Hybrid);
1151
1152 EnableExtLdPromotion = true;
1153
1154 // Set required alignment.
1155 setMinFunctionAlignment(Align(4));
1156 // Set preferred alignments.
1157
1158 // Don't align loops on Windows. The SEH unwind info generation needs to
1159 // know the exact length of functions before the alignments have been
1160 // expanded.
1161 if (!Subtarget->isTargetWindows())
1162 setPrefLoopAlignment(STI.getPrefLoopAlignment());
1163 setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
1164 setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
1165
1166 // Only change the limit for entries in a jump table if specified by
1167 // the sub target, but not at the command line.
1168 unsigned MaxJT = STI.getMaximumJumpTableSize();
1169 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1170 setMaximumJumpTableSize(MaxJT);
1171
1172 setHasExtractBitsInsn(true);
1173
1174 setMaxDivRemBitWidthSupported(128);
1175
1176 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::Other, Action: Custom);
1177
1178 if (Subtarget->isNeonAvailable()) {
1179 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1180 // silliness like this:
1181 // clang-format off
1182 for (auto Op :
1183 {ISD::SELECT, ISD::SELECT_CC,
1184 ISD::BR_CC, ISD::FADD, ISD::FSUB,
1185 ISD::FMUL, ISD::FDIV, ISD::FMA,
1186 ISD::FNEG, ISD::FABS, ISD::FCEIL,
1187 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1188 ISD::FSIN, ISD::FCOS, ISD::FTAN,
1189 ISD::FASIN, ISD::FACOS, ISD::FATAN,
1190 ISD::FSINH, ISD::FCOSH, ISD::FTANH,
1191 ISD::FPOW, ISD::FLOG, ISD::FLOG2,
1192 ISD::FLOG10, ISD::FEXP, ISD::FEXP2,
1193 ISD::FEXP10, ISD::FRINT, ISD::FROUND,
1194 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM,
1195 ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM,
1196 ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
1197 ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FCEIL,
1198 ISD::STRICT_FFLOOR, ISD::STRICT_FSQRT, ISD::STRICT_FRINT,
1199 ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND, ISD::STRICT_FTRUNC,
1200 ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
1201 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM})
1202 setOperationAction(Op, VT: MVT::v1f64, Action: Expand);
1203 // clang-format on
1204 for (auto Op :
1205 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
1206 ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
1207 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
1208 ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
1209 setOperationAction(Op, VT: MVT::v1i64, Action: Expand);
1210
1211 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1212 // elements smaller than i32, so promote the input to i32 first.
1213 setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v4i8, DestVT: MVT::v4i32);
1214 setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v4i8, DestVT: MVT::v4i32);
1215
1216 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1217 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1218 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1219 for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1220 ISD::STRICT_UINT_TO_FP})
1221 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1222 setOperationAction(Op, VT, Action: Custom);
1223
1224 if (Subtarget->hasFullFP16()) {
1225 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
1226 setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
1227
1228 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v8i8, Action: Custom);
1229 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v8i8, Action: Custom);
1230 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v16i8, Action: Custom);
1231 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v16i8, Action: Custom);
1232 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1233 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1234 setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v8i16, Action: Custom);
1235 setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v8i16, Action: Custom);
1236 } else {
1237 // when AArch64 doesn't have fullfp16 support, promote the input
1238 // to i32 first.
1239 setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v8i8, DestVT: MVT::v8i32);
1240 setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v8i8, DestVT: MVT::v8i32);
1241 setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v16i8, DestVT: MVT::v16i32);
1242 setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v16i8, DestVT: MVT::v16i32);
1243 setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v4i16, DestVT: MVT::v4i32);
1244 setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v4i16, DestVT: MVT::v4i32);
1245 setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v8i16, DestVT: MVT::v8i32);
1246 setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v8i16, DestVT: MVT::v8i32);
1247 }
1248
1249 setOperationAction(Op: ISD::CTLZ, VT: MVT::v1i64, Action: Expand);
1250 setOperationAction(Op: ISD::CTLZ, VT: MVT::v2i64, Action: Expand);
1251 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v8i8, Action: Legal);
1252 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v16i8, Action: Legal);
1253 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v2i32, Action: Custom);
1254 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v4i32, Action: Custom);
1255 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v1i64, Action: Custom);
1256 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v2i64, Action: Custom);
1257 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1258 setOperationAction(Op: ISD::UMAX, VT, Action: Custom);
1259 setOperationAction(Op: ISD::SMAX, VT, Action: Custom);
1260 setOperationAction(Op: ISD::UMIN, VT, Action: Custom);
1261 setOperationAction(Op: ISD::SMIN, VT, Action: Custom);
1262 }
1263
1264 // Custom handling for some quad-vector types to detect MULL.
1265 setOperationAction(Op: ISD::MUL, VT: MVT::v8i16, Action: Custom);
1266 setOperationAction(Op: ISD::MUL, VT: MVT::v4i32, Action: Custom);
1267 setOperationAction(Op: ISD::MUL, VT: MVT::v2i64, Action: Custom);
1268 setOperationAction(Op: ISD::MUL, VT: MVT::v4i16, Action: Custom);
1269 setOperationAction(Op: ISD::MUL, VT: MVT::v2i32, Action: Custom);
1270 setOperationAction(Op: ISD::MUL, VT: MVT::v1i64, Action: Custom);
1271
1272 // Saturates
1273 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1274 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1275 setOperationAction(Op: ISD::SADDSAT, VT, Action: Legal);
1276 setOperationAction(Op: ISD::UADDSAT, VT, Action: Legal);
1277 setOperationAction(Op: ISD::SSUBSAT, VT, Action: Legal);
1278 setOperationAction(Op: ISD::USUBSAT, VT, Action: Legal);
1279 }
1280
1281 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1282 MVT::v4i32}) {
1283 setOperationAction(Op: ISD::AVGFLOORS, VT, Action: Legal);
1284 setOperationAction(Op: ISD::AVGFLOORU, VT, Action: Legal);
1285 setOperationAction(Op: ISD::AVGCEILS, VT, Action: Legal);
1286 setOperationAction(Op: ISD::AVGCEILU, VT, Action: Legal);
1287 setOperationAction(Op: ISD::ABDS, VT, Action: Legal);
1288 setOperationAction(Op: ISD::ABDU, VT, Action: Legal);
1289 }
1290
1291 // Vector reductions
1292 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1293 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1294 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1295 setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Legal);
1296 setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Legal);
1297 setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Legal);
1298 setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Legal);
1299
1300 setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Legal);
1301 }
1302 }
1303 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1304 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1305 setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Custom);
1306 setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Custom);
1307 setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Custom);
1308 setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Custom);
1309 setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Custom);
1310 setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1311 setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1312 setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1313 }
1314 setOperationAction(Op: ISD::VECREDUCE_ADD, VT: MVT::v2i64, Action: Custom);
1315 setOperationAction(Op: ISD::VECREDUCE_AND, VT: MVT::v2i64, Action: Custom);
1316 setOperationAction(Op: ISD::VECREDUCE_OR, VT: MVT::v2i64, Action: Custom);
1317 setOperationAction(Op: ISD::VECREDUCE_XOR, VT: MVT::v2i64, Action: Custom);
1318
1319 setOperationAction(Op: ISD::ANY_EXTEND, VT: MVT::v4i32, Action: Legal);
1320 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
1321 // Likewise, narrowing and extending vector loads/stores aren't handled
1322 // directly.
1323 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1324 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Expand);
1325
1326 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1327 setOperationAction(Op: ISD::MULHS, VT, Action: Legal);
1328 setOperationAction(Op: ISD::MULHU, VT, Action: Legal);
1329 } else {
1330 setOperationAction(Op: ISD::MULHS, VT, Action: Expand);
1331 setOperationAction(Op: ISD::MULHU, VT, Action: Expand);
1332 }
1333 setOperationAction(Op: ISD::SMUL_LOHI, VT, Action: Expand);
1334 setOperationAction(Op: ISD::UMUL_LOHI, VT, Action: Expand);
1335
1336 setOperationAction(Op: ISD::BSWAP, VT, Action: Expand);
1337 setOperationAction(Op: ISD::CTTZ, VT, Action: Expand);
1338
1339 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1340 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1341 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1342 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1343 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1344 }
1345 }
1346
1347 // AArch64 has implementations of a lot of rounding-like FP operations.
1348 for (auto Op :
1349 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1350 ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
1351 ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
1352 ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
1353 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1354 setOperationAction(Op, VT: Ty, Action: Legal);
1355 if (Subtarget->hasFullFP16())
1356 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1357 setOperationAction(Op, VT: Ty, Action: Legal);
1358 }
1359
1360 // LRINT and LLRINT.
1361 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1362 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1363 setOperationAction(Op, VT: Ty, Action: Custom);
1364 if (Subtarget->hasFullFP16())
1365 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1366 setOperationAction(Op, VT: Ty, Action: Custom);
1367 }
1368
1369 setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1370
1371 setOperationAction(Op: ISD::BITCAST, VT: MVT::i2, Action: Custom);
1372 setOperationAction(Op: ISD::BITCAST, VT: MVT::i4, Action: Custom);
1373 setOperationAction(Op: ISD::BITCAST, VT: MVT::i8, Action: Custom);
1374 setOperationAction(Op: ISD::BITCAST, VT: MVT::i16, Action: Custom);
1375
1376 setOperationAction(Op: ISD::BITCAST, VT: MVT::v2i8, Action: Custom);
1377 setOperationAction(Op: ISD::BITCAST, VT: MVT::v2i16, Action: Custom);
1378 setOperationAction(Op: ISD::BITCAST, VT: MVT::v4i8, Action: Custom);
1379
1380 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1381 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1382 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1383 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1384 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1385 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1386
1387 // ADDP custom lowering
1388 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1389 setOperationAction(Op: ISD::ADD, VT, Action: Custom);
1390 // FADDP custom lowering
1391 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1392 setOperationAction(Op: ISD::FADD, VT, Action: Custom);
1393 } else /* !isNeonAvailable */ {
1394 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1395 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1396 setOperationAction(Op, VT, Action: Expand);
1397
1398 if (VT.is128BitVector() || VT.is64BitVector()) {
1399 setOperationAction(Op: ISD::LOAD, VT, Action: Legal);
1400 setOperationAction(Op: ISD::STORE, VT, Action: Legal);
1401 setOperationAction(Op: ISD::BITCAST, VT,
1402 Action: Subtarget->isLittleEndian() ? Legal : Expand);
1403 }
1404 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1405 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1406 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1407 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1408 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1409 }
1410 }
1411 }
1412
1413 if (Subtarget->hasSME()) {
1414 setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::Other, Action: Custom);
1415 }
1416
1417 // FIXME: Move lowering for more nodes here if those are common between
1418 // SVE and SME.
1419 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1420 for (auto VT :
1421 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1422 setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Custom);
1423 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Custom);
1424 setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1425 setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1426 }
1427 }
1428
1429 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1430 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1431 setOperationAction(Op: ISD::BITREVERSE, VT, Action: Custom);
1432 setOperationAction(Op: ISD::BSWAP, VT, Action: Custom);
1433 setOperationAction(Op: ISD::CTLZ, VT, Action: Custom);
1434 setOperationAction(Op: ISD::CTPOP, VT, Action: Custom);
1435 setOperationAction(Op: ISD::CTTZ, VT, Action: Custom);
1436 setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1437 setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Custom);
1438 setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Custom);
1439 setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Custom);
1440 setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Custom);
1441 setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1442 setOperationAction(Op: ISD::MUL, VT, Action: Custom);
1443 setOperationAction(Op: ISD::MULHS, VT, Action: Custom);
1444 setOperationAction(Op: ISD::MULHU, VT, Action: Custom);
1445 setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1446 setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1447 setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1448 setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1449 setOperationAction(Op: ISD::SDIV, VT, Action: Custom);
1450 setOperationAction(Op: ISD::UDIV, VT, Action: Custom);
1451 setOperationAction(Op: ISD::SMIN, VT, Action: Custom);
1452 setOperationAction(Op: ISD::UMIN, VT, Action: Custom);
1453 setOperationAction(Op: ISD::SMAX, VT, Action: Custom);
1454 setOperationAction(Op: ISD::UMAX, VT, Action: Custom);
1455 setOperationAction(Op: ISD::SHL, VT, Action: Custom);
1456 setOperationAction(Op: ISD::SRL, VT, Action: Custom);
1457 setOperationAction(Op: ISD::SRA, VT, Action: Custom);
1458 setOperationAction(Op: ISD::ABS, VT, Action: Custom);
1459 setOperationAction(Op: ISD::ABDS, VT, Action: Custom);
1460 setOperationAction(Op: ISD::ABDU, VT, Action: Custom);
1461 setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Custom);
1462 setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1463 setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1464 setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1465 setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Custom);
1466 setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Custom);
1467 setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Custom);
1468 setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Custom);
1469 setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1470 setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1471
1472 setOperationAction(Op: ISD::UMUL_LOHI, VT, Action: Expand);
1473 setOperationAction(Op: ISD::SMUL_LOHI, VT, Action: Expand);
1474 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1475 setOperationAction(Op: ISD::ROTL, VT, Action: Expand);
1476 setOperationAction(Op: ISD::ROTR, VT, Action: Expand);
1477
1478 setOperationAction(Op: ISD::SADDSAT, VT, Action: Legal);
1479 setOperationAction(Op: ISD::UADDSAT, VT, Action: Legal);
1480 setOperationAction(Op: ISD::SSUBSAT, VT, Action: Legal);
1481 setOperationAction(Op: ISD::USUBSAT, VT, Action: Legal);
1482 setOperationAction(Op: ISD::UREM, VT, Action: Expand);
1483 setOperationAction(Op: ISD::SREM, VT, Action: Expand);
1484 setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
1485 setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
1486
1487 setOperationAction(Op: ISD::AVGFLOORS, VT, Action: Custom);
1488 setOperationAction(Op: ISD::AVGFLOORU, VT, Action: Custom);
1489 setOperationAction(Op: ISD::AVGCEILS, VT, Action: Custom);
1490 setOperationAction(Op: ISD::AVGCEILU, VT, Action: Custom);
1491
1492 if (!Subtarget->isLittleEndian())
1493 setOperationAction(Op: ISD::BITCAST, VT, Action: Expand);
1494
1495 if (Subtarget->hasSVE2() ||
1496 (Subtarget->hasSME() && Subtarget->isStreaming()))
1497 // For SLI/SRI.
1498 setOperationAction(Op: ISD::OR, VT, Action: Custom);
1499 }
1500
1501 // Illegal unpacked integer vector types.
1502 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1503 setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Custom);
1504 setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1505 }
1506
1507 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1508 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1509 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1510 setOperationAction(Op: ISD::BITCAST, VT, Action: Custom);
1511
1512 for (auto VT :
1513 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1514 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1515 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Legal);
1516
1517 for (auto VT :
1518 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1519 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1520 setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1521 setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1522 setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1523 setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1524 setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1525 setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1526
1527 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1528 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Custom);
1529 setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1530
1531 // There are no legal MVT::nxv16f## based types.
1532 if (VT != MVT::nxv16i1) {
1533 setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Custom);
1534 setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Custom);
1535 }
1536 }
1537
1538 // NEON doesn't support masked loads/stores, but SME and SVE do.
1539 for (auto VT :
1540 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1541 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1542 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1543 setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1544 setOperationAction(Op: ISD::MSTORE, VT, Action: Custom);
1545 }
1546
1547 // Firstly, exclude all scalable vector extending loads/truncating stores,
1548 // include both integer and floating scalable vector.
1549 for (MVT VT : MVT::scalable_vector_valuetypes()) {
1550 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1551 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1552 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1553 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1554 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1555 }
1556 }
1557
1558 // Then, selectively enable those which we directly support.
1559 setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i8, Action: Legal);
1560 setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i16, Action: Legal);
1561 setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i32, Action: Legal);
1562 setTruncStoreAction(ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i8, Action: Legal);
1563 setTruncStoreAction(ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i16, Action: Legal);
1564 setTruncStoreAction(ValVT: MVT::nxv8i16, MemVT: MVT::nxv8i8, Action: Legal);
1565 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1566 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i8, Action: Legal);
1567 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i16, Action: Legal);
1568 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i32, Action: Legal);
1569 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i8, Action: Legal);
1570 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i16, Action: Legal);
1571 setLoadExtAction(ExtType: Op, ValVT: MVT::nxv8i16, MemVT: MVT::nxv8i8, Action: Legal);
1572 }
1573
1574 // SVE supports truncating stores of 64 and 128-bit vectors
1575 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Custom);
1576 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Custom);
1577 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Custom);
1578 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Custom);
1579 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Custom);
1580
1581 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1582 MVT::nxv4f32, MVT::nxv2f64}) {
1583 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1584 setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1585 setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1586 setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1587 setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1588 setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1589 setOperationAction(Op: ISD::FADD, VT, Action: Custom);
1590 setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
1591 setOperationAction(Op: ISD::FDIV, VT, Action: Custom);
1592 setOperationAction(Op: ISD::FMA, VT, Action: Custom);
1593 setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Custom);
1594 setOperationAction(Op: ISD::FMAXNUM, VT, Action: Custom);
1595 setOperationAction(Op: ISD::FMINIMUM, VT, Action: Custom);
1596 setOperationAction(Op: ISD::FMINNUM, VT, Action: Custom);
1597 setOperationAction(Op: ISD::FMUL, VT, Action: Custom);
1598 setOperationAction(Op: ISD::FNEG, VT, Action: Custom);
1599 setOperationAction(Op: ISD::FSUB, VT, Action: Custom);
1600 setOperationAction(Op: ISD::FCEIL, VT, Action: Custom);
1601 setOperationAction(Op: ISD::FFLOOR, VT, Action: Custom);
1602 setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Custom);
1603 setOperationAction(Op: ISD::FRINT, VT, Action: Custom);
1604 setOperationAction(Op: ISD::LRINT, VT, Action: Custom);
1605 setOperationAction(Op: ISD::LLRINT, VT, Action: Custom);
1606 setOperationAction(Op: ISD::FROUND, VT, Action: Custom);
1607 setOperationAction(Op: ISD::FROUNDEVEN, VT, Action: Custom);
1608 setOperationAction(Op: ISD::FTRUNC, VT, Action: Custom);
1609 setOperationAction(Op: ISD::FSQRT, VT, Action: Custom);
1610 setOperationAction(Op: ISD::FABS, VT, Action: Custom);
1611 setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Custom);
1612 setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1613 setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Custom);
1614 setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Custom);
1615 setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Custom);
1616 setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Custom);
1617 setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Custom);
1618 setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1619 setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1620 setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1621
1622 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1623 setOperationAction(Op: ISD::FREM, VT, Action: Expand);
1624 setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
1625 setOperationAction(Op: ISD::FPOWI, VT, Action: Expand);
1626 setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
1627 setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
1628 setOperationAction(Op: ISD::FSINCOS, VT, Action: Expand);
1629 setOperationAction(Op: ISD::FTAN, VT, Action: Expand);
1630 setOperationAction(Op: ISD::FACOS, VT, Action: Expand);
1631 setOperationAction(Op: ISD::FASIN, VT, Action: Expand);
1632 setOperationAction(Op: ISD::FATAN, VT, Action: Expand);
1633 setOperationAction(Op: ISD::FCOSH, VT, Action: Expand);
1634 setOperationAction(Op: ISD::FSINH, VT, Action: Expand);
1635 setOperationAction(Op: ISD::FTANH, VT, Action: Expand);
1636 setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
1637 setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
1638 setOperationAction(Op: ISD::FEXP10, VT, Action: Expand);
1639 setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
1640 setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
1641 setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
1642
1643 setCondCodeAction(CCs: ISD::SETO, VT, Action: Expand);
1644 setCondCodeAction(CCs: ISD::SETOLT, VT, Action: Expand);
1645 setCondCodeAction(CCs: ISD::SETLT, VT, Action: Expand);
1646 setCondCodeAction(CCs: ISD::SETOLE, VT, Action: Expand);
1647 setCondCodeAction(CCs: ISD::SETLE, VT, Action: Expand);
1648 setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
1649 setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
1650 setCondCodeAction(CCs: ISD::SETUGE, VT, Action: Expand);
1651 setCondCodeAction(CCs: ISD::SETUGT, VT, Action: Expand);
1652 setCondCodeAction(CCs: ISD::SETUEQ, VT, Action: Expand);
1653 setCondCodeAction(CCs: ISD::SETONE, VT, Action: Expand);
1654
1655 if (!Subtarget->isLittleEndian())
1656 setOperationAction(Op: ISD::BITCAST, VT, Action: Expand);
1657 }
1658
1659 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1660 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1661 setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1662 setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1663 setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1664 setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1665
1666 if (!Subtarget->isLittleEndian())
1667 setOperationAction(Op: ISD::BITCAST, VT, Action: Expand);
1668 }
1669
1670 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::i8, Action: Custom);
1671 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::i16, Action: Custom);
1672
1673 // NEON doesn't support integer divides, but SVE does
1674 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1675 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1676 setOperationAction(Op: ISD::SDIV, VT, Action: Custom);
1677 setOperationAction(Op: ISD::UDIV, VT, Action: Custom);
1678 }
1679
1680 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1681 setOperationAction(Op: ISD::MUL, VT: MVT::v1i64, Action: Custom);
1682 setOperationAction(Op: ISD::MUL, VT: MVT::v2i64, Action: Custom);
1683
1684 // NOTE: Currently this has to happen after computeRegisterProperties rather
1685 // than the preferred option of combining it with the addRegisterClass call.
1686 if (Subtarget->useSVEForFixedLengthVectors()) {
1687 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
1688 if (useSVEForFixedLengthVectorVT(
1689 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1690 addTypeForFixedLengthSVE(VT);
1691 }
1692 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
1693 if (useSVEForFixedLengthVectorVT(
1694 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1695 addTypeForFixedLengthSVE(VT);
1696 }
1697
1698 // 64bit results can mean a bigger than NEON input.
1699 for (auto VT : {MVT::v8i8, MVT::v4i16})
1700 setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1701 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4f16, Action: Custom);
1702
1703 // 128bit results imply a bigger than NEON input.
1704 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1705 setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1706 for (auto VT : {MVT::v8f16, MVT::v4f32})
1707 setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1708
1709 // These operations are not supported on NEON but SVE can do them.
1710 setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v1i64, Action: Custom);
1711 setOperationAction(Op: ISD::CTLZ, VT: MVT::v1i64, Action: Custom);
1712 setOperationAction(Op: ISD::CTLZ, VT: MVT::v2i64, Action: Custom);
1713 setOperationAction(Op: ISD::CTTZ, VT: MVT::v1i64, Action: Custom);
1714 setOperationAction(Op: ISD::MULHS, VT: MVT::v1i64, Action: Custom);
1715 setOperationAction(Op: ISD::MULHS, VT: MVT::v2i64, Action: Custom);
1716 setOperationAction(Op: ISD::MULHU, VT: MVT::v1i64, Action: Custom);
1717 setOperationAction(Op: ISD::MULHU, VT: MVT::v2i64, Action: Custom);
1718 setOperationAction(Op: ISD::SMAX, VT: MVT::v1i64, Action: Custom);
1719 setOperationAction(Op: ISD::SMAX, VT: MVT::v2i64, Action: Custom);
1720 setOperationAction(Op: ISD::SMIN, VT: MVT::v1i64, Action: Custom);
1721 setOperationAction(Op: ISD::SMIN, VT: MVT::v2i64, Action: Custom);
1722 setOperationAction(Op: ISD::UMAX, VT: MVT::v1i64, Action: Custom);
1723 setOperationAction(Op: ISD::UMAX, VT: MVT::v2i64, Action: Custom);
1724 setOperationAction(Op: ISD::UMIN, VT: MVT::v1i64, Action: Custom);
1725 setOperationAction(Op: ISD::UMIN, VT: MVT::v2i64, Action: Custom);
1726 setOperationAction(Op: ISD::VECREDUCE_SMAX, VT: MVT::v2i64, Action: Custom);
1727 setOperationAction(Op: ISD::VECREDUCE_SMIN, VT: MVT::v2i64, Action: Custom);
1728 setOperationAction(Op: ISD::VECREDUCE_UMAX, VT: MVT::v2i64, Action: Custom);
1729 setOperationAction(Op: ISD::VECREDUCE_UMIN, VT: MVT::v2i64, Action: Custom);
1730
1731 // Int operations with no NEON support.
1732 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1733 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1734 setOperationAction(Op: ISD::BITREVERSE, VT, Action: Custom);
1735 setOperationAction(Op: ISD::CTTZ, VT, Action: Custom);
1736 setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1737 setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1738 setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1739 setOperationAction(Op: ISD::MULHS, VT, Action: Custom);
1740 setOperationAction(Op: ISD::MULHU, VT, Action: Custom);
1741 }
1742
1743 // Use SVE for vectors with more than 2 elements.
1744 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1745 setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Custom);
1746 }
1747
1748 setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv2i1, DestVT: MVT::nxv2i64);
1749 setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv4i1, DestVT: MVT::nxv4i32);
1750 setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv8i1, DestVT: MVT::nxv8i16);
1751 setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv16i1, DestVT: MVT::nxv16i8);
1752
1753 setOperationAction(Op: ISD::VSCALE, VT: MVT::i32, Action: Custom);
1754
1755 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1756 setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT, Action: Custom);
1757 }
1758
1759 // Handle operations that are only available in non-streaming SVE mode.
1760 if (Subtarget->isSVEAvailable()) {
1761 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1762 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1763 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1764 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1765 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1766 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1767 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1768 setOperationAction(Op: ISD::MGATHER, VT, Action: Custom);
1769 setOperationAction(Op: ISD::MSCATTER, VT, Action: Custom);
1770 }
1771
1772 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1773 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1774 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1775 setOperationAction(Op: ISD::VECREDUCE_SEQ_FADD, VT, Action: Custom);
1776
1777 // Histcnt is SVE2 only
1778 if (Subtarget->hasSVE2())
1779 setOperationAction(Op: ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, VT: MVT::Other,
1780 Action: Custom);
1781 }
1782
1783
1784 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1785 // Only required for llvm.aarch64.mops.memset.tag
1786 setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::i8, Action: Custom);
1787 }
1788
1789 setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::Other, Action: Custom);
1790
1791 if (Subtarget->hasSVE()) {
1792 setOperationAction(Op: ISD::FLDEXP, VT: MVT::f64, Action: Custom);
1793 setOperationAction(Op: ISD::FLDEXP, VT: MVT::f32, Action: Custom);
1794 setOperationAction(Op: ISD::FLDEXP, VT: MVT::f16, Action: Custom);
1795 setOperationAction(Op: ISD::FLDEXP, VT: MVT::bf16, Action: Custom);
1796 }
1797
1798 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1799
1800 IsStrictFPEnabled = true;
1801 setMaxAtomicSizeInBitsSupported(128);
1802
1803 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1804 // it, but it's just a wrapper around ldexp.
1805 if (Subtarget->isTargetWindows()) {
1806 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1807 if (isOperationExpand(Op, VT: MVT::f32))
1808 setOperationAction(Op, VT: MVT::f32, Action: Promote);
1809 }
1810
1811 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1812 // isn't legal.
1813 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1814 if (isOperationExpand(Op, VT: MVT::f16))
1815 setOperationAction(Op, VT: MVT::f16, Action: Promote);
1816
1817 if (Subtarget->isWindowsArm64EC()) {
1818 // FIXME: are there intrinsics we need to exclude from this?
1819 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1820 auto code = static_cast<RTLIB::Libcall>(i);
1821 auto libcallName = getLibcallName(Call: code);
1822 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1823 setLibcallName(Call: code, Name: Saver.save(S: Twine("#") + libcallName).data());
1824 }
1825 }
1826 }
1827}
1828
1829void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1830 assert(VT.isVector() && "VT should be a vector type");
1831
1832 if (VT.isFloatingPoint()) {
1833 MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1834 setOperationPromotedToType(Opc: ISD::LOAD, OrigVT: VT, DestVT: PromoteTo);
1835 setOperationPromotedToType(Opc: ISD::STORE, OrigVT: VT, DestVT: PromoteTo);
1836 }
1837
1838 // Mark vector float intrinsics as expand.
1839 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1840 setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
1841 setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
1842 setOperationAction(Op: ISD::FTAN, VT, Action: Expand);
1843 setOperationAction(Op: ISD::FASIN, VT, Action: Expand);
1844 setOperationAction(Op: ISD::FACOS, VT, Action: Expand);
1845 setOperationAction(Op: ISD::FATAN, VT, Action: Expand);
1846 setOperationAction(Op: ISD::FSINH, VT, Action: Expand);
1847 setOperationAction(Op: ISD::FCOSH, VT, Action: Expand);
1848 setOperationAction(Op: ISD::FTANH, VT, Action: Expand);
1849 setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
1850 setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
1851 setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
1852 setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
1853 setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
1854 setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
1855 setOperationAction(Op: ISD::FEXP10, VT, Action: Expand);
1856 }
1857
1858 // But we do support custom-lowering for FCOPYSIGN.
1859 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1860 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1861 VT == MVT::v8f16) &&
1862 Subtarget->hasFullFP16()))
1863 setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
1864
1865 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Custom);
1866 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Custom);
1867 setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Custom);
1868 setOperationAction(Op: ISD::ZERO_EXTEND_VECTOR_INREG, VT, Action: Custom);
1869 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Custom);
1870 setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Custom);
1871 setOperationAction(Op: ISD::SRA, VT, Action: Custom);
1872 setOperationAction(Op: ISD::SRL, VT, Action: Custom);
1873 setOperationAction(Op: ISD::SHL, VT, Action: Custom);
1874 setOperationAction(Op: ISD::OR, VT, Action: Custom);
1875 setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1876 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Legal);
1877
1878 setOperationAction(Op: ISD::SELECT, VT, Action: Expand);
1879 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1880 setOperationAction(Op: ISD::VSELECT, VT, Action: Expand);
1881 for (MVT InnerVT : MVT::all_valuetypes())
1882 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: InnerVT, MemVT: VT, Action: Expand);
1883
1884 // CNT supports only B element sizes, then use UADDLP to widen.
1885 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1886 setOperationAction(Op: ISD::CTPOP, VT, Action: Custom);
1887
1888 setOperationAction(Op: ISD::UDIV, VT, Action: Expand);
1889 setOperationAction(Op: ISD::SDIV, VT, Action: Expand);
1890 setOperationAction(Op: ISD::UREM, VT, Action: Expand);
1891 setOperationAction(Op: ISD::SREM, VT, Action: Expand);
1892 setOperationAction(Op: ISD::FREM, VT, Action: Expand);
1893
1894 for (unsigned Opcode :
1895 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1896 ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1897 setOperationAction(Op: Opcode, VT, Action: Custom);
1898
1899 if (!VT.isFloatingPoint())
1900 setOperationAction(Op: ISD::ABS, VT, Action: Legal);
1901
1902 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1903 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1904 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1905 setOperationAction(Op: Opcode, VT, Action: Legal);
1906
1907 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1908 // NEON types.
1909 if (VT.isFloatingPoint() &&
1910 VT.getVectorElementType() != MVT::bf16 &&
1911 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1912 for (unsigned Opcode :
1913 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
1914 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
1915 ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
1916 ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
1917 ISD::STRICT_FSQRT})
1918 setOperationAction(Op: Opcode, VT, Action: Legal);
1919
1920 // Strict fp extend and trunc are legal
1921 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1922 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT, Action: Legal);
1923 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1924 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT, Action: Legal);
1925
1926 // FIXME: We could potentially make use of the vector comparison instructions
1927 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1928 // complications:
1929 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1930 // so we would need to expand when the condition code doesn't match the
1931 // kind of comparison.
1932 // * Some kinds of comparison require more than one FCMXY instruction so
1933 // would need to be expanded instead.
1934 // * The lowering of the non-strict versions involves target-specific ISD
1935 // nodes so we would likely need to add strict versions of all of them and
1936 // handle them appropriately.
1937 setOperationAction(Op: ISD::STRICT_FSETCC, VT, Action: Expand);
1938 setOperationAction(Op: ISD::STRICT_FSETCCS, VT, Action: Expand);
1939
1940 if (Subtarget->isLittleEndian()) {
1941 for (unsigned im = (unsigned)ISD::PRE_INC;
1942 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1943 setIndexedLoadAction(IdxModes: im, VT, Action: Legal);
1944 setIndexedStoreAction(IdxModes: im, VT, Action: Legal);
1945 }
1946 }
1947
1948 if (Subtarget->hasD128()) {
1949 setOperationAction(Op: ISD::READ_REGISTER, VT: MVT::i128, Action: Custom);
1950 setOperationAction(Op: ISD::WRITE_REGISTER, VT: MVT::i128, Action: Custom);
1951 }
1952}
1953
1954bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
1955 EVT OpVT) const {
1956 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1957 if (!Subtarget->hasSVE())
1958 return true;
1959
1960 // We can only support legal predicate result types. We can use the SVE
1961 // whilelo instruction for generating fixed-width predicates too.
1962 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1963 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1964 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1965 return true;
1966
1967 // The whilelo instruction only works with i32 or i64 scalar inputs.
1968 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1969 return true;
1970
1971 return false;
1972}
1973
1974bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
1975 if (!Subtarget->isSVEorStreamingSVEAvailable())
1976 return true;
1977
1978 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
1979 // also support fixed-width predicates.
1980 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
1981 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
1982 VT != MVT::v4i1 && VT != MVT::v2i1;
1983}
1984
1985void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1986 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1987
1988 // By default everything must be expanded.
1989 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1990 setOperationAction(Op, VT, Action: Expand);
1991
1992 if (VT.isFloatingPoint()) {
1993 setCondCodeAction(CCs: ISD::SETO, VT, Action: Expand);
1994 setCondCodeAction(CCs: ISD::SETOLT, VT, Action: Expand);
1995 setCondCodeAction(CCs: ISD::SETOLE, VT, Action: Expand);
1996 setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
1997 setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
1998 setCondCodeAction(CCs: ISD::SETUGE, VT, Action: Expand);
1999 setCondCodeAction(CCs: ISD::SETUGT, VT, Action: Expand);
2000 setCondCodeAction(CCs: ISD::SETUEQ, VT, Action: Expand);
2001 setCondCodeAction(CCs: ISD::SETONE, VT, Action: Expand);
2002 }
2003
2004 TargetLoweringBase::LegalizeAction Default =
2005 VT == MVT::v1f64 ? Expand : Custom;
2006
2007 // Mark integer truncating stores/extending loads as having custom lowering
2008 if (VT.isInteger()) {
2009 MVT InnerVT = VT.changeVectorElementType(EltVT: MVT::i8);
2010 while (InnerVT != VT) {
2011 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Default);
2012 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2013 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2014 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2015 InnerVT = InnerVT.changeVectorElementType(
2016 EltVT: MVT::getIntegerVT(BitWidth: 2 * InnerVT.getScalarSizeInBits()));
2017 }
2018 }
2019
2020 // Mark floating-point truncating stores/extending loads as having custom
2021 // lowering
2022 if (VT.isFloatingPoint()) {
2023 MVT InnerVT = VT.changeVectorElementType(EltVT: MVT::f16);
2024 while (InnerVT != VT) {
2025 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Custom);
2026 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2027 InnerVT = InnerVT.changeVectorElementType(
2028 EltVT: MVT::getFloatingPointVT(BitWidth: 2 * InnerVT.getScalarSizeInBits()));
2029 }
2030 }
2031
2032 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2033 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2034
2035 // Lower fixed length vector operations to scalable equivalents.
2036 setOperationAction(Op: ISD::ABS, VT, Action: Default);
2037 setOperationAction(Op: ISD::ADD, VT, Action: Default);
2038 setOperationAction(Op: ISD::AND, VT, Action: Default);
2039 setOperationAction(Op: ISD::ANY_EXTEND, VT, Action: Default);
2040 setOperationAction(Op: ISD::BITCAST, VT, Action: PreferNEON ? Legal : Default);
2041 setOperationAction(Op: ISD::BITREVERSE, VT, Action: Default);
2042 setOperationAction(Op: ISD::BSWAP, VT, Action: Default);
2043 setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Default);
2044 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Default);
2045 setOperationAction(Op: ISD::CTLZ, VT, Action: Default);
2046 setOperationAction(Op: ISD::CTPOP, VT, Action: Default);
2047 setOperationAction(Op: ISD::CTTZ, VT, Action: Default);
2048 setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Default);
2049 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Default);
2050 setOperationAction(Op: ISD::FABS, VT, Action: Default);
2051 setOperationAction(Op: ISD::FADD, VT, Action: Default);
2052 setOperationAction(Op: ISD::FCEIL, VT, Action: Default);
2053 setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Default);
2054 setOperationAction(Op: ISD::FDIV, VT, Action: Default);
2055 setOperationAction(Op: ISD::FFLOOR, VT, Action: Default);
2056 setOperationAction(Op: ISD::FMA, VT, Action: Default);
2057 setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Default);
2058 setOperationAction(Op: ISD::FMAXNUM, VT, Action: Default);
2059 setOperationAction(Op: ISD::FMINIMUM, VT, Action: Default);
2060 setOperationAction(Op: ISD::FMINNUM, VT, Action: Default);
2061 setOperationAction(Op: ISD::FMUL, VT, Action: Default);
2062 setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Default);
2063 setOperationAction(Op: ISD::FNEG, VT, Action: Default);
2064 setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Default);
2065 setOperationAction(Op: ISD::FP_ROUND, VT, Action: Default);
2066 setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Default);
2067 setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Default);
2068 setOperationAction(Op: ISD::FRINT, VT, Action: Default);
2069 setOperationAction(Op: ISD::LRINT, VT, Action: Default);
2070 setOperationAction(Op: ISD::LLRINT, VT, Action: Default);
2071 setOperationAction(Op: ISD::FROUND, VT, Action: Default);
2072 setOperationAction(Op: ISD::FROUNDEVEN, VT, Action: Default);
2073 setOperationAction(Op: ISD::FSQRT, VT, Action: Default);
2074 setOperationAction(Op: ISD::FSUB, VT, Action: Default);
2075 setOperationAction(Op: ISD::FTRUNC, VT, Action: Default);
2076 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Default);
2077 setOperationAction(Op: ISD::LOAD, VT, Action: PreferNEON ? Legal : Default);
2078 setOperationAction(Op: ISD::MGATHER, VT, Action: PreferSVE ? Default : Expand);
2079 setOperationAction(Op: ISD::MLOAD, VT, Action: Default);
2080 setOperationAction(Op: ISD::MSCATTER, VT, Action: PreferSVE ? Default : Expand);
2081 setOperationAction(Op: ISD::MSTORE, VT, Action: Default);
2082 setOperationAction(Op: ISD::MUL, VT, Action: Default);
2083 setOperationAction(Op: ISD::MULHS, VT, Action: Default);
2084 setOperationAction(Op: ISD::MULHU, VT, Action: Default);
2085 setOperationAction(Op: ISD::OR, VT, Action: Default);
2086 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT, Action: PreferNEON ? Legal : Expand);
2087 setOperationAction(Op: ISD::SDIV, VT, Action: Default);
2088 setOperationAction(Op: ISD::SELECT, VT, Action: Default);
2089 setOperationAction(Op: ISD::SETCC, VT, Action: Default);
2090 setOperationAction(Op: ISD::SHL, VT, Action: Default);
2091 setOperationAction(Op: ISD::SIGN_EXTEND, VT, Action: Default);
2092 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Default);
2093 setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Default);
2094 setOperationAction(Op: ISD::SMAX, VT, Action: Default);
2095 setOperationAction(Op: ISD::SMIN, VT, Action: Default);
2096 setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Default);
2097 setOperationAction(Op: ISD::SRA, VT, Action: Default);
2098 setOperationAction(Op: ISD::SRL, VT, Action: Default);
2099 setOperationAction(Op: ISD::STORE, VT, Action: PreferNEON ? Legal : Default);
2100 setOperationAction(Op: ISD::SUB, VT, Action: Default);
2101 setOperationAction(Op: ISD::TRUNCATE, VT, Action: Default);
2102 setOperationAction(Op: ISD::UDIV, VT, Action: Default);
2103 setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Default);
2104 setOperationAction(Op: ISD::UMAX, VT, Action: Default);
2105 setOperationAction(Op: ISD::UMIN, VT, Action: Default);
2106 setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Default);
2107 setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Default);
2108 setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Default);
2109 setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Default);
2110 setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Default);
2111 setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Default);
2112 setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Default);
2113 setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Default);
2114 setOperationAction(Op: ISD::VECREDUCE_SEQ_FADD, VT, Action: PreferSVE ? Default : Expand);
2115 setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Default);
2116 setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Default);
2117 setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Default);
2118 setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Default);
2119 setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Default);
2120 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Default);
2121 setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Default);
2122 setOperationAction(Op: ISD::VSELECT, VT, Action: Default);
2123 setOperationAction(Op: ISD::XOR, VT, Action: Default);
2124 setOperationAction(Op: ISD::ZERO_EXTEND, VT, Action: Default);
2125}
2126
2127void AArch64TargetLowering::addDRType(MVT VT) {
2128 addRegisterClass(VT, RC: &AArch64::FPR64RegClass);
2129 if (Subtarget->isNeonAvailable())
2130 addTypeForNEON(VT);
2131}
2132
2133void AArch64TargetLowering::addQRType(MVT VT) {
2134 addRegisterClass(VT, RC: &AArch64::FPR128RegClass);
2135 if (Subtarget->isNeonAvailable())
2136 addTypeForNEON(VT);
2137}
2138
2139EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
2140 LLVMContext &C, EVT VT) const {
2141 if (!VT.isVector())
2142 return MVT::i32;
2143 if (VT.isScalableVector())
2144 return EVT::getVectorVT(Context&: C, VT: MVT::i1, EC: VT.getVectorElementCount());
2145 return VT.changeVectorElementTypeToInteger();
2146}
2147
2148// isIntImmediate - This method tests to see if the node is a constant
2149// operand. If so Imm will receive the value.
2150static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2151 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(Val: N)) {
2152 Imm = C->getZExtValue();
2153 return true;
2154 }
2155 return false;
2156}
2157
2158// isOpcWithIntImmediate - This method tests to see if the node is a specific
2159// opcode and that it has a immediate integer right operand.
2160// If so Imm will receive the value.
2161static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2162 uint64_t &Imm) {
2163 return N->getOpcode() == Opc &&
2164 isIntImmediate(N: N->getOperand(Num: 1).getNode(), Imm);
2165}
2166
2167static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2168 const APInt &Demanded,
2169 TargetLowering::TargetLoweringOpt &TLO,
2170 unsigned NewOpc) {
2171 uint64_t OldImm = Imm, NewImm, Enc;
2172 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2173
2174 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2175 // bimm64.
2176 if (Imm == 0 || Imm == Mask ||
2177 AArch64_AM::isLogicalImmediate(imm: Imm & Mask, regSize: Size))
2178 return false;
2179
2180 unsigned EltSize = Size;
2181 uint64_t DemandedBits = Demanded.getZExtValue();
2182
2183 // Clear bits that are not demanded.
2184 Imm &= DemandedBits;
2185
2186 while (true) {
2187 // The goal here is to set the non-demanded bits in a way that minimizes
2188 // the number of switching between 0 and 1. In order to achieve this goal,
2189 // we set the non-demanded bits to the value of the preceding demanded bits.
2190 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2191 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2192 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2193 // The final result is 0b11000011.
2194 uint64_t NonDemandedBits = ~DemandedBits;
2195 uint64_t InvertedImm = ~Imm & DemandedBits;
2196 uint64_t RotatedImm =
2197 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2198 NonDemandedBits;
2199 uint64_t Sum = RotatedImm + NonDemandedBits;
2200 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2201 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2202 NewImm = (Imm | Ones) & Mask;
2203
2204 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2205 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2206 // we halve the element size and continue the search.
2207 if (isShiftedMask_64(Value: NewImm) || isShiftedMask_64(Value: ~(NewImm | ~Mask)))
2208 break;
2209
2210 // We cannot shrink the element size any further if it is 2-bits.
2211 if (EltSize == 2)
2212 return false;
2213
2214 EltSize /= 2;
2215 Mask >>= EltSize;
2216 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2217
2218 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2219 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2220 return false;
2221
2222 // Merge the upper and lower halves of Imm and DemandedBits.
2223 Imm |= Hi;
2224 DemandedBits |= DemandedBitsHi;
2225 }
2226
2227 ++NumOptimizedImms;
2228
2229 // Replicate the element across the register width.
2230 while (EltSize < Size) {
2231 NewImm |= NewImm << EltSize;
2232 EltSize *= 2;
2233 }
2234
2235 (void)OldImm;
2236 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2237 "demanded bits should never be altered");
2238 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2239
2240 // Create the new constant immediate node.
2241 EVT VT = Op.getValueType();
2242 SDLoc DL(Op);
2243 SDValue New;
2244
2245 // If the new constant immediate is all-zeros or all-ones, let the target
2246 // independent DAG combine optimize this node.
2247 if (NewImm == 0 || NewImm == OrigMask) {
2248 New = TLO.DAG.getNode(Opcode: Op.getOpcode(), DL, VT, N1: Op.getOperand(i: 0),
2249 N2: TLO.DAG.getConstant(Val: NewImm, DL, VT));
2250 // Otherwise, create a machine node so that target independent DAG combine
2251 // doesn't undo this optimization.
2252 } else {
2253 Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm, regSize: Size);
2254 SDValue EncConst = TLO.DAG.getTargetConstant(Val: Enc, DL, VT);
2255 New = SDValue(
2256 TLO.DAG.getMachineNode(Opcode: NewOpc, dl: DL, VT, Op1: Op.getOperand(i: 0), Op2: EncConst), 0);
2257 }
2258
2259 return TLO.CombineTo(O: Op, N: New);
2260}
2261
2262bool AArch64TargetLowering::targetShrinkDemandedConstant(
2263 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2264 TargetLoweringOpt &TLO) const {
2265 // Delay this optimization to as late as possible.
2266 if (!TLO.LegalOps)
2267 return false;
2268
2269 if (!EnableOptimizeLogicalImm)
2270 return false;
2271
2272 EVT VT = Op.getValueType();
2273 if (VT.isVector())
2274 return false;
2275
2276 unsigned Size = VT.getSizeInBits();
2277 assert((Size == 32 || Size == 64) &&
2278 "i32 or i64 is expected after legalization.");
2279
2280 // Exit early if we demand all bits.
2281 if (DemandedBits.popcount() == Size)
2282 return false;
2283
2284 unsigned NewOpc;
2285 switch (Op.getOpcode()) {
2286 default:
2287 return false;
2288 case ISD::AND:
2289 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2290 break;
2291 case ISD::OR:
2292 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2293 break;
2294 case ISD::XOR:
2295 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2296 break;
2297 }
2298 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1));
2299 if (!C)
2300 return false;
2301 uint64_t Imm = C->getZExtValue();
2302 return optimizeLogicalImm(Op, Size, Imm, Demanded: DemandedBits, TLO, NewOpc);
2303}
2304
2305/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2306/// Mask are known to be either zero or one and return them Known.
2307void AArch64TargetLowering::computeKnownBitsForTargetNode(
2308 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2309 const SelectionDAG &DAG, unsigned Depth) const {
2310 switch (Op.getOpcode()) {
2311 default:
2312 break;
2313 case AArch64ISD::DUP: {
2314 SDValue SrcOp = Op.getOperand(i: 0);
2315 Known = DAG.computeKnownBits(Op: SrcOp, Depth: Depth + 1);
2316 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2317 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2318 "Expected DUP implicit truncation");
2319 Known = Known.trunc(BitWidth: Op.getScalarValueSizeInBits());
2320 }
2321 break;
2322 }
2323 case AArch64ISD::CSEL: {
2324 KnownBits Known2;
2325 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2326 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2327 Known = Known.intersectWith(RHS: Known2);
2328 break;
2329 }
2330 case AArch64ISD::BICi: {
2331 // Compute the bit cleared value.
2332 uint64_t Mask =
2333 ~(Op->getConstantOperandVal(Num: 1) << Op->getConstantOperandVal(Num: 2));
2334 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2335 Known &= KnownBits::makeConstant(C: APInt(Known.getBitWidth(), Mask));
2336 break;
2337 }
2338 case AArch64ISD::VLSHR: {
2339 KnownBits Known2;
2340 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2341 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2342 Known = KnownBits::lshr(LHS: Known, RHS: Known2);
2343 break;
2344 }
2345 case AArch64ISD::VASHR: {
2346 KnownBits Known2;
2347 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2348 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2349 Known = KnownBits::ashr(LHS: Known, RHS: Known2);
2350 break;
2351 }
2352 case AArch64ISD::VSHL: {
2353 KnownBits Known2;
2354 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2355 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2356 Known = KnownBits::shl(LHS: Known, RHS: Known2);
2357 break;
2358 }
2359 case AArch64ISD::MOVI: {
2360 Known = KnownBits::makeConstant(
2361 C: APInt(Known.getBitWidth(), Op->getConstantOperandVal(Num: 0)));
2362 break;
2363 }
2364 case AArch64ISD::LOADgot:
2365 case AArch64ISD::ADDlow: {
2366 if (!Subtarget->isTargetILP32())
2367 break;
2368 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2369 Known.Zero = APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32);
2370 break;
2371 }
2372 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2373 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2374 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2375 break;
2376 }
2377 case ISD::INTRINSIC_W_CHAIN: {
2378 Intrinsic::ID IntID =
2379 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(Num: 1));
2380 switch (IntID) {
2381 default: return;
2382 case Intrinsic::aarch64_ldaxr:
2383 case Intrinsic::aarch64_ldxr: {
2384 unsigned BitWidth = Known.getBitWidth();
2385 EVT VT = cast<MemIntrinsicSDNode>(Val: Op)->getMemoryVT();
2386 unsigned MemBits = VT.getScalarSizeInBits();
2387 Known.Zero |= APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - MemBits);
2388 return;
2389 }
2390 }
2391 break;
2392 }
2393 case ISD::INTRINSIC_WO_CHAIN:
2394 case ISD::INTRINSIC_VOID: {
2395 unsigned IntNo = Op.getConstantOperandVal(i: 0);
2396 switch (IntNo) {
2397 default:
2398 break;
2399 case Intrinsic::aarch64_neon_uaddlv: {
2400 MVT VT = Op.getOperand(i: 1).getValueType().getSimpleVT();
2401 unsigned BitWidth = Known.getBitWidth();
2402 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2403 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2404 assert(BitWidth >= Bound && "Unexpected width!");
2405 APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - Bound);
2406 Known.Zero |= Mask;
2407 }
2408 break;
2409 }
2410 case Intrinsic::aarch64_neon_umaxv:
2411 case Intrinsic::aarch64_neon_uminv: {
2412 // Figure out the datatype of the vector operand. The UMINV instruction
2413 // will zero extend the result, so we can mark as known zero all the
2414 // bits larger than the element datatype. 32-bit or larget doesn't need
2415 // this as those are legal types and will be handled by isel directly.
2416 MVT VT = Op.getOperand(i: 1).getValueType().getSimpleVT();
2417 unsigned BitWidth = Known.getBitWidth();
2418 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2419 assert(BitWidth >= 8 && "Unexpected width!");
2420 APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 8);
2421 Known.Zero |= Mask;
2422 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2423 assert(BitWidth >= 16 && "Unexpected width!");
2424 APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 16);
2425 Known.Zero |= Mask;
2426 }
2427 break;
2428 } break;
2429 }
2430 }
2431 }
2432}
2433
2434unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode(
2435 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2436 unsigned Depth) const {
2437 EVT VT = Op.getValueType();
2438 unsigned VTBits = VT.getScalarSizeInBits();
2439 unsigned Opcode = Op.getOpcode();
2440 switch (Opcode) {
2441 case AArch64ISD::CMEQ:
2442 case AArch64ISD::CMGE:
2443 case AArch64ISD::CMGT:
2444 case AArch64ISD::CMHI:
2445 case AArch64ISD::CMHS:
2446 case AArch64ISD::FCMEQ:
2447 case AArch64ISD::FCMGE:
2448 case AArch64ISD::FCMGT:
2449 case AArch64ISD::CMEQz:
2450 case AArch64ISD::CMGEz:
2451 case AArch64ISD::CMGTz:
2452 case AArch64ISD::CMLEz:
2453 case AArch64ISD::CMLTz:
2454 case AArch64ISD::FCMEQz:
2455 case AArch64ISD::FCMGEz:
2456 case AArch64ISD::FCMGTz:
2457 case AArch64ISD::FCMLEz:
2458 case AArch64ISD::FCMLTz:
2459 // Compares return either 0 or all-ones
2460 return VTBits;
2461 }
2462
2463 return 1;
2464}
2465
2466MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
2467 EVT) const {
2468 return MVT::i64;
2469}
2470
2471bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2472 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2473 unsigned *Fast) const {
2474 if (Subtarget->requiresStrictAlign())
2475 return false;
2476
2477 if (Fast) {
2478 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2479 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2480 // See comments in performSTORECombine() for more details about
2481 // these conditions.
2482
2483 // Code that uses clang vector extensions can mark that it
2484 // wants unaligned accesses to be treated as fast by
2485 // underspecifying alignment to be 1 or 2.
2486 Alignment <= 2 ||
2487
2488 // Disregard v2i64. Memcpy lowering produces those and splitting
2489 // them regresses performance on micro-benchmarks and olden/bh.
2490 VT == MVT::v2i64;
2491 }
2492 return true;
2493}
2494
2495// Same as above but handling LLTs instead.
2496bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2497 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2498 unsigned *Fast) const {
2499 if (Subtarget->requiresStrictAlign())
2500 return false;
2501
2502 if (Fast) {
2503 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2504 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2505 Ty.getSizeInBytes() != 16 ||
2506 // See comments in performSTORECombine() for more details about
2507 // these conditions.
2508
2509 // Code that uses clang vector extensions can mark that it
2510 // wants unaligned accesses to be treated as fast by
2511 // underspecifying alignment to be 1 or 2.
2512 Alignment <= 2 ||
2513
2514 // Disregard v2i64. Memcpy lowering produces those and splitting
2515 // them regresses performance on micro-benchmarks and olden/bh.
2516 Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
2517 }
2518 return true;
2519}
2520
2521FastISel *
2522AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2523 const TargetLibraryInfo *libInfo) const {
2524 return AArch64::createFastISel(funcInfo, libInfo);
2525}
2526
2527const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2528#define MAKE_CASE(V) \
2529 case V: \
2530 return #V;
2531 switch ((AArch64ISD::NodeType)Opcode) {
2532 case AArch64ISD::FIRST_NUMBER:
2533 break;
2534 MAKE_CASE(AArch64ISD::ALLOCATE_ZA_BUFFER)
2535 MAKE_CASE(AArch64ISD::INIT_TPIDR2OBJ)
2536 MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
2537 MAKE_CASE(AArch64ISD::VG_SAVE)
2538 MAKE_CASE(AArch64ISD::VG_RESTORE)
2539 MAKE_CASE(AArch64ISD::SMSTART)
2540 MAKE_CASE(AArch64ISD::SMSTOP)
2541 MAKE_CASE(AArch64ISD::RESTORE_ZA)
2542 MAKE_CASE(AArch64ISD::RESTORE_ZT)
2543 MAKE_CASE(AArch64ISD::SAVE_ZT)
2544 MAKE_CASE(AArch64ISD::CALL)
2545 MAKE_CASE(AArch64ISD::ADRP)
2546 MAKE_CASE(AArch64ISD::ADR)
2547 MAKE_CASE(AArch64ISD::ADDlow)
2548 MAKE_CASE(AArch64ISD::AUTH_CALL)
2549 MAKE_CASE(AArch64ISD::AUTH_TC_RETURN)
2550 MAKE_CASE(AArch64ISD::AUTH_CALL_RVMARKER)
2551 MAKE_CASE(AArch64ISD::LOADgot)
2552 MAKE_CASE(AArch64ISD::RET_GLUE)
2553 MAKE_CASE(AArch64ISD::BRCOND)
2554 MAKE_CASE(AArch64ISD::CSEL)
2555 MAKE_CASE(AArch64ISD::CSINV)
2556 MAKE_CASE(AArch64ISD::CSNEG)
2557 MAKE_CASE(AArch64ISD::CSINC)
2558 MAKE_CASE(AArch64ISD::THREAD_POINTER)
2559 MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
2560 MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
2561 MAKE_CASE(AArch64ISD::ABDS_PRED)
2562 MAKE_CASE(AArch64ISD::ABDU_PRED)
2563 MAKE_CASE(AArch64ISD::HADDS_PRED)
2564 MAKE_CASE(AArch64ISD::HADDU_PRED)
2565 MAKE_CASE(AArch64ISD::MUL_PRED)
2566 MAKE_CASE(AArch64ISD::MULHS_PRED)
2567 MAKE_CASE(AArch64ISD::MULHU_PRED)
2568 MAKE_CASE(AArch64ISD::RHADDS_PRED)
2569 MAKE_CASE(AArch64ISD::RHADDU_PRED)
2570 MAKE_CASE(AArch64ISD::SDIV_PRED)
2571 MAKE_CASE(AArch64ISD::SHL_PRED)
2572 MAKE_CASE(AArch64ISD::SMAX_PRED)
2573 MAKE_CASE(AArch64ISD::SMIN_PRED)
2574 MAKE_CASE(AArch64ISD::SRA_PRED)
2575 MAKE_CASE(AArch64ISD::SRL_PRED)
2576 MAKE_CASE(AArch64ISD::UDIV_PRED)
2577 MAKE_CASE(AArch64ISD::UMAX_PRED)
2578 MAKE_CASE(AArch64ISD::UMIN_PRED)
2579 MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
2580 MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
2581 MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
2582 MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
2583 MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
2584 MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
2585 MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
2586 MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
2587 MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
2588 MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
2589 MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
2590 MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
2591 MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
2592 MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
2593 MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
2594 MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
2595 MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
2596 MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
2597 MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
2598 MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
2599 MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
2600 MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
2601 MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
2602 MAKE_CASE(AArch64ISD::ADC)
2603 MAKE_CASE(AArch64ISD::SBC)
2604 MAKE_CASE(AArch64ISD::ADDS)
2605 MAKE_CASE(AArch64ISD::SUBS)
2606 MAKE_CASE(AArch64ISD::ADCS)
2607 MAKE_CASE(AArch64ISD::SBCS)
2608 MAKE_CASE(AArch64ISD::ANDS)
2609 MAKE_CASE(AArch64ISD::CCMP)
2610 MAKE_CASE(AArch64ISD::CCMN)
2611 MAKE_CASE(AArch64ISD::FCCMP)
2612 MAKE_CASE(AArch64ISD::FCMP)
2613 MAKE_CASE(AArch64ISD::STRICT_FCMP)
2614 MAKE_CASE(AArch64ISD::STRICT_FCMPE)
2615 MAKE_CASE(AArch64ISD::FCVTXN)
2616 MAKE_CASE(AArch64ISD::SME_ZA_LDR)
2617 MAKE_CASE(AArch64ISD::SME_ZA_STR)
2618 MAKE_CASE(AArch64ISD::DUP)
2619 MAKE_CASE(AArch64ISD::DUPLANE8)
2620 MAKE_CASE(AArch64ISD::DUPLANE16)
2621 MAKE_CASE(AArch64ISD::DUPLANE32)
2622 MAKE_CASE(AArch64ISD::DUPLANE64)
2623 MAKE_CASE(AArch64ISD::DUPLANE128)
2624 MAKE_CASE(AArch64ISD::MOVI)
2625 MAKE_CASE(AArch64ISD::MOVIshift)
2626 MAKE_CASE(AArch64ISD::MOVIedit)
2627 MAKE_CASE(AArch64ISD::MOVImsl)
2628 MAKE_CASE(AArch64ISD::FMOV)
2629 MAKE_CASE(AArch64ISD::MVNIshift)
2630 MAKE_CASE(AArch64ISD::MVNImsl)
2631 MAKE_CASE(AArch64ISD::BICi)
2632 MAKE_CASE(AArch64ISD::ORRi)
2633 MAKE_CASE(AArch64ISD::BSP)
2634 MAKE_CASE(AArch64ISD::ZIP1)
2635 MAKE_CASE(AArch64ISD::ZIP2)
2636 MAKE_CASE(AArch64ISD::UZP1)
2637 MAKE_CASE(AArch64ISD::UZP2)
2638 MAKE_CASE(AArch64ISD::TRN1)
2639 MAKE_CASE(AArch64ISD::TRN2)
2640 MAKE_CASE(AArch64ISD::REV16)
2641 MAKE_CASE(AArch64ISD::REV32)
2642 MAKE_CASE(AArch64ISD::REV64)
2643 MAKE_CASE(AArch64ISD::EXT)
2644 MAKE_CASE(AArch64ISD::SPLICE)
2645 MAKE_CASE(AArch64ISD::VSHL)
2646 MAKE_CASE(AArch64ISD::VLSHR)
2647 MAKE_CASE(AArch64ISD::VASHR)
2648 MAKE_CASE(AArch64ISD::VSLI)
2649 MAKE_CASE(AArch64ISD::VSRI)
2650 MAKE_CASE(AArch64ISD::CMEQ)
2651 MAKE_CASE(AArch64ISD::CMGE)
2652 MAKE_CASE(AArch64ISD::CMGT)
2653 MAKE_CASE(AArch64ISD::CMHI)
2654 MAKE_CASE(AArch64ISD::CMHS)
2655 MAKE_CASE(AArch64ISD::FCMEQ)
2656 MAKE_CASE(AArch64ISD::FCMGE)
2657 MAKE_CASE(AArch64ISD::FCMGT)
2658 MAKE_CASE(AArch64ISD::CMEQz)
2659 MAKE_CASE(AArch64ISD::CMGEz)
2660 MAKE_CASE(AArch64ISD::CMGTz)
2661 MAKE_CASE(AArch64ISD::CMLEz)
2662 MAKE_CASE(AArch64ISD::CMLTz)
2663 MAKE_CASE(AArch64ISD::FCMEQz)
2664 MAKE_CASE(AArch64ISD::FCMGEz)
2665 MAKE_CASE(AArch64ISD::FCMGTz)
2666 MAKE_CASE(AArch64ISD::FCMLEz)
2667 MAKE_CASE(AArch64ISD::FCMLTz)
2668 MAKE_CASE(AArch64ISD::SADDV)
2669 MAKE_CASE(AArch64ISD::UADDV)
2670 MAKE_CASE(AArch64ISD::UADDLV)
2671 MAKE_CASE(AArch64ISD::SADDLV)
2672 MAKE_CASE(AArch64ISD::SDOT)
2673 MAKE_CASE(AArch64ISD::UDOT)
2674 MAKE_CASE(AArch64ISD::SMINV)
2675 MAKE_CASE(AArch64ISD::UMINV)
2676 MAKE_CASE(AArch64ISD::SMAXV)
2677 MAKE_CASE(AArch64ISD::UMAXV)
2678 MAKE_CASE(AArch64ISD::SADDV_PRED)
2679 MAKE_CASE(AArch64ISD::UADDV_PRED)
2680 MAKE_CASE(AArch64ISD::SMAXV_PRED)
2681 MAKE_CASE(AArch64ISD::UMAXV_PRED)
2682 MAKE_CASE(AArch64ISD::SMINV_PRED)
2683 MAKE_CASE(AArch64ISD::UMINV_PRED)
2684 MAKE_CASE(AArch64ISD::ORV_PRED)
2685 MAKE_CASE(AArch64ISD::EORV_PRED)
2686 MAKE_CASE(AArch64ISD::ANDV_PRED)
2687 MAKE_CASE(AArch64ISD::CLASTA_N)
2688 MAKE_CASE(AArch64ISD::CLASTB_N)
2689 MAKE_CASE(AArch64ISD::LASTA)
2690 MAKE_CASE(AArch64ISD::LASTB)
2691 MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2692 MAKE_CASE(AArch64ISD::LS64_BUILD)
2693 MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2694 MAKE_CASE(AArch64ISD::TBL)
2695 MAKE_CASE(AArch64ISD::FADD_PRED)
2696 MAKE_CASE(AArch64ISD::FADDA_PRED)
2697 MAKE_CASE(AArch64ISD::FADDV_PRED)
2698 MAKE_CASE(AArch64ISD::FDIV_PRED)
2699 MAKE_CASE(AArch64ISD::FMA_PRED)
2700 MAKE_CASE(AArch64ISD::FMAX_PRED)
2701 MAKE_CASE(AArch64ISD::FMAXV_PRED)
2702 MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2703 MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2704 MAKE_CASE(AArch64ISD::FMIN_PRED)
2705 MAKE_CASE(AArch64ISD::FMINV_PRED)
2706 MAKE_CASE(AArch64ISD::FMINNM_PRED)
2707 MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2708 MAKE_CASE(AArch64ISD::FMUL_PRED)
2709 MAKE_CASE(AArch64ISD::FSUB_PRED)
2710 MAKE_CASE(AArch64ISD::RDSVL)
2711 MAKE_CASE(AArch64ISD::BIC)
2712 MAKE_CASE(AArch64ISD::CBZ)
2713 MAKE_CASE(AArch64ISD::CBNZ)
2714 MAKE_CASE(AArch64ISD::TBZ)
2715 MAKE_CASE(AArch64ISD::TBNZ)
2716 MAKE_CASE(AArch64ISD::TC_RETURN)
2717 MAKE_CASE(AArch64ISD::PREFETCH)
2718 MAKE_CASE(AArch64ISD::SITOF)
2719 MAKE_CASE(AArch64ISD::UITOF)
2720 MAKE_CASE(AArch64ISD::NVCAST)
2721 MAKE_CASE(AArch64ISD::MRS)
2722 MAKE_CASE(AArch64ISD::SQSHL_I)
2723 MAKE_CASE(AArch64ISD::UQSHL_I)
2724 MAKE_CASE(AArch64ISD::SRSHR_I)
2725 MAKE_CASE(AArch64ISD::URSHR_I)
2726 MAKE_CASE(AArch64ISD::SQSHLU_I)
2727 MAKE_CASE(AArch64ISD::WrapperLarge)
2728 MAKE_CASE(AArch64ISD::LD2post)
2729 MAKE_CASE(AArch64ISD::LD3post)
2730 MAKE_CASE(AArch64ISD::LD4post)
2731 MAKE_CASE(AArch64ISD::ST2post)
2732 MAKE_CASE(AArch64ISD::ST3post)
2733 MAKE_CASE(AArch64ISD::ST4post)
2734 MAKE_CASE(AArch64ISD::LD1x2post)
2735 MAKE_CASE(AArch64ISD::LD1x3post)
2736 MAKE_CASE(AArch64ISD::LD1x4post)
2737 MAKE_CASE(AArch64ISD::ST1x2post)
2738 MAKE_CASE(AArch64ISD::ST1x3post)
2739 MAKE_CASE(AArch64ISD::ST1x4post)
2740 MAKE_CASE(AArch64ISD::LD1DUPpost)
2741 MAKE_CASE(AArch64ISD::LD2DUPpost)
2742 MAKE_CASE(AArch64ISD::LD3DUPpost)
2743 MAKE_CASE(AArch64ISD::LD4DUPpost)
2744 MAKE_CASE(AArch64ISD::LD1LANEpost)
2745 MAKE_CASE(AArch64ISD::LD2LANEpost)
2746 MAKE_CASE(AArch64ISD::LD3LANEpost)
2747 MAKE_CASE(AArch64ISD::LD4LANEpost)
2748 MAKE_CASE(AArch64ISD::ST2LANEpost)
2749 MAKE_CASE(AArch64ISD::ST3LANEpost)
2750 MAKE_CASE(AArch64ISD::ST4LANEpost)
2751 MAKE_CASE(AArch64ISD::SMULL)
2752 MAKE_CASE(AArch64ISD::UMULL)
2753 MAKE_CASE(AArch64ISD::PMULL)
2754 MAKE_CASE(AArch64ISD::FRECPE)
2755 MAKE_CASE(AArch64ISD::FRECPS)
2756 MAKE_CASE(AArch64ISD::FRSQRTE)
2757 MAKE_CASE(AArch64ISD::FRSQRTS)
2758 MAKE_CASE(AArch64ISD::STG)
2759 MAKE_CASE(AArch64ISD::STZG)
2760 MAKE_CASE(AArch64ISD::ST2G)
2761 MAKE_CASE(AArch64ISD::STZ2G)
2762 MAKE_CASE(AArch64ISD::SUNPKHI)
2763 MAKE_CASE(AArch64ISD::SUNPKLO)
2764 MAKE_CASE(AArch64ISD::UUNPKHI)
2765 MAKE_CASE(AArch64ISD::UUNPKLO)
2766 MAKE_CASE(AArch64ISD::INSR)
2767 MAKE_CASE(AArch64ISD::PTEST)
2768 MAKE_CASE(AArch64ISD::PTEST_ANY)
2769 MAKE_CASE(AArch64ISD::PTRUE)
2770 MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2771 MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2772 MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2773 MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2774 MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2775 MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2776 MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2777 MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2778 MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2779 MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2780 MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2781 MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2782 MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2783 MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2784 MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2785 MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2786 MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2787 MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2788 MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
2789 MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
2790 MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2791 MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2792 MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2793 MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2794 MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2795 MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2796 MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2797 MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2798 MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2799 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2800 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2801 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2802 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2803 MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2804 MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2805 MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2806 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2807 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2808 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2809 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2810 MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2811 MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2812 MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2813 MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2814 MAKE_CASE(AArch64ISD::SST1Q_PRED)
2815 MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
2816 MAKE_CASE(AArch64ISD::ST1_PRED)
2817 MAKE_CASE(AArch64ISD::SST1_PRED)
2818 MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2819 MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2820 MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2821 MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2822 MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2823 MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2824 MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2825 MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2826 MAKE_CASE(AArch64ISD::LDP)
2827 MAKE_CASE(AArch64ISD::LDIAPP)
2828 MAKE_CASE(AArch64ISD::LDNP)
2829 MAKE_CASE(AArch64ISD::STP)
2830 MAKE_CASE(AArch64ISD::STILP)
2831 MAKE_CASE(AArch64ISD::STNP)
2832 MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2833 MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2834 MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
2835 MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
2836 MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
2837 MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2838 MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2839 MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2840 MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2841 MAKE_CASE(AArch64ISD::ADDP)
2842 MAKE_CASE(AArch64ISD::SADDLP)
2843 MAKE_CASE(AArch64ISD::UADDLP)
2844 MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2845 MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
2846 MAKE_CASE(AArch64ISD::MOPS_MEMSET)
2847 MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
2848 MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
2849 MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
2850 MAKE_CASE(AArch64ISD::CALL_BTI)
2851 MAKE_CASE(AArch64ISD::MRRS)
2852 MAKE_CASE(AArch64ISD::MSRR)
2853 MAKE_CASE(AArch64ISD::RSHRNB_I)
2854 MAKE_CASE(AArch64ISD::CTTZ_ELTS)
2855 MAKE_CASE(AArch64ISD::CALL_ARM64EC_TO_X64)
2856 MAKE_CASE(AArch64ISD::URSHR_I_PRED)
2857 }
2858#undef MAKE_CASE
2859 return nullptr;
2860}
2861
2862MachineBasicBlock *
2863AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2864 MachineBasicBlock *MBB) const {
2865 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2866 // phi node:
2867
2868 // OrigBB:
2869 // [... previous instrs leading to comparison ...]
2870 // b.ne TrueBB
2871 // b EndBB
2872 // TrueBB:
2873 // ; Fallthrough
2874 // EndBB:
2875 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2876
2877 MachineFunction *MF = MBB->getParent();
2878 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2879 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2880 DebugLoc DL = MI.getDebugLoc();
2881 MachineFunction::iterator It = ++MBB->getIterator();
2882
2883 Register DestReg = MI.getOperand(i: 0).getReg();
2884 Register IfTrueReg = MI.getOperand(i: 1).getReg();
2885 Register IfFalseReg = MI.getOperand(i: 2).getReg();
2886 unsigned CondCode = MI.getOperand(i: 3).getImm();
2887 bool NZCVKilled = MI.getOperand(i: 4).isKill();
2888
2889 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2890 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2891 MF->insert(MBBI: It, MBB: TrueBB);
2892 MF->insert(MBBI: It, MBB: EndBB);
2893
2894 // Transfer rest of current basic-block to EndBB
2895 EndBB->splice(Where: EndBB->begin(), Other: MBB, From: std::next(x: MachineBasicBlock::iterator(MI)),
2896 To: MBB->end());
2897 EndBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
2898
2899 BuildMI(BB: MBB, MIMD: DL, MCID: TII->get(Opcode: AArch64::Bcc)).addImm(Val: CondCode).addMBB(MBB: TrueBB);
2900 BuildMI(BB: MBB, MIMD: DL, MCID: TII->get(Opcode: AArch64::B)).addMBB(MBB: EndBB);
2901 MBB->addSuccessor(Succ: TrueBB);
2902 MBB->addSuccessor(Succ: EndBB);
2903
2904 // TrueBB falls through to the end.
2905 TrueBB->addSuccessor(Succ: EndBB);
2906
2907 if (!NZCVKilled) {
2908 TrueBB->addLiveIn(PhysReg: AArch64::NZCV);
2909 EndBB->addLiveIn(PhysReg: AArch64::NZCV);
2910 }
2911
2912 BuildMI(BB&: *EndBB, I: EndBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AArch64::PHI), DestReg)
2913 .addReg(RegNo: IfTrueReg)
2914 .addMBB(MBB: TrueBB)
2915 .addReg(RegNo: IfFalseReg)
2916 .addMBB(MBB);
2917
2918 MI.eraseFromParent();
2919 return EndBB;
2920}
2921
2922MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2923 MachineInstr &MI, MachineBasicBlock *BB) const {
2924 assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2925 BB->getParent()->getFunction().getPersonalityFn())) &&
2926 "SEH does not use catchret!");
2927 return BB;
2928}
2929
2930MachineBasicBlock *
2931AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
2932 MachineBasicBlock *MBB) const {
2933 MachineFunction &MF = *MBB->getParent();
2934 MachineBasicBlock::iterator MBBI = MI.getIterator();
2935 DebugLoc DL = MBB->findDebugLoc(MBBI);
2936 const AArch64InstrInfo &TII =
2937 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2938 Register TargetReg = MI.getOperand(i: 0).getReg();
2939 MachineBasicBlock::iterator NextInst =
2940 TII.probedStackAlloc(MBBI, TargetReg, FrameSetup: false);
2941
2942 MI.eraseFromParent();
2943 return NextInst->getParent();
2944}
2945
2946MachineBasicBlock *
2947AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2948 MachineInstr &MI,
2949 MachineBasicBlock *BB) const {
2950 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2951 MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
2952
2953 MIB.addReg(RegNo: BaseReg + MI.getOperand(i: 0).getImm(), flags: RegState::Define);
2954 MIB.add(MO: MI.getOperand(i: 1)); // slice index register
2955 MIB.add(MO: MI.getOperand(i: 2)); // slice index offset
2956 MIB.add(MO: MI.getOperand(i: 3)); // pg
2957 MIB.add(MO: MI.getOperand(i: 4)); // base
2958 MIB.add(MO: MI.getOperand(i: 5)); // offset
2959
2960 MI.eraseFromParent(); // The pseudo is gone now.
2961 return BB;
2962}
2963
2964MachineBasicBlock *
2965AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
2966 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2967 MachineInstrBuilder MIB =
2968 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::LDR_ZA));
2969
2970 MIB.addReg(RegNo: AArch64::ZA, flags: RegState::Define);
2971 MIB.add(MO: MI.getOperand(i: 0)); // Vector select register
2972 MIB.add(MO: MI.getOperand(i: 1)); // Vector select offset
2973 MIB.add(MO: MI.getOperand(i: 2)); // Base
2974 MIB.add(MO: MI.getOperand(i: 1)); // Offset, same as vector select offset
2975
2976 MI.eraseFromParent(); // The pseudo is gone now.
2977 return BB;
2978}
2979
2980MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
2981 MachineBasicBlock *BB,
2982 unsigned Opcode,
2983 bool Op0IsDef) const {
2984 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2985 MachineInstrBuilder MIB;
2986
2987 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode))
2988 .addReg(RegNo: MI.getOperand(i: 0).getReg(), flags: Op0IsDef ? RegState::Define : 0);
2989 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2990 MIB.add(MO: MI.getOperand(i: I));
2991
2992 MI.eraseFromParent(); // The pseudo is gone now.
2993 return BB;
2994}
2995
2996MachineBasicBlock *
2997AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2998 MachineInstr &MI,
2999 MachineBasicBlock *BB) const {
3000 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3001 MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
3002 unsigned StartIdx = 0;
3003
3004 bool HasTile = BaseReg != AArch64::ZA;
3005 bool HasZPROut = HasTile && MI.getOperand(i: 0).isReg();
3006 if (HasZPROut) {
3007 MIB.add(MO: MI.getOperand(i: StartIdx)); // Output ZPR
3008 ++StartIdx;
3009 }
3010 if (HasTile) {
3011 MIB.addReg(RegNo: BaseReg + MI.getOperand(i: StartIdx).getImm(),
3012 flags: RegState::Define); // Output ZA Tile
3013 MIB.addReg(RegNo: BaseReg + MI.getOperand(i: StartIdx).getImm()); // Input Za Tile
3014 StartIdx++;
3015 } else {
3016 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3017 if (MI.getOperand(i: 0).isReg() && !MI.getOperand(i: 1).isImm()) {
3018 MIB.add(MO: MI.getOperand(i: StartIdx)); // Output ZPR
3019 ++StartIdx;
3020 }
3021 MIB.addReg(RegNo: BaseReg, flags: RegState::Define).addReg(RegNo: BaseReg);
3022 }
3023 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3024 MIB.add(MO: MI.getOperand(i: I));
3025
3026 MI.eraseFromParent(); // The pseudo is gone now.
3027 return BB;
3028}
3029
3030MachineBasicBlock *
3031AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
3032 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3033 MachineInstrBuilder MIB =
3034 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::ZERO_M));
3035 MIB.add(MO: MI.getOperand(i: 0)); // Mask
3036
3037 unsigned Mask = MI.getOperand(i: 0).getImm();
3038 for (unsigned I = 0; I < 8; I++) {
3039 if (Mask & (1 << I))
3040 MIB.addDef(RegNo: AArch64::ZAD0 + I, Flags: RegState::ImplicitDefine);
3041 }
3042
3043 MI.eraseFromParent(); // The pseudo is gone now.
3044 return BB;
3045}
3046
3047MachineBasicBlock *
3048AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
3049 MachineBasicBlock *BB) const {
3050 MachineFunction *MF = BB->getParent();
3051 MachineFrameInfo &MFI = MF->getFrameInfo();
3052 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3053 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3054 if (TPIDR2.Uses > 0) {
3055 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3056 // Store the buffer pointer to the TPIDR2 stack object.
3057 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRXui))
3058 .addReg(RegNo: MI.getOperand(i: 0).getReg())
3059 .addFrameIndex(Idx: TPIDR2.FrameIndex)
3060 .addImm(Val: 0);
3061 // Set the reserved bytes (10-15) to zero
3062 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRHHui))
3063 .addReg(RegNo: AArch64::WZR)
3064 .addFrameIndex(Idx: TPIDR2.FrameIndex)
3065 .addImm(Val: 5);
3066 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRWui))
3067 .addReg(RegNo: AArch64::WZR)
3068 .addFrameIndex(Idx: TPIDR2.FrameIndex)
3069 .addImm(Val: 3);
3070 } else
3071 MFI.RemoveStackObject(ObjectIdx: TPIDR2.FrameIndex);
3072
3073 BB->remove_instr(I: &MI);
3074 return BB;
3075}
3076
3077MachineBasicBlock *
3078AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI,
3079 MachineBasicBlock *BB) const {
3080 MachineFunction *MF = BB->getParent();
3081 MachineFrameInfo &MFI = MF->getFrameInfo();
3082 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3083 // TODO This function grows the stack with a subtraction, which doesn't work
3084 // on Windows. Some refactoring to share the functionality in
3085 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3086 // supports SME
3087 assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
3088 "Lazy ZA save is not yet supported on Windows");
3089
3090 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3091
3092 if (TPIDR2.Uses > 0) {
3093 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3094 MachineRegisterInfo &MRI = MF->getRegInfo();
3095
3096 // The SUBXrs below won't always be emitted in a form that accepts SP
3097 // directly
3098 Register SP = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
3099 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: SP)
3100 .addReg(RegNo: AArch64::SP);
3101
3102 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3103 auto Size = MI.getOperand(i: 1).getReg();
3104 auto Dest = MI.getOperand(i: 0).getReg();
3105 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::MSUBXrrr), DestReg: Dest)
3106 .addReg(RegNo: Size)
3107 .addReg(RegNo: Size)
3108 .addReg(RegNo: SP);
3109 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
3110 DestReg: AArch64::SP)
3111 .addReg(RegNo: Dest);
3112
3113 // We have just allocated a variable sized object, tell this to PEI.
3114 MFI.CreateVariableSizedObject(Alignment: Align(16), Alloca: nullptr);
3115 }
3116
3117 BB->remove_instr(I: &MI);
3118 return BB;
3119}
3120
3121MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
3122 MachineInstr &MI, MachineBasicBlock *BB) const {
3123
3124 int SMEOrigInstr = AArch64::getSMEPseudoMap(Opcode: MI.getOpcode());
3125 if (SMEOrigInstr != -1) {
3126 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3127 uint64_t SMEMatrixType =
3128 TII->get(Opcode: MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3129 switch (SMEMatrixType) {
3130 case (AArch64::SMEMatrixArray):
3131 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZA, MI, BB);
3132 case (AArch64::SMEMatrixTileB):
3133 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAB0, MI, BB);
3134 case (AArch64::SMEMatrixTileH):
3135 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAH0, MI, BB);
3136 case (AArch64::SMEMatrixTileS):
3137 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAS0, MI, BB);
3138 case (AArch64::SMEMatrixTileD):
3139 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAD0, MI, BB);
3140 case (AArch64::SMEMatrixTileQ):
3141 return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAQ0, MI, BB);
3142 }
3143 }
3144
3145 switch (MI.getOpcode()) {
3146 default:
3147#ifndef NDEBUG
3148 MI.dump();
3149#endif
3150 llvm_unreachable("Unexpected instruction for custom inserter!");
3151 case AArch64::InitTPIDR2Obj:
3152 return EmitInitTPIDR2Object(MI, BB);
3153 case AArch64::AllocateZABuffer:
3154 return EmitAllocateZABuffer(MI, BB);
3155 case AArch64::F128CSEL:
3156 return EmitF128CSEL(MI, MBB: BB);
3157 case TargetOpcode::STATEPOINT:
3158 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3159 // while bl call instruction (where statepoint will be lowered at the end)
3160 // has implicit def. This def is early-clobber as it will be set at
3161 // the moment of the call and earlier than any use is read.
3162 // Add this implicit dead def here as a workaround.
3163 MI.addOperand(MF&: *MI.getMF(),
3164 Op: MachineOperand::CreateReg(
3165 Reg: AArch64::LR, /*isDef*/ true,
3166 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3167 /*isUndef*/ false, /*isEarlyClobber*/ true));
3168 [[fallthrough]];
3169 case TargetOpcode::STACKMAP:
3170 case TargetOpcode::PATCHPOINT:
3171 return emitPatchPoint(MI, MBB: BB);
3172
3173 case TargetOpcode::PATCHABLE_EVENT_CALL:
3174 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3175 return BB;
3176
3177 case AArch64::CATCHRET:
3178 return EmitLoweredCatchRet(MI, BB);
3179
3180 case AArch64::PROBED_STACKALLOC_DYN:
3181 return EmitDynamicProbedAlloc(MI, MBB: BB);
3182
3183 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3184 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_B, BaseReg: AArch64::ZAB0, MI, BB);
3185 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3186 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_H, BaseReg: AArch64::ZAH0, MI, BB);
3187 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3188 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_S, BaseReg: AArch64::ZAS0, MI, BB);
3189 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3190 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_D, BaseReg: AArch64::ZAD0, MI, BB);
3191 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3192 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_Q, BaseReg: AArch64::ZAQ0, MI, BB);
3193 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3194 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_B, BaseReg: AArch64::ZAB0, MI, BB);
3195 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3196 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_H, BaseReg: AArch64::ZAH0, MI, BB);
3197 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3198 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_S, BaseReg: AArch64::ZAS0, MI, BB);
3199 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3200 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_D, BaseReg: AArch64::ZAD0, MI, BB);
3201 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3202 return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_Q, BaseReg: AArch64::ZAQ0, MI, BB);
3203 case AArch64::LDR_ZA_PSEUDO:
3204 return EmitFill(MI, BB);
3205 case AArch64::LDR_TX_PSEUDO:
3206 return EmitZTInstr(MI, BB, Opcode: AArch64::LDR_TX, /*Op0IsDef=*/true);
3207 case AArch64::STR_TX_PSEUDO:
3208 return EmitZTInstr(MI, BB, Opcode: AArch64::STR_TX, /*Op0IsDef=*/false);
3209 case AArch64::ZERO_M_PSEUDO:
3210 return EmitZero(MI, BB);
3211 case AArch64::ZERO_T_PSEUDO:
3212 return EmitZTInstr(MI, BB, Opcode: AArch64::ZERO_T, /*Op0IsDef=*/true);
3213 }
3214}
3215
3216//===----------------------------------------------------------------------===//
3217// AArch64 Lowering private implementation.
3218//===----------------------------------------------------------------------===//
3219
3220//===----------------------------------------------------------------------===//
3221// Lowering Code
3222//===----------------------------------------------------------------------===//
3223
3224// Forward declarations of SVE fixed length lowering helpers
3225static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
3226static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
3227static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
3228static SDValue convertFixedMaskToScalableVector(SDValue Mask,
3229 SelectionDAG &DAG);
3230static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT);
3231static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
3232 EVT VT);
3233
3234/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3235static bool isZerosVector(const SDNode *N) {
3236 // Look through a bit convert.
3237 while (N->getOpcode() == ISD::BITCAST)
3238 N = N->getOperand(Num: 0).getNode();
3239
3240 if (ISD::isConstantSplatVectorAllZeros(N))
3241 return true;
3242
3243 if (N->getOpcode() != AArch64ISD::DUP)
3244 return false;
3245
3246 auto Opnd0 = N->getOperand(Num: 0);
3247 return isNullConstant(V: Opnd0) || isNullFPConstant(V: Opnd0);
3248}
3249
3250/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3251/// CC
3252static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
3253 switch (CC) {
3254 default:
3255 llvm_unreachable("Unknown condition code!");
3256 case ISD::SETNE:
3257 return AArch64CC::NE;
3258 case ISD::SETEQ:
3259 return AArch64CC::EQ;
3260 case ISD::SETGT:
3261 return AArch64CC::GT;
3262 case ISD::SETGE:
3263 return AArch64CC::GE;
3264 case ISD::SETLT:
3265 return AArch64CC::LT;
3266 case ISD::SETLE:
3267 return AArch64CC::LE;
3268 case ISD::SETUGT:
3269 return AArch64CC::HI;
3270 case ISD::SETUGE:
3271 return AArch64CC::HS;
3272 case ISD::SETULT:
3273 return AArch64CC::LO;
3274 case ISD::SETULE:
3275 return AArch64CC::LS;
3276 }
3277}
3278
3279/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3280static void changeFPCCToAArch64CC(ISD::CondCode CC,
3281 AArch64CC::CondCode &CondCode,
3282 AArch64CC::CondCode &CondCode2) {
3283 CondCode2 = AArch64CC::AL;
3284 switch (CC) {
3285 default:
3286 llvm_unreachable("Unknown FP condition!");
3287 case ISD::SETEQ:
3288 case ISD::SETOEQ:
3289 CondCode = AArch64CC::EQ;
3290 break;
3291 case ISD::SETGT:
3292 case ISD::SETOGT:
3293 CondCode = AArch64CC::GT;
3294 break;
3295 case ISD::SETGE:
3296 case ISD::SETOGE:
3297 CondCode = AArch64CC::GE;
3298 break;
3299 case ISD::SETOLT:
3300 CondCode = AArch64CC::MI;
3301 break;
3302 case ISD::SETOLE:
3303 CondCode = AArch64CC::LS;
3304 break;
3305 case ISD::SETONE:
3306 CondCode = AArch64CC::MI;
3307 CondCode2 = AArch64CC::GT;
3308 break;
3309 case ISD::SETO:
3310 CondCode = AArch64CC::VC;
3311 break;
3312 case ISD::SETUO:
3313 CondCode = AArch64CC::VS;
3314 break;
3315 case ISD::SETUEQ:
3316 CondCode = AArch64CC::EQ;
3317 CondCode2 = AArch64CC::VS;
3318 break;
3319 case ISD::SETUGT:
3320 CondCode = AArch64CC::HI;
3321 break;
3322 case ISD::SETUGE:
3323 CondCode = AArch64CC::PL;
3324 break;
3325 case ISD::SETLT:
3326 case ISD::SETULT:
3327 CondCode = AArch64CC::LT;
3328 break;
3329 case ISD::SETLE:
3330 case ISD::SETULE:
3331 CondCode = AArch64CC::LE;
3332 break;
3333 case ISD::SETNE:
3334 case ISD::SETUNE:
3335 CondCode = AArch64CC::NE;
3336 break;
3337 }
3338}
3339
3340/// Convert a DAG fp condition code to an AArch64 CC.
3341/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3342/// should be AND'ed instead of OR'ed.
3343static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
3344 AArch64CC::CondCode &CondCode,
3345 AArch64CC::CondCode &CondCode2) {
3346 CondCode2 = AArch64CC::AL;
3347 switch (CC) {
3348 default:
3349 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3350 assert(CondCode2 == AArch64CC::AL);
3351 break;
3352 case ISD::SETONE:
3353 // (a one b)
3354 // == ((a olt b) || (a ogt b))
3355 // == ((a ord b) && (a une b))
3356 CondCode = AArch64CC::VC;
3357 CondCode2 = AArch64CC::NE;
3358 break;
3359 case ISD::SETUEQ:
3360 // (a ueq b)
3361 // == ((a uno b) || (a oeq b))
3362 // == ((a ule b) && (a uge b))
3363 CondCode = AArch64CC::PL;
3364 CondCode2 = AArch64CC::LE;
3365 break;
3366 }
3367}
3368
3369/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3370/// CC usable with the vector instructions. Fewer operations are available
3371/// without a real NZCV register, so we have to use less efficient combinations
3372/// to get the same effect.
3373static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
3374 AArch64CC::CondCode &CondCode,
3375 AArch64CC::CondCode &CondCode2,
3376 bool &Invert) {
3377 Invert = false;
3378 switch (CC) {
3379 default:
3380 // Mostly the scalar mappings work fine.
3381 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3382 break;
3383 case ISD::SETUO:
3384 Invert = true;
3385 [[fallthrough]];
3386 case ISD::SETO:
3387 CondCode = AArch64CC::MI;
3388 CondCode2 = AArch64CC::GE;
3389 break;
3390 case ISD::SETUEQ:
3391 case ISD::SETULT:
3392 case ISD::SETULE:
3393 case ISD::SETUGT:
3394 case ISD::SETUGE:
3395 // All of the compare-mask comparisons are ordered, but we can switch
3396 // between the two by a double inversion. E.g. ULE == !OGT.
3397 Invert = true;
3398 changeFPCCToAArch64CC(CC: getSetCCInverse(Operation: CC, /* FP inverse */ Type: MVT::f32),
3399 CondCode, CondCode2);
3400 break;
3401 }
3402}
3403
3404static bool isLegalArithImmed(uint64_t C) {
3405 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3406 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3407 LLVM_DEBUG(dbgs() << "Is imm " << C
3408 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3409 return IsLegal;
3410}
3411
3412static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
3413 KnownBits KnownSrc = DAG.computeKnownBits(Op: CheckedVal);
3414 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3415}
3416
3417// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3418// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3419// can be set differently by this operation. It comes down to whether
3420// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3421// everything is fine. If not then the optimization is wrong. Thus general
3422// comparisons are only valid if op2 != 0.
3423//
3424// So, finally, the only LLVM-native comparisons that don't mention C or V
3425// are the ones that aren't unsigned comparisons. They're the only ones we can
3426// safely use CMN for in the absence of information about op2.
3427static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG) {
3428 return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: 0)) &&
3429 (isIntEqualitySetCC(Code: CC) ||
3430 (isUnsignedIntSetCC(Code: CC) && DAG.isKnownNeverZero(Op: Op.getOperand(i: 1))) ||
3431 (isSignedIntSetCC(Code: CC) && cannotBeIntMin(CheckedVal: Op.getOperand(i: 1), DAG)));
3432}
3433
3434static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
3435 SelectionDAG &DAG, SDValue Chain,
3436 bool IsSignaling) {
3437 EVT VT = LHS.getValueType();
3438 assert(VT != MVT::f128);
3439
3440 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3441
3442 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3443 LHS = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl, ResultTys: {MVT::f32, MVT::Other},
3444 Ops: {Chain, LHS});
3445 RHS = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl, ResultTys: {MVT::f32, MVT::Other},
3446 Ops: {LHS.getValue(R: 1), RHS});
3447 Chain = RHS.getValue(R: 1);
3448 VT = MVT::f32;
3449 }
3450 unsigned Opcode =
3451 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3452 return DAG.getNode(Opcode, DL: dl, ResultTys: {VT, MVT::Other}, Ops: {Chain, LHS, RHS});
3453}
3454
3455static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3456 const SDLoc &dl, SelectionDAG &DAG) {
3457 EVT VT = LHS.getValueType();
3458 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3459
3460 if (VT.isFloatingPoint()) {
3461 assert(VT != MVT::f128);
3462 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3463 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f32, Operand: LHS);
3464 RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f32, Operand: RHS);
3465 VT = MVT::f32;
3466 }
3467 return DAG.getNode(Opcode: AArch64ISD::FCMP, DL: dl, VT, N1: LHS, N2: RHS);
3468 }
3469
3470 // The CMP instruction is just an alias for SUBS, and representing it as
3471 // SUBS means that it's possible to get CSE with subtract operations.
3472 // A later phase can perform the optimization of setting the destination
3473 // register to WZR/XZR if it ends up being unused.
3474 unsigned Opcode = AArch64ISD::SUBS;
3475
3476 if (isCMN(Op: RHS, CC, DAG)) {
3477 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3478 Opcode = AArch64ISD::ADDS;
3479 RHS = RHS.getOperand(i: 1);
3480 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(V: LHS.getOperand(i: 0)) &&
3481 isIntEqualitySetCC(Code: CC)) {
3482 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3483 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3484 Opcode = AArch64ISD::ADDS;
3485 LHS = LHS.getOperand(i: 1);
3486 } else if (isNullConstant(V: RHS) && !isUnsignedIntSetCC(Code: CC)) {
3487 if (LHS.getOpcode() == ISD::AND) {
3488 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3489 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3490 // of the signed comparisons.
3491 const SDValue ANDSNode = DAG.getNode(Opcode: AArch64ISD::ANDS, DL: dl,
3492 VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC),
3493 N1: LHS.getOperand(i: 0),
3494 N2: LHS.getOperand(i: 1));
3495 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3496 DAG.ReplaceAllUsesWith(From: LHS, To: ANDSNode);
3497 return ANDSNode.getValue(R: 1);
3498 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3499 // Use result of ANDS
3500 return LHS.getValue(R: 1);
3501 }
3502 }
3503
3504 return DAG.getNode(Opcode, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC), N1: LHS, N2: RHS)
3505 .getValue(R: 1);
3506}
3507
3508/// \defgroup AArch64CCMP CMP;CCMP matching
3509///
3510/// These functions deal with the formation of CMP;CCMP;... sequences.
3511/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3512/// a comparison. They set the NZCV flags to a predefined value if their
3513/// predicate is false. This allows to express arbitrary conjunctions, for
3514/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3515/// expressed as:
3516/// cmp A
3517/// ccmp B, inv(CB), CA
3518/// check for CB flags
3519///
3520/// This naturally lets us implement chains of AND operations with SETCC
3521/// operands. And we can even implement some other situations by transforming
3522/// them:
3523/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3524/// negating the flags used in a CCMP/FCCMP operations.
3525/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3526/// by negating the flags we test for afterwards. i.e.
3527/// NEG (CMP CCMP CCCMP ...) can be implemented.
3528/// - Note that we can only ever negate all previously processed results.
3529/// What we can not implement by flipping the flags to test is a negation
3530/// of two sub-trees (because the negation affects all sub-trees emitted so
3531/// far, so the 2nd sub-tree we emit would also affect the first).
3532/// With those tools we can implement some OR operations:
3533/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3534/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3535/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3536/// elimination rules from earlier to implement the whole thing as a
3537/// CCMP/FCCMP chain.
3538///
3539/// As complete example:
3540/// or (or (setCA (cmp A)) (setCB (cmp B)))
3541/// (and (setCC (cmp C)) (setCD (cmp D)))"
3542/// can be reassociated to:
3543/// or (and (setCC (cmp C)) setCD (cmp D))
3544// (or (setCA (cmp A)) (setCB (cmp B)))
3545/// can be transformed to:
3546/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3547/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3548/// which can be implemented as:
3549/// cmp C
3550/// ccmp D, inv(CD), CC
3551/// ccmp A, CA, inv(CD)
3552/// ccmp B, CB, inv(CA)
3553/// check for CB flags
3554///
3555/// A counterexample is "or (and A B) (and C D)" which translates to
3556/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3557/// can only implement 1 of the inner (not) operations, but not both!
3558/// @{
3559
3560/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3561static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
3562 ISD::CondCode CC, SDValue CCOp,
3563 AArch64CC::CondCode Predicate,
3564 AArch64CC::CondCode OutCC,
3565 const SDLoc &DL, SelectionDAG &DAG) {
3566 unsigned Opcode = 0;
3567 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3568
3569 if (LHS.getValueType().isFloatingPoint()) {
3570 assert(LHS.getValueType() != MVT::f128);
3571 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3572 LHS.getValueType() == MVT::bf16) {
3573 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: LHS);
3574 RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: RHS);
3575 }
3576 Opcode = AArch64ISD::FCCMP;
3577 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val&: RHS)) {
3578 APInt Imm = Const->getAPIntValue();
3579 if (Imm.isNegative() && Imm.sgt(RHS: -32)) {
3580 Opcode = AArch64ISD::CCMN;
3581 RHS = DAG.getConstant(Val: Imm.abs(), DL, VT: Const->getValueType(ResNo: 0));
3582 }
3583 } else if (isCMN(Op: RHS, CC, DAG)) {
3584 Opcode = AArch64ISD::CCMN;
3585 RHS = RHS.getOperand(i: 1);
3586 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(V: LHS.getOperand(i: 0)) &&
3587 isIntEqualitySetCC(Code: CC)) {
3588 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3589 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3590 Opcode = AArch64ISD::CCMN;
3591 LHS = LHS.getOperand(i: 1);
3592 }
3593 if (Opcode == 0)
3594 Opcode = AArch64ISD::CCMP;
3595
3596 SDValue Condition = DAG.getConstant(Val: Predicate, DL, VT: MVT_CC);
3597 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3598 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvOutCC);
3599 SDValue NZCVOp = DAG.getConstant(Val: NZCV, DL, VT: MVT::i32);
3600 return DAG.getNode(Opcode, DL, VT: MVT_CC, N1: LHS, N2: RHS, N3: NZCVOp, N4: Condition, N5: CCOp);
3601}
3602
3603/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3604/// expressed as a conjunction. See \ref AArch64CCMP.
3605/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3606/// changing the conditions on the SETCC tests.
3607/// (this means we can call emitConjunctionRec() with
3608/// Negate==true on this sub-tree)
3609/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3610/// cannot do the negation naturally. We are required to
3611/// emit the subtree first in this case.
3612/// \param WillNegate Is true if are called when the result of this
3613/// subexpression must be negated. This happens when the
3614/// outer expression is an OR. We can use this fact to know
3615/// that we have a double negation (or (or ...) ...) that
3616/// can be implemented for free.
3617static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3618 bool &MustBeFirst, bool WillNegate,
3619 unsigned Depth = 0) {
3620 if (!Val.hasOneUse())
3621 return false;
3622 unsigned Opcode = Val->getOpcode();
3623 if (Opcode == ISD::SETCC) {
3624 if (Val->getOperand(Num: 0).getValueType() == MVT::f128)
3625 return false;
3626 CanNegate = true;
3627 MustBeFirst = false;
3628 return true;
3629 }
3630 // Protect against exponential runtime and stack overflow.
3631 if (Depth > 6)
3632 return false;
3633 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3634 bool IsOR = Opcode == ISD::OR;
3635 SDValue O0 = Val->getOperand(Num: 0);
3636 SDValue O1 = Val->getOperand(Num: 1);
3637 bool CanNegateL;
3638 bool MustBeFirstL;
3639 if (!canEmitConjunction(Val: O0, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, Depth: Depth+1))
3640 return false;
3641 bool CanNegateR;
3642 bool MustBeFirstR;
3643 if (!canEmitConjunction(Val: O1, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, Depth: Depth+1))
3644 return false;
3645
3646 if (MustBeFirstL && MustBeFirstR)
3647 return false;
3648
3649 if (IsOR) {
3650 // For an OR expression we need to be able to naturally negate at least
3651 // one side or we cannot do the transformation at all.
3652 if (!CanNegateL && !CanNegateR)
3653 return false;
3654 // If we the result of the OR will be negated and we can naturally negate
3655 // the leafs, then this sub-tree as a whole negates naturally.
3656 CanNegate = WillNegate && CanNegateL && CanNegateR;
3657 // If we cannot naturally negate the whole sub-tree, then this must be
3658 // emitted first.
3659 MustBeFirst = !CanNegate;
3660 } else {
3661 assert(Opcode == ISD::AND && "Must be OR or AND");
3662 // We cannot naturally negate an AND operation.
3663 CanNegate = false;
3664 MustBeFirst = MustBeFirstL || MustBeFirstR;
3665 }
3666 return true;
3667 }
3668 return false;
3669}
3670
3671/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3672/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3673/// Tries to transform the given i1 producing node @p Val to a series compare
3674/// and conditional compare operations. @returns an NZCV flags producing node
3675/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3676/// transformation was not possible.
3677/// \p Negate is true if we want this sub-tree being negated just by changing
3678/// SETCC conditions.
3679static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3680 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3681 AArch64CC::CondCode Predicate) {
3682 // We're at a tree leaf, produce a conditional comparison operation.
3683 unsigned Opcode = Val->getOpcode();
3684 if (Opcode == ISD::SETCC) {
3685 SDValue LHS = Val->getOperand(Num: 0);
3686 SDValue RHS = Val->getOperand(Num: 1);
3687 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Val->getOperand(Num: 2))->get();
3688 bool isInteger = LHS.getValueType().isInteger();
3689 if (Negate)
3690 CC = getSetCCInverse(Operation: CC, Type: LHS.getValueType());
3691 SDLoc DL(Val);
3692 // Determine OutCC and handle FP special case.
3693 if (isInteger) {
3694 OutCC = changeIntCCToAArch64CC(CC);
3695 } else {
3696 assert(LHS.getValueType().isFloatingPoint());
3697 AArch64CC::CondCode ExtraCC;
3698 changeFPCCToANDAArch64CC(CC, CondCode&: OutCC, CondCode2&: ExtraCC);
3699 // Some floating point conditions can't be tested with a single condition
3700 // code. Construct an additional comparison in this case.
3701 if (ExtraCC != AArch64CC::AL) {
3702 SDValue ExtraCmp;
3703 if (!CCOp.getNode())
3704 ExtraCmp = emitComparison(LHS, RHS, CC, dl: DL, DAG);
3705 else
3706 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3707 OutCC: ExtraCC, DL, DAG);
3708 CCOp = ExtraCmp;
3709 Predicate = ExtraCC;
3710 }
3711 }
3712
3713 // Produce a normal comparison if we are first in the chain
3714 if (!CCOp)
3715 return emitComparison(LHS, RHS, CC, dl: DL, DAG);
3716 // Otherwise produce a ccmp.
3717 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3718 DAG);
3719 }
3720 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3721
3722 bool IsOR = Opcode == ISD::OR;
3723
3724 SDValue LHS = Val->getOperand(Num: 0);
3725 bool CanNegateL;
3726 bool MustBeFirstL;
3727 bool ValidL = canEmitConjunction(Val: LHS, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR);
3728 assert(ValidL && "Valid conjunction/disjunction tree");
3729 (void)ValidL;
3730
3731 SDValue RHS = Val->getOperand(Num: 1);
3732 bool CanNegateR;
3733 bool MustBeFirstR;
3734 bool ValidR = canEmitConjunction(Val: RHS, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR);
3735 assert(ValidR && "Valid conjunction/disjunction tree");
3736 (void)ValidR;
3737
3738 // Swap sub-tree that must come first to the right side.
3739 if (MustBeFirstL) {
3740 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3741 std::swap(a&: LHS, b&: RHS);
3742 std::swap(a&: CanNegateL, b&: CanNegateR);
3743 std::swap(a&: MustBeFirstL, b&: MustBeFirstR);
3744 }
3745
3746 bool NegateR;
3747 bool NegateAfterR;
3748 bool NegateL;
3749 bool NegateAfterAll;
3750 if (Opcode == ISD::OR) {
3751 // Swap the sub-tree that we can negate naturally to the left.
3752 if (!CanNegateL) {
3753 assert(CanNegateR && "at least one side must be negatable");
3754 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3755 assert(!Negate);
3756 std::swap(a&: LHS, b&: RHS);
3757 NegateR = false;
3758 NegateAfterR = true;
3759 } else {
3760 // Negate the left sub-tree if possible, otherwise negate the result.
3761 NegateR = CanNegateR;
3762 NegateAfterR = !CanNegateR;
3763 }
3764 NegateL = true;
3765 NegateAfterAll = !Negate;
3766 } else {
3767 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3768 assert(!Negate && "Valid conjunction/disjunction tree");
3769
3770 NegateL = false;
3771 NegateR = false;
3772 NegateAfterR = false;
3773 NegateAfterAll = false;
3774 }
3775
3776 // Emit sub-trees.
3777 AArch64CC::CondCode RHSCC;
3778 SDValue CmpR = emitConjunctionRec(DAG, Val: RHS, OutCC&: RHSCC, Negate: NegateR, CCOp, Predicate);
3779 if (NegateAfterR)
3780 RHSCC = AArch64CC::getInvertedCondCode(Code: RHSCC);
3781 SDValue CmpL = emitConjunctionRec(DAG, Val: LHS, OutCC, Negate: NegateL, CCOp: CmpR, Predicate: RHSCC);
3782 if (NegateAfterAll)
3783 OutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3784 return CmpL;
3785}
3786
3787/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3788/// In some cases this is even possible with OR operations in the expression.
3789/// See \ref AArch64CCMP.
3790/// \see emitConjunctionRec().
3791static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3792 AArch64CC::CondCode &OutCC) {
3793 bool DummyCanNegate;
3794 bool DummyMustBeFirst;
3795 if (!canEmitConjunction(Val, CanNegate&: DummyCanNegate, MustBeFirst&: DummyMustBeFirst, WillNegate: false))
3796 return SDValue();
3797
3798 return emitConjunctionRec(DAG, Val, OutCC, Negate: false, CCOp: SDValue(), Predicate: AArch64CC::AL);
3799}
3800
3801/// @}
3802
3803/// Returns how profitable it is to fold a comparison's operand's shift and/or
3804/// extension operations.
3805static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3806 auto isSupportedExtend = [&](SDValue V) {
3807 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3808 return true;
3809
3810 if (V.getOpcode() == ISD::AND)
3811 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1))) {
3812 uint64_t Mask = MaskCst->getZExtValue();
3813 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3814 }
3815
3816 return false;
3817 };
3818
3819 if (!Op.hasOneUse())
3820 return 0;
3821
3822 if (isSupportedExtend(Op))
3823 return 1;
3824
3825 unsigned Opc = Op.getOpcode();
3826 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3827 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
3828 uint64_t Shift = ShiftCst->getZExtValue();
3829 if (isSupportedExtend(Op.getOperand(i: 0)))
3830 return (Shift <= 4) ? 2 : 1;
3831 EVT VT = Op.getValueType();
3832 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3833 return 1;
3834 }
3835
3836 return 0;
3837}
3838
3839static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3840 SDValue &AArch64cc, SelectionDAG &DAG,
3841 const SDLoc &dl) {
3842 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val: RHS.getNode())) {
3843 EVT VT = RHS.getValueType();
3844 uint64_t C = RHSC->getZExtValue();
3845 if (!isLegalArithImmed(C)) {
3846 // Constant does not fit, try adjusting it by one?
3847 switch (CC) {
3848 default:
3849 break;
3850 case ISD::SETLT:
3851 case ISD::SETGE:
3852 if ((VT == MVT::i32 && C != 0x80000000 &&
3853 isLegalArithImmed(C: (uint32_t)(C - 1))) ||
3854 (VT == MVT::i64 && C != 0x80000000ULL &&
3855 isLegalArithImmed(C: C - 1ULL))) {
3856 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3857 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3858 RHS = DAG.getConstant(Val: C, DL: dl, VT);
3859 }
3860 break;
3861 case ISD::SETULT:
3862 case ISD::SETUGE:
3863 if ((VT == MVT::i32 && C != 0 &&
3864 isLegalArithImmed(C: (uint32_t)(C - 1))) ||
3865 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C: C - 1ULL))) {
3866 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3867 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3868 RHS = DAG.getConstant(Val: C, DL: dl, VT);
3869 }
3870 break;
3871 case ISD::SETLE:
3872 case ISD::SETGT:
3873 if ((VT == MVT::i32 && C != INT32_MAX &&
3874 isLegalArithImmed(C: (uint32_t)(C + 1))) ||
3875 (VT == MVT::i64 && C != INT64_MAX &&
3876 isLegalArithImmed(C: C + 1ULL))) {
3877 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3878 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3879 RHS = DAG.getConstant(Val: C, DL: dl, VT);
3880 }
3881 break;
3882 case ISD::SETULE:
3883 case ISD::SETUGT:
3884 if ((VT == MVT::i32 && C != UINT32_MAX &&
3885 isLegalArithImmed(C: (uint32_t)(C + 1))) ||
3886 (VT == MVT::i64 && C != UINT64_MAX &&
3887 isLegalArithImmed(C: C + 1ULL))) {
3888 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3889 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3890 RHS = DAG.getConstant(Val: C, DL: dl, VT);
3891 }
3892 break;
3893 }
3894 }
3895 }
3896
3897 // Comparisons are canonicalized so that the RHS operand is simpler than the
3898 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3899 // can fold some shift+extend operations on the RHS operand, so swap the
3900 // operands if that can be done.
3901 //
3902 // For example:
3903 // lsl w13, w11, #1
3904 // cmp w13, w12
3905 // can be turned into:
3906 // cmp w12, w11, lsl #1
3907 if (!isa<ConstantSDNode>(Val: RHS) ||
3908 !isLegalArithImmed(C: RHS->getAsAPIntVal().abs().getZExtValue())) {
3909 bool LHSIsCMN = isCMN(Op: LHS, CC, DAG);
3910 bool RHSIsCMN = isCMN(Op: RHS, CC, DAG);
3911 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(i: 1) : LHS;
3912 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(i: 1) : RHS;
3913
3914 if (getCmpOperandFoldingProfit(Op: TheLHS) + (LHSIsCMN ? 1 : 0) >
3915 getCmpOperandFoldingProfit(Op: TheRHS) + (RHSIsCMN ? 1 : 0)) {
3916 std::swap(a&: LHS, b&: RHS);
3917 CC = ISD::getSetCCSwappedOperands(Operation: CC);
3918 }
3919 }
3920
3921 SDValue Cmp;
3922 AArch64CC::CondCode AArch64CC;
3923 if (isIntEqualitySetCC(Code: CC) && isa<ConstantSDNode>(Val: RHS)) {
3924 const ConstantSDNode *RHSC = cast<ConstantSDNode>(Val&: RHS);
3925
3926 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3927 // For the i8 operand, the largest immediate is 255, so this can be easily
3928 // encoded in the compare instruction. For the i16 operand, however, the
3929 // largest immediate cannot be encoded in the compare.
3930 // Therefore, use a sign extending load and cmn to avoid materializing the
3931 // -1 constant. For example,
3932 // movz w1, #65535
3933 // ldrh w0, [x0, #0]
3934 // cmp w0, w1
3935 // >
3936 // ldrsh w0, [x0, #0]
3937 // cmn w0, #1
3938 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3939 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3940 // ensure both the LHS and RHS are truly zero extended and to make sure the
3941 // transformation is profitable.
3942 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(Val: LHS) &&
3943 cast<LoadSDNode>(Val&: LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3944 cast<LoadSDNode>(Val&: LHS)->getMemoryVT() == MVT::i16 &&
3945 LHS.getNode()->hasNUsesOfValue(NUses: 1, Value: 0)) {
3946 int16_t ValueofRHS = RHS->getAsZExtVal();
3947 if (ValueofRHS < 0 && isLegalArithImmed(C: -ValueofRHS)) {
3948 SDValue SExt =
3949 DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: dl, VT: LHS.getValueType(), N1: LHS,
3950 N2: DAG.getValueType(MVT::i16));
3951 Cmp = emitComparison(LHS: SExt, RHS: DAG.getConstant(Val: ValueofRHS, DL: dl,
3952 VT: RHS.getValueType()),
3953 CC, dl, DAG);
3954 AArch64CC = changeIntCCToAArch64CC(CC);
3955 }
3956 }
3957
3958 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3959 if ((Cmp = emitConjunction(DAG, Val: LHS, OutCC&: AArch64CC))) {
3960 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3961 AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
3962 }
3963 }
3964 }
3965
3966 if (!Cmp) {
3967 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3968 AArch64CC = changeIntCCToAArch64CC(CC);
3969 }
3970 AArch64cc = DAG.getConstant(Val: AArch64CC, DL: dl, VT: MVT_CC);
3971 return Cmp;
3972}
3973
3974static std::pair<SDValue, SDValue>
3975getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
3976 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3977 "Unsupported value type");
3978 SDValue Value, Overflow;
3979 SDLoc DL(Op);
3980 SDValue LHS = Op.getOperand(i: 0);
3981 SDValue RHS = Op.getOperand(i: 1);
3982 unsigned Opc = 0;
3983 switch (Op.getOpcode()) {
3984 default:
3985 llvm_unreachable("Unknown overflow instruction!");
3986 case ISD::SADDO:
3987 Opc = AArch64ISD::ADDS;
3988 CC = AArch64CC::VS;
3989 break;
3990 case ISD::UADDO:
3991 Opc = AArch64ISD::ADDS;
3992 CC = AArch64CC::HS;
3993 break;
3994 case ISD::SSUBO:
3995 Opc = AArch64ISD::SUBS;
3996 CC = AArch64CC::VS;
3997 break;
3998 case ISD::USUBO:
3999 Opc = AArch64ISD::SUBS;
4000 CC = AArch64CC::LO;
4001 break;
4002 // Multiply needs a little bit extra work.
4003 case ISD::SMULO:
4004 case ISD::UMULO: {
4005 CC = AArch64CC::NE;
4006 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4007 if (Op.getValueType() == MVT::i32) {
4008 // Extend to 64-bits, then perform a 64-bit multiply.
4009 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4010 LHS = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::i64, Operand: LHS);
4011 RHS = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::i64, Operand: RHS);
4012 SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4013 Value = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: Mul);
4014
4015 // Check that the result fits into a 32-bit integer.
4016 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT_CC);
4017 if (IsSigned) {
4018 // cmp xreg, wreg, sxtw
4019 SDValue SExtMul = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: Value);
4020 Overflow =
4021 DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Mul, N2: SExtMul).getValue(R: 1);
4022 } else {
4023 // tst xreg, #0xffffffff00000000
4024 SDValue UpperBits = DAG.getConstant(Val: 0xFFFFFFFF00000000, DL, VT: MVT::i64);
4025 Overflow =
4026 DAG.getNode(Opcode: AArch64ISD::ANDS, DL, VTList: VTs, N1: Mul, N2: UpperBits).getValue(R: 1);
4027 }
4028 break;
4029 }
4030 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4031 // For the 64 bit multiply
4032 Value = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4033 if (IsSigned) {
4034 SDValue UpperBits = DAG.getNode(Opcode: ISD::MULHS, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4035 SDValue LowerBits = DAG.getNode(Opcode: ISD::SRA, DL, VT: MVT::i64, N1: Value,
4036 N2: DAG.getConstant(Val: 63, DL, VT: MVT::i64));
4037 // It is important that LowerBits is last, otherwise the arithmetic
4038 // shift will not be folded into the compare (SUBS).
4039 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i32);
4040 Overflow = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: UpperBits, N2: LowerBits)
4041 .getValue(R: 1);
4042 } else {
4043 SDValue UpperBits = DAG.getNode(Opcode: ISD::MULHU, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4044 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i32);
4045 Overflow =
4046 DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs,
4047 N1: DAG.getConstant(Val: 0, DL, VT: MVT::i64),
4048 N2: UpperBits).getValue(R: 1);
4049 }
4050 break;
4051 }
4052 } // switch (...)
4053
4054 if (Opc) {
4055 SDVTList VTs = DAG.getVTList(VT1: Op->getValueType(ResNo: 0), VT2: MVT::i32);
4056
4057 // Emit the AArch64 operation with overflow check.
4058 Value = DAG.getNode(Opcode: Opc, DL, VTList: VTs, N1: LHS, N2: RHS);
4059 Overflow = Value.getValue(R: 1);
4060 }
4061 return std::make_pair(x&: Value, y&: Overflow);
4062}
4063
4064SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4065 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
4066 OverrideNEON: !Subtarget->isNeonAvailable()))
4067 return LowerToScalableOp(Op, DAG);
4068
4069 SDValue Sel = Op.getOperand(i: 0);
4070 SDValue Other = Op.getOperand(i: 1);
4071 SDLoc dl(Sel);
4072
4073 // If the operand is an overflow checking operation, invert the condition
4074 // code and kill the Not operation. I.e., transform:
4075 // (xor (overflow_op_bool, 1))
4076 // -->
4077 // (csel 1, 0, invert(cc), overflow_op_bool)
4078 // ... which later gets transformed to just a cset instruction with an
4079 // inverted condition code, rather than a cset + eor sequence.
4080 if (isOneConstant(V: Other) && ISD::isOverflowIntrOpRes(Op: Sel)) {
4081 // Only lower legal XALUO ops.
4082 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Sel->getValueType(ResNo: 0)))
4083 return SDValue();
4084
4085 SDValue TVal = DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32);
4086 SDValue FVal = DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32);
4087 AArch64CC::CondCode CC;
4088 SDValue Value, Overflow;
4089 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op: Sel.getValue(R: 0), DAG);
4090 SDValue CCVal = DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL: dl, VT: MVT::i32);
4091 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT: Op.getValueType(), N1: TVal, N2: FVal,
4092 N3: CCVal, N4: Overflow);
4093 }
4094 // If neither operand is a SELECT_CC, give up.
4095 if (Sel.getOpcode() != ISD::SELECT_CC)
4096 std::swap(a&: Sel, b&: Other);
4097 if (Sel.getOpcode() != ISD::SELECT_CC)
4098 return Op;
4099
4100 // The folding we want to perform is:
4101 // (xor x, (select_cc a, b, cc, 0, -1) )
4102 // -->
4103 // (csel x, (xor x, -1), cc ...)
4104 //
4105 // The latter will get matched to a CSINV instruction.
4106
4107 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Sel.getOperand(i: 4))->get();
4108 SDValue LHS = Sel.getOperand(i: 0);
4109 SDValue RHS = Sel.getOperand(i: 1);
4110 SDValue TVal = Sel.getOperand(i: 2);
4111 SDValue FVal = Sel.getOperand(i: 3);
4112
4113 // FIXME: This could be generalized to non-integer comparisons.
4114 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4115 return Op;
4116
4117 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
4118 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
4119
4120 // The values aren't constants, this isn't the pattern we're looking for.
4121 if (!CFVal || !CTVal)
4122 return Op;
4123
4124 // We can commute the SELECT_CC by inverting the condition. This
4125 // might be needed to make this fit into a CSINV pattern.
4126 if (CTVal->isAllOnes() && CFVal->isZero()) {
4127 std::swap(a&: TVal, b&: FVal);
4128 std::swap(a&: CTVal, b&: CFVal);
4129 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
4130 }
4131
4132 // If the constants line up, perform the transform!
4133 if (CTVal->isZero() && CFVal->isAllOnes()) {
4134 SDValue CCVal;
4135 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
4136
4137 FVal = Other;
4138 TVal = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: Other.getValueType(), N1: Other,
4139 N2: DAG.getConstant(Val: -1ULL, DL: dl, VT: Other.getValueType()));
4140
4141 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT: Sel.getValueType(), N1: FVal, N2: TVal,
4142 N3: CCVal, N4: Cmp);
4143 }
4144
4145 return Op;
4146}
4147
4148// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4149// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4150// sets 'C' bit to 0.
4151static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
4152 SDLoc DL(Value);
4153 EVT VT = Value.getValueType();
4154 SDValue Op0 = Invert ? DAG.getConstant(Val: 0, DL, VT) : Value;
4155 SDValue Op1 = Invert ? Value : DAG.getConstant(Val: 1, DL, VT);
4156 SDValue Cmp =
4157 DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Glue), N1: Op0, N2: Op1);
4158 return Cmp.getValue(R: 1);
4159}
4160
4161// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4162// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4163static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG,
4164 bool Invert) {
4165 assert(Glue.getResNo() == 1);
4166 SDLoc DL(Glue);
4167 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
4168 SDValue One = DAG.getConstant(Val: 1, DL, VT);
4169 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4170 SDValue CC = DAG.getConstant(Val: Cond, DL, VT: MVT::i32);
4171 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
4172}
4173
4174// Value is 1 if 'V' bit of NZCV is 1, else 0
4175static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) {
4176 assert(Glue.getResNo() == 1);
4177 SDLoc DL(Glue);
4178 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
4179 SDValue One = DAG.getConstant(Val: 1, DL, VT);
4180 SDValue CC = DAG.getConstant(Val: AArch64CC::VS, DL, VT: MVT::i32);
4181 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
4182}
4183
4184// This lowering is inefficient, but it will get cleaned up by
4185// `foldOverflowCheck`
4186static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
4187 unsigned Opcode, bool IsSigned) {
4188 EVT VT0 = Op.getValue(R: 0).getValueType();
4189 EVT VT1 = Op.getValue(R: 1).getValueType();
4190
4191 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4192 return SDValue();
4193
4194 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4195 SDValue OpLHS = Op.getOperand(i: 0);
4196 SDValue OpRHS = Op.getOperand(i: 1);
4197 SDValue OpCarryIn = valueToCarryFlag(Value: Op.getOperand(i: 2), DAG, Invert: InvertCarry);
4198
4199 SDLoc DL(Op);
4200 SDVTList VTs = DAG.getVTList(VT1: VT0, VT2: VT1);
4201
4202 SDValue Sum = DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: VT0, VT2: MVT::Glue), N1: OpLHS,
4203 N2: OpRHS, N3: OpCarryIn);
4204
4205 SDValue OutFlag =
4206 IsSigned ? overflowFlagToValue(Glue: Sum.getValue(R: 1), VT: VT1, DAG)
4207 : carryFlagToValue(Glue: Sum.getValue(R: 1), VT: VT1, DAG, Invert: InvertCarry);
4208
4209 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: VTs, N1: Sum, N2: OutFlag);
4210}
4211
4212static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
4213 // Let legalize expand this if it isn't a legal type yet.
4214 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Op.getValueType()))
4215 return SDValue();
4216
4217 SDLoc dl(Op);
4218 AArch64CC::CondCode CC;
4219 // The actual operation that sets the overflow or carry flag.
4220 SDValue Value, Overflow;
4221 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4222
4223 // We use 0 and 1 as false and true values.
4224 SDValue TVal = DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32);
4225 SDValue FVal = DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32);
4226
4227 // We use an inverted condition, because the conditional select is inverted
4228 // too. This will allow it to be selected to a single instruction:
4229 // CSINC Wd, WZR, WZR, invert(cond).
4230 SDValue CCVal = DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL: dl, VT: MVT::i32);
4231 Overflow = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT: MVT::i32, N1: FVal, N2: TVal,
4232 N3: CCVal, N4: Overflow);
4233
4234 SDVTList VTs = DAG.getVTList(VT1: Op.getValueType(), VT2: MVT::i32);
4235 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: VTs, N1: Value, N2: Overflow);
4236}
4237
4238// Prefetch operands are:
4239// 1: Address to prefetch
4240// 2: bool isWrite
4241// 3: int locality (0 = no locality ... 3 = extreme locality)
4242// 4: bool isDataCache
4243static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
4244 SDLoc DL(Op);
4245 unsigned IsWrite = Op.getConstantOperandVal(i: 2);
4246 unsigned Locality = Op.getConstantOperandVal(i: 3);
4247 unsigned IsData = Op.getConstantOperandVal(i: 4);
4248
4249 bool IsStream = !Locality;
4250 // When the locality number is set
4251 if (Locality) {
4252 // The front-end should have filtered out the out-of-range values
4253 assert(Locality <= 3 && "Prefetch locality out-of-range");
4254 // The locality degree is the opposite of the cache speed.
4255 // Put the number the other way around.
4256 // The encoding starts at 0 for level 1
4257 Locality = 3 - Locality;
4258 }
4259
4260 // built the mask value encoding the expected behavior.
4261 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4262 (!IsData << 3) | // IsDataCache bit
4263 (Locality << 1) | // Cache level bits
4264 (unsigned)IsStream; // Stream bit
4265 return DAG.getNode(Opcode: AArch64ISD::PREFETCH, DL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4266 N2: DAG.getTargetConstant(Val: PrfOp, DL, VT: MVT::i32),
4267 N3: Op.getOperand(i: 1));
4268}
4269
4270SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4271 SelectionDAG &DAG) const {
4272 EVT VT = Op.getValueType();
4273 if (VT.isScalableVector())
4274 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4275
4276 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
4277 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4278
4279 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4280 return SDValue();
4281}
4282
4283SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4284 SelectionDAG &DAG) const {
4285 EVT VT = Op.getValueType();
4286 if (VT.isScalableVector())
4287 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4288
4289 bool IsStrict = Op->isStrictFPOpcode();
4290 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
4291 EVT SrcVT = SrcVal.getValueType();
4292 bool Trunc = Op.getConstantOperandVal(i: IsStrict ? 2 : 1) == 1;
4293
4294 if (useSVEForFixedLengthVectorVT(VT: SrcVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4295 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4296
4297 // Expand cases where the result type is BF16 but we don't have hardware
4298 // instructions to lower it.
4299 if (VT.getScalarType() == MVT::bf16 &&
4300 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4301 Subtarget->hasBF16())) {
4302 SDLoc dl(Op);
4303 SDValue Narrow = SrcVal;
4304 SDValue NaN;
4305 EVT I32 = SrcVT.changeElementType(EltVT: MVT::i32);
4306 EVT F32 = SrcVT.changeElementType(EltVT: MVT::f32);
4307 if (SrcVT.getScalarType() == MVT::f32) {
4308 bool NeverSNaN = DAG.isKnownNeverSNaN(Op: Narrow);
4309 Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: I32, Operand: Narrow);
4310 if (!NeverSNaN) {
4311 // Set the quiet bit.
4312 NaN = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: I32, N1: Narrow,
4313 N2: DAG.getConstant(Val: 0x400000, DL: dl, VT: I32));
4314 }
4315 } else if (SrcVT.getScalarType() == MVT::f64) {
4316 Narrow = DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL: dl, VT: F32, Operand: Narrow);
4317 Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: I32, Operand: Narrow);
4318 } else {
4319 return SDValue();
4320 }
4321 if (!Trunc) {
4322 SDValue One = DAG.getConstant(Val: 1, DL: dl, VT: I32);
4323 SDValue Lsb = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: I32, N1: Narrow,
4324 N2: DAG.getShiftAmountConstant(Val: 16, VT: I32, DL: dl));
4325 Lsb = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: I32, N1: Lsb, N2: One);
4326 SDValue RoundingBias =
4327 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: I32, N1: DAG.getConstant(Val: 0x7fff, DL: dl, VT: I32), N2: Lsb);
4328 Narrow = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: I32, N1: Narrow, N2: RoundingBias);
4329 }
4330
4331 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4332 // 0x80000000.
4333 if (NaN) {
4334 SDValue IsNaN = DAG.getSetCC(
4335 DL: dl, VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: SrcVT),
4336 LHS: SrcVal, RHS: SrcVal, Cond: ISD::SETUO);
4337 Narrow = DAG.getSelect(DL: dl, VT: I32, Cond: IsNaN, LHS: NaN, RHS: Narrow);
4338 }
4339
4340 // Now that we have rounded, shift the bits into position.
4341 Narrow = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: I32, N1: Narrow,
4342 N2: DAG.getShiftAmountConstant(Val: 16, VT: I32, DL: dl));
4343 if (VT.isVector()) {
4344 EVT I16 = I32.changeVectorElementType(EltVT: MVT::i16);
4345 Narrow = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: I16, Operand: Narrow);
4346 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Narrow);
4347 }
4348 Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: F32, Operand: Narrow);
4349 SDValue Result = DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL: dl, VT, Operand: Narrow);
4350 return IsStrict ? DAG.getMergeValues(Ops: {Result, Op.getOperand(i: 0)}, dl)
4351 : Result;
4352 }
4353
4354 if (SrcVT != MVT::f128) {
4355 // Expand cases where the input is a vector bigger than NEON.
4356 if (useSVEForFixedLengthVectorVT(VT: SrcVT))
4357 return SDValue();
4358
4359 // It's legal except when f128 is involved
4360 return Op;
4361 }
4362
4363 return SDValue();
4364}
4365
4366SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4367 SelectionDAG &DAG) const {
4368 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4369 // Any additional optimization in this function should be recorded
4370 // in the cost tables.
4371 bool IsStrict = Op->isStrictFPOpcode();
4372 EVT InVT = Op.getOperand(i: IsStrict ? 1 : 0).getValueType();
4373 EVT VT = Op.getValueType();
4374
4375 if (VT.isScalableVector()) {
4376 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4377 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4378 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4379 return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4380 }
4381
4382 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) ||
4383 useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4384 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4385
4386 unsigned NumElts = InVT.getVectorNumElements();
4387
4388 // f16 conversions are promoted to f32 when full fp16 is not supported.
4389 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4390 InVT.getVectorElementType() == MVT::bf16) {
4391 MVT NewVT = MVT::getVectorVT(VT: MVT::f32, NumElements: NumElts);
4392 SDLoc dl(Op);
4393 if (IsStrict) {
4394 SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl, ResultTys: {NewVT, MVT::Other},
4395 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1)});
4396 return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {VT, MVT::Other},
4397 Ops: {Ext.getValue(R: 1), Ext.getValue(R: 0)});
4398 }
4399 return DAG.getNode(
4400 Opcode: Op.getOpcode(), DL: dl, VT: Op.getValueType(),
4401 Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: NewVT, Operand: Op.getOperand(i: 0)));
4402 }
4403
4404 uint64_t VTSize = VT.getFixedSizeInBits();
4405 uint64_t InVTSize = InVT.getFixedSizeInBits();
4406 if (VTSize < InVTSize) {
4407 SDLoc dl(Op);
4408 if (IsStrict) {
4409 InVT = InVT.changeVectorElementTypeToInteger();
4410 SDValue Cv = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {InVT, MVT::Other},
4411 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1)});
4412 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Cv);
4413 return DAG.getMergeValues(Ops: {Trunc, Cv.getValue(R: 1)}, dl);
4414 }
4415 SDValue Cv =
4416 DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: InVT.changeVectorElementTypeToInteger(),
4417 Operand: Op.getOperand(i: 0));
4418 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Cv);
4419 }
4420
4421 if (VTSize > InVTSize) {
4422 SDLoc dl(Op);
4423 MVT ExtVT =
4424 MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: VT.getScalarSizeInBits()),
4425 NumElements: VT.getVectorNumElements());
4426 if (IsStrict) {
4427 SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl, ResultTys: {ExtVT, MVT::Other},
4428 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1)});
4429 return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {VT, MVT::Other},
4430 Ops: {Ext.getValue(R: 1), Ext.getValue(R: 0)});
4431 }
4432 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: ExtVT, Operand: Op.getOperand(i: 0));
4433 return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT, Operand: Ext);
4434 }
4435
4436 // Use a scalar operation for conversions between single-element vectors of
4437 // the same size.
4438 if (NumElts == 1) {
4439 SDLoc dl(Op);
4440 SDValue Extract = DAG.getNode(
4441 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: InVT.getScalarType(),
4442 N1: Op.getOperand(i: IsStrict ? 1 : 0), N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64));
4443 EVT ScalarVT = VT.getScalarType();
4444 if (IsStrict)
4445 return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {ScalarVT, MVT::Other},
4446 Ops: {Op.getOperand(i: 0), Extract});
4447 return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: ScalarVT, Operand: Extract);
4448 }
4449
4450 // Type changing conversions are illegal.
4451 return Op;
4452}
4453
4454SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4455 SelectionDAG &DAG) const {
4456 bool IsStrict = Op->isStrictFPOpcode();
4457 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
4458
4459 if (SrcVal.getValueType().isVector())
4460 return LowerVectorFP_TO_INT(Op, DAG);
4461
4462 // f16 conversions are promoted to f32 when full fp16 is not supported.
4463 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4464 SrcVal.getValueType() == MVT::bf16) {
4465 SDLoc dl(Op);
4466 if (IsStrict) {
4467 SDValue Ext =
4468 DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl, ResultTys: {MVT::f32, MVT::Other},
4469 Ops: {Op.getOperand(i: 0), SrcVal});
4470 return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {Op.getValueType(), MVT::Other},
4471 Ops: {Ext.getValue(R: 1), Ext.getValue(R: 0)});
4472 }
4473 return DAG.getNode(
4474 Opcode: Op.getOpcode(), DL: dl, VT: Op.getValueType(),
4475 Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f32, Operand: SrcVal));
4476 }
4477
4478 if (SrcVal.getValueType() != MVT::f128) {
4479 // It's legal except when f128 is involved
4480 return Op;
4481 }
4482
4483 return SDValue();
4484}
4485
4486SDValue
4487AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4488 SelectionDAG &DAG) const {
4489 // AArch64 FP-to-int conversions saturate to the destination element size, so
4490 // we can lower common saturating conversions to simple instructions.
4491 SDValue SrcVal = Op.getOperand(i: 0);
4492 EVT SrcVT = SrcVal.getValueType();
4493 EVT DstVT = Op.getValueType();
4494 EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
4495
4496 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4497 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4498 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4499 assert(SatWidth <= DstElementWidth &&
4500 "Saturation width cannot exceed result width");
4501
4502 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4503 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4504 // types, so this is hard to reach.
4505 if (DstVT.isScalableVector())
4506 return SDValue();
4507
4508 EVT SrcElementVT = SrcVT.getVectorElementType();
4509
4510 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4511 if ((SrcElementVT == MVT::f16 &&
4512 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4513 SrcElementVT == MVT::bf16) {
4514 MVT F32VT = MVT::getVectorVT(VT: MVT::f32, NumElements: SrcVT.getVectorNumElements());
4515 SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc(Op), VT: F32VT, Operand: SrcVal);
4516 SrcVT = F32VT;
4517 SrcElementVT = MVT::f32;
4518 SrcElementWidth = 32;
4519 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4520 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4521 return SDValue();
4522
4523 SDLoc DL(Op);
4524 // Expand to f64 if we are saturating to i64, to help produce keep the lanes
4525 // the same width and produce a fcvtzu.
4526 if (SatWidth == 64 && SrcElementWidth < 64) {
4527 MVT F64VT = MVT::getVectorVT(VT: MVT::f64, NumElements: SrcVT.getVectorNumElements());
4528 SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: F64VT, Operand: SrcVal);
4529 SrcVT = F64VT;
4530 SrcElementVT = MVT::f64;
4531 SrcElementWidth = 64;
4532 }
4533 // Cases that we can emit directly.
4534 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4535 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4536 N2: DAG.getValueType(DstVT.getScalarType()));
4537
4538 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4539 // result. This is only valid if the legal cvt is larger than the saturate
4540 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4541 // (at least until sqxtn is selected).
4542 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4543 return SDValue();
4544
4545 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4546 SDValue NativeCvt = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: IntVT, N1: SrcVal,
4547 N2: DAG.getValueType(IntVT.getScalarType()));
4548 SDValue Sat;
4549 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4550 SDValue MinC = DAG.getConstant(
4551 Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4552 SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4553 SDValue MaxC = DAG.getConstant(
4554 Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4555 Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: IntVT, N1: Min, N2: MaxC);
4556 } else {
4557 SDValue MinC = DAG.getConstant(
4558 Val: APInt::getAllOnes(numBits: SatWidth).zext(width: SrcElementWidth), DL, VT: IntVT);
4559 Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4560 }
4561
4562 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4563}
4564
4565SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4566 SelectionDAG &DAG) const {
4567 // AArch64 FP-to-int conversions saturate to the destination register size, so
4568 // we can lower common saturating conversions to simple instructions.
4569 SDValue SrcVal = Op.getOperand(i: 0);
4570 EVT SrcVT = SrcVal.getValueType();
4571
4572 if (SrcVT.isVector())
4573 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4574
4575 EVT DstVT = Op.getValueType();
4576 EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
4577 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4578 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4579 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4580
4581 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4582 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4583 SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc(Op), VT: MVT::f32, Operand: SrcVal);
4584 SrcVT = MVT::f32;
4585 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4586 SrcVT != MVT::bf16)
4587 return SDValue();
4588
4589 SDLoc DL(Op);
4590 // Cases that we can emit directly.
4591 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4592 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4593 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4594 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4595 N2: DAG.getValueType(DstVT));
4596
4597 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4598 // result. This is only valid if the legal cvt is larger than the saturate
4599 // width.
4600 if (DstWidth < SatWidth)
4601 return SDValue();
4602
4603 SDValue NativeCvt =
4604 DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal, N2: DAG.getValueType(DstVT));
4605 SDValue Sat;
4606 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4607 SDValue MinC = DAG.getConstant(
4608 Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4609 SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4610 SDValue MaxC = DAG.getConstant(
4611 Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4612 Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: DstVT, N1: Min, N2: MaxC);
4613 } else {
4614 SDValue MinC = DAG.getConstant(
4615 Val: APInt::getAllOnes(numBits: SatWidth).zext(width: DstWidth), DL, VT: DstVT);
4616 Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4617 }
4618
4619 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4620}
4621
4622SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4623 SelectionDAG &DAG) const {
4624 EVT VT = Op.getValueType();
4625 SDValue Src = Op.getOperand(i: 0);
4626 SDLoc DL(Op);
4627
4628 assert(VT.isVector() && "Expected vector type");
4629
4630 EVT CastVT =
4631 VT.changeVectorElementType(EltVT: Src.getValueType().getVectorElementType());
4632
4633 // Round the floating-point value into a floating-point register with the
4634 // current rounding mode.
4635 SDValue FOp = DAG.getNode(Opcode: ISD::FRINT, DL, VT: CastVT, Operand: Src);
4636
4637 // Truncate the rounded floating point to an integer.
4638 return DAG.getNode(Opcode: ISD::FP_TO_SINT_SAT, DL, VT, N1: FOp,
4639 N2: DAG.getValueType(VT.getVectorElementType()));
4640}
4641
4642SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4643 SelectionDAG &DAG) const {
4644 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4645 // Any additional optimization in this function should be recorded
4646 // in the cost tables.
4647 bool IsStrict = Op->isStrictFPOpcode();
4648 EVT VT = Op.getValueType();
4649 SDLoc dl(Op);
4650 SDValue In = Op.getOperand(i: IsStrict ? 1 : 0);
4651 EVT InVT = In.getValueType();
4652 unsigned Opc = Op.getOpcode();
4653 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4654
4655 if (VT.isScalableVector()) {
4656 if (InVT.getVectorElementType() == MVT::i1) {
4657 // We can't directly extend an SVE predicate; extend it first.
4658 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4659 EVT CastVT = getPromotedVTForPredicate(VT: InVT);
4660 In = DAG.getNode(Opcode: CastOpc, DL: dl, VT: CastVT, Operand: In);
4661 return DAG.getNode(Opcode: Opc, DL: dl, VT, Operand: In);
4662 }
4663
4664 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4665 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
4666 return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4667 }
4668
4669 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) ||
4670 useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4671 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4672
4673 // Promote bf16 conversions to f32.
4674 if (VT.getVectorElementType() == MVT::bf16) {
4675 EVT F32 = VT.changeElementType(EltVT: MVT::f32);
4676 if (IsStrict) {
4677 SDValue Val = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {F32, MVT::Other},
4678 Ops: {Op.getOperand(i: 0), In});
4679 return DAG.getNode(
4680 Opcode: ISD::STRICT_FP_ROUND, DL: dl, ResultTys: {Op.getValueType(), MVT::Other},
4681 Ops: {Val.getValue(R: 1), Val.getValue(R: 0), DAG.getIntPtrConstant(Val: 0, DL: dl)});
4682 }
4683 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: Op.getValueType(),
4684 N1: DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: F32, Operand: In),
4685 N2: DAG.getIntPtrConstant(Val: 0, DL: dl));
4686 }
4687
4688 uint64_t VTSize = VT.getFixedSizeInBits();
4689 uint64_t InVTSize = InVT.getFixedSizeInBits();
4690 if (VTSize < InVTSize) {
4691 MVT CastVT =
4692 MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: InVT.getScalarSizeInBits()),
4693 NumElements: InVT.getVectorNumElements());
4694 if (IsStrict) {
4695 In = DAG.getNode(Opcode: Opc, DL: dl, ResultTys: {CastVT, MVT::Other},
4696 Ops: {Op.getOperand(i: 0), In});
4697 return DAG.getNode(
4698 Opcode: ISD::STRICT_FP_ROUND, DL: dl, ResultTys: {VT, MVT::Other},
4699 Ops: {In.getValue(R: 1), In.getValue(R: 0), DAG.getIntPtrConstant(Val: 0, DL: dl)});
4700 }
4701 In = DAG.getNode(Opcode: Opc, DL: dl, VT: CastVT, Operand: In);
4702 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT, N1: In,
4703 N2: DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true));
4704 }
4705
4706 if (VTSize > InVTSize) {
4707 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4708 EVT CastVT = VT.changeVectorElementTypeToInteger();
4709 In = DAG.getNode(Opcode: CastOpc, DL: dl, VT: CastVT, Operand: In);
4710 if (IsStrict)
4711 return DAG.getNode(Opcode: Opc, DL: dl, ResultTys: {VT, MVT::Other}, Ops: {Op.getOperand(i: 0), In});
4712 return DAG.getNode(Opcode: Opc, DL: dl, VT, Operand: In);
4713 }
4714
4715 // Use a scalar operation for conversions between single-element vectors of
4716 // the same size.
4717 if (VT.getVectorNumElements() == 1) {
4718 SDValue Extract = DAG.getNode(
4719 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: InVT.getScalarType(),
4720 N1: In, N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64));
4721 EVT ScalarVT = VT.getScalarType();
4722 if (IsStrict)
4723 return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {ScalarVT, MVT::Other},
4724 Ops: {Op.getOperand(i: 0), Extract});
4725 return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: ScalarVT, Operand: Extract);
4726 }
4727
4728 return Op;
4729}
4730
4731SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4732 SelectionDAG &DAG) const {
4733 if (Op.getValueType().isVector())
4734 return LowerVectorINT_TO_FP(Op, DAG);
4735
4736 bool IsStrict = Op->isStrictFPOpcode();
4737 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
4738
4739 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4740 Op->getOpcode() == ISD::SINT_TO_FP;
4741
4742 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4743 SDLoc dl(Op);
4744 if (IsStrict) {
4745 SDValue Val = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {PromoteVT, MVT::Other},
4746 Ops: {Op.getOperand(i: 0), SrcVal});
4747 return DAG.getNode(
4748 Opcode: ISD::STRICT_FP_ROUND, DL: dl, ResultTys: {Op.getValueType(), MVT::Other},
4749 Ops: {Val.getValue(R: 1), Val.getValue(R: 0), DAG.getIntPtrConstant(Val: 0, DL: dl)});
4750 }
4751 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: Op.getValueType(),
4752 N1: DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromoteVT, Operand: SrcVal),
4753 N2: DAG.getIntPtrConstant(Val: 0, DL: dl));
4754 };
4755
4756 if (Op.getValueType() == MVT::bf16) {
4757 unsigned MaxWidth = IsSigned
4758 ? DAG.ComputeMaxSignificantBits(Op: SrcVal)
4759 : DAG.computeKnownBits(Op: SrcVal).countMaxActiveBits();
4760 // bf16 conversions are promoted to f32 when converting from i16.
4761 if (MaxWidth <= 24) {
4762 return IntToFpViaPromotion(MVT::f32);
4763 }
4764
4765 // bf16 conversions are promoted to f64 when converting from i32.
4766 if (MaxWidth <= 53) {
4767 return IntToFpViaPromotion(MVT::f64);
4768 }
4769
4770 // We need to be careful about i64 -> bf16.
4771 // Consider an i32 22216703.
4772 // This number cannot be represented exactly as an f32 and so a itofp will
4773 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4774 // However, the correct bf16 was supposed to be 22151168.0
4775 // We need to use sticky rounding to get this correct.
4776 if (SrcVal.getValueType() == MVT::i64) {
4777 SDLoc DL(Op);
4778 // This algorithm is equivalent to the following:
4779 // uint64_t SrcHi = SrcVal & ~0xfffull;
4780 // uint64_t SrcLo = SrcVal & 0xfffull;
4781 // uint64_t Highest = SrcVal >> 53;
4782 // bool HasHighest = Highest != 0;
4783 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4784 // double Rounded = static_cast<double>(ToRound);
4785 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4786 // uint64_t HasLo = SrcLo != 0;
4787 // bool NeedsAdjustment = HasHighest & HasLo;
4788 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4789 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4790 // return static_cast<__bf16>(Adjusted);
4791 //
4792 // Essentially, what happens is that SrcVal either fits perfectly in a
4793 // double-precision value or it is too big. If it is sufficiently small,
4794 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4795 // ensure that u64 -> double has no rounding error by only using the 52
4796 // MSB of the input. The low order bits will get merged into a sticky bit
4797 // which will avoid issues incurred by double rounding.
4798
4799 // Signed conversion is more or less like so:
4800 // copysign((__bf16)abs(SrcVal), SrcVal)
4801 SDValue SignBit;
4802 if (IsSigned) {
4803 SignBit = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4804 N2: DAG.getConstant(Val: 1ull << 63, DL, VT: MVT::i64));
4805 SrcVal = DAG.getNode(Opcode: ISD::ABS, DL, VT: MVT::i64, Operand: SrcVal);
4806 }
4807 SDValue SrcHi = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4808 N2: DAG.getConstant(Val: ~0xfffull, DL, VT: MVT::i64));
4809 SDValue SrcLo = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4810 N2: DAG.getConstant(Val: 0xfffull, DL, VT: MVT::i64));
4811 SDValue Highest =
4812 DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: SrcVal,
4813 N2: DAG.getShiftAmountConstant(Val: 53, VT: MVT::i64, DL));
4814 SDValue Zero64 = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
4815 SDValue ToRound =
4816 DAG.getSelectCC(DL, LHS: Highest, RHS: Zero64, True: SrcHi, False: SrcVal, Cond: ISD::SETNE);
4817 SDValue Rounded =
4818 IsStrict ? DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {MVT::f64, MVT::Other},
4819 Ops: {Op.getOperand(i: 0), ToRound})
4820 : DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f64, Operand: ToRound);
4821
4822 SDValue RoundedBits = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Rounded);
4823 if (SignBit) {
4824 RoundedBits = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: RoundedBits, N2: SignBit);
4825 }
4826
4827 SDValue HasHighest = DAG.getSetCC(
4828 DL,
4829 VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: MVT::i64),
4830 LHS: Highest, RHS: Zero64, Cond: ISD::SETNE);
4831
4832 SDValue HasLo = DAG.getSetCC(
4833 DL,
4834 VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: MVT::i64),
4835 LHS: SrcLo, RHS: Zero64, Cond: ISD::SETNE);
4836
4837 SDValue NeedsAdjustment =
4838 DAG.getNode(Opcode: ISD::AND, DL, VT: HasLo.getValueType(), N1: HasHighest, N2: HasLo);
4839 NeedsAdjustment = DAG.getZExtOrTrunc(Op: NeedsAdjustment, DL, VT: MVT::i64);
4840
4841 SDValue AdjustedBits =
4842 DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: RoundedBits, N2: NeedsAdjustment);
4843 SDValue Adjusted = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: AdjustedBits);
4844 return IsStrict
4845 ? DAG.getNode(Opcode: ISD::STRICT_FP_ROUND, DL,
4846 ResultTys: {Op.getValueType(), MVT::Other},
4847 Ops: {Rounded.getValue(R: 1), Adjusted,
4848 DAG.getIntPtrConstant(Val: 0, DL)})
4849 : DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: Op.getValueType(), N1: Adjusted,
4850 N2: DAG.getIntPtrConstant(Val: 0, DL, isTarget: true));
4851 }
4852 }
4853
4854 // f16 conversions are promoted to f32 when full fp16 is not supported.
4855 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4856 return IntToFpViaPromotion(MVT::f32);
4857 }
4858
4859 // i128 conversions are libcalls.
4860 if (SrcVal.getValueType() == MVT::i128)
4861 return SDValue();
4862
4863 // Other conversions are legal, unless it's to the completely software-based
4864 // fp128.
4865 if (Op.getValueType() != MVT::f128)
4866 return Op;
4867 return SDValue();
4868}
4869
4870SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4871 SelectionDAG &DAG) const {
4872 // For iOS, we want to call an alternative entry point: __sincos_stret,
4873 // which returns the values in two S / D registers.
4874 SDLoc dl(Op);
4875 SDValue Arg = Op.getOperand(i: 0);
4876 EVT ArgVT = Arg.getValueType();
4877 Type *ArgTy = ArgVT.getTypeForEVT(Context&: *DAG.getContext());
4878
4879 ArgListTy Args;
4880 ArgListEntry Entry;
4881
4882 Entry.Node = Arg;
4883 Entry.Ty = ArgTy;
4884 Entry.IsSExt = false;
4885 Entry.IsZExt = false;
4886 Args.push_back(x: Entry);
4887
4888 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4889 : RTLIB::SINCOS_STRET_F32;
4890 const char *LibcallName = getLibcallName(Call: LC);
4891 SDValue Callee =
4892 DAG.getExternalSymbol(Sym: LibcallName, VT: getPointerTy(DL: DAG.getDataLayout()));
4893
4894 StructType *RetTy = StructType::get(elt1: ArgTy, elts: ArgTy);
4895 TargetLowering::CallLoweringInfo CLI(DAG);
4896 CLI.setDebugLoc(dl)
4897 .setChain(DAG.getEntryNode())
4898 .setLibCallee(CC: CallingConv::Fast, ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
4899
4900 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4901 return CallResult.first;
4902}
4903
4904static MVT getSVEContainerType(EVT ContentTy);
4905
4906SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4907 SelectionDAG &DAG) const {
4908 EVT OpVT = Op.getValueType();
4909 EVT ArgVT = Op.getOperand(i: 0).getValueType();
4910
4911 if (useSVEForFixedLengthVectorVT(VT: OpVT))
4912 return LowerFixedLengthBitcastToSVE(Op, DAG);
4913
4914 if (OpVT.isScalableVector()) {
4915 // Bitcasting between unpacked vector types of different element counts is
4916 // not a NOP because the live elements are laid out differently.
4917 // 01234567
4918 // e.g. nxv2i32 = XX??XX??
4919 // nxv4f16 = X?X?X?X?
4920 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4921 return SDValue();
4922
4923 if (isTypeLegal(VT: OpVT) && !isTypeLegal(VT: ArgVT)) {
4924 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4925 "Expected int->fp bitcast!");
4926 SDValue ExtResult =
4927 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SDLoc(Op), VT: getSVEContainerType(ContentTy: ArgVT),
4928 Operand: Op.getOperand(i: 0));
4929 return getSVESafeBitCast(VT: OpVT, Op: ExtResult, DAG);
4930 }
4931 return getSVESafeBitCast(VT: OpVT, Op: Op.getOperand(i: 0), DAG);
4932 }
4933
4934 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4935 return SDValue();
4936
4937 // Bitcasts between f16 and bf16 are legal.
4938 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4939 return Op;
4940
4941 assert(ArgVT == MVT::i16);
4942 SDLoc DL(Op);
4943
4944 Op = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Op.getOperand(i: 0));
4945 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Op);
4946 return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT: OpVT, Operand: Op);
4947}
4948
4949static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4950 if (OrigVT.getSizeInBits() >= 64)
4951 return OrigVT;
4952
4953 assert(OrigVT.isSimple() && "Expecting a simple value type");
4954
4955 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4956 switch (OrigSimpleTy) {
4957 default: llvm_unreachable("Unexpected Vector Type");
4958 case MVT::v2i8:
4959 case MVT::v2i16:
4960 return MVT::v2i32;
4961 case MVT::v4i8:
4962 return MVT::v4i16;
4963 }
4964}
4965
4966static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
4967 const EVT &OrigTy,
4968 const EVT &ExtTy,
4969 unsigned ExtOpcode) {
4970 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4971 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4972 // 64-bits we need to insert a new extension so that it will be 64-bits.
4973 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4974 if (OrigTy.getSizeInBits() >= 64)
4975 return N;
4976
4977 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4978 EVT NewVT = getExtensionTo64Bits(OrigVT: OrigTy);
4979
4980 return DAG.getNode(Opcode: ExtOpcode, DL: SDLoc(N), VT: NewVT, Operand: N);
4981}
4982
4983// Returns lane if Op extracts from a two-element vector and lane is constant
4984// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4985static std::optional<uint64_t>
4986getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
4987 SDNode *OpNode = Op.getNode();
4988 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4989 return std::nullopt;
4990
4991 EVT VT = OpNode->getOperand(Num: 0).getValueType();
4992 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: OpNode->getOperand(Num: 1));
4993 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4994 return std::nullopt;
4995
4996 return C->getZExtValue();
4997}
4998
4999static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG,
5000 bool isSigned) {
5001 EVT VT = N.getValueType();
5002
5003 if (N.getOpcode() != ISD::BUILD_VECTOR)
5004 return false;
5005
5006 for (const SDValue &Elt : N->op_values()) {
5007 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Elt)) {
5008 unsigned EltSize = VT.getScalarSizeInBits();
5009 unsigned HalfSize = EltSize / 2;
5010 if (isSigned) {
5011 if (!isIntN(N: HalfSize, x: C->getSExtValue()))
5012 return false;
5013 } else {
5014 if (!isUIntN(N: HalfSize, x: C->getZExtValue()))
5015 return false;
5016 }
5017 continue;
5018 }
5019 return false;
5020 }
5021
5022 return true;
5023}
5024
5025static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) {
5026 EVT VT = N.getValueType();
5027 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5028
5029 unsigned NumElts = VT.getVectorNumElements();
5030 unsigned OrigEltSize = VT.getScalarSizeInBits();
5031 unsigned EltSize = OrigEltSize / 2;
5032 MVT TruncVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: EltSize), NumElements: NumElts);
5033
5034 APInt HiBits = APInt::getHighBitsSet(numBits: OrigEltSize, hiBitsSet: EltSize);
5035 if (DAG.MaskedValueIsZero(Op: N, Mask: HiBits))
5036 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc(N), VT: TruncVT, Operand: N);
5037
5038 if (ISD::isExtOpcode(Opcode: N.getOpcode()))
5039 return addRequiredExtensionForVectorMULL(N: N.getOperand(i: 0), DAG,
5040 OrigTy: N.getOperand(i: 0).getValueType(), ExtTy: VT,
5041 ExtOpcode: N.getOpcode());
5042
5043 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
5044 SDLoc dl(N);
5045 SmallVector<SDValue, 8> Ops;
5046 for (unsigned i = 0; i != NumElts; ++i) {
5047 const APInt &CInt = N.getConstantOperandAPInt(i);
5048 // Element types smaller than 32 bits are not legal, so use i32 elements.
5049 // The values are implicitly truncated so sext vs. zext doesn't matter.
5050 Ops.push_back(Elt: DAG.getConstant(Val: CInt.zextOrTrunc(width: 32), DL: dl, VT: MVT::i32));
5051 }
5052 return DAG.getBuildVector(VT: TruncVT, DL: dl, Ops);
5053}
5054
5055static bool isSignExtended(SDValue N, SelectionDAG &DAG) {
5056 return N.getOpcode() == ISD::SIGN_EXTEND ||
5057 N.getOpcode() == ISD::ANY_EXTEND ||
5058 isExtendedBUILD_VECTOR(N, DAG, isSigned: true);
5059}
5060
5061static bool isZeroExtended(SDValue N, SelectionDAG &DAG) {
5062 return N.getOpcode() == ISD::ZERO_EXTEND ||
5063 N.getOpcode() == ISD::ANY_EXTEND ||
5064 isExtendedBUILD_VECTOR(N, DAG, isSigned: false);
5065}
5066
5067static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) {
5068 unsigned Opcode = N.getOpcode();
5069 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5070 SDValue N0 = N.getOperand(i: 0);
5071 SDValue N1 = N.getOperand(i: 1);
5072 return N0->hasOneUse() && N1->hasOneUse() &&
5073 isSignExtended(N: N0, DAG) && isSignExtended(N: N1, DAG);
5074 }
5075 return false;
5076}
5077
5078static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) {
5079 unsigned Opcode = N.getOpcode();
5080 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5081 SDValue N0 = N.getOperand(i: 0);
5082 SDValue N1 = N.getOperand(i: 1);
5083 return N0->hasOneUse() && N1->hasOneUse() &&
5084 isZeroExtended(N: N0, DAG) && isZeroExtended(N: N1, DAG);
5085 }
5086 return false;
5087}
5088
5089SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5090 SelectionDAG &DAG) const {
5091 // The rounding mode is in bits 23:22 of the FPSCR.
5092 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5093 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5094 // so that the shift + and get folded into a bitfield extract.
5095 SDLoc dl(Op);
5096
5097 SDValue Chain = Op.getOperand(i: 0);
5098 SDValue FPCR_64 = DAG.getNode(
5099 Opcode: ISD::INTRINSIC_W_CHAIN, DL: dl, ResultTys: {MVT::i64, MVT::Other},
5100 Ops: {Chain, DAG.getConstant(Val: Intrinsic::aarch64_get_fpcr, DL: dl, VT: MVT::i64)});
5101 Chain = FPCR_64.getValue(R: 1);
5102 SDValue FPCR_32 = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: FPCR_64);
5103 SDValue FltRounds = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: FPCR_32,
5104 N2: DAG.getConstant(Val: 1U << 22, DL: dl, VT: MVT::i32));
5105 SDValue RMODE = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i32, N1: FltRounds,
5106 N2: DAG.getConstant(Val: 22, DL: dl, VT: MVT::i32));
5107 SDValue AND = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: RMODE,
5108 N2: DAG.getConstant(Val: 3, DL: dl, VT: MVT::i32));
5109 return DAG.getMergeValues(Ops: {AND, Chain}, dl);
5110}
5111
5112SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5113 SelectionDAG &DAG) const {
5114 SDLoc DL(Op);
5115 SDValue Chain = Op->getOperand(Num: 0);
5116 SDValue RMValue = Op->getOperand(Num: 1);
5117
5118 // The rounding mode is in bits 23:22 of the FPCR.
5119 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5120 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5121 // ((arg - 1) & 3) << 22).
5122 //
5123 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5124 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5125 // generated llvm.set.rounding to ensure this condition.
5126
5127 // Calculate new value of FPCR[23:22].
5128 RMValue = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: RMValue,
5129 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
5130 RMValue = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: RMValue,
5131 N2: DAG.getConstant(Val: 0x3, DL, VT: MVT::i32));
5132 RMValue =
5133 DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: RMValue,
5134 N2: DAG.getConstant(Val: AArch64::RoundingBitsPos, DL, VT: MVT::i32));
5135 RMValue = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: RMValue);
5136
5137 // Get current value of FPCR.
5138 SDValue Ops[] = {
5139 Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5140 SDValue FPCR =
5141 DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5142 Chain = FPCR.getValue(R: 1);
5143 FPCR = FPCR.getValue(R: 0);
5144
5145 // Put new rounding mode into FPSCR[23:22].
5146 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5147 FPCR = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: FPCR,
5148 N2: DAG.getConstant(Val: RMMask, DL, VT: MVT::i64));
5149 FPCR = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: FPCR, N2: RMValue);
5150 SDValue Ops2[] = {
5151 Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64),
5152 FPCR};
5153 return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5154}
5155
5156SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5157 SelectionDAG &DAG) const {
5158 SDLoc DL(Op);
5159 SDValue Chain = Op->getOperand(Num: 0);
5160
5161 // Get current value of FPCR.
5162 SDValue Ops[] = {
5163 Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5164 SDValue FPCR =
5165 DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5166 Chain = FPCR.getValue(R: 1);
5167 FPCR = FPCR.getValue(R: 0);
5168
5169 // Truncate FPCR to 32 bits.
5170 SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: FPCR);
5171
5172 return DAG.getMergeValues(Ops: {Result, Chain}, dl: DL);
5173}
5174
5175SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5176 SelectionDAG &DAG) const {
5177 SDLoc DL(Op);
5178 SDValue Chain = Op->getOperand(Num: 0);
5179 SDValue Mode = Op->getOperand(Num: 1);
5180
5181 // Extend the specified value to 64 bits.
5182 SDValue FPCR = DAG.getZExtOrTrunc(Op: Mode, DL, VT: MVT::i64);
5183
5184 // Set new value of FPCR.
5185 SDValue Ops2[] = {
5186 Chain, DAG.getConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64), FPCR};
5187 return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5188}
5189
5190SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5191 SelectionDAG &DAG) const {
5192 SDLoc DL(Op);
5193 SDValue Chain = Op->getOperand(Num: 0);
5194
5195 // Get current value of FPCR.
5196 SDValue Ops[] = {
5197 Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5198 SDValue FPCR =
5199 DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5200 Chain = FPCR.getValue(R: 1);
5201 FPCR = FPCR.getValue(R: 0);
5202
5203 // Clear bits that are not reserved.
5204 SDValue FPSCRMasked = DAG.getNode(
5205 Opcode: ISD::AND, DL, VT: MVT::i64, N1: FPCR,
5206 N2: DAG.getConstant(Val: AArch64::ReservedFPControlBits, DL, VT: MVT::i64));
5207
5208 // Set new value of FPCR.
5209 SDValue Ops2[] = {Chain,
5210 DAG.getConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64),
5211 FPSCRMasked};
5212 return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5213}
5214
5215static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5216 SDLoc DL, bool &IsMLA) {
5217 bool IsN0SExt = isSignExtended(N: N0, DAG);
5218 bool IsN1SExt = isSignExtended(N: N1, DAG);
5219 if (IsN0SExt && IsN1SExt)
5220 return AArch64ISD::SMULL;
5221
5222 bool IsN0ZExt = isZeroExtended(N: N0, DAG);
5223 bool IsN1ZExt = isZeroExtended(N: N1, DAG);
5224
5225 if (IsN0ZExt && IsN1ZExt)
5226 return AArch64ISD::UMULL;
5227
5228 // Select SMULL if we can replace zext with sext.
5229 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
5230 !isExtendedBUILD_VECTOR(N: N0, DAG, isSigned: false) &&
5231 !isExtendedBUILD_VECTOR(N: N1, DAG, isSigned: false)) {
5232 SDValue ZextOperand;
5233 if (IsN0ZExt)
5234 ZextOperand = N0.getOperand(i: 0);
5235 else
5236 ZextOperand = N1.getOperand(i: 0);
5237 if (DAG.SignBitIsZero(Op: ZextOperand)) {
5238 SDValue NewSext =
5239 DAG.getSExtOrTrunc(Op: ZextOperand, DL, VT: N0.getValueType());
5240 if (IsN0ZExt)
5241 N0 = NewSext;
5242 else
5243 N1 = NewSext;
5244 return AArch64ISD::SMULL;
5245 }
5246 }
5247
5248 // Select UMULL if we can replace the other operand with an extend.
5249 if (IsN0ZExt || IsN1ZExt) {
5250 EVT VT = N0.getValueType();
5251 APInt Mask = APInt::getHighBitsSet(numBits: VT.getScalarSizeInBits(),
5252 hiBitsSet: VT.getScalarSizeInBits() / 2);
5253 if (DAG.MaskedValueIsZero(Op: IsN0ZExt ? N1 : N0, Mask))
5254 return AArch64ISD::UMULL;
5255 }
5256
5257 if (!IsN1SExt && !IsN1ZExt)
5258 return 0;
5259
5260 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5261 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5262 if (IsN1SExt && isAddSubSExt(N: N0, DAG)) {
5263 IsMLA = true;
5264 return AArch64ISD::SMULL;
5265 }
5266 if (IsN1ZExt && isAddSubZExt(N: N0, DAG)) {
5267 IsMLA = true;
5268 return AArch64ISD::UMULL;
5269 }
5270 if (IsN0ZExt && isAddSubZExt(N: N1, DAG)) {
5271 std::swap(a&: N0, b&: N1);
5272 IsMLA = true;
5273 return AArch64ISD::UMULL;
5274 }
5275 return 0;
5276}
5277
5278SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5279 EVT VT = Op.getValueType();
5280
5281 bool OverrideNEON = !Subtarget->isNeonAvailable();
5282 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5283 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5284
5285 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5286 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5287 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5288 "unexpected type for custom-lowering ISD::MUL");
5289 SDValue N0 = Op.getOperand(i: 0);
5290 SDValue N1 = Op.getOperand(i: 1);
5291 bool isMLA = false;
5292 EVT OVT = VT;
5293 if (VT.is64BitVector()) {
5294 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5295 isNullConstant(V: N0.getOperand(i: 1)) &&
5296 N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5297 isNullConstant(V: N1.getOperand(i: 1))) {
5298 N0 = N0.getOperand(i: 0);
5299 N1 = N1.getOperand(i: 0);
5300 VT = N0.getValueType();
5301 } else {
5302 if (VT == MVT::v1i64) {
5303 if (Subtarget->hasSVE())
5304 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5305 // Fall through to expand this. It is not legal.
5306 return SDValue();
5307 } else
5308 // Other vector multiplications are legal.
5309 return Op;
5310 }
5311 }
5312
5313 SDLoc DL(Op);
5314 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, IsMLA&: isMLA);
5315
5316 if (!NewOpc) {
5317 if (VT.getVectorElementType() == MVT::i64) {
5318 // If SVE is available then i64 vector multiplications can also be made
5319 // legal.
5320 if (Subtarget->hasSVE())
5321 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5322 // Fall through to expand this. It is not legal.
5323 return SDValue();
5324 } else
5325 // Other vector multiplications are legal.
5326 return Op;
5327 }
5328
5329 // Legalize to a S/UMULL instruction
5330 SDValue Op0;
5331 SDValue Op1 = skipExtensionForVectorMULL(N: N1, DAG);
5332 if (!isMLA) {
5333 Op0 = skipExtensionForVectorMULL(N: N0, DAG);
5334 assert(Op0.getValueType().is64BitVector() &&
5335 Op1.getValueType().is64BitVector() &&
5336 "unexpected types for extended operands to VMULL");
5337 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: OVT,
5338 N1: DAG.getNode(Opcode: NewOpc, DL, VT, N1: Op0, N2: Op1),
5339 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
5340 }
5341 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5342 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5343 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5344 SDValue N00 = skipExtensionForVectorMULL(N: N0.getOperand(i: 0), DAG);
5345 SDValue N01 = skipExtensionForVectorMULL(N: N0.getOperand(i: 1), DAG);
5346 EVT Op1VT = Op1.getValueType();
5347 return DAG.getNode(
5348 Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: OVT,
5349 N1: DAG.getNode(Opcode: N0.getOpcode(), DL, VT,
5350 N1: DAG.getNode(Opcode: NewOpc, DL, VT,
5351 N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op1VT, Operand: N00), N2: Op1),
5352 N2: DAG.getNode(Opcode: NewOpc, DL, VT,
5353 N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op1VT, Operand: N01), N2: Op1)),
5354 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
5355}
5356
5357static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5358 int Pattern) {
5359 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5360 return DAG.getConstant(Val: 1, DL, VT: MVT::nxv1i1);
5361 return DAG.getNode(Opcode: AArch64ISD::PTRUE, DL, VT,
5362 Operand: DAG.getTargetConstant(Val: Pattern, DL, VT: MVT::i32));
5363}
5364
5365static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG,
5366 bool IsSigned, bool IsEqual) {
5367 if (!isa<ConstantSDNode>(Val: Op.getOperand(i: 1)) ||
5368 !isa<ConstantSDNode>(Val: Op.getOperand(i: 2)))
5369 return SDValue();
5370
5371 SDLoc dl(Op);
5372 APInt X = Op.getConstantOperandAPInt(i: 1);
5373 APInt Y = Op.getConstantOperandAPInt(i: 2);
5374 bool Overflow;
5375 APInt NumActiveElems =
5376 IsSigned ? Y.ssub_ov(RHS: X, Overflow) : Y.usub_ov(RHS: X, Overflow);
5377
5378 if (Overflow)
5379 return SDValue();
5380
5381 if (IsEqual) {
5382 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5383 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(RHS: One, Overflow)
5384 : NumActiveElems.uadd_ov(RHS: One, Overflow);
5385 if (Overflow)
5386 return SDValue();
5387 }
5388
5389 std::optional<unsigned> PredPattern =
5390 getSVEPredPatternFromNumElements(MinNumElts: NumActiveElems.getZExtValue());
5391 unsigned MinSVEVectorSize = std::max(
5392 a: DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), b: 128u);
5393 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5394 if (PredPattern != std::nullopt &&
5395 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5396 return getPTrue(DAG, DL: dl, VT: Op.getValueType(), Pattern: *PredPattern);
5397
5398 return SDValue();
5399}
5400
5401// Returns a safe bitcast between two scalable vector predicates, where
5402// any newly created lanes from a widening bitcast are defined as zero.
5403static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
5404 SDLoc DL(Op);
5405 EVT InVT = Op.getValueType();
5406
5407 assert(InVT.getVectorElementType() == MVT::i1 &&
5408 VT.getVectorElementType() == MVT::i1 &&
5409 "Expected a predicate-to-predicate bitcast");
5410 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5411 InVT.isScalableVector() &&
5412 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5413 "Only expect to cast between legal scalable predicate types!");
5414
5415 // Return the operand if the cast isn't changing type,
5416 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5417 if (InVT == VT)
5418 return Op;
5419
5420 SDValue Reinterpret = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
5421
5422 // We only have to zero the lanes if new lanes are being defined, e.g. when
5423 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5424 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5425 // we can return here.
5426 if (InVT.bitsGT(VT))
5427 return Reinterpret;
5428
5429 // Check if the other lanes are already known to be zeroed by
5430 // construction.
5431 if (isZeroingInactiveLanes(Op))
5432 return Reinterpret;
5433
5434 // Zero the newly introduced lanes.
5435 SDValue Mask = DAG.getConstant(Val: 1, DL, VT: InVT);
5436 Mask = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Mask);
5437 return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Reinterpret, N2: Mask);
5438}
5439
5440SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5441 SDValue Chain, SDLoc DL,
5442 EVT VT) const {
5443 SDValue Callee = DAG.getExternalSymbol(Sym: "__arm_sme_state",
5444 VT: getPointerTy(DL: DAG.getDataLayout()));
5445 Type *Int64Ty = Type::getInt64Ty(C&: *DAG.getContext());
5446 Type *RetTy = StructType::get(elt1: Int64Ty, elts: Int64Ty);
5447 TargetLowering::CallLoweringInfo CLI(DAG);
5448 ArgListTy Args;
5449 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5450 CC: CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
5451 ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
5452 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5453 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ Val: 1, DL, VT: MVT::i64);
5454 return DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: CallResult.first.getOperand(i: 0),
5455 N2: Mask);
5456}
5457
5458// Lower an SME LDR/STR ZA intrinsic
5459// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5460// folded into the instruction
5461// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5462// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5463// and tile slice registers
5464// ldr(%tileslice, %ptr, %vecnum)
5465// ->
5466// %svl = rdsvl
5467// %ptr2 = %ptr + %svl * %vecnum
5468// %tileslice2 = %tileslice + %vecnum
5469// ldr [%tileslice2, 0], [%ptr2, 0]
5470// Case 3: If the vecnum is an immediate out of range, then the same is done as
5471// case 2, but the base and slice registers are modified by the greatest
5472// multiple of 15 lower than the vecnum and the remainder is folded into the
5473// instruction. This means that successive loads and stores that are offset from
5474// each other can share the same base and slice register updates.
5475// ldr(%tileslice, %ptr, 22)
5476// ldr(%tileslice, %ptr, 23)
5477// ->
5478// %svl = rdsvl
5479// %ptr2 = %ptr + %svl * 15
5480// %tileslice2 = %tileslice + 15
5481// ldr [%tileslice2, 7], [%ptr2, 7]
5482// ldr [%tileslice2, 8], [%ptr2, 8]
5483// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5484// operand and the immediate can be folded into the instruction, like case 2.
5485// ldr(%tileslice, %ptr, %vecnum + 7)
5486// ldr(%tileslice, %ptr, %vecnum + 8)
5487// ->
5488// %svl = rdsvl
5489// %ptr2 = %ptr + %svl * %vecnum
5490// %tileslice2 = %tileslice + %vecnum
5491// ldr [%tileslice2, 7], [%ptr2, 7]
5492// ldr [%tileslice2, 8], [%ptr2, 8]
5493// Case 5: The vecnum being an add of an immediate out of range is also handled,
5494// in which case the same remainder logic as case 3 is used.
5495SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
5496 SDLoc DL(N);
5497
5498 SDValue TileSlice = N->getOperand(Num: 2);
5499 SDValue Base = N->getOperand(Num: 3);
5500 SDValue VecNum = N->getOperand(Num: 4);
5501 int32_t ConstAddend = 0;
5502 SDValue VarAddend = VecNum;
5503
5504 // If the vnum is an add of an immediate, we can fold it into the instruction
5505 if (VecNum.getOpcode() == ISD::ADD &&
5506 isa<ConstantSDNode>(Val: VecNum.getOperand(i: 1))) {
5507 ConstAddend = cast<ConstantSDNode>(Val: VecNum.getOperand(i: 1))->getSExtValue();
5508 VarAddend = VecNum.getOperand(i: 0);
5509 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(Val&: VecNum)) {
5510 ConstAddend = ImmNode->getSExtValue();
5511 VarAddend = SDValue();
5512 }
5513
5514 int32_t ImmAddend = ConstAddend % 16;
5515 if (int32_t C = (ConstAddend - ImmAddend)) {
5516 SDValue CVal = DAG.getTargetConstant(Val: C, DL, VT: MVT::i32);
5517 VarAddend = VarAddend
5518 ? DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops: {VarAddend, CVal})
5519 : CVal;
5520 }
5521
5522 if (VarAddend) {
5523 // Get the vector length that will be multiplied by vnum
5524 auto SVL = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
5525 Operand: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
5526
5527 // Multiply SVL and vnum then add it to the base
5528 SDValue Mul = DAG.getNode(
5529 Opcode: ISD::MUL, DL, VT: MVT::i64,
5530 Ops: {SVL, DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: VarAddend)});
5531 Base = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, Ops: {Base, Mul});
5532 // Just add vnum to the tileslice
5533 TileSlice = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops: {TileSlice, VarAddend});
5534 }
5535
5536 return DAG.getNode(Opcode: IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
5537 DL, VT: MVT::Other,
5538 Ops: {/*Chain=*/N.getOperand(i: 0), TileSlice, Base,
5539 DAG.getTargetConstant(Val: ImmAddend, DL, VT: MVT::i32)});
5540}
5541
5542SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5543 SelectionDAG &DAG) const {
5544 unsigned IntNo = Op.getConstantOperandVal(i: 1);
5545 SDLoc DL(Op);
5546 switch (IntNo) {
5547 default:
5548 return SDValue(); // Don't custom lower most intrinsics.
5549 case Intrinsic::aarch64_prefetch: {
5550 SDValue Chain = Op.getOperand(i: 0);
5551 SDValue Addr = Op.getOperand(i: 2);
5552
5553 unsigned IsWrite = Op.getConstantOperandVal(i: 3);
5554 unsigned Locality = Op.getConstantOperandVal(i: 4);
5555 unsigned IsStream = Op.getConstantOperandVal(i: 5);
5556 unsigned IsData = Op.getConstantOperandVal(i: 6);
5557 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5558 (!IsData << 3) | // IsDataCache bit
5559 (Locality << 1) | // Cache level bits
5560 (unsigned)IsStream; // Stream bit
5561
5562 return DAG.getNode(Opcode: AArch64ISD::PREFETCH, DL, VT: MVT::Other, N1: Chain,
5563 N2: DAG.getTargetConstant(Val: PrfOp, DL, VT: MVT::i32), N3: Addr);
5564 }
5565 case Intrinsic::aarch64_sme_str:
5566 case Intrinsic::aarch64_sme_ldr: {
5567 return LowerSMELdrStr(N: Op, DAG, IsLoad: IntNo == Intrinsic::aarch64_sme_ldr);
5568 }
5569 case Intrinsic::aarch64_sme_za_enable:
5570 return DAG.getNode(
5571 Opcode: AArch64ISD::SMSTART, DL, VT: MVT::Other,
5572 N1: Op->getOperand(Num: 0), // Chain
5573 N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32),
5574 N3: DAG.getConstant(Val: AArch64SME::Always, DL, VT: MVT::i64));
5575 case Intrinsic::aarch64_sme_za_disable:
5576 return DAG.getNode(
5577 Opcode: AArch64ISD::SMSTOP, DL, VT: MVT::Other,
5578 N1: Op->getOperand(Num: 0), // Chain
5579 N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32),
5580 N3: DAG.getConstant(Val: AArch64SME::Always, DL, VT: MVT::i64));
5581 }
5582}
5583
5584SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5585 SelectionDAG &DAG) const {
5586 unsigned IntNo = Op.getConstantOperandVal(i: 1);
5587 SDLoc DL(Op);
5588 switch (IntNo) {
5589 default:
5590 return SDValue(); // Don't custom lower most intrinsics.
5591 case Intrinsic::aarch64_mops_memset_tag: {
5592 auto Node = cast<MemIntrinsicSDNode>(Val: Op.getNode());
5593 SDValue Chain = Node->getChain();
5594 SDValue Dst = Op.getOperand(i: 2);
5595 SDValue Val = Op.getOperand(i: 3);
5596 Val = DAG.getAnyExtOrTrunc(Op: Val, DL, VT: MVT::i64);
5597 SDValue Size = Op.getOperand(i: 4);
5598 auto Alignment = Node->getMemOperand()->getAlign();
5599 bool IsVol = Node->isVolatile();
5600 auto DstPtrInfo = Node->getPointerInfo();
5601
5602 const auto &SDI =
5603 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5604 SDValue MS =
5605 SDI.EmitMOPS(SDOpcode: AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, SrcOrValue: Val,
5606 Size, Alignment, isVolatile: IsVol, DstPtrInfo, SrcPtrInfo: MachinePointerInfo{});
5607
5608 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5609 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5610 // LowerOperationWrapper will complain that the number of results has
5611 // changed.
5612 return DAG.getMergeValues(Ops: {MS.getValue(R: 0), MS.getValue(R: 2)}, dl: DL);
5613 }
5614 }
5615}
5616
5617SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5618 SelectionDAG &DAG) const {
5619 unsigned IntNo = Op.getConstantOperandVal(i: 0);
5620 SDLoc dl(Op);
5621 switch (IntNo) {
5622 default: return SDValue(); // Don't custom lower most intrinsics.
5623 case Intrinsic::thread_pointer: {
5624 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
5625 return DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL: dl, VT: PtrVT);
5626 }
5627 case Intrinsic::aarch64_neon_abs: {
5628 EVT Ty = Op.getValueType();
5629 if (Ty == MVT::i64) {
5630 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v1i64,
5631 Operand: Op.getOperand(i: 1));
5632 Result = DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: MVT::v1i64, Operand: Result);
5633 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i64, Operand: Result);
5634 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(VT: Ty)) {
5635 return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: Ty, Operand: Op.getOperand(i: 1));
5636 } else {
5637 report_fatal_error(reason: "Unexpected type for AArch64 NEON intrinic");
5638 }
5639 }
5640 case Intrinsic::aarch64_neon_pmull64: {
5641 SDValue LHS = Op.getOperand(i: 1);
5642 SDValue RHS = Op.getOperand(i: 2);
5643
5644 std::optional<uint64_t> LHSLane =
5645 getConstantLaneNumOfExtractHalfOperand(Op&: LHS);
5646 std::optional<uint64_t> RHSLane =
5647 getConstantLaneNumOfExtractHalfOperand(Op&: RHS);
5648
5649 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5650 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5651
5652 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5653 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5654 // which ISel recognizes better. For example, generate a ldr into d*
5655 // registers as opposed to a GPR load followed by a fmov.
5656 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5657 std::optional<uint64_t> OtherLane,
5658 const SDLoc &dl,
5659 SelectionDAG &DAG) -> SDValue {
5660 // If the operand is an higher half itself, rewrite it to
5661 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5662 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5663 if (NLane && *NLane == 1)
5664 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: MVT::v1i64,
5665 N1: N.getOperand(i: 0), N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i64));
5666
5667 // Operand N is not a higher half but the other operand is.
5668 if (OtherLane && *OtherLane == 1) {
5669 // If this operand is a lower half, rewrite it to
5670 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5671 // align lanes of two operands. A roundtrip sequence (to move from lane
5672 // 1 to lane 0) is like this:
5673 // mov x8, v0.d[1]
5674 // fmov d0, x8
5675 if (NLane && *NLane == 0)
5676 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: MVT::v1i64,
5677 N1: DAG.getNode(Opcode: AArch64ISD::DUPLANE64, DL: dl, VT: MVT::v2i64,
5678 N1: N.getOperand(i: 0),
5679 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64)),
5680 N2: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i64));
5681
5682 // Otherwise just dup from main to all lanes.
5683 return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT: MVT::v1i64, Operand: N);
5684 }
5685
5686 // Neither operand is an extract of higher half, so codegen may just use
5687 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5688 assert(N.getValueType() == MVT::i64 &&
5689 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5690 return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: MVT::v1i64, Operand: N);
5691 };
5692
5693 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5694 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5695
5696 return DAG.getNode(Opcode: AArch64ISD::PMULL, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
5697 }
5698 case Intrinsic::aarch64_neon_smax:
5699 return DAG.getNode(Opcode: ISD::SMAX, DL: dl, VT: Op.getValueType(),
5700 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5701 case Intrinsic::aarch64_neon_umax:
5702 return DAG.getNode(Opcode: ISD::UMAX, DL: dl, VT: Op.getValueType(),
5703 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5704 case Intrinsic::aarch64_neon_smin:
5705 return DAG.getNode(Opcode: ISD::SMIN, DL: dl, VT: Op.getValueType(),
5706 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5707 case Intrinsic::aarch64_neon_umin:
5708 return DAG.getNode(Opcode: ISD::UMIN, DL: dl, VT: Op.getValueType(),
5709 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5710 case Intrinsic::aarch64_neon_scalar_sqxtn:
5711 case Intrinsic::aarch64_neon_scalar_sqxtun:
5712 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5713 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5714 if (Op.getValueType() == MVT::i32)
5715 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32,
5716 Operand: DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::f32,
5717 N1: Op.getOperand(i: 0),
5718 N2: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::f64,
5719 Operand: Op.getOperand(i: 1))));
5720 return SDValue();
5721 }
5722 case Intrinsic::aarch64_sve_whilelo:
5723 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5724 /*IsEqual=*/false);
5725 case Intrinsic::aarch64_sve_whilelt:
5726 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5727 /*IsEqual=*/false);
5728 case Intrinsic::aarch64_sve_whilels:
5729 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5730 /*IsEqual=*/true);
5731 case Intrinsic::aarch64_sve_whilele:
5732 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5733 /*IsEqual=*/true);
5734 case Intrinsic::aarch64_sve_sunpkhi:
5735 return DAG.getNode(Opcode: AArch64ISD::SUNPKHI, DL: dl, VT: Op.getValueType(),
5736 Operand: Op.getOperand(i: 1));
5737 case Intrinsic::aarch64_sve_sunpklo:
5738 return DAG.getNode(Opcode: AArch64ISD::SUNPKLO, DL: dl, VT: Op.getValueType(),
5739 Operand: Op.getOperand(i: 1));
5740 case Intrinsic::aarch64_sve_uunpkhi:
5741 return DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL: dl, VT: Op.getValueType(),
5742 Operand: Op.getOperand(i: 1));
5743 case Intrinsic::aarch64_sve_uunpklo:
5744 return DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL: dl, VT: Op.getValueType(),
5745 Operand: Op.getOperand(i: 1));
5746 case Intrinsic::aarch64_sve_clasta_n:
5747 return DAG.getNode(Opcode: AArch64ISD::CLASTA_N, DL: dl, VT: Op.getValueType(),
5748 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
5749 case Intrinsic::aarch64_sve_clastb_n:
5750 return DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL: dl, VT: Op.getValueType(),
5751 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
5752 case Intrinsic::aarch64_sve_lasta:
5753 return DAG.getNode(Opcode: AArch64ISD::LASTA, DL: dl, VT: Op.getValueType(),
5754 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5755 case Intrinsic::aarch64_sve_lastb:
5756 return DAG.getNode(Opcode: AArch64ISD::LASTB, DL: dl, VT: Op.getValueType(),
5757 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5758 case Intrinsic::aarch64_sve_rev:
5759 return DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL: dl, VT: Op.getValueType(),
5760 Operand: Op.getOperand(i: 1));
5761 case Intrinsic::aarch64_sve_tbl:
5762 return DAG.getNode(Opcode: AArch64ISD::TBL, DL: dl, VT: Op.getValueType(),
5763 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5764 case Intrinsic::aarch64_sve_trn1:
5765 return DAG.getNode(Opcode: AArch64ISD::TRN1, DL: dl, VT: Op.getValueType(),
5766 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5767 case Intrinsic::aarch64_sve_trn2:
5768 return DAG.getNode(Opcode: AArch64ISD::TRN2, DL: dl, VT: Op.getValueType(),
5769 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5770 case Intrinsic::aarch64_sve_uzp1:
5771 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT: Op.getValueType(),
5772 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5773 case Intrinsic::aarch64_sve_uzp2:
5774 return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VT: Op.getValueType(),
5775 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5776 case Intrinsic::aarch64_sve_zip1:
5777 return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT: Op.getValueType(),
5778 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5779 case Intrinsic::aarch64_sve_zip2:
5780 return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL: dl, VT: Op.getValueType(),
5781 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5782 case Intrinsic::aarch64_sve_splice:
5783 return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL: dl, VT: Op.getValueType(),
5784 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
5785 case Intrinsic::aarch64_sve_ptrue:
5786 return getPTrue(DAG, DL: dl, VT: Op.getValueType(), Pattern: Op.getConstantOperandVal(i: 1));
5787 case Intrinsic::aarch64_sve_clz:
5788 return DAG.getNode(Opcode: AArch64ISD::CTLZ_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5789 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5790 case Intrinsic::aarch64_sme_cntsb:
5791 return DAG.getNode(Opcode: AArch64ISD::RDSVL, DL: dl, VT: Op.getValueType(),
5792 Operand: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
5793 case Intrinsic::aarch64_sme_cntsh: {
5794 SDValue One = DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32);
5795 SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL: dl, VT: Op.getValueType(), Operand: One);
5796 return DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: Op.getValueType(), N1: Bytes, N2: One);
5797 }
5798 case Intrinsic::aarch64_sme_cntsw: {
5799 SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL: dl, VT: Op.getValueType(),
5800 Operand: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
5801 return DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: Op.getValueType(), N1: Bytes,
5802 N2: DAG.getConstant(Val: 2, DL: dl, VT: MVT::i32));
5803 }
5804 case Intrinsic::aarch64_sme_cntsd: {
5805 SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL: dl, VT: Op.getValueType(),
5806 Operand: DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32));
5807 return DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: Op.getValueType(), N1: Bytes,
5808 N2: DAG.getConstant(Val: 3, DL: dl, VT: MVT::i32));
5809 }
5810 case Intrinsic::aarch64_sve_cnt: {
5811 SDValue Data = Op.getOperand(i: 3);
5812 // CTPOP only supports integer operands.
5813 if (Data.getValueType().isFloatingPoint())
5814 Data = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Data);
5815 return DAG.getNode(Opcode: AArch64ISD::CTPOP_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5816 N1: Op.getOperand(i: 2), N2: Data, N3: Op.getOperand(i: 1));
5817 }
5818 case Intrinsic::aarch64_sve_dupq_lane:
5819 return LowerDUPQLane(Op, DAG);
5820 case Intrinsic::aarch64_sve_convert_from_svbool:
5821 if (Op.getValueType() == MVT::aarch64svcount)
5822 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Op.getOperand(i: 1));
5823 return getSVEPredicateBitCast(VT: Op.getValueType(), Op: Op.getOperand(i: 1), DAG);
5824 case Intrinsic::aarch64_sve_convert_to_svbool:
5825 if (Op.getOperand(i: 1).getValueType() == MVT::aarch64svcount)
5826 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::nxv16i1, Operand: Op.getOperand(i: 1));
5827 return getSVEPredicateBitCast(VT: MVT::nxv16i1, Op: Op.getOperand(i: 1), DAG);
5828 case Intrinsic::aarch64_sve_fneg:
5829 return DAG.getNode(Opcode: AArch64ISD::FNEG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5830 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5831 case Intrinsic::aarch64_sve_frintp:
5832 return DAG.getNode(Opcode: AArch64ISD::FCEIL_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5833 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5834 case Intrinsic::aarch64_sve_frintm:
5835 return DAG.getNode(Opcode: AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5836 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5837 case Intrinsic::aarch64_sve_frinti:
5838 return DAG.getNode(Opcode: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5839 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5840 case Intrinsic::aarch64_sve_frintx:
5841 return DAG.getNode(Opcode: AArch64ISD::FRINT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5842 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5843 case Intrinsic::aarch64_sve_frinta:
5844 return DAG.getNode(Opcode: AArch64ISD::FROUND_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5845 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5846 case Intrinsic::aarch64_sve_frintn:
5847 return DAG.getNode(Opcode: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5848 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5849 case Intrinsic::aarch64_sve_frintz:
5850 return DAG.getNode(Opcode: AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5851 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5852 case Intrinsic::aarch64_sve_ucvtf:
5853 return DAG.getNode(Opcode: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL: dl,
5854 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5855 N3: Op.getOperand(i: 1));
5856 case Intrinsic::aarch64_sve_scvtf:
5857 return DAG.getNode(Opcode: AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL: dl,
5858 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5859 N3: Op.getOperand(i: 1));
5860 case Intrinsic::aarch64_sve_fcvtzu:
5861 return DAG.getNode(Opcode: AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL: dl,
5862 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5863 N3: Op.getOperand(i: 1));
5864 case Intrinsic::aarch64_sve_fcvtzs:
5865 return DAG.getNode(Opcode: AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL: dl,
5866 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5867 N3: Op.getOperand(i: 1));
5868 case Intrinsic::aarch64_sve_fsqrt:
5869 return DAG.getNode(Opcode: AArch64ISD::FSQRT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5870 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5871 case Intrinsic::aarch64_sve_frecpx:
5872 return DAG.getNode(Opcode: AArch64ISD::FRECPX_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5873 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5874 case Intrinsic::aarch64_sve_frecpe_x:
5875 return DAG.getNode(Opcode: AArch64ISD::FRECPE, DL: dl, VT: Op.getValueType(),
5876 Operand: Op.getOperand(i: 1));
5877 case Intrinsic::aarch64_sve_frecps_x:
5878 return DAG.getNode(Opcode: AArch64ISD::FRECPS, DL: dl, VT: Op.getValueType(),
5879 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5880 case Intrinsic::aarch64_sve_frsqrte_x:
5881 return DAG.getNode(Opcode: AArch64ISD::FRSQRTE, DL: dl, VT: Op.getValueType(),
5882 Operand: Op.getOperand(i: 1));
5883 case Intrinsic::aarch64_sve_frsqrts_x:
5884 return DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL: dl, VT: Op.getValueType(),
5885 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5886 case Intrinsic::aarch64_sve_fabs:
5887 return DAG.getNode(Opcode: AArch64ISD::FABS_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5888 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5889 case Intrinsic::aarch64_sve_abs:
5890 return DAG.getNode(Opcode: AArch64ISD::ABS_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5891 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5892 case Intrinsic::aarch64_sve_neg:
5893 return DAG.getNode(Opcode: AArch64ISD::NEG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5894 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5895 case Intrinsic::aarch64_sve_insr: {
5896 SDValue Scalar = Op.getOperand(i: 2);
5897 EVT ScalarTy = Scalar.getValueType();
5898 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5899 Scalar = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i32, Operand: Scalar);
5900
5901 return DAG.getNode(Opcode: AArch64ISD::INSR, DL: dl, VT: Op.getValueType(),
5902 N1: Op.getOperand(i: 1), N2: Scalar);
5903 }
5904 case Intrinsic::aarch64_sve_rbit:
5905 return DAG.getNode(Opcode: AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL: dl,
5906 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5907 N3: Op.getOperand(i: 1));
5908 case Intrinsic::aarch64_sve_revb:
5909 return DAG.getNode(Opcode: AArch64ISD::BSWAP_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5910 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5911 case Intrinsic::aarch64_sve_revh:
5912 return DAG.getNode(Opcode: AArch64ISD::REVH_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5913 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5914 case Intrinsic::aarch64_sve_revw:
5915 return DAG.getNode(Opcode: AArch64ISD::REVW_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5916 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5917 case Intrinsic::aarch64_sve_revd:
5918 return DAG.getNode(Opcode: AArch64ISD::REVD_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5919 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5920 case Intrinsic::aarch64_sve_sxtb:
5921 return DAG.getNode(
5922 Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5923 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5924 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i8)),
5925 N4: Op.getOperand(i: 1));
5926 case Intrinsic::aarch64_sve_sxth:
5927 return DAG.getNode(
5928 Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5929 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5930 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i16)),
5931 N4: Op.getOperand(i: 1));
5932 case Intrinsic::aarch64_sve_sxtw:
5933 return DAG.getNode(
5934 Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5935 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5936 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i32)),
5937 N4: Op.getOperand(i: 1));
5938 case Intrinsic::aarch64_sve_uxtb:
5939 return DAG.getNode(
5940 Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5941 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5942 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i8)),
5943 N4: Op.getOperand(i: 1));
5944 case Intrinsic::aarch64_sve_uxth:
5945 return DAG.getNode(
5946 Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5947 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5948 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i16)),
5949 N4: Op.getOperand(i: 1));
5950 case Intrinsic::aarch64_sve_uxtw:
5951 return DAG.getNode(
5952 Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5953 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5954 N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i32)),
5955 N4: Op.getOperand(i: 1));
5956 case Intrinsic::localaddress: {
5957 const auto &MF = DAG.getMachineFunction();
5958 const auto *RegInfo = Subtarget->getRegisterInfo();
5959 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5960 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl, Reg,
5961 VT: Op.getSimpleValueType());
5962 }
5963
5964 case Intrinsic::eh_recoverfp: {
5965 // FIXME: This needs to be implemented to correctly handle highly aligned
5966 // stack objects. For now we simply return the incoming FP. Refer D53541
5967 // for more details.
5968 SDValue FnOp = Op.getOperand(i: 1);
5969 SDValue IncomingFPOp = Op.getOperand(i: 2);
5970 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: FnOp);
5971 auto *Fn = dyn_cast_or_null<Function>(Val: GSD ? GSD->getGlobal() : nullptr);
5972 if (!Fn)
5973 report_fatal_error(
5974 reason: "llvm.eh.recoverfp must take a function as the first argument");
5975 return IncomingFPOp;
5976 }
5977
5978 case Intrinsic::aarch64_neon_vsri:
5979 case Intrinsic::aarch64_neon_vsli:
5980 case Intrinsic::aarch64_sve_sri:
5981 case Intrinsic::aarch64_sve_sli: {
5982 EVT Ty = Op.getValueType();
5983
5984 if (!Ty.isVector())
5985 report_fatal_error(reason: "Unexpected type for aarch64_neon_vsli");
5986
5987 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5988
5989 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5990 IntNo == Intrinsic::aarch64_sve_sri;
5991 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5992 return DAG.getNode(Opcode, DL: dl, VT: Ty, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2),
5993 N3: Op.getOperand(i: 3));
5994 }
5995
5996 case Intrinsic::aarch64_neon_srhadd:
5997 case Intrinsic::aarch64_neon_urhadd:
5998 case Intrinsic::aarch64_neon_shadd:
5999 case Intrinsic::aarch64_neon_uhadd: {
6000 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6001 IntNo == Intrinsic::aarch64_neon_shadd);
6002 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6003 IntNo == Intrinsic::aarch64_neon_urhadd);
6004 unsigned Opcode = IsSignedAdd
6005 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6006 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6007 return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
6008 N2: Op.getOperand(i: 2));
6009 }
6010 case Intrinsic::aarch64_neon_saddlp:
6011 case Intrinsic::aarch64_neon_uaddlp: {
6012 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6013 ? AArch64ISD::UADDLP
6014 : AArch64ISD::SADDLP;
6015 return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), Operand: Op.getOperand(i: 1));
6016 }
6017 case Intrinsic::aarch64_neon_sdot:
6018 case Intrinsic::aarch64_neon_udot:
6019 case Intrinsic::aarch64_sve_sdot:
6020 case Intrinsic::aarch64_sve_udot: {
6021 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6022 IntNo == Intrinsic::aarch64_sve_udot)
6023 ? AArch64ISD::UDOT
6024 : AArch64ISD::SDOT;
6025 return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
6026 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
6027 }
6028 case Intrinsic::get_active_lane_mask: {
6029 SDValue ID =
6030 DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo, DL: dl, VT: MVT::i64);
6031
6032 EVT VT = Op.getValueType();
6033 if (VT.isScalableVector())
6034 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT, N1: ID, N2: Op.getOperand(i: 1),
6035 N3: Op.getOperand(i: 2));
6036
6037 // We can use the SVE whilelo instruction to lower this intrinsic by
6038 // creating the appropriate sequence of scalable vector operations and
6039 // then extracting a fixed-width subvector from the scalable vector.
6040
6041 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
6042 EVT WhileVT = ContainerVT.changeElementType(EltVT: MVT::i1);
6043
6044 SDValue Mask = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: WhileVT, N1: ID,
6045 N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2));
6046 SDValue MaskAsInt = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: ContainerVT, Operand: Mask);
6047 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT, N1: MaskAsInt,
6048 N2: DAG.getVectorIdxConstant(Val: 0, DL: dl));
6049 }
6050 case Intrinsic::aarch64_neon_uaddlv: {
6051 EVT OpVT = Op.getOperand(i: 1).getValueType();
6052 EVT ResVT = Op.getValueType();
6053 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6054 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
6055 // In order to avoid insert_subvector, used v4i32 than v2i32.
6056 SDValue UADDLV =
6057 DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: dl, VT: MVT::v4i32, Operand: Op.getOperand(i: 1));
6058 SDValue EXTRACT_VEC_ELT =
6059 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i32, N1: UADDLV,
6060 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64));
6061 return EXTRACT_VEC_ELT;
6062 }
6063 return SDValue();
6064 }
6065 case Intrinsic::experimental_cttz_elts: {
6066 SDValue CttzOp = Op.getOperand(i: 1);
6067 EVT VT = CttzOp.getValueType();
6068 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6069
6070 if (VT.isFixedLengthVector()) {
6071 // We can use SVE instructions to lower this intrinsic by first creating
6072 // an SVE predicate register mask from the fixed-width vector.
6073 EVT NewVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
6074 SDValue Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: NewVT, Operand: CttzOp);
6075 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6076 }
6077
6078 SDValue NewCttzElts =
6079 DAG.getNode(Opcode: AArch64ISD::CTTZ_ELTS, DL: dl, VT: MVT::i64, Operand: CttzOp);
6080 return DAG.getZExtOrTrunc(Op: NewCttzElts, DL: dl, VT: Op.getValueType());
6081 }
6082 }
6083}
6084
6085bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6086 if (VT.getVectorElementType() == MVT::i8 ||
6087 VT.getVectorElementType() == MVT::i16) {
6088 EltTy = MVT::i32;
6089 return true;
6090 }
6091 return false;
6092}
6093
6094bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6095 EVT DataVT) const {
6096 const EVT IndexVT = Extend.getOperand(i: 0).getValueType();
6097 // SVE only supports implicit extension of 32-bit indices.
6098 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6099 return false;
6100
6101 // Indices cannot be smaller than the main data type.
6102 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6103 return false;
6104
6105 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6106 // element container type, which would violate the previous clause.
6107 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6108}
6109
6110bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6111 EVT ExtVT = ExtVal.getValueType();
6112 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6113 return false;
6114
6115 // It may be worth creating extending masked loads if there are multiple
6116 // masked loads using the same predicate. That way we'll end up creating
6117 // extending masked loads that may then get split by the legaliser. This
6118 // results in just one set of predicate unpacks at the start, instead of
6119 // multiple sets of vector unpacks after each load.
6120 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(Val: ExtVal->getOperand(Num: 0))) {
6121 if (!isLoadExtLegalOrCustom(ExtType: ISD::ZEXTLOAD, ValVT: ExtVT, MemVT: Ld->getValueType(ResNo: 0))) {
6122 // Disable extending masked loads for fixed-width for now, since the code
6123 // quality doesn't look great.
6124 if (!ExtVT.isScalableVector())
6125 return false;
6126
6127 unsigned NumExtMaskedLoads = 0;
6128 for (auto *U : Ld->getMask()->uses())
6129 if (isa<MaskedLoadSDNode>(Val: U))
6130 NumExtMaskedLoads++;
6131
6132 if (NumExtMaskedLoads <= 1)
6133 return false;
6134 }
6135 }
6136
6137 return true;
6138}
6139
6140unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6141 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6142 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: false, /*Extend*/ args: false),
6143 AArch64ISD::GLD1_MERGE_ZERO},
6144 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: false, /*Extend*/ args: true),
6145 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6146 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: true, /*Extend*/ args: false),
6147 AArch64ISD::GLD1_MERGE_ZERO},
6148 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: true, /*Extend*/ args: true),
6149 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6150 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: false, /*Extend*/ args: false),
6151 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6152 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: false, /*Extend*/ args: true),
6153 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6154 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: true, /*Extend*/ args: false),
6155 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6156 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: true, /*Extend*/ args: true),
6157 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6158 };
6159 auto Key = std::make_tuple(args&: IsScaled, args&: IsSigned, args&: NeedsExtend);
6160 return AddrModes.find(x: Key)->second;
6161}
6162
6163unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6164 switch (Opcode) {
6165 default:
6166 llvm_unreachable("unimplemented opcode");
6167 return Opcode;
6168 case AArch64ISD::GLD1_MERGE_ZERO:
6169 return AArch64ISD::GLD1S_MERGE_ZERO;
6170 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6171 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6172 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6173 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6174 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6175 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6176 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6177 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6178 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6179 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6180 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6181 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6182 }
6183}
6184
6185SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6186 SelectionDAG &DAG) const {
6187 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Val&: Op);
6188
6189 SDLoc DL(Op);
6190 SDValue Chain = MGT->getChain();
6191 SDValue PassThru = MGT->getPassThru();
6192 SDValue Mask = MGT->getMask();
6193 SDValue BasePtr = MGT->getBasePtr();
6194 SDValue Index = MGT->getIndex();
6195 SDValue Scale = MGT->getScale();
6196 EVT VT = Op.getValueType();
6197 EVT MemVT = MGT->getMemoryVT();
6198 ISD::LoadExtType ExtType = MGT->getExtensionType();
6199 ISD::MemIndexType IndexType = MGT->getIndexType();
6200
6201 // SVE supports zero (and so undef) passthrough values only, everything else
6202 // must be handled manually by an explicit select on the load's output.
6203 if (!PassThru->isUndef() && !isZerosVector(N: PassThru.getNode())) {
6204 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6205 SDValue Load =
6206 DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
6207 MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6208 SDValue Select = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
6209 return DAG.getMergeValues(Ops: {Select, Load.getValue(R: 1)}, dl: DL);
6210 }
6211
6212 bool IsScaled = MGT->isIndexScaled();
6213 bool IsSigned = MGT->isIndexSigned();
6214
6215 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6216 // must be calculated before hand.
6217 uint64_t ScaleVal = Scale->getAsZExtVal();
6218 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6219 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6220 EVT IndexVT = Index.getValueType();
6221 Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
6222 N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
6223 Scale = DAG.getTargetConstant(Val: 1, DL, VT: Scale.getValueType());
6224
6225 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6226 return DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
6227 MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6228 }
6229
6230 // Lower fixed length gather to a scalable equivalent.
6231 if (VT.isFixedLengthVector()) {
6232 assert(Subtarget->useSVEForFixedLengthVectors() &&
6233 "Cannot lower when not using SVE for fixed vectors!");
6234
6235 // NOTE: Handle floating-point as if integer then bitcast the result.
6236 EVT DataVT = VT.changeVectorElementTypeToInteger();
6237 MemVT = MemVT.changeVectorElementTypeToInteger();
6238
6239 // Find the smallest integer fixed length vector we can use for the gather.
6240 EVT PromotedVT = VT.changeVectorElementType(EltVT: MVT::i32);
6241 if (DataVT.getVectorElementType() == MVT::i64 ||
6242 Index.getValueType().getVectorElementType() == MVT::i64 ||
6243 Mask.getValueType().getVectorElementType() == MVT::i64)
6244 PromotedVT = VT.changeVectorElementType(EltVT: MVT::i64);
6245
6246 // Promote vector operands except for passthrough, which we know is either
6247 // undef or zero, and thus best constructed directly.
6248 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6249 Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
6250 Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
6251
6252 // A promoted result type forces the need for an extending load.
6253 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6254 ExtType = ISD::EXTLOAD;
6255
6256 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
6257
6258 // Convert fixed length vector operands to scalable.
6259 MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
6260 Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
6261 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6262 PassThru = PassThru->isUndef() ? DAG.getUNDEF(VT: ContainerVT)
6263 : DAG.getConstant(Val: 0, DL, VT: ContainerVT);
6264
6265 // Emit equivalent scalable vector gather.
6266 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6267 SDValue Load =
6268 DAG.getMaskedGather(VTs: DAG.getVTList(VT1: ContainerVT, VT2: MVT::Other), MemVT, dl: DL,
6269 Ops, MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6270
6271 // Extract fixed length data then convert to the required result type.
6272 SDValue Result = convertFromScalableVector(DAG, VT: PromotedVT, V: Load);
6273 Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DataVT, Operand: Result);
6274 if (VT.isFloatingPoint())
6275 Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Result);
6276
6277 return DAG.getMergeValues(Ops: {Result, Load.getValue(R: 1)}, dl: DL);
6278 }
6279
6280 // Everything else is legal.
6281 return Op;
6282}
6283
6284SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6285 SelectionDAG &DAG) const {
6286 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Val&: Op);
6287
6288 SDLoc DL(Op);
6289 SDValue Chain = MSC->getChain();
6290 SDValue StoreVal = MSC->getValue();
6291 SDValue Mask = MSC->getMask();
6292 SDValue BasePtr = MSC->getBasePtr();
6293 SDValue Index = MSC->getIndex();
6294 SDValue Scale = MSC->getScale();
6295 EVT VT = StoreVal.getValueType();
6296 EVT MemVT = MSC->getMemoryVT();
6297 ISD::MemIndexType IndexType = MSC->getIndexType();
6298 bool Truncating = MSC->isTruncatingStore();
6299
6300 bool IsScaled = MSC->isIndexScaled();
6301 bool IsSigned = MSC->isIndexSigned();
6302
6303 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6304 // must be calculated before hand.
6305 uint64_t ScaleVal = Scale->getAsZExtVal();
6306 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6307 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6308 EVT IndexVT = Index.getValueType();
6309 Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
6310 N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
6311 Scale = DAG.getTargetConstant(Val: 1, DL, VT: Scale.getValueType());
6312
6313 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6314 return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
6315 MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
6316 }
6317
6318 // Lower fixed length scatter to a scalable equivalent.
6319 if (VT.isFixedLengthVector()) {
6320 assert(Subtarget->useSVEForFixedLengthVectors() &&
6321 "Cannot lower when not using SVE for fixed vectors!");
6322
6323 // Once bitcast we treat floating-point scatters as if integer.
6324 if (VT.isFloatingPoint()) {
6325 VT = VT.changeVectorElementTypeToInteger();
6326 MemVT = MemVT.changeVectorElementTypeToInteger();
6327 StoreVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: StoreVal);
6328 }
6329
6330 // Find the smallest integer fixed length vector we can use for the scatter.
6331 EVT PromotedVT = VT.changeVectorElementType(EltVT: MVT::i32);
6332 if (VT.getVectorElementType() == MVT::i64 ||
6333 Index.getValueType().getVectorElementType() == MVT::i64 ||
6334 Mask.getValueType().getVectorElementType() == MVT::i64)
6335 PromotedVT = VT.changeVectorElementType(EltVT: MVT::i64);
6336
6337 // Promote vector operands.
6338 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6339 Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
6340 Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
6341 StoreVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: PromotedVT, Operand: StoreVal);
6342
6343 // A promoted value type forces the need for a truncating store.
6344 if (PromotedVT != VT)
6345 Truncating = true;
6346
6347 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
6348
6349 // Convert fixed length vector operands to scalable.
6350 MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
6351 Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
6352 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6353 StoreVal = convertToScalableVector(DAG, VT: ContainerVT, V: StoreVal);
6354
6355 // Emit equivalent scalable vector scatter.
6356 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6357 return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
6358 MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
6359 }
6360
6361 // Everything else is legal.
6362 return Op;
6363}
6364
6365SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6366 SDLoc DL(Op);
6367 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Val&: Op);
6368 assert(LoadNode && "Expected custom lowering of a masked load node");
6369 EVT VT = Op->getValueType(ResNo: 0);
6370
6371 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6372 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6373
6374 SDValue PassThru = LoadNode->getPassThru();
6375 SDValue Mask = LoadNode->getMask();
6376
6377 if (PassThru->isUndef() || isZerosVector(N: PassThru.getNode()))
6378 return Op;
6379
6380 SDValue Load = DAG.getMaskedLoad(
6381 VT, dl: DL, Chain: LoadNode->getChain(), Base: LoadNode->getBasePtr(),
6382 Offset: LoadNode->getOffset(), Mask, Src0: DAG.getUNDEF(VT), MemVT: LoadNode->getMemoryVT(),
6383 MMO: LoadNode->getMemOperand(), AM: LoadNode->getAddressingMode(),
6384 LoadNode->getExtensionType());
6385
6386 SDValue Result = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
6387
6388 return DAG.getMergeValues(Ops: {Result, Load.getValue(R: 1)}, dl: DL);
6389}
6390
6391// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6392static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
6393 EVT VT, EVT MemVT,
6394 SelectionDAG &DAG) {
6395 assert(VT.isVector() && "VT should be a vector type");
6396 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6397
6398 SDValue Value = ST->getValue();
6399
6400 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6401 // the word lane which represent the v4i8 subvector. It optimizes the store
6402 // to:
6403 //
6404 // xtn v0.8b, v0.8h
6405 // str s0, [x0]
6406
6407 SDValue Undef = DAG.getUNDEF(VT: MVT::i16);
6408 SDValue UndefVec = DAG.getBuildVector(VT: MVT::v4i16, DL,
6409 Ops: {Undef, Undef, Undef, Undef});
6410
6411 SDValue TruncExt = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16,
6412 N1: Value, N2: UndefVec);
6413 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: TruncExt);
6414
6415 Trunc = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Trunc);
6416 SDValue ExtractTrunc = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32,
6417 N1: Trunc, N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
6418
6419 return DAG.getStore(Chain: ST->getChain(), dl: DL, Val: ExtractTrunc,
6420 Ptr: ST->getBasePtr(), MMO: ST->getMemOperand());
6421}
6422
6423// Custom lowering for any store, vector or scalar and/or default or with
6424// a truncate operations. Currently only custom lower truncate operation
6425// from vector v4i16 to v4i8 or volatile stores of i128.
6426SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6427 SelectionDAG &DAG) const {
6428 SDLoc Dl(Op);
6429 StoreSDNode *StoreNode = cast<StoreSDNode>(Val&: Op);
6430 assert (StoreNode && "Can only custom lower store nodes");
6431
6432 SDValue Value = StoreNode->getValue();
6433
6434 EVT VT = Value.getValueType();
6435 EVT MemVT = StoreNode->getMemoryVT();
6436
6437 if (VT.isVector()) {
6438 if (useSVEForFixedLengthVectorVT(
6439 VT,
6440 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6441 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6442
6443 unsigned AS = StoreNode->getAddressSpace();
6444 Align Alignment = StoreNode->getAlign();
6445 if (Alignment < MemVT.getStoreSize() &&
6446 !allowsMisalignedMemoryAccesses(VT: MemVT, AddrSpace: AS, Alignment,
6447 Flags: StoreNode->getMemOperand()->getFlags(),
6448 Fast: nullptr)) {
6449 return scalarizeVectorStore(ST: StoreNode, DAG);
6450 }
6451
6452 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6453 MemVT == MVT::v4i8) {
6454 return LowerTruncateVectorStore(DL: Dl, ST: StoreNode, VT, MemVT, DAG);
6455 }
6456 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6457 // the custom lowering, as there are no un-paired non-temporal stores and
6458 // legalization will break up 256 bit inputs.
6459 ElementCount EC = MemVT.getVectorElementCount();
6460 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6461 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6462 (MemVT.getScalarSizeInBits() == 8u ||
6463 MemVT.getScalarSizeInBits() == 16u ||
6464 MemVT.getScalarSizeInBits() == 32u ||
6465 MemVT.getScalarSizeInBits() == 64u)) {
6466 SDValue Lo =
6467 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: Dl,
6468 VT: MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
6469 N1: StoreNode->getValue(), N2: DAG.getConstant(Val: 0, DL: Dl, VT: MVT::i64));
6470 SDValue Hi =
6471 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: Dl,
6472 VT: MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
6473 N1: StoreNode->getValue(),
6474 N2: DAG.getConstant(Val: EC.getKnownMinValue() / 2, DL: Dl, VT: MVT::i64));
6475 SDValue Result = DAG.getMemIntrinsicNode(
6476 Opcode: AArch64ISD::STNP, dl: Dl, VTList: DAG.getVTList(VT: MVT::Other),
6477 Ops: {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6478 MemVT: StoreNode->getMemoryVT(), MMO: StoreNode->getMemOperand());
6479 return Result;
6480 }
6481 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6482 return LowerStore128(Op, DAG);
6483 } else if (MemVT == MVT::i64x8) {
6484 SDValue Value = StoreNode->getValue();
6485 assert(Value->getValueType(0) == MVT::i64x8);
6486 SDValue Chain = StoreNode->getChain();
6487 SDValue Base = StoreNode->getBasePtr();
6488 EVT PtrVT = Base.getValueType();
6489 for (unsigned i = 0; i < 8; i++) {
6490 SDValue Part = DAG.getNode(Opcode: AArch64ISD::LS64_EXTRACT, DL: Dl, VT: MVT::i64,
6491 N1: Value, N2: DAG.getConstant(Val: i, DL: Dl, VT: MVT::i32));
6492 SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL: Dl, VT: PtrVT, N1: Base,
6493 N2: DAG.getConstant(Val: i * 8, DL: Dl, VT: PtrVT));
6494 Chain = DAG.getStore(Chain, dl: Dl, Val: Part, Ptr, PtrInfo: StoreNode->getPointerInfo(),
6495 Alignment: StoreNode->getOriginalAlign());
6496 }
6497 return Chain;
6498 }
6499
6500 return SDValue();
6501}
6502
6503/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6504SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6505 SelectionDAG &DAG) const {
6506 MemSDNode *StoreNode = cast<MemSDNode>(Val&: Op);
6507 assert(StoreNode->getMemoryVT() == MVT::i128);
6508 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6509
6510 bool IsStoreRelease =
6511 StoreNode->getMergedOrdering() == AtomicOrdering::Release;
6512 if (StoreNode->isAtomic())
6513 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6514 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6515 StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
6516 StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
6517
6518 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6519 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6520 ? StoreNode->getOperand(Num: 1)
6521 : StoreNode->getOperand(Num: 2);
6522 SDLoc DL(Op);
6523 auto StoreValue = DAG.SplitScalar(N: Value, DL, LoVT: MVT::i64, HiVT: MVT::i64);
6524 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6525 if (DAG.getDataLayout().isBigEndian())
6526 std::swap(a&: StoreValue.first, b&: StoreValue.second);
6527 SDValue Result = DAG.getMemIntrinsicNode(
6528 Opcode, dl: DL, VTList: DAG.getVTList(VT: MVT::Other),
6529 Ops: {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6530 StoreNode->getBasePtr()},
6531 MemVT: StoreNode->getMemoryVT(), MMO: StoreNode->getMemOperand());
6532 return Result;
6533}
6534
6535SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6536 SelectionDAG &DAG) const {
6537 SDLoc DL(Op);
6538 LoadSDNode *LoadNode = cast<LoadSDNode>(Val&: Op);
6539 assert(LoadNode && "Expected custom lowering of a load node");
6540
6541 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6542 SmallVector<SDValue, 8> Ops;
6543 SDValue Base = LoadNode->getBasePtr();
6544 SDValue Chain = LoadNode->getChain();
6545 EVT PtrVT = Base.getValueType();
6546 for (unsigned i = 0; i < 8; i++) {
6547 SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: Base,
6548 N2: DAG.getConstant(Val: i * 8, DL, VT: PtrVT));
6549 SDValue Part = DAG.getLoad(VT: MVT::i64, dl: DL, Chain, Ptr,
6550 PtrInfo: LoadNode->getPointerInfo(),
6551 Alignment: LoadNode->getOriginalAlign());
6552 Ops.push_back(Elt: Part);
6553 Chain = SDValue(Part.getNode(), 1);
6554 }
6555 SDValue Loaded = DAG.getNode(Opcode: AArch64ISD::LS64_BUILD, DL, VT: MVT::i64x8, Ops);
6556 return DAG.getMergeValues(Ops: {Loaded, Chain}, dl: DL);
6557 }
6558
6559 // Custom lowering for extending v4i8 vector loads.
6560 EVT VT = Op->getValueType(ResNo: 0);
6561 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6562
6563 if (LoadNode->getMemoryVT() != MVT::v4i8)
6564 return SDValue();
6565
6566 // Avoid generating unaligned loads.
6567 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
6568 return SDValue();
6569
6570 unsigned ExtType;
6571 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6572 ExtType = ISD::SIGN_EXTEND;
6573 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6574 LoadNode->getExtensionType() == ISD::EXTLOAD)
6575 ExtType = ISD::ZERO_EXTEND;
6576 else
6577 return SDValue();
6578
6579 SDValue Load = DAG.getLoad(VT: MVT::f32, dl: DL, Chain: LoadNode->getChain(),
6580 Ptr: LoadNode->getBasePtr(), PtrInfo: MachinePointerInfo());
6581 SDValue Chain = Load.getValue(R: 1);
6582 SDValue Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v2f32, Operand: Load);
6583 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i8, Operand: Vec);
6584 SDValue Ext = DAG.getNode(Opcode: ExtType, DL, VT: MVT::v8i16, Operand: BC);
6585 Ext = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v4i16, N1: Ext,
6586 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
6587 if (VT == MVT::v4i32)
6588 Ext = DAG.getNode(Opcode: ExtType, DL, VT: MVT::v4i32, Operand: Ext);
6589 return DAG.getMergeValues(Ops: {Ext, Chain}, dl: DL);
6590}
6591
6592// Generate SUBS and CSEL for integer abs.
6593SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6594 MVT VT = Op.getSimpleValueType();
6595
6596 if (VT.isVector())
6597 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABS_MERGE_PASSTHRU);
6598
6599 SDLoc DL(Op);
6600 SDValue Neg = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT),
6601 N2: Op.getOperand(i: 0));
6602 // Generate SUBS & CSEL.
6603 SDValue Cmp =
6604 DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::i32),
6605 N1: Op.getOperand(i: 0), N2: DAG.getConstant(Val: 0, DL, VT));
6606 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: Op.getOperand(i: 0), N2: Neg,
6607 N3: DAG.getConstant(Val: AArch64CC::PL, DL, VT: MVT::i32),
6608 N4: Cmp.getValue(R: 1));
6609}
6610
6611static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
6612 SDValue Chain = Op.getOperand(i: 0);
6613 SDValue Cond = Op.getOperand(i: 1);
6614 SDValue Dest = Op.getOperand(i: 2);
6615
6616 AArch64CC::CondCode CC;
6617 if (SDValue Cmp = emitConjunction(DAG, Val: Cond, OutCC&: CC)) {
6618 SDLoc dl(Op);
6619 SDValue CCVal = DAG.getConstant(Val: CC, DL: dl, VT: MVT::i32);
6620 return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL: dl, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
6621 N4: Cmp);
6622 }
6623
6624 return SDValue();
6625}
6626
6627// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6628// FSHL is converted to FSHR before deciding what to do with it
6629static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) {
6630 SDValue Shifts = Op.getOperand(i: 2);
6631 // Check if the shift amount is a constant
6632 // If opcode is FSHL, convert it to FSHR
6633 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Val&: Shifts)) {
6634 SDLoc DL(Op);
6635 MVT VT = Op.getSimpleValueType();
6636
6637 if (Op.getOpcode() == ISD::FSHL) {
6638 unsigned int NewShiftNo =
6639 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6640 return DAG.getNode(
6641 Opcode: ISD::FSHR, DL, VT, N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1),
6642 N3: DAG.getConstant(Val: NewShiftNo, DL, VT: Shifts.getValueType()));
6643 } else if (Op.getOpcode() == ISD::FSHR) {
6644 return Op;
6645 }
6646 }
6647
6648 return SDValue();
6649}
6650
6651static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
6652 SDValue X = Op.getOperand(i: 0);
6653 EVT XScalarTy = X.getValueType();
6654 SDValue Exp = Op.getOperand(i: 1);
6655
6656 SDLoc DL(Op);
6657 EVT XVT, ExpVT;
6658 switch (Op.getSimpleValueType().SimpleTy) {
6659 default:
6660 return SDValue();
6661 case MVT::bf16:
6662 case MVT::f16:
6663 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X);
6664 [[fallthrough]];
6665 case MVT::f32:
6666 XVT = MVT::nxv4f32;
6667 ExpVT = MVT::nxv4i32;
6668 break;
6669 case MVT::f64:
6670 XVT = MVT::nxv2f64;
6671 ExpVT = MVT::nxv2i64;
6672 Exp = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: Exp);
6673 break;
6674 }
6675
6676 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
6677 SDValue VX =
6678 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: XVT, N1: DAG.getUNDEF(VT: XVT), N2: X, N3: Zero);
6679 SDValue VExp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ExpVT,
6680 N1: DAG.getUNDEF(VT: ExpVT), N2: Exp, N3: Zero);
6681 SDValue VPg = getPTrue(DAG, DL, VT: XVT.changeVectorElementType(EltVT: MVT::i1),
6682 Pattern: AArch64SVEPredPattern::all);
6683 SDValue FScale =
6684 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: XVT,
6685 N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_fscale, DL, VT: MVT::i64),
6686 N2: VPg, N3: VX, N4: VExp);
6687 SDValue Final =
6688 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: X.getValueType(), N1: FScale, N2: Zero);
6689 if (X.getValueType() != XScalarTy)
6690 Final = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: XScalarTy, N1: Final,
6691 N2: DAG.getIntPtrConstant(Val: 1, DL: SDLoc(Op)));
6692 return Final;
6693}
6694
6695SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
6696 SelectionDAG &DAG) const {
6697 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
6698 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
6699 report_fatal_error(
6700 reason: "ADJUST_TRAMPOLINE operation is only supported on Linux.");
6701
6702 return Op.getOperand(i: 0);
6703}
6704
6705SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
6706 SelectionDAG &DAG) const {
6707
6708 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
6709 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
6710 report_fatal_error(reason: "INIT_TRAMPOLINE operation is only supported on Linux.");
6711
6712 SDValue Chain = Op.getOperand(i: 0);
6713 SDValue Trmp = Op.getOperand(i: 1); // trampoline
6714 SDValue FPtr = Op.getOperand(i: 2); // nested function
6715 SDValue Nest = Op.getOperand(i: 3); // 'nest' parameter value
6716 SDLoc dl(Op);
6717
6718 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
6719 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(C&: *DAG.getContext());
6720
6721 TargetLowering::ArgListTy Args;
6722 TargetLowering::ArgListEntry Entry;
6723
6724 Entry.Ty = IntPtrTy;
6725 Entry.Node = Trmp;
6726 Args.push_back(x: Entry);
6727 Entry.Node = DAG.getConstant(Val: 20, DL: dl, VT: MVT::i64);
6728 Args.push_back(x: Entry);
6729
6730 Entry.Node = FPtr;
6731 Args.push_back(x: Entry);
6732 Entry.Node = Nest;
6733 Args.push_back(x: Entry);
6734
6735 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
6736 TargetLowering::CallLoweringInfo CLI(DAG);
6737 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
6738 CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *DAG.getContext()),
6739 Target: DAG.getExternalSymbol(Sym: "__trampoline_setup", VT: PtrVT), ArgsList: std::move(Args));
6740
6741 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
6742 return CallResult.second;
6743}
6744
6745SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
6746 SelectionDAG &DAG) const {
6747 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6748 LLVM_DEBUG(Op.dump());
6749
6750 switch (Op.getOpcode()) {
6751 default:
6752 llvm_unreachable("unimplemented operand");
6753 return SDValue();
6754 case ISD::BITCAST:
6755 return LowerBITCAST(Op, DAG);
6756 case ISD::GlobalAddress:
6757 return LowerGlobalAddress(Op, DAG);
6758 case ISD::GlobalTLSAddress:
6759 return LowerGlobalTLSAddress(Op, DAG);
6760 case ISD::PtrAuthGlobalAddress:
6761 return LowerPtrAuthGlobalAddress(Op, DAG);
6762 case ISD::ADJUST_TRAMPOLINE:
6763 return LowerADJUST_TRAMPOLINE(Op, DAG);
6764 case ISD::INIT_TRAMPOLINE:
6765 return LowerINIT_TRAMPOLINE(Op, DAG);
6766 case ISD::SETCC:
6767 case ISD::STRICT_FSETCC:
6768 case ISD::STRICT_FSETCCS:
6769 return LowerSETCC(Op, DAG);
6770 case ISD::SETCCCARRY:
6771 return LowerSETCCCARRY(Op, DAG);
6772 case ISD::BRCOND:
6773 return LowerBRCOND(Op, DAG);
6774 case ISD::BR_CC:
6775 return LowerBR_CC(Op, DAG);
6776 case ISD::SELECT:
6777 return LowerSELECT(Op, DAG);
6778 case ISD::SELECT_CC:
6779 return LowerSELECT_CC(Op, DAG);
6780 case ISD::JumpTable:
6781 return LowerJumpTable(Op, DAG);
6782 case ISD::BR_JT:
6783 return LowerBR_JT(Op, DAG);
6784 case ISD::BRIND:
6785 return LowerBRIND(Op, DAG);
6786 case ISD::ConstantPool:
6787 return LowerConstantPool(Op, DAG);
6788 case ISD::BlockAddress:
6789 return LowerBlockAddress(Op, DAG);
6790 case ISD::VASTART:
6791 return LowerVASTART(Op, DAG);
6792 case ISD::VACOPY:
6793 return LowerVACOPY(Op, DAG);
6794 case ISD::VAARG:
6795 return LowerVAARG(Op, DAG);
6796 case ISD::UADDO_CARRY:
6797 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: false /*unsigned*/);
6798 case ISD::USUBO_CARRY:
6799 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: false /*unsigned*/);
6800 case ISD::SADDO_CARRY:
6801 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: true /*signed*/);
6802 case ISD::SSUBO_CARRY:
6803 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: true /*signed*/);
6804 case ISD::SADDO:
6805 case ISD::UADDO:
6806 case ISD::SSUBO:
6807 case ISD::USUBO:
6808 case ISD::SMULO:
6809 case ISD::UMULO:
6810 return LowerXALUO(Op, DAG);
6811 case ISD::FADD:
6812 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FADD_PRED);
6813 case ISD::FSUB:
6814 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSUB_PRED);
6815 case ISD::FMUL:
6816 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMUL_PRED);
6817 case ISD::FMA:
6818 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMA_PRED);
6819 case ISD::FDIV:
6820 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FDIV_PRED);
6821 case ISD::FNEG:
6822 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEG_MERGE_PASSTHRU);
6823 case ISD::FCEIL:
6824 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FCEIL_MERGE_PASSTHRU);
6825 case ISD::FFLOOR:
6826 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6827 case ISD::FNEARBYINT:
6828 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6829 case ISD::FRINT:
6830 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FRINT_MERGE_PASSTHRU);
6831 case ISD::FROUND:
6832 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUND_MERGE_PASSTHRU);
6833 case ISD::FROUNDEVEN:
6834 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6835 case ISD::FTRUNC:
6836 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6837 case ISD::FSQRT:
6838 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSQRT_MERGE_PASSTHRU);
6839 case ISD::FABS:
6840 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FABS_MERGE_PASSTHRU);
6841 case ISD::FP_ROUND:
6842 case ISD::STRICT_FP_ROUND:
6843 return LowerFP_ROUND(Op, DAG);
6844 case ISD::FP_EXTEND:
6845 return LowerFP_EXTEND(Op, DAG);
6846 case ISD::FRAMEADDR:
6847 return LowerFRAMEADDR(Op, DAG);
6848 case ISD::SPONENTRY:
6849 return LowerSPONENTRY(Op, DAG);
6850 case ISD::RETURNADDR:
6851 return LowerRETURNADDR(Op, DAG);
6852 case ISD::ADDROFRETURNADDR:
6853 return LowerADDROFRETURNADDR(Op, DAG);
6854 case ISD::CONCAT_VECTORS:
6855 return LowerCONCAT_VECTORS(Op, DAG);
6856 case ISD::INSERT_VECTOR_ELT:
6857 return LowerINSERT_VECTOR_ELT(Op, DAG);
6858 case ISD::EXTRACT_VECTOR_ELT:
6859 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6860 case ISD::BUILD_VECTOR:
6861 return LowerBUILD_VECTOR(Op, DAG);
6862 case ISD::ZERO_EXTEND_VECTOR_INREG:
6863 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6864 case ISD::VECTOR_SHUFFLE:
6865 return LowerVECTOR_SHUFFLE(Op, DAG);
6866 case ISD::SPLAT_VECTOR:
6867 return LowerSPLAT_VECTOR(Op, DAG);
6868 case ISD::EXTRACT_SUBVECTOR:
6869 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6870 case ISD::INSERT_SUBVECTOR:
6871 return LowerINSERT_SUBVECTOR(Op, DAG);
6872 case ISD::SDIV:
6873 case ISD::UDIV:
6874 return LowerDIV(Op, DAG);
6875 case ISD::SMIN:
6876 case ISD::UMIN:
6877 case ISD::SMAX:
6878 case ISD::UMAX:
6879 return LowerMinMax(Op, DAG);
6880 case ISD::SRA:
6881 case ISD::SRL:
6882 case ISD::SHL:
6883 return LowerVectorSRA_SRL_SHL(Op, DAG);
6884 case ISD::SHL_PARTS:
6885 case ISD::SRL_PARTS:
6886 case ISD::SRA_PARTS:
6887 return LowerShiftParts(Op, DAG);
6888 case ISD::CTPOP:
6889 case ISD::PARITY:
6890 return LowerCTPOP_PARITY(Op, DAG);
6891 case ISD::FCOPYSIGN:
6892 return LowerFCOPYSIGN(Op, DAG);
6893 case ISD::OR:
6894 return LowerVectorOR(Op, DAG);
6895 case ISD::XOR:
6896 return LowerXOR(Op, DAG);
6897 case ISD::PREFETCH:
6898 return LowerPREFETCH(Op, DAG);
6899 case ISD::SINT_TO_FP:
6900 case ISD::UINT_TO_FP:
6901 case ISD::STRICT_SINT_TO_FP:
6902 case ISD::STRICT_UINT_TO_FP:
6903 return LowerINT_TO_FP(Op, DAG);
6904 case ISD::FP_TO_SINT:
6905 case ISD::FP_TO_UINT:
6906 case ISD::STRICT_FP_TO_SINT:
6907 case ISD::STRICT_FP_TO_UINT:
6908 return LowerFP_TO_INT(Op, DAG);
6909 case ISD::FP_TO_SINT_SAT:
6910 case ISD::FP_TO_UINT_SAT:
6911 return LowerFP_TO_INT_SAT(Op, DAG);
6912 case ISD::FSINCOS:
6913 return LowerFSINCOS(Op, DAG);
6914 case ISD::GET_ROUNDING:
6915 return LowerGET_ROUNDING(Op, DAG);
6916 case ISD::SET_ROUNDING:
6917 return LowerSET_ROUNDING(Op, DAG);
6918 case ISD::GET_FPMODE:
6919 return LowerGET_FPMODE(Op, DAG);
6920 case ISD::SET_FPMODE:
6921 return LowerSET_FPMODE(Op, DAG);
6922 case ISD::RESET_FPMODE:
6923 return LowerRESET_FPMODE(Op, DAG);
6924 case ISD::MUL:
6925 return LowerMUL(Op, DAG);
6926 case ISD::MULHS:
6927 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHS_PRED);
6928 case ISD::MULHU:
6929 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHU_PRED);
6930 case ISD::INTRINSIC_W_CHAIN:
6931 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6932 case ISD::INTRINSIC_WO_CHAIN:
6933 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6934 case ISD::INTRINSIC_VOID:
6935 return LowerINTRINSIC_VOID(Op, DAG);
6936 case ISD::ATOMIC_STORE:
6937 if (cast<MemSDNode>(Val&: Op)->getMemoryVT() == MVT::i128) {
6938 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6939 return LowerStore128(Op, DAG);
6940 }
6941 return SDValue();
6942 case ISD::STORE:
6943 return LowerSTORE(Op, DAG);
6944 case ISD::MSTORE:
6945 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6946 case ISD::MGATHER:
6947 return LowerMGATHER(Op, DAG);
6948 case ISD::MSCATTER:
6949 return LowerMSCATTER(Op, DAG);
6950 case ISD::VECREDUCE_SEQ_FADD:
6951 return LowerVECREDUCE_SEQ_FADD(ScalarOp: Op, DAG);
6952 case ISD::VECREDUCE_ADD:
6953 case ISD::VECREDUCE_AND:
6954 case ISD::VECREDUCE_OR:
6955 case ISD::VECREDUCE_XOR:
6956 case ISD::VECREDUCE_SMAX:
6957 case ISD::VECREDUCE_SMIN:
6958 case ISD::VECREDUCE_UMAX:
6959 case ISD::VECREDUCE_UMIN:
6960 case ISD::VECREDUCE_FADD:
6961 case ISD::VECREDUCE_FMAX:
6962 case ISD::VECREDUCE_FMIN:
6963 case ISD::VECREDUCE_FMAXIMUM:
6964 case ISD::VECREDUCE_FMINIMUM:
6965 return LowerVECREDUCE(Op, DAG);
6966 case ISD::ATOMIC_LOAD_AND:
6967 return LowerATOMIC_LOAD_AND(Op, DAG);
6968 case ISD::DYNAMIC_STACKALLOC:
6969 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6970 case ISD::VSCALE:
6971 return LowerVSCALE(Op, DAG);
6972 case ISD::ANY_EXTEND:
6973 case ISD::SIGN_EXTEND:
6974 case ISD::ZERO_EXTEND:
6975 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6976 case ISD::SIGN_EXTEND_INREG: {
6977 // Only custom lower when ExtraVT has a legal byte based element type.
6978 EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
6979 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6980 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6981 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6982 return SDValue();
6983
6984 return LowerToPredicatedOp(Op, DAG,
6985 NewOp: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
6986 }
6987 case ISD::TRUNCATE:
6988 return LowerTRUNCATE(Op, DAG);
6989 case ISD::MLOAD:
6990 return LowerMLOAD(Op, DAG);
6991 case ISD::LOAD:
6992 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
6993 OverrideNEON: !Subtarget->isNeonAvailable()))
6994 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6995 return LowerLOAD(Op, DAG);
6996 case ISD::ADD:
6997 case ISD::AND:
6998 case ISD::SUB:
6999 return LowerToScalableOp(Op, DAG);
7000 case ISD::FMAXIMUM:
7001 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAX_PRED);
7002 case ISD::FMAXNUM:
7003 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAXNM_PRED);
7004 case ISD::FMINIMUM:
7005 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMIN_PRED);
7006 case ISD::FMINNUM:
7007 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMINNM_PRED);
7008 case ISD::VSELECT:
7009 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7010 case ISD::ABS:
7011 return LowerABS(Op, DAG);
7012 case ISD::ABDS:
7013 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDS_PRED);
7014 case ISD::ABDU:
7015 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDU_PRED);
7016 case ISD::AVGFLOORS:
7017 return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDS_PRED);
7018 case ISD::AVGFLOORU:
7019 return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDU_PRED);
7020 case ISD::AVGCEILS:
7021 return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDS_PRED);
7022 case ISD::AVGCEILU:
7023 return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDU_PRED);
7024 case ISD::BITREVERSE:
7025 return LowerBitreverse(Op, DAG);
7026 case ISD::BSWAP:
7027 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BSWAP_MERGE_PASSTHRU);
7028 case ISD::CTLZ:
7029 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTLZ_MERGE_PASSTHRU);
7030 case ISD::CTTZ:
7031 return LowerCTTZ(Op, DAG);
7032 case ISD::VECTOR_SPLICE:
7033 return LowerVECTOR_SPLICE(Op, DAG);
7034 case ISD::VECTOR_DEINTERLEAVE:
7035 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7036 case ISD::VECTOR_INTERLEAVE:
7037 return LowerVECTOR_INTERLEAVE(Op, DAG);
7038 case ISD::LRINT:
7039 case ISD::LLRINT:
7040 if (Op.getValueType().isVector())
7041 return LowerVectorXRINT(Op, DAG);
7042 [[fallthrough]];
7043 case ISD::LROUND:
7044 case ISD::LLROUND: {
7045 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7046 Op.getOperand(0).getValueType() == MVT::bf16) &&
7047 "Expected custom lowering of rounding operations only for f16");
7048 SDLoc DL(Op);
7049 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Op.getOperand(i: 0));
7050 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: Op.getValueType(), Operand: Ext);
7051 }
7052 case ISD::STRICT_LROUND:
7053 case ISD::STRICT_LLROUND:
7054 case ISD::STRICT_LRINT:
7055 case ISD::STRICT_LLRINT: {
7056 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7057 Op.getOperand(1).getValueType() == MVT::bf16) &&
7058 "Expected custom lowering of rounding operations only for f16");
7059 SDLoc DL(Op);
7060 SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {MVT::f32, MVT::Other},
7061 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1)});
7062 return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {Op.getValueType(), MVT::Other},
7063 Ops: {Ext.getValue(R: 1), Ext.getValue(R: 0)});
7064 }
7065 case ISD::WRITE_REGISTER: {
7066 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7067 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7068 SDLoc DL(Op);
7069
7070 SDValue Chain = Op.getOperand(i: 0);
7071 SDValue SysRegName = Op.getOperand(i: 1);
7072 std::pair<SDValue, SDValue> Pair =
7073 DAG.SplitScalar(N: Op.getOperand(i: 2), DL, LoVT: MVT::i64, HiVT: MVT::i64);
7074
7075 // chain = MSRR(chain, sysregname, lo, hi)
7076 SDValue Result = DAG.getNode(Opcode: AArch64ISD::MSRR, DL, VT: MVT::Other, N1: Chain,
7077 N2: SysRegName, N3: Pair.first, N4: Pair.second);
7078
7079 return Result;
7080 }
7081 case ISD::FSHL:
7082 case ISD::FSHR:
7083 return LowerFunnelShift(Op, DAG);
7084 case ISD::FLDEXP:
7085 return LowerFLDEXP(Op, DAG);
7086 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
7087 return LowerVECTOR_HISTOGRAM(Op, DAG);
7088 }
7089}
7090
7091bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
7092 return !Subtarget->useSVEForFixedLengthVectors();
7093}
7094
7095bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
7096 EVT VT, bool OverrideNEON) const {
7097 if (!VT.isFixedLengthVector() || !VT.isSimple())
7098 return false;
7099
7100 // Don't use SVE for vectors we cannot scalarize if required.
7101 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7102 // Fixed length predicates should be promoted to i8.
7103 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7104 case MVT::i1:
7105 default:
7106 return false;
7107 case MVT::i8:
7108 case MVT::i16:
7109 case MVT::i32:
7110 case MVT::i64:
7111 case MVT::f16:
7112 case MVT::f32:
7113 case MVT::f64:
7114 break;
7115 }
7116
7117 // NEON-sized vectors can be emulated using SVE instructions.
7118 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7119 return Subtarget->isSVEorStreamingSVEAvailable();
7120
7121 // Ensure NEON MVTs only belong to a single register class.
7122 if (VT.getFixedSizeInBits() <= 128)
7123 return false;
7124
7125 // Ensure wider than NEON code generation is enabled.
7126 if (!Subtarget->useSVEForFixedLengthVectors())
7127 return false;
7128
7129 // Don't use SVE for types that don't fit.
7130 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7131 return false;
7132
7133 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7134 // the base fixed length SVE support in place.
7135 if (!VT.isPow2VectorType())
7136 return false;
7137
7138 return true;
7139}
7140
7141//===----------------------------------------------------------------------===//
7142// Calling Convention Implementation
7143//===----------------------------------------------------------------------===//
7144
7145static unsigned getIntrinsicID(const SDNode *N) {
7146 unsigned Opcode = N->getOpcode();
7147 switch (Opcode) {
7148 default:
7149 return Intrinsic::not_intrinsic;
7150 case ISD::INTRINSIC_WO_CHAIN: {
7151 unsigned IID = N->getConstantOperandVal(Num: 0);
7152 if (IID < Intrinsic::num_intrinsics)
7153 return IID;
7154 return Intrinsic::not_intrinsic;
7155 }
7156 }
7157}
7158
7159bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
7160 SDValue N1) const {
7161 if (!N0.hasOneUse())
7162 return false;
7163
7164 unsigned IID = getIntrinsicID(N: N1.getNode());
7165 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7166 if (IID == Intrinsic::aarch64_neon_umull ||
7167 N1.getOpcode() == AArch64ISD::UMULL ||
7168 IID == Intrinsic::aarch64_neon_smull ||
7169 N1.getOpcode() == AArch64ISD::SMULL)
7170 return N0.getOpcode() != ISD::ADD;
7171
7172 return true;
7173}
7174
7175/// Selects the correct CCAssignFn for a given CallingConvention value.
7176CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
7177 bool IsVarArg) const {
7178 switch (CC) {
7179 default:
7180 report_fatal_error(reason: "Unsupported calling convention.");
7181 case CallingConv::GHC:
7182 return CC_AArch64_GHC;
7183 case CallingConv::PreserveNone:
7184 // The VarArg implementation makes assumptions about register
7185 // argument passing that do not hold for preserve_none, so we
7186 // instead fall back to C argument passing.
7187 // The non-vararg case is handled in the CC function itself.
7188 if (!IsVarArg)
7189 return CC_AArch64_Preserve_None;
7190 [[fallthrough]];
7191 case CallingConv::C:
7192 case CallingConv::Fast:
7193 case CallingConv::PreserveMost:
7194 case CallingConv::PreserveAll:
7195 case CallingConv::CXX_FAST_TLS:
7196 case CallingConv::Swift:
7197 case CallingConv::SwiftTail:
7198 case CallingConv::Tail:
7199 case CallingConv::GRAAL:
7200 if (Subtarget->isTargetWindows()) {
7201 if (IsVarArg) {
7202 if (Subtarget->isWindowsArm64EC())
7203 return CC_AArch64_Arm64EC_VarArg;
7204 return CC_AArch64_Win64_VarArg;
7205 }
7206 return CC_AArch64_Win64PCS;
7207 }
7208 if (!Subtarget->isTargetDarwin())
7209 return CC_AArch64_AAPCS;
7210 if (!IsVarArg)
7211 return CC_AArch64_DarwinPCS;
7212 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
7213 : CC_AArch64_DarwinPCS_VarArg;
7214 case CallingConv::Win64:
7215 if (IsVarArg) {
7216 if (Subtarget->isWindowsArm64EC())
7217 return CC_AArch64_Arm64EC_VarArg;
7218 return CC_AArch64_Win64_VarArg;
7219 }
7220 return CC_AArch64_Win64PCS;
7221 case CallingConv::CFGuard_Check:
7222 if (Subtarget->isWindowsArm64EC())
7223 return CC_AArch64_Arm64EC_CFGuard_Check;
7224 return CC_AArch64_Win64_CFGuard_Check;
7225 case CallingConv::AArch64_VectorCall:
7226 case CallingConv::AArch64_SVE_VectorCall:
7227 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
7228 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
7229 return CC_AArch64_AAPCS;
7230 case CallingConv::ARM64EC_Thunk_X64:
7231 return CC_AArch64_Arm64EC_Thunk;
7232 case CallingConv::ARM64EC_Thunk_Native:
7233 return CC_AArch64_Arm64EC_Thunk_Native;
7234 }
7235}
7236
7237CCAssignFn *
7238AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
7239 switch (CC) {
7240 default:
7241 return RetCC_AArch64_AAPCS;
7242 case CallingConv::ARM64EC_Thunk_X64:
7243 return RetCC_AArch64_Arm64EC_Thunk;
7244 case CallingConv::CFGuard_Check:
7245 if (Subtarget->isWindowsArm64EC())
7246 return RetCC_AArch64_Arm64EC_CFGuard_Check;
7247 return RetCC_AArch64_AAPCS;
7248 }
7249}
7250
7251static bool isPassedInFPR(EVT VT) {
7252 return VT.isFixedLengthVector() ||
7253 (VT.isFloatingPoint() && !VT.isScalableVector());
7254}
7255
7256SDValue AArch64TargetLowering::LowerFormalArguments(
7257 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7258 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7259 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7260 MachineFunction &MF = DAG.getMachineFunction();
7261 const Function &F = MF.getFunction();
7262 MachineFrameInfo &MFI = MF.getFrameInfo();
7263 bool IsWin64 =
7264 Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg());
7265 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7266 (isVarArg && Subtarget->isWindowsArm64EC());
7267 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7268
7269 SmallVector<ISD::OutputArg, 4> Outs;
7270 GetReturnInfo(CC: CallConv, ReturnType: F.getReturnType(), attr: F.getAttributes(), Outs,
7271 TLI: DAG.getTargetLoweringInfo(), DL: MF.getDataLayout());
7272 if (any_of(Range&: Outs, P: [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7273 FuncInfo->setIsSVECC(true);
7274
7275 // Assign locations to all of the incoming arguments.
7276 SmallVector<CCValAssign, 16> ArgLocs;
7277 DenseMap<unsigned, SDValue> CopiedRegs;
7278 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7279
7280 // At this point, Ins[].VT may already be promoted to i32. To correctly
7281 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7282 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7283 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7284 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7285 // LocVT.
7286 unsigned NumArgs = Ins.size();
7287 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7288 unsigned CurArgIdx = 0;
7289 for (unsigned i = 0; i != NumArgs; ++i) {
7290 MVT ValVT = Ins[i].VT;
7291 if (Ins[i].isOrigArg()) {
7292 std::advance(i&: CurOrigArg, n: Ins[i].getOrigArgIndex() - CurArgIdx);
7293 CurArgIdx = Ins[i].getOrigArgIndex();
7294
7295 // Get type of the original argument.
7296 EVT ActualVT = getValueType(DL: DAG.getDataLayout(), Ty: CurOrigArg->getType(),
7297 /*AllowUnknown*/ true);
7298 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7299 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7300 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7301 ValVT = MVT::i8;
7302 else if (ActualMVT == MVT::i16)
7303 ValVT = MVT::i16;
7304 }
7305 bool UseVarArgCC = false;
7306 if (IsWin64)
7307 UseVarArgCC = isVarArg;
7308 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: UseVarArgCC);
7309 bool Res =
7310 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7311 assert(!Res && "Call operand has unhandled type");
7312 (void)Res;
7313 }
7314
7315 SMEAttrs Attrs(MF.getFunction());
7316 bool IsLocallyStreaming =
7317 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7318 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7319 SDValue Glue = Chain.getValue(R: 1);
7320
7321 SmallVector<SDValue, 16> ArgValues;
7322 unsigned ExtraArgLocs = 0;
7323 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7324 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7325
7326 if (Ins[i].Flags.isByVal()) {
7327 // Byval is used for HFAs in the PCS, but the system should work in a
7328 // non-compliant manner for larger structs.
7329 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7330 int Size = Ins[i].Flags.getByValSize();
7331 unsigned NumRegs = (Size + 7) / 8;
7332
7333 // FIXME: This works on big-endian for composite byvals, which are the common
7334 // case. It should also work for fundamental types too.
7335 unsigned FrameIdx =
7336 MFI.CreateFixedObject(Size: 8 * NumRegs, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
7337 SDValue FrameIdxN = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
7338 InVals.push_back(Elt: FrameIdxN);
7339
7340 continue;
7341 }
7342
7343 if (Ins[i].Flags.isSwiftAsync())
7344 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
7345
7346 SDValue ArgValue;
7347 if (VA.isRegLoc()) {
7348 // Arguments stored in registers.
7349 EVT RegVT = VA.getLocVT();
7350 const TargetRegisterClass *RC;
7351
7352 if (RegVT == MVT::i32)
7353 RC = &AArch64::GPR32RegClass;
7354 else if (RegVT == MVT::i64)
7355 RC = &AArch64::GPR64RegClass;
7356 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7357 RC = &AArch64::FPR16RegClass;
7358 else if (RegVT == MVT::f32)
7359 RC = &AArch64::FPR32RegClass;
7360 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7361 RC = &AArch64::FPR64RegClass;
7362 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7363 RC = &AArch64::FPR128RegClass;
7364 else if (RegVT.isScalableVector() &&
7365 RegVT.getVectorElementType() == MVT::i1) {
7366 FuncInfo->setIsSVECC(true);
7367 RC = &AArch64::PPRRegClass;
7368 } else if (RegVT == MVT::aarch64svcount) {
7369 FuncInfo->setIsSVECC(true);
7370 RC = &AArch64::PPRRegClass;
7371 } else if (RegVT.isScalableVector()) {
7372 FuncInfo->setIsSVECC(true);
7373 RC = &AArch64::ZPRRegClass;
7374 } else
7375 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7376
7377 // Transform the arguments in physical registers into virtual ones.
7378 Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
7379
7380 if (IsLocallyStreaming) {
7381 // LocallyStreamingFunctions must insert the SMSTART in the correct
7382 // position, so we use Glue to ensure no instructions can be scheduled
7383 // between the chain of:
7384 // t0: ch,glue = EntryNode
7385 // t1: res,ch,glue = CopyFromReg
7386 // ...
7387 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7388 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7389 // ^^^^^^
7390 // This will be the new Chain/Root node.
7391 ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT, Glue);
7392 Glue = ArgValue.getValue(R: 2);
7393 if (isPassedInFPR(VT: ArgValue.getValueType())) {
7394 ArgValue =
7395 DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
7396 VTList: DAG.getVTList(VT1: ArgValue.getValueType(), VT2: MVT::Glue),
7397 Ops: {ArgValue, Glue});
7398 Glue = ArgValue.getValue(R: 1);
7399 }
7400 } else
7401 ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT);
7402
7403 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7404 // to 64 bits. Insert an assert[sz]ext to capture this, then
7405 // truncate to the right size.
7406 switch (VA.getLocInfo()) {
7407 default:
7408 llvm_unreachable("Unknown loc info!");
7409 case CCValAssign::Full:
7410 break;
7411 case CCValAssign::Indirect:
7412 assert(
7413 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7414 "Indirect arguments should be scalable on most subtargets");
7415 break;
7416 case CCValAssign::BCvt:
7417 ArgValue = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: ArgValue);
7418 break;
7419 case CCValAssign::AExt:
7420 case CCValAssign::SExt:
7421 case CCValAssign::ZExt:
7422 break;
7423 case CCValAssign::AExtUpper:
7424 ArgValue = DAG.getNode(Opcode: ISD::SRL, DL, VT: RegVT, N1: ArgValue,
7425 N2: DAG.getConstant(Val: 32, DL, VT: RegVT));
7426 ArgValue = DAG.getZExtOrTrunc(Op: ArgValue, DL, VT: VA.getValVT());
7427 break;
7428 }
7429 } else { // VA.isRegLoc()
7430 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7431 unsigned ArgOffset = VA.getLocMemOffset();
7432 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7433 ? VA.getLocVT().getSizeInBits()
7434 : VA.getValVT().getSizeInBits()) / 8;
7435
7436 uint32_t BEAlign = 0;
7437 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7438 !Ins[i].Flags.isInConsecutiveRegs())
7439 BEAlign = 8 - ArgSize;
7440
7441 SDValue FIN;
7442 MachinePointerInfo PtrInfo;
7443 if (StackViaX4) {
7444 // In both the ARM64EC varargs convention and the thunk convention,
7445 // arguments on the stack are accessed relative to x4, not sp. In
7446 // the thunk convention, there's an additional offset of 32 bytes
7447 // to account for the shadow store.
7448 unsigned ObjOffset = ArgOffset + BEAlign;
7449 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7450 ObjOffset += 32;
7451 Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
7452 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
7453 FIN = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Val,
7454 N2: DAG.getConstant(Val: ObjOffset, DL, VT: MVT::i64));
7455 PtrInfo = MachinePointerInfo::getUnknownStack(MF);
7456 } else {
7457 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset + BEAlign, IsImmutable: true);
7458
7459 // Create load nodes to retrieve arguments from the stack.
7460 FIN = DAG.getFrameIndex(FI, VT: getPointerTy(DL: DAG.getDataLayout()));
7461 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7462 }
7463
7464 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7465 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
7466 MVT MemVT = VA.getValVT();
7467
7468 switch (VA.getLocInfo()) {
7469 default:
7470 break;
7471 case CCValAssign::Trunc:
7472 case CCValAssign::BCvt:
7473 MemVT = VA.getLocVT();
7474 break;
7475 case CCValAssign::Indirect:
7476 assert((VA.getValVT().isScalableVector() ||
7477 Subtarget->isWindowsArm64EC()) &&
7478 "Indirect arguments should be scalable on most subtargets");
7479 MemVT = VA.getLocVT();
7480 break;
7481 case CCValAssign::SExt:
7482 ExtType = ISD::SEXTLOAD;
7483 break;
7484 case CCValAssign::ZExt:
7485 ExtType = ISD::ZEXTLOAD;
7486 break;
7487 case CCValAssign::AExt:
7488 ExtType = ISD::EXTLOAD;
7489 break;
7490 }
7491
7492 ArgValue = DAG.getExtLoad(ExtType, dl: DL, VT: VA.getLocVT(), Chain, Ptr: FIN, PtrInfo,
7493 MemVT);
7494 }
7495
7496 if (VA.getLocInfo() == CCValAssign::Indirect) {
7497 assert((VA.getValVT().isScalableVT() ||
7498 Subtarget->isWindowsArm64EC()) &&
7499 "Indirect arguments should be scalable on most subtargets");
7500
7501 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7502 unsigned NumParts = 1;
7503 if (Ins[i].Flags.isInConsecutiveRegs()) {
7504 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7505 ++NumParts;
7506 }
7507
7508 MVT PartLoad = VA.getValVT();
7509 SDValue Ptr = ArgValue;
7510
7511 // Ensure we generate all loads for each tuple part, whilst updating the
7512 // pointer after each load correctly using vscale.
7513 while (NumParts > 0) {
7514 ArgValue = DAG.getLoad(VT: PartLoad, dl: DL, Chain, Ptr, PtrInfo: MachinePointerInfo());
7515 InVals.push_back(Elt: ArgValue);
7516 NumParts--;
7517 if (NumParts > 0) {
7518 SDValue BytesIncrement;
7519 if (PartLoad.isScalableVector()) {
7520 BytesIncrement = DAG.getVScale(
7521 DL, VT: Ptr.getValueType(),
7522 MulImm: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7523 } else {
7524 BytesIncrement = DAG.getConstant(
7525 Val: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7526 VT: Ptr.getValueType());
7527 }
7528 SDNodeFlags Flags;
7529 Flags.setNoUnsignedWrap(true);
7530 Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
7531 N2: BytesIncrement, Flags);
7532 ExtraArgLocs++;
7533 i++;
7534 }
7535 }
7536 } else {
7537 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7538 ArgValue = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: ArgValue.getValueType(),
7539 N1: ArgValue, N2: DAG.getValueType(MVT::i32));
7540
7541 // i1 arguments are zero-extended to i8 by the caller. Emit a
7542 // hint to reflect this.
7543 if (Ins[i].isOrigArg()) {
7544 Argument *OrigArg = F.getArg(i: Ins[i].getOrigArgIndex());
7545 if (OrigArg->getType()->isIntegerTy(Bitwidth: 1)) {
7546 if (!Ins[i].Flags.isZExt()) {
7547 ArgValue = DAG.getNode(Opcode: AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7548 VT: ArgValue.getValueType(), Operand: ArgValue);
7549 }
7550 }
7551 }
7552
7553 InVals.push_back(Elt: ArgValue);
7554 }
7555 }
7556 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7557
7558 // Insert the SMSTART if this is a locally streaming function and
7559 // make sure it is Glued to the last CopyFromReg value.
7560 if (IsLocallyStreaming) {
7561 SDValue PStateSM;
7562 if (Attrs.hasStreamingCompatibleInterface()) {
7563 PStateSM = getRuntimePStateSM(DAG, Chain, DL, VT: MVT::i64);
7564 Register Reg = MF.getRegInfo().createVirtualRegister(
7565 RegClass: getRegClassFor(VT: PStateSM.getValueType().getSimpleVT()));
7566 FuncInfo->setPStateSMReg(Reg);
7567 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: PStateSM);
7568 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, InGlue: Glue,
7569 Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
7570 } else
7571 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, InGlue: Glue,
7572 Condition: AArch64SME::Always);
7573
7574 // Ensure that the SMSTART happens after the CopyWithChain such that its
7575 // chain result is used.
7576 for (unsigned I=0; I<InVals.size(); ++I) {
7577 Register Reg = MF.getRegInfo().createVirtualRegister(
7578 RegClass: getRegClassFor(VT: InVals[I].getValueType().getSimpleVT()));
7579 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: InVals[I]);
7580 InVals[I] = DAG.getCopyFromReg(Chain, dl: DL, Reg,
7581 VT: InVals[I].getValueType());
7582 }
7583 }
7584
7585 // varargs
7586 if (isVarArg) {
7587 if (!Subtarget->isTargetDarwin() || IsWin64) {
7588 // The AAPCS variadic function ABI is identical to the non-variadic
7589 // one. As a result there may be more arguments in registers and we should
7590 // save them for future reference.
7591 // Win64 variadic functions also pass arguments in registers, but all float
7592 // arguments are passed in integer registers.
7593 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7594 }
7595
7596 // This will point to the next argument passed via stack.
7597 unsigned VarArgsOffset = CCInfo.getStackSize();
7598 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7599 VarArgsOffset = alignTo(Value: VarArgsOffset, Align: Subtarget->isTargetILP32() ? 4 : 8);
7600 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7601 FuncInfo->setVarArgsStackIndex(
7602 MFI.CreateFixedObject(Size: 4, SPOffset: VarArgsOffset, IsImmutable: true));
7603
7604 if (MFI.hasMustTailInVarArgFunc()) {
7605 SmallVector<MVT, 2> RegParmTypes;
7606 RegParmTypes.push_back(Elt: MVT::i64);
7607 RegParmTypes.push_back(Elt: MVT::f128);
7608 // Compute the set of forwarded registers. The rest are scratch.
7609 SmallVectorImpl<ForwardedRegister> &Forwards =
7610 FuncInfo->getForwardedMustTailRegParms();
7611 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7612 Fn: CC_AArch64_AAPCS);
7613
7614 // Conservatively forward X8, since it might be used for aggregate return.
7615 if (!CCInfo.isAllocated(Reg: AArch64::X8)) {
7616 Register X8VReg = MF.addLiveIn(PReg: AArch64::X8, RC: &AArch64::GPR64RegClass);
7617 Forwards.push_back(Elt: ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7618 }
7619 }
7620 }
7621
7622 // On Windows, InReg pointers must be returned, so record the pointer in a
7623 // virtual register at the start of the function so it can be returned in the
7624 // epilogue.
7625 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7626 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7627 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7628 Ins[I].Flags.isInReg()) &&
7629 Ins[I].Flags.isSRet()) {
7630 assert(!FuncInfo->getSRetReturnReg());
7631
7632 MVT PtrTy = getPointerTy(DL: DAG.getDataLayout());
7633 Register Reg =
7634 MF.getRegInfo().createVirtualRegister(RegClass: getRegClassFor(VT: PtrTy));
7635 FuncInfo->setSRetReturnReg(Reg);
7636
7637 SDValue Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg, N: InVals[I]);
7638 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, N1: Copy, N2: Chain);
7639 break;
7640 }
7641 }
7642 }
7643
7644 unsigned StackArgSize = CCInfo.getStackSize();
7645 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7646 if (DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt)) {
7647 // This is a non-standard ABI so by fiat I say we're allowed to make full
7648 // use of the stack area to be popped, which must be aligned to 16 bytes in
7649 // any case:
7650 StackArgSize = alignTo(Value: StackArgSize, Align: 16);
7651
7652 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7653 // a multiple of 16.
7654 FuncInfo->setArgumentStackToRestore(StackArgSize);
7655
7656 // This realignment carries over to the available bytes below. Our own
7657 // callers will guarantee the space is free by giving an aligned value to
7658 // CALLSEQ_START.
7659 }
7660 // Even if we're not expected to free up the space, it's useful to know how
7661 // much is there while considering tail calls (because we can reuse it).
7662 FuncInfo->setBytesInStackArgArea(StackArgSize);
7663
7664 if (Subtarget->hasCustomCallingConv())
7665 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
7666
7667 // Create a 16 Byte TPIDR2 object. The dynamic buffer
7668 // will be expanded and stored in the static object later using a pseudonode.
7669 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7670 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
7671 TPIDR2.FrameIndex = MFI.CreateStackObject(Size: 16, Alignment: Align(16), isSpillSlot: false);
7672 SDValue SVL = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
7673 Operand: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
7674
7675 SDValue Buffer;
7676 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
7677 Buffer = DAG.getNode(Opcode: AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
7678 VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other), Ops: {Chain, SVL});
7679 } else {
7680 SDValue Size = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: SVL, N2: SVL);
7681 Buffer = DAG.getNode(Opcode: ISD::DYNAMIC_STACKALLOC, DL,
7682 VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other),
7683 Ops: {Chain, Size, DAG.getConstant(Val: 1, DL, VT: MVT::i64)});
7684 MFI.CreateVariableSizedObject(Alignment: Align(16), Alloca: nullptr);
7685 }
7686 Chain = DAG.getNode(
7687 Opcode: AArch64ISD::INIT_TPIDR2OBJ, DL, VTList: DAG.getVTList(VT: MVT::Other),
7688 Ops: {/*Chain*/ Buffer.getValue(R: 1), /*Buffer ptr*/ Buffer.getValue(R: 0)});
7689 }
7690
7691 if (CallConv == CallingConv::PreserveNone) {
7692 for (const ISD::InputArg &I : Ins) {
7693 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
7694 I.Flags.isSwiftAsync()) {
7695 MachineFunction &MF = DAG.getMachineFunction();
7696 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
7697 MF.getFunction(),
7698 "Swift attributes can't be used with preserve_none",
7699 DL.getDebugLoc()));
7700 break;
7701 }
7702 }
7703 }
7704
7705 return Chain;
7706}
7707
7708void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7709 SelectionDAG &DAG,
7710 const SDLoc &DL,
7711 SDValue &Chain) const {
7712 MachineFunction &MF = DAG.getMachineFunction();
7713 MachineFrameInfo &MFI = MF.getFrameInfo();
7714 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7715 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
7716 Function &F = MF.getFunction();
7717 bool IsWin64 =
7718 Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg());
7719
7720 SmallVector<SDValue, 8> MemOps;
7721
7722 auto GPRArgRegs = AArch64::getGPRArgRegs();
7723 unsigned NumGPRArgRegs = GPRArgRegs.size();
7724 if (Subtarget->isWindowsArm64EC()) {
7725 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7726 // functions.
7727 NumGPRArgRegs = 4;
7728 }
7729 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(Regs: GPRArgRegs);
7730
7731 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7732 int GPRIdx = 0;
7733 if (GPRSaveSize != 0) {
7734 if (IsWin64) {
7735 GPRIdx = MFI.CreateFixedObject(Size: GPRSaveSize, SPOffset: -(int)GPRSaveSize, IsImmutable: false);
7736 if (GPRSaveSize & 15)
7737 // The extra size here, if triggered, will always be 8.
7738 MFI.CreateFixedObject(Size: 16 - (GPRSaveSize & 15), SPOffset: -(int)alignTo(Value: GPRSaveSize, Align: 16), IsImmutable: false);
7739 } else
7740 GPRIdx = MFI.CreateStackObject(Size: GPRSaveSize, Alignment: Align(8), isSpillSlot: false);
7741
7742 SDValue FIN;
7743 if (Subtarget->isWindowsArm64EC()) {
7744 // With the Arm64EC ABI, we reserve the save area as usual, but we
7745 // compute its address relative to x4. For a normal AArch64->AArch64
7746 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7747 // different address.
7748 Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
7749 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
7750 FIN = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: Val,
7751 N2: DAG.getConstant(Val: GPRSaveSize, DL, VT: MVT::i64));
7752 } else {
7753 FIN = DAG.getFrameIndex(FI: GPRIdx, VT: PtrVT);
7754 }
7755
7756 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7757 Register VReg = MF.addLiveIn(PReg: GPRArgRegs[i], RC: &AArch64::GPR64RegClass);
7758 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
7759 SDValue Store =
7760 DAG.getStore(Chain: Val.getValue(R: 1), dl: DL, Val, Ptr: FIN,
7761 PtrInfo: IsWin64 ? MachinePointerInfo::getFixedStack(
7762 MF, FI: GPRIdx, Offset: (i - FirstVariadicGPR) * 8)
7763 : MachinePointerInfo::getStack(MF, Offset: i * 8));
7764 MemOps.push_back(Elt: Store);
7765 FIN =
7766 DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN, N2: DAG.getConstant(Val: 8, DL, VT: PtrVT));
7767 }
7768 }
7769 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7770 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7771
7772 if (Subtarget->hasFPARMv8() && !IsWin64) {
7773 auto FPRArgRegs = AArch64::getFPRArgRegs();
7774 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7775 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(Regs: FPRArgRegs);
7776
7777 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7778 int FPRIdx = 0;
7779 if (FPRSaveSize != 0) {
7780 FPRIdx = MFI.CreateStackObject(Size: FPRSaveSize, Alignment: Align(16), isSpillSlot: false);
7781
7782 SDValue FIN = DAG.getFrameIndex(FI: FPRIdx, VT: PtrVT);
7783
7784 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7785 Register VReg = MF.addLiveIn(PReg: FPRArgRegs[i], RC: &AArch64::FPR128RegClass);
7786 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::f128);
7787
7788 SDValue Store = DAG.getStore(Chain: Val.getValue(R: 1), dl: DL, Val, Ptr: FIN,
7789 PtrInfo: MachinePointerInfo::getStack(MF, Offset: i * 16));
7790 MemOps.push_back(Elt: Store);
7791 FIN = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN,
7792 N2: DAG.getConstant(Val: 16, DL, VT: PtrVT));
7793 }
7794 }
7795 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7796 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7797 }
7798
7799 if (!MemOps.empty()) {
7800 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOps);
7801 }
7802}
7803
7804/// LowerCallResult - Lower the result values of a call into the
7805/// appropriate copies out of appropriate physical registers.
7806SDValue AArch64TargetLowering::LowerCallResult(
7807 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7808 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7809 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7810 SDValue ThisVal, bool RequiresSMChange) const {
7811 DenseMap<unsigned, SDValue> CopiedRegs;
7812 // Copy all of the result registers out of their specified physreg.
7813 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7814 CCValAssign VA = RVLocs[i];
7815
7816 // Pass 'this' value directly from the argument to return value, to avoid
7817 // reg unit interference
7818 if (i == 0 && isThisReturn) {
7819 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7820 "unexpected return calling convention register assignment");
7821 InVals.push_back(Elt: ThisVal);
7822 continue;
7823 }
7824
7825 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7826 // allows one use of a physreg per block.
7827 SDValue Val = CopiedRegs.lookup(Val: VA.getLocReg());
7828 if (!Val) {
7829 Val =
7830 DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
7831 Chain = Val.getValue(R: 1);
7832 InGlue = Val.getValue(R: 2);
7833 CopiedRegs[VA.getLocReg()] = Val;
7834 }
7835
7836 switch (VA.getLocInfo()) {
7837 default:
7838 llvm_unreachable("Unknown loc info!");
7839 case CCValAssign::Full:
7840 break;
7841 case CCValAssign::BCvt:
7842 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
7843 break;
7844 case CCValAssign::AExtUpper:
7845 Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: VA.getLocVT(), N1: Val,
7846 N2: DAG.getConstant(Val: 32, DL, VT: VA.getLocVT()));
7847 [[fallthrough]];
7848 case CCValAssign::AExt:
7849 [[fallthrough]];
7850 case CCValAssign::ZExt:
7851 Val = DAG.getZExtOrTrunc(Op: Val, DL, VT: VA.getValVT());
7852 break;
7853 }
7854
7855 if (RequiresSMChange && isPassedInFPR(VT: VA.getValVT()))
7856 Val = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL, VT: Val.getValueType(),
7857 Operand: Val);
7858
7859 InVals.push_back(Elt: Val);
7860 }
7861
7862 return Chain;
7863}
7864
7865/// Return true if the calling convention is one that we can guarantee TCO for.
7866static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7867 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7868 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
7869}
7870
7871/// Return true if we might ever do TCO for calls with this calling convention.
7872static bool mayTailCallThisCC(CallingConv::ID CC) {
7873 switch (CC) {
7874 case CallingConv::C:
7875 case CallingConv::AArch64_SVE_VectorCall:
7876 case CallingConv::PreserveMost:
7877 case CallingConv::PreserveAll:
7878 case CallingConv::PreserveNone:
7879 case CallingConv::Swift:
7880 case CallingConv::SwiftTail:
7881 case CallingConv::Tail:
7882 case CallingConv::Fast:
7883 return true;
7884 default:
7885 return false;
7886 }
7887}
7888
7889/// Return true if the call convention supports varargs
7890/// Currently only those that pass varargs like the C
7891/// calling convention does are eligible
7892/// Calling conventions listed in this function must also
7893/// be properly handled in AArch64Subtarget::isCallingConvWin64
7894static bool callConvSupportsVarArgs(CallingConv::ID CC) {
7895 switch (CC) {
7896 case CallingConv::C:
7897 case CallingConv::PreserveNone:
7898 return true;
7899 default:
7900 return false;
7901 }
7902}
7903
7904static void analyzeCallOperands(const AArch64TargetLowering &TLI,
7905 const AArch64Subtarget *Subtarget,
7906 const TargetLowering::CallLoweringInfo &CLI,
7907 CCState &CCInfo) {
7908 const SelectionDAG &DAG = CLI.DAG;
7909 CallingConv::ID CalleeCC = CLI.CallConv;
7910 bool IsVarArg = CLI.IsVarArg;
7911 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7912 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CC: CalleeCC, IsVarArg);
7913
7914 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7915 // for the shadow store.
7916 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7917 CCInfo.AllocateStack(Size: 32, Alignment: Align(16));
7918
7919 unsigned NumArgs = Outs.size();
7920 for (unsigned i = 0; i != NumArgs; ++i) {
7921 MVT ArgVT = Outs[i].VT;
7922 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7923
7924 bool UseVarArgCC = false;
7925 if (IsVarArg) {
7926 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7927 // too, so use the vararg CC to force them to integer registers.
7928 if (IsCalleeWin64) {
7929 UseVarArgCC = true;
7930 } else {
7931 UseVarArgCC = !Outs[i].IsFixed;
7932 }
7933 }
7934
7935 if (!UseVarArgCC) {
7936 // Get type of the original argument.
7937 EVT ActualVT =
7938 TLI.getValueType(DL: DAG.getDataLayout(), Ty: CLI.Args[Outs[i].OrigArgIndex].Ty,
7939 /*AllowUnknown*/ true);
7940 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7941 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7942 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7943 ArgVT = MVT::i8;
7944 else if (ActualMVT == MVT::i16)
7945 ArgVT = MVT::i16;
7946 }
7947
7948 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC: CalleeCC, IsVarArg: UseVarArgCC);
7949 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7950 assert(!Res && "Call operand has unhandled type");
7951 (void)Res;
7952 }
7953}
7954
7955bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7956 const CallLoweringInfo &CLI) const {
7957 CallingConv::ID CalleeCC = CLI.CallConv;
7958 if (!mayTailCallThisCC(CC: CalleeCC))
7959 return false;
7960
7961 SDValue Callee = CLI.Callee;
7962 bool IsVarArg = CLI.IsVarArg;
7963 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7964 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7965 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7966 const SelectionDAG &DAG = CLI.DAG;
7967 MachineFunction &MF = DAG.getMachineFunction();
7968 const Function &CallerF = MF.getFunction();
7969 CallingConv::ID CallerCC = CallerF.getCallingConv();
7970
7971 // SME Streaming functions are not eligible for TCO as they may require
7972 // the streaming mode or ZA to be restored after returning from the call.
7973 SMEAttrs CallerAttrs(MF.getFunction());
7974 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7975 if (CallerAttrs.requiresSMChange(Callee: CalleeAttrs) ||
7976 CallerAttrs.requiresLazySave(Callee: CalleeAttrs) ||
7977 CallerAttrs.hasStreamingBody())
7978 return false;
7979
7980 // Functions using the C or Fast calling convention that have an SVE signature
7981 // preserve more registers and should assume the SVE_VectorCall CC.
7982 // The check for matching callee-saved regs will determine whether it is
7983 // eligible for TCO.
7984 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7985 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
7986 CallerCC = CallingConv::AArch64_SVE_VectorCall;
7987
7988 bool CCMatch = CallerCC == CalleeCC;
7989
7990 // When using the Windows calling convention on a non-windows OS, we want
7991 // to back up and restore X18 in such functions; we can't do a tail call
7992 // from those functions.
7993 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7994 CalleeCC != CallingConv::Win64)
7995 return false;
7996
7997 // Byval parameters hand the function a pointer directly into the stack area
7998 // we want to reuse during a tail call. Working around this *is* possible (see
7999 // X86) but less efficient and uglier in LowerCall.
8000 for (Function::const_arg_iterator i = CallerF.arg_begin(),
8001 e = CallerF.arg_end();
8002 i != e; ++i) {
8003 if (i->hasByValAttr())
8004 return false;
8005
8006 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8007 // In this case, it is necessary to save/restore X0 in the callee. Tail
8008 // call opt interferes with this. So we disable tail call opt when the
8009 // caller has an argument with "inreg" attribute.
8010
8011 // FIXME: Check whether the callee also has an "inreg" argument.
8012 if (i->hasInRegAttr())
8013 return false;
8014 }
8015
8016 if (canGuaranteeTCO(CC: CalleeCC, GuaranteeTailCalls: getTargetMachine().Options.GuaranteedTailCallOpt))
8017 return CCMatch;
8018
8019 // Externally-defined functions with weak linkage should not be
8020 // tail-called on AArch64 when the OS does not support dynamic
8021 // pre-emption of symbols, as the AAELF spec requires normal calls
8022 // to undefined weak functions to be replaced with a NOP or jump to the
8023 // next instruction. The behaviour of branch instructions in this
8024 // situation (as used for tail calls) is implementation-defined, so we
8025 // cannot rely on the linker replacing the tail call with a return.
8026 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
8027 const GlobalValue *GV = G->getGlobal();
8028 const Triple &TT = getTargetMachine().getTargetTriple();
8029 if (GV->hasExternalWeakLinkage() &&
8030 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
8031 return false;
8032 }
8033
8034 // Now we search for cases where we can use a tail call without changing the
8035 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
8036 // concept.
8037
8038 // I want anyone implementing a new calling convention to think long and hard
8039 // about this assert.
8040 if (IsVarArg && !callConvSupportsVarArgs(CC: CalleeCC))
8041 report_fatal_error(reason: "Unsupported variadic calling convention");
8042
8043 LLVMContext &C = *DAG.getContext();
8044 // Check that the call results are passed in the same way.
8045 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8046 CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
8047 CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
8048 return false;
8049 // The callee has to preserve all registers the caller needs to preserve.
8050 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8051 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8052 if (!CCMatch) {
8053 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8054 if (Subtarget->hasCustomCallingConv()) {
8055 TRI->UpdateCustomCallPreservedMask(MF, Mask: &CallerPreserved);
8056 TRI->UpdateCustomCallPreservedMask(MF, Mask: &CalleePreserved);
8057 }
8058 if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
8059 return false;
8060 }
8061
8062 // Nothing more to check if the callee is taking no arguments
8063 if (Outs.empty())
8064 return true;
8065
8066 SmallVector<CCValAssign, 16> ArgLocs;
8067 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8068
8069 analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
8070
8071 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8072 // When we are musttail, additional checks have been done and we can safely ignore this check
8073 // At least two cases here: if caller is fastcc then we can't have any
8074 // memory arguments (we'd be expected to clean up the stack afterwards). If
8075 // caller is C then we could potentially use its argument area.
8076
8077 // FIXME: for now we take the most conservative of these in both cases:
8078 // disallow all variadic memory operands.
8079 for (const CCValAssign &ArgLoc : ArgLocs)
8080 if (!ArgLoc.isRegLoc())
8081 return false;
8082 }
8083
8084 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8085
8086 // If any of the arguments is passed indirectly, it must be SVE, so the
8087 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8088 // allocate space on the stack. That is why we determine this explicitly here
8089 // the call cannot be a tailcall.
8090 if (llvm::any_of(Range&: ArgLocs, P: [&](CCValAssign &A) {
8091 assert((A.getLocInfo() != CCValAssign::Indirect ||
8092 A.getValVT().isScalableVector() ||
8093 Subtarget->isWindowsArm64EC()) &&
8094 "Expected value to be scalable");
8095 return A.getLocInfo() == CCValAssign::Indirect;
8096 }))
8097 return false;
8098
8099 // If the stack arguments for this call do not fit into our own save area then
8100 // the call cannot be made tail.
8101 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8102 return false;
8103
8104 const MachineRegisterInfo &MRI = MF.getRegInfo();
8105 if (!parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals))
8106 return false;
8107
8108 return true;
8109}
8110
8111SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8112 SelectionDAG &DAG,
8113 MachineFrameInfo &MFI,
8114 int ClobberedFI) const {
8115 SmallVector<SDValue, 8> ArgChains;
8116 int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
8117 int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - 1;
8118
8119 // Include the original chain at the beginning of the list. When this is
8120 // used by target LowerCall hooks, this helps legalize find the
8121 // CALLSEQ_BEGIN node.
8122 ArgChains.push_back(Elt: Chain);
8123
8124 // Add a chain value for each stack argument corresponding
8125 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
8126 if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U))
8127 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr()))
8128 if (FI->getIndex() < 0) {
8129 int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
8130 int64_t InLastByte = InFirstByte;
8131 InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - 1;
8132
8133 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8134 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8135 ArgChains.push_back(Elt: SDValue(L, 1));
8136 }
8137
8138 // Build a tokenfactor for all the chains.
8139 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ArgChains);
8140}
8141
8142bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8143 bool TailCallOpt) const {
8144 return (CallCC == CallingConv::Fast && TailCallOpt) ||
8145 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8146}
8147
8148// Check if the value is zero-extended from i1 to i8
8149static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8150 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8151 if (SizeInBits < 8)
8152 return false;
8153
8154 APInt RequredZero(SizeInBits, 0xFE);
8155 KnownBits Bits = DAG.computeKnownBits(Op: Arg, Depth: 4);
8156 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
8157 return ZExtBool;
8158}
8159
8160void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8161 SDNode *Node) const {
8162 // Live-in physreg copies that are glued to SMSTART are applied as
8163 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
8164 // register allocator to pass call args in callee saved regs, without extra
8165 // copies to avoid these fake clobbers of actually-preserved GPRs.
8166 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
8167 MI.getOpcode() == AArch64::MSRpstatePseudo) {
8168 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
8169 if (MachineOperand &MO = MI.getOperand(i: I);
8170 MO.isReg() && MO.isImplicit() && MO.isDef() &&
8171 (AArch64::GPR32RegClass.contains(Reg: MO.getReg()) ||
8172 AArch64::GPR64RegClass.contains(Reg: MO.getReg())))
8173 MI.removeOperand(OpNo: I);
8174
8175 // The SVE vector length can change when entering/leaving streaming mode.
8176 if (MI.getOperand(i: 0).getImm() == AArch64SVCR::SVCRSM ||
8177 MI.getOperand(i: 0).getImm() == AArch64SVCR::SVCRSMZA) {
8178 MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /*IsDef=*/isDef: false,
8179 /*IsImplicit=*/isImp: true));
8180 MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /*IsDef=*/isDef: true,
8181 /*IsImplicit=*/isImp: true));
8182 }
8183 }
8184
8185 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8186 // have nothing to do with VG, were it not that they are used to materialise a
8187 // frame-address. If they contain a frame-index to a scalable vector, this
8188 // will likely require an ADDVL instruction to materialise the address, thus
8189 // reading VG.
8190 const MachineFunction &MF = *MI.getMF();
8191 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
8192 (MI.getOpcode() == AArch64::ADDXri ||
8193 MI.getOpcode() == AArch64::SUBXri)) {
8194 const MachineOperand &MO = MI.getOperand(i: 1);
8195 if (MO.isFI() && MF.getFrameInfo().getStackID(ObjectIdx: MO.getIndex()) ==
8196 TargetStackID::ScalableVector)
8197 MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /*IsDef=*/isDef: false,
8198 /*IsImplicit=*/isImp: true));
8199 }
8200}
8201
8202SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
8203 bool Enable, SDValue Chain,
8204 SDValue InGlue,
8205 unsigned Condition,
8206 SDValue PStateSM) const {
8207 MachineFunction &MF = DAG.getMachineFunction();
8208 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8209 FuncInfo->setHasStreamingModeChanges(true);
8210
8211 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8212 SDValue RegMask = DAG.getRegisterMask(RegMask: TRI->getSMStartStopCallPreservedMask());
8213 SDValue MSROp =
8214 DAG.getTargetConstant(Val: (int32_t)AArch64SVCR::SVCRSM, DL, VT: MVT::i32);
8215 SDValue ConditionOp = DAG.getTargetConstant(Val: Condition, DL, VT: MVT::i64);
8216 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
8217 if (Condition != AArch64SME::Always) {
8218 assert(PStateSM && "PStateSM should be defined");
8219 Ops.push_back(Elt: PStateSM);
8220 }
8221 Ops.push_back(Elt: RegMask);
8222
8223 if (InGlue)
8224 Ops.push_back(Elt: InGlue);
8225
8226 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8227 return DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops);
8228}
8229
8230static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
8231 const SMEAttrs &CalleeAttrs) {
8232 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
8233 CallerAttrs.hasStreamingBody())
8234 return AArch64SME::Always;
8235 if (CalleeAttrs.hasNonStreamingInterface())
8236 return AArch64SME::IfCallerIsStreaming;
8237 if (CalleeAttrs.hasStreamingInterface())
8238 return AArch64SME::IfCallerIsNonStreaming;
8239
8240 llvm_unreachable("Unsupported attributes");
8241}
8242
8243/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8244/// and add input and output parameter nodes.
8245SDValue
8246AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8247 SmallVectorImpl<SDValue> &InVals) const {
8248 SelectionDAG &DAG = CLI.DAG;
8249 SDLoc &DL = CLI.DL;
8250 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8251 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8252 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8253 SDValue Chain = CLI.Chain;
8254 SDValue Callee = CLI.Callee;
8255 bool &IsTailCall = CLI.IsTailCall;
8256 CallingConv::ID &CallConv = CLI.CallConv;
8257 bool IsVarArg = CLI.IsVarArg;
8258
8259 MachineFunction &MF = DAG.getMachineFunction();
8260 MachineFunction::CallSiteInfo CSInfo;
8261 bool IsThisReturn = false;
8262
8263 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8264 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8265 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8266 bool IsSibCall = false;
8267 bool GuardWithBTI = false;
8268
8269 if (CLI.CB && CLI.CB->hasFnAttr(Kind: Attribute::ReturnsTwice) &&
8270 !Subtarget->noBTIAtReturnTwice()) {
8271 GuardWithBTI = FuncInfo->branchTargetEnforcement();
8272 }
8273
8274 // Analyze operands of the call, assigning locations to each operand.
8275 SmallVector<CCValAssign, 16> ArgLocs;
8276 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
8277
8278 if (IsVarArg) {
8279 unsigned NumArgs = Outs.size();
8280
8281 for (unsigned i = 0; i != NumArgs; ++i) {
8282 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
8283 report_fatal_error(reason: "Passing SVE types to variadic functions is "
8284 "currently not supported");
8285 }
8286 }
8287
8288 analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
8289
8290 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
8291 // Assign locations to each value returned by this call.
8292 SmallVector<CCValAssign, 16> RVLocs;
8293 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
8294 *DAG.getContext());
8295 RetCCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
8296
8297 // Check callee args/returns for SVE registers and set calling convention
8298 // accordingly.
8299 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
8300 auto HasSVERegLoc = [](CCValAssign &Loc) {
8301 if (!Loc.isRegLoc())
8302 return false;
8303 return AArch64::ZPRRegClass.contains(Reg: Loc.getLocReg()) ||
8304 AArch64::PPRRegClass.contains(Reg: Loc.getLocReg());
8305 };
8306 if (any_of(Range&: RVLocs, P: HasSVERegLoc) || any_of(Range&: ArgLocs, P: HasSVERegLoc))
8307 CallConv = CallingConv::AArch64_SVE_VectorCall;
8308 }
8309
8310 if (IsTailCall) {
8311 // Check if it's really possible to do a tail call.
8312 IsTailCall = isEligibleForTailCallOptimization(CLI);
8313
8314 // A sibling call is one where we're under the usual C ABI and not planning
8315 // to change that but can still do a tail call:
8316 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
8317 CallConv != CallingConv::SwiftTail)
8318 IsSibCall = true;
8319
8320 if (IsTailCall)
8321 ++NumTailCalls;
8322 }
8323
8324 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
8325 report_fatal_error(reason: "failed to perform tail call elimination on a call "
8326 "site marked musttail");
8327
8328 // Get a count of how many bytes are to be pushed on the stack.
8329 unsigned NumBytes = CCInfo.getStackSize();
8330
8331 if (IsSibCall) {
8332 // Since we're not changing the ABI to make this a tail call, the memory
8333 // operands are already available in the caller's incoming argument space.
8334 NumBytes = 0;
8335 }
8336
8337 // FPDiff is the byte offset of the call's argument area from the callee's.
8338 // Stores to callee stack arguments will be placed in FixedStackSlots offset
8339 // by this amount for a tail call. In a sibling call it must be 0 because the
8340 // caller will deallocate the entire stack and the callee still expects its
8341 // arguments to begin at SP+0. Completely unused for non-tail calls.
8342 int FPDiff = 0;
8343
8344 if (IsTailCall && !IsSibCall) {
8345 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
8346
8347 // Since callee will pop argument stack as a tail call, we must keep the
8348 // popped size 16-byte aligned.
8349 NumBytes = alignTo(Value: NumBytes, Align: 16);
8350
8351 // FPDiff will be negative if this tail call requires more space than we
8352 // would automatically have in our incoming argument space. Positive if we
8353 // can actually shrink the stack.
8354 FPDiff = NumReusableBytes - NumBytes;
8355
8356 // Update the required reserved area if this is the tail call requiring the
8357 // most argument stack space.
8358 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
8359 FuncInfo->setTailCallReservedStack(-FPDiff);
8360
8361 // The stack pointer must be 16-byte aligned at all times it's used for a
8362 // memory operation, which in practice means at *all* times and in
8363 // particular across call boundaries. Therefore our own arguments started at
8364 // a 16-byte aligned SP and the delta applied for the tail call should
8365 // satisfy the same constraint.
8366 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
8367 }
8368
8369 // Determine whether we need any streaming mode changes.
8370 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
8371 if (CLI.CB)
8372 CalleeAttrs = SMEAttrs(*CLI.CB);
8373 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: CLI.Callee))
8374 CalleeAttrs = SMEAttrs(ES->getSymbol());
8375
8376 auto DescribeCallsite =
8377 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
8378 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
8379 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: CLI.Callee))
8380 R << ore::NV("Callee", ES->getSymbol());
8381 else if (CLI.CB && CLI.CB->getCalledFunction())
8382 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
8383 else
8384 R << "unknown callee";
8385 R << "'";
8386 return R;
8387 };
8388
8389 bool RequiresLazySave = CallerAttrs.requiresLazySave(Callee: CalleeAttrs);
8390 if (RequiresLazySave) {
8391 const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8392 MachinePointerInfo MPI =
8393 MachinePointerInfo::getStack(MF, Offset: TPIDR2.FrameIndex);
8394 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
8395 FI: TPIDR2.FrameIndex,
8396 VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8397 SDValue NumZaSaveSlicesAddr =
8398 DAG.getNode(Opcode: ISD::ADD, DL, VT: TPIDR2ObjAddr.getValueType(), N1: TPIDR2ObjAddr,
8399 N2: DAG.getConstant(Val: 8, DL, VT: TPIDR2ObjAddr.getValueType()));
8400 SDValue NumZaSaveSlices = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
8401 Operand: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
8402 Chain = DAG.getTruncStore(Chain, dl: DL, Val: NumZaSaveSlices, Ptr: NumZaSaveSlicesAddr,
8403 PtrInfo: MPI, SVT: MVT::i16);
8404 Chain = DAG.getNode(
8405 Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, N1: Chain,
8406 N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_set_tpidr2, DL, VT: MVT::i32),
8407 N3: TPIDR2ObjAddr);
8408 OptimizationRemarkEmitter ORE(&MF.getFunction());
8409 ORE.emit(RemarkBuilder: [&]() {
8410 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8411 CLI.CB)
8412 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8413 &MF.getFunction());
8414 return DescribeCallsite(R) << " sets up a lazy save for ZA";
8415 });
8416 }
8417
8418 SDValue PStateSM;
8419 bool RequiresSMChange = CallerAttrs.requiresSMChange(Callee: CalleeAttrs);
8420 if (RequiresSMChange) {
8421 if (CallerAttrs.hasStreamingInterfaceOrBody())
8422 PStateSM = DAG.getConstant(Val: 1, DL, VT: MVT::i64);
8423 else if (CallerAttrs.hasNonStreamingInterface())
8424 PStateSM = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
8425 else
8426 PStateSM = getRuntimePStateSM(DAG, Chain, DL, VT: MVT::i64);
8427 OptimizationRemarkEmitter ORE(&MF.getFunction());
8428 ORE.emit(RemarkBuilder: [&]() {
8429 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
8430 CLI.CB)
8431 : OptimizationRemarkAnalysis("sme", "SMETransition",
8432 &MF.getFunction());
8433 DescribeCallsite(R) << " requires a streaming mode transition";
8434 return R;
8435 });
8436 }
8437
8438 SDValue ZTFrameIdx;
8439 MachineFrameInfo &MFI = MF.getFrameInfo();
8440 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(Callee: CalleeAttrs);
8441
8442 // If the caller has ZT0 state which will not be preserved by the callee,
8443 // spill ZT0 before the call.
8444 if (ShouldPreserveZT0) {
8445 unsigned ZTObj = MFI.CreateSpillStackObject(Size: 64, Alignment: Align(16));
8446 ZTFrameIdx = DAG.getFrameIndex(
8447 FI: ZTObj,
8448 VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8449
8450 Chain = DAG.getNode(Opcode: AArch64ISD::SAVE_ZT, DL, VTList: DAG.getVTList(VT: MVT::Other),
8451 Ops: {Chain, DAG.getConstant(Val: 0, DL, VT: MVT::i32), ZTFrameIdx});
8452 }
8453
8454 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
8455 // PSTATE.ZA before the call if there is no lazy-save active.
8456 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(Callee: CalleeAttrs);
8457 assert((!DisableZA || !RequiresLazySave) &&
8458 "Lazy-save should have PSTATE.SM=1 on entry to the function");
8459
8460 if (DisableZA)
8461 Chain = DAG.getNode(
8462 Opcode: AArch64ISD::SMSTOP, DL, VT: MVT::Other, N1: Chain,
8463 N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32),
8464 N3: DAG.getConstant(Val: AArch64SME::Always, DL, VT: MVT::i64));
8465
8466 // Adjust the stack pointer for the new arguments...
8467 // These operations are automatically eliminated by the prolog/epilog pass
8468 if (!IsSibCall)
8469 Chain = DAG.getCALLSEQ_START(Chain, InSize: IsTailCall ? 0 : NumBytes, OutSize: 0, DL);
8470
8471 SDValue StackPtr = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::SP,
8472 VT: getPointerTy(DL: DAG.getDataLayout()));
8473
8474 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
8475 SmallSet<unsigned, 8> RegsUsed;
8476 SmallVector<SDValue, 8> MemOpChains;
8477 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
8478
8479 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8480 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8481 for (const auto &F : Forwards) {
8482 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: F.VReg, VT: F.VT);
8483 RegsToPass.emplace_back(Args: F.PReg, Args&: Val);
8484 }
8485 }
8486
8487 // Walk the register/memloc assignments, inserting copies/loads.
8488 unsigned ExtraArgLocs = 0;
8489 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8490 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8491 SDValue Arg = OutVals[i];
8492 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8493
8494 // Promote the value if needed.
8495 switch (VA.getLocInfo()) {
8496 default:
8497 llvm_unreachable("Unknown loc info!");
8498 case CCValAssign::Full:
8499 break;
8500 case CCValAssign::SExt:
8501 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8502 break;
8503 case CCValAssign::ZExt:
8504 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8505 break;
8506 case CCValAssign::AExt:
8507 if (Outs[i].ArgVT == MVT::i1) {
8508 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8509 //
8510 // Check if we actually have to do this, because the value may
8511 // already be zero-extended.
8512 //
8513 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8514 // and rely on DAGCombiner to fold this, because the following
8515 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8516 //
8517 // (ext (zext x)) -> (zext x)
8518 //
8519 // This will give us (zext i32), which we cannot remove, so
8520 // try to check this beforehand.
8521 if (!checkZExtBool(Arg, DAG)) {
8522 Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Arg);
8523 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i8, Operand: Arg);
8524 }
8525 }
8526 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8527 break;
8528 case CCValAssign::AExtUpper:
8529 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8530 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8531 Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
8532 N2: DAG.getConstant(Val: 32, DL, VT: VA.getLocVT()));
8533 break;
8534 case CCValAssign::BCvt:
8535 Arg = DAG.getBitcast(VT: VA.getLocVT(), V: Arg);
8536 break;
8537 case CCValAssign::Trunc:
8538 Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
8539 break;
8540 case CCValAssign::FPExt:
8541 Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8542 break;
8543 case CCValAssign::Indirect:
8544 bool isScalable = VA.getValVT().isScalableVT();
8545 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8546 "Indirect arguments should be scalable on most subtargets");
8547
8548 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8549 uint64_t PartSize = StoreSize;
8550 unsigned NumParts = 1;
8551 if (Outs[i].Flags.isInConsecutiveRegs()) {
8552 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8553 ++NumParts;
8554 StoreSize *= NumParts;
8555 }
8556
8557 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(Context&: *DAG.getContext());
8558 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8559 MachineFrameInfo &MFI = MF.getFrameInfo();
8560 int FI = MFI.CreateStackObject(Size: StoreSize, Alignment, isSpillSlot: false);
8561 if (isScalable)
8562 MFI.setStackID(ObjectIdx: FI, ID: TargetStackID::ScalableVector);
8563
8564 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
8565 SDValue Ptr = DAG.getFrameIndex(
8566 FI, VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8567 SDValue SpillSlot = Ptr;
8568
8569 // Ensure we generate all stores for each tuple part, whilst updating the
8570 // pointer after each store correctly using vscale.
8571 while (NumParts) {
8572 SDValue Store = DAG.getStore(Chain, dl: DL, Val: OutVals[i], Ptr, PtrInfo: MPI);
8573 MemOpChains.push_back(Elt: Store);
8574
8575 NumParts--;
8576 if (NumParts > 0) {
8577 SDValue BytesIncrement;
8578 if (isScalable) {
8579 BytesIncrement = DAG.getVScale(
8580 DL, VT: Ptr.getValueType(),
8581 MulImm: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8582 } else {
8583 BytesIncrement = DAG.getConstant(
8584 Val: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8585 VT: Ptr.getValueType());
8586 }
8587 SDNodeFlags Flags;
8588 Flags.setNoUnsignedWrap(true);
8589
8590 MPI = MachinePointerInfo(MPI.getAddrSpace());
8591 Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
8592 N2: BytesIncrement, Flags);
8593 ExtraArgLocs++;
8594 i++;
8595 }
8596 }
8597
8598 Arg = SpillSlot;
8599 break;
8600 }
8601
8602 if (VA.isRegLoc()) {
8603 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8604 Outs[0].VT == MVT::i64) {
8605 assert(VA.getLocVT() == MVT::i64 &&
8606 "unexpected calling convention register assignment");
8607 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8608 "unexpected use of 'returned'");
8609 IsThisReturn = true;
8610 }
8611 if (RegsUsed.count(V: VA.getLocReg())) {
8612 // If this register has already been used then we're trying to pack
8613 // parts of an [N x i32] into an X-register. The extension type will
8614 // take care of putting the two halves in the right place but we have to
8615 // combine them.
8616 SDValue &Bits =
8617 llvm::find_if(Range&: RegsToPass,
8618 P: [=](const std::pair<unsigned, SDValue> &Elt) {
8619 return Elt.first == VA.getLocReg();
8620 })
8621 ->second;
8622 Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
8623 // Call site info is used for function's parameter entry value
8624 // tracking. For now we track only simple cases when parameter
8625 // is transferred through whole register.
8626 llvm::erase_if(C&: CSInfo.ArgRegPairs,
8627 P: [&VA](MachineFunction::ArgRegPair ArgReg) {
8628 return ArgReg.Reg == VA.getLocReg();
8629 });
8630 } else {
8631 // Add an extra level of indirection for streaming mode changes by
8632 // using a pseudo copy node that cannot be rematerialised between a
8633 // smstart/smstop and the call by the simple register coalescer.
8634 if (RequiresSMChange && isPassedInFPR(VT: Arg.getValueType()))
8635 Arg = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
8636 VT: Arg.getValueType(), Operand: Arg);
8637 RegsToPass.emplace_back(Args: VA.getLocReg(), Args&: Arg);
8638 RegsUsed.insert(V: VA.getLocReg());
8639 const TargetOptions &Options = DAG.getTarget().Options;
8640 if (Options.EmitCallSiteInfo)
8641 CSInfo.ArgRegPairs.emplace_back(Args: VA.getLocReg(), Args&: i);
8642 }
8643 } else {
8644 assert(VA.isMemLoc());
8645
8646 SDValue DstAddr;
8647 MachinePointerInfo DstInfo;
8648
8649 // FIXME: This works on big-endian for composite byvals, which are the
8650 // common case. It should also work for fundamental types too.
8651 uint32_t BEAlign = 0;
8652 unsigned OpSize;
8653 if (VA.getLocInfo() == CCValAssign::Indirect ||
8654 VA.getLocInfo() == CCValAssign::Trunc)
8655 OpSize = VA.getLocVT().getFixedSizeInBits();
8656 else
8657 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8658 : VA.getValVT().getSizeInBits();
8659 OpSize = (OpSize + 7) / 8;
8660 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8661 !Flags.isInConsecutiveRegs()) {
8662 if (OpSize < 8)
8663 BEAlign = 8 - OpSize;
8664 }
8665 unsigned LocMemOffset = VA.getLocMemOffset();
8666 int32_t Offset = LocMemOffset + BEAlign;
8667 SDValue PtrOff = DAG.getIntPtrConstant(Val: Offset, DL);
8668 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: StackPtr, N2: PtrOff);
8669
8670 if (IsTailCall) {
8671 Offset = Offset + FPDiff;
8672 int FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
8673
8674 DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
8675 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8676
8677 // Make sure any stack arguments overlapping with where we're storing
8678 // are loaded before this eventual operation. Otherwise they'll be
8679 // clobbered.
8680 Chain = addTokenForArgument(Chain, DAG, MFI&: MF.getFrameInfo(), ClobberedFI: FI);
8681 } else {
8682 SDValue PtrOff = DAG.getIntPtrConstant(Val: Offset, DL);
8683
8684 DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: StackPtr, N2: PtrOff);
8685 DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
8686 }
8687
8688 if (Outs[i].Flags.isByVal()) {
8689 SDValue SizeNode =
8690 DAG.getConstant(Val: Outs[i].Flags.getByValSize(), DL, VT: MVT::i64);
8691 SDValue Cpy = DAG.getMemcpy(
8692 Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
8693 Alignment: Outs[i].Flags.getNonZeroByValAlign(),
8694 /*isVol = */ false, /*AlwaysInline = */ false,
8695 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo, SrcPtrInfo: MachinePointerInfo());
8696
8697 MemOpChains.push_back(Elt: Cpy);
8698 } else {
8699 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8700 // promoted to a legal register type i32, we should truncate Arg back to
8701 // i1/i8/i16.
8702 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8703 VA.getValVT() == MVT::i16)
8704 Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Arg);
8705
8706 SDValue Store = DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo);
8707 MemOpChains.push_back(Elt: Store);
8708 }
8709 }
8710 }
8711
8712 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8713 SDValue ParamPtr = StackPtr;
8714 if (IsTailCall) {
8715 // Create a dummy object at the top of the stack that can be used to get
8716 // the SP after the epilogue
8717 int FI = MF.getFrameInfo().CreateFixedObject(Size: 1, SPOffset: FPDiff, IsImmutable: true);
8718 ParamPtr = DAG.getFrameIndex(FI, VT: PtrVT);
8719 }
8720
8721 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8722 // describing the argument list. x4 contains the address of the
8723 // first stack parameter. x5 contains the size in bytes of all parameters
8724 // passed on the stack.
8725 RegsToPass.emplace_back(Args: AArch64::X4, Args&: ParamPtr);
8726 RegsToPass.emplace_back(Args: AArch64::X5,
8727 Args: DAG.getConstant(Val: NumBytes, DL, VT: MVT::i64));
8728 }
8729
8730 if (!MemOpChains.empty())
8731 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
8732
8733 SDValue InGlue;
8734 if (RequiresSMChange) {
8735
8736 Chain = DAG.getNode(Opcode: AArch64ISD::VG_SAVE, DL,
8737 VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), N: Chain);
8738 InGlue = Chain.getValue(R: 1);
8739
8740 SDValue NewChain = changeStreamingMode(
8741 DAG, DL, Enable: CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8742 Condition: getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8743 Chain = NewChain.getValue(R: 0);
8744 InGlue = NewChain.getValue(R: 1);
8745 }
8746
8747 // Build a sequence of copy-to-reg nodes chained together with token chain
8748 // and flag operands which copy the outgoing args into the appropriate regs.
8749 for (auto &RegToPass : RegsToPass) {
8750 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RegToPass.first,
8751 N: RegToPass.second, Glue: InGlue);
8752 InGlue = Chain.getValue(R: 1);
8753 }
8754
8755 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8756 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8757 // node so that legalize doesn't hack it.
8758 if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
8759 auto GV = G->getGlobal();
8760 unsigned OpFlags =
8761 Subtarget->classifyGlobalFunctionReference(GV, TM: getTargetMachine());
8762 if (OpFlags & AArch64II::MO_GOT) {
8763 Callee = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: OpFlags);
8764 Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
8765 } else {
8766 const GlobalValue *GV = G->getGlobal();
8767 Callee = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: OpFlags);
8768 }
8769 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: Callee)) {
8770 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8771 Subtarget->isTargetMachO()) ||
8772 MF.getFunction().getParent()->getRtLibUseGOT();
8773 const char *Sym = S->getSymbol();
8774 if (UseGot) {
8775 Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: AArch64II::MO_GOT);
8776 Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
8777 } else {
8778 Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: 0);
8779 }
8780 }
8781
8782 // We don't usually want to end the call-sequence here because we would tidy
8783 // the frame up *after* the call, however in the ABI-changing tail-call case
8784 // we've carefully laid out the parameters so that when sp is reset they'll be
8785 // in the correct location.
8786 if (IsTailCall && !IsSibCall) {
8787 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: InGlue, DL);
8788 InGlue = Chain.getValue(R: 1);
8789 }
8790
8791 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
8792
8793 std::vector<SDValue> Ops;
8794 Ops.push_back(x: Chain);
8795 Ops.push_back(x: Callee);
8796
8797 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8798 // be expanded to the call, directly followed by a special marker sequence and
8799 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8800 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CB: CLI.CB)) {
8801 assert(!IsTailCall &&
8802 "tail calls cannot be marked with clang.arc.attachedcall");
8803 Opc = AArch64ISD::CALL_RVMARKER;
8804
8805 // Add a target global address for the retainRV/claimRV runtime function
8806 // just before the call target.
8807 Function *ARCFn = *objcarc::getAttachedARCFunction(CB: CLI.CB);
8808 auto GA = DAG.getTargetGlobalAddress(GV: ARCFn, DL, VT: PtrVT);
8809 Ops.insert(position: Ops.begin() + 1, x: GA);
8810 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8811 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
8812 } else if (GuardWithBTI) {
8813 Opc = AArch64ISD::CALL_BTI;
8814 }
8815
8816 if (IsTailCall) {
8817 // Each tail call may have to adjust the stack by a different amount, so
8818 // this information must travel along with the operation for eventual
8819 // consumption by emitEpilogue.
8820 Ops.push_back(x: DAG.getTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
8821 }
8822
8823 if (CLI.PAI) {
8824 const uint64_t Key = CLI.PAI->Key;
8825 assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
8826 "Invalid auth call key");
8827
8828 // Split the discriminator into address/integer components.
8829 SDValue AddrDisc, IntDisc;
8830 std::tie(args&: IntDisc, args&: AddrDisc) =
8831 extractPtrauthBlendDiscriminators(Disc: CLI.PAI->Discriminator, DAG: &DAG);
8832
8833 if (Opc == AArch64ISD::CALL_RVMARKER)
8834 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
8835 else
8836 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
8837 Ops.push_back(x: DAG.getTargetConstant(Val: Key, DL, VT: MVT::i32));
8838 Ops.push_back(x: IntDisc);
8839 Ops.push_back(x: AddrDisc);
8840 }
8841
8842 // Add argument registers to the end of the list so that they are known live
8843 // into the call.
8844 for (auto &RegToPass : RegsToPass)
8845 Ops.push_back(x: DAG.getRegister(Reg: RegToPass.first,
8846 VT: RegToPass.second.getValueType()));
8847
8848 // Add a register mask operand representing the call-preserved registers.
8849 const uint32_t *Mask;
8850 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8851 if (IsThisReturn) {
8852 // For 'this' returns, use the X0-preserving mask if applicable
8853 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8854 if (!Mask) {
8855 IsThisReturn = false;
8856 Mask = TRI->getCallPreservedMask(MF, CallConv);
8857 }
8858 } else
8859 Mask = TRI->getCallPreservedMask(MF, CallConv);
8860
8861 if (Subtarget->hasCustomCallingConv())
8862 TRI->UpdateCustomCallPreservedMask(MF, Mask: &Mask);
8863
8864 if (TRI->isAnyArgRegReserved(MF))
8865 TRI->emitReservedArgRegCallError(MF);
8866
8867 assert(Mask && "Missing call preserved mask for calling convention");
8868 Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
8869
8870 if (InGlue.getNode())
8871 Ops.push_back(x: InGlue);
8872
8873 SDVTList NodeTys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
8874
8875 // If we're doing a tall call, use a TC_RETURN here rather than an
8876 // actual call instruction.
8877 if (IsTailCall) {
8878 MF.getFrameInfo().setHasTailCall();
8879 SDValue Ret = DAG.getNode(Opcode: Opc, DL, VTList: NodeTys, Ops);
8880 if (IsCFICall)
8881 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8882
8883 DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CLI.NoMerge);
8884 DAG.addCallSiteInfo(Node: Ret.getNode(), CallInfo: std::move(CSInfo));
8885 return Ret;
8886 }
8887
8888 // Returns a chain and a flag for retval copy to use.
8889 Chain = DAG.getNode(Opcode: Opc, DL, VTList: NodeTys, Ops);
8890 if (IsCFICall)
8891 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8892
8893 DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CLI.NoMerge);
8894 InGlue = Chain.getValue(R: 1);
8895 DAG.addCallSiteInfo(Node: Chain.getNode(), CallInfo: std::move(CSInfo));
8896
8897 uint64_t CalleePopBytes =
8898 DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt) ? alignTo(Value: NumBytes, Align: 16) : 0;
8899
8900 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: CalleePopBytes, Glue: InGlue, DL);
8901 InGlue = Chain.getValue(R: 1);
8902
8903 // Handle result values, copying them out of physregs into vregs that we
8904 // return.
8905 SDValue Result = LowerCallResult(
8906 Chain, InGlue, CallConv, isVarArg: IsVarArg, RVLocs, DL, DAG, InVals, isThisReturn: IsThisReturn,
8907 ThisVal: IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8908
8909 if (!Ins.empty())
8910 InGlue = Result.getValue(R: Result->getNumValues() - 1);
8911
8912 if (RequiresSMChange) {
8913 assert(PStateSM && "Expected a PStateSM to be set");
8914 Result = changeStreamingMode(
8915 DAG, DL, Enable: !CalleeAttrs.hasStreamingInterface(), Chain: Result, InGlue,
8916 Condition: getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8917 InGlue = Result.getValue(R: 1);
8918
8919 Result =
8920 DAG.getNode(Opcode: AArch64ISD::VG_RESTORE, DL,
8921 VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops: {Result, InGlue});
8922 }
8923
8924 if (CallerAttrs.requiresEnablingZAAfterCall(Callee: CalleeAttrs))
8925 // Unconditionally resume ZA.
8926 Result = DAG.getNode(
8927 Opcode: AArch64ISD::SMSTART, DL, VT: MVT::Other, N1: Result,
8928 N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32),
8929 N3: DAG.getConstant(Val: AArch64SME::Always, DL, VT: MVT::i64));
8930
8931 if (ShouldPreserveZT0)
8932 Result =
8933 DAG.getNode(Opcode: AArch64ISD::RESTORE_ZT, DL, VTList: DAG.getVTList(VT: MVT::Other),
8934 Ops: {Result, DAG.getConstant(Val: 0, DL, VT: MVT::i32), ZTFrameIdx});
8935
8936 if (RequiresLazySave) {
8937 // Conditionally restore the lazy save using a pseudo node.
8938 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8939 SDValue RegMask = DAG.getRegisterMask(
8940 RegMask: TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8941 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8942 Sym: "__arm_tpidr2_restore", VT: getPointerTy(DL: DAG.getDataLayout()));
8943 SDValue TPIDR2_EL0 = DAG.getNode(
8944 Opcode: ISD::INTRINSIC_W_CHAIN, DL, VT: MVT::i64, N1: Result,
8945 N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_get_tpidr2, DL, VT: MVT::i32));
8946
8947 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8948 // RESTORE_ZA pseudo.
8949 SDValue Glue;
8950 SDValue TPIDR2Block = DAG.getFrameIndex(
8951 FI: TPIDR2.FrameIndex,
8952 VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8953 Result = DAG.getCopyToReg(Chain: Result, dl: DL, Reg: AArch64::X0, N: TPIDR2Block, Glue);
8954 Result =
8955 DAG.getNode(Opcode: AArch64ISD::RESTORE_ZA, DL, VT: MVT::Other,
8956 Ops: {Result, TPIDR2_EL0, DAG.getRegister(Reg: AArch64::X0, VT: MVT::i64),
8957 RestoreRoutine, RegMask, Result.getValue(R: 1)});
8958
8959 // Finally reset the TPIDR2_EL0 register to 0.
8960 Result = DAG.getNode(
8961 Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, N1: Result,
8962 N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_set_tpidr2, DL, VT: MVT::i32),
8963 N3: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
8964 TPIDR2.Uses++;
8965 }
8966
8967 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8968 for (unsigned I = 0; I < InVals.size(); ++I) {
8969 // The smstart/smstop is chained as part of the call, but when the
8970 // resulting chain is discarded (which happens when the call is not part
8971 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8972 // smstart/smstop is chained to the result value. We can do that by doing
8973 // a vreg -> vreg copy.
8974 Register Reg = MF.getRegInfo().createVirtualRegister(
8975 RegClass: getRegClassFor(VT: InVals[I].getValueType().getSimpleVT()));
8976 SDValue X = DAG.getCopyToReg(Chain: Result, dl: DL, Reg, N: InVals[I]);
8977 InVals[I] = DAG.getCopyFromReg(Chain: X, dl: DL, Reg,
8978 VT: InVals[I].getValueType());
8979 }
8980 }
8981
8982 if (CallConv == CallingConv::PreserveNone) {
8983 for (const ISD::OutputArg &O : Outs) {
8984 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
8985 O.Flags.isSwiftAsync()) {
8986 MachineFunction &MF = DAG.getMachineFunction();
8987 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
8988 MF.getFunction(),
8989 "Swift attributes can't be used with preserve_none",
8990 DL.getDebugLoc()));
8991 break;
8992 }
8993 }
8994 }
8995
8996 return Result;
8997}
8998
8999bool AArch64TargetLowering::CanLowerReturn(
9000 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9001 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
9002 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
9003 SmallVector<CCValAssign, 16> RVLocs;
9004 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9005 return CCInfo.CheckReturn(Outs, Fn: RetCC);
9006}
9007
9008SDValue
9009AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9010 bool isVarArg,
9011 const SmallVectorImpl<ISD::OutputArg> &Outs,
9012 const SmallVectorImpl<SDValue> &OutVals,
9013 const SDLoc &DL, SelectionDAG &DAG) const {
9014 auto &MF = DAG.getMachineFunction();
9015 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9016
9017 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
9018 SmallVector<CCValAssign, 16> RVLocs;
9019 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9020 CCInfo.AnalyzeReturn(Outs, Fn: RetCC);
9021
9022 // Copy the result values into the output registers.
9023 SDValue Glue;
9024 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
9025 SmallSet<unsigned, 4> RegsUsed;
9026 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
9027 ++i, ++realRVLocIdx) {
9028 CCValAssign &VA = RVLocs[i];
9029 assert(VA.isRegLoc() && "Can only return in registers!");
9030 SDValue Arg = OutVals[realRVLocIdx];
9031
9032 switch (VA.getLocInfo()) {
9033 default:
9034 llvm_unreachable("Unknown loc info!");
9035 case CCValAssign::Full:
9036 if (Outs[i].ArgVT == MVT::i1) {
9037 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
9038 // value. This is strictly redundant on Darwin (which uses "zeroext
9039 // i1"), but will be optimised out before ISel.
9040 Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Arg);
9041 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9042 }
9043 break;
9044 case CCValAssign::BCvt:
9045 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
9046 break;
9047 case CCValAssign::AExt:
9048 case CCValAssign::ZExt:
9049 Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
9050 break;
9051 case CCValAssign::AExtUpper:
9052 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9053 Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
9054 Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
9055 N2: DAG.getConstant(Val: 32, DL, VT: VA.getLocVT()));
9056 break;
9057 }
9058
9059 if (RegsUsed.count(V: VA.getLocReg())) {
9060 SDValue &Bits =
9061 llvm::find_if(Range&: RetVals, P: [=](const std::pair<unsigned, SDValue> &Elt) {
9062 return Elt.first == VA.getLocReg();
9063 })->second;
9064 Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
9065 } else {
9066 RetVals.emplace_back(Args: VA.getLocReg(), Args&: Arg);
9067 RegsUsed.insert(V: VA.getLocReg());
9068 }
9069 }
9070
9071 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9072
9073 // Emit SMSTOP before returning from a locally streaming function
9074 SMEAttrs FuncAttrs(MF.getFunction());
9075 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
9076 if (FuncAttrs.hasStreamingCompatibleInterface()) {
9077 Register Reg = FuncInfo->getPStateSMReg();
9078 assert(Reg.isValid() && "PStateSM Register is invalid");
9079 SDValue PStateSM = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: MVT::i64);
9080 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9081 /*Glue*/ InGlue: SDValue(),
9082 Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
9083 } else
9084 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9085 /*Glue*/ InGlue: SDValue(), Condition: AArch64SME::Always);
9086 Glue = Chain.getValue(R: 1);
9087 }
9088
9089 SmallVector<SDValue, 4> RetOps(1, Chain);
9090 for (auto &RetVal : RetVals) {
9091 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
9092 isPassedInFPR(VT: RetVal.second.getValueType()))
9093 RetVal.second = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
9094 VT: RetVal.second.getValueType(), Operand: RetVal.second);
9095 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetVal.first, N: RetVal.second, Glue);
9096 Glue = Chain.getValue(R: 1);
9097 RetOps.push_back(
9098 Elt: DAG.getRegister(Reg: RetVal.first, VT: RetVal.second.getValueType()));
9099 }
9100
9101 // Windows AArch64 ABIs require that for returning structs by value we copy
9102 // the sret argument into X0 for the return.
9103 // We saved the argument into a virtual register in the entry block,
9104 // so now we copy the value out and into X0.
9105 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
9106 SDValue Val = DAG.getCopyFromReg(Chain: RetOps[0], dl: DL, Reg: SRetReg,
9107 VT: getPointerTy(DL: MF.getDataLayout()));
9108
9109 unsigned RetValReg = AArch64::X0;
9110 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
9111 RetValReg = AArch64::X8;
9112 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetValReg, N: Val, Glue);
9113 Glue = Chain.getValue(R: 1);
9114
9115 RetOps.push_back(
9116 Elt: DAG.getRegister(Reg: RetValReg, VT: getPointerTy(DL: DAG.getDataLayout())));
9117 }
9118
9119 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(MF: &MF);
9120 if (I) {
9121 for (; *I; ++I) {
9122 if (AArch64::GPR64RegClass.contains(Reg: *I))
9123 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
9124 else if (AArch64::FPR64RegClass.contains(Reg: *I))
9125 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::getFloatingPointVT(BitWidth: 64)));
9126 else
9127 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9128 }
9129 }
9130
9131 RetOps[0] = Chain; // Update chain.
9132
9133 // Add the glue if we have it.
9134 if (Glue.getNode())
9135 RetOps.push_back(Elt: Glue);
9136
9137 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9138 // ARM64EC entry thunks use a special return sequence: instead of a regular
9139 // "ret" instruction, they need to explicitly call the emulator.
9140 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9141 SDValue Arm64ECRetDest =
9142 DAG.getExternalSymbol(Sym: "__os_arm64x_dispatch_ret", VT: PtrVT);
9143 Arm64ECRetDest =
9144 getAddr(N: cast<ExternalSymbolSDNode>(Val&: Arm64ECRetDest), DAG, Flags: 0);
9145 Arm64ECRetDest = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Arm64ECRetDest,
9146 PtrInfo: MachinePointerInfo());
9147 RetOps.insert(I: RetOps.begin() + 1, Elt: Arm64ECRetDest);
9148 RetOps.insert(I: RetOps.begin() + 2, Elt: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
9149 return DAG.getNode(Opcode: AArch64ISD::TC_RETURN, DL, VT: MVT::Other, Ops: RetOps);
9150 }
9151
9152 return DAG.getNode(Opcode: AArch64ISD::RET_GLUE, DL, VT: MVT::Other, Ops: RetOps);
9153}
9154
9155//===----------------------------------------------------------------------===//
9156// Other Lowering Code
9157//===----------------------------------------------------------------------===//
9158
9159SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9160 SelectionDAG &DAG,
9161 unsigned Flag) const {
9162 return DAG.getTargetGlobalAddress(GV: N->getGlobal(), DL: SDLoc(N), VT: Ty,
9163 offset: N->getOffset(), TargetFlags: Flag);
9164}
9165
9166SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9167 SelectionDAG &DAG,
9168 unsigned Flag) const {
9169 return DAG.getTargetJumpTable(JTI: N->getIndex(), VT: Ty, TargetFlags: Flag);
9170}
9171
9172SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9173 SelectionDAG &DAG,
9174 unsigned Flag) const {
9175 return DAG.getTargetConstantPool(C: N->getConstVal(), VT: Ty, Align: N->getAlign(),
9176 Offset: N->getOffset(), TargetFlags: Flag);
9177}
9178
9179SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9180 SelectionDAG &DAG,
9181 unsigned Flag) const {
9182 return DAG.getTargetBlockAddress(BA: N->getBlockAddress(), VT: Ty, Offset: 0, TargetFlags: Flag);
9183}
9184
9185SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9186 SelectionDAG &DAG,
9187 unsigned Flag) const {
9188 return DAG.getTargetExternalSymbol(Sym: N->getSymbol(), VT: Ty, TargetFlags: Flag);
9189}
9190
9191// (loadGOT sym)
9192template <class NodeTy>
9193SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9194 unsigned Flags) const {
9195 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9196 SDLoc DL(N);
9197 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9198 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
9199 // FIXME: Once remat is capable of dealing with instructions with register
9200 // operands, expand this into two nodes instead of using a wrapper node.
9201 return DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: Ty, Operand: GotAddr);
9202}
9203
9204// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9205template <class NodeTy>
9206SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9207 unsigned Flags) const {
9208 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9209 SDLoc DL(N);
9210 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9211 const unsigned char MO_NC = AArch64II::MO_NC;
9212 return DAG.getNode(
9213 AArch64ISD::WrapperLarge, DL, Ty,
9214 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
9215 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
9216 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
9217 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
9218}
9219
9220// (addlow (adrp %hi(sym)) %lo(sym))
9221template <class NodeTy>
9222SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9223 unsigned Flags) const {
9224 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9225 SDLoc DL(N);
9226 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9227 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
9228 SDValue Lo = getTargetNode(N, Ty, DAG,
9229 AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
9230 SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: Ty, Operand: Hi);
9231 return DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: Ty, N1: ADRP, N2: Lo);
9232}
9233
9234// (adr sym)
9235template <class NodeTy>
9236SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9237 unsigned Flags) const {
9238 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9239 SDLoc DL(N);
9240 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9241 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9242 return DAG.getNode(Opcode: AArch64ISD::ADR, DL, VT: Ty, Operand: Sym);
9243}
9244
9245SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9246 SelectionDAG &DAG) const {
9247 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Val&: Op);
9248 const GlobalValue *GV = GN->getGlobal();
9249 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM: getTargetMachine());
9250
9251 if (OpFlags != AArch64II::MO_NO_FLAG)
9252 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
9253 "unexpected offset in global node");
9254
9255 // This also catches the large code model case for Darwin, and tiny code
9256 // model with got relocations.
9257 if ((OpFlags & AArch64II::MO_GOT) != 0) {
9258 return getGOT(N: GN, DAG, Flags: OpFlags);
9259 }
9260
9261 SDValue Result;
9262 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
9263 !getTargetMachine().isPositionIndependent()) {
9264 Result = getAddrLarge(N: GN, DAG, Flags: OpFlags);
9265 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9266 Result = getAddrTiny(N: GN, DAG, Flags: OpFlags);
9267 } else {
9268 Result = getAddr(N: GN, DAG, Flags: OpFlags);
9269 }
9270 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9271 SDLoc DL(GN);
9272 if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
9273 Result = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Result,
9274 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()));
9275 return Result;
9276}
9277
9278/// Convert a TLS address reference into the correct sequence of loads
9279/// and calls to compute the variable's address (for Darwin, currently) and
9280/// return an SDValue containing the final node.
9281
9282/// Darwin only has one TLS scheme which must be capable of dealing with the
9283/// fully general situation, in the worst case. This means:
9284/// + "extern __thread" declaration.
9285/// + Defined in a possibly unknown dynamic library.
9286///
9287/// The general system is that each __thread variable has a [3 x i64] descriptor
9288/// which contains information used by the runtime to calculate the address. The
9289/// only part of this the compiler needs to know about is the first xword, which
9290/// contains a function pointer that must be called with the address of the
9291/// entire descriptor in "x0".
9292///
9293/// Since this descriptor may be in a different unit, in general even the
9294/// descriptor must be accessed via an indirect load. The "ideal" code sequence
9295/// is:
9296/// adrp x0, _var@TLVPPAGE
9297/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
9298/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
9299/// ; the function pointer
9300/// blr x1 ; Uses descriptor address in x0
9301/// ; Address of _var is now in x0.
9302///
9303/// If the address of _var's descriptor *is* known to the linker, then it can
9304/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
9305/// a slight efficiency gain.
9306SDValue
9307AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
9308 SelectionDAG &DAG) const {
9309 assert(Subtarget->isTargetDarwin() &&
9310 "This function expects a Darwin target");
9311
9312 SDLoc DL(Op);
9313 MVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9314 MVT PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
9315 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: Op)->getGlobal();
9316
9317 SDValue TLVPAddr =
9318 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS);
9319 SDValue DescAddr = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TLVPAddr);
9320
9321 // The first entry in the descriptor is a function pointer that we must call
9322 // to obtain the address of the variable.
9323 SDValue Chain = DAG.getEntryNode();
9324 SDValue FuncTLVGet = DAG.getLoad(
9325 VT: PtrMemVT, dl: DL, Chain, Ptr: DescAddr,
9326 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()),
9327 Alignment: Align(PtrMemVT.getSizeInBits() / 8),
9328 MMOFlags: MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
9329 Chain = FuncTLVGet.getValue(R: 1);
9330
9331 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
9332 FuncTLVGet = DAG.getZExtOrTrunc(Op: FuncTLVGet, DL, VT: PtrVT);
9333
9334 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9335 MFI.setAdjustsStack(true);
9336
9337 // TLS calls preserve all registers except those that absolutely must be
9338 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
9339 // silly).
9340 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9341 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
9342 if (Subtarget->hasCustomCallingConv())
9343 TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
9344
9345 // Finally, we can make the call. This is just a degenerate version of a
9346 // normal AArch64 call node: x0 takes the address of the descriptor, and
9347 // returns the address of the variable in this thread.
9348 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: AArch64::X0, N: DescAddr, Glue: SDValue());
9349
9350 unsigned Opcode = AArch64ISD::CALL;
9351 SmallVector<SDValue, 8> Ops;
9352 Ops.push_back(Elt: Chain);
9353 Ops.push_back(Elt: FuncTLVGet);
9354
9355 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
9356 if (DAG.getMachineFunction().getFunction().hasFnAttribute(Kind: "ptrauth-calls")) {
9357 Opcode = AArch64ISD::AUTH_CALL;
9358 Ops.push_back(Elt: DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32));
9359 Ops.push_back(Elt: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64)); // Integer Disc.
9360 Ops.push_back(Elt: DAG.getRegister(Reg: AArch64::NoRegister, VT: MVT::i64)); // Addr Disc.
9361 }
9362
9363 Ops.push_back(Elt: DAG.getRegister(Reg: AArch64::X0, VT: MVT::i64));
9364 Ops.push_back(Elt: DAG.getRegisterMask(RegMask: Mask));
9365 Ops.push_back(Elt: Chain.getValue(R: 1));
9366 Chain = DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops);
9367 return DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::X0, VT: PtrVT, Glue: Chain.getValue(R: 1));
9368}
9369
9370/// Convert a thread-local variable reference into a sequence of instructions to
9371/// compute the variable's address for the local exec TLS model of ELF targets.
9372/// The sequence depends on the maximum TLS area size.
9373SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
9374 SDValue ThreadBase,
9375 const SDLoc &DL,
9376 SelectionDAG &DAG) const {
9377 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9378 SDValue TPOff, Addr;
9379
9380 switch (DAG.getTarget().Options.TLSSize) {
9381 default:
9382 llvm_unreachable("Unexpected TLS size");
9383
9384 case 12: {
9385 // mrs x0, TPIDR_EL0
9386 // add x0, x0, :tprel_lo12:a
9387 SDValue Var = DAG.getTargetGlobalAddress(
9388 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
9389 return SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: ThreadBase,
9390 Op2: Var,
9391 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
9392 0);
9393 }
9394
9395 case 24: {
9396 // mrs x0, TPIDR_EL0
9397 // add x0, x0, :tprel_hi12:a
9398 // add x0, x0, :tprel_lo12_nc:a
9399 SDValue HiVar = DAG.getTargetGlobalAddress(
9400 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_HI12);
9401 SDValue LoVar = DAG.getTargetGlobalAddress(
9402 GV, DL, VT: PtrVT, offset: 0,
9403 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9404 Addr = SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: ThreadBase,
9405 Op2: HiVar,
9406 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
9407 0);
9408 return SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: Addr,
9409 Op2: LoVar,
9410 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
9411 0);
9412 }
9413
9414 case 32: {
9415 // mrs x1, TPIDR_EL0
9416 // movz x0, #:tprel_g1:a
9417 // movk x0, #:tprel_g0_nc:a
9418 // add x0, x1, x0
9419 SDValue HiVar = DAG.getTargetGlobalAddress(
9420 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G1);
9421 SDValue LoVar = DAG.getTargetGlobalAddress(
9422 GV, DL, VT: PtrVT, offset: 0,
9423 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
9424 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::MOVZXi, dl: DL, VT: PtrVT, Op1: HiVar,
9425 Op2: DAG.getTargetConstant(Val: 16, DL, VT: MVT::i32)),
9426 0);
9427 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
9428 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
9429 0);
9430 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
9431 }
9432
9433 case 48: {
9434 // mrs x1, TPIDR_EL0
9435 // movz x0, #:tprel_g2:a
9436 // movk x0, #:tprel_g1_nc:a
9437 // movk x0, #:tprel_g0_nc:a
9438 // add x0, x1, x0
9439 SDValue HiVar = DAG.getTargetGlobalAddress(
9440 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G2);
9441 SDValue MiVar = DAG.getTargetGlobalAddress(
9442 GV, DL, VT: PtrVT, offset: 0,
9443 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
9444 SDValue LoVar = DAG.getTargetGlobalAddress(
9445 GV, DL, VT: PtrVT, offset: 0,
9446 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
9447 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::MOVZXi, dl: DL, VT: PtrVT, Op1: HiVar,
9448 Op2: DAG.getTargetConstant(Val: 32, DL, VT: MVT::i32)),
9449 0);
9450 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: MiVar,
9451 Op3: DAG.getTargetConstant(Val: 16, DL, VT: MVT::i32)),
9452 0);
9453 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
9454 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
9455 0);
9456 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
9457 }
9458 }
9459}
9460
9461/// When accessing thread-local variables under either the general-dynamic or
9462/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
9463/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
9464/// is a function pointer to carry out the resolution.
9465///
9466/// The sequence is:
9467/// adrp x0, :tlsdesc:var
9468/// ldr x1, [x0, #:tlsdesc_lo12:var]
9469/// add x0, x0, #:tlsdesc_lo12:var
9470/// .tlsdesccall var
9471/// blr x1
9472/// (TPIDR_EL0 offset now in x0)
9473///
9474/// The above sequence must be produced unscheduled, to enable the linker to
9475/// optimize/relax this sequence.
9476/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
9477/// above sequence, and expanded really late in the compilation flow, to ensure
9478/// the sequence is produced as per above.
9479SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
9480 const SDLoc &DL,
9481 SelectionDAG &DAG) const {
9482 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9483
9484 SDValue Chain = DAG.getEntryNode();
9485 SDVTList NodeTys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
9486
9487 Chain =
9488 DAG.getNode(Opcode: AArch64ISD::TLSDESC_CALLSEQ, DL, VTList: NodeTys, Ops: {Chain, SymAddr});
9489 SDValue Glue = Chain.getValue(R: 1);
9490
9491 return DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::X0, VT: PtrVT, Glue);
9492}
9493
9494SDValue
9495AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
9496 SelectionDAG &DAG) const {
9497 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
9498
9499 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
9500
9501 TLSModel::Model Model = getTargetMachine().getTLSModel(GV: GA->getGlobal());
9502
9503 if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
9504 if (Model == TLSModel::LocalDynamic)
9505 Model = TLSModel::GeneralDynamic;
9506 }
9507
9508 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
9509 Model != TLSModel::LocalExec)
9510 report_fatal_error(reason: "ELF TLS only supported in small memory model or "
9511 "in local exec TLS model");
9512 // Different choices can be made for the maximum size of the TLS area for a
9513 // module. For the small address model, the default TLS size is 16MiB and the
9514 // maximum TLS size is 4GiB.
9515 // FIXME: add tiny and large code model support for TLS access models other
9516 // than local exec. We currently generate the same code as small for tiny,
9517 // which may be larger than needed.
9518
9519 SDValue TPOff;
9520 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9521 SDLoc DL(Op);
9522 const GlobalValue *GV = GA->getGlobal();
9523
9524 SDValue ThreadBase = DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL, VT: PtrVT);
9525
9526 if (Model == TLSModel::LocalExec) {
9527 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9528 } else if (Model == TLSModel::InitialExec) {
9529 TPOff = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS);
9530 TPOff = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TPOff);
9531 } else if (Model == TLSModel::LocalDynamic) {
9532 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9533 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9534 // the beginning of the module's TLS region, followed by a DTPREL offset
9535 // calculation.
9536
9537 // These accesses will need deduplicating if there's more than one.
9538 AArch64FunctionInfo *MFI =
9539 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9540 MFI->incNumLocalDynamicTLSAccesses();
9541
9542 // The call needs a relocation too for linker relaxation. It doesn't make
9543 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9544 // the address.
9545 SDValue SymAddr = DAG.getTargetExternalSymbol(Sym: "_TLS_MODULE_BASE_", VT: PtrVT,
9546 TargetFlags: AArch64II::MO_TLS);
9547
9548 // Now we can calculate the offset from TPIDR_EL0 to this module's
9549 // thread-local area.
9550 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9551
9552 // Now use :dtprel_whatever: operations to calculate this variable's offset
9553 // in its thread-storage area.
9554 SDValue HiVar = DAG.getTargetGlobalAddress(
9555 GV, DL, VT: MVT::i64, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_HI12);
9556 SDValue LoVar = DAG.getTargetGlobalAddress(
9557 GV, DL, VT: MVT::i64, offset: 0,
9558 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9559
9560 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TPOff, Op2: HiVar,
9561 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
9562 0);
9563 TPOff = SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
9564 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
9565 0);
9566 } else if (Model == TLSModel::GeneralDynamic) {
9567 // The call needs a relocation too for linker relaxation. It doesn't make
9568 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9569 // the address.
9570 SDValue SymAddr =
9571 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS);
9572
9573 // Finally we can make a call to calculate the offset from tpidr_el0.
9574 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9575 } else
9576 llvm_unreachable("Unsupported ELF TLS access model");
9577
9578 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
9579}
9580
9581SDValue
9582AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9583 SelectionDAG &DAG) const {
9584 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9585
9586 SDValue Chain = DAG.getEntryNode();
9587 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9588 SDLoc DL(Op);
9589
9590 SDValue TEB = DAG.getRegister(Reg: AArch64::X18, VT: MVT::i64);
9591
9592 // Load the ThreadLocalStoragePointer from the TEB
9593 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9594 SDValue TLSArray =
9595 DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TEB, N2: DAG.getIntPtrConstant(Val: 0x58, DL));
9596 TLSArray = DAG.getLoad(VT: PtrVT, dl: DL, Chain, Ptr: TLSArray, PtrInfo: MachinePointerInfo());
9597 Chain = TLSArray.getValue(R: 1);
9598
9599 // Load the TLS index from the C runtime;
9600 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9601 // This also does the same as LOADgot, but using a generic i32 load,
9602 // while LOADgot only loads i64.
9603 SDValue TLSIndexHi =
9604 DAG.getTargetExternalSymbol(Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGE);
9605 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9606 Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9607 SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: PtrVT, Operand: TLSIndexHi);
9608 SDValue TLSIndex =
9609 DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: ADRP, N2: TLSIndexLo);
9610 TLSIndex = DAG.getLoad(VT: MVT::i32, dl: DL, Chain, Ptr: TLSIndex, PtrInfo: MachinePointerInfo());
9611 Chain = TLSIndex.getValue(R: 1);
9612
9613 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9614 // offset into the TLSArray.
9615 TLSIndex = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: PtrVT, Operand: TLSIndex);
9616 SDValue Slot = DAG.getNode(Opcode: ISD::SHL, DL, VT: PtrVT, N1: TLSIndex,
9617 N2: DAG.getConstant(Val: 3, DL, VT: PtrVT));
9618 SDValue TLS = DAG.getLoad(VT: PtrVT, dl: DL, Chain,
9619 Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TLSArray, N2: Slot),
9620 PtrInfo: MachinePointerInfo());
9621 Chain = TLS.getValue(R: 1);
9622
9623 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
9624 const GlobalValue *GV = GA->getGlobal();
9625 SDValue TGAHi = DAG.getTargetGlobalAddress(
9626 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_HI12);
9627 SDValue TGALo = DAG.getTargetGlobalAddress(
9628 GV, DL, VT: PtrVT, offset: 0,
9629 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9630
9631 // Add the offset from the start of the .tls section (section base).
9632 SDValue Addr =
9633 SDValue(DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TLS, Op2: TGAHi,
9634 Op3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
9635 0);
9636 Addr = DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: Addr, N2: TGALo);
9637 return Addr;
9638}
9639
9640SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9641 SelectionDAG &DAG) const {
9642 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
9643 if (DAG.getTarget().useEmulatedTLS())
9644 return LowerToTLSEmulatedModel(GA, DAG);
9645
9646 if (Subtarget->isTargetDarwin())
9647 return LowerDarwinGlobalTLSAddress(Op, DAG);
9648 if (Subtarget->isTargetELF())
9649 return LowerELFGlobalTLSAddress(Op, DAG);
9650 if (Subtarget->isTargetWindows())
9651 return LowerWindowsGlobalTLSAddress(Op, DAG);
9652
9653 llvm_unreachable("Unexpected platform trying to use TLS");
9654}
9655
9656//===----------------------------------------------------------------------===//
9657// PtrAuthGlobalAddress lowering
9658//
9659// We have 3 lowering alternatives to choose from:
9660// - MOVaddrPAC: similar to MOVaddr, with added PAC.
9661// If the GV doesn't need a GOT load (i.e., is locally defined)
9662// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
9663//
9664// - LOADgotPAC: similar to LOADgot, with added PAC.
9665// If the GV needs a GOT load, materialize the pointer using the usual
9666// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
9667// section is assumed to be read-only (for example, via relro mechanism). See
9668// LowerMOVaddrPAC.
9669//
9670// - LOADauthptrstatic: similar to LOADgot, but use a
9671// special stub slot instead of a GOT slot.
9672// Load a signed pointer for symbol 'sym' from a stub slot named
9673// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
9674// resolving. This usually lowers to adrp+ldr, but also emits an entry into
9675// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
9676//
9677// All 3 are pseudos that are expand late to longer sequences: this lets us
9678// provide integrity guarantees on the to-be-signed intermediate values.
9679//
9680// LOADauthptrstatic is undesirable because it requires a large section filled
9681// with often similarly-signed pointers, making it a good harvesting target.
9682// Thus, it's only used for ptrauth references to extern_weak to avoid null
9683// checks.
9684
9685SDValue AArch64TargetLowering::LowerPtrAuthGlobalAddressStatically(
9686 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
9687 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) const {
9688 const auto *TGN = cast<GlobalAddressSDNode>(Val: TGA.getNode());
9689 assert(TGN->getGlobal()->hasExternalWeakLinkage());
9690
9691 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
9692 // offset alone as a pointer if the symbol wasn't available, which would
9693 // probably break null checks in users. Ptrauth complicates things further:
9694 // error out.
9695 if (TGN->getOffset() != 0)
9696 report_fatal_error(
9697 reason: "unsupported non-zero offset in weak ptrauth global reference");
9698
9699 if (!isNullConstant(V: AddrDiscriminator))
9700 report_fatal_error(reason: "unsupported weak addr-div ptrauth global");
9701
9702 SDValue Key = DAG.getTargetConstant(Val: KeyC, DL, VT: MVT::i32);
9703 return SDValue(DAG.getMachineNode(Opcode: AArch64::LOADauthptrstatic, dl: DL, VT: MVT::i64,
9704 Ops: {TGA, Key, Discriminator}),
9705 0);
9706}
9707
9708SDValue
9709AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
9710 SelectionDAG &DAG) const {
9711 SDValue Ptr = Op.getOperand(i: 0);
9712 uint64_t KeyC = Op.getConstantOperandVal(i: 1);
9713 SDValue AddrDiscriminator = Op.getOperand(i: 2);
9714 uint64_t DiscriminatorC = Op.getConstantOperandVal(i: 3);
9715 EVT VT = Op.getValueType();
9716 SDLoc DL(Op);
9717
9718 if (KeyC > AArch64PACKey::LAST)
9719 report_fatal_error(reason: "key in ptrauth global out of range [0, " +
9720 Twine((int)AArch64PACKey::LAST) + "]");
9721
9722 // Blend only works if the integer discriminator is 16-bit wide.
9723 if (!isUInt<16>(x: DiscriminatorC))
9724 report_fatal_error(
9725 reason: "constant discriminator in ptrauth global out of range [0, 0xffff]");
9726
9727 // Choosing between 3 lowering alternatives is target-specific.
9728 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
9729 report_fatal_error(reason: "ptrauth global lowering only supported on MachO/ELF");
9730
9731 int64_t PtrOffsetC = 0;
9732 if (Ptr.getOpcode() == ISD::ADD) {
9733 PtrOffsetC = Ptr.getConstantOperandVal(i: 1);
9734 Ptr = Ptr.getOperand(i: 0);
9735 }
9736 const auto *PtrN = cast<GlobalAddressSDNode>(Val: Ptr.getNode());
9737 const GlobalValue *PtrGV = PtrN->getGlobal();
9738
9739 // Classify the reference to determine whether it needs a GOT load.
9740 const unsigned OpFlags =
9741 Subtarget->ClassifyGlobalReference(GV: PtrGV, TM: getTargetMachine());
9742 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
9743 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
9744 "unsupported non-GOT op flags on ptrauth global reference");
9745
9746 // Fold any offset into the GV; our pseudos expect it there.
9747 PtrOffsetC += PtrN->getOffset();
9748 SDValue TPtr = DAG.getTargetGlobalAddress(GV: PtrGV, DL, VT, offset: PtrOffsetC,
9749 /*TargetFlags=*/0);
9750 assert(PtrN->getTargetFlags() == 0 &&
9751 "unsupported target flags on ptrauth global");
9752
9753 SDValue Key = DAG.getTargetConstant(Val: KeyC, DL, VT: MVT::i32);
9754 SDValue Discriminator = DAG.getTargetConstant(Val: DiscriminatorC, DL, VT: MVT::i64);
9755 SDValue TAddrDiscriminator = !isNullConstant(V: AddrDiscriminator)
9756 ? AddrDiscriminator
9757 : DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
9758
9759 // No GOT load needed -> MOVaddrPAC
9760 if (!NeedsGOTLoad) {
9761 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
9762 return SDValue(
9763 DAG.getMachineNode(Opcode: AArch64::MOVaddrPAC, dl: DL, VT: MVT::i64,
9764 Ops: {TPtr, Key, TAddrDiscriminator, Discriminator}),
9765 0);
9766 }
9767
9768 // GOT load -> LOADgotPAC
9769 // Note that we disallow extern_weak refs to avoid null checks later.
9770 if (!PtrGV->hasExternalWeakLinkage())
9771 return SDValue(
9772 DAG.getMachineNode(Opcode: AArch64::LOADgotPAC, dl: DL, VT: MVT::i64,
9773 Ops: {TPtr, Key, TAddrDiscriminator, Discriminator}),
9774 0);
9775
9776 // extern_weak ref -> LOADauthptrstatic
9777 return LowerPtrAuthGlobalAddressStatically(
9778 TGA: TPtr, DL, VT, KeyC: (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
9779 DAG);
9780}
9781
9782// Looks through \param Val to determine the bit that can be used to
9783// check the sign of the value. It returns the unextended value and
9784// the sign bit position.
9785std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9786 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9787 return {Val.getOperand(i: 0),
9788 cast<VTSDNode>(Val: Val.getOperand(i: 1))->getVT().getFixedSizeInBits() -
9789 1};
9790
9791 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9792 return {Val.getOperand(i: 0),
9793 Val.getOperand(i: 0)->getValueType(ResNo: 0).getFixedSizeInBits() - 1};
9794
9795 return {Val, Val.getValueSizeInBits() - 1};
9796}
9797
9798SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9799 SDValue Chain = Op.getOperand(i: 0);
9800 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 1))->get();
9801 SDValue LHS = Op.getOperand(i: 2);
9802 SDValue RHS = Op.getOperand(i: 3);
9803 SDValue Dest = Op.getOperand(i: 4);
9804 SDLoc dl(Op);
9805
9806 MachineFunction &MF = DAG.getMachineFunction();
9807 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9808 // will not be produced, as they are conditional branch instructions that do
9809 // not set flags.
9810 bool ProduceNonFlagSettingCondBr =
9811 !MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening);
9812
9813 // Handle f128 first, since lowering it will result in comparing the return
9814 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9815 // is expecting to deal with.
9816 if (LHS.getValueType() == MVT::f128) {
9817 softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS);
9818
9819 // If softenSetCCOperands returned a scalar, we need to compare the result
9820 // against zero to select between true and false values.
9821 if (!RHS.getNode()) {
9822 RHS = DAG.getConstant(Val: 0, DL: dl, VT: LHS.getValueType());
9823 CC = ISD::SETNE;
9824 }
9825 }
9826
9827 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9828 // instruction.
9829 if (ISD::isOverflowIntrOpRes(Op: LHS) && isOneConstant(V: RHS) &&
9830 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9831 // Only lower legal XALUO ops.
9832 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: LHS->getValueType(ResNo: 0)))
9833 return SDValue();
9834
9835 // The actual operation with overflow check.
9836 AArch64CC::CondCode OFCC;
9837 SDValue Value, Overflow;
9838 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: LHS.getValue(R: 0), DAG);
9839
9840 if (CC == ISD::SETNE)
9841 OFCC = getInvertedCondCode(Code: OFCC);
9842 SDValue CCVal = DAG.getConstant(Val: OFCC, DL: dl, VT: MVT::i32);
9843
9844 return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL: dl, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
9845 N4: Overflow);
9846 }
9847
9848 if (LHS.getValueType().isInteger()) {
9849 assert((LHS.getValueType() == RHS.getValueType()) &&
9850 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9851
9852 // If the RHS of the comparison is zero, we can potentially fold this
9853 // to a specialized branch.
9854 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
9855 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9856 if (CC == ISD::SETEQ) {
9857 // See if we can use a TBZ to fold in an AND as well.
9858 // TBZ has a smaller branch displacement than CBZ. If the offset is
9859 // out of bounds, a late MI-layer pass rewrites branches.
9860 // 403.gcc is an example that hits this case.
9861 if (LHS.getOpcode() == ISD::AND &&
9862 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
9863 isPowerOf2_64(Value: LHS.getConstantOperandVal(i: 1))) {
9864 SDValue Test = LHS.getOperand(i: 0);
9865 uint64_t Mask = LHS.getConstantOperandVal(i: 1);
9866 return DAG.getNode(Opcode: AArch64ISD::TBZ, DL: dl, VT: MVT::Other, N1: Chain, N2: Test,
9867 N3: DAG.getConstant(Val: Log2_64(Value: Mask), DL: dl, VT: MVT::i64),
9868 N4: Dest);
9869 }
9870
9871 return DAG.getNode(Opcode: AArch64ISD::CBZ, DL: dl, VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
9872 } else if (CC == ISD::SETNE) {
9873 // See if we can use a TBZ to fold in an AND as well.
9874 // TBZ has a smaller branch displacement than CBZ. If the offset is
9875 // out of bounds, a late MI-layer pass rewrites branches.
9876 // 403.gcc is an example that hits this case.
9877 if (LHS.getOpcode() == ISD::AND &&
9878 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
9879 isPowerOf2_64(Value: LHS.getConstantOperandVal(i: 1))) {
9880 SDValue Test = LHS.getOperand(i: 0);
9881 uint64_t Mask = LHS.getConstantOperandVal(i: 1);
9882 return DAG.getNode(Opcode: AArch64ISD::TBNZ, DL: dl, VT: MVT::Other, N1: Chain, N2: Test,
9883 N3: DAG.getConstant(Val: Log2_64(Value: Mask), DL: dl, VT: MVT::i64),
9884 N4: Dest);
9885 }
9886
9887 return DAG.getNode(Opcode: AArch64ISD::CBNZ, DL: dl, VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
9888 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9889 // Don't combine AND since emitComparison converts the AND to an ANDS
9890 // (a.k.a. TST) and the test in the test bit and branch instruction
9891 // becomes redundant. This would also increase register pressure.
9892 uint64_t SignBitPos;
9893 std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
9894 return DAG.getNode(Opcode: AArch64ISD::TBNZ, DL: dl, VT: MVT::Other, N1: Chain, N2: LHS,
9895 N3: DAG.getConstant(Val: SignBitPos, DL: dl, VT: MVT::i64), N4: Dest);
9896 }
9897 }
9898 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9899 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9900 // Don't combine AND since emitComparison converts the AND to an ANDS
9901 // (a.k.a. TST) and the test in the test bit and branch instruction
9902 // becomes redundant. This would also increase register pressure.
9903 uint64_t SignBitPos;
9904 std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
9905 return DAG.getNode(Opcode: AArch64ISD::TBZ, DL: dl, VT: MVT::Other, N1: Chain, N2: LHS,
9906 N3: DAG.getConstant(Val: SignBitPos, DL: dl, VT: MVT::i64), N4: Dest);
9907 }
9908
9909 SDValue CCVal;
9910 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
9911 return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL: dl, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
9912 N4: Cmp);
9913 }
9914
9915 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9916 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9917
9918 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9919 // clean. Some of them require two branches to implement.
9920 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9921 AArch64CC::CondCode CC1, CC2;
9922 changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
9923 SDValue CC1Val = DAG.getConstant(Val: CC1, DL: dl, VT: MVT::i32);
9924 SDValue BR1 =
9925 DAG.getNode(Opcode: AArch64ISD::BRCOND, DL: dl, VT: MVT::Other, N1: Chain, N2: Dest, N3: CC1Val, N4: Cmp);
9926 if (CC2 != AArch64CC::AL) {
9927 SDValue CC2Val = DAG.getConstant(Val: CC2, DL: dl, VT: MVT::i32);
9928 return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL: dl, VT: MVT::Other, N1: BR1, N2: Dest, N3: CC2Val,
9929 N4: Cmp);
9930 }
9931
9932 return BR1;
9933}
9934
9935SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9936 SelectionDAG &DAG) const {
9937 if (!Subtarget->isNeonAvailable() &&
9938 !Subtarget->useSVEForFixedLengthVectors())
9939 return SDValue();
9940
9941 EVT VT = Op.getValueType();
9942 EVT IntVT = VT.changeTypeToInteger();
9943 SDLoc DL(Op);
9944
9945 SDValue In1 = Op.getOperand(i: 0);
9946 SDValue In2 = Op.getOperand(i: 1);
9947 EVT SrcVT = In2.getValueType();
9948
9949 if (!SrcVT.bitsEq(VT))
9950 In2 = DAG.getFPExtendOrRound(Op: In2, DL, VT);
9951
9952 if (VT.isScalableVector())
9953 IntVT =
9954 getPackedSVEVectorVT(VT: VT.getVectorElementType().changeTypeToInteger());
9955
9956 if (VT.isFixedLengthVector() &&
9957 useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
9958 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9959
9960 In1 = convertToScalableVector(DAG, VT: ContainerVT, V: In1);
9961 In2 = convertToScalableVector(DAG, VT: ContainerVT, V: In2);
9962
9963 SDValue Res = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: ContainerVT, N1: In1, N2: In2);
9964 return convertFromScalableVector(DAG, VT, V: Res);
9965 }
9966
9967 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9968 if (VT.isScalableVector())
9969 return getSVESafeBitCast(VT, Op, DAG);
9970
9971 return DAG.getBitcast(VT, V: Op);
9972 };
9973
9974 SDValue VecVal1, VecVal2;
9975 EVT VecVT;
9976 auto SetVecVal = [&](int Idx = -1) {
9977 if (!VT.isVector()) {
9978 VecVal1 =
9979 DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In1);
9980 VecVal2 =
9981 DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In2);
9982 } else {
9983 VecVal1 = BitCast(VecVT, In1, DAG);
9984 VecVal2 = BitCast(VecVT, In2, DAG);
9985 }
9986 };
9987 if (VT.isVector()) {
9988 VecVT = IntVT;
9989 SetVecVal();
9990 } else if (VT == MVT::f64) {
9991 VecVT = MVT::v2i64;
9992 SetVecVal(AArch64::dsub);
9993 } else if (VT == MVT::f32) {
9994 VecVT = MVT::v4i32;
9995 SetVecVal(AArch64::ssub);
9996 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9997 VecVT = MVT::v8i16;
9998 SetVecVal(AArch64::hsub);
9999 } else {
10000 llvm_unreachable("Invalid type for copysign!");
10001 }
10002
10003 unsigned BitWidth = In1.getScalarValueSizeInBits();
10004 SDValue SignMaskV = DAG.getConstant(Val: ~APInt::getSignMask(BitWidth), DL, VT: VecVT);
10005
10006 // We want to materialize a mask with every bit but the high bit set, but the
10007 // AdvSIMD immediate moves cannot materialize that in a single instruction for
10008 // 64-bit elements. Instead, materialize all bits set and then negate that.
10009 if (VT == MVT::f64 || VT == MVT::v2f64) {
10010 SignMaskV = DAG.getConstant(Val: APInt::getAllOnes(numBits: BitWidth), DL, VT: VecVT);
10011 SignMaskV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2f64, Operand: SignMaskV);
10012 SignMaskV = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::v2f64, Operand: SignMaskV);
10013 SignMaskV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i64, Operand: SignMaskV);
10014 }
10015
10016 SDValue BSP =
10017 DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT: VecVT, N1: SignMaskV, N2: VecVal1, N3: VecVal2);
10018 if (VT == MVT::f16 || VT == MVT::bf16)
10019 return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT, Operand: BSP);
10020 if (VT == MVT::f32)
10021 return DAG.getTargetExtractSubreg(SRIdx: AArch64::ssub, DL, VT, Operand: BSP);
10022 if (VT == MVT::f64)
10023 return DAG.getTargetExtractSubreg(SRIdx: AArch64::dsub, DL, VT, Operand: BSP);
10024
10025 return BitCast(VT, BSP, DAG);
10026}
10027
10028SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
10029 SelectionDAG &DAG) const {
10030 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
10031 Kind: Attribute::NoImplicitFloat))
10032 return SDValue();
10033
10034 EVT VT = Op.getValueType();
10035 if (VT.isScalableVector() ||
10036 useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
10037 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTPOP_MERGE_PASSTHRU);
10038
10039 if (!Subtarget->isNeonAvailable())
10040 return SDValue();
10041
10042 bool IsParity = Op.getOpcode() == ISD::PARITY;
10043 SDValue Val = Op.getOperand(i: 0);
10044 SDLoc DL(Op);
10045
10046 // for i32, general parity function using EORs is more efficient compared to
10047 // using floating point
10048 if (VT == MVT::i32 && IsParity)
10049 return SDValue();
10050
10051 // If there is no CNT instruction available, GPR popcount can
10052 // be more efficiently lowered to the following sequence that uses
10053 // AdvSIMD registers/instructions as long as the copies to/from
10054 // the AdvSIMD registers are cheap.
10055 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
10056 // CNT V0.8B, V0.8B // 8xbyte pop-counts
10057 // ADDV B0, V0.8B // sum 8xbyte pop-counts
10058 // UMOV X0, V0.B[0] // copy byte result back to integer reg
10059 if (VT == MVT::i32 || VT == MVT::i64) {
10060 if (VT == MVT::i32)
10061 Val = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: Val);
10062 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i8, Operand: Val);
10063
10064 SDValue CtPop = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: MVT::v8i8, Operand: Val);
10065 SDValue UaddLV = DAG.getNode(Opcode: AArch64ISD::UADDLV, DL, VT: MVT::v4i32, Operand: CtPop);
10066 UaddLV = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: UaddLV,
10067 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
10068
10069 if (IsParity)
10070 UaddLV = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UaddLV,
10071 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
10072
10073 if (VT == MVT::i64)
10074 UaddLV = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: UaddLV);
10075 return UaddLV;
10076 } else if (VT == MVT::i128) {
10077 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v16i8, Operand: Val);
10078
10079 SDValue CtPop = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: MVT::v16i8, Operand: Val);
10080 SDValue UaddLV = DAG.getNode(Opcode: AArch64ISD::UADDLV, DL, VT: MVT::v4i32, Operand: CtPop);
10081 UaddLV = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: UaddLV,
10082 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
10083
10084 if (IsParity)
10085 UaddLV = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UaddLV,
10086 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
10087
10088 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i128, Operand: UaddLV);
10089 }
10090
10091 assert(!IsParity && "ISD::PARITY of vector types not supported");
10092
10093 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
10094 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
10095 "Unexpected type for custom ctpop lowering");
10096
10097 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
10098 Val = DAG.getBitcast(VT: VT8Bit, V: Val);
10099 Val = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: VT8Bit, Operand: Val);
10100
10101 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
10102 VT.getVectorNumElements() >= 2) {
10103 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
10104 SDValue Zeros = DAG.getConstant(Val: 0, DL, VT: DT);
10105 SDValue Ones = DAG.getConstant(Val: 1, DL, VT: VT8Bit);
10106
10107 if (VT == MVT::v2i64) {
10108 Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10109 Val = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT, Operand: Val);
10110 } else if (VT == MVT::v2i32) {
10111 Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10112 } else if (VT == MVT::v4i32) {
10113 Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10114 } else {
10115 llvm_unreachable("Unexpected type for custom ctpop lowering");
10116 }
10117
10118 return Val;
10119 }
10120
10121 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
10122 unsigned EltSize = 8;
10123 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
10124 while (EltSize != VT.getScalarSizeInBits()) {
10125 EltSize *= 2;
10126 NumElts /= 2;
10127 MVT WidenVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: EltSize), NumElements: NumElts);
10128 Val = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT: WidenVT, Operand: Val);
10129 }
10130
10131 return Val;
10132}
10133
10134SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10135 EVT VT = Op.getValueType();
10136 assert(VT.isScalableVector() ||
10137 useSVEForFixedLengthVectorVT(
10138 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
10139
10140 SDLoc DL(Op);
10141 SDValue RBIT = DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT, Operand: Op.getOperand(i: 0));
10142 return DAG.getNode(Opcode: ISD::CTLZ, DL, VT, Operand: RBIT);
10143}
10144
10145SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
10146 SelectionDAG &DAG) const {
10147
10148 EVT VT = Op.getValueType();
10149 SDLoc DL(Op);
10150 unsigned Opcode = Op.getOpcode();
10151 ISD::CondCode CC;
10152 switch (Opcode) {
10153 default:
10154 llvm_unreachable("Wrong instruction");
10155 case ISD::SMAX:
10156 CC = ISD::SETGT;
10157 break;
10158 case ISD::SMIN:
10159 CC = ISD::SETLT;
10160 break;
10161 case ISD::UMAX:
10162 CC = ISD::SETUGT;
10163 break;
10164 case ISD::UMIN:
10165 CC = ISD::SETULT;
10166 break;
10167 }
10168
10169 if (VT.isScalableVector() ||
10170 useSVEForFixedLengthVectorVT(
10171 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
10172 switch (Opcode) {
10173 default:
10174 llvm_unreachable("Wrong instruction");
10175 case ISD::SMAX:
10176 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMAX_PRED);
10177 case ISD::SMIN:
10178 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMIN_PRED);
10179 case ISD::UMAX:
10180 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMAX_PRED);
10181 case ISD::UMIN:
10182 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMIN_PRED);
10183 }
10184 }
10185
10186 SDValue Op0 = Op.getOperand(i: 0);
10187 SDValue Op1 = Op.getOperand(i: 1);
10188 SDValue Cond = DAG.getSetCC(DL, VT, LHS: Op0, RHS: Op1, Cond: CC);
10189 return DAG.getSelect(DL, VT, Cond, LHS: Op0, RHS: Op1);
10190}
10191
10192SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10193 SelectionDAG &DAG) const {
10194 EVT VT = Op.getValueType();
10195
10196 if (VT.isScalableVector() ||
10197 useSVEForFixedLengthVectorVT(
10198 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
10199 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10200
10201 SDLoc DL(Op);
10202 SDValue REVB;
10203 MVT VST;
10204
10205 switch (VT.getSimpleVT().SimpleTy) {
10206 default:
10207 llvm_unreachable("Invalid type for bitreverse!");
10208
10209 case MVT::v2i32: {
10210 VST = MVT::v8i8;
10211 REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: 0));
10212
10213 break;
10214 }
10215
10216 case MVT::v4i32: {
10217 VST = MVT::v16i8;
10218 REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: 0));
10219
10220 break;
10221 }
10222
10223 case MVT::v1i64: {
10224 VST = MVT::v8i8;
10225 REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: 0));
10226
10227 break;
10228 }
10229
10230 case MVT::v2i64: {
10231 VST = MVT::v16i8;
10232 REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: 0));
10233
10234 break;
10235 }
10236 }
10237
10238 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
10239 Operand: DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT: VST, Operand: REVB));
10240}
10241
10242// Check whether the continuous comparison sequence.
10243static bool
10244isOrXorChain(SDValue N, unsigned &Num,
10245 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
10246 if (Num == MaxXors)
10247 return false;
10248
10249 // Skip the one-use zext
10250 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
10251 N = N->getOperand(Num: 0);
10252
10253 // The leaf node must be XOR
10254 if (N->getOpcode() == ISD::XOR) {
10255 WorkList.push_back(Elt: std::make_pair(x: N->getOperand(Num: 0), y: N->getOperand(Num: 1)));
10256 Num++;
10257 return true;
10258 }
10259
10260 // All the non-leaf nodes must be OR.
10261 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
10262 return false;
10263
10264 if (isOrXorChain(N: N->getOperand(Num: 0), Num, WorkList) &&
10265 isOrXorChain(N: N->getOperand(Num: 1), Num, WorkList))
10266 return true;
10267 return false;
10268}
10269
10270// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
10271static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
10272 SDValue LHS = N->getOperand(Num: 0);
10273 SDValue RHS = N->getOperand(Num: 1);
10274 SDLoc DL(N);
10275 EVT VT = N->getValueType(ResNo: 0);
10276 SmallVector<std::pair<SDValue, SDValue>, 16> WorkList;
10277
10278 // Only handle integer compares.
10279 if (N->getOpcode() != ISD::SETCC)
10280 return SDValue();
10281
10282 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
10283 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
10284 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
10285 unsigned NumXors = 0;
10286 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(V: RHS) &&
10287 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
10288 isOrXorChain(N: LHS, Num&: NumXors, WorkList)) {
10289 SDValue XOR0, XOR1;
10290 std::tie(args&: XOR0, args&: XOR1) = WorkList[0];
10291 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
10292 SDValue Cmp = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
10293 for (unsigned I = 1; I < WorkList.size(); I++) {
10294 std::tie(args&: XOR0, args&: XOR1) = WorkList[I];
10295 SDValue CmpChain = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
10296 Cmp = DAG.getNode(Opcode: LogicOp, DL, VT, N1: Cmp, N2: CmpChain);
10297 }
10298
10299 // Exit early by inverting the condition, which help reduce indentations.
10300 return Cmp;
10301 }
10302
10303 return SDValue();
10304}
10305
10306SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
10307
10308 if (Op.getValueType().isVector())
10309 return LowerVSETCC(Op, DAG);
10310
10311 bool IsStrict = Op->isStrictFPOpcode();
10312 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10313 unsigned OpNo = IsStrict ? 1 : 0;
10314 SDValue Chain;
10315 if (IsStrict)
10316 Chain = Op.getOperand(i: 0);
10317 SDValue LHS = Op.getOperand(i: OpNo + 0);
10318 SDValue RHS = Op.getOperand(i: OpNo + 1);
10319 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: OpNo + 2))->get();
10320 SDLoc dl(Op);
10321
10322 // We chose ZeroOrOneBooleanContents, so use zero and one.
10323 EVT VT = Op.getValueType();
10324 SDValue TVal = DAG.getConstant(Val: 1, DL: dl, VT);
10325 SDValue FVal = DAG.getConstant(Val: 0, DL: dl, VT);
10326
10327 // Handle f128 first, since one possible outcome is a normal integer
10328 // comparison which gets picked up by the next if statement.
10329 if (LHS.getValueType() == MVT::f128) {
10330 softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS, Chain,
10331 IsSignaling);
10332
10333 // If softenSetCCOperands returned a scalar, use it.
10334 if (!RHS.getNode()) {
10335 assert(LHS.getValueType() == Op.getValueType() &&
10336 "Unexpected setcc expansion!");
10337 return IsStrict ? DAG.getMergeValues(Ops: {LHS, Chain}, dl) : LHS;
10338 }
10339 }
10340
10341 if (LHS.getValueType().isInteger()) {
10342 SDValue CCVal;
10343 SDValue Cmp = getAArch64Cmp(
10344 LHS, RHS, CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), AArch64cc&: CCVal, DAG, dl);
10345
10346 // Note that we inverted the condition above, so we reverse the order of
10347 // the true and false operands here. This will allow the setcc to be
10348 // matched to a single CSINC instruction.
10349 SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: FVal, N2: TVal, N3: CCVal, N4: Cmp);
10350 return IsStrict ? DAG.getMergeValues(Ops: {Res, Chain}, dl) : Res;
10351 }
10352
10353 // Now we know we're dealing with FP values.
10354 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
10355 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10356
10357 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
10358 // and do the comparison.
10359 SDValue Cmp;
10360 if (IsStrict)
10361 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
10362 else
10363 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10364
10365 AArch64CC::CondCode CC1, CC2;
10366 changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
10367 SDValue Res;
10368 if (CC2 == AArch64CC::AL) {
10369 changeFPCCToAArch64CC(CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), CondCode&: CC1,
10370 CondCode2&: CC2);
10371 SDValue CC1Val = DAG.getConstant(Val: CC1, DL: dl, VT: MVT::i32);
10372
10373 // Note that we inverted the condition above, so we reverse the order of
10374 // the true and false operands here. This will allow the setcc to be
10375 // matched to a single CSINC instruction.
10376 Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: FVal, N2: TVal, N3: CC1Val, N4: Cmp);
10377 } else {
10378 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
10379 // totally clean. Some of them require two CSELs to implement. As is in
10380 // this case, we emit the first CSEL and then emit a second using the output
10381 // of the first as the RHS. We're effectively OR'ing the two CC's together.
10382
10383 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
10384 SDValue CC1Val = DAG.getConstant(Val: CC1, DL: dl, VT: MVT::i32);
10385 SDValue CS1 =
10386 DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
10387
10388 SDValue CC2Val = DAG.getConstant(Val: CC2, DL: dl, VT: MVT::i32);
10389 Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
10390 }
10391 return IsStrict ? DAG.getMergeValues(Ops: {Res, Cmp.getValue(R: 1)}, dl) : Res;
10392}
10393
10394SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
10395 SelectionDAG &DAG) const {
10396
10397 SDValue LHS = Op.getOperand(i: 0);
10398 SDValue RHS = Op.getOperand(i: 1);
10399 EVT VT = LHS.getValueType();
10400 if (VT != MVT::i32 && VT != MVT::i64)
10401 return SDValue();
10402
10403 SDLoc DL(Op);
10404 SDValue Carry = Op.getOperand(i: 2);
10405 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
10406 SDValue InvCarry = valueToCarryFlag(Value: Carry, DAG, Invert: true);
10407 SDValue Cmp = DAG.getNode(Opcode: AArch64ISD::SBCS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Glue),
10408 N1: LHS, N2: RHS, N3: InvCarry);
10409
10410 EVT OpVT = Op.getValueType();
10411 SDValue TVal = DAG.getConstant(Val: 1, DL, VT: OpVT);
10412 SDValue FVal = DAG.getConstant(Val: 0, DL, VT: OpVT);
10413
10414 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: Op.getOperand(i: 3))->get();
10415 ISD::CondCode CondInv = ISD::getSetCCInverse(Operation: Cond, Type: VT);
10416 SDValue CCVal =
10417 DAG.getConstant(Val: changeIntCCToAArch64CC(CC: CondInv), DL, VT: MVT::i32);
10418 // Inputs are swapped because the condition is inverted. This will allow
10419 // matching with a single CSINC instruction.
10420 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OpVT, N1: FVal, N2: TVal, N3: CCVal,
10421 N4: Cmp.getValue(R: 1));
10422}
10423
10424SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
10425 SDValue RHS, SDValue TVal,
10426 SDValue FVal, const SDLoc &dl,
10427 SelectionDAG &DAG) const {
10428 // Handle f128 first, because it will result in a comparison of some RTLIB
10429 // call result against zero.
10430 if (LHS.getValueType() == MVT::f128) {
10431 softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS);
10432
10433 // If softenSetCCOperands returned a scalar, we need to compare the result
10434 // against zero to select between true and false values.
10435 if (!RHS.getNode()) {
10436 RHS = DAG.getConstant(Val: 0, DL: dl, VT: LHS.getValueType());
10437 CC = ISD::SETNE;
10438 }
10439 }
10440
10441 // Also handle f16, for which we need to do a f32 comparison.
10442 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
10443 LHS.getValueType() == MVT::bf16) {
10444 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f32, Operand: LHS);
10445 RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f32, Operand: RHS);
10446 }
10447
10448 // Next, handle integers.
10449 if (LHS.getValueType().isInteger()) {
10450 assert((LHS.getValueType() == RHS.getValueType()) &&
10451 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10452
10453 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
10454 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
10455 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
10456 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
10457 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
10458 // supported types.
10459 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
10460 CTVal->isOne() && CFVal->isAllOnes() &&
10461 LHS.getValueType() == TVal.getValueType()) {
10462 EVT VT = LHS.getValueType();
10463 SDValue Shift =
10464 DAG.getNode(Opcode: ISD::SRA, DL: dl, VT, N1: LHS,
10465 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL: dl, VT));
10466 return DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Shift, N2: DAG.getConstant(Val: 1, DL: dl, VT));
10467 }
10468
10469 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
10470 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
10471 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
10472 // Both require less instructions than compare and conditional select.
10473 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
10474 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
10475 LHS.getValueType() == RHS.getValueType()) {
10476 EVT VT = LHS.getValueType();
10477 SDValue Shift =
10478 DAG.getNode(Opcode: ISD::SRA, DL: dl, VT, N1: LHS,
10479 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL: dl, VT));
10480
10481 if (CC == ISD::SETGT)
10482 Shift = DAG.getNOT(DL: dl, Val: Shift, VT);
10483
10484 return DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: LHS, N2: Shift);
10485 }
10486
10487 unsigned Opcode = AArch64ISD::CSEL;
10488
10489 // If both the TVal and the FVal are constants, see if we can swap them in
10490 // order to for a CSINV or CSINC out of them.
10491 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
10492 std::swap(a&: TVal, b&: FVal);
10493 std::swap(a&: CTVal, b&: CFVal);
10494 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
10495 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
10496 std::swap(a&: TVal, b&: FVal);
10497 std::swap(a&: CTVal, b&: CFVal);
10498 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
10499 } else if (TVal.getOpcode() == ISD::XOR) {
10500 // If TVal is a NOT we want to swap TVal and FVal so that we can match
10501 // with a CSINV rather than a CSEL.
10502 if (isAllOnesConstant(V: TVal.getOperand(i: 1))) {
10503 std::swap(a&: TVal, b&: FVal);
10504 std::swap(a&: CTVal, b&: CFVal);
10505 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
10506 }
10507 } else if (TVal.getOpcode() == ISD::SUB) {
10508 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
10509 // that we can match with a CSNEG rather than a CSEL.
10510 if (isNullConstant(V: TVal.getOperand(i: 0))) {
10511 std::swap(a&: TVal, b&: FVal);
10512 std::swap(a&: CTVal, b&: CFVal);
10513 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
10514 }
10515 } else if (CTVal && CFVal) {
10516 const int64_t TrueVal = CTVal->getSExtValue();
10517 const int64_t FalseVal = CFVal->getSExtValue();
10518 bool Swap = false;
10519
10520 // If both TVal and FVal are constants, see if FVal is the
10521 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
10522 // instead of a CSEL in that case.
10523 if (TrueVal == ~FalseVal) {
10524 Opcode = AArch64ISD::CSINV;
10525 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
10526 TrueVal == -FalseVal) {
10527 Opcode = AArch64ISD::CSNEG;
10528 } else if (TVal.getValueType() == MVT::i32) {
10529 // If our operands are only 32-bit wide, make sure we use 32-bit
10530 // arithmetic for the check whether we can use CSINC. This ensures that
10531 // the addition in the check will wrap around properly in case there is
10532 // an overflow (which would not be the case if we do the check with
10533 // 64-bit arithmetic).
10534 const uint32_t TrueVal32 = CTVal->getZExtValue();
10535 const uint32_t FalseVal32 = CFVal->getZExtValue();
10536
10537 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
10538 Opcode = AArch64ISD::CSINC;
10539
10540 if (TrueVal32 > FalseVal32) {
10541 Swap = true;
10542 }
10543 }
10544 } else {
10545 // 64-bit check whether we can use CSINC.
10546 const uint64_t TrueVal64 = TrueVal;
10547 const uint64_t FalseVal64 = FalseVal;
10548
10549 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
10550 Opcode = AArch64ISD::CSINC;
10551
10552 if (TrueVal > FalseVal) {
10553 Swap = true;
10554 }
10555 }
10556 }
10557
10558 // Swap TVal and FVal if necessary.
10559 if (Swap) {
10560 std::swap(a&: TVal, b&: FVal);
10561 std::swap(a&: CTVal, b&: CFVal);
10562 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
10563 }
10564
10565 if (Opcode != AArch64ISD::CSEL) {
10566 // Drop FVal since we can get its value by simply inverting/negating
10567 // TVal.
10568 FVal = TVal;
10569 }
10570 }
10571
10572 // Avoid materializing a constant when possible by reusing a known value in
10573 // a register. However, don't perform this optimization if the known value
10574 // is one, zero or negative one in the case of a CSEL. We can always
10575 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
10576 // FVal, respectively.
10577 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(Val&: RHS);
10578 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
10579 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
10580 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
10581 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
10582 // "a != C ? x : a" to avoid materializing C.
10583 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
10584 TVal = LHS;
10585 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
10586 FVal = LHS;
10587 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
10588 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
10589 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
10590 // avoid materializing C.
10591 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
10592 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
10593 Opcode = AArch64ISD::CSINV;
10594 TVal = LHS;
10595 FVal = DAG.getConstant(Val: 0, DL: dl, VT: FVal.getValueType());
10596 }
10597 }
10598
10599 SDValue CCVal;
10600 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
10601 EVT VT = TVal.getValueType();
10602 return DAG.getNode(Opcode, DL: dl, VT, N1: TVal, N2: FVal, N3: CCVal, N4: Cmp);
10603 }
10604
10605 // Now we know we're dealing with FP values.
10606 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
10607 LHS.getValueType() == MVT::f64);
10608 assert(LHS.getValueType() == RHS.getValueType());
10609 EVT VT = TVal.getValueType();
10610 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10611
10612 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10613 // clean. Some of them require two CSELs to implement.
10614 AArch64CC::CondCode CC1, CC2;
10615 changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
10616
10617 if (DAG.getTarget().Options.UnsafeFPMath) {
10618 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
10619 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
10620 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(Val&: RHS);
10621 if (RHSVal && RHSVal->isZero()) {
10622 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(Val&: FVal);
10623 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(Val&: TVal);
10624
10625 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
10626 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
10627 TVal = LHS;
10628 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
10629 CFVal && CFVal->isZero() &&
10630 FVal.getValueType() == LHS.getValueType())
10631 FVal = LHS;
10632 }
10633 }
10634
10635 // Emit first, and possibly only, CSEL.
10636 SDValue CC1Val = DAG.getConstant(Val: CC1, DL: dl, VT: MVT::i32);
10637 SDValue CS1 = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
10638
10639 // If we need a second CSEL, emit it, using the output of the first as the
10640 // RHS. We're effectively OR'ing the two CC's together.
10641 if (CC2 != AArch64CC::AL) {
10642 SDValue CC2Val = DAG.getConstant(Val: CC2, DL: dl, VT: MVT::i32);
10643 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
10644 }
10645
10646 // Otherwise, return the output of the first CSEL.
10647 return CS1;
10648}
10649
10650SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10651 SelectionDAG &DAG) const {
10652 EVT Ty = Op.getValueType();
10653 auto Idx = Op.getConstantOperandAPInt(i: 2);
10654 int64_t IdxVal = Idx.getSExtValue();
10655 assert(Ty.isScalableVector() &&
10656 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10657
10658 // We can use the splice instruction for certain index values where we are
10659 // able to efficiently generate the correct predicate. The index will be
10660 // inverted and used directly as the input to the ptrue instruction, i.e.
10661 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10662 // splice predicate. However, we can only do this if we can guarantee that
10663 // there are enough elements in the vector, hence we check the index <= min
10664 // number of elements.
10665 std::optional<unsigned> PredPattern;
10666 if (Ty.isScalableVector() && IdxVal < 0 &&
10667 (PredPattern = getSVEPredPatternFromNumElements(MinNumElts: std::abs(i: IdxVal))) !=
10668 std::nullopt) {
10669 SDLoc DL(Op);
10670
10671 // Create a predicate where all but the last -IdxVal elements are false.
10672 EVT PredVT = Ty.changeVectorElementType(EltVT: MVT::i1);
10673 SDValue Pred = getPTrue(DAG, DL, VT: PredVT, Pattern: *PredPattern);
10674 Pred = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: PredVT, Operand: Pred);
10675
10676 // Now splice the two inputs together using the predicate.
10677 return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: Ty, N1: Pred, N2: Op.getOperand(i: 0),
10678 N3: Op.getOperand(i: 1));
10679 }
10680
10681 // We can select to an EXT instruction when indexing the first 256 bytes.
10682 unsigned BlockSize = AArch64::SVEBitsPerBlock / Ty.getVectorMinNumElements();
10683 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
10684 return Op;
10685
10686 return SDValue();
10687}
10688
10689SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10690 SelectionDAG &DAG) const {
10691 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 4))->get();
10692 SDValue LHS = Op.getOperand(i: 0);
10693 SDValue RHS = Op.getOperand(i: 1);
10694 SDValue TVal = Op.getOperand(i: 2);
10695 SDValue FVal = Op.getOperand(i: 3);
10696 SDLoc DL(Op);
10697 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, dl: DL, DAG);
10698}
10699
10700SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10701 SelectionDAG &DAG) const {
10702 SDValue CCVal = Op->getOperand(Num: 0);
10703 SDValue TVal = Op->getOperand(Num: 1);
10704 SDValue FVal = Op->getOperand(Num: 2);
10705 SDLoc DL(Op);
10706
10707 EVT Ty = Op.getValueType();
10708 if (Ty == MVT::aarch64svcount) {
10709 TVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i1, Operand: TVal);
10710 FVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i1, Operand: FVal);
10711 SDValue Sel =
10712 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::nxv16i1, N1: CCVal, N2: TVal, N3: FVal);
10713 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Ty, Operand: Sel);
10714 }
10715
10716 if (Ty.isScalableVector()) {
10717 MVT PredVT = MVT::getVectorVT(VT: MVT::i1, EC: Ty.getVectorElementCount());
10718 SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: CCVal);
10719 return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
10720 }
10721
10722 if (useSVEForFixedLengthVectorVT(VT: Ty, OverrideNEON: !Subtarget->isNeonAvailable())) {
10723 // FIXME: Ideally this would be the same as above using i1 types, however
10724 // for the moment we can't deal with fixed i1 vector types properly, so
10725 // instead extend the predicate to a result type sized integer vector.
10726 MVT SplatValVT = MVT::getIntegerVT(BitWidth: Ty.getScalarSizeInBits());
10727 MVT PredVT = MVT::getVectorVT(VT: SplatValVT, EC: Ty.getVectorElementCount());
10728 SDValue SplatVal = DAG.getSExtOrTrunc(Op: CCVal, DL, VT: SplatValVT);
10729 SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: SplatVal);
10730 return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
10731 }
10732
10733 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10734 // instruction.
10735 if (ISD::isOverflowIntrOpRes(Op: CCVal)) {
10736 // Only lower legal XALUO ops.
10737 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: CCVal->getValueType(ResNo: 0)))
10738 return SDValue();
10739
10740 AArch64CC::CondCode OFCC;
10741 SDValue Value, Overflow;
10742 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: CCVal.getValue(R: 0), DAG);
10743 SDValue CCVal = DAG.getConstant(Val: OFCC, DL, VT: MVT::i32);
10744
10745 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: Op.getValueType(), N1: TVal, N2: FVal,
10746 N3: CCVal, N4: Overflow);
10747 }
10748
10749 // Lower it the same way as we would lower a SELECT_CC node.
10750 ISD::CondCode CC;
10751 SDValue LHS, RHS;
10752 if (CCVal.getOpcode() == ISD::SETCC) {
10753 LHS = CCVal.getOperand(i: 0);
10754 RHS = CCVal.getOperand(i: 1);
10755 CC = cast<CondCodeSDNode>(Val: CCVal.getOperand(i: 2))->get();
10756 } else {
10757 LHS = CCVal;
10758 RHS = DAG.getConstant(Val: 0, DL, VT: CCVal.getValueType());
10759 CC = ISD::SETNE;
10760 }
10761
10762 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10763 // order to use FCSELSrrr
10764 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10765 TVal = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
10766 Operand: DAG.getUNDEF(VT: MVT::f32), Subreg: TVal);
10767 FVal = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
10768 Operand: DAG.getUNDEF(VT: MVT::f32), Subreg: FVal);
10769 }
10770
10771 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, dl: DL, DAG);
10772
10773 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10774 return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT: Ty, Operand: Res);
10775 }
10776
10777 return Res;
10778}
10779
10780SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10781 SelectionDAG &DAG) const {
10782 // Jump table entries as PC relative offsets. No additional tweaking
10783 // is necessary here. Just get the address of the jump table.
10784 JumpTableSDNode *JT = cast<JumpTableSDNode>(Val&: Op);
10785
10786 CodeModel::Model CM = getTargetMachine().getCodeModel();
10787 if (CM == CodeModel::Large && !getTargetMachine().isPositionIndependent() &&
10788 !Subtarget->isTargetMachO())
10789 return getAddrLarge(N: JT, DAG);
10790 if (CM == CodeModel::Tiny)
10791 return getAddrTiny(N: JT, DAG);
10792 return getAddr(N: JT, DAG);
10793}
10794
10795SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10796 SelectionDAG &DAG) const {
10797 // Jump table entries as PC relative offsets. No additional tweaking
10798 // is necessary here. Just get the address of the jump table.
10799 SDLoc DL(Op);
10800 SDValue JT = Op.getOperand(i: 1);
10801 SDValue Entry = Op.getOperand(i: 2);
10802 int JTI = cast<JumpTableSDNode>(Val: JT.getNode())->getIndex();
10803
10804 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10805 AFI->setJumpTableEntryInfo(Idx: JTI, Size: 4, PCRelSym: nullptr);
10806
10807 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
10808 // sequence later, to guarantee the integrity of the intermediate values.
10809 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
10810 Kind: "aarch64-jump-table-hardening")) {
10811 CodeModel::Model CM = getTargetMachine().getCodeModel();
10812 if (Subtarget->isTargetMachO()) {
10813 if (CM != CodeModel::Small && CM != CodeModel::Large)
10814 report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
10815 } else {
10816 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
10817 assert(Subtarget->isTargetELF() &&
10818 "jump table hardening only supported on MachO/ELF");
10819 if (CM != CodeModel::Small)
10820 report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
10821 }
10822
10823 SDValue X16Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::X16,
10824 N: Entry, Glue: SDValue());
10825 SDNode *B = DAG.getMachineNode(Opcode: AArch64::BR_JumpTable, dl: DL, VT: MVT::Other,
10826 Op1: DAG.getTargetJumpTable(JTI, VT: MVT::i32),
10827 Op2: X16Copy.getValue(R: 0), Op3: X16Copy.getValue(R: 1));
10828 return SDValue(B, 0);
10829 }
10830
10831 SDNode *Dest =
10832 DAG.getMachineNode(Opcode: AArch64::JumpTableDest32, dl: DL, VT1: MVT::i64, VT2: MVT::i64, Op1: JT,
10833 Op2: Entry, Op3: DAG.getTargetJumpTable(JTI, VT: MVT::i32));
10834 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Chain: Op.getOperand(i: 0), DL);
10835 return DAG.getNode(Opcode: ISD::BRIND, DL, VT: MVT::Other, N1: JTInfo, N2: SDValue(Dest, 0));
10836}
10837
10838SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
10839 SDValue Chain = Op.getOperand(i: 0);
10840 SDValue Dest = Op.getOperand(i: 1);
10841
10842 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
10843 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
10844 if (Dest->isMachineOpcode() &&
10845 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
10846 return SDValue();
10847
10848 const MachineFunction &MF = DAG.getMachineFunction();
10849 std::optional<uint16_t> BADisc =
10850 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(ParentFn: MF.getFunction());
10851 if (!BADisc)
10852 return SDValue();
10853
10854 SDLoc DL(Op);
10855
10856 SDValue Disc = DAG.getTargetConstant(Val: *BADisc, DL, VT: MVT::i64);
10857 SDValue Key = DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32);
10858 SDValue AddrDisc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
10859
10860 SDNode *BrA = DAG.getMachineNode(Opcode: AArch64::BRA, dl: DL, VT: MVT::Other,
10861 Ops: {Dest, Key, Disc, AddrDisc, Chain});
10862 return SDValue(BrA, 0);
10863}
10864
10865SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10866 SelectionDAG &DAG) const {
10867 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Val&: Op);
10868 CodeModel::Model CM = getTargetMachine().getCodeModel();
10869 if (CM == CodeModel::Large) {
10870 // Use the GOT for the large code model on iOS.
10871 if (Subtarget->isTargetMachO()) {
10872 return getGOT(N: CP, DAG);
10873 }
10874 if (!getTargetMachine().isPositionIndependent())
10875 return getAddrLarge(N: CP, DAG);
10876 } else if (CM == CodeModel::Tiny) {
10877 return getAddrTiny(N: CP, DAG);
10878 }
10879 return getAddr(N: CP, DAG);
10880}
10881
10882SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10883 SelectionDAG &DAG) const {
10884 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Val&: Op);
10885 const BlockAddress *BA = BAN->getBlockAddress();
10886
10887 if (std::optional<uint16_t> BADisc =
10888 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
10889 ParentFn: *BA->getFunction())) {
10890 SDLoc DL(Op);
10891
10892 // This isn't cheap, but BRIND is rare.
10893 SDValue TargetBA = DAG.getTargetBlockAddress(BA, VT: BAN->getValueType(ResNo: 0));
10894
10895 SDValue Disc = DAG.getTargetConstant(Val: *BADisc, DL, VT: MVT::i64);
10896
10897 SDValue Key = DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32);
10898 SDValue AddrDisc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
10899
10900 SDNode *MOV =
10901 DAG.getMachineNode(Opcode: AArch64::MOVaddrPAC, dl: DL, ResultTys: {MVT::Other, MVT::Glue},
10902 Ops: {TargetBA, Key, AddrDisc, Disc});
10903 return DAG.getCopyFromReg(Chain: SDValue(MOV, 0), dl: DL, Reg: AArch64::X16, VT: MVT::i64,
10904 Glue: SDValue(MOV, 1));
10905 }
10906
10907 CodeModel::Model CM = getTargetMachine().getCodeModel();
10908 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10909 if (!getTargetMachine().isPositionIndependent())
10910 return getAddrLarge(N: BAN, DAG);
10911 } else if (CM == CodeModel::Tiny) {
10912 return getAddrTiny(N: BAN, DAG);
10913 }
10914 return getAddr(N: BAN, DAG);
10915}
10916
10917SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10918 SelectionDAG &DAG) const {
10919 AArch64FunctionInfo *FuncInfo =
10920 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10921
10922 SDLoc DL(Op);
10923 SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(),
10924 VT: getPointerTy(DL: DAG.getDataLayout()));
10925 FR = DAG.getZExtOrTrunc(Op: FR, DL, VT: getPointerMemTy(DL: DAG.getDataLayout()));
10926 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
10927 return DAG.getStore(Chain: Op.getOperand(i: 0), dl: DL, Val: FR, Ptr: Op.getOperand(i: 1),
10928 PtrInfo: MachinePointerInfo(SV));
10929}
10930
10931SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10932 SelectionDAG &DAG) const {
10933 MachineFunction &MF = DAG.getMachineFunction();
10934 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10935
10936 SDLoc DL(Op);
10937 SDValue FR;
10938 if (Subtarget->isWindowsArm64EC()) {
10939 // With the Arm64EC ABI, we compute the address of the varargs save area
10940 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10941 // but calls from an entry thunk can pass in a different address.
10942 Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
10943 SDValue Val = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: VReg, VT: MVT::i64);
10944 uint64_t StackOffset;
10945 if (FuncInfo->getVarArgsGPRSize() > 0)
10946 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10947 else
10948 StackOffset = FuncInfo->getVarArgsStackOffset();
10949 FR = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Val,
10950 N2: DAG.getConstant(Val: StackOffset, DL, VT: MVT::i64));
10951 } else {
10952 FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRSize() > 0
10953 ? FuncInfo->getVarArgsGPRIndex()
10954 : FuncInfo->getVarArgsStackIndex(),
10955 VT: getPointerTy(DL: DAG.getDataLayout()));
10956 }
10957 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
10958 return DAG.getStore(Chain: Op.getOperand(i: 0), dl: DL, Val: FR, Ptr: Op.getOperand(i: 1),
10959 PtrInfo: MachinePointerInfo(SV));
10960}
10961
10962SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10963 SelectionDAG &DAG) const {
10964 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10965 // Standard, section B.3.
10966 MachineFunction &MF = DAG.getMachineFunction();
10967 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10968 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10969 auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
10970 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
10971 SDLoc DL(Op);
10972
10973 SDValue Chain = Op.getOperand(i: 0);
10974 SDValue VAList = Op.getOperand(i: 1);
10975 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
10976 SmallVector<SDValue, 4> MemOps;
10977
10978 // void *__stack at offset 0
10979 unsigned Offset = 0;
10980 SDValue Stack = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(), VT: PtrVT);
10981 Stack = DAG.getZExtOrTrunc(Op: Stack, DL, VT: PtrMemVT);
10982 MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: Stack, Ptr: VAList,
10983 PtrInfo: MachinePointerInfo(SV), Alignment: Align(PtrSize)));
10984
10985 // void *__gr_top at offset 8 (4 on ILP32)
10986 Offset += PtrSize;
10987 int GPRSize = FuncInfo->getVarArgsGPRSize();
10988 if (GPRSize > 0) {
10989 SDValue GRTop, GRTopAddr;
10990
10991 GRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10992 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
10993
10994 GRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRIndex(), VT: PtrVT);
10995 GRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: GRTop,
10996 N2: DAG.getConstant(Val: GPRSize, DL, VT: PtrVT));
10997 GRTop = DAG.getZExtOrTrunc(Op: GRTop, DL, VT: PtrMemVT);
10998
10999 MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: GRTop, Ptr: GRTopAddr,
11000 PtrInfo: MachinePointerInfo(SV, Offset),
11001 Alignment: Align(PtrSize)));
11002 }
11003
11004 // void *__vr_top at offset 16 (8 on ILP32)
11005 Offset += PtrSize;
11006 int FPRSize = FuncInfo->getVarArgsFPRSize();
11007 if (FPRSize > 0) {
11008 SDValue VRTop, VRTopAddr;
11009 VRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11010 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11011
11012 VRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFPRIndex(), VT: PtrVT);
11013 VRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VRTop,
11014 N2: DAG.getConstant(Val: FPRSize, DL, VT: PtrVT));
11015 VRTop = DAG.getZExtOrTrunc(Op: VRTop, DL, VT: PtrMemVT);
11016
11017 MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: VRTop, Ptr: VRTopAddr,
11018 PtrInfo: MachinePointerInfo(SV, Offset),
11019 Alignment: Align(PtrSize)));
11020 }
11021
11022 // int __gr_offs at offset 24 (12 on ILP32)
11023 Offset += PtrSize;
11024 SDValue GROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11025 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11026 MemOps.push_back(
11027 Elt: DAG.getStore(Chain, dl: DL, Val: DAG.getConstant(Val: -GPRSize, DL, VT: MVT::i32),
11028 Ptr: GROffsAddr, PtrInfo: MachinePointerInfo(SV, Offset), Alignment: Align(4)));
11029
11030 // int __vr_offs at offset 28 (16 on ILP32)
11031 Offset += 4;
11032 SDValue VROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11033 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11034 MemOps.push_back(
11035 Elt: DAG.getStore(Chain, dl: DL, Val: DAG.getConstant(Val: -FPRSize, DL, VT: MVT::i32),
11036 Ptr: VROffsAddr, PtrInfo: MachinePointerInfo(SV, Offset), Alignment: Align(4)));
11037
11038 return DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOps);
11039}
11040
11041SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
11042 SelectionDAG &DAG) const {
11043 MachineFunction &MF = DAG.getMachineFunction();
11044 Function &F = MF.getFunction();
11045
11046 if (Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg()))
11047 return LowerWin64_VASTART(Op, DAG);
11048 else if (Subtarget->isTargetDarwin())
11049 return LowerDarwin_VASTART(Op, DAG);
11050 else
11051 return LowerAAPCS_VASTART(Op, DAG);
11052}
11053
11054SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
11055 SelectionDAG &DAG) const {
11056 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
11057 // pointer.
11058 SDLoc DL(Op);
11059 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11060 unsigned VaListSize =
11061 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
11062 ? PtrSize
11063 : Subtarget->isTargetILP32() ? 20 : 32;
11064 const Value *DestSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 3))->getValue();
11065 const Value *SrcSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 4))->getValue();
11066
11067 return DAG.getMemcpy(Chain: Op.getOperand(i: 0), dl: DL, Dst: Op.getOperand(i: 1), Src: Op.getOperand(i: 2),
11068 Size: DAG.getConstant(Val: VaListSize, DL, VT: MVT::i32),
11069 Alignment: Align(PtrSize), isVol: false, AlwaysInline: false, /*CI=*/nullptr,
11070 OverrideTailCall: std::nullopt, DstPtrInfo: MachinePointerInfo(DestSV),
11071 SrcPtrInfo: MachinePointerInfo(SrcSV));
11072}
11073
11074SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
11075 assert(Subtarget->isTargetDarwin() &&
11076 "automatic va_arg instruction only works on Darwin");
11077
11078 const Value *V = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
11079 EVT VT = Op.getValueType();
11080 SDLoc DL(Op);
11081 SDValue Chain = Op.getOperand(i: 0);
11082 SDValue Addr = Op.getOperand(i: 1);
11083 MaybeAlign Align(Op.getConstantOperandVal(i: 3));
11084 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
11085 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
11086 auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
11087 SDValue VAList =
11088 DAG.getLoad(VT: PtrMemVT, dl: DL, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo(V));
11089 Chain = VAList.getValue(R: 1);
11090 VAList = DAG.getZExtOrTrunc(Op: VAList, DL, VT: PtrVT);
11091
11092 if (VT.isScalableVector())
11093 report_fatal_error(reason: "Passing SVE types to variadic functions is "
11094 "currently not supported");
11095
11096 if (Align && *Align > MinSlotSize) {
11097 VAList = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11098 N2: DAG.getConstant(Val: Align->value() - 1, DL, VT: PtrVT));
11099 VAList = DAG.getNode(Opcode: ISD::AND, DL, VT: PtrVT, N1: VAList,
11100 N2: DAG.getConstant(Val: -(int64_t)Align->value(), DL, VT: PtrVT));
11101 }
11102
11103 Type *ArgTy = VT.getTypeForEVT(Context&: *DAG.getContext());
11104 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(Ty: ArgTy);
11105
11106 // Scalar integer and FP values smaller than 64 bits are implicitly extended
11107 // up to 64 bits. At the very least, we have to increase the striding of the
11108 // vaargs list to match this, and for FP values we need to introduce
11109 // FP_ROUND nodes as well.
11110 if (VT.isInteger() && !VT.isVector())
11111 ArgSize = std::max(a: ArgSize, b: MinSlotSize);
11112 bool NeedFPTrunc = false;
11113 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
11114 ArgSize = 8;
11115 NeedFPTrunc = true;
11116 }
11117
11118 // Increment the pointer, VAList, to the next vaarg
11119 SDValue VANext = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11120 N2: DAG.getConstant(Val: ArgSize, DL, VT: PtrVT));
11121 VANext = DAG.getZExtOrTrunc(Op: VANext, DL, VT: PtrMemVT);
11122
11123 // Store the incremented VAList to the legalized pointer
11124 SDValue APStore =
11125 DAG.getStore(Chain, dl: DL, Val: VANext, Ptr: Addr, PtrInfo: MachinePointerInfo(V));
11126
11127 // Load the actual argument out of the pointer VAList
11128 if (NeedFPTrunc) {
11129 // Load the value as an f64.
11130 SDValue WideFP =
11131 DAG.getLoad(VT: MVT::f64, dl: DL, Chain: APStore, Ptr: VAList, PtrInfo: MachinePointerInfo());
11132 // Round the value down to an f32.
11133 SDValue NarrowFP =
11134 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: WideFP.getValue(R: 0),
11135 N2: DAG.getIntPtrConstant(Val: 1, DL, /*isTarget=*/true));
11136 SDValue Ops[] = { NarrowFP, WideFP.getValue(R: 1) };
11137 // Merge the rounded value with the chain output of the load.
11138 return DAG.getMergeValues(Ops, dl: DL);
11139 }
11140
11141 return DAG.getLoad(VT, dl: DL, Chain: APStore, Ptr: VAList, PtrInfo: MachinePointerInfo());
11142}
11143
11144SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
11145 SelectionDAG &DAG) const {
11146 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11147 MFI.setFrameAddressIsTaken(true);
11148
11149 EVT VT = Op.getValueType();
11150 SDLoc DL(Op);
11151 unsigned Depth = Op.getConstantOperandVal(i: 0);
11152 SDValue FrameAddr =
11153 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::FP, VT: MVT::i64);
11154 while (Depth--)
11155 FrameAddr = DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr: FrameAddr,
11156 PtrInfo: MachinePointerInfo());
11157
11158 if (Subtarget->isTargetILP32())
11159 FrameAddr = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: MVT::i64, N1: FrameAddr,
11160 N2: DAG.getValueType(VT));
11161
11162 return FrameAddr;
11163}
11164
11165SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
11166 SelectionDAG &DAG) const {
11167 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11168
11169 EVT VT = getPointerTy(DL: DAG.getDataLayout());
11170 SDLoc DL(Op);
11171 int FI = MFI.CreateFixedObject(Size: 4, SPOffset: 0, IsImmutable: false);
11172 return DAG.getFrameIndex(FI, VT);
11173}
11174
11175#define GET_REGISTER_MATCHER
11176#include "AArch64GenAsmMatcher.inc"
11177
11178// FIXME? Maybe this could be a TableGen attribute on some registers and
11179// this table could be generated automatically from RegInfo.
11180Register AArch64TargetLowering::
11181getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
11182 Register Reg = MatchRegisterName(Name: RegName);
11183 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
11184 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
11185 unsigned DwarfRegNum = MRI->getDwarfRegNum(RegNum: Reg, isEH: false);
11186 if (!Subtarget->isXRegisterReserved(i: DwarfRegNum) &&
11187 !MRI->isReservedReg(MF, Reg))
11188 Reg = 0;
11189 }
11190 if (Reg)
11191 return Reg;
11192 report_fatal_error(reason: Twine("Invalid register name \""
11193 + StringRef(RegName) + "\"."));
11194}
11195
11196SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
11197 SelectionDAG &DAG) const {
11198 DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
11199
11200 EVT VT = Op.getValueType();
11201 SDLoc DL(Op);
11202
11203 SDValue FrameAddr =
11204 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::FP, VT);
11205 SDValue Offset = DAG.getConstant(Val: 8, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
11206
11207 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset);
11208}
11209
11210SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
11211 SelectionDAG &DAG) const {
11212 MachineFunction &MF = DAG.getMachineFunction();
11213 MachineFrameInfo &MFI = MF.getFrameInfo();
11214 MFI.setReturnAddressIsTaken(true);
11215
11216 EVT VT = Op.getValueType();
11217 SDLoc DL(Op);
11218 unsigned Depth = Op.getConstantOperandVal(i: 0);
11219 SDValue ReturnAddress;
11220 if (Depth) {
11221 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
11222 SDValue Offset = DAG.getConstant(Val: 8, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
11223 ReturnAddress = DAG.getLoad(
11224 VT, dl: DL, Chain: DAG.getEntryNode(),
11225 Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset), PtrInfo: MachinePointerInfo());
11226 } else {
11227 // Return LR, which contains the return address. Mark it an implicit
11228 // live-in.
11229 Register Reg = MF.addLiveIn(PReg: AArch64::LR, RC: &AArch64::GPR64RegClass);
11230 ReturnAddress = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
11231 }
11232
11233 // The XPACLRI instruction assembles to a hint-space instruction before
11234 // Armv8.3-A therefore this instruction can be safely used for any pre
11235 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
11236 // that instead.
11237 SDNode *St;
11238 if (Subtarget->hasPAuth()) {
11239 St = DAG.getMachineNode(Opcode: AArch64::XPACI, dl: DL, VT, Op1: ReturnAddress);
11240 } else {
11241 // XPACLRI operates on LR therefore we must move the operand accordingly.
11242 SDValue Chain =
11243 DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::LR, N: ReturnAddress);
11244 St = DAG.getMachineNode(Opcode: AArch64::XPACLRI, dl: DL, VT, Op1: Chain);
11245 }
11246 return SDValue(St, 0);
11247}
11248
11249/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
11250/// i32 values and take a 2 x i32 value to shift plus a shift amount.
11251SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
11252 SelectionDAG &DAG) const {
11253 SDValue Lo, Hi;
11254 expandShiftParts(N: Op.getNode(), Lo, Hi, DAG);
11255 return DAG.getMergeValues(Ops: {Lo, Hi}, dl: SDLoc(Op));
11256}
11257
11258bool AArch64TargetLowering::isOffsetFoldingLegal(
11259 const GlobalAddressSDNode *GA) const {
11260 // Offsets are folded in the DAG combine rather than here so that we can
11261 // intelligently choose an offset based on the uses.
11262 return false;
11263}
11264
11265bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
11266 bool OptForSize) const {
11267 bool IsLegal = false;
11268 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
11269 // 16-bit case when target has full fp16 support.
11270 // We encode bf16 bit patterns as if they were fp16. This results in very
11271 // strange looking assembly but should populate the register with appropriate
11272 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
11273 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
11274 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
11275 // FIXME: We should be able to handle f128 as well with a clever lowering.
11276 const APInt ImmInt = Imm.bitcastToAPInt();
11277 if (VT == MVT::f64)
11278 IsLegal = AArch64_AM::getFP64Imm(Imm: ImmInt) != -1 || Imm.isPosZero();
11279 else if (VT == MVT::f32)
11280 IsLegal = AArch64_AM::getFP32Imm(Imm: ImmInt) != -1 || Imm.isPosZero();
11281 else if (VT == MVT::f16 || VT == MVT::bf16)
11282 IsLegal =
11283 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(Imm: ImmInt) != -1) ||
11284 Imm.isPosZero();
11285
11286 // If we can not materialize in immediate field for fmov, check if the
11287 // value can be encoded as the immediate operand of a logical instruction.
11288 // The immediate value will be created with either MOVZ, MOVN, or ORR.
11289 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
11290 // generate that fmov.
11291 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
11292 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
11293 // however the mov+fmov sequence is always better because of the reduced
11294 // cache pressure. The timings are still the same if you consider
11295 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
11296 // movw+movk is fused). So we limit up to 2 instrdduction at most.
11297 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
11298 AArch64_IMM::expandMOVImm(Imm: ImmInt.getZExtValue(), BitSize: VT.getSizeInBits(), Insn);
11299 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
11300 IsLegal = Insn.size() <= Limit;
11301 }
11302
11303 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
11304 << " imm value: "; Imm.dump(););
11305 return IsLegal;
11306}
11307
11308//===----------------------------------------------------------------------===//
11309// AArch64 Optimization Hooks
11310//===----------------------------------------------------------------------===//
11311
11312static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
11313 SDValue Operand, SelectionDAG &DAG,
11314 int &ExtraSteps) {
11315 EVT VT = Operand.getValueType();
11316 if ((ST->hasNEON() &&
11317 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
11318 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
11319 VT == MVT::v4f32)) ||
11320 (ST->hasSVE() &&
11321 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
11322 if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) {
11323 // For the reciprocal estimates, convergence is quadratic, so the number
11324 // of digits is doubled after each iteration. In ARMv8, the accuracy of
11325 // the initial estimate is 2^-8. Thus the number of extra steps to refine
11326 // the result for float (23 mantissa bits) is 2 and for double (52
11327 // mantissa bits) is 3.
11328 constexpr unsigned AccurateBits = 8;
11329 unsigned DesiredBits =
11330 APFloat::semanticsPrecision(DAG.EVTToAPFloatSemantics(VT));
11331 ExtraSteps = DesiredBits <= AccurateBits
11332 ? 0
11333 : Log2_64_Ceil(Value: DesiredBits) - Log2_64_Ceil(Value: AccurateBits);
11334 }
11335
11336 return DAG.getNode(Opcode, DL: SDLoc(Operand), VT, Operand);
11337 }
11338
11339 return SDValue();
11340}
11341
11342SDValue
11343AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
11344 const DenormalMode &Mode) const {
11345 SDLoc DL(Op);
11346 EVT VT = Op.getValueType();
11347 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT);
11348 SDValue FPZero = DAG.getConstantFP(Val: 0.0, DL, VT);
11349 return DAG.getSetCC(DL, VT: CCVT, LHS: Op, RHS: FPZero, Cond: ISD::SETEQ);
11350}
11351
11352SDValue
11353AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
11354 SelectionDAG &DAG) const {
11355 return Op;
11356}
11357
11358SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
11359 SelectionDAG &DAG, int Enabled,
11360 int &ExtraSteps,
11361 bool &UseOneConst,
11362 bool Reciprocal) const {
11363 if (Enabled == ReciprocalEstimate::Enabled ||
11364 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
11365 if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRSQRTE, Operand,
11366 DAG, ExtraSteps)) {
11367 SDLoc DL(Operand);
11368 EVT VT = Operand.getValueType();
11369
11370 SDNodeFlags Flags;
11371 Flags.setAllowReassociation(true);
11372
11373 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
11374 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
11375 for (int i = ExtraSteps; i > 0; --i) {
11376 SDValue Step = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Estimate,
11377 Flags);
11378 Step = DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL, VT, N1: Operand, N2: Step, Flags);
11379 Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
11380 }
11381 if (!Reciprocal)
11382 Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Operand, N2: Estimate, Flags);
11383
11384 ExtraSteps = 0;
11385 return Estimate;
11386 }
11387
11388 return SDValue();
11389}
11390
11391SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
11392 SelectionDAG &DAG, int Enabled,
11393 int &ExtraSteps) const {
11394 if (Enabled == ReciprocalEstimate::Enabled)
11395 if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRECPE, Operand,
11396 DAG, ExtraSteps)) {
11397 SDLoc DL(Operand);
11398 EVT VT = Operand.getValueType();
11399
11400 SDNodeFlags Flags;
11401 Flags.setAllowReassociation(true);
11402
11403 // Newton reciprocal iteration: E * (2 - X * E)
11404 // AArch64 reciprocal iteration instruction: (2 - M * N)
11405 for (int i = ExtraSteps; i > 0; --i) {
11406 SDValue Step = DAG.getNode(Opcode: AArch64ISD::FRECPS, DL, VT, N1: Operand,
11407 N2: Estimate, Flags);
11408 Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
11409 }
11410
11411 ExtraSteps = 0;
11412 return Estimate;
11413 }
11414
11415 return SDValue();
11416}
11417
11418//===----------------------------------------------------------------------===//
11419// AArch64 Inline Assembly Support
11420//===----------------------------------------------------------------------===//
11421
11422// Table of Constraints
11423// TODO: This is the current set of constraints supported by ARM for the
11424// compiler, not all of them may make sense.
11425//
11426// r - A general register
11427// w - An FP/SIMD register of some size in the range v0-v31
11428// x - An FP/SIMD register of some size in the range v0-v15
11429// I - Constant that can be used with an ADD instruction
11430// J - Constant that can be used with a SUB instruction
11431// K - Constant that can be used with a 32-bit logical instruction
11432// L - Constant that can be used with a 64-bit logical instruction
11433// M - Constant that can be used as a 32-bit MOV immediate
11434// N - Constant that can be used as a 64-bit MOV immediate
11435// Q - A memory reference with base register and no offset
11436// S - A symbolic address
11437// Y - Floating point constant zero
11438// Z - Integer constant zero
11439//
11440// Note that general register operands will be output using their 64-bit x
11441// register name, whatever the size of the variable, unless the asm operand
11442// is prefixed by the %w modifier. Floating-point and SIMD register operands
11443// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
11444// %q modifier.
11445const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
11446 // At this point, we have to lower this constraint to something else, so we
11447 // lower it to an "r" or "w". However, by doing this we will force the result
11448 // to be in register, while the X constraint is much more permissive.
11449 //
11450 // Although we are correct (we are free to emit anything, without
11451 // constraints), we might break use cases that would expect us to be more
11452 // efficient and emit something else.
11453 if (!Subtarget->hasFPARMv8())
11454 return "r";
11455
11456 if (ConstraintVT.isFloatingPoint())
11457 return "w";
11458
11459 if (ConstraintVT.isVector() &&
11460 (ConstraintVT.getSizeInBits() == 64 ||
11461 ConstraintVT.getSizeInBits() == 128))
11462 return "w";
11463
11464 return "r";
11465}
11466
11467enum class PredicateConstraint { Uph, Upl, Upa };
11468
11469static std::optional<PredicateConstraint>
11470parsePredicateConstraint(StringRef Constraint) {
11471 return StringSwitch<std::optional<PredicateConstraint>>(Constraint)
11472 .Case(S: "Uph", Value: PredicateConstraint::Uph)
11473 .Case(S: "Upl", Value: PredicateConstraint::Upl)
11474 .Case(S: "Upa", Value: PredicateConstraint::Upa)
11475 .Default(Value: std::nullopt);
11476}
11477
11478static const TargetRegisterClass *
11479getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
11480 if (VT != MVT::aarch64svcount &&
11481 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
11482 return nullptr;
11483
11484 switch (Constraint) {
11485 case PredicateConstraint::Uph:
11486 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
11487 : &AArch64::PPR_p8to15RegClass;
11488 case PredicateConstraint::Upl:
11489 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
11490 : &AArch64::PPR_3bRegClass;
11491 case PredicateConstraint::Upa:
11492 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
11493 : &AArch64::PPRRegClass;
11494 }
11495
11496 llvm_unreachable("Missing PredicateConstraint!");
11497}
11498
11499enum class ReducedGprConstraint { Uci, Ucj };
11500
11501static std::optional<ReducedGprConstraint>
11502parseReducedGprConstraint(StringRef Constraint) {
11503 return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint)
11504 .Case(S: "Uci", Value: ReducedGprConstraint::Uci)
11505 .Case(S: "Ucj", Value: ReducedGprConstraint::Ucj)
11506 .Default(Value: std::nullopt);
11507}
11508
11509static const TargetRegisterClass *
11510getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) {
11511 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
11512 return nullptr;
11513
11514 switch (Constraint) {
11515 case ReducedGprConstraint::Uci:
11516 return &AArch64::MatrixIndexGPR32_8_11RegClass;
11517 case ReducedGprConstraint::Ucj:
11518 return &AArch64::MatrixIndexGPR32_12_15RegClass;
11519 }
11520
11521 llvm_unreachable("Missing ReducedGprConstraint!");
11522}
11523
11524// The set of cc code supported is from
11525// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
11526static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) {
11527 AArch64CC::CondCode Cond = StringSwitch<AArch64CC::CondCode>(Constraint)
11528 .Case(S: "{@cchi}", Value: AArch64CC::HI)
11529 .Case(S: "{@cccs}", Value: AArch64CC::HS)
11530 .Case(S: "{@cclo}", Value: AArch64CC::LO)
11531 .Case(S: "{@ccls}", Value: AArch64CC::LS)
11532 .Case(S: "{@cccc}", Value: AArch64CC::LO)
11533 .Case(S: "{@cceq}", Value: AArch64CC::EQ)
11534 .Case(S: "{@ccgt}", Value: AArch64CC::GT)
11535 .Case(S: "{@ccge}", Value: AArch64CC::GE)
11536 .Case(S: "{@cclt}", Value: AArch64CC::LT)
11537 .Case(S: "{@ccle}", Value: AArch64CC::LE)
11538 .Case(S: "{@cchs}", Value: AArch64CC::HS)
11539 .Case(S: "{@ccne}", Value: AArch64CC::NE)
11540 .Case(S: "{@ccvc}", Value: AArch64CC::VC)
11541 .Case(S: "{@ccpl}", Value: AArch64CC::PL)
11542 .Case(S: "{@ccvs}", Value: AArch64CC::VS)
11543 .Case(S: "{@ccmi}", Value: AArch64CC::MI)
11544 .Default(Value: AArch64CC::Invalid);
11545 return Cond;
11546}
11547
11548/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
11549/// WZR, invert(<cond>)'.
11550static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL,
11551 SelectionDAG &DAG) {
11552 return DAG.getNode(
11553 Opcode: AArch64ISD::CSINC, DL, VT: MVT::i32, N1: DAG.getConstant(Val: 0, DL, VT: MVT::i32),
11554 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32),
11555 N3: DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL, VT: MVT::i32), N4: NZCV);
11556}
11557
11558// Lower @cc flag output via getSETCC.
11559SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
11560 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
11561 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
11562 AArch64CC::CondCode Cond = parseConstraintCode(Constraint: OpInfo.ConstraintCode);
11563 if (Cond == AArch64CC::Invalid)
11564 return SDValue();
11565 // The output variable should be a scalar integer.
11566 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
11567 OpInfo.ConstraintVT.getSizeInBits() < 8)
11568 report_fatal_error(reason: "Flag output operand is of invalid type");
11569
11570 // Get NZCV register. Only update chain when copyfrom is glued.
11571 if (Glue.getNode()) {
11572 Glue = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::NZCV, VT: MVT::i32, Glue);
11573 Chain = Glue.getValue(R: 1);
11574 } else
11575 Glue = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::NZCV, VT: MVT::i32);
11576 // Extract CC code.
11577 SDValue CC = getSETCC(CC: Cond, NZCV: Glue, DL, DAG);
11578
11579 SDValue Result;
11580
11581 // Truncate or ZERO_EXTEND based on value types.
11582 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
11583 Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: OpInfo.ConstraintVT, Operand: CC);
11584 else
11585 Result = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: OpInfo.ConstraintVT, Operand: CC);
11586
11587 return Result;
11588}
11589
11590/// getConstraintType - Given a constraint letter, return the type of
11591/// constraint it is for this target.
11592AArch64TargetLowering::ConstraintType
11593AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
11594 if (Constraint.size() == 1) {
11595 switch (Constraint[0]) {
11596 default:
11597 break;
11598 case 'x':
11599 case 'w':
11600 case 'y':
11601 return C_RegisterClass;
11602 // An address with a single base register. Due to the way we
11603 // currently handle addresses it is the same as 'r'.
11604 case 'Q':
11605 return C_Memory;
11606 case 'I':
11607 case 'J':
11608 case 'K':
11609 case 'L':
11610 case 'M':
11611 case 'N':
11612 case 'Y':
11613 case 'Z':
11614 return C_Immediate;
11615 case 'z':
11616 case 'S': // A symbol or label reference with a constant offset
11617 return C_Other;
11618 }
11619 } else if (parsePredicateConstraint(Constraint))
11620 return C_RegisterClass;
11621 else if (parseReducedGprConstraint(Constraint))
11622 return C_RegisterClass;
11623 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
11624 return C_Other;
11625 return TargetLowering::getConstraintType(Constraint);
11626}
11627
11628/// Examine constraint type and operand type and determine a weight value.
11629/// This object must already have been set up with the operand type
11630/// and the current alternative constraint selected.
11631TargetLowering::ConstraintWeight
11632AArch64TargetLowering::getSingleConstraintMatchWeight(
11633 AsmOperandInfo &info, const char *constraint) const {
11634 ConstraintWeight weight = CW_Invalid;
11635 Value *CallOperandVal = info.CallOperandVal;
11636 // If we don't have a value, we can't do a match,
11637 // but allow it at the lowest weight.
11638 if (!CallOperandVal)
11639 return CW_Default;
11640 Type *type = CallOperandVal->getType();
11641 // Look at the constraint type.
11642 switch (*constraint) {
11643 default:
11644 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
11645 break;
11646 case 'x':
11647 case 'w':
11648 case 'y':
11649 if (type->isFloatingPointTy() || type->isVectorTy())
11650 weight = CW_Register;
11651 break;
11652 case 'z':
11653 weight = CW_Constant;
11654 break;
11655 case 'U':
11656 if (parsePredicateConstraint(Constraint: constraint) ||
11657 parseReducedGprConstraint(Constraint: constraint))
11658 weight = CW_Register;
11659 break;
11660 }
11661 return weight;
11662}
11663
11664std::pair<unsigned, const TargetRegisterClass *>
11665AArch64TargetLowering::getRegForInlineAsmConstraint(
11666 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
11667 if (Constraint.size() == 1) {
11668 switch (Constraint[0]) {
11669 case 'r':
11670 if (VT.isScalableVector())
11671 return std::make_pair(x: 0U, y: nullptr);
11672 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
11673 return std::make_pair(x: 0U, y: &AArch64::GPR64x8ClassRegClass);
11674 if (VT.getFixedSizeInBits() == 64)
11675 return std::make_pair(x: 0U, y: &AArch64::GPR64commonRegClass);
11676 return std::make_pair(x: 0U, y: &AArch64::GPR32commonRegClass);
11677 case 'w': {
11678 if (!Subtarget->hasFPARMv8())
11679 break;
11680 if (VT.isScalableVector()) {
11681 if (VT.getVectorElementType() != MVT::i1)
11682 return std::make_pair(x: 0U, y: &AArch64::ZPRRegClass);
11683 return std::make_pair(x: 0U, y: nullptr);
11684 }
11685 if (VT == MVT::Other)
11686 break;
11687 uint64_t VTSize = VT.getFixedSizeInBits();
11688 if (VTSize == 16)
11689 return std::make_pair(x: 0U, y: &AArch64::FPR16RegClass);
11690 if (VTSize == 32)
11691 return std::make_pair(x: 0U, y: &AArch64::FPR32RegClass);
11692 if (VTSize == 64)
11693 return std::make_pair(x: 0U, y: &AArch64::FPR64RegClass);
11694 if (VTSize == 128)
11695 return std::make_pair(x: 0U, y: &AArch64::FPR128RegClass);
11696 break;
11697 }
11698 // The instructions that this constraint is designed for can
11699 // only take 128-bit registers so just use that regclass.
11700 case 'x':
11701 if (!Subtarget->hasFPARMv8())
11702 break;
11703 if (VT.isScalableVector())
11704 return std::make_pair(x: 0U, y: &AArch64::ZPR_4bRegClass);
11705 if (VT.getSizeInBits() == 128)
11706 return std::make_pair(x: 0U, y: &AArch64::FPR128_loRegClass);
11707 break;
11708 case 'y':
11709 if (!Subtarget->hasFPARMv8())
11710 break;
11711 if (VT.isScalableVector())
11712 return std::make_pair(x: 0U, y: &AArch64::ZPR_3bRegClass);
11713 break;
11714 }
11715 } else {
11716 if (const auto PC = parsePredicateConstraint(Constraint))
11717 if (const auto *RegClass = getPredicateRegisterClass(Constraint: *PC, VT))
11718 return std::make_pair(x: 0U, y&: RegClass);
11719
11720 if (const auto RGC = parseReducedGprConstraint(Constraint))
11721 if (const auto *RegClass = getReducedGprRegisterClass(Constraint: *RGC, VT))
11722 return std::make_pair(x: 0U, y&: RegClass);
11723 }
11724 if (StringRef("{cc}").equals_insensitive(RHS: Constraint) ||
11725 parseConstraintCode(Constraint) != AArch64CC::Invalid)
11726 return std::make_pair(x: unsigned(AArch64::NZCV), y: &AArch64::CCRRegClass);
11727
11728 if (Constraint == "{za}") {
11729 return std::make_pair(x: unsigned(AArch64::ZA), y: &AArch64::MPRRegClass);
11730 }
11731
11732 if (Constraint == "{zt0}") {
11733 return std::make_pair(x: unsigned(AArch64::ZT0), y: &AArch64::ZTRRegClass);
11734 }
11735
11736 // Use the default implementation in TargetLowering to convert the register
11737 // constraint into a member of a register class.
11738 std::pair<unsigned, const TargetRegisterClass *> Res;
11739 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
11740
11741 // Not found as a standard register?
11742 if (!Res.second) {
11743 unsigned Size = Constraint.size();
11744 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11745 tolower(c: Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11746 int RegNo;
11747 bool Failed = Constraint.slice(Start: 2, End: Size - 1).getAsInteger(Radix: 10, Result&: RegNo);
11748 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11749 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11750 // By default we'll emit v0-v31 for this unless there's a modifier where
11751 // we'll emit the correct register as well.
11752 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11753 Res.first = AArch64::FPR64RegClass.getRegister(i: RegNo);
11754 Res.second = &AArch64::FPR64RegClass;
11755 } else {
11756 Res.first = AArch64::FPR128RegClass.getRegister(i: RegNo);
11757 Res.second = &AArch64::FPR128RegClass;
11758 }
11759 }
11760 }
11761 }
11762
11763 if (Res.second && !Subtarget->hasFPARMv8() &&
11764 !AArch64::GPR32allRegClass.hasSubClassEq(RC: Res.second) &&
11765 !AArch64::GPR64allRegClass.hasSubClassEq(RC: Res.second))
11766 return std::make_pair(x: 0U, y: nullptr);
11767
11768 return Res;
11769}
11770
11771EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
11772 llvm::Type *Ty,
11773 bool AllowUnknown) const {
11774 if (Subtarget->hasLS64() && Ty->isIntegerTy(Bitwidth: 512))
11775 return EVT(MVT::i64x8);
11776
11777 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11778}
11779
11780/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11781/// vector. If it is invalid, don't add anything to Ops.
11782void AArch64TargetLowering::LowerAsmOperandForConstraint(
11783 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11784 SelectionDAG &DAG) const {
11785 SDValue Result;
11786
11787 // Currently only support length 1 constraints.
11788 if (Constraint.size() != 1)
11789 return;
11790
11791 char ConstraintLetter = Constraint[0];
11792 switch (ConstraintLetter) {
11793 default:
11794 break;
11795
11796 // This set of constraints deal with valid constants for various instructions.
11797 // Validate and return a target constant for them if we can.
11798 case 'z': {
11799 // 'z' maps to xzr or wzr so it needs an input of 0.
11800 if (!isNullConstant(V: Op))
11801 return;
11802
11803 if (Op.getValueType() == MVT::i64)
11804 Result = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
11805 else
11806 Result = DAG.getRegister(Reg: AArch64::WZR, VT: MVT::i32);
11807 break;
11808 }
11809 case 'S':
11810 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11811 // supported for PIC while "s" isn't, making "s" less useful. We implement
11812 // "S" but not "s".
11813 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint: "s", Ops, DAG);
11814 break;
11815
11816 case 'I':
11817 case 'J':
11818 case 'K':
11819 case 'L':
11820 case 'M':
11821 case 'N':
11822 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op);
11823 if (!C)
11824 return;
11825
11826 // Grab the value and do some validation.
11827 uint64_t CVal = C->getZExtValue();
11828 switch (ConstraintLetter) {
11829 // The I constraint applies only to simple ADD or SUB immediate operands:
11830 // i.e. 0 to 4095 with optional shift by 12
11831 // The J constraint applies only to ADD or SUB immediates that would be
11832 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11833 // instruction [or vice versa], in other words -1 to -4095 with optional
11834 // left shift by 12.
11835 case 'I':
11836 if (isUInt<12>(x: CVal) || isShiftedUInt<12, 12>(x: CVal))
11837 break;
11838 return;
11839 case 'J': {
11840 uint64_t NVal = -C->getSExtValue();
11841 if (isUInt<12>(x: NVal) || isShiftedUInt<12, 12>(x: NVal)) {
11842 CVal = C->getSExtValue();
11843 break;
11844 }
11845 return;
11846 }
11847 // The K and L constraints apply *only* to logical immediates, including
11848 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11849 // been removed and MOV should be used). So these constraints have to
11850 // distinguish between bit patterns that are valid 32-bit or 64-bit
11851 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11852 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11853 // versa.
11854 case 'K':
11855 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 32))
11856 break;
11857 return;
11858 case 'L':
11859 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 64))
11860 break;
11861 return;
11862 // The M and N constraints are a superset of K and L respectively, for use
11863 // with the MOV (immediate) alias. As well as the logical immediates they
11864 // also match 32 or 64-bit immediates that can be loaded either using a
11865 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11866 // (M) or 64-bit 0x1234000000000000 (N) etc.
11867 // As a note some of this code is liberally stolen from the asm parser.
11868 case 'M': {
11869 if (!isUInt<32>(x: CVal))
11870 return;
11871 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 32))
11872 break;
11873 if ((CVal & 0xFFFF) == CVal)
11874 break;
11875 if ((CVal & 0xFFFF0000ULL) == CVal)
11876 break;
11877 uint64_t NCVal = ~(uint32_t)CVal;
11878 if ((NCVal & 0xFFFFULL) == NCVal)
11879 break;
11880 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11881 break;
11882 return;
11883 }
11884 case 'N': {
11885 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 64))
11886 break;
11887 if ((CVal & 0xFFFFULL) == CVal)
11888 break;
11889 if ((CVal & 0xFFFF0000ULL) == CVal)
11890 break;
11891 if ((CVal & 0xFFFF00000000ULL) == CVal)
11892 break;
11893 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11894 break;
11895 uint64_t NCVal = ~CVal;
11896 if ((NCVal & 0xFFFFULL) == NCVal)
11897 break;
11898 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11899 break;
11900 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11901 break;
11902 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11903 break;
11904 return;
11905 }
11906 default:
11907 return;
11908 }
11909
11910 // All assembler immediates are 64-bit integers.
11911 Result = DAG.getTargetConstant(Val: CVal, DL: SDLoc(Op), VT: MVT::i64);
11912 break;
11913 }
11914
11915 if (Result.getNode()) {
11916 Ops.push_back(x: Result);
11917 return;
11918 }
11919
11920 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11921}
11922
11923//===----------------------------------------------------------------------===//
11924// AArch64 Advanced SIMD Support
11925//===----------------------------------------------------------------------===//
11926
11927/// WidenVector - Given a value in the V64 register class, produce the
11928/// equivalent value in the V128 register class.
11929static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
11930 EVT VT = V64Reg.getValueType();
11931 unsigned NarrowSize = VT.getVectorNumElements();
11932 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11933 MVT WideTy = MVT::getVectorVT(VT: EltTy, NumElements: 2 * NarrowSize);
11934 SDLoc DL(V64Reg);
11935
11936 return DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: WideTy, N1: DAG.getUNDEF(VT: WideTy),
11937 N2: V64Reg, N3: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
11938}
11939
11940/// getExtFactor - Determine the adjustment factor for the position when
11941/// generating an "extract from vector registers" instruction.
11942static unsigned getExtFactor(SDValue &V) {
11943 EVT EltType = V.getValueType().getVectorElementType();
11944 return EltType.getSizeInBits() / 8;
11945}
11946
11947// Check if a vector is built from one vector via extracted elements of
11948// another together with an AND mask, ensuring that all elements fit
11949// within range. This can be reconstructed using AND and NEON's TBL1.
11950SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
11951 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11952 SDLoc dl(Op);
11953 EVT VT = Op.getValueType();
11954 assert(!VT.isScalableVector() &&
11955 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11956
11957 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11958 // directly to TBL1.
11959 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11960 return SDValue();
11961
11962 unsigned NumElts = VT.getVectorNumElements();
11963 assert((NumElts == 8 || NumElts == 16) &&
11964 "Need to have exactly 8 or 16 elements in vector.");
11965
11966 SDValue SourceVec;
11967 SDValue MaskSourceVec;
11968 SmallVector<SDValue, 16> AndMaskConstants;
11969
11970 for (unsigned i = 0; i < NumElts; ++i) {
11971 SDValue V = Op.getOperand(i);
11972 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11973 return SDValue();
11974
11975 SDValue OperandSourceVec = V.getOperand(i: 0);
11976 if (!SourceVec)
11977 SourceVec = OperandSourceVec;
11978 else if (SourceVec != OperandSourceVec)
11979 return SDValue();
11980
11981 // This only looks at shuffles with elements that are
11982 // a) truncated by a constant AND mask extracted from a mask vector, or
11983 // b) extracted directly from a mask vector.
11984 SDValue MaskSource = V.getOperand(i: 1);
11985 if (MaskSource.getOpcode() == ISD::AND) {
11986 if (!isa<ConstantSDNode>(Val: MaskSource.getOperand(i: 1)))
11987 return SDValue();
11988
11989 AndMaskConstants.push_back(Elt: MaskSource.getOperand(i: 1));
11990 MaskSource = MaskSource->getOperand(Num: 0);
11991 } else if (!AndMaskConstants.empty()) {
11992 // Either all or no operands should have an AND mask.
11993 return SDValue();
11994 }
11995
11996 // An ANY_EXTEND may be inserted between the AND and the source vector
11997 // extraction. We don't care about that, so we can just skip it.
11998 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11999 MaskSource = MaskSource.getOperand(i: 0);
12000
12001 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12002 return SDValue();
12003
12004 SDValue MaskIdx = MaskSource.getOperand(i: 1);
12005 if (!isa<ConstantSDNode>(Val: MaskIdx) ||
12006 !cast<ConstantSDNode>(Val&: MaskIdx)->getConstantIntValue()->equalsInt(V: i))
12007 return SDValue();
12008
12009 // We only apply this if all elements come from the same vector with the
12010 // same vector type.
12011 if (!MaskSourceVec) {
12012 MaskSourceVec = MaskSource->getOperand(Num: 0);
12013 if (MaskSourceVec.getValueType() != VT)
12014 return SDValue();
12015 } else if (MaskSourceVec != MaskSource->getOperand(Num: 0)) {
12016 return SDValue();
12017 }
12018 }
12019
12020 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
12021 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
12022 // insert, we know that the index in the mask must be smaller than the number
12023 // of elements in the source, or we would have an out-of-bounds access.
12024 if (NumElts == 8)
12025 SourceVec = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: MVT::v16i8, N1: SourceVec,
12026 N2: DAG.getUNDEF(VT));
12027
12028 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
12029 if (!AndMaskConstants.empty())
12030 MaskSourceVec = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: MaskSourceVec,
12031 N2: DAG.getBuildVector(VT, DL: dl, Ops: AndMaskConstants));
12032
12033 return DAG.getNode(
12034 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT,
12035 N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL: dl, VT: MVT::i32), N2: SourceVec,
12036 N3: MaskSourceVec);
12037}
12038
12039// Gather data to see if the operation can be modelled as a
12040// shuffle in combination with VEXTs.
12041SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
12042 SelectionDAG &DAG) const {
12043 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12044 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
12045 SDLoc dl(Op);
12046 EVT VT = Op.getValueType();
12047 assert(!VT.isScalableVector() &&
12048 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12049 unsigned NumElts = VT.getVectorNumElements();
12050
12051 struct ShuffleSourceInfo {
12052 SDValue Vec;
12053 unsigned MinElt;
12054 unsigned MaxElt;
12055
12056 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
12057 // be compatible with the shuffle we intend to construct. As a result
12058 // ShuffleVec will be some sliding window into the original Vec.
12059 SDValue ShuffleVec;
12060
12061 // Code should guarantee that element i in Vec starts at element "WindowBase
12062 // + i * WindowScale in ShuffleVec".
12063 int WindowBase;
12064 int WindowScale;
12065
12066 ShuffleSourceInfo(SDValue Vec)
12067 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
12068 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
12069
12070 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
12071 };
12072
12073 // First gather all vectors used as an immediate source for this BUILD_VECTOR
12074 // node.
12075 SmallVector<ShuffleSourceInfo, 2> Sources;
12076 for (unsigned i = 0; i < NumElts; ++i) {
12077 SDValue V = Op.getOperand(i);
12078 if (V.isUndef())
12079 continue;
12080 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12081 !isa<ConstantSDNode>(Val: V.getOperand(i: 1)) ||
12082 V.getOperand(i: 0).getValueType().isScalableVector()) {
12083 LLVM_DEBUG(
12084 dbgs() << "Reshuffle failed: "
12085 "a shuffle can only come from building a vector from "
12086 "various elements of other fixed-width vectors, provided "
12087 "their indices are constant\n");
12088 return SDValue();
12089 }
12090
12091 // Add this element source to the list if it's not already there.
12092 SDValue SourceVec = V.getOperand(i: 0);
12093 auto Source = find(Range&: Sources, Val: SourceVec);
12094 if (Source == Sources.end())
12095 Source = Sources.insert(I: Sources.end(), Elt: ShuffleSourceInfo(SourceVec));
12096
12097 // Update the minimum and maximum lane number seen.
12098 unsigned EltNo = V.getConstantOperandVal(i: 1);
12099 Source->MinElt = std::min(a: Source->MinElt, b: EltNo);
12100 Source->MaxElt = std::max(a: Source->MaxElt, b: EltNo);
12101 }
12102
12103 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
12104 // better than moving to/from gpr registers for larger vectors.
12105 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
12106 // Construct a mask for the tbl. We may need to adjust the index for types
12107 // larger than i8.
12108 SmallVector<unsigned, 16> Mask;
12109 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
12110 for (unsigned I = 0; I < NumElts; ++I) {
12111 SDValue V = Op.getOperand(i: I);
12112 if (V.isUndef()) {
12113 for (unsigned OF = 0; OF < OutputFactor; OF++)
12114 Mask.push_back(Elt: -1);
12115 continue;
12116 }
12117 // Set the Mask lanes adjusted for the size of the input and output
12118 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
12119 // output element, adjusted in their positions per input and output types.
12120 unsigned Lane = V.getConstantOperandVal(i: 1);
12121 for (unsigned S = 0; S < Sources.size(); S++) {
12122 if (V.getOperand(i: 0) == Sources[S].Vec) {
12123 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
12124 unsigned InputBase = 16 * S + Lane * InputSize / 8;
12125 for (unsigned OF = 0; OF < OutputFactor; OF++)
12126 Mask.push_back(Elt: InputBase + OF);
12127 break;
12128 }
12129 }
12130 }
12131
12132 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
12133 // v16i8, and the TBLMask
12134 SmallVector<SDValue, 16> TBLOperands;
12135 TBLOperands.push_back(Elt: DAG.getConstant(Val: Sources.size() == 3
12136 ? Intrinsic::aarch64_neon_tbl3
12137 : Intrinsic::aarch64_neon_tbl4,
12138 DL: dl, VT: MVT::i32));
12139 for (unsigned i = 0; i < Sources.size(); i++) {
12140 SDValue Src = Sources[i].Vec;
12141 EVT SrcVT = Src.getValueType();
12142 Src = DAG.getBitcast(VT: SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, V: Src);
12143 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
12144 "Expected a legally typed vector");
12145 if (SrcVT.is64BitVector())
12146 Src = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: MVT::v16i8, N1: Src,
12147 N2: DAG.getUNDEF(VT: MVT::v8i8));
12148 TBLOperands.push_back(Elt: Src);
12149 }
12150
12151 SmallVector<SDValue, 16> TBLMask;
12152 for (unsigned i = 0; i < Mask.size(); i++)
12153 TBLMask.push_back(Elt: DAG.getConstant(Val: Mask[i], DL: dl, VT: MVT::i32));
12154 assert((Mask.size() == 8 || Mask.size() == 16) &&
12155 "Expected a v8i8 or v16i8 Mask");
12156 TBLOperands.push_back(
12157 Elt: DAG.getBuildVector(VT: Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL: dl, Ops: TBLMask));
12158
12159 SDValue Shuffle =
12160 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl,
12161 VT: Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, Ops: TBLOperands);
12162 return DAG.getBitcast(VT, V: Shuffle);
12163 }
12164
12165 if (Sources.size() > 2) {
12166 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
12167 << "sensible when at most two source vectors are "
12168 << "involved\n");
12169 return SDValue();
12170 }
12171
12172 // Find out the smallest element size among result and two sources, and use
12173 // it as element size to build the shuffle_vector.
12174 EVT SmallestEltTy = VT.getVectorElementType();
12175 for (auto &Source : Sources) {
12176 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
12177 if (SrcEltTy.bitsLT(VT: SmallestEltTy)) {
12178 SmallestEltTy = SrcEltTy;
12179 }
12180 }
12181 unsigned ResMultiplier =
12182 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12183 uint64_t VTSize = VT.getFixedSizeInBits();
12184 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
12185 EVT ShuffleVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SmallestEltTy, NumElements: NumElts);
12186
12187 // If the source vector is too wide or too narrow, we may nevertheless be able
12188 // to construct a compatible shuffle either by concatenating it with UNDEF or
12189 // extracting a suitable range of elements.
12190 for (auto &Src : Sources) {
12191 EVT SrcVT = Src.ShuffleVec.getValueType();
12192
12193 TypeSize SrcVTSize = SrcVT.getSizeInBits();
12194 if (SrcVTSize == TypeSize::getFixed(ExactSize: VTSize))
12195 continue;
12196
12197 // This stage of the search produces a source with the same element type as
12198 // the original, but with a total width matching the BUILD_VECTOR output.
12199 EVT EltVT = SrcVT.getVectorElementType();
12200 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
12201 EVT DestVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumSrcElts);
12202
12203 if (SrcVTSize.getFixedValue() < VTSize) {
12204 assert(2 * SrcVTSize == VTSize);
12205 // We can pad out the smaller vector for free, so if it's part of a
12206 // shuffle...
12207 Src.ShuffleVec =
12208 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
12209 N2: DAG.getUNDEF(VT: Src.ShuffleVec.getValueType()));
12210 continue;
12211 }
12212
12213 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
12214 LLVM_DEBUG(
12215 dbgs() << "Reshuffle failed: result vector too small to extract\n");
12216 return SDValue();
12217 }
12218
12219 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
12220 LLVM_DEBUG(
12221 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
12222 return SDValue();
12223 }
12224
12225 if (Src.MinElt >= NumSrcElts) {
12226 // The extraction can just take the second half
12227 Src.ShuffleVec =
12228 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
12229 N2: DAG.getConstant(Val: NumSrcElts, DL: dl, VT: MVT::i64));
12230 Src.WindowBase = -NumSrcElts;
12231 } else if (Src.MaxElt < NumSrcElts) {
12232 // The extraction can just take the first half
12233 Src.ShuffleVec =
12234 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
12235 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64));
12236 } else {
12237 // An actual VEXT is needed
12238 SDValue VEXTSrc1 =
12239 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
12240 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64));
12241 SDValue VEXTSrc2 =
12242 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
12243 N2: DAG.getConstant(Val: NumSrcElts, DL: dl, VT: MVT::i64));
12244 unsigned Imm = Src.MinElt * getExtFactor(V&: VEXTSrc1);
12245
12246 if (!SrcVT.is64BitVector()) {
12247 LLVM_DEBUG(
12248 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
12249 "for SVE vectors.");
12250 return SDValue();
12251 }
12252
12253 Src.ShuffleVec = DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT: DestVT, N1: VEXTSrc1,
12254 N2: VEXTSrc2,
12255 N3: DAG.getConstant(Val: Imm, DL: dl, VT: MVT::i32));
12256 Src.WindowBase = -Src.MinElt;
12257 }
12258 }
12259
12260 // Another possible incompatibility occurs from the vector element types. We
12261 // can fix this by bitcasting the source vectors to the same type we intend
12262 // for the shuffle.
12263 for (auto &Src : Sources) {
12264 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
12265 if (SrcEltTy == SmallestEltTy)
12266 continue;
12267 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
12268 if (DAG.getDataLayout().isBigEndian()) {
12269 Src.ShuffleVec =
12270 DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT: ShuffleVT, Operand: Src.ShuffleVec);
12271 } else {
12272 Src.ShuffleVec = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ShuffleVT, Operand: Src.ShuffleVec);
12273 }
12274 Src.WindowScale =
12275 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12276 Src.WindowBase *= Src.WindowScale;
12277 }
12278
12279 // Final check before we try to actually produce a shuffle.
12280 LLVM_DEBUG(for (auto Src
12281 : Sources)
12282 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
12283
12284 // The stars all align, our next step is to produce the mask for the shuffle.
12285 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
12286 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
12287 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
12288 SDValue Entry = Op.getOperand(i);
12289 if (Entry.isUndef())
12290 continue;
12291
12292 auto Src = find(Range&: Sources, Val: Entry.getOperand(i: 0));
12293 int EltNo = cast<ConstantSDNode>(Val: Entry.getOperand(i: 1))->getSExtValue();
12294
12295 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
12296 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
12297 // segment.
12298 EVT OrigEltTy = Entry.getOperand(i: 0).getValueType().getVectorElementType();
12299 int BitsDefined = std::min(a: OrigEltTy.getScalarSizeInBits(),
12300 b: VT.getScalarSizeInBits());
12301 int LanesDefined = BitsDefined / BitsPerShuffleLane;
12302
12303 // This source is expected to fill ResMultiplier lanes of the final shuffle,
12304 // starting at the appropriate offset.
12305 int *LaneMask = &Mask[i * ResMultiplier];
12306
12307 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
12308 ExtractBase += NumElts * (Src - Sources.begin());
12309 for (int j = 0; j < LanesDefined; ++j)
12310 LaneMask[j] = ExtractBase + j;
12311 }
12312
12313 // Final check before we try to produce nonsense...
12314 if (!isShuffleMaskLegal(M: Mask, VT: ShuffleVT)) {
12315 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
12316 return SDValue();
12317 }
12318
12319 SDValue ShuffleOps[] = { DAG.getUNDEF(VT: ShuffleVT), DAG.getUNDEF(VT: ShuffleVT) };
12320 for (unsigned i = 0; i < Sources.size(); ++i)
12321 ShuffleOps[i] = Sources[i].ShuffleVec;
12322
12323 SDValue Shuffle = DAG.getVectorShuffle(VT: ShuffleVT, dl, N1: ShuffleOps[0],
12324 N2: ShuffleOps[1], Mask);
12325 SDValue V;
12326 if (DAG.getDataLayout().isBigEndian()) {
12327 V = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Shuffle);
12328 } else {
12329 V = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Shuffle);
12330 }
12331
12332 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
12333 dbgs() << "Reshuffle, creating node: "; V.dump(););
12334
12335 return V;
12336}
12337
12338// check if an EXT instruction can handle the shuffle mask when the
12339// vector sources of the shuffle are the same.
12340static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
12341 unsigned NumElts = VT.getVectorNumElements();
12342
12343 // Assume that the first shuffle index is not UNDEF. Fail if it is.
12344 if (M[0] < 0)
12345 return false;
12346
12347 Imm = M[0];
12348
12349 // If this is a VEXT shuffle, the immediate value is the index of the first
12350 // element. The other shuffle indices must be the successive elements after
12351 // the first one.
12352 unsigned ExpectedElt = Imm;
12353 for (unsigned i = 1; i < NumElts; ++i) {
12354 // Increment the expected index. If it wraps around, just follow it
12355 // back to index zero and keep going.
12356 ++ExpectedElt;
12357 if (ExpectedElt == NumElts)
12358 ExpectedElt = 0;
12359
12360 if (M[i] < 0)
12361 continue; // ignore UNDEF indices
12362 if (ExpectedElt != static_cast<unsigned>(M[i]))
12363 return false;
12364 }
12365
12366 return true;
12367}
12368
12369// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
12370// v4i32s. This is really a truncate, which we can construct out of (legal)
12371// concats and truncate nodes.
12372static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
12373 if (V.getValueType() != MVT::v16i8)
12374 return SDValue();
12375 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
12376
12377 for (unsigned X = 0; X < 4; X++) {
12378 // Check the first item in each group is an extract from lane 0 of a v4i32
12379 // or v4i16.
12380 SDValue BaseExt = V.getOperand(i: X * 4);
12381 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12382 (BaseExt.getOperand(i: 0).getValueType() != MVT::v4i16 &&
12383 BaseExt.getOperand(i: 0).getValueType() != MVT::v4i32) ||
12384 !isa<ConstantSDNode>(Val: BaseExt.getOperand(i: 1)) ||
12385 BaseExt.getConstantOperandVal(i: 1) != 0)
12386 return SDValue();
12387 SDValue Base = BaseExt.getOperand(i: 0);
12388 // And check the other items are extracts from the same vector.
12389 for (unsigned Y = 1; Y < 4; Y++) {
12390 SDValue Ext = V.getOperand(i: X * 4 + Y);
12391 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12392 Ext.getOperand(i: 0) != Base ||
12393 !isa<ConstantSDNode>(Val: Ext.getOperand(i: 1)) ||
12394 Ext.getConstantOperandVal(i: 1) != Y)
12395 return SDValue();
12396 }
12397 }
12398
12399 // Turn the buildvector into a series of truncates and concates, which will
12400 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
12401 // concat together to produce 2 v8i16. These are both truncated and concat
12402 // together.
12403 SDLoc DL(V);
12404 SDValue Trunc[4] = {
12405 V.getOperand(i: 0).getOperand(i: 0), V.getOperand(i: 4).getOperand(i: 0),
12406 V.getOperand(i: 8).getOperand(i: 0), V.getOperand(i: 12).getOperand(i: 0)};
12407 for (SDValue &V : Trunc)
12408 if (V.getValueType() == MVT::v4i32)
12409 V = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v4i16, Operand: V);
12410 SDValue Concat0 =
12411 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16, N1: Trunc[0], N2: Trunc[1]);
12412 SDValue Concat1 =
12413 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16, N1: Trunc[2], N2: Trunc[3]);
12414 SDValue Trunc0 = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: Concat0);
12415 SDValue Trunc1 = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: Concat1);
12416 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: Trunc0, N2: Trunc1);
12417}
12418
12419/// Check if a vector shuffle corresponds to a DUP instructions with a larger
12420/// element width than the vector lane type. If that is the case the function
12421/// returns true and writes the value of the DUP instruction lane operand into
12422/// DupLaneOp
12423static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
12424 unsigned &DupLaneOp) {
12425 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
12426 "Only possible block sizes for wide DUP are: 16, 32, 64");
12427
12428 if (BlockSize <= VT.getScalarSizeInBits())
12429 return false;
12430 if (BlockSize % VT.getScalarSizeInBits() != 0)
12431 return false;
12432 if (VT.getSizeInBits() % BlockSize != 0)
12433 return false;
12434
12435 size_t SingleVecNumElements = VT.getVectorNumElements();
12436 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
12437 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
12438
12439 // We are looking for masks like
12440 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
12441 // might be replaced by 'undefined'. BlockIndices will eventually contain
12442 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
12443 // for the above examples)
12444 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
12445 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
12446 for (size_t I = 0; I < NumEltsPerBlock; I++) {
12447 int Elt = M[BlockIndex * NumEltsPerBlock + I];
12448 if (Elt < 0)
12449 continue;
12450 // For now we don't support shuffles that use the second operand
12451 if ((unsigned)Elt >= SingleVecNumElements)
12452 return false;
12453 if (BlockElts[I] < 0)
12454 BlockElts[I] = Elt;
12455 else if (BlockElts[I] != Elt)
12456 return false;
12457 }
12458
12459 // We found a candidate block (possibly with some undefs). It must be a
12460 // sequence of consecutive integers starting with a value divisible by
12461 // NumEltsPerBlock with some values possibly replaced by undef-s.
12462
12463 // Find first non-undef element
12464 auto FirstRealEltIter = find_if(Range&: BlockElts, P: [](int Elt) { return Elt >= 0; });
12465 assert(FirstRealEltIter != BlockElts.end() &&
12466 "Shuffle with all-undefs must have been caught by previous cases, "
12467 "e.g. isSplat()");
12468 if (FirstRealEltIter == BlockElts.end()) {
12469 DupLaneOp = 0;
12470 return true;
12471 }
12472
12473 // Index of FirstRealElt in BlockElts
12474 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
12475
12476 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
12477 return false;
12478 // BlockElts[0] must have the following value if it isn't undef:
12479 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
12480
12481 // Check the first element
12482 if (Elt0 % NumEltsPerBlock != 0)
12483 return false;
12484 // Check that the sequence indeed consists of consecutive integers (modulo
12485 // undefs)
12486 for (size_t I = 0; I < NumEltsPerBlock; I++)
12487 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
12488 return false;
12489
12490 DupLaneOp = Elt0 / NumEltsPerBlock;
12491 return true;
12492}
12493
12494// check if an EXT instruction can handle the shuffle mask when the
12495// vector sources of the shuffle are different.
12496static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
12497 unsigned &Imm) {
12498 // Look for the first non-undef element.
12499 const int *FirstRealElt = find_if(Range&: M, P: [](int Elt) { return Elt >= 0; });
12500
12501 // Benefit form APInt to handle overflow when calculating expected element.
12502 unsigned NumElts = VT.getVectorNumElements();
12503 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
12504 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
12505 // The following shuffle indices must be the successive elements after the
12506 // first real element.
12507 bool FoundWrongElt = std::any_of(first: FirstRealElt + 1, last: M.end(), pred: [&](int Elt) {
12508 return Elt != ExpectedElt++ && Elt != -1;
12509 });
12510 if (FoundWrongElt)
12511 return false;
12512
12513 // The index of an EXT is the first element if it is not UNDEF.
12514 // Watch out for the beginning UNDEFs. The EXT index should be the expected
12515 // value of the first element. E.g.
12516 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
12517 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
12518 // ExpectedElt is the last mask index plus 1.
12519 Imm = ExpectedElt.getZExtValue();
12520
12521 // There are two difference cases requiring to reverse input vectors.
12522 // For example, for vector <4 x i32> we have the following cases,
12523 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
12524 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
12525 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
12526 // to reverse two input vectors.
12527 if (Imm < NumElts)
12528 ReverseEXT = true;
12529 else
12530 Imm -= NumElts;
12531
12532 return true;
12533}
12534
12535/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
12536/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12537/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
12538static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12539 unsigned NumElts = VT.getVectorNumElements();
12540 if (NumElts % 2 != 0)
12541 return false;
12542 WhichResult = (M[0] == 0 ? 0 : 1);
12543 unsigned Idx = WhichResult * NumElts / 2;
12544 for (unsigned i = 0; i != NumElts; i += 2) {
12545 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
12546 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
12547 return false;
12548 Idx += 1;
12549 }
12550
12551 return true;
12552}
12553
12554/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
12555/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12556/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
12557static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12558 unsigned Half = VT.getVectorNumElements() / 2;
12559 WhichResult = (M[0] == 0 ? 0 : 1);
12560 for (unsigned j = 0; j != 2; ++j) {
12561 unsigned Idx = WhichResult;
12562 for (unsigned i = 0; i != Half; ++i) {
12563 int MIdx = M[i + j * Half];
12564 if (MIdx >= 0 && (unsigned)MIdx != Idx)
12565 return false;
12566 Idx += 2;
12567 }
12568 }
12569
12570 return true;
12571}
12572
12573/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
12574/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12575/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
12576static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12577 unsigned NumElts = VT.getVectorNumElements();
12578 if (NumElts % 2 != 0)
12579 return false;
12580 WhichResult = (M[0] == 0 ? 0 : 1);
12581 for (unsigned i = 0; i < NumElts; i += 2) {
12582 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
12583 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
12584 return false;
12585 }
12586 return true;
12587}
12588
12589static bool isINSMask(ArrayRef<int> M, int NumInputElements,
12590 bool &DstIsLeft, int &Anomaly) {
12591 if (M.size() != static_cast<size_t>(NumInputElements))
12592 return false;
12593
12594 int NumLHSMatch = 0, NumRHSMatch = 0;
12595 int LastLHSMismatch = -1, LastRHSMismatch = -1;
12596
12597 for (int i = 0; i < NumInputElements; ++i) {
12598 if (M[i] == -1) {
12599 ++NumLHSMatch;
12600 ++NumRHSMatch;
12601 continue;
12602 }
12603
12604 if (M[i] == i)
12605 ++NumLHSMatch;
12606 else
12607 LastLHSMismatch = i;
12608
12609 if (M[i] == i + NumInputElements)
12610 ++NumRHSMatch;
12611 else
12612 LastRHSMismatch = i;
12613 }
12614
12615 if (NumLHSMatch == NumInputElements - 1) {
12616 DstIsLeft = true;
12617 Anomaly = LastLHSMismatch;
12618 return true;
12619 } else if (NumRHSMatch == NumInputElements - 1) {
12620 DstIsLeft = false;
12621 Anomaly = LastRHSMismatch;
12622 return true;
12623 }
12624
12625 return false;
12626}
12627
12628static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
12629 if (VT.getSizeInBits() != 128)
12630 return false;
12631
12632 unsigned NumElts = VT.getVectorNumElements();
12633
12634 for (int I = 0, E = NumElts / 2; I != E; I++) {
12635 if (Mask[I] != I)
12636 return false;
12637 }
12638
12639 int Offset = NumElts / 2;
12640 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
12641 if (Mask[I] != I + SplitLHS * Offset)
12642 return false;
12643 }
12644
12645 return true;
12646}
12647
12648static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
12649 SDLoc DL(Op);
12650 EVT VT = Op.getValueType();
12651 SDValue V0 = Op.getOperand(i: 0);
12652 SDValue V1 = Op.getOperand(i: 1);
12653 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
12654
12655 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
12656 VT.getVectorElementType() != V1.getValueType().getVectorElementType())
12657 return SDValue();
12658
12659 bool SplitV0 = V0.getValueSizeInBits() == 128;
12660
12661 if (!isConcatMask(Mask, VT, SplitLHS: SplitV0))
12662 return SDValue();
12663
12664 EVT CastVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
12665 if (SplitV0) {
12666 V0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: CastVT, N1: V0,
12667 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
12668 }
12669 if (V1.getValueSizeInBits() == 128) {
12670 V1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: CastVT, N1: V1,
12671 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
12672 }
12673 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: V0, N2: V1);
12674}
12675
12676/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
12677/// the specified operations to build the shuffle. ID is the perfect-shuffle
12678//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
12679//table entry and LHS/RHS are the immediate inputs for this stage of the
12680//shuffle.
12681static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
12682 SDValue V2, unsigned PFEntry, SDValue LHS,
12683 SDValue RHS, SelectionDAG &DAG,
12684 const SDLoc &dl) {
12685 unsigned OpNum = (PFEntry >> 26) & 0x0F;
12686 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
12687 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
12688
12689 enum {
12690 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12691 OP_VREV,
12692 OP_VDUP0,
12693 OP_VDUP1,
12694 OP_VDUP2,
12695 OP_VDUP3,
12696 OP_VEXT1,
12697 OP_VEXT2,
12698 OP_VEXT3,
12699 OP_VUZPL, // VUZP, left result
12700 OP_VUZPR, // VUZP, right result
12701 OP_VZIPL, // VZIP, left result
12702 OP_VZIPR, // VZIP, right result
12703 OP_VTRNL, // VTRN, left result
12704 OP_VTRNR, // VTRN, right result
12705 OP_MOVLANE // Move lane. RHSID is the lane to move into
12706 };
12707
12708 if (OpNum == OP_COPY) {
12709 if (LHSID == (1 * 9 + 2) * 9 + 3)
12710 return LHS;
12711 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12712 return RHS;
12713 }
12714
12715 if (OpNum == OP_MOVLANE) {
12716 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12717 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12718 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12719 Elt = 3 - Elt;
12720 while (Elt > 0) {
12721 ID /= 9;
12722 Elt--;
12723 }
12724 return (ID % 9 == 8) ? -1 : ID % 9;
12725 };
12726
12727 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12728 // get the lane to move from the PFID, which is always from the
12729 // original vectors (V1 or V2).
12730 SDValue OpLHS = GeneratePerfectShuffle(
12731 ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12732 EVT VT = OpLHS.getValueType();
12733 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12734 unsigned ExtLane = 0;
12735 SDValue Input;
12736
12737 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12738 // convert into a higher type.
12739 if (RHSID & 0x4) {
12740 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12741 if (MaskElt == -1)
12742 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12743 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12744 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12745 Input = MaskElt < 2 ? V1 : V2;
12746 if (VT.getScalarSizeInBits() == 16) {
12747 Input = DAG.getBitcast(VT: MVT::v2f32, V: Input);
12748 OpLHS = DAG.getBitcast(VT: MVT::v2f32, V: OpLHS);
12749 } else {
12750 assert(VT.getScalarSizeInBits() == 32 &&
12751 "Expected 16 or 32 bit shuffle elemements");
12752 Input = DAG.getBitcast(VT: MVT::v2f64, V: Input);
12753 OpLHS = DAG.getBitcast(VT: MVT::v2f64, V: OpLHS);
12754 }
12755 } else {
12756 int MaskElt = getPFIDLane(ID, RHSID);
12757 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12758 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12759 Input = MaskElt < 4 ? V1 : V2;
12760 // Be careful about creating illegal types. Use f16 instead of i16.
12761 if (VT == MVT::v4i16) {
12762 Input = DAG.getBitcast(VT: MVT::v4f16, V: Input);
12763 OpLHS = DAG.getBitcast(VT: MVT::v4f16, V: OpLHS);
12764 }
12765 }
12766 SDValue Ext = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl,
12767 VT: Input.getValueType().getVectorElementType(),
12768 N1: Input, N2: DAG.getVectorIdxConstant(Val: ExtLane, DL: dl));
12769 SDValue Ins =
12770 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT: Input.getValueType(), N1: OpLHS,
12771 N2: Ext, N3: DAG.getVectorIdxConstant(Val: RHSID & 0x3, DL: dl));
12772 return DAG.getBitcast(VT, V: Ins);
12773 }
12774
12775 SDValue OpLHS, OpRHS;
12776 OpLHS = GeneratePerfectShuffle(ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS,
12777 RHS, DAG, dl);
12778 OpRHS = GeneratePerfectShuffle(ID: RHSID, V1, V2, PFEntry: PerfectShuffleTable[RHSID], LHS,
12779 RHS, DAG, dl);
12780 EVT VT = OpLHS.getValueType();
12781
12782 switch (OpNum) {
12783 default:
12784 llvm_unreachable("Unknown shuffle opcode!");
12785 case OP_VREV:
12786 // VREV divides the vector in half and swaps within the half.
12787 if (VT.getVectorElementType() == MVT::i32 ||
12788 VT.getVectorElementType() == MVT::f32)
12789 return DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT, Operand: OpLHS);
12790 // vrev <4 x i16> -> REV32
12791 if (VT.getVectorElementType() == MVT::i16 ||
12792 VT.getVectorElementType() == MVT::f16 ||
12793 VT.getVectorElementType() == MVT::bf16)
12794 return DAG.getNode(Opcode: AArch64ISD::REV32, DL: dl, VT, Operand: OpLHS);
12795 // vrev <4 x i8> -> REV16
12796 assert(VT.getVectorElementType() == MVT::i8);
12797 return DAG.getNode(Opcode: AArch64ISD::REV16, DL: dl, VT, Operand: OpLHS);
12798 case OP_VDUP0:
12799 case OP_VDUP1:
12800 case OP_VDUP2:
12801 case OP_VDUP3: {
12802 EVT EltTy = VT.getVectorElementType();
12803 unsigned Opcode;
12804 if (EltTy == MVT::i8)
12805 Opcode = AArch64ISD::DUPLANE8;
12806 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12807 Opcode = AArch64ISD::DUPLANE16;
12808 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12809 Opcode = AArch64ISD::DUPLANE32;
12810 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12811 Opcode = AArch64ISD::DUPLANE64;
12812 else
12813 llvm_unreachable("Invalid vector element type?");
12814
12815 if (VT.getSizeInBits() == 64)
12816 OpLHS = WidenVector(V64Reg: OpLHS, DAG);
12817 SDValue Lane = DAG.getConstant(Val: OpNum - OP_VDUP0, DL: dl, VT: MVT::i64);
12818 return DAG.getNode(Opcode, DL: dl, VT, N1: OpLHS, N2: Lane);
12819 }
12820 case OP_VEXT1:
12821 case OP_VEXT2:
12822 case OP_VEXT3: {
12823 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(V&: OpLHS);
12824 return DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT, N1: OpLHS, N2: OpRHS,
12825 N3: DAG.getConstant(Val: Imm, DL: dl, VT: MVT::i32));
12826 }
12827 case OP_VUZPL:
12828 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12829 case OP_VUZPR:
12830 return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12831 case OP_VZIPL:
12832 return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12833 case OP_VZIPR:
12834 return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12835 case OP_VTRNL:
12836 return DAG.getNode(Opcode: AArch64ISD::TRN1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12837 case OP_VTRNR:
12838 return DAG.getNode(Opcode: AArch64ISD::TRN2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12839 }
12840}
12841
12842static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
12843 SelectionDAG &DAG) {
12844 // Check to see if we can use the TBL instruction.
12845 SDValue V1 = Op.getOperand(i: 0);
12846 SDValue V2 = Op.getOperand(i: 1);
12847 SDLoc DL(Op);
12848
12849 EVT EltVT = Op.getValueType().getVectorElementType();
12850 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12851
12852 bool Swap = false;
12853 if (V1.isUndef() || isZerosVector(N: V1.getNode())) {
12854 std::swap(a&: V1, b&: V2);
12855 Swap = true;
12856 }
12857
12858 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12859 // out of range values with 0s. We do need to make sure that any out-of-range
12860 // values are really out-of-range for a v16i8 vector.
12861 bool IsUndefOrZero = V2.isUndef() || isZerosVector(N: V2.getNode());
12862 MVT IndexVT = MVT::v8i8;
12863 unsigned IndexLen = 8;
12864 if (Op.getValueSizeInBits() == 128) {
12865 IndexVT = MVT::v16i8;
12866 IndexLen = 16;
12867 }
12868
12869 SmallVector<SDValue, 8> TBLMask;
12870 for (int Val : ShuffleMask) {
12871 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12872 unsigned Offset = Byte + Val * BytesPerElt;
12873 if (Swap)
12874 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12875 if (IsUndefOrZero && Offset >= IndexLen)
12876 Offset = 255;
12877 TBLMask.push_back(Elt: DAG.getConstant(Val: Offset, DL, VT: MVT::i32));
12878 }
12879 }
12880
12881 SDValue V1Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V1);
12882 SDValue V2Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V2);
12883
12884 SDValue Shuffle;
12885 if (IsUndefOrZero) {
12886 if (IndexLen == 8)
12887 V1Cst = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: V1Cst, N2: V1Cst);
12888 Shuffle = DAG.getNode(
12889 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
12890 N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL, VT: MVT::i32), N2: V1Cst,
12891 N3: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
12892 } else {
12893 if (IndexLen == 8) {
12894 V1Cst = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: V1Cst, N2: V2Cst);
12895 Shuffle = DAG.getNode(
12896 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
12897 N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL, VT: MVT::i32), N2: V1Cst,
12898 N3: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
12899 } else {
12900 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12901 // cannot currently represent the register constraints on the input
12902 // table registers.
12903 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12904 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12905 // IndexLen));
12906 Shuffle = DAG.getNode(
12907 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
12908 N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl2, DL, VT: MVT::i32), N2: V1Cst,
12909 N3: V2Cst,
12910 N4: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
12911 }
12912 }
12913 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
12914}
12915
12916static unsigned getDUPLANEOp(EVT EltType) {
12917 if (EltType == MVT::i8)
12918 return AArch64ISD::DUPLANE8;
12919 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12920 return AArch64ISD::DUPLANE16;
12921 if (EltType == MVT::i32 || EltType == MVT::f32)
12922 return AArch64ISD::DUPLANE32;
12923 if (EltType == MVT::i64 || EltType == MVT::f64)
12924 return AArch64ISD::DUPLANE64;
12925
12926 llvm_unreachable("Invalid vector element type?");
12927}
12928
12929static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12930 unsigned Opcode, SelectionDAG &DAG) {
12931 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12932 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12933 // Match: dup (bitcast (extract_subv X, C)), LaneC
12934 if (BitCast.getOpcode() != ISD::BITCAST ||
12935 BitCast.getOperand(i: 0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
12936 return false;
12937
12938 // The extract index must align in the destination type. That may not
12939 // happen if the bitcast is from narrow to wide type.
12940 SDValue Extract = BitCast.getOperand(i: 0);
12941 unsigned ExtIdx = Extract.getConstantOperandVal(i: 1);
12942 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12943 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12944 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12945 if (ExtIdxInBits % CastedEltBitWidth != 0)
12946 return false;
12947
12948 // Can't handle cases where vector size is not 128-bit
12949 if (!Extract.getOperand(i: 0).getValueType().is128BitVector())
12950 return false;
12951
12952 // Update the lane value by offsetting with the scaled extract index.
12953 LaneC += ExtIdxInBits / CastedEltBitWidth;
12954
12955 // Determine the casted vector type of the wide vector input.
12956 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12957 // Examples:
12958 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12959 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12960 unsigned SrcVecNumElts =
12961 Extract.getOperand(i: 0).getValueSizeInBits() / CastedEltBitWidth;
12962 CastVT = MVT::getVectorVT(VT: BitCast.getSimpleValueType().getScalarType(),
12963 NumElements: SrcVecNumElts);
12964 return true;
12965 };
12966 MVT CastVT;
12967 if (getScaledOffsetDup(V, Lane, CastVT)) {
12968 V = DAG.getBitcast(VT: CastVT, V: V.getOperand(i: 0).getOperand(i: 0));
12969 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12970 V.getOperand(i: 0).getValueType().is128BitVector()) {
12971 // The lane is incremented by the index of the extract.
12972 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12973 Lane += V.getConstantOperandVal(i: 1);
12974 V = V.getOperand(i: 0);
12975 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12976 // The lane is decremented if we are splatting from the 2nd operand.
12977 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12978 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12979 Lane -= Idx * VT.getVectorNumElements() / 2;
12980 V = WidenVector(V64Reg: V.getOperand(i: Idx), DAG);
12981 } else if (VT.getSizeInBits() == 64) {
12982 // Widen the operand to 128-bit register with undef.
12983 V = WidenVector(V64Reg: V, DAG);
12984 }
12985 return DAG.getNode(Opcode, DL: dl, VT, N1: V, N2: DAG.getConstant(Val: Lane, DL: dl, VT: MVT::i64));
12986}
12987
12988// Return true if we can get a new shuffle mask by checking the parameter mask
12989// array to test whether every two adjacent mask values are continuous and
12990// starting from an even number.
12991static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
12992 SmallVectorImpl<int> &NewMask) {
12993 unsigned NumElts = VT.getVectorNumElements();
12994 if (NumElts % 2 != 0)
12995 return false;
12996
12997 NewMask.clear();
12998 for (unsigned i = 0; i < NumElts; i += 2) {
12999 int M0 = M[i];
13000 int M1 = M[i + 1];
13001
13002 // If both elements are undef, new mask is undef too.
13003 if (M0 == -1 && M1 == -1) {
13004 NewMask.push_back(Elt: -1);
13005 continue;
13006 }
13007
13008 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
13009 NewMask.push_back(Elt: M1 / 2);
13010 continue;
13011 }
13012
13013 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
13014 NewMask.push_back(Elt: M0 / 2);
13015 continue;
13016 }
13017
13018 NewMask.clear();
13019 return false;
13020 }
13021
13022 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
13023 return true;
13024}
13025
13026// Try to widen element type to get a new mask value for a better permutation
13027// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
13028// UZP1/2, TRN1/2, REV, INS, etc.
13029// For example:
13030// shufflevector <4 x i32> %a, <4 x i32> %b,
13031// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
13032// is equivalent to:
13033// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
13034// Finally, we can get:
13035// mov v0.d[0], v1.d[1]
13036static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
13037 SDLoc DL(Op);
13038 EVT VT = Op.getValueType();
13039 EVT ScalarVT = VT.getVectorElementType();
13040 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
13041 SDValue V0 = Op.getOperand(i: 0);
13042 SDValue V1 = Op.getOperand(i: 1);
13043 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
13044
13045 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
13046 // We need to make sure the wider element type is legal. Thus, ElementSize
13047 // should be not larger than 32 bits, and i1 type should also be excluded.
13048 if (ElementSize > 32 || ElementSize == 1)
13049 return SDValue();
13050
13051 SmallVector<int, 8> NewMask;
13052 if (isWideTypeMask(M: Mask, VT, NewMask)) {
13053 MVT NewEltVT = VT.isFloatingPoint()
13054 ? MVT::getFloatingPointVT(BitWidth: ElementSize * 2)
13055 : MVT::getIntegerVT(BitWidth: ElementSize * 2);
13056 MVT NewVT = MVT::getVectorVT(VT: NewEltVT, NumElements: VT.getVectorNumElements() / 2);
13057 if (DAG.getTargetLoweringInfo().isTypeLegal(VT: NewVT)) {
13058 V0 = DAG.getBitcast(VT: NewVT, V: V0);
13059 V1 = DAG.getBitcast(VT: NewVT, V: V1);
13060 return DAG.getBitcast(VT,
13061 V: DAG.getVectorShuffle(VT: NewVT, dl: DL, N1: V0, N2: V1, Mask: NewMask));
13062 }
13063 }
13064
13065 return SDValue();
13066}
13067
13068// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
13069static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
13070 ArrayRef<int> ShuffleMask,
13071 SelectionDAG &DAG) {
13072 SDValue Tbl1 = Op->getOperand(Num: 0);
13073 SDValue Tbl2 = Op->getOperand(Num: 1);
13074 SDLoc dl(Op);
13075 SDValue Tbl2ID =
13076 DAG.getTargetConstant(Val: Intrinsic::aarch64_neon_tbl2, DL: dl, VT: MVT::i64);
13077
13078 EVT VT = Op.getValueType();
13079 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
13080 Tbl1->getOperand(Num: 0) != Tbl2ID ||
13081 Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
13082 Tbl2->getOperand(Num: 0) != Tbl2ID)
13083 return SDValue();
13084
13085 if (Tbl1->getValueType(ResNo: 0) != MVT::v16i8 ||
13086 Tbl2->getValueType(ResNo: 0) != MVT::v16i8)
13087 return SDValue();
13088
13089 SDValue Mask1 = Tbl1->getOperand(Num: 3);
13090 SDValue Mask2 = Tbl2->getOperand(Num: 3);
13091 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
13092 for (unsigned I = 0; I < 16; I++) {
13093 if (ShuffleMask[I] < 16)
13094 TBLMaskParts[I] = Mask1->getOperand(Num: ShuffleMask[I]);
13095 else {
13096 auto *C =
13097 dyn_cast<ConstantSDNode>(Val: Mask2->getOperand(Num: ShuffleMask[I] - 16));
13098 if (!C)
13099 return SDValue();
13100 TBLMaskParts[I] = DAG.getConstant(Val: C->getSExtValue() + 32, DL: dl, VT: MVT::i32);
13101 }
13102 }
13103
13104 SDValue TBLMask = DAG.getBuildVector(VT, DL: dl, Ops: TBLMaskParts);
13105 SDValue ID =
13106 DAG.getTargetConstant(Val: Intrinsic::aarch64_neon_tbl4, DL: dl, VT: MVT::i64);
13107
13108 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::v16i8,
13109 Ops: {ID, Tbl1->getOperand(Num: 1), Tbl1->getOperand(Num: 2),
13110 Tbl2->getOperand(Num: 1), Tbl2->getOperand(Num: 2), TBLMask});
13111}
13112
13113// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
13114// but we don't have an appropriate instruction,
13115// so custom-lower it as ZIP1-with-zeros.
13116SDValue
13117AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
13118 SelectionDAG &DAG) const {
13119 SDLoc dl(Op);
13120 EVT VT = Op.getValueType();
13121 SDValue SrcOp = Op.getOperand(i: 0);
13122 EVT SrcVT = SrcOp.getValueType();
13123 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
13124 "Unexpected extension factor.");
13125 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
13126 // FIXME: support multi-step zipping?
13127 if (Scale != 2)
13128 return SDValue();
13129 SDValue Zeros = DAG.getConstant(Val: 0, DL: dl, VT: SrcVT);
13130 return DAG.getBitcast(VT,
13131 V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT: SrcVT, N1: SrcOp, N2: Zeros));
13132}
13133
13134SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
13135 SelectionDAG &DAG) const {
13136 SDLoc dl(Op);
13137 EVT VT = Op.getValueType();
13138
13139 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
13140
13141 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
13142 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
13143
13144 // Convert shuffles that are directly supported on NEON to target-specific
13145 // DAG nodes, instead of keeping them as shuffles and matching them again
13146 // during code selection. This is more efficient and avoids the possibility
13147 // of inconsistencies between legalization and selection.
13148 ArrayRef<int> ShuffleMask = SVN->getMask();
13149
13150 SDValue V1 = Op.getOperand(i: 0);
13151 SDValue V2 = Op.getOperand(i: 1);
13152
13153 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
13154 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
13155 "Unexpected VECTOR_SHUFFLE mask size!");
13156
13157 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
13158 return Res;
13159
13160 if (SVN->isSplat()) {
13161 int Lane = SVN->getSplatIndex();
13162 // If this is undef splat, generate it via "just" vdup, if possible.
13163 if (Lane == -1)
13164 Lane = 0;
13165
13166 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
13167 return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT: V1.getValueType(),
13168 Operand: V1.getOperand(i: 0));
13169 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
13170 // constant. If so, we can just reference the lane's definition directly.
13171 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
13172 !isa<ConstantSDNode>(Val: V1.getOperand(i: Lane)))
13173 return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: V1.getOperand(i: Lane));
13174
13175 // Otherwise, duplicate from the lane of the input vector.
13176 unsigned Opcode = getDUPLANEOp(EltType: V1.getValueType().getVectorElementType());
13177 return constructDup(V: V1, Lane, dl, VT, Opcode, DAG);
13178 }
13179
13180 // Check if the mask matches a DUP for a wider element
13181 for (unsigned LaneSize : {64U, 32U, 16U}) {
13182 unsigned Lane = 0;
13183 if (isWideDUPMask(M: ShuffleMask, VT, BlockSize: LaneSize, DupLaneOp&: Lane)) {
13184 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
13185 : LaneSize == 32 ? AArch64ISD::DUPLANE32
13186 : AArch64ISD::DUPLANE16;
13187 // Cast V1 to an integer vector with required lane size
13188 MVT NewEltTy = MVT::getIntegerVT(BitWidth: LaneSize);
13189 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
13190 MVT NewVecTy = MVT::getVectorVT(VT: NewEltTy, NumElements: NewEltCount);
13191 V1 = DAG.getBitcast(VT: NewVecTy, V: V1);
13192 // Constuct the DUP instruction
13193 V1 = constructDup(V: V1, Lane, dl, VT: NewVecTy, Opcode, DAG);
13194 // Cast back to the original type
13195 return DAG.getBitcast(VT, V: V1);
13196 }
13197 }
13198
13199 unsigned NumElts = VT.getVectorNumElements();
13200 unsigned EltSize = VT.getScalarSizeInBits();
13201 if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: 64))
13202 return DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13203 if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: 32))
13204 return DAG.getNode(Opcode: AArch64ISD::REV32, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13205 if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: 16))
13206 return DAG.getNode(Opcode: AArch64ISD::REV16, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13207
13208 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
13209 ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size())) {
13210 SDValue Rev = DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT, Operand: V1);
13211 return DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT, N1: Rev, N2: Rev,
13212 N3: DAG.getConstant(Val: 8, DL: dl, VT: MVT::i32));
13213 }
13214
13215 bool ReverseEXT = false;
13216 unsigned Imm;
13217 if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm)) {
13218 if (ReverseEXT)
13219 std::swap(a&: V1, b&: V2);
13220 Imm *= getExtFactor(V&: V1);
13221 return DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2,
13222 N3: DAG.getConstant(Val: Imm, DL: dl, VT: MVT::i32));
13223 } else if (V2->isUndef() && isSingletonEXTMask(M: ShuffleMask, VT, Imm)) {
13224 Imm *= getExtFactor(V&: V1);
13225 return DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1,
13226 N3: DAG.getConstant(Val: Imm, DL: dl, VT: MVT::i32));
13227 }
13228
13229 unsigned WhichResult;
13230 if (isZIPMask(M: ShuffleMask, NumElts, WhichResultOut&: WhichResult)) {
13231 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13232 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13233 }
13234 if (isUZPMask(M: ShuffleMask, NumElts, WhichResultOut&: WhichResult)) {
13235 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13236 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13237 }
13238 if (isTRNMask(M: ShuffleMask, NumElts, WhichResult)) {
13239 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13240 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13241 }
13242
13243 if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
13244 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13245 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
13246 }
13247 if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
13248 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13249 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
13250 }
13251 if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
13252 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13253 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
13254 }
13255
13256 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
13257 return Concat;
13258
13259 bool DstIsLeft;
13260 int Anomaly;
13261 int NumInputElements = V1.getValueType().getVectorNumElements();
13262 if (isINSMask(M: ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
13263 SDValue DstVec = DstIsLeft ? V1 : V2;
13264 SDValue DstLaneV = DAG.getConstant(Val: Anomaly, DL: dl, VT: MVT::i64);
13265
13266 SDValue SrcVec = V1;
13267 int SrcLane = ShuffleMask[Anomaly];
13268 if (SrcLane >= NumInputElements) {
13269 SrcVec = V2;
13270 SrcLane -= NumElts;
13271 }
13272 SDValue SrcLaneV = DAG.getConstant(Val: SrcLane, DL: dl, VT: MVT::i64);
13273
13274 EVT ScalarVT = VT.getVectorElementType();
13275
13276 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
13277 ScalarVT = MVT::i32;
13278
13279 return DAG.getNode(
13280 Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: DstVec,
13281 N2: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: ScalarVT, N1: SrcVec, N2: SrcLaneV),
13282 N3: DstLaneV);
13283 }
13284
13285 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
13286 return NewSD;
13287
13288 // If the shuffle is not directly supported and it has 4 elements, use
13289 // the PerfectShuffle-generated table to synthesize it from other shuffles.
13290 if (NumElts == 4) {
13291 unsigned PFIndexes[4];
13292 for (unsigned i = 0; i != 4; ++i) {
13293 if (ShuffleMask[i] < 0)
13294 PFIndexes[i] = 8;
13295 else
13296 PFIndexes[i] = ShuffleMask[i];
13297 }
13298
13299 // Compute the index in the perfect shuffle table.
13300 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
13301 PFIndexes[2] * 9 + PFIndexes[3];
13302 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
13303 return GeneratePerfectShuffle(ID: PFTableIndex, V1, V2, PFEntry, LHS: V1, RHS: V2, DAG,
13304 dl);
13305 }
13306
13307 return GenerateTBL(Op, ShuffleMask, DAG);
13308}
13309
13310SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
13311 SelectionDAG &DAG) const {
13312 EVT VT = Op.getValueType();
13313
13314 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
13315 return LowerToScalableOp(Op, DAG);
13316
13317 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
13318 "Unexpected vector type!");
13319
13320 // We can handle the constant cases during isel.
13321 if (isa<ConstantSDNode>(Val: Op.getOperand(i: 0)))
13322 return Op;
13323
13324 // There isn't a natural way to handle the general i1 case, so we use some
13325 // trickery with whilelo.
13326 SDLoc DL(Op);
13327 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 0), DL, VT: MVT::i64);
13328 SplatVal = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i64, N1: SplatVal,
13329 N2: DAG.getValueType(MVT::i1));
13330 SDValue ID =
13331 DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo, DL, VT: MVT::i64);
13332 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
13333 if (VT == MVT::nxv1i1)
13334 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::nxv1i1,
13335 N1: DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::nxv2i1, N1: ID,
13336 N2: Zero, N3: SplatVal),
13337 N2: Zero);
13338 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: ID, N2: Zero, N3: SplatVal);
13339}
13340
13341SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
13342 SelectionDAG &DAG) const {
13343 SDLoc DL(Op);
13344
13345 EVT VT = Op.getValueType();
13346 if (!isTypeLegal(VT) || !VT.isScalableVector())
13347 return SDValue();
13348
13349 // Current lowering only supports the SVE-ACLE types.
13350 if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
13351 return SDValue();
13352
13353 // The DUPQ operation is indepedent of element type so normalise to i64s.
13354 SDValue Idx128 = Op.getOperand(i: 2);
13355
13356 // DUPQ can be used when idx is in range.
13357 auto *CIdx = dyn_cast<ConstantSDNode>(Val&: Idx128);
13358 if (CIdx && (CIdx->getZExtValue() <= 3)) {
13359 SDValue CI = DAG.getTargetConstant(Val: CIdx->getZExtValue(), DL, VT: MVT::i64);
13360 return DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT, N1: Op.getOperand(i: 1), N2: CI);
13361 }
13362
13363 SDValue V = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv2i64, Operand: Op.getOperand(i: 1));
13364
13365 // The ACLE says this must produce the same result as:
13366 // svtbl(data, svadd_x(svptrue_b64(),
13367 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
13368 // index * 2))
13369 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i64);
13370 SDValue SplatOne = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: One);
13371
13372 // create the vector 0,1,0,1,...
13373 SDValue SV = DAG.getStepVector(DL, ResVT: MVT::nxv2i64);
13374 SV = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::nxv2i64, N1: SV, N2: SplatOne);
13375
13376 // create the vector idx64,idx64+1,idx64,idx64+1,...
13377 SDValue Idx64 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Idx128, N2: Idx128);
13378 SDValue SplatIdx64 = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: Idx64);
13379 SDValue ShuffleMask = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::nxv2i64, N1: SV, N2: SplatIdx64);
13380
13381 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
13382 SDValue TBL = DAG.getNode(Opcode: AArch64ISD::TBL, DL, VT: MVT::nxv2i64, N1: V, N2: ShuffleMask);
13383 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: TBL);
13384}
13385
13386
13387static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
13388 APInt &UndefBits) {
13389 EVT VT = BVN->getValueType(ResNo: 0);
13390 APInt SplatBits, SplatUndef;
13391 unsigned SplatBitSize;
13392 bool HasAnyUndefs;
13393 if (BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
13394 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
13395
13396 for (unsigned i = 0; i < NumSplats; ++i) {
13397 CnstBits <<= SplatBitSize;
13398 UndefBits <<= SplatBitSize;
13399 CnstBits |= SplatBits.zextOrTrunc(width: VT.getSizeInBits());
13400 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(width: VT.getSizeInBits());
13401 }
13402
13403 return true;
13404 }
13405
13406 return false;
13407}
13408
13409// Try 64-bit splatted SIMD immediate.
13410static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13411 const APInt &Bits) {
13412 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
13413 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
13414 EVT VT = Op.getValueType();
13415 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
13416
13417 if (AArch64_AM::isAdvSIMDModImmType10(Imm: Value)) {
13418 Value = AArch64_AM::encodeAdvSIMDModImmType10(Imm: Value);
13419
13420 SDLoc dl(Op);
13421 SDValue Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13422 Operand: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32));
13423 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13424 }
13425 }
13426
13427 return SDValue();
13428}
13429
13430// Try 32-bit splatted SIMD immediate.
13431static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13432 const APInt &Bits,
13433 const SDValue *LHS = nullptr) {
13434 EVT VT = Op.getValueType();
13435 if (VT.isFixedLengthVector() &&
13436 !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
13437 return SDValue();
13438
13439 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
13440 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
13441 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
13442 bool isAdvSIMDModImm = false;
13443 uint64_t Shift;
13444
13445 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Imm: Value))) {
13446 Value = AArch64_AM::encodeAdvSIMDModImmType1(Imm: Value);
13447 Shift = 0;
13448 }
13449 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Imm: Value))) {
13450 Value = AArch64_AM::encodeAdvSIMDModImmType2(Imm: Value);
13451 Shift = 8;
13452 }
13453 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Imm: Value))) {
13454 Value = AArch64_AM::encodeAdvSIMDModImmType3(Imm: Value);
13455 Shift = 16;
13456 }
13457 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Imm: Value))) {
13458 Value = AArch64_AM::encodeAdvSIMDModImmType4(Imm: Value);
13459 Shift = 24;
13460 }
13461
13462 if (isAdvSIMDModImm) {
13463 SDLoc dl(Op);
13464 SDValue Mov;
13465
13466 if (LHS)
13467 Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13468 N1: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT: MovTy, Operand: *LHS),
13469 N2: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32),
13470 N3: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
13471 else
13472 Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13473 N1: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32),
13474 N2: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
13475
13476 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13477 }
13478 }
13479
13480 return SDValue();
13481}
13482
13483// Try 16-bit splatted SIMD immediate.
13484static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13485 const APInt &Bits,
13486 const SDValue *LHS = nullptr) {
13487 EVT VT = Op.getValueType();
13488 if (VT.isFixedLengthVector() &&
13489 !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
13490 return SDValue();
13491
13492 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
13493 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
13494 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
13495 bool isAdvSIMDModImm = false;
13496 uint64_t Shift;
13497
13498 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Imm: Value))) {
13499 Value = AArch64_AM::encodeAdvSIMDModImmType5(Imm: Value);
13500 Shift = 0;
13501 }
13502 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Imm: Value))) {
13503 Value = AArch64_AM::encodeAdvSIMDModImmType6(Imm: Value);
13504 Shift = 8;
13505 }
13506
13507 if (isAdvSIMDModImm) {
13508 SDLoc dl(Op);
13509 SDValue Mov;
13510
13511 if (LHS)
13512 Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13513 N1: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT: MovTy, Operand: *LHS),
13514 N2: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32),
13515 N3: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
13516 else
13517 Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13518 N1: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32),
13519 N2: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
13520
13521 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13522 }
13523 }
13524
13525 return SDValue();
13526}
13527
13528// Try 32-bit splatted SIMD immediate with shifted ones.
13529static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
13530 SelectionDAG &DAG, const APInt &Bits) {
13531 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
13532 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
13533 EVT VT = Op.getValueType();
13534 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
13535 bool isAdvSIMDModImm = false;
13536 uint64_t Shift;
13537
13538 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Imm: Value))) {
13539 Value = AArch64_AM::encodeAdvSIMDModImmType7(Imm: Value);
13540 Shift = 264;
13541 }
13542 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Imm: Value))) {
13543 Value = AArch64_AM::encodeAdvSIMDModImmType8(Imm: Value);
13544 Shift = 272;
13545 }
13546
13547 if (isAdvSIMDModImm) {
13548 SDLoc dl(Op);
13549 SDValue Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13550 N1: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32),
13551 N2: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
13552 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13553 }
13554 }
13555
13556 return SDValue();
13557}
13558
13559// Try 8-bit splatted SIMD immediate.
13560static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13561 const APInt &Bits) {
13562 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
13563 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
13564 EVT VT = Op.getValueType();
13565 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
13566
13567 if (AArch64_AM::isAdvSIMDModImmType9(Imm: Value)) {
13568 Value = AArch64_AM::encodeAdvSIMDModImmType9(Imm: Value);
13569
13570 SDLoc dl(Op);
13571 SDValue Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13572 Operand: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32));
13573 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13574 }
13575 }
13576
13577 return SDValue();
13578}
13579
13580// Try FP splatted SIMD immediate.
13581static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13582 const APInt &Bits) {
13583 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
13584 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
13585 EVT VT = Op.getValueType();
13586 bool isWide = (VT.getSizeInBits() == 128);
13587 MVT MovTy;
13588 bool isAdvSIMDModImm = false;
13589
13590 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Imm: Value))) {
13591 Value = AArch64_AM::encodeAdvSIMDModImmType11(Imm: Value);
13592 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
13593 }
13594 else if (isWide &&
13595 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Imm: Value))) {
13596 Value = AArch64_AM::encodeAdvSIMDModImmType12(Imm: Value);
13597 MovTy = MVT::v2f64;
13598 }
13599
13600 if (isAdvSIMDModImm) {
13601 SDLoc dl(Op);
13602 SDValue Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13603 Operand: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32));
13604 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13605 }
13606 }
13607
13608 return SDValue();
13609}
13610
13611// Specialized code to quickly find if PotentialBVec is a BuildVector that
13612// consists of only the same constant int value, returned in reference arg
13613// ConstVal
13614static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
13615 uint64_t &ConstVal) {
13616 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(Val: PotentialBVec);
13617 if (!Bvec)
13618 return false;
13619 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: 0));
13620 if (!FirstElt)
13621 return false;
13622 EVT VT = Bvec->getValueType(ResNo: 0);
13623 unsigned NumElts = VT.getVectorNumElements();
13624 for (unsigned i = 1; i < NumElts; ++i)
13625 if (dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: i)) != FirstElt)
13626 return false;
13627 ConstVal = FirstElt->getZExtValue();
13628 return true;
13629}
13630
13631static bool isAllInactivePredicate(SDValue N) {
13632 // Look through cast.
13633 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
13634 N = N.getOperand(i: 0);
13635
13636 return ISD::isConstantSplatVectorAllZeros(N: N.getNode());
13637}
13638
13639static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
13640 unsigned NumElts = N.getValueType().getVectorMinNumElements();
13641
13642 // Look through cast.
13643 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
13644 N = N.getOperand(i: 0);
13645 // When reinterpreting from a type with fewer elements the "new" elements
13646 // are not active, so bail if they're likely to be used.
13647 if (N.getValueType().getVectorMinNumElements() < NumElts)
13648 return false;
13649 }
13650
13651 if (ISD::isConstantSplatVectorAllOnes(N: N.getNode()))
13652 return true;
13653
13654 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
13655 // or smaller than the implicit element type represented by N.
13656 // NOTE: A larger element count implies a smaller element type.
13657 if (N.getOpcode() == AArch64ISD::PTRUE &&
13658 N.getConstantOperandVal(i: 0) == AArch64SVEPredPattern::all)
13659 return N.getValueType().getVectorMinNumElements() >= NumElts;
13660
13661 // If we're compiling for a specific vector-length, we can check if the
13662 // pattern's VL equals that of the scalable vector at runtime.
13663 if (N.getOpcode() == AArch64ISD::PTRUE) {
13664 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13665 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
13666 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
13667 if (MaxSVESize && MinSVESize == MaxSVESize) {
13668 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
13669 unsigned PatNumElts =
13670 getNumElementsFromSVEPredPattern(Pattern: N.getConstantOperandVal(i: 0));
13671 return PatNumElts == (NumElts * VScale);
13672 }
13673 }
13674
13675 return false;
13676}
13677
13678// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
13679// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
13680// BUILD_VECTORs with constant element C1, C2 is a constant, and:
13681// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
13682// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13683// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13684static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
13685 EVT VT = N->getValueType(ResNo: 0);
13686
13687 if (!VT.isVector())
13688 return SDValue();
13689
13690 SDLoc DL(N);
13691
13692 SDValue And;
13693 SDValue Shift;
13694
13695 SDValue FirstOp = N->getOperand(Num: 0);
13696 unsigned FirstOpc = FirstOp.getOpcode();
13697 SDValue SecondOp = N->getOperand(Num: 1);
13698 unsigned SecondOpc = SecondOp.getOpcode();
13699
13700 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13701 // a BICi in order to use an immediate instead of a register.
13702 // Is the other operand an shl or lshr? This will have been turned into:
13703 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13704 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13705 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13706 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13707 SecondOpc == AArch64ISD::SHL_PRED ||
13708 SecondOpc == AArch64ISD::SRL_PRED)) {
13709 And = FirstOp;
13710 Shift = SecondOp;
13711
13712 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13713 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13714 FirstOpc == AArch64ISD::SHL_PRED ||
13715 FirstOpc == AArch64ISD::SRL_PRED)) {
13716 And = SecondOp;
13717 Shift = FirstOp;
13718 } else
13719 return SDValue();
13720
13721 bool IsAnd = And.getOpcode() == ISD::AND;
13722 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13723 Shift.getOpcode() == AArch64ISD::SRL_PRED;
13724 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13725 Shift.getOpcode() == AArch64ISD::SRL_PRED;
13726
13727 // Is the shift amount constant and are all lanes active?
13728 uint64_t C2;
13729 if (ShiftHasPredOp) {
13730 if (!isAllActivePredicate(DAG, N: Shift.getOperand(i: 0)))
13731 return SDValue();
13732 APInt C;
13733 if (!ISD::isConstantSplatVector(N: Shift.getOperand(i: 2).getNode(), SplatValue&: C))
13734 return SDValue();
13735 C2 = C.getZExtValue();
13736 } else if (ConstantSDNode *C2node =
13737 dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
13738 C2 = C2node->getZExtValue();
13739 else
13740 return SDValue();
13741
13742 APInt C1AsAPInt;
13743 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13744 if (IsAnd) {
13745 // Is the and mask vector all constant?
13746 if (!ISD::isConstantSplatVector(N: And.getOperand(i: 1).getNode(), SplatValue&: C1AsAPInt))
13747 return SDValue();
13748 } else {
13749 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13750 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: 1));
13751 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: 2));
13752 assert(C1nodeImm && C1nodeShift);
13753 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13754 C1AsAPInt = C1AsAPInt.zextOrTrunc(width: ElemSizeInBits);
13755 }
13756
13757 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13758 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13759 // how much one can shift elements of a particular size?
13760 if (C2 > ElemSizeInBits)
13761 return SDValue();
13762
13763 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(numBits: ElemSizeInBits, hiBitsSet: C2)
13764 : APInt::getLowBitsSet(numBits: ElemSizeInBits, loBitsSet: C2);
13765 if (C1AsAPInt != RequiredC1)
13766 return SDValue();
13767
13768 SDValue X = And.getOperand(i: 0);
13769 SDValue Y = ShiftHasPredOp ? Shift.getOperand(i: 1) : Shift.getOperand(i: 0);
13770 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(Val: C2, DL, VT: MVT::i32)
13771 : Shift.getOperand(i: 1);
13772
13773 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13774 SDValue ResultSLI = DAG.getNode(Opcode: Inst, DL, VT, N1: X, N2: Y, N3: Imm);
13775
13776 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13777 LLVM_DEBUG(N->dump(&DAG));
13778 LLVM_DEBUG(dbgs() << "into: \n");
13779 LLVM_DEBUG(ResultSLI->dump(&DAG));
13780
13781 ++NumShiftInserts;
13782 return ResultSLI;
13783}
13784
13785SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13786 SelectionDAG &DAG) const {
13787 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
13788 OverrideNEON: !Subtarget->isNeonAvailable()))
13789 return LowerToScalableOp(Op, DAG);
13790
13791 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13792 if (SDValue Res = tryLowerToSLI(N: Op.getNode(), DAG))
13793 return Res;
13794
13795 EVT VT = Op.getValueType();
13796 if (VT.isScalableVector())
13797 return Op;
13798
13799 SDValue LHS = Op.getOperand(i: 0);
13800 BuildVectorSDNode *BVN =
13801 dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: 1).getNode());
13802 if (!BVN) {
13803 // OR commutes, so try swapping the operands.
13804 LHS = Op.getOperand(i: 1);
13805 BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: 0).getNode());
13806 }
13807 if (!BVN)
13808 return Op;
13809
13810 APInt DefBits(VT.getSizeInBits(), 0);
13811 APInt UndefBits(VT.getSizeInBits(), 0);
13812 if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
13813 SDValue NewOp;
13814
13815 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
13816 Bits: DefBits, LHS: &LHS)) ||
13817 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
13818 Bits: DefBits, LHS: &LHS)))
13819 return NewOp;
13820
13821 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
13822 Bits: UndefBits, LHS: &LHS)) ||
13823 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
13824 Bits: UndefBits, LHS: &LHS)))
13825 return NewOp;
13826 }
13827
13828 // We can always fall back to a non-immediate OR.
13829 return Op;
13830}
13831
13832// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13833// be truncated to fit element width.
13834static SDValue NormalizeBuildVector(SDValue Op,
13835 SelectionDAG &DAG) {
13836 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13837 SDLoc dl(Op);
13838 EVT VT = Op.getValueType();
13839 EVT EltTy= VT.getVectorElementType();
13840
13841 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13842 return Op;
13843
13844 SmallVector<SDValue, 16> Ops;
13845 for (SDValue Lane : Op->ops()) {
13846 // For integer vectors, type legalization would have promoted the
13847 // operands already. Otherwise, if Op is a floating-point splat
13848 // (with operands cast to integers), then the only possibilities
13849 // are constants and UNDEFs.
13850 if (auto *CstLane = dyn_cast<ConstantSDNode>(Val&: Lane)) {
13851 APInt LowBits(EltTy.getSizeInBits(),
13852 CstLane->getZExtValue());
13853 Lane = DAG.getConstant(Val: LowBits.getZExtValue(), DL: dl, VT: MVT::i32);
13854 } else if (Lane.getNode()->isUndef()) {
13855 Lane = DAG.getUNDEF(VT: MVT::i32);
13856 } else {
13857 assert(Lane.getValueType() == MVT::i32 &&
13858 "Unexpected BUILD_VECTOR operand type");
13859 }
13860 Ops.push_back(Elt: Lane);
13861 }
13862 return DAG.getBuildVector(VT, DL: dl, Ops);
13863}
13864
13865static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
13866 const AArch64Subtarget *ST) {
13867 EVT VT = Op.getValueType();
13868 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13869 "Expected a legal NEON vector");
13870
13871 APInt DefBits(VT.getSizeInBits(), 0);
13872 APInt UndefBits(VT.getSizeInBits(), 0);
13873 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
13874 if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
13875 auto TryMOVIWithBits = [&](APInt DefBits) {
13876 SDValue NewOp;
13877 if ((NewOp =
13878 tryAdvSIMDModImm64(NewOp: AArch64ISD::MOVIedit, Op, DAG, Bits: DefBits)) ||
13879 (NewOp =
13880 tryAdvSIMDModImm32(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) ||
13881 (NewOp =
13882 tryAdvSIMDModImm321s(NewOp: AArch64ISD::MOVImsl, Op, DAG, Bits: DefBits)) ||
13883 (NewOp =
13884 tryAdvSIMDModImm16(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) ||
13885 (NewOp = tryAdvSIMDModImm8(NewOp: AArch64ISD::MOVI, Op, DAG, Bits: DefBits)) ||
13886 (NewOp = tryAdvSIMDModImmFP(NewOp: AArch64ISD::FMOV, Op, DAG, Bits: DefBits)))
13887 return NewOp;
13888
13889 APInt NotDefBits = ~DefBits;
13890 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::MVNIshift, Op, DAG,
13891 Bits: NotDefBits)) ||
13892 (NewOp = tryAdvSIMDModImm321s(NewOp: AArch64ISD::MVNImsl, Op, DAG,
13893 Bits: NotDefBits)) ||
13894 (NewOp =
13895 tryAdvSIMDModImm16(NewOp: AArch64ISD::MVNIshift, Op, DAG, Bits: NotDefBits)))
13896 return NewOp;
13897 return SDValue();
13898 };
13899 if (SDValue R = TryMOVIWithBits(DefBits))
13900 return R;
13901 if (SDValue R = TryMOVIWithBits(UndefBits))
13902 return R;
13903
13904 // See if a fneg of the constant can be materialized with a MOVI, etc
13905 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13906 // FNegate each sub-element of the constant
13907 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13908 APInt Neg = APInt::getHighBitsSet(numBits: FVT.getSizeInBits(), hiBitsSet: 1)
13909 .zext(width: VT.getSizeInBits());
13910 APInt NegBits(VT.getSizeInBits(), 0);
13911 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13912 for (unsigned i = 0; i < NumElts; i++)
13913 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13914 NegBits = DefBits ^ NegBits;
13915
13916 // Try to create the new constants with MOVI, and if so generate a fneg
13917 // for it.
13918 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13919 SDLoc DL(Op);
13920 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(VT: FVT, NumElements: NumElts);
13921 return DAG.getNode(
13922 Opcode: AArch64ISD::NVCAST, DL, VT,
13923 Operand: DAG.getNode(Opcode: ISD::FNEG, DL, VT: VFVT,
13924 Operand: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: VFVT, Operand: NewOp)));
13925 }
13926 return SDValue();
13927 };
13928 SDValue R;
13929 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13930 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13931 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13932 return R;
13933 }
13934
13935 return SDValue();
13936}
13937
13938SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13939 SelectionDAG &DAG) const {
13940 EVT VT = Op.getValueType();
13941
13942 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
13943 if (auto SeqInfo = cast<BuildVectorSDNode>(Val&: Op)->isConstantSequence()) {
13944 SDLoc DL(Op);
13945 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13946 SDValue Start = DAG.getConstant(Val: SeqInfo->first, DL, VT: ContainerVT);
13947 SDValue Steps = DAG.getStepVector(DL, ResVT: ContainerVT, StepVal: SeqInfo->second);
13948 SDValue Seq = DAG.getNode(Opcode: ISD::ADD, DL, VT: ContainerVT, N1: Start, N2: Steps);
13949 return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Seq);
13950 }
13951
13952 // Revert to common legalisation for all other variants.
13953 return SDValue();
13954 }
13955
13956 // Try to build a simple constant vector.
13957 Op = NormalizeBuildVector(Op, DAG);
13958 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13959 // abort.
13960 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13961 return SDValue();
13962
13963 // Certain vector constants, used to express things like logical NOT and
13964 // arithmetic NEG, are passed through unmodified. This allows special
13965 // patterns for these operations to match, which will lower these constants
13966 // to whatever is proven necessary.
13967 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
13968 if (BVN->isConstant()) {
13969 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13970 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13971 APInt Val(BitSize,
13972 Const->getAPIntValue().zextOrTrunc(width: BitSize).getZExtValue());
13973 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13974 return Op;
13975 }
13976 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13977 if (Const->isZero() && !Const->isNegative())
13978 return Op;
13979 }
13980
13981 if (SDValue V = ConstantBuildVector(Op, DAG, ST: Subtarget))
13982 return V;
13983
13984 // Scan through the operands to find some interesting properties we can
13985 // exploit:
13986 // 1) If only one value is used, we can use a DUP, or
13987 // 2) if only the low element is not undef, we can just insert that, or
13988 // 3) if only one constant value is used (w/ some non-constant lanes),
13989 // we can splat the constant value into the whole vector then fill
13990 // in the non-constant lanes.
13991 // 4) FIXME: If different constant values are used, but we can intelligently
13992 // select the values we'll be overwriting for the non-constant
13993 // lanes such that we can directly materialize the vector
13994 // some other way (MOVI, e.g.), we can be sneaky.
13995 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13996 SDLoc dl(Op);
13997 unsigned NumElts = VT.getVectorNumElements();
13998 bool isOnlyLowElement = true;
13999 bool usesOnlyOneValue = true;
14000 bool usesOnlyOneConstantValue = true;
14001 bool isConstant = true;
14002 bool AllLanesExtractElt = true;
14003 unsigned NumConstantLanes = 0;
14004 unsigned NumDifferentLanes = 0;
14005 unsigned NumUndefLanes = 0;
14006 SDValue Value;
14007 SDValue ConstantValue;
14008 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
14009 unsigned ConsecutiveValCount = 0;
14010 SDValue PrevVal;
14011 for (unsigned i = 0; i < NumElts; ++i) {
14012 SDValue V = Op.getOperand(i);
14013 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14014 AllLanesExtractElt = false;
14015 if (V.isUndef()) {
14016 ++NumUndefLanes;
14017 continue;
14018 }
14019 if (i > 0)
14020 isOnlyLowElement = false;
14021 if (!isIntOrFPConstant(V))
14022 isConstant = false;
14023
14024 if (isIntOrFPConstant(V)) {
14025 ++NumConstantLanes;
14026 if (!ConstantValue.getNode())
14027 ConstantValue = V;
14028 else if (ConstantValue != V)
14029 usesOnlyOneConstantValue = false;
14030 }
14031
14032 if (!Value.getNode())
14033 Value = V;
14034 else if (V != Value) {
14035 usesOnlyOneValue = false;
14036 ++NumDifferentLanes;
14037 }
14038
14039 if (PrevVal != V) {
14040 ConsecutiveValCount = 0;
14041 PrevVal = V;
14042 }
14043
14044 // Keep different values and its last consecutive count. For example,
14045 //
14046 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14047 // t24, t24, t24, t24, t24, t24, t24, t24
14048 // t23 = consecutive count 8
14049 // t24 = consecutive count 8
14050 // ------------------------------------------------------------------
14051 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
14052 // t24, t24, t24, t24, t24, t24, t24, t24
14053 // t23 = consecutive count 5
14054 // t24 = consecutive count 9
14055 DifferentValueMap[V] = ++ConsecutiveValCount;
14056 }
14057
14058 if (!Value.getNode()) {
14059 LLVM_DEBUG(
14060 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
14061 return DAG.getUNDEF(VT);
14062 }
14063
14064 // Convert BUILD_VECTOR where all elements but the lowest are undef into
14065 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
14066 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
14067 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(V: Value))) {
14068 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
14069 "SCALAR_TO_VECTOR node\n");
14070 return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT, Operand: Value);
14071 }
14072
14073 if (AllLanesExtractElt) {
14074 SDNode *Vector = nullptr;
14075 bool Even = false;
14076 bool Odd = false;
14077 // Check whether the extract elements match the Even pattern <0,2,4,...> or
14078 // the Odd pattern <1,3,5,...>.
14079 for (unsigned i = 0; i < NumElts; ++i) {
14080 SDValue V = Op.getOperand(i);
14081 const SDNode *N = V.getNode();
14082 if (!isa<ConstantSDNode>(Val: N->getOperand(Num: 1))) {
14083 Even = false;
14084 Odd = false;
14085 break;
14086 }
14087 SDValue N0 = N->getOperand(Num: 0);
14088
14089 // All elements are extracted from the same vector.
14090 if (!Vector) {
14091 Vector = N0.getNode();
14092 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
14093 // BUILD_VECTOR.
14094 if (VT.getVectorElementType() !=
14095 N0.getValueType().getVectorElementType())
14096 break;
14097 } else if (Vector != N0.getNode()) {
14098 Odd = false;
14099 Even = false;
14100 break;
14101 }
14102
14103 // Extracted values are either at Even indices <0,2,4,...> or at Odd
14104 // indices <1,3,5,...>.
14105 uint64_t Val = N->getConstantOperandVal(Num: 1);
14106 if (Val == 2 * i) {
14107 Even = true;
14108 continue;
14109 }
14110 if (Val - 1 == 2 * i) {
14111 Odd = true;
14112 continue;
14113 }
14114
14115 // Something does not match: abort.
14116 Odd = false;
14117 Even = false;
14118 break;
14119 }
14120 if (Even || Odd) {
14121 SDValue LHS =
14122 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT, N1: SDValue(Vector, 0),
14123 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64));
14124 SDValue RHS =
14125 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT, N1: SDValue(Vector, 0),
14126 N2: DAG.getConstant(Val: NumElts, DL: dl, VT: MVT::i64));
14127
14128 if (Even && !Odd)
14129 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT, N1: LHS, N2: RHS);
14130 if (Odd && !Even)
14131 return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VT, N1: LHS, N2: RHS);
14132 }
14133 }
14134
14135 // Use DUP for non-constant splats. For f32 constant splats, reduce to
14136 // i32 and try again.
14137 if (usesOnlyOneValue) {
14138 if (!isConstant) {
14139 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14140 Value.getValueType() != VT) {
14141 LLVM_DEBUG(
14142 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
14143 return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: Value);
14144 }
14145
14146 // This is actually a DUPLANExx operation, which keeps everything vectory.
14147
14148 SDValue Lane = Value.getOperand(i: 1);
14149 Value = Value.getOperand(i: 0);
14150 if (Value.getValueSizeInBits() == 64) {
14151 LLVM_DEBUG(
14152 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
14153 "widening it\n");
14154 Value = WidenVector(V64Reg: Value, DAG);
14155 }
14156
14157 unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
14158 return DAG.getNode(Opcode, DL: dl, VT, N1: Value, N2: Lane);
14159 }
14160
14161 if (VT.getVectorElementType().isFloatingPoint()) {
14162 SmallVector<SDValue, 8> Ops;
14163 EVT EltTy = VT.getVectorElementType();
14164 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
14165 EltTy == MVT::f64) && "Unsupported floating-point vector type");
14166 LLVM_DEBUG(
14167 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
14168 "BITCASTS, and try again\n");
14169 MVT NewType = MVT::getIntegerVT(BitWidth: EltTy.getSizeInBits());
14170 for (unsigned i = 0; i < NumElts; ++i)
14171 Ops.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: NewType, Operand: Op.getOperand(i)));
14172 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: NewType, NumElements: NumElts);
14173 SDValue Val = DAG.getBuildVector(VT: VecVT, DL: dl, Ops);
14174 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
14175 Val.dump(););
14176 Val = LowerBUILD_VECTOR(Op: Val, DAG);
14177 if (Val.getNode())
14178 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Val);
14179 }
14180 }
14181
14182 // If we need to insert a small number of different non-constant elements and
14183 // the vector width is sufficiently large, prefer using DUP with the common
14184 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
14185 // skip the constant lane handling below.
14186 bool PreferDUPAndInsert =
14187 !isConstant && NumDifferentLanes >= 1 &&
14188 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
14189 NumDifferentLanes >= NumConstantLanes;
14190
14191 // If there was only one constant value used and for more than one lane,
14192 // start by splatting that value, then replace the non-constant lanes. This
14193 // is better than the default, which will perform a separate initialization
14194 // for each lane.
14195 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
14196 // Firstly, try to materialize the splat constant.
14197 SDValue Val = DAG.getSplatBuildVector(VT, DL: dl, Op: ConstantValue);
14198 unsigned BitSize = VT.getScalarSizeInBits();
14199 APInt ConstantValueAPInt(1, 0);
14200 if (auto *C = dyn_cast<ConstantSDNode>(Val&: ConstantValue))
14201 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(width: BitSize);
14202 if (!isNullConstant(V: ConstantValue) && !isNullFPConstant(V: ConstantValue) &&
14203 !ConstantValueAPInt.isAllOnes()) {
14204 Val = ConstantBuildVector(Op: Val, DAG, ST: Subtarget);
14205 if (!Val)
14206 // Otherwise, materialize the constant and splat it.
14207 Val = DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: ConstantValue);
14208 }
14209
14210 // Now insert the non-constant lanes.
14211 for (unsigned i = 0; i < NumElts; ++i) {
14212 SDValue V = Op.getOperand(i);
14213 SDValue LaneIdx = DAG.getConstant(Val: i, DL: dl, VT: MVT::i64);
14214 if (!isIntOrFPConstant(V))
14215 // Note that type legalization likely mucked about with the VT of the
14216 // source operand, so we may have to convert it here before inserting.
14217 Val = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: Val, N2: V, N3: LaneIdx);
14218 }
14219 return Val;
14220 }
14221
14222 // This will generate a load from the constant pool.
14223 if (isConstant) {
14224 LLVM_DEBUG(
14225 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
14226 "expansion\n");
14227 return SDValue();
14228 }
14229
14230 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
14231 // v4i32s. This is really a truncate, which we can construct out of (legal)
14232 // concats and truncate nodes.
14233 if (SDValue M = ReconstructTruncateFromBuildVector(V: Op, DAG))
14234 return M;
14235
14236 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
14237 if (NumElts >= 4) {
14238 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
14239 return Shuffle;
14240
14241 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
14242 return Shuffle;
14243 }
14244
14245 if (PreferDUPAndInsert) {
14246 // First, build a constant vector with the common element.
14247 SmallVector<SDValue, 8> Ops(NumElts, Value);
14248 SDValue NewVector = LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT, DL: dl, Ops), DAG);
14249 // Next, insert the elements that do not match the common value.
14250 for (unsigned I = 0; I < NumElts; ++I)
14251 if (Op.getOperand(i: I) != Value)
14252 NewVector =
14253 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: NewVector,
14254 N2: Op.getOperand(i: I), N3: DAG.getConstant(Val: I, DL: dl, VT: MVT::i64));
14255
14256 return NewVector;
14257 }
14258
14259 // If vector consists of two different values, try to generate two DUPs and
14260 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
14261 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
14262 SmallVector<SDValue, 2> Vals;
14263 // Check the consecutive count of the value is the half number of vector
14264 // elements. In this case, we can use CONCAT_VECTORS. For example,
14265 //
14266 // canUseVECTOR_CONCAT = true;
14267 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14268 // t24, t24, t24, t24, t24, t24, t24, t24
14269 //
14270 // canUseVECTOR_CONCAT = false;
14271 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
14272 // t24, t24, t24, t24, t24, t24, t24, t24
14273 bool canUseVECTOR_CONCAT = true;
14274 for (auto Pair : DifferentValueMap) {
14275 // Check different values have same length which is NumElts / 2.
14276 if (Pair.second != NumElts / 2)
14277 canUseVECTOR_CONCAT = false;
14278 Vals.push_back(Elt: Pair.first);
14279 }
14280
14281 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
14282 // CONCAT_VECTORs. For example,
14283 //
14284 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
14285 // t24, t24, t24, t24, t24, t24, t24, t24
14286 // ==>
14287 // t26: v8i8 = AArch64ISD::DUP t23
14288 // t28: v8i8 = AArch64ISD::DUP t24
14289 // t29: v16i8 = concat_vectors t26, t28
14290 if (canUseVECTOR_CONCAT) {
14291 EVT SubVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
14292 if (isTypeLegal(VT: SubVT) && SubVT.isVector() &&
14293 SubVT.getVectorNumElements() >= 2) {
14294 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
14295 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
14296 SDValue DUP1 =
14297 LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL: dl, Ops: Ops1), DAG);
14298 SDValue DUP2 =
14299 LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL: dl, Ops: Ops2), DAG);
14300 SDValue CONCAT_VECTORS =
14301 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: DUP1, N2: DUP2);
14302 return CONCAT_VECTORS;
14303 }
14304 }
14305
14306 // Let's try to generate VECTOR_SHUFFLE. For example,
14307 //
14308 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
14309 // ==>
14310 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
14311 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
14312 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
14313 if (NumElts >= 8) {
14314 SmallVector<int, 16> MaskVec;
14315 // Build mask for VECTOR_SHUFLLE.
14316 SDValue FirstLaneVal = Op.getOperand(i: 0);
14317 for (unsigned i = 0; i < NumElts; ++i) {
14318 SDValue Val = Op.getOperand(i);
14319 if (FirstLaneVal == Val)
14320 MaskVec.push_back(Elt: i);
14321 else
14322 MaskVec.push_back(Elt: i + NumElts);
14323 }
14324
14325 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
14326 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
14327 SDValue VEC1 = DAG.getBuildVector(VT, DL: dl, Ops: Ops1);
14328 SDValue VEC2 = DAG.getBuildVector(VT, DL: dl, Ops: Ops2);
14329 SDValue VECTOR_SHUFFLE =
14330 DAG.getVectorShuffle(VT, dl, N1: VEC1, N2: VEC2, Mask: MaskVec);
14331 return VECTOR_SHUFFLE;
14332 }
14333 }
14334
14335 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
14336 // know the default expansion would otherwise fall back on something even
14337 // worse. For a vector with one or two non-undef values, that's
14338 // scalar_to_vector for the elements followed by a shuffle (provided the
14339 // shuffle is valid for the target) and materialization element by element
14340 // on the stack followed by a load for everything else.
14341 if (!isConstant && !usesOnlyOneValue) {
14342 LLVM_DEBUG(
14343 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
14344 "of INSERT_VECTOR_ELT\n");
14345
14346 SDValue Vec = DAG.getUNDEF(VT);
14347 SDValue Op0 = Op.getOperand(i: 0);
14348 unsigned i = 0;
14349
14350 // Use SCALAR_TO_VECTOR for lane zero to
14351 // a) Avoid a RMW dependency on the full vector register, and
14352 // b) Allow the register coalescer to fold away the copy if the
14353 // value is already in an S or D register, and we're forced to emit an
14354 // INSERT_SUBREG that we can't fold anywhere.
14355 //
14356 // We also allow types like i8 and i16 which are illegal scalar but legal
14357 // vector element types. After type-legalization the inserted value is
14358 // extended (i32) and it is safe to cast them to the vector type by ignoring
14359 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
14360 if (!Op0.isUndef()) {
14361 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
14362 Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT, Operand: Op0);
14363 ++i;
14364 }
14365 LLVM_DEBUG(if (i < NumElts) dbgs()
14366 << "Creating nodes for the other vector elements:\n";);
14367 for (; i < NumElts; ++i) {
14368 SDValue V = Op.getOperand(i);
14369 if (V.isUndef())
14370 continue;
14371 SDValue LaneIdx = DAG.getConstant(Val: i, DL: dl, VT: MVT::i64);
14372 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: Vec, N2: V, N3: LaneIdx);
14373 }
14374 return Vec;
14375 }
14376
14377 LLVM_DEBUG(
14378 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
14379 "better alternative\n");
14380 return SDValue();
14381}
14382
14383SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
14384 SelectionDAG &DAG) const {
14385 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
14386 OverrideNEON: !Subtarget->isNeonAvailable()))
14387 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
14388
14389 assert(Op.getValueType().isScalableVector() &&
14390 isTypeLegal(Op.getValueType()) &&
14391 "Expected legal scalable vector type!");
14392
14393 if (isTypeLegal(VT: Op.getOperand(i: 0).getValueType())) {
14394 unsigned NumOperands = Op->getNumOperands();
14395 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
14396 "Unexpected number of operands in CONCAT_VECTORS");
14397
14398 if (NumOperands == 2)
14399 return Op;
14400
14401 // Concat each pair of subvectors and pack into the lower half of the array.
14402 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
14403 while (ConcatOps.size() > 1) {
14404 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
14405 SDValue V1 = ConcatOps[I];
14406 SDValue V2 = ConcatOps[I + 1];
14407 EVT SubVT = V1.getValueType();
14408 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
14409 ConcatOps[I / 2] =
14410 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT: PairVT, N1: V1, N2: V2);
14411 }
14412 ConcatOps.resize(N: ConcatOps.size() / 2);
14413 }
14414 return ConcatOps[0];
14415 }
14416
14417 return SDValue();
14418}
14419
14420SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14421 SelectionDAG &DAG) const {
14422 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
14423
14424 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
14425 OverrideNEON: !Subtarget->isNeonAvailable()))
14426 return LowerFixedLengthInsertVectorElt(Op, DAG);
14427
14428 EVT VT = Op.getOperand(i: 0).getValueType();
14429
14430 if (VT.getScalarType() == MVT::i1) {
14431 EVT VectorVT = getPromotedVTForPredicate(VT);
14432 SDLoc DL(Op);
14433 SDValue ExtendedVector =
14434 DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 0), DL, VT: VectorVT);
14435 SDValue ExtendedValue =
14436 DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 1), DL,
14437 VT: VectorVT.getScalarType().getSizeInBits() < 32
14438 ? MVT::i32
14439 : VectorVT.getScalarType());
14440 ExtendedVector =
14441 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VectorVT, N1: ExtendedVector,
14442 N2: ExtendedValue, N3: Op.getOperand(i: 2));
14443 return DAG.getAnyExtOrTrunc(Op: ExtendedVector, DL, VT);
14444 }
14445
14446 // Check for non-constant or out of range lane.
14447 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
14448 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
14449 return SDValue();
14450
14451 return Op;
14452}
14453
14454SDValue
14455AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14456 SelectionDAG &DAG) const {
14457 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
14458 EVT VT = Op.getOperand(i: 0).getValueType();
14459
14460 if (VT.getScalarType() == MVT::i1) {
14461 // We can't directly extract from an SVE predicate; extend it first.
14462 // (This isn't the only possible lowering, but it's straightforward.)
14463 EVT VectorVT = getPromotedVTForPredicate(VT);
14464 SDLoc DL(Op);
14465 SDValue Extend =
14466 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VectorVT, Operand: Op.getOperand(i: 0));
14467 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
14468 SDValue Extract = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtractTy,
14469 N1: Extend, N2: Op.getOperand(i: 1));
14470 return DAG.getAnyExtOrTrunc(Op: Extract, DL, VT: Op.getValueType());
14471 }
14472
14473 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14474 return LowerFixedLengthExtractVectorElt(Op, DAG);
14475
14476 // Check for non-constant or out of range lane.
14477 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1));
14478 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
14479 return SDValue();
14480
14481 // Insertion/extraction are legal for V128 types.
14482 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
14483 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
14484 VT == MVT::v8f16 || VT == MVT::v8bf16)
14485 return Op;
14486
14487 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
14488 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
14489 VT != MVT::v4bf16)
14490 return SDValue();
14491
14492 // For V64 types, we perform extraction by expanding the value
14493 // to a V128 type and perform the extraction on that.
14494 SDLoc DL(Op);
14495 SDValue WideVec = WidenVector(V64Reg: Op.getOperand(i: 0), DAG);
14496 EVT WideTy = WideVec.getValueType();
14497
14498 EVT ExtrTy = WideTy.getVectorElementType();
14499 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
14500 ExtrTy = MVT::i32;
14501
14502 // For extractions, we just return the result directly.
14503 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtrTy, N1: WideVec,
14504 N2: Op.getOperand(i: 1));
14505}
14506
14507SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
14508 SelectionDAG &DAG) const {
14509 EVT VT = Op.getValueType();
14510 assert(VT.isFixedLengthVector() &&
14511 "Only cases that extract a fixed length vector are supported!");
14512 EVT InVT = Op.getOperand(i: 0).getValueType();
14513
14514 // If we don't have legal types yet, do nothing
14515 if (!isTypeLegal(VT: InVT))
14516 return SDValue();
14517
14518 if (InVT.is128BitVector()) {
14519 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
14520 unsigned Idx = Op.getConstantOperandVal(i: 1);
14521
14522 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
14523 if (Idx == 0)
14524 return Op;
14525
14526 // If this is extracting the upper 64-bits of a 128-bit vector, we match
14527 // that directly.
14528 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
14529 return Op;
14530 }
14531
14532 if (InVT.isScalableVector() ||
14533 useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable())) {
14534 SDLoc DL(Op);
14535 SDValue Vec = Op.getOperand(i: 0);
14536 SDValue Idx = Op.getOperand(i: 1);
14537
14538 EVT PackedVT = getPackedSVEVectorVT(VT: InVT.getVectorElementType());
14539 if (PackedVT != InVT) {
14540 // Pack input into the bottom part of an SVE register and try again.
14541 SDValue Container = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: PackedVT,
14542 N1: DAG.getUNDEF(VT: PackedVT), N2: Vec,
14543 N3: DAG.getVectorIdxConstant(Val: 0, DL));
14544 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Container, N2: Idx);
14545 }
14546
14547 // This will get matched by custom code during ISelDAGToDAG.
14548 if (isNullConstant(V: Idx))
14549 return Op;
14550
14551 assert(InVT.isScalableVector() && "Unexpected vector type!");
14552 // Move requested subvector to the start of the vector and try again.
14553 SDValue Splice = DAG.getNode(Opcode: ISD::VECTOR_SPLICE, DL, VT: InVT, N1: Vec, N2: Vec, N3: Idx);
14554 return convertFromScalableVector(DAG, VT, V: Splice);
14555 }
14556
14557 return SDValue();
14558}
14559
14560SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
14561 SelectionDAG &DAG) const {
14562 assert(Op.getValueType().isScalableVector() &&
14563 "Only expect to lower inserts into scalable vectors!");
14564
14565 EVT InVT = Op.getOperand(i: 1).getValueType();
14566 unsigned Idx = Op.getConstantOperandVal(i: 2);
14567
14568 SDValue Vec0 = Op.getOperand(i: 0);
14569 SDValue Vec1 = Op.getOperand(i: 1);
14570 SDLoc DL(Op);
14571 EVT VT = Op.getValueType();
14572
14573 if (InVT.isScalableVector()) {
14574 if (!isTypeLegal(VT))
14575 return SDValue();
14576
14577 // Break down insert_subvector into simpler parts.
14578 if (VT.getVectorElementType() == MVT::i1) {
14579 unsigned NumElts = VT.getVectorMinNumElements();
14580 EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
14581
14582 SDValue Lo, Hi;
14583 Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
14584 N2: DAG.getVectorIdxConstant(Val: 0, DL));
14585 Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
14586 N2: DAG.getVectorIdxConstant(Val: NumElts / 2, DL));
14587 if (Idx < (NumElts / 2))
14588 Lo = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Lo, N2: Vec1,
14589 N3: DAG.getVectorIdxConstant(Val: Idx, DL));
14590 else
14591 Hi = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Hi, N2: Vec1,
14592 N3: DAG.getVectorIdxConstant(Val: Idx - (NumElts / 2), DL));
14593
14594 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Lo, N2: Hi);
14595 }
14596
14597 // Ensure the subvector is half the size of the main vector.
14598 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
14599 return SDValue();
14600
14601 // Here narrow and wide refers to the vector element types. After "casting"
14602 // both vectors must have the same bit length and so because the subvector
14603 // has fewer elements, those elements need to be bigger.
14604 EVT NarrowVT = getPackedSVEVectorVT(EC: VT.getVectorElementCount());
14605 EVT WideVT = getPackedSVEVectorVT(EC: InVT.getVectorElementCount());
14606
14607 // NOP cast operands to the largest legal vector of the same element count.
14608 if (VT.isFloatingPoint()) {
14609 Vec0 = getSVESafeBitCast(VT: NarrowVT, Op: Vec0, DAG);
14610 Vec1 = getSVESafeBitCast(VT: WideVT, Op: Vec1, DAG);
14611 } else {
14612 // Legal integer vectors are already their largest so Vec0 is fine as is.
14613 Vec1 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: WideVT, Operand: Vec1);
14614 }
14615
14616 // To replace the top/bottom half of vector V with vector SubV we widen the
14617 // preserved half of V, concatenate this to SubV (the order depending on the
14618 // half being replaced) and then narrow the result.
14619 SDValue Narrow;
14620 if (Idx == 0) {
14621 SDValue HiVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL, VT: WideVT, Operand: Vec0);
14622 Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: Vec1, N2: HiVec0);
14623 } else {
14624 assert(Idx == InVT.getVectorMinNumElements() &&
14625 "Invalid subvector index!");
14626 SDValue LoVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL, VT: WideVT, Operand: Vec0);
14627 Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: LoVec0, N2: Vec1);
14628 }
14629
14630 return getSVESafeBitCast(VT, Op: Narrow, DAG);
14631 }
14632
14633 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
14634 // This will be matched by custom code during ISelDAGToDAG.
14635 if (Vec0.isUndef())
14636 return Op;
14637
14638 std::optional<unsigned> PredPattern =
14639 getSVEPredPatternFromNumElements(MinNumElts: InVT.getVectorNumElements());
14640 auto PredTy = VT.changeVectorElementType(EltVT: MVT::i1);
14641 SDValue PTrue = getPTrue(DAG, DL, VT: PredTy, Pattern: *PredPattern);
14642 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, V: Vec1);
14643 return DAG.getNode(Opcode: ISD::VSELECT, DL, VT, N1: PTrue, N2: ScalableVec1, N3: Vec0);
14644 }
14645
14646 return SDValue();
14647}
14648
14649static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
14650 if (Op.getOpcode() != AArch64ISD::DUP &&
14651 Op.getOpcode() != ISD::SPLAT_VECTOR &&
14652 Op.getOpcode() != ISD::BUILD_VECTOR)
14653 return false;
14654
14655 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
14656 !isAllConstantBuildVector(PotentialBVec: Op, ConstVal&: SplatVal))
14657 return false;
14658
14659 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
14660 !isa<ConstantSDNode>(Val: Op->getOperand(Num: 0)))
14661 return false;
14662
14663 SplatVal = Op->getConstantOperandVal(Num: 0);
14664 if (Op.getValueType().getVectorElementType() != MVT::i64)
14665 SplatVal = (int32_t)SplatVal;
14666
14667 Negated = false;
14668 if (isPowerOf2_64(Value: SplatVal))
14669 return true;
14670
14671 Negated = true;
14672 if (isPowerOf2_64(Value: -SplatVal)) {
14673 SplatVal = -SplatVal;
14674 return true;
14675 }
14676
14677 return false;
14678}
14679
14680SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
14681 EVT VT = Op.getValueType();
14682 SDLoc dl(Op);
14683
14684 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
14685 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
14686
14687 assert(VT.isScalableVector() && "Expected a scalable vector.");
14688
14689 bool Signed = Op.getOpcode() == ISD::SDIV;
14690 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14691
14692 bool Negated;
14693 uint64_t SplatVal;
14694 if (Signed && isPow2Splat(Op: Op.getOperand(i: 1), SplatVal, Negated)) {
14695 SDValue Pg = getPredicateForScalableVector(DAG, DL&: dl, VT);
14696 SDValue Res =
14697 DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: dl, VT, N1: Pg, N2: Op->getOperand(Num: 0),
14698 N3: DAG.getTargetConstant(Val: Log2_64(Value: SplatVal), DL: dl, VT: MVT::i32));
14699 if (Negated)
14700 Res = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: DAG.getConstant(Val: 0, DL: dl, VT), N2: Res);
14701
14702 return Res;
14703 }
14704
14705 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14706 return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
14707
14708 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14709 // operations, and truncate the result.
14710 EVT WidenedVT;
14711 if (VT == MVT::nxv16i8)
14712 WidenedVT = MVT::nxv8i16;
14713 else if (VT == MVT::nxv8i16)
14714 WidenedVT = MVT::nxv4i32;
14715 else
14716 llvm_unreachable("Unexpected Custom DIV operation");
14717
14718 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14719 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14720 SDValue Op0Lo = DAG.getNode(Opcode: UnpkLo, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: 0));
14721 SDValue Op1Lo = DAG.getNode(Opcode: UnpkLo, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: 1));
14722 SDValue Op0Hi = DAG.getNode(Opcode: UnpkHi, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: 0));
14723 SDValue Op1Hi = DAG.getNode(Opcode: UnpkHi, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: 1));
14724 SDValue ResultLo = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WidenedVT, N1: Op0Lo, N2: Op1Lo);
14725 SDValue ResultHi = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WidenedVT, N1: Op0Hi, N2: Op1Hi);
14726 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT, N1: ResultLo, N2: ResultHi);
14727}
14728
14729bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
14730 EVT VT, unsigned DefinedValues) const {
14731 if (!Subtarget->isNeonAvailable())
14732 return false;
14733 return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
14734}
14735
14736bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
14737 // Currently no fixed length shuffles that require SVE are legal.
14738 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14739 return false;
14740
14741 if (VT.getVectorNumElements() == 4 &&
14742 (VT.is128BitVector() || VT.is64BitVector())) {
14743 unsigned Cost = getPerfectShuffleCost(M);
14744 if (Cost <= 1)
14745 return true;
14746 }
14747
14748 bool DummyBool;
14749 int DummyInt;
14750 unsigned DummyUnsigned;
14751
14752 unsigned EltSize = VT.getScalarSizeInBits();
14753 unsigned NumElts = VT.getVectorNumElements();
14754 return (ShuffleVectorSDNode::isSplatMask(Mask: &M[0], VT) ||
14755 isREVMask(M, EltSize, NumElts, BlockSize: 64) ||
14756 isREVMask(M, EltSize, NumElts, BlockSize: 32) ||
14757 isREVMask(M, EltSize, NumElts, BlockSize: 16) ||
14758 isEXTMask(M, VT, ReverseEXT&: DummyBool, Imm&: DummyUnsigned) ||
14759 isTRNMask(M, NumElts, WhichResult&: DummyUnsigned) ||
14760 isUZPMask(M, NumElts, WhichResultOut&: DummyUnsigned) ||
14761 isZIPMask(M, NumElts, WhichResultOut&: DummyUnsigned) ||
14762 isTRN_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) ||
14763 isUZP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) ||
14764 isZIP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) ||
14765 isINSMask(M, NumInputElements: NumElts, DstIsLeft&: DummyBool, Anomaly&: DummyInt) ||
14766 isConcatMask(Mask: M, VT, SplitLHS: VT.getSizeInBits() == 128));
14767}
14768
14769bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
14770 EVT VT) const {
14771 // Just delegate to the generic legality, clear masks aren't special.
14772 return isShuffleMaskLegal(M, VT);
14773}
14774
14775/// getVShiftImm - Check if this is a valid build_vector for the immediate
14776/// operand of a vector shift operation, where all the elements of the
14777/// build_vector must have the same constant integer value.
14778static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14779 // Ignore bit_converts.
14780 while (Op.getOpcode() == ISD::BITCAST)
14781 Op = Op.getOperand(i: 0);
14782 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getNode());
14783 APInt SplatBits, SplatUndef;
14784 unsigned SplatBitSize;
14785 bool HasAnyUndefs;
14786 if (!BVN || !BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize,
14787 HasAnyUndefs, MinSplatBits: ElementBits) ||
14788 SplatBitSize > ElementBits)
14789 return false;
14790 Cnt = SplatBits.getSExtValue();
14791 return true;
14792}
14793
14794/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14795/// operand of a vector shift left operation. That value must be in the range:
14796/// 0 <= Value < ElementBits for a left shift; or
14797/// 0 <= Value <= ElementBits for a long left shift.
14798static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14799 assert(VT.isVector() && "vector shift count is not a vector type");
14800 int64_t ElementBits = VT.getScalarSizeInBits();
14801 if (!getVShiftImm(Op, ElementBits, Cnt))
14802 return false;
14803 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14804}
14805
14806/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14807/// operand of a vector shift right operation. The value must be in the range:
14808/// 1 <= Value <= ElementBits for a right shift; or
14809static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14810 assert(VT.isVector() && "vector shift count is not a vector type");
14811 int64_t ElementBits = VT.getScalarSizeInBits();
14812 if (!getVShiftImm(Op, ElementBits, Cnt))
14813 return false;
14814 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14815}
14816
14817SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14818 SelectionDAG &DAG) const {
14819 EVT VT = Op.getValueType();
14820
14821 if (VT.getScalarType() == MVT::i1) {
14822 // Lower i1 truncate to `(x & 1) != 0`.
14823 SDLoc dl(Op);
14824 EVT OpVT = Op.getOperand(i: 0).getValueType();
14825 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: OpVT);
14826 SDValue One = DAG.getConstant(Val: 1, DL: dl, VT: OpVT);
14827 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: OpVT, N1: Op.getOperand(i: 0), N2: One);
14828 return DAG.getSetCC(DL: dl, VT, LHS: And, RHS: Zero, Cond: ISD::SETNE);
14829 }
14830
14831 if (!VT.isVector() || VT.isScalableVector())
14832 return SDValue();
14833
14834 if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: 0).getValueType(),
14835 OverrideNEON: !Subtarget->isNeonAvailable()))
14836 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14837
14838 return SDValue();
14839}
14840
14841// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14842// possibly a truncated type, it tells how many bits of the value are to be
14843// used.
14844static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT,
14845 SelectionDAG &DAG,
14846 unsigned &ShiftValue,
14847 SDValue &RShOperand) {
14848 if (Shift->getOpcode() != ISD::SRL)
14849 return false;
14850
14851 EVT VT = Shift.getValueType();
14852 assert(VT.isScalableVT());
14853
14854 auto ShiftOp1 =
14855 dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Shift->getOperand(Num: 1)));
14856 if (!ShiftOp1)
14857 return false;
14858
14859 ShiftValue = ShiftOp1->getZExtValue();
14860 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14861 return false;
14862
14863 SDValue Add = Shift->getOperand(Num: 0);
14864 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14865 return false;
14866
14867 assert(ResVT.getScalarSizeInBits() <= VT.getScalarSizeInBits() &&
14868 "ResVT must be truncated or same type as the shift.");
14869 // Check if an overflow can lead to incorrect results.
14870 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14871 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14872 return false;
14873
14874 auto AddOp1 =
14875 dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Add->getOperand(Num: 1)));
14876 if (!AddOp1)
14877 return false;
14878 uint64_t AddValue = AddOp1->getZExtValue();
14879 if (AddValue != 1ULL << (ShiftValue - 1))
14880 return false;
14881
14882 RShOperand = Add->getOperand(Num: 0);
14883 return true;
14884}
14885
14886SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14887 SelectionDAG &DAG) const {
14888 EVT VT = Op.getValueType();
14889 SDLoc DL(Op);
14890 int64_t Cnt;
14891
14892 if (!Op.getOperand(i: 1).getValueType().isVector())
14893 return Op;
14894 unsigned EltSize = VT.getScalarSizeInBits();
14895
14896 switch (Op.getOpcode()) {
14897 case ISD::SHL:
14898 if (VT.isScalableVector() ||
14899 useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14900 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SHL_PRED);
14901
14902 if (isVShiftLImm(Op: Op.getOperand(i: 1), VT, isLong: false, Cnt) && Cnt < EltSize)
14903 return DAG.getNode(Opcode: AArch64ISD::VSHL, DL, VT, N1: Op.getOperand(i: 0),
14904 N2: DAG.getConstant(Val: Cnt, DL, VT: MVT::i32));
14905 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
14906 N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_ushl, DL,
14907 VT: MVT::i32),
14908 N2: Op.getOperand(i: 0), N3: Op.getOperand(i: 1));
14909 case ISD::SRA:
14910 case ISD::SRL:
14911 if (VT.isScalableVector() &&
14912 (Subtarget->hasSVE2() ||
14913 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
14914 SDValue RShOperand;
14915 unsigned ShiftValue;
14916 if (canLowerSRLToRoundingShiftForVT(Shift: Op, ResVT: VT, DAG, ShiftValue, RShOperand))
14917 return DAG.getNode(Opcode: AArch64ISD::URSHR_I_PRED, DL, VT,
14918 N1: getPredicateForVector(DAG, DL, VT), N2: RShOperand,
14919 N3: DAG.getTargetConstant(Val: ShiftValue, DL, VT: MVT::i32));
14920 }
14921
14922 if (VT.isScalableVector() ||
14923 useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
14924 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14925 : AArch64ISD::SRL_PRED;
14926 return LowerToPredicatedOp(Op, DAG, NewOp: Opc);
14927 }
14928
14929 // Right shift immediate
14930 if (isVShiftRImm(Op: Op.getOperand(i: 1), VT, isNarrow: false, Cnt) && Cnt < EltSize) {
14931 unsigned Opc =
14932 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14933 return DAG.getNode(Opcode: Opc, DL, VT, N1: Op.getOperand(i: 0),
14934 N2: DAG.getConstant(Val: Cnt, DL, VT: MVT::i32), Flags: Op->getFlags());
14935 }
14936
14937 // Right shift register. Note, there is not a shift right register
14938 // instruction, but the shift left register instruction takes a signed
14939 // value, where negative numbers specify a right shift.
14940 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14941 : Intrinsic::aarch64_neon_ushl;
14942 // negate the shift amount
14943 SDValue NegShift = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT),
14944 N2: Op.getOperand(i: 1));
14945 SDValue NegShiftLeft =
14946 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
14947 N1: DAG.getConstant(Val: Opc, DL, VT: MVT::i32), N2: Op.getOperand(i: 0),
14948 N3: NegShift);
14949 return NegShiftLeft;
14950 }
14951
14952 llvm_unreachable("unexpected shift opcode");
14953}
14954
14955static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
14956 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14957 const SDLoc &dl, SelectionDAG &DAG) {
14958 EVT SrcVT = LHS.getValueType();
14959 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14960 "function only supposed to emit natural comparisons");
14961
14962 APInt SplatValue;
14963 APInt SplatUndef;
14964 unsigned SplatBitSize = 0;
14965 bool HasAnyUndefs;
14966
14967 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
14968 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14969 SplatBitSize, HasAnyUndefs);
14970
14971 bool IsZero = IsCnst && SplatValue == 0;
14972 bool IsOne =
14973 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14974 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14975
14976 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14977 switch (CC) {
14978 default:
14979 return SDValue();
14980 case AArch64CC::NE: {
14981 SDValue Fcmeq;
14982 if (IsZero)
14983 Fcmeq = DAG.getNode(Opcode: AArch64ISD::FCMEQz, DL: dl, VT, Operand: LHS);
14984 else
14985 Fcmeq = DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14986 return DAG.getNOT(DL: dl, Val: Fcmeq, VT);
14987 }
14988 case AArch64CC::EQ:
14989 if (IsZero)
14990 return DAG.getNode(Opcode: AArch64ISD::FCMEQz, DL: dl, VT, Operand: LHS);
14991 return DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14992 case AArch64CC::GE:
14993 if (IsZero)
14994 return DAG.getNode(Opcode: AArch64ISD::FCMGEz, DL: dl, VT, Operand: LHS);
14995 return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL: dl, VT, N1: LHS, N2: RHS);
14996 case AArch64CC::GT:
14997 if (IsZero)
14998 return DAG.getNode(Opcode: AArch64ISD::FCMGTz, DL: dl, VT, Operand: LHS);
14999 return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL: dl, VT, N1: LHS, N2: RHS);
15000 case AArch64CC::LE:
15001 if (!NoNans)
15002 return SDValue();
15003 // If we ignore NaNs then we can use to the LS implementation.
15004 [[fallthrough]];
15005 case AArch64CC::LS:
15006 if (IsZero)
15007 return DAG.getNode(Opcode: AArch64ISD::FCMLEz, DL: dl, VT, Operand: LHS);
15008 return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL: dl, VT, N1: RHS, N2: LHS);
15009 case AArch64CC::LT:
15010 if (!NoNans)
15011 return SDValue();
15012 // If we ignore NaNs then we can use to the MI implementation.
15013 [[fallthrough]];
15014 case AArch64CC::MI:
15015 if (IsZero)
15016 return DAG.getNode(Opcode: AArch64ISD::FCMLTz, DL: dl, VT, Operand: LHS);
15017 return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL: dl, VT, N1: RHS, N2: LHS);
15018 }
15019 }
15020
15021 switch (CC) {
15022 default:
15023 return SDValue();
15024 case AArch64CC::NE: {
15025 SDValue Cmeq;
15026 if (IsZero)
15027 Cmeq = DAG.getNode(Opcode: AArch64ISD::CMEQz, DL: dl, VT, Operand: LHS);
15028 else
15029 Cmeq = DAG.getNode(Opcode: AArch64ISD::CMEQ, DL: dl, VT, N1: LHS, N2: RHS);
15030 return DAG.getNOT(DL: dl, Val: Cmeq, VT);
15031 }
15032 case AArch64CC::EQ:
15033 if (IsZero)
15034 return DAG.getNode(Opcode: AArch64ISD::CMEQz, DL: dl, VT, Operand: LHS);
15035 return DAG.getNode(Opcode: AArch64ISD::CMEQ, DL: dl, VT, N1: LHS, N2: RHS);
15036 case AArch64CC::GE:
15037 if (IsZero)
15038 return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: dl, VT, Operand: LHS);
15039 return DAG.getNode(Opcode: AArch64ISD::CMGE, DL: dl, VT, N1: LHS, N2: RHS);
15040 case AArch64CC::GT:
15041 if (IsZero)
15042 return DAG.getNode(Opcode: AArch64ISD::CMGTz, DL: dl, VT, Operand: LHS);
15043 if (IsMinusOne)
15044 return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: dl, VT, N1: LHS, N2: RHS);
15045 return DAG.getNode(Opcode: AArch64ISD::CMGT, DL: dl, VT, N1: LHS, N2: RHS);
15046 case AArch64CC::LE:
15047 if (IsZero)
15048 return DAG.getNode(Opcode: AArch64ISD::CMLEz, DL: dl, VT, Operand: LHS);
15049 return DAG.getNode(Opcode: AArch64ISD::CMGE, DL: dl, VT, N1: RHS, N2: LHS);
15050 case AArch64CC::LS:
15051 return DAG.getNode(Opcode: AArch64ISD::CMHS, DL: dl, VT, N1: RHS, N2: LHS);
15052 case AArch64CC::LO:
15053 return DAG.getNode(Opcode: AArch64ISD::CMHI, DL: dl, VT, N1: RHS, N2: LHS);
15054 case AArch64CC::LT:
15055 if (IsZero)
15056 return DAG.getNode(Opcode: AArch64ISD::CMLTz, DL: dl, VT, Operand: LHS);
15057 if (IsOne)
15058 return DAG.getNode(Opcode: AArch64ISD::CMLEz, DL: dl, VT, Operand: LHS);
15059 return DAG.getNode(Opcode: AArch64ISD::CMGT, DL: dl, VT, N1: RHS, N2: LHS);
15060 case AArch64CC::HI:
15061 return DAG.getNode(Opcode: AArch64ISD::CMHI, DL: dl, VT, N1: LHS, N2: RHS);
15062 case AArch64CC::HS:
15063 return DAG.getNode(Opcode: AArch64ISD::CMHS, DL: dl, VT, N1: LHS, N2: RHS);
15064 }
15065}
15066
15067SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
15068 SelectionDAG &DAG) const {
15069 if (Op.getValueType().isScalableVector())
15070 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SETCC_MERGE_ZERO);
15071
15072 if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: 0).getValueType(),
15073 OverrideNEON: !Subtarget->isNeonAvailable()))
15074 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
15075
15076 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
15077 SDValue LHS = Op.getOperand(i: 0);
15078 SDValue RHS = Op.getOperand(i: 1);
15079 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
15080 SDLoc dl(Op);
15081
15082 if (LHS.getValueType().getVectorElementType().isInteger()) {
15083 assert(LHS.getValueType() == RHS.getValueType());
15084 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
15085 SDValue Cmp =
15086 EmitVectorComparison(LHS, RHS, CC: AArch64CC, NoNans: false, VT: CmpVT, dl, DAG);
15087 return DAG.getSExtOrTrunc(Op: Cmp, DL: dl, VT: Op.getValueType());
15088 }
15089
15090 // Lower isnan(x) | isnan(never-nan) to x != x.
15091 // Lower !isnan(x) & !isnan(never-nan) to x == x.
15092 if (CC == ISD::SETUO || CC == ISD::SETO) {
15093 bool OneNaN = false;
15094 if (LHS == RHS) {
15095 OneNaN = true;
15096 } else if (DAG.isKnownNeverNaN(Op: RHS)) {
15097 OneNaN = true;
15098 RHS = LHS;
15099 } else if (DAG.isKnownNeverNaN(Op: LHS)) {
15100 OneNaN = true;
15101 LHS = RHS;
15102 }
15103 if (OneNaN) {
15104 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
15105 }
15106 }
15107
15108 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
15109
15110 // Make v4f16 (only) fcmp operations utilise vector instructions
15111 // v8f16 support will be a litle more complicated
15112 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
15113 LHS.getValueType().getVectorElementType() == MVT::bf16) {
15114 if (LHS.getValueType().getVectorNumElements() == 4) {
15115 LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::v4f32, Operand: LHS);
15116 RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::v4f32, Operand: RHS);
15117 SDValue NewSetcc = DAG.getSetCC(DL: dl, VT: MVT::v4i16, LHS, RHS, Cond: CC);
15118 DAG.ReplaceAllUsesWith(From: Op, To: NewSetcc);
15119 CmpVT = MVT::v4i32;
15120 } else
15121 return SDValue();
15122 }
15123
15124 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
15125 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
15126 LHS.getValueType().getVectorElementType() != MVT::f128);
15127
15128 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
15129 // clean. Some of them require two branches to implement.
15130 AArch64CC::CondCode CC1, CC2;
15131 bool ShouldInvert;
15132 changeVectorFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2, Invert&: ShouldInvert);
15133
15134 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
15135 SDValue Cmp =
15136 EmitVectorComparison(LHS, RHS, CC: CC1, NoNans: NoNaNs, VT: CmpVT, dl, DAG);
15137 if (!Cmp.getNode())
15138 return SDValue();
15139
15140 if (CC2 != AArch64CC::AL) {
15141 SDValue Cmp2 =
15142 EmitVectorComparison(LHS, RHS, CC: CC2, NoNans: NoNaNs, VT: CmpVT, dl, DAG);
15143 if (!Cmp2.getNode())
15144 return SDValue();
15145
15146 Cmp = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: CmpVT, N1: Cmp, N2: Cmp2);
15147 }
15148
15149 Cmp = DAG.getSExtOrTrunc(Op: Cmp, DL: dl, VT: Op.getValueType());
15150
15151 if (ShouldInvert)
15152 Cmp = DAG.getNOT(DL: dl, Val: Cmp, VT: Cmp.getValueType());
15153
15154 return Cmp;
15155}
15156
15157static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
15158 SelectionDAG &DAG) {
15159 SDValue VecOp = ScalarOp.getOperand(i: 0);
15160 auto Rdx = DAG.getNode(Opcode: Op, DL, VT: VecOp.getSimpleValueType(), Operand: VecOp);
15161 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarOp.getValueType(), N1: Rdx,
15162 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
15163}
15164
15165static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
15166 SDLoc DL, SelectionDAG &DAG) {
15167 unsigned ScalarOpcode;
15168 switch (Opcode) {
15169 case ISD::VECREDUCE_AND:
15170 ScalarOpcode = ISD::AND;
15171 break;
15172 case ISD::VECREDUCE_OR:
15173 ScalarOpcode = ISD::OR;
15174 break;
15175 case ISD::VECREDUCE_XOR:
15176 ScalarOpcode = ISD::XOR;
15177 break;
15178 default:
15179 llvm_unreachable("Expected bitwise vector reduction");
15180 return SDValue();
15181 }
15182
15183 EVT VecVT = Vec.getValueType();
15184 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
15185 "Expected power-of-2 length vector");
15186
15187 EVT ElemVT = VecVT.getVectorElementType();
15188
15189 SDValue Result;
15190 unsigned NumElems = VecVT.getVectorNumElements();
15191
15192 // Special case for boolean reductions
15193 if (ElemVT == MVT::i1) {
15194 // Split large vectors into smaller ones
15195 if (NumElems > 16) {
15196 SDValue Lo, Hi;
15197 std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
15198 EVT HalfVT = Lo.getValueType();
15199 SDValue HalfVec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: HalfVT, N1: Lo, N2: Hi);
15200 return getVectorBitwiseReduce(Opcode, Vec: HalfVec, VT, DL, DAG);
15201 }
15202
15203 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
15204 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
15205 // this element size leads to the best codegen, since e.g. setcc results
15206 // might need to be truncated otherwise.
15207 EVT ExtendedVT = MVT::getIntegerVT(BitWidth: std::max(a: 64u / NumElems, b: 8u));
15208
15209 // any_ext doesn't work with umin/umax, so only use it for uadd.
15210 unsigned ExtendOp =
15211 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
15212 SDValue Extended = DAG.getNode(
15213 Opcode: ExtendOp, DL, VT: VecVT.changeVectorElementType(EltVT: ExtendedVT), Operand: Vec);
15214 switch (ScalarOpcode) {
15215 case ISD::AND:
15216 Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMIN, DL, VT: ExtendedVT, Operand: Extended);
15217 break;
15218 case ISD::OR:
15219 Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMAX, DL, VT: ExtendedVT, Operand: Extended);
15220 break;
15221 case ISD::XOR:
15222 Result = DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ExtendedVT, Operand: Extended);
15223 break;
15224 default:
15225 llvm_unreachable("Unexpected Opcode");
15226 }
15227
15228 Result = DAG.getAnyExtOrTrunc(Op: Result, DL, VT: MVT::i1);
15229 } else {
15230 // Iteratively split the vector in half and combine using the bitwise
15231 // operation until it fits in a 64 bit register.
15232 while (VecVT.getSizeInBits() > 64) {
15233 SDValue Lo, Hi;
15234 std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
15235 VecVT = Lo.getValueType();
15236 NumElems = VecVT.getVectorNumElements();
15237 Vec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: VecVT, N1: Lo, N2: Hi);
15238 }
15239
15240 EVT ScalarVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VecVT.getSizeInBits());
15241
15242 // Do the remaining work on a scalar since it allows the code generator to
15243 // combine the shift and bitwise operation into one instruction and since
15244 // integer instructions can have higher throughput than vector instructions.
15245 SDValue Scalar = DAG.getBitcast(VT: ScalarVT, V: Vec);
15246
15247 // Iteratively combine the lower and upper halves of the scalar using the
15248 // bitwise operation, halving the relevant region of the scalar in each
15249 // iteration, until the relevant region is just one element of the original
15250 // vector.
15251 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
15252 SDValue ShiftAmount =
15253 DAG.getConstant(Val: Shift * ElemVT.getSizeInBits(), DL, VT: MVT::i64);
15254 SDValue Shifted =
15255 DAG.getNode(Opcode: ISD::SRL, DL, VT: ScalarVT, N1: Scalar, N2: ShiftAmount);
15256 Scalar = DAG.getNode(Opcode: ScalarOpcode, DL, VT: ScalarVT, N1: Scalar, N2: Shifted);
15257 }
15258
15259 Result = DAG.getAnyExtOrTrunc(Op: Scalar, DL, VT: ElemVT);
15260 }
15261
15262 return DAG.getAnyExtOrTrunc(Op: Result, DL, VT);
15263}
15264
15265SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
15266 SelectionDAG &DAG) const {
15267 SDValue Src = Op.getOperand(i: 0);
15268
15269 // Try to lower fixed length reductions to SVE.
15270 EVT SrcVT = Src.getValueType();
15271 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15272 Op.getOpcode() == ISD::VECREDUCE_AND ||
15273 Op.getOpcode() == ISD::VECREDUCE_OR ||
15274 Op.getOpcode() == ISD::VECREDUCE_XOR ||
15275 Op.getOpcode() == ISD::VECREDUCE_FADD ||
15276 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
15277 SrcVT.getVectorElementType() == MVT::i64);
15278 if (SrcVT.isScalableVector() ||
15279 useSVEForFixedLengthVectorVT(
15280 VT: SrcVT, OverrideNEON: OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
15281
15282 if (SrcVT.getVectorElementType() == MVT::i1)
15283 return LowerPredReductionToSVE(ScalarOp: Op, DAG);
15284
15285 switch (Op.getOpcode()) {
15286 case ISD::VECREDUCE_ADD:
15287 return LowerReductionToSVE(Opcode: AArch64ISD::UADDV_PRED, ScalarOp: Op, DAG);
15288 case ISD::VECREDUCE_AND:
15289 return LowerReductionToSVE(Opcode: AArch64ISD::ANDV_PRED, ScalarOp: Op, DAG);
15290 case ISD::VECREDUCE_OR:
15291 return LowerReductionToSVE(Opcode: AArch64ISD::ORV_PRED, ScalarOp: Op, DAG);
15292 case ISD::VECREDUCE_SMAX:
15293 return LowerReductionToSVE(Opcode: AArch64ISD::SMAXV_PRED, ScalarOp: Op, DAG);
15294 case ISD::VECREDUCE_SMIN:
15295 return LowerReductionToSVE(Opcode: AArch64ISD::SMINV_PRED, ScalarOp: Op, DAG);
15296 case ISD::VECREDUCE_UMAX:
15297 return LowerReductionToSVE(Opcode: AArch64ISD::UMAXV_PRED, ScalarOp: Op, DAG);
15298 case ISD::VECREDUCE_UMIN:
15299 return LowerReductionToSVE(Opcode: AArch64ISD::UMINV_PRED, ScalarOp: Op, DAG);
15300 case ISD::VECREDUCE_XOR:
15301 return LowerReductionToSVE(Opcode: AArch64ISD::EORV_PRED, ScalarOp: Op, DAG);
15302 case ISD::VECREDUCE_FADD:
15303 return LowerReductionToSVE(Opcode: AArch64ISD::FADDV_PRED, ScalarOp: Op, DAG);
15304 case ISD::VECREDUCE_FMAX:
15305 return LowerReductionToSVE(Opcode: AArch64ISD::FMAXNMV_PRED, ScalarOp: Op, DAG);
15306 case ISD::VECREDUCE_FMIN:
15307 return LowerReductionToSVE(Opcode: AArch64ISD::FMINNMV_PRED, ScalarOp: Op, DAG);
15308 case ISD::VECREDUCE_FMAXIMUM:
15309 return LowerReductionToSVE(Opcode: AArch64ISD::FMAXV_PRED, ScalarOp: Op, DAG);
15310 case ISD::VECREDUCE_FMINIMUM:
15311 return LowerReductionToSVE(Opcode: AArch64ISD::FMINV_PRED, ScalarOp: Op, DAG);
15312 default:
15313 llvm_unreachable("Unhandled fixed length reduction");
15314 }
15315 }
15316
15317 // Lower NEON reductions.
15318 SDLoc dl(Op);
15319 switch (Op.getOpcode()) {
15320 case ISD::VECREDUCE_AND:
15321 case ISD::VECREDUCE_OR:
15322 case ISD::VECREDUCE_XOR:
15323 return getVectorBitwiseReduce(Opcode: Op.getOpcode(), Vec: Op.getOperand(i: 0),
15324 VT: Op.getValueType(), DL: dl, DAG);
15325 case ISD::VECREDUCE_ADD:
15326 return getReductionSDNode(Op: AArch64ISD::UADDV, DL: dl, ScalarOp: Op, DAG);
15327 case ISD::VECREDUCE_SMAX:
15328 return getReductionSDNode(Op: AArch64ISD::SMAXV, DL: dl, ScalarOp: Op, DAG);
15329 case ISD::VECREDUCE_SMIN:
15330 return getReductionSDNode(Op: AArch64ISD::SMINV, DL: dl, ScalarOp: Op, DAG);
15331 case ISD::VECREDUCE_UMAX:
15332 return getReductionSDNode(Op: AArch64ISD::UMAXV, DL: dl, ScalarOp: Op, DAG);
15333 case ISD::VECREDUCE_UMIN:
15334 return getReductionSDNode(Op: AArch64ISD::UMINV, DL: dl, ScalarOp: Op, DAG);
15335 default:
15336 llvm_unreachable("Unhandled reduction");
15337 }
15338}
15339
15340SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
15341 SelectionDAG &DAG) const {
15342 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15343 // No point replacing if we don't have the relevant instruction/libcall anyway
15344 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
15345 return SDValue();
15346
15347 // LSE has an atomic load-clear instruction, but not a load-and.
15348 SDLoc dl(Op);
15349 MVT VT = Op.getSimpleValueType();
15350 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
15351 SDValue RHS = Op.getOperand(i: 2);
15352 AtomicSDNode *AN = cast<AtomicSDNode>(Val: Op.getNode());
15353 RHS = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: DAG.getConstant(Val: -1ULL, DL: dl, VT), N2: RHS);
15354 return DAG.getAtomic(Opcode: ISD::ATOMIC_LOAD_CLR, dl, MemVT: AN->getMemoryVT(),
15355 Chain: Op.getOperand(i: 0), Ptr: Op.getOperand(i: 1), Val: RHS,
15356 MMO: AN->getMemOperand());
15357}
15358
15359SDValue
15360AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
15361 SelectionDAG &DAG) const {
15362
15363 SDLoc dl(Op);
15364 // Get the inputs.
15365 SDNode *Node = Op.getNode();
15366 SDValue Chain = Op.getOperand(i: 0);
15367 SDValue Size = Op.getOperand(i: 1);
15368 MaybeAlign Align =
15369 cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getMaybeAlignValue();
15370 EVT VT = Node->getValueType(ResNo: 0);
15371
15372 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
15373 Kind: "no-stack-arg-probe")) {
15374 SDValue SP = DAG.getCopyFromReg(Chain, dl, Reg: AArch64::SP, VT: MVT::i64);
15375 Chain = SP.getValue(R: 1);
15376 SP = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: MVT::i64, N1: SP, N2: Size);
15377 if (Align)
15378 SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: 0),
15379 N2: DAG.getConstant(Val: -(uint64_t)Align->value(), DL: dl, VT));
15380 Chain = DAG.getCopyToReg(Chain, dl, Reg: AArch64::SP, N: SP);
15381 SDValue Ops[2] = {SP, Chain};
15382 return DAG.getMergeValues(Ops, dl);
15383 }
15384
15385 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL: dl);
15386
15387 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
15388 SDValue Callee = DAG.getTargetExternalSymbol(Sym: Subtarget->getChkStkName(),
15389 VT: PtrVT, TargetFlags: 0);
15390
15391 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
15392 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
15393 if (Subtarget->hasCustomCallingConv())
15394 TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
15395
15396 Size = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i64, N1: Size,
15397 N2: DAG.getConstant(Val: 4, DL: dl, VT: MVT::i64));
15398 Chain = DAG.getCopyToReg(Chain, dl, Reg: AArch64::X15, N: Size, Glue: SDValue());
15399 Chain =
15400 DAG.getNode(Opcode: AArch64ISD::CALL, DL: dl, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue),
15401 N1: Chain, N2: Callee, N3: DAG.getRegister(Reg: AArch64::X15, VT: MVT::i64),
15402 N4: DAG.getRegisterMask(RegMask: Mask), N5: Chain.getValue(R: 1));
15403 // To match the actual intent better, we should read the output from X15 here
15404 // again (instead of potentially spilling it to the stack), but rereading Size
15405 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
15406 // here.
15407
15408 Size = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i64, N1: Size,
15409 N2: DAG.getConstant(Val: 4, DL: dl, VT: MVT::i64));
15410
15411 SDValue SP = DAG.getCopyFromReg(Chain, dl, Reg: AArch64::SP, VT: MVT::i64);
15412 Chain = SP.getValue(R: 1);
15413 SP = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: MVT::i64, N1: SP, N2: Size);
15414 if (Align)
15415 SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: 0),
15416 N2: DAG.getConstant(Val: -(uint64_t)Align->value(), DL: dl, VT));
15417 Chain = DAG.getCopyToReg(Chain, dl, Reg: AArch64::SP, N: SP);
15418
15419 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL: dl);
15420
15421 SDValue Ops[2] = {SP, Chain};
15422 return DAG.getMergeValues(Ops, dl);
15423}
15424
15425SDValue
15426AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
15427 SelectionDAG &DAG) const {
15428 // Get the inputs.
15429 SDNode *Node = Op.getNode();
15430 SDValue Chain = Op.getOperand(i: 0);
15431 SDValue Size = Op.getOperand(i: 1);
15432
15433 MaybeAlign Align =
15434 cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getMaybeAlignValue();
15435 SDLoc dl(Op);
15436 EVT VT = Node->getValueType(ResNo: 0);
15437
15438 // Construct the new SP value in a GPR.
15439 SDValue SP = DAG.getCopyFromReg(Chain, dl, Reg: AArch64::SP, VT: MVT::i64);
15440 Chain = SP.getValue(R: 1);
15441 SP = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: MVT::i64, N1: SP, N2: Size);
15442 if (Align)
15443 SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: 0),
15444 N2: DAG.getConstant(Val: -(uint64_t)Align->value(), DL: dl, VT));
15445
15446 // Set the real SP to the new value with a probing loop.
15447 Chain = DAG.getNode(Opcode: AArch64ISD::PROBED_ALLOCA, DL: dl, VT: MVT::Other, N1: Chain, N2: SP);
15448 SDValue Ops[2] = {SP, Chain};
15449 return DAG.getMergeValues(Ops, dl);
15450}
15451
15452SDValue
15453AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
15454 SelectionDAG &DAG) const {
15455 MachineFunction &MF = DAG.getMachineFunction();
15456
15457 if (Subtarget->isTargetWindows())
15458 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
15459 else if (hasInlineStackProbe(MF))
15460 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
15461 else
15462 return SDValue();
15463}
15464
15465SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
15466 unsigned NewOp) const {
15467 if (Subtarget->hasSVE2())
15468 return LowerToPredicatedOp(Op, DAG, NewOp);
15469
15470 // Default to expand.
15471 return SDValue();
15472}
15473
15474SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
15475 SelectionDAG &DAG) const {
15476 EVT VT = Op.getValueType();
15477 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
15478
15479 SDLoc DL(Op);
15480 APInt MulImm = Op.getConstantOperandAPInt(i: 0);
15481 return DAG.getZExtOrTrunc(Op: DAG.getVScale(DL, VT: MVT::i64, MulImm: MulImm.sext(width: 64)), DL,
15482 VT);
15483}
15484
15485/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
15486template <unsigned NumVecs>
15487static bool
15488setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
15489 AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
15490 Info.opc = ISD::INTRINSIC_VOID;
15491 // Retrieve EC from first vector argument.
15492 const EVT VT = TLI.getMemValueType(DL, Ty: CI.getArgOperand(i: 0)->getType());
15493 ElementCount EC = VT.getVectorElementCount();
15494#ifndef NDEBUG
15495 // Check the assumption that all input vectors are the same type.
15496 for (unsigned I = 0; I < NumVecs; ++I)
15497 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
15498 "Invalid type.");
15499#endif
15500 // memVT is `NumVecs * VT`.
15501 Info.memVT = EVT::getVectorVT(Context&: CI.getType()->getContext(), VT: VT.getScalarType(),
15502 EC: EC * NumVecs);
15503 Info.ptrVal = CI.getArgOperand(i: CI.arg_size() - 1);
15504 Info.offset = 0;
15505 Info.align.reset();
15506 Info.flags = MachineMemOperand::MOStore;
15507 return true;
15508}
15509
15510/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
15511/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
15512/// specified in the intrinsic calls.
15513bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
15514 const CallInst &I,
15515 MachineFunction &MF,
15516 unsigned Intrinsic) const {
15517 auto &DL = I.getDataLayout();
15518 switch (Intrinsic) {
15519 case Intrinsic::aarch64_sve_st2:
15520 return setInfoSVEStN<2>(TLI: *this, DL, Info, CI: I);
15521 case Intrinsic::aarch64_sve_st3:
15522 return setInfoSVEStN<3>(TLI: *this, DL, Info, CI: I);
15523 case Intrinsic::aarch64_sve_st4:
15524 return setInfoSVEStN<4>(TLI: *this, DL, Info, CI: I);
15525 case Intrinsic::aarch64_neon_ld2:
15526 case Intrinsic::aarch64_neon_ld3:
15527 case Intrinsic::aarch64_neon_ld4:
15528 case Intrinsic::aarch64_neon_ld1x2:
15529 case Intrinsic::aarch64_neon_ld1x3:
15530 case Intrinsic::aarch64_neon_ld1x4: {
15531 Info.opc = ISD::INTRINSIC_W_CHAIN;
15532 uint64_t NumElts = DL.getTypeSizeInBits(Ty: I.getType()) / 64;
15533 Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: MVT::i64, NumElements: NumElts);
15534 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
15535 Info.offset = 0;
15536 Info.align.reset();
15537 // volatile loads with NEON intrinsics not supported
15538 Info.flags = MachineMemOperand::MOLoad;
15539 return true;
15540 }
15541 case Intrinsic::aarch64_neon_ld2lane:
15542 case Intrinsic::aarch64_neon_ld3lane:
15543 case Intrinsic::aarch64_neon_ld4lane:
15544 case Intrinsic::aarch64_neon_ld2r:
15545 case Intrinsic::aarch64_neon_ld3r:
15546 case Intrinsic::aarch64_neon_ld4r: {
15547 Info.opc = ISD::INTRINSIC_W_CHAIN;
15548 // ldx return struct with the same vec type
15549 Type *RetTy = I.getType();
15550 auto *StructTy = cast<StructType>(Val: RetTy);
15551 unsigned NumElts = StructTy->getNumElements();
15552 Type *VecTy = StructTy->getElementType(N: 0);
15553 MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
15554 Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
15555 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
15556 Info.offset = 0;
15557 Info.align.reset();
15558 // volatile loads with NEON intrinsics not supported
15559 Info.flags = MachineMemOperand::MOLoad;
15560 return true;
15561 }
15562 case Intrinsic::aarch64_neon_st2:
15563 case Intrinsic::aarch64_neon_st3:
15564 case Intrinsic::aarch64_neon_st4:
15565 case Intrinsic::aarch64_neon_st1x2:
15566 case Intrinsic::aarch64_neon_st1x3:
15567 case Intrinsic::aarch64_neon_st1x4: {
15568 Info.opc = ISD::INTRINSIC_VOID;
15569 unsigned NumElts = 0;
15570 for (const Value *Arg : I.args()) {
15571 Type *ArgTy = Arg->getType();
15572 if (!ArgTy->isVectorTy())
15573 break;
15574 NumElts += DL.getTypeSizeInBits(Ty: ArgTy) / 64;
15575 }
15576 Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: MVT::i64, NumElements: NumElts);
15577 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
15578 Info.offset = 0;
15579 Info.align.reset();
15580 // volatile stores with NEON intrinsics not supported
15581 Info.flags = MachineMemOperand::MOStore;
15582 return true;
15583 }
15584 case Intrinsic::aarch64_neon_st2lane:
15585 case Intrinsic::aarch64_neon_st3lane:
15586 case Intrinsic::aarch64_neon_st4lane: {
15587 Info.opc = ISD::INTRINSIC_VOID;
15588 unsigned NumElts = 0;
15589 // all the vector type is same
15590 Type *VecTy = I.getArgOperand(i: 0)->getType();
15591 MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
15592
15593 for (const Value *Arg : I.args()) {
15594 Type *ArgTy = Arg->getType();
15595 if (!ArgTy->isVectorTy())
15596 break;
15597 NumElts += 1;
15598 }
15599
15600 Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
15601 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
15602 Info.offset = 0;
15603 Info.align.reset();
15604 // volatile stores with NEON intrinsics not supported
15605 Info.flags = MachineMemOperand::MOStore;
15606 return true;
15607 }
15608 case Intrinsic::aarch64_ldaxr:
15609 case Intrinsic::aarch64_ldxr: {
15610 Type *ValTy = I.getParamElementType(ArgNo: 0);
15611 Info.opc = ISD::INTRINSIC_W_CHAIN;
15612 Info.memVT = MVT::getVT(Ty: ValTy);
15613 Info.ptrVal = I.getArgOperand(i: 0);
15614 Info.offset = 0;
15615 Info.align = DL.getABITypeAlign(Ty: ValTy);
15616 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
15617 return true;
15618 }
15619 case Intrinsic::aarch64_stlxr:
15620 case Intrinsic::aarch64_stxr: {
15621 Type *ValTy = I.getParamElementType(ArgNo: 1);
15622 Info.opc = ISD::INTRINSIC_W_CHAIN;
15623 Info.memVT = MVT::getVT(Ty: ValTy);
15624 Info.ptrVal = I.getArgOperand(i: 1);
15625 Info.offset = 0;
15626 Info.align = DL.getABITypeAlign(Ty: ValTy);
15627 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
15628 return true;
15629 }
15630 case Intrinsic::aarch64_ldaxp:
15631 case Intrinsic::aarch64_ldxp:
15632 Info.opc = ISD::INTRINSIC_W_CHAIN;
15633 Info.memVT = MVT::i128;
15634 Info.ptrVal = I.getArgOperand(i: 0);
15635 Info.offset = 0;
15636 Info.align = Align(16);
15637 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
15638 return true;
15639 case Intrinsic::aarch64_stlxp:
15640 case Intrinsic::aarch64_stxp:
15641 Info.opc = ISD::INTRINSIC_W_CHAIN;
15642 Info.memVT = MVT::i128;
15643 Info.ptrVal = I.getArgOperand(i: 2);
15644 Info.offset = 0;
15645 Info.align = Align(16);
15646 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
15647 return true;
15648 case Intrinsic::aarch64_sve_ldnt1: {
15649 Type *ElTy = cast<VectorType>(Val: I.getType())->getElementType();
15650 Info.opc = ISD::INTRINSIC_W_CHAIN;
15651 Info.memVT = MVT::getVT(Ty: I.getType());
15652 Info.ptrVal = I.getArgOperand(i: 1);
15653 Info.offset = 0;
15654 Info.align = DL.getABITypeAlign(Ty: ElTy);
15655 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
15656 return true;
15657 }
15658 case Intrinsic::aarch64_sve_stnt1: {
15659 Type *ElTy =
15660 cast<VectorType>(Val: I.getArgOperand(i: 0)->getType())->getElementType();
15661 Info.opc = ISD::INTRINSIC_W_CHAIN;
15662 Info.memVT = MVT::getVT(Ty: I.getOperand(i_nocapture: 0)->getType());
15663 Info.ptrVal = I.getArgOperand(i: 2);
15664 Info.offset = 0;
15665 Info.align = DL.getABITypeAlign(Ty: ElTy);
15666 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
15667 return true;
15668 }
15669 case Intrinsic::aarch64_mops_memset_tag: {
15670 Value *Dst = I.getArgOperand(i: 0);
15671 Value *Val = I.getArgOperand(i: 1);
15672 Info.opc = ISD::INTRINSIC_W_CHAIN;
15673 Info.memVT = MVT::getVT(Ty: Val->getType());
15674 Info.ptrVal = Dst;
15675 Info.offset = 0;
15676 Info.align = I.getParamAlign(ArgNo: 0).valueOrOne();
15677 Info.flags = MachineMemOperand::MOStore;
15678 // The size of the memory being operated on is unknown at this point
15679 Info.size = MemoryLocation::UnknownSize;
15680 return true;
15681 }
15682 default:
15683 break;
15684 }
15685
15686 return false;
15687}
15688
15689bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
15690 ISD::LoadExtType ExtTy,
15691 EVT NewVT) const {
15692 // TODO: This may be worth removing. Check regression tests for diffs.
15693 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15694 return false;
15695
15696 // If we're reducing the load width in order to avoid having to use an extra
15697 // instruction to do extension then it's probably a good idea.
15698 if (ExtTy != ISD::NON_EXTLOAD)
15699 return true;
15700 // Don't reduce load width if it would prevent us from combining a shift into
15701 // the offset.
15702 MemSDNode *Mem = dyn_cast<MemSDNode>(Val: Load);
15703 assert(Mem);
15704 const SDValue &Base = Mem->getBasePtr();
15705 if (Base.getOpcode() == ISD::ADD &&
15706 Base.getOperand(i: 1).getOpcode() == ISD::SHL &&
15707 Base.getOperand(i: 1).hasOneUse() &&
15708 Base.getOperand(i: 1).getOperand(i: 1).getOpcode() == ISD::Constant) {
15709 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15710 if (Mem->getMemoryVT().isScalableVector())
15711 return false;
15712 // The shift can be combined if it matches the size of the value being
15713 // loaded (and so reducing the width would make it not match).
15714 uint64_t ShiftAmount = Base.getOperand(i: 1).getConstantOperandVal(i: 1);
15715 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15716 if (ShiftAmount == Log2_32(Value: LoadBytes))
15717 return false;
15718 }
15719 // We have no reason to disallow reducing the load width, so allow it.
15720 return true;
15721}
15722
15723// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15724bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const {
15725 EVT VT = Extend.getValueType();
15726 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15727 SDValue Extract = Extend.getOperand(i: 0);
15728 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15729 Extract = Extract.getOperand(i: 0);
15730 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15731 EVT VecVT = Extract.getOperand(i: 0).getValueType();
15732 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15733 return false;
15734 }
15735 }
15736 return true;
15737}
15738
15739// Truncations from 64-bit GPR to 32-bit GPR is free.
15740bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
15741 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15742 return false;
15743 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15744 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15745 return NumBits1 > NumBits2;
15746}
15747bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
15748 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15749 return false;
15750 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15751 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15752 return NumBits1 > NumBits2;
15753}
15754
15755/// Check if it is profitable to hoist instruction in then/else to if.
15756/// Not profitable if I and it's user can form a FMA instruction
15757/// because we prefer FMSUB/FMADD.
15758bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
15759 if (I->getOpcode() != Instruction::FMul)
15760 return true;
15761
15762 if (!I->hasOneUse())
15763 return true;
15764
15765 Instruction *User = I->user_back();
15766
15767 if (!(User->getOpcode() == Instruction::FSub ||
15768 User->getOpcode() == Instruction::FAdd))
15769 return true;
15770
15771 const TargetOptions &Options = getTargetMachine().Options;
15772 const Function *F = I->getFunction();
15773 const DataLayout &DL = F->getDataLayout();
15774 Type *Ty = User->getOperand(i: 0)->getType();
15775
15776 return !(isFMAFasterThanFMulAndFAdd(F: *F, Ty) &&
15777 isOperationLegalOrCustom(Op: ISD::FMA, VT: getValueType(DL, Ty)) &&
15778 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15779 Options.UnsafeFPMath));
15780}
15781
15782// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15783// 64-bit GPR.
15784bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
15785 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15786 return false;
15787 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15788 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15789 return NumBits1 == 32 && NumBits2 == 64;
15790}
15791bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
15792 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15793 return false;
15794 unsigned NumBits1 = VT1.getSizeInBits();
15795 unsigned NumBits2 = VT2.getSizeInBits();
15796 return NumBits1 == 32 && NumBits2 == 64;
15797}
15798
15799bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
15800 EVT VT1 = Val.getValueType();
15801 if (isZExtFree(VT1, VT2)) {
15802 return true;
15803 }
15804
15805 if (Val.getOpcode() != ISD::LOAD)
15806 return false;
15807
15808 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15809 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15810 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15811 VT1.getSizeInBits() <= 32);
15812}
15813
15814bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15815 if (isa<FPExtInst>(Val: Ext))
15816 return false;
15817
15818 // Vector types are not free.
15819 if (Ext->getType()->isVectorTy())
15820 return false;
15821
15822 for (const Use &U : Ext->uses()) {
15823 // The extension is free if we can fold it with a left shift in an
15824 // addressing mode or an arithmetic operation: add, sub, and cmp.
15825
15826 // Is there a shift?
15827 const Instruction *Instr = cast<Instruction>(Val: U.getUser());
15828
15829 // Is this a constant shift?
15830 switch (Instr->getOpcode()) {
15831 case Instruction::Shl:
15832 if (!isa<ConstantInt>(Val: Instr->getOperand(i: 1)))
15833 return false;
15834 break;
15835 case Instruction::GetElementPtr: {
15836 gep_type_iterator GTI = gep_type_begin(GEP: Instr);
15837 auto &DL = Ext->getDataLayout();
15838 std::advance(i&: GTI, n: U.getOperandNo()-1);
15839 Type *IdxTy = GTI.getIndexedType();
15840 // This extension will end up with a shift because of the scaling factor.
15841 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15842 // Get the shift amount based on the scaling factor:
15843 // log2(sizeof(IdxTy)) - log2(8).
15844 if (IdxTy->isScalableTy())
15845 return false;
15846 uint64_t ShiftAmt =
15847 llvm::countr_zero(Val: DL.getTypeStoreSizeInBits(Ty: IdxTy).getFixedValue()) -
15848 3;
15849 // Is the constant foldable in the shift of the addressing mode?
15850 // I.e., shift amount is between 1 and 4 inclusive.
15851 if (ShiftAmt == 0 || ShiftAmt > 4)
15852 return false;
15853 break;
15854 }
15855 case Instruction::Trunc:
15856 // Check if this is a noop.
15857 // trunc(sext ty1 to ty2) to ty1.
15858 if (Instr->getType() == Ext->getOperand(i: 0)->getType())
15859 continue;
15860 [[fallthrough]];
15861 default:
15862 return false;
15863 }
15864
15865 // At this point we can use the bfm family, so this extension is free
15866 // for that use.
15867 }
15868 return true;
15869}
15870
15871static bool isSplatShuffle(Value *V) {
15872 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V))
15873 return all_equal(Range: Shuf->getShuffleMask());
15874 return false;
15875}
15876
15877/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15878/// or upper half of the vector elements.
15879static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15880 bool AllowSplat = false) {
15881 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15882 auto *FullTy = FullV->getType();
15883 auto *HalfTy = HalfV->getType();
15884 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15885 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15886 };
15887
15888 auto extractHalf = [](Value *FullV, Value *HalfV) {
15889 auto *FullVT = cast<FixedVectorType>(Val: FullV->getType());
15890 auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType());
15891 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15892 };
15893
15894 ArrayRef<int> M1, M2;
15895 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15896 if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask(M1))) ||
15897 !match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask(M2))))
15898 return false;
15899
15900 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15901 // it is not checked as an extract below.
15902 if (AllowSplat && isSplatShuffle(V: Op1))
15903 S1Op1 = nullptr;
15904 if (AllowSplat && isSplatShuffle(V: Op2))
15905 S2Op1 = nullptr;
15906
15907 // Check that the operands are half as wide as the result and we extract
15908 // half of the elements of the input vectors.
15909 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15910 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15911 return false;
15912
15913 // Check the mask extracts either the lower or upper half of vector
15914 // elements.
15915 int M1Start = 0;
15916 int M2Start = 0;
15917 int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * 2;
15918 if ((S1Op1 &&
15919 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) ||
15920 (S2Op1 &&
15921 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start)))
15922 return false;
15923
15924 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15925 (M2Start != 0 && M2Start != (NumElements / 2)))
15926 return false;
15927 if (S1Op1 && S2Op1 && M1Start != M2Start)
15928 return false;
15929
15930 return true;
15931}
15932
15933/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15934/// of the vector elements.
15935static bool areExtractExts(Value *Ext1, Value *Ext2) {
15936 auto areExtDoubled = [](Instruction *Ext) {
15937 return Ext->getType()->getScalarSizeInBits() ==
15938 2 * Ext->getOperand(i: 0)->getType()->getScalarSizeInBits();
15939 };
15940
15941 if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) ||
15942 !match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) ||
15943 !areExtDoubled(cast<Instruction>(Val: Ext1)) ||
15944 !areExtDoubled(cast<Instruction>(Val: Ext2)))
15945 return false;
15946
15947 return true;
15948}
15949
15950/// Check if Op could be used with vmull_high_p64 intrinsic.
15951static bool isOperandOfVmullHighP64(Value *Op) {
15952 Value *VectorOperand = nullptr;
15953 ConstantInt *ElementIndex = nullptr;
15954 return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand),
15955 Idx: m_ConstantInt(CI&: ElementIndex))) &&
15956 ElementIndex->getValue() == 1 &&
15957 isa<FixedVectorType>(Val: VectorOperand->getType()) &&
15958 cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == 2;
15959}
15960
15961/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15962static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15963 return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2);
15964}
15965
15966static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
15967 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15968 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs);
15969 if (!GEP || GEP->getNumOperands() != 2)
15970 return false;
15971
15972 Value *Base = GEP->getOperand(i_nocapture: 0);
15973 Value *Offsets = GEP->getOperand(i_nocapture: 1);
15974
15975 // We only care about scalar_base+vector_offsets.
15976 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15977 return false;
15978
15979 // Sink extends that would allow us to use 32-bit offset vectors.
15980 if (isa<SExtInst>(Val: Offsets) || isa<ZExtInst>(Val: Offsets)) {
15981 auto *OffsetsInst = cast<Instruction>(Val: Offsets);
15982 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15983 OffsetsInst->getOperand(i: 0)->getType()->getScalarSizeInBits() <= 32)
15984 Ops.push_back(Elt: &GEP->getOperandUse(i: 1));
15985 }
15986
15987 // Sink the GEP.
15988 return true;
15989}
15990
15991/// We want to sink following cases:
15992/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
15993/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
15994static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
15995 if (match(V: Op, P: m_VScale()))
15996 return true;
15997 if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) ||
15998 match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) {
15999 Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0));
16000 return true;
16001 }
16002 if (match(V: Op, P: m_Shl(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt())) ||
16003 match(V: Op, P: m_Mul(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt()))) {
16004 Value *ZExtOp = cast<Instruction>(Val: Op)->getOperand(i: 0);
16005 Ops.push_back(Elt: &cast<Instruction>(Val: ZExtOp)->getOperandUse(i: 0));
16006 Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0));
16007 return true;
16008 }
16009 return false;
16010}
16011
16012/// Check if sinking \p I's operands to I's basic block is profitable, because
16013/// the operands can be folded into a target instruction, e.g.
16014/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
16015bool AArch64TargetLowering::shouldSinkOperands(
16016 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
16017 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
16018 switch (II->getIntrinsicID()) {
16019 case Intrinsic::aarch64_neon_smull:
16020 case Intrinsic::aarch64_neon_umull:
16021 if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1),
16022 /*AllowSplat=*/true)) {
16023 Ops.push_back(Elt: &II->getOperandUse(i: 0));
16024 Ops.push_back(Elt: &II->getOperandUse(i: 1));
16025 return true;
16026 }
16027 [[fallthrough]];
16028
16029 case Intrinsic::fma:
16030 if (isa<VectorType>(Val: I->getType()) &&
16031 cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
16032 !Subtarget->hasFullFP16())
16033 return false;
16034 [[fallthrough]];
16035 case Intrinsic::aarch64_neon_sqdmull:
16036 case Intrinsic::aarch64_neon_sqdmulh:
16037 case Intrinsic::aarch64_neon_sqrdmulh:
16038 // Sink splats for index lane variants
16039 if (isSplatShuffle(V: II->getOperand(i_nocapture: 0)))
16040 Ops.push_back(Elt: &II->getOperandUse(i: 0));
16041 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
16042 Ops.push_back(Elt: &II->getOperandUse(i: 1));
16043 return !Ops.empty();
16044 case Intrinsic::aarch64_neon_fmlal:
16045 case Intrinsic::aarch64_neon_fmlal2:
16046 case Intrinsic::aarch64_neon_fmlsl:
16047 case Intrinsic::aarch64_neon_fmlsl2:
16048 // Sink splats for index lane variants
16049 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
16050 Ops.push_back(Elt: &II->getOperandUse(i: 1));
16051 if (isSplatShuffle(V: II->getOperand(i_nocapture: 2)))
16052 Ops.push_back(Elt: &II->getOperandUse(i: 2));
16053 return !Ops.empty();
16054 case Intrinsic::aarch64_sve_ptest_first:
16055 case Intrinsic::aarch64_sve_ptest_last:
16056 if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: 0)))
16057 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
16058 Ops.push_back(Elt: &II->getOperandUse(i: 0));
16059 return !Ops.empty();
16060 case Intrinsic::aarch64_sme_write_horiz:
16061 case Intrinsic::aarch64_sme_write_vert:
16062 case Intrinsic::aarch64_sme_writeq_horiz:
16063 case Intrinsic::aarch64_sme_writeq_vert: {
16064 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 1));
16065 if (!Idx || Idx->getOpcode() != Instruction::Add)
16066 return false;
16067 Ops.push_back(Elt: &II->getOperandUse(i: 1));
16068 return true;
16069 }
16070 case Intrinsic::aarch64_sme_read_horiz:
16071 case Intrinsic::aarch64_sme_read_vert:
16072 case Intrinsic::aarch64_sme_readq_horiz:
16073 case Intrinsic::aarch64_sme_readq_vert:
16074 case Intrinsic::aarch64_sme_ld1b_vert:
16075 case Intrinsic::aarch64_sme_ld1h_vert:
16076 case Intrinsic::aarch64_sme_ld1w_vert:
16077 case Intrinsic::aarch64_sme_ld1d_vert:
16078 case Intrinsic::aarch64_sme_ld1q_vert:
16079 case Intrinsic::aarch64_sme_st1b_vert:
16080 case Intrinsic::aarch64_sme_st1h_vert:
16081 case Intrinsic::aarch64_sme_st1w_vert:
16082 case Intrinsic::aarch64_sme_st1d_vert:
16083 case Intrinsic::aarch64_sme_st1q_vert:
16084 case Intrinsic::aarch64_sme_ld1b_horiz:
16085 case Intrinsic::aarch64_sme_ld1h_horiz:
16086 case Intrinsic::aarch64_sme_ld1w_horiz:
16087 case Intrinsic::aarch64_sme_ld1d_horiz:
16088 case Intrinsic::aarch64_sme_ld1q_horiz:
16089 case Intrinsic::aarch64_sme_st1b_horiz:
16090 case Intrinsic::aarch64_sme_st1h_horiz:
16091 case Intrinsic::aarch64_sme_st1w_horiz:
16092 case Intrinsic::aarch64_sme_st1d_horiz:
16093 case Intrinsic::aarch64_sme_st1q_horiz: {
16094 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 3));
16095 if (!Idx || Idx->getOpcode() != Instruction::Add)
16096 return false;
16097 Ops.push_back(Elt: &II->getOperandUse(i: 3));
16098 return true;
16099 }
16100 case Intrinsic::aarch64_neon_pmull:
16101 if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1)))
16102 return false;
16103 Ops.push_back(Elt: &II->getOperandUse(i: 0));
16104 Ops.push_back(Elt: &II->getOperandUse(i: 1));
16105 return true;
16106 case Intrinsic::aarch64_neon_pmull64:
16107 if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: 0),
16108 Op2: II->getArgOperand(i: 1)))
16109 return false;
16110 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
16111 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
16112 return true;
16113 case Intrinsic::masked_gather:
16114 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 0), Ops))
16115 return false;
16116 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
16117 return true;
16118 case Intrinsic::masked_scatter:
16119 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 1), Ops))
16120 return false;
16121 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
16122 return true;
16123 default:
16124 return false;
16125 }
16126 }
16127
16128 // Sink vscales closer to uses for better isel
16129 switch (I->getOpcode()) {
16130 case Instruction::GetElementPtr:
16131 case Instruction::Add:
16132 case Instruction::Sub:
16133 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
16134 if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) {
16135 Ops.push_back(Elt: &I->getOperandUse(i: Op));
16136 return true;
16137 }
16138 }
16139 break;
16140 default:
16141 break;
16142 }
16143
16144 if (!I->getType()->isVectorTy())
16145 return false;
16146
16147 switch (I->getOpcode()) {
16148 case Instruction::Sub:
16149 case Instruction::Add: {
16150 if (!areExtractExts(Ext1: I->getOperand(i: 0), Ext2: I->getOperand(i: 1)))
16151 return false;
16152
16153 // If the exts' operands extract either the lower or upper elements, we
16154 // can sink them too.
16155 auto Ext1 = cast<Instruction>(Val: I->getOperand(i: 0));
16156 auto Ext2 = cast<Instruction>(Val: I->getOperand(i: 1));
16157 if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: 0), Op2: Ext2->getOperand(i: 0))) {
16158 Ops.push_back(Elt: &Ext1->getOperandUse(i: 0));
16159 Ops.push_back(Elt: &Ext2->getOperandUse(i: 0));
16160 }
16161
16162 Ops.push_back(Elt: &I->getOperandUse(i: 0));
16163 Ops.push_back(Elt: &I->getOperandUse(i: 1));
16164
16165 return true;
16166 }
16167 case Instruction::Or: {
16168 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
16169 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
16170 if (Subtarget->hasNEON()) {
16171 Instruction *OtherAnd, *IA, *IB;
16172 Value *MaskValue;
16173 // MainAnd refers to And instruction that has 'Not' as one of its operands
16174 if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)),
16175 R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))),
16176 R: m_Instruction(I&: IA)))))) {
16177 if (match(V: OtherAnd,
16178 P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) {
16179 Instruction *MainAnd = I->getOperand(i: 0) == OtherAnd
16180 ? cast<Instruction>(Val: I->getOperand(i: 1))
16181 : cast<Instruction>(Val: I->getOperand(i: 0));
16182
16183 // Both Ands should be in same basic block as Or
16184 if (I->getParent() != MainAnd->getParent() ||
16185 I->getParent() != OtherAnd->getParent())
16186 return false;
16187
16188 // Non-mask operands of both Ands should also be in same basic block
16189 if (I->getParent() != IA->getParent() ||
16190 I->getParent() != IB->getParent())
16191 return false;
16192
16193 Ops.push_back(Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: 0) == IA ? 1 : 0));
16194 Ops.push_back(Elt: &I->getOperandUse(i: 0));
16195 Ops.push_back(Elt: &I->getOperandUse(i: 1));
16196
16197 return true;
16198 }
16199 }
16200 }
16201
16202 return false;
16203 }
16204 case Instruction::Mul: {
16205 int NumZExts = 0, NumSExts = 0;
16206 for (auto &Op : I->operands()) {
16207 // Make sure we are not already sinking this operand
16208 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; }))
16209 continue;
16210
16211 if (match(V: &Op, P: m_SExt(Op: m_Value()))) {
16212 NumSExts++;
16213 continue;
16214 } else if (match(V: &Op, P: m_ZExt(Op: m_Value()))) {
16215 NumZExts++;
16216 continue;
16217 }
16218
16219 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op);
16220
16221 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
16222 // operand and the s/zext can help create indexed s/umull. This is
16223 // especially useful to prevent i64 mul being scalarized.
16224 if (Shuffle && isSplatShuffle(V: Shuffle) &&
16225 match(V: Shuffle->getOperand(i_nocapture: 0), P: m_ZExtOrSExt(Op: m_Value()))) {
16226 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
16227 Ops.push_back(Elt: &Op);
16228 if (match(V: Shuffle->getOperand(i_nocapture: 0), P: m_SExt(Op: m_Value())))
16229 NumSExts++;
16230 else
16231 NumZExts++;
16232 continue;
16233 }
16234
16235 if (!Shuffle)
16236 continue;
16237
16238 Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: 0);
16239 InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand);
16240 if (!Insert)
16241 continue;
16242
16243 Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: 1));
16244 if (!OperandInstr)
16245 continue;
16246
16247 ConstantInt *ElementConstant =
16248 dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: 2));
16249 // Check that the insertelement is inserting into element 0
16250 if (!ElementConstant || !ElementConstant->isZero())
16251 continue;
16252
16253 unsigned Opcode = OperandInstr->getOpcode();
16254 if (Opcode == Instruction::SExt)
16255 NumSExts++;
16256 else if (Opcode == Instruction::ZExt)
16257 NumZExts++;
16258 else {
16259 // If we find that the top bits are known 0, then we can sink and allow
16260 // the backend to generate a umull.
16261 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
16262 APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / 2);
16263 const DataLayout &DL = I->getDataLayout();
16264 if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, DL))
16265 continue;
16266 NumZExts++;
16267 }
16268
16269 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
16270 Ops.push_back(Elt: &Op);
16271 }
16272
16273 // Is it profitable to sink if we found two of the same type of extends.
16274 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
16275 }
16276 default:
16277 return false;
16278 }
16279 return false;
16280}
16281
16282static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16283 unsigned NumElts, bool IsLittleEndian,
16284 SmallVectorImpl<int> &Mask) {
16285 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
16286 return false;
16287
16288 assert(DstWidth % SrcWidth == 0 &&
16289 "TBL lowering is not supported for a conversion instruction with this "
16290 "source and destination element type.");
16291
16292 unsigned Factor = DstWidth / SrcWidth;
16293 unsigned MaskLen = NumElts * Factor;
16294
16295 Mask.clear();
16296 Mask.resize(N: MaskLen, NV: NumElts);
16297
16298 unsigned SrcIndex = 0;
16299 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
16300 Mask[I] = SrcIndex++;
16301
16302 return true;
16303}
16304
16305static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op,
16306 FixedVectorType *ZExtTy,
16307 FixedVectorType *DstTy,
16308 bool IsLittleEndian) {
16309 auto *SrcTy = cast<FixedVectorType>(Val: Op->getType());
16310 unsigned NumElts = SrcTy->getNumElements();
16311 auto SrcWidth = cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16312 auto DstWidth = cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16313
16314 SmallVector<int> Mask;
16315 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16316 return nullptr;
16317
16318 auto *FirstEltZero = Builder.CreateInsertElement(
16319 Vec: PoisonValue::get(T: SrcTy), NewElt: Builder.getInt8(C: 0), Idx: uint64_t(0));
16320 Value *Result = Builder.CreateShuffleVector(V1: Op, V2: FirstEltZero, Mask);
16321 Result = Builder.CreateBitCast(V: Result, DestTy: DstTy);
16322 if (DstTy != ZExtTy)
16323 Result = Builder.CreateZExt(V: Result, DestTy: ZExtTy);
16324 return Result;
16325}
16326
16327static Value *createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op,
16328 FixedVectorType *DstTy,
16329 bool IsLittleEndian) {
16330 auto *SrcTy = cast<FixedVectorType>(Val: Op->getType());
16331 auto SrcWidth = cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16332 auto DstWidth = cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16333
16334 SmallVector<int> Mask;
16335 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts: SrcTy->getNumElements(),
16336 IsLittleEndian: !IsLittleEndian, Mask))
16337 return nullptr;
16338
16339 auto *FirstEltZero = Builder.CreateInsertElement(
16340 Vec: PoisonValue::get(T: SrcTy), NewElt: Builder.getInt8(C: 0), Idx: uint64_t(0));
16341
16342 return Builder.CreateShuffleVector(V1: Op, V2: FirstEltZero, Mask);
16343}
16344
16345static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
16346 IRBuilder<> Builder(TI);
16347 SmallVector<Value *> Parts;
16348 int NumElements = cast<FixedVectorType>(Val: TI->getType())->getNumElements();
16349 auto *SrcTy = cast<FixedVectorType>(Val: TI->getOperand(i_nocapture: 0)->getType());
16350 auto *DstTy = cast<FixedVectorType>(Val: TI->getType());
16351 assert(SrcTy->getElementType()->isIntegerTy() &&
16352 "Non-integer type source vector element is not supported");
16353 assert(DstTy->getElementType()->isIntegerTy(8) &&
16354 "Unsupported destination vector element type");
16355 unsigned SrcElemTySz =
16356 cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16357 unsigned DstElemTySz =
16358 cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16359 assert((SrcElemTySz % DstElemTySz == 0) &&
16360 "Cannot lower truncate to tbl instructions for a source element size "
16361 "that is not divisible by the destination element size");
16362 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16363 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
16364 "Unsupported source vector element type size");
16365 Type *VecTy = FixedVectorType::get(ElementType: Builder.getInt8Ty(), NumElts: 16);
16366
16367 // Create a mask to choose every nth byte from the source vector table of
16368 // bytes to create the truncated destination vector, where 'n' is the truncate
16369 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16370 // 0,8,16,..Y*8th bytes for the little-endian format
16371 SmallVector<Constant *, 16> MaskConst;
16372 for (int Itr = 0; Itr < 16; Itr++) {
16373 if (Itr < NumElements)
16374 MaskConst.push_back(Elt: Builder.getInt8(
16375 C: IsLittleEndian ? Itr * TruncFactor
16376 : Itr * TruncFactor + (TruncFactor - 1)));
16377 else
16378 MaskConst.push_back(Elt: Builder.getInt8(C: 255));
16379 }
16380
16381 int MaxTblSz = 128 * 4;
16382 int MaxSrcSz = SrcElemTySz * NumElements;
16383 int ElemsPerTbl =
16384 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16385 assert(ElemsPerTbl <= 16 &&
16386 "Maximum elements selected using TBL instruction cannot exceed 16!");
16387
16388 int ShuffleCount = 128 / SrcElemTySz;
16389 SmallVector<int> ShuffleLanes;
16390 for (int i = 0; i < ShuffleCount; ++i)
16391 ShuffleLanes.push_back(Elt: i);
16392
16393 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16394 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16395 // call TBL & save the result in a vector of TBL results for combining later.
16396 SmallVector<Value *> Results;
16397 while (ShuffleLanes.back() < NumElements) {
16398 Parts.push_back(Elt: Builder.CreateBitCast(
16399 V: Builder.CreateShuffleVector(V: TI->getOperand(i_nocapture: 0), Mask: ShuffleLanes), DestTy: VecTy));
16400
16401 if (Parts.size() == 4) {
16402 auto *F = Intrinsic::getDeclaration(M: TI->getModule(),
16403 id: Intrinsic::aarch64_neon_tbl4, Tys: VecTy);
16404 Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
16405 Results.push_back(Elt: Builder.CreateCall(Callee: F, Args: Parts));
16406 Parts.clear();
16407 }
16408
16409 for (int i = 0; i < ShuffleCount; ++i)
16410 ShuffleLanes[i] += ShuffleCount;
16411 }
16412
16413 assert((Parts.empty() || Results.empty()) &&
16414 "Lowering trunc for vectors requiring different TBL instructions is "
16415 "not supported!");
16416 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16417 // registers
16418 if (!Parts.empty()) {
16419 Intrinsic::ID TblID;
16420 switch (Parts.size()) {
16421 case 1:
16422 TblID = Intrinsic::aarch64_neon_tbl1;
16423 break;
16424 case 2:
16425 TblID = Intrinsic::aarch64_neon_tbl2;
16426 break;
16427 case 3:
16428 TblID = Intrinsic::aarch64_neon_tbl3;
16429 break;
16430 }
16431
16432 auto *F = Intrinsic::getDeclaration(M: TI->getModule(), id: TblID, Tys: VecTy);
16433 Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
16434 Results.push_back(Elt: Builder.CreateCall(Callee: F, Args: Parts));
16435 }
16436
16437 // Extract the destination vector from TBL result(s) after combining them
16438 // where applicable. Currently, at most two TBLs are supported.
16439 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
16440 "more than 2 tbl instructions!");
16441 Value *FinalResult = Results[0];
16442 if (Results.size() == 1) {
16443 if (ElemsPerTbl < 16) {
16444 SmallVector<int> FinalMask(ElemsPerTbl);
16445 std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: 0);
16446 FinalResult = Builder.CreateShuffleVector(V: Results[0], Mask: FinalMask);
16447 }
16448 } else {
16449 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16450 if (ElemsPerTbl < 16) {
16451 std::iota(first: FinalMask.begin(), last: FinalMask.begin() + ElemsPerTbl, value: 0);
16452 std::iota(first: FinalMask.begin() + ElemsPerTbl, last: FinalMask.end(), value: 16);
16453 } else {
16454 std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: 0);
16455 }
16456 FinalResult =
16457 Builder.CreateShuffleVector(V1: Results[0], V2: Results[1], Mask: FinalMask);
16458 }
16459
16460 TI->replaceAllUsesWith(V: FinalResult);
16461 TI->eraseFromParent();
16462}
16463
16464bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
16465 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
16466 // shuffle_vector instructions are serialized when targeting SVE,
16467 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
16468 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
16469 return false;
16470
16471 // Try to optimize conversions using tbl. This requires materializing constant
16472 // index vectors, which can increase code size and add loads. Skip the
16473 // transform unless the conversion is in a loop block guaranteed to execute
16474 // and we are not optimizing for size.
16475 Function *F = I->getParent()->getParent();
16476 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
16477 F->hasOptSize())
16478 return false;
16479
16480 auto *SrcTy = dyn_cast<FixedVectorType>(Val: I->getOperand(i: 0)->getType());
16481 auto *DstTy = dyn_cast<FixedVectorType>(Val: I->getType());
16482 if (!SrcTy || !DstTy)
16483 return false;
16484
16485 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16486 // lowered to tbl instructions to insert the original i8 elements
16487 // into i8x lanes. This is enabled for cases where it is beneficial.
16488 auto *ZExt = dyn_cast<ZExtInst>(Val: I);
16489 if (ZExt && SrcTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
16490 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16491 if (DstWidth % 8 != 0)
16492 return false;
16493
16494 auto *TruncDstType =
16495 cast<FixedVectorType>(Val: VectorType::getTruncatedElementVectorType(VTy: DstTy));
16496 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16497 // the remaining ZExt folded into the user, don't use tbl lowering.
16498 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16499 if (TTI.getCastInstrCost(Opcode: I->getOpcode(), Dst: DstTy, Src: TruncDstType,
16500 CCH: TargetTransformInfo::getCastContextHint(I),
16501 CostKind: TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) {
16502 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
16503 return false;
16504
16505 DstTy = TruncDstType;
16506 }
16507 IRBuilder<> Builder(ZExt);
16508 Value *Result = createTblShuffleForZExt(
16509 Builder, Op: ZExt->getOperand(i_nocapture: 0), ZExtTy: cast<FixedVectorType>(Val: ZExt->getType()),
16510 DstTy, IsLittleEndian: Subtarget->isLittleEndian());
16511 if (!Result)
16512 return false;
16513 ZExt->replaceAllUsesWith(V: Result);
16514 ZExt->eraseFromParent();
16515 return true;
16516 }
16517
16518 auto *UIToFP = dyn_cast<UIToFPInst>(Val: I);
16519 if (UIToFP && SrcTy->getElementType()->isIntegerTy(Bitwidth: 8) &&
16520 DstTy->getElementType()->isFloatTy()) {
16521 IRBuilder<> Builder(I);
16522 Value *ZExt = createTblShuffleForZExt(
16523 Builder, Op: I->getOperand(i: 0), ZExtTy: FixedVectorType::getInteger(VTy: DstTy),
16524 DstTy: FixedVectorType::getInteger(VTy: DstTy), IsLittleEndian: Subtarget->isLittleEndian());
16525 assert(ZExt && "Cannot fail for the i8 to float conversion");
16526 auto *UI = Builder.CreateUIToFP(V: ZExt, DestTy: DstTy);
16527 I->replaceAllUsesWith(V: UI);
16528 I->eraseFromParent();
16529 return true;
16530 }
16531
16532 auto *SIToFP = dyn_cast<SIToFPInst>(Val: I);
16533 if (SIToFP && SrcTy->getElementType()->isIntegerTy(Bitwidth: 8) &&
16534 DstTy->getElementType()->isFloatTy()) {
16535 IRBuilder<> Builder(I);
16536 auto *Shuffle = createTblShuffleForSExt(Builder, Op: I->getOperand(i: 0),
16537 DstTy: FixedVectorType::getInteger(VTy: DstTy),
16538 IsLittleEndian: Subtarget->isLittleEndian());
16539 assert(Shuffle && "Cannot fail for the i8 to float conversion");
16540 auto *Cast = Builder.CreateBitCast(V: Shuffle, DestTy: VectorType::getInteger(VTy: DstTy));
16541 auto *AShr = Builder.CreateAShr(LHS: Cast, RHS: 24, Name: "", isExact: true);
16542 auto *SI = Builder.CreateSIToFP(V: AShr, DestTy: DstTy);
16543 I->replaceAllUsesWith(V: SI);
16544 I->eraseFromParent();
16545 return true;
16546 }
16547
16548 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
16549 // followed by a truncate lowered to using tbl.4.
16550 auto *FPToUI = dyn_cast<FPToUIInst>(Val: I);
16551 if (FPToUI &&
16552 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
16553 SrcTy->getElementType()->isFloatTy() &&
16554 DstTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
16555 IRBuilder<> Builder(I);
16556 auto *WideConv = Builder.CreateFPToUI(V: FPToUI->getOperand(i_nocapture: 0),
16557 DestTy: VectorType::getInteger(VTy: SrcTy));
16558 auto *TruncI = Builder.CreateTrunc(V: WideConv, DestTy: DstTy);
16559 I->replaceAllUsesWith(V: TruncI);
16560 I->eraseFromParent();
16561 createTblForTrunc(TI: cast<TruncInst>(Val: TruncI), IsLittleEndian: Subtarget->isLittleEndian());
16562 return true;
16563 }
16564
16565 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
16566 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
16567 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
16568 // registers
16569 auto *TI = dyn_cast<TruncInst>(Val: I);
16570 if (TI && DstTy->getElementType()->isIntegerTy(Bitwidth: 8) &&
16571 ((SrcTy->getElementType()->isIntegerTy(Bitwidth: 32) ||
16572 SrcTy->getElementType()->isIntegerTy(Bitwidth: 64)) &&
16573 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
16574 createTblForTrunc(TI, IsLittleEndian: Subtarget->isLittleEndian());
16575 return true;
16576 }
16577
16578 return false;
16579}
16580
16581bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
16582 Align &RequiredAligment) const {
16583 if (!LoadedType.isSimple() ||
16584 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
16585 return false;
16586 // Cyclone supports unaligned accesses.
16587 RequiredAligment = Align(1);
16588 unsigned NumBits = LoadedType.getSizeInBits();
16589 return NumBits == 32 || NumBits == 64;
16590}
16591
16592/// A helper function for determining the number of interleaved accesses we
16593/// will generate when lowering accesses of the given type.
16594unsigned AArch64TargetLowering::getNumInterleavedAccesses(
16595 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
16596 unsigned VecSize = 128;
16597 unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
16598 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
16599 if (UseScalable && isa<FixedVectorType>(Val: VecTy))
16600 VecSize = std::max(a: Subtarget->getMinSVEVectorSizeInBits(), b: 128u);
16601 return std::max<unsigned>(a: 1, b: (MinElts * ElSize + 127) / VecSize);
16602}
16603
16604MachineMemOperand::Flags
16605AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
16606 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
16607 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
16608 return MOStridedAccess;
16609 return MachineMemOperand::MONone;
16610}
16611
16612bool AArch64TargetLowering::isLegalInterleavedAccessType(
16613 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
16614 unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
16615 auto EC = VecTy->getElementCount();
16616 unsigned MinElts = EC.getKnownMinValue();
16617
16618 UseScalable = false;
16619
16620 if (isa<FixedVectorType>(Val: VecTy) && !Subtarget->isNeonAvailable() &&
16621 (!Subtarget->useSVEForFixedLengthVectors() ||
16622 !getSVEPredPatternFromNumElements(MinNumElts: MinElts)))
16623 return false;
16624
16625 if (isa<ScalableVectorType>(Val: VecTy) &&
16626 !Subtarget->isSVEorStreamingSVEAvailable())
16627 return false;
16628
16629 // Ensure the number of vector elements is greater than 1.
16630 if (MinElts < 2)
16631 return false;
16632
16633 // Ensure the element type is legal.
16634 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
16635 return false;
16636
16637 if (EC.isScalable()) {
16638 UseScalable = true;
16639 return isPowerOf2_32(Value: MinElts) && (MinElts * ElSize) % 128 == 0;
16640 }
16641
16642 unsigned VecSize = DL.getTypeSizeInBits(Ty: VecTy);
16643 if (Subtarget->useSVEForFixedLengthVectors()) {
16644 unsigned MinSVEVectorSize =
16645 std::max(a: Subtarget->getMinSVEVectorSizeInBits(), b: 128u);
16646 if (VecSize % MinSVEVectorSize == 0 ||
16647 (VecSize < MinSVEVectorSize && isPowerOf2_32(Value: MinElts) &&
16648 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
16649 UseScalable = true;
16650 return true;
16651 }
16652 }
16653
16654 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
16655 // 128 will be split into multiple interleaved accesses.
16656 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
16657}
16658
16659static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
16660 if (VTy->getElementType() == Type::getDoubleTy(C&: VTy->getContext()))
16661 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 2);
16662
16663 if (VTy->getElementType() == Type::getFloatTy(C&: VTy->getContext()))
16664 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 4);
16665
16666 if (VTy->getElementType() == Type::getBFloatTy(C&: VTy->getContext()))
16667 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 8);
16668
16669 if (VTy->getElementType() == Type::getHalfTy(C&: VTy->getContext()))
16670 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 8);
16671
16672 if (VTy->getElementType() == Type::getInt64Ty(C&: VTy->getContext()))
16673 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 2);
16674
16675 if (VTy->getElementType() == Type::getInt32Ty(C&: VTy->getContext()))
16676 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 4);
16677
16678 if (VTy->getElementType() == Type::getInt16Ty(C&: VTy->getContext()))
16679 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 8);
16680
16681 if (VTy->getElementType() == Type::getInt8Ty(C&: VTy->getContext()))
16682 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 16);
16683
16684 llvm_unreachable("Cannot handle input vector type");
16685}
16686
16687static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
16688 bool Scalable, Type *LDVTy,
16689 Type *PtrTy) {
16690 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16691 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
16692 Intrinsic::aarch64_sve_ld3_sret,
16693 Intrinsic::aarch64_sve_ld4_sret};
16694 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
16695 Intrinsic::aarch64_neon_ld3,
16696 Intrinsic::aarch64_neon_ld4};
16697 if (Scalable)
16698 return Intrinsic::getDeclaration(M, id: SVELoads[Factor - 2], Tys: {LDVTy});
16699
16700 return Intrinsic::getDeclaration(M, id: NEONLoads[Factor - 2], Tys: {LDVTy, PtrTy});
16701}
16702
16703static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
16704 bool Scalable, Type *STVTy,
16705 Type *PtrTy) {
16706 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16707 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
16708 Intrinsic::aarch64_sve_st3,
16709 Intrinsic::aarch64_sve_st4};
16710 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
16711 Intrinsic::aarch64_neon_st3,
16712 Intrinsic::aarch64_neon_st4};
16713 if (Scalable)
16714 return Intrinsic::getDeclaration(M, id: SVEStores[Factor - 2], Tys: {STVTy});
16715
16716 return Intrinsic::getDeclaration(M, id: NEONStores[Factor - 2], Tys: {STVTy, PtrTy});
16717}
16718
16719/// Lower an interleaved load into a ldN intrinsic.
16720///
16721/// E.g. Lower an interleaved load (Factor = 2):
16722/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
16723/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16724/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16725///
16726/// Into:
16727/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16728/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16729/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16730bool AArch64TargetLowering::lowerInterleavedLoad(
16731 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
16732 ArrayRef<unsigned> Indices, unsigned Factor) const {
16733 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16734 "Invalid interleave factor");
16735 assert(!Shuffles.empty() && "Empty shufflevector input");
16736 assert(Shuffles.size() == Indices.size() &&
16737 "Unmatched number of shufflevectors and indices");
16738
16739 const DataLayout &DL = LI->getDataLayout();
16740
16741 VectorType *VTy = Shuffles[0]->getType();
16742
16743 // Skip if we do not have NEON and skip illegal vector types. We can
16744 // "legalize" wide vector types into multiple interleaved accesses as long as
16745 // the vector types are divisible by 128.
16746 bool UseScalable;
16747 if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
16748 return false;
16749
16750 unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
16751
16752 auto *FVTy = cast<FixedVectorType>(Val: VTy);
16753
16754 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16755 // load integer vectors first and then convert to pointer vectors.
16756 Type *EltTy = FVTy->getElementType();
16757 if (EltTy->isPointerTy())
16758 FVTy =
16759 FixedVectorType::get(ElementType: DL.getIntPtrType(EltTy), NumElts: FVTy->getNumElements());
16760
16761 // If we're going to generate more than one load, reset the sub-vector type
16762 // to something legal.
16763 FVTy = FixedVectorType::get(ElementType: FVTy->getElementType(),
16764 NumElts: FVTy->getNumElements() / NumLoads);
16765
16766 auto *LDVTy =
16767 UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: FVTy)) : FVTy;
16768
16769 IRBuilder<> Builder(LI);
16770
16771 // The base address of the load.
16772 Value *BaseAddr = LI->getPointerOperand();
16773
16774 Type *PtrTy = LI->getPointerOperandType();
16775 Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: LDVTy->getContext()),
16776 EC: LDVTy->getElementCount());
16777
16778 Function *LdNFunc = getStructuredLoadFunction(M: LI->getModule(), Factor,
16779 Scalable: UseScalable, LDVTy, PtrTy);
16780
16781 // Holds sub-vectors extracted from the load intrinsic return values. The
16782 // sub-vectors are associated with the shufflevector instructions they will
16783 // replace.
16784 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
16785
16786 Value *PTrue = nullptr;
16787 if (UseScalable) {
16788 std::optional<unsigned> PgPattern =
16789 getSVEPredPatternFromNumElements(MinNumElts: FVTy->getNumElements());
16790 if (Subtarget->getMinSVEVectorSizeInBits() ==
16791 Subtarget->getMaxSVEVectorSizeInBits() &&
16792 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(Ty: FVTy))
16793 PgPattern = AArch64SVEPredPattern::all;
16794
16795 auto *PTruePat =
16796 ConstantInt::get(Ty: Type::getInt32Ty(C&: LDVTy->getContext()), V: *PgPattern);
16797 PTrue = Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, Types: {PredTy},
16798 Args: {PTruePat});
16799 }
16800
16801 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16802
16803 // If we're generating more than one load, compute the base address of
16804 // subsequent loads as an offset from the previous.
16805 if (LoadCount > 0)
16806 BaseAddr = Builder.CreateConstGEP1_32(Ty: LDVTy->getElementType(), Ptr: BaseAddr,
16807 Idx0: FVTy->getNumElements() * Factor);
16808
16809 CallInst *LdN;
16810 if (UseScalable)
16811 LdN = Builder.CreateCall(Callee: LdNFunc, Args: {PTrue, BaseAddr}, Name: "ldN");
16812 else
16813 LdN = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
16814
16815 // Extract and store the sub-vectors returned by the load intrinsic.
16816 for (unsigned i = 0; i < Shuffles.size(); i++) {
16817 ShuffleVectorInst *SVI = Shuffles[i];
16818 unsigned Index = Indices[i];
16819
16820 Value *SubVec = Builder.CreateExtractValue(Agg: LdN, Idxs: Index);
16821
16822 if (UseScalable)
16823 SubVec = Builder.CreateExtractVector(
16824 DstType: FVTy, SrcVec: SubVec,
16825 Idx: ConstantInt::get(Ty: Type::getInt64Ty(C&: VTy->getContext()), V: 0));
16826
16827 // Convert the integer vector to pointer vector if the element is pointer.
16828 if (EltTy->isPointerTy())
16829 SubVec = Builder.CreateIntToPtr(
16830 V: SubVec, DestTy: FixedVectorType::get(ElementType: SVI->getType()->getElementType(),
16831 NumElts: FVTy->getNumElements()));
16832
16833 SubVecs[SVI].push_back(Elt: SubVec);
16834 }
16835 }
16836
16837 // Replace uses of the shufflevector instructions with the sub-vectors
16838 // returned by the load intrinsic. If a shufflevector instruction is
16839 // associated with more than one sub-vector, those sub-vectors will be
16840 // concatenated into a single wide vector.
16841 for (ShuffleVectorInst *SVI : Shuffles) {
16842 auto &SubVec = SubVecs[SVI];
16843 auto *WideVec =
16844 SubVec.size() > 1 ? concatenateVectors(Builder, Vecs: SubVec) : SubVec[0];
16845 SVI->replaceAllUsesWith(V: WideVec);
16846 }
16847
16848 return true;
16849}
16850
16851template <typename Iter>
16852bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16853 int MaxLookupDist = 20;
16854 unsigned IdxWidth = DL.getIndexSizeInBits(AS: 0);
16855 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16856 const Value *PtrA1 =
16857 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
16858
16859 while (++It != End) {
16860 if (It->isDebugOrPseudoInst())
16861 continue;
16862 if (MaxLookupDist-- == 0)
16863 break;
16864 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16865 const Value *PtrB1 =
16866 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16867 DL, OffsetB);
16868 if (PtrA1 == PtrB1 &&
16869 (OffsetA.sextOrTrunc(width: IdxWidth) - OffsetB.sextOrTrunc(width: IdxWidth))
16870 .abs() == 16)
16871 return true;
16872 }
16873 }
16874
16875 return false;
16876}
16877
16878/// Lower an interleaved store into a stN intrinsic.
16879///
16880/// E.g. Lower an interleaved store (Factor = 3):
16881/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16882/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16883/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16884///
16885/// Into:
16886/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16887/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16888/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16889/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16890///
16891/// Note that the new shufflevectors will be removed and we'll only generate one
16892/// st3 instruction in CodeGen.
16893///
16894/// Example for a more general valid mask (Factor 3). Lower:
16895/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16896/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16897/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16898///
16899/// Into:
16900/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16901/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16902/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16903/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16904bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
16905 ShuffleVectorInst *SVI,
16906 unsigned Factor) const {
16907
16908 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16909 "Invalid interleave factor");
16910
16911 auto *VecTy = cast<FixedVectorType>(Val: SVI->getType());
16912 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16913
16914 unsigned LaneLen = VecTy->getNumElements() / Factor;
16915 Type *EltTy = VecTy->getElementType();
16916 auto *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: LaneLen);
16917
16918 const DataLayout &DL = SI->getDataLayout();
16919 bool UseScalable;
16920
16921 // Skip if we do not have NEON and skip illegal vector types. We can
16922 // "legalize" wide vector types into multiple interleaved accesses as long as
16923 // the vector types are divisible by 128.
16924 if (!isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
16925 return false;
16926
16927 unsigned NumStores = getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
16928
16929 Value *Op0 = SVI->getOperand(i_nocapture: 0);
16930 Value *Op1 = SVI->getOperand(i_nocapture: 1);
16931 IRBuilder<> Builder(SI);
16932
16933 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16934 // vectors to integer vectors.
16935 if (EltTy->isPointerTy()) {
16936 Type *IntTy = DL.getIntPtrType(EltTy);
16937 unsigned NumOpElts =
16938 cast<FixedVectorType>(Val: Op0->getType())->getNumElements();
16939
16940 // Convert to the corresponding integer vector.
16941 auto *IntVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: NumOpElts);
16942 Op0 = Builder.CreatePtrToInt(V: Op0, DestTy: IntVecTy);
16943 Op1 = Builder.CreatePtrToInt(V: Op1, DestTy: IntVecTy);
16944
16945 SubVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: LaneLen);
16946 }
16947
16948 // If we're going to generate more than one store, reset the lane length
16949 // and sub-vector type to something legal.
16950 LaneLen /= NumStores;
16951 SubVecTy = FixedVectorType::get(ElementType: SubVecTy->getElementType(), NumElts: LaneLen);
16952
16953 auto *STVTy = UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: SubVecTy))
16954 : SubVecTy;
16955
16956 // The base address of the store.
16957 Value *BaseAddr = SI->getPointerOperand();
16958
16959 auto Mask = SVI->getShuffleMask();
16960
16961 // Sanity check if all the indices are NOT in range.
16962 // If mask is `poison`, `Mask` may be a vector of -1s.
16963 // If all of them are `poison`, OOB read will happen later.
16964 if (llvm::all_of(Range&: Mask, P: [](int Idx) { return Idx == PoisonMaskElem; })) {
16965 return false;
16966 }
16967 // A 64bit st2 which does not start at element 0 will involved adding extra
16968 // ext elements making the st2 unprofitable, and if there is a nearby store
16969 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16970 // zip;ldp pair which has higher throughput.
16971 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16972 (Mask[0] != 0 ||
16973 hasNearbyPairedStore(It: SI->getIterator(), End: SI->getParent()->end(), Ptr: BaseAddr,
16974 DL) ||
16975 hasNearbyPairedStore(It: SI->getReverseIterator(), End: SI->getParent()->rend(),
16976 Ptr: BaseAddr, DL)))
16977 return false;
16978
16979 Type *PtrTy = SI->getPointerOperandType();
16980 Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: STVTy->getContext()),
16981 EC: STVTy->getElementCount());
16982
16983 Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
16984 Scalable: UseScalable, STVTy, PtrTy);
16985
16986 Value *PTrue = nullptr;
16987 if (UseScalable) {
16988 std::optional<unsigned> PgPattern =
16989 getSVEPredPatternFromNumElements(MinNumElts: SubVecTy->getNumElements());
16990 if (Subtarget->getMinSVEVectorSizeInBits() ==
16991 Subtarget->getMaxSVEVectorSizeInBits() &&
16992 Subtarget->getMinSVEVectorSizeInBits() ==
16993 DL.getTypeSizeInBits(Ty: SubVecTy))
16994 PgPattern = AArch64SVEPredPattern::all;
16995
16996 auto *PTruePat =
16997 ConstantInt::get(Ty: Type::getInt32Ty(C&: STVTy->getContext()), V: *PgPattern);
16998 PTrue = Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, Types: {PredTy},
16999 Args: {PTruePat});
17000 }
17001
17002 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
17003
17004 SmallVector<Value *, 5> Ops;
17005
17006 // Split the shufflevector operands into sub vectors for the new stN call.
17007 for (unsigned i = 0; i < Factor; i++) {
17008 Value *Shuffle;
17009 unsigned IdxI = StoreCount * LaneLen * Factor + i;
17010 if (Mask[IdxI] >= 0) {
17011 Shuffle = Builder.CreateShuffleVector(
17012 V1: Op0, V2: Op1, Mask: createSequentialMask(Start: Mask[IdxI], NumInts: LaneLen, NumUndefs: 0));
17013 } else {
17014 unsigned StartMask = 0;
17015 for (unsigned j = 1; j < LaneLen; j++) {
17016 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17017 if (Mask[IdxJ] >= 0) {
17018 StartMask = Mask[IdxJ] - j;
17019 break;
17020 }
17021 }
17022 // Note: Filling undef gaps with random elements is ok, since
17023 // those elements were being written anyway (with undefs).
17024 // In the case of all undefs we're defaulting to using elems from 0
17025 // Note: StartMask cannot be negative, it's checked in
17026 // isReInterleaveMask
17027 Shuffle = Builder.CreateShuffleVector(
17028 V1: Op0, V2: Op1, Mask: createSequentialMask(Start: StartMask, NumInts: LaneLen, NumUndefs: 0));
17029 }
17030
17031 if (UseScalable)
17032 Shuffle = Builder.CreateInsertVector(
17033 DstType: STVTy, SrcVec: UndefValue::get(T: STVTy), SubVec: Shuffle,
17034 Idx: ConstantInt::get(Ty: Type::getInt64Ty(C&: STVTy->getContext()), V: 0));
17035
17036 Ops.push_back(Elt: Shuffle);
17037 }
17038
17039 if (UseScalable)
17040 Ops.push_back(Elt: PTrue);
17041
17042 // If we generating more than one store, we compute the base address of
17043 // subsequent stores as an offset from the previous.
17044 if (StoreCount > 0)
17045 BaseAddr = Builder.CreateConstGEP1_32(Ty: SubVecTy->getElementType(),
17046 Ptr: BaseAddr, Idx0: LaneLen * Factor);
17047
17048 Ops.push_back(Elt: BaseAddr);
17049 Builder.CreateCall(Callee: StNFunc, Args: Ops);
17050 }
17051 return true;
17052}
17053
17054bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
17055 IntrinsicInst *DI, LoadInst *LI) const {
17056 // Only deinterleave2 supported at present.
17057 if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
17058 return false;
17059
17060 // Only a factor of 2 supported at present.
17061 const unsigned Factor = 2;
17062
17063 VectorType *VTy = cast<VectorType>(Val: DI->getType()->getContainedType(i: 0));
17064 const DataLayout &DL = DI->getDataLayout();
17065 bool UseScalable;
17066 if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
17067 return false;
17068
17069 // TODO: Add support for using SVE instructions with fixed types later, using
17070 // the code from lowerInterleavedLoad to obtain the correct container type.
17071 if (UseScalable && !VTy->isScalableTy())
17072 return false;
17073
17074 unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
17075
17076 VectorType *LdTy =
17077 VectorType::get(ElementType: VTy->getElementType(),
17078 EC: VTy->getElementCount().divideCoefficientBy(RHS: NumLoads));
17079
17080 Type *PtrTy = LI->getPointerOperandType();
17081 Function *LdNFunc = getStructuredLoadFunction(M: DI->getModule(), Factor,
17082 Scalable: UseScalable, LDVTy: LdTy, PtrTy);
17083
17084 IRBuilder<> Builder(LI);
17085
17086 Value *Pred = nullptr;
17087 if (UseScalable)
17088 Pred =
17089 Builder.CreateVectorSplat(EC: LdTy->getElementCount(), V: Builder.getTrue());
17090
17091 Value *BaseAddr = LI->getPointerOperand();
17092 Value *Result;
17093 if (NumLoads > 1) {
17094 Value *Left = PoisonValue::get(T: VTy);
17095 Value *Right = PoisonValue::get(T: VTy);
17096
17097 for (unsigned I = 0; I < NumLoads; ++I) {
17098 Value *Offset = Builder.getInt64(C: I * Factor);
17099
17100 Value *Address = Builder.CreateGEP(Ty: LdTy, Ptr: BaseAddr, IdxList: {Offset});
17101 Value *LdN = nullptr;
17102 if (UseScalable)
17103 LdN = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, Address}, Name: "ldN");
17104 else
17105 LdN = Builder.CreateCall(Callee: LdNFunc, Args: Address, Name: "ldN");
17106
17107 Value *Idx =
17108 Builder.getInt64(C: I * LdTy->getElementCount().getKnownMinValue());
17109 Left = Builder.CreateInsertVector(
17110 DstType: VTy, SrcVec: Left, SubVec: Builder.CreateExtractValue(Agg: LdN, Idxs: 0), Idx);
17111 Right = Builder.CreateInsertVector(
17112 DstType: VTy, SrcVec: Right, SubVec: Builder.CreateExtractValue(Agg: LdN, Idxs: 1), Idx);
17113 }
17114
17115 Result = PoisonValue::get(T: DI->getType());
17116 Result = Builder.CreateInsertValue(Agg: Result, Val: Left, Idxs: 0);
17117 Result = Builder.CreateInsertValue(Agg: Result, Val: Right, Idxs: 1);
17118 } else {
17119 if (UseScalable)
17120 Result = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, BaseAddr}, Name: "ldN");
17121 else
17122 Result = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
17123 }
17124
17125 DI->replaceAllUsesWith(V: Result);
17126 return true;
17127}
17128
17129bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
17130 IntrinsicInst *II, StoreInst *SI) const {
17131 // Only interleave2 supported at present.
17132 if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
17133 return false;
17134
17135 // Only a factor of 2 supported at present.
17136 const unsigned Factor = 2;
17137
17138 VectorType *VTy = cast<VectorType>(Val: II->getOperand(i_nocapture: 0)->getType());
17139 const DataLayout &DL = II->getDataLayout();
17140 bool UseScalable;
17141 if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
17142 return false;
17143
17144 // TODO: Add support for using SVE instructions with fixed types later, using
17145 // the code from lowerInterleavedStore to obtain the correct container type.
17146 if (UseScalable && !VTy->isScalableTy())
17147 return false;
17148
17149 unsigned NumStores = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
17150
17151 VectorType *StTy =
17152 VectorType::get(ElementType: VTy->getElementType(),
17153 EC: VTy->getElementCount().divideCoefficientBy(RHS: NumStores));
17154
17155 Type *PtrTy = SI->getPointerOperandType();
17156 Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
17157 Scalable: UseScalable, STVTy: StTy, PtrTy);
17158
17159 IRBuilder<> Builder(SI);
17160
17161 Value *BaseAddr = SI->getPointerOperand();
17162 Value *Pred = nullptr;
17163
17164 if (UseScalable)
17165 Pred =
17166 Builder.CreateVectorSplat(EC: StTy->getElementCount(), V: Builder.getTrue());
17167
17168 Value *L = II->getOperand(i_nocapture: 0);
17169 Value *R = II->getOperand(i_nocapture: 1);
17170
17171 for (unsigned I = 0; I < NumStores; ++I) {
17172 Value *Address = BaseAddr;
17173 if (NumStores > 1) {
17174 Value *Offset = Builder.getInt64(C: I * Factor);
17175 Address = Builder.CreateGEP(Ty: StTy, Ptr: BaseAddr, IdxList: {Offset});
17176
17177 Value *Idx =
17178 Builder.getInt64(C: I * StTy->getElementCount().getKnownMinValue());
17179 L = Builder.CreateExtractVector(DstType: StTy, SrcVec: II->getOperand(i_nocapture: 0), Idx);
17180 R = Builder.CreateExtractVector(DstType: StTy, SrcVec: II->getOperand(i_nocapture: 1), Idx);
17181 }
17182
17183 if (UseScalable)
17184 Builder.CreateCall(Callee: StNFunc, Args: {L, R, Pred, Address});
17185 else
17186 Builder.CreateCall(Callee: StNFunc, Args: {L, R, Address});
17187 }
17188
17189 return true;
17190}
17191
17192EVT AArch64TargetLowering::getOptimalMemOpType(
17193 const MemOp &Op, const AttributeList &FuncAttributes) const {
17194 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Kind: Attribute::NoImplicitFloat);
17195 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17196 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17197 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17198 // taken one instruction to materialize the v2i64 zero and one store (with
17199 // restrictive addressing mode). Just do i64 stores.
17200 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17201 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17202 if (Op.isAligned(AlignCheck))
17203 return true;
17204 unsigned Fast;
17205 return allowsMisalignedMemoryAccesses(VT, AddrSpace: 0, Alignment: Align(1),
17206 Flags: MachineMemOperand::MONone, Fast: &Fast) &&
17207 Fast;
17208 };
17209
17210 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17211 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
17212 return MVT::v16i8;
17213 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17214 return MVT::f128;
17215 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17216 return MVT::i64;
17217 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17218 return MVT::i32;
17219 return MVT::Other;
17220}
17221
17222LLT AArch64TargetLowering::getOptimalMemOpLLT(
17223 const MemOp &Op, const AttributeList &FuncAttributes) const {
17224 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Kind: Attribute::NoImplicitFloat);
17225 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17226 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17227 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17228 // taken one instruction to materialize the v2i64 zero and one store (with
17229 // restrictive addressing mode). Just do i64 stores.
17230 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17231 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17232 if (Op.isAligned(AlignCheck))
17233 return true;
17234 unsigned Fast;
17235 return allowsMisalignedMemoryAccesses(VT, AddrSpace: 0, Alignment: Align(1),
17236 Flags: MachineMemOperand::MONone, Fast: &Fast) &&
17237 Fast;
17238 };
17239
17240 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17241 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
17242 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
17243 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17244 return LLT::scalar(SizeInBits: 128);
17245 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17246 return LLT::scalar(SizeInBits: 64);
17247 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17248 return LLT::scalar(SizeInBits: 32);
17249 return LLT();
17250}
17251
17252// 12-bit optionally shifted immediates are legal for adds.
17253bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
17254 if (Immed == std::numeric_limits<int64_t>::min()) {
17255 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
17256 << ": avoid UB for INT64_MIN\n");
17257 return false;
17258 }
17259 // Same encoding for add/sub, just flip the sign.
17260 Immed = std::abs(i: Immed);
17261 bool IsLegal = ((Immed >> 12) == 0 ||
17262 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
17263 LLVM_DEBUG(dbgs() << "Is " << Immed
17264 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
17265 return IsLegal;
17266}
17267
17268bool AArch64TargetLowering::isLegalAddScalableImmediate(int64_t Imm) const {
17269 // We will only emit addvl/inc* instructions for SVE2
17270 if (!Subtarget->hasSVE2())
17271 return false;
17272
17273 // addvl's immediates are in terms of the number of bytes in a register.
17274 // Since there are 16 in the base supported size (128bits), we need to
17275 // divide the immediate by that much to give us a useful immediate to
17276 // multiply by vscale. We can't have a remainder as a result of this.
17277 if (Imm % 16 == 0)
17278 return isInt<6>(x: Imm / 16);
17279
17280 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
17281 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17282 // of addvl as a result, so only take h|w|d into account.
17283 // Dec[h|w|d] will cover subtractions.
17284 // Immediates are in the range [1,16], so we can't do a 2's complement check.
17285 // FIXME: Can we make use of other patterns to cover other immediates?
17286
17287 // inch|dech
17288 if (Imm % 8 == 0)
17289 return std::abs(i: Imm / 8) <= 16;
17290 // incw|decw
17291 if (Imm % 4 == 0)
17292 return std::abs(i: Imm / 4) <= 16;
17293 // incd|decd
17294 if (Imm % 2 == 0)
17295 return std::abs(i: Imm / 2) <= 16;
17296
17297 return false;
17298}
17299
17300// Return false to prevent folding
17301// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
17302// if the folding leads to worse code.
17303bool AArch64TargetLowering::isMulAddWithConstProfitable(
17304 SDValue AddNode, SDValue ConstNode) const {
17305 // Let the DAGCombiner decide for vector types and large types.
17306 const EVT VT = AddNode.getValueType();
17307 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
17308 return true;
17309
17310 // It is worse if c1 is legal add immediate, while c1*c2 is not
17311 // and has to be composed by at least two instructions.
17312 const ConstantSDNode *C1Node = cast<ConstantSDNode>(Val: AddNode.getOperand(i: 1));
17313 const ConstantSDNode *C2Node = cast<ConstantSDNode>(Val&: ConstNode);
17314 const int64_t C1 = C1Node->getSExtValue();
17315 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17316 if (!isLegalAddImmediate(Immed: C1) || isLegalAddImmediate(Immed: C1C2.getSExtValue()))
17317 return true;
17318 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
17319 // Adapt to the width of a register.
17320 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
17321 AArch64_IMM::expandMOVImm(Imm: C1C2.getZExtValue(), BitSize, Insn);
17322 if (Insn.size() > 1)
17323 return false;
17324
17325 // Default to true and let the DAGCombiner decide.
17326 return true;
17327}
17328
17329// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17330// immediates is the same as for an add or a sub.
17331bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
17332 return isLegalAddImmediate(Immed);
17333}
17334
17335/// isLegalAddressingMode - Return true if the addressing mode represented
17336/// by AM is legal for this target, for a load/store of the specified type.
17337bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
17338 const AddrMode &AMode, Type *Ty,
17339 unsigned AS, Instruction *I) const {
17340 // AArch64 has five basic addressing modes:
17341 // reg
17342 // reg + 9-bit signed offset
17343 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
17344 // reg1 + reg2
17345 // reg + SIZE_IN_BYTES * reg
17346
17347 // No global is ever allowed as a base.
17348 if (AMode.BaseGV)
17349 return false;
17350
17351 // No reg+reg+imm addressing.
17352 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17353 return false;
17354
17355 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
17356 // `2*ScaledReg` into `BaseReg + ScaledReg`
17357 AddrMode AM = AMode;
17358 if (AM.Scale && !AM.HasBaseReg) {
17359 if (AM.Scale == 1) {
17360 AM.HasBaseReg = true;
17361 AM.Scale = 0;
17362 } else if (AM.Scale == 2) {
17363 AM.HasBaseReg = true;
17364 AM.Scale = 1;
17365 } else {
17366 return false;
17367 }
17368 }
17369
17370 // A base register is required in all addressing modes.
17371 if (!AM.HasBaseReg)
17372 return false;
17373
17374 if (Ty->isScalableTy()) {
17375 if (isa<ScalableVectorType>(Val: Ty)) {
17376 // See if we have a foldable vscale-based offset, for vector types which
17377 // are either legal or smaller than the minimum; more work will be
17378 // required if we need to consider addressing for types which need
17379 // legalization by splitting.
17380 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
17381 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17382 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
17383 isPowerOf2_64(Value: VecNumBytes))
17384 return isInt<4>(x: AM.ScalableOffset / (int64_t)VecNumBytes);
17385
17386 uint64_t VecElemNumBytes =
17387 DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: Ty)->getElementType()) / 8;
17388 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17389 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
17390 }
17391
17392 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17393 }
17394
17395 // No scalable offsets allowed for non-scalable types.
17396 if (AM.ScalableOffset)
17397 return false;
17398
17399 // check reg + imm case:
17400 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
17401 uint64_t NumBytes = 0;
17402 if (Ty->isSized()) {
17403 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
17404 NumBytes = NumBits / 8;
17405 if (!isPowerOf2_64(Value: NumBits))
17406 NumBytes = 0;
17407 }
17408
17409 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, Offset: AM.BaseOffs,
17410 Scale: AM.Scale);
17411}
17412
17413// Check whether the 2 offsets belong to the same imm24 range, and their high
17414// 12bits are same, then their high part can be decoded with the offset of add.
17415int64_t
17416AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset,
17417 int64_t MaxOffset) const {
17418 int64_t HighPart = MinOffset & ~0xfffULL;
17419 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(Immed: HighPart)) {
17420 // Rebase the value to an integer multiple of imm12.
17421 return HighPart;
17422 }
17423
17424 return 0;
17425}
17426
17427bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
17428 // Consider splitting large offset of struct or array.
17429 return true;
17430}
17431
17432bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
17433 const MachineFunction &MF, EVT VT) const {
17434 VT = VT.getScalarType();
17435
17436 if (!VT.isSimple())
17437 return false;
17438
17439 switch (VT.getSimpleVT().SimpleTy) {
17440 case MVT::f16:
17441 return Subtarget->hasFullFP16();
17442 case MVT::f32:
17443 case MVT::f64:
17444 return true;
17445 default:
17446 break;
17447 }
17448
17449 return false;
17450}
17451
17452bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
17453 Type *Ty) const {
17454 switch (Ty->getScalarType()->getTypeID()) {
17455 case Type::FloatTyID:
17456 case Type::DoubleTyID:
17457 return true;
17458 default:
17459 return false;
17460 }
17461}
17462
17463bool AArch64TargetLowering::generateFMAsInMachineCombiner(
17464 EVT VT, CodeGenOptLevel OptLevel) const {
17465 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
17466 !useSVEForFixedLengthVectorVT(VT);
17467}
17468
17469const MCPhysReg *
17470AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
17471 // LR is a callee-save register, but we must treat it as clobbered by any call
17472 // site. Hence we include LR in the scratch registers, which are in turn added
17473 // as implicit-defs for stackmaps and patchpoints.
17474 static const MCPhysReg ScratchRegs[] = {
17475 AArch64::X16, AArch64::X17, AArch64::LR, 0
17476 };
17477 return ScratchRegs;
17478}
17479
17480ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const {
17481 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
17482 return RCRegs;
17483}
17484
17485bool
17486AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
17487 CombineLevel Level) const {
17488 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
17489 N->getOpcode() == ISD::SRL) &&
17490 "Expected shift op");
17491
17492 SDValue ShiftLHS = N->getOperand(Num: 0);
17493 EVT VT = N->getValueType(ResNo: 0);
17494
17495 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
17496 // combine it with shift 'N' to let it be lowered to UBFX except:
17497 // ((x >> C) & mask) << C.
17498 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
17499 isa<ConstantSDNode>(Val: ShiftLHS.getOperand(i: 1))) {
17500 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(i: 1);
17501 if (isMask_64(Value: TruncMask)) {
17502 SDValue AndLHS = ShiftLHS.getOperand(i: 0);
17503 if (AndLHS.getOpcode() == ISD::SRL) {
17504 if (auto *SRLC = dyn_cast<ConstantSDNode>(Val: AndLHS.getOperand(i: 1))) {
17505 if (N->getOpcode() == ISD::SHL)
17506 if (auto *SHLC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1)))
17507 return SRLC->getZExtValue() == SHLC->getZExtValue();
17508 return false;
17509 }
17510 }
17511 }
17512 }
17513 return true;
17514}
17515
17516bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
17517 const SDNode *N) const {
17518 assert(N->getOpcode() == ISD::XOR &&
17519 (N->getOperand(0).getOpcode() == ISD::SHL ||
17520 N->getOperand(0).getOpcode() == ISD::SRL) &&
17521 "Expected XOR(SHIFT) pattern");
17522
17523 // Only commute if the entire NOT mask is a hidden shifted mask.
17524 auto *XorC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
17525 auto *ShiftC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0).getOperand(i: 1));
17526 if (XorC && ShiftC) {
17527 unsigned MaskIdx, MaskLen;
17528 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
17529 unsigned ShiftAmt = ShiftC->getZExtValue();
17530 unsigned BitWidth = N->getValueType(ResNo: 0).getScalarSizeInBits();
17531 if (N->getOperand(Num: 0).getOpcode() == ISD::SHL)
17532 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
17533 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
17534 }
17535 }
17536
17537 return false;
17538}
17539
17540bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
17541 const SDNode *N, CombineLevel Level) const {
17542 assert(((N->getOpcode() == ISD::SHL &&
17543 N->getOperand(0).getOpcode() == ISD::SRL) ||
17544 (N->getOpcode() == ISD::SRL &&
17545 N->getOperand(0).getOpcode() == ISD::SHL)) &&
17546 "Expected shift-shift mask");
17547 // Don't allow multiuse shift folding with the same shift amount.
17548 if (!N->getOperand(Num: 0)->hasOneUse())
17549 return false;
17550
17551 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
17552 EVT VT = N->getValueType(ResNo: 0);
17553 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
17554 auto *C1 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0).getOperand(i: 1));
17555 auto *C2 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
17556 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
17557 }
17558
17559 return true;
17560}
17561
17562bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant(
17563 unsigned BinOpcode, EVT VT) const {
17564 return VT.isScalableVector() && isTypeLegal(VT);
17565}
17566
17567bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
17568 Type *Ty) const {
17569 assert(Ty->isIntegerTy());
17570
17571 unsigned BitSize = Ty->getPrimitiveSizeInBits();
17572 if (BitSize == 0)
17573 return false;
17574
17575 int64_t Val = Imm.getSExtValue();
17576 if (Val == 0 || AArch64_AM::isLogicalImmediate(imm: Val, regSize: BitSize))
17577 return true;
17578
17579 if ((int64_t)Val < 0)
17580 Val = ~Val;
17581 if (BitSize == 32)
17582 Val &= (1LL << 32) - 1;
17583
17584 unsigned Shift = llvm::Log2_64(Value: (uint64_t)Val) / 16;
17585 // MOVZ is free so return true for one or fewer MOVK.
17586 return Shift < 3;
17587}
17588
17589bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
17590 unsigned Index) const {
17591 if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
17592 return false;
17593
17594 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
17595}
17596
17597/// Turn vector tests of the signbit in the form of:
17598/// xor (sra X, elt_size(X)-1), -1
17599/// into:
17600/// cmge X, X, #0
17601static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
17602 const AArch64Subtarget *Subtarget) {
17603 EVT VT = N->getValueType(ResNo: 0);
17604 if (!Subtarget->hasNEON() || !VT.isVector())
17605 return SDValue();
17606
17607 // There must be a shift right algebraic before the xor, and the xor must be a
17608 // 'not' operation.
17609 SDValue Shift = N->getOperand(Num: 0);
17610 SDValue Ones = N->getOperand(Num: 1);
17611 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
17612 !ISD::isBuildVectorAllOnes(N: Ones.getNode()))
17613 return SDValue();
17614
17615 // The shift should be smearing the sign bit across each vector element.
17616 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1));
17617 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
17618 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
17619 return SDValue();
17620
17621 return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: SDLoc(N), VT, Operand: Shift.getOperand(i: 0));
17622}
17623
17624// Given a vecreduce_add node, detect the below pattern and convert it to the
17625// node sequence with UABDL, [S|U]ADB and UADDLP.
17626//
17627// i32 vecreduce_add(
17628// v16i32 abs(
17629// v16i32 sub(
17630// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
17631// =================>
17632// i32 vecreduce_add(
17633// v4i32 UADDLP(
17634// v8i16 add(
17635// v8i16 zext(
17636// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
17637// v8i16 zext(
17638// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
17639static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
17640 SelectionDAG &DAG) {
17641 // Assumed i32 vecreduce_add
17642 if (N->getValueType(ResNo: 0) != MVT::i32)
17643 return SDValue();
17644
17645 SDValue VecReduceOp0 = N->getOperand(Num: 0);
17646 unsigned Opcode = VecReduceOp0.getOpcode();
17647 // Assumed v16i32 abs
17648 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(ResNo: 0) != MVT::v16i32)
17649 return SDValue();
17650
17651 SDValue ABS = VecReduceOp0;
17652 // Assumed v16i32 sub
17653 if (ABS->getOperand(Num: 0)->getOpcode() != ISD::SUB ||
17654 ABS->getOperand(Num: 0)->getValueType(ResNo: 0) != MVT::v16i32)
17655 return SDValue();
17656
17657 SDValue SUB = ABS->getOperand(Num: 0);
17658 unsigned Opcode0 = SUB->getOperand(Num: 0).getOpcode();
17659 unsigned Opcode1 = SUB->getOperand(Num: 1).getOpcode();
17660 // Assumed v16i32 type
17661 if (SUB->getOperand(Num: 0)->getValueType(ResNo: 0) != MVT::v16i32 ||
17662 SUB->getOperand(Num: 1)->getValueType(ResNo: 0) != MVT::v16i32)
17663 return SDValue();
17664
17665 // Assumed zext or sext
17666 bool IsZExt = false;
17667 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
17668 IsZExt = true;
17669 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
17670 IsZExt = false;
17671 } else
17672 return SDValue();
17673
17674 SDValue EXT0 = SUB->getOperand(Num: 0);
17675 SDValue EXT1 = SUB->getOperand(Num: 1);
17676 // Assumed zext's operand has v16i8 type
17677 if (EXT0->getOperand(Num: 0)->getValueType(ResNo: 0) != MVT::v16i8 ||
17678 EXT1->getOperand(Num: 0)->getValueType(ResNo: 0) != MVT::v16i8)
17679 return SDValue();
17680
17681 // Pattern is dectected. Let's convert it to sequence of nodes.
17682 SDLoc DL(N);
17683
17684 // First, create the node pattern of UABD/SABD.
17685 SDValue UABDHigh8Op0 =
17686 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT0->getOperand(Num: 0),
17687 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i64));
17688 SDValue UABDHigh8Op1 =
17689 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT1->getOperand(Num: 0),
17690 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i64));
17691 SDValue UABDHigh8 = DAG.getNode(Opcode: IsZExt ? ISD::ABDU : ISD::ABDS, DL, VT: MVT::v8i8,
17692 N1: UABDHigh8Op0, N2: UABDHigh8Op1);
17693 SDValue UABDL = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::v8i16, Operand: UABDHigh8);
17694
17695 // Second, create the node pattern of UABAL.
17696 SDValue UABDLo8Op0 =
17697 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT0->getOperand(Num: 0),
17698 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
17699 SDValue UABDLo8Op1 =
17700 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT1->getOperand(Num: 0),
17701 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
17702 SDValue UABDLo8 = DAG.getNode(Opcode: IsZExt ? ISD::ABDU : ISD::ABDS, DL, VT: MVT::v8i8,
17703 N1: UABDLo8Op0, N2: UABDLo8Op1);
17704 SDValue ZExtUABD = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::v8i16, Operand: UABDLo8);
17705 SDValue UABAL = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::v8i16, N1: UABDL, N2: ZExtUABD);
17706
17707 // Third, create the node of UADDLP.
17708 SDValue UADDLP = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT: MVT::v4i32, Operand: UABAL);
17709
17710 // Fourth, create the node of VECREDUCE_ADD.
17711 return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: MVT::i32, Operand: UADDLP);
17712}
17713
17714// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17715// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17716// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17717// If we have vectors larger than v16i8 we extract v16i8 vectors,
17718// Follow the same steps above to get DOT instructions concatenate them
17719// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17720static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
17721 const AArch64Subtarget *ST) {
17722 if (!ST->isNeonAvailable())
17723 return SDValue();
17724
17725 if (!ST->hasDotProd())
17726 return performVecReduceAddCombineWithUADDLP(N, DAG);
17727
17728 SDValue Op0 = N->getOperand(Num: 0);
17729 if (N->getValueType(ResNo: 0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17730 Op0.getValueType().getVectorElementType() != MVT::i32)
17731 return SDValue();
17732
17733 unsigned ExtOpcode = Op0.getOpcode();
17734 SDValue A = Op0;
17735 SDValue B;
17736 if (ExtOpcode == ISD::MUL) {
17737 A = Op0.getOperand(i: 0);
17738 B = Op0.getOperand(i: 1);
17739 if (A.getOpcode() != B.getOpcode() ||
17740 A.getOperand(i: 0).getValueType() != B.getOperand(i: 0).getValueType())
17741 return SDValue();
17742 ExtOpcode = A.getOpcode();
17743 }
17744 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17745 return SDValue();
17746
17747 EVT Op0VT = A.getOperand(i: 0).getValueType();
17748 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17749 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17750 if (!IsValidElementCount || !IsValidSize)
17751 return SDValue();
17752
17753 SDLoc DL(Op0);
17754 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17755 // the extend B.
17756 if (!B)
17757 B = DAG.getConstant(Val: 1, DL, VT: Op0VT);
17758 else
17759 B = B.getOperand(i: 0);
17760
17761 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17762 unsigned NumOfVecReduce;
17763 EVT TargetType;
17764 if (IsMultipleOf16) {
17765 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17766 TargetType = MVT::v4i32;
17767 } else {
17768 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17769 TargetType = MVT::v2i32;
17770 }
17771 auto DotOpcode =
17772 (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
17773 // Handle the case where we need to generate only one Dot operation.
17774 if (NumOfVecReduce == 1) {
17775 SDValue Zeros = DAG.getConstant(Val: 0, DL, VT: TargetType);
17776 SDValue Dot = DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros,
17777 N2: A.getOperand(i: 0), N3: B);
17778 return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: 0), Operand: Dot);
17779 }
17780 // Generate Dot instructions that are multiple of 16.
17781 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17782 SmallVector<SDValue, 4> SDotVec16;
17783 unsigned I = 0;
17784 for (; I < VecReduce16Num; I += 1) {
17785 SDValue Zeros = DAG.getConstant(Val: 0, DL, VT: MVT::v4i32);
17786 SDValue Op0 =
17787 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v16i8, N1: A.getOperand(i: 0),
17788 N2: DAG.getConstant(Val: I * 16, DL, VT: MVT::i64));
17789 SDValue Op1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v16i8, N1: B,
17790 N2: DAG.getConstant(Val: I * 16, DL, VT: MVT::i64));
17791 SDValue Dot =
17792 DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Op0, N3: Op1);
17793 SDotVec16.push_back(Elt: Dot);
17794 }
17795 // Concatenate dot operations.
17796 EVT SDot16EVT =
17797 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: 4 * VecReduce16Num);
17798 SDValue ConcatSDot16 =
17799 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: SDot16EVT, Ops: SDotVec16);
17800 SDValue VecReduceAdd16 =
17801 DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: 0), Operand: ConcatSDot16);
17802 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17803 if (VecReduce8Num == 0)
17804 return VecReduceAdd16;
17805
17806 // Generate the remainder Dot operation that is multiple of 8.
17807 SmallVector<SDValue, 4> SDotVec8;
17808 SDValue Zeros = DAG.getConstant(Val: 0, DL, VT: MVT::v2i32);
17809 SDValue Vec8Op0 =
17810 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: A.getOperand(i: 0),
17811 N2: DAG.getConstant(Val: I * 16, DL, VT: MVT::i64));
17812 SDValue Vec8Op1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: B,
17813 N2: DAG.getConstant(Val: I * 16, DL, VT: MVT::i64));
17814 SDValue Dot =
17815 DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Vec8Op0, N3: Vec8Op1);
17816 SDValue VecReudceAdd8 =
17817 DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: 0), Operand: Dot);
17818 return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: 0), N1: VecReduceAdd16,
17819 N2: VecReudceAdd8);
17820}
17821
17822// Given an (integer) vecreduce, we know the order of the inputs does not
17823// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17824// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17825// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17826static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
17827 auto DetectAddExtract = [&](SDValue A) {
17828 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17829 // UADDLP(x) if found.
17830 assert(A.getOpcode() == ISD::ADD);
17831 EVT VT = A.getValueType();
17832 SDValue Op0 = A.getOperand(i: 0);
17833 SDValue Op1 = A.getOperand(i: 1);
17834 if (Op0.getOpcode() != Op0.getOpcode() ||
17835 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17836 Op0.getOpcode() != ISD::SIGN_EXTEND))
17837 return SDValue();
17838 SDValue Ext0 = Op0.getOperand(i: 0);
17839 SDValue Ext1 = Op1.getOperand(i: 0);
17840 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17841 Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17842 Ext0.getOperand(i: 0) != Ext1.getOperand(i: 0))
17843 return SDValue();
17844 // Check that the type is twice the add types, and the extract are from
17845 // upper/lower parts of the same source.
17846 if (Ext0.getOperand(i: 0).getValueType().getVectorNumElements() !=
17847 VT.getVectorNumElements() * 2)
17848 return SDValue();
17849 if ((Ext0.getConstantOperandVal(i: 1) != 0 ||
17850 Ext1.getConstantOperandVal(i: 1) != VT.getVectorNumElements()) &&
17851 (Ext1.getConstantOperandVal(i: 1) != 0 ||
17852 Ext0.getConstantOperandVal(i: 1) != VT.getVectorNumElements()))
17853 return SDValue();
17854 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17855 : AArch64ISD::SADDLP;
17856 return DAG.getNode(Opcode, DL: SDLoc(A), VT, Operand: Ext0.getOperand(i: 0));
17857 };
17858
17859 if (SDValue R = DetectAddExtract(A))
17860 return R;
17861
17862 if (A.getOperand(i: 0).getOpcode() == ISD::ADD && A.getOperand(i: 0).hasOneUse())
17863 if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: 0), DAG))
17864 return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(A), VT: A.getValueType(), N1: R,
17865 N2: A.getOperand(i: 1));
17866 if (A.getOperand(i: 1).getOpcode() == ISD::ADD && A.getOperand(i: 1).hasOneUse())
17867 if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: 1), DAG))
17868 return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(A), VT: A.getValueType(), N1: R,
17869 N2: A.getOperand(i: 0));
17870 return SDValue();
17871}
17872
17873// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17874// UADDLV(concat), where the concat represents the 64-bit zext sources.
17875static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG) {
17876 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17877 // UADDLV(concat(zext, zext)) if found.
17878 assert(A.getOpcode() == ISD::ADD);
17879 EVT VT = A.getValueType();
17880 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17881 return SDValue();
17882 SDValue Op0 = A.getOperand(i: 0);
17883 SDValue Op1 = A.getOperand(i: 1);
17884 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17885 return SDValue();
17886 SDValue Ext0 = Op0.getOperand(i: 0);
17887 SDValue Ext1 = Op1.getOperand(i: 0);
17888 EVT ExtVT0 = Ext0.getValueType();
17889 EVT ExtVT1 = Ext1.getValueType();
17890 // Check zext VTs are the same and 64-bit length.
17891 if (ExtVT0 != ExtVT1 ||
17892 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17893 return SDValue();
17894 // Get VT for concat of zext sources.
17895 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
17896 SDValue Concat =
17897 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(A), VT: PairVT, N1: Ext0, N2: Ext1);
17898
17899 switch (VT.getSimpleVT().SimpleTy) {
17900 case MVT::v2i64:
17901 case MVT::v4i32:
17902 return DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: SDLoc(A), VT, Operand: Concat);
17903 case MVT::v8i16: {
17904 SDValue Uaddlv =
17905 DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: SDLoc(A), VT: MVT::v4i32, Operand: Concat);
17906 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: SDLoc(A), VT: MVT::v8i16, Operand: Uaddlv);
17907 }
17908 default:
17909 llvm_unreachable("Unhandled vector type");
17910 }
17911}
17912
17913static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
17914 SDValue A = N->getOperand(Num: 0);
17915 if (A.getOpcode() == ISD::ADD) {
17916 if (SDValue R = performUADDVAddCombine(A, DAG))
17917 return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: R);
17918 else if (SDValue R = performUADDVZextCombine(A, DAG))
17919 return R;
17920 }
17921 return SDValue();
17922}
17923
17924static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
17925 TargetLowering::DAGCombinerInfo &DCI,
17926 const AArch64Subtarget *Subtarget) {
17927 if (DCI.isBeforeLegalizeOps())
17928 return SDValue();
17929
17930 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17931}
17932
17933SDValue
17934AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17935 SelectionDAG &DAG,
17936 SmallVectorImpl<SDNode *> &Created) const {
17937 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
17938 if (isIntDivCheap(VT: N->getValueType(ResNo: 0), Attr))
17939 return SDValue(N, 0); // Lower SDIV as SDIV
17940
17941 EVT VT = N->getValueType(ResNo: 0);
17942
17943 // For scalable and fixed types, mark them as cheap so we can handle it much
17944 // later. This allows us to handle larger than legal types.
17945 if (VT.isScalableVector() ||
17946 (VT.isFixedLengthVector() && Subtarget->useSVEForFixedLengthVectors()))
17947 return SDValue(N, 0);
17948
17949 // fold (sdiv X, pow2)
17950 if ((VT != MVT::i32 && VT != MVT::i64) ||
17951 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17952 return SDValue();
17953
17954 // If the divisor is 2 or -2, the default expansion is better. It will add
17955 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
17956 if (Divisor == 2 ||
17957 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
17958 return SDValue();
17959
17960 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17961}
17962
17963SDValue
17964AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17965 SelectionDAG &DAG,
17966 SmallVectorImpl<SDNode *> &Created) const {
17967 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
17968 if (isIntDivCheap(VT: N->getValueType(ResNo: 0), Attr))
17969 return SDValue(N, 0); // Lower SREM as SREM
17970
17971 EVT VT = N->getValueType(ResNo: 0);
17972
17973 // For scalable and fixed types, mark them as cheap so we can handle it much
17974 // later. This allows us to handle larger than legal types.
17975 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17976 return SDValue(N, 0);
17977
17978 // fold (srem X, pow2)
17979 if ((VT != MVT::i32 && VT != MVT::i64) ||
17980 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17981 return SDValue();
17982
17983 unsigned Lg2 = Divisor.countr_zero();
17984 if (Lg2 == 0)
17985 return SDValue();
17986
17987 SDLoc DL(N);
17988 SDValue N0 = N->getOperand(Num: 0);
17989 SDValue Pow2MinusOne = DAG.getConstant(Val: (1ULL << Lg2) - 1, DL, VT);
17990 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
17991 SDValue CCVal, CSNeg;
17992 if (Lg2 == 1) {
17993 SDValue Cmp = getAArch64Cmp(LHS: N0, RHS: Zero, CC: ISD::SETGE, AArch64cc&: CCVal, DAG, dl: DL);
17994 SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
17995 CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: And, N2: And, N3: CCVal, N4: Cmp);
17996
17997 Created.push_back(Elt: Cmp.getNode());
17998 Created.push_back(Elt: And.getNode());
17999 } else {
18000 SDValue CCVal = DAG.getConstant(Val: AArch64CC::MI, DL, VT: MVT_CC);
18001 SDVTList VTs = DAG.getVTList(VT1: VT, VT2: MVT::i32);
18002
18003 SDValue Negs = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Zero, N2: N0);
18004 SDValue AndPos = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
18005 SDValue AndNeg = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Negs, N2: Pow2MinusOne);
18006 CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: AndPos, N2: AndNeg, N3: CCVal,
18007 N4: Negs.getValue(R: 1));
18008
18009 Created.push_back(Elt: Negs.getNode());
18010 Created.push_back(Elt: AndPos.getNode());
18011 Created.push_back(Elt: AndNeg.getNode());
18012 }
18013
18014 return CSNeg;
18015}
18016
18017static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
18018 switch(getIntrinsicID(N: S.getNode())) {
18019 default:
18020 break;
18021 case Intrinsic::aarch64_sve_cntb:
18022 return 8;
18023 case Intrinsic::aarch64_sve_cnth:
18024 return 16;
18025 case Intrinsic::aarch64_sve_cntw:
18026 return 32;
18027 case Intrinsic::aarch64_sve_cntd:
18028 return 64;
18029 }
18030 return {};
18031}
18032
18033/// Calculates what the pre-extend type is, based on the extension
18034/// operation node provided by \p Extend.
18035///
18036/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
18037/// pre-extend type is pulled directly from the operand, while other extend
18038/// operations need a bit more inspection to get this information.
18039///
18040/// \param Extend The SDNode from the DAG that represents the extend operation
18041///
18042/// \returns The type representing the \p Extend source type, or \p MVT::Other
18043/// if no valid type can be determined
18044static EVT calculatePreExtendType(SDValue Extend) {
18045 switch (Extend.getOpcode()) {
18046 case ISD::SIGN_EXTEND:
18047 case ISD::ZERO_EXTEND:
18048 return Extend.getOperand(i: 0).getValueType();
18049 case ISD::AssertSext:
18050 case ISD::AssertZext:
18051 case ISD::SIGN_EXTEND_INREG: {
18052 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Val: Extend.getOperand(i: 1));
18053 if (!TypeNode)
18054 return MVT::Other;
18055 return TypeNode->getVT();
18056 }
18057 case ISD::AND: {
18058 ConstantSDNode *Constant =
18059 dyn_cast<ConstantSDNode>(Val: Extend.getOperand(i: 1).getNode());
18060 if (!Constant)
18061 return MVT::Other;
18062
18063 uint32_t Mask = Constant->getZExtValue();
18064
18065 if (Mask == UCHAR_MAX)
18066 return MVT::i8;
18067 else if (Mask == USHRT_MAX)
18068 return MVT::i16;
18069 else if (Mask == UINT_MAX)
18070 return MVT::i32;
18071
18072 return MVT::Other;
18073 }
18074 default:
18075 return MVT::Other;
18076 }
18077}
18078
18079/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
18080/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
18081/// SExt/ZExt rather than the scalar SExt/ZExt
18082static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
18083 EVT VT = BV.getValueType();
18084 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
18085 BV.getOpcode() != ISD::VECTOR_SHUFFLE)
18086 return SDValue();
18087
18088 // Use the first item in the buildvector/shuffle to get the size of the
18089 // extend, and make sure it looks valid.
18090 SDValue Extend = BV->getOperand(Num: 0);
18091 unsigned ExtendOpcode = Extend.getOpcode();
18092 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
18093 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
18094 ExtendOpcode == ISD::AssertSext;
18095 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
18096 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
18097 return SDValue();
18098 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
18099 // calculatePreExtendType will work without issue.
18100 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
18101 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
18102 return SDValue();
18103
18104 // Restrict valid pre-extend data type
18105 EVT PreExtendType = calculatePreExtendType(Extend);
18106 if (PreExtendType == MVT::Other ||
18107 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
18108 return SDValue();
18109
18110 // Make sure all other operands are equally extended
18111 for (SDValue Op : drop_begin(RangeOrContainer: BV->ops())) {
18112 if (Op.isUndef())
18113 continue;
18114 unsigned Opc = Op.getOpcode();
18115 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
18116 Opc == ISD::AssertSext;
18117 if (OpcIsSExt != IsSExt || calculatePreExtendType(Extend: Op) != PreExtendType)
18118 return SDValue();
18119 }
18120
18121 SDValue NBV;
18122 SDLoc DL(BV);
18123 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
18124 EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType);
18125 EVT PreExtendLegalType =
18126 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
18127 SmallVector<SDValue, 8> NewOps;
18128 for (SDValue Op : BV->ops())
18129 NewOps.push_back(Elt: Op.isUndef() ? DAG.getUNDEF(VT: PreExtendLegalType)
18130 : DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 0), DL,
18131 VT: PreExtendLegalType));
18132 NBV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: PreExtendVT, Ops: NewOps);
18133 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
18134 EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType.getScalarType());
18135 NBV = DAG.getVectorShuffle(VT: PreExtendVT, dl: DL, N1: BV.getOperand(i: 0).getOperand(i: 0),
18136 N2: BV.getOperand(i: 1).isUndef()
18137 ? DAG.getUNDEF(VT: PreExtendVT)
18138 : BV.getOperand(i: 1).getOperand(i: 0),
18139 Mask: cast<ShuffleVectorSDNode>(Val&: BV)->getMask());
18140 }
18141 return DAG.getNode(Opcode: IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, Operand: NBV);
18142}
18143
18144/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
18145/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
18146static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
18147 // If the value type isn't a vector, none of the operands are going to be dups
18148 EVT VT = Mul->getValueType(ResNo: 0);
18149 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18150 return SDValue();
18151
18152 SDValue Op0 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: 0), DAG);
18153 SDValue Op1 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: 1), DAG);
18154
18155 // Neither operands have been changed, don't make any further changes
18156 if (!Op0 && !Op1)
18157 return SDValue();
18158
18159 SDLoc DL(Mul);
18160 return DAG.getNode(Opcode: Mul->getOpcode(), DL, VT, N1: Op0 ? Op0 : Mul->getOperand(Num: 0),
18161 N2: Op1 ? Op1 : Mul->getOperand(Num: 1));
18162}
18163
18164// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
18165// Same for other types with equivalent constants.
18166static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
18167 EVT VT = N->getValueType(ResNo: 0);
18168 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
18169 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
18170 return SDValue();
18171 if (N->getOperand(Num: 0).getOpcode() != ISD::AND ||
18172 N->getOperand(Num: 0).getOperand(i: 0).getOpcode() != ISD::SRL)
18173 return SDValue();
18174
18175 SDValue And = N->getOperand(Num: 0);
18176 SDValue Srl = And.getOperand(i: 0);
18177
18178 APInt V1, V2, V3;
18179 if (!ISD::isConstantSplatVector(N: N->getOperand(Num: 1).getNode(), SplatValue&: V1) ||
18180 !ISD::isConstantSplatVector(N: And.getOperand(i: 1).getNode(), SplatValue&: V2) ||
18181 !ISD::isConstantSplatVector(N: Srl.getOperand(i: 1).getNode(), SplatValue&: V3))
18182 return SDValue();
18183
18184 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
18185 if (!V1.isMask(numBits: HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
18186 V3 != (HalfSize - 1))
18187 return SDValue();
18188
18189 EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
18190 VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: HalfSize),
18191 EC: VT.getVectorElementCount() * 2);
18192
18193 SDLoc DL(N);
18194 SDValue In = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: HalfVT, Operand: Srl.getOperand(i: 0));
18195 SDValue CM = DAG.getNode(Opcode: AArch64ISD::CMLTz, DL, VT: HalfVT, Operand: In);
18196 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: CM);
18197}
18198
18199// Transform vector add(zext i8 to i32, zext i8 to i32)
18200// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18201// This allows extra uses of saddl/uaddl at the lower vector widths, and less
18202// extends.
18203static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) {
18204 EVT VT = N->getValueType(ResNo: 0);
18205 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
18206 (N->getOperand(Num: 0).getOpcode() != ISD::ZERO_EXTEND &&
18207 N->getOperand(Num: 0).getOpcode() != ISD::SIGN_EXTEND) ||
18208 (N->getOperand(Num: 1).getOpcode() != ISD::ZERO_EXTEND &&
18209 N->getOperand(Num: 1).getOpcode() != ISD::SIGN_EXTEND) ||
18210 N->getOperand(Num: 0).getOperand(i: 0).getValueType() !=
18211 N->getOperand(Num: 1).getOperand(i: 0).getValueType())
18212 return SDValue();
18213
18214 if (N->getOpcode() == ISD::MUL &&
18215 N->getOperand(Num: 0).getOpcode() != N->getOperand(Num: 1).getOpcode())
18216 return SDValue();
18217
18218 SDValue N0 = N->getOperand(Num: 0).getOperand(i: 0);
18219 SDValue N1 = N->getOperand(Num: 1).getOperand(i: 0);
18220 EVT InVT = N0.getValueType();
18221
18222 EVT S1 = InVT.getScalarType();
18223 EVT S2 = VT.getScalarType();
18224 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
18225 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
18226 SDLoc DL(N);
18227 EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
18228 VT: S2.getHalfSizedIntegerVT(Context&: *DAG.getContext()),
18229 EC: VT.getVectorElementCount());
18230 SDValue NewN0 = DAG.getNode(Opcode: N->getOperand(Num: 0).getOpcode(), DL, VT: HalfVT, Operand: N0);
18231 SDValue NewN1 = DAG.getNode(Opcode: N->getOperand(Num: 1).getOpcode(), DL, VT: HalfVT, Operand: N1);
18232 SDValue NewOp = DAG.getNode(Opcode: N->getOpcode(), DL, VT: HalfVT, N1: NewN0, N2: NewN1);
18233 return DAG.getNode(Opcode: N->getOpcode() == ISD::MUL ? N->getOperand(Num: 0).getOpcode()
18234 : (unsigned)ISD::SIGN_EXTEND,
18235 DL, VT, Operand: NewOp);
18236 }
18237 return SDValue();
18238}
18239
18240static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
18241 TargetLowering::DAGCombinerInfo &DCI,
18242 const AArch64Subtarget *Subtarget) {
18243
18244 if (SDValue Ext = performMulVectorExtendCombine(Mul: N, DAG))
18245 return Ext;
18246 if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
18247 return Ext;
18248 if (SDValue Ext = performVectorExtCombine(N, DAG))
18249 return Ext;
18250
18251 if (DCI.isBeforeLegalizeOps())
18252 return SDValue();
18253
18254 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
18255 // and in MachineCombiner pass, add+mul will be combined into madd.
18256 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
18257 SDLoc DL(N);
18258 EVT VT = N->getValueType(ResNo: 0);
18259 SDValue N0 = N->getOperand(Num: 0);
18260 SDValue N1 = N->getOperand(Num: 1);
18261 SDValue MulOper;
18262 unsigned AddSubOpc;
18263
18264 auto IsAddSubWith1 = [&](SDValue V) -> bool {
18265 AddSubOpc = V->getOpcode();
18266 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
18267 SDValue Opnd = V->getOperand(Num: 1);
18268 MulOper = V->getOperand(Num: 0);
18269 if (AddSubOpc == ISD::SUB)
18270 std::swap(a&: Opnd, b&: MulOper);
18271 if (auto C = dyn_cast<ConstantSDNode>(Val&: Opnd))
18272 return C->isOne();
18273 }
18274 return false;
18275 };
18276
18277 if (IsAddSubWith1(N0)) {
18278 SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1, N2: MulOper);
18279 return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1, N2: MulVal);
18280 }
18281
18282 if (IsAddSubWith1(N1)) {
18283 SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: N0, N2: MulOper);
18284 return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1: N0, N2: MulVal);
18285 }
18286
18287 // The below optimizations require a constant RHS.
18288 if (!isa<ConstantSDNode>(Val: N1))
18289 return SDValue();
18290
18291 ConstantSDNode *C = cast<ConstantSDNode>(Val&: N1);
18292 const APInt &ConstValue = C->getAPIntValue();
18293
18294 // Allow the scaling to be folded into the `cnt` instruction by preventing
18295 // the scaling to be obscured here. This makes it easier to pattern match.
18296 if (IsSVECntIntrinsic(S: N0) ||
18297 (N0->getOpcode() == ISD::TRUNCATE &&
18298 (IsSVECntIntrinsic(S: N0->getOperand(Num: 0)))))
18299 if (ConstValue.sge(RHS: 1) && ConstValue.sle(RHS: 16))
18300 return SDValue();
18301
18302 // Multiplication of a power of two plus/minus one can be done more
18303 // cheaply as shift+add/sub. For now, this is true unilaterally. If
18304 // future CPUs have a cheaper MADD instruction, this may need to be
18305 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18306 // 64-bit is 5 cycles, so this is always a win.
18307 // More aggressively, some multiplications N0 * C can be lowered to
18308 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
18309 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
18310 // TODO: lower more cases.
18311
18312 // TrailingZeroes is used to test if the mul can be lowered to
18313 // shift+add+shift.
18314 unsigned TrailingZeroes = ConstValue.countr_zero();
18315 if (TrailingZeroes) {
18316 // Conservatively do not lower to shift+add+shift if the mul might be
18317 // folded into smul or umul.
18318 if (N0->hasOneUse() && (isSignExtended(N: N0, DAG) ||
18319 isZeroExtended(N: N0, DAG)))
18320 return SDValue();
18321 // Conservatively do not lower to shift+add+shift if the mul might be
18322 // folded into madd or msub.
18323 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
18324 N->use_begin()->getOpcode() == ISD::SUB))
18325 return SDValue();
18326 }
18327 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18328 // and shift+add+shift.
18329 APInt ShiftedConstValue = ConstValue.ashr(ShiftAmt: TrailingZeroes);
18330 unsigned ShiftAmt;
18331
18332 auto Shl = [&](SDValue N0, unsigned N1) {
18333 if (!N0.getNode())
18334 return SDValue();
18335 // If shift causes overflow, ignore this combine.
18336 if (N1 >= N0.getValueSizeInBits())
18337 return SDValue();
18338 SDValue RHS = DAG.getConstant(Val: N1, DL, VT: MVT::i64);
18339 return DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N0, N2: RHS);
18340 };
18341 auto Add = [&](SDValue N0, SDValue N1) {
18342 if (!N0.getNode() || !N1.getNode())
18343 return SDValue();
18344 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: N0, N2: N1);
18345 };
18346 auto Sub = [&](SDValue N0, SDValue N1) {
18347 if (!N0.getNode() || !N1.getNode())
18348 return SDValue();
18349 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: N0, N2: N1);
18350 };
18351 auto Negate = [&](SDValue N) {
18352 if (!N0.getNode())
18353 return SDValue();
18354 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
18355 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: N);
18356 };
18357
18358 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
18359 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
18360 // the (2^N - 1) can't be execused via a single instruction.
18361 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
18362 unsigned BitWidth = C.getBitWidth();
18363 for (unsigned i = 1; i < BitWidth / 2; i++) {
18364 APInt Rem;
18365 APInt X(BitWidth, (1 << i) + 1);
18366 APInt::sdivrem(LHS: C, RHS: X, Quotient&: N, Remainder&: Rem);
18367 APInt NVMinus1 = N - 1;
18368 if (Rem == 0 && NVMinus1.isPowerOf2()) {
18369 M = X;
18370 return true;
18371 }
18372 }
18373 return false;
18374 };
18375
18376 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
18377 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
18378 // the (2^N - 1) can't be execused via a single instruction.
18379 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
18380 APInt CVMinus1 = C - 1;
18381 if (CVMinus1.isNegative())
18382 return false;
18383 unsigned TrailingZeroes = CVMinus1.countr_zero();
18384 APInt SCVMinus1 = CVMinus1.ashr(ShiftAmt: TrailingZeroes) - 1;
18385 if (SCVMinus1.isPowerOf2()) {
18386 unsigned BitWidth = SCVMinus1.getBitWidth();
18387 M = APInt(BitWidth, SCVMinus1.logBase2());
18388 N = APInt(BitWidth, TrailingZeroes);
18389 return true;
18390 }
18391 return false;
18392 };
18393
18394 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
18395 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
18396 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
18397 APInt CVMinus1 = C - 1;
18398 if (CVMinus1.isNegative())
18399 return false;
18400 unsigned TrailingZeroes = CVMinus1.countr_zero();
18401 APInt CVPlus1 = CVMinus1.ashr(ShiftAmt: TrailingZeroes) + 1;
18402 if (CVPlus1.isPowerOf2()) {
18403 unsigned BitWidth = CVPlus1.getBitWidth();
18404 M = APInt(BitWidth, CVPlus1.logBase2());
18405 N = APInt(BitWidth, TrailingZeroes);
18406 return true;
18407 }
18408 return false;
18409 };
18410
18411 if (ConstValue.isNonNegative()) {
18412 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
18413 // (mul x, 2^N - 1) => (sub (shl x, N), x)
18414 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
18415 // (mul x, (2^M + 1) * (2^N + 1))
18416 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
18417 // (mul x, (2^M + 1) * 2^N + 1))
18418 // => MV = add (shl x, M), x); add (shl MV, N), x)
18419 // (mul x, 1 - (1 - 2^M) * 2^N))
18420 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
18421 APInt SCVMinus1 = ShiftedConstValue - 1;
18422 APInt SCVPlus1 = ShiftedConstValue + 1;
18423 APInt CVPlus1 = ConstValue + 1;
18424 APInt CVM, CVN;
18425 if (SCVMinus1.isPowerOf2()) {
18426 ShiftAmt = SCVMinus1.logBase2();
18427 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
18428 } else if (CVPlus1.isPowerOf2()) {
18429 ShiftAmt = CVPlus1.logBase2();
18430 return Sub(Shl(N0, ShiftAmt), N0);
18431 } else if (SCVPlus1.isPowerOf2()) {
18432 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18433 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
18434 }
18435 if (Subtarget->hasALULSLFast() &&
18436 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
18437 APInt CVMMinus1 = CVM - 1;
18438 APInt CVNMinus1 = CVN - 1;
18439 unsigned ShiftM1 = CVMMinus1.logBase2();
18440 unsigned ShiftN1 = CVNMinus1.logBase2();
18441 // ALULSLFast implicate that Shifts <= 4 places are fast
18442 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
18443 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
18444 return Add(Shl(MVal, ShiftN1), MVal);
18445 }
18446 }
18447 if (Subtarget->hasALULSLFast() &&
18448 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
18449 unsigned ShiftM = CVM.getZExtValue();
18450 unsigned ShiftN = CVN.getZExtValue();
18451 // ALULSLFast implicate that Shifts <= 4 places are fast
18452 if (ShiftM <= 4 && ShiftN <= 4) {
18453 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
18454 return Add(Shl(MVal, CVN.getZExtValue()), N0);
18455 }
18456 }
18457
18458 if (Subtarget->hasALULSLFast() &&
18459 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
18460 unsigned ShiftM = CVM.getZExtValue();
18461 unsigned ShiftN = CVN.getZExtValue();
18462 // ALULSLFast implicate that Shifts <= 4 places are fast
18463 if (ShiftM <= 4 && ShiftN <= 4) {
18464 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
18465 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
18466 }
18467 }
18468 } else {
18469 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
18470 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
18471 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
18472 APInt SCVPlus1 = -ShiftedConstValue + 1;
18473 APInt CVNegPlus1 = -ConstValue + 1;
18474 APInt CVNegMinus1 = -ConstValue - 1;
18475 if (CVNegPlus1.isPowerOf2()) {
18476 ShiftAmt = CVNegPlus1.logBase2();
18477 return Sub(N0, Shl(N0, ShiftAmt));
18478 } else if (CVNegMinus1.isPowerOf2()) {
18479 ShiftAmt = CVNegMinus1.logBase2();
18480 return Negate(Add(Shl(N0, ShiftAmt), N0));
18481 } else if (SCVPlus1.isPowerOf2()) {
18482 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18483 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
18484 }
18485 }
18486
18487 return SDValue();
18488}
18489
18490static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
18491 SelectionDAG &DAG) {
18492 // Take advantage of vector comparisons producing 0 or -1 in each lane to
18493 // optimize away operation when it's from a constant.
18494 //
18495 // The general transformation is:
18496 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
18497 // AND(VECTOR_CMP(x,y), constant2)
18498 // constant2 = UNARYOP(constant)
18499
18500 // Early exit if this isn't a vector operation, the operand of the
18501 // unary operation isn't a bitwise AND, or if the sizes of the operations
18502 // aren't the same.
18503 EVT VT = N->getValueType(ResNo: 0);
18504 if (!VT.isVector() || N->getOperand(Num: 0)->getOpcode() != ISD::AND ||
18505 N->getOperand(Num: 0)->getOperand(Num: 0)->getOpcode() != ISD::SETCC ||
18506 VT.getSizeInBits() != N->getOperand(Num: 0)->getValueType(ResNo: 0).getSizeInBits())
18507 return SDValue();
18508
18509 // Now check that the other operand of the AND is a constant. We could
18510 // make the transformation for non-constant splats as well, but it's unclear
18511 // that would be a benefit as it would not eliminate any operations, just
18512 // perform one more step in scalar code before moving to the vector unit.
18513 if (BuildVectorSDNode *BV =
18514 dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: 0)->getOperand(Num: 1))) {
18515 // Bail out if the vector isn't a constant.
18516 if (!BV->isConstant())
18517 return SDValue();
18518
18519 // Everything checks out. Build up the new and improved node.
18520 SDLoc DL(N);
18521 EVT IntVT = BV->getValueType(ResNo: 0);
18522 // Create a new constant of the appropriate type for the transformed
18523 // DAG.
18524 SDValue SourceConst = DAG.getNode(Opcode: N->getOpcode(), DL, VT, Operand: SDValue(BV, 0));
18525 // The AND node needs bitcasts to/from an integer vector type around it.
18526 SDValue MaskConst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntVT, Operand: SourceConst);
18527 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: IntVT,
18528 N1: N->getOperand(Num: 0)->getOperand(Num: 0), N2: MaskConst);
18529 SDValue Res = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewAnd);
18530 return Res;
18531 }
18532
18533 return SDValue();
18534}
18535
18536static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
18537 const AArch64Subtarget *Subtarget) {
18538 // First try to optimize away the conversion when it's conditionally from
18539 // a constant. Vectors only.
18540 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
18541 return Res;
18542
18543 EVT VT = N->getValueType(ResNo: 0);
18544 if (VT != MVT::f32 && VT != MVT::f64)
18545 return SDValue();
18546
18547 // Only optimize when the source and destination types have the same width.
18548 if (VT.getSizeInBits() != N->getOperand(Num: 0).getValueSizeInBits())
18549 return SDValue();
18550
18551 // If the result of an integer load is only used by an integer-to-float
18552 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
18553 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
18554 SDValue N0 = N->getOperand(Num: 0);
18555 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N: N0.getNode()) &&
18556 N0.hasOneUse() &&
18557 // Do not change the width of a volatile load.
18558 !cast<LoadSDNode>(Val&: N0)->isVolatile()) {
18559 LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
18560 SDValue Load = DAG.getLoad(VT, dl: SDLoc(N), Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
18561 PtrInfo: LN0->getPointerInfo(), Alignment: LN0->getAlign(),
18562 MMOFlags: LN0->getMemOperand()->getFlags());
18563
18564 // Make sure successors of the original load stay after it by updating them
18565 // to use the new Chain.
18566 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LN0, 1), To: Load.getValue(R: 1));
18567
18568 unsigned Opcode =
18569 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
18570 return DAG.getNode(Opcode, DL: SDLoc(N), VT, Operand: Load);
18571 }
18572
18573 return SDValue();
18574}
18575
18576/// Fold a floating-point multiply by power of two into floating-point to
18577/// fixed-point conversion.
18578static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
18579 TargetLowering::DAGCombinerInfo &DCI,
18580 const AArch64Subtarget *Subtarget) {
18581 if (!Subtarget->isNeonAvailable())
18582 return SDValue();
18583
18584 if (!N->getValueType(ResNo: 0).isSimple())
18585 return SDValue();
18586
18587 SDValue Op = N->getOperand(Num: 0);
18588 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
18589 return SDValue();
18590
18591 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
18592 return SDValue();
18593
18594 SDValue ConstVec = Op->getOperand(Num: 1);
18595 if (!isa<BuildVectorSDNode>(Val: ConstVec))
18596 return SDValue();
18597
18598 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
18599 uint32_t FloatBits = FloatTy.getSizeInBits();
18600 if (FloatBits != 32 && FloatBits != 64 &&
18601 (FloatBits != 16 || !Subtarget->hasFullFP16()))
18602 return SDValue();
18603
18604 MVT IntTy = N->getSimpleValueType(ResNo: 0).getVectorElementType();
18605 uint32_t IntBits = IntTy.getSizeInBits();
18606 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
18607 return SDValue();
18608
18609 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
18610 if (IntBits > FloatBits)
18611 return SDValue();
18612
18613 BitVector UndefElements;
18614 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Val&: ConstVec);
18615 int32_t Bits = IntBits == 64 ? 64 : 32;
18616 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(UndefElements: &UndefElements, BitWidth: Bits + 1);
18617 if (C == -1 || C == 0 || C > Bits)
18618 return SDValue();
18619
18620 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
18621 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ResTy))
18622 return SDValue();
18623
18624 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
18625 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
18626 EVT SatVT = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT();
18627 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
18628 return SDValue();
18629 }
18630
18631 SDLoc DL(N);
18632 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
18633 N->getOpcode() == ISD::FP_TO_SINT_SAT);
18634 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
18635 : Intrinsic::aarch64_neon_vcvtfp2fxu;
18636 SDValue FixConv =
18637 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ResTy,
18638 N1: DAG.getConstant(Val: IntrinsicOpcode, DL, VT: MVT::i32),
18639 N2: Op->getOperand(Num: 0), N3: DAG.getConstant(Val: C, DL, VT: MVT::i32));
18640 // We can handle smaller integers by generating an extra trunc.
18641 if (IntBits < FloatBits)
18642 FixConv = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: N->getValueType(ResNo: 0), Operand: FixConv);
18643
18644 return FixConv;
18645}
18646
18647static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18648 const AArch64TargetLowering &TLI) {
18649 EVT VT = N->getValueType(ResNo: 0);
18650 SelectionDAG &DAG = DCI.DAG;
18651 SDLoc DL(N);
18652 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
18653
18654 if (!VT.isVector())
18655 return SDValue();
18656
18657 if (VT.isScalableVector() && !Subtarget.hasSVE2())
18658 return SDValue();
18659
18660 if (VT.isFixedLengthVector() &&
18661 (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
18662 return SDValue();
18663
18664 SDValue N0 = N->getOperand(Num: 0);
18665 if (N0.getOpcode() != ISD::AND)
18666 return SDValue();
18667
18668 SDValue N1 = N->getOperand(Num: 1);
18669 if (N1.getOpcode() != ISD::AND)
18670 return SDValue();
18671
18672 // InstCombine does (not (neg a)) => (add a -1).
18673 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
18674 // Loop over all combinations of AND operands.
18675 for (int i = 1; i >= 0; --i) {
18676 for (int j = 1; j >= 0; --j) {
18677 SDValue O0 = N0->getOperand(Num: i);
18678 SDValue O1 = N1->getOperand(Num: j);
18679 SDValue Sub, Add, SubSibling, AddSibling;
18680
18681 // Find a SUB and an ADD operand, one from each AND.
18682 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
18683 Sub = O0;
18684 Add = O1;
18685 SubSibling = N0->getOperand(Num: 1 - i);
18686 AddSibling = N1->getOperand(Num: 1 - j);
18687 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
18688 Add = O0;
18689 Sub = O1;
18690 AddSibling = N0->getOperand(Num: 1 - i);
18691 SubSibling = N1->getOperand(Num: 1 - j);
18692 } else
18693 continue;
18694
18695 if (!ISD::isConstantSplatVectorAllZeros(N: Sub.getOperand(i: 0).getNode()))
18696 continue;
18697
18698 // Constant ones is always righthand operand of the Add.
18699 if (!ISD::isConstantSplatVectorAllOnes(N: Add.getOperand(i: 1).getNode()))
18700 continue;
18701
18702 if (Sub.getOperand(i: 1) != Add.getOperand(i: 0))
18703 continue;
18704
18705 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Sub, N2: SubSibling, N3: AddSibling);
18706 }
18707 }
18708
18709 // (or (and a b) (and (not a) c)) => (bsl a b c)
18710 // We only have to look for constant vectors here since the general, variable
18711 // case can be handled in TableGen.
18712 unsigned Bits = VT.getScalarSizeInBits();
18713 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
18714 for (int i = 1; i >= 0; --i)
18715 for (int j = 1; j >= 0; --j) {
18716 APInt Val1, Val2;
18717
18718 if (ISD::isConstantSplatVector(N: N0->getOperand(Num: i).getNode(), SplatValue&: Val1) &&
18719 ISD::isConstantSplatVector(N: N1->getOperand(Num: j).getNode(), SplatValue&: Val2) &&
18720 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
18721 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0->getOperand(Num: i),
18722 N2: N0->getOperand(Num: 1 - i), N3: N1->getOperand(Num: 1 - j));
18723 }
18724 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(Val: N0->getOperand(Num: i));
18725 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(Val: N1->getOperand(Num: j));
18726 if (!BVN0 || !BVN1)
18727 continue;
18728
18729 bool FoundMatch = true;
18730 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
18731 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(Val: BVN0->getOperand(Num: k));
18732 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val: BVN1->getOperand(Num: k));
18733 if (!CN0 || !CN1 ||
18734 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
18735 FoundMatch = false;
18736 break;
18737 }
18738 }
18739 if (FoundMatch)
18740 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0->getOperand(Num: i),
18741 N2: N0->getOperand(Num: 1 - i), N3: N1->getOperand(Num: 1 - j));
18742 }
18743
18744 return SDValue();
18745}
18746
18747// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
18748// convert to csel(ccmp(.., cc0)), depending on cc1:
18749
18750// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18751// =>
18752// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
18753//
18754// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18755// =>
18756// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18757static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
18758 EVT VT = N->getValueType(ResNo: 0);
18759 SDValue CSel0 = N->getOperand(Num: 0);
18760 SDValue CSel1 = N->getOperand(Num: 1);
18761
18762 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
18763 CSel1.getOpcode() != AArch64ISD::CSEL)
18764 return SDValue();
18765
18766 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18767 return SDValue();
18768
18769 if (!isNullConstant(V: CSel0.getOperand(i: 0)) ||
18770 !isOneConstant(V: CSel0.getOperand(i: 1)) ||
18771 !isNullConstant(V: CSel1.getOperand(i: 0)) ||
18772 !isOneConstant(V: CSel1.getOperand(i: 1)))
18773 return SDValue();
18774
18775 SDValue Cmp0 = CSel0.getOperand(i: 3);
18776 SDValue Cmp1 = CSel1.getOperand(i: 3);
18777 AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(i: 2);
18778 AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(i: 2);
18779 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18780 return SDValue();
18781 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18782 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18783 std::swap(a&: Cmp0, b&: Cmp1);
18784 std::swap(a&: CC0, b&: CC1);
18785 }
18786
18787 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18788 return SDValue();
18789
18790 SDLoc DL(N);
18791 SDValue CCmp, Condition;
18792 unsigned NZCV;
18793
18794 if (N->getOpcode() == ISD::AND) {
18795 AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(Code: CC0);
18796 Condition = DAG.getConstant(Val: InvCC0, DL, VT: MVT_CC);
18797 NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: CC1);
18798 } else {
18799 AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(Code: CC1);
18800 Condition = DAG.getConstant(Val: CC0, DL, VT: MVT_CC);
18801 NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvCC1);
18802 }
18803
18804 SDValue NZCVOp = DAG.getConstant(Val: NZCV, DL, VT: MVT::i32);
18805
18806 auto *Op1 = dyn_cast<ConstantSDNode>(Val: Cmp1.getOperand(i: 1));
18807 if (Op1 && Op1->getAPIntValue().isNegative() &&
18808 Op1->getAPIntValue().sgt(RHS: -32)) {
18809 // CCMP accept the constant int the range [0, 31]
18810 // if the Op1 is a constant in the range [-31, -1], we
18811 // can select to CCMN to avoid the extra mov
18812 SDValue AbsOp1 =
18813 DAG.getConstant(Val: Op1->getAPIntValue().abs(), DL, VT: Op1->getValueType(ResNo: 0));
18814 CCmp = DAG.getNode(Opcode: AArch64ISD::CCMN, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: 0), N2: AbsOp1,
18815 N3: NZCVOp, N4: Condition, N5: Cmp0);
18816 } else {
18817 CCmp = DAG.getNode(Opcode: AArch64ISD::CCMP, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: 0),
18818 N2: Cmp1.getOperand(i: 1), N3: NZCVOp, N4: Condition, N5: Cmp0);
18819 }
18820 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: CSel0.getOperand(i: 0),
18821 N2: CSel0.getOperand(i: 1), N3: DAG.getConstant(Val: CC1, DL, VT: MVT::i32),
18822 N4: CCmp);
18823}
18824
18825static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18826 const AArch64Subtarget *Subtarget,
18827 const AArch64TargetLowering &TLI) {
18828 SelectionDAG &DAG = DCI.DAG;
18829 EVT VT = N->getValueType(ResNo: 0);
18830
18831 if (SDValue R = performANDORCSELCombine(N, DAG))
18832 return R;
18833
18834 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18835 return SDValue();
18836
18837 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18838 return Res;
18839
18840 return SDValue();
18841}
18842
18843static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
18844 if (!MemVT.getVectorElementType().isSimple())
18845 return false;
18846
18847 uint64_t MaskForTy = 0ull;
18848 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18849 case MVT::i8:
18850 MaskForTy = 0xffull;
18851 break;
18852 case MVT::i16:
18853 MaskForTy = 0xffffull;
18854 break;
18855 case MVT::i32:
18856 MaskForTy = 0xffffffffull;
18857 break;
18858 default:
18859 return false;
18860 break;
18861 }
18862
18863 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18864 if (auto *Op0 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0)))
18865 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18866
18867 return false;
18868}
18869
18870static SDValue performReinterpretCastCombine(SDNode *N) {
18871 SDValue LeafOp = SDValue(N, 0);
18872 SDValue Op = N->getOperand(Num: 0);
18873 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18874 LeafOp.getValueType() != Op.getValueType())
18875 Op = Op->getOperand(Num: 0);
18876 if (LeafOp.getValueType() == Op.getValueType())
18877 return Op;
18878 return SDValue();
18879}
18880
18881static SDValue performSVEAndCombine(SDNode *N,
18882 TargetLowering::DAGCombinerInfo &DCI) {
18883 SelectionDAG &DAG = DCI.DAG;
18884 SDValue Src = N->getOperand(Num: 0);
18885 unsigned Opc = Src->getOpcode();
18886
18887 // Zero/any extend of an unsigned unpack
18888 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18889 SDValue UnpkOp = Src->getOperand(Num: 0);
18890 SDValue Dup = N->getOperand(Num: 1);
18891
18892 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18893 return SDValue();
18894
18895 SDLoc DL(N);
18896 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Dup->getOperand(Num: 0));
18897 if (!C)
18898 return SDValue();
18899
18900 uint64_t ExtVal = C->getZExtValue();
18901
18902 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18903 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18904 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18905 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18906 };
18907
18908 // If the mask is fully covered by the unpack, we don't need to push
18909 // a new AND onto the operand
18910 EVT EltTy = UnpkOp->getValueType(ResNo: 0).getVectorElementType();
18911 if (MaskAndTypeMatch(EltTy))
18912 return Src;
18913
18914 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18915 // to see if the mask is all-ones of size MemTy.
18916 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(Val&: UnpkOp);
18917 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18918 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18919 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18920 if (MaskAndTypeMatch(EltTy))
18921 return Src;
18922 }
18923
18924 // Truncate to prevent a DUP with an over wide constant
18925 APInt Mask = C->getAPIntValue().trunc(width: EltTy.getSizeInBits());
18926
18927 // Otherwise, make sure we propagate the AND to the operand
18928 // of the unpack
18929 Dup = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: UnpkOp->getValueType(ResNo: 0),
18930 Operand: DAG.getConstant(Val: Mask.zextOrTrunc(width: 32), DL, VT: MVT::i32));
18931
18932 SDValue And = DAG.getNode(Opcode: ISD::AND, DL,
18933 VT: UnpkOp->getValueType(ResNo: 0), N1: UnpkOp, N2: Dup);
18934
18935 return DAG.getNode(Opcode: Opc, DL, VT: N->getValueType(ResNo: 0), Operand: And);
18936 }
18937
18938 if (DCI.isBeforeLegalizeOps())
18939 return SDValue();
18940
18941 // If both sides of AND operations are i1 splat_vectors then
18942 // we can produce just i1 splat_vector as the result.
18943 if (isAllActivePredicate(DAG, N: N->getOperand(Num: 0)))
18944 return N->getOperand(Num: 1);
18945 if (isAllActivePredicate(DAG, N: N->getOperand(Num: 1)))
18946 return N->getOperand(Num: 0);
18947
18948 if (!EnableCombineMGatherIntrinsics)
18949 return SDValue();
18950
18951 SDValue Mask = N->getOperand(Num: 1);
18952
18953 if (!Src.hasOneUse())
18954 return SDValue();
18955
18956 EVT MemVT;
18957
18958 // SVE load instructions perform an implicit zero-extend, which makes them
18959 // perfect candidates for combining.
18960 switch (Opc) {
18961 case AArch64ISD::LD1_MERGE_ZERO:
18962 case AArch64ISD::LDNF1_MERGE_ZERO:
18963 case AArch64ISD::LDFF1_MERGE_ZERO:
18964 MemVT = cast<VTSDNode>(Val: Src->getOperand(Num: 3))->getVT();
18965 break;
18966 case AArch64ISD::GLD1_MERGE_ZERO:
18967 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
18968 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
18969 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
18970 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
18971 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
18972 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
18973 case AArch64ISD::GLDFF1_MERGE_ZERO:
18974 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
18975 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
18976 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
18977 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
18978 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
18979 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
18980 case AArch64ISD::GLDNT1_MERGE_ZERO:
18981 MemVT = cast<VTSDNode>(Val: Src->getOperand(Num: 4))->getVT();
18982 break;
18983 default:
18984 return SDValue();
18985 }
18986
18987 if (isConstantSplatVectorMaskForType(N: Mask.getNode(), MemVT))
18988 return Src;
18989
18990 return SDValue();
18991}
18992
18993// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18994static SDValue performANDSETCCCombine(SDNode *N,
18995 TargetLowering::DAGCombinerInfo &DCI) {
18996
18997 // This function performs an optimization on a specific pattern involving
18998 // an AND operation and SETCC (Set Condition Code) node.
18999
19000 SDValue SetCC = N->getOperand(Num: 0);
19001 EVT VT = N->getValueType(ResNo: 0);
19002 SelectionDAG &DAG = DCI.DAG;
19003
19004 // Checks if the current node (N) is used by any SELECT instruction and
19005 // returns an empty SDValue to avoid applying the optimization to prevent
19006 // incorrect results
19007 for (auto U : N->uses())
19008 if (U->getOpcode() == ISD::SELECT)
19009 return SDValue();
19010
19011 // Check if the operand is a SETCC node with floating-point comparison
19012 if (SetCC.getOpcode() == ISD::SETCC &&
19013 SetCC.getOperand(i: 0).getValueType() == MVT::f32) {
19014
19015 SDValue Cmp;
19016 AArch64CC::CondCode CC;
19017
19018 // Check if the DAG is after legalization and if we can emit the conjunction
19019 if (!DCI.isBeforeLegalize() &&
19020 (Cmp = emitConjunction(DAG, Val: SDValue(N, 0), OutCC&: CC))) {
19021
19022 AArch64CC::CondCode InvertedCC = AArch64CC::getInvertedCondCode(Code: CC);
19023
19024 SDLoc DL(N);
19025 return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT),
19026 N2: DAG.getConstant(Val: 0, DL, VT),
19027 N3: DAG.getConstant(Val: InvertedCC, DL, VT: MVT::i32), N4: Cmp);
19028 }
19029 }
19030 return SDValue();
19031}
19032
19033static SDValue performANDCombine(SDNode *N,
19034 TargetLowering::DAGCombinerInfo &DCI) {
19035 SelectionDAG &DAG = DCI.DAG;
19036 SDValue LHS = N->getOperand(Num: 0);
19037 SDValue RHS = N->getOperand(Num: 1);
19038 EVT VT = N->getValueType(ResNo: 0);
19039
19040 if (SDValue R = performANDORCSELCombine(N, DAG))
19041 return R;
19042
19043 if (SDValue R = performANDSETCCCombine(N,DCI))
19044 return R;
19045
19046 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19047 return SDValue();
19048
19049 if (VT.isScalableVector())
19050 return performSVEAndCombine(N, DCI);
19051
19052 // The combining code below works only for NEON vectors. In particular, it
19053 // does not work for SVE when dealing with vectors wider than 128 bits.
19054 if (!VT.is64BitVector() && !VT.is128BitVector())
19055 return SDValue();
19056
19057 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
19058 if (!BVN)
19059 return SDValue();
19060
19061 // AND does not accept an immediate, so check if we can use a BIC immediate
19062 // instruction instead. We do this here instead of using a (and x, (mvni imm))
19063 // pattern in isel, because some immediates may be lowered to the preferred
19064 // (and x, (movi imm)) form, even though an mvni representation also exists.
19065 APInt DefBits(VT.getSizeInBits(), 0);
19066 APInt UndefBits(VT.getSizeInBits(), 0);
19067 if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
19068 SDValue NewOp;
19069
19070 // Any bits known to already be 0 need not be cleared again, which can help
19071 // reduce the size of the immediate to one supported by the instruction.
19072 KnownBits Known = DAG.computeKnownBits(Op: LHS);
19073 APInt ZeroSplat(VT.getSizeInBits(), 0);
19074 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
19075 ZeroSplat |= Known.Zero.zext(width: VT.getSizeInBits())
19076 << (Known.Zero.getBitWidth() * I);
19077
19078 DefBits = ~(DefBits | ZeroSplat);
19079 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
19080 Bits: DefBits, LHS: &LHS)) ||
19081 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
19082 Bits: DefBits, LHS: &LHS)))
19083 return NewOp;
19084
19085 UndefBits = ~(UndefBits | ZeroSplat);
19086 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
19087 Bits: UndefBits, LHS: &LHS)) ||
19088 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
19089 Bits: UndefBits, LHS: &LHS)))
19090 return NewOp;
19091 }
19092
19093 return SDValue();
19094}
19095
19096static SDValue performFADDCombine(SDNode *N,
19097 TargetLowering::DAGCombinerInfo &DCI) {
19098 SelectionDAG &DAG = DCI.DAG;
19099 SDValue LHS = N->getOperand(Num: 0);
19100 SDValue RHS = N->getOperand(Num: 1);
19101 EVT VT = N->getValueType(ResNo: 0);
19102 SDLoc DL(N);
19103
19104 if (!N->getFlags().hasAllowReassociation())
19105 return SDValue();
19106
19107 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
19108 auto ReassocComplex = [&](SDValue A, SDValue B) {
19109 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
19110 return SDValue();
19111 unsigned Opc = A.getConstantOperandVal(i: 0);
19112 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
19113 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
19114 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
19115 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
19116 return SDValue();
19117 SDValue VCMLA = DAG.getNode(
19118 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: A.getOperand(i: 0),
19119 N2: DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: A.getOperand(i: 1), N2: B, Flags: N->getFlags()),
19120 N3: A.getOperand(i: 2), N4: A.getOperand(i: 3));
19121 VCMLA->setFlags(A->getFlags());
19122 return VCMLA;
19123 };
19124 if (SDValue R = ReassocComplex(LHS, RHS))
19125 return R;
19126 if (SDValue R = ReassocComplex(RHS, LHS))
19127 return R;
19128
19129 return SDValue();
19130}
19131
19132static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
19133 switch (Opcode) {
19134 case ISD::STRICT_FADD:
19135 case ISD::FADD:
19136 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
19137 case ISD::ADD:
19138 return VT == MVT::i64;
19139 default:
19140 return false;
19141 }
19142}
19143
19144static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19145 AArch64CC::CondCode Cond);
19146
19147static bool isPredicateCCSettingOp(SDValue N) {
19148 if ((N.getOpcode() == ISD::SETCC) ||
19149 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
19150 (N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilege ||
19151 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilegt ||
19152 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilehi ||
19153 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilehs ||
19154 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilele ||
19155 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilelo ||
19156 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilels ||
19157 N.getConstantOperandVal(i: 0) == Intrinsic::aarch64_sve_whilelt ||
19158 // get_active_lane_mask is lowered to a whilelo instruction.
19159 N.getConstantOperandVal(i: 0) == Intrinsic::get_active_lane_mask)))
19160 return true;
19161
19162 return false;
19163}
19164
19165// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
19166// ... into: "ptrue p, all" + PTEST
19167static SDValue
19168performFirstTrueTestVectorCombine(SDNode *N,
19169 TargetLowering::DAGCombinerInfo &DCI,
19170 const AArch64Subtarget *Subtarget) {
19171 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19172 // Make sure PTEST can be legalised with illegal types.
19173 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19174 return SDValue();
19175
19176 SDValue N0 = N->getOperand(Num: 0);
19177 EVT VT = N0.getValueType();
19178
19179 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
19180 !isNullConstant(V: N->getOperand(Num: 1)))
19181 return SDValue();
19182
19183 // Restricted the DAG combine to only cases where we're extracting from a
19184 // flag-setting operation.
19185 if (!isPredicateCCSettingOp(N: N0))
19186 return SDValue();
19187
19188 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
19189 SelectionDAG &DAG = DCI.DAG;
19190 SDValue Pg = getPTrue(DAG, DL: SDLoc(N), VT, Pattern: AArch64SVEPredPattern::all);
19191 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg, Op: N0, Cond: AArch64CC::FIRST_ACTIVE);
19192}
19193
19194// Materialize : Idx = (add (mul vscale, NumEls), -1)
19195// i1 = extract_vector_elt t37, Constant:i64<Idx>
19196// ... into: "ptrue p, all" + PTEST
19197static SDValue
19198performLastTrueTestVectorCombine(SDNode *N,
19199 TargetLowering::DAGCombinerInfo &DCI,
19200 const AArch64Subtarget *Subtarget) {
19201 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19202 // Make sure PTEST is legal types.
19203 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19204 return SDValue();
19205
19206 SDValue N0 = N->getOperand(Num: 0);
19207 EVT OpVT = N0.getValueType();
19208
19209 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
19210 return SDValue();
19211
19212 // Idx == (add (mul vscale, NumEls), -1)
19213 SDValue Idx = N->getOperand(Num: 1);
19214 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(V: Idx.getOperand(i: 1)))
19215 return SDValue();
19216
19217 SDValue VS = Idx.getOperand(i: 0);
19218 if (VS.getOpcode() != ISD::VSCALE)
19219 return SDValue();
19220
19221 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
19222 if (VS.getConstantOperandVal(i: 0) != NumEls)
19223 return SDValue();
19224
19225 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
19226 SelectionDAG &DAG = DCI.DAG;
19227 SDValue Pg = getPTrue(DAG, DL: SDLoc(N), VT: OpVT, Pattern: AArch64SVEPredPattern::all);
19228 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg, Op: N0, Cond: AArch64CC::LAST_ACTIVE);
19229}
19230
19231static SDValue
19232performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19233 const AArch64Subtarget *Subtarget) {
19234 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19235 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
19236 return Res;
19237 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
19238 return Res;
19239
19240 SelectionDAG &DAG = DCI.DAG;
19241 SDValue N0 = N->getOperand(Num: 0), N1 = N->getOperand(Num: 1);
19242
19243 EVT VT = N->getValueType(ResNo: 0);
19244 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
19245 bool IsStrict = N0->isStrictFPOpcode();
19246
19247 // extract(dup x) -> x
19248 if (N0.getOpcode() == AArch64ISD::DUP)
19249 return VT.isInteger() ? DAG.getZExtOrTrunc(Op: N0.getOperand(i: 0), DL: SDLoc(N), VT)
19250 : N0.getOperand(i: 0);
19251
19252 // Rewrite for pairwise fadd pattern
19253 // (f32 (extract_vector_elt
19254 // (fadd (vXf32 Other)
19255 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
19256 // ->
19257 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
19258 // (extract_vector_elt (vXf32 Other) 1))
19259 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
19260 // we can only do this when it's used only by the extract_vector_elt.
19261 if (isNullConstant(V: N1) && hasPairwiseAdd(Opcode: N0->getOpcode(), VT, FullFP16) &&
19262 (!IsStrict || N0.hasOneUse())) {
19263 SDLoc DL(N0);
19264 SDValue N00 = N0->getOperand(Num: IsStrict ? 1 : 0);
19265 SDValue N01 = N0->getOperand(Num: IsStrict ? 2 : 1);
19266
19267 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N01);
19268 SDValue Other = N00;
19269
19270 // And handle the commutative case.
19271 if (!Shuffle) {
19272 Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N00);
19273 Other = N01;
19274 }
19275
19276 if (Shuffle && Shuffle->getMaskElt(Idx: 0) == 1 &&
19277 Other == Shuffle->getOperand(Num: 0)) {
19278 SDValue Extract1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Other,
19279 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
19280 SDValue Extract2 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Other,
19281 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i64));
19282 if (!IsStrict)
19283 return DAG.getNode(Opcode: N0->getOpcode(), DL, VT, N1: Extract1, N2: Extract2);
19284
19285 // For strict_fadd we need uses of the final extract_vector to be replaced
19286 // with the strict_fadd, but we also need uses of the chain output of the
19287 // original strict_fadd to use the chain output of the new strict_fadd as
19288 // otherwise it may not be deleted.
19289 SDValue Ret = DAG.getNode(Opcode: N0->getOpcode(), DL,
19290 ResultTys: {VT, MVT::Other},
19291 Ops: {N0->getOperand(Num: 0), Extract1, Extract2});
19292 DAG.ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Ret);
19293 DAG.ReplaceAllUsesOfValueWith(From: N0.getValue(R: 1), To: Ret.getValue(R: 1));
19294 return SDValue(N, 0);
19295 }
19296 }
19297
19298 return SDValue();
19299}
19300
19301static SDValue performConcatVectorsCombine(SDNode *N,
19302 TargetLowering::DAGCombinerInfo &DCI,
19303 SelectionDAG &DAG) {
19304 SDLoc dl(N);
19305 EVT VT = N->getValueType(ResNo: 0);
19306 SDValue N0 = N->getOperand(Num: 0), N1 = N->getOperand(Num: 1);
19307 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
19308
19309 if (VT.isScalableVector())
19310 return SDValue();
19311
19312 // Optimize concat_vectors of truncated vectors, where the intermediate
19313 // type is illegal, to avoid said illegality, e.g.,
19314 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
19315 // (v2i16 (truncate (v2i64)))))
19316 // ->
19317 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
19318 // (v4i32 (bitcast (v2i64))),
19319 // <0, 2, 4, 6>)))
19320 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
19321 // on both input and result type, so we might generate worse code.
19322 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
19323 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19324 N1Opc == ISD::TRUNCATE) {
19325 SDValue N00 = N0->getOperand(Num: 0);
19326 SDValue N10 = N1->getOperand(Num: 0);
19327 EVT N00VT = N00.getValueType();
19328
19329 if (N00VT == N10.getValueType() &&
19330 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
19331 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
19332 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
19333 SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
19334 for (size_t i = 0; i < Mask.size(); ++i)
19335 Mask[i] = i * 2;
19336 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT,
19337 Operand: DAG.getVectorShuffle(
19338 VT: MidVT, dl,
19339 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MidVT, Operand: N00),
19340 N2: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MidVT, Operand: N10), Mask));
19341 }
19342 }
19343
19344 if (N->getOperand(Num: 0).getValueType() == MVT::v4i8 ||
19345 N->getOperand(Num: 0).getValueType() == MVT::v2i16 ||
19346 N->getOperand(Num: 0).getValueType() == MVT::v2i8) {
19347 EVT SrcVT = N->getOperand(Num: 0).getValueType();
19348 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
19349 // loads to prevent having to go through the v4i8 load legalization that
19350 // needs to extend each element into a larger type.
19351 if (N->getNumOperands() % 2 == 0 &&
19352 all_of(Range: N->op_values(), P: [SrcVT](SDValue V) {
19353 if (V.getValueType() != SrcVT)
19354 return false;
19355 if (V.isUndef())
19356 return true;
19357 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val&: V);
19358 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
19359 LD->getExtensionType() == ISD::NON_EXTLOAD;
19360 })) {
19361 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
19362 EVT NVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: FVT, NumElements: N->getNumOperands());
19363 SmallVector<SDValue> Ops;
19364
19365 for (unsigned i = 0; i < N->getNumOperands(); i++) {
19366 SDValue V = N->getOperand(Num: i);
19367 if (V.isUndef())
19368 Ops.push_back(Elt: DAG.getUNDEF(VT: FVT));
19369 else {
19370 LoadSDNode *LD = cast<LoadSDNode>(Val&: V);
19371 SDValue NewLoad = DAG.getLoad(VT: FVT, dl, Chain: LD->getChain(),
19372 Ptr: LD->getBasePtr(), MMO: LD->getMemOperand());
19373 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: NewLoad.getValue(R: 1));
19374 Ops.push_back(Elt: NewLoad);
19375 }
19376 }
19377 return DAG.getBitcast(VT: N->getValueType(ResNo: 0),
19378 V: DAG.getBuildVector(VT: NVT, DL: dl, Ops));
19379 }
19380 }
19381
19382 // Canonicalise concat_vectors to replace concatenations of truncated nots
19383 // with nots of concatenated truncates. This in some cases allows for multiple
19384 // redundant negations to be eliminated.
19385 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
19386 // (v4i16 (truncate (not (v4i32)))))
19387 // ->
19388 // (not (concat_vectors (v4i16 (truncate (v4i32))),
19389 // (v4i16 (truncate (v4i32)))))
19390 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19391 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N: N0.getNode()) &&
19392 N->isOnlyUserOf(N: N1.getNode())) {
19393 auto isBitwiseVectorNegate = [](SDValue V) {
19394 return V->getOpcode() == ISD::XOR &&
19395 ISD::isConstantSplatVectorAllOnes(N: V.getOperand(i: 1).getNode());
19396 };
19397 SDValue N00 = N0->getOperand(Num: 0);
19398 SDValue N10 = N1->getOperand(Num: 0);
19399 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N: N00.getNode()) &&
19400 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N: N10.getNode())) {
19401 return DAG.getNOT(
19402 DL: dl,
19403 Val: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT,
19404 N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: N0.getValueType(),
19405 Operand: N00->getOperand(Num: 0)),
19406 N2: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: N1.getValueType(),
19407 Operand: N10->getOperand(Num: 0))),
19408 VT);
19409 }
19410 }
19411
19412 // Wait till after everything is legalized to try this. That way we have
19413 // legal vector types and such.
19414 if (DCI.isBeforeLegalizeOps())
19415 return SDValue();
19416
19417 // Optimise concat_vectors of two identical binops with a 128-bit destination
19418 // size, combine into an binop of two contacts of the source vectors. eg:
19419 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
19420 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
19421 DAG.getTargetLoweringInfo().isBinOp(Opcode: N0Opc) && N0->hasOneUse() &&
19422 N1->hasOneUse()) {
19423 SDValue N00 = N0->getOperand(Num: 0);
19424 SDValue N01 = N0->getOperand(Num: 1);
19425 SDValue N10 = N1->getOperand(Num: 0);
19426 SDValue N11 = N1->getOperand(Num: 1);
19427
19428 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
19429 SDValue Concat0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N00, N2: N10);
19430 SDValue Concat1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N01, N2: N11);
19431 return DAG.getNode(Opcode: N0Opc, DL: dl, VT, N1: Concat0, N2: Concat1);
19432 }
19433 }
19434
19435 auto IsRSHRN = [](SDValue Shr) {
19436 if (Shr.getOpcode() != AArch64ISD::VLSHR)
19437 return false;
19438 SDValue Op = Shr.getOperand(i: 0);
19439 EVT VT = Op.getValueType();
19440 unsigned ShtAmt = Shr.getConstantOperandVal(i: 1);
19441 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
19442 return false;
19443
19444 APInt Imm;
19445 if (Op.getOperand(i: 1).getOpcode() == AArch64ISD::MOVIshift)
19446 Imm = APInt(VT.getScalarSizeInBits(),
19447 Op.getOperand(i: 1).getConstantOperandVal(i: 0)
19448 << Op.getOperand(i: 1).getConstantOperandVal(i: 1));
19449 else if (Op.getOperand(i: 1).getOpcode() == AArch64ISD::DUP &&
19450 isa<ConstantSDNode>(Val: Op.getOperand(i: 1).getOperand(i: 0)))
19451 Imm = APInt(VT.getScalarSizeInBits(),
19452 Op.getOperand(i: 1).getConstantOperandVal(i: 0));
19453 else
19454 return false;
19455
19456 if (Imm != 1ULL << (ShtAmt - 1))
19457 return false;
19458 return true;
19459 };
19460
19461 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
19462 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
19463 ((IsRSHRN(N1) &&
19464 N0.getConstantOperandVal(i: 1) == N1.getConstantOperandVal(i: 1)) ||
19465 N1.isUndef())) {
19466 SDValue X = N0.getOperand(i: 0).getOperand(i: 0);
19467 SDValue Y = N1.isUndef() ? DAG.getUNDEF(VT: X.getValueType())
19468 : N1.getOperand(i: 0).getOperand(i: 0);
19469 EVT BVT =
19470 X.getValueType().getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
19471 SDValue CC = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: BVT, N1: X, N2: Y);
19472 SDValue Add = DAG.getNode(
19473 Opcode: ISD::ADD, DL: dl, VT: BVT, N1: CC,
19474 N2: DAG.getConstant(Val: 1ULL << (N0.getConstantOperandVal(i: 1) - 1), DL: dl, VT: BVT));
19475 SDValue Shr =
19476 DAG.getNode(Opcode: AArch64ISD::VLSHR, DL: dl, VT: BVT, N1: Add, N2: N0.getOperand(i: 1));
19477 return Shr;
19478 }
19479
19480 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
19481 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
19482 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(i: 0) == N1.getOperand(i: 0) &&
19483 N0.getOperand(i: 1) == N1.getOperand(i: 1)) {
19484 SDValue E0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N0.getOperand(i: 0),
19485 N2: DAG.getUNDEF(VT: N0.getValueType()));
19486 SDValue E1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N0.getOperand(i: 1),
19487 N2: DAG.getUNDEF(VT: N0.getValueType()));
19488 return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT, N1: E0, N2: E1);
19489 }
19490
19491 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
19492 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
19493 // canonicalise to that.
19494 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
19495 assert(VT.getScalarSizeInBits() == 64);
19496 return DAG.getNode(Opcode: AArch64ISD::DUPLANE64, DL: dl, VT, N1: WidenVector(V64Reg: N0, DAG),
19497 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64));
19498 }
19499
19500 // Canonicalise concat_vectors so that the right-hand vector has as few
19501 // bit-casts as possible before its real operation. The primary matching
19502 // destination for these operations will be the narrowing "2" instructions,
19503 // which depend on the operation being performed on this right-hand vector.
19504 // For example,
19505 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
19506 // becomes
19507 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
19508
19509 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
19510 return SDValue();
19511 SDValue RHS = N1->getOperand(Num: 0);
19512 MVT RHSTy = RHS.getValueType().getSimpleVT();
19513 // If the RHS is not a vector, this is not the pattern we're looking for.
19514 if (!RHSTy.isVector())
19515 return SDValue();
19516
19517 LLVM_DEBUG(
19518 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
19519
19520 MVT ConcatTy = MVT::getVectorVT(VT: RHSTy.getVectorElementType(),
19521 NumElements: RHSTy.getVectorNumElements() * 2);
19522 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT,
19523 Operand: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: ConcatTy,
19524 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: RHSTy, Operand: N0),
19525 N2: RHS));
19526}
19527
19528static SDValue
19529performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19530 SelectionDAG &DAG) {
19531 if (DCI.isBeforeLegalizeOps())
19532 return SDValue();
19533
19534 EVT VT = N->getValueType(ResNo: 0);
19535 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
19536 return SDValue();
19537
19538 SDValue V = N->getOperand(Num: 0);
19539
19540 // NOTE: This combine exists in DAGCombiner, but that version's legality check
19541 // blocks this combine because the non-const case requires custom lowering.
19542 //
19543 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
19544 if (V.getOpcode() == ISD::SPLAT_VECTOR)
19545 if (isa<ConstantSDNode>(Val: V.getOperand(i: 0)))
19546 return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc(N), VT, Operand: V.getOperand(i: 0));
19547
19548 return SDValue();
19549}
19550
19551static SDValue
19552performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19553 SelectionDAG &DAG) {
19554 SDLoc DL(N);
19555 SDValue Vec = N->getOperand(Num: 0);
19556 SDValue SubVec = N->getOperand(Num: 1);
19557 uint64_t IdxVal = N->getConstantOperandVal(Num: 2);
19558 EVT VecVT = Vec.getValueType();
19559 EVT SubVT = SubVec.getValueType();
19560
19561 // Only do this for legal fixed vector types.
19562 if (!VecVT.isFixedLengthVector() ||
19563 !DAG.getTargetLoweringInfo().isTypeLegal(VT: VecVT) ||
19564 !DAG.getTargetLoweringInfo().isTypeLegal(VT: SubVT))
19565 return SDValue();
19566
19567 // Ignore widening patterns.
19568 if (IdxVal == 0 && Vec.isUndef())
19569 return SDValue();
19570
19571 // Subvector must be half the width and an "aligned" insertion.
19572 unsigned NumSubElts = SubVT.getVectorNumElements();
19573 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
19574 (IdxVal != 0 && IdxVal != NumSubElts))
19575 return SDValue();
19576
19577 // Fold insert_subvector -> concat_vectors
19578 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
19579 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
19580 SDValue Lo, Hi;
19581 if (IdxVal == 0) {
19582 Lo = SubVec;
19583 Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
19584 N2: DAG.getVectorIdxConstant(Val: NumSubElts, DL));
19585 } else {
19586 Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
19587 N2: DAG.getVectorIdxConstant(Val: 0, DL));
19588 Hi = SubVec;
19589 }
19590 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: VecVT, N1: Lo, N2: Hi);
19591}
19592
19593static SDValue tryCombineFixedPointConvert(SDNode *N,
19594 TargetLowering::DAGCombinerInfo &DCI,
19595 SelectionDAG &DAG) {
19596 // Wait until after everything is legalized to try this. That way we have
19597 // legal vector types and such.
19598 if (DCI.isBeforeLegalizeOps())
19599 return SDValue();
19600 // Transform a scalar conversion of a value from a lane extract into a
19601 // lane extract of a vector conversion. E.g., from foo1 to foo2:
19602 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
19603 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
19604 //
19605 // The second form interacts better with instruction selection and the
19606 // register allocator to avoid cross-class register copies that aren't
19607 // coalescable due to a lane reference.
19608
19609 // Check the operand and see if it originates from a lane extract.
19610 SDValue Op1 = N->getOperand(Num: 1);
19611 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
19612 return SDValue();
19613
19614 // Yep, no additional predication needed. Perform the transform.
19615 SDValue IID = N->getOperand(Num: 0);
19616 SDValue Shift = N->getOperand(Num: 2);
19617 SDValue Vec = Op1.getOperand(i: 0);
19618 SDValue Lane = Op1.getOperand(i: 1);
19619 EVT ResTy = N->getValueType(ResNo: 0);
19620 EVT VecResTy;
19621 SDLoc DL(N);
19622
19623 // The vector width should be 128 bits by the time we get here, even
19624 // if it started as 64 bits (the extract_vector handling will have
19625 // done so). Bail if it is not.
19626 if (Vec.getValueSizeInBits() != 128)
19627 return SDValue();
19628
19629 if (Vec.getValueType() == MVT::v4i32)
19630 VecResTy = MVT::v4f32;
19631 else if (Vec.getValueType() == MVT::v2i64)
19632 VecResTy = MVT::v2f64;
19633 else
19634 return SDValue();
19635
19636 SDValue Convert =
19637 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: VecResTy, N1: IID, N2: Vec, N3: Shift);
19638 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResTy, N1: Convert, N2: Lane);
19639}
19640
19641// AArch64 high-vector "long" operations are formed by performing the non-high
19642// version on an extract_subvector of each operand which gets the high half:
19643//
19644// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
19645//
19646// However, there are cases which don't have an extract_high explicitly, but
19647// have another operation that can be made compatible with one for free. For
19648// example:
19649//
19650// (dupv64 scalar) --> (extract_high (dup128 scalar))
19651//
19652// This routine does the actual conversion of such DUPs, once outer routines
19653// have determined that everything else is in order.
19654// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
19655// similarly here.
19656static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
19657 MVT VT = N.getSimpleValueType();
19658 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
19659 N.getConstantOperandVal(i: 1) == 0)
19660 N = N.getOperand(i: 0);
19661
19662 switch (N.getOpcode()) {
19663 case AArch64ISD::DUP:
19664 case AArch64ISD::DUPLANE8:
19665 case AArch64ISD::DUPLANE16:
19666 case AArch64ISD::DUPLANE32:
19667 case AArch64ISD::DUPLANE64:
19668 case AArch64ISD::MOVI:
19669 case AArch64ISD::MOVIshift:
19670 case AArch64ISD::MOVIedit:
19671 case AArch64ISD::MOVImsl:
19672 case AArch64ISD::MVNIshift:
19673 case AArch64ISD::MVNImsl:
19674 break;
19675 default:
19676 // FMOV could be supported, but isn't very useful, as it would only occur
19677 // if you passed a bitcast' floating point immediate to an eligible long
19678 // integer op (addl, smull, ...).
19679 return SDValue();
19680 }
19681
19682 if (!VT.is64BitVector())
19683 return SDValue();
19684
19685 SDLoc DL(N);
19686 unsigned NumElems = VT.getVectorNumElements();
19687 if (N.getValueType().is64BitVector()) {
19688 MVT ElementTy = VT.getVectorElementType();
19689 MVT NewVT = MVT::getVectorVT(VT: ElementTy, NumElements: NumElems * 2);
19690 N = DAG.getNode(Opcode: N->getOpcode(), DL, VT: NewVT, Ops: N->ops());
19691 }
19692
19693 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: N,
19694 N2: DAG.getConstant(Val: NumElems, DL, VT: MVT::i64));
19695}
19696
19697static bool isEssentiallyExtractHighSubvector(SDValue N) {
19698 if (N.getOpcode() == ISD::BITCAST)
19699 N = N.getOperand(i: 0);
19700 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19701 return false;
19702 if (N.getOperand(i: 0).getValueType().isScalableVector())
19703 return false;
19704 return N.getConstantOperandAPInt(i: 1) ==
19705 N.getOperand(i: 0).getValueType().getVectorNumElements() / 2;
19706}
19707
19708/// Helper structure to keep track of ISD::SET_CC operands.
19709struct GenericSetCCInfo {
19710 const SDValue *Opnd0;
19711 const SDValue *Opnd1;
19712 ISD::CondCode CC;
19713};
19714
19715/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
19716struct AArch64SetCCInfo {
19717 const SDValue *Cmp;
19718 AArch64CC::CondCode CC;
19719};
19720
19721/// Helper structure to keep track of SetCC information.
19722union SetCCInfo {
19723 GenericSetCCInfo Generic;
19724 AArch64SetCCInfo AArch64;
19725};
19726
19727/// Helper structure to be able to read SetCC information. If set to
19728/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
19729/// GenericSetCCInfo.
19730struct SetCCInfoAndKind {
19731 SetCCInfo Info;
19732 bool IsAArch64;
19733};
19734
19735/// Check whether or not \p Op is a SET_CC operation, either a generic or
19736/// an
19737/// AArch64 lowered one.
19738/// \p SetCCInfo is filled accordingly.
19739/// \post SetCCInfo is meanginfull only when this function returns true.
19740/// \return True when Op is a kind of SET_CC operation.
19741static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
19742 // If this is a setcc, this is straight forward.
19743 if (Op.getOpcode() == ISD::SETCC) {
19744 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(i: 0);
19745 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(i: 1);
19746 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
19747 SetCCInfo.IsAArch64 = false;
19748 return true;
19749 }
19750 // Otherwise, check if this is a matching csel instruction.
19751 // In other words:
19752 // - csel 1, 0, cc
19753 // - csel 0, 1, !cc
19754 if (Op.getOpcode() != AArch64ISD::CSEL)
19755 return false;
19756 // Set the information about the operands.
19757 // TODO: we want the operands of the Cmp not the csel
19758 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(i: 3);
19759 SetCCInfo.IsAArch64 = true;
19760 SetCCInfo.Info.AArch64.CC =
19761 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: 2));
19762
19763 // Check that the operands matches the constraints:
19764 // (1) Both operands must be constants.
19765 // (2) One must be 1 and the other must be 0.
19766 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 0));
19767 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1));
19768
19769 // Check (1).
19770 if (!TValue || !FValue)
19771 return false;
19772
19773 // Check (2).
19774 if (!TValue->isOne()) {
19775 // Update the comparison when we are interested in !cc.
19776 std::swap(a&: TValue, b&: FValue);
19777 SetCCInfo.Info.AArch64.CC =
19778 AArch64CC::getInvertedCondCode(Code: SetCCInfo.Info.AArch64.CC);
19779 }
19780 return TValue->isOne() && FValue->isZero();
19781}
19782
19783// Returns true if Op is setcc or zext of setcc.
19784static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19785 if (isSetCC(Op, SetCCInfo&: Info))
19786 return true;
19787 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19788 isSetCC(Op: Op->getOperand(Num: 0), SetCCInfo&: Info));
19789}
19790
19791// The folding we want to perform is:
19792// (add x, [zext] (setcc cc ...) )
19793// -->
19794// (csel x, (add x, 1), !cc ...)
19795//
19796// The latter will get matched to a CSINC instruction.
19797static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
19798 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19799 SDValue LHS = Op->getOperand(Num: 0);
19800 SDValue RHS = Op->getOperand(Num: 1);
19801 SetCCInfoAndKind InfoAndKind;
19802
19803 // If both operands are a SET_CC, then we don't want to perform this
19804 // folding and create another csel as this results in more instructions
19805 // (and higher register usage).
19806 if (isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind) &&
19807 isSetCCOrZExtSetCC(Op: RHS, Info&: InfoAndKind))
19808 return SDValue();
19809
19810 // If neither operand is a SET_CC, give up.
19811 if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind)) {
19812 std::swap(a&: LHS, b&: RHS);
19813 if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind))
19814 return SDValue();
19815 }
19816
19817 // FIXME: This could be generatized to work for FP comparisons.
19818 EVT CmpVT = InfoAndKind.IsAArch64
19819 ? InfoAndKind.Info.AArch64.Cmp->getOperand(i: 0).getValueType()
19820 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19821 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19822 return SDValue();
19823
19824 SDValue CCVal;
19825 SDValue Cmp;
19826 SDLoc dl(Op);
19827 if (InfoAndKind.IsAArch64) {
19828 CCVal = DAG.getConstant(
19829 Val: AArch64CC::getInvertedCondCode(Code: InfoAndKind.Info.AArch64.CC), DL: dl,
19830 VT: MVT::i32);
19831 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19832 } else
19833 Cmp = getAArch64Cmp(
19834 LHS: *InfoAndKind.Info.Generic.Opnd0, RHS: *InfoAndKind.Info.Generic.Opnd1,
19835 CC: ISD::getSetCCInverse(Operation: InfoAndKind.Info.Generic.CC, Type: CmpVT), AArch64cc&: CCVal, DAG,
19836 dl);
19837
19838 EVT VT = Op->getValueType(ResNo: 0);
19839 LHS = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: RHS, N2: DAG.getConstant(Val: 1, DL: dl, VT));
19840 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: RHS, N2: LHS, N3: CCVal, N4: Cmp);
19841}
19842
19843// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19844static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
19845 EVT VT = N->getValueType(ResNo: 0);
19846 // Only scalar integer and vector types.
19847 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19848 return SDValue();
19849
19850 SDValue LHS = N->getOperand(Num: 0);
19851 SDValue RHS = N->getOperand(Num: 1);
19852 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19853 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19854 return SDValue();
19855
19856 auto *LHSN1 = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1));
19857 auto *RHSN1 = dyn_cast<ConstantSDNode>(Val: RHS->getOperand(Num: 1));
19858 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19859 return SDValue();
19860
19861 SDValue Op1 = LHS->getOperand(Num: 0);
19862 SDValue Op2 = RHS->getOperand(Num: 0);
19863 EVT OpVT1 = Op1.getValueType();
19864 EVT OpVT2 = Op2.getValueType();
19865 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19866 Op2.getOpcode() != AArch64ISD::UADDV ||
19867 OpVT1.getVectorElementType() != VT)
19868 return SDValue();
19869
19870 SDValue Val1 = Op1.getOperand(i: 0);
19871 SDValue Val2 = Op2.getOperand(i: 0);
19872 EVT ValVT = Val1->getValueType(ResNo: 0);
19873 SDLoc DL(N);
19874 SDValue AddVal = DAG.getNode(Opcode: ISD::ADD, DL, VT: ValVT, N1: Val1, N2: Val2);
19875 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT,
19876 N1: DAG.getNode(Opcode: AArch64ISD::UADDV, DL, VT: ValVT, Operand: AddVal),
19877 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
19878}
19879
19880/// Perform the scalar expression combine in the form of:
19881/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19882/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19883static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
19884 EVT VT = N->getValueType(ResNo: 0);
19885 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19886 return SDValue();
19887
19888 SDValue LHS = N->getOperand(Num: 0);
19889 SDValue RHS = N->getOperand(Num: 1);
19890
19891 // Handle commutivity.
19892 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19893 LHS.getOpcode() != AArch64ISD::CSNEG) {
19894 std::swap(a&: LHS, b&: RHS);
19895 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19896 LHS.getOpcode() != AArch64ISD::CSNEG) {
19897 return SDValue();
19898 }
19899 }
19900
19901 if (!LHS.hasOneUse())
19902 return SDValue();
19903
19904 AArch64CC::CondCode AArch64CC =
19905 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: 2));
19906
19907 // The CSEL should include a const one operand, and the CSNEG should include
19908 // One or NegOne operand.
19909 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 0));
19910 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
19911 if (!CTVal || !CFVal)
19912 return SDValue();
19913
19914 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19915 (CTVal->isOne() || CFVal->isOne())) &&
19916 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19917 (CTVal->isOne() || CFVal->isAllOnes())))
19918 return SDValue();
19919
19920 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19921 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19922 !CFVal->isOne()) {
19923 std::swap(a&: CTVal, b&: CFVal);
19924 AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
19925 }
19926
19927 SDLoc DL(N);
19928 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19929 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19930 !CFVal->isAllOnes()) {
19931 APInt C = -1 * CFVal->getAPIntValue();
19932 CTVal = cast<ConstantSDNode>(Val: DAG.getConstant(Val: C, DL, VT));
19933 CFVal = cast<ConstantSDNode>(Val: DAG.getAllOnesConstant(DL, VT));
19934 AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
19935 }
19936
19937 // It might be neutral for larger constants, as the immediate need to be
19938 // materialized in a register.
19939 APInt ADDC = CTVal->getAPIntValue();
19940 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19941 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19942 return SDValue();
19943
19944 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19945 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19946 "Unexpected constant value");
19947
19948 SDValue NewNode = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: SDValue(CTVal, 0));
19949 SDValue CCVal = DAG.getConstant(Val: AArch64CC, DL, VT: MVT::i32);
19950 SDValue Cmp = LHS.getOperand(i: 3);
19951
19952 return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: NewNode, N2: RHS, N3: CCVal, N4: Cmp);
19953}
19954
19955// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19956static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
19957 EVT VT = N->getValueType(ResNo: 0);
19958 if (N->getOpcode() != ISD::ADD)
19959 return SDValue();
19960
19961 SDValue Dot = N->getOperand(Num: 0);
19962 SDValue A = N->getOperand(Num: 1);
19963 // Handle commutivity
19964 auto isZeroDot = [](SDValue Dot) {
19965 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19966 Dot.getOpcode() == AArch64ISD::SDOT) &&
19967 isZerosVector(N: Dot.getOperand(i: 0).getNode());
19968 };
19969 if (!isZeroDot(Dot))
19970 std::swap(a&: Dot, b&: A);
19971 if (!isZeroDot(Dot))
19972 return SDValue();
19973
19974 return DAG.getNode(Opcode: Dot.getOpcode(), DL: SDLoc(N), VT, N1: A, N2: Dot.getOperand(i: 1),
19975 N3: Dot.getOperand(i: 2));
19976}
19977
19978static bool isNegatedInteger(SDValue Op) {
19979 return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: 0));
19980}
19981
19982static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
19983 SDLoc DL(Op);
19984 EVT VT = Op.getValueType();
19985 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
19986 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: Op);
19987}
19988
19989// Try to fold
19990//
19991// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19992//
19993// The folding helps csel to be matched with csneg without generating
19994// redundant neg instruction, which includes negation of the csel expansion
19995// of abs node lowered by lowerABS.
19996static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
19997 if (!isNegatedInteger(Op: SDValue(N, 0)))
19998 return SDValue();
19999
20000 SDValue CSel = N->getOperand(Num: 1);
20001 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
20002 return SDValue();
20003
20004 SDValue N0 = CSel.getOperand(i: 0);
20005 SDValue N1 = CSel.getOperand(i: 1);
20006
20007 // If both of them is not negations, it's not worth the folding as it
20008 // introduces two additional negations while reducing one negation.
20009 if (!isNegatedInteger(Op: N0) && !isNegatedInteger(Op: N1))
20010 return SDValue();
20011
20012 SDValue N0N = getNegatedInteger(Op: N0, DAG);
20013 SDValue N1N = getNegatedInteger(Op: N1, DAG);
20014
20015 SDLoc DL(N);
20016 EVT VT = CSel.getValueType();
20017 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: N0N, N2: N1N, N3: CSel.getOperand(i: 2),
20018 N4: CSel.getOperand(i: 3));
20019}
20020
20021// The basic add/sub long vector instructions have variants with "2" on the end
20022// which act on the high-half of their inputs. They are normally matched by
20023// patterns like:
20024//
20025// (add (zeroext (extract_high LHS)),
20026// (zeroext (extract_high RHS)))
20027// -> uaddl2 vD, vN, vM
20028//
20029// However, if one of the extracts is something like a duplicate, this
20030// instruction can still be used profitably. This function puts the DAG into a
20031// more appropriate form for those patterns to trigger.
20032static SDValue performAddSubLongCombine(SDNode *N,
20033 TargetLowering::DAGCombinerInfo &DCI) {
20034 SelectionDAG &DAG = DCI.DAG;
20035 if (DCI.isBeforeLegalizeOps())
20036 return SDValue();
20037
20038 MVT VT = N->getSimpleValueType(ResNo: 0);
20039 if (!VT.is128BitVector()) {
20040 if (N->getOpcode() == ISD::ADD)
20041 return performSetccAddFolding(Op: N, DAG);
20042 return SDValue();
20043 }
20044
20045 // Make sure both branches are extended in the same way.
20046 SDValue LHS = N->getOperand(Num: 0);
20047 SDValue RHS = N->getOperand(Num: 1);
20048 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
20049 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
20050 LHS.getOpcode() != RHS.getOpcode())
20051 return SDValue();
20052
20053 unsigned ExtType = LHS.getOpcode();
20054
20055 // It's not worth doing if at least one of the inputs isn't already an
20056 // extract, but we don't know which it'll be so we have to try both.
20057 if (isEssentiallyExtractHighSubvector(N: LHS.getOperand(i: 0))) {
20058 RHS = tryExtendDUPToExtractHigh(N: RHS.getOperand(i: 0), DAG);
20059 if (!RHS.getNode())
20060 return SDValue();
20061
20062 RHS = DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT, Operand: RHS);
20063 } else if (isEssentiallyExtractHighSubvector(N: RHS.getOperand(i: 0))) {
20064 LHS = tryExtendDUPToExtractHigh(N: LHS.getOperand(i: 0), DAG);
20065 if (!LHS.getNode())
20066 return SDValue();
20067
20068 LHS = DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT, Operand: LHS);
20069 }
20070
20071 return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT, N1: LHS, N2: RHS);
20072}
20073
20074static bool isCMP(SDValue Op) {
20075 return Op.getOpcode() == AArch64ISD::SUBS &&
20076 !Op.getNode()->hasAnyUseOfValue(Value: 0);
20077}
20078
20079// (CSEL 1 0 CC Cond) => CC
20080// (CSEL 0 1 CC Cond) => !CC
20081static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
20082 if (Op.getOpcode() != AArch64ISD::CSEL)
20083 return std::nullopt;
20084 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: 2));
20085 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
20086 return std::nullopt;
20087 SDValue OpLHS = Op.getOperand(i: 0);
20088 SDValue OpRHS = Op.getOperand(i: 1);
20089 if (isOneConstant(V: OpLHS) && isNullConstant(V: OpRHS))
20090 return CC;
20091 if (isNullConstant(V: OpLHS) && isOneConstant(V: OpRHS))
20092 return getInvertedCondCode(Code: CC);
20093
20094 return std::nullopt;
20095}
20096
20097// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
20098// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
20099static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
20100 SDValue CmpOp = Op->getOperand(Num: 2);
20101 if (!isCMP(Op: CmpOp))
20102 return SDValue();
20103
20104 if (IsAdd) {
20105 if (!isOneConstant(V: CmpOp.getOperand(i: 1)))
20106 return SDValue();
20107 } else {
20108 if (!isNullConstant(V: CmpOp.getOperand(i: 0)))
20109 return SDValue();
20110 }
20111
20112 SDValue CsetOp = CmpOp->getOperand(Num: IsAdd ? 0 : 1);
20113 auto CC = getCSETCondCode(Op: CsetOp);
20114 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
20115 return SDValue();
20116
20117 return DAG.getNode(Opcode: Op->getOpcode(), DL: SDLoc(Op), VTList: Op->getVTList(),
20118 N1: Op->getOperand(Num: 0), N2: Op->getOperand(Num: 1),
20119 N3: CsetOp.getOperand(i: 3));
20120}
20121
20122// (ADC x 0 cond) => (CINC x HS cond)
20123static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
20124 SDValue LHS = N->getOperand(Num: 0);
20125 SDValue RHS = N->getOperand(Num: 1);
20126 SDValue Cond = N->getOperand(Num: 2);
20127
20128 if (!isNullConstant(V: RHS))
20129 return SDValue();
20130
20131 EVT VT = N->getValueType(ResNo: 0);
20132 SDLoc DL(N);
20133
20134 // (CINC x cc cond) <=> (CSINC x x !cc cond)
20135 SDValue CC = DAG.getConstant(Val: AArch64CC::LO, DL, VT: MVT::i32);
20136 return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: LHS, N2: LHS, N3: CC, N4: Cond);
20137}
20138
20139static SDValue performBuildVectorCombine(SDNode *N,
20140 TargetLowering::DAGCombinerInfo &DCI,
20141 SelectionDAG &DAG) {
20142 SDLoc DL(N);
20143 EVT VT = N->getValueType(ResNo: 0);
20144
20145 if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
20146 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
20147 SDValue Elt0 = N->getOperand(Num: 0), Elt1 = N->getOperand(Num: 1),
20148 Elt2 = N->getOperand(Num: 2), Elt3 = N->getOperand(Num: 3);
20149 if (Elt0->getOpcode() == ISD::FP_ROUND &&
20150 Elt1->getOpcode() == ISD::FP_ROUND &&
20151 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 1)) &&
20152 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 1)) &&
20153 Elt0->getConstantOperandVal(Num: 1) == Elt1->getConstantOperandVal(Num: 1) &&
20154 Elt0->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20155 Elt1->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20156 // Constant index.
20157 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 0)->getOperand(Num: 1)) &&
20158 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 0)->getOperand(Num: 1)) &&
20159 Elt0->getOperand(Num: 0)->getOperand(Num: 0) ==
20160 Elt1->getOperand(Num: 0)->getOperand(Num: 0) &&
20161 Elt0->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 0 &&
20162 Elt1->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 1) {
20163 SDValue LowLanesSrcVec = Elt0->getOperand(Num: 0)->getOperand(Num: 0);
20164 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
20165 SDValue HighLanes;
20166 if (Elt2->getOpcode() == ISD::UNDEF &&
20167 Elt3->getOpcode() == ISD::UNDEF) {
20168 HighLanes = DAG.getUNDEF(VT: MVT::v2f32);
20169 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
20170 Elt3->getOpcode() == ISD::FP_ROUND &&
20171 isa<ConstantSDNode>(Val: Elt2->getOperand(Num: 1)) &&
20172 isa<ConstantSDNode>(Val: Elt3->getOperand(Num: 1)) &&
20173 Elt2->getConstantOperandVal(Num: 1) ==
20174 Elt3->getConstantOperandVal(Num: 1) &&
20175 Elt2->getOperand(Num: 0)->getOpcode() ==
20176 ISD::EXTRACT_VECTOR_ELT &&
20177 Elt3->getOperand(Num: 0)->getOpcode() ==
20178 ISD::EXTRACT_VECTOR_ELT &&
20179 // Constant index.
20180 isa<ConstantSDNode>(Val: Elt2->getOperand(Num: 0)->getOperand(Num: 1)) &&
20181 isa<ConstantSDNode>(Val: Elt3->getOperand(Num: 0)->getOperand(Num: 1)) &&
20182 Elt2->getOperand(Num: 0)->getOperand(Num: 0) ==
20183 Elt3->getOperand(Num: 0)->getOperand(Num: 0) &&
20184 Elt2->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 0 &&
20185 Elt3->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 1) {
20186 SDValue HighLanesSrcVec = Elt2->getOperand(Num: 0)->getOperand(Num: 0);
20187 HighLanes =
20188 DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL, VT: MVT::v2f32, Operand: HighLanesSrcVec);
20189 }
20190 if (HighLanes) {
20191 SDValue DoubleToSingleSticky =
20192 DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL, VT: MVT::v2f32, Operand: LowLanesSrcVec);
20193 SDValue Concat = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v4f32,
20194 N1: DoubleToSingleSticky, N2: HighLanes);
20195 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Concat,
20196 N2: Elt0->getOperand(Num: 1));
20197 }
20198 }
20199 }
20200 }
20201
20202 if (VT == MVT::v2f64) {
20203 SDValue Elt0 = N->getOperand(Num: 0), Elt1 = N->getOperand(Num: 1);
20204 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
20205 Elt1->getOpcode() == ISD::FP_EXTEND &&
20206 Elt0->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20207 Elt1->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20208 Elt0->getOperand(Num: 0)->getOperand(Num: 0) ==
20209 Elt1->getOperand(Num: 0)->getOperand(Num: 0) &&
20210 // Constant index.
20211 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 0)->getOperand(Num: 1)) &&
20212 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 0)->getOperand(Num: 1)) &&
20213 Elt0->getOperand(Num: 0)->getConstantOperandVal(Num: 1) + 1 ==
20214 Elt1->getOperand(Num: 0)->getConstantOperandVal(Num: 1) &&
20215 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20216 // ResultType's known minimum vector length.
20217 Elt0->getOperand(Num: 0)->getConstantOperandVal(Num: 1) %
20218 VT.getVectorMinNumElements() ==
20219 0) {
20220 SDValue SrcVec = Elt0->getOperand(Num: 0)->getOperand(Num: 0);
20221 if (SrcVec.getValueType() == MVT::v4f16 ||
20222 SrcVec.getValueType() == MVT::v4bf16) {
20223 SDValue HalfToSingle =
20224 DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::v4f32, Operand: SrcVec);
20225 SDValue SubvectorIdx = Elt0->getOperand(Num: 0)->getOperand(Num: 1);
20226 SDValue Extract = DAG.getNode(
20227 Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: VT.changeVectorElementType(EltVT: MVT::f32),
20228 N1: HalfToSingle, N2: SubvectorIdx);
20229 return DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Extract);
20230 }
20231 }
20232 }
20233
20234 // A build vector of two extracted elements is equivalent to an
20235 // extract subvector where the inner vector is any-extended to the
20236 // extract_vector_elt VT.
20237 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
20238 // (extract_elt_iXX_to_i32 vec Idx+1))
20239 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
20240
20241 // For now, only consider the v2i32 case, which arises as a result of
20242 // legalization.
20243 if (VT != MVT::v2i32)
20244 return SDValue();
20245
20246 SDValue Elt0 = N->getOperand(Num: 0), Elt1 = N->getOperand(Num: 1);
20247 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
20248 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20249 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20250 // Constant index.
20251 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 1)) &&
20252 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 1)) &&
20253 // Both EXTRACT_VECTOR_ELT from same vector...
20254 Elt0->getOperand(Num: 0) == Elt1->getOperand(Num: 0) &&
20255 // ... and contiguous. First element's index +1 == second element's index.
20256 Elt0->getConstantOperandVal(Num: 1) + 1 == Elt1->getConstantOperandVal(Num: 1) &&
20257 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20258 // ResultType's known minimum vector length.
20259 Elt0->getConstantOperandVal(Num: 1) % VT.getVectorMinNumElements() == 0) {
20260 SDValue VecToExtend = Elt0->getOperand(Num: 0);
20261 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(EltVT: MVT::i32);
20262 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ExtVT))
20263 return SDValue();
20264
20265 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Val: Elt0->getConstantOperandVal(Num: 1), DL);
20266
20267 SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ExtVT, Operand: VecToExtend);
20268 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v2i32, N1: Ext,
20269 N2: SubvectorIdx);
20270 }
20271
20272 return SDValue();
20273}
20274
20275static SDValue performTruncateCombine(SDNode *N,
20276 SelectionDAG &DAG) {
20277 EVT VT = N->getValueType(ResNo: 0);
20278 SDValue N0 = N->getOperand(Num: 0);
20279 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
20280 N0.getOpcode() == AArch64ISD::DUP) {
20281 SDValue Op = N0.getOperand(i: 0);
20282 if (VT.getScalarType() == MVT::i32 &&
20283 N0.getOperand(i: 0).getValueType().getScalarType() == MVT::i64)
20284 Op = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc(N), VT: MVT::i32, Operand: Op);
20285 return DAG.getNode(Opcode: N0.getOpcode(), DL: SDLoc(N), VT, Operand: Op);
20286 }
20287
20288 return SDValue();
20289}
20290
20291// Check an node is an extend or shift operand
20292static bool isExtendOrShiftOperand(SDValue N) {
20293 unsigned Opcode = N.getOpcode();
20294 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
20295 EVT SrcVT;
20296 if (Opcode == ISD::SIGN_EXTEND_INREG)
20297 SrcVT = cast<VTSDNode>(Val: N.getOperand(i: 1))->getVT();
20298 else
20299 SrcVT = N.getOperand(i: 0).getValueType();
20300
20301 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
20302 } else if (Opcode == ISD::AND) {
20303 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1));
20304 if (!CSD)
20305 return false;
20306 uint64_t AndMask = CSD->getZExtValue();
20307 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
20308 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
20309 return isa<ConstantSDNode>(Val: N.getOperand(i: 1));
20310 }
20311
20312 return false;
20313}
20314
20315// (N - Y) + Z --> (Z - Y) + N
20316// when N is an extend or shift operand
20317static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z,
20318 SelectionDAG &DAG) {
20319 auto IsOneUseExtend = [](SDValue N) {
20320 return N.hasOneUse() && isExtendOrShiftOperand(N);
20321 };
20322
20323 // DAGCombiner will revert the combination when Z is constant cause
20324 // dead loop. So don't enable the combination when Z is constant.
20325 // If Z is one use shift C, we also can't do the optimization.
20326 // It will falling to self infinite loop.
20327 if (isa<ConstantSDNode>(Val: Z) || IsOneUseExtend(Z))
20328 return SDValue();
20329
20330 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
20331 return SDValue();
20332
20333 SDValue Shift = SUB.getOperand(i: 0);
20334 if (!IsOneUseExtend(Shift))
20335 return SDValue();
20336
20337 SDLoc DL(N);
20338 EVT VT = N->getValueType(ResNo: 0);
20339
20340 SDValue Y = SUB.getOperand(i: 1);
20341 SDValue NewSub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Z, N2: Y);
20342 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: NewSub, N2: Shift);
20343}
20344
20345static SDValue performAddCombineForShiftedOperands(SDNode *N,
20346 SelectionDAG &DAG) {
20347 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
20348 // commutative.
20349 if (N->getOpcode() != ISD::ADD)
20350 return SDValue();
20351
20352 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
20353 // shifted register is only available for i32 and i64.
20354 EVT VT = N->getValueType(ResNo: 0);
20355 if (VT != MVT::i32 && VT != MVT::i64)
20356 return SDValue();
20357
20358 SDLoc DL(N);
20359 SDValue LHS = N->getOperand(Num: 0);
20360 SDValue RHS = N->getOperand(Num: 1);
20361
20362 if (SDValue Val = performAddCombineSubShift(N, SUB: LHS, Z: RHS, DAG))
20363 return Val;
20364 if (SDValue Val = performAddCombineSubShift(N, SUB: RHS, Z: LHS, DAG))
20365 return Val;
20366
20367 uint64_t LHSImm = 0, RHSImm = 0;
20368 // If both operand are shifted by imm and shift amount is not greater than 4
20369 // for one operand, swap LHS and RHS to put operand with smaller shift amount
20370 // on RHS.
20371 //
20372 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
20373 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
20374 // with LSL (shift > 4). For the rest of processors, this is no-op for
20375 // performance or correctness.
20376 if (isOpcWithIntImmediate(N: LHS.getNode(), Opc: ISD::SHL, Imm&: LHSImm) &&
20377 isOpcWithIntImmediate(N: RHS.getNode(), Opc: ISD::SHL, Imm&: RHSImm) && LHSImm <= 4 &&
20378 RHSImm > 4 && LHS.hasOneUse())
20379 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: LHS);
20380
20381 return SDValue();
20382}
20383
20384// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
20385// This reassociates it back to allow the creation of more mls instructions.
20386static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) {
20387 if (N->getOpcode() != ISD::SUB)
20388 return SDValue();
20389
20390 SDValue Add = N->getOperand(Num: 1);
20391 SDValue X = N->getOperand(Num: 0);
20392 if (Add.getOpcode() != ISD::ADD)
20393 return SDValue();
20394
20395 if (!Add.hasOneUse())
20396 return SDValue();
20397 if (DAG.isConstantIntBuildVectorOrConstantInt(N: peekThroughBitcasts(V: X)))
20398 return SDValue();
20399
20400 SDValue M1 = Add.getOperand(i: 0);
20401 SDValue M2 = Add.getOperand(i: 1);
20402 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
20403 M1.getOpcode() != AArch64ISD::UMULL)
20404 return SDValue();
20405 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
20406 M2.getOpcode() != AArch64ISD::UMULL)
20407 return SDValue();
20408
20409 EVT VT = N->getValueType(ResNo: 0);
20410 SDValue Sub = DAG.getNode(Opcode: ISD::SUB, DL: SDLoc(N), VT, N1: X, N2: M1);
20411 return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc(N), VT, N1: Sub, N2: M2);
20412}
20413
20414// Combine into mla/mls.
20415// This works on the patterns of:
20416// add v1, (mul v2, v3)
20417// sub v1, (mul v2, v3)
20418// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
20419// It will transform the add/sub to a scalable version, so that we can
20420// make use of SVE's MLA/MLS that will be generated for that pattern
20421static SDValue
20422performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
20423 SelectionDAG &DAG = DCI.DAG;
20424 // Make sure that the types are legal
20425 if (!DCI.isAfterLegalizeDAG())
20426 return SDValue();
20427 // Before using SVE's features, check first if it's available.
20428 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
20429 return SDValue();
20430
20431 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
20432 return SDValue();
20433
20434 if (!N->getValueType(ResNo: 0).isFixedLengthVector())
20435 return SDValue();
20436
20437 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
20438 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20439 return SDValue();
20440
20441 if (!cast<ConstantSDNode>(Val: Op1->getOperand(Num: 1))->isZero())
20442 return SDValue();
20443
20444 SDValue MulValue = Op1->getOperand(Num: 0);
20445 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
20446 return SDValue();
20447
20448 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
20449 return SDValue();
20450
20451 EVT ScalableVT = MulValue.getValueType();
20452 if (!ScalableVT.isScalableVector())
20453 return SDValue();
20454
20455 SDValue ScaledOp = convertToScalableVector(DAG, VT: ScalableVT, V: Op0);
20456 SDValue NewValue =
20457 DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT: ScalableVT, Ops: {ScaledOp, MulValue});
20458 return convertFromScalableVector(DAG, VT: N->getValueType(ResNo: 0), V: NewValue);
20459 };
20460
20461 if (SDValue res = performOpt(N->getOperand(Num: 0), N->getOperand(Num: 1)))
20462 return res;
20463 else if (N->getOpcode() == ISD::ADD)
20464 return performOpt(N->getOperand(Num: 1), N->getOperand(Num: 0));
20465
20466 return SDValue();
20467}
20468
20469// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
20470// help, for example, to produce ssra from sshr+add.
20471static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
20472 EVT VT = N->getValueType(ResNo: 0);
20473 if (VT != MVT::i64 ||
20474 DAG.getTargetLoweringInfo().isOperationExpand(Op: N->getOpcode(), VT: MVT::v1i64))
20475 return SDValue();
20476 SDValue Op0 = N->getOperand(Num: 0);
20477 SDValue Op1 = N->getOperand(Num: 1);
20478
20479 // At least one of the operands should be an extract, and the other should be
20480 // something that is easy to convert to v1i64 type (in this case a load).
20481 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
20482 Op0.getOpcode() != ISD::LOAD)
20483 return SDValue();
20484 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
20485 Op1.getOpcode() != ISD::LOAD)
20486 return SDValue();
20487
20488 SDLoc DL(N);
20489 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20490 Op0.getOperand(i: 0).getValueType() == MVT::v1i64) {
20491 Op0 = Op0.getOperand(i: 0);
20492 Op1 = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: Op1);
20493 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20494 Op1.getOperand(i: 0).getValueType() == MVT::v1i64) {
20495 Op0 = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: Op0);
20496 Op1 = Op1.getOperand(i: 0);
20497 } else
20498 return SDValue();
20499
20500 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64,
20501 N1: DAG.getNode(Opcode: N->getOpcode(), DL, VT: MVT::v1i64, N1: Op0, N2: Op1),
20502 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
20503}
20504
20505static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {
20506 SDValue BV = peekThroughOneUseBitcasts(V: B);
20507 if (!BV->hasOneUse())
20508 return false;
20509 if (auto *Ld = dyn_cast<LoadSDNode>(Val&: BV)) {
20510 if (!Ld || !Ld->isSimple())
20511 return false;
20512 Loads.push_back(Elt: Ld);
20513 return true;
20514 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
20515 BV.getOpcode() == ISD::CONCAT_VECTORS) {
20516 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
20517 auto *Ld = dyn_cast<LoadSDNode>(Val: BV.getOperand(i: Op));
20518 if (!Ld || !Ld->isSimple() || !BV.getOperand(i: Op).hasOneUse())
20519 return false;
20520 Loads.push_back(Elt: Ld);
20521 }
20522 return true;
20523 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
20524 // Try to find a tree of shuffles and concats from how IR shuffles of loads
20525 // are lowered. Note that this only comes up because we do not always visit
20526 // operands before uses. After that is fixed this can be removed and in the
20527 // meantime this is fairly specific to the lowering we expect from IR.
20528 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
20529 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
20530 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
20531 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
20532 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
20533 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
20534 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
20535 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
20536 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
20537 if (B.getOperand(i: 0).getOpcode() != ISD::VECTOR_SHUFFLE ||
20538 B.getOperand(i: 0).getOperand(i: 0).getOpcode() != ISD::CONCAT_VECTORS ||
20539 B.getOperand(i: 0).getOperand(i: 1).getOpcode() != ISD::CONCAT_VECTORS ||
20540 B.getOperand(i: 1).getOpcode() != ISD::CONCAT_VECTORS ||
20541 B.getOperand(i: 1).getNumOperands() != 4)
20542 return false;
20543 auto SV1 = cast<ShuffleVectorSDNode>(Val&: B);
20544 auto SV2 = cast<ShuffleVectorSDNode>(Val: B.getOperand(i: 0));
20545 int NumElts = B.getValueType().getVectorNumElements();
20546 int NumSubElts = NumElts / 4;
20547 for (int I = 0; I < NumSubElts; I++) {
20548 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
20549 if (SV1->getMaskElt(Idx: I) != I ||
20550 SV1->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts ||
20551 SV1->getMaskElt(Idx: I + NumSubElts * 2) != I + NumSubElts * 2 ||
20552 SV1->getMaskElt(Idx: I + NumSubElts * 3) != I + NumElts)
20553 return false;
20554 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
20555 if (SV2->getMaskElt(Idx: I) != I ||
20556 SV2->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts ||
20557 SV2->getMaskElt(Idx: I + NumSubElts * 2) != I + NumElts)
20558 return false;
20559 }
20560 auto *Ld0 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: 0).getOperand(i: 0));
20561 auto *Ld1 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: 0).getOperand(i: 1));
20562 auto *Ld2 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: 1).getOperand(i: 0));
20563 auto *Ld3 = dyn_cast<LoadSDNode>(Val: B.getOperand(i: 1).getOperand(i: 0));
20564 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
20565 !Ld2->isSimple() || !Ld3->isSimple())
20566 return false;
20567 Loads.push_back(Elt: Ld0);
20568 Loads.push_back(Elt: Ld1);
20569 Loads.push_back(Elt: Ld2);
20570 Loads.push_back(Elt: Ld3);
20571 return true;
20572 }
20573 return false;
20574}
20575
20576static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1,
20577 SelectionDAG &DAG,
20578 unsigned &NumSubLoads) {
20579 if (!Op0.hasOneUse() || !Op1.hasOneUse())
20580 return false;
20581
20582 SmallVector<LoadSDNode *> Loads0, Loads1;
20583 if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
20584 isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
20585 if (NumSubLoads && Loads0.size() != NumSubLoads)
20586 return false;
20587 NumSubLoads = Loads0.size();
20588 return Loads0.size() == Loads1.size() &&
20589 all_of(Range: zip(t&: Loads0, u&: Loads1), P: [&DAG](auto L) {
20590 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
20591 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
20592 DAG.areNonVolatileConsecutiveLoads(LD: get<1>(L), Base: get<0>(L),
20593 Bytes: Size / 8, Dist: 1);
20594 });
20595 }
20596
20597 if (Op0.getOpcode() != Op1.getOpcode())
20598 return false;
20599
20600 switch (Op0.getOpcode()) {
20601 case ISD::ADD:
20602 case ISD::SUB:
20603 return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: 0), Op1: Op1.getOperand(i: 0),
20604 DAG, NumSubLoads) &&
20605 areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: 1), Op1: Op1.getOperand(i: 1),
20606 DAG, NumSubLoads);
20607 case ISD::SIGN_EXTEND:
20608 case ISD::ANY_EXTEND:
20609 case ISD::ZERO_EXTEND:
20610 EVT XVT = Op0.getOperand(i: 0).getValueType();
20611 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
20612 XVT.getScalarSizeInBits() != 32)
20613 return false;
20614 return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: 0), Op1: Op1.getOperand(i: 0),
20615 DAG, NumSubLoads);
20616 }
20617 return false;
20618}
20619
20620// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
20621// into a single load of twice the size, that we extract the bottom part and top
20622// part so that the shl can use a shll2 instruction. The two loads in that
20623// example can also be larger trees of instructions, which are identical except
20624// for the leaves which are all loads offset from the LHS, including
20625// buildvectors of multiple loads. For example the RHS tree could be
20626// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
20627// Whilst it can be common for the larger loads to replace LDP instructions
20628// (which doesn't gain anything on it's own), the larger loads can help create
20629// more efficient code, and in buildvectors prevent the need for ld1 lane
20630// inserts which can be slower than normal loads.
20631static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
20632 EVT VT = N->getValueType(ResNo: 0);
20633 if (!VT.isFixedLengthVector() ||
20634 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
20635 VT.getScalarSizeInBits() != 64))
20636 return SDValue();
20637
20638 SDValue Other = N->getOperand(Num: 0);
20639 SDValue Shift = N->getOperand(Num: 1);
20640 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
20641 std::swap(a&: Shift, b&: Other);
20642 APInt ShiftAmt;
20643 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
20644 !ISD::isConstantSplatVector(N: Shift.getOperand(i: 1).getNode(), SplatValue&: ShiftAmt))
20645 return SDValue();
20646
20647 if (!ISD::isExtOpcode(Opcode: Shift.getOperand(i: 0).getOpcode()) ||
20648 !ISD::isExtOpcode(Opcode: Other.getOpcode()) ||
20649 Shift.getOperand(i: 0).getOperand(i: 0).getValueType() !=
20650 Other.getOperand(i: 0).getValueType() ||
20651 !Other.hasOneUse() || !Shift.getOperand(i: 0).hasOneUse())
20652 return SDValue();
20653
20654 SDValue Op0 = Other.getOperand(i: 0);
20655 SDValue Op1 = Shift.getOperand(i: 0).getOperand(i: 0);
20656
20657 unsigned NumSubLoads = 0;
20658 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
20659 return SDValue();
20660
20661 // Attempt to rule out some unprofitable cases using heuristics (some working
20662 // around suboptimal code generation), notably if the extend not be able to
20663 // use ushll2 instructions as the types are not large enough. Otherwise zip's
20664 // will need to be created which can increase the instruction count.
20665 unsigned NumElts = Op0.getValueType().getVectorNumElements();
20666 unsigned NumSubElts = NumElts / NumSubLoads;
20667 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
20668 (Other.getOpcode() != Shift.getOperand(i: 0).getOpcode() &&
20669 Op0.getValueType().getSizeInBits() < 128 &&
20670 !DAG.getTargetLoweringInfo().isTypeLegal(VT: Op0.getValueType())))
20671 return SDValue();
20672
20673 // Recreate the tree with the new combined loads.
20674 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
20675 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
20676 EVT DVT =
20677 Op0.getValueType().getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
20678
20679 SmallVector<LoadSDNode *> Loads0, Loads1;
20680 if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
20681 isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
20682 EVT LoadVT = EVT::getVectorVT(
20683 Context&: *DAG.getContext(), VT: Op0.getValueType().getScalarType(),
20684 NumElements: Op0.getValueType().getVectorNumElements() / Loads0.size());
20685 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
20686
20687 SmallVector<SDValue> NewLoads;
20688 for (const auto &[L0, L1] : zip(t&: Loads0, u&: Loads1)) {
20689 SDValue Load = DAG.getLoad(VT: DLoadVT, dl: SDLoc(L0), Chain: L0->getChain(),
20690 Ptr: L0->getBasePtr(), PtrInfo: L0->getPointerInfo(),
20691 Alignment: L0->getOriginalAlign());
20692 DAG.makeEquivalentMemoryOrdering(OldLoad: L0, NewMemOp: Load.getValue(R: 1));
20693 DAG.makeEquivalentMemoryOrdering(OldLoad: L1, NewMemOp: Load.getValue(R: 1));
20694 NewLoads.push_back(Elt: Load);
20695 }
20696 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op0), VT: DVT, Ops: NewLoads);
20697 }
20698
20699 SmallVector<SDValue> Ops;
20700 for (const auto &[O0, O1] : zip(t: Op0->op_values(), u: Op1->op_values()))
20701 Ops.push_back(Elt: GenCombinedTree(O0, O1, DAG));
20702 return DAG.getNode(Opcode: Op0.getOpcode(), DL: SDLoc(Op0), VT: DVT, Ops);
20703 };
20704 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
20705
20706 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
20707 int Hi = NumSubElts, Lo = 0;
20708 for (unsigned i = 0; i < NumSubLoads; i++) {
20709 for (unsigned j = 0; j < NumSubElts; j++) {
20710 LowMask[i * NumSubElts + j] = Lo++;
20711 HighMask[i * NumSubElts + j] = Hi++;
20712 }
20713 Lo += NumSubElts;
20714 Hi += NumSubElts;
20715 }
20716 SDLoc DL(N);
20717 SDValue Ext0, Ext1;
20718 // Extract the top and bottom lanes, then extend the result. Possibly extend
20719 // the result then extract the lanes if the two operands match as it produces
20720 // slightly smaller code.
20721 if (Other.getOpcode() != Shift.getOperand(i: 0).getOpcode()) {
20722 SDValue SubL = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: Op0.getValueType(),
20723 N1: NewOp, N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
20724 SDValue SubH =
20725 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: Op0.getValueType(), N1: NewOp,
20726 N2: DAG.getConstant(Val: NumSubElts * NumSubLoads, DL, VT: MVT::i64));
20727 SDValue Extr0 =
20728 DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
20729 SDValue Extr1 =
20730 DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
20731 Ext0 = DAG.getNode(Opcode: Other.getOpcode(), DL, VT, Operand: Extr0);
20732 Ext1 = DAG.getNode(Opcode: Shift.getOperand(i: 0).getOpcode(), DL, VT, Operand: Extr1);
20733 } else {
20734 EVT DVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
20735 SDValue Ext = DAG.getNode(Opcode: Other.getOpcode(), DL, VT: DVT, Operand: NewOp);
20736 SDValue SubL = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Ext,
20737 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
20738 SDValue SubH =
20739 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Ext,
20740 N2: DAG.getConstant(Val: NumSubElts * NumSubLoads, DL, VT: MVT::i64));
20741 Ext0 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
20742 Ext1 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
20743 }
20744 SDValue NShift =
20745 DAG.getNode(Opcode: Shift.getOpcode(), DL, VT, N1: Ext1, N2: Shift.getOperand(i: 1));
20746 return DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: Ext0, N2: NShift);
20747}
20748
20749static SDValue performAddSubCombine(SDNode *N,
20750 TargetLowering::DAGCombinerInfo &DCI) {
20751 // Try to change sum of two reductions.
20752 if (SDValue Val = performAddUADDVCombine(N, DAG&: DCI.DAG))
20753 return Val;
20754 if (SDValue Val = performAddDotCombine(N, DAG&: DCI.DAG))
20755 return Val;
20756 if (SDValue Val = performAddCSelIntoCSinc(N, DAG&: DCI.DAG))
20757 return Val;
20758 if (SDValue Val = performNegCSelCombine(N, DAG&: DCI.DAG))
20759 return Val;
20760 if (SDValue Val = performVectorExtCombine(N, DAG&: DCI.DAG))
20761 return Val;
20762 if (SDValue Val = performAddCombineForShiftedOperands(N, DAG&: DCI.DAG))
20763 return Val;
20764 if (SDValue Val = performSubAddMULCombine(N, DAG&: DCI.DAG))
20765 return Val;
20766 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20767 return Val;
20768 if (SDValue Val = performAddSubIntoVectorOp(N, DAG&: DCI.DAG))
20769 return Val;
20770
20771 if (SDValue Val = performExtBinopLoadFold(N, DAG&: DCI.DAG))
20772 return Val;
20773
20774 return performAddSubLongCombine(N, DCI);
20775}
20776
20777// Massage DAGs which we can use the high-half "long" operations on into
20778// something isel will recognize better. E.g.
20779//
20780// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20781// (aarch64_neon_umull (extract_high (v2i64 vec)))
20782// (extract_high (v2i64 (dup128 scalar)))))
20783//
20784static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
20785 TargetLowering::DAGCombinerInfo &DCI,
20786 SelectionDAG &DAG) {
20787 if (DCI.isBeforeLegalizeOps())
20788 return SDValue();
20789
20790 SDValue LHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? 0 : 1);
20791 SDValue RHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? 1 : 2);
20792 assert(LHS.getValueType().is64BitVector() &&
20793 RHS.getValueType().is64BitVector() &&
20794 "unexpected shape for long operation");
20795
20796 // Either node could be a DUP, but it's not worth doing both of them (you'd
20797 // just as well use the non-high version) so look for a corresponding extract
20798 // operation on the other "wing".
20799 if (isEssentiallyExtractHighSubvector(N: LHS)) {
20800 RHS = tryExtendDUPToExtractHigh(N: RHS, DAG);
20801 if (!RHS.getNode())
20802 return SDValue();
20803 } else if (isEssentiallyExtractHighSubvector(N: RHS)) {
20804 LHS = tryExtendDUPToExtractHigh(N: LHS, DAG);
20805 if (!LHS.getNode())
20806 return SDValue();
20807 } else
20808 return SDValue();
20809
20810 if (IID == Intrinsic::not_intrinsic)
20811 return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: LHS, N2: RHS);
20812
20813 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20814 N1: N->getOperand(Num: 0), N2: LHS, N3: RHS);
20815}
20816
20817static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20818 MVT ElemTy = N->getSimpleValueType(ResNo: 0).getScalarType();
20819 unsigned ElemBits = ElemTy.getSizeInBits();
20820
20821 int64_t ShiftAmount;
20822 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: 2))) {
20823 APInt SplatValue, SplatUndef;
20824 unsigned SplatBitSize;
20825 bool HasAnyUndefs;
20826 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20827 HasAnyUndefs, MinSplatBits: ElemBits) ||
20828 SplatBitSize != ElemBits)
20829 return SDValue();
20830
20831 ShiftAmount = SplatValue.getSExtValue();
20832 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2))) {
20833 ShiftAmount = CVN->getSExtValue();
20834 } else
20835 return SDValue();
20836
20837 // If the shift amount is zero, remove the shift intrinsic.
20838 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20839 return N->getOperand(Num: 1);
20840
20841 unsigned Opcode;
20842 bool IsRightShift;
20843 switch (IID) {
20844 default:
20845 llvm_unreachable("Unknown shift intrinsic");
20846 case Intrinsic::aarch64_neon_sqshl:
20847 Opcode = AArch64ISD::SQSHL_I;
20848 IsRightShift = false;
20849 break;
20850 case Intrinsic::aarch64_neon_uqshl:
20851 Opcode = AArch64ISD::UQSHL_I;
20852 IsRightShift = false;
20853 break;
20854 case Intrinsic::aarch64_neon_srshl:
20855 Opcode = AArch64ISD::SRSHR_I;
20856 IsRightShift = true;
20857 break;
20858 case Intrinsic::aarch64_neon_urshl:
20859 Opcode = AArch64ISD::URSHR_I;
20860 IsRightShift = true;
20861 break;
20862 case Intrinsic::aarch64_neon_sqshlu:
20863 Opcode = AArch64ISD::SQSHLU_I;
20864 IsRightShift = false;
20865 break;
20866 case Intrinsic::aarch64_neon_sshl:
20867 case Intrinsic::aarch64_neon_ushl:
20868 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20869 // left shift for positive shift amounts. For negative shifts we can use a
20870 // VASHR/VLSHR as appropiate.
20871 if (ShiftAmount < 0) {
20872 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20873 : AArch64ISD::VLSHR;
20874 ShiftAmount = -ShiftAmount;
20875 } else
20876 Opcode = AArch64ISD::VSHL;
20877 IsRightShift = false;
20878 break;
20879 }
20880
20881 EVT VT = N->getValueType(ResNo: 0);
20882 SDValue Op = N->getOperand(Num: 1);
20883 SDLoc dl(N);
20884 if (VT == MVT::i64) {
20885 Op = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: MVT::v1i64, Operand: Op);
20886 VT = MVT::v1i64;
20887 }
20888
20889 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20890 Op = DAG.getNode(Opcode, DL: dl, VT, N1: Op,
20891 N2: DAG.getConstant(Val: -ShiftAmount, DL: dl, VT: MVT::i32));
20892 if (N->getValueType(ResNo: 0) == MVT::i64)
20893 Op = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i64, N1: Op,
20894 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64));
20895 return Op;
20896 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20897 Op = DAG.getNode(Opcode, DL: dl, VT, N1: Op,
20898 N2: DAG.getConstant(Val: ShiftAmount, DL: dl, VT: MVT::i32));
20899 if (N->getValueType(ResNo: 0) == MVT::i64)
20900 Op = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i64, N1: Op,
20901 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64));
20902 return Op;
20903 }
20904
20905 return SDValue();
20906}
20907
20908// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20909// the intrinsics must be legal and take an i32, this means there's almost
20910// certainly going to be a zext in the DAG which we can eliminate.
20911static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20912 SDValue AndN = N->getOperand(Num: 2);
20913 if (AndN.getOpcode() != ISD::AND)
20914 return SDValue();
20915
20916 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: AndN.getOperand(i: 1));
20917 if (!CMask || CMask->getZExtValue() != Mask)
20918 return SDValue();
20919
20920 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SDLoc(N), VT: MVT::i32,
20921 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), N3: AndN.getOperand(i: 0));
20922}
20923
20924static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
20925 SelectionDAG &DAG) {
20926 SDLoc dl(N);
20927 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: N->getValueType(ResNo: 0),
20928 N1: DAG.getNode(Opcode: Opc, DL: dl,
20929 VT: N->getOperand(Num: 1).getSimpleValueType(),
20930 Operand: N->getOperand(Num: 1)),
20931 N2: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64));
20932}
20933
20934static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
20935 SDLoc DL(N);
20936 SDValue Op1 = N->getOperand(Num: 1);
20937 SDValue Op2 = N->getOperand(Num: 2);
20938 EVT ScalarTy = Op2.getValueType();
20939 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20940 ScalarTy = MVT::i32;
20941
20942 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20943 SDValue StepVector = DAG.getStepVector(DL, ResVT: N->getValueType(ResNo: 0));
20944 SDValue Step = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: 0), Operand: Op2);
20945 SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: N->getValueType(ResNo: 0), N1: StepVector, N2: Step);
20946 SDValue Base = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: 0), Operand: Op1);
20947 return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: 0), N1: Mul, N2: Base);
20948}
20949
20950static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
20951 SDLoc dl(N);
20952 SDValue Scalar = N->getOperand(Num: 3);
20953 EVT ScalarTy = Scalar.getValueType();
20954
20955 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20956 Scalar = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i32, Operand: Scalar);
20957
20958 SDValue Passthru = N->getOperand(Num: 1);
20959 SDValue Pred = N->getOperand(Num: 2);
20960 return DAG.getNode(Opcode: AArch64ISD::DUP_MERGE_PASSTHRU, DL: dl, VT: N->getValueType(ResNo: 0),
20961 N1: Pred, N2: Scalar, N3: Passthru);
20962}
20963
20964static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
20965 SDLoc dl(N);
20966 LLVMContext &Ctx = *DAG.getContext();
20967 EVT VT = N->getValueType(ResNo: 0);
20968
20969 assert(VT.isScalableVector() && "Expected a scalable vector.");
20970
20971 // Current lowering only supports the SVE-ACLE types.
20972 if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
20973 return SDValue();
20974
20975 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20976 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20977 EVT ByteVT =
20978 EVT::getVectorVT(Context&: Ctx, VT: MVT::i8, EC: ElementCount::getScalable(MinVal: ByteSize));
20979
20980 // Convert everything to the domain of EXT (i.e bytes).
20981 SDValue Op0 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ByteVT, Operand: N->getOperand(Num: 1));
20982 SDValue Op1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ByteVT, Operand: N->getOperand(Num: 2));
20983 SDValue Op2 = DAG.getNode(Opcode: ISD::MUL, DL: dl, VT: MVT::i32, N1: N->getOperand(Num: 3),
20984 N2: DAG.getConstant(Val: ElemSize, DL: dl, VT: MVT::i32));
20985
20986 SDValue EXT = DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT: ByteVT, N1: Op0, N2: Op1, N3: Op2);
20987 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: EXT);
20988}
20989
20990static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
20991 TargetLowering::DAGCombinerInfo &DCI,
20992 SelectionDAG &DAG) {
20993 if (DCI.isBeforeLegalize())
20994 return SDValue();
20995
20996 SDValue Comparator = N->getOperand(Num: 3);
20997 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20998 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20999 unsigned IID = getIntrinsicID(N);
21000 EVT VT = N->getValueType(ResNo: 0);
21001 EVT CmpVT = N->getOperand(Num: 2).getValueType();
21002 SDValue Pred = N->getOperand(Num: 1);
21003 SDValue Imm;
21004 SDLoc DL(N);
21005
21006 switch (IID) {
21007 default:
21008 llvm_unreachable("Called with wrong intrinsic!");
21009 break;
21010
21011 // Signed comparisons
21012 case Intrinsic::aarch64_sve_cmpeq_wide:
21013 case Intrinsic::aarch64_sve_cmpne_wide:
21014 case Intrinsic::aarch64_sve_cmpge_wide:
21015 case Intrinsic::aarch64_sve_cmpgt_wide:
21016 case Intrinsic::aarch64_sve_cmplt_wide:
21017 case Intrinsic::aarch64_sve_cmple_wide: {
21018 if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: 0))) {
21019 int64_t ImmVal = CN->getSExtValue();
21020 if (ImmVal >= -16 && ImmVal <= 15)
21021 Imm = DAG.getConstant(Val: ImmVal, DL, VT: MVT::i32);
21022 else
21023 return SDValue();
21024 }
21025 break;
21026 }
21027 // Unsigned comparisons
21028 case Intrinsic::aarch64_sve_cmphs_wide:
21029 case Intrinsic::aarch64_sve_cmphi_wide:
21030 case Intrinsic::aarch64_sve_cmplo_wide:
21031 case Intrinsic::aarch64_sve_cmpls_wide: {
21032 if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: 0))) {
21033 uint64_t ImmVal = CN->getZExtValue();
21034 if (ImmVal <= 127)
21035 Imm = DAG.getConstant(Val: ImmVal, DL, VT: MVT::i32);
21036 else
21037 return SDValue();
21038 }
21039 break;
21040 }
21041 }
21042
21043 if (!Imm)
21044 return SDValue();
21045
21046 SDValue Splat = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: CmpVT, Operand: Imm);
21047 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT, N1: Pred,
21048 N2: N->getOperand(Num: 2), N3: Splat, N4: DAG.getCondCode(Cond: CC));
21049 }
21050
21051 return SDValue();
21052}
21053
21054static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
21055 AArch64CC::CondCode Cond) {
21056 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21057
21058 SDLoc DL(Op);
21059 assert(Op.getValueType().isScalableVector() &&
21060 TLI.isTypeLegal(Op.getValueType()) &&
21061 "Expected legal scalable vector type!");
21062 assert(Op.getValueType() == Pg.getValueType() &&
21063 "Expected same type for PTEST operands");
21064
21065 // Ensure target specific opcodes are using legal type.
21066 EVT OutVT = TLI.getTypeToTransformTo(Context&: *DAG.getContext(), VT);
21067 SDValue TVal = DAG.getConstant(Val: 1, DL, VT: OutVT);
21068 SDValue FVal = DAG.getConstant(Val: 0, DL, VT: OutVT);
21069
21070 // Ensure operands have type nxv16i1.
21071 if (Op.getValueType() != MVT::nxv16i1) {
21072 if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&
21073 isZeroingInactiveLanes(Op))
21074 Pg = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv16i1, Operand: Pg);
21075 else
21076 Pg = getSVEPredicateBitCast(VT: MVT::nxv16i1, Op: Pg, DAG);
21077 Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv16i1, Operand: Op);
21078 }
21079
21080 // Set condition code (CC) flags.
21081 SDValue Test = DAG.getNode(
21082 Opcode: Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
21083 DL, VT: MVT::Other, N1: Pg, N2: Op);
21084
21085 // Convert CC to integer based on requested condition.
21086 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
21087 SDValue CC = DAG.getConstant(Val: getInvertedCondCode(Code: Cond), DL, VT: MVT::i32);
21088 SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OutVT, N1: FVal, N2: TVal, N3: CC, N4: Test);
21089 return DAG.getZExtOrTrunc(Op: Res, DL, VT);
21090}
21091
21092static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
21093 SelectionDAG &DAG) {
21094 SDLoc DL(N);
21095
21096 SDValue Pred = N->getOperand(Num: 1);
21097 SDValue VecToReduce = N->getOperand(Num: 2);
21098
21099 // NOTE: The integer reduction's result type is not always linked to the
21100 // operand's element type so we construct it from the intrinsic's result type.
21101 EVT ReduceVT = getPackedSVEVectorVT(VT: N->getValueType(ResNo: 0));
21102 SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
21103
21104 // SVE reductions set the whole vector register with the first element
21105 // containing the reduction result, which we'll now extract.
21106 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
21107 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: 0), N1: Reduce,
21108 N2: Zero);
21109}
21110
21111static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
21112 SelectionDAG &DAG) {
21113 SDLoc DL(N);
21114
21115 SDValue Pred = N->getOperand(Num: 1);
21116 SDValue VecToReduce = N->getOperand(Num: 2);
21117
21118 EVT ReduceVT = VecToReduce.getValueType();
21119 SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
21120
21121 // SVE reductions set the whole vector register with the first element
21122 // containing the reduction result, which we'll now extract.
21123 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
21124 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: 0), N1: Reduce,
21125 N2: Zero);
21126}
21127
21128static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
21129 SelectionDAG &DAG) {
21130 SDLoc DL(N);
21131
21132 SDValue Pred = N->getOperand(Num: 1);
21133 SDValue InitVal = N->getOperand(Num: 2);
21134 SDValue VecToReduce = N->getOperand(Num: 3);
21135 EVT ReduceVT = VecToReduce.getValueType();
21136
21137 // Ordered reductions use the first lane of the result vector as the
21138 // reduction's initial value.
21139 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
21140 InitVal = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ReduceVT,
21141 N1: DAG.getUNDEF(VT: ReduceVT), N2: InitVal, N3: Zero);
21142
21143 SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: InitVal, N3: VecToReduce);
21144
21145 // SVE reductions set the whole vector register with the first element
21146 // containing the reduction result, which we'll now extract.
21147 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: 0), N1: Reduce,
21148 N2: Zero);
21149}
21150
21151// If a merged operation has no inactive lanes we can relax it to a predicated
21152// or unpredicated operation, which potentially allows better isel (perhaps
21153// using immediate forms) or relaxing register reuse requirements.
21154static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
21155 SelectionDAG &DAG, bool UnpredOp = false,
21156 bool SwapOperands = false) {
21157 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
21158 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
21159 SDValue Pg = N->getOperand(Num: 1);
21160 SDValue Op1 = N->getOperand(Num: SwapOperands ? 3 : 2);
21161 SDValue Op2 = N->getOperand(Num: SwapOperands ? 2 : 3);
21162
21163 // ISD way to specify an all active predicate.
21164 if (isAllActivePredicate(DAG, N: Pg)) {
21165 if (UnpredOp)
21166 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: Op1, N2: Op2);
21167
21168 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: Pg, N2: Op1, N3: Op2);
21169 }
21170
21171 // FUTURE: SplatVector(true)
21172 return SDValue();
21173}
21174
21175static SDValue tryCombineWhileLo(SDNode *N,
21176 TargetLowering::DAGCombinerInfo &DCI,
21177 const AArch64Subtarget *Subtarget) {
21178 if (DCI.isBeforeLegalize())
21179 return SDValue();
21180
21181 if (!Subtarget->hasSVE2p1())
21182 return SDValue();
21183
21184 if (!N->hasNUsesOfValue(NUses: 2, Value: 0))
21185 return SDValue();
21186
21187 const uint64_t HalfSize = N->getValueType(ResNo: 0).getVectorMinNumElements() / 2;
21188 if (HalfSize < 2)
21189 return SDValue();
21190
21191 auto It = N->use_begin();
21192 SDNode *Lo = *It++;
21193 SDNode *Hi = *It;
21194
21195 if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
21196 Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
21197 return SDValue();
21198
21199 uint64_t OffLo = Lo->getConstantOperandVal(Num: 1);
21200 uint64_t OffHi = Hi->getConstantOperandVal(Num: 1);
21201
21202 if (OffLo > OffHi) {
21203 std::swap(a&: Lo, b&: Hi);
21204 std::swap(a&: OffLo, b&: OffHi);
21205 }
21206
21207 if (OffLo != 0 || OffHi != HalfSize)
21208 return SDValue();
21209
21210 EVT HalfVec = Lo->getValueType(ResNo: 0);
21211 if (HalfVec != Hi->getValueType(ResNo: 0) ||
21212 HalfVec.getVectorElementCount() != ElementCount::getScalable(MinVal: HalfSize))
21213 return SDValue();
21214
21215 SelectionDAG &DAG = DCI.DAG;
21216 SDLoc DL(N);
21217 SDValue ID =
21218 DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo_x2, DL, VT: MVT::i64);
21219 SDValue Idx = N->getOperand(Num: 1);
21220 SDValue TC = N->getOperand(Num: 2);
21221 if (Idx.getValueType() != MVT::i64) {
21222 Idx = DAG.getZExtOrTrunc(Op: Idx, DL, VT: MVT::i64);
21223 TC = DAG.getZExtOrTrunc(Op: TC, DL, VT: MVT::i64);
21224 }
21225 auto R =
21226 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL,
21227 ResultTys: {Lo->getValueType(ResNo: 0), Hi->getValueType(ResNo: 0)}, Ops: {ID, Idx, TC});
21228
21229 DCI.CombineTo(N: Lo, Res: R.getValue(R: 0));
21230 DCI.CombineTo(N: Hi, Res: R.getValue(R: 1));
21231
21232 return SDValue(N, 0);
21233}
21234
21235static SDValue performIntrinsicCombine(SDNode *N,
21236 TargetLowering::DAGCombinerInfo &DCI,
21237 const AArch64Subtarget *Subtarget) {
21238 SelectionDAG &DAG = DCI.DAG;
21239 unsigned IID = getIntrinsicID(N);
21240 switch (IID) {
21241 default:
21242 break;
21243 case Intrinsic::aarch64_neon_vcvtfxs2fp:
21244 case Intrinsic::aarch64_neon_vcvtfxu2fp:
21245 return tryCombineFixedPointConvert(N, DCI, DAG);
21246 case Intrinsic::aarch64_neon_saddv:
21247 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SADDV, N, DAG);
21248 case Intrinsic::aarch64_neon_uaddv:
21249 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UADDV, N, DAG);
21250 case Intrinsic::aarch64_neon_sminv:
21251 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMINV, N, DAG);
21252 case Intrinsic::aarch64_neon_uminv:
21253 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMINV, N, DAG);
21254 case Intrinsic::aarch64_neon_smaxv:
21255 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMAXV, N, DAG);
21256 case Intrinsic::aarch64_neon_umaxv:
21257 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMAXV, N, DAG);
21258 case Intrinsic::aarch64_neon_fmax:
21259 return DAG.getNode(Opcode: ISD::FMAXIMUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21260 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21261 case Intrinsic::aarch64_neon_fmin:
21262 return DAG.getNode(Opcode: ISD::FMINIMUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21263 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21264 case Intrinsic::aarch64_neon_fmaxnm:
21265 return DAG.getNode(Opcode: ISD::FMAXNUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21266 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21267 case Intrinsic::aarch64_neon_fminnm:
21268 return DAG.getNode(Opcode: ISD::FMINNUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21269 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21270 case Intrinsic::aarch64_neon_smull:
21271 return DAG.getNode(Opcode: AArch64ISD::SMULL, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21272 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21273 case Intrinsic::aarch64_neon_umull:
21274 return DAG.getNode(Opcode: AArch64ISD::UMULL, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21275 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21276 case Intrinsic::aarch64_neon_pmull:
21277 return DAG.getNode(Opcode: AArch64ISD::PMULL, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21278 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21279 case Intrinsic::aarch64_neon_sqdmull:
21280 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
21281 case Intrinsic::aarch64_neon_sqshl:
21282 case Intrinsic::aarch64_neon_uqshl:
21283 case Intrinsic::aarch64_neon_sqshlu:
21284 case Intrinsic::aarch64_neon_srshl:
21285 case Intrinsic::aarch64_neon_urshl:
21286 case Intrinsic::aarch64_neon_sshl:
21287 case Intrinsic::aarch64_neon_ushl:
21288 return tryCombineShiftImm(IID, N, DAG);
21289 case Intrinsic::aarch64_neon_sabd:
21290 return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21291 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21292 case Intrinsic::aarch64_neon_uabd:
21293 return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21294 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21295 case Intrinsic::aarch64_crc32b:
21296 case Intrinsic::aarch64_crc32cb:
21297 return tryCombineCRC32(Mask: 0xff, N, DAG);
21298 case Intrinsic::aarch64_crc32h:
21299 case Intrinsic::aarch64_crc32ch:
21300 return tryCombineCRC32(Mask: 0xffff, N, DAG);
21301 case Intrinsic::aarch64_sve_saddv:
21302 // There is no i64 version of SADDV because the sign is irrelevant.
21303 if (N->getOperand(Num: 2)->getValueType(ResNo: 0).getVectorElementType() == MVT::i64)
21304 return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
21305 else
21306 return combineSVEReductionInt(N, Opc: AArch64ISD::SADDV_PRED, DAG);
21307 case Intrinsic::aarch64_sve_uaddv:
21308 return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
21309 case Intrinsic::aarch64_sve_smaxv:
21310 return combineSVEReductionInt(N, Opc: AArch64ISD::SMAXV_PRED, DAG);
21311 case Intrinsic::aarch64_sve_umaxv:
21312 return combineSVEReductionInt(N, Opc: AArch64ISD::UMAXV_PRED, DAG);
21313 case Intrinsic::aarch64_sve_sminv:
21314 return combineSVEReductionInt(N, Opc: AArch64ISD::SMINV_PRED, DAG);
21315 case Intrinsic::aarch64_sve_uminv:
21316 return combineSVEReductionInt(N, Opc: AArch64ISD::UMINV_PRED, DAG);
21317 case Intrinsic::aarch64_sve_orv:
21318 return combineSVEReductionInt(N, Opc: AArch64ISD::ORV_PRED, DAG);
21319 case Intrinsic::aarch64_sve_eorv:
21320 return combineSVEReductionInt(N, Opc: AArch64ISD::EORV_PRED, DAG);
21321 case Intrinsic::aarch64_sve_andv:
21322 return combineSVEReductionInt(N, Opc: AArch64ISD::ANDV_PRED, DAG);
21323 case Intrinsic::aarch64_sve_index:
21324 return LowerSVEIntrinsicIndex(N, DAG);
21325 case Intrinsic::aarch64_sve_dup:
21326 return LowerSVEIntrinsicDUP(N, DAG);
21327 case Intrinsic::aarch64_sve_dup_x:
21328 return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21329 Operand: N->getOperand(Num: 1));
21330 case Intrinsic::aarch64_sve_ext:
21331 return LowerSVEIntrinsicEXT(N, DAG);
21332 case Intrinsic::aarch64_sve_mul_u:
21333 return DAG.getNode(Opcode: AArch64ISD::MUL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21334 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21335 case Intrinsic::aarch64_sve_smulh_u:
21336 return DAG.getNode(Opcode: AArch64ISD::MULHS_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21337 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21338 case Intrinsic::aarch64_sve_umulh_u:
21339 return DAG.getNode(Opcode: AArch64ISD::MULHU_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21340 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21341 case Intrinsic::aarch64_sve_smin_u:
21342 return DAG.getNode(Opcode: AArch64ISD::SMIN_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21343 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21344 case Intrinsic::aarch64_sve_umin_u:
21345 return DAG.getNode(Opcode: AArch64ISD::UMIN_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21346 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21347 case Intrinsic::aarch64_sve_smax_u:
21348 return DAG.getNode(Opcode: AArch64ISD::SMAX_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21349 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21350 case Intrinsic::aarch64_sve_umax_u:
21351 return DAG.getNode(Opcode: AArch64ISD::UMAX_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21352 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21353 case Intrinsic::aarch64_sve_lsl_u:
21354 return DAG.getNode(Opcode: AArch64ISD::SHL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21355 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21356 case Intrinsic::aarch64_sve_lsr_u:
21357 return DAG.getNode(Opcode: AArch64ISD::SRL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21358 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21359 case Intrinsic::aarch64_sve_asr_u:
21360 return DAG.getNode(Opcode: AArch64ISD::SRA_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21361 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21362 case Intrinsic::aarch64_sve_fadd_u:
21363 return DAG.getNode(Opcode: AArch64ISD::FADD_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21364 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21365 case Intrinsic::aarch64_sve_fdiv_u:
21366 return DAG.getNode(Opcode: AArch64ISD::FDIV_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21367 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21368 case Intrinsic::aarch64_sve_fmax_u:
21369 return DAG.getNode(Opcode: AArch64ISD::FMAX_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21370 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21371 case Intrinsic::aarch64_sve_fmaxnm_u:
21372 return DAG.getNode(Opcode: AArch64ISD::FMAXNM_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21373 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21374 case Intrinsic::aarch64_sve_fmla_u:
21375 return DAG.getNode(Opcode: AArch64ISD::FMA_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21376 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 3), N3: N->getOperand(Num: 4),
21377 N4: N->getOperand(Num: 2));
21378 case Intrinsic::aarch64_sve_fmin_u:
21379 return DAG.getNode(Opcode: AArch64ISD::FMIN_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21380 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21381 case Intrinsic::aarch64_sve_fminnm_u:
21382 return DAG.getNode(Opcode: AArch64ISD::FMINNM_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21383 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21384 case Intrinsic::aarch64_sve_fmul_u:
21385 return DAG.getNode(Opcode: AArch64ISD::FMUL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21386 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21387 case Intrinsic::aarch64_sve_fsub_u:
21388 return DAG.getNode(Opcode: AArch64ISD::FSUB_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21389 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21390 case Intrinsic::aarch64_sve_add_u:
21391 return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
21392 N2: N->getOperand(Num: 3));
21393 case Intrinsic::aarch64_sve_sub_u:
21394 return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
21395 N2: N->getOperand(Num: 3));
21396 case Intrinsic::aarch64_sve_subr:
21397 return convertMergedOpToPredOp(N, Opc: ISD::SUB, DAG, UnpredOp: true, SwapOperands: true);
21398 case Intrinsic::aarch64_sve_and_u:
21399 return DAG.getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
21400 N2: N->getOperand(Num: 3));
21401 case Intrinsic::aarch64_sve_bic_u:
21402 return DAG.getNode(Opcode: AArch64ISD::BIC, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21403 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
21404 case Intrinsic::aarch64_sve_eor_u:
21405 return DAG.getNode(Opcode: ISD::XOR, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
21406 N2: N->getOperand(Num: 3));
21407 case Intrinsic::aarch64_sve_orr_u:
21408 return DAG.getNode(Opcode: ISD::OR, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
21409 N2: N->getOperand(Num: 3));
21410 case Intrinsic::aarch64_sve_sabd_u:
21411 return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21412 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
21413 case Intrinsic::aarch64_sve_uabd_u:
21414 return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21415 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
21416 case Intrinsic::aarch64_sve_sdiv_u:
21417 return DAG.getNode(Opcode: AArch64ISD::SDIV_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21418 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21419 case Intrinsic::aarch64_sve_udiv_u:
21420 return DAG.getNode(Opcode: AArch64ISD::UDIV_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21421 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21422 case Intrinsic::aarch64_sve_sqadd:
21423 return convertMergedOpToPredOp(N, Opc: ISD::SADDSAT, DAG, UnpredOp: true);
21424 case Intrinsic::aarch64_sve_sqsub_u:
21425 return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21426 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
21427 case Intrinsic::aarch64_sve_uqadd:
21428 return convertMergedOpToPredOp(N, Opc: ISD::UADDSAT, DAG, UnpredOp: true);
21429 case Intrinsic::aarch64_sve_uqsub_u:
21430 return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21431 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
21432 case Intrinsic::aarch64_sve_sqadd_x:
21433 return DAG.getNode(Opcode: ISD::SADDSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21434 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21435 case Intrinsic::aarch64_sve_sqsub_x:
21436 return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21437 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21438 case Intrinsic::aarch64_sve_uqadd_x:
21439 return DAG.getNode(Opcode: ISD::UADDSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21440 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21441 case Intrinsic::aarch64_sve_uqsub_x:
21442 return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21443 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
21444 case Intrinsic::aarch64_sve_asrd:
21445 return DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21446 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21447 case Intrinsic::aarch64_sve_cmphs:
21448 if (!N->getOperand(Num: 2).getValueType().isFloatingPoint())
21449 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
21450 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
21451 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETUGE));
21452 break;
21453 case Intrinsic::aarch64_sve_cmphi:
21454 if (!N->getOperand(Num: 2).getValueType().isFloatingPoint())
21455 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
21456 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
21457 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETUGT));
21458 break;
21459 case Intrinsic::aarch64_sve_fcmpge:
21460 case Intrinsic::aarch64_sve_cmpge:
21461 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
21462 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
21463 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETGE));
21464 break;
21465 case Intrinsic::aarch64_sve_fcmpgt:
21466 case Intrinsic::aarch64_sve_cmpgt:
21467 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
21468 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
21469 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETGT));
21470 break;
21471 case Intrinsic::aarch64_sve_fcmpeq:
21472 case Intrinsic::aarch64_sve_cmpeq:
21473 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
21474 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
21475 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETEQ));
21476 break;
21477 case Intrinsic::aarch64_sve_fcmpne:
21478 case Intrinsic::aarch64_sve_cmpne:
21479 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
21480 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
21481 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETNE));
21482 break;
21483 case Intrinsic::aarch64_sve_fcmpuo:
21484 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
21485 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
21486 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETUO));
21487 break;
21488 case Intrinsic::aarch64_sve_fadda:
21489 return combineSVEReductionOrderedFP(N, Opc: AArch64ISD::FADDA_PRED, DAG);
21490 case Intrinsic::aarch64_sve_faddv:
21491 return combineSVEReductionFP(N, Opc: AArch64ISD::FADDV_PRED, DAG);
21492 case Intrinsic::aarch64_sve_fmaxnmv:
21493 return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXNMV_PRED, DAG);
21494 case Intrinsic::aarch64_sve_fmaxv:
21495 return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXV_PRED, DAG);
21496 case Intrinsic::aarch64_sve_fminnmv:
21497 return combineSVEReductionFP(N, Opc: AArch64ISD::FMINNMV_PRED, DAG);
21498 case Intrinsic::aarch64_sve_fminv:
21499 return combineSVEReductionFP(N, Opc: AArch64ISD::FMINV_PRED, DAG);
21500 case Intrinsic::aarch64_sve_sel:
21501 return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
21502 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
21503 case Intrinsic::aarch64_sve_cmpeq_wide:
21504 return tryConvertSVEWideCompare(N, CC: ISD::SETEQ, DCI, DAG);
21505 case Intrinsic::aarch64_sve_cmpne_wide:
21506 return tryConvertSVEWideCompare(N, CC: ISD::SETNE, DCI, DAG);
21507 case Intrinsic::aarch64_sve_cmpge_wide:
21508 return tryConvertSVEWideCompare(N, CC: ISD::SETGE, DCI, DAG);
21509 case Intrinsic::aarch64_sve_cmpgt_wide:
21510 return tryConvertSVEWideCompare(N, CC: ISD::SETGT, DCI, DAG);
21511 case Intrinsic::aarch64_sve_cmplt_wide:
21512 return tryConvertSVEWideCompare(N, CC: ISD::SETLT, DCI, DAG);
21513 case Intrinsic::aarch64_sve_cmple_wide:
21514 return tryConvertSVEWideCompare(N, CC: ISD::SETLE, DCI, DAG);
21515 case Intrinsic::aarch64_sve_cmphs_wide:
21516 return tryConvertSVEWideCompare(N, CC: ISD::SETUGE, DCI, DAG);
21517 case Intrinsic::aarch64_sve_cmphi_wide:
21518 return tryConvertSVEWideCompare(N, CC: ISD::SETUGT, DCI, DAG);
21519 case Intrinsic::aarch64_sve_cmplo_wide:
21520 return tryConvertSVEWideCompare(N, CC: ISD::SETULT, DCI, DAG);
21521 case Intrinsic::aarch64_sve_cmpls_wide:
21522 return tryConvertSVEWideCompare(N, CC: ISD::SETULE, DCI, DAG);
21523 case Intrinsic::aarch64_sve_ptest_any:
21524 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg: N->getOperand(Num: 1), Op: N->getOperand(Num: 2),
21525 Cond: AArch64CC::ANY_ACTIVE);
21526 case Intrinsic::aarch64_sve_ptest_first:
21527 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg: N->getOperand(Num: 1), Op: N->getOperand(Num: 2),
21528 Cond: AArch64CC::FIRST_ACTIVE);
21529 case Intrinsic::aarch64_sve_ptest_last:
21530 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg: N->getOperand(Num: 1), Op: N->getOperand(Num: 2),
21531 Cond: AArch64CC::LAST_ACTIVE);
21532 case Intrinsic::aarch64_sve_whilelo:
21533 return tryCombineWhileLo(N, DCI, Subtarget);
21534 }
21535 return SDValue();
21536}
21537
21538static bool isCheapToExtend(const SDValue &N) {
21539 unsigned OC = N->getOpcode();
21540 return OC == ISD::LOAD || OC == ISD::MLOAD ||
21541 ISD::isConstantSplatVectorAllZeros(N: N.getNode());
21542}
21543
21544static SDValue
21545performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
21546 SelectionDAG &DAG) {
21547 // If we have (sext (setcc A B)) and A and B are cheap to extend,
21548 // we can move the sext into the arguments and have the same result. For
21549 // example, if A and B are both loads, we can make those extending loads and
21550 // avoid an extra instruction. This pattern appears often in VLS code
21551 // generation where the inputs to the setcc have a different size to the
21552 // instruction that wants to use the result of the setcc.
21553 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
21554 N->getOperand(0)->getOpcode() == ISD::SETCC);
21555 const SDValue SetCC = N->getOperand(Num: 0);
21556
21557 const SDValue CCOp0 = SetCC.getOperand(i: 0);
21558 const SDValue CCOp1 = SetCC.getOperand(i: 1);
21559 if (!CCOp0->getValueType(ResNo: 0).isInteger() ||
21560 !CCOp1->getValueType(ResNo: 0).isInteger())
21561 return SDValue();
21562
21563 ISD::CondCode Code =
21564 cast<CondCodeSDNode>(Val: SetCC->getOperand(Num: 2).getNode())->get();
21565
21566 ISD::NodeType ExtType =
21567 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21568
21569 if (isCheapToExtend(N: SetCC.getOperand(i: 0)) &&
21570 isCheapToExtend(N: SetCC.getOperand(i: 1))) {
21571 const SDValue Ext1 =
21572 DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: CCOp0);
21573 const SDValue Ext2 =
21574 DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: CCOp1);
21575
21576 return DAG.getSetCC(
21577 DL: SDLoc(SetCC), VT: N->getValueType(ResNo: 0), LHS: Ext1, RHS: Ext2,
21578 Cond: cast<CondCodeSDNode>(Val: SetCC->getOperand(Num: 2).getNode())->get());
21579 }
21580
21581 return SDValue();
21582}
21583
21584static SDValue performExtendCombine(SDNode *N,
21585 TargetLowering::DAGCombinerInfo &DCI,
21586 SelectionDAG &DAG) {
21587 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
21588 // we can convert that DUP into another extract_high (of a bigger DUP), which
21589 // helps the backend to decide that an sabdl2 would be useful, saving a real
21590 // extract_high operation.
21591 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
21592 (N->getOperand(Num: 0).getOpcode() == ISD::ABDU ||
21593 N->getOperand(Num: 0).getOpcode() == ISD::ABDS)) {
21594 SDNode *ABDNode = N->getOperand(Num: 0).getNode();
21595 SDValue NewABD =
21596 tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N: ABDNode, DCI, DAG);
21597 if (!NewABD.getNode())
21598 return SDValue();
21599
21600 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: NewABD);
21601 }
21602
21603 if (N->getValueType(ResNo: 0).isFixedLengthVector() &&
21604 N->getOpcode() == ISD::SIGN_EXTEND &&
21605 N->getOperand(Num: 0)->getOpcode() == ISD::SETCC)
21606 return performSignExtendSetCCCombine(N, DCI, DAG);
21607
21608 return SDValue();
21609}
21610
21611static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
21612 SDValue SplatVal, unsigned NumVecElts) {
21613 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
21614 Align OrigAlignment = St.getAlign();
21615 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
21616
21617 // Create scalar stores. This is at least as good as the code sequence for a
21618 // split unaligned store which is a dup.s, ext.b, and two stores.
21619 // Most of the time the three stores should be replaced by store pair
21620 // instructions (stp).
21621 SDLoc DL(&St);
21622 SDValue BasePtr = St.getBasePtr();
21623 uint64_t BaseOffset = 0;
21624
21625 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
21626 SDValue NewST1 =
21627 DAG.getStore(Chain: St.getChain(), dl: DL, Val: SplatVal, Ptr: BasePtr, PtrInfo,
21628 Alignment: OrigAlignment, MMOFlags: St.getMemOperand()->getFlags());
21629
21630 // As this in ISel, we will not merge this add which may degrade results.
21631 if (BasePtr->getOpcode() == ISD::ADD &&
21632 isa<ConstantSDNode>(Val: BasePtr->getOperand(Num: 1))) {
21633 BaseOffset = cast<ConstantSDNode>(Val: BasePtr->getOperand(Num: 1))->getSExtValue();
21634 BasePtr = BasePtr->getOperand(Num: 0);
21635 }
21636
21637 unsigned Offset = EltOffset;
21638 while (--NumVecElts) {
21639 Align Alignment = commonAlignment(A: OrigAlignment, Offset);
21640 SDValue OffsetPtr =
21641 DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr,
21642 N2: DAG.getConstant(Val: BaseOffset + Offset, DL, VT: MVT::i64));
21643 NewST1 = DAG.getStore(Chain: NewST1.getValue(R: 0), dl: DL, Val: SplatVal, Ptr: OffsetPtr,
21644 PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
21645 MMOFlags: St.getMemOperand()->getFlags());
21646 Offset += EltOffset;
21647 }
21648 return NewST1;
21649}
21650
21651// Returns an SVE type that ContentTy can be trivially sign or zero extended
21652// into.
21653static MVT getSVEContainerType(EVT ContentTy) {
21654 assert(ContentTy.isSimple() && "No SVE containers for extended types");
21655
21656 switch (ContentTy.getSimpleVT().SimpleTy) {
21657 default:
21658 llvm_unreachable("No known SVE container for this MVT type");
21659 case MVT::nxv2i8:
21660 case MVT::nxv2i16:
21661 case MVT::nxv2i32:
21662 case MVT::nxv2i64:
21663 case MVT::nxv2f32:
21664 case MVT::nxv2f64:
21665 return MVT::nxv2i64;
21666 case MVT::nxv4i8:
21667 case MVT::nxv4i16:
21668 case MVT::nxv4i32:
21669 case MVT::nxv4f32:
21670 return MVT::nxv4i32;
21671 case MVT::nxv8i8:
21672 case MVT::nxv8i16:
21673 case MVT::nxv8f16:
21674 case MVT::nxv8bf16:
21675 return MVT::nxv8i16;
21676 case MVT::nxv16i8:
21677 return MVT::nxv16i8;
21678 }
21679}
21680
21681static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
21682 SDLoc DL(N);
21683 EVT VT = N->getValueType(ResNo: 0);
21684
21685 if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
21686 return SDValue();
21687
21688 EVT ContainerVT = VT;
21689 if (ContainerVT.isInteger())
21690 ContainerVT = getSVEContainerType(ContentTy: ContainerVT);
21691
21692 SDVTList VTs = DAG.getVTList(VT1: ContainerVT, VT2: MVT::Other);
21693 SDValue Ops[] = { N->getOperand(Num: 0), // Chain
21694 N->getOperand(Num: 2), // Pg
21695 N->getOperand(Num: 3), // Base
21696 DAG.getValueType(VT) };
21697
21698 SDValue Load = DAG.getNode(Opcode: Opc, DL, VTList: VTs, Ops);
21699 SDValue LoadChain = SDValue(Load.getNode(), 1);
21700
21701 if (ContainerVT.isInteger() && (VT != ContainerVT))
21702 Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Load.getValue(R: 0));
21703
21704 return DAG.getMergeValues(Ops: { Load, LoadChain }, dl: DL);
21705}
21706
21707static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
21708 SDLoc DL(N);
21709 EVT VT = N->getValueType(ResNo: 0);
21710 EVT PtrTy = N->getOperand(Num: 3).getValueType();
21711
21712 EVT LoadVT = VT;
21713 if (VT.isFloatingPoint())
21714 LoadVT = VT.changeTypeToInteger();
21715
21716 auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
21717 SDValue PassThru = DAG.getConstant(Val: 0, DL, VT: LoadVT);
21718 SDValue L = DAG.getMaskedLoad(VT: LoadVT, dl: DL, Chain: MINode->getChain(),
21719 Base: MINode->getOperand(Num: 3), Offset: DAG.getUNDEF(VT: PtrTy),
21720 Mask: MINode->getOperand(Num: 2), Src0: PassThru,
21721 MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
21722 AM: ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding: false);
21723
21724 if (VT.isFloatingPoint()) {
21725 SDValue Ops[] = { DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: L), L.getValue(R: 1) };
21726 return DAG.getMergeValues(Ops, dl: DL);
21727 }
21728
21729 return L;
21730}
21731
21732template <unsigned Opcode>
21733static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
21734 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
21735 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
21736 "Unsupported opcode.");
21737 SDLoc DL(N);
21738 EVT VT = N->getValueType(ResNo: 0);
21739
21740 EVT LoadVT = VT;
21741 if (VT.isFloatingPoint())
21742 LoadVT = VT.changeTypeToInteger();
21743
21744 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 2), N->getOperand(Num: 3)};
21745 SDValue Load = DAG.getNode(Opcode, DL, ResultTys: {LoadVT, MVT::Other}, Ops);
21746 SDValue LoadChain = SDValue(Load.getNode(), 1);
21747
21748 if (VT.isFloatingPoint())
21749 Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Load.getValue(R: 0));
21750
21751 return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
21752}
21753
21754static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
21755 SDLoc DL(N);
21756 SDValue Data = N->getOperand(Num: 2);
21757 EVT DataVT = Data.getValueType();
21758 EVT HwSrcVt = getSVEContainerType(ContentTy: DataVT);
21759 SDValue InputVT = DAG.getValueType(DataVT);
21760
21761 if (DataVT.isFloatingPoint())
21762 InputVT = DAG.getValueType(HwSrcVt);
21763
21764 SDValue SrcNew;
21765 if (Data.getValueType().isFloatingPoint())
21766 SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Data);
21767 else
21768 SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Data);
21769
21770 SDValue Ops[] = { N->getOperand(Num: 0), // Chain
21771 SrcNew,
21772 N->getOperand(Num: 4), // Base
21773 N->getOperand(Num: 3), // Pg
21774 InputVT
21775 };
21776
21777 return DAG.getNode(Opcode: AArch64ISD::ST1_PRED, DL, VT: N->getValueType(ResNo: 0), Ops);
21778}
21779
21780static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
21781 SDLoc DL(N);
21782
21783 SDValue Data = N->getOperand(Num: 2);
21784 EVT DataVT = Data.getValueType();
21785 EVT PtrTy = N->getOperand(Num: 4).getValueType();
21786
21787 if (DataVT.isFloatingPoint())
21788 Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DataVT.changeTypeToInteger(), Operand: Data);
21789
21790 auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
21791 return DAG.getMaskedStore(Chain: MINode->getChain(), dl: DL, Val: Data, Base: MINode->getOperand(Num: 4),
21792 Offset: DAG.getUNDEF(VT: PtrTy), Mask: MINode->getOperand(Num: 3),
21793 MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
21794 AM: ISD::UNINDEXED, IsTruncating: false, IsCompressing: false);
21795}
21796
21797/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21798/// load store optimizer pass will merge them to store pair stores. This should
21799/// be better than a movi to create the vector zero followed by a vector store
21800/// if the zero constant is not re-used, since one instructions and one register
21801/// live range will be removed.
21802///
21803/// For example, the final generated code should be:
21804///
21805/// stp xzr, xzr, [x0]
21806///
21807/// instead of:
21808///
21809/// movi v0.2d, #0
21810/// str q0, [x0]
21811///
21812static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
21813 SDValue StVal = St.getValue();
21814 EVT VT = StVal.getValueType();
21815
21816 // Avoid scalarizing zero splat stores for scalable vectors.
21817 if (VT.isScalableVector())
21818 return SDValue();
21819
21820 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21821 // 2, 3 or 4 i32 elements.
21822 int NumVecElts = VT.getVectorNumElements();
21823 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21824 VT.getVectorElementType().getSizeInBits() == 64) ||
21825 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21826 VT.getVectorElementType().getSizeInBits() == 32)))
21827 return SDValue();
21828
21829 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21830 return SDValue();
21831
21832 // If the zero constant has more than one use then the vector store could be
21833 // better since the constant mov will be amortized and stp q instructions
21834 // should be able to be formed.
21835 if (!StVal.hasOneUse())
21836 return SDValue();
21837
21838 // If the store is truncating then it's going down to i16 or smaller, which
21839 // means it can be implemented in a single store anyway.
21840 if (St.isTruncatingStore())
21841 return SDValue();
21842
21843 // If the immediate offset of the address operand is too large for the stp
21844 // instruction, then bail out.
21845 if (DAG.isBaseWithConstantOffset(Op: St.getBasePtr())) {
21846 int64_t Offset = St.getBasePtr()->getConstantOperandVal(Num: 1);
21847 if (Offset < -512 || Offset > 504)
21848 return SDValue();
21849 }
21850
21851 for (int I = 0; I < NumVecElts; ++I) {
21852 SDValue EltVal = StVal.getOperand(i: I);
21853 if (!isNullConstant(V: EltVal) && !isNullFPConstant(V: EltVal))
21854 return SDValue();
21855 }
21856
21857 // Use a CopyFromReg WZR/XZR here to prevent
21858 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21859 SDLoc DL(&St);
21860 unsigned ZeroReg;
21861 EVT ZeroVT;
21862 if (VT.getVectorElementType().getSizeInBits() == 32) {
21863 ZeroReg = AArch64::WZR;
21864 ZeroVT = MVT::i32;
21865 } else {
21866 ZeroReg = AArch64::XZR;
21867 ZeroVT = MVT::i64;
21868 }
21869 SDValue SplatVal =
21870 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: ZeroReg, VT: ZeroVT);
21871 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21872}
21873
21874/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21875/// value. The load store optimizer pass will merge them to store pair stores.
21876/// This has better performance than a splat of the scalar followed by a split
21877/// vector store. Even if the stores are not merged it is four stores vs a dup,
21878/// followed by an ext.b and two stores.
21879static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
21880 SDValue StVal = St.getValue();
21881 EVT VT = StVal.getValueType();
21882
21883 // Don't replace floating point stores, they possibly won't be transformed to
21884 // stp because of the store pair suppress pass.
21885 if (VT.isFloatingPoint())
21886 return SDValue();
21887
21888 // We can express a splat as store pair(s) for 2 or 4 elements.
21889 unsigned NumVecElts = VT.getVectorNumElements();
21890 if (NumVecElts != 4 && NumVecElts != 2)
21891 return SDValue();
21892
21893 // If the store is truncating then it's going down to i16 or smaller, which
21894 // means it can be implemented in a single store anyway.
21895 if (St.isTruncatingStore())
21896 return SDValue();
21897
21898 // Check that this is a splat.
21899 // Make sure that each of the relevant vector element locations are inserted
21900 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21901 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21902 SDValue SplatVal;
21903 for (unsigned I = 0; I < NumVecElts; ++I) {
21904 // Check for insert vector elements.
21905 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21906 return SDValue();
21907
21908 // Check that same value is inserted at each vector element.
21909 if (I == 0)
21910 SplatVal = StVal.getOperand(i: 1);
21911 else if (StVal.getOperand(i: 1) != SplatVal)
21912 return SDValue();
21913
21914 // Check insert element index.
21915 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(Val: StVal.getOperand(i: 2));
21916 if (!CIndex)
21917 return SDValue();
21918 uint64_t IndexVal = CIndex->getZExtValue();
21919 if (IndexVal >= NumVecElts)
21920 return SDValue();
21921 IndexNotInserted.reset(position: IndexVal);
21922
21923 StVal = StVal.getOperand(i: 0);
21924 }
21925 // Check that all vector element locations were inserted to.
21926 if (IndexNotInserted.any())
21927 return SDValue();
21928
21929 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21930}
21931
21932static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
21933 SelectionDAG &DAG,
21934 const AArch64Subtarget *Subtarget) {
21935
21936 StoreSDNode *S = cast<StoreSDNode>(Val: N);
21937 if (S->isVolatile() || S->isIndexed())
21938 return SDValue();
21939
21940 SDValue StVal = S->getValue();
21941 EVT VT = StVal.getValueType();
21942
21943 if (!VT.isFixedLengthVector())
21944 return SDValue();
21945
21946 // If we get a splat of zeros, convert this vector store to a store of
21947 // scalars. They will be merged into store pairs of xzr thereby removing one
21948 // instruction and one register.
21949 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, St&: *S))
21950 return ReplacedZeroSplat;
21951
21952 // FIXME: The logic for deciding if an unaligned store should be split should
21953 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21954 // a call to that function here.
21955
21956 if (!Subtarget->isMisaligned128StoreSlow())
21957 return SDValue();
21958
21959 // Don't split at -Oz.
21960 if (DAG.getMachineFunction().getFunction().hasMinSize())
21961 return SDValue();
21962
21963 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21964 // those up regresses performance on micro-benchmarks and olden/bh.
21965 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21966 return SDValue();
21967
21968 // Split unaligned 16B stores. They are terrible for performance.
21969 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21970 // extensions can use this to mark that it does not want splitting to happen
21971 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21972 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21973 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21974 S->getAlign() <= Align(2))
21975 return SDValue();
21976
21977 // If we get a splat of a scalar convert this vector store to a store of
21978 // scalars. They will be merged into store pairs thereby removing two
21979 // instructions.
21980 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, St&: *S))
21981 return ReplacedSplat;
21982
21983 SDLoc DL(S);
21984
21985 // Split VT into two.
21986 EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
21987 unsigned NumElts = HalfVT.getVectorNumElements();
21988 SDValue SubVector0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: StVal,
21989 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
21990 SDValue SubVector1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: StVal,
21991 N2: DAG.getConstant(Val: NumElts, DL, VT: MVT::i64));
21992 SDValue BasePtr = S->getBasePtr();
21993 SDValue NewST1 =
21994 DAG.getStore(Chain: S->getChain(), dl: DL, Val: SubVector0, Ptr: BasePtr, PtrInfo: S->getPointerInfo(),
21995 Alignment: S->getAlign(), MMOFlags: S->getMemOperand()->getFlags());
21996 SDValue OffsetPtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr,
21997 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i64));
21998 return DAG.getStore(Chain: NewST1.getValue(R: 0), dl: DL, Val: SubVector1, Ptr: OffsetPtr,
21999 PtrInfo: S->getPointerInfo(), Alignment: S->getAlign(),
22000 MMOFlags: S->getMemOperand()->getFlags());
22001}
22002
22003static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
22004 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
22005
22006 // splice(pg, op1, undef) -> op1
22007 if (N->getOperand(Num: 2).isUndef())
22008 return N->getOperand(Num: 1);
22009
22010 return SDValue();
22011}
22012
22013static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
22014 const AArch64Subtarget *Subtarget) {
22015 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
22016 N->getOpcode() == AArch64ISD::UUNPKLO) &&
22017 "Unexpected Opcode!");
22018
22019 // uunpklo/hi undef -> undef
22020 if (N->getOperand(Num: 0).isUndef())
22021 return DAG.getUNDEF(VT: N->getValueType(ResNo: 0));
22022
22023 // If this is a masked load followed by an UUNPKLO, fold this into a masked
22024 // extending load. We can do this even if this is already a masked
22025 // {z,}extload.
22026 if (N->getOperand(Num: 0).getOpcode() == ISD::MLOAD &&
22027 N->getOpcode() == AArch64ISD::UUNPKLO) {
22028 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(Val: N->getOperand(Num: 0));
22029 SDValue Mask = MLD->getMask();
22030 SDLoc DL(N);
22031
22032 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
22033 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22034 (MLD->getPassThru()->isUndef() ||
22035 isZerosVector(N: MLD->getPassThru().getNode()))) {
22036 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22037 unsigned PgPattern = Mask->getConstantOperandVal(Num: 0);
22038 EVT VT = N->getValueType(ResNo: 0);
22039
22040 // Ensure we can double the size of the predicate pattern
22041 unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
22042 if (NumElts &&
22043 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
22044 Mask =
22045 getPTrue(DAG, DL, VT: VT.changeVectorElementType(EltVT: MVT::i1), Pattern: PgPattern);
22046 SDValue PassThru = DAG.getConstant(Val: 0, DL, VT);
22047 SDValue NewLoad = DAG.getMaskedLoad(
22048 VT, dl: DL, Chain: MLD->getChain(), Base: MLD->getBasePtr(), Offset: MLD->getOffset(), Mask,
22049 Src0: PassThru, MemVT: MLD->getMemoryVT(), MMO: MLD->getMemOperand(),
22050 AM: MLD->getAddressingMode(), ISD::ZEXTLOAD);
22051
22052 DAG.ReplaceAllUsesOfValueWith(From: SDValue(MLD, 1), To: NewLoad.getValue(R: 1));
22053
22054 return NewLoad;
22055 }
22056 }
22057 }
22058
22059 return SDValue();
22060}
22061
22062static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N) {
22063 if (N->getOpcode() != AArch64ISD::UZP1)
22064 return false;
22065 SDValue Op0 = N->getOperand(Num: 0);
22066 EVT SrcVT = Op0->getValueType(ResNo: 0);
22067 EVT DstVT = N->getValueType(ResNo: 0);
22068 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
22069 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
22070 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
22071}
22072
22073// Try to combine rounding shifts where the operands come from an extend, and
22074// the result is truncated and combined into one vector.
22075// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
22076static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG) {
22077 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
22078 SDValue Op0 = N->getOperand(Num: 0);
22079 SDValue Op1 = N->getOperand(Num: 1);
22080 EVT ResVT = N->getValueType(ResNo: 0);
22081
22082 unsigned RshOpc = Op0.getOpcode();
22083 if (RshOpc != AArch64ISD::RSHRNB_I)
22084 return SDValue();
22085
22086 // Same op code and imm value?
22087 SDValue ShiftValue = Op0.getOperand(i: 1);
22088 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(i: 1))
22089 return SDValue();
22090
22091 // Same unextended operand value?
22092 SDValue Lo = Op0.getOperand(i: 0);
22093 SDValue Hi = Op1.getOperand(i: 0);
22094 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
22095 Hi.getOpcode() != AArch64ISD::UUNPKHI)
22096 return SDValue();
22097 SDValue OrigArg = Lo.getOperand(i: 0);
22098 if (OrigArg != Hi.getOperand(i: 0))
22099 return SDValue();
22100
22101 SDLoc DL(N);
22102 return DAG.getNode(Opcode: AArch64ISD::URSHR_I_PRED, DL, VT: ResVT,
22103 N1: getPredicateForVector(DAG, DL, VT: ResVT), N2: OrigArg,
22104 N3: ShiftValue);
22105}
22106
22107// Try to simplify:
22108// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
22109// t2 = nxv8i16 srl(t1, ShiftValue)
22110// to
22111// t1 = nxv8i16 rshrnb(X, shiftvalue).
22112// rshrnb will zero the top half bits of each element. Therefore, this combine
22113// should only be performed when a following instruction with the rshrnb
22114// as an operand does not care about the top half of each element. For example,
22115// a uzp1 or a truncating store.
22116static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
22117 const AArch64Subtarget *Subtarget) {
22118 EVT VT = Srl->getValueType(ResNo: 0);
22119 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
22120 return SDValue();
22121
22122 EVT ResVT;
22123 if (VT == MVT::nxv8i16)
22124 ResVT = MVT::nxv16i8;
22125 else if (VT == MVT::nxv4i32)
22126 ResVT = MVT::nxv8i16;
22127 else if (VT == MVT::nxv2i64)
22128 ResVT = MVT::nxv4i32;
22129 else
22130 return SDValue();
22131
22132 SDLoc DL(Srl);
22133 unsigned ShiftValue;
22134 SDValue RShOperand;
22135 if (!canLowerSRLToRoundingShiftForVT(Shift: Srl, ResVT, DAG, ShiftValue, RShOperand))
22136 return SDValue();
22137 SDValue Rshrnb = DAG.getNode(
22138 Opcode: AArch64ISD::RSHRNB_I, DL, VT: ResVT,
22139 Ops: {RShOperand, DAG.getTargetConstant(Val: ShiftValue, DL, VT: MVT::i32)});
22140 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Rshrnb);
22141}
22142
22143static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
22144 const AArch64Subtarget *Subtarget) {
22145 SDLoc DL(N);
22146 SDValue Op0 = N->getOperand(Num: 0);
22147 SDValue Op1 = N->getOperand(Num: 1);
22148 EVT ResVT = N->getValueType(ResNo: 0);
22149
22150 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
22151 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
22152 Op1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
22153 Op0.getOperand(i: 0) == Op1.getOperand(i: 0)) {
22154
22155 SDValue SourceVec = Op0.getOperand(i: 0);
22156 uint64_t ExtIdx0 = Op0.getConstantOperandVal(i: 1);
22157 uint64_t ExtIdx1 = Op1.getConstantOperandVal(i: 1);
22158 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
22159 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
22160 EVT OpVT = Op0.getOperand(i: 1).getValueType();
22161 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
22162 SDValue Uzp = DAG.getNode(Opcode: N->getOpcode(), DL, VT: WidenedResVT, N1: SourceVec,
22163 N2: DAG.getUNDEF(VT: WidenedResVT));
22164 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: ResVT, N1: Uzp,
22165 N2: DAG.getConstant(Val: 0, DL, VT: OpVT));
22166 }
22167 }
22168
22169 // Following optimizations only work with uzp1.
22170 if (N->getOpcode() == AArch64ISD::UZP2)
22171 return SDValue();
22172
22173 // uzp1(x, undef) -> concat(truncate(x), undef)
22174 if (Op1.getOpcode() == ISD::UNDEF) {
22175 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
22176 switch (ResVT.getSimpleVT().SimpleTy) {
22177 default:
22178 break;
22179 case MVT::v16i8:
22180 BCVT = MVT::v8i16;
22181 HalfVT = MVT::v8i8;
22182 break;
22183 case MVT::v8i16:
22184 BCVT = MVT::v4i32;
22185 HalfVT = MVT::v4i16;
22186 break;
22187 case MVT::v4i32:
22188 BCVT = MVT::v2i64;
22189 HalfVT = MVT::v2i32;
22190 break;
22191 }
22192 if (BCVT != MVT::Other) {
22193 SDValue BC = DAG.getBitcast(VT: BCVT, V: Op0);
22194 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: HalfVT, Operand: BC);
22195 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ResVT, N1: Trunc,
22196 N2: DAG.getUNDEF(VT: HalfVT));
22197 }
22198 }
22199
22200 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
22201 return Urshr;
22202
22203 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Op0, DAG, Subtarget))
22204 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Rshrnb, N2: Op1);
22205
22206 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Op1, DAG, Subtarget))
22207 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Rshrnb);
22208
22209 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
22210 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
22211 if (Op0.getOperand(i: 0).getOpcode() == AArch64ISD::UZP1) {
22212 SDValue X = Op0.getOperand(i: 0).getOperand(i: 0);
22213 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: X, N2: Op1);
22214 }
22215 }
22216
22217 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
22218 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
22219 if (Op1.getOperand(i: 0).getOpcode() == AArch64ISD::UZP1) {
22220 SDValue Z = Op1.getOperand(i: 0).getOperand(i: 1);
22221 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Z);
22222 }
22223 }
22224
22225 // These optimizations only work on little endian.
22226 if (!DAG.getDataLayout().isLittleEndian())
22227 return SDValue();
22228
22229 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
22230 // Example:
22231 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
22232 // to
22233 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
22234 if (isHalvingTruncateAndConcatOfLegalIntScalableType(N) &&
22235 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
22236 if (Op0.getOperand(i: 0).getValueType() == Op1.getOperand(i: 0).getValueType()) {
22237 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0.getOperand(i: 0),
22238 N2: Op1.getOperand(i: 0));
22239 }
22240 }
22241
22242 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
22243 return SDValue();
22244
22245 SDValue SourceOp0 = peekThroughBitcasts(V: Op0);
22246 SDValue SourceOp1 = peekThroughBitcasts(V: Op1);
22247
22248 // truncating uzp1(x, y) -> xtn(concat (x, y))
22249 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
22250 EVT Op0Ty = SourceOp0.getValueType();
22251 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
22252 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
22253 SDValue Concat =
22254 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL,
22255 VT: Op0Ty.getDoubleNumVectorElementsVT(Context&: *DAG.getContext()),
22256 N1: SourceOp0, N2: SourceOp1);
22257 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT, Operand: Concat);
22258 }
22259 }
22260
22261 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
22262 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
22263 SourceOp1.getOpcode() != ISD::TRUNCATE)
22264 return SDValue();
22265 SourceOp0 = SourceOp0.getOperand(i: 0);
22266 SourceOp1 = SourceOp1.getOperand(i: 0);
22267
22268 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
22269 !SourceOp0.getValueType().isSimple())
22270 return SDValue();
22271
22272 EVT ResultTy;
22273
22274 switch (SourceOp0.getSimpleValueType().SimpleTy) {
22275 case MVT::v2i64:
22276 ResultTy = MVT::v4i32;
22277 break;
22278 case MVT::v4i32:
22279 ResultTy = MVT::v8i16;
22280 break;
22281 case MVT::v8i16:
22282 ResultTy = MVT::v16i8;
22283 break;
22284 default:
22285 return SDValue();
22286 }
22287
22288 SDValue UzpOp0 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp0);
22289 SDValue UzpOp1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp1);
22290 SDValue UzpResult =
22291 DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UzpOp0.getValueType(), N1: UzpOp0, N2: UzpOp1);
22292
22293 EVT BitcastResultTy;
22294
22295 switch (ResVT.getSimpleVT().SimpleTy) {
22296 case MVT::v2i32:
22297 BitcastResultTy = MVT::v2i64;
22298 break;
22299 case MVT::v4i16:
22300 BitcastResultTy = MVT::v4i32;
22301 break;
22302 case MVT::v8i8:
22303 BitcastResultTy = MVT::v8i16;
22304 break;
22305 default:
22306 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
22307 }
22308
22309 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT,
22310 Operand: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: BitcastResultTy, Operand: UzpResult));
22311}
22312
22313static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
22314 unsigned Opc = N->getOpcode();
22315
22316 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
22317 Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
22318 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
22319 Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
22320 "Invalid opcode.");
22321
22322 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
22323 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
22324 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
22325 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
22326 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
22327 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
22328 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
22329 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
22330
22331 SDLoc DL(N);
22332 SDValue Chain = N->getOperand(Num: 0);
22333 SDValue Pg = N->getOperand(Num: 1);
22334 SDValue Base = N->getOperand(Num: 2);
22335 SDValue Offset = N->getOperand(Num: 3);
22336 SDValue Ty = N->getOperand(Num: 4);
22337
22338 EVT ResVT = N->getValueType(ResNo: 0);
22339
22340 const auto OffsetOpc = Offset.getOpcode();
22341 const bool OffsetIsZExt =
22342 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
22343 const bool OffsetIsSExt =
22344 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
22345
22346 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
22347 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
22348 SDValue ExtPg = Offset.getOperand(i: 0);
22349 VTSDNode *ExtFrom = cast<VTSDNode>(Val: Offset.getOperand(i: 2).getNode());
22350 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
22351
22352 // If the predicate for the sign- or zero-extended offset is the
22353 // same as the predicate used for this load and the sign-/zero-extension
22354 // was from a 32-bits...
22355 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
22356 SDValue UnextendedOffset = Offset.getOperand(i: 1);
22357
22358 unsigned NewOpc = getGatherVecOpcode(IsScaled: Scaled, IsSigned: OffsetIsSExt, NeedsExtend: true);
22359 if (Signed)
22360 NewOpc = getSignExtendedGatherOpcode(Opcode: NewOpc);
22361
22362 return DAG.getNode(Opcode: NewOpc, DL, ResultTys: {ResVT, MVT::Other},
22363 Ops: {Chain, Pg, Base, UnextendedOffset, Ty});
22364 }
22365 }
22366
22367 return SDValue();
22368}
22369
22370/// Optimize a vector shift instruction and its operand if shifted out
22371/// bits are not used.
22372static SDValue performVectorShiftCombine(SDNode *N,
22373 const AArch64TargetLowering &TLI,
22374 TargetLowering::DAGCombinerInfo &DCI) {
22375 assert(N->getOpcode() == AArch64ISD::VASHR ||
22376 N->getOpcode() == AArch64ISD::VLSHR);
22377
22378 SDValue Op = N->getOperand(Num: 0);
22379 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
22380
22381 unsigned ShiftImm = N->getConstantOperandVal(Num: 1);
22382 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
22383
22384 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
22385 if (N->getOpcode() == AArch64ISD::VASHR &&
22386 Op.getOpcode() == AArch64ISD::VSHL &&
22387 N->getOperand(Num: 1) == Op.getOperand(i: 1))
22388 if (DCI.DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0)) > ShiftImm)
22389 return Op.getOperand(i: 0);
22390
22391 // If the shift is exact, the shifted out bits matter.
22392 if (N->getFlags().hasExact())
22393 return SDValue();
22394
22395 APInt ShiftedOutBits = APInt::getLowBitsSet(numBits: OpScalarSize, loBitsSet: ShiftImm);
22396 APInt DemandedMask = ~ShiftedOutBits;
22397
22398 if (TLI.SimplifyDemandedBits(Op, DemandedBits: DemandedMask, DCI))
22399 return SDValue(N, 0);
22400
22401 return SDValue();
22402}
22403
22404static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
22405 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
22406 // This transform works in partnership with performSetCCPunpkCombine to
22407 // remove unnecessary transfer of predicates into standard registers and back
22408 if (N->getOperand(Num: 0).getOpcode() == ISD::SIGN_EXTEND &&
22409 N->getOperand(Num: 0)->getOperand(Num: 0)->getValueType(ResNo: 0).getScalarType() ==
22410 MVT::i1) {
22411 SDValue CC = N->getOperand(Num: 0)->getOperand(Num: 0);
22412 auto VT = CC->getValueType(ResNo: 0).getHalfNumVectorElementsVT(Context&: *DAG.getContext());
22413 SDValue Unpk = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SDLoc(N), VT, N1: CC,
22414 N2: DAG.getVectorIdxConstant(Val: 0, DL: SDLoc(N)));
22415 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: Unpk);
22416 }
22417
22418 return SDValue();
22419}
22420
22421/// Target-specific DAG combine function for post-increment LD1 (lane) and
22422/// post-increment LD1R.
22423static SDValue performPostLD1Combine(SDNode *N,
22424 TargetLowering::DAGCombinerInfo &DCI,
22425 bool IsLaneOp) {
22426 if (DCI.isBeforeLegalizeOps())
22427 return SDValue();
22428
22429 SelectionDAG &DAG = DCI.DAG;
22430 EVT VT = N->getValueType(ResNo: 0);
22431
22432 if (!VT.is128BitVector() && !VT.is64BitVector())
22433 return SDValue();
22434
22435 unsigned LoadIdx = IsLaneOp ? 1 : 0;
22436 SDNode *LD = N->getOperand(Num: LoadIdx).getNode();
22437 // If it is not LOAD, can not do such combine.
22438 if (LD->getOpcode() != ISD::LOAD)
22439 return SDValue();
22440
22441 // The vector lane must be a constant in the LD1LANE opcode.
22442 SDValue Lane;
22443 if (IsLaneOp) {
22444 Lane = N->getOperand(Num: 2);
22445 auto *LaneC = dyn_cast<ConstantSDNode>(Val&: Lane);
22446 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
22447 return SDValue();
22448 }
22449
22450 LoadSDNode *LoadSDN = cast<LoadSDNode>(Val: LD);
22451 EVT MemVT = LoadSDN->getMemoryVT();
22452 // Check if memory operand is the same type as the vector element.
22453 if (MemVT != VT.getVectorElementType())
22454 return SDValue();
22455
22456 // Check if there are other uses. If so, do not combine as it will introduce
22457 // an extra load.
22458 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
22459 ++UI) {
22460 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
22461 continue;
22462 if (*UI != N)
22463 return SDValue();
22464 }
22465
22466 // If there is one use and it can splat the value, prefer that operation.
22467 // TODO: This could be expanded to more operations if they reliably use the
22468 // index variants.
22469 if (N->hasOneUse()) {
22470 unsigned UseOpc = N->use_begin()->getOpcode();
22471 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
22472 return SDValue();
22473 }
22474
22475 SDValue Addr = LD->getOperand(Num: 1);
22476 SDValue Vector = N->getOperand(Num: 0);
22477 // Search for a use of the address operand that is an increment.
22478 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
22479 Addr.getNode()->use_end(); UI != UE; ++UI) {
22480 SDNode *User = *UI;
22481 if (User->getOpcode() != ISD::ADD
22482 || UI.getUse().getResNo() != Addr.getResNo())
22483 continue;
22484
22485 // If the increment is a constant, it must match the memory ref size.
22486 SDValue Inc = User->getOperand(Num: User->getOperand(Num: 0) == Addr ? 1 : 0);
22487 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
22488 uint32_t IncVal = CInc->getZExtValue();
22489 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
22490 if (IncVal != NumBytes)
22491 continue;
22492 Inc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
22493 }
22494
22495 // To avoid cycle construction make sure that neither the load nor the add
22496 // are predecessors to each other or the Vector.
22497 SmallPtrSet<const SDNode *, 32> Visited;
22498 SmallVector<const SDNode *, 16> Worklist;
22499 Visited.insert(Ptr: Addr.getNode());
22500 Worklist.push_back(Elt: User);
22501 Worklist.push_back(Elt: LD);
22502 Worklist.push_back(Elt: Vector.getNode());
22503 if (SDNode::hasPredecessorHelper(N: LD, Visited, Worklist) ||
22504 SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
22505 continue;
22506
22507 SmallVector<SDValue, 8> Ops;
22508 Ops.push_back(Elt: LD->getOperand(Num: 0)); // Chain
22509 if (IsLaneOp) {
22510 Ops.push_back(Elt: Vector); // The vector to be inserted
22511 Ops.push_back(Elt: Lane); // The lane to be inserted in the vector
22512 }
22513 Ops.push_back(Elt: Addr);
22514 Ops.push_back(Elt: Inc);
22515
22516 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
22517 SDVTList SDTys = DAG.getVTList(VTs: Tys);
22518 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
22519 SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOp, dl: SDLoc(N), VTList: SDTys, Ops,
22520 MemVT,
22521 MMO: LoadSDN->getMemOperand());
22522
22523 // Update the uses.
22524 SDValue NewResults[] = {
22525 SDValue(LD, 0), // The result of load
22526 SDValue(UpdN.getNode(), 2) // Chain
22527 };
22528 DCI.CombineTo(N: LD, To: NewResults);
22529 DCI.CombineTo(N, Res: SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
22530 DCI.CombineTo(N: User, Res: SDValue(UpdN.getNode(), 1)); // Write back register
22531
22532 break;
22533 }
22534 return SDValue();
22535}
22536
22537/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
22538/// address translation.
22539static bool performTBISimplification(SDValue Addr,
22540 TargetLowering::DAGCombinerInfo &DCI,
22541 SelectionDAG &DAG) {
22542 APInt DemandedMask = APInt::getLowBitsSet(numBits: 64, loBitsSet: 56);
22543 KnownBits Known;
22544 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
22545 !DCI.isBeforeLegalizeOps());
22546 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22547 if (TLI.SimplifyDemandedBits(Op: Addr, DemandedBits: DemandedMask, Known, TLO)) {
22548 DCI.CommitTargetLoweringOpt(TLO);
22549 return true;
22550 }
22551 return false;
22552}
22553
22554static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
22555 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
22556 "Expected STORE dag node in input!");
22557
22558 if (auto Store = dyn_cast<StoreSDNode>(Val: N)) {
22559 if (!Store->isTruncatingStore() || Store->isIndexed())
22560 return SDValue();
22561 SDValue Ext = Store->getValue();
22562 auto ExtOpCode = Ext.getOpcode();
22563 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
22564 ExtOpCode != ISD::ANY_EXTEND)
22565 return SDValue();
22566 SDValue Orig = Ext->getOperand(Num: 0);
22567 if (Store->getMemoryVT() != Orig.getValueType())
22568 return SDValue();
22569 return DAG.getStore(Chain: Store->getChain(), dl: SDLoc(Store), Val: Orig,
22570 Ptr: Store->getBasePtr(), MMO: Store->getMemOperand());
22571 }
22572
22573 return SDValue();
22574}
22575
22576// A custom combine to lower load <3 x i8> as the more efficient sequence
22577// below:
22578// ldrb wX, [x0, #2]
22579// ldrh wY, [x0]
22580// orr wX, wY, wX, lsl #16
22581// fmov s0, wX
22582//
22583// Note that an alternative sequence with even fewer (although usually more
22584// complex/expensive) instructions would be:
22585// ld1r.4h { v0 }, [x0], #2
22586// ld1.b { v0 }[2], [x0]
22587//
22588// Generating this sequence unfortunately results in noticeably worse codegen
22589// for code that extends the loaded v3i8, due to legalization breaking vector
22590// shuffle detection in a way that is very difficult to work around.
22591// TODO: Revisit once v3i8 legalization has been improved in general.
22592static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
22593 EVT MemVT = LD->getMemoryVT();
22594 if (MemVT != EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i8, NumElements: 3) ||
22595 LD->getOriginalAlign() >= 4)
22596 return SDValue();
22597
22598 SDLoc DL(LD);
22599 MachineFunction &MF = DAG.getMachineFunction();
22600 SDValue Chain = LD->getChain();
22601 SDValue BasePtr = LD->getBasePtr();
22602 MachineMemOperand *MMO = LD->getMemOperand();
22603 assert(LD->getOffset().isUndef() && "undef offset expected");
22604
22605 // Load 2 x i8, then 1 x i8.
22606 SDValue L16 = DAG.getLoad(VT: MVT::i16, dl: DL, Chain, Ptr: BasePtr, MMO);
22607 TypeSize Offset2 = TypeSize::getFixed(ExactSize: 2);
22608 SDValue L8 = DAG.getLoad(VT: MVT::i8, dl: DL, Chain,
22609 Ptr: DAG.getMemBasePlusOffset(Base: BasePtr, Offset: Offset2, DL),
22610 MMO: MF.getMachineMemOperand(MMO, Offset: 2, Size: 1));
22611
22612 // Extend to i32.
22613 SDValue Ext16 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: L16);
22614 SDValue Ext8 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: L8);
22615
22616 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
22617 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: Ext8,
22618 N2: DAG.getConstant(Val: 16, DL, VT: MVT::i32));
22619 SDValue Or = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Ext16, N2: Shl);
22620 SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v4i8, Operand: Or);
22621
22622 // Extract v3i8 again.
22623 SDValue Extract = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MemVT, N1: Cast,
22624 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
22625 SDValue TokenFactor = DAG.getNode(
22626 Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
22627 Ops: {SDValue(cast<SDNode>(Val&: L16), 1), SDValue(cast<SDNode>(Val&: L8), 1)});
22628 return DAG.getMergeValues(Ops: {Extract, TokenFactor}, dl: DL);
22629}
22630
22631// Perform TBI simplification if supported by the target and try to break up
22632// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
22633// load instructions can be selected.
22634static SDValue performLOADCombine(SDNode *N,
22635 TargetLowering::DAGCombinerInfo &DCI,
22636 SelectionDAG &DAG,
22637 const AArch64Subtarget *Subtarget) {
22638 if (Subtarget->supportsAddressTopByteIgnored())
22639 performTBISimplification(Addr: N->getOperand(Num: 1), DCI, DAG);
22640
22641 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
22642 if (LD->isVolatile() || !Subtarget->isLittleEndian())
22643 return SDValue(N, 0);
22644
22645 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
22646 return Res;
22647
22648 if (!LD->isNonTemporal())
22649 return SDValue(N, 0);
22650
22651 EVT MemVT = LD->getMemoryVT();
22652 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
22653 MemVT.getSizeInBits() % 256 == 0 ||
22654 256 % MemVT.getScalarSizeInBits() != 0)
22655 return SDValue(N, 0);
22656
22657 SDLoc DL(LD);
22658 SDValue Chain = LD->getChain();
22659 SDValue BasePtr = LD->getBasePtr();
22660 SDNodeFlags Flags = LD->getFlags();
22661 SmallVector<SDValue, 4> LoadOps;
22662 SmallVector<SDValue, 4> LoadOpsChain;
22663 // Replace any non temporal load over 256-bit with a series of 256 bit loads
22664 // and a scalar/vector load less than 256. This way we can utilize 256-bit
22665 // loads and reduce the amount of load instructions generated.
22666 MVT NewVT =
22667 MVT::getVectorVT(VT: MemVT.getVectorElementType().getSimpleVT(),
22668 NumElements: 256 / MemVT.getVectorElementType().getSizeInBits());
22669 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
22670 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
22671 for (unsigned I = 0; I < Num256Loads; I++) {
22672 unsigned PtrOffset = I * 32;
22673 SDValue NewPtr = DAG.getMemBasePlusOffset(
22674 Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
22675 Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
22676 SDValue NewLoad = DAG.getLoad(
22677 VT: NewVT, dl: DL, Chain, Ptr: NewPtr, PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset),
22678 Alignment: NewAlign, MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
22679 LoadOps.push_back(Elt: NewLoad);
22680 LoadOpsChain.push_back(Elt: SDValue(cast<SDNode>(Val&: NewLoad), 1));
22681 }
22682
22683 // Process remaining bits of the load operation.
22684 // This is done by creating an UNDEF vector to match the size of the
22685 // 256-bit loads and inserting the remaining load to it. We extract the
22686 // original load type at the end using EXTRACT_SUBVECTOR instruction.
22687 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
22688 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
22689 MVT RemainingVT = MVT::getVectorVT(
22690 VT: MemVT.getVectorElementType().getSimpleVT(),
22691 NumElements: BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
22692 SDValue NewPtr = DAG.getMemBasePlusOffset(
22693 Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
22694 Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
22695 SDValue RemainingLoad =
22696 DAG.getLoad(VT: RemainingVT, dl: DL, Chain, Ptr: NewPtr,
22697 PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset), Alignment: NewAlign,
22698 MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
22699 SDValue UndefVector = DAG.getUNDEF(VT: NewVT);
22700 SDValue InsertIdx = DAG.getVectorIdxConstant(Val: 0, DL);
22701 SDValue ExtendedReminingLoad =
22702 DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewVT,
22703 Ops: {UndefVector, RemainingLoad, InsertIdx});
22704 LoadOps.push_back(Elt: ExtendedReminingLoad);
22705 LoadOpsChain.push_back(Elt: SDValue(cast<SDNode>(Val&: RemainingLoad), 1));
22706 EVT ConcatVT =
22707 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getScalarType(),
22708 NumElements: LoadOps.size() * NewVT.getVectorNumElements());
22709 SDValue ConcatVectors =
22710 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ConcatVT, Ops: LoadOps);
22711 // Extract the original vector type size.
22712 SDValue ExtractSubVector =
22713 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MemVT,
22714 Ops: {ConcatVectors, DAG.getVectorIdxConstant(Val: 0, DL)});
22715 SDValue TokenFactor =
22716 DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: LoadOpsChain);
22717 return DAG.getMergeValues(Ops: {ExtractSubVector, TokenFactor}, dl: DL);
22718}
22719
22720static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
22721 EVT VecVT = Op.getValueType();
22722 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
22723 "Need boolean vector type.");
22724
22725 if (Depth > 3)
22726 return MVT::INVALID_SIMPLE_VALUE_TYPE;
22727
22728 // We can get the base type from a vector compare or truncate.
22729 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
22730 return Op.getOperand(i: 0).getValueType();
22731
22732 // If an operand is a bool vector, continue looking.
22733 EVT BaseVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
22734 for (SDValue Operand : Op->op_values()) {
22735 if (Operand.getValueType() != VecVT)
22736 continue;
22737
22738 EVT OperandVT = tryGetOriginalBoolVectorType(Op: Operand, Depth: Depth + 1);
22739 if (!BaseVT.isSimple())
22740 BaseVT = OperandVT;
22741 else if (OperandVT != BaseVT)
22742 return MVT::INVALID_SIMPLE_VALUE_TYPE;
22743 }
22744
22745 return BaseVT;
22746}
22747
22748// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
22749// iN, we can use a trick that extracts the i^th bit from the i^th element and
22750// then performs a vector add to get a scalar bitmask. This requires that each
22751// element's bits are either all 1 or all 0.
22752static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
22753 SDLoc DL(N);
22754 SDValue ComparisonResult(N, 0);
22755 EVT VecVT = ComparisonResult.getValueType();
22756 assert(VecVT.isVector() && "Must be a vector type");
22757
22758 unsigned NumElts = VecVT.getVectorNumElements();
22759 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
22760 return SDValue();
22761
22762 if (VecVT.getVectorElementType() != MVT::i1 &&
22763 !DAG.getTargetLoweringInfo().isTypeLegal(VT: VecVT))
22764 return SDValue();
22765
22766 // If we can find the original types to work on instead of a vector of i1,
22767 // we can avoid extend/extract conversion instructions.
22768 if (VecVT.getVectorElementType() == MVT::i1) {
22769 VecVT = tryGetOriginalBoolVectorType(Op: ComparisonResult);
22770 if (!VecVT.isSimple()) {
22771 unsigned BitsPerElement = std::max(a: 64 / NumElts, b: 8u); // >= 64-bit vector
22772 VecVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: BitsPerElement), NumElements: NumElts);
22773 }
22774 }
22775 VecVT = VecVT.changeVectorElementTypeToInteger();
22776
22777 // Large vectors don't map directly to this conversion, so to avoid too many
22778 // edge cases, we don't apply it here. The conversion will likely still be
22779 // applied later via multiple smaller vectors, whose results are concatenated.
22780 if (VecVT.getSizeInBits() > 128)
22781 return SDValue();
22782
22783 // Ensure that all elements' bits are either 0s or 1s.
22784 ComparisonResult = DAG.getSExtOrTrunc(Op: ComparisonResult, DL, VT: VecVT);
22785
22786 SmallVector<SDValue, 16> MaskConstants;
22787 if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
22788 VecVT == MVT::v16i8) {
22789 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22790 // per entry. We split it into two halves, apply the mask, zip the halves to
22791 // create 8x 16-bit values, and the perform the vector reduce.
22792 for (unsigned Half = 0; Half < 2; ++Half) {
22793 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22794 MaskConstants.push_back(Elt: DAG.getConstant(Val: MaskBit, DL, VT: MVT::i32));
22795 }
22796 }
22797 SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
22798 SDValue RepresentativeBits =
22799 DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
22800
22801 SDValue UpperRepresentativeBits =
22802 DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: VecVT, N1: RepresentativeBits,
22803 N2: RepresentativeBits, N3: DAG.getConstant(Val: 8, DL, VT: MVT::i32));
22804 SDValue Zipped = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: VecVT,
22805 N1: RepresentativeBits, N2: UpperRepresentativeBits);
22806 Zipped = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i16, Operand: Zipped);
22807 return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: MVT::i16, Operand: Zipped);
22808 }
22809
22810 // All other vector sizes.
22811 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22812 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22813 MaskConstants.push_back(Elt: DAG.getConstant(Val: MaskBit, DL, VT: MVT::i64));
22814 }
22815
22816 SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
22817 SDValue RepresentativeBits =
22818 DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
22819 EVT ResultVT = MVT::getIntegerVT(BitWidth: std::max<unsigned>(
22820 a: NumElts, b: VecVT.getVectorElementType().getSizeInBits()));
22821 return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ResultVT, Operand: RepresentativeBits);
22822}
22823
22824static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
22825 StoreSDNode *Store) {
22826 if (!Store->isTruncatingStore())
22827 return SDValue();
22828
22829 SDLoc DL(Store);
22830 SDValue VecOp = Store->getValue();
22831 EVT VT = VecOp.getValueType();
22832 EVT MemVT = Store->getMemoryVT();
22833
22834 if (!MemVT.isVector() || !VT.isVector() ||
22835 MemVT.getVectorElementType() != MVT::i1)
22836 return SDValue();
22837
22838 // If we are storing a vector that we are currently building, let
22839 // `scalarizeVectorStore()` handle this more efficiently.
22840 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22841 return SDValue();
22842
22843 VecOp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: VecOp);
22844 SDValue VectorBits = vectorToScalarBitmask(N: VecOp.getNode(), DAG);
22845 if (!VectorBits)
22846 return SDValue();
22847
22848 EVT StoreVT =
22849 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getStoreSizeInBits());
22850 SDValue ExtendedBits = DAG.getZExtOrTrunc(Op: VectorBits, DL, VT: StoreVT);
22851 return DAG.getStore(Chain: Store->getChain(), dl: DL, Val: ExtendedBits, Ptr: Store->getBasePtr(),
22852 MMO: Store->getMemOperand());
22853}
22854
22855bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
22856 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22857 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22858 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22859}
22860
22861// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22862static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
22863 const AArch64Subtarget *Subtarget) {
22864 SDValue Value = ST->getValue();
22865 EVT ValueVT = Value.getValueType();
22866
22867 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22868 Value.getOpcode() != ISD::TRUNCATE ||
22869 ValueVT != EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i8, NumElements: 3))
22870 return SDValue();
22871
22872 assert(ST->getOffset().isUndef() && "undef offset expected");
22873 SDLoc DL(ST);
22874 auto WideVT = EVT::getVectorVT(
22875 Context&: *DAG.getContext(),
22876 VT: Value->getOperand(Num: 0).getValueType().getVectorElementType(), NumElements: 4);
22877 SDValue UndefVector = DAG.getUNDEF(VT: WideVT);
22878 SDValue WideTrunc = DAG.getNode(
22879 Opcode: ISD::INSERT_SUBVECTOR, DL, VT: WideVT,
22880 Ops: {UndefVector, Value->getOperand(Num: 0), DAG.getVectorIdxConstant(Val: 0, DL)});
22881 SDValue Cast = DAG.getNode(
22882 Opcode: ISD::BITCAST, DL, VT: WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22883 Operand: WideTrunc);
22884
22885 MachineFunction &MF = DAG.getMachineFunction();
22886 SDValue Chain = ST->getChain();
22887 MachineMemOperand *MMO = ST->getMemOperand();
22888 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22889 SDValue E2 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
22890 N2: DAG.getConstant(Val: 2 * IdxScale, DL, VT: MVT::i64));
22891 TypeSize Offset2 = TypeSize::getFixed(ExactSize: 2);
22892 SDValue Ptr2 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset2, DL);
22893 Chain = DAG.getStore(Chain, dl: DL, Val: E2, Ptr: Ptr2, MMO: MF.getMachineMemOperand(MMO, Offset: 2, Size: 1));
22894
22895 SDValue E1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
22896 N2: DAG.getConstant(Val: 1 * IdxScale, DL, VT: MVT::i64));
22897 TypeSize Offset1 = TypeSize::getFixed(ExactSize: 1);
22898 SDValue Ptr1 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset1, DL);
22899 Chain = DAG.getStore(Chain, dl: DL, Val: E1, Ptr: Ptr1, MMO: MF.getMachineMemOperand(MMO, Offset: 1, Size: 1));
22900
22901 SDValue E0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
22902 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
22903 Chain = DAG.getStore(Chain, dl: DL, Val: E0, Ptr: ST->getBasePtr(),
22904 MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: 1));
22905 return Chain;
22906}
22907
22908static SDValue performSTORECombine(SDNode *N,
22909 TargetLowering::DAGCombinerInfo &DCI,
22910 SelectionDAG &DAG,
22911 const AArch64Subtarget *Subtarget) {
22912 StoreSDNode *ST = cast<StoreSDNode>(Val: N);
22913 SDValue Chain = ST->getChain();
22914 SDValue Value = ST->getValue();
22915 SDValue Ptr = ST->getBasePtr();
22916 EVT ValueVT = Value.getValueType();
22917
22918 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22919 EVT EltVT = VT.getVectorElementType();
22920 return EltVT == MVT::f32 || EltVT == MVT::f64;
22921 };
22922
22923 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22924 return Res;
22925
22926 // If this is an FP_ROUND followed by a store, fold this into a truncating
22927 // store. We can do this even if this is already a truncstore.
22928 // We purposefully don't care about legality of the nodes here as we know
22929 // they can be split down into something legal.
22930 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22931 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22932 Subtarget->useSVEForFixedLengthVectors() &&
22933 ValueVT.isFixedLengthVector() &&
22934 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22935 hasValidElementTypeForFPTruncStore(Value.getOperand(i: 0).getValueType()))
22936 return DAG.getTruncStore(Chain, dl: SDLoc(N), Val: Value.getOperand(i: 0), Ptr,
22937 SVT: ST->getMemoryVT(), MMO: ST->getMemOperand());
22938
22939 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22940 return Split;
22941
22942 if (Subtarget->supportsAddressTopByteIgnored() &&
22943 performTBISimplification(Addr: N->getOperand(Num: 2), DCI, DAG))
22944 return SDValue(N, 0);
22945
22946 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22947 return Store;
22948
22949 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, Store: ST))
22950 return Store;
22951
22952 if (ST->isTruncatingStore()) {
22953 EVT StoreVT = ST->getMemoryVT();
22954 if (!isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: StoreVT))
22955 return SDValue();
22956 if (SDValue Rshrnb =
22957 trySimplifySrlAddToRshrnb(Srl: ST->getOperand(Num: 1), DAG, Subtarget)) {
22958 return DAG.getTruncStore(Chain: ST->getChain(), dl: ST, Val: Rshrnb, Ptr: ST->getBasePtr(),
22959 SVT: StoreVT, MMO: ST->getMemOperand());
22960 }
22961 }
22962
22963 return SDValue();
22964}
22965
22966static SDValue performMSTORECombine(SDNode *N,
22967 TargetLowering::DAGCombinerInfo &DCI,
22968 SelectionDAG &DAG,
22969 const AArch64Subtarget *Subtarget) {
22970 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(Val: N);
22971 SDValue Value = MST->getValue();
22972 SDValue Mask = MST->getMask();
22973 SDLoc DL(N);
22974
22975 // If this is a UZP1 followed by a masked store, fold this into a masked
22976 // truncating store. We can do this even if this is already a masked
22977 // truncstore.
22978 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22979 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22980 Value.getValueType().isInteger()) {
22981 Value = Value.getOperand(i: 0);
22982 if (Value.getOpcode() == ISD::BITCAST) {
22983 EVT HalfVT =
22984 Value.getValueType().getHalfNumVectorElementsVT(Context&: *DAG.getContext());
22985 EVT InVT = Value.getOperand(i: 0).getValueType();
22986
22987 if (HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext()) == InVT) {
22988 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22989 unsigned PgPattern = Mask->getConstantOperandVal(Num: 0);
22990
22991 // Ensure we can double the size of the predicate pattern
22992 unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
22993 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22994 MinSVESize) {
22995 Mask = getPTrue(DAG, DL, VT: InVT.changeVectorElementType(EltVT: MVT::i1),
22996 Pattern: PgPattern);
22997 return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Value.getOperand(i: 0),
22998 Base: MST->getBasePtr(), Offset: MST->getOffset(), Mask,
22999 MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
23000 AM: MST->getAddressingMode(),
23001 /*IsTruncating=*/true);
23002 }
23003 }
23004 }
23005 }
23006
23007 if (MST->isTruncatingStore()) {
23008 EVT ValueVT = Value->getValueType(ResNo: 0);
23009 EVT MemVT = MST->getMemoryVT();
23010 if (!isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: MemVT))
23011 return SDValue();
23012 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Value, DAG, Subtarget)) {
23013 return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Rshrnb, Base: MST->getBasePtr(),
23014 Offset: MST->getOffset(), Mask: MST->getMask(),
23015 MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
23016 AM: MST->getAddressingMode(), IsTruncating: true);
23017 }
23018 }
23019
23020 return SDValue();
23021}
23022
23023/// \return true if part of the index was folded into the Base.
23024static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
23025 SDLoc DL, SelectionDAG &DAG) {
23026 // This function assumes a vector of i64 indices.
23027 EVT IndexVT = Index.getValueType();
23028 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
23029 return false;
23030
23031 // Simplify:
23032 // BasePtr = Ptr
23033 // Index = X + splat(Offset)
23034 // ->
23035 // BasePtr = Ptr + Offset * scale.
23036 // Index = X
23037 if (Index.getOpcode() == ISD::ADD) {
23038 if (auto Offset = DAG.getSplatValue(V: Index.getOperand(i: 1))) {
23039 Offset = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: Offset, N2: Scale);
23040 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr, N2: Offset);
23041 Index = Index.getOperand(i: 0);
23042 return true;
23043 }
23044 }
23045
23046 // Simplify:
23047 // BasePtr = Ptr
23048 // Index = (X + splat(Offset)) << splat(Shift)
23049 // ->
23050 // BasePtr = Ptr + (Offset << Shift) * scale)
23051 // Index = X << splat(shift)
23052 if (Index.getOpcode() == ISD::SHL &&
23053 Index.getOperand(i: 0).getOpcode() == ISD::ADD) {
23054 SDValue Add = Index.getOperand(i: 0);
23055 SDValue ShiftOp = Index.getOperand(i: 1);
23056 SDValue OffsetOp = Add.getOperand(i: 1);
23057 if (auto Shift = DAG.getSplatValue(V: ShiftOp))
23058 if (auto Offset = DAG.getSplatValue(V: OffsetOp)) {
23059 Offset = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i64, N1: Offset, N2: Shift);
23060 Offset = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: Offset, N2: Scale);
23061 BasePtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr, N2: Offset);
23062 Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: Index.getValueType(),
23063 N1: Add.getOperand(i: 0), N2: ShiftOp);
23064 return true;
23065 }
23066 }
23067
23068 return false;
23069}
23070
23071// Analyse the specified address returning true if a more optimal addressing
23072// mode is available. When returning true all parameters are updated to reflect
23073// their recommended values.
23074static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
23075 SDValue &BasePtr, SDValue &Index,
23076 SelectionDAG &DAG) {
23077 // Try to iteratively fold parts of the index into the base pointer to
23078 // simplify the index as much as possible.
23079 bool Changed = false;
23080 while (foldIndexIntoBase(BasePtr, Index, Scale: N->getScale(), DL: SDLoc(N), DAG))
23081 Changed = true;
23082
23083 // Only consider element types that are pointer sized as smaller types can
23084 // be easily promoted.
23085 EVT IndexVT = Index.getValueType();
23086 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
23087 return Changed;
23088
23089 // Can indices be trivially shrunk?
23090 EVT DataVT = N->getOperand(Num: 1).getValueType();
23091 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
23092 // will later be re-extended to 64 bits in legalization
23093 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
23094 return Changed;
23095 if (ISD::isVectorShrinkable(N: Index.getNode(), NewEltSize: 32, Signed: N->isIndexSigned())) {
23096 EVT NewIndexVT = IndexVT.changeVectorElementType(EltVT: MVT::i32);
23097 Index = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc(N), VT: NewIndexVT, Operand: Index);
23098 return true;
23099 }
23100
23101 // Match:
23102 // Index = step(const)
23103 int64_t Stride = 0;
23104 if (Index.getOpcode() == ISD::STEP_VECTOR) {
23105 Stride = cast<ConstantSDNode>(Val: Index.getOperand(i: 0))->getSExtValue();
23106 }
23107 // Match:
23108 // Index = step(const) << shift(const)
23109 else if (Index.getOpcode() == ISD::SHL &&
23110 Index.getOperand(i: 0).getOpcode() == ISD::STEP_VECTOR) {
23111 SDValue RHS = Index.getOperand(i: 1);
23112 if (auto *Shift =
23113 dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: RHS))) {
23114 int64_t Step = (int64_t)Index.getOperand(i: 0).getConstantOperandVal(i: 1);
23115 Stride = Step << Shift->getZExtValue();
23116 }
23117 }
23118
23119 // Return early because no supported pattern is found.
23120 if (Stride == 0)
23121 return Changed;
23122
23123 if (Stride < std::numeric_limits<int32_t>::min() ||
23124 Stride > std::numeric_limits<int32_t>::max())
23125 return Changed;
23126
23127 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
23128 unsigned MaxVScale =
23129 Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
23130 int64_t LastElementOffset =
23131 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
23132
23133 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
23134 LastElementOffset > std::numeric_limits<int32_t>::max())
23135 return Changed;
23136
23137 EVT NewIndexVT = IndexVT.changeVectorElementType(EltVT: MVT::i32);
23138 // Stride does not scale explicitly by 'Scale', because it happens in
23139 // the gather/scatter addressing mode.
23140 Index = DAG.getStepVector(DL: SDLoc(N), ResVT: NewIndexVT, StepVal: APInt(32, Stride));
23141 return true;
23142}
23143
23144static SDValue performMaskedGatherScatterCombine(
23145 SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
23146 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(Val: N);
23147 assert(MGS && "Can only combine gather load or scatter store nodes");
23148
23149 if (!DCI.isBeforeLegalize())
23150 return SDValue();
23151
23152 SDLoc DL(MGS);
23153 SDValue Chain = MGS->getChain();
23154 SDValue Scale = MGS->getScale();
23155 SDValue Index = MGS->getIndex();
23156 SDValue Mask = MGS->getMask();
23157 SDValue BasePtr = MGS->getBasePtr();
23158 ISD::MemIndexType IndexType = MGS->getIndexType();
23159
23160 if (!findMoreOptimalIndexType(N: MGS, BasePtr, Index, DAG))
23161 return SDValue();
23162
23163 // Here we catch such cases early and change MGATHER's IndexType to allow
23164 // the use of an Index that's more legalisation friendly.
23165 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(Val: MGS)) {
23166 SDValue PassThru = MGT->getPassThru();
23167 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
23168 return DAG.getMaskedGather(
23169 VTs: DAG.getVTList(VT1: N->getValueType(ResNo: 0), VT2: MVT::Other), MemVT: MGT->getMemoryVT(), dl: DL,
23170 Ops, MMO: MGT->getMemOperand(), IndexType, ExtTy: MGT->getExtensionType());
23171 }
23172 auto *MSC = cast<MaskedScatterSDNode>(Val: MGS);
23173 SDValue Data = MSC->getValue();
23174 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
23175 return DAG.getMaskedScatter(VTs: DAG.getVTList(VT: MVT::Other), MemVT: MSC->getMemoryVT(), dl: DL,
23176 Ops, MMO: MSC->getMemOperand(), IndexType,
23177 IsTruncating: MSC->isTruncatingStore());
23178}
23179
23180/// Target-specific DAG combine function for NEON load/store intrinsics
23181/// to merge base address updates.
23182static SDValue performNEONPostLDSTCombine(SDNode *N,
23183 TargetLowering::DAGCombinerInfo &DCI,
23184 SelectionDAG &DAG) {
23185 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
23186 return SDValue();
23187
23188 unsigned AddrOpIdx = N->getNumOperands() - 1;
23189 SDValue Addr = N->getOperand(Num: AddrOpIdx);
23190
23191 // Search for a use of the address operand that is an increment.
23192 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
23193 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
23194 SDNode *User = *UI;
23195 if (User->getOpcode() != ISD::ADD ||
23196 UI.getUse().getResNo() != Addr.getResNo())
23197 continue;
23198
23199 // Check that the add is independent of the load/store. Otherwise, folding
23200 // it would create a cycle.
23201 SmallPtrSet<const SDNode *, 32> Visited;
23202 SmallVector<const SDNode *, 16> Worklist;
23203 Visited.insert(Ptr: Addr.getNode());
23204 Worklist.push_back(Elt: N);
23205 Worklist.push_back(Elt: User);
23206 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
23207 SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
23208 continue;
23209
23210 // Find the new opcode for the updating load/store.
23211 bool IsStore = false;
23212 bool IsLaneOp = false;
23213 bool IsDupOp = false;
23214 unsigned NewOpc = 0;
23215 unsigned NumVecs = 0;
23216 unsigned IntNo = N->getConstantOperandVal(Num: 1);
23217 switch (IntNo) {
23218 default: llvm_unreachable("unexpected intrinsic for Neon base update");
23219 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
23220 NumVecs = 2; break;
23221 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
23222 NumVecs = 3; break;
23223 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
23224 NumVecs = 4; break;
23225 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
23226 NumVecs = 2; IsStore = true; break;
23227 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
23228 NumVecs = 3; IsStore = true; break;
23229 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
23230 NumVecs = 4; IsStore = true; break;
23231 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
23232 NumVecs = 2; break;
23233 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
23234 NumVecs = 3; break;
23235 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
23236 NumVecs = 4; break;
23237 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
23238 NumVecs = 2; IsStore = true; break;
23239 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
23240 NumVecs = 3; IsStore = true; break;
23241 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
23242 NumVecs = 4; IsStore = true; break;
23243 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
23244 NumVecs = 2; IsDupOp = true; break;
23245 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
23246 NumVecs = 3; IsDupOp = true; break;
23247 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
23248 NumVecs = 4; IsDupOp = true; break;
23249 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
23250 NumVecs = 2; IsLaneOp = true; break;
23251 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
23252 NumVecs = 3; IsLaneOp = true; break;
23253 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
23254 NumVecs = 4; IsLaneOp = true; break;
23255 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
23256 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
23257 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
23258 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
23259 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
23260 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
23261 }
23262
23263 EVT VecTy;
23264 if (IsStore)
23265 VecTy = N->getOperand(Num: 2).getValueType();
23266 else
23267 VecTy = N->getValueType(ResNo: 0);
23268
23269 // If the increment is a constant, it must match the memory ref size.
23270 SDValue Inc = User->getOperand(Num: User->getOperand(Num: 0) == Addr ? 1 : 0);
23271 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
23272 uint32_t IncVal = CInc->getZExtValue();
23273 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
23274 if (IsLaneOp || IsDupOp)
23275 NumBytes /= VecTy.getVectorNumElements();
23276 if (IncVal != NumBytes)
23277 continue;
23278 Inc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
23279 }
23280 SmallVector<SDValue, 8> Ops;
23281 Ops.push_back(Elt: N->getOperand(Num: 0)); // Incoming chain
23282 // Load lane and store have vector list as input.
23283 if (IsLaneOp || IsStore)
23284 for (unsigned i = 2; i < AddrOpIdx; ++i)
23285 Ops.push_back(Elt: N->getOperand(Num: i));
23286 Ops.push_back(Elt: Addr); // Base register
23287 Ops.push_back(Elt: Inc);
23288
23289 // Return Types.
23290 EVT Tys[6];
23291 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
23292 unsigned n;
23293 for (n = 0; n < NumResultVecs; ++n)
23294 Tys[n] = VecTy;
23295 Tys[n++] = MVT::i64; // Type of write back register
23296 Tys[n] = MVT::Other; // Type of the chain
23297 SDVTList SDTys = DAG.getVTList(VTs: ArrayRef(Tys, NumResultVecs + 2));
23298
23299 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(Val: N);
23300 SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOpc, dl: SDLoc(N), VTList: SDTys, Ops,
23301 MemVT: MemInt->getMemoryVT(),
23302 MMO: MemInt->getMemOperand());
23303
23304 // Update the uses.
23305 std::vector<SDValue> NewResults;
23306 for (unsigned i = 0; i < NumResultVecs; ++i) {
23307 NewResults.push_back(x: SDValue(UpdN.getNode(), i));
23308 }
23309 NewResults.push_back(x: SDValue(UpdN.getNode(), NumResultVecs + 1));
23310 DCI.CombineTo(N, To: NewResults);
23311 DCI.CombineTo(N: User, Res: SDValue(UpdN.getNode(), NumResultVecs));
23312
23313 break;
23314 }
23315 return SDValue();
23316}
23317
23318// Checks to see if the value is the prescribed width and returns information
23319// about its extension mode.
23320static
23321bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
23322 ExtType = ISD::NON_EXTLOAD;
23323 switch(V.getNode()->getOpcode()) {
23324 default:
23325 return false;
23326 case ISD::LOAD: {
23327 LoadSDNode *LoadNode = cast<LoadSDNode>(Val: V.getNode());
23328 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
23329 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
23330 ExtType = LoadNode->getExtensionType();
23331 return true;
23332 }
23333 return false;
23334 }
23335 case ISD::AssertSext: {
23336 VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: 1));
23337 if ((TypeNode->getVT() == MVT::i8 && width == 8)
23338 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
23339 ExtType = ISD::SEXTLOAD;
23340 return true;
23341 }
23342 return false;
23343 }
23344 case ISD::AssertZext: {
23345 VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: 1));
23346 if ((TypeNode->getVT() == MVT::i8 && width == 8)
23347 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
23348 ExtType = ISD::ZEXTLOAD;
23349 return true;
23350 }
23351 return false;
23352 }
23353 case ISD::Constant:
23354 case ISD::TargetConstant: {
23355 return std::abs(i: cast<ConstantSDNode>(Val: V.getNode())->getSExtValue()) <
23356 1LL << (width - 1);
23357 }
23358 }
23359
23360 return true;
23361}
23362
23363// This function does a whole lot of voodoo to determine if the tests are
23364// equivalent without and with a mask. Essentially what happens is that given a
23365// DAG resembling:
23366//
23367// +-------------+ +-------------+ +-------------+ +-------------+
23368// | Input | | AddConstant | | CompConstant| | CC |
23369// +-------------+ +-------------+ +-------------+ +-------------+
23370// | | | |
23371// V V | +----------+
23372// +-------------+ +----+ | |
23373// | ADD | |0xff| | |
23374// +-------------+ +----+ | |
23375// | | | |
23376// V V | |
23377// +-------------+ | |
23378// | AND | | |
23379// +-------------+ | |
23380// | | |
23381// +-----+ | |
23382// | | |
23383// V V V
23384// +-------------+
23385// | CMP |
23386// +-------------+
23387//
23388// The AND node may be safely removed for some combinations of inputs. In
23389// particular we need to take into account the extension type of the Input,
23390// the exact values of AddConstant, CompConstant, and CC, along with the nominal
23391// width of the input (this can work for any width inputs, the above graph is
23392// specific to 8 bits.
23393//
23394// The specific equations were worked out by generating output tables for each
23395// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
23396// problem was simplified by working with 4 bit inputs, which means we only
23397// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
23398// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
23399// patterns present in both extensions (0,7). For every distinct set of
23400// AddConstant and CompConstants bit patterns we can consider the masked and
23401// unmasked versions to be equivalent if the result of this function is true for
23402// all 16 distinct bit patterns of for the current extension type of Input (w0).
23403//
23404// sub w8, w0, w1
23405// and w10, w8, #0x0f
23406// cmp w8, w2
23407// cset w9, AArch64CC
23408// cmp w10, w2
23409// cset w11, AArch64CC
23410// cmp w9, w11
23411// cset w0, eq
23412// ret
23413//
23414// Since the above function shows when the outputs are equivalent it defines
23415// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
23416// would be expensive to run during compiles. The equations below were written
23417// in a test harness that confirmed they gave equivalent outputs to the above
23418// for all inputs function, so they can be used determine if the removal is
23419// legal instead.
23420//
23421// isEquivalentMaskless() is the code for testing if the AND can be removed
23422// factored out of the DAG recognition as the DAG can take several forms.
23423
23424static bool isEquivalentMaskless(unsigned CC, unsigned width,
23425 ISD::LoadExtType ExtType, int AddConstant,
23426 int CompConstant) {
23427 // By being careful about our equations and only writing the in term
23428 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
23429 // make them generally applicable to all bit widths.
23430 int MaxUInt = (1 << width);
23431
23432 // For the purposes of these comparisons sign extending the type is
23433 // equivalent to zero extending the add and displacing it by half the integer
23434 // width. Provided we are careful and make sure our equations are valid over
23435 // the whole range we can just adjust the input and avoid writing equations
23436 // for sign extended inputs.
23437 if (ExtType == ISD::SEXTLOAD)
23438 AddConstant -= (1 << (width-1));
23439
23440 switch(CC) {
23441 case AArch64CC::LE:
23442 case AArch64CC::GT:
23443 if ((AddConstant == 0) ||
23444 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
23445 (AddConstant >= 0 && CompConstant < 0) ||
23446 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
23447 return true;
23448 break;
23449 case AArch64CC::LT:
23450 case AArch64CC::GE:
23451 if ((AddConstant == 0) ||
23452 (AddConstant >= 0 && CompConstant <= 0) ||
23453 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
23454 return true;
23455 break;
23456 case AArch64CC::HI:
23457 case AArch64CC::LS:
23458 if ((AddConstant >= 0 && CompConstant < 0) ||
23459 (AddConstant <= 0 && CompConstant >= -1 &&
23460 CompConstant < AddConstant + MaxUInt))
23461 return true;
23462 break;
23463 case AArch64CC::PL:
23464 case AArch64CC::MI:
23465 if ((AddConstant == 0) ||
23466 (AddConstant > 0 && CompConstant <= 0) ||
23467 (AddConstant < 0 && CompConstant <= AddConstant))
23468 return true;
23469 break;
23470 case AArch64CC::LO:
23471 case AArch64CC::HS:
23472 if ((AddConstant >= 0 && CompConstant <= 0) ||
23473 (AddConstant <= 0 && CompConstant >= 0 &&
23474 CompConstant <= AddConstant + MaxUInt))
23475 return true;
23476 break;
23477 case AArch64CC::EQ:
23478 case AArch64CC::NE:
23479 if ((AddConstant > 0 && CompConstant < 0) ||
23480 (AddConstant < 0 && CompConstant >= 0 &&
23481 CompConstant < AddConstant + MaxUInt) ||
23482 (AddConstant >= 0 && CompConstant >= 0 &&
23483 CompConstant >= AddConstant) ||
23484 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
23485 return true;
23486 break;
23487 case AArch64CC::VS:
23488 case AArch64CC::VC:
23489 case AArch64CC::AL:
23490 case AArch64CC::NV:
23491 return true;
23492 case AArch64CC::Invalid:
23493 break;
23494 }
23495
23496 return false;
23497}
23498
23499// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
23500// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
23501static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode,
23502 SDNode *AndNode, SelectionDAG &DAG,
23503 unsigned CCIndex, unsigned CmpIndex,
23504 unsigned CC) {
23505 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(Val: SubsNode->getOperand(Num: 1));
23506 if (!SubsC)
23507 return SDValue();
23508
23509 APInt SubsAP = SubsC->getAPIntValue();
23510 if (CC == AArch64CC::HI) {
23511 if (!SubsAP.isMask())
23512 return SDValue();
23513 } else if (CC == AArch64CC::LO) {
23514 if (!SubsAP.isPowerOf2())
23515 return SDValue();
23516 } else
23517 return SDValue();
23518
23519 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: 1));
23520 if (!AndC)
23521 return SDValue();
23522
23523 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
23524
23525 SDLoc DL(N);
23526 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
23527 SDValue ANDS = DAG.getNode(
23528 Opcode: AArch64ISD::ANDS, DL, VTList: SubsNode->getVTList(), N1: AndNode->getOperand(Num: 0),
23529 N2: DAG.getConstant(Val: AndSMask, DL, VT: SubsC->getValueType(ResNo: 0)));
23530 SDValue AArch64_CC =
23531 DAG.getConstant(Val: CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL,
23532 VT: N->getOperand(Num: CCIndex)->getValueType(ResNo: 0));
23533
23534 // For now, only performCSELCombine and performBRCONDCombine call this
23535 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
23536 // operands. So just init the ops direct to simplify the code. If we have some
23537 // other case with different CCIndex, CmpIndex, we need to use for loop to
23538 // rewrite the code here.
23539 // TODO: Do we need to assert number of operand is 4 here?
23540 assert((CCIndex == 2 && CmpIndex == 3) &&
23541 "Expected CCIndex to be 2 and CmpIndex to be 3.");
23542 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), AArch64_CC,
23543 ANDS.getValue(R: 1)};
23544 return DAG.getNode(Opcode: N->getOpcode(), DL: N, VTList: N->getVTList(), Ops);
23545}
23546
23547static
23548SDValue performCONDCombine(SDNode *N,
23549 TargetLowering::DAGCombinerInfo &DCI,
23550 SelectionDAG &DAG, unsigned CCIndex,
23551 unsigned CmpIndex) {
23552 unsigned CC = cast<ConstantSDNode>(Val: N->getOperand(Num: CCIndex))->getSExtValue();
23553 SDNode *SubsNode = N->getOperand(Num: CmpIndex).getNode();
23554 unsigned CondOpcode = SubsNode->getOpcode();
23555
23556 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(Value: 0) ||
23557 !SubsNode->hasOneUse())
23558 return SDValue();
23559
23560 // There is a SUBS feeding this condition. Is it fed by a mask we can
23561 // use?
23562
23563 SDNode *AndNode = SubsNode->getOperand(Num: 0).getNode();
23564 unsigned MaskBits = 0;
23565
23566 if (AndNode->getOpcode() != ISD::AND)
23567 return SDValue();
23568
23569 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
23570 CmpIndex, CC))
23571 return Val;
23572
23573 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: 1))) {
23574 uint32_t CNV = CN->getZExtValue();
23575 if (CNV == 255)
23576 MaskBits = 8;
23577 else if (CNV == 65535)
23578 MaskBits = 16;
23579 }
23580
23581 if (!MaskBits)
23582 return SDValue();
23583
23584 SDValue AddValue = AndNode->getOperand(Num: 0);
23585
23586 if (AddValue.getOpcode() != ISD::ADD)
23587 return SDValue();
23588
23589 // The basic dag structure is correct, grab the inputs and validate them.
23590
23591 SDValue AddInputValue1 = AddValue.getNode()->getOperand(Num: 0);
23592 SDValue AddInputValue2 = AddValue.getNode()->getOperand(Num: 1);
23593 SDValue SubsInputValue = SubsNode->getOperand(Num: 1);
23594
23595 // The mask is present and the provenance of all the values is a smaller type,
23596 // lets see if the mask is superfluous.
23597
23598 if (!isa<ConstantSDNode>(Val: AddInputValue2.getNode()) ||
23599 !isa<ConstantSDNode>(Val: SubsInputValue.getNode()))
23600 return SDValue();
23601
23602 ISD::LoadExtType ExtType;
23603
23604 if (!checkValueWidth(V: SubsInputValue, width: MaskBits, ExtType) ||
23605 !checkValueWidth(V: AddInputValue2, width: MaskBits, ExtType) ||
23606 !checkValueWidth(V: AddInputValue1, width: MaskBits, ExtType) )
23607 return SDValue();
23608
23609 if(!isEquivalentMaskless(CC, width: MaskBits, ExtType,
23610 AddConstant: cast<ConstantSDNode>(Val: AddInputValue2.getNode())->getSExtValue(),
23611 CompConstant: cast<ConstantSDNode>(Val: SubsInputValue.getNode())->getSExtValue()))
23612 return SDValue();
23613
23614 // The AND is not necessary, remove it.
23615
23616 SDVTList VTs = DAG.getVTList(VT1: SubsNode->getValueType(ResNo: 0),
23617 VT2: SubsNode->getValueType(ResNo: 1));
23618 SDValue Ops[] = { AddValue, SubsNode->getOperand(Num: 1) };
23619
23620 SDValue NewValue = DAG.getNode(Opcode: CondOpcode, DL: SDLoc(SubsNode), VTList: VTs, Ops);
23621 DAG.ReplaceAllUsesWith(From: SubsNode, To: NewValue.getNode());
23622
23623 return SDValue(N, 0);
23624}
23625
23626// Optimize compare with zero and branch.
23627static SDValue performBRCONDCombine(SDNode *N,
23628 TargetLowering::DAGCombinerInfo &DCI,
23629 SelectionDAG &DAG) {
23630 MachineFunction &MF = DAG.getMachineFunction();
23631 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
23632 // will not be produced, as they are conditional branch instructions that do
23633 // not set flags.
23634 if (MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening))
23635 return SDValue();
23636
23637 if (SDValue NV = performCONDCombine(N, DCI, DAG, CCIndex: 2, CmpIndex: 3))
23638 N = NV.getNode();
23639 SDValue Chain = N->getOperand(Num: 0);
23640 SDValue Dest = N->getOperand(Num: 1);
23641 SDValue CCVal = N->getOperand(Num: 2);
23642 SDValue Cmp = N->getOperand(Num: 3);
23643
23644 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
23645 unsigned CC = CCVal->getAsZExtVal();
23646 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
23647 return SDValue();
23648
23649 unsigned CmpOpc = Cmp.getOpcode();
23650 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
23651 return SDValue();
23652
23653 // Only attempt folding if there is only one use of the flag and no use of the
23654 // value.
23655 if (!Cmp->hasNUsesOfValue(NUses: 0, Value: 0) || !Cmp->hasNUsesOfValue(NUses: 1, Value: 1))
23656 return SDValue();
23657
23658 SDValue LHS = Cmp.getOperand(i: 0);
23659 SDValue RHS = Cmp.getOperand(i: 1);
23660
23661 assert(LHS.getValueType() == RHS.getValueType() &&
23662 "Expected the value type to be the same for both operands!");
23663 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
23664 return SDValue();
23665
23666 if (isNullConstant(V: LHS))
23667 std::swap(a&: LHS, b&: RHS);
23668
23669 if (!isNullConstant(V: RHS))
23670 return SDValue();
23671
23672 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
23673 LHS.getOpcode() == ISD::SRL)
23674 return SDValue();
23675
23676 // Fold the compare into the branch instruction.
23677 SDValue BR;
23678 if (CC == AArch64CC::EQ)
23679 BR = DAG.getNode(Opcode: AArch64ISD::CBZ, DL: SDLoc(N), VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
23680 else
23681 BR = DAG.getNode(Opcode: AArch64ISD::CBNZ, DL: SDLoc(N), VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
23682
23683 // Do not add new nodes to DAG combiner worklist.
23684 DCI.CombineTo(N, Res: BR, AddTo: false);
23685
23686 return SDValue();
23687}
23688
23689static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
23690 unsigned CC = N->getConstantOperandVal(Num: 2);
23691 SDValue SUBS = N->getOperand(Num: 3);
23692 SDValue Zero, CTTZ;
23693
23694 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
23695 Zero = N->getOperand(Num: 0);
23696 CTTZ = N->getOperand(Num: 1);
23697 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
23698 Zero = N->getOperand(Num: 1);
23699 CTTZ = N->getOperand(Num: 0);
23700 } else
23701 return SDValue();
23702
23703 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
23704 (CTTZ.getOpcode() == ISD::TRUNCATE &&
23705 CTTZ.getOperand(i: 0).getOpcode() != ISD::CTTZ))
23706 return SDValue();
23707
23708 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
23709 "Illegal type in CTTZ folding");
23710
23711 if (!isNullConstant(V: Zero) || !isNullConstant(V: SUBS.getOperand(i: 1)))
23712 return SDValue();
23713
23714 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
23715 ? CTTZ.getOperand(i: 0).getOperand(i: 0)
23716 : CTTZ.getOperand(i: 0);
23717
23718 if (X != SUBS.getOperand(i: 0))
23719 return SDValue();
23720
23721 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
23722 ? CTTZ.getOperand(i: 0).getValueSizeInBits()
23723 : CTTZ.getValueSizeInBits();
23724 SDValue BitWidthMinusOne =
23725 DAG.getConstant(Val: BitWidth - 1, DL: SDLoc(N), VT: CTTZ.getValueType());
23726 return DAG.getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: CTTZ.getValueType(), N1: CTTZ,
23727 N2: BitWidthMinusOne);
23728}
23729
23730// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
23731// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
23732// Where x and y are constants and x != y
23733
23734// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
23735// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
23736// Where x and y are constants and x != y
23737static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
23738 SDValue L = Op->getOperand(Num: 0);
23739 SDValue R = Op->getOperand(Num: 1);
23740 AArch64CC::CondCode OpCC =
23741 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(Num: 2));
23742
23743 SDValue OpCmp = Op->getOperand(Num: 3);
23744 if (!isCMP(Op: OpCmp))
23745 return SDValue();
23746
23747 SDValue CmpLHS = OpCmp.getOperand(i: 0);
23748 SDValue CmpRHS = OpCmp.getOperand(i: 1);
23749
23750 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
23751 std::swap(a&: CmpLHS, b&: CmpRHS);
23752 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
23753 return SDValue();
23754
23755 SDValue X = CmpLHS->getOperand(Num: 0);
23756 SDValue Y = CmpLHS->getOperand(Num: 1);
23757 if (!isa<ConstantSDNode>(Val: X) || !isa<ConstantSDNode>(Val: Y) || X == Y) {
23758 return SDValue();
23759 }
23760
23761 // If one of the constant is opaque constant, x,y sdnode is still different
23762 // but the real value maybe the same. So check APInt here to make sure the
23763 // code is correct.
23764 ConstantSDNode *CX = cast<ConstantSDNode>(Val&: X);
23765 ConstantSDNode *CY = cast<ConstantSDNode>(Val&: Y);
23766 if (CX->getAPIntValue() == CY->getAPIntValue())
23767 return SDValue();
23768
23769 AArch64CC::CondCode CC =
23770 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(Num: 2));
23771 SDValue Cond = CmpLHS->getOperand(Num: 3);
23772
23773 if (CmpRHS == Y)
23774 CC = AArch64CC::getInvertedCondCode(Code: CC);
23775 else if (CmpRHS != X)
23776 return SDValue();
23777
23778 if (OpCC == AArch64CC::NE)
23779 CC = AArch64CC::getInvertedCondCode(Code: CC);
23780 else if (OpCC != AArch64CC::EQ)
23781 return SDValue();
23782
23783 SDLoc DL(Op);
23784 EVT VT = Op->getValueType(ResNo: 0);
23785
23786 SDValue CCValue = DAG.getConstant(Val: CC, DL, VT: MVT::i32);
23787 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: L, N2: R, N3: CCValue, N4: Cond);
23788}
23789
23790// Optimize CSEL instructions
23791static SDValue performCSELCombine(SDNode *N,
23792 TargetLowering::DAGCombinerInfo &DCI,
23793 SelectionDAG &DAG) {
23794 // CSEL x, x, cc -> x
23795 if (N->getOperand(Num: 0) == N->getOperand(Num: 1))
23796 return N->getOperand(Num: 0);
23797
23798 if (SDValue R = foldCSELOfCSEL(Op: N, DAG))
23799 return R;
23800
23801 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23802 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23803 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23804 return Folded;
23805
23806 return performCONDCombine(N, DCI, DAG, CCIndex: 2, CmpIndex: 3);
23807}
23808
23809// Try to re-use an already extended operand of a vector SetCC feeding a
23810// extended select. Doing so avoids requiring another full extension of the
23811// SET_CC result when lowering the select.
23812static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
23813 EVT Op0MVT = Op->getOperand(Num: 0).getValueType();
23814 if (!Op0MVT.isVector() || Op->use_empty())
23815 return SDValue();
23816
23817 // Make sure that all uses of Op are VSELECTs with result matching types where
23818 // the result type has a larger element type than the SetCC operand.
23819 SDNode *FirstUse = *Op->use_begin();
23820 if (FirstUse->getOpcode() != ISD::VSELECT)
23821 return SDValue();
23822 EVT UseMVT = FirstUse->getValueType(ResNo: 0);
23823 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23824 return SDValue();
23825 if (any_of(Range: Op->uses(), P: [&UseMVT](const SDNode *N) {
23826 return N->getOpcode() != ISD::VSELECT || N->getValueType(ResNo: 0) != UseMVT;
23827 }))
23828 return SDValue();
23829
23830 APInt V;
23831 if (!ISD::isConstantSplatVector(N: Op->getOperand(Num: 1).getNode(), SplatValue&: V))
23832 return SDValue();
23833
23834 SDLoc DL(Op);
23835 SDValue Op0ExtV;
23836 SDValue Op1ExtV;
23837 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op->getOperand(Num: 2))->get();
23838 // Check if the first operand of the SET_CC is already extended. If it is,
23839 // split the SET_CC and re-use the extended version of the operand.
23840 SDNode *Op0SExt = DAG.getNodeIfExists(Opcode: ISD::SIGN_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
23841 Ops: Op->getOperand(Num: 0));
23842 SDNode *Op0ZExt = DAG.getNodeIfExists(Opcode: ISD::ZERO_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
23843 Ops: Op->getOperand(Num: 0));
23844 if (Op0SExt && (isSignedIntSetCC(Code: CC) || isIntEqualitySetCC(Code: CC))) {
23845 Op0ExtV = SDValue(Op0SExt, 0);
23846 Op1ExtV = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: 1));
23847 } else if (Op0ZExt && (isUnsignedIntSetCC(Code: CC) || isIntEqualitySetCC(Code: CC))) {
23848 Op0ExtV = SDValue(Op0ZExt, 0);
23849 Op1ExtV = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: 1));
23850 } else
23851 return SDValue();
23852
23853 return DAG.getNode(Opcode: ISD::SETCC, DL, VT: UseMVT.changeVectorElementType(EltVT: MVT::i1),
23854 N1: Op0ExtV, N2: Op1ExtV, N3: Op->getOperand(Num: 2));
23855}
23856
23857static SDValue
23858performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
23859 SelectionDAG &DAG) {
23860 SDValue Vec = N->getOperand(Num: 0);
23861 if (DCI.isBeforeLegalize() &&
23862 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23863 Vec.getValueType().isFixedLengthVector() &&
23864 Vec.getValueType().isPow2VectorType()) {
23865 SDLoc DL(N);
23866 return getVectorBitwiseReduce(Opcode: N->getOpcode(), Vec, VT: N->getValueType(ResNo: 0), DL,
23867 DAG);
23868 }
23869
23870 return SDValue();
23871}
23872
23873static SDValue performSETCCCombine(SDNode *N,
23874 TargetLowering::DAGCombinerInfo &DCI,
23875 SelectionDAG &DAG) {
23876 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23877 SDValue LHS = N->getOperand(Num: 0);
23878 SDValue RHS = N->getOperand(Num: 1);
23879 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
23880 SDLoc DL(N);
23881 EVT VT = N->getValueType(ResNo: 0);
23882
23883 if (SDValue V = tryToWidenSetCCOperands(Op: N, DAG))
23884 return V;
23885
23886 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23887 if (Cond == ISD::SETNE && isOneConstant(V: RHS) &&
23888 LHS->getOpcode() == AArch64ISD::CSEL &&
23889 isNullConstant(V: LHS->getOperand(Num: 0)) && isOneConstant(V: LHS->getOperand(Num: 1)) &&
23890 LHS->hasOneUse()) {
23891 // Invert CSEL's condition.
23892 auto OldCond =
23893 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: 2));
23894 auto NewCond = getInvertedCondCode(Code: OldCond);
23895
23896 // csel 0, 1, !cond, X
23897 SDValue CSEL =
23898 DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: LHS.getValueType(), N1: LHS.getOperand(i: 0),
23899 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: NewCond, DL, VT: MVT::i32),
23900 N4: LHS.getOperand(i: 3));
23901 return DAG.getZExtOrTrunc(Op: CSEL, DL, VT);
23902 }
23903
23904 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23905 if (Cond == ISD::SETNE && isNullConstant(V: RHS) &&
23906 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Val: LHS->getOperand(Num: 1)) &&
23907 LHS->getConstantOperandVal(Num: 1) < VT.getScalarSizeInBits() &&
23908 LHS->hasOneUse()) {
23909 EVT TstVT = LHS->getValueType(ResNo: 0);
23910 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23911 // this pattern will get better opt in emitComparison
23912 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(Num: 1);
23913 SDValue TST = DAG.getNode(Opcode: ISD::AND, DL, VT: TstVT, N1: LHS->getOperand(Num: 0),
23914 N2: DAG.getConstant(Val: TstImm, DL, VT: TstVT));
23915 return DAG.getNode(Opcode: ISD::SETCC, DL, VT, N1: TST, N2: RHS, N3: N->getOperand(Num: 2));
23916 }
23917 }
23918
23919 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23920 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23921 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23922 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23923 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23924 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23925 (isNullConstant(V: RHS) || isAllOnesConstant(V: RHS)) &&
23926 LHS->getOpcode() == ISD::BITCAST) {
23927 EVT ToVT = LHS->getValueType(ResNo: 0);
23928 EVT FromVT = LHS->getOperand(Num: 0).getValueType();
23929 if (FromVT.isFixedLengthVector() &&
23930 FromVT.getVectorElementType() == MVT::i1) {
23931 bool IsNull = isNullConstant(V: RHS);
23932 LHS = DAG.getNode(Opcode: IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
23933 DL, VT: MVT::i1, Operand: LHS->getOperand(Num: 0));
23934 LHS = DAG.getNode(Opcode: IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, VT: ToVT,
23935 Operand: LHS);
23936 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23937 }
23938 }
23939
23940 // Try to perform the memcmp when the result is tested for [in]equality with 0
23941 if (SDValue V = performOrXorChainCombine(N, DAG))
23942 return V;
23943
23944 return SDValue();
23945}
23946
23947// Replace a flag-setting operator (eg ANDS) with the generic version
23948// (eg AND) if the flag is unused.
23949static SDValue performFlagSettingCombine(SDNode *N,
23950 TargetLowering::DAGCombinerInfo &DCI,
23951 unsigned GenericOpcode) {
23952 SDLoc DL(N);
23953 SDValue LHS = N->getOperand(Num: 0);
23954 SDValue RHS = N->getOperand(Num: 1);
23955 EVT VT = N->getValueType(ResNo: 0);
23956
23957 // If the flag result isn't used, convert back to a generic opcode.
23958 if (!N->hasAnyUseOfValue(Value: 1)) {
23959 SDValue Res = DCI.DAG.getNode(Opcode: GenericOpcode, DL, VT, Ops: N->ops());
23960 return DCI.DAG.getMergeValues(Ops: {Res, DCI.DAG.getConstant(Val: 0, DL, VT: MVT::i32)},
23961 dl: DL);
23962 }
23963
23964 // Combine identical generic nodes into this node, re-using the result.
23965 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23966 Opcode: GenericOpcode, VTList: DCI.DAG.getVTList(VT), Ops: {LHS, RHS}))
23967 DCI.CombineTo(N: Generic, Res: SDValue(N, 0));
23968
23969 return SDValue();
23970}
23971
23972static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
23973 // setcc_merge_zero pred
23974 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23975 // => extract_subvector (inner setcc_merge_zero)
23976 SDValue Pred = N->getOperand(Num: 0);
23977 SDValue LHS = N->getOperand(Num: 1);
23978 SDValue RHS = N->getOperand(Num: 2);
23979 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 3))->get();
23980
23981 if (Cond != ISD::SETNE || !isZerosVector(N: RHS.getNode()) ||
23982 LHS->getOpcode() != ISD::SIGN_EXTEND)
23983 return SDValue();
23984
23985 SDValue Extract = LHS->getOperand(Num: 0);
23986 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23987 Extract->getValueType(ResNo: 0) != N->getValueType(ResNo: 0) ||
23988 Extract->getConstantOperandVal(Num: 1) != 0)
23989 return SDValue();
23990
23991 SDValue InnerSetCC = Extract->getOperand(Num: 0);
23992 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23993 return SDValue();
23994
23995 // By this point we've effectively got
23996 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23997 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23998 // can operate on A directly.
23999 SDValue InnerPred = InnerSetCC.getOperand(i: 0);
24000 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
24001 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
24002 Pred.getConstantOperandVal(i: 0) == InnerPred.getConstantOperandVal(i: 0) &&
24003 Pred->getConstantOperandVal(Num: 0) >= AArch64SVEPredPattern::vl1 &&
24004 Pred->getConstantOperandVal(Num: 0) <= AArch64SVEPredPattern::vl256)
24005 return Extract;
24006
24007 return SDValue();
24008}
24009
24010static SDValue
24011performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
24012 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
24013 "Unexpected opcode!");
24014
24015 SelectionDAG &DAG = DCI.DAG;
24016 SDValue Pred = N->getOperand(Num: 0);
24017 SDValue LHS = N->getOperand(Num: 1);
24018 SDValue RHS = N->getOperand(Num: 2);
24019 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 3))->get();
24020
24021 if (SDValue V = performSetCCPunpkCombine(N, DAG))
24022 return V;
24023
24024 if (Cond == ISD::SETNE && isZerosVector(N: RHS.getNode()) &&
24025 LHS->getOpcode() == ISD::SIGN_EXTEND &&
24026 LHS->getOperand(Num: 0)->getValueType(ResNo: 0) == N->getValueType(ResNo: 0)) {
24027 // setcc_merge_zero(
24028 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
24029 // => setcc_merge_zero(pred, ...)
24030 if (LHS->getOperand(Num: 0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
24031 LHS->getOperand(Num: 0)->getOperand(Num: 0) == Pred)
24032 return LHS->getOperand(Num: 0);
24033
24034 // setcc_merge_zero(
24035 // all_active, extend(nxvNi1 ...), != splat(0))
24036 // -> nxvNi1 ...
24037 if (isAllActivePredicate(DAG, N: Pred))
24038 return LHS->getOperand(Num: 0);
24039
24040 // setcc_merge_zero(
24041 // pred, extend(nxvNi1 ...), != splat(0))
24042 // -> nxvNi1 and(pred, ...)
24043 if (DCI.isAfterLegalizeDAG())
24044 // Do this after legalization to allow more folds on setcc_merge_zero
24045 // to be recognized.
24046 return DAG.getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
24047 N1: LHS->getOperand(Num: 0), N2: Pred);
24048 }
24049
24050 return SDValue();
24051}
24052
24053// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
24054// as well as whether the test should be inverted. This code is required to
24055// catch these cases (as opposed to standard dag combines) because
24056// AArch64ISD::TBZ is matched during legalization.
24057static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
24058 SelectionDAG &DAG) {
24059
24060 if (!Op->hasOneUse())
24061 return Op;
24062
24063 // We don't handle undef/constant-fold cases below, as they should have
24064 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
24065 // etc.)
24066
24067 // (tbz (trunc x), b) -> (tbz x, b)
24068 // This case is just here to enable more of the below cases to be caught.
24069 if (Op->getOpcode() == ISD::TRUNCATE &&
24070 Bit < Op->getValueType(ResNo: 0).getSizeInBits()) {
24071 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
24072 }
24073
24074 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
24075 if (Op->getOpcode() == ISD::ANY_EXTEND &&
24076 Bit < Op->getOperand(Num: 0).getValueSizeInBits()) {
24077 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
24078 }
24079
24080 if (Op->getNumOperands() != 2)
24081 return Op;
24082
24083 auto *C = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
24084 if (!C)
24085 return Op;
24086
24087 switch (Op->getOpcode()) {
24088 default:
24089 return Op;
24090
24091 // (tbz (and x, m), b) -> (tbz x, b)
24092 case ISD::AND:
24093 if ((C->getZExtValue() >> Bit) & 1)
24094 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
24095 return Op;
24096
24097 // (tbz (shl x, c), b) -> (tbz x, b-c)
24098 case ISD::SHL:
24099 if (C->getZExtValue() <= Bit &&
24100 (Bit - C->getZExtValue()) < Op->getValueType(ResNo: 0).getSizeInBits()) {
24101 Bit = Bit - C->getZExtValue();
24102 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
24103 }
24104 return Op;
24105
24106 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
24107 case ISD::SRA:
24108 Bit = Bit + C->getZExtValue();
24109 if (Bit >= Op->getValueType(ResNo: 0).getSizeInBits())
24110 Bit = Op->getValueType(ResNo: 0).getSizeInBits() - 1;
24111 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
24112
24113 // (tbz (srl x, c), b) -> (tbz x, b+c)
24114 case ISD::SRL:
24115 if ((Bit + C->getZExtValue()) < Op->getValueType(ResNo: 0).getSizeInBits()) {
24116 Bit = Bit + C->getZExtValue();
24117 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
24118 }
24119 return Op;
24120
24121 // (tbz (xor x, -1), b) -> (tbnz x, b)
24122 case ISD::XOR:
24123 if ((C->getZExtValue() >> Bit) & 1)
24124 Invert = !Invert;
24125 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
24126 }
24127}
24128
24129// Optimize test single bit zero/non-zero and branch.
24130static SDValue performTBZCombine(SDNode *N,
24131 TargetLowering::DAGCombinerInfo &DCI,
24132 SelectionDAG &DAG) {
24133 unsigned Bit = N->getConstantOperandVal(Num: 2);
24134 bool Invert = false;
24135 SDValue TestSrc = N->getOperand(Num: 1);
24136 SDValue NewTestSrc = getTestBitOperand(Op: TestSrc, Bit, Invert, DAG);
24137
24138 if (TestSrc == NewTestSrc)
24139 return SDValue();
24140
24141 unsigned NewOpc = N->getOpcode();
24142 if (Invert) {
24143 if (NewOpc == AArch64ISD::TBZ)
24144 NewOpc = AArch64ISD::TBNZ;
24145 else {
24146 assert(NewOpc == AArch64ISD::TBNZ);
24147 NewOpc = AArch64ISD::TBZ;
24148 }
24149 }
24150
24151 SDLoc DL(N);
24152 return DAG.getNode(Opcode: NewOpc, DL, VT: MVT::Other, N1: N->getOperand(Num: 0), N2: NewTestSrc,
24153 N3: DAG.getConstant(Val: Bit, DL, VT: MVT::i64), N4: N->getOperand(Num: 3));
24154}
24155
24156// Swap vselect operands where it may allow a predicated operation to achieve
24157// the `sel`.
24158//
24159// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
24160// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
24161static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
24162 auto SelectA = N->getOperand(Num: 1);
24163 auto SelectB = N->getOperand(Num: 2);
24164 auto NTy = N->getValueType(ResNo: 0);
24165
24166 if (!NTy.isScalableVector())
24167 return SDValue();
24168 SDValue SetCC = N->getOperand(Num: 0);
24169 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
24170 return SDValue();
24171
24172 switch (SelectB.getOpcode()) {
24173 default:
24174 return SDValue();
24175 case ISD::FMUL:
24176 case ISD::FSUB:
24177 case ISD::FADD:
24178 break;
24179 }
24180 if (SelectA != SelectB.getOperand(i: 0))
24181 return SDValue();
24182
24183 ISD::CondCode CC = cast<CondCodeSDNode>(Val: SetCC.getOperand(i: 2))->get();
24184 ISD::CondCode InverseCC =
24185 ISD::getSetCCInverse(Operation: CC, Type: SetCC.getOperand(i: 0).getValueType());
24186 auto InverseSetCC =
24187 DAG.getSetCC(DL: SDLoc(SetCC), VT: SetCC.getValueType(), LHS: SetCC.getOperand(i: 0),
24188 RHS: SetCC.getOperand(i: 1), Cond: InverseCC);
24189
24190 return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT: NTy,
24191 Ops: {InverseSetCC, SelectB, SelectA});
24192}
24193
24194// vselect (v1i1 setcc) ->
24195// vselect (v1iXX setcc) (XX is the size of the compared operand type)
24196// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
24197// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
24198// such VSELECT.
24199static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
24200 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
24201 return SwapResult;
24202
24203 SDValue N0 = N->getOperand(Num: 0);
24204 EVT CCVT = N0.getValueType();
24205
24206 if (isAllActivePredicate(DAG, N: N0))
24207 return N->getOperand(Num: 1);
24208
24209 if (isAllInactivePredicate(N: N0))
24210 return N->getOperand(Num: 2);
24211
24212 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
24213 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
24214 // supported types.
24215 SDValue SetCC = N->getOperand(Num: 0);
24216 if (SetCC.getOpcode() == ISD::SETCC &&
24217 SetCC.getOperand(i: 2) == DAG.getCondCode(Cond: ISD::SETGT)) {
24218 SDValue CmpLHS = SetCC.getOperand(i: 0);
24219 EVT VT = CmpLHS.getValueType();
24220 SDNode *CmpRHS = SetCC.getOperand(i: 1).getNode();
24221 SDNode *SplatLHS = N->getOperand(Num: 1).getNode();
24222 SDNode *SplatRHS = N->getOperand(Num: 2).getNode();
24223 APInt SplatLHSVal;
24224 if (CmpLHS.getValueType() == N->getOperand(Num: 1).getValueType() &&
24225 VT.isSimple() &&
24226 is_contained(Range: ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
24227 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
24228 Element: VT.getSimpleVT().SimpleTy) &&
24229 ISD::isConstantSplatVector(N: SplatLHS, SplatValue&: SplatLHSVal) &&
24230 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(N: CmpRHS) &&
24231 ISD::isConstantSplatVectorAllOnes(N: SplatRHS)) {
24232 unsigned NumElts = VT.getVectorNumElements();
24233 SmallVector<SDValue, 8> Ops(
24234 NumElts, DAG.getConstant(Val: VT.getScalarSizeInBits() - 1, DL: SDLoc(N),
24235 VT: VT.getScalarType()));
24236 SDValue Val = DAG.getBuildVector(VT, DL: SDLoc(N), Ops);
24237
24238 auto Shift = DAG.getNode(Opcode: ISD::SRA, DL: SDLoc(N), VT, N1: CmpLHS, N2: Val);
24239 auto Or = DAG.getNode(Opcode: ISD::OR, DL: SDLoc(N), VT, N1: Shift, N2: N->getOperand(Num: 1));
24240 return Or;
24241 }
24242 }
24243
24244 EVT CmpVT = N0.getOperand(i: 0).getValueType();
24245 if (N0.getOpcode() != ISD::SETCC ||
24246 CCVT.getVectorElementCount() != ElementCount::getFixed(MinVal: 1) ||
24247 CCVT.getVectorElementType() != MVT::i1 ||
24248 CmpVT.getVectorElementType().isFloatingPoint())
24249 return SDValue();
24250
24251 EVT ResVT = N->getValueType(ResNo: 0);
24252 // Only combine when the result type is of the same size as the compared
24253 // operands.
24254 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
24255 return SDValue();
24256
24257 SDValue IfTrue = N->getOperand(Num: 1);
24258 SDValue IfFalse = N->getOperand(Num: 2);
24259 SetCC = DAG.getSetCC(DL: SDLoc(N), VT: CmpVT.changeVectorElementTypeToInteger(),
24260 LHS: N0.getOperand(i: 0), RHS: N0.getOperand(i: 1),
24261 Cond: cast<CondCodeSDNode>(Val: N0.getOperand(i: 2))->get());
24262 return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT: ResVT, N1: SetCC,
24263 N2: IfTrue, N3: IfFalse);
24264}
24265
24266/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
24267/// the compare-mask instructions rather than going via NZCV, even if LHS and
24268/// RHS are really scalar. This replaces any scalar setcc in the above pattern
24269/// with a vector one followed by a DUP shuffle on the result.
24270static SDValue performSelectCombine(SDNode *N,
24271 TargetLowering::DAGCombinerInfo &DCI) {
24272 SelectionDAG &DAG = DCI.DAG;
24273 SDValue N0 = N->getOperand(Num: 0);
24274 EVT ResVT = N->getValueType(ResNo: 0);
24275
24276 if (N0.getOpcode() != ISD::SETCC)
24277 return SDValue();
24278
24279 if (ResVT.isScalableVT())
24280 return SDValue();
24281
24282 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
24283 // scalar SetCCResultType. We also don't expect vectors, because we assume
24284 // that selects fed by vector SETCCs are canonicalized to VSELECT.
24285 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
24286 "Scalar-SETCC feeding SELECT has unexpected result type!");
24287
24288 // If NumMaskElts == 0, the comparison is larger than select result. The
24289 // largest real NEON comparison is 64-bits per lane, which means the result is
24290 // at most 32-bits and an illegal vector. Just bail out for now.
24291 EVT SrcVT = N0.getOperand(i: 0).getValueType();
24292
24293 // Don't try to do this optimization when the setcc itself has i1 operands.
24294 // There are no legal vectors of i1, so this would be pointless. v1f16 is
24295 // ruled out to prevent the creation of setcc that need to be scalarized.
24296 if (SrcVT == MVT::i1 ||
24297 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
24298 return SDValue();
24299
24300 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
24301 if (!ResVT.isVector() || NumMaskElts == 0)
24302 return SDValue();
24303
24304 SrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SrcVT, NumElements: NumMaskElts);
24305 EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
24306
24307 // Also bail out if the vector CCVT isn't the same size as ResVT.
24308 // This can happen if the SETCC operand size doesn't divide the ResVT size
24309 // (e.g., f64 vs v3f32).
24310 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
24311 return SDValue();
24312
24313 // Make sure we didn't create illegal types, if we're not supposed to.
24314 assert(DCI.isBeforeLegalize() ||
24315 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
24316
24317 // First perform a vector comparison, where lane 0 is the one we're interested
24318 // in.
24319 SDLoc DL(N0);
24320 SDValue LHS =
24321 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: 0));
24322 SDValue RHS =
24323 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: 1));
24324 SDValue SetCC = DAG.getNode(Opcode: ISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS, N3: N0.getOperand(i: 2));
24325
24326 // Now duplicate the comparison mask we want across all other lanes.
24327 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
24328 SDValue Mask = DAG.getVectorShuffle(VT: CCVT, dl: DL, N1: SetCC, N2: SetCC, Mask: DUPMask);
24329 Mask = DAG.getNode(Opcode: ISD::BITCAST, DL,
24330 VT: ResVT.changeVectorElementTypeToInteger(), Operand: Mask);
24331
24332 return DAG.getSelect(DL, VT: ResVT, Cond: Mask, LHS: N->getOperand(Num: 1), RHS: N->getOperand(Num: 2));
24333}
24334
24335static SDValue performDUPCombine(SDNode *N,
24336 TargetLowering::DAGCombinerInfo &DCI) {
24337 EVT VT = N->getValueType(ResNo: 0);
24338 SDLoc DL(N);
24339 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
24340 // 128bit vector version.
24341 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
24342 EVT LVT = VT.getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
24343 SmallVector<SDValue> Ops(N->ops());
24344 if (SDNode *LN = DCI.DAG.getNodeIfExists(Opcode: N->getOpcode(),
24345 VTList: DCI.DAG.getVTList(VT: LVT), Ops)) {
24346 return DCI.DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: SDValue(LN, 0),
24347 N2: DCI.DAG.getConstant(Val: 0, DL, VT: MVT::i64));
24348 }
24349 }
24350
24351 if (N->getOpcode() == AArch64ISD::DUP) {
24352 if (DCI.isAfterLegalizeDAG()) {
24353 // If scalar dup's operand is extract_vector_elt, try to combine them into
24354 // duplane. For example,
24355 //
24356 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
24357 // t18: v4i32 = AArch64ISD::DUP t21
24358 // ==>
24359 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
24360 SDValue EXTRACT_VEC_ELT = N->getOperand(Num: 0);
24361 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24362 if (VT == EXTRACT_VEC_ELT.getOperand(i: 0).getValueType()) {
24363 unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
24364 return DCI.DAG.getNode(Opcode, DL, VT, N1: EXTRACT_VEC_ELT.getOperand(i: 0),
24365 N2: EXTRACT_VEC_ELT.getOperand(i: 1));
24366 }
24367 }
24368 }
24369
24370 return performPostLD1Combine(N, DCI, IsLaneOp: false);
24371 }
24372
24373 return SDValue();
24374}
24375
24376/// Get rid of unnecessary NVCASTs (that don't change the type).
24377static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) {
24378 if (N->getValueType(ResNo: 0) == N->getOperand(Num: 0).getValueType())
24379 return N->getOperand(Num: 0);
24380 if (N->getOperand(Num: 0).getOpcode() == AArch64ISD::NVCAST)
24381 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
24382 Operand: N->getOperand(Num: 0).getOperand(i: 0));
24383
24384 return SDValue();
24385}
24386
24387// If all users of the globaladdr are of the form (globaladdr + constant), find
24388// the smallest constant, fold it into the globaladdr's offset and rewrite the
24389// globaladdr as (globaladdr + constant) - constant.
24390static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
24391 const AArch64Subtarget *Subtarget,
24392 const TargetMachine &TM) {
24393 auto *GN = cast<GlobalAddressSDNode>(Val: N);
24394 if (Subtarget->ClassifyGlobalReference(GV: GN->getGlobal(), TM) !=
24395 AArch64II::MO_NO_FLAG)
24396 return SDValue();
24397
24398 uint64_t MinOffset = -1ull;
24399 for (SDNode *N : GN->uses()) {
24400 if (N->getOpcode() != ISD::ADD)
24401 return SDValue();
24402 auto *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0));
24403 if (!C)
24404 C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
24405 if (!C)
24406 return SDValue();
24407 MinOffset = std::min(a: MinOffset, b: C->getZExtValue());
24408 }
24409 uint64_t Offset = MinOffset + GN->getOffset();
24410
24411 // Require that the new offset is larger than the existing one. Otherwise, we
24412 // can end up oscillating between two possible DAGs, for example,
24413 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
24414 if (Offset <= uint64_t(GN->getOffset()))
24415 return SDValue();
24416
24417 // Check whether folding this offset is legal. It must not go out of bounds of
24418 // the referenced object to avoid violating the code model, and must be
24419 // smaller than 2^20 because this is the largest offset expressible in all
24420 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
24421 // stores an immediate signed 21 bit offset.)
24422 //
24423 // This check also prevents us from folding negative offsets, which will end
24424 // up being treated in the same way as large positive ones. They could also
24425 // cause code model violations, and aren't really common enough to matter.
24426 if (Offset >= (1 << 20))
24427 return SDValue();
24428
24429 const GlobalValue *GV = GN->getGlobal();
24430 Type *T = GV->getValueType();
24431 if (!T->isSized() ||
24432 Offset > GV->getDataLayout().getTypeAllocSize(Ty: T))
24433 return SDValue();
24434
24435 SDLoc DL(GN);
24436 SDValue Result = DAG.getGlobalAddress(GV, DL, VT: MVT::i64, offset: Offset);
24437 return DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: Result,
24438 N2: DAG.getConstant(Val: MinOffset, DL, VT: MVT::i64));
24439}
24440
24441static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG,
24442 const AArch64Subtarget *Subtarget) {
24443 SDValue BR = N->getOperand(Num: 0);
24444 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
24445 !BR.getValueType().isScalarInteger())
24446 return SDValue();
24447
24448 SDLoc DL(N);
24449 return DAG.getNode(Opcode: ISD::CTTZ, DL, VT: BR.getValueType(), Operand: BR.getOperand(i: 0));
24450}
24451
24452// Turns the vector of indices into a vector of byte offstes by scaling Offset
24453// by (BitWidth / 8).
24454static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
24455 SDLoc DL, unsigned BitWidth) {
24456 assert(Offset.getValueType().isScalableVector() &&
24457 "This method is only for scalable vectors of offsets");
24458
24459 SDValue Shift = DAG.getConstant(Val: Log2_32(Value: BitWidth / 8), DL, VT: MVT::i64);
24460 SDValue SplatShift = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: Shift);
24461
24462 return DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::nxv2i64, N1: Offset, N2: SplatShift);
24463}
24464
24465/// Check if the value of \p OffsetInBytes can be used as an immediate for
24466/// the gather load/prefetch and scatter store instructions with vector base and
24467/// immediate offset addressing mode:
24468///
24469/// [<Zn>.[S|D]{, #<imm>}]
24470///
24471/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
24472inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
24473 unsigned ScalarSizeInBytes) {
24474 // The immediate is not a multiple of the scalar size.
24475 if (OffsetInBytes % ScalarSizeInBytes)
24476 return false;
24477
24478 // The immediate is out of range.
24479 if (OffsetInBytes / ScalarSizeInBytes > 31)
24480 return false;
24481
24482 return true;
24483}
24484
24485/// Check if the value of \p Offset represents a valid immediate for the SVE
24486/// gather load/prefetch and scatter store instructiona with vector base and
24487/// immediate offset addressing mode:
24488///
24489/// [<Zn>.[S|D]{, #<imm>}]
24490///
24491/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
24492static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
24493 unsigned ScalarSizeInBytes) {
24494 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Val: Offset.getNode());
24495 return OffsetConst && isValidImmForSVEVecImmAddrMode(
24496 OffsetInBytes: OffsetConst->getZExtValue(), ScalarSizeInBytes);
24497}
24498
24499static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
24500 unsigned Opcode,
24501 bool OnlyPackedOffsets = true) {
24502 const SDValue Src = N->getOperand(Num: 2);
24503 const EVT SrcVT = Src->getValueType(ResNo: 0);
24504 assert(SrcVT.isScalableVector() &&
24505 "Scatter stores are only possible for SVE vectors");
24506
24507 SDLoc DL(N);
24508 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
24509
24510 // Make sure that source data will fit into an SVE register
24511 if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
24512 return SDValue();
24513
24514 // For FPs, ACLE only supports _packed_ single and double precision types.
24515 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
24516 if (SrcElVT.isFloatingPoint())
24517 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
24518 ((Opcode != AArch64ISD::SST1Q_PRED &&
24519 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
24520 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
24521 return SDValue();
24522
24523 // Depending on the addressing mode, this is either a pointer or a vector of
24524 // pointers (that fits into one register)
24525 SDValue Base = N->getOperand(Num: 4);
24526 // Depending on the addressing mode, this is either a single offset or a
24527 // vector of offsets (that fits into one register)
24528 SDValue Offset = N->getOperand(Num: 5);
24529
24530 // For "scalar + vector of indices", just scale the indices. This only
24531 // applies to non-temporal scatters because there's no instruction that takes
24532 // indices.
24533 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
24534 Offset =
24535 getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
24536 Opcode = AArch64ISD::SSTNT1_PRED;
24537 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
24538 Offset =
24539 getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
24540 Opcode = AArch64ISD::SST1Q_PRED;
24541 }
24542
24543 // In the case of non-temporal gather loads there's only one SVE instruction
24544 // per data-size: "scalar + vector", i.e.
24545 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
24546 // Since we do have intrinsics that allow the arguments to be in a different
24547 // order, we may need to swap them to match the spec.
24548 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
24549 Offset.getValueType().isVector())
24550 std::swap(a&: Base, b&: Offset);
24551
24552 // SST1_IMM requires that the offset is an immediate that is:
24553 // * a multiple of #SizeInBytes,
24554 // * in the range [0, 31 x #SizeInBytes],
24555 // where #SizeInBytes is the size in bytes of the stored items. For
24556 // immediates outside that range and non-immediate scalar offsets use SST1 or
24557 // SST1_UXTW instead.
24558 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
24559 if (!isValidImmForSVEVecImmAddrMode(Offset,
24560 ScalarSizeInBytes: SrcVT.getScalarSizeInBits() / 8)) {
24561 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24562 Opcode = AArch64ISD::SST1_UXTW_PRED;
24563 else
24564 Opcode = AArch64ISD::SST1_PRED;
24565
24566 std::swap(a&: Base, b&: Offset);
24567 }
24568 }
24569
24570 auto &TLI = DAG.getTargetLoweringInfo();
24571 if (!TLI.isTypeLegal(VT: Base.getValueType()))
24572 return SDValue();
24573
24574 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
24575 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24576 // nxv2i64. Legalize accordingly.
24577 if (!OnlyPackedOffsets &&
24578 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24579 Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset).getValue(R: 0);
24580
24581 if (!TLI.isTypeLegal(VT: Offset.getValueType()))
24582 return SDValue();
24583
24584 // Source value type that is representable in hardware
24585 EVT HwSrcVt = getSVEContainerType(ContentTy: SrcVT);
24586
24587 // Keep the original type of the input data to store - this is needed to be
24588 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
24589 // FP values we want the integer equivalent, so just use HwSrcVt.
24590 SDValue InputVT = DAG.getValueType(SrcVT);
24591 if (SrcVT.isFloatingPoint())
24592 InputVT = DAG.getValueType(HwSrcVt);
24593
24594 SDVTList VTs = DAG.getVTList(VT: MVT::Other);
24595 SDValue SrcNew;
24596
24597 if (Src.getValueType().isFloatingPoint())
24598 SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Src);
24599 else
24600 SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Src);
24601
24602 SDValue Ops[] = {N->getOperand(Num: 0), // Chain
24603 SrcNew,
24604 N->getOperand(Num: 3), // Pg
24605 Base,
24606 Offset,
24607 InputVT};
24608
24609 return DAG.getNode(Opcode, DL, VTList: VTs, Ops);
24610}
24611
24612static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
24613 unsigned Opcode,
24614 bool OnlyPackedOffsets = true) {
24615 const EVT RetVT = N->getValueType(ResNo: 0);
24616 assert(RetVT.isScalableVector() &&
24617 "Gather loads are only possible for SVE vectors");
24618
24619 SDLoc DL(N);
24620
24621 // Make sure that the loaded data will fit into an SVE register
24622 if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
24623 return SDValue();
24624
24625 // Depending on the addressing mode, this is either a pointer or a vector of
24626 // pointers (that fits into one register)
24627 SDValue Base = N->getOperand(Num: 3);
24628 // Depending on the addressing mode, this is either a single offset or a
24629 // vector of offsets (that fits into one register)
24630 SDValue Offset = N->getOperand(Num: 4);
24631
24632 // For "scalar + vector of indices", scale the indices to obtain unscaled
24633 // offsets. This applies to non-temporal and quadword gathers, which do not
24634 // have an addressing mode with scaled offset.
24635 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
24636 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
24637 BitWidth: RetVT.getScalarSizeInBits());
24638 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
24639 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
24640 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
24641 BitWidth: RetVT.getScalarSizeInBits());
24642 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
24643 }
24644
24645 // In the case of non-temporal gather loads and quadword gather loads there's
24646 // only one addressing mode : "vector + scalar", e.g.
24647 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
24648 // Since we do have intrinsics that allow the arguments to be in a different
24649 // order, we may need to swap them to match the spec.
24650 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
24651 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
24652 Offset.getValueType().isVector())
24653 std::swap(a&: Base, b&: Offset);
24654
24655 // GLD{FF}1_IMM requires that the offset is an immediate that is:
24656 // * a multiple of #SizeInBytes,
24657 // * in the range [0, 31 x #SizeInBytes],
24658 // where #SizeInBytes is the size in bytes of the loaded items. For
24659 // immediates outside that range and non-immediate scalar offsets use
24660 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
24661 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
24662 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
24663 if (!isValidImmForSVEVecImmAddrMode(Offset,
24664 ScalarSizeInBytes: RetVT.getScalarSizeInBits() / 8)) {
24665 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24666 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24667 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
24668 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
24669 else
24670 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24671 ? AArch64ISD::GLD1_MERGE_ZERO
24672 : AArch64ISD::GLDFF1_MERGE_ZERO;
24673
24674 std::swap(a&: Base, b&: Offset);
24675 }
24676 }
24677
24678 auto &TLI = DAG.getTargetLoweringInfo();
24679 if (!TLI.isTypeLegal(VT: Base.getValueType()))
24680 return SDValue();
24681
24682 // Some gather load variants allow unpacked offsets, but only as nxv2i32
24683 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24684 // nxv2i64. Legalize accordingly.
24685 if (!OnlyPackedOffsets &&
24686 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24687 Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset).getValue(R: 0);
24688
24689 // Return value type that is representable in hardware
24690 EVT HwRetVt = getSVEContainerType(ContentTy: RetVT);
24691
24692 // Keep the original output value type around - this is needed to be able to
24693 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
24694 // values we want the integer equivalent, so just use HwRetVT.
24695 SDValue OutVT = DAG.getValueType(RetVT);
24696 if (RetVT.isFloatingPoint())
24697 OutVT = DAG.getValueType(HwRetVt);
24698
24699 SDVTList VTs = DAG.getVTList(VT1: HwRetVt, VT2: MVT::Other);
24700 SDValue Ops[] = {N->getOperand(Num: 0), // Chain
24701 N->getOperand(Num: 2), // Pg
24702 Base, Offset, OutVT};
24703
24704 SDValue Load = DAG.getNode(Opcode, DL, VTList: VTs, Ops);
24705 SDValue LoadChain = SDValue(Load.getNode(), 1);
24706
24707 if (RetVT.isInteger() && (RetVT != HwRetVt))
24708 Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: RetVT, Operand: Load.getValue(R: 0));
24709
24710 // If the original return value was FP, bitcast accordingly. Doing it here
24711 // means that we can avoid adding TableGen patterns for FPs.
24712 if (RetVT.isFloatingPoint())
24713 Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: RetVT, Operand: Load.getValue(R: 0));
24714
24715 return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
24716}
24717
24718static SDValue
24719performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
24720 SelectionDAG &DAG) {
24721 SDLoc DL(N);
24722 SDValue Src = N->getOperand(Num: 0);
24723 unsigned Opc = Src->getOpcode();
24724
24725 // Sign extend of an unsigned unpack -> signed unpack
24726 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
24727
24728 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
24729 : AArch64ISD::SUNPKLO;
24730
24731 // Push the sign extend to the operand of the unpack
24732 // This is necessary where, for example, the operand of the unpack
24733 // is another unpack:
24734 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
24735 // ->
24736 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
24737 // ->
24738 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
24739 SDValue ExtOp = Src->getOperand(Num: 0);
24740 auto VT = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT();
24741 EVT EltTy = VT.getVectorElementType();
24742 (void)EltTy;
24743
24744 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
24745 "Sign extending from an invalid type");
24746
24747 EVT ExtVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
24748
24749 SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ExtOp.getValueType(),
24750 N1: ExtOp, N2: DAG.getValueType(ExtVT));
24751
24752 return DAG.getNode(Opcode: SOpc, DL, VT: N->getValueType(ResNo: 0), Operand: Ext);
24753 }
24754
24755 if (DCI.isBeforeLegalizeOps())
24756 return SDValue();
24757
24758 if (!EnableCombineMGatherIntrinsics)
24759 return SDValue();
24760
24761 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
24762 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
24763 unsigned NewOpc;
24764 unsigned MemVTOpNum = 4;
24765 switch (Opc) {
24766 case AArch64ISD::LD1_MERGE_ZERO:
24767 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
24768 MemVTOpNum = 3;
24769 break;
24770 case AArch64ISD::LDNF1_MERGE_ZERO:
24771 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
24772 MemVTOpNum = 3;
24773 break;
24774 case AArch64ISD::LDFF1_MERGE_ZERO:
24775 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
24776 MemVTOpNum = 3;
24777 break;
24778 case AArch64ISD::GLD1_MERGE_ZERO:
24779 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
24780 break;
24781 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
24782 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
24783 break;
24784 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
24785 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
24786 break;
24787 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
24788 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
24789 break;
24790 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
24791 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
24792 break;
24793 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
24794 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
24795 break;
24796 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
24797 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
24798 break;
24799 case AArch64ISD::GLDFF1_MERGE_ZERO:
24800 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
24801 break;
24802 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
24803 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
24804 break;
24805 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
24806 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
24807 break;
24808 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
24809 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
24810 break;
24811 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
24812 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
24813 break;
24814 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
24815 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
24816 break;
24817 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
24818 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
24819 break;
24820 case AArch64ISD::GLDNT1_MERGE_ZERO:
24821 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
24822 break;
24823 default:
24824 return SDValue();
24825 }
24826
24827 EVT SignExtSrcVT = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT();
24828 EVT SrcMemVT = cast<VTSDNode>(Val: Src->getOperand(Num: MemVTOpNum))->getVT();
24829
24830 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24831 return SDValue();
24832
24833 EVT DstVT = N->getValueType(ResNo: 0);
24834 SDVTList VTs = DAG.getVTList(VT1: DstVT, VT2: MVT::Other);
24835
24836 SmallVector<SDValue, 5> Ops;
24837 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24838 Ops.push_back(Elt: Src->getOperand(Num: I));
24839
24840 SDValue ExtLoad = DAG.getNode(Opcode: NewOpc, DL: SDLoc(N), VTList: VTs, Ops);
24841 DCI.CombineTo(N, Res: ExtLoad);
24842 DCI.CombineTo(N: Src.getNode(), Res0: ExtLoad, Res1: ExtLoad.getValue(R: 1));
24843
24844 // Return N so it doesn't get rechecked
24845 return SDValue(N, 0);
24846}
24847
24848/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24849/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24850/// != nxv2i32) do not need legalization.
24851static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
24852 const unsigned OffsetPos = 4;
24853 SDValue Offset = N->getOperand(Num: OffsetPos);
24854
24855 // Not an unpacked vector, bail out.
24856 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24857 return SDValue();
24858
24859 // Extend the unpacked offset vector to 64-bit lanes.
24860 SDLoc DL(N);
24861 Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset);
24862 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24863 // Replace the offset operand with the 64-bit one.
24864 Ops[OffsetPos] = Offset;
24865
24866 return DAG.getNode(Opcode: N->getOpcode(), DL, VTList: DAG.getVTList(VT: MVT::Other), Ops);
24867}
24868
24869/// Combines a node carrying the intrinsic
24870/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24871/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24872/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24873/// sve gather prefetch instruction with vector plus immediate addressing mode.
24874static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
24875 unsigned ScalarSizeInBytes) {
24876 const unsigned ImmPos = 4, OffsetPos = 3;
24877 // No need to combine the node if the immediate is valid...
24878 if (isValidImmForSVEVecImmAddrMode(Offset: N->getOperand(Num: ImmPos), ScalarSizeInBytes))
24879 return SDValue();
24880
24881 // ...otherwise swap the offset base with the offset...
24882 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24883 std::swap(a&: Ops[ImmPos], b&: Ops[OffsetPos]);
24884 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24885 // `aarch64_sve_prfb_gather_uxtw_index`.
24886 SDLoc DL(N);
24887 Ops[1] = DAG.getConstant(Val: Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24888 VT: MVT::i64);
24889
24890 return DAG.getNode(Opcode: N->getOpcode(), DL, VTList: DAG.getVTList(VT: MVT::Other), Ops);
24891}
24892
24893// Return true if the vector operation can guarantee only the first lane of its
24894// result contains data, with all bits in other lanes set to zero.
24895static bool isLanes1toNKnownZero(SDValue Op) {
24896 switch (Op.getOpcode()) {
24897 default:
24898 return false;
24899 case AArch64ISD::ANDV_PRED:
24900 case AArch64ISD::EORV_PRED:
24901 case AArch64ISD::FADDA_PRED:
24902 case AArch64ISD::FADDV_PRED:
24903 case AArch64ISD::FMAXNMV_PRED:
24904 case AArch64ISD::FMAXV_PRED:
24905 case AArch64ISD::FMINNMV_PRED:
24906 case AArch64ISD::FMINV_PRED:
24907 case AArch64ISD::ORV_PRED:
24908 case AArch64ISD::SADDV_PRED:
24909 case AArch64ISD::SMAXV_PRED:
24910 case AArch64ISD::SMINV_PRED:
24911 case AArch64ISD::UADDV_PRED:
24912 case AArch64ISD::UMAXV_PRED:
24913 case AArch64ISD::UMINV_PRED:
24914 return true;
24915 }
24916}
24917
24918static SDValue removeRedundantInsertVectorElt(SDNode *N) {
24919 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24920 SDValue InsertVec = N->getOperand(Num: 0);
24921 SDValue InsertElt = N->getOperand(Num: 1);
24922 SDValue InsertIdx = N->getOperand(Num: 2);
24923
24924 // We only care about inserts into the first element...
24925 if (!isNullConstant(V: InsertIdx))
24926 return SDValue();
24927 // ...of a zero'd vector...
24928 if (!ISD::isConstantSplatVectorAllZeros(N: InsertVec.getNode()))
24929 return SDValue();
24930 // ...where the inserted data was previously extracted...
24931 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24932 return SDValue();
24933
24934 SDValue ExtractVec = InsertElt.getOperand(i: 0);
24935 SDValue ExtractIdx = InsertElt.getOperand(i: 1);
24936
24937 // ...from the first element of a vector.
24938 if (!isNullConstant(V: ExtractIdx))
24939 return SDValue();
24940
24941 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24942
24943 // Ensure there's no type conversion going on.
24944 if (N->getValueType(ResNo: 0) != ExtractVec.getValueType())
24945 return SDValue();
24946
24947 if (!isLanes1toNKnownZero(Op: ExtractVec))
24948 return SDValue();
24949
24950 // The explicit zeroing is redundant.
24951 return ExtractVec;
24952}
24953
24954static SDValue
24955performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
24956 if (SDValue Res = removeRedundantInsertVectorElt(N))
24957 return Res;
24958
24959 return performPostLD1Combine(N, DCI, IsLaneOp: true);
24960}
24961
24962static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
24963 TargetLowering::DAGCombinerInfo &DCI,
24964 const AArch64Subtarget *Subtarget) {
24965 SDValue N0 = N->getOperand(Num: 0);
24966 EVT VT = N->getValueType(ResNo: 0);
24967
24968 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24969 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24970 return SDValue();
24971
24972 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24973 EVT EltVT = VT.getVectorElementType();
24974 return EltVT == MVT::f32 || EltVT == MVT::f64;
24975 };
24976
24977 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24978 // We purposefully don't care about legality of the nodes here as we know
24979 // they can be split down into something legal.
24980 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N: N0.getNode()) &&
24981 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24982 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24983 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24984 LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
24985 SDValue ExtLoad = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: SDLoc(N), VT,
24986 Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
24987 MemVT: N0.getValueType(), MMO: LN0->getMemOperand());
24988 DCI.CombineTo(N, Res: ExtLoad);
24989 DCI.CombineTo(
24990 N: N0.getNode(),
24991 Res0: DAG.getNode(Opcode: ISD::FP_ROUND, DL: SDLoc(N0), VT: N0.getValueType(), N1: ExtLoad,
24992 N2: DAG.getIntPtrConstant(Val: 1, DL: SDLoc(N0), /*isTarget=*/true)),
24993 Res1: ExtLoad.getValue(R: 1));
24994 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24995 }
24996
24997 return SDValue();
24998}
24999
25000static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
25001 const AArch64Subtarget *Subtarget) {
25002 EVT VT = N->getValueType(ResNo: 0);
25003
25004 // Don't expand for NEON, SVE2 or SME
25005 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
25006 return SDValue();
25007
25008 SDLoc DL(N);
25009
25010 SDValue Mask = N->getOperand(Num: 0);
25011 SDValue In1 = N->getOperand(Num: 1);
25012 SDValue In2 = N->getOperand(Num: 2);
25013
25014 SDValue InvMask = DAG.getNOT(DL, Val: Mask, VT);
25015 SDValue Sel = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Mask, N2: In1);
25016 SDValue SelInv = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: InvMask, N2: In2);
25017 return DAG.getNode(Opcode: ISD::OR, DL, VT, N1: Sel, N2: SelInv);
25018}
25019
25020static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
25021 EVT VT = N->getValueType(ResNo: 0);
25022
25023 SDValue Insert = N->getOperand(Num: 0);
25024 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
25025 return SDValue();
25026
25027 if (!Insert.getOperand(i: 0).isUndef())
25028 return SDValue();
25029
25030 uint64_t IdxInsert = Insert.getConstantOperandVal(i: 2);
25031 uint64_t IdxDupLane = N->getConstantOperandVal(Num: 1);
25032 if (IdxInsert != 0 || IdxDupLane != 0)
25033 return SDValue();
25034
25035 SDValue Bitcast = Insert.getOperand(i: 1);
25036 if (Bitcast.getOpcode() != ISD::BITCAST)
25037 return SDValue();
25038
25039 SDValue Subvec = Bitcast.getOperand(i: 0);
25040 EVT SubvecVT = Subvec.getValueType();
25041 if (!SubvecVT.is128BitVector())
25042 return SDValue();
25043 EVT NewSubvecVT =
25044 getPackedSVEVectorVT(VT: Subvec.getValueType().getVectorElementType());
25045
25046 SDLoc DL(N);
25047 SDValue NewInsert =
25048 DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewSubvecVT,
25049 N1: DAG.getUNDEF(VT: NewSubvecVT), N2: Subvec, N3: Insert->getOperand(Num: 2));
25050 SDValue NewDuplane128 = DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT: NewSubvecVT,
25051 N1: NewInsert, N2: N->getOperand(Num: 1));
25052 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewDuplane128);
25053}
25054
25055// Try to combine mull with uzp1.
25056static SDValue tryCombineMULLWithUZP1(SDNode *N,
25057 TargetLowering::DAGCombinerInfo &DCI,
25058 SelectionDAG &DAG) {
25059 if (DCI.isBeforeLegalizeOps())
25060 return SDValue();
25061
25062 SDValue LHS = N->getOperand(Num: 0);
25063 SDValue RHS = N->getOperand(Num: 1);
25064
25065 SDValue ExtractHigh;
25066 SDValue ExtractLow;
25067 SDValue TruncHigh;
25068 SDValue TruncLow;
25069 SDLoc DL(N);
25070
25071 // Check the operands are trunc and extract_high.
25072 if (isEssentiallyExtractHighSubvector(N: LHS) &&
25073 RHS.getOpcode() == ISD::TRUNCATE) {
25074 TruncHigh = RHS;
25075 if (LHS.getOpcode() == ISD::BITCAST)
25076 ExtractHigh = LHS.getOperand(i: 0);
25077 else
25078 ExtractHigh = LHS;
25079 } else if (isEssentiallyExtractHighSubvector(N: RHS) &&
25080 LHS.getOpcode() == ISD::TRUNCATE) {
25081 TruncHigh = LHS;
25082 if (RHS.getOpcode() == ISD::BITCAST)
25083 ExtractHigh = RHS.getOperand(i: 0);
25084 else
25085 ExtractHigh = RHS;
25086 } else
25087 return SDValue();
25088
25089 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
25090 // with uzp1.
25091 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
25092 SDValue TruncHighOp = TruncHigh.getOperand(i: 0);
25093 EVT TruncHighOpVT = TruncHighOp.getValueType();
25094 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
25095 DAG.isSplatValue(V: TruncHighOp, AllowUndefs: false))
25096 return SDValue();
25097
25098 // Check there is other extract_high with same source vector.
25099 // For example,
25100 //
25101 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
25102 // t12: v4i16 = truncate t11
25103 // t31: v4i32 = AArch64ISD::SMULL t18, t12
25104 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
25105 // t16: v4i16 = truncate t15
25106 // t30: v4i32 = AArch64ISD::SMULL t23, t1
25107 //
25108 // This dagcombine assumes the two extract_high uses same source vector in
25109 // order to detect the pair of the mull. If they have different source vector,
25110 // this code will not work.
25111 // TODO: Should also try to look through a bitcast.
25112 bool HasFoundMULLow = true;
25113 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(i: 0);
25114 if (ExtractHighSrcVec->use_size() != 2)
25115 HasFoundMULLow = false;
25116
25117 // Find ExtractLow.
25118 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
25119 if (User == ExtractHigh.getNode())
25120 continue;
25121
25122 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
25123 !isNullConstant(V: User->getOperand(Num: 1))) {
25124 HasFoundMULLow = false;
25125 break;
25126 }
25127
25128 ExtractLow.setNode(User);
25129 }
25130
25131 if (!ExtractLow || !ExtractLow->hasOneUse())
25132 HasFoundMULLow = false;
25133
25134 // Check ExtractLow's user.
25135 if (HasFoundMULLow) {
25136 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
25137 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
25138 HasFoundMULLow = false;
25139 } else {
25140 if (ExtractLowUser->getOperand(Num: 0) == ExtractLow) {
25141 if (ExtractLowUser->getOperand(Num: 1).getOpcode() == ISD::TRUNCATE)
25142 TruncLow = ExtractLowUser->getOperand(Num: 1);
25143 else
25144 HasFoundMULLow = false;
25145 } else {
25146 if (ExtractLowUser->getOperand(Num: 0).getOpcode() == ISD::TRUNCATE)
25147 TruncLow = ExtractLowUser->getOperand(Num: 0);
25148 else
25149 HasFoundMULLow = false;
25150 }
25151 }
25152 }
25153
25154 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
25155 // with uzp1.
25156 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
25157 EVT TruncHighVT = TruncHigh.getValueType();
25158 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
25159 SDValue TruncLowOp =
25160 HasFoundMULLow ? TruncLow.getOperand(i: 0) : DAG.getUNDEF(VT: UZP1VT);
25161 EVT TruncLowOpVT = TruncLowOp.getValueType();
25162 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
25163 DAG.isSplatValue(V: TruncLowOp, AllowUndefs: false)))
25164 return SDValue();
25165
25166 // Create uzp1, extract_high and extract_low.
25167 if (TruncHighOpVT != UZP1VT)
25168 TruncHighOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncHighOp);
25169 if (TruncLowOpVT != UZP1VT)
25170 TruncLowOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncLowOp);
25171
25172 SDValue UZP1 =
25173 DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UZP1VT, N1: TruncLowOp, N2: TruncHighOp);
25174 SDValue HighIdxCst =
25175 DAG.getConstant(Val: TruncHighVT.getVectorNumElements(), DL, VT: MVT::i64);
25176 SDValue NewTruncHigh =
25177 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncHighVT, N1: UZP1, N2: HighIdxCst);
25178 DAG.ReplaceAllUsesWith(From: TruncHigh, To: NewTruncHigh);
25179
25180 if (HasFoundMULLow) {
25181 EVT TruncLowVT = TruncLow.getValueType();
25182 SDValue NewTruncLow = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncLowVT,
25183 N1: UZP1, N2: ExtractLow.getOperand(i: 1));
25184 DAG.ReplaceAllUsesWith(From: TruncLow, To: NewTruncLow);
25185 }
25186
25187 return SDValue(N, 0);
25188}
25189
25190static SDValue performMULLCombine(SDNode *N,
25191 TargetLowering::DAGCombinerInfo &DCI,
25192 SelectionDAG &DAG) {
25193 if (SDValue Val =
25194 tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N, DCI, DAG))
25195 return Val;
25196
25197 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
25198 return Val;
25199
25200 return SDValue();
25201}
25202
25203static SDValue
25204performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
25205 SelectionDAG &DAG) {
25206 // Let's do below transform.
25207 //
25208 // t34: v4i32 = AArch64ISD::UADDLV t2
25209 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
25210 // t7: i64 = zero_extend t35
25211 // t20: v1i64 = scalar_to_vector t7
25212 // ==>
25213 // t34: v4i32 = AArch64ISD::UADDLV t2
25214 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
25215 // t40: v1i64 = AArch64ISD::NVCAST t39
25216 if (DCI.isBeforeLegalizeOps())
25217 return SDValue();
25218
25219 EVT VT = N->getValueType(ResNo: 0);
25220 if (VT != MVT::v1i64)
25221 return SDValue();
25222
25223 SDValue ZEXT = N->getOperand(Num: 0);
25224 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
25225 return SDValue();
25226
25227 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(i: 0);
25228 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
25229 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
25230 return SDValue();
25231
25232 if (!isNullConstant(V: EXTRACT_VEC_ELT.getOperand(i: 1)))
25233 return SDValue();
25234
25235 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(i: 0);
25236 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
25237 UADDLV.getValueType() != MVT::v4i32 ||
25238 UADDLV.getOperand(i: 0).getValueType() != MVT::v8i8)
25239 return SDValue();
25240
25241 // Let's generate new sequence with AArch64ISD::NVCAST.
25242 SDLoc DL(N);
25243 SDValue EXTRACT_SUBVEC =
25244 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v2i32, N1: UADDLV,
25245 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
25246 SDValue NVCAST =
25247 DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: MVT::v1i64, Operand: EXTRACT_SUBVEC);
25248
25249 return NVCAST;
25250}
25251
25252SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
25253 DAGCombinerInfo &DCI) const {
25254 SelectionDAG &DAG = DCI.DAG;
25255 switch (N->getOpcode()) {
25256 default:
25257 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
25258 break;
25259 case ISD::VECREDUCE_AND:
25260 case ISD::VECREDUCE_OR:
25261 case ISD::VECREDUCE_XOR:
25262 return performVecReduceBitwiseCombine(N, DCI, DAG);
25263 case ISD::ADD:
25264 case ISD::SUB:
25265 return performAddSubCombine(N, DCI);
25266 case ISD::BUILD_VECTOR:
25267 return performBuildVectorCombine(N, DCI, DAG);
25268 case ISD::TRUNCATE:
25269 return performTruncateCombine(N, DAG);
25270 case AArch64ISD::ANDS:
25271 return performFlagSettingCombine(N, DCI, GenericOpcode: ISD::AND);
25272 case AArch64ISD::ADC:
25273 if (auto R = foldOverflowCheck(Op: N, DAG, /* IsAdd */ true))
25274 return R;
25275 return foldADCToCINC(N, DAG);
25276 case AArch64ISD::SBC:
25277 return foldOverflowCheck(Op: N, DAG, /* IsAdd */ false);
25278 case AArch64ISD::ADCS:
25279 if (auto R = foldOverflowCheck(Op: N, DAG, /* IsAdd */ true))
25280 return R;
25281 return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::ADC);
25282 case AArch64ISD::SBCS:
25283 if (auto R = foldOverflowCheck(Op: N, DAG, /* IsAdd */ false))
25284 return R;
25285 return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::SBC);
25286 case AArch64ISD::BICi: {
25287 APInt DemandedBits =
25288 APInt::getAllOnes(numBits: N->getValueType(ResNo: 0).getScalarSizeInBits());
25289 APInt DemandedElts =
25290 APInt::getAllOnes(numBits: N->getValueType(ResNo: 0).getVectorNumElements());
25291
25292 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(
25293 Op: SDValue(N, 0), DemandedBits, DemandedElts, DCI))
25294 return SDValue();
25295
25296 break;
25297 }
25298 case ISD::XOR:
25299 return performXorCombine(N, DAG, DCI, Subtarget);
25300 case ISD::MUL:
25301 return performMulCombine(N, DAG, DCI, Subtarget);
25302 case ISD::SINT_TO_FP:
25303 case ISD::UINT_TO_FP:
25304 return performIntToFpCombine(N, DAG, Subtarget);
25305 case ISD::FP_TO_SINT:
25306 case ISD::FP_TO_UINT:
25307 case ISD::FP_TO_SINT_SAT:
25308 case ISD::FP_TO_UINT_SAT:
25309 return performFpToIntCombine(N, DAG, DCI, Subtarget);
25310 case ISD::OR:
25311 return performORCombine(N, DCI, Subtarget, TLI: *this);
25312 case ISD::AND:
25313 return performANDCombine(N, DCI);
25314 case ISD::FADD:
25315 return performFADDCombine(N, DCI);
25316 case ISD::INTRINSIC_WO_CHAIN:
25317 return performIntrinsicCombine(N, DCI, Subtarget);
25318 case ISD::ANY_EXTEND:
25319 case ISD::ZERO_EXTEND:
25320 case ISD::SIGN_EXTEND:
25321 return performExtendCombine(N, DCI, DAG);
25322 case ISD::SIGN_EXTEND_INREG:
25323 return performSignExtendInRegCombine(N, DCI, DAG);
25324 case ISD::CONCAT_VECTORS:
25325 return performConcatVectorsCombine(N, DCI, DAG);
25326 case ISD::EXTRACT_SUBVECTOR:
25327 return performExtractSubvectorCombine(N, DCI, DAG);
25328 case ISD::INSERT_SUBVECTOR:
25329 return performInsertSubvectorCombine(N, DCI, DAG);
25330 case ISD::SELECT:
25331 return performSelectCombine(N, DCI);
25332 case ISD::VSELECT:
25333 return performVSelectCombine(N, DAG&: DCI.DAG);
25334 case ISD::SETCC:
25335 return performSETCCCombine(N, DCI, DAG);
25336 case ISD::LOAD:
25337 return performLOADCombine(N, DCI, DAG, Subtarget);
25338 case ISD::STORE:
25339 return performSTORECombine(N, DCI, DAG, Subtarget);
25340 case ISD::MSTORE:
25341 return performMSTORECombine(N, DCI, DAG, Subtarget);
25342 case ISD::MGATHER:
25343 case ISD::MSCATTER:
25344 return performMaskedGatherScatterCombine(N, DCI, DAG);
25345 case ISD::FP_EXTEND:
25346 return performFPExtendCombine(N, DAG, DCI, Subtarget);
25347 case AArch64ISD::BRCOND:
25348 return performBRCONDCombine(N, DCI, DAG);
25349 case AArch64ISD::TBNZ:
25350 case AArch64ISD::TBZ:
25351 return performTBZCombine(N, DCI, DAG);
25352 case AArch64ISD::CSEL:
25353 return performCSELCombine(N, DCI, DAG);
25354 case AArch64ISD::DUP:
25355 case AArch64ISD::DUPLANE8:
25356 case AArch64ISD::DUPLANE16:
25357 case AArch64ISD::DUPLANE32:
25358 case AArch64ISD::DUPLANE64:
25359 return performDUPCombine(N, DCI);
25360 case AArch64ISD::DUPLANE128:
25361 return performDupLane128Combine(N, DAG);
25362 case AArch64ISD::NVCAST:
25363 return performNVCASTCombine(N, DAG);
25364 case AArch64ISD::SPLICE:
25365 return performSpliceCombine(N, DAG);
25366 case AArch64ISD::UUNPKLO:
25367 case AArch64ISD::UUNPKHI:
25368 return performUnpackCombine(N, DAG, Subtarget);
25369 case AArch64ISD::UZP1:
25370 case AArch64ISD::UZP2:
25371 return performUzpCombine(N, DAG, Subtarget);
25372 case AArch64ISD::SETCC_MERGE_ZERO:
25373 return performSetccMergeZeroCombine(N, DCI);
25374 case AArch64ISD::REINTERPRET_CAST:
25375 return performReinterpretCastCombine(N);
25376 case AArch64ISD::GLD1_MERGE_ZERO:
25377 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
25378 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
25379 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
25380 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
25381 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
25382 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
25383 case AArch64ISD::GLD1S_MERGE_ZERO:
25384 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
25385 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
25386 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
25387 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
25388 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
25389 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
25390 return performGLD1Combine(N, DAG);
25391 case AArch64ISD::VASHR:
25392 case AArch64ISD::VLSHR:
25393 return performVectorShiftCombine(N, TLI: *this, DCI);
25394 case AArch64ISD::SUNPKLO:
25395 return performSunpkloCombine(N, DAG);
25396 case AArch64ISD::BSP:
25397 return performBSPExpandForSVE(N, DAG, Subtarget);
25398 case ISD::INSERT_VECTOR_ELT:
25399 return performInsertVectorEltCombine(N, DCI);
25400 case ISD::EXTRACT_VECTOR_ELT:
25401 return performExtractVectorEltCombine(N, DCI, Subtarget);
25402 case ISD::VECREDUCE_ADD:
25403 return performVecReduceAddCombine(N, DAG&: DCI.DAG, ST: Subtarget);
25404 case AArch64ISD::UADDV:
25405 return performUADDVCombine(N, DAG);
25406 case AArch64ISD::SMULL:
25407 case AArch64ISD::UMULL:
25408 case AArch64ISD::PMULL:
25409 return performMULLCombine(N, DCI, DAG);
25410 case ISD::INTRINSIC_VOID:
25411 case ISD::INTRINSIC_W_CHAIN:
25412 switch (N->getConstantOperandVal(Num: 1)) {
25413 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
25414 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 1 /*=ScalarSizeInBytes*/);
25415 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
25416 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 2 /*=ScalarSizeInBytes*/);
25417 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
25418 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 4 /*=ScalarSizeInBytes*/);
25419 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
25420 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 8 /*=ScalarSizeInBytes*/);
25421 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
25422 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
25423 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
25424 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
25425 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
25426 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
25427 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
25428 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
25429 return legalizeSVEGatherPrefetchOffsVec(N, DAG);
25430 case Intrinsic::aarch64_neon_ld2:
25431 case Intrinsic::aarch64_neon_ld3:
25432 case Intrinsic::aarch64_neon_ld4:
25433 case Intrinsic::aarch64_neon_ld1x2:
25434 case Intrinsic::aarch64_neon_ld1x3:
25435 case Intrinsic::aarch64_neon_ld1x4:
25436 case Intrinsic::aarch64_neon_ld2lane:
25437 case Intrinsic::aarch64_neon_ld3lane:
25438 case Intrinsic::aarch64_neon_ld4lane:
25439 case Intrinsic::aarch64_neon_ld2r:
25440 case Intrinsic::aarch64_neon_ld3r:
25441 case Intrinsic::aarch64_neon_ld4r:
25442 case Intrinsic::aarch64_neon_st2:
25443 case Intrinsic::aarch64_neon_st3:
25444 case Intrinsic::aarch64_neon_st4:
25445 case Intrinsic::aarch64_neon_st1x2:
25446 case Intrinsic::aarch64_neon_st1x3:
25447 case Intrinsic::aarch64_neon_st1x4:
25448 case Intrinsic::aarch64_neon_st2lane:
25449 case Intrinsic::aarch64_neon_st3lane:
25450 case Intrinsic::aarch64_neon_st4lane:
25451 return performNEONPostLDSTCombine(N, DCI, DAG);
25452 case Intrinsic::aarch64_sve_ldnt1:
25453 return performLDNT1Combine(N, DAG);
25454 case Intrinsic::aarch64_sve_ld1rq:
25455 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
25456 case Intrinsic::aarch64_sve_ld1ro:
25457 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
25458 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
25459 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
25460 case Intrinsic::aarch64_sve_ldnt1_gather:
25461 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
25462 case Intrinsic::aarch64_sve_ldnt1_gather_index:
25463 return performGatherLoadCombine(N, DAG,
25464 Opcode: AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
25465 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
25466 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
25467 case Intrinsic::aarch64_sve_ld1:
25468 return performLD1Combine(N, DAG, Opc: AArch64ISD::LD1_MERGE_ZERO);
25469 case Intrinsic::aarch64_sve_ldnf1:
25470 return performLD1Combine(N, DAG, Opc: AArch64ISD::LDNF1_MERGE_ZERO);
25471 case Intrinsic::aarch64_sve_ldff1:
25472 return performLD1Combine(N, DAG, Opc: AArch64ISD::LDFF1_MERGE_ZERO);
25473 case Intrinsic::aarch64_sve_st1:
25474 return performST1Combine(N, DAG);
25475 case Intrinsic::aarch64_sve_stnt1:
25476 return performSTNT1Combine(N, DAG);
25477 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
25478 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
25479 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
25480 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
25481 case Intrinsic::aarch64_sve_stnt1_scatter:
25482 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
25483 case Intrinsic::aarch64_sve_stnt1_scatter_index:
25484 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_INDEX_PRED);
25485 case Intrinsic::aarch64_sve_ld1_gather:
25486 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_MERGE_ZERO);
25487 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
25488 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
25489 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1Q_MERGE_ZERO);
25490 case Intrinsic::aarch64_sve_ld1q_gather_index:
25491 return performGatherLoadCombine(N, DAG,
25492 Opcode: AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
25493 case Intrinsic::aarch64_sve_ld1_gather_index:
25494 return performGatherLoadCombine(N, DAG,
25495 Opcode: AArch64ISD::GLD1_SCALED_MERGE_ZERO);
25496 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
25497 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_SXTW_MERGE_ZERO,
25498 /*OnlyPackedOffsets=*/false);
25499 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
25500 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_UXTW_MERGE_ZERO,
25501 /*OnlyPackedOffsets=*/false);
25502 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
25503 return performGatherLoadCombine(N, DAG,
25504 Opcode: AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
25505 /*OnlyPackedOffsets=*/false);
25506 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
25507 return performGatherLoadCombine(N, DAG,
25508 Opcode: AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
25509 /*OnlyPackedOffsets=*/false);
25510 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
25511 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_IMM_MERGE_ZERO);
25512 case Intrinsic::aarch64_sve_ldff1_gather:
25513 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDFF1_MERGE_ZERO);
25514 case Intrinsic::aarch64_sve_ldff1_gather_index:
25515 return performGatherLoadCombine(N, DAG,
25516 Opcode: AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
25517 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
25518 return performGatherLoadCombine(N, DAG,
25519 Opcode: AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
25520 /*OnlyPackedOffsets=*/false);
25521 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
25522 return performGatherLoadCombine(N, DAG,
25523 Opcode: AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
25524 /*OnlyPackedOffsets=*/false);
25525 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
25526 return performGatherLoadCombine(N, DAG,
25527 Opcode: AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
25528 /*OnlyPackedOffsets=*/false);
25529 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
25530 return performGatherLoadCombine(N, DAG,
25531 Opcode: AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
25532 /*OnlyPackedOffsets=*/false);
25533 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
25534 return performGatherLoadCombine(N, DAG,
25535 Opcode: AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
25536 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
25537 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
25538 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_PRED);
25539 case Intrinsic::aarch64_sve_st1q_scatter_index:
25540 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_INDEX_PRED);
25541 case Intrinsic::aarch64_sve_st1_scatter:
25542 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_PRED);
25543 case Intrinsic::aarch64_sve_st1_scatter_index:
25544 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SCALED_PRED);
25545 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
25546 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SXTW_PRED,
25547 /*OnlyPackedOffsets=*/false);
25548 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
25549 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_UXTW_PRED,
25550 /*OnlyPackedOffsets=*/false);
25551 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
25552 return performScatterStoreCombine(N, DAG,
25553 Opcode: AArch64ISD::SST1_SXTW_SCALED_PRED,
25554 /*OnlyPackedOffsets=*/false);
25555 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
25556 return performScatterStoreCombine(N, DAG,
25557 Opcode: AArch64ISD::SST1_UXTW_SCALED_PRED,
25558 /*OnlyPackedOffsets=*/false);
25559 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
25560 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_IMM_PRED);
25561 case Intrinsic::aarch64_rndr:
25562 case Intrinsic::aarch64_rndrrs: {
25563 unsigned IntrinsicID = N->getConstantOperandVal(Num: 1);
25564 auto Register =
25565 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
25566 : AArch64SysReg::RNDRRS);
25567 SDLoc DL(N);
25568 SDValue A = DAG.getNode(
25569 Opcode: AArch64ISD::MRS, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Glue, VT3: MVT::Other),
25570 N1: N->getOperand(Num: 0), N2: DAG.getConstant(Val: Register, DL, VT: MVT::i64));
25571 SDValue B = DAG.getNode(
25572 Opcode: AArch64ISD::CSINC, DL, VT: MVT::i32, N1: DAG.getConstant(Val: 0, DL, VT: MVT::i32),
25573 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32),
25574 N3: DAG.getConstant(Val: AArch64CC::NE, DL, VT: MVT::i32), N4: A.getValue(R: 1));
25575 return DAG.getMergeValues(
25576 Ops: {A, DAG.getZExtOrTrunc(Op: B, DL, VT: MVT::i1), A.getValue(R: 2)}, dl: DL);
25577 }
25578 case Intrinsic::aarch64_sme_ldr_zt:
25579 return DAG.getNode(Opcode: AArch64ISD::RESTORE_ZT, DL: SDLoc(N),
25580 VTList: DAG.getVTList(VT: MVT::Other), N1: N->getOperand(Num: 0),
25581 N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
25582 case Intrinsic::aarch64_sme_str_zt:
25583 return DAG.getNode(Opcode: AArch64ISD::SAVE_ZT, DL: SDLoc(N),
25584 VTList: DAG.getVTList(VT: MVT::Other), N1: N->getOperand(Num: 0),
25585 N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
25586 default:
25587 break;
25588 }
25589 break;
25590 case ISD::GlobalAddress:
25591 return performGlobalAddressCombine(N, DAG, Subtarget, TM: getTargetMachine());
25592 case ISD::CTLZ:
25593 return performCTLZCombine(N, DAG, Subtarget);
25594 case ISD::SCALAR_TO_VECTOR:
25595 return performScalarToVectorCombine(N, DCI, DAG);
25596 }
25597 return SDValue();
25598}
25599
25600// Check if the return value is used as only a return value, as otherwise
25601// we can't perform a tail-call. In particular, we need to check for
25602// target ISD nodes that are returns and any other "odd" constructs
25603// that the generic analysis code won't necessarily catch.
25604bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
25605 SDValue &Chain) const {
25606 if (N->getNumValues() != 1)
25607 return false;
25608 if (!N->hasNUsesOfValue(NUses: 1, Value: 0))
25609 return false;
25610
25611 SDValue TCChain = Chain;
25612 SDNode *Copy = *N->use_begin();
25613 if (Copy->getOpcode() == ISD::CopyToReg) {
25614 // If the copy has a glue operand, we conservatively assume it isn't safe to
25615 // perform a tail call.
25616 if (Copy->getOperand(Num: Copy->getNumOperands() - 1).getValueType() ==
25617 MVT::Glue)
25618 return false;
25619 TCChain = Copy->getOperand(Num: 0);
25620 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
25621 return false;
25622
25623 bool HasRet = false;
25624 for (SDNode *Node : Copy->uses()) {
25625 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
25626 return false;
25627 HasRet = true;
25628 }
25629
25630 if (!HasRet)
25631 return false;
25632
25633 Chain = TCChain;
25634 return true;
25635}
25636
25637// Return whether the an instruction can potentially be optimized to a tail
25638// call. This will cause the optimizers to attempt to move, or duplicate,
25639// return instructions to help enable tail call optimizations for this
25640// instruction.
25641bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
25642 return CI->isTailCall();
25643}
25644
25645bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
25646 Register Offset, bool IsPre,
25647 MachineRegisterInfo &MRI) const {
25648 auto CstOffset = getIConstantVRegVal(VReg: Offset, MRI);
25649 if (!CstOffset || CstOffset->isZero())
25650 return false;
25651
25652 // All of the indexed addressing mode instructions take a signed 9 bit
25653 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
25654 // encodes the sign/indexing direction.
25655 return isInt<9>(x: CstOffset->getSExtValue());
25656}
25657
25658bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
25659 SDValue &Base,
25660 SDValue &Offset,
25661 SelectionDAG &DAG) const {
25662 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
25663 return false;
25664
25665 // Non-null if there is exactly one user of the loaded value (ignoring chain).
25666 SDNode *ValOnlyUser = nullptr;
25667 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
25668 ++UI) {
25669 if (UI.getUse().getResNo() == 1)
25670 continue; // Ignore chain.
25671 if (ValOnlyUser == nullptr)
25672 ValOnlyUser = *UI;
25673 else {
25674 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
25675 break;
25676 }
25677 }
25678
25679 auto IsUndefOrZero = [](SDValue V) {
25680 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
25681 };
25682
25683 // If the only user of the value is a scalable vector splat, it is
25684 // preferable to do a replicating load (ld1r*).
25685 if (ValOnlyUser && ValOnlyUser->getValueType(ResNo: 0).isScalableVector() &&
25686 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
25687 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
25688 IsUndefOrZero(ValOnlyUser->getOperand(Num: 2)))))
25689 return false;
25690
25691 Base = Op->getOperand(Num: 0);
25692 // All of the indexed addressing mode instructions take a signed
25693 // 9 bit immediate offset.
25694 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1))) {
25695 int64_t RHSC = RHS->getSExtValue();
25696 if (Op->getOpcode() == ISD::SUB)
25697 RHSC = -(uint64_t)RHSC;
25698 if (!isInt<9>(x: RHSC))
25699 return false;
25700 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
25701 // when dealing with subtraction.
25702 Offset = DAG.getConstant(Val: RHSC, DL: SDLoc(N), VT: RHS->getValueType(ResNo: 0));
25703 return true;
25704 }
25705 return false;
25706}
25707
25708bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
25709 SDValue &Offset,
25710 ISD::MemIndexedMode &AM,
25711 SelectionDAG &DAG) const {
25712 EVT VT;
25713 SDValue Ptr;
25714 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
25715 VT = LD->getMemoryVT();
25716 Ptr = LD->getBasePtr();
25717 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
25718 VT = ST->getMemoryVT();
25719 Ptr = ST->getBasePtr();
25720 } else
25721 return false;
25722
25723 if (!getIndexedAddressParts(N, Op: Ptr.getNode(), Base, Offset, DAG))
25724 return false;
25725 AM = ISD::PRE_INC;
25726 return true;
25727}
25728
25729bool AArch64TargetLowering::getPostIndexedAddressParts(
25730 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
25731 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
25732 EVT VT;
25733 SDValue Ptr;
25734 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
25735 VT = LD->getMemoryVT();
25736 Ptr = LD->getBasePtr();
25737 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
25738 VT = ST->getMemoryVT();
25739 Ptr = ST->getBasePtr();
25740 } else
25741 return false;
25742
25743 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
25744 return false;
25745 // Post-indexing updates the base, so it's not a valid transform
25746 // if that's not the same as the load's pointer.
25747 if (Ptr != Base)
25748 return false;
25749 AM = ISD::POST_INC;
25750 return true;
25751}
25752
25753static void replaceBoolVectorBitcast(SDNode *N,
25754 SmallVectorImpl<SDValue> &Results,
25755 SelectionDAG &DAG) {
25756 SDLoc DL(N);
25757 SDValue Op = N->getOperand(Num: 0);
25758 EVT VT = N->getValueType(ResNo: 0);
25759 [[maybe_unused]] EVT SrcVT = Op.getValueType();
25760 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25761 "Must be bool vector.");
25762
25763 // Special handling for Clang's __builtin_convertvector. For vectors with <8
25764 // elements, it adds a vector concatenation with undef(s). If we encounter
25765 // this here, we can skip the concat.
25766 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(i: 0).isUndef()) {
25767 bool AllUndef = true;
25768 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
25769 AllUndef &= Op.getOperand(i: I).isUndef();
25770
25771 if (AllUndef)
25772 Op = Op.getOperand(i: 0);
25773 }
25774
25775 SDValue VectorBits = vectorToScalarBitmask(N: Op.getNode(), DAG);
25776 if (VectorBits)
25777 Results.push_back(Elt: DAG.getZExtOrTrunc(Op: VectorBits, DL, VT));
25778}
25779
25780static void CustomNonLegalBITCASTResults(SDNode *N,
25781 SmallVectorImpl<SDValue> &Results,
25782 SelectionDAG &DAG, EVT ExtendVT,
25783 EVT CastVT) {
25784 SDLoc DL(N);
25785 SDValue Op = N->getOperand(Num: 0);
25786 EVT VT = N->getValueType(ResNo: 0);
25787
25788 // Use SCALAR_TO_VECTOR for lane zero
25789 SDValue Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: ExtendVT, Operand: Op);
25790 SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: CastVT, Operand: Vec);
25791 SDValue IdxZero = DAG.getVectorIdxConstant(Val: 0, DL);
25792 Results.push_back(
25793 Elt: DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: CastVal, N2: IdxZero));
25794}
25795
25796void AArch64TargetLowering::ReplaceBITCASTResults(
25797 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
25798 SDLoc DL(N);
25799 SDValue Op = N->getOperand(Num: 0);
25800 EVT VT = N->getValueType(ResNo: 0);
25801 EVT SrcVT = Op.getValueType();
25802
25803 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25804 CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v2i32, CastVT: MVT::v4i16);
25805 return;
25806 }
25807
25808 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25809 CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v2i32, CastVT: MVT::v8i8);
25810 return;
25811 }
25812
25813 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25814 CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v4i16, CastVT: MVT::v8i8);
25815 return;
25816 }
25817
25818 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(VT: SrcVT)) {
25819 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25820 "Expected fp->int bitcast!");
25821
25822 // Bitcasting between unpacked vector types of different element counts is
25823 // not a NOP because the live elements are laid out differently.
25824 // 01234567
25825 // e.g. nxv2i32 = XX??XX??
25826 // nxv4f16 = X?X?X?X?
25827 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25828 return;
25829
25830 SDValue CastResult = getSVESafeBitCast(VT: getSVEContainerType(ContentTy: VT), Op, DAG);
25831 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: CastResult));
25832 return;
25833 }
25834
25835 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25836 !VT.isVector())
25837 return replaceBoolVectorBitcast(N, Results, DAG);
25838
25839 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25840 return;
25841
25842 Op = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
25843 Operand: DAG.getUNDEF(VT: MVT::i32), Subreg: Op);
25844 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Op);
25845 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Op));
25846}
25847
25848static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
25849 SelectionDAG &DAG,
25850 const AArch64Subtarget *Subtarget) {
25851 EVT VT = N->getValueType(ResNo: 0);
25852 if (!VT.is256BitVector() ||
25853 (VT.getScalarType().isFloatingPoint() &&
25854 !N->getFlags().hasAllowReassociation()) ||
25855 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25856 VT.getScalarType() == MVT::bf16)
25857 return;
25858
25859 SDValue X = N->getOperand(Num: 0);
25860 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 1));
25861 if (!Shuf) {
25862 Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 0));
25863 X = N->getOperand(Num: 1);
25864 if (!Shuf)
25865 return;
25866 }
25867
25868 if (Shuf->getOperand(Num: 0) != X || !Shuf->getOperand(Num: 1)->isUndef())
25869 return;
25870
25871 // Check the mask is 1,0,3,2,5,4,...
25872 ArrayRef<int> Mask = Shuf->getMask();
25873 for (int I = 0, E = Mask.size(); I < E; I++)
25874 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25875 return;
25876
25877 SDLoc DL(N);
25878 auto LoHi = DAG.SplitVector(N: X, DL);
25879 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25880 SDValue Addp = DAG.getNode(Opcode: AArch64ISD::ADDP, DL: N, VT: LoHi.first.getValueType(),
25881 N1: LoHi.first, N2: LoHi.second);
25882
25883 // Shuffle the elements back into order.
25884 SmallVector<int> NMask;
25885 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25886 NMask.push_back(Elt: I);
25887 NMask.push_back(Elt: I);
25888 }
25889 Results.push_back(
25890 Elt: DAG.getVectorShuffle(VT, dl: DL,
25891 N1: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Addp,
25892 N2: DAG.getUNDEF(VT: LoHi.first.getValueType())),
25893 N2: DAG.getUNDEF(VT), Mask: NMask));
25894}
25895
25896static void ReplaceReductionResults(SDNode *N,
25897 SmallVectorImpl<SDValue> &Results,
25898 SelectionDAG &DAG, unsigned InterOp,
25899 unsigned AcrossOp) {
25900 EVT LoVT, HiVT;
25901 SDValue Lo, Hi;
25902 SDLoc dl(N);
25903 std::tie(args&: LoVT, args&: HiVT) = DAG.GetSplitDestVTs(VT: N->getValueType(ResNo: 0));
25904 std::tie(args&: Lo, args&: Hi) = DAG.SplitVectorOperand(N, OpNo: 0);
25905 SDValue InterVal = DAG.getNode(Opcode: InterOp, DL: dl, VT: LoVT, N1: Lo, N2: Hi);
25906 SDValue SplitVal = DAG.getNode(Opcode: AcrossOp, DL: dl, VT: LoVT, Operand: InterVal);
25907 Results.push_back(Elt: SplitVal);
25908}
25909
25910void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25911 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
25912 SDValue In = N->getOperand(Num: 0);
25913 EVT InVT = In.getValueType();
25914
25915 // Common code will handle these just fine.
25916 if (!InVT.isScalableVector() || !InVT.isInteger())
25917 return;
25918
25919 SDLoc DL(N);
25920 EVT VT = N->getValueType(ResNo: 0);
25921
25922 // The following checks bail if this is not a halving operation.
25923
25924 ElementCount ResEC = VT.getVectorElementCount();
25925
25926 if (InVT.getVectorElementCount() != (ResEC * 2))
25927 return;
25928
25929 auto *CIndex = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
25930 if (!CIndex)
25931 return;
25932
25933 unsigned Index = CIndex->getZExtValue();
25934 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25935 return;
25936
25937 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25938 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
25939
25940 SDValue Half = DAG.getNode(Opcode, DL, VT: ExtendedHalfVT, Operand: N->getOperand(Num: 0));
25941 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Half));
25942}
25943
25944// Create an even/odd pair of X registers holding integer value V.
25945static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
25946 SDLoc dl(V.getNode());
25947 auto [VLo, VHi] = DAG.SplitScalar(N: V, DL: dl, LoVT: MVT::i64, HiVT: MVT::i64);
25948 if (DAG.getDataLayout().isBigEndian())
25949 std::swap (a&: VLo, b&: VHi);
25950 SDValue RegClass =
25951 DAG.getTargetConstant(Val: AArch64::XSeqPairsClassRegClassID, DL: dl, VT: MVT::i32);
25952 SDValue SubReg0 = DAG.getTargetConstant(Val: AArch64::sube64, DL: dl, VT: MVT::i32);
25953 SDValue SubReg1 = DAG.getTargetConstant(Val: AArch64::subo64, DL: dl, VT: MVT::i32);
25954 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25955 return SDValue(
25956 DAG.getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl, VT: MVT::Untyped, Ops), 0);
25957}
25958
25959static void ReplaceCMP_SWAP_128Results(SDNode *N,
25960 SmallVectorImpl<SDValue> &Results,
25961 SelectionDAG &DAG,
25962 const AArch64Subtarget *Subtarget) {
25963 assert(N->getValueType(0) == MVT::i128 &&
25964 "AtomicCmpSwap on types less than 128 should be legal");
25965
25966 MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
25967 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25968 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25969 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25970 SDValue Ops[] = {
25971 createGPRPairNode(DAG, V: N->getOperand(Num: 2)), // Compare value
25972 createGPRPairNode(DAG, V: N->getOperand(Num: 3)), // Store value
25973 N->getOperand(Num: 1), // Ptr
25974 N->getOperand(Num: 0), // Chain in
25975 };
25976
25977 unsigned Opcode;
25978 switch (MemOp->getMergedOrdering()) {
25979 case AtomicOrdering::Monotonic:
25980 Opcode = AArch64::CASPX;
25981 break;
25982 case AtomicOrdering::Acquire:
25983 Opcode = AArch64::CASPAX;
25984 break;
25985 case AtomicOrdering::Release:
25986 Opcode = AArch64::CASPLX;
25987 break;
25988 case AtomicOrdering::AcquireRelease:
25989 case AtomicOrdering::SequentiallyConsistent:
25990 Opcode = AArch64::CASPALX;
25991 break;
25992 default:
25993 llvm_unreachable("Unexpected ordering!");
25994 }
25995
25996 MachineSDNode *CmpSwap = DAG.getMachineNode(
25997 Opcode, dl: SDLoc(N), VTs: DAG.getVTList(VT1: MVT::Untyped, VT2: MVT::Other), Ops);
25998 DAG.setNodeMemRefs(N: CmpSwap, NewMemRefs: {MemOp});
25999
26000 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
26001 if (DAG.getDataLayout().isBigEndian())
26002 std::swap(a&: SubReg1, b&: SubReg2);
26003 SDValue Lo = DAG.getTargetExtractSubreg(SRIdx: SubReg1, DL: SDLoc(N), VT: MVT::i64,
26004 Operand: SDValue(CmpSwap, 0));
26005 SDValue Hi = DAG.getTargetExtractSubreg(SRIdx: SubReg2, DL: SDLoc(N), VT: MVT::i64,
26006 Operand: SDValue(CmpSwap, 0));
26007 Results.push_back(
26008 Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc(N), VT: MVT::i128, N1: Lo, N2: Hi));
26009 Results.push_back(Elt: SDValue(CmpSwap, 1)); // Chain out
26010 return;
26011 }
26012
26013 unsigned Opcode;
26014 switch (MemOp->getMergedOrdering()) {
26015 case AtomicOrdering::Monotonic:
26016 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
26017 break;
26018 case AtomicOrdering::Acquire:
26019 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
26020 break;
26021 case AtomicOrdering::Release:
26022 Opcode = AArch64::CMP_SWAP_128_RELEASE;
26023 break;
26024 case AtomicOrdering::AcquireRelease:
26025 case AtomicOrdering::SequentiallyConsistent:
26026 Opcode = AArch64::CMP_SWAP_128;
26027 break;
26028 default:
26029 llvm_unreachable("Unexpected ordering!");
26030 }
26031
26032 SDLoc DL(N);
26033 auto Desired = DAG.SplitScalar(N: N->getOperand(Num: 2), DL, LoVT: MVT::i64, HiVT: MVT::i64);
26034 auto New = DAG.SplitScalar(N: N->getOperand(Num: 3), DL, LoVT: MVT::i64, HiVT: MVT::i64);
26035 SDValue Ops[] = {N->getOperand(Num: 1), Desired.first, Desired.second,
26036 New.first, New.second, N->getOperand(Num: 0)};
26037 SDNode *CmpSwap = DAG.getMachineNode(
26038 Opcode, dl: SDLoc(N), VTs: DAG.getVTList(VT1: MVT::i64, VT2: MVT::i64, VT3: MVT::i32, VT4: MVT::Other),
26039 Ops);
26040 DAG.setNodeMemRefs(N: cast<MachineSDNode>(Val: CmpSwap), NewMemRefs: {MemOp});
26041
26042 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i128,
26043 N1: SDValue(CmpSwap, 0), N2: SDValue(CmpSwap, 1)));
26044 Results.push_back(Elt: SDValue(CmpSwap, 3));
26045}
26046
26047static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
26048 AtomicOrdering Ordering) {
26049 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
26050 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
26051 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
26052 // ATOMIC_LOAD_CLR at any point.
26053 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
26054 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
26055 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
26056 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
26057
26058 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
26059 // The operand will need to be XORed in a separate step.
26060 switch (Ordering) {
26061 case AtomicOrdering::Monotonic:
26062 return AArch64::LDCLRP;
26063 break;
26064 case AtomicOrdering::Acquire:
26065 return AArch64::LDCLRPA;
26066 break;
26067 case AtomicOrdering::Release:
26068 return AArch64::LDCLRPL;
26069 break;
26070 case AtomicOrdering::AcquireRelease:
26071 case AtomicOrdering::SequentiallyConsistent:
26072 return AArch64::LDCLRPAL;
26073 break;
26074 default:
26075 llvm_unreachable("Unexpected ordering!");
26076 }
26077 }
26078
26079 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
26080 switch (Ordering) {
26081 case AtomicOrdering::Monotonic:
26082 return AArch64::LDSETP;
26083 break;
26084 case AtomicOrdering::Acquire:
26085 return AArch64::LDSETPA;
26086 break;
26087 case AtomicOrdering::Release:
26088 return AArch64::LDSETPL;
26089 break;
26090 case AtomicOrdering::AcquireRelease:
26091 case AtomicOrdering::SequentiallyConsistent:
26092 return AArch64::LDSETPAL;
26093 break;
26094 default:
26095 llvm_unreachable("Unexpected ordering!");
26096 }
26097 }
26098
26099 if (ISDOpcode == ISD::ATOMIC_SWAP) {
26100 switch (Ordering) {
26101 case AtomicOrdering::Monotonic:
26102 return AArch64::SWPP;
26103 break;
26104 case AtomicOrdering::Acquire:
26105 return AArch64::SWPPA;
26106 break;
26107 case AtomicOrdering::Release:
26108 return AArch64::SWPPL;
26109 break;
26110 case AtomicOrdering::AcquireRelease:
26111 case AtomicOrdering::SequentiallyConsistent:
26112 return AArch64::SWPPAL;
26113 break;
26114 default:
26115 llvm_unreachable("Unexpected ordering!");
26116 }
26117 }
26118
26119 llvm_unreachable("Unexpected ISDOpcode!");
26120}
26121
26122static void ReplaceATOMIC_LOAD_128Results(SDNode *N,
26123 SmallVectorImpl<SDValue> &Results,
26124 SelectionDAG &DAG,
26125 const AArch64Subtarget *Subtarget) {
26126 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
26127 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
26128 // rather than the CASP instructions, because CASP has register classes for
26129 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
26130 // to present them as single operands. LSE128 instructions use the GPR64
26131 // register class (because the pair does not have to be sequential), like
26132 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
26133
26134 assert(N->getValueType(0) == MVT::i128 &&
26135 "AtomicLoadXXX on types less than 128 should be legal");
26136
26137 if (!Subtarget->hasLSE128())
26138 return;
26139
26140 MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
26141 const SDValue &Chain = N->getOperand(Num: 0);
26142 const SDValue &Ptr = N->getOperand(Num: 1);
26143 const SDValue &Val128 = N->getOperand(Num: 2);
26144 std::pair<SDValue, SDValue> Val2x64 =
26145 DAG.SplitScalar(N: Val128, DL: SDLoc(Val128), LoVT: MVT::i64, HiVT: MVT::i64);
26146
26147 const unsigned ISDOpcode = N->getOpcode();
26148 const unsigned MachineOpcode =
26149 getAtomicLoad128Opcode(ISDOpcode, Ordering: MemOp->getMergedOrdering());
26150
26151 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
26152 SDLoc dl(Val128);
26153 Val2x64.first =
26154 DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i64,
26155 N1: DAG.getConstant(Val: -1ULL, DL: dl, VT: MVT::i64), N2: Val2x64.first);
26156 Val2x64.second =
26157 DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i64,
26158 N1: DAG.getConstant(Val: -1ULL, DL: dl, VT: MVT::i64), N2: Val2x64.second);
26159 }
26160
26161 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
26162 if (DAG.getDataLayout().isBigEndian())
26163 std::swap(a&: Ops[0], b&: Ops[1]);
26164
26165 MachineSDNode *AtomicInst =
26166 DAG.getMachineNode(Opcode: MachineOpcode, dl: SDLoc(N),
26167 VTs: DAG.getVTList(VT1: MVT::i64, VT2: MVT::i64, VT3: MVT::Other), Ops);
26168
26169 DAG.setNodeMemRefs(N: AtomicInst, NewMemRefs: {MemOp});
26170
26171 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
26172 if (DAG.getDataLayout().isBigEndian())
26173 std::swap(a&: Lo, b&: Hi);
26174
26175 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc(N), VT: MVT::i128, N1: Lo, N2: Hi));
26176 Results.push_back(Elt: SDValue(AtomicInst, 2)); // Chain out
26177}
26178
26179void AArch64TargetLowering::ReplaceNodeResults(
26180 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
26181 switch (N->getOpcode()) {
26182 default:
26183 llvm_unreachable("Don't know how to custom expand this");
26184 case ISD::BITCAST:
26185 ReplaceBITCASTResults(N, Results, DAG);
26186 return;
26187 case ISD::VECREDUCE_ADD:
26188 case ISD::VECREDUCE_SMAX:
26189 case ISD::VECREDUCE_SMIN:
26190 case ISD::VECREDUCE_UMAX:
26191 case ISD::VECREDUCE_UMIN:
26192 Results.push_back(Elt: LowerVECREDUCE(Op: SDValue(N, 0), DAG));
26193 return;
26194 case ISD::ADD:
26195 case ISD::FADD:
26196 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
26197 return;
26198
26199 case ISD::CTPOP:
26200 case ISD::PARITY:
26201 if (SDValue Result = LowerCTPOP_PARITY(Op: SDValue(N, 0), DAG))
26202 Results.push_back(Elt: Result);
26203 return;
26204 case AArch64ISD::SADDV:
26205 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::SADDV);
26206 return;
26207 case AArch64ISD::UADDV:
26208 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::UADDV);
26209 return;
26210 case AArch64ISD::SMINV:
26211 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMIN, AcrossOp: AArch64ISD::SMINV);
26212 return;
26213 case AArch64ISD::UMINV:
26214 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMIN, AcrossOp: AArch64ISD::UMINV);
26215 return;
26216 case AArch64ISD::SMAXV:
26217 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMAX, AcrossOp: AArch64ISD::SMAXV);
26218 return;
26219 case AArch64ISD::UMAXV:
26220 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMAX, AcrossOp: AArch64ISD::UMAXV);
26221 return;
26222 case ISD::MULHS:
26223 if (useSVEForFixedLengthVectorVT(VT: SDValue(N, 0).getValueType()))
26224 Results.push_back(
26225 Elt: LowerToPredicatedOp(Op: SDValue(N, 0), DAG, NewOp: AArch64ISD::MULHS_PRED));
26226 return;
26227 case ISD::MULHU:
26228 if (useSVEForFixedLengthVectorVT(VT: SDValue(N, 0).getValueType()))
26229 Results.push_back(
26230 Elt: LowerToPredicatedOp(Op: SDValue(N, 0), DAG, NewOp: AArch64ISD::MULHU_PRED));
26231 return;
26232 case ISD::FP_TO_UINT:
26233 case ISD::FP_TO_SINT:
26234 case ISD::STRICT_FP_TO_SINT:
26235 case ISD::STRICT_FP_TO_UINT:
26236 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
26237 // Let normal code take care of it by not adding anything to Results.
26238 return;
26239 case ISD::ATOMIC_CMP_SWAP:
26240 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
26241 return;
26242 case ISD::ATOMIC_LOAD_CLR:
26243 assert(N->getValueType(0) != MVT::i128 &&
26244 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
26245 break;
26246 case ISD::ATOMIC_LOAD_AND:
26247 case ISD::ATOMIC_LOAD_OR:
26248 case ISD::ATOMIC_SWAP: {
26249 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
26250 "Expected 128-bit atomicrmw.");
26251 // These need custom type legalisation so we go directly to instruction.
26252 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
26253 return;
26254 }
26255 case ISD::ATOMIC_LOAD:
26256 case ISD::LOAD: {
26257 MemSDNode *LoadNode = cast<MemSDNode>(Val: N);
26258 EVT MemVT = LoadNode->getMemoryVT();
26259 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
26260 // targets.
26261 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
26262 MemVT.getSizeInBits() == 256u &&
26263 (MemVT.getScalarSizeInBits() == 8u ||
26264 MemVT.getScalarSizeInBits() == 16u ||
26265 MemVT.getScalarSizeInBits() == 32u ||
26266 MemVT.getScalarSizeInBits() == 64u)) {
26267
26268 SDValue Result = DAG.getMemIntrinsicNode(
26269 Opcode: AArch64ISD::LDNP, dl: SDLoc(N),
26270 VTList: DAG.getVTList(VTs: {MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
26271 MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
26272 MVT::Other}),
26273 Ops: {LoadNode->getChain(), LoadNode->getBasePtr()},
26274 MemVT: LoadNode->getMemoryVT(), MMO: LoadNode->getMemOperand());
26275
26276 SDValue Pair = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(N), VT: MemVT,
26277 N1: Result.getValue(R: 0), N2: Result.getValue(R: 1));
26278 Results.append(IL: {Pair, Result.getValue(R: 2) /* Chain */});
26279 return;
26280 }
26281
26282 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
26283 LoadNode->getMemoryVT() != MVT::i128) {
26284 // Non-volatile or atomic loads are optimized later in AArch64's load/store
26285 // optimizer.
26286 return;
26287 }
26288
26289 if (SDValue(N, 0).getValueType() == MVT::i128) {
26290 auto *AN = dyn_cast<AtomicSDNode>(Val: LoadNode);
26291 bool isLoadAcquire =
26292 AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire;
26293 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
26294
26295 if (isLoadAcquire)
26296 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
26297
26298 SDValue Result = DAG.getMemIntrinsicNode(
26299 Opcode, dl: SDLoc(N), VTList: DAG.getVTList(VTs: {MVT::i64, MVT::i64, MVT::Other}),
26300 Ops: {LoadNode->getChain(), LoadNode->getBasePtr()},
26301 MemVT: LoadNode->getMemoryVT(), MMO: LoadNode->getMemOperand());
26302
26303 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
26304
26305 SDValue Pair =
26306 DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc(N), VT: MVT::i128,
26307 N1: Result.getValue(R: FirstRes), N2: Result.getValue(R: 1 - FirstRes));
26308 Results.append(IL: {Pair, Result.getValue(R: 2) /* Chain */});
26309 }
26310 return;
26311 }
26312 case ISD::EXTRACT_SUBVECTOR:
26313 ReplaceExtractSubVectorResults(N, Results, DAG);
26314 return;
26315 case ISD::INSERT_SUBVECTOR:
26316 case ISD::CONCAT_VECTORS:
26317 // Custom lowering has been requested for INSERT_SUBVECTOR and
26318 // CONCAT_VECTORS -- but delegate to common code for result type
26319 // legalisation
26320 return;
26321 case ISD::INTRINSIC_WO_CHAIN: {
26322 EVT VT = N->getValueType(ResNo: 0);
26323
26324 Intrinsic::ID IntID =
26325 static_cast<Intrinsic::ID>(N->getConstantOperandVal(Num: 0));
26326 switch (IntID) {
26327 default:
26328 return;
26329 case Intrinsic::aarch64_sve_clasta_n: {
26330 assert((VT == MVT::i8 || VT == MVT::i16) &&
26331 "custom lowering for unexpected type");
26332 SDLoc DL(N);
26333 auto Op2 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: N->getOperand(Num: 2));
26334 auto V = DAG.getNode(Opcode: AArch64ISD::CLASTA_N, DL, VT: MVT::i32,
26335 N1: N->getOperand(Num: 1), N2: Op2, N3: N->getOperand(Num: 3));
26336 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
26337 return;
26338 }
26339 case Intrinsic::aarch64_sve_clastb_n: {
26340 assert((VT == MVT::i8 || VT == MVT::i16) &&
26341 "custom lowering for unexpected type");
26342 SDLoc DL(N);
26343 auto Op2 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: N->getOperand(Num: 2));
26344 auto V = DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL, VT: MVT::i32,
26345 N1: N->getOperand(Num: 1), N2: Op2, N3: N->getOperand(Num: 3));
26346 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
26347 return;
26348 }
26349 case Intrinsic::aarch64_sve_lasta: {
26350 assert((VT == MVT::i8 || VT == MVT::i16) &&
26351 "custom lowering for unexpected type");
26352 SDLoc DL(N);
26353 auto V = DAG.getNode(Opcode: AArch64ISD::LASTA, DL, VT: MVT::i32,
26354 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
26355 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
26356 return;
26357 }
26358 case Intrinsic::aarch64_sve_lastb: {
26359 assert((VT == MVT::i8 || VT == MVT::i16) &&
26360 "custom lowering for unexpected type");
26361 SDLoc DL(N);
26362 auto V = DAG.getNode(Opcode: AArch64ISD::LASTB, DL, VT: MVT::i32,
26363 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
26364 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
26365 return;
26366 }
26367 case Intrinsic::get_active_lane_mask: {
26368 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
26369 return;
26370
26371 // NOTE: Only trivial type promotion is supported.
26372 EVT NewVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
26373 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
26374 return;
26375
26376 SDLoc DL(N);
26377 auto V = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: NewVT, Ops: N->ops());
26378 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
26379 return;
26380 }
26381 }
26382 }
26383 case ISD::READ_REGISTER: {
26384 SDLoc DL(N);
26385 assert(N->getValueType(0) == MVT::i128 &&
26386 "READ_REGISTER custom lowering is only for 128-bit sysregs");
26387 SDValue Chain = N->getOperand(Num: 0);
26388 SDValue SysRegName = N->getOperand(Num: 1);
26389
26390 SDValue Result = DAG.getNode(
26391 Opcode: AArch64ISD::MRRS, DL, VTList: DAG.getVTList(VTs: {MVT::i64, MVT::i64, MVT::Other}),
26392 N1: Chain, N2: SysRegName);
26393
26394 // Sysregs are not endian. Result.getValue(0) always contains the lower half
26395 // of the 128-bit System Register value.
26396 SDValue Pair = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i128,
26397 N1: Result.getValue(R: 0), N2: Result.getValue(R: 1));
26398 Results.push_back(Elt: Pair);
26399 Results.push_back(Elt: Result.getValue(R: 2)); // Chain
26400 return;
26401 }
26402 }
26403}
26404
26405bool AArch64TargetLowering::useLoadStackGuardNode() const {
26406 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
26407 return TargetLowering::useLoadStackGuardNode();
26408 return true;
26409}
26410
26411unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
26412 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
26413 // reciprocal if there are three or more FDIVs.
26414 return 3;
26415}
26416
26417TargetLoweringBase::LegalizeTypeAction
26418AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
26419 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
26420 // v4i16, v2i32 instead of to promote.
26421 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
26422 VT == MVT::v1f32)
26423 return TypeWidenVector;
26424
26425 return TargetLoweringBase::getPreferredVectorAction(VT);
26426}
26427
26428// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
26429// provided the address is 16-byte aligned.
26430bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
26431 if (!Subtarget->hasLSE2())
26432 return false;
26433
26434 if (auto LI = dyn_cast<LoadInst>(Val: I))
26435 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
26436 LI->getAlign() >= Align(16);
26437
26438 if (auto SI = dyn_cast<StoreInst>(Val: I))
26439 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26440 SI->getAlign() >= Align(16);
26441
26442 return false;
26443}
26444
26445bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction *I) const {
26446 if (!Subtarget->hasLSE128())
26447 return false;
26448
26449 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
26450 // will clobber the two registers.
26451 if (const auto *SI = dyn_cast<StoreInst>(Val: I))
26452 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26453 SI->getAlign() >= Align(16) &&
26454 (SI->getOrdering() == AtomicOrdering::Release ||
26455 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
26456
26457 if (const auto *RMW = dyn_cast<AtomicRMWInst>(Val: I))
26458 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26459 RMW->getAlign() >= Align(16) &&
26460 (RMW->getOperation() == AtomicRMWInst::Xchg ||
26461 RMW->getOperation() == AtomicRMWInst::And ||
26462 RMW->getOperation() == AtomicRMWInst::Or);
26463
26464 return false;
26465}
26466
26467bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction *I) const {
26468 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
26469 return false;
26470
26471 if (auto LI = dyn_cast<LoadInst>(Val: I))
26472 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
26473 LI->getAlign() >= Align(16) &&
26474 LI->getOrdering() == AtomicOrdering::Acquire;
26475
26476 if (auto SI = dyn_cast<StoreInst>(Val: I))
26477 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26478 SI->getAlign() >= Align(16) &&
26479 SI->getOrdering() == AtomicOrdering::Release;
26480
26481 return false;
26482}
26483
26484bool AArch64TargetLowering::shouldInsertFencesForAtomic(
26485 const Instruction *I) const {
26486 if (isOpSuitableForRCPC3(I))
26487 return false;
26488 if (isOpSuitableForLSE128(I))
26489 return false;
26490 if (isOpSuitableForLDPSTP(I))
26491 return true;
26492 return false;
26493}
26494
26495bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
26496 const Instruction *I) const {
26497 // Store-Release instructions only provide seq_cst guarantees when paired with
26498 // Load-Acquire instructions. MSVC CRT does not use these instructions to
26499 // implement seq_cst loads and stores, so we need additional explicit fences
26500 // after memory writes.
26501 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26502 return false;
26503
26504 switch (I->getOpcode()) {
26505 default:
26506 return false;
26507 case Instruction::AtomicCmpXchg:
26508 return cast<AtomicCmpXchgInst>(Val: I)->getSuccessOrdering() ==
26509 AtomicOrdering::SequentiallyConsistent;
26510 case Instruction::AtomicRMW:
26511 return cast<AtomicRMWInst>(Val: I)->getOrdering() ==
26512 AtomicOrdering::SequentiallyConsistent;
26513 case Instruction::Store:
26514 return cast<StoreInst>(Val: I)->getOrdering() ==
26515 AtomicOrdering::SequentiallyConsistent;
26516 }
26517}
26518
26519// Loads and stores less than 128-bits are already atomic; ones above that
26520// are doomed anyway, so defer to the default libcall and blame the OS when
26521// things go wrong.
26522TargetLoweringBase::AtomicExpansionKind
26523AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
26524 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
26525 if (Size != 128)
26526 return AtomicExpansionKind::None;
26527 if (isOpSuitableForRCPC3(I: SI))
26528 return AtomicExpansionKind::None;
26529 if (isOpSuitableForLSE128(I: SI))
26530 return AtomicExpansionKind::Expand;
26531 if (isOpSuitableForLDPSTP(I: SI))
26532 return AtomicExpansionKind::None;
26533 return AtomicExpansionKind::Expand;
26534}
26535
26536// Loads and stores less than 128-bits are already atomic; ones above that
26537// are doomed anyway, so defer to the default libcall and blame the OS when
26538// things go wrong.
26539TargetLowering::AtomicExpansionKind
26540AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
26541 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
26542
26543 if (Size != 128)
26544 return AtomicExpansionKind::None;
26545 if (isOpSuitableForRCPC3(I: LI))
26546 return AtomicExpansionKind::None;
26547 // No LSE128 loads
26548 if (isOpSuitableForLDPSTP(I: LI))
26549 return AtomicExpansionKind::None;
26550
26551 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26552 // implement atomicrmw without spilling. If the target address is also on the
26553 // stack and close enough to the spill slot, this can lead to a situation
26554 // where the monitor always gets cleared and the atomic operation can never
26555 // succeed. So at -O0 lower this operation to a CAS loop.
26556 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
26557 return AtomicExpansionKind::CmpXChg;
26558
26559 // Using CAS for an atomic load has a better chance of succeeding under high
26560 // contention situations. So use it if available.
26561 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
26562 : AtomicExpansionKind::LLSC;
26563}
26564
26565// The "default" for integer RMW operations is to expand to an LL/SC loop.
26566// However, with the LSE instructions (or outline-atomics mode, which provides
26567// library routines in place of the LSE-instructions), we can directly emit many
26568// operations instead.
26569//
26570// Floating-point operations are always emitted to a cmpxchg loop, because they
26571// may trigger a trap which aborts an LLSC sequence.
26572TargetLowering::AtomicExpansionKind
26573AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
26574 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
26575 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
26576
26577 if (AI->isFloatingPointOperation())
26578 return AtomicExpansionKind::CmpXChg;
26579
26580 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
26581 (AI->getOperation() == AtomicRMWInst::Xchg ||
26582 AI->getOperation() == AtomicRMWInst::Or ||
26583 AI->getOperation() == AtomicRMWInst::And);
26584 if (CanUseLSE128)
26585 return AtomicExpansionKind::None;
26586
26587 // Nand is not supported in LSE.
26588 // Leave 128 bits to LLSC or CmpXChg.
26589 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
26590 if (Subtarget->hasLSE())
26591 return AtomicExpansionKind::None;
26592 if (Subtarget->outlineAtomics()) {
26593 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
26594 // Don't outline them unless
26595 // (1) high level <atomic> support approved:
26596 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
26597 // (2) low level libgcc and compiler-rt support implemented by:
26598 // min/max outline atomics helpers
26599 if (AI->getOperation() != AtomicRMWInst::Min &&
26600 AI->getOperation() != AtomicRMWInst::Max &&
26601 AI->getOperation() != AtomicRMWInst::UMin &&
26602 AI->getOperation() != AtomicRMWInst::UMax) {
26603 return AtomicExpansionKind::None;
26604 }
26605 }
26606 }
26607
26608 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26609 // implement atomicrmw without spilling. If the target address is also on the
26610 // stack and close enough to the spill slot, this can lead to a situation
26611 // where the monitor always gets cleared and the atomic operation can never
26612 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
26613 // we have a single CAS instruction that can replace the loop.
26614 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
26615 Subtarget->hasLSE())
26616 return AtomicExpansionKind::CmpXChg;
26617
26618 return AtomicExpansionKind::LLSC;
26619}
26620
26621TargetLowering::AtomicExpansionKind
26622AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
26623 AtomicCmpXchgInst *AI) const {
26624 // If subtarget has LSE, leave cmpxchg intact for codegen.
26625 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
26626 return AtomicExpansionKind::None;
26627 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26628 // implement cmpxchg without spilling. If the address being exchanged is also
26629 // on the stack and close enough to the spill slot, this can lead to a
26630 // situation where the monitor always gets cleared and the atomic operation
26631 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
26632 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
26633 return AtomicExpansionKind::None;
26634
26635 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
26636 // it.
26637 unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
26638 if (Size > 64)
26639 return AtomicExpansionKind::None;
26640
26641 return AtomicExpansionKind::LLSC;
26642}
26643
26644Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
26645 Type *ValueTy, Value *Addr,
26646 AtomicOrdering Ord) const {
26647 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26648 bool IsAcquire = isAcquireOrStronger(AO: Ord);
26649
26650 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
26651 // intrinsic must return {i64, i64} and we have to recombine them into a
26652 // single i128 here.
26653 if (ValueTy->getPrimitiveSizeInBits() == 128) {
26654 Intrinsic::ID Int =
26655 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
26656 Function *Ldxr = Intrinsic::getDeclaration(M, id: Int);
26657
26658 Value *LoHi = Builder.CreateCall(Callee: Ldxr, Args: Addr, Name: "lohi");
26659
26660 Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: 0, Name: "lo");
26661 Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: 1, Name: "hi");
26662 Lo = Builder.CreateZExt(V: Lo, DestTy: ValueTy, Name: "lo64");
26663 Hi = Builder.CreateZExt(V: Hi, DestTy: ValueTy, Name: "hi64");
26664 return Builder.CreateOr(
26665 LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: ValueTy, V: 64)), Name: "val64");
26666 }
26667
26668 Type *Tys[] = { Addr->getType() };
26669 Intrinsic::ID Int =
26670 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
26671 Function *Ldxr = Intrinsic::getDeclaration(M, id: Int, Tys);
26672
26673 const DataLayout &DL = M->getDataLayout();
26674 IntegerType *IntEltTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: ValueTy));
26675 CallInst *CI = Builder.CreateCall(Callee: Ldxr, Args: Addr);
26676 CI->addParamAttr(
26677 ArgNo: 0, Attr: Attribute::get(Context&: Builder.getContext(), Kind: Attribute::ElementType, Ty: ValueTy));
26678 Value *Trunc = Builder.CreateTrunc(V: CI, DestTy: IntEltTy);
26679
26680 return Builder.CreateBitCast(V: Trunc, DestTy: ValueTy);
26681}
26682
26683void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
26684 IRBuilderBase &Builder) const {
26685 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26686 Builder.CreateCall(Callee: Intrinsic::getDeclaration(M, id: Intrinsic::aarch64_clrex));
26687}
26688
26689Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
26690 Value *Val, Value *Addr,
26691 AtomicOrdering Ord) const {
26692 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26693 bool IsRelease = isReleaseOrStronger(AO: Ord);
26694
26695 // Since the intrinsics must have legal type, the i128 intrinsics take two
26696 // parameters: "i64, i64". We must marshal Val into the appropriate form
26697 // before the call.
26698 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
26699 Intrinsic::ID Int =
26700 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
26701 Function *Stxr = Intrinsic::getDeclaration(M, id: Int);
26702 Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
26703
26704 Value *Lo = Builder.CreateTrunc(V: Val, DestTy: Int64Ty, Name: "lo");
26705 Value *Hi = Builder.CreateTrunc(V: Builder.CreateLShr(LHS: Val, RHS: 64), DestTy: Int64Ty, Name: "hi");
26706 return Builder.CreateCall(Callee: Stxr, Args: {Lo, Hi, Addr});
26707 }
26708
26709 Intrinsic::ID Int =
26710 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
26711 Type *Tys[] = { Addr->getType() };
26712 Function *Stxr = Intrinsic::getDeclaration(M, id: Int, Tys);
26713
26714 const DataLayout &DL = M->getDataLayout();
26715 IntegerType *IntValTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: Val->getType()));
26716 Val = Builder.CreateBitCast(V: Val, DestTy: IntValTy);
26717
26718 CallInst *CI = Builder.CreateCall(
26719 Callee: Stxr, Args: {Builder.CreateZExtOrBitCast(
26720 V: Val, DestTy: Stxr->getFunctionType()->getParamType(i: 0)),
26721 Addr});
26722 CI->addParamAttr(ArgNo: 1, Attr: Attribute::get(Context&: Builder.getContext(),
26723 Kind: Attribute::ElementType, Ty: Val->getType()));
26724 return CI;
26725}
26726
26727bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
26728 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
26729 const DataLayout &DL) const {
26730 if (!Ty->isArrayTy()) {
26731 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
26732 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
26733 }
26734
26735 // All non aggregate members of the type must have the same type
26736 SmallVector<EVT> ValueVTs;
26737 ComputeValueVTs(TLI: *this, DL, Ty, ValueVTs);
26738 return all_equal(Range&: ValueVTs);
26739}
26740
26741bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
26742 EVT) const {
26743 return false;
26744}
26745
26746static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
26747 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
26748 Function *ThreadPointerFunc =
26749 Intrinsic::getDeclaration(M, id: Intrinsic::thread_pointer);
26750 return IRB.CreatePointerCast(
26751 V: IRB.CreateConstGEP1_32(Ty: IRB.getInt8Ty(), Ptr: IRB.CreateCall(Callee: ThreadPointerFunc),
26752 Idx0: Offset),
26753 DestTy: IRB.getPtrTy(AddrSpace: 0));
26754}
26755
26756Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
26757 // Android provides a fixed TLS slot for the stack cookie. See the definition
26758 // of TLS_SLOT_STACK_GUARD in
26759 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
26760 if (Subtarget->isTargetAndroid())
26761 return UseTlsOffset(IRB, Offset: 0x28);
26762
26763 // Fuchsia is similar.
26764 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
26765 if (Subtarget->isTargetFuchsia())
26766 return UseTlsOffset(IRB, Offset: -0x10);
26767
26768 return TargetLowering::getIRStackGuard(IRB);
26769}
26770
26771void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
26772 // MSVC CRT provides functionalities for stack protection.
26773 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
26774 // MSVC CRT has a global variable holding security cookie.
26775 M.getOrInsertGlobal(Name: "__security_cookie",
26776 Ty: PointerType::getUnqual(C&: M.getContext()));
26777
26778 // MSVC CRT has a function to validate security cookie.
26779 FunctionCallee SecurityCheckCookie =
26780 M.getOrInsertFunction(Name: Subtarget->getSecurityCheckCookieName(),
26781 RetTy: Type::getVoidTy(C&: M.getContext()),
26782 Args: PointerType::getUnqual(C&: M.getContext()));
26783 if (Function *F = dyn_cast<Function>(Val: SecurityCheckCookie.getCallee())) {
26784 F->setCallingConv(CallingConv::Win64);
26785 F->addParamAttr(ArgNo: 0, Kind: Attribute::AttrKind::InReg);
26786 }
26787 return;
26788 }
26789 TargetLowering::insertSSPDeclarations(M);
26790}
26791
26792Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
26793 // MSVC CRT has a global variable holding security cookie.
26794 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26795 return M.getGlobalVariable(Name: "__security_cookie");
26796 return TargetLowering::getSDagStackGuard(M);
26797}
26798
26799Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
26800 // MSVC CRT has a function to validate security cookie.
26801 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26802 return M.getFunction(Name: Subtarget->getSecurityCheckCookieName());
26803 return TargetLowering::getSSPStackGuardCheck(M);
26804}
26805
26806Value *
26807AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
26808 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26809 // definition of TLS_SLOT_SAFESTACK in
26810 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26811 if (Subtarget->isTargetAndroid())
26812 return UseTlsOffset(IRB, Offset: 0x48);
26813
26814 // Fuchsia is similar.
26815 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26816 if (Subtarget->isTargetFuchsia())
26817 return UseTlsOffset(IRB, Offset: -0x8);
26818
26819 return TargetLowering::getSafeStackPointerLocation(IRB);
26820}
26821
26822bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
26823 const Instruction &AndI) const {
26824 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26825 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26826 // may be beneficial to sink in other cases, but we would have to check that
26827 // the cmp would not get folded into the br to form a cbz for these to be
26828 // beneficial.
26829 ConstantInt* Mask = dyn_cast<ConstantInt>(Val: AndI.getOperand(i: 1));
26830 if (!Mask)
26831 return false;
26832 return Mask->getValue().isPowerOf2();
26833}
26834
26835bool AArch64TargetLowering::
26836 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
26837 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
26838 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26839 SelectionDAG &DAG) const {
26840 // Does baseline recommend not to perform the fold by default?
26841 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
26842 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26843 return false;
26844 // Else, if this is a vector shift, prefer 'shl'.
26845 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26846}
26847
26848TargetLowering::ShiftLegalizationStrategy
26849AArch64TargetLowering::preferredShiftLegalizationStrategy(
26850 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26851 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
26852 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26853 return ShiftLegalizationStrategy::LowerToLibcall;
26854 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
26855 ExpansionFactor);
26856}
26857
26858void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
26859 // Update IsSplitCSR in AArch64unctionInfo.
26860 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26861 AFI->setIsSplitCSR(true);
26862}
26863
26864void AArch64TargetLowering::insertCopiesSplitCSR(
26865 MachineBasicBlock *Entry,
26866 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26867 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26868 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
26869 if (!IStart)
26870 return;
26871
26872 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26873 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26874 MachineBasicBlock::iterator MBBI = Entry->begin();
26875 for (const MCPhysReg *I = IStart; *I; ++I) {
26876 const TargetRegisterClass *RC = nullptr;
26877 if (AArch64::GPR64RegClass.contains(Reg: *I))
26878 RC = &AArch64::GPR64RegClass;
26879 else if (AArch64::FPR64RegClass.contains(Reg: *I))
26880 RC = &AArch64::FPR64RegClass;
26881 else
26882 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26883
26884 Register NewVR = MRI->createVirtualRegister(RegClass: RC);
26885 // Create copy from CSR to a virtual register.
26886 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26887 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26888 // nounwind. If we want to generalize this later, we may need to emit
26889 // CFI pseudo-instructions.
26890 assert(Entry->getParent()->getFunction().hasFnAttribute(
26891 Attribute::NoUnwind) &&
26892 "Function should be nounwind in insertCopiesSplitCSR!");
26893 Entry->addLiveIn(PhysReg: *I);
26894 BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
26895 .addReg(RegNo: *I);
26896
26897 // Insert the copy-back instructions right before the terminator.
26898 for (auto *Exit : Exits)
26899 BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(),
26900 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
26901 .addReg(RegNo: NewVR);
26902 }
26903}
26904
26905bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
26906 // Integer division on AArch64 is expensive. However, when aggressively
26907 // optimizing for code size, we prefer to use a div instruction, as it is
26908 // usually smaller than the alternative sequence.
26909 // The exception to this is vector division. Since AArch64 doesn't have vector
26910 // integer division, leaving the division as-is is a loss even in terms of
26911 // size, because it will have to be scalarized, while the alternative code
26912 // sequence can be performed in vector form.
26913 bool OptSize = Attr.hasFnAttr(Kind: Attribute::MinSize);
26914 return OptSize && !VT.isVector();
26915}
26916
26917bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
26918 // We want inc-of-add for scalars and sub-of-not for vectors.
26919 return VT.isScalarInteger();
26920}
26921
26922bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
26923 EVT VT) const {
26924 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26925 // legalize.
26926 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26927 return false;
26928 if (FPVT == MVT::v8bf16)
26929 return false;
26930 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26931}
26932
26933MachineInstr *
26934AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
26935 MachineBasicBlock::instr_iterator &MBBI,
26936 const TargetInstrInfo *TII) const {
26937 assert(MBBI->isCall() && MBBI->getCFIType() &&
26938 "Invalid call instruction for a KCFI check");
26939
26940 switch (MBBI->getOpcode()) {
26941 case AArch64::BLR:
26942 case AArch64::BLRNoIP:
26943 case AArch64::TCRETURNri:
26944 case AArch64::TCRETURNrix16x17:
26945 case AArch64::TCRETURNrix17:
26946 case AArch64::TCRETURNrinotx16:
26947 break;
26948 default:
26949 llvm_unreachable("Unexpected CFI call opcode");
26950 }
26951
26952 MachineOperand &Target = MBBI->getOperand(i: 0);
26953 assert(Target.isReg() && "Invalid target operand for an indirect call");
26954 Target.setIsRenamable(false);
26955
26956 return BuildMI(BB&: MBB, I: MBBI, MIMD: MBBI->getDebugLoc(), MCID: TII->get(Opcode: AArch64::KCFI_CHECK))
26957 .addReg(RegNo: Target.getReg())
26958 .addImm(Val: MBBI->getCFIType())
26959 .getInstr();
26960}
26961
26962bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
26963 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26964}
26965
26966unsigned
26967AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
26968 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26969 return getPointerTy(DL).getSizeInBits();
26970
26971 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26972}
26973
26974void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26975 MachineFrameInfo &MFI = MF.getFrameInfo();
26976 // If we have any vulnerable SVE stack objects then the stack protector
26977 // needs to be placed at the top of the SVE stack area, as the SVE locals
26978 // are placed above the other locals, so we allocate it as if it were a
26979 // scalable vector.
26980 // FIXME: It may be worthwhile having a specific interface for this rather
26981 // than doing it here in finalizeLowering.
26982 if (MFI.hasStackProtectorIndex()) {
26983 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26984 if (MFI.getStackID(ObjectIdx: i) == TargetStackID::ScalableVector &&
26985 MFI.getObjectSSPLayout(ObjectIdx: i) != MachineFrameInfo::SSPLK_None) {
26986 MFI.setStackID(ObjectIdx: MFI.getStackProtectorIndex(),
26987 ID: TargetStackID::ScalableVector);
26988 MFI.setObjectAlignment(ObjectIdx: MFI.getStackProtectorIndex(), Alignment: Align(16));
26989 break;
26990 }
26991 }
26992 }
26993 MFI.computeMaxCallFrameSize(MF);
26994 TargetLoweringBase::finalizeLowering(MF);
26995}
26996
26997// Unlike X86, we let frame lowering assign offsets to all catch objects.
26998bool AArch64TargetLowering::needsFixedCatchObjects() const {
26999 return false;
27000}
27001
27002bool AArch64TargetLowering::shouldLocalize(
27003 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
27004 auto &MF = *MI.getMF();
27005 auto &MRI = MF.getRegInfo();
27006 auto maxUses = [](unsigned RematCost) {
27007 // A cost of 1 means remats are basically free.
27008 if (RematCost == 1)
27009 return std::numeric_limits<unsigned>::max();
27010 if (RematCost == 2)
27011 return 2U;
27012
27013 // Remat is too expensive, only sink if there's one user.
27014 if (RematCost > 2)
27015 return 1U;
27016 llvm_unreachable("Unexpected remat cost");
27017 };
27018
27019 unsigned Opc = MI.getOpcode();
27020 switch (Opc) {
27021 case TargetOpcode::G_GLOBAL_VALUE: {
27022 // On Darwin, TLS global vars get selected into function calls, which
27023 // we don't want localized, as they can get moved into the middle of a
27024 // another call sequence.
27025 const GlobalValue &GV = *MI.getOperand(i: 1).getGlobal();
27026 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
27027 return false;
27028 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
27029 }
27030 case TargetOpcode::G_FCONSTANT:
27031 case TargetOpcode::G_CONSTANT: {
27032 const ConstantInt *CI;
27033 unsigned AdditionalCost = 0;
27034
27035 if (Opc == TargetOpcode::G_CONSTANT)
27036 CI = MI.getOperand(i: 1).getCImm();
27037 else {
27038 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
27039 // We try to estimate cost of 32/64b fpimms, as they'll likely be
27040 // materialized as integers.
27041 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
27042 break;
27043 auto APF = MI.getOperand(i: 1).getFPImm()->getValueAPF();
27044 bool OptForSize =
27045 MF.getFunction().hasOptSize() || MF.getFunction().hasMinSize();
27046 if (isFPImmLegal(Imm: APF, VT: EVT::getFloatingPointVT(BitWidth: Ty.getScalarSizeInBits()),
27047 OptForSize))
27048 return true; // Constant should be cheap.
27049 CI =
27050 ConstantInt::get(Context&: MF.getFunction().getContext(), V: APF.bitcastToAPInt());
27051 // FP materialization also costs an extra move, from gpr to fpr.
27052 AdditionalCost = 1;
27053 }
27054 APInt Imm = CI->getValue();
27055 InstructionCost Cost = TTI->getIntImmCost(
27056 Imm, Ty: CI->getType(), CostKind: TargetTransformInfo::TCK_CodeSize);
27057 assert(Cost.isValid() && "Expected a valid imm cost");
27058
27059 unsigned RematCost = *Cost.getValue();
27060 RematCost += AdditionalCost;
27061 Register Reg = MI.getOperand(i: 0).getReg();
27062 unsigned MaxUses = maxUses(RematCost);
27063 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
27064 if (MaxUses == std::numeric_limits<unsigned>::max())
27065 --MaxUses;
27066 return MRI.hasAtMostUserInstrs(Reg, MaxUsers: MaxUses);
27067 }
27068 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
27069 // localizable.
27070 case AArch64::ADRP:
27071 case AArch64::G_ADD_LOW:
27072 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
27073 case TargetOpcode::G_PTR_ADD:
27074 return true;
27075 default:
27076 break;
27077 }
27078 return TargetLoweringBase::shouldLocalize(MI, TTI);
27079}
27080
27081bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
27082 // Fallback for scalable vectors.
27083 // Note that if EnableSVEGISel is true, we allow scalable vector types for
27084 // all instructions, regardless of whether they are actually supported.
27085 if (!EnableSVEGISel) {
27086 if (Inst.getType()->isScalableTy()) {
27087 return true;
27088 }
27089
27090 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
27091 if (Inst.getOperand(i)->getType()->isScalableTy())
27092 return true;
27093
27094 if (const AllocaInst *AI = dyn_cast<AllocaInst>(Val: &Inst)) {
27095 if (AI->getAllocatedType()->isScalableTy())
27096 return true;
27097 }
27098 }
27099
27100 // Checks to allow the use of SME instructions
27101 if (auto *Base = dyn_cast<CallBase>(Val: &Inst)) {
27102 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
27103 auto CalleeAttrs = SMEAttrs(*Base);
27104 if (CallerAttrs.requiresSMChange(Callee: CalleeAttrs) ||
27105 CallerAttrs.requiresLazySave(Callee: CalleeAttrs) ||
27106 CallerAttrs.requiresPreservingZT0(Callee: CalleeAttrs))
27107 return true;
27108 }
27109 return false;
27110}
27111
27112// Return the largest legal scalable vector type that matches VT's element type.
27113static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
27114 assert(VT.isFixedLengthVector() &&
27115 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
27116 "Expected legal fixed length vector!");
27117 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
27118 default:
27119 llvm_unreachable("unexpected element type for SVE container");
27120 case MVT::i8:
27121 return EVT(MVT::nxv16i8);
27122 case MVT::i16:
27123 return EVT(MVT::nxv8i16);
27124 case MVT::i32:
27125 return EVT(MVT::nxv4i32);
27126 case MVT::i64:
27127 return EVT(MVT::nxv2i64);
27128 case MVT::bf16:
27129 return EVT(MVT::nxv8bf16);
27130 case MVT::f16:
27131 return EVT(MVT::nxv8f16);
27132 case MVT::f32:
27133 return EVT(MVT::nxv4f32);
27134 case MVT::f64:
27135 return EVT(MVT::nxv2f64);
27136 }
27137}
27138
27139// Return a PTRUE with active lanes corresponding to the extent of VT.
27140static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
27141 EVT VT) {
27142 assert(VT.isFixedLengthVector() &&
27143 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
27144 "Expected legal fixed length vector!");
27145
27146 std::optional<unsigned> PgPattern =
27147 getSVEPredPatternFromNumElements(MinNumElts: VT.getVectorNumElements());
27148 assert(PgPattern && "Unexpected element count for SVE predicate");
27149
27150 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
27151 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
27152 // variants of instructions when available.
27153 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27154 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27155 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27156 if (MaxSVESize && MinSVESize == MaxSVESize &&
27157 MaxSVESize == VT.getSizeInBits())
27158 PgPattern = AArch64SVEPredPattern::all;
27159
27160 MVT MaskVT;
27161 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
27162 default:
27163 llvm_unreachable("unexpected element type for SVE predicate");
27164 case MVT::i8:
27165 MaskVT = MVT::nxv16i1;
27166 break;
27167 case MVT::i16:
27168 case MVT::f16:
27169 case MVT::bf16:
27170 MaskVT = MVT::nxv8i1;
27171 break;
27172 case MVT::i32:
27173 case MVT::f32:
27174 MaskVT = MVT::nxv4i1;
27175 break;
27176 case MVT::i64:
27177 case MVT::f64:
27178 MaskVT = MVT::nxv2i1;
27179 break;
27180 }
27181
27182 return getPTrue(DAG, DL, VT: MaskVT, Pattern: *PgPattern);
27183}
27184
27185static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
27186 EVT VT) {
27187 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
27188 "Expected legal scalable vector!");
27189 auto PredTy = VT.changeVectorElementType(EltVT: MVT::i1);
27190 return getPTrue(DAG, DL, VT: PredTy, Pattern: AArch64SVEPredPattern::all);
27191}
27192
27193static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
27194 if (VT.isFixedLengthVector())
27195 return getPredicateForFixedLengthVector(DAG, DL, VT);
27196
27197 return getPredicateForScalableVector(DAG, DL, VT);
27198}
27199
27200// Grow V to consume an entire SVE register.
27201static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
27202 assert(VT.isScalableVector() &&
27203 "Expected to convert into a scalable vector!");
27204 assert(V.getValueType().isFixedLengthVector() &&
27205 "Expected a fixed length vector operand!");
27206 SDLoc DL(V);
27207 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
27208 return DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT, N1: DAG.getUNDEF(VT), N2: V, N3: Zero);
27209}
27210
27211// Shrink V so it's just big enough to maintain a VT's worth of data.
27212static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
27213 assert(VT.isFixedLengthVector() &&
27214 "Expected to convert into a fixed length vector!");
27215 assert(V.getValueType().isScalableVector() &&
27216 "Expected a scalable vector operand!");
27217 SDLoc DL(V);
27218 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
27219 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: V, N2: Zero);
27220}
27221
27222// Convert all fixed length vector loads larger than NEON to masked_loads.
27223SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
27224 SDValue Op, SelectionDAG &DAG) const {
27225 auto Load = cast<LoadSDNode>(Val&: Op);
27226
27227 SDLoc DL(Op);
27228 EVT VT = Op.getValueType();
27229 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27230 EVT LoadVT = ContainerVT;
27231 EVT MemVT = Load->getMemoryVT();
27232
27233 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27234
27235 if (VT.isFloatingPoint()) {
27236 LoadVT = ContainerVT.changeTypeToInteger();
27237 MemVT = MemVT.changeTypeToInteger();
27238 }
27239
27240 SDValue NewLoad = DAG.getMaskedLoad(
27241 VT: LoadVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(), Mask: Pg,
27242 Src0: DAG.getUNDEF(VT: LoadVT), MemVT, MMO: Load->getMemOperand(),
27243 AM: Load->getAddressingMode(), Load->getExtensionType());
27244
27245 SDValue Result = NewLoad;
27246 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
27247 EVT ExtendVT = ContainerVT.changeVectorElementType(
27248 EltVT: Load->getMemoryVT().getVectorElementType());
27249
27250 Result = getSVESafeBitCast(VT: ExtendVT, Op: Result, DAG);
27251 Result = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
27252 N1: Pg, N2: Result, N3: DAG.getUNDEF(VT: ContainerVT));
27253 } else if (VT.isFloatingPoint()) {
27254 Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Result);
27255 }
27256
27257 Result = convertFromScalableVector(DAG, VT, V: Result);
27258 SDValue MergedValues[2] = {Result, NewLoad.getValue(R: 1)};
27259 return DAG.getMergeValues(Ops: MergedValues, dl: DL);
27260}
27261
27262static SDValue convertFixedMaskToScalableVector(SDValue Mask,
27263 SelectionDAG &DAG) {
27264 SDLoc DL(Mask);
27265 EVT InVT = Mask.getValueType();
27266 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
27267
27268 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
27269
27270 if (ISD::isBuildVectorAllOnes(N: Mask.getNode()))
27271 return Pg;
27272
27273 auto Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Mask);
27274 auto Op2 = DAG.getConstant(Val: 0, DL, VT: ContainerVT);
27275
27276 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: Pg.getValueType(),
27277 Ops: {Pg, Op1, Op2, DAG.getCondCode(Cond: ISD::SETNE)});
27278}
27279
27280// Convert all fixed length vector loads larger than NEON to masked_loads.
27281SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
27282 SDValue Op, SelectionDAG &DAG) const {
27283 auto Load = cast<MaskedLoadSDNode>(Val&: Op);
27284
27285 SDLoc DL(Op);
27286 EVT VT = Op.getValueType();
27287 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27288
27289 SDValue Mask = Load->getMask();
27290 // If this is an extending load and the mask type is not the same as
27291 // load's type then we have to extend the mask type.
27292 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
27293 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
27294 "Incorrect mask type");
27295 Mask = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: Mask);
27296 }
27297 Mask = convertFixedMaskToScalableVector(Mask, DAG);
27298
27299 SDValue PassThru;
27300 bool IsPassThruZeroOrUndef = false;
27301
27302 if (Load->getPassThru()->isUndef()) {
27303 PassThru = DAG.getUNDEF(VT: ContainerVT);
27304 IsPassThruZeroOrUndef = true;
27305 } else {
27306 if (ContainerVT.isInteger())
27307 PassThru = DAG.getConstant(Val: 0, DL, VT: ContainerVT);
27308 else
27309 PassThru = DAG.getConstantFP(Val: 0, DL, VT: ContainerVT);
27310 if (isZerosVector(N: Load->getPassThru().getNode()))
27311 IsPassThruZeroOrUndef = true;
27312 }
27313
27314 SDValue NewLoad = DAG.getMaskedLoad(
27315 VT: ContainerVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(),
27316 Mask, Src0: PassThru, MemVT: Load->getMemoryVT(), MMO: Load->getMemOperand(),
27317 AM: Load->getAddressingMode(), Load->getExtensionType());
27318
27319 SDValue Result = NewLoad;
27320 if (!IsPassThruZeroOrUndef) {
27321 SDValue OldPassThru =
27322 convertToScalableVector(DAG, VT: ContainerVT, V: Load->getPassThru());
27323 Result = DAG.getSelect(DL, VT: ContainerVT, Cond: Mask, LHS: Result, RHS: OldPassThru);
27324 }
27325
27326 Result = convertFromScalableVector(DAG, VT, V: Result);
27327 SDValue MergedValues[2] = {Result, NewLoad.getValue(R: 1)};
27328 return DAG.getMergeValues(Ops: MergedValues, dl: DL);
27329}
27330
27331// Convert all fixed length vector stores larger than NEON to masked_stores.
27332SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
27333 SDValue Op, SelectionDAG &DAG) const {
27334 auto Store = cast<StoreSDNode>(Val&: Op);
27335
27336 SDLoc DL(Op);
27337 EVT VT = Store->getValue().getValueType();
27338 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27339 EVT MemVT = Store->getMemoryVT();
27340
27341 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27342 auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
27343
27344 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
27345 EVT TruncVT = ContainerVT.changeVectorElementType(
27346 EltVT: Store->getMemoryVT().getVectorElementType());
27347 MemVT = MemVT.changeTypeToInteger();
27348 NewValue = DAG.getNode(Opcode: AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, VT: TruncVT, N1: Pg,
27349 N2: NewValue, N3: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64),
27350 N4: DAG.getUNDEF(VT: TruncVT));
27351 NewValue =
27352 getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
27353 } else if (VT.isFloatingPoint()) {
27354 MemVT = MemVT.changeTypeToInteger();
27355 NewValue =
27356 getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
27357 }
27358
27359 return DAG.getMaskedStore(Chain: Store->getChain(), dl: DL, Val: NewValue,
27360 Base: Store->getBasePtr(), Offset: Store->getOffset(), Mask: Pg, MemVT,
27361 MMO: Store->getMemOperand(), AM: Store->getAddressingMode(),
27362 IsTruncating: Store->isTruncatingStore());
27363}
27364
27365SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
27366 SDValue Op, SelectionDAG &DAG) const {
27367 auto *Store = cast<MaskedStoreSDNode>(Val&: Op);
27368
27369 SDLoc DL(Op);
27370 EVT VT = Store->getValue().getValueType();
27371 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27372
27373 auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
27374 SDValue Mask = convertFixedMaskToScalableVector(Mask: Store->getMask(), DAG);
27375
27376 return DAG.getMaskedStore(
27377 Chain: Store->getChain(), dl: DL, Val: NewValue, Base: Store->getBasePtr(), Offset: Store->getOffset(),
27378 Mask, MemVT: Store->getMemoryVT(), MMO: Store->getMemOperand(),
27379 AM: Store->getAddressingMode(), IsTruncating: Store->isTruncatingStore());
27380}
27381
27382SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
27383 SDValue Op, SelectionDAG &DAG) const {
27384 SDLoc dl(Op);
27385 EVT VT = Op.getValueType();
27386 EVT EltVT = VT.getVectorElementType();
27387
27388 bool Signed = Op.getOpcode() == ISD::SDIV;
27389 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
27390
27391 bool Negated;
27392 uint64_t SplatVal;
27393 if (Signed && isPow2Splat(Op: Op.getOperand(i: 1), SplatVal, Negated)) {
27394 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27395 SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: 0));
27396 SDValue Op2 = DAG.getTargetConstant(Val: Log2_64(Value: SplatVal), DL: dl, VT: MVT::i32);
27397
27398 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL&: dl, VT);
27399 SDValue Res =
27400 DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: dl, VT: ContainerVT, N1: Pg, N2: Op1, N3: Op2);
27401 if (Negated)
27402 Res = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: ContainerVT,
27403 N1: DAG.getConstant(Val: 0, DL: dl, VT: ContainerVT), N2: Res);
27404
27405 return convertFromScalableVector(DAG, VT, V: Res);
27406 }
27407
27408 // Scalable vector i32/i64 DIV is supported.
27409 if (EltVT == MVT::i32 || EltVT == MVT::i64)
27410 return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
27411
27412 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
27413 EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
27414 EVT PromVT = HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext());
27415 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27416
27417 // If the wider type is legal: extend, op, and truncate.
27418 EVT WideVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
27419 if (DAG.getTargetLoweringInfo().isTypeLegal(VT: WideVT)) {
27420 SDValue Op0 = DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: WideVT, Operand: Op.getOperand(i: 0));
27421 SDValue Op1 = DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: WideVT, Operand: Op.getOperand(i: 1));
27422 SDValue Div = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WideVT, N1: Op0, N2: Op1);
27423 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Div);
27424 }
27425
27426 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
27427 &ExtendOpcode](SDValue Op) {
27428 SDValue IdxZero = DAG.getConstant(Val: 0, DL: dl, VT: MVT::i64);
27429 SDValue IdxHalf =
27430 DAG.getConstant(Val: HalfVT.getVectorNumElements(), DL: dl, VT: MVT::i64);
27431 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: HalfVT, N1: Op, N2: IdxZero);
27432 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: HalfVT, N1: Op, N2: IdxHalf);
27433 return std::pair<SDValue, SDValue>(
27434 {DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: PromVT, Operand: Lo),
27435 DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: PromVT, Operand: Hi)});
27436 };
27437
27438 // If wider type is not legal: split, extend, op, trunc and concat.
27439 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(i: 0));
27440 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(i: 1));
27441 SDValue Lo = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromVT, N1: Op0LoExt, N2: Op1LoExt);
27442 SDValue Hi = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromVT, N1: Op0HiExt, N2: Op1HiExt);
27443 SDValue LoTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: HalfVT, Operand: Lo);
27444 SDValue HiTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: HalfVT, Operand: Hi);
27445 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, Ops: {LoTrunc, HiTrunc});
27446}
27447
27448SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
27449 SDValue Op, SelectionDAG &DAG) const {
27450 EVT VT = Op.getValueType();
27451 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27452
27453 SDLoc DL(Op);
27454 SDValue Val = Op.getOperand(i: 0);
27455 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
27456 Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
27457
27458 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
27459 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
27460
27461 // Repeatedly unpack Val until the result is of the desired element type.
27462 switch (ContainerVT.getSimpleVT().SimpleTy) {
27463 default:
27464 llvm_unreachable("unimplemented container type");
27465 case MVT::nxv16i8:
27466 Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv8i16, Operand: Val);
27467 if (VT.getVectorElementType() == MVT::i16)
27468 break;
27469 [[fallthrough]];
27470 case MVT::nxv8i16:
27471 Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv4i32, Operand: Val);
27472 if (VT.getVectorElementType() == MVT::i32)
27473 break;
27474 [[fallthrough]];
27475 case MVT::nxv4i32:
27476 Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv2i64, Operand: Val);
27477 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
27478 break;
27479 }
27480
27481 return convertFromScalableVector(DAG, VT, V: Val);
27482}
27483
27484SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
27485 SDValue Op, SelectionDAG &DAG) const {
27486 EVT VT = Op.getValueType();
27487 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27488
27489 SDLoc DL(Op);
27490 SDValue Val = Op.getOperand(i: 0);
27491 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
27492 Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
27493
27494 // Repeatedly truncate Val until the result is of the desired element type.
27495 switch (ContainerVT.getSimpleVT().SimpleTy) {
27496 default:
27497 llvm_unreachable("unimplemented container type");
27498 case MVT::nxv2i64:
27499 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv4i32, Operand: Val);
27500 Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv4i32, N1: Val, N2: Val);
27501 if (VT.getVectorElementType() == MVT::i32)
27502 break;
27503 [[fallthrough]];
27504 case MVT::nxv4i32:
27505 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv8i16, Operand: Val);
27506 Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv8i16, N1: Val, N2: Val);
27507 if (VT.getVectorElementType() == MVT::i16)
27508 break;
27509 [[fallthrough]];
27510 case MVT::nxv8i16:
27511 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i8, Operand: Val);
27512 Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv16i8, N1: Val, N2: Val);
27513 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
27514 break;
27515 }
27516
27517 return convertFromScalableVector(DAG, VT, V: Val);
27518}
27519
27520SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
27521 SDValue Op, SelectionDAG &DAG) const {
27522 EVT VT = Op.getValueType();
27523 EVT InVT = Op.getOperand(i: 0).getValueType();
27524 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
27525
27526 SDLoc DL(Op);
27527 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
27528 SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 0));
27529
27530 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Op0, N2: Op.getOperand(i: 1));
27531}
27532
27533SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
27534 SDValue Op, SelectionDAG &DAG) const {
27535 EVT VT = Op.getValueType();
27536 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27537
27538 SDLoc DL(Op);
27539 EVT InVT = Op.getOperand(i: 0).getValueType();
27540 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
27541 SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 0));
27542
27543 auto ScalableRes = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT, N1: Op0,
27544 N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2));
27545
27546 return convertFromScalableVector(DAG, VT, V: ScalableRes);
27547}
27548
27549// Convert vector operation 'Op' to an equivalent predicated operation whereby
27550// the original operation's type is used to construct a suitable predicate.
27551// NOTE: The results for inactive lanes are undefined.
27552SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
27553 SelectionDAG &DAG,
27554 unsigned NewOp) const {
27555 EVT VT = Op.getValueType();
27556 SDLoc DL(Op);
27557 auto Pg = getPredicateForVector(DAG, DL, VT);
27558
27559 if (VT.isFixedLengthVector()) {
27560 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
27561 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27562
27563 // Create list of operands by converting existing ones to scalable types.
27564 SmallVector<SDValue, 4> Operands = {Pg};
27565 for (const SDValue &V : Op->op_values()) {
27566 if (isa<CondCodeSDNode>(Val: V)) {
27567 Operands.push_back(Elt: V);
27568 continue;
27569 }
27570
27571 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(Val: V)) {
27572 EVT VTArg = VTNode->getVT().getVectorElementType();
27573 EVT NewVTArg = ContainerVT.changeVectorElementType(EltVT: VTArg);
27574 Operands.push_back(Elt: DAG.getValueType(NewVTArg));
27575 continue;
27576 }
27577
27578 assert(isTypeLegal(V.getValueType()) &&
27579 "Expected only legal fixed-width types");
27580 Operands.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
27581 }
27582
27583 if (isMergePassthruOpcode(Opc: NewOp))
27584 Operands.push_back(Elt: DAG.getUNDEF(VT: ContainerVT));
27585
27586 auto ScalableRes = DAG.getNode(Opcode: NewOp, DL, VT: ContainerVT, Ops: Operands);
27587 return convertFromScalableVector(DAG, VT, V: ScalableRes);
27588 }
27589
27590 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
27591
27592 SmallVector<SDValue, 4> Operands = {Pg};
27593 for (const SDValue &V : Op->op_values()) {
27594 assert((!V.getValueType().isVector() ||
27595 V.getValueType().isScalableVector()) &&
27596 "Only scalable vectors are supported!");
27597 Operands.push_back(Elt: V);
27598 }
27599
27600 if (isMergePassthruOpcode(Opc: NewOp))
27601 Operands.push_back(Elt: DAG.getUNDEF(VT));
27602
27603 return DAG.getNode(Opcode: NewOp, DL, VT, Ops: Operands, Flags: Op->getFlags());
27604}
27605
27606// If a fixed length vector operation has no side effects when applied to
27607// undefined elements, we can safely use scalable vectors to perform the same
27608// operation without needing to worry about predication.
27609SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
27610 SelectionDAG &DAG) const {
27611 EVT VT = Op.getValueType();
27612 assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
27613 "Only expected to lower fixed length vector operation!");
27614 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27615
27616 // Create list of operands by converting existing ones to scalable types.
27617 SmallVector<SDValue, 4> Ops;
27618 for (const SDValue &V : Op->op_values()) {
27619 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
27620
27621 // Pass through non-vector operands.
27622 if (!V.getValueType().isVector()) {
27623 Ops.push_back(Elt: V);
27624 continue;
27625 }
27626
27627 // "cast" fixed length vector to a scalable vector.
27628 assert(V.getValueType().isFixedLengthVector() &&
27629 isTypeLegal(V.getValueType()) &&
27630 "Only fixed length vectors are supported!");
27631 Ops.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
27632 }
27633
27634 auto ScalableRes = DAG.getNode(Opcode: Op.getOpcode(), DL: SDLoc(Op), VT: ContainerVT, Ops);
27635 return convertFromScalableVector(DAG, VT, V: ScalableRes);
27636}
27637
27638SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
27639 SelectionDAG &DAG) const {
27640 SDLoc DL(ScalarOp);
27641 SDValue AccOp = ScalarOp.getOperand(i: 0);
27642 SDValue VecOp = ScalarOp.getOperand(i: 1);
27643 EVT SrcVT = VecOp.getValueType();
27644 EVT ResVT = SrcVT.getVectorElementType();
27645
27646 EVT ContainerVT = SrcVT;
27647 if (SrcVT.isFixedLengthVector()) {
27648 ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27649 VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
27650 }
27651
27652 SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
27653 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
27654
27655 // Convert operands to Scalable.
27656 AccOp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT,
27657 N1: DAG.getUNDEF(VT: ContainerVT), N2: AccOp, N3: Zero);
27658
27659 // Perform reduction.
27660 SDValue Rdx = DAG.getNode(Opcode: AArch64ISD::FADDA_PRED, DL, VT: ContainerVT,
27661 N1: Pg, N2: AccOp, N3: VecOp);
27662
27663 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT, N1: Rdx, N2: Zero);
27664}
27665
27666SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
27667 SelectionDAG &DAG) const {
27668 SDLoc DL(ReduceOp);
27669 SDValue Op = ReduceOp.getOperand(i: 0);
27670 EVT OpVT = Op.getValueType();
27671 EVT VT = ReduceOp.getValueType();
27672
27673 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
27674 return SDValue();
27675
27676 SDValue Pg = getPredicateForVector(DAG, DL, VT: OpVT);
27677
27678 switch (ReduceOp.getOpcode()) {
27679 default:
27680 return SDValue();
27681 case ISD::VECREDUCE_OR:
27682 if (isAllActivePredicate(DAG, N: Pg) && OpVT == MVT::nxv16i1)
27683 // The predicate can be 'Op' because
27684 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
27685 return getPTest(DAG, VT, Pg: Op, Op, Cond: AArch64CC::ANY_ACTIVE);
27686 else
27687 return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::ANY_ACTIVE);
27688 case ISD::VECREDUCE_AND: {
27689 Op = DAG.getNode(Opcode: ISD::XOR, DL, VT: OpVT, N1: Op, N2: Pg);
27690 return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::NONE_ACTIVE);
27691 }
27692 case ISD::VECREDUCE_XOR: {
27693 SDValue ID =
27694 DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_cntp, DL, VT: MVT::i64);
27695 if (OpVT == MVT::nxv1i1) {
27696 // Emulate a CNTP on .Q using .D and a different governing predicate.
27697 Pg = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv2i1, Operand: Pg);
27698 Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv2i1, Operand: Op);
27699 }
27700 SDValue Cntp =
27701 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i64, N1: ID, N2: Pg, N3: Op);
27702 return DAG.getAnyExtOrTrunc(Op: Cntp, DL, VT);
27703 }
27704 }
27705
27706 return SDValue();
27707}
27708
27709SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
27710 SDValue ScalarOp,
27711 SelectionDAG &DAG) const {
27712 SDLoc DL(ScalarOp);
27713 SDValue VecOp = ScalarOp.getOperand(i: 0);
27714 EVT SrcVT = VecOp.getValueType();
27715
27716 if (useSVEForFixedLengthVectorVT(
27717 VT: SrcVT,
27718 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
27719 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27720 VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
27721 }
27722
27723 // UADDV always returns an i64 result.
27724 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
27725 SrcVT.getVectorElementType();
27726 EVT RdxVT = SrcVT;
27727 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
27728 RdxVT = getPackedSVEVectorVT(VT: ResVT);
27729
27730 SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
27731 SDValue Rdx = DAG.getNode(Opcode, DL, VT: RdxVT, N1: Pg, N2: VecOp);
27732 SDValue Res = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT,
27733 N1: Rdx, N2: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
27734
27735 // The VEC_REDUCE nodes expect an element size result.
27736 if (ResVT != ScalarOp.getValueType())
27737 Res = DAG.getAnyExtOrTrunc(Op: Res, DL, VT: ScalarOp.getValueType());
27738
27739 return Res;
27740}
27741
27742SDValue
27743AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
27744 SelectionDAG &DAG) const {
27745 EVT VT = Op.getValueType();
27746 SDLoc DL(Op);
27747
27748 EVT InVT = Op.getOperand(i: 1).getValueType();
27749 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
27750 SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 1));
27751 SDValue Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 2));
27752
27753 // Convert the mask to a predicated (NOTE: We don't need to worry about
27754 // inactive lanes since VSELECT is safe when given undefined elements).
27755 EVT MaskVT = Op.getOperand(i: 0).getValueType();
27756 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskVT);
27757 auto Mask = convertToScalableVector(DAG, VT: MaskContainerVT, V: Op.getOperand(i: 0));
27758 Mask = DAG.getNode(Opcode: ISD::TRUNCATE, DL,
27759 VT: MaskContainerVT.changeVectorElementType(EltVT: MVT::i1), Operand: Mask);
27760
27761 auto ScalableRes = DAG.getNode(Opcode: ISD::VSELECT, DL, VT: ContainerVT,
27762 N1: Mask, N2: Op1, N3: Op2);
27763
27764 return convertFromScalableVector(DAG, VT, V: ScalableRes);
27765}
27766
27767SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
27768 SDValue Op, SelectionDAG &DAG) const {
27769 SDLoc DL(Op);
27770 EVT InVT = Op.getOperand(i: 0).getValueType();
27771 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
27772
27773 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
27774 "Only expected to lower fixed length vector operation!");
27775 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
27776 "Expected integer result of the same bit length as the inputs!");
27777
27778 auto Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: 0));
27779 auto Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: 1));
27780 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
27781
27782 EVT CmpVT = Pg.getValueType();
27783 auto Cmp = DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: CmpVT,
27784 Ops: {Pg, Op1, Op2, Op.getOperand(i: 2)});
27785
27786 EVT PromoteVT = ContainerVT.changeTypeToInteger();
27787 auto Promote = DAG.getBoolExtOrTrunc(Op: Cmp, SL: DL, VT: PromoteVT, OpVT: InVT);
27788 return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Promote);
27789}
27790
27791SDValue
27792AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27793 SelectionDAG &DAG) const {
27794 SDLoc DL(Op);
27795 auto SrcOp = Op.getOperand(i: 0);
27796 EVT VT = Op.getValueType();
27797 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27798 EVT ContainerSrcVT =
27799 getContainerForFixedLengthVector(DAG, VT: SrcOp.getValueType());
27800
27801 SrcOp = convertToScalableVector(DAG, VT: ContainerSrcVT, V: SrcOp);
27802 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerDstVT, Operand: SrcOp);
27803 return convertFromScalableVector(DAG, VT, V: Op);
27804}
27805
27806SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27807 SDValue Op, SelectionDAG &DAG) const {
27808 SDLoc DL(Op);
27809 unsigned NumOperands = Op->getNumOperands();
27810
27811 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27812 "Unexpected number of operands in CONCAT_VECTORS");
27813
27814 auto SrcOp1 = Op.getOperand(i: 0);
27815 auto SrcOp2 = Op.getOperand(i: 1);
27816 EVT VT = Op.getValueType();
27817 EVT SrcVT = SrcOp1.getValueType();
27818
27819 if (NumOperands > 2) {
27820 SmallVector<SDValue, 4> Ops;
27821 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
27822 for (unsigned I = 0; I < NumOperands; I += 2)
27823 Ops.push_back(Elt: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: PairVT,
27824 N1: Op->getOperand(Num: I), N2: Op->getOperand(Num: I + 1)));
27825
27826 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops);
27827 }
27828
27829 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27830
27831 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
27832 SrcOp1 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp1);
27833 SrcOp2 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp2);
27834
27835 Op = DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: ContainerVT, N1: Pg, N2: SrcOp1, N3: SrcOp2);
27836
27837 return convertFromScalableVector(DAG, VT, V: Op);
27838}
27839
27840SDValue
27841AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27842 SelectionDAG &DAG) const {
27843 EVT VT = Op.getValueType();
27844 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27845
27846 SDLoc DL(Op);
27847 SDValue Val = Op.getOperand(i: 0);
27848 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27849 EVT SrcVT = Val.getValueType();
27850 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27851 EVT ExtendVT = ContainerVT.changeVectorElementType(
27852 EltVT: SrcVT.getVectorElementType());
27853
27854 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
27855 Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27856
27857 Val = convertToScalableVector(DAG, VT: ContainerVT.changeTypeToInteger(), V: Val);
27858 Val = getSVESafeBitCast(VT: ExtendVT, Op: Val, DAG);
27859 Val = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
27860 N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: ContainerVT));
27861
27862 return convertFromScalableVector(DAG, VT, V: Val);
27863}
27864
27865SDValue
27866AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27867 SelectionDAG &DAG) const {
27868 EVT VT = Op.getValueType();
27869 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27870
27871 SDLoc DL(Op);
27872 SDValue Val = Op.getOperand(i: 0);
27873 EVT SrcVT = Val.getValueType();
27874 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27875 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27876 EltVT: VT.getVectorElementType());
27877 SDValue Pg = getPredicateForVector(DAG, DL, VT: RoundVT);
27878
27879 Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
27880 Val = DAG.getNode(Opcode: AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, VT: RoundVT, N1: Pg, N2: Val,
27881 N3: Op.getOperand(i: 1), N4: DAG.getUNDEF(VT: RoundVT));
27882 Val = getSVESafeBitCast(VT: ContainerSrcVT.changeTypeToInteger(), Op: Val, DAG);
27883 Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
27884
27885 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27886 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
27887}
27888
27889SDValue
27890AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27891 SelectionDAG &DAG) const {
27892 EVT VT = Op.getValueType();
27893 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27894
27895 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27896 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27897 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
27898
27899 SDLoc DL(Op);
27900 SDValue Val = Op.getOperand(i: 0);
27901 EVT SrcVT = Val.getValueType();
27902 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27903 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27904
27905 if (VT.bitsGE(VT: SrcVT)) {
27906 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27907
27908 Val = DAG.getNode(Opcode: IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27909 VT: VT.changeTypeToInteger(), Operand: Val);
27910
27911 // Safe to use a larger than specified operand because by promoting the
27912 // value nothing has changed from an arithmetic point of view.
27913 Val =
27914 convertToScalableVector(DAG, VT: ContainerDstVT.changeTypeToInteger(), V: Val);
27915 Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
27916 N3: DAG.getUNDEF(VT: ContainerDstVT));
27917 return convertFromScalableVector(DAG, VT, V: Val);
27918 } else {
27919 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27920 EltVT: ContainerDstVT.getVectorElementType());
27921 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
27922
27923 Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
27924 Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
27925 Val = getSVESafeBitCast(VT: ContainerSrcVT, Op: Val, DAG);
27926 Val = convertFromScalableVector(DAG, VT: SrcVT, V: Val);
27927
27928 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27929 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
27930 }
27931}
27932
27933SDValue
27934AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27935 SelectionDAG &DAG) const {
27936 SDLoc DL(Op);
27937 EVT OpVT = Op.getValueType();
27938 assert(OpVT.isScalableVector() &&
27939 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27940 SDValue Even = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: OpVT, N1: Op.getOperand(i: 0),
27941 N2: Op.getOperand(i: 1));
27942 SDValue Odd = DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT: OpVT, N1: Op.getOperand(i: 0),
27943 N2: Op.getOperand(i: 1));
27944 return DAG.getMergeValues(Ops: {Even, Odd}, dl: DL);
27945}
27946
27947SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27948 SelectionDAG &DAG) const {
27949 SDLoc DL(Op);
27950 EVT OpVT = Op.getValueType();
27951 assert(OpVT.isScalableVector() &&
27952 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27953
27954 SDValue Lo = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: OpVT, N1: Op.getOperand(i: 0),
27955 N2: Op.getOperand(i: 1));
27956 SDValue Hi = DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: OpVT, N1: Op.getOperand(i: 0),
27957 N2: Op.getOperand(i: 1));
27958 return DAG.getMergeValues(Ops: {Lo, Hi}, dl: DL);
27959}
27960
27961SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
27962 SelectionDAG &DAG) const {
27963 // FIXME: Maybe share some code with LowerMGather/Scatter?
27964 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Val&: Op);
27965 SDLoc DL(HG);
27966 SDValue Chain = HG->getChain();
27967 SDValue Inc = HG->getInc();
27968 SDValue Mask = HG->getMask();
27969 SDValue Ptr = HG->getBasePtr();
27970 SDValue Index = HG->getIndex();
27971 SDValue Scale = HG->getScale();
27972 SDValue IntID = HG->getIntID();
27973
27974 // The Intrinsic ID determines the type of update operation.
27975 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(Val: IntID.getNode());
27976 // Right now, we only support 'add' as an update.
27977 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
27978 "Unexpected histogram update operation");
27979
27980 EVT IncVT = Inc.getValueType();
27981 EVT IndexVT = Index.getValueType();
27982 EVT MemVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: IncVT,
27983 EC: IndexVT.getVectorElementCount());
27984 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i64);
27985 SDValue PassThru = DAG.getSplatVector(VT: MemVT, DL, Op: Zero);
27986 SDValue IncSplat = DAG.getSplatVector(VT: MemVT, DL, Op: Inc);
27987 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
27988
27989 MachineMemOperand *MMO = HG->getMemOperand();
27990 // Create an MMO for the gather, without load|store flags.
27991 MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
27992 PtrInfo: MMO->getPointerInfo(), F: MachineMemOperand::MOLoad, Size: MMO->getSize(),
27993 BaseAlignment: MMO->getAlign(), AAInfo: MMO->getAAInfo());
27994 ISD::MemIndexType IndexType = HG->getIndexType();
27995 SDValue Gather =
27996 DAG.getMaskedGather(VTs: DAG.getVTList(VT1: MemVT, VT2: MVT::Other), MemVT, dl: DL, Ops,
27997 MMO: GMMO, IndexType, ExtTy: ISD::NON_EXTLOAD);
27998
27999 SDValue GChain = Gather.getValue(R: 1);
28000
28001 // Perform the histcnt, multiply by inc, add to bucket data.
28002 SDValue ID = DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_histcnt, DL, VT: IncVT);
28003 SDValue HistCnt =
28004 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT, N1: ID, N2: Mask, N3: Index, N4: Index);
28005 SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: MemVT, N1: HistCnt, N2: IncSplat);
28006 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: MemVT, N1: Gather, N2: Mul);
28007
28008 // Create an MMO for the scatter, without load|store flags.
28009 MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
28010 PtrInfo: MMO->getPointerInfo(), F: MachineMemOperand::MOStore, Size: MMO->getSize(),
28011 BaseAlignment: MMO->getAlign(), AAInfo: MMO->getAAInfo());
28012
28013 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
28014 SDValue Scatter = DAG.getMaskedScatter(VTs: DAG.getVTList(VT: MVT::Other), MemVT, dl: DL,
28015 Ops: ScatterOps, MMO: SMMO, IndexType, IsTruncating: false);
28016 return Scatter;
28017}
28018
28019SDValue
28020AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
28021 SelectionDAG &DAG) const {
28022 EVT VT = Op.getValueType();
28023 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28024
28025 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
28026 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
28027 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
28028
28029 SDLoc DL(Op);
28030 SDValue Val = Op.getOperand(i: 0);
28031 EVT SrcVT = Val.getValueType();
28032 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
28033 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
28034
28035 if (VT.bitsGT(VT: SrcVT)) {
28036 EVT CvtVT = ContainerDstVT.changeVectorElementType(
28037 EltVT: ContainerSrcVT.getVectorElementType());
28038 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28039
28040 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
28041 Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: Val);
28042
28043 Val = convertToScalableVector(DAG, VT: ContainerDstVT, V: Val);
28044 Val = getSVESafeBitCast(VT: CvtVT, Op: Val, DAG);
28045 Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
28046 N3: DAG.getUNDEF(VT: ContainerDstVT));
28047 return convertFromScalableVector(DAG, VT, V: Val);
28048 } else {
28049 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
28050 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
28051
28052 // Safe to use a larger than specified result since an fp_to_int where the
28053 // result doesn't fit into the destination is undefined.
28054 Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
28055 Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
28056 Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
28057
28058 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Val);
28059 }
28060}
28061
28062static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
28063 ArrayRef<int> ShuffleMask, EVT VT,
28064 EVT ContainerVT, SelectionDAG &DAG) {
28065 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
28066 SDLoc DL(Op);
28067 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
28068 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
28069 bool IsSingleOp =
28070 ShuffleVectorInst::isSingleSourceMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size());
28071
28072 if (!Subtarget.isNeonAvailable() && !MinSVESize)
28073 MinSVESize = 128;
28074
28075 // Ignore two operands if no SVE2 or all index numbers couldn't
28076 // be represented.
28077 if (!IsSingleOp && !Subtarget.hasSVE2())
28078 return SDValue();
28079
28080 EVT VTOp1 = Op.getOperand(i: 0).getValueType();
28081 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
28082 unsigned IndexLen = MinSVESize / BitsPerElt;
28083 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
28084 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
28085 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
28086 EVT MaskType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MaskEltType, NumElements: IndexLen);
28087 bool MinMaxEqual = (MinSVESize == MaxSVESize);
28088 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
28089 "Incorrectly legalised shuffle operation");
28090
28091 SmallVector<SDValue, 8> TBLMask;
28092 // If MinSVESize is not equal to MaxSVESize then we need to know which
28093 // TBL mask element needs adjustment.
28094 SmallVector<SDValue, 8> AddRuntimeVLMask;
28095
28096 // Bail out for 8-bits element types, because with 2048-bit SVE register
28097 // size 8 bits is only sufficient to index into the first source vector.
28098 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
28099 return SDValue();
28100
28101 for (int Index : ShuffleMask) {
28102 // Handling poison index value.
28103 if (Index < 0)
28104 Index = 0;
28105 // If the mask refers to elements in the second operand, then we have to
28106 // offset the index by the number of elements in a vector. If this is number
28107 // is not known at compile-time, we need to maintain a mask with 'VL' values
28108 // to add at runtime.
28109 if ((unsigned)Index >= ElementsPerVectorReg) {
28110 if (MinMaxEqual) {
28111 Index += IndexLen - ElementsPerVectorReg;
28112 } else {
28113 Index = Index - ElementsPerVectorReg;
28114 AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: 1, DL, VT: MVT::i64));
28115 }
28116 } else if (!MinMaxEqual)
28117 AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
28118 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
28119 // to 255, this might point to the last element of in the second operand
28120 // of the shufflevector, thus we are rejecting this transform.
28121 if ((unsigned)Index >= MaxOffset)
28122 return SDValue();
28123 TBLMask.push_back(Elt: DAG.getConstant(Val: Index, DL, VT: MVT::i64));
28124 }
28125
28126 // Choosing an out-of-range index leads to the lane being zeroed vs zero
28127 // value where it would perform first lane duplication for out of
28128 // index elements. For i8 elements an out-of-range index could be a valid
28129 // for 2048-bit vector register size.
28130 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
28131 TBLMask.push_back(Elt: DAG.getConstant(Val: (int)MaxOffset, DL, VT: MVT::i64));
28132 if (!MinMaxEqual)
28133 AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: 0, DL, VT: MVT::i64));
28134 }
28135
28136 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskType);
28137 SDValue VecMask =
28138 DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
28139 SDValue SVEMask = convertToScalableVector(DAG, VT: MaskContainerVT, V: VecMask);
28140
28141 SDValue Shuffle;
28142 if (IsSingleOp)
28143 Shuffle =
28144 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ContainerVT,
28145 N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_tbl, DL, VT: MVT::i32),
28146 N2: Op1, N3: SVEMask);
28147 else if (Subtarget.hasSVE2()) {
28148 if (!MinMaxEqual) {
28149 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
28150 SDValue VScale = (BitsPerElt == 64)
28151 ? DAG.getVScale(DL, VT: MVT::i64, MulImm: APInt(64, MinNumElts))
28152 : DAG.getVScale(DL, VT: MVT::i32, MulImm: APInt(32, MinNumElts));
28153 SDValue VecMask =
28154 DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
28155 SDValue MulByMask = DAG.getNode(
28156 Opcode: ISD::MUL, DL, VT: MaskType,
28157 N1: DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MaskType, Operand: VScale),
28158 N2: DAG.getBuildVector(VT: MaskType, DL,
28159 Ops: ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
28160 SDValue UpdatedVecMask =
28161 DAG.getNode(Opcode: ISD::ADD, DL, VT: MaskType, N1: VecMask, N2: MulByMask);
28162 SVEMask = convertToScalableVector(
28163 DAG, VT: getContainerForFixedLengthVector(DAG, VT: MaskType), V: UpdatedVecMask);
28164 }
28165 Shuffle =
28166 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ContainerVT,
28167 N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_tbl2, DL, VT: MVT::i32),
28168 N2: Op1, N3: Op2, N4: SVEMask);
28169 }
28170 Shuffle = convertFromScalableVector(DAG, VT, V: Shuffle);
28171 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
28172}
28173
28174SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
28175 SDValue Op, SelectionDAG &DAG) const {
28176 EVT VT = Op.getValueType();
28177 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28178
28179 auto *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
28180 auto ShuffleMask = SVN->getMask();
28181
28182 SDLoc DL(Op);
28183 SDValue Op1 = Op.getOperand(i: 0);
28184 SDValue Op2 = Op.getOperand(i: 1);
28185
28186 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28187 Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op1);
28188 Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op2);
28189
28190 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
28191 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
28192 return MVT::i32;
28193 return ScalarTy;
28194 };
28195
28196 if (SVN->isSplat()) {
28197 unsigned Lane = std::max(a: 0, b: SVN->getSplatIndex());
28198 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
28199 SDValue SplatEl = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarTy, N1: Op1,
28200 N2: DAG.getConstant(Val: Lane, DL, VT: MVT::i64));
28201 Op = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: ContainerVT, Operand: SplatEl);
28202 return convertFromScalableVector(DAG, VT, V: Op);
28203 }
28204
28205 bool ReverseEXT = false;
28206 unsigned Imm;
28207 if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm) &&
28208 Imm == VT.getVectorNumElements() - 1) {
28209 if (ReverseEXT)
28210 std::swap(a&: Op1, b&: Op2);
28211 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
28212 SDValue Scalar = DAG.getNode(
28213 Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarTy, N1: Op1,
28214 N2: DAG.getConstant(Val: VT.getVectorNumElements() - 1, DL, VT: MVT::i64));
28215 Op = DAG.getNode(Opcode: AArch64ISD::INSR, DL, VT: ContainerVT, N1: Op2, N2: Scalar);
28216 return convertFromScalableVector(DAG, VT, V: Op);
28217 }
28218
28219 unsigned EltSize = VT.getScalarSizeInBits();
28220 for (unsigned LaneSize : {64U, 32U, 16U}) {
28221 if (isREVMask(M: ShuffleMask, EltSize, NumElts: VT.getVectorNumElements(), BlockSize: LaneSize)) {
28222 EVT NewVT =
28223 getPackedSVEVectorVT(VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: LaneSize));
28224 unsigned RevOp;
28225 if (EltSize == 8)
28226 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
28227 else if (EltSize == 16)
28228 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
28229 else
28230 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
28231
28232 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: NewVT, Operand: Op1);
28233 Op = LowerToPredicatedOp(Op, DAG, NewOp: RevOp);
28234 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Op);
28235 return convertFromScalableVector(DAG, VT, V: Op);
28236 }
28237 }
28238
28239 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
28240 isREVMask(M: ShuffleMask, EltSize, NumElts: VT.getVectorNumElements(), BlockSize: 128)) {
28241 if (!VT.isFloatingPoint())
28242 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::REVD_MERGE_PASSTHRU);
28243
28244 EVT NewVT = getPackedSVEVectorVT(VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: 64));
28245 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: NewVT, Operand: Op1);
28246 Op = LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::REVD_MERGE_PASSTHRU);
28247 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Op);
28248 return convertFromScalableVector(DAG, VT, V: Op);
28249 }
28250
28251 unsigned WhichResult;
28252 if (isZIPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult) &&
28253 WhichResult == 0)
28254 return convertFromScalableVector(
28255 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op2));
28256
28257 if (isTRNMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResult)) {
28258 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
28259 return convertFromScalableVector(
28260 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
28261 }
28262
28263 if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult == 0)
28264 return convertFromScalableVector(
28265 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op1));
28266
28267 if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
28268 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
28269 return convertFromScalableVector(
28270 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
28271 }
28272
28273 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
28274 // represents the same logical operation as performed by a ZIP instruction. In
28275 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
28276 // equivalent to an AArch64 instruction. There's the extra component of
28277 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
28278 // only operated on 64/128bit vector types that have a direct mapping to a
28279 // target register and so an exact mapping is implied.
28280 // However, when using SVE for fixed length vectors, most legal vector types
28281 // are actually sub-vectors of a larger SVE register. When mapping
28282 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
28283 // how the mask's indices translate. Specifically, when the mapping requires
28284 // an exact meaning for a specific vector index (e.g. Index X is the last
28285 // vector element in the register) then such mappings are often only safe when
28286 // the exact SVE register size is know. The main exception to this is when
28287 // indices are logically relative to the first element of either
28288 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
28289 // when converting from fixed-length to scalable vector types (i.e. the start
28290 // of a fixed length vector is always the start of a scalable vector).
28291 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
28292 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
28293 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
28294 if (ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size()) &&
28295 Op2.isUndef()) {
28296 Op = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: ContainerVT, Operand: Op1);
28297 return convertFromScalableVector(DAG, VT, V: Op);
28298 }
28299
28300 if (isZIPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult) &&
28301 WhichResult != 0)
28302 return convertFromScalableVector(
28303 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op2));
28304
28305 if (isUZPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult)) {
28306 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
28307 return convertFromScalableVector(
28308 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
28309 }
28310
28311 if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult != 0)
28312 return convertFromScalableVector(
28313 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op1));
28314
28315 if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
28316 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
28317 return convertFromScalableVector(
28318 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
28319 }
28320 }
28321
28322 // Avoid producing TBL instruction if we don't know SVE register minimal size,
28323 // unless NEON is not available and we can assume minimal SVE register size is
28324 // 128-bits.
28325 if (MinSVESize || !Subtarget->isNeonAvailable())
28326 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
28327 DAG);
28328
28329 return SDValue();
28330}
28331
28332SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
28333 SelectionDAG &DAG) const {
28334 SDLoc DL(Op);
28335 EVT InVT = Op.getValueType();
28336
28337 assert(VT.isScalableVector() && isTypeLegal(VT) &&
28338 InVT.isScalableVector() && isTypeLegal(InVT) &&
28339 "Only expect to cast between legal scalable vector types!");
28340 assert(VT.getVectorElementType() != MVT::i1 &&
28341 InVT.getVectorElementType() != MVT::i1 &&
28342 "For predicate bitcasts, use getSVEPredicateBitCast");
28343
28344 if (InVT == VT)
28345 return Op;
28346
28347 EVT PackedVT = getPackedSVEVectorVT(VT: VT.getVectorElementType());
28348 EVT PackedInVT = getPackedSVEVectorVT(VT: InVT.getVectorElementType());
28349
28350 // Safe bitcasting between unpacked vector types of different element counts
28351 // is currently unsupported because the following is missing the necessary
28352 // work to ensure the result's elements live where they're supposed to within
28353 // an SVE register.
28354 // 01234567
28355 // e.g. nxv2i32 = XX??XX??
28356 // nxv4f16 = X?X?X?X?
28357 assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
28358 VT == PackedVT || InVT == PackedInVT) &&
28359 "Unexpected bitcast!");
28360
28361 // Pack input if required.
28362 if (InVT != PackedInVT)
28363 Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: PackedInVT, Operand: Op);
28364
28365 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: PackedVT, Operand: Op);
28366
28367 // Unpack result if required.
28368 if (VT != PackedVT)
28369 Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
28370
28371 return Op;
28372}
28373
28374bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
28375 SDValue N) const {
28376 return ::isAllActivePredicate(DAG, N);
28377}
28378
28379EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
28380 return ::getPromotedVTForPredicate(VT);
28381}
28382
28383bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
28384 SDValue Op, const APInt &OriginalDemandedBits,
28385 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
28386 unsigned Depth) const {
28387
28388 unsigned Opc = Op.getOpcode();
28389 switch (Opc) {
28390 case AArch64ISD::VSHL: {
28391 // Match (VSHL (VLSHR Val X) X)
28392 SDValue ShiftL = Op;
28393 SDValue ShiftR = Op->getOperand(Num: 0);
28394 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
28395 return false;
28396
28397 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
28398 return false;
28399
28400 unsigned ShiftLBits = ShiftL->getConstantOperandVal(Num: 1);
28401 unsigned ShiftRBits = ShiftR->getConstantOperandVal(Num: 1);
28402
28403 // Other cases can be handled as well, but this is not
28404 // implemented.
28405 if (ShiftRBits != ShiftLBits)
28406 return false;
28407
28408 unsigned ScalarSize = Op.getScalarValueSizeInBits();
28409 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
28410
28411 APInt ZeroBits = APInt::getLowBitsSet(numBits: ScalarSize, loBitsSet: ShiftLBits);
28412 APInt UnusedBits = ~OriginalDemandedBits;
28413
28414 if ((ZeroBits & UnusedBits) != ZeroBits)
28415 return false;
28416
28417 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
28418 // used - simplify to just Val.
28419 return TLO.CombineTo(O: Op, N: ShiftR->getOperand(Num: 0));
28420 }
28421 case AArch64ISD::BICi: {
28422 // Fold BICi if all destination bits already known to be zeroed
28423 SDValue Op0 = Op.getOperand(i: 0);
28424 KnownBits KnownOp0 =
28425 TLO.DAG.computeKnownBits(Op: Op0, DemandedElts: OriginalDemandedElts, Depth: Depth + 1);
28426 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
28427 uint64_t BitsToClear = Op->getConstantOperandVal(Num: 1)
28428 << Op->getConstantOperandVal(Num: 2);
28429 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
28430 if (APInt(Known.getBitWidth(), BitsToClear)
28431 .isSubsetOf(RHS: AlreadyZeroedBitsToClear))
28432 return TLO.CombineTo(O: Op, N: Op0);
28433
28434 Known = KnownOp0 &
28435 KnownBits::makeConstant(C: APInt(Known.getBitWidth(), ~BitsToClear));
28436
28437 return false;
28438 }
28439 case ISD::INTRINSIC_WO_CHAIN: {
28440 if (auto ElementSize = IsSVECntIntrinsic(S: Op)) {
28441 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
28442 if (!MaxSVEVectorSizeInBits)
28443 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
28444 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
28445 // The SVE count intrinsics don't support the multiplier immediate so we
28446 // don't have to account for that here. The value returned may be slightly
28447 // over the true required bits, as this is based on the "ALL" pattern. The
28448 // other patterns are also exposed by these intrinsics, but they all
28449 // return a value that's strictly less than "ALL".
28450 unsigned RequiredBits = llvm::bit_width(Value: MaxElements);
28451 unsigned BitWidth = Known.Zero.getBitWidth();
28452 if (RequiredBits < BitWidth)
28453 Known.Zero.setHighBits(BitWidth - RequiredBits);
28454 return false;
28455 }
28456 }
28457 }
28458
28459 return TargetLowering::SimplifyDemandedBitsForTargetNode(
28460 Op, DemandedBits: OriginalDemandedBits, DemandedElts: OriginalDemandedElts, Known, TLO, Depth);
28461}
28462
28463bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
28464 return Op.getOpcode() == AArch64ISD::DUP ||
28465 Op.getOpcode() == AArch64ISD::MOVI ||
28466 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
28467 Op.getOperand(i: 0).getOpcode() == AArch64ISD::DUP) ||
28468 TargetLowering::isTargetCanonicalConstantNode(Op);
28469}
28470
28471bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
28472 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
28473 Subtarget->hasComplxNum();
28474}
28475
28476bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
28477 ComplexDeinterleavingOperation Operation, Type *Ty) const {
28478 auto *VTy = dyn_cast<VectorType>(Val: Ty);
28479 if (!VTy)
28480 return false;
28481
28482 // If the vector is scalable, SVE is enabled, implying support for complex
28483 // numbers. Otherwise, we need to ensure complex number support is available
28484 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
28485 return false;
28486
28487 auto *ScalarTy = VTy->getScalarType();
28488 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
28489
28490 // We can only process vectors that have a bit size of 128 or higher (with an
28491 // additional 64 bits for Neon). Additionally, these vectors must have a
28492 // power-of-2 size, as we later split them into the smallest supported size
28493 // and merging them back together after applying complex operation.
28494 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
28495 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
28496 !llvm::isPowerOf2_32(Value: VTyWidth))
28497 return false;
28498
28499 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
28500 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
28501 return 8 <= ScalarWidth && ScalarWidth <= 64;
28502 }
28503
28504 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
28505 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
28506}
28507
28508Value *AArch64TargetLowering::createComplexDeinterleavingIR(
28509 IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
28510 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
28511 Value *Accumulator) const {
28512 VectorType *Ty = cast<VectorType>(Val: InputA->getType());
28513 bool IsScalable = Ty->isScalableTy();
28514 bool IsInt = Ty->getElementType()->isIntegerTy();
28515
28516 unsigned TyWidth =
28517 Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
28518
28519 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
28520 "Vector type must be either 64 or a power of 2 that is at least 128");
28521
28522 if (TyWidth > 128) {
28523 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
28524 auto *HalfTy = VectorType::getHalfElementsVectorType(VTy: Ty);
28525 auto *LowerSplitA = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: B.getInt64(C: 0));
28526 auto *LowerSplitB = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: B.getInt64(C: 0));
28527 auto *UpperSplitA =
28528 B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: B.getInt64(C: Stride));
28529 auto *UpperSplitB =
28530 B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: B.getInt64(C: Stride));
28531 Value *LowerSplitAcc = nullptr;
28532 Value *UpperSplitAcc = nullptr;
28533 if (Accumulator) {
28534 LowerSplitAcc = B.CreateExtractVector(DstType: HalfTy, SrcVec: Accumulator, Idx: B.getInt64(C: 0));
28535 UpperSplitAcc =
28536 B.CreateExtractVector(DstType: HalfTy, SrcVec: Accumulator, Idx: B.getInt64(C: Stride));
28537 }
28538 auto *LowerSplitInt = createComplexDeinterleavingIR(
28539 B, OperationType, Rotation, InputA: LowerSplitA, InputB: LowerSplitB, Accumulator: LowerSplitAcc);
28540 auto *UpperSplitInt = createComplexDeinterleavingIR(
28541 B, OperationType, Rotation, InputA: UpperSplitA, InputB: UpperSplitB, Accumulator: UpperSplitAcc);
28542
28543 auto *Result = B.CreateInsertVector(DstType: Ty, SrcVec: PoisonValue::get(T: Ty), SubVec: LowerSplitInt,
28544 Idx: B.getInt64(C: 0));
28545 return B.CreateInsertVector(DstType: Ty, SrcVec: Result, SubVec: UpperSplitInt, Idx: B.getInt64(C: Stride));
28546 }
28547
28548 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
28549 if (Accumulator == nullptr)
28550 Accumulator = Constant::getNullValue(Ty);
28551
28552 if (IsScalable) {
28553 if (IsInt)
28554 return B.CreateIntrinsic(
28555 ID: Intrinsic::aarch64_sve_cmla_x, Types: Ty,
28556 Args: {Accumulator, InputA, InputB, B.getInt32(C: (int)Rotation * 90)});
28557
28558 auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
28559 return B.CreateIntrinsic(
28560 ID: Intrinsic::aarch64_sve_fcmla, Types: Ty,
28561 Args: {Mask, Accumulator, InputA, InputB, B.getInt32(C: (int)Rotation * 90)});
28562 }
28563
28564 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
28565 Intrinsic::aarch64_neon_vcmla_rot90,
28566 Intrinsic::aarch64_neon_vcmla_rot180,
28567 Intrinsic::aarch64_neon_vcmla_rot270};
28568
28569
28570 return B.CreateIntrinsic(ID: IdMap[(int)Rotation], Types: Ty,
28571 Args: {Accumulator, InputA, InputB});
28572 }
28573
28574 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
28575 if (IsScalable) {
28576 if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
28577 Rotation == ComplexDeinterleavingRotation::Rotation_270) {
28578 if (IsInt)
28579 return B.CreateIntrinsic(
28580 ID: Intrinsic::aarch64_sve_cadd_x, Types: Ty,
28581 Args: {InputA, InputB, B.getInt32(C: (int)Rotation * 90)});
28582
28583 auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
28584 return B.CreateIntrinsic(
28585 ID: Intrinsic::aarch64_sve_fcadd, Types: Ty,
28586 Args: {Mask, InputA, InputB, B.getInt32(C: (int)Rotation * 90)});
28587 }
28588 return nullptr;
28589 }
28590
28591 Intrinsic::ID IntId = Intrinsic::not_intrinsic;
28592 if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
28593 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
28594 else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
28595 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
28596
28597 if (IntId == Intrinsic::not_intrinsic)
28598 return nullptr;
28599
28600 return B.CreateIntrinsic(ID: IntId, Types: Ty, Args: {InputA, InputB});
28601 }
28602
28603 return nullptr;
28604}
28605
28606bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
28607 unsigned Opc = N->getOpcode();
28608 if (ISD::isExtOpcode(Opcode: Opc)) {
28609 if (any_of(Range: N->uses(),
28610 P: [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
28611 return false;
28612 }
28613 return true;
28614}
28615
28616unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
28617 return Subtarget->getMinimumJumpTableEntries();
28618}
28619
28620MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
28621 CallingConv::ID CC,
28622 EVT VT) const {
28623 bool NonUnitFixedLengthVector =
28624 VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
28625 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
28626 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
28627
28628 EVT VT1;
28629 MVT RegisterVT;
28630 unsigned NumIntermediates;
28631 getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1, NumIntermediates,
28632 RegisterVT);
28633 return RegisterVT;
28634}
28635
28636unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
28637 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
28638 bool NonUnitFixedLengthVector =
28639 VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
28640 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
28641 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
28642
28643 EVT VT1;
28644 MVT VT2;
28645 unsigned NumIntermediates;
28646 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1,
28647 NumIntermediates, RegisterVT&: VT2);
28648}
28649
28650unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
28651 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
28652 unsigned &NumIntermediates, MVT &RegisterVT) const {
28653 int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
28654 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
28655 if (!RegisterVT.isFixedLengthVector() ||
28656 RegisterVT.getFixedSizeInBits() <= 128)
28657 return NumRegs;
28658
28659 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
28660 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
28661 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
28662
28663 // A size mismatch here implies either type promotion or widening and would
28664 // have resulted in scalarisation if larger vectors had not be available.
28665 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
28666 EVT EltTy = VT.getVectorElementType();
28667 EVT NewVT = EVT::getVectorVT(Context, VT: EltTy, EC: ElementCount::getFixed(MinVal: 1));
28668 if (!isTypeLegal(VT: NewVT))
28669 NewVT = EltTy;
28670
28671 IntermediateVT = NewVT;
28672 NumIntermediates = VT.getVectorNumElements();
28673 RegisterVT = getRegisterType(Context, VT: NewVT);
28674 return NumIntermediates;
28675 }
28676
28677 // SVE VLS support does not introduce a new ABI so we should use NEON sized
28678 // types for vector arguments and returns.
28679
28680 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
28681 NumIntermediates *= NumSubRegs;
28682 NumRegs *= NumSubRegs;
28683
28684 switch (RegisterVT.getVectorElementType().SimpleTy) {
28685 default:
28686 llvm_unreachable("unexpected element type for vector");
28687 case MVT::i8:
28688 IntermediateVT = RegisterVT = MVT::v16i8;
28689 break;
28690 case MVT::i16:
28691 IntermediateVT = RegisterVT = MVT::v8i16;
28692 break;
28693 case MVT::i32:
28694 IntermediateVT = RegisterVT = MVT::v4i32;
28695 break;
28696 case MVT::i64:
28697 IntermediateVT = RegisterVT = MVT::v2i64;
28698 break;
28699 case MVT::f16:
28700 IntermediateVT = RegisterVT = MVT::v8f16;
28701 break;
28702 case MVT::f32:
28703 IntermediateVT = RegisterVT = MVT::v4f32;
28704 break;
28705 case MVT::f64:
28706 IntermediateVT = RegisterVT = MVT::v2f64;
28707 break;
28708 case MVT::bf16:
28709 IntermediateVT = RegisterVT = MVT::v8bf16;
28710 break;
28711 }
28712
28713 return NumRegs;
28714}
28715
28716bool AArch64TargetLowering::hasInlineStackProbe(
28717 const MachineFunction &MF) const {
28718 return !Subtarget->isTargetWindows() &&
28719 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
28720}
28721
28722#ifndef NDEBUG
28723void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const {
28724 switch (N->getOpcode()) {
28725 default:
28726 break;
28727 case AArch64ISD::SUNPKLO:
28728 case AArch64ISD::SUNPKHI:
28729 case AArch64ISD::UUNPKLO:
28730 case AArch64ISD::UUNPKHI: {
28731 assert(N->getNumValues() == 1 && "Expected one result!");
28732 assert(N->getNumOperands() == 1 && "Expected one operand!");
28733 EVT VT = N->getValueType(0);
28734 EVT OpVT = N->getOperand(0).getValueType();
28735 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
28736 VT.isInteger() && "Expected integer vectors!");
28737 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
28738 "Expected vectors of equal size!");
28739 // TODO: Enable assert once bogus creations have been fixed.
28740 // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
28741 // "Expected result vector with half the lanes of its input!");
28742 break;
28743 }
28744 case AArch64ISD::TRN1:
28745 case AArch64ISD::TRN2:
28746 case AArch64ISD::UZP1:
28747 case AArch64ISD::UZP2:
28748 case AArch64ISD::ZIP1:
28749 case AArch64ISD::ZIP2: {
28750 assert(N->getNumValues() == 1 && "Expected one result!");
28751 assert(N->getNumOperands() == 2 && "Expected two operands!");
28752 EVT VT = N->getValueType(0);
28753 EVT Op0VT = N->getOperand(0).getValueType();
28754 EVT Op1VT = N->getOperand(1).getValueType();
28755 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
28756 "Expected vectors!");
28757 // TODO: Enable assert once bogus creations have been fixed.
28758 // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
28759 break;
28760 }
28761 }
28762}
28763#endif
28764