1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "AArch64PerfectShuffle.h"
12#include "MCTargetDesc/AArch64AddressingModes.h"
13#include "Utils/AArch64SMEAttributes.h"
14#include "llvm/ADT/DenseMap.h"
15#include "llvm/Analysis/LoopInfo.h"
16#include "llvm/Analysis/TargetTransformInfo.h"
17#include "llvm/CodeGen/BasicTTIImpl.h"
18#include "llvm/CodeGen/CostTable.h"
19#include "llvm/CodeGen/TargetLowering.h"
20#include "llvm/IR/DerivedTypes.h"
21#include "llvm/IR/IntrinsicInst.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
24#include "llvm/IR/PatternMatch.h"
25#include "llvm/Support/Debug.h"
26#include "llvm/TargetParser/AArch64TargetParser.h"
27#include "llvm/Transforms/InstCombine/InstCombiner.h"
28#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
29#include <algorithm>
30#include <optional>
31using namespace llvm;
32using namespace llvm::PatternMatch;
33
34#define DEBUG_TYPE "aarch64tti"
35
36static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
37 cl::init(Val: true), cl::Hidden);
38
39static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost(
40 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
41
42static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(Val: 10),
43 cl::Hidden);
44
45static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
46 cl::init(Val: 10), cl::Hidden);
47
48static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
49 cl::init(Val: 15), cl::Hidden);
50
51static cl::opt<unsigned>
52 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(Val: 10),
53 cl::Hidden);
54
55static cl::opt<unsigned> CallPenaltyChangeSM(
56 "call-penalty-sm-change", cl::init(Val: 5), cl::Hidden,
57 cl::desc(
58 "Penalty of calling a function that requires a change to PSTATE.SM"));
59
60static cl::opt<unsigned> InlineCallPenaltyChangeSM(
61 "inline-call-penalty-sm-change", cl::init(Val: 10), cl::Hidden,
62 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
63
64static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
65 cl::init(Val: true), cl::Hidden);
66
67static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
68 cl::init(Val: true), cl::Hidden);
69
70// A complete guess as to a reasonable cost.
71static cl::opt<unsigned>
72 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(Val: 8), cl::Hidden,
73 cl::desc("The cost of a histcnt instruction"));
74
75static cl::opt<unsigned> DMBLookaheadThreshold(
76 "dmb-lookahead-threshold", cl::init(Val: 10), cl::Hidden,
77 cl::desc("The number of instructions to search for a redundant dmb"));
78
79namespace {
80class TailFoldingOption {
81 // These bitfields will only ever be set to something non-zero in operator=,
82 // when setting the -sve-tail-folding option. This option should always be of
83 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
84 // InitialBits is one of (disabled|all|simple). EnableBits represents
85 // additional flags we're enabling, and DisableBits for those flags we're
86 // disabling. The default flag is tracked in the variable NeedsDefault, since
87 // at the time of setting the option we may not know what the default value
88 // for the CPU is.
89 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
90 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
91 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
92
93 // This value needs to be initialised to true in case the user does not
94 // explicitly set the -sve-tail-folding option.
95 bool NeedsDefault = true;
96
97 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
98
99 void setNeedsDefault(bool V) { NeedsDefault = V; }
100
101 void setEnableBit(TailFoldingOpts Bit) {
102 EnableBits |= Bit;
103 DisableBits &= ~Bit;
104 }
105
106 void setDisableBit(TailFoldingOpts Bit) {
107 EnableBits &= ~Bit;
108 DisableBits |= Bit;
109 }
110
111 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
112 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
113
114 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
115 "Initial bits should only include one of "
116 "(disabled|all|simple|default)");
117 Bits = NeedsDefault ? DefaultBits : InitialBits;
118 Bits |= EnableBits;
119 Bits &= ~DisableBits;
120
121 return Bits;
122 }
123
124 void reportError(std::string Opt) {
125 errs() << "invalid argument '" << Opt
126 << "' to -sve-tail-folding=; the option should be of the form\n"
127 " (disabled|all|default|simple)[+(reductions|recurrences"
128 "|reverse|noreductions|norecurrences|noreverse)]\n";
129 report_fatal_error(reason: "Unrecognised tail-folding option");
130 }
131
132public:
133
134 void operator=(const std::string &Val) {
135 // If the user explicitly sets -sve-tail-folding= then treat as an error.
136 if (Val.empty()) {
137 reportError(Opt: "");
138 return;
139 }
140
141 // Since the user is explicitly setting the option we don't automatically
142 // need the default unless they require it.
143 setNeedsDefault(false);
144
145 SmallVector<StringRef, 4> TailFoldTypes;
146 StringRef(Val).split(A&: TailFoldTypes, Separator: '+', MaxSplit: -1, KeepEmpty: false);
147
148 unsigned StartIdx = 1;
149 if (TailFoldTypes[0] == "disabled")
150 setInitialBits(TailFoldingOpts::Disabled);
151 else if (TailFoldTypes[0] == "all")
152 setInitialBits(TailFoldingOpts::All);
153 else if (TailFoldTypes[0] == "default")
154 setNeedsDefault(true);
155 else if (TailFoldTypes[0] == "simple")
156 setInitialBits(TailFoldingOpts::Simple);
157 else {
158 StartIdx = 0;
159 setInitialBits(TailFoldingOpts::Disabled);
160 }
161
162 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
163 if (TailFoldTypes[I] == "reductions")
164 setEnableBit(TailFoldingOpts::Reductions);
165 else if (TailFoldTypes[I] == "recurrences")
166 setEnableBit(TailFoldingOpts::Recurrences);
167 else if (TailFoldTypes[I] == "reverse")
168 setEnableBit(TailFoldingOpts::Reverse);
169 else if (TailFoldTypes[I] == "noreductions")
170 setDisableBit(TailFoldingOpts::Reductions);
171 else if (TailFoldTypes[I] == "norecurrences")
172 setDisableBit(TailFoldingOpts::Recurrences);
173 else if (TailFoldTypes[I] == "noreverse")
174 setDisableBit(TailFoldingOpts::Reverse);
175 else
176 reportError(Opt: Val);
177 }
178 }
179
180 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
181 return (getBits(DefaultBits) & Required) == Required;
182 }
183};
184} // namespace
185
186TailFoldingOption TailFoldingOptionLoc;
187
188static cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
189 "sve-tail-folding",
190 cl::desc(
191 "Control the use of vectorisation using tail-folding for SVE where the"
192 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
193 "\ndisabled (Initial) No loop types will vectorize using "
194 "tail-folding"
195 "\ndefault (Initial) Uses the default tail-folding settings for "
196 "the target CPU"
197 "\nall (Initial) All legal loop types will vectorize using "
198 "tail-folding"
199 "\nsimple (Initial) Use tail-folding for simple loops (not "
200 "reductions or recurrences)"
201 "\nreductions Use tail-folding for loops containing reductions"
202 "\nnoreductions Inverse of above"
203 "\nrecurrences Use tail-folding for loops containing fixed order "
204 "recurrences"
205 "\nnorecurrences Inverse of above"
206 "\nreverse Use tail-folding for loops requiring reversed "
207 "predicates"
208 "\nnoreverse Inverse of above"),
209 cl::location(L&: TailFoldingOptionLoc));
210
211// Experimental option that will only be fully functional when the
212// code-generator is changed to use SVE instead of NEON for all fixed-width
213// operations.
214static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
215 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
216
217// Experimental option that will only be fully functional when the cost-model
218// and code-generator have been changed to avoid using scalable vector
219// instructions that are not legal in streaming SVE mode.
220static cl::opt<bool> EnableScalableAutovecInStreamingMode(
221 "enable-scalable-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
222
223static bool isSMEABIRoutineCall(const CallInst &CI) {
224 const auto *F = CI.getCalledFunction();
225 return F && StringSwitch<bool>(F->getName())
226 .Case(S: "__arm_sme_state", Value: true)
227 .Case(S: "__arm_tpidr2_save", Value: true)
228 .Case(S: "__arm_tpidr2_restore", Value: true)
229 .Case(S: "__arm_za_disable", Value: true)
230 .Default(Value: false);
231}
232
233/// Returns true if the function has explicit operations that can only be
234/// lowered using incompatible instructions for the selected mode. This also
235/// returns true if the function F may use or modify ZA state.
236static bool hasPossibleIncompatibleOps(const Function *F) {
237 for (const BasicBlock &BB : *F) {
238 for (const Instruction &I : BB) {
239 // Be conservative for now and assume that any call to inline asm or to
240 // intrinsics could could result in non-streaming ops (e.g. calls to
241 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
242 // all native LLVM instructions can be lowered to compatible instructions.
243 if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() &&
244 (cast<CallInst>(Val: I).isInlineAsm() || isa<IntrinsicInst>(Val: I) ||
245 isSMEABIRoutineCall(CI: cast<CallInst>(Val: I))))
246 return true;
247 }
248 }
249 return false;
250}
251
252uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const {
253 StringRef AttributeStr =
254 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
255 StringRef FeatureStr = F.getFnAttribute(Kind: AttributeStr).getValueAsString();
256 SmallVector<StringRef, 8> Features;
257 FeatureStr.split(A&: Features, Separator: ",");
258 return AArch64::getFMVPriority(Features);
259}
260
261bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {
262 return F.hasFnAttribute(Kind: "fmv-features");
263}
264
265const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
266 AArch64::FeatureExecuteOnly,
267};
268
269bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
270 const Function *Callee) const {
271 SMECallAttrs CallAttrs(*Caller, *Callee);
272
273 // When inlining, we should consider the body of the function, not the
274 // interface.
275 if (CallAttrs.callee().hasStreamingBody()) {
276 CallAttrs.callee().set(M: SMEAttrs::SM_Compatible, Enable: false);
277 CallAttrs.callee().set(M: SMEAttrs::SM_Enabled, Enable: true);
278 }
279
280 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
281 return false;
282
283 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
284 CallAttrs.requiresPreservingZT0() ||
285 CallAttrs.requiresPreservingAllZAState()) {
286 if (hasPossibleIncompatibleOps(F: Callee))
287 return false;
288 }
289
290 const TargetMachine &TM = getTLI()->getTargetMachine();
291 const FeatureBitset &CallerBits =
292 TM.getSubtargetImpl(*Caller)->getFeatureBits();
293 const FeatureBitset &CalleeBits =
294 TM.getSubtargetImpl(*Callee)->getFeatureBits();
295 // Adjust the feature bitsets by inverting some of the bits. This is needed
296 // for target features that represent restrictions rather than capabilities,
297 // for example a "+execute-only" callee can be inlined into a caller without
298 // "+execute-only", but not vice versa.
299 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
300 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
301
302 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
303}
304
305bool AArch64TTIImpl::areTypesABICompatible(
306 const Function *Caller, const Function *Callee,
307 const ArrayRef<Type *> &Types) const {
308 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
309 return false;
310
311 // We need to ensure that argument promotion does not attempt to promote
312 // pointers to fixed-length vector types larger than 128 bits like
313 // <8 x float> (and pointers to aggregate types which have such fixed-length
314 // vector type members) into the values of the pointees. Such vector types
315 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
316 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
317 // types can be safely treated as 128-bit NEON types and they cannot be
318 // distinguished in IR.
319 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range: Types, P: [](Type *Ty) {
320 auto FVTy = dyn_cast<FixedVectorType>(Val: Ty);
321 return FVTy &&
322 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
323 }))
324 return false;
325
326 return true;
327}
328
329unsigned
330AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
331 unsigned DefaultCallPenalty) const {
332 // This function calculates a penalty for executing Call in F.
333 //
334 // There are two ways this function can be called:
335 // (1) F:
336 // call from F -> G (the call here is Call)
337 //
338 // For (1), Call.getCaller() == F, so it will always return a high cost if
339 // a streaming-mode change is required (thus promoting the need to inline the
340 // function)
341 //
342 // (2) F:
343 // call from F -> G (the call here is not Call)
344 // G:
345 // call from G -> H (the call here is Call)
346 //
347 // For (2), if after inlining the body of G into F the call to H requires a
348 // streaming-mode change, and the call to G from F would also require a
349 // streaming-mode change, then there is benefit to do the streaming-mode
350 // change only once and avoid inlining of G into F.
351
352 SMEAttrs FAttrs(*F);
353 SMECallAttrs CallAttrs(Call);
354
355 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
356 if (F == Call.getCaller()) // (1)
357 return CallPenaltyChangeSM * DefaultCallPenalty;
358 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
359 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
360 }
361
362 return DefaultCallPenalty;
363}
364
365bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
366 TargetTransformInfo::RegisterKind K) const {
367 assert(K != TargetTransformInfo::RGK_Scalar);
368 return (K == TargetTransformInfo::RGK_FixedWidthVector &&
369 ST->isNeonAvailable());
370}
371
372/// Calculate the cost of materializing a 64-bit value. This helper
373/// method might only calculate a fraction of a larger immediate. Therefore it
374/// is valid to return a cost of ZERO.
375InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) const {
376 // Check if the immediate can be encoded within an instruction.
377 if (Val == 0 || AArch64_AM::isLogicalImmediate(imm: Val, regSize: 64))
378 return 0;
379
380 if (Val < 0)
381 Val = ~Val;
382
383 // Calculate how many moves we will need to materialize this constant.
384 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
385 AArch64_IMM::expandMOVImm(Imm: Val, BitSize: 64, Insn);
386 return Insn.size();
387}
388
389/// Calculate the cost of materializing the given constant.
390InstructionCost
391AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
392 TTI::TargetCostKind CostKind) const {
393 assert(Ty->isIntegerTy());
394
395 unsigned BitSize = Ty->getPrimitiveSizeInBits();
396 if (BitSize == 0)
397 return ~0U;
398
399 // Sign-extend all constants to a multiple of 64-bit.
400 APInt ImmVal = Imm;
401 if (BitSize & 0x3f)
402 ImmVal = Imm.sext(width: (BitSize + 63) & ~0x3fU);
403
404 // Split the constant into 64-bit chunks and calculate the cost for each
405 // chunk.
406 InstructionCost Cost = 0;
407 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
408 APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64);
409 int64_t Val = Tmp.getSExtValue();
410 Cost += getIntImmCost(Val);
411 }
412 // We need at least one instruction to materialze the constant.
413 return std::max<InstructionCost>(a: 1, b: Cost);
414}
415
416InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
417 const APInt &Imm, Type *Ty,
418 TTI::TargetCostKind CostKind,
419 Instruction *Inst) const {
420 assert(Ty->isIntegerTy());
421
422 unsigned BitSize = Ty->getPrimitiveSizeInBits();
423 // There is no cost model for constants with a bit size of 0. Return TCC_Free
424 // here, so that constant hoisting will ignore this constant.
425 if (BitSize == 0)
426 return TTI::TCC_Free;
427
428 unsigned ImmIdx = ~0U;
429 switch (Opcode) {
430 default:
431 return TTI::TCC_Free;
432 case Instruction::GetElementPtr:
433 // Always hoist the base address of a GetElementPtr.
434 if (Idx == 0)
435 return 2 * TTI::TCC_Basic;
436 return TTI::TCC_Free;
437 case Instruction::Store:
438 ImmIdx = 0;
439 break;
440 case Instruction::Add:
441 case Instruction::Sub:
442 case Instruction::Mul:
443 case Instruction::UDiv:
444 case Instruction::SDiv:
445 case Instruction::URem:
446 case Instruction::SRem:
447 case Instruction::And:
448 case Instruction::Or:
449 case Instruction::Xor:
450 case Instruction::ICmp:
451 ImmIdx = 1;
452 break;
453 // Always return TCC_Free for the shift value of a shift instruction.
454 case Instruction::Shl:
455 case Instruction::LShr:
456 case Instruction::AShr:
457 if (Idx == 1)
458 return TTI::TCC_Free;
459 break;
460 case Instruction::Trunc:
461 case Instruction::ZExt:
462 case Instruction::SExt:
463 case Instruction::IntToPtr:
464 case Instruction::PtrToInt:
465 case Instruction::BitCast:
466 case Instruction::PHI:
467 case Instruction::Call:
468 case Instruction::Select:
469 case Instruction::Ret:
470 case Instruction::Load:
471 break;
472 }
473
474 if (Idx == ImmIdx) {
475 int NumConstants = (BitSize + 63) / 64;
476 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
477 return (Cost <= NumConstants * TTI::TCC_Basic)
478 ? static_cast<int>(TTI::TCC_Free)
479 : Cost;
480 }
481 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
482}
483
484InstructionCost
485AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
486 const APInt &Imm, Type *Ty,
487 TTI::TargetCostKind CostKind) const {
488 assert(Ty->isIntegerTy());
489
490 unsigned BitSize = Ty->getPrimitiveSizeInBits();
491 // There is no cost model for constants with a bit size of 0. Return TCC_Free
492 // here, so that constant hoisting will ignore this constant.
493 if (BitSize == 0)
494 return TTI::TCC_Free;
495
496 // Most (all?) AArch64 intrinsics do not support folding immediates into the
497 // selected instruction, so we compute the materialization cost for the
498 // immediate directly.
499 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
500 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
501
502 switch (IID) {
503 default:
504 return TTI::TCC_Free;
505 case Intrinsic::sadd_with_overflow:
506 case Intrinsic::uadd_with_overflow:
507 case Intrinsic::ssub_with_overflow:
508 case Intrinsic::usub_with_overflow:
509 case Intrinsic::smul_with_overflow:
510 case Intrinsic::umul_with_overflow:
511 if (Idx == 1) {
512 int NumConstants = (BitSize + 63) / 64;
513 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
514 return (Cost <= NumConstants * TTI::TCC_Basic)
515 ? static_cast<int>(TTI::TCC_Free)
516 : Cost;
517 }
518 break;
519 case Intrinsic::experimental_stackmap:
520 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
521 return TTI::TCC_Free;
522 break;
523 case Intrinsic::experimental_patchpoint_void:
524 case Intrinsic::experimental_patchpoint:
525 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
526 return TTI::TCC_Free;
527 break;
528 case Intrinsic::experimental_gc_statepoint:
529 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
530 return TTI::TCC_Free;
531 break;
532 }
533 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
534}
535
536TargetTransformInfo::PopcntSupportKind
537AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
538 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
539 if (TyWidth == 32 || TyWidth == 64)
540 return TTI::PSK_FastHardware;
541 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
542 return TTI::PSK_Software;
543}
544
545static bool isUnpackedVectorVT(EVT VecVT) {
546 return VecVT.isScalableVector() &&
547 VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
548}
549
550static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
551 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
552 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
553 unsigned TotalHistCnts = 1;
554
555 unsigned EltSize = EltTy->getScalarSizeInBits();
556 // Only allow (up to 64b) integers or pointers
557 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
558 return InstructionCost::getInvalid();
559
560 // FIXME: We should be able to generate histcnt for fixed-length vectors
561 // using ptrue with a specific VL.
562 if (VectorType *VTy = dyn_cast<VectorType>(Val: BucketPtrsTy)) {
563 unsigned EC = VTy->getElementCount().getKnownMinValue();
564 if (!isPowerOf2_64(Value: EC) || !VTy->isScalableTy())
565 return InstructionCost::getInvalid();
566
567 // HistCnt only supports 32b and 64b element types
568 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
569
570 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
571 return InstructionCost(BaseHistCntCost);
572
573 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
574 TotalHistCnts = EC / NaturalVectorWidth;
575 }
576
577 return InstructionCost(BaseHistCntCost * TotalHistCnts);
578}
579
580InstructionCost
581AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
582 TTI::TargetCostKind CostKind) const {
583 // The code-generator is currently not able to handle scalable vectors
584 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
585 // it. This change will be removed when code-generation for these types is
586 // sufficiently reliable.
587 auto *RetTy = ICA.getReturnType();
588 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: RetTy))
589 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
590 return InstructionCost::getInvalid();
591
592 switch (ICA.getID()) {
593 case Intrinsic::experimental_vector_histogram_add:
594 if (!ST->hasSVE2())
595 return InstructionCost::getInvalid();
596 return getHistogramCost(ICA);
597 case Intrinsic::umin:
598 case Intrinsic::umax:
599 case Intrinsic::smin:
600 case Intrinsic::smax: {
601 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
602 MVT::v8i16, MVT::v2i32, MVT::v4i32,
603 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
604 MVT::nxv2i64};
605 auto LT = getTypeLegalizationCost(Ty: RetTy);
606 // v2i64 types get converted to cmp+bif hence the cost of 2
607 if (LT.second == MVT::v2i64)
608 return LT.first * 2;
609 if (any_of(Range: ValidMinMaxTys, P: [&LT](MVT M) { return M == LT.second; }))
610 return LT.first;
611 break;
612 }
613 case Intrinsic::sadd_sat:
614 case Intrinsic::ssub_sat:
615 case Intrinsic::uadd_sat:
616 case Intrinsic::usub_sat: {
617 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
618 MVT::v8i16, MVT::v2i32, MVT::v4i32,
619 MVT::v2i64};
620 auto LT = getTypeLegalizationCost(Ty: RetTy);
621 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
622 // need to extend the type, as it uses shr(qadd(shl, shl)).
623 unsigned Instrs =
624 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
625 if (any_of(Range: ValidSatTys, P: [&LT](MVT M) { return M == LT.second; }))
626 return LT.first * Instrs;
627 break;
628 }
629 case Intrinsic::abs: {
630 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
631 MVT::v8i16, MVT::v2i32, MVT::v4i32,
632 MVT::v2i64};
633 auto LT = getTypeLegalizationCost(Ty: RetTy);
634 if (any_of(Range: ValidAbsTys, P: [&LT](MVT M) { return M == LT.second; }))
635 return LT.first;
636 break;
637 }
638 case Intrinsic::bswap: {
639 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
640 MVT::v4i32, MVT::v2i64};
641 auto LT = getTypeLegalizationCost(Ty: RetTy);
642 if (any_of(Range: ValidAbsTys, P: [&LT](MVT M) { return M == LT.second; }) &&
643 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
644 return LT.first;
645 break;
646 }
647 case Intrinsic::stepvector: {
648 InstructionCost Cost = 1; // Cost of the `index' instruction
649 auto LT = getTypeLegalizationCost(Ty: RetTy);
650 // Legalisation of illegal vectors involves an `index' instruction plus
651 // (LT.first - 1) vector adds.
652 if (LT.first > 1) {
653 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: RetTy->getContext());
654 InstructionCost AddCost =
655 getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind);
656 Cost += AddCost * (LT.first - 1);
657 }
658 return Cost;
659 }
660 case Intrinsic::vector_extract:
661 case Intrinsic::vector_insert: {
662 // If both the vector and subvector types are legal types and the index
663 // is 0, then this should be a no-op or simple operation; return a
664 // relatively low cost.
665
666 // If arguments aren't actually supplied, then we cannot determine the
667 // value of the index. We also want to skip predicate types.
668 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
669 ICA.getReturnType()->getScalarType()->isIntegerTy(Bitwidth: 1))
670 break;
671
672 LLVMContext &C = RetTy->getContext();
673 EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
674 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
675 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy)
676 : getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[1]);
677 // Skip this if either the vector or subvector types are unpacked
678 // SVE types; they may get lowered to stack stores and loads.
679 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(VecVT: SubVecVT))
680 break;
681
682 TargetLoweringBase::LegalizeKind SubVecLK =
683 getTLI()->getTypeConversion(Context&: C, VT: SubVecVT);
684 TargetLoweringBase::LegalizeKind VecLK =
685 getTLI()->getTypeConversion(Context&: C, VT: VecVT);
686 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
687 const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx);
688 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
689 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
690 return TTI::TCC_Free;
691 break;
692 }
693 case Intrinsic::bitreverse: {
694 static const CostTblEntry BitreverseTbl[] = {
695 {.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: 1},
696 {.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: 1},
697 {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i8, .Cost: 1},
698 {.ISD: Intrinsic::bitreverse, .Type: MVT::v16i8, .Cost: 1},
699 {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i16, .Cost: 2},
700 {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i16, .Cost: 2},
701 {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i32, .Cost: 2},
702 {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i32, .Cost: 2},
703 {.ISD: Intrinsic::bitreverse, .Type: MVT::v1i64, .Cost: 2},
704 {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i64, .Cost: 2},
705 };
706 const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
707 const auto *Entry =
708 CostTableLookup(Table: BitreverseTbl, ISD: ICA.getID(), Ty: LegalisationCost.second);
709 if (Entry) {
710 // Cost Model is using the legal type(i32) that i8 and i16 will be
711 // converted to +1 so that we match the actual lowering cost
712 if (TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i8 ||
713 TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i16)
714 return LegalisationCost.first * Entry->Cost + 1;
715
716 return LegalisationCost.first * Entry->Cost;
717 }
718 break;
719 }
720 case Intrinsic::ctpop: {
721 if (!ST->hasNEON()) {
722 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
723 return getTypeLegalizationCost(Ty: RetTy).first * 12;
724 }
725 static const CostTblEntry CtpopCostTbl[] = {
726 {.ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: 4},
727 {.ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: 3},
728 {.ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: 2},
729 {.ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: 1},
730 {.ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: 4},
731 {.ISD: ISD::CTPOP, .Type: MVT::v2i32, .Cost: 3},
732 {.ISD: ISD::CTPOP, .Type: MVT::v4i16, .Cost: 2},
733 {.ISD: ISD::CTPOP, .Type: MVT::v8i8, .Cost: 1},
734 {.ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: 5},
735 };
736 auto LT = getTypeLegalizationCost(Ty: RetTy);
737 MVT MTy = LT.second;
738 if (const auto *Entry = CostTableLookup(Table: CtpopCostTbl, ISD: ISD::CTPOP, Ty: MTy)) {
739 // Extra cost of +1 when illegal vector types are legalized by promoting
740 // the integer type.
741 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
742 RetTy->getScalarSizeInBits()
743 ? 1
744 : 0;
745 return LT.first * Entry->Cost + ExtraCost;
746 }
747 break;
748 }
749 case Intrinsic::sadd_with_overflow:
750 case Intrinsic::uadd_with_overflow:
751 case Intrinsic::ssub_with_overflow:
752 case Intrinsic::usub_with_overflow:
753 case Intrinsic::smul_with_overflow:
754 case Intrinsic::umul_with_overflow: {
755 static const CostTblEntry WithOverflowCostTbl[] = {
756 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i8, .Cost: 3},
757 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i8, .Cost: 3},
758 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i16, .Cost: 3},
759 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i16, .Cost: 3},
760 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i32, .Cost: 1},
761 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i32, .Cost: 1},
762 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i64, .Cost: 1},
763 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i64, .Cost: 1},
764 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i8, .Cost: 3},
765 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i8, .Cost: 3},
766 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i16, .Cost: 3},
767 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i16, .Cost: 3},
768 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i32, .Cost: 1},
769 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i32, .Cost: 1},
770 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i64, .Cost: 1},
771 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i64, .Cost: 1},
772 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i8, .Cost: 5},
773 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i8, .Cost: 4},
774 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i16, .Cost: 5},
775 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i16, .Cost: 4},
776 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;tst
777 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;cmp sxtw
778 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;smulh;cmp
779 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;umulh;cmp asr
780 };
781 EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: 0), AllowUnknown: true);
782 if (MTy.isSimple())
783 if (const auto *Entry = CostTableLookup(Table: WithOverflowCostTbl, ISD: ICA.getID(),
784 Ty: MTy.getSimpleVT()))
785 return Entry->Cost;
786 break;
787 }
788 case Intrinsic::fptosi_sat:
789 case Intrinsic::fptoui_sat: {
790 if (ICA.getArgTypes().empty())
791 break;
792 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
793 auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]);
794 EVT MTy = TLI->getValueType(DL, Ty: RetTy);
795 // Check for the legal types, which are where the size of the input and the
796 // output are the same, or we are using cvt f64->i32 or f32->i64.
797 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
798 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
799 LT.second == MVT::v2f64)) {
800 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
801 (LT.second == MVT::f64 && MTy == MVT::i32) ||
802 (LT.second == MVT::f32 && MTy == MVT::i64)))
803 return LT.first;
804 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
805 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
806 MTy.getScalarSizeInBits() == 64)
807 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
808 }
809 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
810 // f32.
811 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
812 return LT.first + getIntrinsicInstrCost(
813 ICA: {ICA.getID(),
814 RetTy,
815 {ICA.getArgTypes()[0]->getWithNewType(
816 EltTy: Type::getFloatTy(C&: RetTy->getContext()))}},
817 CostKind);
818 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
819 (LT.second == MVT::f16 && MTy == MVT::i64) ||
820 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
821 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
822 return LT.first;
823 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
824 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
825 MTy.getScalarSizeInBits() == 32)
826 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
827 // Extending vector types v8f16->v8i32. These current scalarize but the
828 // codegen could be better.
829 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
830 MTy.getScalarSizeInBits() == 64)
831 return MTy.getVectorNumElements() * 3;
832
833 // If we can we use a legal convert followed by a min+max
834 if ((LT.second.getScalarType() == MVT::f32 ||
835 LT.second.getScalarType() == MVT::f64 ||
836 LT.second.getScalarType() == MVT::f16) &&
837 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
838 Type *LegalTy =
839 Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits());
840 if (LT.second.isVector())
841 LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount());
842 InstructionCost Cost = 1;
843 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
844 LegalTy, {LegalTy, LegalTy});
845 Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
846 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
847 LegalTy, {LegalTy, LegalTy});
848 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
849 return LT.first * Cost +
850 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
851 : 1);
852 }
853 // Otherwise we need to follow the default expansion that clamps the value
854 // using a float min/max with a fcmp+sel for nan handling when signed.
855 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
856 RetTy = RetTy->getScalarType();
857 if (LT.second.isVector()) {
858 FPTy = VectorType::get(ElementType: FPTy, EC: LT.second.getVectorElementCount());
859 RetTy = VectorType::get(ElementType: RetTy, EC: LT.second.getVectorElementCount());
860 }
861 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
862 InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind);
863 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
864 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
865 Cost +=
866 getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
867 Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind);
868 if (IsSigned) {
869 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
870 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy,
871 VecPred: CmpInst::FCMP_UNO, CostKind);
872 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy,
873 VecPred: CmpInst::FCMP_UNO, CostKind);
874 }
875 return LT.first * Cost;
876 }
877 case Intrinsic::fshl:
878 case Intrinsic::fshr: {
879 if (ICA.getArgs().empty())
880 break;
881
882 // TODO: Add handling for fshl where third argument is not a constant.
883 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[2]);
884 if (!OpInfoZ.isConstant())
885 break;
886
887 const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
888 if (OpInfoZ.isUniform()) {
889 static const CostTblEntry FshlTbl[] = {
890 {.ISD: Intrinsic::fshl, .Type: MVT::v4i32, .Cost: 2}, // shl + usra
891 {.ISD: Intrinsic::fshl, .Type: MVT::v2i64, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v16i8, .Cost: 2},
892 {.ISD: Intrinsic::fshl, .Type: MVT::v8i16, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v2i32, .Cost: 2},
893 {.ISD: Intrinsic::fshl, .Type: MVT::v8i8, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v4i16, .Cost: 2}};
894 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
895 // to avoid having to duplicate the costs.
896 const auto *Entry =
897 CostTableLookup(Table: FshlTbl, ISD: Intrinsic::fshl, Ty: LegalisationCost.second);
898 if (Entry)
899 return LegalisationCost.first * Entry->Cost;
900 }
901
902 auto TyL = getTypeLegalizationCost(Ty: RetTy);
903 if (!RetTy->isIntegerTy())
904 break;
905
906 // Estimate cost manually, as types like i8 and i16 will get promoted to
907 // i32 and CostTableLookup will ignore the extra conversion cost.
908 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
909 RetTy->getScalarSizeInBits() < 64) ||
910 (RetTy->getScalarSizeInBits() % 64 != 0);
911 unsigned ExtraCost = HigherCost ? 1 : 0;
912 if (RetTy->getScalarSizeInBits() == 32 ||
913 RetTy->getScalarSizeInBits() == 64)
914 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
915 // extr instruction.
916 else if (HigherCost)
917 ExtraCost = 1;
918 else
919 break;
920 return TyL.first + ExtraCost;
921 }
922 case Intrinsic::get_active_lane_mask: {
923 auto *RetTy = dyn_cast<FixedVectorType>(Val: ICA.getReturnType());
924 if (RetTy) {
925 EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy);
926 EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
927 if (!getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT) &&
928 !getTLI()->isTypeLegal(VT: RetVT)) {
929 // We don't have enough context at this point to determine if the mask
930 // is going to be kept live after the block, which will force the vXi1
931 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
932 // For now, we just assume the vectorizer created this intrinsic and
933 // the result will be the input for a PHI. In this case the cost will
934 // be extremely high for fixed-width vectors.
935 // NOTE: getScalarizationOverhead returns a cost that's far too
936 // pessimistic for the actual generated codegen. In reality there are
937 // two instructions generated per lane.
938 return RetTy->getNumElements() * 2;
939 }
940 }
941 break;
942 }
943 case Intrinsic::experimental_vector_match: {
944 auto *NeedleTy = cast<FixedVectorType>(Val: ICA.getArgTypes()[1]);
945 EVT SearchVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
946 unsigned SearchSize = NeedleTy->getNumElements();
947 if (!getTLI()->shouldExpandVectorMatch(VT: SearchVT, SearchSize)) {
948 // Base cost for MATCH instructions. At least on the Neoverse V2 and
949 // Neoverse V3, these are cheap operations with the same latency as a
950 // vector ADD. In most cases, however, we also need to do an extra DUP.
951 // For fixed-length vectors we currently need an extra five--six
952 // instructions besides the MATCH.
953 InstructionCost Cost = 4;
954 if (isa<FixedVectorType>(Val: RetTy))
955 Cost += 10;
956 return Cost;
957 }
958 break;
959 }
960 case Intrinsic::experimental_cttz_elts: {
961 EVT ArgVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
962 if (!getTLI()->shouldExpandCttzElements(VT: ArgVT)) {
963 // This will consist of a SVE brkb and a cntp instruction. These
964 // typically have the same latency and half the throughput as a vector
965 // add instruction.
966 return 4;
967 }
968 break;
969 }
970 default:
971 break;
972 }
973 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
974}
975
976/// The function will remove redundant reinterprets casting in the presence
977/// of the control flow
978static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
979 IntrinsicInst &II) {
980 SmallVector<Instruction *, 32> Worklist;
981 auto RequiredType = II.getType();
982
983 auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: 0));
984 assert(PN && "Expected Phi Node!");
985
986 // Don't create a new Phi unless we can remove the old one.
987 if (!PN->hasOneUse())
988 return std::nullopt;
989
990 for (Value *IncValPhi : PN->incoming_values()) {
991 auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi);
992 if (!Reinterpret ||
993 Reinterpret->getIntrinsicID() !=
994 Intrinsic::aarch64_sve_convert_to_svbool ||
995 RequiredType != Reinterpret->getArgOperand(i: 0)->getType())
996 return std::nullopt;
997 }
998
999 // Create the new Phi
1000 IC.Builder.SetInsertPoint(PN);
1001 PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues());
1002 Worklist.push_back(Elt: PN);
1003
1004 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1005 auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I));
1006 NPN->addIncoming(V: Reinterpret->getOperand(i: 0), BB: PN->getIncomingBlock(i: I));
1007 Worklist.push_back(Elt: Reinterpret);
1008 }
1009
1010 // Cleanup Phi Node and reinterprets
1011 return IC.replaceInstUsesWith(I&: II, V: NPN);
1012}
1013
1014// A collection of properties common to SVE intrinsics that allow for combines
1015// to be written without needing to know the specific intrinsic.
1016struct SVEIntrinsicInfo {
1017 //
1018 // Helper routines for common intrinsic definitions.
1019 //
1020
1021 // e.g. llvm.aarch64.sve.add pg, op1, op2
1022 // with IID ==> llvm.aarch64.sve.add_u
1023 static SVEIntrinsicInfo
1024 defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) {
1025 return SVEIntrinsicInfo()
1026 .setGoverningPredicateOperandIdx(0)
1027 .setOperandIdxInactiveLanesTakenFrom(1)
1028 .setMatchingUndefIntrinsic(IID);
1029 }
1030
1031 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1032 static SVEIntrinsicInfo defaultMergingUnaryOp() {
1033 return SVEIntrinsicInfo()
1034 .setGoverningPredicateOperandIdx(1)
1035 .setOperandIdxInactiveLanesTakenFrom(0)
1036 .setOperandIdxWithNoActiveLanes(0);
1037 }
1038
1039 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1040 static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() {
1041 return SVEIntrinsicInfo()
1042 .setGoverningPredicateOperandIdx(1)
1043 .setOperandIdxInactiveLanesTakenFrom(0);
1044 }
1045
1046 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1047 static SVEIntrinsicInfo defaultUndefOp() {
1048 return SVEIntrinsicInfo()
1049 .setGoverningPredicateOperandIdx(0)
1050 .setInactiveLanesAreNotDefined();
1051 }
1052
1053 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1054 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1055 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1056 return SVEIntrinsicInfo()
1057 .setGoverningPredicateOperandIdx(GPIndex)
1058 .setInactiveLanesAreUnused();
1059 }
1060
1061 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1062 // llvm.aarch64.sve.ld1 pg, ptr
1063 static SVEIntrinsicInfo defaultZeroingOp() {
1064 return SVEIntrinsicInfo()
1065 .setGoverningPredicateOperandIdx(0)
1066 .setInactiveLanesAreUnused()
1067 .setResultIsZeroInitialized();
1068 }
1069
1070 // All properties relate to predication and thus having a general predicate
1071 // is the minimum requirement to say there is intrinsic info to act on.
1072 explicit operator bool() const { return hasGoverningPredicate(); }
1073
1074 //
1075 // Properties relating to the governing predicate.
1076 //
1077
1078 bool hasGoverningPredicate() const {
1079 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1080 }
1081
1082 unsigned getGoverningPredicateOperandIdx() const {
1083 assert(hasGoverningPredicate() && "Propery not set!");
1084 return GoverningPredicateIdx;
1085 }
1086
1087 SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) {
1088 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1089 GoverningPredicateIdx = Index;
1090 return *this;
1091 }
1092
1093 //
1094 // Properties relating to operations the intrinsic could be transformed into.
1095 // NOTE: This does not mean such a transformation is always possible, but the
1096 // knowledge makes it possible to reuse existing optimisations without needing
1097 // to embed specific handling for each intrinsic. For example, instruction
1098 // simplification can be used to optimise an intrinsic's active lanes.
1099 //
1100
1101 bool hasMatchingUndefIntrinsic() const {
1102 return UndefIntrinsic != Intrinsic::not_intrinsic;
1103 }
1104
1105 Intrinsic::ID getMatchingUndefIntrinsic() const {
1106 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1107 return UndefIntrinsic;
1108 }
1109
1110 SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) {
1111 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1112 UndefIntrinsic = IID;
1113 return *this;
1114 }
1115
1116 bool hasMatchingIROpode() const { return IROpcode != 0; }
1117
1118 unsigned getMatchingIROpode() const {
1119 assert(hasMatchingIROpode() && "Propery not set!");
1120 return IROpcode;
1121 }
1122
1123 SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
1124 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1125 IROpcode = Opcode;
1126 return *this;
1127 }
1128
1129 //
1130 // Properties relating to the result of inactive lanes.
1131 //
1132
1133 bool inactiveLanesTakenFromOperand() const {
1134 return ResultLanes == InactiveLanesTakenFromOperand;
1135 }
1136
1137 unsigned getOperandIdxInactiveLanesTakenFrom() const {
1138 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1139 return OperandIdxForInactiveLanes;
1140 }
1141
1142 SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) {
1143 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1144 ResultLanes = InactiveLanesTakenFromOperand;
1145 OperandIdxForInactiveLanes = Index;
1146 return *this;
1147 }
1148
1149 bool inactiveLanesAreNotDefined() const {
1150 return ResultLanes == InactiveLanesAreNotDefined;
1151 }
1152
1153 SVEIntrinsicInfo &setInactiveLanesAreNotDefined() {
1154 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1155 ResultLanes = InactiveLanesAreNotDefined;
1156 return *this;
1157 }
1158
1159 bool inactiveLanesAreUnused() const {
1160 return ResultLanes == InactiveLanesAreUnused;
1161 }
1162
1163 SVEIntrinsicInfo &setInactiveLanesAreUnused() {
1164 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1165 ResultLanes = InactiveLanesAreUnused;
1166 return *this;
1167 }
1168
1169 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1170 // inactiveLanesAreZeroed =
1171 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1172 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1173
1174 SVEIntrinsicInfo &setResultIsZeroInitialized() {
1175 ResultIsZeroInitialized = true;
1176 return *this;
1177 }
1178
1179 //
1180 // The first operand of unary merging operations is typically only used to
1181 // set the result for inactive lanes. Knowing this allows us to deadcode the
1182 // operand when we can prove there are no inactive lanes.
1183 //
1184
1185 bool hasOperandWithNoActiveLanes() const {
1186 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1187 }
1188
1189 unsigned getOperandIdxWithNoActiveLanes() const {
1190 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1191 return OperandIdxWithNoActiveLanes;
1192 }
1193
1194 SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) {
1195 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1196 OperandIdxWithNoActiveLanes = Index;
1197 return *this;
1198 }
1199
1200private:
1201 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1202
1203 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1204 unsigned IROpcode = 0;
1205
1206 enum PredicationStyle {
1207 Uninitialized,
1208 InactiveLanesTakenFromOperand,
1209 InactiveLanesAreNotDefined,
1210 InactiveLanesAreUnused
1211 } ResultLanes = Uninitialized;
1212
1213 bool ResultIsZeroInitialized = false;
1214 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1215 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1216};
1217
1218static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
1219 // Some SVE intrinsics do not use scalable vector types, but since they are
1220 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1221 if (!isa<ScalableVectorType>(Val: II.getType()) &&
1222 all_of(Range: II.args(), P: [&](const Value *V) {
1223 return !isa<ScalableVectorType>(Val: V->getType());
1224 }))
1225 return SVEIntrinsicInfo();
1226
1227 Intrinsic::ID IID = II.getIntrinsicID();
1228 switch (IID) {
1229 default:
1230 break;
1231 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1232 case Intrinsic::aarch64_sve_fcvt_f16f32:
1233 case Intrinsic::aarch64_sve_fcvt_f16f64:
1234 case Intrinsic::aarch64_sve_fcvt_f32f16:
1235 case Intrinsic::aarch64_sve_fcvt_f32f64:
1236 case Intrinsic::aarch64_sve_fcvt_f64f16:
1237 case Intrinsic::aarch64_sve_fcvt_f64f32:
1238 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1239 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1240 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1241 case Intrinsic::aarch64_sve_fcvtzs:
1242 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1243 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1244 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1245 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1246 case Intrinsic::aarch64_sve_fcvtzu:
1247 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1248 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1249 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1250 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1251 case Intrinsic::aarch64_sve_scvtf:
1252 case Intrinsic::aarch64_sve_scvtf_f16i32:
1253 case Intrinsic::aarch64_sve_scvtf_f16i64:
1254 case Intrinsic::aarch64_sve_scvtf_f32i64:
1255 case Intrinsic::aarch64_sve_scvtf_f64i32:
1256 case Intrinsic::aarch64_sve_ucvtf:
1257 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1258 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1259 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1260 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1261 return SVEIntrinsicInfo::defaultMergingUnaryOp();
1262
1263 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1264 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1265 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1266 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1267 return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp();
1268
1269 case Intrinsic::aarch64_sve_fabd:
1270 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fabd_u);
1271 case Intrinsic::aarch64_sve_fadd:
1272 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fadd_u)
1273 .setMatchingIROpcode(Instruction::FAdd);
1274 case Intrinsic::aarch64_sve_fdiv:
1275 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fdiv_u)
1276 .setMatchingIROpcode(Instruction::FDiv);
1277 case Intrinsic::aarch64_sve_fmax:
1278 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmax_u);
1279 case Intrinsic::aarch64_sve_fmaxnm:
1280 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmaxnm_u);
1281 case Intrinsic::aarch64_sve_fmin:
1282 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmin_u);
1283 case Intrinsic::aarch64_sve_fminnm:
1284 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fminnm_u);
1285 case Intrinsic::aarch64_sve_fmla:
1286 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmla_u);
1287 case Intrinsic::aarch64_sve_fmls:
1288 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmls_u);
1289 case Intrinsic::aarch64_sve_fmul:
1290 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmul_u)
1291 .setMatchingIROpcode(Instruction::FMul);
1292 case Intrinsic::aarch64_sve_fmulx:
1293 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmulx_u);
1294 case Intrinsic::aarch64_sve_fnmla:
1295 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmla_u);
1296 case Intrinsic::aarch64_sve_fnmls:
1297 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmls_u);
1298 case Intrinsic::aarch64_sve_fsub:
1299 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fsub_u)
1300 .setMatchingIROpcode(Instruction::FSub);
1301 case Intrinsic::aarch64_sve_add:
1302 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_add_u)
1303 .setMatchingIROpcode(Instruction::Add);
1304 case Intrinsic::aarch64_sve_mla:
1305 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mla_u);
1306 case Intrinsic::aarch64_sve_mls:
1307 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mls_u);
1308 case Intrinsic::aarch64_sve_mul:
1309 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mul_u)
1310 .setMatchingIROpcode(Instruction::Mul);
1311 case Intrinsic::aarch64_sve_sabd:
1312 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sabd_u);
1313 case Intrinsic::aarch64_sve_sdiv:
1314 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sdiv_u)
1315 .setMatchingIROpcode(Instruction::SDiv);
1316 case Intrinsic::aarch64_sve_smax:
1317 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smax_u);
1318 case Intrinsic::aarch64_sve_smin:
1319 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smin_u);
1320 case Intrinsic::aarch64_sve_smulh:
1321 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smulh_u);
1322 case Intrinsic::aarch64_sve_sub:
1323 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sub_u)
1324 .setMatchingIROpcode(Instruction::Sub);
1325 case Intrinsic::aarch64_sve_uabd:
1326 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uabd_u);
1327 case Intrinsic::aarch64_sve_udiv:
1328 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_udiv_u)
1329 .setMatchingIROpcode(Instruction::UDiv);
1330 case Intrinsic::aarch64_sve_umax:
1331 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umax_u);
1332 case Intrinsic::aarch64_sve_umin:
1333 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umin_u);
1334 case Intrinsic::aarch64_sve_umulh:
1335 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umulh_u);
1336 case Intrinsic::aarch64_sve_asr:
1337 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_asr_u)
1338 .setMatchingIROpcode(Instruction::AShr);
1339 case Intrinsic::aarch64_sve_lsl:
1340 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsl_u)
1341 .setMatchingIROpcode(Instruction::Shl);
1342 case Intrinsic::aarch64_sve_lsr:
1343 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsr_u)
1344 .setMatchingIROpcode(Instruction::LShr);
1345 case Intrinsic::aarch64_sve_and:
1346 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_and_u)
1347 .setMatchingIROpcode(Instruction::And);
1348 case Intrinsic::aarch64_sve_bic:
1349 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_bic_u);
1350 case Intrinsic::aarch64_sve_eor:
1351 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_eor_u)
1352 .setMatchingIROpcode(Instruction::Xor);
1353 case Intrinsic::aarch64_sve_orr:
1354 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_orr_u)
1355 .setMatchingIROpcode(Instruction::Or);
1356 case Intrinsic::aarch64_sve_sqsub:
1357 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqsub_u);
1358 case Intrinsic::aarch64_sve_uqsub:
1359 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqsub_u);
1360
1361 case Intrinsic::aarch64_sve_add_u:
1362 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1363 Instruction::Add);
1364 case Intrinsic::aarch64_sve_and_u:
1365 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1366 Instruction::And);
1367 case Intrinsic::aarch64_sve_asr_u:
1368 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1369 Instruction::AShr);
1370 case Intrinsic::aarch64_sve_eor_u:
1371 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1372 Instruction::Xor);
1373 case Intrinsic::aarch64_sve_fadd_u:
1374 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1375 Instruction::FAdd);
1376 case Intrinsic::aarch64_sve_fdiv_u:
1377 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1378 Instruction::FDiv);
1379 case Intrinsic::aarch64_sve_fmul_u:
1380 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1381 Instruction::FMul);
1382 case Intrinsic::aarch64_sve_fsub_u:
1383 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1384 Instruction::FSub);
1385 case Intrinsic::aarch64_sve_lsl_u:
1386 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1387 Instruction::Shl);
1388 case Intrinsic::aarch64_sve_lsr_u:
1389 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1390 Instruction::LShr);
1391 case Intrinsic::aarch64_sve_mul_u:
1392 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1393 Instruction::Mul);
1394 case Intrinsic::aarch64_sve_orr_u:
1395 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1396 Instruction::Or);
1397 case Intrinsic::aarch64_sve_sdiv_u:
1398 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1399 Instruction::SDiv);
1400 case Intrinsic::aarch64_sve_sub_u:
1401 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1402 Instruction::Sub);
1403 case Intrinsic::aarch64_sve_udiv_u:
1404 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1405 Instruction::UDiv);
1406
1407 case Intrinsic::aarch64_sve_addqv:
1408 case Intrinsic::aarch64_sve_and_z:
1409 case Intrinsic::aarch64_sve_bic_z:
1410 case Intrinsic::aarch64_sve_brka_z:
1411 case Intrinsic::aarch64_sve_brkb_z:
1412 case Intrinsic::aarch64_sve_brkn_z:
1413 case Intrinsic::aarch64_sve_brkpa_z:
1414 case Intrinsic::aarch64_sve_brkpb_z:
1415 case Intrinsic::aarch64_sve_cntp:
1416 case Intrinsic::aarch64_sve_compact:
1417 case Intrinsic::aarch64_sve_eor_z:
1418 case Intrinsic::aarch64_sve_eorv:
1419 case Intrinsic::aarch64_sve_eorqv:
1420 case Intrinsic::aarch64_sve_nand_z:
1421 case Intrinsic::aarch64_sve_nor_z:
1422 case Intrinsic::aarch64_sve_orn_z:
1423 case Intrinsic::aarch64_sve_orr_z:
1424 case Intrinsic::aarch64_sve_orv:
1425 case Intrinsic::aarch64_sve_orqv:
1426 case Intrinsic::aarch64_sve_pnext:
1427 case Intrinsic::aarch64_sve_rdffr_z:
1428 case Intrinsic::aarch64_sve_saddv:
1429 case Intrinsic::aarch64_sve_uaddv:
1430 case Intrinsic::aarch64_sve_umaxv:
1431 case Intrinsic::aarch64_sve_umaxqv:
1432 case Intrinsic::aarch64_sve_cmpeq:
1433 case Intrinsic::aarch64_sve_cmpeq_wide:
1434 case Intrinsic::aarch64_sve_cmpge:
1435 case Intrinsic::aarch64_sve_cmpge_wide:
1436 case Intrinsic::aarch64_sve_cmpgt:
1437 case Intrinsic::aarch64_sve_cmpgt_wide:
1438 case Intrinsic::aarch64_sve_cmphi:
1439 case Intrinsic::aarch64_sve_cmphi_wide:
1440 case Intrinsic::aarch64_sve_cmphs:
1441 case Intrinsic::aarch64_sve_cmphs_wide:
1442 case Intrinsic::aarch64_sve_cmple_wide:
1443 case Intrinsic::aarch64_sve_cmplo_wide:
1444 case Intrinsic::aarch64_sve_cmpls_wide:
1445 case Intrinsic::aarch64_sve_cmplt_wide:
1446 case Intrinsic::aarch64_sve_cmpne:
1447 case Intrinsic::aarch64_sve_cmpne_wide:
1448 case Intrinsic::aarch64_sve_facge:
1449 case Intrinsic::aarch64_sve_facgt:
1450 case Intrinsic::aarch64_sve_fcmpeq:
1451 case Intrinsic::aarch64_sve_fcmpge:
1452 case Intrinsic::aarch64_sve_fcmpgt:
1453 case Intrinsic::aarch64_sve_fcmpne:
1454 case Intrinsic::aarch64_sve_fcmpuo:
1455 case Intrinsic::aarch64_sve_ld1:
1456 case Intrinsic::aarch64_sve_ld1_gather:
1457 case Intrinsic::aarch64_sve_ld1_gather_index:
1458 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1459 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1460 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1461 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1462 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1463 case Intrinsic::aarch64_sve_ld1q_gather_index:
1464 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1465 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1466 case Intrinsic::aarch64_sve_ld1ro:
1467 case Intrinsic::aarch64_sve_ld1rq:
1468 case Intrinsic::aarch64_sve_ld1udq:
1469 case Intrinsic::aarch64_sve_ld1uwq:
1470 case Intrinsic::aarch64_sve_ld2_sret:
1471 case Intrinsic::aarch64_sve_ld2q_sret:
1472 case Intrinsic::aarch64_sve_ld3_sret:
1473 case Intrinsic::aarch64_sve_ld3q_sret:
1474 case Intrinsic::aarch64_sve_ld4_sret:
1475 case Intrinsic::aarch64_sve_ld4q_sret:
1476 case Intrinsic::aarch64_sve_ldff1:
1477 case Intrinsic::aarch64_sve_ldff1_gather:
1478 case Intrinsic::aarch64_sve_ldff1_gather_index:
1479 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1480 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1481 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1482 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1483 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1484 case Intrinsic::aarch64_sve_ldnf1:
1485 case Intrinsic::aarch64_sve_ldnt1:
1486 case Intrinsic::aarch64_sve_ldnt1_gather:
1487 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1488 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1489 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1490 return SVEIntrinsicInfo::defaultZeroingOp();
1491
1492 case Intrinsic::aarch64_sve_prf:
1493 case Intrinsic::aarch64_sve_prfb_gather_index:
1494 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1495 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1496 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1497 case Intrinsic::aarch64_sve_prfd_gather_index:
1498 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1499 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1500 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1501 case Intrinsic::aarch64_sve_prfh_gather_index:
1502 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1503 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1504 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1505 case Intrinsic::aarch64_sve_prfw_gather_index:
1506 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1507 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1508 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1509 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 0);
1510
1511 case Intrinsic::aarch64_sve_st1_scatter:
1512 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1513 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1514 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1515 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1516 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1517 case Intrinsic::aarch64_sve_st1dq:
1518 case Intrinsic::aarch64_sve_st1q_scatter_index:
1519 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1520 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1521 case Intrinsic::aarch64_sve_st1wq:
1522 case Intrinsic::aarch64_sve_stnt1:
1523 case Intrinsic::aarch64_sve_stnt1_scatter:
1524 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1525 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1526 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1527 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 1);
1528 case Intrinsic::aarch64_sve_st2:
1529 case Intrinsic::aarch64_sve_st2q:
1530 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 2);
1531 case Intrinsic::aarch64_sve_st3:
1532 case Intrinsic::aarch64_sve_st3q:
1533 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 3);
1534 case Intrinsic::aarch64_sve_st4:
1535 case Intrinsic::aarch64_sve_st4q:
1536 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 4);
1537 }
1538
1539 return SVEIntrinsicInfo();
1540}
1541
1542static bool isAllActivePredicate(Value *Pred) {
1543 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1544 Value *UncastedPred;
1545 if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1546 Op0: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1547 Op0: m_Value(V&: UncastedPred)))))
1548 // If the predicate has the same or less lanes than the uncasted
1549 // predicate then we know the casting has no effect.
1550 if (cast<ScalableVectorType>(Val: Pred->getType())->getMinNumElements() <=
1551 cast<ScalableVectorType>(Val: UncastedPred->getType())->getMinNumElements())
1552 Pred = UncastedPred;
1553 auto *C = dyn_cast<Constant>(Val: Pred);
1554 return (C && C->isAllOnesValue());
1555}
1556
1557// Simplify `V` by only considering the operations that affect active lanes.
1558// This function should only return existing Values or newly created Constants.
1559static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1560 auto *Dup = dyn_cast<IntrinsicInst>(Val: V);
1561 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1562 Dup->getOperand(i_nocapture: 1) == Pg && isa<Constant>(Val: Dup->getOperand(i_nocapture: 2)))
1563 return ConstantVector::getSplat(
1564 EC: cast<VectorType>(Val: V->getType())->getElementCount(),
1565 Elt: cast<Constant>(Val: Dup->getOperand(i_nocapture: 2)));
1566
1567 return V;
1568}
1569
1570static std::optional<Instruction *>
1571simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II,
1572 const SVEIntrinsicInfo &IInfo) {
1573 const unsigned Opc = IInfo.getMatchingIROpode();
1574 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1575
1576 Value *Pg = II.getOperand(i_nocapture: 0);
1577 Value *Op1 = II.getOperand(i_nocapture: 1);
1578 Value *Op2 = II.getOperand(i_nocapture: 2);
1579 const DataLayout &DL = II.getDataLayout();
1580
1581 // Canonicalise constants to the RHS.
1582 if (Instruction::isCommutative(Opcode: Opc) && IInfo.inactiveLanesAreNotDefined() &&
1583 isa<Constant>(Val: Op1) && !isa<Constant>(Val: Op2)) {
1584 IC.replaceOperand(I&: II, OpNum: 1, V: Op2);
1585 IC.replaceOperand(I&: II, OpNum: 2, V: Op1);
1586 return &II;
1587 }
1588
1589 // Only active lanes matter when simplifying the operation.
1590 Op1 = stripInactiveLanes(V: Op1, Pg);
1591 Op2 = stripInactiveLanes(V: Op2, Pg);
1592
1593 Value *SimpleII;
1594 if (auto FII = dyn_cast<FPMathOperator>(Val: &II))
1595 SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, FMF: FII->getFastMathFlags(), Q: DL);
1596 else
1597 SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, Q: DL);
1598
1599 // An SVE intrinsic's result is always defined. However, this is not the case
1600 // for its equivalent IR instruction (e.g. when shifting by an amount more
1601 // than the data's bitwidth). Simplifications to an undefined result must be
1602 // ignored to preserve the intrinsic's expected behaviour.
1603 if (!SimpleII || isa<UndefValue>(Val: SimpleII))
1604 return std::nullopt;
1605
1606 if (IInfo.inactiveLanesAreNotDefined())
1607 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1608
1609 Value *Inactive = II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom());
1610
1611 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1612 if (SimpleII == Inactive)
1613 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1614
1615 // Inactive lanes must be preserved.
1616 SimpleII = IC.Builder.CreateSelect(C: Pg, True: SimpleII, False: Inactive);
1617 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1618}
1619
1620// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1621// to operations with less strict inactive lane requirements.
1622static std::optional<Instruction *>
1623simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II,
1624 const SVEIntrinsicInfo &IInfo) {
1625 if (!IInfo.hasGoverningPredicate())
1626 return std::nullopt;
1627
1628 auto *OpPredicate = II.getOperand(i_nocapture: IInfo.getGoverningPredicateOperandIdx());
1629
1630 // If there are no active lanes.
1631 if (match(V: OpPredicate, P: m_ZeroInt())) {
1632 if (IInfo.inactiveLanesTakenFromOperand())
1633 return IC.replaceInstUsesWith(
1634 I&: II, V: II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom()));
1635
1636 if (IInfo.inactiveLanesAreUnused()) {
1637 if (IInfo.resultIsZeroInitialized())
1638 IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1639
1640 return IC.eraseInstFromFunction(I&: II);
1641 }
1642 }
1643
1644 // If there are no inactive lanes.
1645 if (isAllActivePredicate(Pred: OpPredicate)) {
1646 if (IInfo.hasOperandWithNoActiveLanes()) {
1647 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1648 if (!isa<UndefValue>(Val: II.getOperand(i_nocapture: OpIdx)))
1649 return IC.replaceOperand(I&: II, OpNum: OpIdx, V: UndefValue::get(T: II.getType()));
1650 }
1651
1652 if (IInfo.hasMatchingUndefIntrinsic()) {
1653 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1654 M: II.getModule(), id: IInfo.getMatchingUndefIntrinsic(), Tys: {II.getType()});
1655 II.setCalledFunction(NewDecl);
1656 return &II;
1657 }
1658 }
1659
1660 // Operation specific simplifications.
1661 if (IInfo.hasMatchingIROpode() &&
1662 Instruction::isBinaryOp(Opcode: IInfo.getMatchingIROpode()))
1663 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1664
1665 return std::nullopt;
1666}
1667
1668// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1669// => (binop (pred) (from_svbool _) (from_svbool _))
1670//
1671// The above transformation eliminates a `to_svbool` in the predicate
1672// operand of bitwise operation `binop` by narrowing the vector width of
1673// the operation. For example, it would convert a `<vscale x 16 x i1>
1674// and` into a `<vscale x 4 x i1> and`. This is profitable because
1675// to_svbool must zero the new lanes during widening, whereas
1676// from_svbool is free.
1677static std::optional<Instruction *>
1678tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
1679 auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: 0));
1680 if (!BinOp)
1681 return std::nullopt;
1682
1683 auto IntrinsicID = BinOp->getIntrinsicID();
1684 switch (IntrinsicID) {
1685 case Intrinsic::aarch64_sve_and_z:
1686 case Intrinsic::aarch64_sve_bic_z:
1687 case Intrinsic::aarch64_sve_eor_z:
1688 case Intrinsic::aarch64_sve_nand_z:
1689 case Intrinsic::aarch64_sve_nor_z:
1690 case Intrinsic::aarch64_sve_orn_z:
1691 case Intrinsic::aarch64_sve_orr_z:
1692 break;
1693 default:
1694 return std::nullopt;
1695 }
1696
1697 auto BinOpPred = BinOp->getOperand(i_nocapture: 0);
1698 auto BinOpOp1 = BinOp->getOperand(i_nocapture: 1);
1699 auto BinOpOp2 = BinOp->getOperand(i_nocapture: 2);
1700
1701 auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred);
1702 if (!PredIntr ||
1703 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1704 return std::nullopt;
1705
1706 auto PredOp = PredIntr->getOperand(i_nocapture: 0);
1707 auto PredOpTy = cast<VectorType>(Val: PredOp->getType());
1708 if (PredOpTy != II.getType())
1709 return std::nullopt;
1710
1711 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1712 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1713 ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp1});
1714 NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
1715 if (BinOpOp1 == BinOpOp2)
1716 NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
1717 else
1718 NarrowedBinOpArgs.push_back(Elt: IC.Builder.CreateIntrinsic(
1719 ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp2}));
1720
1721 auto NarrowedBinOp =
1722 IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {PredOpTy}, Args: NarrowedBinOpArgs);
1723 return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp);
1724}
1725
1726static std::optional<Instruction *>
1727instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
1728 // If the reinterpret instruction operand is a PHI Node
1729 if (isa<PHINode>(Val: II.getArgOperand(i: 0)))
1730 return processPhiNode(IC, II);
1731
1732 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1733 return BinOpCombine;
1734
1735 // Ignore converts to/from svcount_t.
1736 if (isa<TargetExtType>(Val: II.getArgOperand(i: 0)->getType()) ||
1737 isa<TargetExtType>(Val: II.getType()))
1738 return std::nullopt;
1739
1740 SmallVector<Instruction *, 32> CandidatesForRemoval;
1741 Value *Cursor = II.getOperand(i_nocapture: 0), *EarliestReplacement = nullptr;
1742
1743 const auto *IVTy = cast<VectorType>(Val: II.getType());
1744
1745 // Walk the chain of conversions.
1746 while (Cursor) {
1747 // If the type of the cursor has fewer lanes than the final result, zeroing
1748 // must take place, which breaks the equivalence chain.
1749 const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType());
1750 if (CursorVTy->getElementCount().getKnownMinValue() <
1751 IVTy->getElementCount().getKnownMinValue())
1752 break;
1753
1754 // If the cursor has the same type as I, it is a viable replacement.
1755 if (Cursor->getType() == IVTy)
1756 EarliestReplacement = Cursor;
1757
1758 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor);
1759
1760 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1761 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1762 Intrinsic::aarch64_sve_convert_to_svbool ||
1763 IntrinsicCursor->getIntrinsicID() ==
1764 Intrinsic::aarch64_sve_convert_from_svbool))
1765 break;
1766
1767 CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor);
1768 Cursor = IntrinsicCursor->getOperand(i_nocapture: 0);
1769 }
1770
1771 // If no viable replacement in the conversion chain was found, there is
1772 // nothing to do.
1773 if (!EarliestReplacement)
1774 return std::nullopt;
1775
1776 return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement);
1777}
1778
1779static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1780 IntrinsicInst &II) {
1781 // svsel(ptrue, x, y) => x
1782 auto *OpPredicate = II.getOperand(i_nocapture: 0);
1783 if (isAllActivePredicate(Pred: OpPredicate))
1784 return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: 1));
1785
1786 auto Select =
1787 IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: 1), False: II.getOperand(i_nocapture: 2));
1788 return IC.replaceInstUsesWith(I&: II, V: Select);
1789}
1790
1791static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1792 IntrinsicInst &II) {
1793 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1));
1794 if (!Pg)
1795 return std::nullopt;
1796
1797 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1798 return std::nullopt;
1799
1800 const auto PTruePattern =
1801 cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: 0))->getZExtValue();
1802 if (PTruePattern != AArch64SVEPredPattern::vl1)
1803 return std::nullopt;
1804
1805 // The intrinsic is inserting into lane zero so use an insert instead.
1806 auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1807 auto *Insert = InsertElementInst::Create(
1808 Vec: II.getArgOperand(i: 0), NewElt: II.getArgOperand(i: 2), Idx: ConstantInt::get(Ty: IdxTy, V: 0));
1809 Insert->insertBefore(InsertPos: II.getIterator());
1810 Insert->takeName(V: &II);
1811
1812 return IC.replaceInstUsesWith(I&: II, V: Insert);
1813}
1814
1815static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1816 IntrinsicInst &II) {
1817 // Replace DupX with a regular IR splat.
1818 auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
1819 Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
1820 V: II.getArgOperand(i: 0));
1821 Splat->takeName(V: &II);
1822 return IC.replaceInstUsesWith(I&: II, V: Splat);
1823}
1824
1825static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1826 IntrinsicInst &II) {
1827 LLVMContext &Ctx = II.getContext();
1828
1829 if (!isAllActivePredicate(Pred: II.getArgOperand(i: 0)))
1830 return std::nullopt;
1831
1832 // Check that we have a compare of zero..
1833 auto *SplatValue =
1834 dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: 2)));
1835 if (!SplatValue || !SplatValue->isZero())
1836 return std::nullopt;
1837
1838 // ..against a dupq
1839 auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1));
1840 if (!DupQLane ||
1841 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1842 return std::nullopt;
1843
1844 // Where the dupq is a lane 0 replicate of a vector insert
1845 auto *DupQLaneIdx = dyn_cast<ConstantInt>(Val: DupQLane->getArgOperand(i: 1));
1846 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1847 return std::nullopt;
1848
1849 auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: 0));
1850 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1851 return std::nullopt;
1852
1853 // Where the vector insert is a fixed constant vector insert into undef at
1854 // index zero
1855 if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: 0)))
1856 return std::nullopt;
1857
1858 if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: 2))->isZero())
1859 return std::nullopt;
1860
1861 auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: 1));
1862 if (!ConstVec)
1863 return std::nullopt;
1864
1865 auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType());
1866 auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType());
1867 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1868 return std::nullopt;
1869
1870 unsigned NumElts = VecTy->getNumElements();
1871 unsigned PredicateBits = 0;
1872
1873 // Expand intrinsic operands to a 16-bit byte level predicate
1874 for (unsigned I = 0; I < NumElts; ++I) {
1875 auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I));
1876 if (!Arg)
1877 return std::nullopt;
1878 if (!Arg->isZero())
1879 PredicateBits |= 1 << (I * (16 / NumElts));
1880 }
1881
1882 // If all bits are zero bail early with an empty predicate
1883 if (PredicateBits == 0) {
1884 auto *PFalse = Constant::getNullValue(Ty: II.getType());
1885 PFalse->takeName(V: &II);
1886 return IC.replaceInstUsesWith(I&: II, V: PFalse);
1887 }
1888
1889 // Calculate largest predicate type used (where byte predicate is largest)
1890 unsigned Mask = 8;
1891 for (unsigned I = 0; I < 16; ++I)
1892 if ((PredicateBits & (1 << I)) != 0)
1893 Mask |= (I % 8);
1894
1895 unsigned PredSize = Mask & -Mask;
1896 auto *PredType = ScalableVectorType::get(
1897 ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * 8));
1898
1899 // Ensure all relevant bits are set
1900 for (unsigned I = 0; I < 16; I += PredSize)
1901 if ((PredicateBits & (1 << I)) == 0)
1902 return std::nullopt;
1903
1904 auto *PTruePat =
1905 ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
1906 auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
1907 Types: {PredType}, Args: {PTruePat});
1908 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1909 ID: Intrinsic::aarch64_sve_convert_to_svbool, Types: {PredType}, Args: {PTrue});
1910 auto *ConvertFromSVBool =
1911 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool,
1912 Types: {II.getType()}, Args: {ConvertToSVBool});
1913
1914 ConvertFromSVBool->takeName(V: &II);
1915 return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool);
1916}
1917
1918static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1919 IntrinsicInst &II) {
1920 Value *Pg = II.getArgOperand(i: 0);
1921 Value *Vec = II.getArgOperand(i: 1);
1922 auto IntrinsicID = II.getIntrinsicID();
1923 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1924
1925 // lastX(splat(X)) --> X
1926 if (auto *SplatVal = getSplatValue(V: Vec))
1927 return IC.replaceInstUsesWith(I&: II, V: SplatVal);
1928
1929 // If x and/or y is a splat value then:
1930 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1931 Value *LHS, *RHS;
1932 if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) {
1933 if (isSplatValue(V: LHS) || isSplatValue(V: RHS)) {
1934 auto *OldBinOp = cast<BinaryOperator>(Val: Vec);
1935 auto OpC = OldBinOp->getOpcode();
1936 auto *NewLHS =
1937 IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, LHS});
1938 auto *NewRHS =
1939 IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, RHS});
1940 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
1941 Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator());
1942 return IC.replaceInstUsesWith(I&: II, V: NewBinOp);
1943 }
1944 }
1945
1946 auto *C = dyn_cast<Constant>(Val: Pg);
1947 if (IsAfter && C && C->isNullValue()) {
1948 // The intrinsic is extracting lane 0 so use an extract instead.
1949 auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1950 auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: 0));
1951 Extract->insertBefore(InsertPos: II.getIterator());
1952 Extract->takeName(V: &II);
1953 return IC.replaceInstUsesWith(I&: II, V: Extract);
1954 }
1955
1956 auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg);
1957 if (!IntrPG)
1958 return std::nullopt;
1959
1960 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1961 return std::nullopt;
1962
1963 const auto PTruePattern =
1964 cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: 0))->getZExtValue();
1965
1966 // Can the intrinsic's predicate be converted to a known constant index?
1967 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern);
1968 if (!MinNumElts)
1969 return std::nullopt;
1970
1971 unsigned Idx = MinNumElts - 1;
1972 // Increment the index if extracting the element after the last active
1973 // predicate element.
1974 if (IsAfter)
1975 ++Idx;
1976
1977 // Ignore extracts whose index is larger than the known minimum vector
1978 // length. NOTE: This is an artificial constraint where we prefer to
1979 // maintain what the user asked for until an alternative is proven faster.
1980 auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType());
1981 if (Idx >= PgVTy->getMinNumElements())
1982 return std::nullopt;
1983
1984 // The intrinsic is extracting a fixed lane so use an extract instead.
1985 auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1986 auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx));
1987 Extract->insertBefore(InsertPos: II.getIterator());
1988 Extract->takeName(V: &II);
1989 return IC.replaceInstUsesWith(I&: II, V: Extract);
1990}
1991
1992static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1993 IntrinsicInst &II) {
1994 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1995 // integer variant across a variety of micro-architectures. Replace scalar
1996 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1997 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1998 // depending on the micro-architecture, but has been observed as generally
1999 // being faster, particularly when the CLAST[AB] op is a loop-carried
2000 // dependency.
2001 Value *Pg = II.getArgOperand(i: 0);
2002 Value *Fallback = II.getArgOperand(i: 1);
2003 Value *Vec = II.getArgOperand(i: 2);
2004 Type *Ty = II.getType();
2005
2006 if (!Ty->isIntegerTy())
2007 return std::nullopt;
2008
2009 Type *FPTy;
2010 switch (cast<IntegerType>(Val: Ty)->getBitWidth()) {
2011 default:
2012 return std::nullopt;
2013 case 16:
2014 FPTy = IC.Builder.getHalfTy();
2015 break;
2016 case 32:
2017 FPTy = IC.Builder.getFloatTy();
2018 break;
2019 case 64:
2020 FPTy = IC.Builder.getDoubleTy();
2021 break;
2022 }
2023
2024 Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy);
2025 auto *FPVTy = VectorType::get(
2026 ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount());
2027 Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy);
2028 auto *FPII = IC.Builder.CreateIntrinsic(
2029 ID: II.getIntrinsicID(), Types: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec});
2030 Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType());
2031 return IC.replaceInstUsesWith(I&: II, V: FPIItoInt);
2032}
2033
2034static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2035 IntrinsicInst &II) {
2036 LLVMContext &Ctx = II.getContext();
2037 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2038 // can work with RDFFR_PP for ptest elimination.
2039 auto *AllPat =
2040 ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
2041 auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
2042 Types: {II.getType()}, Args: {AllPat});
2043 auto *RDFFR =
2044 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_rdffr_z, Args: {PTrue});
2045 RDFFR->takeName(V: &II);
2046 return IC.replaceInstUsesWith(I&: II, V: RDFFR);
2047}
2048
2049static std::optional<Instruction *>
2050instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
2051 const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: 0))->getZExtValue();
2052
2053 if (Pattern == AArch64SVEPredPattern::all) {
2054 Value *Cnt = IC.Builder.CreateElementCount(
2055 Ty: II.getType(), EC: ElementCount::getScalable(MinVal: NumElts));
2056 Cnt->takeName(V: &II);
2057 return IC.replaceInstUsesWith(I&: II, V: Cnt);
2058 }
2059
2060 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2061
2062 return MinNumElts && NumElts >= MinNumElts
2063 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2064 I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts)))
2065 : std::nullopt;
2066}
2067
2068static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2069 IntrinsicInst &II) {
2070 Value *PgVal = II.getArgOperand(i: 0);
2071 Value *OpVal = II.getArgOperand(i: 1);
2072
2073 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2074 // Later optimizations prefer this form.
2075 if (PgVal == OpVal &&
2076 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2077 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2078 Value *Ops[] = {PgVal, OpVal};
2079 Type *Tys[] = {PgVal->getType()};
2080
2081 auto *PTest =
2082 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptest_any, Types: Tys, Args: Ops);
2083 PTest->takeName(V: &II);
2084
2085 return IC.replaceInstUsesWith(I&: II, V: PTest);
2086 }
2087
2088 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal);
2089 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal);
2090
2091 if (!Pg || !Op)
2092 return std::nullopt;
2093
2094 Intrinsic::ID OpIID = Op->getIntrinsicID();
2095
2096 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2097 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2098 Pg->getArgOperand(i: 0)->getType() == Op->getArgOperand(i: 0)->getType()) {
2099 Value *Ops[] = {Pg->getArgOperand(i: 0), Op->getArgOperand(i: 0)};
2100 Type *Tys[] = {Pg->getArgOperand(i: 0)->getType()};
2101
2102 auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
2103
2104 PTest->takeName(V: &II);
2105 return IC.replaceInstUsesWith(I&: II, V: PTest);
2106 }
2107
2108 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2109 // Later optimizations may rewrite sequence to use the flag-setting variant
2110 // of instruction X to remove PTEST.
2111 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2112 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2113 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2114 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2115 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2116 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2117 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2118 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2119 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2120 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2121 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2122 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2123 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2124 Value *Ops[] = {Pg->getArgOperand(i: 0), Pg};
2125 Type *Tys[] = {Pg->getType()};
2126
2127 auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
2128 PTest->takeName(V: &II);
2129
2130 return IC.replaceInstUsesWith(I&: II, V: PTest);
2131 }
2132
2133 return std::nullopt;
2134}
2135
2136template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
2137static std::optional<Instruction *>
2138instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
2139 bool MergeIntoAddendOp) {
2140 Value *P = II.getOperand(i_nocapture: 0);
2141 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2142 if (MergeIntoAddendOp) {
2143 AddendOp = II.getOperand(i_nocapture: 1);
2144 Mul = II.getOperand(i_nocapture: 2);
2145 } else {
2146 AddendOp = II.getOperand(i_nocapture: 2);
2147 Mul = II.getOperand(i_nocapture: 1);
2148 }
2149
2150 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0),
2151 m_Value(V&: MulOp1))))
2152 return std::nullopt;
2153
2154 if (!Mul->hasOneUse())
2155 return std::nullopt;
2156
2157 Instruction *FMFSource = nullptr;
2158 if (II.getType()->isFPOrFPVectorTy()) {
2159 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2160 // Stop the combine when the flags on the inputs differ in case dropping
2161 // flags would lead to us missing out on more beneficial optimizations.
2162 if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags())
2163 return std::nullopt;
2164 if (!FAddFlags.allowContract())
2165 return std::nullopt;
2166 FMFSource = &II;
2167 }
2168
2169 CallInst *Res;
2170 if (MergeIntoAddendOp)
2171 Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
2172 Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2173 else
2174 Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
2175 Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2176
2177 return IC.replaceInstUsesWith(I&: II, V: Res);
2178}
2179
2180static std::optional<Instruction *>
2181instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2182 Value *Pred = II.getOperand(i_nocapture: 0);
2183 Value *PtrOp = II.getOperand(i_nocapture: 1);
2184 Type *VecTy = II.getType();
2185
2186 if (isAllActivePredicate(Pred)) {
2187 LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp);
2188 Load->copyMetadata(SrcInst: II);
2189 return IC.replaceInstUsesWith(I&: II, V: Load);
2190 }
2191
2192 CallInst *MaskedLoad =
2193 IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL),
2194 Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy));
2195 MaskedLoad->copyMetadata(SrcInst: II);
2196 return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2197}
2198
2199static std::optional<Instruction *>
2200instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2201 Value *VecOp = II.getOperand(i_nocapture: 0);
2202 Value *Pred = II.getOperand(i_nocapture: 1);
2203 Value *PtrOp = II.getOperand(i_nocapture: 2);
2204
2205 if (isAllActivePredicate(Pred)) {
2206 StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp);
2207 Store->copyMetadata(SrcInst: II);
2208 return IC.eraseInstFromFunction(I&: II);
2209 }
2210
2211 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2212 Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred);
2213 MaskedStore->copyMetadata(SrcInst: II);
2214 return IC.eraseInstFromFunction(I&: II);
2215}
2216
2217static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
2218 switch (Intrinsic) {
2219 case Intrinsic::aarch64_sve_fmul_u:
2220 return Instruction::BinaryOps::FMul;
2221 case Intrinsic::aarch64_sve_fadd_u:
2222 return Instruction::BinaryOps::FAdd;
2223 case Intrinsic::aarch64_sve_fsub_u:
2224 return Instruction::BinaryOps::FSub;
2225 default:
2226 return Instruction::BinaryOpsEnd;
2227 }
2228}
2229
2230static std::optional<Instruction *>
2231instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
2232 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2233 if (II.isStrictFP())
2234 return std::nullopt;
2235
2236 auto *OpPredicate = II.getOperand(i_nocapture: 0);
2237 auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID());
2238 if (BinOpCode == Instruction::BinaryOpsEnd ||
2239 !isAllActivePredicate(Pred: OpPredicate))
2240 return std::nullopt;
2241 auto BinOp = IC.Builder.CreateBinOpFMF(
2242 Opc: BinOpCode, LHS: II.getOperand(i_nocapture: 1), RHS: II.getOperand(i_nocapture: 2), FMFSource: II.getFastMathFlags());
2243 return IC.replaceInstUsesWith(I&: II, V: BinOp);
2244}
2245
2246static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2247 IntrinsicInst &II) {
2248 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2249 Intrinsic::aarch64_sve_mla>(
2250 IC, II, MergeIntoAddendOp: true))
2251 return MLA;
2252 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2253 Intrinsic::aarch64_sve_mad>(
2254 IC, II, MergeIntoAddendOp: false))
2255 return MAD;
2256 return std::nullopt;
2257}
2258
2259static std::optional<Instruction *>
2260instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
2261 if (auto FMLA =
2262 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2263 Intrinsic::aarch64_sve_fmla>(IC, II,
2264 MergeIntoAddendOp: true))
2265 return FMLA;
2266 if (auto FMAD =
2267 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2268 Intrinsic::aarch64_sve_fmad>(IC, II,
2269 MergeIntoAddendOp: false))
2270 return FMAD;
2271 if (auto FMLA =
2272 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2273 Intrinsic::aarch64_sve_fmla>(IC, II,
2274 MergeIntoAddendOp: true))
2275 return FMLA;
2276 return std::nullopt;
2277}
2278
2279static std::optional<Instruction *>
2280instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
2281 if (auto FMLA =
2282 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2283 Intrinsic::aarch64_sve_fmla>(IC, II,
2284 MergeIntoAddendOp: true))
2285 return FMLA;
2286 if (auto FMAD =
2287 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2288 Intrinsic::aarch64_sve_fmad>(IC, II,
2289 MergeIntoAddendOp: false))
2290 return FMAD;
2291 if (auto FMLA_U =
2292 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2293 Intrinsic::aarch64_sve_fmla_u>(
2294 IC, II, MergeIntoAddendOp: true))
2295 return FMLA_U;
2296 return instCombineSVEVectorBinOp(IC, II);
2297}
2298
2299static std::optional<Instruction *>
2300instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
2301 if (auto FMLS =
2302 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2303 Intrinsic::aarch64_sve_fmls>(IC, II,
2304 MergeIntoAddendOp: true))
2305 return FMLS;
2306 if (auto FMSB =
2307 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2308 Intrinsic::aarch64_sve_fnmsb>(
2309 IC, II, MergeIntoAddendOp: false))
2310 return FMSB;
2311 if (auto FMLS =
2312 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2313 Intrinsic::aarch64_sve_fmls>(IC, II,
2314 MergeIntoAddendOp: true))
2315 return FMLS;
2316 return std::nullopt;
2317}
2318
2319static std::optional<Instruction *>
2320instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
2321 if (auto FMLS =
2322 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2323 Intrinsic::aarch64_sve_fmls>(IC, II,
2324 MergeIntoAddendOp: true))
2325 return FMLS;
2326 if (auto FMSB =
2327 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2328 Intrinsic::aarch64_sve_fnmsb>(
2329 IC, II, MergeIntoAddendOp: false))
2330 return FMSB;
2331 if (auto FMLS_U =
2332 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2333 Intrinsic::aarch64_sve_fmls_u>(
2334 IC, II, MergeIntoAddendOp: true))
2335 return FMLS_U;
2336 return instCombineSVEVectorBinOp(IC, II);
2337}
2338
2339static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2340 IntrinsicInst &II) {
2341 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2342 Intrinsic::aarch64_sve_mls>(
2343 IC, II, MergeIntoAddendOp: true))
2344 return MLS;
2345 return std::nullopt;
2346}
2347
2348static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2349 IntrinsicInst &II) {
2350 Value *UnpackArg = II.getArgOperand(i: 0);
2351 auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2352 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2353 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2354
2355 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2356 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2357 if (auto *ScalarArg = getSplatValue(V: UnpackArg)) {
2358 ScalarArg =
2359 IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned);
2360 Value *NewVal =
2361 IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg);
2362 NewVal->takeName(V: &II);
2363 return IC.replaceInstUsesWith(I&: II, V: NewVal);
2364 }
2365
2366 return std::nullopt;
2367}
2368static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2369 IntrinsicInst &II) {
2370 auto *OpVal = II.getOperand(i_nocapture: 0);
2371 auto *OpIndices = II.getOperand(i_nocapture: 1);
2372 VectorType *VTy = cast<VectorType>(Val: II.getType());
2373
2374 // Check whether OpIndices is a constant splat value < minimal element count
2375 // of result.
2376 auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices));
2377 if (!SplatValue ||
2378 SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue()))
2379 return std::nullopt;
2380
2381 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2382 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2383 auto *Extract = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue);
2384 auto *VectorSplat =
2385 IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract);
2386
2387 VectorSplat->takeName(V: &II);
2388 return IC.replaceInstUsesWith(I&: II, V: VectorSplat);
2389}
2390
2391static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2392 IntrinsicInst &II) {
2393 Value *A, *B;
2394 Type *RetTy = II.getType();
2395 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2396 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2397
2398 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2399 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2400 if ((match(V: II.getArgOperand(i: 0),
2401 P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A)))) &&
2402 match(V: II.getArgOperand(i: 1),
2403 P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) ||
2404 (match(V: II.getArgOperand(i: 0), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A))) &&
2405 match(V: II.getArgOperand(i: 1), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) {
2406 auto *TyA = cast<ScalableVectorType>(Val: A->getType());
2407 if (TyA == B->getType() &&
2408 RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) {
2409 auto *SubVec = IC.Builder.CreateInsertVector(
2410 DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: uint64_t(0));
2411 auto *ConcatVec = IC.Builder.CreateInsertVector(DstType: RetTy, SrcVec: SubVec, SubVec: B,
2412 Idx: TyA->getMinNumElements());
2413 ConcatVec->takeName(V: &II);
2414 return IC.replaceInstUsesWith(I&: II, V: ConcatVec);
2415 }
2416 }
2417
2418 return std::nullopt;
2419}
2420
2421static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2422 IntrinsicInst &II) {
2423 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2424 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2425 Value *A, *B;
2426 if (match(V: II.getArgOperand(i: 0),
2427 P: m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(Op0: m_Value(V&: A), Op1: m_Value(V&: B))) &&
2428 match(V: II.getArgOperand(i: 1), P: m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
2429 Op0: m_Specific(V: A), Op1: m_Specific(V: B))))
2430 return IC.replaceInstUsesWith(
2431 I&: II, V: (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2432
2433 return std::nullopt;
2434}
2435
2436static std::optional<Instruction *>
2437instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
2438 Value *Mask = II.getOperand(i_nocapture: 0);
2439 Value *BasePtr = II.getOperand(i_nocapture: 1);
2440 Value *Index = II.getOperand(i_nocapture: 2);
2441 Type *Ty = II.getType();
2442 Value *PassThru = ConstantAggregateZero::get(Ty);
2443
2444 // Contiguous gather => masked load.
2445 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2446 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2447 Value *IndexBase;
2448 if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
2449 Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) {
2450 Align Alignment =
2451 BasePtr->getPointerAlignment(DL: II.getDataLayout());
2452
2453 Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2454 Ptr: BasePtr, IdxList: IndexBase);
2455 CallInst *MaskedLoad =
2456 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2457 MaskedLoad->takeName(V: &II);
2458 return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2459 }
2460
2461 return std::nullopt;
2462}
2463
2464static std::optional<Instruction *>
2465instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
2466 Value *Val = II.getOperand(i_nocapture: 0);
2467 Value *Mask = II.getOperand(i_nocapture: 1);
2468 Value *BasePtr = II.getOperand(i_nocapture: 2);
2469 Value *Index = II.getOperand(i_nocapture: 3);
2470 Type *Ty = Val->getType();
2471
2472 // Contiguous scatter => masked store.
2473 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2474 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2475 Value *IndexBase;
2476 if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
2477 Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) {
2478 Align Alignment =
2479 BasePtr->getPointerAlignment(DL: II.getDataLayout());
2480
2481 Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2482 Ptr: BasePtr, IdxList: IndexBase);
2483 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2484
2485 return IC.eraseInstFromFunction(I&: II);
2486 }
2487
2488 return std::nullopt;
2489}
2490
2491static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2492 IntrinsicInst &II) {
2493 Type *Int32Ty = IC.Builder.getInt32Ty();
2494 Value *Pred = II.getOperand(i_nocapture: 0);
2495 Value *Vec = II.getOperand(i_nocapture: 1);
2496 Value *DivVec = II.getOperand(i_nocapture: 2);
2497
2498 Value *SplatValue = getSplatValue(V: DivVec);
2499 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue);
2500 if (!SplatConstantInt)
2501 return std::nullopt;
2502
2503 APInt Divisor = SplatConstantInt->getValue();
2504 const int64_t DivisorValue = Divisor.getSExtValue();
2505 if (DivisorValue == -1)
2506 return std::nullopt;
2507 if (DivisorValue == 1)
2508 IC.replaceInstUsesWith(I&: II, V: Vec);
2509
2510 if (Divisor.isPowerOf2()) {
2511 Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2512 auto ASRD = IC.Builder.CreateIntrinsic(
2513 ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2514 return IC.replaceInstUsesWith(I&: II, V: ASRD);
2515 }
2516 if (Divisor.isNegatedPowerOf2()) {
2517 Divisor.negate();
2518 Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2519 auto ASRD = IC.Builder.CreateIntrinsic(
2520 ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2521 auto NEG = IC.Builder.CreateIntrinsic(
2522 ID: Intrinsic::aarch64_sve_neg, Types: {ASRD->getType()}, Args: {ASRD, Pred, ASRD});
2523 return IC.replaceInstUsesWith(I&: II, V: NEG);
2524 }
2525
2526 return std::nullopt;
2527}
2528
2529bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2530 size_t VecSize = Vec.size();
2531 if (VecSize == 1)
2532 return true;
2533 if (!isPowerOf2_64(Value: VecSize))
2534 return false;
2535 size_t HalfVecSize = VecSize / 2;
2536
2537 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2538 RHS != Vec.end(); LHS++, RHS++) {
2539 if (*LHS != nullptr && *RHS != nullptr) {
2540 if (*LHS == *RHS)
2541 continue;
2542 else
2543 return false;
2544 }
2545 if (!AllowPoison)
2546 return false;
2547 if (*LHS == nullptr && *RHS != nullptr)
2548 *LHS = *RHS;
2549 }
2550
2551 Vec.resize(N: HalfVecSize);
2552 SimplifyValuePattern(Vec, AllowPoison);
2553 return true;
2554}
2555
2556// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2557// to dupqlane(f64(C)) where C is A concatenated with B
2558static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2559 IntrinsicInst &II) {
2560 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2561 if (!match(V: II.getOperand(i_nocapture: 0),
2562 P: m_Intrinsic<Intrinsic::vector_insert>(
2563 Op0: m_Value(V&: Default), Op1: m_Value(V&: CurrentInsertElt), Op2: m_Value())) ||
2564 !isa<FixedVectorType>(Val: CurrentInsertElt->getType()))
2565 return std::nullopt;
2566 auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType());
2567
2568 // Insert the scalars into a container ordered by InsertElement index
2569 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2570 while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) {
2571 auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: 2));
2572 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: 1);
2573 CurrentInsertElt = InsertElt->getOperand(i_nocapture: 0);
2574 }
2575
2576 bool AllowPoison =
2577 isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default);
2578 if (!SimplifyValuePattern(Vec&: Elts, AllowPoison))
2579 return std::nullopt;
2580
2581 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2582 Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType());
2583 for (size_t I = 0; I < Elts.size(); I++) {
2584 if (Elts[I] == nullptr)
2585 continue;
2586 InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts[I],
2587 Idx: IC.Builder.getInt64(C: I));
2588 }
2589 if (InsertEltChain == nullptr)
2590 return std::nullopt;
2591
2592 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2593 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2594 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2595 // be narrowed back to the original type.
2596 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2597 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2598 IIScalableTy->getMinNumElements() /
2599 PatternWidth;
2600
2601 IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth);
2602 auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount);
2603 auto *WideShuffleMaskTy =
2604 ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount);
2605
2606 auto InsertSubvector = IC.Builder.CreateInsertVector(
2607 DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain,
2608 Idx: uint64_t(0));
2609 auto WideBitcast =
2610 IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy);
2611 auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy);
2612 auto WideShuffle = IC.Builder.CreateShuffleVector(
2613 V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask);
2614 auto NarrowBitcast =
2615 IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType());
2616
2617 return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast);
2618}
2619
2620static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2621 IntrinsicInst &II) {
2622 Value *A = II.getArgOperand(i: 0);
2623 Value *B = II.getArgOperand(i: 1);
2624 if (A == B)
2625 return IC.replaceInstUsesWith(I&: II, V: A);
2626
2627 return std::nullopt;
2628}
2629
2630static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2631 IntrinsicInst &II) {
2632 Value *Pred = II.getOperand(i_nocapture: 0);
2633 Value *Vec = II.getOperand(i_nocapture: 1);
2634 Value *Shift = II.getOperand(i_nocapture: 2);
2635
2636 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2637 Value *AbsPred, *MergedValue;
2638 if (!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2639 Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())) &&
2640 !match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2641 Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())))
2642
2643 return std::nullopt;
2644
2645 // Transform is valid if any of the following are true:
2646 // * The ABS merge value is an undef or non-negative
2647 // * The ABS predicate is all active
2648 // * The ABS predicate and the SRSHL predicates are the same
2649 if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) &&
2650 AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred))
2651 return std::nullopt;
2652
2653 // Only valid when the shift amount is non-negative, otherwise the rounding
2654 // behaviour of SRSHL cannot be ignored.
2655 if (!match(V: Shift, P: m_NonNegative()))
2656 return std::nullopt;
2657
2658 auto LSL = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_lsl,
2659 Types: {II.getType()}, Args: {Pred, Vec, Shift});
2660
2661 return IC.replaceInstUsesWith(I&: II, V: LSL);
2662}
2663
2664static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2665 IntrinsicInst &II) {
2666 Value *Vec = II.getOperand(i_nocapture: 0);
2667
2668 if (getSplatValue(V: Vec) == II.getOperand(i_nocapture: 1))
2669 return IC.replaceInstUsesWith(I&: II, V: Vec);
2670
2671 return std::nullopt;
2672}
2673
2674static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2675 IntrinsicInst &II) {
2676 // If this barrier is post-dominated by identical one we can remove it
2677 auto *NI = II.getNextNonDebugInstruction();
2678 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2679 auto CanSkipOver = [](Instruction *I) {
2680 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2681 };
2682 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2683 auto *NIBB = NI->getParent();
2684 NI = NI->getNextNonDebugInstruction();
2685 if (!NI) {
2686 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2687 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2688 else
2689 break;
2690 }
2691 }
2692 auto *NextII = dyn_cast_or_null<IntrinsicInst>(Val: NI);
2693 if (NextII && II.isIdenticalTo(I: NextII))
2694 return IC.eraseInstFromFunction(I&: II);
2695
2696 return std::nullopt;
2697}
2698
2699static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2700 IntrinsicInst &II) {
2701 if (match(V: II.getOperand(i_nocapture: 0), P: m_ConstantInt<AArch64SVEPredPattern::all>()))
2702 return IC.replaceInstUsesWith(I&: II, V: Constant::getAllOnesValue(Ty: II.getType()));
2703 return std::nullopt;
2704}
2705
2706static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2707 IntrinsicInst &II,
2708 unsigned NumBits) {
2709 Value *Passthru = II.getOperand(i_nocapture: 0);
2710 Value *Pg = II.getOperand(i_nocapture: 1);
2711 Value *Op = II.getOperand(i_nocapture: 2);
2712
2713 // Convert UXT[BHW] to AND.
2714 if (isa<UndefValue>(Val: Passthru) || isAllActivePredicate(Pred: Pg)) {
2715 auto *Ty = cast<VectorType>(Val: II.getType());
2716 auto MaskValue = APInt::getLowBitsSet(numBits: Ty->getScalarSizeInBits(), loBitsSet: NumBits);
2717 auto *Mask = ConstantInt::get(Ty, V: MaskValue);
2718 auto *And = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_and_u, Types: {Ty},
2719 Args: {Pg, Op, Mask});
2720 return IC.replaceInstUsesWith(I&: II, V: And);
2721 }
2722
2723 return std::nullopt;
2724}
2725
2726std::optional<Instruction *>
2727AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
2728 IntrinsicInst &II) const {
2729 const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II);
2730 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2731 return I;
2732
2733 Intrinsic::ID IID = II.getIntrinsicID();
2734 switch (IID) {
2735 default:
2736 break;
2737 case Intrinsic::aarch64_dmb:
2738 return instCombineDMB(IC, II);
2739 case Intrinsic::aarch64_neon_fmaxnm:
2740 case Intrinsic::aarch64_neon_fminnm:
2741 return instCombineMaxMinNM(IC, II);
2742 case Intrinsic::aarch64_sve_convert_from_svbool:
2743 return instCombineConvertFromSVBool(IC, II);
2744 case Intrinsic::aarch64_sve_dup:
2745 return instCombineSVEDup(IC, II);
2746 case Intrinsic::aarch64_sve_dup_x:
2747 return instCombineSVEDupX(IC, II);
2748 case Intrinsic::aarch64_sve_cmpne:
2749 case Intrinsic::aarch64_sve_cmpne_wide:
2750 return instCombineSVECmpNE(IC, II);
2751 case Intrinsic::aarch64_sve_rdffr:
2752 return instCombineRDFFR(IC, II);
2753 case Intrinsic::aarch64_sve_lasta:
2754 case Intrinsic::aarch64_sve_lastb:
2755 return instCombineSVELast(IC, II);
2756 case Intrinsic::aarch64_sve_clasta_n:
2757 case Intrinsic::aarch64_sve_clastb_n:
2758 return instCombineSVECondLast(IC, II);
2759 case Intrinsic::aarch64_sve_cntd:
2760 return instCombineSVECntElts(IC, II, NumElts: 2);
2761 case Intrinsic::aarch64_sve_cntw:
2762 return instCombineSVECntElts(IC, II, NumElts: 4);
2763 case Intrinsic::aarch64_sve_cnth:
2764 return instCombineSVECntElts(IC, II, NumElts: 8);
2765 case Intrinsic::aarch64_sve_cntb:
2766 return instCombineSVECntElts(IC, II, NumElts: 16);
2767 case Intrinsic::aarch64_sve_ptest_any:
2768 case Intrinsic::aarch64_sve_ptest_first:
2769 case Intrinsic::aarch64_sve_ptest_last:
2770 return instCombineSVEPTest(IC, II);
2771 case Intrinsic::aarch64_sve_fadd:
2772 return instCombineSVEVectorFAdd(IC, II);
2773 case Intrinsic::aarch64_sve_fadd_u:
2774 return instCombineSVEVectorFAddU(IC, II);
2775 case Intrinsic::aarch64_sve_fmul_u:
2776 return instCombineSVEVectorBinOp(IC, II);
2777 case Intrinsic::aarch64_sve_fsub:
2778 return instCombineSVEVectorFSub(IC, II);
2779 case Intrinsic::aarch64_sve_fsub_u:
2780 return instCombineSVEVectorFSubU(IC, II);
2781 case Intrinsic::aarch64_sve_add:
2782 return instCombineSVEVectorAdd(IC, II);
2783 case Intrinsic::aarch64_sve_add_u:
2784 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2785 Intrinsic::aarch64_sve_mla_u>(
2786 IC, II, MergeIntoAddendOp: true);
2787 case Intrinsic::aarch64_sve_sub:
2788 return instCombineSVEVectorSub(IC, II);
2789 case Intrinsic::aarch64_sve_sub_u:
2790 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2791 Intrinsic::aarch64_sve_mls_u>(
2792 IC, II, MergeIntoAddendOp: true);
2793 case Intrinsic::aarch64_sve_tbl:
2794 return instCombineSVETBL(IC, II);
2795 case Intrinsic::aarch64_sve_uunpkhi:
2796 case Intrinsic::aarch64_sve_uunpklo:
2797 case Intrinsic::aarch64_sve_sunpkhi:
2798 case Intrinsic::aarch64_sve_sunpklo:
2799 return instCombineSVEUnpack(IC, II);
2800 case Intrinsic::aarch64_sve_uzp1:
2801 return instCombineSVEUzp1(IC, II);
2802 case Intrinsic::aarch64_sve_zip1:
2803 case Intrinsic::aarch64_sve_zip2:
2804 return instCombineSVEZip(IC, II);
2805 case Intrinsic::aarch64_sve_ld1_gather_index:
2806 return instCombineLD1GatherIndex(IC, II);
2807 case Intrinsic::aarch64_sve_st1_scatter_index:
2808 return instCombineST1ScatterIndex(IC, II);
2809 case Intrinsic::aarch64_sve_ld1:
2810 return instCombineSVELD1(IC, II, DL);
2811 case Intrinsic::aarch64_sve_st1:
2812 return instCombineSVEST1(IC, II, DL);
2813 case Intrinsic::aarch64_sve_sdiv:
2814 return instCombineSVESDIV(IC, II);
2815 case Intrinsic::aarch64_sve_sel:
2816 return instCombineSVESel(IC, II);
2817 case Intrinsic::aarch64_sve_srshl:
2818 return instCombineSVESrshl(IC, II);
2819 case Intrinsic::aarch64_sve_dupq_lane:
2820 return instCombineSVEDupqLane(IC, II);
2821 case Intrinsic::aarch64_sve_insr:
2822 return instCombineSVEInsr(IC, II);
2823 case Intrinsic::aarch64_sve_ptrue:
2824 return instCombinePTrue(IC, II);
2825 case Intrinsic::aarch64_sve_uxtb:
2826 return instCombineSVEUxt(IC, II, NumBits: 8);
2827 case Intrinsic::aarch64_sve_uxth:
2828 return instCombineSVEUxt(IC, II, NumBits: 16);
2829 case Intrinsic::aarch64_sve_uxtw:
2830 return instCombineSVEUxt(IC, II, NumBits: 32);
2831 }
2832
2833 return std::nullopt;
2834}
2835
2836std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
2837 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2838 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2839 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2840 SimplifyAndSetOp) const {
2841 switch (II.getIntrinsicID()) {
2842 default:
2843 break;
2844 case Intrinsic::aarch64_neon_fcvtxn:
2845 case Intrinsic::aarch64_neon_rshrn:
2846 case Intrinsic::aarch64_neon_sqrshrn:
2847 case Intrinsic::aarch64_neon_sqrshrun:
2848 case Intrinsic::aarch64_neon_sqshrn:
2849 case Intrinsic::aarch64_neon_sqshrun:
2850 case Intrinsic::aarch64_neon_sqxtn:
2851 case Intrinsic::aarch64_neon_sqxtun:
2852 case Intrinsic::aarch64_neon_uqrshrn:
2853 case Intrinsic::aarch64_neon_uqshrn:
2854 case Intrinsic::aarch64_neon_uqxtn:
2855 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2856 break;
2857 }
2858
2859 return std::nullopt;
2860}
2861
2862bool AArch64TTIImpl::enableScalableVectorization() const {
2863 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2864 EnableScalableAutovecInStreamingMode);
2865}
2866
2867TypeSize
2868AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
2869 switch (K) {
2870 case TargetTransformInfo::RGK_Scalar:
2871 return TypeSize::getFixed(ExactSize: 64);
2872 case TargetTransformInfo::RGK_FixedWidthVector:
2873 if (ST->useSVEForFixedLengthVectors() &&
2874 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
2875 return TypeSize::getFixed(
2876 ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: 128u));
2877 else if (ST->isNeonAvailable())
2878 return TypeSize::getFixed(ExactSize: 128);
2879 else
2880 return TypeSize::getFixed(ExactSize: 0);
2881 case TargetTransformInfo::RGK_ScalableVector:
2882 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2883 EnableScalableAutovecInStreamingMode))
2884 return TypeSize::getScalable(MinimumSize: 128);
2885 else
2886 return TypeSize::getScalable(MinimumSize: 0);
2887 }
2888 llvm_unreachable("Unsupported register kind");
2889}
2890
2891bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2892 ArrayRef<const Value *> Args,
2893 Type *SrcOverrideTy) const {
2894 // A helper that returns a vector type from the given type. The number of
2895 // elements in type Ty determines the vector width.
2896 auto toVectorTy = [&](Type *ArgTy) {
2897 return VectorType::get(ElementType: ArgTy->getScalarType(),
2898 EC: cast<VectorType>(Val: DstTy)->getElementCount());
2899 };
2900
2901 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2902 // i32, i64]. SVE doesn't generally have the same set of instructions to
2903 // perform an extend with the add/sub/mul. There are SMULLB style
2904 // instructions, but they operate on top/bottom, requiring some sort of lane
2905 // interleaving to be used with zext/sext.
2906 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2907 if (!useNeonVector(Ty: DstTy) || Args.size() != 2 ||
2908 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2909 return false;
2910
2911 // Determine if the operation has a widening variant. We consider both the
2912 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2913 // instructions.
2914 //
2915 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2916 // verify that their extending operands are eliminated during code
2917 // generation.
2918 Type *SrcTy = SrcOverrideTy;
2919 switch (Opcode) {
2920 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2921 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2922 // The second operand needs to be an extend
2923 if (isa<SExtInst>(Val: Args[1]) || isa<ZExtInst>(Val: Args[1])) {
2924 if (!SrcTy)
2925 SrcTy =
2926 toVectorTy(cast<Instruction>(Val: Args[1])->getOperand(i: 0)->getType());
2927 } else
2928 return false;
2929 break;
2930 case Instruction::Mul: { // SMULL(2), UMULL(2)
2931 // Both operands need to be extends of the same type.
2932 if ((isa<SExtInst>(Val: Args[0]) && isa<SExtInst>(Val: Args[1])) ||
2933 (isa<ZExtInst>(Val: Args[0]) && isa<ZExtInst>(Val: Args[1]))) {
2934 if (!SrcTy)
2935 SrcTy =
2936 toVectorTy(cast<Instruction>(Val: Args[0])->getOperand(i: 0)->getType());
2937 } else if (isa<ZExtInst>(Val: Args[0]) || isa<ZExtInst>(Val: Args[1])) {
2938 // If one of the operands is a Zext and the other has enough zero bits to
2939 // be treated as unsigned, we can still general a umull, meaning the zext
2940 // is free.
2941 KnownBits Known =
2942 computeKnownBits(V: isa<ZExtInst>(Val: Args[0]) ? Args[1] : Args[0], DL);
2943 if (Args[0]->getType()->getScalarSizeInBits() -
2944 Known.Zero.countLeadingOnes() >
2945 DstTy->getScalarSizeInBits() / 2)
2946 return false;
2947 if (!SrcTy)
2948 SrcTy = toVectorTy(Type::getIntNTy(C&: DstTy->getContext(),
2949 N: DstTy->getScalarSizeInBits() / 2));
2950 } else
2951 return false;
2952 break;
2953 }
2954 default:
2955 return false;
2956 }
2957
2958 // Legalize the destination type and ensure it can be used in a widening
2959 // operation.
2960 auto DstTyL = getTypeLegalizationCost(Ty: DstTy);
2961 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2962 return false;
2963
2964 // Legalize the source type and ensure it can be used in a widening
2965 // operation.
2966 assert(SrcTy && "Expected some SrcTy");
2967 auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy);
2968 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2969 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2970 return false;
2971
2972 // Get the total number of vector elements in the legalized types.
2973 InstructionCost NumDstEls =
2974 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2975 InstructionCost NumSrcEls =
2976 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2977
2978 // Return true if the legalized types have the same number of vector elements
2979 // and the destination element type size is twice that of the source type.
2980 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2981}
2982
2983// s/urhadd instructions implement the following pattern, making the
2984// extends free:
2985// %x = add ((zext i8 -> i16), 1)
2986// %y = (zext i8 -> i16)
2987// trunc i16 (lshr (add %x, %y), 1) -> i8
2988//
2989bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
2990 Type *Src) const {
2991 // The source should be a legal vector type.
2992 if (!Src->isVectorTy() || !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) ||
2993 (Src->isScalableTy() && !ST->hasSVE2()))
2994 return false;
2995
2996 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2997 return false;
2998
2999 // Look for trunc/shl/add before trying to match the pattern.
3000 const Instruction *Add = ExtUser;
3001 auto *AddUser =
3002 dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3003 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3004 Add = AddUser;
3005
3006 auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3007 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3008 return false;
3009
3010 auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser());
3011 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3012 Src->getScalarSizeInBits() !=
3013 cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits())
3014 return false;
3015
3016 // Try to match the whole pattern. Ext could be either the first or second
3017 // m_ZExtOrSExt matched.
3018 Instruction *Ex1, *Ex2;
3019 if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1),
3020 R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_SpecificInt(V: 1))))))
3021 return false;
3022
3023 // Ensure both extends are of the same type
3024 if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) &&
3025 Ex1->getOpcode() == Ex2->getOpcode())
3026 return true;
3027
3028 return false;
3029}
3030
3031InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
3032 Type *Src,
3033 TTI::CastContextHint CCH,
3034 TTI::TargetCostKind CostKind,
3035 const Instruction *I) const {
3036 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3037 assert(ISD && "Invalid opcode");
3038 // If the cast is observable, and it is used by a widening instruction (e.g.,
3039 // uaddl, saddw, etc.), it may be free.
3040 if (I && I->hasOneUser()) {
3041 auto *SingleUser = cast<Instruction>(Val: *I->user_begin());
3042 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3043 if (isWideningInstruction(DstTy: Dst, Opcode: SingleUser->getOpcode(), Args: Operands, SrcOverrideTy: Src)) {
3044 // For adds only count the second operand as free if both operands are
3045 // extends but not the same operation. (i.e both operands are not free in
3046 // add(sext, zext)).
3047 if (SingleUser->getOpcode() == Instruction::Add) {
3048 if (I == SingleUser->getOperand(i: 1) ||
3049 (isa<CastInst>(Val: SingleUser->getOperand(i: 1)) &&
3050 cast<CastInst>(Val: SingleUser->getOperand(i: 1))->getOpcode() == Opcode))
3051 return 0;
3052 } else // Others are free so long as isWideningInstruction returned true.
3053 return 0;
3054 }
3055
3056 // The cast will be free for the s/urhadd instructions
3057 if ((isa<ZExtInst>(Val: I) || isa<SExtInst>(Val: I)) &&
3058 isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src))
3059 return 0;
3060 }
3061
3062 // TODO: Allow non-throughput costs that aren't binary.
3063 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3064 if (CostKind != TTI::TCK_RecipThroughput)
3065 return Cost == 0 ? 0 : 1;
3066 return Cost;
3067 };
3068
3069 EVT SrcTy = TLI->getValueType(DL, Ty: Src);
3070 EVT DstTy = TLI->getValueType(DL, Ty: Dst);
3071
3072 if (!SrcTy.isSimple() || !DstTy.isSimple())
3073 return AdjustCost(
3074 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3075
3076 static const TypeConversionCostTblEntry BF16Tbl[] = {
3077 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 1}, // bfcvt
3078 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 1}, // bfcvt
3079 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 1}, // bfcvtn
3080 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 2}, // bfcvtn+bfcvtn2
3081 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 2}, // bfcvtn+fcvtn
3082 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtl2+bfcvtn
3083 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3084 };
3085
3086 if (ST->hasBF16())
3087 if (const auto *Entry = ConvertCostTableLookup(
3088 Table: BF16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3089 return AdjustCost(Entry->Cost);
3090
3091 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3092 // The cost of unpacking twice is artificially increased for now in order
3093 // to avoid regressions against NEON, which will use tbl instructions directly
3094 // instead of multiple layers of [s|u]unpk[lo|hi].
3095 // We use the unpacks in cases where the destination type is illegal and
3096 // requires splitting of the input, even if the input type itself is legal.
3097 const unsigned int SVE_EXT_COST = 1;
3098 const unsigned int SVE_FCVT_COST = 1;
3099 const unsigned int SVE_UNPACK_ONCE = 4;
3100 const unsigned int SVE_UNPACK_TWICE = 16;
3101
3102 static const TypeConversionCostTblEntry ConversionTbl[] = {
3103 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: 1}, // xtn
3104 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: 1}, // xtn
3105 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i32, .Src: MVT::v2i64, .Cost: 1}, // xtn
3106 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: 1}, // xtn
3107 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: 3}, // 2 xtn + 1 uzp1
3108 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 1}, // xtn
3109 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: 2}, // 1 uzp1 + 1 xtn
3110 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: 1}, // 1 uzp1
3111 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: 1}, // 1 xtn
3112 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: 2}, // 1 uzp1 + 1 xtn
3113 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: 4}, // 3 x uzp1 + xtn
3114 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: 1}, // 1 uzp1
3115 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: 3}, // 3 x uzp1
3116 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: 2}, // 2 x uzp1
3117 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: 1}, // uzp1
3118 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 3}, // (2 + 1) x uzp1
3119 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: 7}, // (4 + 2 + 1) x uzp1
3120 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: 2}, // 2 x uzp1
3121 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i64, .Cost: 6}, // (4 + 2) x uzp1
3122 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i64, .Cost: 4}, // 4 x uzp1
3123
3124 // Truncations on nxvmiN
3125 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i8, .Cost: 2},
3126 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i16, .Cost: 2},
3127 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i32, .Cost: 2},
3128 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i64, .Cost: 2},
3129 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i8, .Cost: 2},
3130 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i16, .Cost: 2},
3131 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i32, .Cost: 2},
3132 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i64, .Cost: 5},
3133 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i8, .Cost: 2},
3134 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i16, .Cost: 2},
3135 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i32, .Cost: 5},
3136 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i64, .Cost: 11},
3137 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i1, .Src: MVT::nxv16i8, .Cost: 2},
3138 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i16, .Cost: 0},
3139 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i32, .Cost: 0},
3140 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i64, .Cost: 0},
3141 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i32, .Cost: 0},
3142 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i64, .Cost: 0},
3143 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i32, .Src: MVT::nxv2i64, .Cost: 0},
3144 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i16, .Cost: 0},
3145 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i32, .Cost: 0},
3146 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i64, .Cost: 1},
3147 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i32, .Cost: 0},
3148 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i64, .Cost: 1},
3149 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i32, .Src: MVT::nxv4i64, .Cost: 1},
3150 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i16, .Cost: 0},
3151 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i32, .Cost: 1},
3152 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i64, .Cost: 3},
3153 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i32, .Cost: 1},
3154 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i64, .Cost: 3},
3155 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i16, .Cost: 1},
3156 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i32, .Cost: 3},
3157 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i64, .Cost: 7},
3158
3159 // The number of shll instructions for the extension.
3160 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3},
3161 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3},
3162 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
3163 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
3164 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3},
3165 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3},
3166 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
3167 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
3168 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7},
3169 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7},
3170 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6},
3171 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6},
3172 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
3173 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
3174 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
3175 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
3176
3177 // FP Ext and trunc
3178 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f32, .Cost: 1}, // fcvt
3179 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f32, .Cost: 1}, // fcvtl
3180 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f32, .Cost: 2}, // fcvtl+fcvtl2
3181 // FP16
3182 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::f16, .Cost: 1}, // fcvt
3183 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f16, .Cost: 1}, // fcvt
3184 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: 1}, // fcvtl
3185 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: 2}, // fcvtl+fcvtl2
3186 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f16, .Cost: 2}, // fcvtl+fcvtl
3187 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f16, .Cost: 3}, // fcvtl+fcvtl2+fcvtl
3188 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f16, .Cost: 6}, // 2 * fcvtl+fcvtl2+fcvtl
3189 // BF16 (uses shift)
3190 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::bf16, .Cost: 1}, // shl
3191 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::bf16, .Cost: 2}, // shl+fcvt
3192 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4bf16, .Cost: 1}, // shll
3193 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8bf16, .Cost: 2}, // shll+shll2
3194 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2bf16, .Cost: 2}, // shll+fcvtl
3195 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4bf16, .Cost: 3}, // shll+fcvtl+fcvtl2
3196 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8bf16, .Cost: 6}, // 2 * shll+fcvtl+fcvtl2
3197 // FP Ext and trunc
3198 {.ISD: ISD::FP_ROUND, .Dst: MVT::f32, .Src: MVT::f64, .Cost: 1}, // fcvt
3199 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f32, .Src: MVT::v2f64, .Cost: 1}, // fcvtn
3200 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f64, .Cost: 2}, // fcvtn+fcvtn2
3201 // FP16
3202 {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f32, .Cost: 1}, // fcvt
3203 {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f64, .Cost: 1}, // fcvt
3204 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f32, .Cost: 1}, // fcvtn
3205 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f32, .Cost: 2}, // fcvtn+fcvtn2
3206 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f16, .Src: MVT::v2f64, .Cost: 2}, // fcvtn+fcvtn
3207 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtn2+fcvtn
3208 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+fcvtn
3209 // BF16 (more complex, with +bf16 is handled above)
3210 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 8}, // Expansion is ~8 insns
3211 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 9}, // fcvtn + above
3212 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f32, .Cost: 8},
3213 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 8},
3214 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 15},
3215 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 9},
3216 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 10},
3217 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 19},
3218
3219 // LowerVectorINT_TO_FP:
3220 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
3221 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
3222 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1},
3223 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
3224 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
3225 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1},
3226
3227 // SVE: to nxv2f16
3228 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3229 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3230 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3231 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3232 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3233 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3234 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3235 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3236 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3237 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3238
3239 // SVE: to nxv4f16
3240 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3241 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3242 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3243 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3244 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3245 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3246 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3247 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3248
3249 // SVE: to nxv8f16
3250 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3251 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3252 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3253 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3254 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3255 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3256
3257 // SVE: to nxv16f16
3258 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3259 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3260 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3261 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3262
3263 // Complex: to v2f32
3264 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
3265 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3},
3266 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
3267 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3},
3268
3269 // SVE: to nxv2f32
3270 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3271 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3272 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3273 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3274 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3275 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3276 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3277 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3278 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3279 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3280
3281 // Complex: to v4f32
3282 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 4},
3283 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
3284 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3},
3285 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
3286
3287 // SVE: to nxv4f32
3288 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3289 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3290 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3291 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3292 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3293 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3294 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3295 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3296
3297 // Complex: to v8f32
3298 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
3299 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4},
3300 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
3301 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4},
3302
3303 // SVE: to nxv8f32
3304 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3305 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3306 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3307 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3308 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3309 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3310 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3311 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3312
3313 // SVE: to nxv16f32
3314 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3315 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3316 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3317 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3318
3319 // Complex: to v16f32
3320 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21},
3321 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21},
3322
3323 // Complex: to v2f64
3324 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4},
3325 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4},
3326 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2},
3327 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4},
3328 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4},
3329 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2},
3330
3331 // SVE: to nxv2f64
3332 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3333 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3334 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3335 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3336 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3337 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3338 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3339 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3340 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3341 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3342
3343 // Complex: to v4f64
3344 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4},
3345 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4},
3346
3347 // SVE: to nxv4f64
3348 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3349 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3350 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3351 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3352 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3353 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3354 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3355 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3356 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3357 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3358 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3359 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3360
3361 // SVE: to nxv8f64
3362 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3363 .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3364 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3365 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3366 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3367 .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3368 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3369 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3370
3371 // LowerVectorFP_TO_INT
3372 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1},
3373 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1},
3374 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1},
3375 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1},
3376 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1},
3377 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1},
3378
3379 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3380 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2},
3381 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1},
3382 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1},
3383 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2},
3384 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1},
3385 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1},
3386
3387 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3388 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2},
3389 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2},
3390 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2},
3391 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2},
3392
3393 // Complex, from nxv2f32.
3394 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1},
3395 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1},
3396 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1},
3397 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1},
3398 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1},
3399 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1},
3400 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1},
3401 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1},
3402
3403 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3404 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2},
3405 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2},
3406 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2},
3407 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2},
3408 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2},
3409 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2},
3410
3411 // Complex, from nxv2f64.
3412 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1},
3413 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1},
3414 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1},
3415 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1},
3416 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1},
3417 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1},
3418 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1},
3419 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1},
3420 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1},
3421 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1},
3422
3423 // Complex, from nxv4f32.
3424 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4},
3425 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1},
3426 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1},
3427 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1},
3428 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1},
3429 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4},
3430 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1},
3431 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1},
3432 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1},
3433 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1},
3434
3435 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3436 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7},
3437 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7},
3438 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7},
3439 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7},
3440
3441 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3442 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3},
3443 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3},
3444 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3},
3445 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3},
3446 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3},
3447 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3},
3448
3449 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3450 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3},
3451 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3},
3452 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3},
3453 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3},
3454
3455 // Complex, from nxv8f16.
3456 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10},
3457 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4},
3458 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1},
3459 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1},
3460 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1},
3461 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10},
3462 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4},
3463 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1},
3464 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1},
3465 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1},
3466
3467 // Complex, from nxv4f16.
3468 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4},
3469 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1},
3470 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1},
3471 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1},
3472 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4},
3473 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1},
3474 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1},
3475 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1},
3476
3477 // Complex, from nxv2f16.
3478 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1},
3479 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1},
3480 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1},
3481 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1},
3482 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1},
3483 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1},
3484 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1},
3485 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1},
3486
3487 // Truncate from nxvmf32 to nxvmf16.
3488 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f32, .Cost: 1},
3489 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f32, .Cost: 1},
3490 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f32, .Cost: 3},
3491
3492 // Truncate from nxvmf64 to nxvmf16.
3493 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f64, .Cost: 1},
3494 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f64, .Cost: 3},
3495 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f64, .Cost: 7},
3496
3497 // Truncate from nxvmf64 to nxvmf32.
3498 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f64, .Cost: 1},
3499 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f64, .Cost: 3},
3500 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f64, .Cost: 6},
3501
3502 // Extend from nxvmf16 to nxvmf32.
3503 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f16, .Cost: 1},
3504 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f16, .Cost: 1},
3505 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f16, .Cost: 2},
3506
3507 // Extend from nxvmf16 to nxvmf64.
3508 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f16, .Cost: 1},
3509 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f16, .Cost: 2},
3510 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f16, .Cost: 4},
3511
3512 // Extend from nxvmf32 to nxvmf64.
3513 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f32, .Cost: 1},
3514 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f32, .Cost: 2},
3515 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f32, .Cost: 6},
3516
3517 // Bitcasts from float to integer
3518 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: 0},
3519 {.ISD: ISD::BITCAST, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: 0},
3520 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: 0},
3521
3522 // Bitcasts from integer to float
3523 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 0},
3524 {.ISD: ISD::BITCAST, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 0},
3525 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 0},
3526
3527 // Add cost for extending to illegal -too wide- scalable vectors.
3528 // zero/sign extend are implemented by multiple unpack operations,
3529 // where each operation has a cost of 1.
3530 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2},
3531 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6},
3532 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14},
3533 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2},
3534 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6},
3535 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2},
3536
3537 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2},
3538 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6},
3539 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14},
3540 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2},
3541 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6},
3542 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2},
3543 };
3544
3545 // We have to estimate a cost of fixed length operation upon
3546 // SVE registers(operations) with the number of registers required
3547 // for a fixed type to be represented upon SVE registers.
3548 EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy;
3549 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3550 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3551 ST->useSVEForFixedLengthVectors(VT: WiderTy)) {
3552 std::pair<InstructionCost, MVT> LT =
3553 getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext()));
3554 unsigned NumElements =
3555 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3556 return AdjustCost(
3557 LT.first *
3558 getCastInstrCost(
3559 Opcode, Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements),
3560 Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH,
3561 CostKind, I));
3562 }
3563
3564 if (const auto *Entry = ConvertCostTableLookup(
3565 Table: ConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3566 return AdjustCost(Entry->Cost);
3567
3568 static const TypeConversionCostTblEntry FP16Tbl[] = {
3569 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs
3570 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1},
3571 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs
3572 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1},
3573 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2}, // fcvtl+fcvtzs
3574 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2},
3575 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2}, // fcvtzs+xtn
3576 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2},
3577 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1}, // fcvtzs
3578 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1},
3579 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4}, // 2*fcvtl+2*fcvtzs
3580 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4},
3581 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3}, // 2*fcvtzs+xtn
3582 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3},
3583 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2}, // 2*fcvtzs
3584 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2},
3585 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8}, // 4*fcvtl+4*fcvtzs
3586 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8},
3587 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // ushll + ucvtf
3588 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // sshll + scvtf
3589 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * ushl(2) + 2 * ucvtf
3590 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * sshl(2) + 2 * scvtf
3591 };
3592
3593 if (ST->hasFullFP16())
3594 if (const auto *Entry = ConvertCostTableLookup(
3595 Table: FP16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3596 return AdjustCost(Entry->Cost);
3597
3598 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3599 // double-rounding issues.
3600 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3601 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3602 isa<FixedVectorType>(Val: Dst) && isa<FixedVectorType>(Val: Src))
3603 return AdjustCost(
3604 cast<FixedVectorType>(Val: Dst)->getNumElements() *
3605 getCastInstrCost(Opcode, Dst: Dst->getScalarType(), Src: Src->getScalarType(),
3606 CCH, CostKind) +
3607 BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Src), Insert: false, Extract: true,
3608 CostKind) +
3609 BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Dst), Insert: true, Extract: false,
3610 CostKind));
3611
3612 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3613 CCH == TTI::CastContextHint::Masked &&
3614 ST->isSVEorStreamingSVEAvailable() &&
3615 TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) ==
3616 TargetLowering::TypePromoteInteger &&
3617 TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) ==
3618 TargetLowering::TypeSplitVector) {
3619 // The standard behaviour in the backend for these cases is to split the
3620 // extend up into two parts:
3621 // 1. Perform an extending load or masked load up to the legal type.
3622 // 2. Extend the loaded data to the final type.
3623 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
3624 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Context&: Src->getContext());
3625 InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
3626 Opcode, Dst: LegalTy, Src, CCH, CostKind, I);
3627 InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
3628 Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I);
3629 return Part1 + Part2;
3630 }
3631
3632 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3633 // but we also want to include the TTI::CastContextHint::Masked case too.
3634 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3635 CCH == TTI::CastContextHint::Masked &&
3636 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(VT: DstTy))
3637 CCH = TTI::CastContextHint::Normal;
3638
3639 return AdjustCost(
3640 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3641}
3642
3643InstructionCost
3644AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
3645 VectorType *VecTy, unsigned Index,
3646 TTI::TargetCostKind CostKind) const {
3647
3648 // Make sure we were given a valid extend opcode.
3649 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3650 "Invalid opcode");
3651
3652 // We are extending an element we extract from a vector, so the source type
3653 // of the extend is the element type of the vector.
3654 auto *Src = VecTy->getElementType();
3655
3656 // Sign- and zero-extends are for integer types only.
3657 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3658
3659 // Get the cost for the extract. We compute the cost (if any) for the extend
3660 // below.
3661 InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
3662 CostKind, Index, Op0: nullptr, Op1: nullptr);
3663
3664 // Legalize the types.
3665 auto VecLT = getTypeLegalizationCost(Ty: VecTy);
3666 auto DstVT = TLI->getValueType(DL, Ty: Dst);
3667 auto SrcVT = TLI->getValueType(DL, Ty: Src);
3668
3669 // If the resulting type is still a vector and the destination type is legal,
3670 // we may get the extension for free. If not, get the default cost for the
3671 // extend.
3672 if (!VecLT.second.isVector() || !TLI->isTypeLegal(VT: DstVT))
3673 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
3674 CostKind);
3675
3676 // The destination type should be larger than the element type. If not, get
3677 // the default cost for the extend.
3678 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3679 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
3680 CostKind);
3681
3682 switch (Opcode) {
3683 default:
3684 llvm_unreachable("Opcode should be either SExt or ZExt");
3685
3686 // For sign-extends, we only need a smov, which performs the extension
3687 // automatically.
3688 case Instruction::SExt:
3689 return Cost;
3690
3691 // For zero-extends, the extend is performed automatically by a umov unless
3692 // the destination type is i64 and the element type is i8 or i16.
3693 case Instruction::ZExt:
3694 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3695 return Cost;
3696 }
3697
3698 // If we are unable to perform the extend for free, get the default cost.
3699 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
3700 CostKind);
3701}
3702
3703InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
3704 TTI::TargetCostKind CostKind,
3705 const Instruction *I) const {
3706 if (CostKind != TTI::TCK_RecipThroughput)
3707 return Opcode == Instruction::PHI ? 0 : 1;
3708 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3709 // Branches are assumed to be predicted.
3710 return 0;
3711}
3712
3713InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3714 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3715 bool HasRealUse, const Instruction *I, Value *Scalar,
3716 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3717 assert(Val->isVectorTy() && "This must be a vector type");
3718
3719 if (Index != -1U) {
3720 // Legalize the type.
3721 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
3722
3723 // This type is legalized to a scalar type.
3724 if (!LT.second.isVector())
3725 return 0;
3726
3727 // The type may be split. For fixed-width vectors we can normalize the
3728 // index to the new type.
3729 if (LT.second.isFixedLengthVector()) {
3730 unsigned Width = LT.second.getVectorNumElements();
3731 Index = Index % Width;
3732 }
3733
3734 // The element at index zero is already inside the vector.
3735 // - For a physical (HasRealUse==true) insert-element or extract-element
3736 // instruction that extracts integers, an explicit FPR -> GPR move is
3737 // needed. So it has non-zero cost.
3738 // - For the rest of cases (virtual instruction or element type is float),
3739 // consider the instruction free.
3740 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3741 return 0;
3742
3743 // This is recognising a LD1 single-element structure to one lane of one
3744 // register instruction. I.e., if this is an `insertelement` instruction,
3745 // and its second operand is a load, then we will generate a LD1, which
3746 // are expensive instructions.
3747 if (I && dyn_cast<LoadInst>(Val: I->getOperand(i: 1)))
3748 return CostKind == TTI::TCK_CodeSize
3749 ? 0
3750 : ST->getVectorInsertExtractBaseCost() + 1;
3751
3752 // i1 inserts and extract will include an extra cset or cmp of the vector
3753 // value. Increase the cost by 1 to account.
3754 if (Val->getScalarSizeInBits() == 1)
3755 return CostKind == TTI::TCK_CodeSize
3756 ? 2
3757 : ST->getVectorInsertExtractBaseCost() + 1;
3758
3759 // FIXME:
3760 // If the extract-element and insert-element instructions could be
3761 // simplified away (e.g., could be combined into users by looking at use-def
3762 // context), they have no cost. This is not done in the first place for
3763 // compile-time considerations.
3764 }
3765
3766 // In case of Neon, if there exists extractelement from lane != 0 such that
3767 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3768 // 2. extractelement result feeds into fmul.
3769 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3770 // equivalent to 0.
3771 // then the extractelement can be merged with fmul in the backend and it
3772 // incurs no cost.
3773 // e.g.
3774 // define double @foo(<2 x double> %a) {
3775 // %1 = extractelement <2 x double> %a, i32 0
3776 // %2 = extractelement <2 x double> %a, i32 1
3777 // %res = fmul double %1, %2
3778 // ret double %res
3779 // }
3780 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3781 auto ExtractCanFuseWithFmul = [&]() {
3782 // We bail out if the extract is from lane 0.
3783 if (Index == 0)
3784 return false;
3785
3786 // Check if the scalar element type of the vector operand of ExtractElement
3787 // instruction is one of the allowed types.
3788 auto IsAllowedScalarTy = [&](const Type *T) {
3789 return T->isFloatTy() || T->isDoubleTy() ||
3790 (T->isHalfTy() && ST->hasFullFP16());
3791 };
3792
3793 // Check if the extractelement user is scalar fmul.
3794 auto IsUserFMulScalarTy = [](const Value *EEUser) {
3795 // Check if the user is scalar fmul.
3796 const auto *BO = dyn_cast<BinaryOperator>(Val: EEUser);
3797 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3798 !BO->getType()->isVectorTy();
3799 };
3800
3801 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3802 // certain scalar type and a certain vector register width.
3803 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3804 auto RegWidth =
3805 getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
3806 .getFixedValue();
3807 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
3808 };
3809
3810 // Check if the type constraints on input vector type and result scalar type
3811 // of extractelement instruction are satisfied.
3812 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3813 return false;
3814
3815 if (Scalar) {
3816 DenseMap<User *, unsigned> UserToExtractIdx;
3817 for (auto *U : Scalar->users()) {
3818 if (!IsUserFMulScalarTy(U))
3819 return false;
3820 // Recording entry for the user is important. Index value is not
3821 // important.
3822 UserToExtractIdx[U];
3823 }
3824 if (UserToExtractIdx.empty())
3825 return false;
3826 for (auto &[S, U, L] : ScalarUserAndIdx) {
3827 for (auto *U : S->users()) {
3828 if (UserToExtractIdx.contains(Val: U)) {
3829 auto *FMul = cast<BinaryOperator>(Val: U);
3830 auto *Op0 = FMul->getOperand(i_nocapture: 0);
3831 auto *Op1 = FMul->getOperand(i_nocapture: 1);
3832 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3833 UserToExtractIdx[U] = L;
3834 break;
3835 }
3836 }
3837 }
3838 }
3839 for (auto &[U, L] : UserToExtractIdx) {
3840 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3841 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3842 return false;
3843 }
3844 } else {
3845 const auto *EE = cast<ExtractElementInst>(Val: I);
3846
3847 const auto *IdxOp = dyn_cast<ConstantInt>(Val: EE->getIndexOperand());
3848 if (!IdxOp)
3849 return false;
3850
3851 return !EE->users().empty() && all_of(Range: EE->users(), P: [&](const User *U) {
3852 if (!IsUserFMulScalarTy(U))
3853 return false;
3854
3855 // Check if the other operand of extractelement is also extractelement
3856 // from lane equivalent to 0.
3857 const auto *BO = cast<BinaryOperator>(Val: U);
3858 const auto *OtherEE = dyn_cast<ExtractElementInst>(
3859 Val: BO->getOperand(i_nocapture: 0) == EE ? BO->getOperand(i_nocapture: 1) : BO->getOperand(i_nocapture: 0));
3860 if (OtherEE) {
3861 const auto *IdxOp = dyn_cast<ConstantInt>(Val: OtherEE->getIndexOperand());
3862 if (!IdxOp)
3863 return false;
3864 return IsExtractLaneEquivalentToZero(
3865 cast<ConstantInt>(Val: OtherEE->getIndexOperand())
3866 ->getValue()
3867 .getZExtValue(),
3868 OtherEE->getType()->getScalarSizeInBits());
3869 }
3870 return true;
3871 });
3872 }
3873 return true;
3874 };
3875
3876 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
3877 ExtractCanFuseWithFmul())
3878 return 0;
3879
3880 // All other insert/extracts cost this much.
3881 return CostKind == TTI::TCK_CodeSize ? 1
3882 : ST->getVectorInsertExtractBaseCost();
3883}
3884
3885InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
3886 TTI::TargetCostKind CostKind,
3887 unsigned Index,
3888 const Value *Op0,
3889 const Value *Op1) const {
3890 bool HasRealUse =
3891 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Val: Op0);
3892 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
3893}
3894
3895InstructionCost AArch64TTIImpl::getVectorInstrCost(
3896 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3897 Value *Scalar,
3898 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3899 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse: false, I: nullptr,
3900 Scalar, ScalarUserAndIdx);
3901}
3902
3903InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
3904 Type *Val,
3905 TTI::TargetCostKind CostKind,
3906 unsigned Index) const {
3907 return getVectorInstrCostHelper(Opcode: I.getOpcode(), Val, CostKind, Index,
3908 HasRealUse: true /* HasRealUse */, I: &I);
3909}
3910
3911InstructionCost AArch64TTIImpl::getScalarizationOverhead(
3912 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
3913 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
3914 ArrayRef<Value *> VL) const {
3915 if (isa<ScalableVectorType>(Val: Ty))
3916 return InstructionCost::getInvalid();
3917 if (Ty->getElementType()->isFloatingPointTy())
3918 return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract,
3919 CostKind);
3920 unsigned VecInstCost =
3921 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
3922 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
3923}
3924
3925InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
3926 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3927 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
3928 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
3929
3930 // The code-generator is currently not able to handle scalable vectors
3931 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3932 // it. This change will be removed when code-generation for these types is
3933 // sufficiently reliable.
3934 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
3935 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
3936 return InstructionCost::getInvalid();
3937
3938 // TODO: Handle more cost kinds.
3939 if (CostKind != TTI::TCK_RecipThroughput)
3940 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
3941 Opd2Info: Op2Info, Args, CxtI);
3942
3943 // Legalize the type.
3944 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3945 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3946
3947 switch (ISD) {
3948 default:
3949 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
3950 Opd2Info: Op2Info);
3951 case ISD::SREM:
3952 case ISD::SDIV:
3953 /*
3954 Notes for sdiv/srem specific costs:
3955 1. This only considers the cases where the divisor is constant, uniform and
3956 (pow-of-2/non-pow-of-2). Other cases are not important since they either
3957 result in some form of (ldr + adrp), corresponding to constant vectors, or
3958 scalarization of the division operation.
3959 2. Constant divisors, either negative in whole or partially, don't result in
3960 significantly different codegen as compared to positive constant divisors.
3961 So, we don't consider negative divisors separately.
3962 3. If the codegen is significantly different with SVE, it has been indicated
3963 using comments at appropriate places.
3964
3965 sdiv specific cases:
3966 -----------------------------------------------------------------------
3967 codegen | pow-of-2 | Type
3968 -----------------------------------------------------------------------
3969 add + cmp + csel + asr | Y | i64
3970 add + cmp + csel + asr | Y | i32
3971 -----------------------------------------------------------------------
3972
3973 srem specific cases:
3974 -----------------------------------------------------------------------
3975 codegen | pow-of-2 | Type
3976 -----------------------------------------------------------------------
3977 negs + and + and + csneg | Y | i64
3978 negs + and + and + csneg | Y | i32
3979 -----------------------------------------------------------------------
3980
3981 other sdiv/srem cases:
3982 -------------------------------------------------------------------------
3983 common codegen | + srem | + sdiv | pow-of-2 | Type
3984 -------------------------------------------------------------------------
3985 smulh + asr + add + add | - | - | N | i64
3986 smull + lsr + add + add | - | - | N | i32
3987 usra | and + sub | sshr | Y | <2 x i64>
3988 2 * (scalar code) | - | - | N | <2 x i64>
3989 usra | bic + sub | sshr + neg | Y | <4 x i32>
3990 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
3991 + sshr + usra | | | |
3992 -------------------------------------------------------------------------
3993 */
3994 if (Op2Info.isConstant() && Op2Info.isUniform()) {
3995 InstructionCost AddCost =
3996 getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
3997 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
3998 InstructionCost AsrCost =
3999 getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4000 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4001 InstructionCost MulCost =
4002 getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4003 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4004 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4005 // have similar cost.
4006 auto VT = TLI->getValueType(DL, Ty);
4007 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4008 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4009 // Neg can be folded into the asr instruction.
4010 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4011 : (3 * AsrCost + AddCost);
4012 } else {
4013 return MulCost + AsrCost + 2 * AddCost;
4014 }
4015 } else if (VT.isVector()) {
4016 InstructionCost UsraCost = 2 * AsrCost;
4017 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4018 // Division with scalable types corresponds to native 'asrd'
4019 // instruction when SVE is available.
4020 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4021
4022 // One more for the negation in SDIV
4023 InstructionCost Cost =
4024 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4025 if (Ty->isScalableTy() && ST->hasSVE())
4026 Cost += 2 * AsrCost;
4027 else {
4028 Cost +=
4029 UsraCost +
4030 (ISD == ISD::SDIV
4031 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4032 : 2 * AddCost);
4033 }
4034 return Cost;
4035 } else if (LT.second == MVT::v2i64) {
4036 return VT.getVectorNumElements() *
4037 getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind,
4038 Op1Info: Op1Info.getNoProps(),
4039 Op2Info: Op2Info.getNoProps());
4040 } else {
4041 // When SVE is available, we get:
4042 // smulh + lsr + add/sub + asr + add/sub.
4043 if (Ty->isScalableTy() && ST->hasSVE())
4044 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4045 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4046 }
4047 }
4048 }
4049 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4050 LT.second.isFixedLengthVector()) {
4051 // FIXME: When the constant vector is non-uniform, this may result in
4052 // loading the vector from constant pool or in some cases, may also result
4053 // in scalarization. For now, we are approximating this with the
4054 // scalarization cost.
4055 auto ExtractCost = 2 * getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty,
4056 CostKind, Index: -1, Op0: nullptr, Op1: nullptr);
4057 auto InsertCost = getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty,
4058 CostKind, Index: -1, Op0: nullptr, Op1: nullptr);
4059 unsigned NElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
4060 return ExtractCost + InsertCost +
4061 NElts * getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(),
4062 CostKind, Op1Info: Op1Info.getNoProps(),
4063 Op2Info: Op2Info.getNoProps());
4064 }
4065 [[fallthrough]];
4066 case ISD::UDIV:
4067 case ISD::UREM: {
4068 auto VT = TLI->getValueType(DL, Ty);
4069 if (Op2Info.isConstant()) {
4070 // If the operand is a power of 2 we can use the shift or and cost.
4071 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4072 return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind,
4073 Op1Info: Op1Info.getNoProps(),
4074 Op2Info: Op2Info.getNoProps());
4075 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4076 return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind,
4077 Op1Info: Op1Info.getNoProps(),
4078 Op2Info: Op2Info.getNoProps());
4079
4080 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4081 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4082 // The MULHU will be expanded to UMULL for the types not listed below,
4083 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4084 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4085 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4086 LT.second == MVT::nxv16i8;
4087 bool Is128bit = LT.second.is128BitVector();
4088
4089 InstructionCost MulCost =
4090 getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4091 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4092 InstructionCost AddCost =
4093 getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
4094 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4095 InstructionCost ShrCost =
4096 getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4097 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4098 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4099 (HasMULH ? 0 : ShrCost) + // UMULL shift
4100 AddCost * 2 + ShrCost;
4101 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4102 }
4103 }
4104
4105 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4106 // emitted by the backend even when those functions are not declared in the
4107 // module.
4108 if (!VT.isVector() && VT.getSizeInBits() > 64)
4109 return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4110
4111 InstructionCost Cost = BaseT::getArithmeticInstrCost(
4112 Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4113 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4114 if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) {
4115 // SDIV/UDIV operations are lowered using SVE, then we can have less
4116 // costs.
4117 if (isa<FixedVectorType>(Val: Ty) && cast<FixedVectorType>(Val: Ty)
4118 ->getPrimitiveSizeInBits()
4119 .getFixedValue() < 128) {
4120 EVT VT = TLI->getValueType(DL, Ty);
4121 static const CostTblEntry DivTbl[]{
4122 {.ISD: ISD::SDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v4i8, .Cost: 8},
4123 {.ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::SDIV, .Type: MVT::v2i16, .Cost: 5},
4124 {.ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: 1},
4125 {.ISD: ISD::UDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v4i8, .Cost: 8},
4126 {.ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::UDIV, .Type: MVT::v2i16, .Cost: 5},
4127 {.ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: 1}};
4128
4129 const auto *Entry = CostTableLookup(Table: DivTbl, ISD, Ty: VT.getSimpleVT());
4130 if (nullptr != Entry)
4131 return Entry->Cost;
4132 }
4133 // For 8/16-bit elements, the cost is higher because the type
4134 // requires promotion and possibly splitting:
4135 if (LT.second.getScalarType() == MVT::i8)
4136 Cost *= 8;
4137 else if (LT.second.getScalarType() == MVT::i16)
4138 Cost *= 4;
4139 return Cost;
4140 } else {
4141 // If one of the operands is a uniform constant then the cost for each
4142 // element is Cost for insertion, extraction and division.
4143 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4144 // operation with scalar type
4145 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4146 (Op2Info.isConstant() && Op2Info.isUniform())) {
4147 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
4148 InstructionCost DivCost = BaseT::getArithmeticInstrCost(
4149 Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4150 return (4 + DivCost) * VTy->getNumElements();
4151 }
4152 }
4153 // On AArch64, without SVE, vector divisions are expanded
4154 // into scalar divisions of each pair of elements.
4155 Cost += getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind,
4156 Index: -1, Op0: nullptr, Op1: nullptr);
4157 Cost += getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1,
4158 Op0: nullptr, Op1: nullptr);
4159 }
4160
4161 // TODO: if one of the arguments is scalar, then it's not necessary to
4162 // double the cost of handling the vector elements.
4163 Cost += Cost;
4164 }
4165 return Cost;
4166 }
4167 case ISD::MUL:
4168 // When SVE is available, then we can lower the v2i64 operation using
4169 // the SVE mul instruction, which has a lower cost.
4170 if (LT.second == MVT::v2i64 && ST->hasSVE())
4171 return LT.first;
4172
4173 // When SVE is not available, there is no MUL.2d instruction,
4174 // which means mul <2 x i64> is expensive as elements are extracted
4175 // from the vectors and the muls scalarized.
4176 // As getScalarizationOverhead is a bit too pessimistic, we
4177 // estimate the cost for a i64 vector directly here, which is:
4178 // - four 2-cost i64 extracts,
4179 // - two 2-cost i64 inserts, and
4180 // - two 1-cost muls.
4181 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4182 // LT.first = 2 the cost is 28. If both operands are extensions it will not
4183 // need to scalarize so the cost can be cheaper (smull or umull).
4184 // so the cost can be cheaper (smull or umull).
4185 if (LT.second != MVT::v2i64 || isWideningInstruction(DstTy: Ty, Opcode, Args))
4186 return LT.first;
4187 return cast<VectorType>(Val: Ty)->getElementCount().getKnownMinValue() *
4188 (getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind) +
4189 getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind, Index: -1,
4190 Op0: nullptr, Op1: nullptr) *
4191 2 +
4192 getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1,
4193 Op0: nullptr, Op1: nullptr));
4194 case ISD::ADD:
4195 case ISD::XOR:
4196 case ISD::OR:
4197 case ISD::AND:
4198 case ISD::SRL:
4199 case ISD::SRA:
4200 case ISD::SHL:
4201 // These nodes are marked as 'custom' for combining purposes only.
4202 // We know that they are legal. See LowerAdd in ISelLowering.
4203 return LT.first;
4204
4205 case ISD::FNEG:
4206 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4207 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4208 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4209 CxtI &&
4210 ((CxtI->hasOneUse() &&
4211 match(V: *CxtI->user_begin(), P: m_FMul(L: m_Value(), R: m_Value()))) ||
4212 match(V: CxtI->getOperand(i: 0), P: m_FMul(L: m_Value(), R: m_Value()))))
4213 return 0;
4214 [[fallthrough]];
4215 case ISD::FADD:
4216 case ISD::FSUB:
4217 // Increase the cost for half and bfloat types if not architecturally
4218 // supported.
4219 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
4220 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
4221 return 2 * LT.first;
4222 if (!Ty->getScalarType()->isFP128Ty())
4223 return LT.first;
4224 [[fallthrough]];
4225 case ISD::FMUL:
4226 case ISD::FDIV:
4227 // These nodes are marked as 'custom' just to lower them to SVE.
4228 // We know said lowering will incur no additional cost.
4229 if (!Ty->getScalarType()->isFP128Ty())
4230 return 2 * LT.first;
4231
4232 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4233 Opd2Info: Op2Info);
4234 case ISD::FREM:
4235 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4236 // those functions are not declared in the module.
4237 if (!Ty->isVectorTy())
4238 return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4239 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4240 Opd2Info: Op2Info);
4241 }
4242}
4243
4244InstructionCost
4245AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
4246 const SCEV *Ptr) const {
4247 // Address computations in vectorized code with non-consecutive addresses will
4248 // likely result in more instructions compared to scalar code where the
4249 // computation can more often be merged into the index mode. The resulting
4250 // extra micro-ops can significantly decrease throughput.
4251 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4252 int MaxMergeDistance = 64;
4253
4254 if (Ty->isVectorTy() && SE &&
4255 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + 1))
4256 return NumVectorInstToHideOverhead;
4257
4258 // In many cases the address computation is not merged into the instruction
4259 // addressing mode.
4260 return 1;
4261}
4262
4263InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
4264 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4265 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
4266 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4267 // TODO: Handle other cost kinds.
4268 if (CostKind != TTI::TCK_RecipThroughput)
4269 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4270 Op1Info, Op2Info, I);
4271
4272 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4273 // We don't lower some vector selects well that are wider than the register
4274 // width.
4275 if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SELECT) {
4276 // We would need this many instructions to hide the scalarization happening.
4277 const int AmortizationCost = 20;
4278
4279 // If VecPred is not set, check if we can get a predicate from the context
4280 // instruction, if its type matches the requested ValTy.
4281 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4282 CmpPredicate CurrentPred;
4283 if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(),
4284 R: m_Value())))
4285 VecPred = CurrentPred;
4286 }
4287 // Check if we have a compare/select chain that can be lowered using
4288 // a (F)CMxx & BFI pair.
4289 if (CmpInst::isIntPredicate(P: VecPred) || VecPred == CmpInst::FCMP_OLE ||
4290 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4291 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4292 VecPred == CmpInst::FCMP_UNE) {
4293 static const auto ValidMinMaxTys = {
4294 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4295 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4296 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4297
4298 auto LT = getTypeLegalizationCost(Ty: ValTy);
4299 if (any_of(Range: ValidMinMaxTys, P: [&LT](MVT M) { return M == LT.second; }) ||
4300 (ST->hasFullFP16() &&
4301 any_of(Range: ValidFP16MinMaxTys, P: [&LT](MVT M) { return M == LT.second; })))
4302 return LT.first;
4303 }
4304
4305 static const TypeConversionCostTblEntry
4306 VectorSelectTbl[] = {
4307 { .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f32, .Cost: 2 },
4308 { .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f64, .Cost: 2 },
4309 { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f32, .Cost: 2 },
4310 { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f16, .Cost: 2 },
4311 { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8f16, .Cost: 2 },
4312 { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: 16 },
4313 { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: 8 },
4314 { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: 16 },
4315 { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: 4 * AmortizationCost },
4316 { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: 8 * AmortizationCost },
4317 { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: 16 * AmortizationCost }
4318 };
4319
4320 EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
4321 EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
4322 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4323 if (const auto *Entry = ConvertCostTableLookup(Table: VectorSelectTbl, ISD,
4324 Dst: SelCondTy.getSimpleVT(),
4325 Src: SelValTy.getSimpleVT()))
4326 return Entry->Cost;
4327 }
4328 }
4329
4330 if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SETCC) {
4331 Type *ValScalarTy = ValTy->getScalarType();
4332 if ((ValScalarTy->isHalfTy() && !ST->hasFullFP16()) ||
4333 ValScalarTy->isBFloatTy()) {
4334 auto *ValVTy = cast<FixedVectorType>(Val: ValTy);
4335
4336 // Without dedicated instructions we promote [b]f16 compares to f32.
4337 auto *PromotedTy =
4338 VectorType::get(ElementType: Type::getFloatTy(C&: ValTy->getContext()), Other: ValVTy);
4339
4340 InstructionCost Cost = 0;
4341 // Promote operands to float vectors.
4342 Cost += 2 * getCastInstrCost(Opcode: Instruction::FPExt, Dst: PromotedTy, Src: ValTy,
4343 CCH: TTI::CastContextHint::None, CostKind);
4344 // Compare float vectors.
4345 Cost += getCmpSelInstrCost(Opcode, ValTy: PromotedTy, CondTy, VecPred, CostKind,
4346 Op1Info, Op2Info);
4347 // During codegen we'll truncate the vector result from i32 to i16.
4348 Cost +=
4349 getCastInstrCost(Opcode: Instruction::Trunc, Dst: VectorType::getInteger(VTy: ValVTy),
4350 Src: VectorType::getInteger(VTy: PromotedTy),
4351 CCH: TTI::CastContextHint::None, CostKind);
4352 return Cost;
4353 }
4354 }
4355
4356 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4357 // icmp(and, 0) as free, as we can make use of ands, but only if the
4358 // comparison is not unsigned.
4359 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
4360 !CmpInst::isUnsigned(predicate: VecPred) &&
4361 TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) &&
4362 match(V: I->getOperand(i: 0), P: m_And(L: m_Value(), R: m_Value()))) {
4363 if (match(V: I->getOperand(i: 1), P: m_Zero()))
4364 return 0;
4365
4366 // x >= 1 / x < 1 -> x > 0 / x <= 0
4367 if (match(V: I->getOperand(i: 1), P: m_One()) &&
4368 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4369 return 0;
4370
4371 // x <= -1 / x > -1 -> x > 0 / x <= 0
4372 if (match(V: I->getOperand(i: 1), P: m_AllOnes()) &&
4373 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4374 return 0;
4375 }
4376
4377 // The base case handles scalable vectors fine for now, since it treats the
4378 // cost as 1 * legalization cost.
4379 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4380 Op1Info, Op2Info, I);
4381}
4382
4383AArch64TTIImpl::TTI::MemCmpExpansionOptions
4384AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4385 TTI::MemCmpExpansionOptions Options;
4386 if (ST->requiresStrictAlign()) {
4387 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4388 // a bunch of instructions when strict align is enabled.
4389 return Options;
4390 }
4391 Options.AllowOverlappingLoads = true;
4392 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4393 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4394 // TODO: Though vector loads usually perform well on AArch64, in some targets
4395 // they may wake up the FP unit, which raises the power consumption. Perhaps
4396 // they could be used with no holds barred (-O3).
4397 Options.LoadSizes = {8, 4, 2, 1};
4398 Options.AllowedTailExpansions = {3, 5, 6};
4399 return Options;
4400}
4401
4402bool AArch64TTIImpl::prefersVectorizedAddressing() const {
4403 return ST->hasSVE();
4404}
4405
4406InstructionCost
4407AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
4408 Align Alignment, unsigned AddressSpace,
4409 TTI::TargetCostKind CostKind) const {
4410 if (useNeonVector(Ty: Src))
4411 return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
4412 CostKind);
4413 auto LT = getTypeLegalizationCost(Ty: Src);
4414 if (!LT.first.isValid())
4415 return InstructionCost::getInvalid();
4416
4417 // Return an invalid cost for element types that we are unable to lower.
4418 auto *VT = cast<VectorType>(Val: Src);
4419 if (VT->getElementType()->isIntegerTy(Bitwidth: 1))
4420 return InstructionCost::getInvalid();
4421
4422 // The code-generator is currently not able to handle scalable vectors
4423 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4424 // it. This change will be removed when code-generation for these types is
4425 // sufficiently reliable.
4426 if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1))
4427 return InstructionCost::getInvalid();
4428
4429 return LT.first;
4430}
4431
4432// This function returns gather/scatter overhead either from
4433// user-provided value or specialized values per-target from \p ST.
4434static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4435 const AArch64Subtarget *ST) {
4436 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4437 "Should be called on only load or stores.");
4438 switch (Opcode) {
4439 case Instruction::Load:
4440 if (SVEGatherOverhead.getNumOccurrences() > 0)
4441 return SVEGatherOverhead;
4442 return ST->getGatherOverhead();
4443 break;
4444 case Instruction::Store:
4445 if (SVEScatterOverhead.getNumOccurrences() > 0)
4446 return SVEScatterOverhead;
4447 return ST->getScatterOverhead();
4448 break;
4449 default:
4450 llvm_unreachable("Shouldn't have reached here");
4451 }
4452}
4453
4454InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
4455 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
4456 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
4457 if (useNeonVector(Ty: DataTy) || !isLegalMaskedGatherScatter(DataType: DataTy))
4458 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
4459 Alignment, CostKind, I);
4460 auto *VT = cast<VectorType>(Val: DataTy);
4461 auto LT = getTypeLegalizationCost(Ty: DataTy);
4462 if (!LT.first.isValid())
4463 return InstructionCost::getInvalid();
4464
4465 // Return an invalid cost for element types that we are unable to lower.
4466 if (!LT.second.isVector() ||
4467 !isElementTypeLegalForScalableVector(Ty: VT->getElementType()) ||
4468 VT->getElementType()->isIntegerTy(Bitwidth: 1))
4469 return InstructionCost::getInvalid();
4470
4471 // The code-generator is currently not able to handle scalable vectors
4472 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4473 // it. This change will be removed when code-generation for these types is
4474 // sufficiently reliable.
4475 if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1))
4476 return InstructionCost::getInvalid();
4477
4478 ElementCount LegalVF = LT.second.getVectorElementCount();
4479 InstructionCost MemOpCost =
4480 getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: 0, CostKind,
4481 OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
4482 // Add on an overhead cost for using gathers/scatters.
4483 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4484 return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF);
4485}
4486
4487bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
4488 return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors();
4489}
4490
4491InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
4492 Align Alignment,
4493 unsigned AddressSpace,
4494 TTI::TargetCostKind CostKind,
4495 TTI::OperandValueInfo OpInfo,
4496 const Instruction *I) const {
4497 EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
4498 // Type legalization can't handle structs
4499 if (VT == MVT::Other)
4500 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
4501 CostKind);
4502
4503 auto LT = getTypeLegalizationCost(Ty);
4504 if (!LT.first.isValid())
4505 return InstructionCost::getInvalid();
4506
4507 // The code-generator is currently not able to handle scalable vectors
4508 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4509 // it. This change will be removed when code-generation for these types is
4510 // sufficiently reliable.
4511 // We also only support full register predicate loads and stores.
4512 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
4513 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1) ||
4514 (VTy->getElementType()->isIntegerTy(Bitwidth: 1) &&
4515 !VTy->getElementCount().isKnownMultipleOf(
4516 RHS: ElementCount::getScalable(MinVal: 16))))
4517 return InstructionCost::getInvalid();
4518
4519 // TODO: consider latency as well for TCK_SizeAndLatency.
4520 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
4521 return LT.first;
4522
4523 if (CostKind != TTI::TCK_RecipThroughput)
4524 return 1;
4525
4526 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4527 LT.second.is128BitVector() && Alignment < Align(16)) {
4528 // Unaligned stores are extremely inefficient. We don't split all
4529 // unaligned 128-bit stores because the negative impact that has shown in
4530 // practice on inlined block copy code.
4531 // We make such stores expensive so that we will only vectorize if there
4532 // are 6 other instructions getting vectorized.
4533 const int AmortizationCost = 6;
4534
4535 return LT.first * 2 * AmortizationCost;
4536 }
4537
4538 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4539 if (Ty->isPtrOrPtrVectorTy())
4540 return LT.first;
4541
4542 if (useNeonVector(Ty)) {
4543 // Check truncating stores and extending loads.
4544 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4545 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4546 if (VT == MVT::v4i8)
4547 return 2;
4548 // Otherwise we need to scalarize.
4549 return cast<FixedVectorType>(Val: Ty)->getNumElements() * 2;
4550 }
4551 EVT EltVT = VT.getVectorElementType();
4552 unsigned EltSize = EltVT.getScalarSizeInBits();
4553 if (!isPowerOf2_32(Value: EltSize) || EltSize < 8 || EltSize > 64 ||
4554 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4555 return LT.first;
4556 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4557 // widening to v4i8, which produces suboptimal results.
4558 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4559 return LT.first;
4560
4561 // Check non-power-of-2 loads/stores for legal vector element types with
4562 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4563 // operations on smaller power-of-2 ops, including ld1/st1.
4564 LLVMContext &C = Ty->getContext();
4565 InstructionCost Cost(0);
4566 SmallVector<EVT> TypeWorklist;
4567 TypeWorklist.push_back(Elt: VT);
4568 while (!TypeWorklist.empty()) {
4569 EVT CurrVT = TypeWorklist.pop_back_val();
4570 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4571 if (isPowerOf2_32(Value: CurrNumElements)) {
4572 Cost += 1;
4573 continue;
4574 }
4575
4576 unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / 2;
4577 TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2));
4578 TypeWorklist.push_back(
4579 Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2));
4580 }
4581 return Cost;
4582 }
4583
4584 return LT.first;
4585}
4586
4587InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
4588 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4589 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4590 bool UseMaskForCond, bool UseMaskForGaps) const {
4591 assert(Factor >= 2 && "Invalid interleave factor");
4592 auto *VecVTy = cast<VectorType>(Val: VecTy);
4593
4594 if (VecTy->isScalableTy() && !ST->hasSVE())
4595 return InstructionCost::getInvalid();
4596
4597 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4598 // only have lowering for power-of-2 factors.
4599 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4600 // InterleavedAccessPass for ld3/st3
4601 if (VecTy->isScalableTy() && !isPowerOf2_32(Value: Factor))
4602 return InstructionCost::getInvalid();
4603
4604 // Vectorization for masked interleaved accesses is only enabled for scalable
4605 // VF.
4606 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4607 return InstructionCost::getInvalid();
4608
4609 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4610 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4611 auto *SubVecTy =
4612 VectorType::get(ElementType: VecVTy->getElementType(),
4613 EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor));
4614
4615 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4616 // Accesses having vector types that are a multiple of 128 bits can be
4617 // matched to more than one ldN/stN instruction.
4618 bool UseScalable;
4619 if (MinElts % Factor == 0 &&
4620 TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
4621 return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
4622 }
4623
4624 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4625 Alignment, AddressSpace, CostKind,
4626 UseMaskForCond, UseMaskForGaps);
4627}
4628
4629InstructionCost
4630AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
4631 InstructionCost Cost = 0;
4632 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4633 for (auto *I : Tys) {
4634 if (!I->isVectorTy())
4635 continue;
4636 if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() ==
4637 128)
4638 Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind) +
4639 getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind);
4640 }
4641 return Cost;
4642}
4643
4644unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
4645 return ST->getMaxInterleaveFactor();
4646}
4647
4648// For Falkor, we want to avoid having too many strided loads in a loop since
4649// that can exhaust the HW prefetcher resources. We adjust the unroller
4650// MaxCount preference below to attempt to ensure unrolling doesn't create too
4651// many strided loads.
4652static void
4653getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4654 TargetTransformInfo::UnrollingPreferences &UP) {
4655 enum { MaxStridedLoads = 7 };
4656 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4657 int StridedLoads = 0;
4658 // FIXME? We could make this more precise by looking at the CFG and
4659 // e.g. not counting loads in each side of an if-then-else diamond.
4660 for (const auto BB : L->blocks()) {
4661 for (auto &I : *BB) {
4662 LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I);
4663 if (!LMemI)
4664 continue;
4665
4666 Value *PtrValue = LMemI->getPointerOperand();
4667 if (L->isLoopInvariant(V: PtrValue))
4668 continue;
4669
4670 const SCEV *LSCEV = SE.getSCEV(V: PtrValue);
4671 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV);
4672 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4673 continue;
4674
4675 // FIXME? We could take pairing of unrolled load copies into account
4676 // by looking at the AddRec, but we would probably have to limit this
4677 // to loops with no stores or other memory optimization barriers.
4678 ++StridedLoads;
4679 // We've seen enough strided loads that seeing more won't make a
4680 // difference.
4681 if (StridedLoads > MaxStridedLoads / 2)
4682 return StridedLoads;
4683 }
4684 }
4685 return StridedLoads;
4686 };
4687
4688 int StridedLoads = countStridedLoads(L, SE);
4689 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4690 << " strided loads\n");
4691 // Pick the largest power of 2 unroll count that won't result in too many
4692 // strided loads.
4693 if (StridedLoads) {
4694 UP.MaxCount = 1 << Log2_32(Value: MaxStridedLoads / StridedLoads);
4695 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4696 << UP.MaxCount << '\n');
4697 }
4698}
4699
4700// This function returns true if the loop:
4701// 1. Has a valid cost, and
4702// 2. Has a cost within the supplied budget.
4703// Otherwise it returns false.
4704static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI,
4705 InstructionCost Budget,
4706 unsigned *FinalSize) {
4707 // Estimate the size of the loop.
4708 InstructionCost LoopCost = 0;
4709
4710 for (auto *BB : L->getBlocks()) {
4711 for (auto &I : *BB) {
4712 SmallVector<const Value *, 4> Operands(I.operand_values());
4713 InstructionCost Cost =
4714 TTI.getInstructionCost(U: &I, Operands, CostKind: TTI::TCK_CodeSize);
4715 // This can happen with intrinsics that don't currently have a cost model
4716 // or for some operations that require SVE.
4717 if (!Cost.isValid())
4718 return false;
4719
4720 LoopCost += Cost;
4721 if (LoopCost > Budget)
4722 return false;
4723 }
4724 }
4725
4726 if (FinalSize)
4727 *FinalSize = LoopCost.getValue();
4728 return true;
4729}
4730
4731static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
4732 const AArch64TTIImpl &TTI) {
4733 // Only consider loops with unknown trip counts for which we can determine
4734 // a symbolic expression. Multi-exit loops with small known trip counts will
4735 // likely be unrolled anyway.
4736 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4737 if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC))
4738 return false;
4739
4740 // It might not be worth unrolling loops with low max trip counts. Restrict
4741 // this to max trip counts > 32 for now.
4742 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4743 if (MaxTC > 0 && MaxTC <= 32)
4744 return false;
4745
4746 // Make sure the loop size is <= 5.
4747 if (!isLoopSizeWithinBudget(L, TTI, Budget: 5, FinalSize: nullptr))
4748 return false;
4749
4750 // Small search loops with multiple exits can be highly beneficial to unroll.
4751 // We only care about loops with exactly two exiting blocks, although each
4752 // block could jump to the same exit block.
4753 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
4754 if (Blocks.size() != 2)
4755 return false;
4756
4757 if (any_of(Range&: Blocks, P: [](BasicBlock *BB) {
4758 return !isa<BranchInst>(Val: BB->getTerminator());
4759 }))
4760 return false;
4761
4762 return true;
4763}
4764
4765/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4766/// OOO engine's wide instruction window and various predictors.
4767static void
4768getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4769 TargetTransformInfo::UnrollingPreferences &UP,
4770 const AArch64TTIImpl &TTI) {
4771 // Limit loops with structure that is highly likely to benefit from runtime
4772 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
4773 // likely with complex control flow). Note that the heuristics here may be
4774 // overly conservative and we err on the side of avoiding runtime unrolling
4775 // rather than unroll excessively. They are all subject to further refinement.
4776 if (!L->isInnermost() || L->getNumBlocks() > 8)
4777 return;
4778
4779 // Loops with multiple exits are handled by common code.
4780 if (!L->getExitBlock())
4781 return;
4782
4783 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4784 if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC) ||
4785 (SE.getSmallConstantMaxTripCount(L) > 0 &&
4786 SE.getSmallConstantMaxTripCount(L) <= 32))
4787 return;
4788
4789 if (findStringMetadataForLoop(TheLoop: L, Name: "llvm.loop.isvectorized"))
4790 return;
4791
4792 if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
4793 return;
4794
4795 // Limit to loops with trip counts that are cheap to expand.
4796 UP.SCEVExpansionBudget = 1;
4797
4798 // Try to unroll small, single block loops, if they have load/store
4799 // dependencies, to expose more parallel memory access streams.
4800 BasicBlock *Header = L->getHeader();
4801 if (Header == L->getLoopLatch()) {
4802 // Estimate the size of the loop.
4803 unsigned Size;
4804 if (!isLoopSizeWithinBudget(L, TTI, Budget: 8, FinalSize: &Size))
4805 return;
4806
4807 SmallPtrSet<Value *, 8> LoadedValues;
4808 SmallVector<StoreInst *> Stores;
4809 for (auto *BB : L->blocks()) {
4810 for (auto &I : *BB) {
4811 Value *Ptr = getLoadStorePointerOperand(V: &I);
4812 if (!Ptr)
4813 continue;
4814 const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
4815 if (SE.isLoopInvariant(S: PtrSCEV, L))
4816 continue;
4817 if (isa<LoadInst>(Val: &I))
4818 LoadedValues.insert(Ptr: &I);
4819 else
4820 Stores.push_back(Elt: cast<StoreInst>(Val: &I));
4821 }
4822 }
4823
4824 // Try to find an unroll count that maximizes the use of the instruction
4825 // window, i.e. trying to fetch as many instructions per cycle as possible.
4826 unsigned MaxInstsPerLine = 16;
4827 unsigned UC = 1;
4828 unsigned BestUC = 1;
4829 unsigned SizeWithBestUC = BestUC * Size;
4830 while (UC <= 8) {
4831 unsigned SizeWithUC = UC * Size;
4832 if (SizeWithUC > 48)
4833 break;
4834 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4835 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4836 BestUC = UC;
4837 SizeWithBestUC = BestUC * Size;
4838 }
4839 UC++;
4840 }
4841
4842 if (BestUC == 1 || none_of(Range&: Stores, P: [&LoadedValues](StoreInst *SI) {
4843 return LoadedValues.contains(Ptr: SI->getOperand(i_nocapture: 0));
4844 }))
4845 return;
4846
4847 UP.Runtime = true;
4848 UP.DefaultUnrollRuntimeCount = BestUC;
4849 return;
4850 }
4851
4852 // Try to runtime-unroll loops with early-continues depending on loop-varying
4853 // loads; this helps with branch-prediction for the early-continues.
4854 auto *Term = dyn_cast<BranchInst>(Val: Header->getTerminator());
4855 auto *Latch = L->getLoopLatch();
4856 SmallVector<BasicBlock *> Preds(predecessors(BB: Latch));
4857 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
4858 !llvm::is_contained(Range&: Preds, Element: Header) ||
4859 none_of(Range&: Preds, P: [L](BasicBlock *Pred) { return L->contains(BB: Pred); }))
4860 return;
4861
4862 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
4863 [&](Instruction *I, unsigned Depth) -> bool {
4864 if (isa<PHINode>(Val: I) || L->isLoopInvariant(V: I) || Depth > 8)
4865 return false;
4866
4867 if (isa<LoadInst>(Val: I))
4868 return true;
4869
4870 return any_of(Range: I->operands(), P: [&](Value *V) {
4871 auto *I = dyn_cast<Instruction>(Val: V);
4872 return I && DependsOnLoopLoad(I, Depth + 1);
4873 });
4874 };
4875 CmpPredicate Pred;
4876 Instruction *I;
4877 if (match(V: Term, P: m_Br(C: m_ICmp(Pred, L: m_Instruction(I), R: m_Value()), T: m_Value(),
4878 F: m_Value())) &&
4879 DependsOnLoopLoad(I, 0)) {
4880 UP.Runtime = true;
4881 }
4882}
4883
4884void AArch64TTIImpl::getUnrollingPreferences(
4885 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
4886 OptimizationRemarkEmitter *ORE) const {
4887 // Enable partial unrolling and runtime unrolling.
4888 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
4889
4890 UP.UpperBound = true;
4891
4892 // For inner loop, it is more likely to be a hot one, and the runtime check
4893 // can be promoted out from LICM pass, so the overhead is less, let's try
4894 // a larger threshold to unroll more loops.
4895 if (L->getLoopDepth() > 1)
4896 UP.PartialThreshold *= 2;
4897
4898 // Disable partial & runtime unrolling on -Os.
4899 UP.PartialOptSizeThreshold = 0;
4900
4901 // Scan the loop: don't unroll loops with calls as this could prevent
4902 // inlining. Don't unroll vector loops either, as they don't benefit much from
4903 // unrolling.
4904 for (auto *BB : L->getBlocks()) {
4905 for (auto &I : *BB) {
4906 // Don't unroll vectorised loop.
4907 if (I.getType()->isVectorTy())
4908 return;
4909
4910 if (isa<CallBase>(Val: I)) {
4911 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I))
4912 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction())
4913 if (!isLoweredToCall(F))
4914 continue;
4915 return;
4916 }
4917 }
4918 }
4919
4920 // Apply subtarget-specific unrolling preferences.
4921 switch (ST->getProcFamily()) {
4922 case AArch64Subtarget::AppleA14:
4923 case AArch64Subtarget::AppleA15:
4924 case AArch64Subtarget::AppleA16:
4925 case AArch64Subtarget::AppleM4:
4926 getAppleRuntimeUnrollPreferences(L, SE, UP, TTI: *this);
4927 break;
4928 case AArch64Subtarget::Falkor:
4929 if (EnableFalkorHWPFUnrollFix)
4930 getFalkorUnrollingPreferences(L, SE, UP);
4931 break;
4932 default:
4933 break;
4934 }
4935
4936 // If this is a small, multi-exit loop similar to something like std::find,
4937 // then there is typically a performance improvement achieved by unrolling.
4938 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, TTI: *this)) {
4939 UP.RuntimeUnrollMultiExit = true;
4940 UP.Runtime = true;
4941 // Limit unroll count.
4942 UP.DefaultUnrollRuntimeCount = 4;
4943 // Allow slightly more costly trip-count expansion to catch search loops
4944 // with pointer inductions.
4945 UP.SCEVExpansionBudget = 5;
4946 return;
4947 }
4948
4949 // Enable runtime unrolling for in-order models
4950 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
4951 // checking for that case, we can ensure that the default behaviour is
4952 // unchanged
4953 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
4954 !ST->getSchedModel().isOutOfOrder()) {
4955 UP.Runtime = true;
4956 UP.Partial = true;
4957 UP.UnrollRemainder = true;
4958 UP.DefaultUnrollRuntimeCount = 4;
4959
4960 UP.UnrollAndJam = true;
4961 UP.UnrollAndJamInnerLoopThreshold = 60;
4962 }
4963}
4964
4965void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
4966 TTI::PeelingPreferences &PP) const {
4967 BaseT::getPeelingPreferences(L, SE, PP);
4968}
4969
4970Value *
4971AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
4972 Type *ExpectedType) const {
4973 switch (Inst->getIntrinsicID()) {
4974 default:
4975 return nullptr;
4976 case Intrinsic::aarch64_neon_st2:
4977 case Intrinsic::aarch64_neon_st3:
4978 case Intrinsic::aarch64_neon_st4: {
4979 // Create a struct type
4980 StructType *ST = dyn_cast<StructType>(Val: ExpectedType);
4981 if (!ST)
4982 return nullptr;
4983 unsigned NumElts = Inst->arg_size() - 1;
4984 if (ST->getNumElements() != NumElts)
4985 return nullptr;
4986 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4987 if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i))
4988 return nullptr;
4989 }
4990 Value *Res = PoisonValue::get(T: ExpectedType);
4991 IRBuilder<> Builder(Inst);
4992 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4993 Value *L = Inst->getArgOperand(i);
4994 Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i);
4995 }
4996 return Res;
4997 }
4998 case Intrinsic::aarch64_neon_ld2:
4999 case Intrinsic::aarch64_neon_ld3:
5000 case Intrinsic::aarch64_neon_ld4:
5001 if (Inst->getType() == ExpectedType)
5002 return Inst;
5003 return nullptr;
5004 }
5005}
5006
5007bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
5008 MemIntrinsicInfo &Info) const {
5009 switch (Inst->getIntrinsicID()) {
5010 default:
5011 break;
5012 case Intrinsic::aarch64_neon_ld2:
5013 case Intrinsic::aarch64_neon_ld3:
5014 case Intrinsic::aarch64_neon_ld4:
5015 Info.ReadMem = true;
5016 Info.WriteMem = false;
5017 Info.PtrVal = Inst->getArgOperand(i: 0);
5018 break;
5019 case Intrinsic::aarch64_neon_st2:
5020 case Intrinsic::aarch64_neon_st3:
5021 case Intrinsic::aarch64_neon_st4:
5022 Info.ReadMem = false;
5023 Info.WriteMem = true;
5024 Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - 1);
5025 break;
5026 }
5027
5028 switch (Inst->getIntrinsicID()) {
5029 default:
5030 return false;
5031 case Intrinsic::aarch64_neon_ld2:
5032 case Intrinsic::aarch64_neon_st2:
5033 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5034 break;
5035 case Intrinsic::aarch64_neon_ld3:
5036 case Intrinsic::aarch64_neon_st3:
5037 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5038 break;
5039 case Intrinsic::aarch64_neon_ld4:
5040 case Intrinsic::aarch64_neon_st4:
5041 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5042 break;
5043 }
5044 return true;
5045}
5046
5047/// See if \p I should be considered for address type promotion. We check if \p
5048/// I is a sext with right type and used in memory accesses. If it used in a
5049/// "complex" getelementptr, we allow it to be promoted without finding other
5050/// sext instructions that sign extended the same initial value. A getelementptr
5051/// is considered as "complex" if it has more than 2 operands.
5052bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
5053 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5054 bool Considerable = false;
5055 AllowPromotionWithoutCommonHeader = false;
5056 if (!isa<SExtInst>(Val: &I))
5057 return false;
5058 Type *ConsideredSExtType =
5059 Type::getInt64Ty(C&: I.getParent()->getParent()->getContext());
5060 if (I.getType() != ConsideredSExtType)
5061 return false;
5062 // See if the sext is the one with the right type and used in at least one
5063 // GetElementPtrInst.
5064 for (const User *U : I.users()) {
5065 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) {
5066 Considerable = true;
5067 // A getelementptr is considered as "complex" if it has more than 2
5068 // operands. We will promote a SExt used in such complex GEP as we
5069 // expect some computation to be merged if they are done on 64 bits.
5070 if (GEPInst->getNumOperands() > 2) {
5071 AllowPromotionWithoutCommonHeader = true;
5072 break;
5073 }
5074 }
5075 }
5076 return Considerable;
5077}
5078
5079bool AArch64TTIImpl::isLegalToVectorizeReduction(
5080 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5081 if (!VF.isScalable())
5082 return true;
5083
5084 Type *Ty = RdxDesc.getRecurrenceType();
5085 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5086 return false;
5087
5088 switch (RdxDesc.getRecurrenceKind()) {
5089 case RecurKind::Add:
5090 case RecurKind::FAdd:
5091 case RecurKind::And:
5092 case RecurKind::Or:
5093 case RecurKind::Xor:
5094 case RecurKind::SMin:
5095 case RecurKind::SMax:
5096 case RecurKind::UMin:
5097 case RecurKind::UMax:
5098 case RecurKind::FMin:
5099 case RecurKind::FMax:
5100 case RecurKind::FMulAdd:
5101 case RecurKind::AnyOf:
5102 return true;
5103 default:
5104 return false;
5105 }
5106}
5107
5108InstructionCost
5109AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
5110 FastMathFlags FMF,
5111 TTI::TargetCostKind CostKind) const {
5112 // The code-generator is currently not able to handle scalable vectors
5113 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5114 // it. This change will be removed when code-generation for these types is
5115 // sufficiently reliable.
5116 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
5117 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
5118 return InstructionCost::getInvalid();
5119
5120 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5121
5122 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5123 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5124
5125 InstructionCost LegalizationCost = 0;
5126 if (LT.first > 1) {
5127 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Ty->getContext());
5128 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5129 LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - 1);
5130 }
5131
5132 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5133}
5134
5135InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
5136 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5137 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5138 InstructionCost LegalizationCost = 0;
5139 if (LT.first > 1) {
5140 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: ValTy->getContext());
5141 LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind);
5142 LegalizationCost *= LT.first - 1;
5143 }
5144
5145 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5146 assert(ISD && "Invalid opcode");
5147 // Add the final reduction cost for the legal horizontal reduction
5148 switch (ISD) {
5149 case ISD::ADD:
5150 case ISD::AND:
5151 case ISD::OR:
5152 case ISD::XOR:
5153 case ISD::FADD:
5154 return LegalizationCost + 2;
5155 default:
5156 return InstructionCost::getInvalid();
5157 }
5158}
5159
5160InstructionCost
5161AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
5162 std::optional<FastMathFlags> FMF,
5163 TTI::TargetCostKind CostKind) const {
5164 // The code-generator is currently not able to handle scalable vectors
5165 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5166 // it. This change will be removed when code-generation for these types is
5167 // sufficiently reliable.
5168 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: ValTy))
5169 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
5170 return InstructionCost::getInvalid();
5171
5172 if (TTI::requiresOrderedReduction(FMF)) {
5173 if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) {
5174 InstructionCost BaseCost =
5175 BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
5176 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5177 // end up vectorizing for more computationally intensive loops.
5178 return BaseCost + FixedVTy->getNumElements();
5179 }
5180
5181 if (Opcode != Instruction::FAdd)
5182 return InstructionCost::getInvalid();
5183
5184 auto *VTy = cast<ScalableVectorType>(Val: ValTy);
5185 InstructionCost Cost =
5186 getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind);
5187 Cost *= getMaxNumElements(VF: VTy->getElementCount());
5188 return Cost;
5189 }
5190
5191 if (isa<ScalableVectorType>(Val: ValTy))
5192 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5193
5194 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5195 MVT MTy = LT.second;
5196 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5197 assert(ISD && "Invalid opcode");
5198
5199 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5200 // instructions as twice a normal vector add, plus 1 for each legalization
5201 // step (LT.first). This is the only arithmetic vector reduction operation for
5202 // which we have an instruction.
5203 // OR, XOR and AND costs should match the codegen from:
5204 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5205 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5206 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5207 static const CostTblEntry CostTblNoPairwise[]{
5208 {.ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: 2},
5209 {.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 2},
5210 {.ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: 2},
5211 {.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 2},
5212 {.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 2},
5213 {.ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 2},
5214 {.ISD: ISD::OR, .Type: MVT::v8i8, .Cost: 15},
5215 {.ISD: ISD::OR, .Type: MVT::v16i8, .Cost: 17},
5216 {.ISD: ISD::OR, .Type: MVT::v4i16, .Cost: 7},
5217 {.ISD: ISD::OR, .Type: MVT::v8i16, .Cost: 9},
5218 {.ISD: ISD::OR, .Type: MVT::v2i32, .Cost: 3},
5219 {.ISD: ISD::OR, .Type: MVT::v4i32, .Cost: 5},
5220 {.ISD: ISD::OR, .Type: MVT::v2i64, .Cost: 3},
5221 {.ISD: ISD::XOR, .Type: MVT::v8i8, .Cost: 15},
5222 {.ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: 17},
5223 {.ISD: ISD::XOR, .Type: MVT::v4i16, .Cost: 7},
5224 {.ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: 9},
5225 {.ISD: ISD::XOR, .Type: MVT::v2i32, .Cost: 3},
5226 {.ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: 5},
5227 {.ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: 3},
5228 {.ISD: ISD::AND, .Type: MVT::v8i8, .Cost: 15},
5229 {.ISD: ISD::AND, .Type: MVT::v16i8, .Cost: 17},
5230 {.ISD: ISD::AND, .Type: MVT::v4i16, .Cost: 7},
5231 {.ISD: ISD::AND, .Type: MVT::v8i16, .Cost: 9},
5232 {.ISD: ISD::AND, .Type: MVT::v2i32, .Cost: 3},
5233 {.ISD: ISD::AND, .Type: MVT::v4i32, .Cost: 5},
5234 {.ISD: ISD::AND, .Type: MVT::v2i64, .Cost: 3},
5235 };
5236 switch (ISD) {
5237 default:
5238 break;
5239 case ISD::FADD:
5240 if (Type *EltTy = ValTy->getScalarType();
5241 // FIXME: For half types without fullfp16 support, this could extend and
5242 // use a fp32 faddp reduction but current codegen unrolls.
5243 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5244 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5245 const unsigned NElts = MTy.getVectorNumElements();
5246 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5247 isPowerOf2_32(Value: NElts))
5248 // Reduction corresponding to series of fadd instructions is lowered to
5249 // series of faddp instructions. faddp has latency/throughput that
5250 // matches fadd instruction and hence, every faddp instruction can be
5251 // considered to have a relative cost = 1 with
5252 // CostKind = TCK_RecipThroughput.
5253 // An faddp will pairwise add vector elements, so the size of input
5254 // vector reduces by half every time, requiring
5255 // #(faddp instructions) = log2_32(NElts).
5256 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(Value: NElts);
5257 }
5258 break;
5259 case ISD::ADD:
5260 if (const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy))
5261 return (LT.first - 1) + Entry->Cost;
5262 break;
5263 case ISD::XOR:
5264 case ISD::AND:
5265 case ISD::OR:
5266 const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy);
5267 if (!Entry)
5268 break;
5269 auto *ValVTy = cast<FixedVectorType>(Val: ValTy);
5270 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5271 isPowerOf2_32(Value: ValVTy->getNumElements())) {
5272 InstructionCost ExtraCost = 0;
5273 if (LT.first != 1) {
5274 // Type needs to be split, so there is an extra cost of LT.first - 1
5275 // arithmetic ops.
5276 auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(),
5277 NumElts: MTy.getVectorNumElements());
5278 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5279 ExtraCost *= LT.first - 1;
5280 }
5281 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5282 auto Cost = ValVTy->getElementType()->isIntegerTy(Bitwidth: 1) ? 2 : Entry->Cost;
5283 return Cost + ExtraCost;
5284 }
5285 break;
5286 }
5287 return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
5288}
5289
5290InstructionCost AArch64TTIImpl::getExtendedReductionCost(
5291 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5292 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5293 EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
5294 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
5295
5296 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5297 VecVT.getSizeInBits() >= 64) {
5298 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
5299
5300 // The legal cases are:
5301 // UADDLV 8/16/32->32
5302 // UADDLP 32->64
5303 unsigned RevVTSize = ResVT.getSizeInBits();
5304 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5305 RevVTSize <= 32) ||
5306 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5307 RevVTSize <= 32) ||
5308 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5309 RevVTSize <= 64))
5310 return (LT.first - 1) * 2 + 2;
5311 }
5312
5313 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: VecTy, FMF,
5314 CostKind);
5315}
5316
5317InstructionCost
5318AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
5319 VectorType *VecTy,
5320 TTI::TargetCostKind CostKind) const {
5321 EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
5322 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
5323
5324 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) {
5325 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
5326
5327 // The legal cases with dotprod are
5328 // UDOT 8->32
5329 // Which requires an additional uaddv to sum the i32 values.
5330 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5331 ResVT == MVT::i32)
5332 return LT.first + 2;
5333 }
5334
5335 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, Ty: VecTy, CostKind);
5336}
5337
5338InstructionCost
5339AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index,
5340 TTI::TargetCostKind CostKind) const {
5341 static const CostTblEntry ShuffleTbl[] = {
5342 { .ISD: TTI::SK_Splice, .Type: MVT::nxv16i8, .Cost: 1 },
5343 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8i16, .Cost: 1 },
5344 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4i32, .Cost: 1 },
5345 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2i64, .Cost: 1 },
5346 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f16, .Cost: 1 },
5347 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f16, .Cost: 1 },
5348 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8f16, .Cost: 1 },
5349 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2bf16, .Cost: 1 },
5350 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4bf16, .Cost: 1 },
5351 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8bf16, .Cost: 1 },
5352 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f32, .Cost: 1 },
5353 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f32, .Cost: 1 },
5354 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f64, .Cost: 1 },
5355 };
5356
5357 // The code-generator is currently not able to handle scalable vectors
5358 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5359 // it. This change will be removed when code-generation for these types is
5360 // sufficiently reliable.
5361 if (Tp->getElementCount() == ElementCount::getScalable(MinVal: 1))
5362 return InstructionCost::getInvalid();
5363
5364 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
5365 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Tp->getContext());
5366 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5367 ? TLI->getPromotedVTForPredicate(VT: EVT(LT.second))
5368 : LT.second;
5369 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Context&: Tp->getContext());
5370 InstructionCost LegalizationCost = 0;
5371 if (Index < 0) {
5372 LegalizationCost =
5373 getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy,
5374 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) +
5375 getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy,
5376 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
5377 }
5378
5379 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5380 // Cost performed on a promoted type.
5381 if (LT.second.getScalarType() == MVT::i1) {
5382 LegalizationCost +=
5383 getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy,
5384 CCH: TTI::CastContextHint::None, CostKind) +
5385 getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy,
5386 CCH: TTI::CastContextHint::None, CostKind);
5387 }
5388 const auto *Entry =
5389 CostTableLookup(Table: ShuffleTbl, ISD: TTI::SK_Splice, Ty: PromotedVT.getSimpleVT());
5390 assert(Entry && "Illegal Type for Splice");
5391 LegalizationCost += Entry->Cost;
5392 return LegalizationCost * LT.first;
5393}
5394
5395InstructionCost AArch64TTIImpl::getPartialReductionCost(
5396 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5397 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
5398 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5399 TTI::TargetCostKind CostKind) const {
5400 InstructionCost Invalid = InstructionCost::getInvalid();
5401 InstructionCost Cost(TTI::TCC_Basic);
5402
5403 if (CostKind != TTI::TCK_RecipThroughput)
5404 return Invalid;
5405
5406 // Sub opcodes currently only occur in chained cases.
5407 // Independent partial reduction subtractions are still costed as an add
5408 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5409 OpAExtend == TTI::PR_None)
5410 return Invalid;
5411
5412 // We only support multiply binary operations for now, and for muls we
5413 // require the types being extended to be the same.
5414 // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but
5415 // only if the i8mm or sve/streaming features are available.
5416 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB ||
5417 OpBExtend == TTI::PR_None ||
5418 (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
5419 !ST->isSVEorStreamingSVEAvailable())))
5420 return Invalid;
5421 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5422 "Unexpected values for OpBExtend or InputTypeB");
5423
5424 EVT InputEVT = EVT::getEVT(Ty: InputTypeA);
5425 EVT AccumEVT = EVT::getEVT(Ty: AccumType);
5426
5427 unsigned VFMinValue = VF.getKnownMinValue();
5428
5429 if (VF.isScalable()) {
5430 if (!ST->isSVEorStreamingSVEAvailable())
5431 return Invalid;
5432
5433 // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
5434 // since we can't lower that type.
5435 unsigned Scale =
5436 AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
5437 if (VFMinValue == Scale)
5438 return Invalid;
5439 }
5440 if (VF.isFixed() &&
5441 (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64))
5442 return Invalid;
5443
5444 if (InputEVT == MVT::i8) {
5445 switch (VFMinValue) {
5446 default:
5447 return Invalid;
5448 case 8:
5449 if (AccumEVT == MVT::i32)
5450 Cost *= 2;
5451 else if (AccumEVT != MVT::i64)
5452 return Invalid;
5453 break;
5454 case 16:
5455 if (AccumEVT == MVT::i64)
5456 Cost *= 2;
5457 else if (AccumEVT != MVT::i32)
5458 return Invalid;
5459 break;
5460 }
5461 } else if (InputEVT == MVT::i16) {
5462 // FIXME: Allow i32 accumulator but increase cost, as we would extend
5463 // it to i64.
5464 if (VFMinValue != 8 || AccumEVT != MVT::i64)
5465 return Invalid;
5466 } else
5467 return Invalid;
5468
5469 return Cost;
5470}
5471
5472InstructionCost
5473AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
5474 VectorType *SrcTy, ArrayRef<int> Mask,
5475 TTI::TargetCostKind CostKind, int Index,
5476 VectorType *SubTp, ArrayRef<const Value *> Args,
5477 const Instruction *CxtI) const {
5478 assert((Mask.empty() || DstTy->isScalableTy() ||
5479 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5480 "Expected the Mask to match the return size if given");
5481 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5482 "Expected the same scalar types");
5483 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
5484
5485 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5486 // into smaller vectors and sum the cost of each shuffle.
5487 if (!Mask.empty() && isa<FixedVectorType>(Val: SrcTy) && LT.second.isVector() &&
5488 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5489 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5490 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5491 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5492 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5493 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5494 // cost than just the load.
5495 if (Args.size() >= 1 && isa<LoadInst>(Val: Args[0]) &&
5496 (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 3) ||
5497 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 4)))
5498 return std::max<InstructionCost>(a: 1, b: LT.first / 4);
5499
5500 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5501 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5502 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5503 // cost than just the store.
5504 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) &&
5505 (ShuffleVectorInst::isInterleaveMask(
5506 Mask, Factor: 4, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2) ||
5507 ShuffleVectorInst::isInterleaveMask(
5508 Mask, Factor: 3, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2)))
5509 return LT.first;
5510
5511 unsigned TpNumElts = Mask.size();
5512 unsigned LTNumElts = LT.second.getVectorNumElements();
5513 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5514 VectorType *NTp = VectorType::get(ElementType: SrcTy->getScalarType(),
5515 EC: LT.second.getVectorElementCount());
5516 InstructionCost Cost;
5517 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5518 PreviousCosts;
5519 for (unsigned N = 0; N < NumVecs; N++) {
5520 SmallVector<int> NMask;
5521 // Split the existing mask into chunks of size LTNumElts. Track the source
5522 // sub-vectors to ensure the result has at most 2 inputs.
5523 unsigned Source1 = -1U, Source2 = -1U;
5524 unsigned NumSources = 0;
5525 for (unsigned E = 0; E < LTNumElts; E++) {
5526 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5527 : PoisonMaskElem;
5528 if (MaskElt < 0) {
5529 NMask.push_back(Elt: PoisonMaskElem);
5530 continue;
5531 }
5532
5533 // Calculate which source from the input this comes from and whether it
5534 // is new to us.
5535 unsigned Source = MaskElt / LTNumElts;
5536 if (NumSources == 0) {
5537 Source1 = Source;
5538 NumSources = 1;
5539 } else if (NumSources == 1 && Source != Source1) {
5540 Source2 = Source;
5541 NumSources = 2;
5542 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
5543 NumSources++;
5544 }
5545
5546 // Add to the new mask. For the NumSources>2 case these are not correct,
5547 // but are only used for the modular lane number.
5548 if (Source == Source1)
5549 NMask.push_back(Elt: MaskElt % LTNumElts);
5550 else if (Source == Source2)
5551 NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts);
5552 else
5553 NMask.push_back(Elt: MaskElt % LTNumElts);
5554 }
5555 // Check if we have already generated this sub-shuffle, which means we
5556 // will have already generated the output. For example a <16 x i32> splat
5557 // will be the same sub-splat 4 times, which only needs to be generated
5558 // once and reused.
5559 auto Result =
5560 PreviousCosts.insert(x: {std::make_tuple(args&: Source1, args&: Source2, args&: NMask), 0});
5561 // Check if it was already in the map (already costed).
5562 if (!Result.second)
5563 continue;
5564 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
5565 // getShuffleCost. If not then cost it using the worst case as the number
5566 // of element moves into a new vector.
5567 InstructionCost NCost =
5568 NumSources <= 2
5569 ? getShuffleCost(Kind: NumSources <= 1 ? TTI::SK_PermuteSingleSrc
5570 : TTI::SK_PermuteTwoSrc,
5571 DstTy: NTp, SrcTy: NTp, Mask: NMask, CostKind, Index: 0, SubTp: nullptr, Args,
5572 CxtI)
5573 : LTNumElts;
5574 Result.first->second = NCost;
5575 Cost += NCost;
5576 }
5577 return Cost;
5578 }
5579
5580 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
5581 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
5582 // A subvector extract can be implemented with an ext (or trivial extract, if
5583 // from lane 0). This currently only handles low or high extracts to prevent
5584 // SLP vectorizer regressions.
5585 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
5586 if (LT.second.is128BitVector() &&
5587 cast<FixedVectorType>(Val: SubTp)->getNumElements() ==
5588 LT.second.getVectorNumElements() / 2) {
5589 if (Index == 0)
5590 return 0;
5591 if (Index == (int)LT.second.getVectorNumElements() / 2)
5592 return 1;
5593 }
5594 Kind = TTI::SK_PermuteSingleSrc;
5595 }
5596 // FIXME: This was added to keep the costs equal when adding DstTys. Update
5597 // the code to handle length-changing shuffles.
5598 if (Kind == TTI::SK_InsertSubvector) {
5599 LT = getTypeLegalizationCost(Ty: DstTy);
5600 SrcTy = DstTy;
5601 }
5602
5603 // Segmented shuffle matching.
5604 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Val: SrcTy) &&
5605 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
5606 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
5607 RHS: AArch64::SVEBitsPerBlock)) {
5608
5609 FixedVectorType *VTy = cast<FixedVectorType>(Val: SrcTy);
5610 unsigned Segments =
5611 VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
5612 unsigned SegmentElts = VTy->getNumElements() / Segments;
5613
5614 // dupq zd.t, zn.t[idx]
5615 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
5616 ST->isSVEorStreamingSVEAvailable() &&
5617 isDUPQMask(Mask, Segments, SegmentSize: SegmentElts))
5618 return LT.first;
5619
5620 // mov zd.q, vn
5621 if (ST->isSVEorStreamingSVEAvailable() &&
5622 isDUPFirstSegmentMask(Mask, Segments, SegmentSize: SegmentElts))
5623 return LT.first;
5624 }
5625
5626 // Check for broadcast loads, which are supported by the LD1R instruction.
5627 // In terms of code-size, the shuffle vector is free when a load + dup get
5628 // folded into a LD1R. That's what we check and return here. For performance
5629 // and reciprocal throughput, a LD1R is not completely free. In this case, we
5630 // return the cost for the broadcast below (i.e. 1 for most/all types), so
5631 // that we model the load + dup sequence slightly higher because LD1R is a
5632 // high latency instruction.
5633 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
5634 bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args[0]);
5635 if (IsLoad && LT.second.isVector() &&
5636 isLegalBroadcastLoad(ElementTy: SrcTy->getElementType(),
5637 NumElements: LT.second.getVectorElementCount()))
5638 return 0;
5639 }
5640
5641 // If we have 4 elements for the shuffle and a Mask, get the cost straight
5642 // from the perfect shuffle tables.
5643 if (Mask.size() == 4 &&
5644 SrcTy->getElementCount() == ElementCount::getFixed(MinVal: 4) &&
5645 (SrcTy->getScalarSizeInBits() == 16 ||
5646 SrcTy->getScalarSizeInBits() == 32) &&
5647 all_of(Range&: Mask, P: [](int E) { return E < 8; }))
5648 return getPerfectShuffleCost(M: Mask);
5649
5650 // Check for identity masks, which we can treat as free.
5651 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
5652 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
5653 all_of(Range: enumerate(First&: Mask), P: [](const auto &M) {
5654 return M.value() < 0 || M.value() == (int)M.index();
5655 }))
5656 return 0;
5657
5658 // Check for other shuffles that are not SK_ kinds but we have native
5659 // instructions for, for example ZIP and UZP.
5660 unsigned Unused;
5661 if (LT.second.isFixedLengthVector() &&
5662 LT.second.getVectorNumElements() == Mask.size() &&
5663 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
5664 (isZIPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) ||
5665 isUZPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) ||
5666 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
5667 NumElts: LT.second.getVectorNumElements(), BlockSize: 16) ||
5668 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
5669 NumElts: LT.second.getVectorNumElements(), BlockSize: 32) ||
5670 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
5671 NumElts: LT.second.getVectorNumElements(), BlockSize: 64) ||
5672 // Check for non-zero lane splats
5673 all_of(Range: drop_begin(RangeOrContainer&: Mask),
5674 P: [&Mask](int M) { return M < 0 || M == Mask[0]; })))
5675 return 1;
5676
5677 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
5678 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
5679 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
5680 static const CostTblEntry ShuffleTbl[] = {
5681 // Broadcast shuffle kinds can be performed with 'dup'.
5682 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: 1},
5683 {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: 1},
5684 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: 1},
5685 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: 1},
5686 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i32, .Cost: 1},
5687 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: 1},
5688 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: 1},
5689 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f16, .Cost: 1},
5690 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: 1},
5691 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4bf16, .Cost: 1},
5692 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8bf16, .Cost: 1},
5693 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f32, .Cost: 1},
5694 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: 1},
5695 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: 1},
5696 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
5697 // 'zip1/zip2' instructions.
5698 {.ISD: TTI::SK_Transpose, .Type: MVT::v8i8, .Cost: 1},
5699 {.ISD: TTI::SK_Transpose, .Type: MVT::v16i8, .Cost: 1},
5700 {.ISD: TTI::SK_Transpose, .Type: MVT::v4i16, .Cost: 1},
5701 {.ISD: TTI::SK_Transpose, .Type: MVT::v8i16, .Cost: 1},
5702 {.ISD: TTI::SK_Transpose, .Type: MVT::v2i32, .Cost: 1},
5703 {.ISD: TTI::SK_Transpose, .Type: MVT::v4i32, .Cost: 1},
5704 {.ISD: TTI::SK_Transpose, .Type: MVT::v2i64, .Cost: 1},
5705 {.ISD: TTI::SK_Transpose, .Type: MVT::v4f16, .Cost: 1},
5706 {.ISD: TTI::SK_Transpose, .Type: MVT::v8f16, .Cost: 1},
5707 {.ISD: TTI::SK_Transpose, .Type: MVT::v4bf16, .Cost: 1},
5708 {.ISD: TTI::SK_Transpose, .Type: MVT::v8bf16, .Cost: 1},
5709 {.ISD: TTI::SK_Transpose, .Type: MVT::v2f32, .Cost: 1},
5710 {.ISD: TTI::SK_Transpose, .Type: MVT::v4f32, .Cost: 1},
5711 {.ISD: TTI::SK_Transpose, .Type: MVT::v2f64, .Cost: 1},
5712 // Select shuffle kinds.
5713 // TODO: handle vXi8/vXi16.
5714 {.ISD: TTI::SK_Select, .Type: MVT::v2i32, .Cost: 1}, // mov.
5715 {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: 2}, // rev+trn (or similar).
5716 {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: 1}, // mov.
5717 {.ISD: TTI::SK_Select, .Type: MVT::v2f32, .Cost: 1}, // mov.
5718 {.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: 2}, // rev+trn (or similar).
5719 {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: 1}, // mov.
5720 // PermuteSingleSrc shuffle kinds.
5721 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i32, .Cost: 1}, // mov.
5722 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: 3}, // perfectshuffle worst case.
5723 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: 1}, // mov.
5724 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f32, .Cost: 1}, // mov.
5725 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: 3}, // perfectshuffle worst case.
5726 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: 1}, // mov.
5727 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: 3}, // perfectshuffle worst case.
5728 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f16, .Cost: 3}, // perfectshuffle worst case.
5729 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4bf16, .Cost: 3}, // same
5730 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: 8}, // constpool + load + tbl
5731 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: 8}, // constpool + load + tbl
5732 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8bf16, .Cost: 8}, // constpool + load + tbl
5733 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: 8}, // constpool + load + tbl
5734 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: 8}, // constpool + load + tbl
5735 // Reverse can be lowered with `rev`.
5736 {.ISD: TTI::SK_Reverse, .Type: MVT::v2i32, .Cost: 1}, // REV64
5737 {.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: 2}, // REV64; EXT
5738 {.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: 1}, // EXT
5739 {.ISD: TTI::SK_Reverse, .Type: MVT::v2f32, .Cost: 1}, // REV64
5740 {.ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: 2}, // REV64; EXT
5741 {.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: 1}, // EXT
5742 {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: 2}, // REV64; EXT
5743 {.ISD: TTI::SK_Reverse, .Type: MVT::v8bf16, .Cost: 2}, // REV64; EXT
5744 {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: 2}, // REV64; EXT
5745 {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: 2}, // REV64; EXT
5746 {.ISD: TTI::SK_Reverse, .Type: MVT::v4f16, .Cost: 1}, // REV64
5747 {.ISD: TTI::SK_Reverse, .Type: MVT::v4bf16, .Cost: 1}, // REV64
5748 {.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: 1}, // REV64
5749 {.ISD: TTI::SK_Reverse, .Type: MVT::v8i8, .Cost: 1}, // REV64
5750 // Splice can all be lowered as `ext`.
5751 {.ISD: TTI::SK_Splice, .Type: MVT::v2i32, .Cost: 1},
5752 {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: 1},
5753 {.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: 1},
5754 {.ISD: TTI::SK_Splice, .Type: MVT::v2f32, .Cost: 1},
5755 {.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: 1},
5756 {.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: 1},
5757 {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: 1},
5758 {.ISD: TTI::SK_Splice, .Type: MVT::v8bf16, .Cost: 1},
5759 {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: 1},
5760 {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: 1},
5761 {.ISD: TTI::SK_Splice, .Type: MVT::v4f16, .Cost: 1},
5762 {.ISD: TTI::SK_Splice, .Type: MVT::v4bf16, .Cost: 1},
5763 {.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: 1},
5764 {.ISD: TTI::SK_Splice, .Type: MVT::v8i8, .Cost: 1},
5765 // Broadcast shuffle kinds for scalable vectors
5766 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i8, .Cost: 1},
5767 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i16, .Cost: 1},
5768 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i32, .Cost: 1},
5769 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i64, .Cost: 1},
5770 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f16, .Cost: 1},
5771 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f16, .Cost: 1},
5772 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8f16, .Cost: 1},
5773 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2bf16, .Cost: 1},
5774 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4bf16, .Cost: 1},
5775 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8bf16, .Cost: 1},
5776 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f32, .Cost: 1},
5777 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f32, .Cost: 1},
5778 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f64, .Cost: 1},
5779 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i1, .Cost: 1},
5780 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i1, .Cost: 1},
5781 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i1, .Cost: 1},
5782 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i1, .Cost: 1},
5783 // Handle the cases for vector.reverse with scalable vectors
5784 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i8, .Cost: 1},
5785 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i16, .Cost: 1},
5786 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i32, .Cost: 1},
5787 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i64, .Cost: 1},
5788 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f16, .Cost: 1},
5789 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f16, .Cost: 1},
5790 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8f16, .Cost: 1},
5791 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2bf16, .Cost: 1},
5792 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4bf16, .Cost: 1},
5793 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8bf16, .Cost: 1},
5794 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f32, .Cost: 1},
5795 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f32, .Cost: 1},
5796 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f64, .Cost: 1},
5797 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i1, .Cost: 1},
5798 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i1, .Cost: 1},
5799 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i1, .Cost: 1},
5800 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i1, .Cost: 1},
5801 };
5802 if (const auto *Entry = CostTableLookup(Table: ShuffleTbl, ISD: Kind, Ty: LT.second))
5803 return LT.first * Entry->Cost;
5804 }
5805
5806 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: SrcTy))
5807 return getSpliceCost(Tp: SrcTy, Index, CostKind);
5808
5809 // Inserting a subvector can often be done with either a D, S or H register
5810 // move, so long as the inserted vector is "aligned".
5811 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
5812 LT.second.getSizeInBits() <= 128 && SubTp) {
5813 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
5814 if (SubLT.second.isVector()) {
5815 int NumElts = LT.second.getVectorNumElements();
5816 int NumSubElts = SubLT.second.getVectorNumElements();
5817 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
5818 return SubLT.first;
5819 }
5820 }
5821
5822 // Restore optimal kind.
5823 if (IsExtractSubvector)
5824 Kind = TTI::SK_ExtractSubvector;
5825 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
5826 Args, CxtI);
5827}
5828
5829static bool containsDecreasingPointers(Loop *TheLoop,
5830 PredicatedScalarEvolution *PSE) {
5831 const auto &Strides = DenseMap<Value *, const SCEV *>();
5832 for (BasicBlock *BB : TheLoop->blocks()) {
5833 // Scan the instructions in the block and look for addresses that are
5834 // consecutive and decreasing.
5835 for (Instruction &I : *BB) {
5836 if (isa<LoadInst>(Val: &I) || isa<StoreInst>(Val: &I)) {
5837 Value *Ptr = getLoadStorePointerOperand(V: &I);
5838 Type *AccessTy = getLoadStoreType(I: &I);
5839 if (getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides, /*Assume=*/true,
5840 /*ShouldCheckWrap=*/false)
5841 .value_or(u: 0) < 0)
5842 return true;
5843 }
5844 }
5845 }
5846 return false;
5847}
5848
5849bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const {
5850 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
5851 return SVEPreferFixedOverScalableIfEqualCost;
5852 return ST->useFixedOverScalableIfEqualCost();
5853}
5854
5855unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
5856 return ST->getEpilogueVectorizationMinVF();
5857}
5858
5859bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
5860 if (!ST->hasSVE())
5861 return false;
5862
5863 // We don't currently support vectorisation with interleaving for SVE - with
5864 // such loops we're better off not using tail-folding. This gives us a chance
5865 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
5866 if (TFI->IAI->hasGroups())
5867 return false;
5868
5869 TailFoldingOpts Required = TailFoldingOpts::Disabled;
5870 if (TFI->LVL->getReductionVars().size())
5871 Required |= TailFoldingOpts::Reductions;
5872 if (TFI->LVL->getFixedOrderRecurrences().size())
5873 Required |= TailFoldingOpts::Recurrences;
5874
5875 // We call this to discover whether any load/store pointers in the loop have
5876 // negative strides. This will require extra work to reverse the loop
5877 // predicate, which may be expensive.
5878 if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(),
5879 PSE: TFI->LVL->getPredicatedScalarEvolution()))
5880 Required |= TailFoldingOpts::Reverse;
5881 if (Required == TailFoldingOpts::Disabled)
5882 Required |= TailFoldingOpts::Simple;
5883
5884 if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(),
5885 Required))
5886 return false;
5887
5888 // Don't tail-fold for tight loops where we would be better off interleaving
5889 // with an unpredicated loop.
5890 unsigned NumInsns = 0;
5891 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
5892 NumInsns += BB->sizeWithoutDebug();
5893 }
5894
5895 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
5896 return NumInsns >= SVETailFoldInsnThreshold;
5897}
5898
5899InstructionCost
5900AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
5901 StackOffset BaseOffset, bool HasBaseReg,
5902 int64_t Scale, unsigned AddrSpace) const {
5903 // Scaling factors are not free at all.
5904 // Operands | Rt Latency
5905 // -------------------------------------------
5906 // Rt, [Xn, Xm] | 4
5907 // -------------------------------------------
5908 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
5909 // Rt, [Xn, Wm, <extend> #imm] |
5910 TargetLoweringBase::AddrMode AM;
5911 AM.BaseGV = BaseGV;
5912 AM.BaseOffs = BaseOffset.getFixed();
5913 AM.HasBaseReg = HasBaseReg;
5914 AM.Scale = Scale;
5915 AM.ScalableOffset = BaseOffset.getScalable();
5916 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace))
5917 // Scale represents reg2 * scale, thus account for 1 if
5918 // it is not equal to 0 or 1.
5919 return AM.Scale != 0 && AM.Scale != 1;
5920 return InstructionCost::getInvalid();
5921}
5922
5923bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(
5924 const Instruction *I) const {
5925 if (EnableOrLikeSelectOpt) {
5926 // For the binary operators (e.g. or) we need to be more careful than
5927 // selects, here we only transform them if they are already at a natural
5928 // break point in the code - the end of a block with an unconditional
5929 // terminator.
5930 if (I->getOpcode() == Instruction::Or &&
5931 isa<BranchInst>(Val: I->getNextNode()) &&
5932 cast<BranchInst>(Val: I->getNextNode())->isUnconditional())
5933 return true;
5934
5935 if (I->getOpcode() == Instruction::Add ||
5936 I->getOpcode() == Instruction::Sub)
5937 return true;
5938 }
5939 return BaseT::shouldTreatInstructionLikeSelect(I);
5940}
5941
5942bool AArch64TTIImpl::isLSRCostLess(
5943 const TargetTransformInfo::LSRCost &C1,
5944 const TargetTransformInfo::LSRCost &C2) const {
5945 // AArch64 specific here is adding the number of instructions to the
5946 // comparison (though not as the first consideration, as some targets do)
5947 // along with changing the priority of the base additions.
5948 // TODO: Maybe a more nuanced tradeoff between instruction count
5949 // and number of registers? To be investigated at a later date.
5950 if (EnableLSRCostOpt)
5951 return std::tie(args: C1.NumRegs, args: C1.Insns, args: C1.NumBaseAdds, args: C1.AddRecCost,
5952 args: C1.NumIVMuls, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) <
5953 std::tie(args: C2.NumRegs, args: C2.Insns, args: C2.NumBaseAdds, args: C2.AddRecCost,
5954 args: C2.NumIVMuls, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost);
5955
5956 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
5957}
5958
5959static bool isSplatShuffle(Value *V) {
5960 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V))
5961 return all_equal(Range: Shuf->getShuffleMask());
5962 return false;
5963}
5964
5965/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
5966/// or upper half of the vector elements.
5967static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
5968 bool AllowSplat = false) {
5969 // Scalable types can't be extract shuffle vectors.
5970 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
5971 return false;
5972
5973 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
5974 auto *FullTy = FullV->getType();
5975 auto *HalfTy = HalfV->getType();
5976 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
5977 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
5978 };
5979
5980 auto extractHalf = [](Value *FullV, Value *HalfV) {
5981 auto *FullVT = cast<FixedVectorType>(Val: FullV->getType());
5982 auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType());
5983 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
5984 };
5985
5986 ArrayRef<int> M1, M2;
5987 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
5988 if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask(M1))) ||
5989 !match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask(M2))))
5990 return false;
5991
5992 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
5993 // it is not checked as an extract below.
5994 if (AllowSplat && isSplatShuffle(V: Op1))
5995 S1Op1 = nullptr;
5996 if (AllowSplat && isSplatShuffle(V: Op2))
5997 S2Op1 = nullptr;
5998
5999 // Check that the operands are half as wide as the result and we extract
6000 // half of the elements of the input vectors.
6001 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6002 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6003 return false;
6004
6005 // Check the mask extracts either the lower or upper half of vector
6006 // elements.
6007 int M1Start = 0;
6008 int M2Start = 0;
6009 int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * 2;
6010 if ((S1Op1 &&
6011 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) ||
6012 (S2Op1 &&
6013 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start)))
6014 return false;
6015
6016 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6017 (M2Start != 0 && M2Start != (NumElements / 2)))
6018 return false;
6019 if (S1Op1 && S2Op1 && M1Start != M2Start)
6020 return false;
6021
6022 return true;
6023}
6024
6025/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6026/// of the vector elements.
6027static bool areExtractExts(Value *Ext1, Value *Ext2) {
6028 auto areExtDoubled = [](Instruction *Ext) {
6029 return Ext->getType()->getScalarSizeInBits() ==
6030 2 * Ext->getOperand(i: 0)->getType()->getScalarSizeInBits();
6031 };
6032
6033 if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) ||
6034 !match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) ||
6035 !areExtDoubled(cast<Instruction>(Val: Ext1)) ||
6036 !areExtDoubled(cast<Instruction>(Val: Ext2)))
6037 return false;
6038
6039 return true;
6040}
6041
6042/// Check if Op could be used with vmull_high_p64 intrinsic.
6043static bool isOperandOfVmullHighP64(Value *Op) {
6044 Value *VectorOperand = nullptr;
6045 ConstantInt *ElementIndex = nullptr;
6046 return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand),
6047 Idx: m_ConstantInt(CI&: ElementIndex))) &&
6048 ElementIndex->getValue() == 1 &&
6049 isa<FixedVectorType>(Val: VectorOperand->getType()) &&
6050 cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == 2;
6051}
6052
6053/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6054static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6055 return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2);
6056}
6057
6058static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
6059 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6060 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs);
6061 if (!GEP || GEP->getNumOperands() != 2)
6062 return false;
6063
6064 Value *Base = GEP->getOperand(i_nocapture: 0);
6065 Value *Offsets = GEP->getOperand(i_nocapture: 1);
6066
6067 // We only care about scalar_base+vector_offsets.
6068 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6069 return false;
6070
6071 // Sink extends that would allow us to use 32-bit offset vectors.
6072 if (isa<SExtInst>(Val: Offsets) || isa<ZExtInst>(Val: Offsets)) {
6073 auto *OffsetsInst = cast<Instruction>(Val: Offsets);
6074 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6075 OffsetsInst->getOperand(i: 0)->getType()->getScalarSizeInBits() <= 32)
6076 Ops.push_back(Elt: &GEP->getOperandUse(i: 1));
6077 }
6078
6079 // Sink the GEP.
6080 return true;
6081}
6082
6083/// We want to sink following cases:
6084/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6085/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6086static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
6087 if (match(V: Op, P: m_VScale()))
6088 return true;
6089 if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) ||
6090 match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) {
6091 Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0));
6092 return true;
6093 }
6094 if (match(V: Op, P: m_Shl(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt())) ||
6095 match(V: Op, P: m_Mul(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt()))) {
6096 Value *ZExtOp = cast<Instruction>(Val: Op)->getOperand(i: 0);
6097 Ops.push_back(Elt: &cast<Instruction>(Val: ZExtOp)->getOperandUse(i: 0));
6098 Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0));
6099 return true;
6100 }
6101 return false;
6102}
6103
6104/// Check if sinking \p I's operands to I's basic block is profitable, because
6105/// the operands can be folded into a target instruction, e.g.
6106/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6107bool AArch64TTIImpl::isProfitableToSinkOperands(
6108 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6109 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
6110 switch (II->getIntrinsicID()) {
6111 case Intrinsic::aarch64_neon_smull:
6112 case Intrinsic::aarch64_neon_umull:
6113 if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1),
6114 /*AllowSplat=*/true)) {
6115 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6116 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6117 return true;
6118 }
6119 [[fallthrough]];
6120
6121 case Intrinsic::fma:
6122 case Intrinsic::fmuladd:
6123 if (isa<VectorType>(Val: I->getType()) &&
6124 cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
6125 !ST->hasFullFP16())
6126 return false;
6127 [[fallthrough]];
6128 case Intrinsic::aarch64_neon_sqdmull:
6129 case Intrinsic::aarch64_neon_sqdmulh:
6130 case Intrinsic::aarch64_neon_sqrdmulh:
6131 // Sink splats for index lane variants
6132 if (isSplatShuffle(V: II->getOperand(i_nocapture: 0)))
6133 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6134 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
6135 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6136 return !Ops.empty();
6137 case Intrinsic::aarch64_neon_fmlal:
6138 case Intrinsic::aarch64_neon_fmlal2:
6139 case Intrinsic::aarch64_neon_fmlsl:
6140 case Intrinsic::aarch64_neon_fmlsl2:
6141 // Sink splats for index lane variants
6142 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
6143 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6144 if (isSplatShuffle(V: II->getOperand(i_nocapture: 2)))
6145 Ops.push_back(Elt: &II->getOperandUse(i: 2));
6146 return !Ops.empty();
6147 case Intrinsic::aarch64_sve_ptest_first:
6148 case Intrinsic::aarch64_sve_ptest_last:
6149 if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: 0)))
6150 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6151 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6152 return !Ops.empty();
6153 case Intrinsic::aarch64_sme_write_horiz:
6154 case Intrinsic::aarch64_sme_write_vert:
6155 case Intrinsic::aarch64_sme_writeq_horiz:
6156 case Intrinsic::aarch64_sme_writeq_vert: {
6157 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 1));
6158 if (!Idx || Idx->getOpcode() != Instruction::Add)
6159 return false;
6160 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6161 return true;
6162 }
6163 case Intrinsic::aarch64_sme_read_horiz:
6164 case Intrinsic::aarch64_sme_read_vert:
6165 case Intrinsic::aarch64_sme_readq_horiz:
6166 case Intrinsic::aarch64_sme_readq_vert:
6167 case Intrinsic::aarch64_sme_ld1b_vert:
6168 case Intrinsic::aarch64_sme_ld1h_vert:
6169 case Intrinsic::aarch64_sme_ld1w_vert:
6170 case Intrinsic::aarch64_sme_ld1d_vert:
6171 case Intrinsic::aarch64_sme_ld1q_vert:
6172 case Intrinsic::aarch64_sme_st1b_vert:
6173 case Intrinsic::aarch64_sme_st1h_vert:
6174 case Intrinsic::aarch64_sme_st1w_vert:
6175 case Intrinsic::aarch64_sme_st1d_vert:
6176 case Intrinsic::aarch64_sme_st1q_vert:
6177 case Intrinsic::aarch64_sme_ld1b_horiz:
6178 case Intrinsic::aarch64_sme_ld1h_horiz:
6179 case Intrinsic::aarch64_sme_ld1w_horiz:
6180 case Intrinsic::aarch64_sme_ld1d_horiz:
6181 case Intrinsic::aarch64_sme_ld1q_horiz:
6182 case Intrinsic::aarch64_sme_st1b_horiz:
6183 case Intrinsic::aarch64_sme_st1h_horiz:
6184 case Intrinsic::aarch64_sme_st1w_horiz:
6185 case Intrinsic::aarch64_sme_st1d_horiz:
6186 case Intrinsic::aarch64_sme_st1q_horiz: {
6187 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 3));
6188 if (!Idx || Idx->getOpcode() != Instruction::Add)
6189 return false;
6190 Ops.push_back(Elt: &II->getOperandUse(i: 3));
6191 return true;
6192 }
6193 case Intrinsic::aarch64_neon_pmull:
6194 if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1)))
6195 return false;
6196 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6197 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6198 return true;
6199 case Intrinsic::aarch64_neon_pmull64:
6200 if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: 0),
6201 Op2: II->getArgOperand(i: 1)))
6202 return false;
6203 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
6204 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
6205 return true;
6206 case Intrinsic::masked_gather:
6207 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 0), Ops))
6208 return false;
6209 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
6210 return true;
6211 case Intrinsic::masked_scatter:
6212 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 1), Ops))
6213 return false;
6214 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
6215 return true;
6216 default:
6217 return false;
6218 }
6219 }
6220
6221 auto ShouldSinkCondition = [](Value *Cond) -> bool {
6222 auto *II = dyn_cast<IntrinsicInst>(Val: Cond);
6223 return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
6224 isa<ScalableVectorType>(Val: II->getOperand(i_nocapture: 0)->getType());
6225 };
6226
6227 switch (I->getOpcode()) {
6228 case Instruction::GetElementPtr:
6229 case Instruction::Add:
6230 case Instruction::Sub:
6231 // Sink vscales closer to uses for better isel
6232 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6233 if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) {
6234 Ops.push_back(Elt: &I->getOperandUse(i: Op));
6235 return true;
6236 }
6237 }
6238 break;
6239 case Instruction::Select: {
6240 if (!ShouldSinkCondition(I->getOperand(i: 0)))
6241 return false;
6242
6243 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6244 return true;
6245 }
6246 case Instruction::Br: {
6247 if (cast<BranchInst>(Val: I)->isUnconditional())
6248 return false;
6249
6250 if (!ShouldSinkCondition(cast<BranchInst>(Val: I)->getCondition()))
6251 return false;
6252
6253 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6254 return true;
6255 }
6256 default:
6257 break;
6258 }
6259
6260 if (!I->getType()->isVectorTy())
6261 return false;
6262
6263 switch (I->getOpcode()) {
6264 case Instruction::Sub:
6265 case Instruction::Add: {
6266 if (!areExtractExts(Ext1: I->getOperand(i: 0), Ext2: I->getOperand(i: 1)))
6267 return false;
6268
6269 // If the exts' operands extract either the lower or upper elements, we
6270 // can sink them too.
6271 auto Ext1 = cast<Instruction>(Val: I->getOperand(i: 0));
6272 auto Ext2 = cast<Instruction>(Val: I->getOperand(i: 1));
6273 if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: 0), Op2: Ext2->getOperand(i: 0))) {
6274 Ops.push_back(Elt: &Ext1->getOperandUse(i: 0));
6275 Ops.push_back(Elt: &Ext2->getOperandUse(i: 0));
6276 }
6277
6278 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6279 Ops.push_back(Elt: &I->getOperandUse(i: 1));
6280
6281 return true;
6282 }
6283 case Instruction::Or: {
6284 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6285 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6286 if (ST->hasNEON()) {
6287 Instruction *OtherAnd, *IA, *IB;
6288 Value *MaskValue;
6289 // MainAnd refers to And instruction that has 'Not' as one of its operands
6290 if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)),
6291 R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))),
6292 R: m_Instruction(I&: IA)))))) {
6293 if (match(V: OtherAnd,
6294 P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) {
6295 Instruction *MainAnd = I->getOperand(i: 0) == OtherAnd
6296 ? cast<Instruction>(Val: I->getOperand(i: 1))
6297 : cast<Instruction>(Val: I->getOperand(i: 0));
6298
6299 // Both Ands should be in same basic block as Or
6300 if (I->getParent() != MainAnd->getParent() ||
6301 I->getParent() != OtherAnd->getParent())
6302 return false;
6303
6304 // Non-mask operands of both Ands should also be in same basic block
6305 if (I->getParent() != IA->getParent() ||
6306 I->getParent() != IB->getParent())
6307 return false;
6308
6309 Ops.push_back(
6310 Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: 0) == IA ? 1 : 0));
6311 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6312 Ops.push_back(Elt: &I->getOperandUse(i: 1));
6313
6314 return true;
6315 }
6316 }
6317 }
6318
6319 return false;
6320 }
6321 case Instruction::Mul: {
6322 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6323 auto *Ty = cast<VectorType>(Val: V->getType());
6324 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6325 if (Ty->isScalableTy())
6326 return false;
6327
6328 // Indexed variants of Mul exist for i16 and i32 element types only.
6329 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6330 };
6331
6332 int NumZExts = 0, NumSExts = 0;
6333 for (auto &Op : I->operands()) {
6334 // Make sure we are not already sinking this operand
6335 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; }))
6336 continue;
6337
6338 if (match(V: &Op, P: m_ZExtOrSExt(Op: m_Value()))) {
6339 auto *Ext = cast<Instruction>(Val&: Op);
6340 auto *ExtOp = Ext->getOperand(i: 0);
6341 if (isSplatShuffle(V: ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6342 Ops.push_back(Elt: &Ext->getOperandUse(i: 0));
6343 Ops.push_back(Elt: &Op);
6344
6345 if (isa<SExtInst>(Val: Ext))
6346 NumSExts++;
6347 else
6348 NumZExts++;
6349
6350 continue;
6351 }
6352
6353 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op);
6354 if (!Shuffle)
6355 continue;
6356
6357 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6358 // operand and the s/zext can help create indexed s/umull. This is
6359 // especially useful to prevent i64 mul being scalarized.
6360 if (isSplatShuffle(V: Shuffle) &&
6361 match(V: Shuffle->getOperand(i_nocapture: 0), P: m_ZExtOrSExt(Op: m_Value()))) {
6362 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
6363 Ops.push_back(Elt: &Op);
6364 if (match(V: Shuffle->getOperand(i_nocapture: 0), P: m_SExt(Op: m_Value())))
6365 NumSExts++;
6366 else
6367 NumZExts++;
6368 continue;
6369 }
6370
6371 Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: 0);
6372 InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand);
6373 if (!Insert)
6374 continue;
6375
6376 Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: 1));
6377 if (!OperandInstr)
6378 continue;
6379
6380 ConstantInt *ElementConstant =
6381 dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: 2));
6382 // Check that the insertelement is inserting into element 0
6383 if (!ElementConstant || !ElementConstant->isZero())
6384 continue;
6385
6386 unsigned Opcode = OperandInstr->getOpcode();
6387 if (Opcode == Instruction::SExt)
6388 NumSExts++;
6389 else if (Opcode == Instruction::ZExt)
6390 NumZExts++;
6391 else {
6392 // If we find that the top bits are known 0, then we can sink and allow
6393 // the backend to generate a umull.
6394 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6395 APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / 2);
6396 if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, SQ: DL))
6397 continue;
6398 NumZExts++;
6399 }
6400
6401 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6402 // the And, just to hoist it again back to the load.
6403 if (!match(V: OperandInstr, P: m_And(L: m_Load(Op: m_Value()), R: m_Value())))
6404 Ops.push_back(Elt: &Insert->getOperandUse(i: 1));
6405 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
6406 Ops.push_back(Elt: &Op);
6407 }
6408
6409 // It is profitable to sink if we found two of the same type of extends.
6410 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6411 return true;
6412
6413 // Otherwise, see if we should sink splats for indexed variants.
6414 if (!ShouldSinkSplatForIndexedVariant(I))
6415 return false;
6416
6417 Ops.clear();
6418 if (isSplatShuffle(V: I->getOperand(i: 0)))
6419 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6420 if (isSplatShuffle(V: I->getOperand(i: 1)))
6421 Ops.push_back(Elt: &I->getOperandUse(i: 1));
6422
6423 return !Ops.empty();
6424 }
6425 case Instruction::FMul: {
6426 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6427 if (I->getType()->isScalableTy())
6428 return false;
6429
6430 if (cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
6431 !ST->hasFullFP16())
6432 return false;
6433
6434 // Sink splats for index lane variants
6435 if (isSplatShuffle(V: I->getOperand(i: 0)))
6436 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6437 if (isSplatShuffle(V: I->getOperand(i: 1)))
6438 Ops.push_back(Elt: &I->getOperandUse(i: 1));
6439 return !Ops.empty();
6440 }
6441 default:
6442 return false;
6443 }
6444 return false;
6445}
6446