1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "AArch64PerfectShuffle.h"
12#include "AArch64SMEAttributes.h"
13#include "MCTargetDesc/AArch64AddressingModes.h"
14#include "llvm/ADT/DenseMap.h"
15#include "llvm/Analysis/LoopInfo.h"
16#include "llvm/Analysis/TargetTransformInfo.h"
17#include "llvm/CodeGen/BasicTTIImpl.h"
18#include "llvm/CodeGen/CostTable.h"
19#include "llvm/CodeGen/TargetLowering.h"
20#include "llvm/IR/DerivedTypes.h"
21#include "llvm/IR/IntrinsicInst.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
24#include "llvm/IR/PatternMatch.h"
25#include "llvm/Support/Debug.h"
26#include "llvm/TargetParser/AArch64TargetParser.h"
27#include "llvm/Transforms/InstCombine/InstCombiner.h"
28#include "llvm/Transforms/Utils/UnrollLoop.h"
29#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(Val: true), cl::Hidden);
39
40static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost(
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(Val: 10),
44 cl::Hidden);
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(Val: 10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(Val: 15), cl::Hidden);
51
52static cl::opt<unsigned>
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(Val: 10),
54 cl::Hidden);
55
56static cl::opt<unsigned> CallPenaltyChangeSM(
57 "call-penalty-sm-change", cl::init(Val: 5), cl::Hidden,
58 cl::desc(
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
61static cl::opt<unsigned> InlineCallPenaltyChangeSM(
62 "inline-call-penalty-sm-change", cl::init(Val: 10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(Val: true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(Val: true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
72static cl::opt<unsigned>
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(Val: 8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
76static cl::opt<unsigned> DMBLookaheadThreshold(
77 "dmb-lookahead-threshold", cl::init(Val: 10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
80static cl::opt<int> Aarch64ForceUnrollThreshold(
81 "aarch64-force-unroll-threshold", cl::init(Val: 0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
94 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
95 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
96 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error(reason: "Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError(Opt: "");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(A&: TailFoldTypes, Separator: '+', MaxSplit: -1, KeepEmpty: false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Opt: Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
193static cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
214 cl::location(L&: TailFoldingOptionLoc));
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
219static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
225static cl::opt<bool> EnableScalableAutovecInStreamingMode(
226 "enable-scalable-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
232 SMEAttrs(F->getName(), TLI.getRuntimeLibcallsInfo()).isSMEABIRoutine();
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
238static bool hasPossibleIncompatibleOps(const Function *F,
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(Val: I).isInlineAsm() || isa<IntrinsicInst>(Val: I) ||
248 isSMEABIRoutineCall(CI: cast<CallInst>(Val: I), TLI)))
249 return true;
250 }
251 }
252 return false;
253}
254
255static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI,
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(Kind: AttributeStr).getValueAsString();
260 FeatureStr.split(A&: Features, Separator: ",");
261}
262
263APInt AArch64TTIImpl::getFeatureMask(const Function &F) const {
264 SmallVector<StringRef, 8> Features;
265 extractAttrFeatures(F, TTI: this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
269APInt AArch64TTIImpl::getPriorityMask(const Function &F) const {
270 SmallVector<StringRef, 8> Features;
271 extractAttrFeatures(F, TTI: this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
275bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {
276 return F.hasFnAttribute(Kind: "fmv-features");
277}
278
279bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
280 const Function *Callee) const {
281 SMECallAttrs CallAttrs(*Caller, *Callee);
282
283 // Never inline a function explicitly marked as being streaming,
284 // into a non-streaming function. Assume it was marked as streaming
285 // for a reason.
286 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
287 CallAttrs.callee().hasStreamingInterfaceOrBody())
288 return false;
289
290 // When inlining, we should consider the body of the function, not the
291 // interface.
292 if (CallAttrs.callee().hasStreamingBody()) {
293 CallAttrs.callee().set(M: SMEAttrs::SM_Compatible, Enable: false);
294 CallAttrs.callee().set(M: SMEAttrs::SM_Enabled, Enable: true);
295 }
296
297 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
298 return false;
299
300 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
301 CallAttrs.requiresPreservingZT0() ||
302 CallAttrs.requiresPreservingAllZAState()) {
303 if (hasPossibleIncompatibleOps(F: Callee, TLI: *getTLI()))
304 return false;
305 }
306
307 return BaseT::areInlineCompatible(Caller, Callee);
308}
309
310bool AArch64TTIImpl::areTypesABICompatible(const Function *Caller,
311 const Function *Callee,
312 ArrayRef<Type *> Types) const {
313 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
314 return false;
315
316 // We need to ensure that argument promotion does not attempt to promote
317 // pointers to fixed-length vector types larger than 128 bits like
318 // <8 x float> (and pointers to aggregate types which have such fixed-length
319 // vector type members) into the values of the pointees. Such vector types
320 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
321 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
322 // types can be safely treated as 128-bit NEON types and they cannot be
323 // distinguished in IR.
324 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range&: Types, P: [](Type *Ty) {
325 auto FVTy = dyn_cast<FixedVectorType>(Val: Ty);
326 return FVTy &&
327 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
328 }))
329 return false;
330
331 return true;
332}
333
334unsigned
335AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
336 unsigned DefaultCallPenalty) const {
337 // This function calculates a penalty for executing Call in F.
338 //
339 // There are two ways this function can be called:
340 // (1) F:
341 // call from F -> G (the call here is Call)
342 //
343 // For (1), Call.getCaller() == F, so it will always return a high cost if
344 // a streaming-mode change is required (thus promoting the need to inline the
345 // function)
346 //
347 // (2) F:
348 // call from F -> G (the call here is not Call)
349 // G:
350 // call from G -> H (the call here is Call)
351 //
352 // For (2), if after inlining the body of G into F the call to H requires a
353 // streaming-mode change, and the call to G from F would also require a
354 // streaming-mode change, then there is benefit to do the streaming-mode
355 // change only once and avoid inlining of G into F.
356
357 SMEAttrs FAttrs(*F);
358 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
359
360 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
361 if (F == Call.getCaller()) // (1)
362 return CallPenaltyChangeSM * DefaultCallPenalty;
363 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
364 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
365 }
366
367 return DefaultCallPenalty;
368}
369
370bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
371 TargetTransformInfo::RegisterKind K) const {
372 assert(K != TargetTransformInfo::RGK_Scalar);
373
374 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
375 return true;
376
377 return K == TargetTransformInfo::RGK_ScalableVector &&
378 ST->isSVEorStreamingSVEAvailable() &&
379 !ST->disableMaximizeScalableBandwidth();
380}
381
382/// Calculate the cost of materializing a 64-bit value. This helper
383/// method might only calculate a fraction of a larger immediate. Therefore it
384/// is valid to return a cost of ZERO.
385InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) const {
386 // Check if the immediate can be encoded within an instruction.
387 if (Val == 0 || AArch64_AM::isLogicalImmediate(imm: Val, regSize: 64))
388 return 0;
389
390 if (Val < 0)
391 Val = ~Val;
392
393 // Calculate how many moves we will need to materialize this constant.
394 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
395 AArch64_IMM::expandMOVImm(Imm: Val, BitSize: 64, Insn);
396 return Insn.size();
397}
398
399/// Calculate the cost of materializing the given constant.
400InstructionCost
401AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
402 TTI::TargetCostKind CostKind) const {
403 assert(Ty->isIntegerTy());
404
405 unsigned BitSize = Ty->getPrimitiveSizeInBits();
406 if (BitSize == 0)
407 return ~0U;
408
409 // Sign-extend all constants to a multiple of 64-bit.
410 APInt ImmVal = Imm;
411 if (BitSize & 0x3f)
412 ImmVal = Imm.sext(width: (BitSize + 63) & ~0x3fU);
413
414 // Split the constant into 64-bit chunks and calculate the cost for each
415 // chunk.
416 InstructionCost Cost = 0;
417 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
418 APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64);
419 int64_t Val = Tmp.getSExtValue();
420 Cost += getIntImmCost(Val);
421 }
422 // We need at least one instruction to materialze the constant.
423 return std::max<InstructionCost>(a: 1, b: Cost);
424}
425
426InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
427 const APInt &Imm, Type *Ty,
428 TTI::TargetCostKind CostKind,
429 Instruction *Inst) const {
430 assert(Ty->isIntegerTy());
431
432 unsigned BitSize = Ty->getPrimitiveSizeInBits();
433 // There is no cost model for constants with a bit size of 0. Return TCC_Free
434 // here, so that constant hoisting will ignore this constant.
435 if (BitSize == 0)
436 return TTI::TCC_Free;
437
438 unsigned ImmIdx = ~0U;
439 switch (Opcode) {
440 default:
441 return TTI::TCC_Free;
442 case Instruction::GetElementPtr:
443 // Always hoist the base address of a GetElementPtr.
444 if (Idx == 0)
445 return 2 * TTI::TCC_Basic;
446 return TTI::TCC_Free;
447 case Instruction::Store:
448 ImmIdx = 0;
449 break;
450 case Instruction::Add:
451 case Instruction::Sub:
452 case Instruction::Mul:
453 case Instruction::UDiv:
454 case Instruction::SDiv:
455 case Instruction::URem:
456 case Instruction::SRem:
457 case Instruction::And:
458 case Instruction::Or:
459 case Instruction::Xor:
460 case Instruction::ICmp:
461 ImmIdx = 1;
462 break;
463 // Always return TCC_Free for the shift value of a shift instruction.
464 case Instruction::Shl:
465 case Instruction::LShr:
466 case Instruction::AShr:
467 if (Idx == 1)
468 return TTI::TCC_Free;
469 break;
470 case Instruction::Trunc:
471 case Instruction::ZExt:
472 case Instruction::SExt:
473 case Instruction::IntToPtr:
474 case Instruction::PtrToInt:
475 case Instruction::BitCast:
476 case Instruction::PHI:
477 case Instruction::Call:
478 case Instruction::Select:
479 case Instruction::Ret:
480 case Instruction::Load:
481 break;
482 }
483
484 if (Idx == ImmIdx) {
485 int NumConstants = (BitSize + 63) / 64;
486 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
487 return (Cost <= NumConstants * TTI::TCC_Basic)
488 ? static_cast<int>(TTI::TCC_Free)
489 : Cost;
490 }
491 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
492}
493
494InstructionCost
495AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
496 const APInt &Imm, Type *Ty,
497 TTI::TargetCostKind CostKind) const {
498 assert(Ty->isIntegerTy());
499
500 unsigned BitSize = Ty->getPrimitiveSizeInBits();
501 // There is no cost model for constants with a bit size of 0. Return TCC_Free
502 // here, so that constant hoisting will ignore this constant.
503 if (BitSize == 0)
504 return TTI::TCC_Free;
505
506 // Most (all?) AArch64 intrinsics do not support folding immediates into the
507 // selected instruction, so we compute the materialization cost for the
508 // immediate directly.
509 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
510 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
511
512 switch (IID) {
513 default:
514 return TTI::TCC_Free;
515 case Intrinsic::sadd_with_overflow:
516 case Intrinsic::uadd_with_overflow:
517 case Intrinsic::ssub_with_overflow:
518 case Intrinsic::usub_with_overflow:
519 case Intrinsic::smul_with_overflow:
520 case Intrinsic::umul_with_overflow:
521 if (Idx == 1) {
522 int NumConstants = (BitSize + 63) / 64;
523 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
524 return (Cost <= NumConstants * TTI::TCC_Basic)
525 ? static_cast<int>(TTI::TCC_Free)
526 : Cost;
527 }
528 break;
529 case Intrinsic::experimental_stackmap:
530 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
531 return TTI::TCC_Free;
532 break;
533 case Intrinsic::experimental_patchpoint_void:
534 case Intrinsic::experimental_patchpoint:
535 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
536 return TTI::TCC_Free;
537 break;
538 case Intrinsic::experimental_gc_statepoint:
539 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
540 return TTI::TCC_Free;
541 break;
542 }
543 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
544}
545
546TargetTransformInfo::PopcntSupportKind
547AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
548 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
549 if (TyWidth == 32 || TyWidth == 64)
550 return TTI::PSK_FastHardware;
551 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
552 return TTI::PSK_Software;
553}
554
555InstructionCost AArch64TTIImpl::getBranchMispredictPenalty() const {
556 // MispredictPenalty is defined per-CPU in AArch64Sched*.td (e.g.,
557 // AArch64SchedNeoverseV2.td).
558 return ST->getSchedModel().MispredictPenalty;
559}
560
561static bool isUnpackedVectorVT(EVT VecVT) {
562 return VecVT.isScalableVector() &&
563 VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
564}
565
566static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
567 const IntrinsicCostAttributes &ICA) {
568 // We need to know at least the number of elements in the vector of buckets
569 // and the size of each element to update.
570 if (ICA.getArgTypes().size() < 2)
571 return InstructionCost::getInvalid();
572
573 // Only interested in costing for the hardware instruction from SVE2.
574 if (!ST->hasSVE2())
575 return InstructionCost::getInvalid();
576
577 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
578 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
579 unsigned TotalHistCnts = 1;
580
581 unsigned EltSize = EltTy->getScalarSizeInBits();
582 // Only allow (up to 64b) integers or pointers
583 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
584 return InstructionCost::getInvalid();
585
586 // FIXME: We should be able to generate histcnt for fixed-length vectors
587 // using ptrue with a specific VL.
588 if (VectorType *VTy = dyn_cast<VectorType>(Val: BucketPtrsTy)) {
589 unsigned EC = VTy->getElementCount().getKnownMinValue();
590 if (!isPowerOf2_64(Value: EC) || !VTy->isScalableTy())
591 return InstructionCost::getInvalid();
592
593 // HistCnt only supports 32b and 64b element types
594 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
595
596 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
597 return InstructionCost(BaseHistCntCost);
598
599 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
600 TotalHistCnts = EC / NaturalVectorWidth;
601
602 return InstructionCost(BaseHistCntCost * TotalHistCnts);
603 }
604
605 return InstructionCost::getInvalid();
606}
607
608InstructionCost
609AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
610 TTI::TargetCostKind CostKind) const {
611 // The code-generator is currently not able to handle scalable vectors
612 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
613 // it. This change will be removed when code-generation for these types is
614 // sufficiently reliable.
615 auto *RetTy = ICA.getReturnType();
616 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: RetTy))
617 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
618 return InstructionCost::getInvalid();
619
620 switch (ICA.getID()) {
621 case Intrinsic::experimental_vector_histogram_add: {
622 InstructionCost HistCost = getHistogramCost(ST, ICA);
623 // If the cost isn't valid, we may still be able to scalarize
624 if (HistCost.isValid())
625 return HistCost;
626 break;
627 }
628 case Intrinsic::clmul: {
629 auto LT = getTypeLegalizationCost(Ty: RetTy);
630
631 // PMUL v8i8/v16i8 is always available on AArch64
632 if (ST->hasNEON()) {
633 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
634 return LT.first;
635
636 // Scalar i8 lowers through scalar/vector moves around PMUL.
637 if (TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i8) {
638 auto *VecTy =
639 FixedVectorType::get(ElementType: Type::getInt8Ty(C&: RetTy->getContext()), NumElts: 8);
640 return 1 +
641 getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind,
642 Index: -1, Op0: nullptr, Op1: nullptr) *
643 2 +
644 getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind,
645 Index: -1, Op0: nullptr, Op1: nullptr);
646 }
647 }
648
649 if (LT.second.SimpleTy == MVT::nxv2i64)
650 if (ST->hasSVEAES() && (ST->isSVEAvailable() || ST->hasSSVE_AES()))
651 return LT.first * 3;
652
653 if (ST->hasSVE2() || ST->hasSME()) {
654 switch (LT.second.SimpleTy) {
655 case MVT::nxv16i8:
656 return LT.first;
657 case MVT::nxv8i16:
658 return LT.first * 6;
659 case MVT::nxv4i32:
660 return LT.first * 3;
661 case MVT::nxv2i64:
662 return LT.first * 8;
663 default:
664 break;
665 }
666 }
667
668 // Avoid +sve giving this cost 2 due to custom lowering: It's very slow
669 if (LT.second.SimpleTy == MVT::nxv2i64)
670 return 192;
671
672 if (ST->hasAES()) {
673 switch (LT.second.SimpleTy) {
674 case MVT::i16:
675 case MVT::i32:
676 case MVT::i64:
677 case MVT::i128: {
678 auto *VecTy =
679 FixedVectorType::get(ElementType: Type::getInt64Ty(C&: RetTy->getContext()), NumElts: 1);
680 return LT.first *
681 (1 +
682 getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind,
683 Index: -1, Op0: nullptr, Op1: nullptr) *
684 2 +
685 getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind,
686 Index: -1, Op0: nullptr, Op1: nullptr));
687 }
688 case MVT::v1i64:
689 return LT.first;
690 case MVT::v2i64:
691 return LT.first * 3;
692 case MVT::v2i32:
693 return LT.first * 6;
694 case MVT::v4i32:
695 return LT.first * 11;
696 case MVT::v4i16:
697 return LT.first * 14;
698 default:
699 break;
700 }
701 }
702 break;
703 }
704 case Intrinsic::umin:
705 case Intrinsic::umax:
706 case Intrinsic::smin:
707 case Intrinsic::smax: {
708 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
709 MVT::v8i16, MVT::v2i32, MVT::v4i32,
710 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
711 MVT::nxv2i64};
712 auto LT = getTypeLegalizationCost(Ty: RetTy);
713 // v2i64 types get converted to cmp+bif hence the cost of 2
714 if (LT.second == MVT::v2i64)
715 return LT.first * 2;
716 if (any_of(Range: ValidMinMaxTys, P: equal_to(Arg&: LT.second)))
717 return LT.first;
718 break;
719 }
720 case Intrinsic::scmp:
721 case Intrinsic::ucmp: {
722 static const CostTblEntry BitreverseTbl[] = {
723 {.ISD: Intrinsic::scmp, .Type: MVT::i32, .Cost: 3}, // cmp+cset+csinv
724 {.ISD: Intrinsic::scmp, .Type: MVT::i64, .Cost: 3}, // cmp+cset+csinv
725 {.ISD: Intrinsic::scmp, .Type: MVT::v8i8, .Cost: 3}, // cmgt+cmgt+sub
726 {.ISD: Intrinsic::scmp, .Type: MVT::v16i8, .Cost: 3}, // cmgt+cmgt+sub
727 {.ISD: Intrinsic::scmp, .Type: MVT::v4i16, .Cost: 3}, // cmgt+cmgt+sub
728 {.ISD: Intrinsic::scmp, .Type: MVT::v8i16, .Cost: 3}, // cmgt+cmgt+sub
729 {.ISD: Intrinsic::scmp, .Type: MVT::v2i32, .Cost: 3}, // cmgt+cmgt+sub
730 {.ISD: Intrinsic::scmp, .Type: MVT::v4i32, .Cost: 3}, // cmgt+cmgt+sub
731 {.ISD: Intrinsic::scmp, .Type: MVT::v1i64, .Cost: 3}, // cmgt+cmgt+sub
732 {.ISD: Intrinsic::scmp, .Type: MVT::v2i64, .Cost: 3}, // cmgt+cmgt+sub
733 };
734 const auto LT = getTypeLegalizationCost(Ty: RetTy);
735 const auto *Entry =
736 CostTableLookup(Table: BitreverseTbl, ISD: Intrinsic::scmp, Ty: LT.second);
737 if (Entry)
738 return Entry->Cost * LT.first;
739 break;
740 }
741 case Intrinsic::sadd_sat:
742 case Intrinsic::ssub_sat:
743 case Intrinsic::uadd_sat:
744 case Intrinsic::usub_sat: {
745 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
746 MVT::v8i16, MVT::v2i32, MVT::v4i32,
747 MVT::v2i64};
748 auto LT = getTypeLegalizationCost(Ty: RetTy);
749 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
750 // need to extend the type, as it uses shr(qadd(shl, shl)).
751 unsigned Instrs =
752 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
753 if (any_of(Range: ValidSatTys, P: equal_to(Arg&: LT.second)))
754 return LT.first * Instrs;
755
756 TypeSize TS = getDataLayout().getTypeSizeInBits(Ty: RetTy);
757 uint64_t VectorSize = TS.getKnownMinValue();
758
759 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(Value: VectorSize))
760 return LT.first * Instrs;
761
762 break;
763 }
764 case Intrinsic::abs: {
765 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
766 MVT::v8i16, MVT::v2i32, MVT::v4i32,
767 MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
768 MVT::nxv4i32, MVT::nxv2i64};
769 auto LT = getTypeLegalizationCost(Ty: RetTy);
770 if (any_of(Range: ValidAbsTys, P: equal_to(Arg&: LT.second)))
771 return LT.first;
772 break;
773 }
774 case Intrinsic::bswap: {
775 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
776 MVT::v4i32, MVT::v2i64};
777 auto LT = getTypeLegalizationCost(Ty: RetTy);
778 if (any_of(Range: ValidAbsTys, P: equal_to(Arg&: LT.second)) &&
779 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
780 return LT.first;
781 break;
782 }
783 case Intrinsic::fma:
784 case Intrinsic::fmuladd: {
785 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
786 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
787 Type *EltTy = RetTy->getScalarType();
788 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
789 (EltTy->isHalfTy() && ST->hasFullFP16()))
790 return getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: RetTy, CostKind);
791 break;
792 }
793 case Intrinsic::stepvector: {
794 InstructionCost Cost = 1; // Cost of the `index' instruction
795 auto LT = getTypeLegalizationCost(Ty: RetTy);
796 // Legalisation of illegal vectors involves an `index' instruction plus
797 // (LT.first - 1) vector adds.
798 if (LT.first > 1) {
799 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: RetTy->getContext());
800 InstructionCost AddCost =
801 getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind);
802 Cost += AddCost * (LT.first - 1);
803 }
804 return Cost;
805 }
806 case Intrinsic::vector_extract:
807 case Intrinsic::vector_insert: {
808 // If both the vector and subvector types are legal types and the index
809 // is 0, then this should be a no-op or simple operation; return a
810 // relatively low cost.
811
812 // If arguments aren't actually supplied, then we cannot determine the
813 // value of the index. We also want to skip predicate types.
814 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
815 ICA.getReturnType()->getScalarType()->isIntegerTy(BitWidth: 1))
816 break;
817
818 LLVMContext &C = RetTy->getContext();
819 EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
820 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
821 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy)
822 : getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[1]);
823 // Skip this if either the vector or subvector types are unpacked
824 // SVE types; they may get lowered to stack stores and loads.
825 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(VecVT: SubVecVT))
826 break;
827
828 TargetLoweringBase::LegalizeKind SubVecLK =
829 getTLI()->getTypeConversion(Context&: C, VT: SubVecVT);
830 TargetLoweringBase::LegalizeKind VecLK =
831 getTLI()->getTypeConversion(Context&: C, VT: VecVT);
832 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
833 const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx);
834 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
835 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
836 return TTI::TCC_Free;
837 break;
838 }
839 case Intrinsic::bitreverse: {
840 static const CostTblEntry BitreverseTbl[] = {
841 {.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: 1},
842 {.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: 1},
843 {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i8, .Cost: 1},
844 {.ISD: Intrinsic::bitreverse, .Type: MVT::v16i8, .Cost: 1},
845 {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i16, .Cost: 2},
846 {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i16, .Cost: 2},
847 {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i32, .Cost: 2},
848 {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i32, .Cost: 2},
849 {.ISD: Intrinsic::bitreverse, .Type: MVT::v1i64, .Cost: 2},
850 {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i64, .Cost: 2},
851 };
852 const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
853 const auto *Entry =
854 CostTableLookup(Table: BitreverseTbl, ISD: ICA.getID(), Ty: LegalisationCost.second);
855 if (Entry) {
856 // Cost Model is using the legal type(i32) that i8 and i16 will be
857 // converted to +1 so that we match the actual lowering cost
858 if (TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i8 ||
859 TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i16)
860 return LegalisationCost.first * Entry->Cost + 1;
861
862 return LegalisationCost.first * Entry->Cost;
863 }
864 break;
865 }
866 case Intrinsic::ctpop: {
867 if (!ST->hasNEON()) {
868 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
869 return getTypeLegalizationCost(Ty: RetTy).first * 12;
870 }
871 static const CostTblEntry CtpopCostTbl[] = {
872 {.ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: 4},
873 {.ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: 3},
874 {.ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: 2},
875 {.ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: 1},
876 {.ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: 4},
877 {.ISD: ISD::CTPOP, .Type: MVT::v2i32, .Cost: 3},
878 {.ISD: ISD::CTPOP, .Type: MVT::v4i16, .Cost: 2},
879 {.ISD: ISD::CTPOP, .Type: MVT::v8i8, .Cost: 1},
880 {.ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: 5},
881 // SVE types (For targets that override NEON for fixed length vectors)
882 {.ISD: ISD::CTPOP, .Type: MVT::nxv2i64, .Cost: 1},
883 {.ISD: ISD::CTPOP, .Type: MVT::nxv4i32, .Cost: 1},
884 {.ISD: ISD::CTPOP, .Type: MVT::nxv8i16, .Cost: 1},
885 {.ISD: ISD::CTPOP, .Type: MVT::nxv16i8, .Cost: 1},
886 };
887 auto LT = getTypeLegalizationCost(Ty: RetTy);
888 MVT MTy = LT.second;
889
890 // When SVE is available CNT will be used for fixed and scalable vectors.
891 if (ST->isSVEorStreamingSVEAvailable() && MTy.isFixedLengthVector())
892 MTy = MVT::getScalableVectorVT(VT: MTy.getVectorElementType(),
893 NumElements: 128 / MTy.getScalarSizeInBits());
894
895 if (const auto *Entry = CostTableLookup(Table: CtpopCostTbl, ISD: ISD::CTPOP, Ty: MTy)) {
896 // Extra cost of +1 when illegal vector types are legalized by promoting
897 // the integer type.
898 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
899 RetTy->getScalarSizeInBits()
900 ? 1
901 : 0;
902 return LT.first * Entry->Cost + ExtraCost;
903 }
904 break;
905 }
906 case Intrinsic::sadd_with_overflow:
907 case Intrinsic::uadd_with_overflow:
908 case Intrinsic::ssub_with_overflow:
909 case Intrinsic::usub_with_overflow:
910 case Intrinsic::smul_with_overflow:
911 case Intrinsic::umul_with_overflow: {
912 static const CostTblEntry WithOverflowCostTbl[] = {
913 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i8, .Cost: 3},
914 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i8, .Cost: 3},
915 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i16, .Cost: 3},
916 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i16, .Cost: 3},
917 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i32, .Cost: 1},
918 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i32, .Cost: 1},
919 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i64, .Cost: 1},
920 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i64, .Cost: 1},
921 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i8, .Cost: 3},
922 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i8, .Cost: 3},
923 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i16, .Cost: 3},
924 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i16, .Cost: 3},
925 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i32, .Cost: 1},
926 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i32, .Cost: 1},
927 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i64, .Cost: 1},
928 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i64, .Cost: 1},
929 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i8, .Cost: 5},
930 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i8, .Cost: 4},
931 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i16, .Cost: 5},
932 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i16, .Cost: 4},
933 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;tst
934 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;cmp sxtw
935 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;smulh;cmp
936 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;umulh;cmp asr
937 };
938 EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: 0), AllowUnknown: true);
939 if (MTy.isSimple())
940 if (const auto *Entry = CostTableLookup(Table: WithOverflowCostTbl, ISD: ICA.getID(),
941 Ty: MTy.getSimpleVT()))
942 return Entry->Cost;
943 break;
944 }
945 case Intrinsic::fptosi_sat:
946 case Intrinsic::fptoui_sat: {
947 if (ICA.getArgTypes().empty())
948 break;
949 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
950 auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]);
951 EVT MTy = TLI->getValueType(DL, Ty: RetTy);
952 // Check for the legal types, which are where the size of the input and the
953 // output are the same, or we are using cvt f64->i32 or f32->i64.
954 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
955 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
956 LT.second == MVT::v2f64)) {
957 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
958 (LT.second == MVT::f64 && MTy == MVT::i32) ||
959 (LT.second == MVT::f32 && MTy == MVT::i64)))
960 return LT.first;
961 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
962 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
963 MTy.getScalarSizeInBits() == 64)
964 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
965 }
966 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
967 // f32.
968 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
969 return LT.first + getIntrinsicInstrCost(
970 ICA: {ICA.getID(),
971 RetTy,
972 {ICA.getArgTypes()[0]->getWithNewType(
973 EltTy: Type::getFloatTy(C&: RetTy->getContext()))}},
974 CostKind);
975 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
976 (LT.second == MVT::f16 && MTy == MVT::i64) ||
977 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
978 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
979 return LT.first;
980 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
981 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
982 MTy.getScalarSizeInBits() == 32)
983 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
984 // Extending vector types v8f16->v8i32. These current scalarize but the
985 // codegen could be better.
986 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
987 MTy.getScalarSizeInBits() == 64)
988 return MTy.getVectorNumElements() * 3;
989
990 // If we can we use a legal convert followed by a min+max
991 if ((LT.second.getScalarType() == MVT::f32 ||
992 LT.second.getScalarType() == MVT::f64 ||
993 LT.second.getScalarType() == MVT::f16) &&
994 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
995 Type *LegalTy =
996 Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits());
997 if (LT.second.isVector())
998 LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount());
999 InstructionCost Cost = 1;
1000 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1001 : Intrinsic::umin,
1002 LegalTy, {LegalTy, LegalTy});
1003 Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
1004 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1005 : Intrinsic::umax,
1006 LegalTy, {LegalTy, LegalTy});
1007 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
1008 return LT.first * Cost +
1009 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
1010 : 1);
1011 }
1012 // Otherwise we need to follow the default expansion that clamps the value
1013 // using a float min/max with a fcmp+sel for nan handling when signed.
1014 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
1015 RetTy = RetTy->getScalarType();
1016 if (LT.second.isVector()) {
1017 FPTy = VectorType::get(ElementType: FPTy, EC: LT.second.getVectorElementCount());
1018 RetTy = VectorType::get(ElementType: RetTy, EC: LT.second.getVectorElementCount());
1019 }
1020 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
1021 InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind);
1022 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
1023 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
1024 Cost +=
1025 getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1026 Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind);
1027 if (IsSigned) {
1028 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
1029 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy,
1030 VecPred: CmpInst::FCMP_UNO, CostKind);
1031 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy,
1032 VecPred: CmpInst::FCMP_UNO, CostKind);
1033 }
1034 return LT.first * Cost;
1035 }
1036 case Intrinsic::fshl:
1037 case Intrinsic::fshr: {
1038 if (ICA.getArgs().empty())
1039 break;
1040
1041 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[2]);
1042
1043 // ROTR / ROTL is a funnel shift with equal first and second operand. For
1044 // ROTR on integer registers (i32/i64) this can be done in a single ror
1045 // instruction. A fshl with a non-constant shift uses a neg + ror.
1046 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
1047 (RetTy->getPrimitiveSizeInBits() == 32 ||
1048 RetTy->getPrimitiveSizeInBits() == 64)) {
1049 InstructionCost NegCost =
1050 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
1051 return 1 + NegCost;
1052 }
1053
1054 // TODO: Add handling for fshl where third argument is not a constant.
1055 if (!OpInfoZ.isConstant())
1056 break;
1057
1058 const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
1059 if (OpInfoZ.isUniform()) {
1060 static const CostTblEntry FshlTbl[] = {
1061 {.ISD: Intrinsic::fshl, .Type: MVT::v4i32, .Cost: 2}, // shl + usra
1062 {.ISD: Intrinsic::fshl, .Type: MVT::v2i64, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v16i8, .Cost: 2},
1063 {.ISD: Intrinsic::fshl, .Type: MVT::v8i16, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v2i32, .Cost: 2},
1064 {.ISD: Intrinsic::fshl, .Type: MVT::v8i8, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v4i16, .Cost: 2}};
1065 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
1066 // to avoid having to duplicate the costs.
1067 const auto *Entry =
1068 CostTableLookup(Table: FshlTbl, ISD: Intrinsic::fshl, Ty: LegalisationCost.second);
1069 if (Entry)
1070 return LegalisationCost.first * Entry->Cost;
1071 }
1072
1073 auto TyL = getTypeLegalizationCost(Ty: RetTy);
1074 if (!RetTy->isIntegerTy())
1075 break;
1076
1077 // Estimate cost manually, as types like i8 and i16 will get promoted to
1078 // i32 and CostTableLookup will ignore the extra conversion cost.
1079 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1080 RetTy->getScalarSizeInBits() < 64) ||
1081 (RetTy->getScalarSizeInBits() % 64 != 0);
1082 unsigned ExtraCost = HigherCost ? 1 : 0;
1083 if (RetTy->getScalarSizeInBits() == 32 ||
1084 RetTy->getScalarSizeInBits() == 64)
1085 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1086 // extr instruction.
1087 else if (HigherCost)
1088 ExtraCost = 1;
1089 else
1090 break;
1091 return TyL.first + ExtraCost;
1092 }
1093 case Intrinsic::get_active_lane_mask: {
1094 auto RetTy = cast<VectorType>(Val: ICA.getReturnType());
1095 EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy);
1096 EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
1097 if (getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT))
1098 break;
1099
1100 if (RetTy->isScalableTy()) {
1101 if (TLI->getTypeAction(Context&: RetTy->getContext(), VT: RetVT) !=
1102 TargetLowering::TypeSplitVector)
1103 break;
1104
1105 auto LT = getTypeLegalizationCost(Ty: RetTy);
1106 InstructionCost Cost = LT.first;
1107 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1108 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1109 // nxv32i1 = get_active_lane_mask(base, idx) ->
1110 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1111 if (ST->hasSVE2p1() || ST->hasSME2()) {
1112 Cost /= 2;
1113 if (Cost == 1)
1114 return Cost;
1115 }
1116
1117 // If more than one whilelo intrinsic is required, include the extra cost
1118 // required by the saturating add & select required to increment the
1119 // start value after the first intrinsic call.
1120 Type *OpTy = ICA.getArgTypes()[0];
1121 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1122 InstructionCost SplitCost = getIntrinsicInstrCost(ICA: AddAttrs, CostKind);
1123 Type *CondTy = OpTy->getWithNewBitWidth(NewBitWidth: 1);
1124 SplitCost += getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: OpTy, CondTy,
1125 VecPred: CmpInst::ICMP_UGT, CostKind);
1126 return Cost + (SplitCost * (Cost - 1));
1127 } else if (!getTLI()->isTypeLegal(VT: RetVT)) {
1128 // We don't have enough context at this point to determine if the mask
1129 // is going to be kept live after the block, which will force the vXi1
1130 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1131 // For now, we just assume the vectorizer created this intrinsic and
1132 // the result will be the input for a PHI. In this case the cost will
1133 // be extremely high for fixed-width vectors.
1134 // NOTE: getScalarizationOverhead returns a cost that's far too
1135 // pessimistic for the actual generated codegen. In reality there are
1136 // two instructions generated per lane.
1137 return cast<FixedVectorType>(Val: RetTy)->getNumElements() * 2;
1138 }
1139 break;
1140 }
1141 case Intrinsic::experimental_vector_match: {
1142 auto *NeedleTy = cast<FixedVectorType>(Val: ICA.getArgTypes()[1]);
1143 EVT SearchVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
1144 unsigned SearchSize = NeedleTy->getNumElements();
1145 if (!getTLI()->shouldExpandVectorMatch(VT: SearchVT, SearchSize)) {
1146 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1147 // Neoverse V3, these are cheap operations with the same latency as a
1148 // vector ADD. In most cases, however, we also need to do an extra DUP.
1149 // For fixed-length vectors we currently need an extra five--six
1150 // instructions besides the MATCH.
1151 InstructionCost Cost = 4;
1152 if (isa<FixedVectorType>(Val: RetTy))
1153 Cost += 10;
1154 return Cost;
1155 }
1156 break;
1157 }
1158 case Intrinsic::cttz: {
1159 auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]);
1160 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
1161 return LT.first * 2;
1162 if (LT.second == MVT::v4i16 || LT.second == MVT::v8i16 ||
1163 LT.second == MVT::v2i32 || LT.second == MVT::v4i32)
1164 return LT.first * 3;
1165 break;
1166 }
1167 case Intrinsic::experimental_cttz_elts: {
1168 EVT ArgVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
1169 if (!getTLI()->shouldExpandCttzElements(VT: ArgVT)) {
1170 // This will consist of a SVE brkb and a cntp instruction. These
1171 // typically have the same latency and half the throughput as a vector
1172 // add instruction.
1173 return 4;
1174 }
1175 break;
1176 }
1177 case Intrinsic::loop_dependence_raw_mask:
1178 case Intrinsic::loop_dependence_war_mask: {
1179 // The whilewr/rw instructions require SVE2 or SME.
1180 if (ST->hasSVE2() || ST->hasSME()) {
1181 EVT VecVT = getTLI()->getValueType(DL, Ty: RetTy);
1182 unsigned EltSizeInBytes =
1183 cast<ConstantInt>(Val: ICA.getArgs()[2])->getZExtValue();
1184 if (!is_contained(Set: {1u, 2u, 4u, 8u}, Element: EltSizeInBytes) ||
1185 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1186 break;
1187 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1188 return isa<FixedVectorType>(Val: RetTy) ? 2 : 1;
1189 }
1190 break;
1191 }
1192 case Intrinsic::experimental_vector_extract_last_active:
1193 if (ST->isSVEorStreamingSVEAvailable()) {
1194 auto [LegalCost, _] = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]);
1195 // This should turn into chained clastb instructions.
1196 return LegalCost;
1197 }
1198 break;
1199 case Intrinsic::pow: {
1200 // For scalar calls we know the target has the libcall, and for fixed-width
1201 // vectors we know for the worst case it can be scalarised.
1202 EVT VT = getTLI()->getValueType(DL, Ty: RetTy);
1203 RTLIB::Libcall LC = RTLIB::getPOW(RetVT: VT);
1204 bool HasLibcall = getTLI()->getLibcallImpl(Call: LC) != RTLIB::Unsupported;
1205 bool CanLowerWithLibcalls = !isa<ScalableVectorType>(Val: RetTy) || HasLibcall;
1206
1207 // If we know that the call can be lowered with libcalls then it's safe to
1208 // reduce the costs in some cases. This is important for scalable vectors,
1209 // since we cannot scalarize the call in the absence of a vector math
1210 // library.
1211 if (CanLowerWithLibcalls && ICA.getInst() && !ICA.getArgs().empty()) {
1212 // If we know the fast math flags and the exponent is a constant then the
1213 // cost may be less for some exponents like 0.25 and 0.75.
1214 const Constant *ExpC = dyn_cast<Constant>(Val: ICA.getArgs()[1]);
1215 if (ExpC && isa<VectorType>(Val: ExpC->getType()))
1216 ExpC = ExpC->getSplatValue();
1217 if (auto *ExpF = dyn_cast_or_null<ConstantFP>(Val: ExpC)) {
1218 // The argument must be a FP constant.
1219 bool Is025 = ExpF->getValueAPF().isExactlyValue(V: 0.25);
1220 bool Is075 = ExpF->getValueAPF().isExactlyValue(V: 0.75);
1221 FastMathFlags FMF = ICA.getInst()->getFastMathFlags();
1222 if ((Is025 || Is075) && FMF.noInfs() && FMF.approxFunc() &&
1223 (!Is025 || FMF.noSignedZeros())) {
1224 IntrinsicCostAttributes Attrs(Intrinsic::sqrt, RetTy, {RetTy}, FMF);
1225 InstructionCost Sqrt = getIntrinsicInstrCost(ICA: Attrs, CostKind);
1226 if (Is025)
1227 return 2 * Sqrt;
1228 InstructionCost FMul =
1229 getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: RetTy, CostKind);
1230 return (Sqrt * 2) + FMul;
1231 }
1232 // TODO: For 1/3 exponents we expect the cbrt call to be slightly
1233 // cheaper than pow.
1234 }
1235 }
1236
1237 if (HasLibcall)
1238 return getCallInstrCost(F: nullptr, RetTy, Tys: ICA.getArgTypes(), CostKind);
1239 break;
1240 }
1241 case Intrinsic::sqrt:
1242 case Intrinsic::fabs:
1243 case Intrinsic::ceil:
1244 case Intrinsic::floor:
1245 case Intrinsic::nearbyint:
1246 case Intrinsic::round:
1247 case Intrinsic::rint:
1248 case Intrinsic::roundeven:
1249 case Intrinsic::trunc:
1250 case Intrinsic::minnum:
1251 case Intrinsic::maxnum:
1252 case Intrinsic::minimum:
1253 case Intrinsic::maximum: {
1254 if (isa<ScalableVectorType>(Val: RetTy) && ST->isSVEorStreamingSVEAvailable()) {
1255 auto LT = getTypeLegalizationCost(Ty: RetTy);
1256 return LT.first;
1257 }
1258 break;
1259 }
1260 default:
1261 break;
1262 }
1263 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1264}
1265
1266/// The function will remove redundant reinterprets casting in the presence
1267/// of the control flow
1268static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1269 IntrinsicInst &II) {
1270 SmallVector<Instruction *, 32> Worklist;
1271 auto RequiredType = II.getType();
1272
1273 auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: 0));
1274 assert(PN && "Expected Phi Node!");
1275
1276 // Don't create a new Phi unless we can remove the old one.
1277 if (!PN->hasOneUse())
1278 return std::nullopt;
1279
1280 for (Value *IncValPhi : PN->incoming_values()) {
1281 auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi);
1282 if (!Reinterpret ||
1283 Reinterpret->getIntrinsicID() !=
1284 Intrinsic::aarch64_sve_convert_to_svbool ||
1285 RequiredType != Reinterpret->getArgOperand(i: 0)->getType())
1286 return std::nullopt;
1287 }
1288
1289 // Create the new Phi
1290 IC.Builder.SetInsertPoint(PN);
1291 PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues());
1292 Worklist.push_back(Elt: PN);
1293
1294 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1295 auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I));
1296 NPN->addIncoming(V: Reinterpret->getOperand(i: 0), BB: PN->getIncomingBlock(i: I));
1297 Worklist.push_back(Elt: Reinterpret);
1298 }
1299
1300 // Cleanup Phi Node and reinterprets
1301 return IC.replaceInstUsesWith(I&: II, V: NPN);
1302}
1303
1304// A collection of properties common to SVE intrinsics that allow for combines
1305// to be written without needing to know the specific intrinsic.
1306struct SVEIntrinsicInfo {
1307 //
1308 // Helper routines for common intrinsic definitions.
1309 //
1310
1311 // e.g. llvm.aarch64.sve.add pg, op1, op2
1312 // with IID ==> llvm.aarch64.sve.add_u
1313 static SVEIntrinsicInfo
1314 defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) {
1315 return SVEIntrinsicInfo()
1316 .setGoverningPredicateOperandIdx(0)
1317 .setOperandIdxInactiveLanesTakenFrom(1)
1318 .setMatchingUndefIntrinsic(IID);
1319 }
1320
1321 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1322 static SVEIntrinsicInfo defaultMergingUnaryOp() {
1323 return SVEIntrinsicInfo()
1324 .setGoverningPredicateOperandIdx(1)
1325 .setOperandIdxInactiveLanesTakenFrom(0)
1326 .setOperandIdxWithNoActiveLanes(0);
1327 }
1328
1329 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1330 static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() {
1331 return SVEIntrinsicInfo()
1332 .setGoverningPredicateOperandIdx(1)
1333 .setOperandIdxInactiveLanesTakenFrom(0);
1334 }
1335
1336 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1337 static SVEIntrinsicInfo defaultUndefOp() {
1338 return SVEIntrinsicInfo()
1339 .setGoverningPredicateOperandIdx(0)
1340 .setInactiveLanesAreNotDefined();
1341 }
1342
1343 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1344 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1345 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1346 return SVEIntrinsicInfo()
1347 .setGoverningPredicateOperandIdx(GPIndex)
1348 .setInactiveLanesAreUnused();
1349 }
1350
1351 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1352 // llvm.aarch64.sve.ld1 pg, ptr
1353 static SVEIntrinsicInfo defaultZeroingOp() {
1354 return SVEIntrinsicInfo()
1355 .setGoverningPredicateOperandIdx(0)
1356 .setInactiveLanesAreUnused()
1357 .setResultIsZeroInitialized();
1358 }
1359
1360 // All properties relate to predication and thus having a general predicate
1361 // is the minimum requirement to say there is intrinsic info to act on.
1362 explicit operator bool() const { return hasGoverningPredicate(); }
1363
1364 //
1365 // Properties relating to the governing predicate.
1366 //
1367
1368 bool hasGoverningPredicate() const {
1369 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1370 }
1371
1372 unsigned getGoverningPredicateOperandIdx() const {
1373 assert(hasGoverningPredicate() && "Propery not set!");
1374 return GoverningPredicateIdx;
1375 }
1376
1377 SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) {
1378 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1379 GoverningPredicateIdx = Index;
1380 return *this;
1381 }
1382
1383 //
1384 // Properties relating to operations the intrinsic could be transformed into.
1385 // NOTE: This does not mean such a transformation is always possible, but the
1386 // knowledge makes it possible to reuse existing optimisations without needing
1387 // to embed specific handling for each intrinsic. For example, instruction
1388 // simplification can be used to optimise an intrinsic's active lanes.
1389 //
1390
1391 bool hasMatchingUndefIntrinsic() const {
1392 return UndefIntrinsic != Intrinsic::not_intrinsic;
1393 }
1394
1395 Intrinsic::ID getMatchingUndefIntrinsic() const {
1396 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1397 return UndefIntrinsic;
1398 }
1399
1400 SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) {
1401 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1402 UndefIntrinsic = IID;
1403 return *this;
1404 }
1405
1406 bool hasMatchingIROpode() const { return IROpcode != 0; }
1407
1408 unsigned getMatchingIROpode() const {
1409 assert(hasMatchingIROpode() && "Propery not set!");
1410 return IROpcode;
1411 }
1412
1413 SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
1414 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1415 IROpcode = Opcode;
1416 return *this;
1417 }
1418
1419 //
1420 // Properties relating to the result of inactive lanes.
1421 //
1422
1423 bool inactiveLanesTakenFromOperand() const {
1424 return ResultLanes == InactiveLanesTakenFromOperand;
1425 }
1426
1427 unsigned getOperandIdxInactiveLanesTakenFrom() const {
1428 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1429 return OperandIdxForInactiveLanes;
1430 }
1431
1432 SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) {
1433 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1434 ResultLanes = InactiveLanesTakenFromOperand;
1435 OperandIdxForInactiveLanes = Index;
1436 return *this;
1437 }
1438
1439 bool inactiveLanesAreNotDefined() const {
1440 return ResultLanes == InactiveLanesAreNotDefined;
1441 }
1442
1443 SVEIntrinsicInfo &setInactiveLanesAreNotDefined() {
1444 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1445 ResultLanes = InactiveLanesAreNotDefined;
1446 return *this;
1447 }
1448
1449 bool inactiveLanesAreUnused() const {
1450 return ResultLanes == InactiveLanesAreUnused;
1451 }
1452
1453 SVEIntrinsicInfo &setInactiveLanesAreUnused() {
1454 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1455 ResultLanes = InactiveLanesAreUnused;
1456 return *this;
1457 }
1458
1459 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1460 // inactiveLanesAreZeroed =
1461 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1462 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1463
1464 SVEIntrinsicInfo &setResultIsZeroInitialized() {
1465 ResultIsZeroInitialized = true;
1466 return *this;
1467 }
1468
1469 //
1470 // The first operand of unary merging operations is typically only used to
1471 // set the result for inactive lanes. Knowing this allows us to deadcode the
1472 // operand when we can prove there are no inactive lanes.
1473 //
1474
1475 bool hasOperandWithNoActiveLanes() const {
1476 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1477 }
1478
1479 unsigned getOperandIdxWithNoActiveLanes() const {
1480 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1481 return OperandIdxWithNoActiveLanes;
1482 }
1483
1484 SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) {
1485 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1486 OperandIdxWithNoActiveLanes = Index;
1487 return *this;
1488 }
1489
1490private:
1491 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1492
1493 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1494 unsigned IROpcode = 0;
1495
1496 enum PredicationStyle {
1497 Uninitialized,
1498 InactiveLanesTakenFromOperand,
1499 InactiveLanesAreNotDefined,
1500 InactiveLanesAreUnused
1501 } ResultLanes = Uninitialized;
1502
1503 bool ResultIsZeroInitialized = false;
1504 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1505 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1506};
1507
1508static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
1509 // Some SVE intrinsics do not use scalable vector types, but since they are
1510 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1511 if (!isa<ScalableVectorType>(Val: II.getType()) &&
1512 all_of(Range: II.args(), P: [&](const Value *V) {
1513 return !isa<ScalableVectorType>(Val: V->getType());
1514 }))
1515 return SVEIntrinsicInfo();
1516
1517 Intrinsic::ID IID = II.getIntrinsicID();
1518 switch (IID) {
1519 default:
1520 break;
1521 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1522 case Intrinsic::aarch64_sve_fcvt_f16f32:
1523 case Intrinsic::aarch64_sve_fcvt_f16f64:
1524 case Intrinsic::aarch64_sve_fcvt_f32f16:
1525 case Intrinsic::aarch64_sve_fcvt_f32f64:
1526 case Intrinsic::aarch64_sve_fcvt_f64f16:
1527 case Intrinsic::aarch64_sve_fcvt_f64f32:
1528 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1529 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1530 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1531 case Intrinsic::aarch64_sve_fcvtzs:
1532 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1533 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1534 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1535 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1536 case Intrinsic::aarch64_sve_fcvtzu:
1537 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1538 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1539 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1540 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1541 case Intrinsic::aarch64_sve_revb:
1542 case Intrinsic::aarch64_sve_revh:
1543 case Intrinsic::aarch64_sve_revw:
1544 case Intrinsic::aarch64_sve_revd:
1545 case Intrinsic::aarch64_sve_scvtf:
1546 case Intrinsic::aarch64_sve_scvtf_f16i32:
1547 case Intrinsic::aarch64_sve_scvtf_f16i64:
1548 case Intrinsic::aarch64_sve_scvtf_f32i64:
1549 case Intrinsic::aarch64_sve_scvtf_f64i32:
1550 case Intrinsic::aarch64_sve_ucvtf:
1551 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1552 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1553 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1554 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1555 return SVEIntrinsicInfo::defaultMergingUnaryOp();
1556
1557 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1558 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1559 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1560 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1561 return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp();
1562
1563 case Intrinsic::aarch64_sve_fabd:
1564 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fabd_u);
1565 case Intrinsic::aarch64_sve_fadd:
1566 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fadd_u)
1567 .setMatchingIROpcode(Instruction::FAdd);
1568 case Intrinsic::aarch64_sve_fdiv:
1569 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fdiv_u)
1570 .setMatchingIROpcode(Instruction::FDiv);
1571 case Intrinsic::aarch64_sve_fmax:
1572 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmax_u);
1573 case Intrinsic::aarch64_sve_fmaxnm:
1574 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmaxnm_u);
1575 case Intrinsic::aarch64_sve_fmin:
1576 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmin_u);
1577 case Intrinsic::aarch64_sve_fminnm:
1578 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fminnm_u);
1579 case Intrinsic::aarch64_sve_fmla:
1580 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmla_u);
1581 case Intrinsic::aarch64_sve_fmls:
1582 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmls_u);
1583 case Intrinsic::aarch64_sve_fmul:
1584 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmul_u)
1585 .setMatchingIROpcode(Instruction::FMul);
1586 case Intrinsic::aarch64_sve_fmulx:
1587 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmulx_u);
1588 case Intrinsic::aarch64_sve_fnmla:
1589 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmla_u);
1590 case Intrinsic::aarch64_sve_fnmls:
1591 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmls_u);
1592 case Intrinsic::aarch64_sve_fsub:
1593 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fsub_u)
1594 .setMatchingIROpcode(Instruction::FSub);
1595 case Intrinsic::aarch64_sve_add:
1596 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_add_u)
1597 .setMatchingIROpcode(Instruction::Add);
1598 case Intrinsic::aarch64_sve_mla:
1599 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mla_u);
1600 case Intrinsic::aarch64_sve_mls:
1601 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mls_u);
1602 case Intrinsic::aarch64_sve_mul:
1603 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mul_u)
1604 .setMatchingIROpcode(Instruction::Mul);
1605 case Intrinsic::aarch64_sve_sabd:
1606 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sabd_u);
1607 case Intrinsic::aarch64_sve_sdiv:
1608 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sdiv_u)
1609 .setMatchingIROpcode(Instruction::SDiv);
1610 case Intrinsic::aarch64_sve_smax:
1611 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smax_u);
1612 case Intrinsic::aarch64_sve_smin:
1613 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smin_u);
1614 case Intrinsic::aarch64_sve_smulh:
1615 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smulh_u);
1616 case Intrinsic::aarch64_sve_sub:
1617 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sub_u)
1618 .setMatchingIROpcode(Instruction::Sub);
1619 case Intrinsic::aarch64_sve_uabd:
1620 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uabd_u);
1621 case Intrinsic::aarch64_sve_udiv:
1622 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_udiv_u)
1623 .setMatchingIROpcode(Instruction::UDiv);
1624 case Intrinsic::aarch64_sve_umax:
1625 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umax_u);
1626 case Intrinsic::aarch64_sve_umin:
1627 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umin_u);
1628 case Intrinsic::aarch64_sve_umulh:
1629 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umulh_u);
1630 case Intrinsic::aarch64_sve_asr:
1631 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_asr_u)
1632 .setMatchingIROpcode(Instruction::AShr);
1633 case Intrinsic::aarch64_sve_lsl:
1634 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsl_u)
1635 .setMatchingIROpcode(Instruction::Shl);
1636 case Intrinsic::aarch64_sve_lsr:
1637 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsr_u)
1638 .setMatchingIROpcode(Instruction::LShr);
1639 case Intrinsic::aarch64_sve_and:
1640 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_and_u)
1641 .setMatchingIROpcode(Instruction::And);
1642 case Intrinsic::aarch64_sve_bic:
1643 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_bic_u);
1644 case Intrinsic::aarch64_sve_eor:
1645 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_eor_u)
1646 .setMatchingIROpcode(Instruction::Xor);
1647 case Intrinsic::aarch64_sve_orr:
1648 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_orr_u)
1649 .setMatchingIROpcode(Instruction::Or);
1650 case Intrinsic::aarch64_sve_shsub:
1651 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_shsub_u);
1652 case Intrinsic::aarch64_sve_shsubr:
1653 return SVEIntrinsicInfo::defaultMergingOp();
1654 case Intrinsic::aarch64_sve_sqrshl:
1655 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqrshl_u);
1656 case Intrinsic::aarch64_sve_sqshl:
1657 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqshl_u);
1658 case Intrinsic::aarch64_sve_sqsub:
1659 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqsub_u);
1660 case Intrinsic::aarch64_sve_srshl:
1661 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_srshl_u);
1662 case Intrinsic::aarch64_sve_uhsub:
1663 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uhsub_u);
1664 case Intrinsic::aarch64_sve_uhsubr:
1665 return SVEIntrinsicInfo::defaultMergingOp();
1666 case Intrinsic::aarch64_sve_uqrshl:
1667 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqrshl_u);
1668 case Intrinsic::aarch64_sve_uqshl:
1669 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqshl_u);
1670 case Intrinsic::aarch64_sve_uqsub:
1671 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqsub_u);
1672 case Intrinsic::aarch64_sve_urshl:
1673 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_urshl_u);
1674
1675 case Intrinsic::aarch64_sve_add_u:
1676 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1677 Instruction::Add);
1678 case Intrinsic::aarch64_sve_and_u:
1679 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1680 Instruction::And);
1681 case Intrinsic::aarch64_sve_asr_u:
1682 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1683 Instruction::AShr);
1684 case Intrinsic::aarch64_sve_eor_u:
1685 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1686 Instruction::Xor);
1687 case Intrinsic::aarch64_sve_fadd_u:
1688 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1689 Instruction::FAdd);
1690 case Intrinsic::aarch64_sve_fdiv_u:
1691 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1692 Instruction::FDiv);
1693 case Intrinsic::aarch64_sve_fmul_u:
1694 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1695 Instruction::FMul);
1696 case Intrinsic::aarch64_sve_fsub_u:
1697 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1698 Instruction::FSub);
1699 case Intrinsic::aarch64_sve_lsl_u:
1700 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1701 Instruction::Shl);
1702 case Intrinsic::aarch64_sve_lsr_u:
1703 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1704 Instruction::LShr);
1705 case Intrinsic::aarch64_sve_mul_u:
1706 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1707 Instruction::Mul);
1708 case Intrinsic::aarch64_sve_orr_u:
1709 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1710 Instruction::Or);
1711 case Intrinsic::aarch64_sve_sdiv_u:
1712 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1713 Instruction::SDiv);
1714 case Intrinsic::aarch64_sve_sub_u:
1715 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1716 Instruction::Sub);
1717 case Intrinsic::aarch64_sve_udiv_u:
1718 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1719 Instruction::UDiv);
1720
1721 case Intrinsic::aarch64_sve_addqv:
1722 case Intrinsic::aarch64_sve_and_z:
1723 case Intrinsic::aarch64_sve_bic_z:
1724 case Intrinsic::aarch64_sve_brka_z:
1725 case Intrinsic::aarch64_sve_brkb_z:
1726 case Intrinsic::aarch64_sve_brkn_z:
1727 case Intrinsic::aarch64_sve_brkpa_z:
1728 case Intrinsic::aarch64_sve_brkpb_z:
1729 case Intrinsic::aarch64_sve_cntp:
1730 case Intrinsic::aarch64_sve_compact:
1731 case Intrinsic::aarch64_sve_eor_z:
1732 case Intrinsic::aarch64_sve_eorv:
1733 case Intrinsic::aarch64_sve_eorqv:
1734 case Intrinsic::aarch64_sve_nand_z:
1735 case Intrinsic::aarch64_sve_nor_z:
1736 case Intrinsic::aarch64_sve_orn_z:
1737 case Intrinsic::aarch64_sve_orr_z:
1738 case Intrinsic::aarch64_sve_orv:
1739 case Intrinsic::aarch64_sve_orqv:
1740 case Intrinsic::aarch64_sve_pnext:
1741 case Intrinsic::aarch64_sve_rdffr_z:
1742 case Intrinsic::aarch64_sve_saddv:
1743 case Intrinsic::aarch64_sve_uaddv:
1744 case Intrinsic::aarch64_sve_umaxv:
1745 case Intrinsic::aarch64_sve_umaxqv:
1746 case Intrinsic::aarch64_sve_cmpeq:
1747 case Intrinsic::aarch64_sve_cmpeq_wide:
1748 case Intrinsic::aarch64_sve_cmpge:
1749 case Intrinsic::aarch64_sve_cmpge_wide:
1750 case Intrinsic::aarch64_sve_cmpgt:
1751 case Intrinsic::aarch64_sve_cmpgt_wide:
1752 case Intrinsic::aarch64_sve_cmphi:
1753 case Intrinsic::aarch64_sve_cmphi_wide:
1754 case Intrinsic::aarch64_sve_cmphs:
1755 case Intrinsic::aarch64_sve_cmphs_wide:
1756 case Intrinsic::aarch64_sve_cmple_wide:
1757 case Intrinsic::aarch64_sve_cmplo_wide:
1758 case Intrinsic::aarch64_sve_cmpls_wide:
1759 case Intrinsic::aarch64_sve_cmplt_wide:
1760 case Intrinsic::aarch64_sve_cmpne:
1761 case Intrinsic::aarch64_sve_cmpne_wide:
1762 case Intrinsic::aarch64_sve_facge:
1763 case Intrinsic::aarch64_sve_facgt:
1764 case Intrinsic::aarch64_sve_fcmpeq:
1765 case Intrinsic::aarch64_sve_fcmpge:
1766 case Intrinsic::aarch64_sve_fcmpgt:
1767 case Intrinsic::aarch64_sve_fcmpne:
1768 case Intrinsic::aarch64_sve_fcmpuo:
1769 case Intrinsic::aarch64_sve_ld1:
1770 case Intrinsic::aarch64_sve_ld1_gather:
1771 case Intrinsic::aarch64_sve_ld1_gather_index:
1772 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1773 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1774 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1775 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1776 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1777 case Intrinsic::aarch64_sve_ld1q_gather_index:
1778 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1779 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1780 case Intrinsic::aarch64_sve_ld1ro:
1781 case Intrinsic::aarch64_sve_ld1rq:
1782 case Intrinsic::aarch64_sve_ld1udq:
1783 case Intrinsic::aarch64_sve_ld1uwq:
1784 case Intrinsic::aarch64_sve_ld2_sret:
1785 case Intrinsic::aarch64_sve_ld2q_sret:
1786 case Intrinsic::aarch64_sve_ld3_sret:
1787 case Intrinsic::aarch64_sve_ld3q_sret:
1788 case Intrinsic::aarch64_sve_ld4_sret:
1789 case Intrinsic::aarch64_sve_ld4q_sret:
1790 case Intrinsic::aarch64_sve_ldff1:
1791 case Intrinsic::aarch64_sve_ldff1_gather:
1792 case Intrinsic::aarch64_sve_ldff1_gather_index:
1793 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1794 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1795 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1796 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1797 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1798 case Intrinsic::aarch64_sve_ldnf1:
1799 case Intrinsic::aarch64_sve_ldnt1:
1800 case Intrinsic::aarch64_sve_ldnt1_gather:
1801 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1802 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1803 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1804 return SVEIntrinsicInfo::defaultZeroingOp();
1805
1806 case Intrinsic::aarch64_sve_prf:
1807 case Intrinsic::aarch64_sve_prfb_gather_index:
1808 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1809 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1810 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1811 case Intrinsic::aarch64_sve_prfd_gather_index:
1812 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1813 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1814 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1815 case Intrinsic::aarch64_sve_prfh_gather_index:
1816 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1817 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1818 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1819 case Intrinsic::aarch64_sve_prfw_gather_index:
1820 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1821 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1822 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1823 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 0);
1824
1825 case Intrinsic::aarch64_sve_st1_scatter:
1826 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1827 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1828 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1829 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1830 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1831 case Intrinsic::aarch64_sve_st1dq:
1832 case Intrinsic::aarch64_sve_st1q_scatter_index:
1833 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1834 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1835 case Intrinsic::aarch64_sve_st1wq:
1836 case Intrinsic::aarch64_sve_stnt1:
1837 case Intrinsic::aarch64_sve_stnt1_scatter:
1838 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1839 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1840 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1841 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 1);
1842 case Intrinsic::aarch64_sve_st2:
1843 case Intrinsic::aarch64_sve_st2q:
1844 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 2);
1845 case Intrinsic::aarch64_sve_st3:
1846 case Intrinsic::aarch64_sve_st3q:
1847 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 3);
1848 case Intrinsic::aarch64_sve_st4:
1849 case Intrinsic::aarch64_sve_st4q:
1850 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 4);
1851 }
1852
1853 return SVEIntrinsicInfo();
1854}
1855
1856static bool isAllActivePredicate(Value *Pred) {
1857 Value *UncastedPred;
1858
1859 // Look through predicate casts that only remove lanes.
1860 if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1861 Op0: m_Value(V&: UncastedPred)))) {
1862 auto *OrigPredTy = cast<ScalableVectorType>(Val: Pred->getType());
1863 Pred = UncastedPred;
1864
1865 if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1866 Op0: m_Value(V&: UncastedPred))))
1867 // If the predicate has the same or less lanes than the uncasted predicate
1868 // then we know the casting has no effect.
1869 if (OrigPredTy->getMinNumElements() <=
1870 cast<ScalableVectorType>(Val: UncastedPred->getType())
1871 ->getMinNumElements())
1872 Pred = UncastedPred;
1873 }
1874
1875 auto *C = dyn_cast<Constant>(Val: Pred);
1876 return C && C->isAllOnesValue();
1877}
1878
1879// Simplify `V` by only considering the operations that affect active lanes.
1880// This function should only return existing Values or newly created Constants.
1881static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1882 auto *Dup = dyn_cast<IntrinsicInst>(Val: V);
1883 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1884 Dup->getOperand(i_nocapture: 1) == Pg && isa<Constant>(Val: Dup->getOperand(i_nocapture: 2)))
1885 return ConstantVector::getSplat(
1886 EC: cast<VectorType>(Val: V->getType())->getElementCount(),
1887 Elt: cast<Constant>(Val: Dup->getOperand(i_nocapture: 2)));
1888
1889 return V;
1890}
1891
1892static std::optional<Instruction *>
1893simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II,
1894 const SVEIntrinsicInfo &IInfo) {
1895 const unsigned Opc = IInfo.getMatchingIROpode();
1896 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1897
1898 Value *Pg = II.getOperand(i_nocapture: 0);
1899 Value *Op1 = II.getOperand(i_nocapture: 1);
1900 Value *Op2 = II.getOperand(i_nocapture: 2);
1901 const DataLayout &DL = II.getDataLayout();
1902
1903 // Canonicalise constants to the RHS.
1904 if (Instruction::isCommutative(Opcode: Opc) && IInfo.inactiveLanesAreNotDefined() &&
1905 isa<Constant>(Val: Op1) && !isa<Constant>(Val: Op2)) {
1906 IC.replaceOperand(I&: II, OpNum: 1, V: Op2);
1907 IC.replaceOperand(I&: II, OpNum: 2, V: Op1);
1908 return &II;
1909 }
1910
1911 // Only active lanes matter when simplifying the operation.
1912 Op1 = stripInactiveLanes(V: Op1, Pg);
1913 Op2 = stripInactiveLanes(V: Op2, Pg);
1914
1915 Value *SimpleII;
1916 if (auto FII = dyn_cast<FPMathOperator>(Val: &II))
1917 SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, FMF: FII->getFastMathFlags(), Q: DL);
1918 else
1919 SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, Q: DL);
1920
1921 // An SVE intrinsic's result is always defined. However, this is not the case
1922 // for its equivalent IR instruction (e.g. when shifting by an amount more
1923 // than the data's bitwidth). Simplifications to an undefined result must be
1924 // ignored to preserve the intrinsic's expected behaviour.
1925 if (!SimpleII || isa<UndefValue>(Val: SimpleII))
1926 return std::nullopt;
1927
1928 if (IInfo.inactiveLanesAreNotDefined())
1929 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1930
1931 Value *Inactive = II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom());
1932
1933 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1934 if (SimpleII == Inactive)
1935 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1936
1937 // Inactive lanes must be preserved.
1938 SimpleII = IC.Builder.CreateSelect(C: Pg, True: SimpleII, False: Inactive);
1939 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1940}
1941
1942// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1943// to operations with less strict inactive lane requirements.
1944static std::optional<Instruction *>
1945simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II,
1946 const SVEIntrinsicInfo &IInfo) {
1947 if (!IInfo.hasGoverningPredicate())
1948 return std::nullopt;
1949
1950 auto *OpPredicate = II.getOperand(i_nocapture: IInfo.getGoverningPredicateOperandIdx());
1951
1952 // If there are no active lanes.
1953 if (match(V: OpPredicate, P: m_ZeroInt())) {
1954 if (IInfo.inactiveLanesTakenFromOperand())
1955 return IC.replaceInstUsesWith(
1956 I&: II, V: II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom()));
1957
1958 if (IInfo.inactiveLanesAreUnused()) {
1959 if (IInfo.resultIsZeroInitialized())
1960 IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1961
1962 return IC.eraseInstFromFunction(I&: II);
1963 }
1964 }
1965
1966 // If there are no inactive lanes.
1967 if (isAllActivePredicate(Pred: OpPredicate)) {
1968 if (IInfo.hasOperandWithNoActiveLanes()) {
1969 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1970 if (!isa<UndefValue>(Val: II.getOperand(i_nocapture: OpIdx)))
1971 return IC.replaceOperand(I&: II, OpNum: OpIdx, V: UndefValue::get(T: II.getType()));
1972 }
1973
1974 if (IInfo.hasMatchingUndefIntrinsic()) {
1975 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1976 M: II.getModule(), id: IInfo.getMatchingUndefIntrinsic(), OverloadTys: {II.getType()});
1977 II.setCalledFunction(NewDecl);
1978 return &II;
1979 }
1980 }
1981
1982 // Operation specific simplifications.
1983 if (IInfo.hasMatchingIROpode() &&
1984 Instruction::isBinaryOp(Opcode: IInfo.getMatchingIROpode()))
1985 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1986
1987 return std::nullopt;
1988}
1989
1990// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1991// => (binop (pred) (from_svbool _) (from_svbool _))
1992//
1993// The above transformation eliminates a `to_svbool` in the predicate
1994// operand of bitwise operation `binop` by narrowing the vector width of
1995// the operation. For example, it would convert a `<vscale x 16 x i1>
1996// and` into a `<vscale x 4 x i1> and`. This is profitable because
1997// to_svbool must zero the new lanes during widening, whereas
1998// from_svbool is free.
1999static std::optional<Instruction *>
2000tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
2001 auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: 0));
2002 if (!BinOp)
2003 return std::nullopt;
2004
2005 auto IntrinsicID = BinOp->getIntrinsicID();
2006 switch (IntrinsicID) {
2007 case Intrinsic::aarch64_sve_and_z:
2008 case Intrinsic::aarch64_sve_bic_z:
2009 case Intrinsic::aarch64_sve_eor_z:
2010 case Intrinsic::aarch64_sve_nand_z:
2011 case Intrinsic::aarch64_sve_nor_z:
2012 case Intrinsic::aarch64_sve_orn_z:
2013 case Intrinsic::aarch64_sve_orr_z:
2014 break;
2015 default:
2016 return std::nullopt;
2017 }
2018
2019 auto BinOpPred = BinOp->getOperand(i_nocapture: 0);
2020 auto BinOpOp1 = BinOp->getOperand(i_nocapture: 1);
2021 auto BinOpOp2 = BinOp->getOperand(i_nocapture: 2);
2022
2023 auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred);
2024 if (!PredIntr ||
2025 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
2026 return std::nullopt;
2027
2028 auto PredOp = PredIntr->getOperand(i_nocapture: 0);
2029 auto PredOpTy = cast<VectorType>(Val: PredOp->getType());
2030 if (PredOpTy != II.getType())
2031 return std::nullopt;
2032
2033 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
2034 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
2035 ID: Intrinsic::aarch64_sve_convert_from_svbool, OverloadTypes: {PredOpTy}, Args: {BinOpOp1});
2036 NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
2037 if (BinOpOp1 == BinOpOp2)
2038 NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
2039 else
2040 NarrowedBinOpArgs.push_back(Elt: IC.Builder.CreateIntrinsic(
2041 ID: Intrinsic::aarch64_sve_convert_from_svbool, OverloadTypes: {PredOpTy}, Args: {BinOpOp2}));
2042
2043 auto NarrowedBinOp =
2044 IC.Builder.CreateIntrinsic(ID: IntrinsicID, OverloadTypes: {PredOpTy}, Args: NarrowedBinOpArgs);
2045 return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp);
2046}
2047
2048static std::optional<Instruction *>
2049instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
2050 // If the reinterpret instruction operand is a PHI Node
2051 if (isa<PHINode>(Val: II.getArgOperand(i: 0)))
2052 return processPhiNode(IC, II);
2053
2054 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
2055 return BinOpCombine;
2056
2057 // Ignore converts to/from svcount_t.
2058 if (isa<TargetExtType>(Val: II.getArgOperand(i: 0)->getType()) ||
2059 isa<TargetExtType>(Val: II.getType()))
2060 return std::nullopt;
2061
2062 SmallVector<Instruction *, 32> CandidatesForRemoval;
2063 Value *Cursor = II.getOperand(i_nocapture: 0), *EarliestReplacement = nullptr;
2064
2065 const auto *IVTy = cast<VectorType>(Val: II.getType());
2066
2067 // Walk the chain of conversions.
2068 while (Cursor) {
2069 // If the type of the cursor has fewer lanes than the final result, zeroing
2070 // must take place, which breaks the equivalence chain.
2071 const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType());
2072 if (CursorVTy->getElementCount().getKnownMinValue() <
2073 IVTy->getElementCount().getKnownMinValue())
2074 break;
2075
2076 // If the cursor has the same type as I, it is a viable replacement.
2077 if (Cursor->getType() == IVTy)
2078 EarliestReplacement = Cursor;
2079
2080 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor);
2081
2082 // If this is not an SVE conversion intrinsic, this is the end of the chain.
2083 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
2084 Intrinsic::aarch64_sve_convert_to_svbool ||
2085 IntrinsicCursor->getIntrinsicID() ==
2086 Intrinsic::aarch64_sve_convert_from_svbool))
2087 break;
2088
2089 CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor);
2090 Cursor = IntrinsicCursor->getOperand(i_nocapture: 0);
2091 }
2092
2093 // If no viable replacement in the conversion chain was found, there is
2094 // nothing to do.
2095 if (!EarliestReplacement)
2096 return std::nullopt;
2097
2098 return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement);
2099}
2100
2101static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
2102 IntrinsicInst &II) {
2103 // svsel(ptrue, x, y) => x
2104 auto *OpPredicate = II.getOperand(i_nocapture: 0);
2105 if (isAllActivePredicate(Pred: OpPredicate))
2106 return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: 1));
2107
2108 auto Select =
2109 IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: 1), False: II.getOperand(i_nocapture: 2));
2110 return IC.replaceInstUsesWith(I&: II, V: Select);
2111}
2112
2113static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
2114 IntrinsicInst &II) {
2115 Value *Pg = II.getOperand(i_nocapture: 1);
2116
2117 // sve.dup(V, all_active, X) ==> splat(X)
2118 if (isAllActivePredicate(Pred: Pg)) {
2119 auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2120 Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
2121 V: II.getArgOperand(i: 2));
2122 return IC.replaceInstUsesWith(I&: II, V: Splat);
2123 }
2124
2125 if (!match(V: Pg, P: m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
2126 Op0: m_SpecificInt(V: AArch64SVEPredPattern::vl1))))
2127 return std::nullopt;
2128
2129 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
2130 Value *Insert = IC.Builder.CreateInsertElement(
2131 Vec: II.getArgOperand(i: 0), NewElt: II.getArgOperand(i: 2), Idx: uint64_t(0));
2132 return IC.replaceInstUsesWith(I&: II, V: Insert);
2133}
2134
2135static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
2136 IntrinsicInst &II) {
2137 // Replace DupX with a regular IR splat.
2138 auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2139 Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
2140 V: II.getArgOperand(i: 0));
2141 Splat->takeName(V: &II);
2142 return IC.replaceInstUsesWith(I&: II, V: Splat);
2143}
2144
2145static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
2146 IntrinsicInst &II) {
2147 LLVMContext &Ctx = II.getContext();
2148
2149 if (!isAllActivePredicate(Pred: II.getArgOperand(i: 0)))
2150 return std::nullopt;
2151
2152 // Check that we have a compare of zero..
2153 auto *SplatValue =
2154 dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: 2)));
2155 if (!SplatValue || !SplatValue->isZero())
2156 return std::nullopt;
2157
2158 // ..against a dupq
2159 auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1));
2160 if (!DupQLane ||
2161 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2162 return std::nullopt;
2163
2164 // Where the dupq is a lane 0 replicate of a vector insert
2165 auto *DupQLaneIdx = dyn_cast<ConstantInt>(Val: DupQLane->getArgOperand(i: 1));
2166 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2167 return std::nullopt;
2168
2169 auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: 0));
2170 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2171 return std::nullopt;
2172
2173 // Where the vector insert is a fixed constant vector insert into undef at
2174 // index zero
2175 if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: 0)))
2176 return std::nullopt;
2177
2178 if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: 2))->isZero())
2179 return std::nullopt;
2180
2181 auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: 1));
2182 if (!ConstVec)
2183 return std::nullopt;
2184
2185 auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType());
2186 auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType());
2187 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2188 return std::nullopt;
2189
2190 unsigned NumElts = VecTy->getNumElements();
2191 unsigned PredicateBits = 0;
2192
2193 // Expand intrinsic operands to a 16-bit byte level predicate
2194 for (unsigned I = 0; I < NumElts; ++I) {
2195 auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I));
2196 if (!Arg)
2197 return std::nullopt;
2198 if (!Arg->isZero())
2199 PredicateBits |= 1 << (I * (16 / NumElts));
2200 }
2201
2202 // If all bits are zero bail early with an empty predicate
2203 if (PredicateBits == 0) {
2204 auto *PFalse = Constant::getNullValue(Ty: II.getType());
2205 PFalse->takeName(V: &II);
2206 return IC.replaceInstUsesWith(I&: II, V: PFalse);
2207 }
2208
2209 // Calculate largest predicate type used (where byte predicate is largest)
2210 unsigned Mask = 8;
2211 for (unsigned I = 0; I < 16; ++I)
2212 if ((PredicateBits & (1 << I)) != 0)
2213 Mask |= (I % 8);
2214
2215 unsigned PredSize = Mask & -Mask;
2216 auto *PredType = ScalableVectorType::get(
2217 ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * 8));
2218
2219 // Ensure all relevant bits are set
2220 for (unsigned I = 0; I < 16; I += PredSize)
2221 if ((PredicateBits & (1 << I)) == 0)
2222 return std::nullopt;
2223
2224 auto *ConvertToSVBool =
2225 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_to_svbool,
2226 OverloadTypes: PredType, Args: ConstantInt::getTrue(Ty: PredType));
2227 auto *ConvertFromSVBool =
2228 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool,
2229 OverloadTypes: II.getType(), Args: ConvertToSVBool);
2230
2231 ConvertFromSVBool->takeName(V: &II);
2232 return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool);
2233}
2234
2235static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2236 IntrinsicInst &II) {
2237 Value *Pg = II.getArgOperand(i: 0);
2238 Value *Vec = II.getArgOperand(i: 1);
2239 auto IntrinsicID = II.getIntrinsicID();
2240 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2241
2242 // lastX(splat(X)) --> X
2243 if (auto *SplatVal = getSplatValue(V: Vec))
2244 return IC.replaceInstUsesWith(I&: II, V: SplatVal);
2245
2246 // If x and/or y is a splat value then:
2247 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2248 Value *LHS, *RHS;
2249 if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) {
2250 if (isSplatValue(V: LHS) || isSplatValue(V: RHS)) {
2251 auto *OldBinOp = cast<BinaryOperator>(Val: Vec);
2252 auto OpC = OldBinOp->getOpcode();
2253 auto *NewLHS =
2254 IC.Builder.CreateIntrinsic(ID: IntrinsicID, OverloadTypes: {Vec->getType()}, Args: {Pg, LHS});
2255 auto *NewRHS =
2256 IC.Builder.CreateIntrinsic(ID: IntrinsicID, OverloadTypes: {Vec->getType()}, Args: {Pg, RHS});
2257 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
2258 Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator());
2259 return IC.replaceInstUsesWith(I&: II, V: NewBinOp);
2260 }
2261 }
2262
2263 auto *C = dyn_cast<Constant>(Val: Pg);
2264 if (IsAfter && C && C->isNullValue()) {
2265 // The intrinsic is extracting lane 0 so use an extract instead.
2266 auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
2267 auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: 0));
2268 Extract->insertBefore(InsertPos: II.getIterator());
2269 Extract->takeName(V: &II);
2270 return IC.replaceInstUsesWith(I&: II, V: Extract);
2271 }
2272
2273 auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg);
2274 if (!IntrPG)
2275 return std::nullopt;
2276
2277 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2278 return std::nullopt;
2279
2280 const auto PTruePattern =
2281 cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: 0))->getZExtValue();
2282
2283 // Can the intrinsic's predicate be converted to a known constant index?
2284 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern);
2285 if (!MinNumElts)
2286 return std::nullopt;
2287
2288 unsigned Idx = MinNumElts - 1;
2289 // Increment the index if extracting the element after the last active
2290 // predicate element.
2291 if (IsAfter)
2292 ++Idx;
2293
2294 // Ignore extracts whose index is larger than the known minimum vector
2295 // length. NOTE: This is an artificial constraint where we prefer to
2296 // maintain what the user asked for until an alternative is proven faster.
2297 auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType());
2298 if (Idx >= PgVTy->getMinNumElements())
2299 return std::nullopt;
2300
2301 // The intrinsic is extracting a fixed lane so use an extract instead.
2302 auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
2303 auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx));
2304 Extract->insertBefore(InsertPos: II.getIterator());
2305 Extract->takeName(V: &II);
2306 return IC.replaceInstUsesWith(I&: II, V: Extract);
2307}
2308
2309static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2310 IntrinsicInst &II) {
2311 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2312 // integer variant across a variety of micro-architectures. Replace scalar
2313 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2314 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2315 // depending on the micro-architecture, but has been observed as generally
2316 // being faster, particularly when the CLAST[AB] op is a loop-carried
2317 // dependency.
2318 Value *Pg = II.getArgOperand(i: 0);
2319 Value *Fallback = II.getArgOperand(i: 1);
2320 Value *Vec = II.getArgOperand(i: 2);
2321 Type *Ty = II.getType();
2322
2323 if (!Ty->isIntegerTy())
2324 return std::nullopt;
2325
2326 Type *FPTy;
2327 switch (cast<IntegerType>(Val: Ty)->getBitWidth()) {
2328 default:
2329 return std::nullopt;
2330 case 16:
2331 FPTy = IC.Builder.getHalfTy();
2332 break;
2333 case 32:
2334 FPTy = IC.Builder.getFloatTy();
2335 break;
2336 case 64:
2337 FPTy = IC.Builder.getDoubleTy();
2338 break;
2339 }
2340
2341 Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy);
2342 auto *FPVTy = VectorType::get(
2343 ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount());
2344 Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy);
2345 auto *FPII = IC.Builder.CreateIntrinsic(
2346 ID: II.getIntrinsicID(), OverloadTypes: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec});
2347 Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType());
2348 return IC.replaceInstUsesWith(I&: II, V: FPIItoInt);
2349}
2350
2351static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2352 IntrinsicInst &II) {
2353 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2354 // can work with RDFFR_PP for ptest elimination.
2355 auto *RDFFR = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_rdffr_z,
2356 Args: ConstantInt::getTrue(Ty: II.getType()));
2357 RDFFR->takeName(V: &II);
2358 return IC.replaceInstUsesWith(I&: II, V: RDFFR);
2359}
2360
2361static std::optional<Instruction *>
2362instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
2363 const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: 0))->getZExtValue();
2364
2365 if (Pattern == AArch64SVEPredPattern::all) {
2366 Value *Cnt = IC.Builder.CreateElementCount(
2367 Ty: II.getType(), EC: ElementCount::getScalable(MinVal: NumElts));
2368 Cnt->takeName(V: &II);
2369 return IC.replaceInstUsesWith(I&: II, V: Cnt);
2370 }
2371
2372 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2373
2374 return MinNumElts && NumElts >= MinNumElts
2375 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2376 I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts)))
2377 : std::nullopt;
2378}
2379
2380static std::optional<Instruction *>
2381instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II,
2382 const AArch64Subtarget *ST) {
2383 if (!ST->isStreaming())
2384 return std::nullopt;
2385
2386 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2387 // with SVEPredPattern::all
2388 Value *Cnt =
2389 IC.Builder.CreateElementCount(Ty: II.getType(), EC: ElementCount::getScalable(MinVal: 2));
2390 Cnt->takeName(V: &II);
2391 return IC.replaceInstUsesWith(I&: II, V: Cnt);
2392}
2393
2394static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2395 IntrinsicInst &II) {
2396 Value *PgVal = II.getArgOperand(i: 0);
2397 Value *OpVal = II.getArgOperand(i: 1);
2398
2399 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2400 // Later optimizations prefer this form.
2401 if (PgVal == OpVal &&
2402 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2403 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2404 Value *Ops[] = {PgVal, OpVal};
2405 Type *Tys[] = {PgVal->getType()};
2406
2407 auto *PTest =
2408 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptest_any, OverloadTypes: Tys, Args: Ops);
2409 PTest->takeName(V: &II);
2410
2411 return IC.replaceInstUsesWith(I&: II, V: PTest);
2412 }
2413
2414 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal);
2415 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal);
2416
2417 if (!Pg || !Op)
2418 return std::nullopt;
2419
2420 Intrinsic::ID OpIID = Op->getIntrinsicID();
2421
2422 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2423 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2424 Pg->getArgOperand(i: 0)->getType() == Op->getArgOperand(i: 0)->getType()) {
2425 Value *Ops[] = {Pg->getArgOperand(i: 0), Op->getArgOperand(i: 0)};
2426 Type *Tys[] = {Pg->getArgOperand(i: 0)->getType()};
2427
2428 auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), OverloadTypes: Tys, Args: Ops);
2429
2430 PTest->takeName(V: &II);
2431 return IC.replaceInstUsesWith(I&: II, V: PTest);
2432 }
2433
2434 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2435 // Later optimizations may rewrite sequence to use the flag-setting variant
2436 // of instruction X to remove PTEST.
2437 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2438 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2439 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2440 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2441 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2442 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2443 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2444 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2445 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2446 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2447 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2448 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2449 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2450 Value *Ops[] = {Pg->getArgOperand(i: 0), Pg};
2451 Type *Tys[] = {Pg->getType()};
2452
2453 auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), OverloadTypes: Tys, Args: Ops);
2454 PTest->takeName(V: &II);
2455
2456 return IC.replaceInstUsesWith(I&: II, V: PTest);
2457 }
2458
2459 return std::nullopt;
2460}
2461
2462template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2463static std::optional<Instruction *>
2464instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
2465 bool MergeIntoAddendOp) {
2466 Value *P = II.getOperand(i_nocapture: 0);
2467 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2468 if (MergeIntoAddendOp) {
2469 AddendOp = II.getOperand(i_nocapture: 1);
2470 Mul = II.getOperand(i_nocapture: 2);
2471 } else {
2472 AddendOp = II.getOperand(i_nocapture: 2);
2473 Mul = II.getOperand(i_nocapture: 1);
2474 }
2475
2476 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0),
2477 m_Value(V&: MulOp1))))
2478 return std::nullopt;
2479
2480 if (!Mul->hasOneUse())
2481 return std::nullopt;
2482
2483 Instruction *FMFSource = nullptr;
2484 if (II.getType()->isFPOrFPVectorTy()) {
2485 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2486 // Stop the combine when the flags on the inputs differ in case dropping
2487 // flags would lead to us missing out on more beneficial optimizations.
2488 if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags())
2489 return std::nullopt;
2490 if (!FAddFlags.allowContract())
2491 return std::nullopt;
2492 FMFSource = &II;
2493 }
2494
2495 Value *Res;
2496 if (MergeIntoAddendOp)
2497 Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, OverloadTypes: {II.getType()},
2498 Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2499 else
2500 Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, OverloadTypes: {II.getType()},
2501 Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2502
2503 return IC.replaceInstUsesWith(I&: II, V: Res);
2504}
2505
2506static std::optional<Instruction *>
2507instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2508 Value *Pred = II.getOperand(i_nocapture: 0);
2509 Value *PtrOp = II.getOperand(i_nocapture: 1);
2510 Type *VecTy = II.getType();
2511
2512 if (isAllActivePredicate(Pred)) {
2513 LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp);
2514 Load->copyMetadata(SrcInst: II);
2515 return IC.replaceInstUsesWith(I&: II, V: Load);
2516 }
2517
2518 CallInst *MaskedLoad =
2519 IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL),
2520 Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy));
2521 MaskedLoad->copyMetadata(SrcInst: II);
2522 return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2523}
2524
2525static std::optional<Instruction *>
2526instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2527 Value *VecOp = II.getOperand(i_nocapture: 0);
2528 Value *Pred = II.getOperand(i_nocapture: 1);
2529 Value *PtrOp = II.getOperand(i_nocapture: 2);
2530
2531 if (isAllActivePredicate(Pred)) {
2532 StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp);
2533 Store->copyMetadata(SrcInst: II);
2534 return IC.eraseInstFromFunction(I&: II);
2535 }
2536
2537 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2538 Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred);
2539 MaskedStore->copyMetadata(SrcInst: II);
2540 return IC.eraseInstFromFunction(I&: II);
2541}
2542
2543static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
2544 switch (Intrinsic) {
2545 case Intrinsic::aarch64_sve_fmul_u:
2546 return Instruction::BinaryOps::FMul;
2547 case Intrinsic::aarch64_sve_fadd_u:
2548 return Instruction::BinaryOps::FAdd;
2549 case Intrinsic::aarch64_sve_fsub_u:
2550 return Instruction::BinaryOps::FSub;
2551 default:
2552 return Instruction::BinaryOpsEnd;
2553 }
2554}
2555
2556static std::optional<Instruction *>
2557instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
2558 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2559 if (II.isStrictFP())
2560 return std::nullopt;
2561
2562 auto *OpPredicate = II.getOperand(i_nocapture: 0);
2563 auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID());
2564 if (BinOpCode == Instruction::BinaryOpsEnd ||
2565 !isAllActivePredicate(Pred: OpPredicate))
2566 return std::nullopt;
2567 auto BinOp = IC.Builder.CreateBinOpFMF(
2568 Opc: BinOpCode, LHS: II.getOperand(i_nocapture: 1), RHS: II.getOperand(i_nocapture: 2), FMFSource: II.getFastMathFlags());
2569 return IC.replaceInstUsesWith(I&: II, V: BinOp);
2570}
2571
2572static std::optional<Instruction *>
2573instCombineSVEVectorMlaU(InstCombiner &IC, IntrinsicInst &II) {
2574 assert(II.getIntrinsicID() == Intrinsic::aarch64_sve_mla_u &&
2575 "Expected MLA_U intrinsic");
2576 Value *Acc = II.getArgOperand(i: 1);
2577 Value *MulOp0 = II.getArgOperand(i: 2);
2578 Value *MulOp1 = II.getArgOperand(i: 3);
2579
2580 // For mla_u, inactive lanes are undefined, so it is valid to drop the
2581 // predicate when replacing mla_u(acc, x, 1) with add(acc, x) or
2582 // mla_u(acc, x, -1) with sub(acc, x).
2583 if (match(V: MulOp0, P: m_One()))
2584 return IC.replaceInstUsesWith(I&: II, V: IC.Builder.CreateAdd(LHS: Acc, RHS: MulOp1));
2585 if (match(V: MulOp1, P: m_One()))
2586 return IC.replaceInstUsesWith(I&: II, V: IC.Builder.CreateAdd(LHS: Acc, RHS: MulOp0));
2587 if (match(V: MulOp0, P: m_AllOnes()))
2588 return IC.replaceInstUsesWith(I&: II, V: IC.Builder.CreateSub(LHS: Acc, RHS: MulOp1));
2589 if (match(V: MulOp1, P: m_AllOnes()))
2590 return IC.replaceInstUsesWith(I&: II, V: IC.Builder.CreateSub(LHS: Acc, RHS: MulOp0));
2591
2592 if (isa<Constant>(Val: MulOp0) && !isa<Constant>(Val: MulOp1)) {
2593 II.setArgOperand(i: 2, v: MulOp1);
2594 II.setArgOperand(i: 3, v: MulOp0);
2595 return &II;
2596 }
2597
2598 return std::nullopt;
2599}
2600
2601static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2602 IntrinsicInst &II) {
2603 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2604 Intrinsic::aarch64_sve_mla>(
2605 IC, II, MergeIntoAddendOp: true))
2606 return MLA;
2607 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2608 Intrinsic::aarch64_sve_mad>(
2609 IC, II, MergeIntoAddendOp: false))
2610 return MAD;
2611 return std::nullopt;
2612}
2613
2614static std::optional<Instruction *>
2615instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
2616 if (auto FMLA =
2617 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2618 Intrinsic::aarch64_sve_fmla>(IC, II,
2619 MergeIntoAddendOp: true))
2620 return FMLA;
2621 if (auto FMAD =
2622 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2623 Intrinsic::aarch64_sve_fmad>(IC, II,
2624 MergeIntoAddendOp: false))
2625 return FMAD;
2626 if (auto FMLA =
2627 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2628 Intrinsic::aarch64_sve_fmla>(IC, II,
2629 MergeIntoAddendOp: true))
2630 return FMLA;
2631 return std::nullopt;
2632}
2633
2634static std::optional<Instruction *>
2635instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
2636 if (auto FMLA =
2637 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2638 Intrinsic::aarch64_sve_fmla>(IC, II,
2639 MergeIntoAddendOp: true))
2640 return FMLA;
2641 if (auto FMAD =
2642 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2643 Intrinsic::aarch64_sve_fmad>(IC, II,
2644 MergeIntoAddendOp: false))
2645 return FMAD;
2646 if (auto FMLA_U =
2647 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2648 Intrinsic::aarch64_sve_fmla_u>(
2649 IC, II, MergeIntoAddendOp: true))
2650 return FMLA_U;
2651 return instCombineSVEVectorBinOp(IC, II);
2652}
2653
2654static std::optional<Instruction *>
2655instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
2656 if (auto FMLS =
2657 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2658 Intrinsic::aarch64_sve_fmls>(IC, II,
2659 MergeIntoAddendOp: true))
2660 return FMLS;
2661 if (auto FMSB =
2662 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2663 Intrinsic::aarch64_sve_fnmsb>(
2664 IC, II, MergeIntoAddendOp: false))
2665 return FMSB;
2666 if (auto FMLS =
2667 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2668 Intrinsic::aarch64_sve_fmls>(IC, II,
2669 MergeIntoAddendOp: true))
2670 return FMLS;
2671 return std::nullopt;
2672}
2673
2674static std::optional<Instruction *>
2675instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
2676 if (auto FMLS =
2677 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2678 Intrinsic::aarch64_sve_fmls>(IC, II,
2679 MergeIntoAddendOp: true))
2680 return FMLS;
2681 if (auto FMSB =
2682 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2683 Intrinsic::aarch64_sve_fnmsb>(
2684 IC, II, MergeIntoAddendOp: false))
2685 return FMSB;
2686 if (auto FMLS_U =
2687 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2688 Intrinsic::aarch64_sve_fmls_u>(
2689 IC, II, MergeIntoAddendOp: true))
2690 return FMLS_U;
2691 return instCombineSVEVectorBinOp(IC, II);
2692}
2693
2694static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2695 IntrinsicInst &II) {
2696 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2697 Intrinsic::aarch64_sve_mls>(
2698 IC, II, MergeIntoAddendOp: true))
2699 return MLS;
2700 return std::nullopt;
2701}
2702
2703static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2704 IntrinsicInst &II) {
2705 Value *UnpackArg = II.getArgOperand(i: 0);
2706 auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2707 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2708 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2709
2710 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2711 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2712 if (auto *ScalarArg = getSplatValue(V: UnpackArg)) {
2713 ScalarArg =
2714 IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned);
2715 Value *NewVal =
2716 IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg);
2717 NewVal->takeName(V: &II);
2718 return IC.replaceInstUsesWith(I&: II, V: NewVal);
2719 }
2720
2721 return std::nullopt;
2722}
2723static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2724 IntrinsicInst &II) {
2725 auto *OpVal = II.getOperand(i_nocapture: 0);
2726 auto *OpIndices = II.getOperand(i_nocapture: 1);
2727 VectorType *VTy = cast<VectorType>(Val: II.getType());
2728
2729 // Check whether OpIndices is a constant splat value < minimal element count
2730 // of result.
2731 auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices));
2732 if (!SplatValue ||
2733 SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue()))
2734 return std::nullopt;
2735
2736 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2737 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2738 auto *Extract = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue);
2739 auto *VectorSplat =
2740 IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract);
2741
2742 VectorSplat->takeName(V: &II);
2743 return IC.replaceInstUsesWith(I&: II, V: VectorSplat);
2744}
2745
2746static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2747 IntrinsicInst &II) {
2748 Value *A, *B;
2749 Type *RetTy = II.getType();
2750 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2751 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2752
2753 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2754 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2755 if ((match(V: II.getArgOperand(i: 0),
2756 P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A)))) &&
2757 match(V: II.getArgOperand(i: 1),
2758 P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) ||
2759 (match(V: II.getArgOperand(i: 0), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A))) &&
2760 match(V: II.getArgOperand(i: 1), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) {
2761 auto *TyA = cast<ScalableVectorType>(Val: A->getType());
2762 if (TyA == B->getType() &&
2763 RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) {
2764 auto *SubVec = IC.Builder.CreateInsertVector(
2765 DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: uint64_t(0));
2766 auto *ConcatVec = IC.Builder.CreateInsertVector(DstType: RetTy, SrcVec: SubVec, SubVec: B,
2767 Idx: TyA->getMinNumElements());
2768 ConcatVec->takeName(V: &II);
2769 return IC.replaceInstUsesWith(I&: II, V: ConcatVec);
2770 }
2771 }
2772
2773 return std::nullopt;
2774}
2775
2776static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2777 IntrinsicInst &II) {
2778 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2779 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2780 Value *A, *B;
2781 if (match(V: II.getArgOperand(i: 0),
2782 P: m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(Op0: m_Value(V&: A), Op1: m_Value(V&: B))) &&
2783 match(V: II.getArgOperand(i: 1), P: m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
2784 Op0: m_Specific(V: A), Op1: m_Specific(V: B))))
2785 return IC.replaceInstUsesWith(
2786 I&: II, V: (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2787
2788 return std::nullopt;
2789}
2790
2791static std::optional<Instruction *>
2792instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
2793 Value *Mask = II.getOperand(i_nocapture: 0);
2794 Value *BasePtr = II.getOperand(i_nocapture: 1);
2795 Value *Index = II.getOperand(i_nocapture: 2);
2796 Type *Ty = II.getType();
2797 Value *PassThru = ConstantAggregateZero::get(Ty);
2798
2799 // Contiguous gather => masked load.
2800 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2801 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2802 Value *IndexBase;
2803 if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(Op0: m_Value(V&: IndexBase),
2804 Op1: m_One()))) {
2805 Align Alignment =
2806 BasePtr->getPointerAlignment(DL: II.getDataLayout());
2807
2808 Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2809 Ptr: BasePtr, IdxList: IndexBase);
2810 CallInst *MaskedLoad =
2811 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2812 MaskedLoad->takeName(V: &II);
2813 return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2814 }
2815
2816 return std::nullopt;
2817}
2818
2819static std::optional<Instruction *>
2820instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
2821 Value *Val = II.getOperand(i_nocapture: 0);
2822 Value *Mask = II.getOperand(i_nocapture: 1);
2823 Value *BasePtr = II.getOperand(i_nocapture: 2);
2824 Value *Index = II.getOperand(i_nocapture: 3);
2825 Type *Ty = Val->getType();
2826
2827 // Contiguous scatter => masked store.
2828 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2829 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2830 Value *IndexBase;
2831 if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(Op0: m_Value(V&: IndexBase),
2832 Op1: m_One()))) {
2833 Align Alignment =
2834 BasePtr->getPointerAlignment(DL: II.getDataLayout());
2835
2836 Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2837 Ptr: BasePtr, IdxList: IndexBase);
2838 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2839
2840 return IC.eraseInstFromFunction(I&: II);
2841 }
2842
2843 return std::nullopt;
2844}
2845
2846static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2847 IntrinsicInst &II) {
2848 Type *Int32Ty = IC.Builder.getInt32Ty();
2849 Value *Pred = II.getOperand(i_nocapture: 0);
2850 Value *Vec = II.getOperand(i_nocapture: 1);
2851 Value *DivVec = II.getOperand(i_nocapture: 2);
2852
2853 Value *SplatValue = getSplatValue(V: DivVec);
2854 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue);
2855 if (!SplatConstantInt)
2856 return std::nullopt;
2857
2858 APInt Divisor = SplatConstantInt->getValue();
2859 const int64_t DivisorValue = Divisor.getSExtValue();
2860 if (DivisorValue == -1)
2861 return std::nullopt;
2862 if (DivisorValue == 1)
2863 IC.replaceInstUsesWith(I&: II, V: Vec);
2864
2865 if (Divisor.isPowerOf2()) {
2866 Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2867 auto ASRD = IC.Builder.CreateIntrinsic(
2868 ID: Intrinsic::aarch64_sve_asrd, OverloadTypes: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2869 return IC.replaceInstUsesWith(I&: II, V: ASRD);
2870 }
2871 if (Divisor.isNegatedPowerOf2()) {
2872 Divisor.negate();
2873 Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2874 auto ASRD = IC.Builder.CreateIntrinsic(
2875 ID: Intrinsic::aarch64_sve_asrd, OverloadTypes: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2876 auto NEG = IC.Builder.CreateIntrinsic(
2877 ID: Intrinsic::aarch64_sve_neg, OverloadTypes: {ASRD->getType()}, Args: {ASRD, Pred, ASRD});
2878 return IC.replaceInstUsesWith(I&: II, V: NEG);
2879 }
2880
2881 return std::nullopt;
2882}
2883
2884bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2885 size_t VecSize = Vec.size();
2886 if (VecSize == 1)
2887 return true;
2888 if (!isPowerOf2_64(Value: VecSize))
2889 return false;
2890 size_t HalfVecSize = VecSize / 2;
2891
2892 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2893 RHS != Vec.end(); LHS++, RHS++) {
2894 if (*LHS != nullptr && *RHS != nullptr) {
2895 if (*LHS == *RHS)
2896 continue;
2897 else
2898 return false;
2899 }
2900 if (!AllowPoison)
2901 return false;
2902 if (*LHS == nullptr && *RHS != nullptr)
2903 *LHS = *RHS;
2904 }
2905
2906 Vec.resize(N: HalfVecSize);
2907 SimplifyValuePattern(Vec, AllowPoison);
2908 return true;
2909}
2910
2911// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2912// to dupqlane(f64(C)) where C is A concatenated with B
2913static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2914 IntrinsicInst &II) {
2915 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2916 if (!match(V: II.getOperand(i_nocapture: 0),
2917 P: m_Intrinsic<Intrinsic::vector_insert>(
2918 Op0: m_Value(V&: Default), Op1: m_Value(V&: CurrentInsertElt), Op2: m_Value())) ||
2919 !isa<FixedVectorType>(Val: CurrentInsertElt->getType()))
2920 return std::nullopt;
2921 auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType());
2922
2923 // Insert the scalars into a container ordered by InsertElement index
2924 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2925 while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) {
2926 auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: 2));
2927 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: 1);
2928 CurrentInsertElt = InsertElt->getOperand(i_nocapture: 0);
2929 }
2930
2931 bool AllowPoison =
2932 isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default);
2933 if (!SimplifyValuePattern(Vec&: Elts, AllowPoison))
2934 return std::nullopt;
2935
2936 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2937 Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType());
2938 for (size_t I = 0; I < Elts.size(); I++) {
2939 if (Elts[I] == nullptr)
2940 continue;
2941 InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts[I],
2942 Idx: IC.Builder.getInt64(C: I));
2943 }
2944 if (InsertEltChain == nullptr)
2945 return std::nullopt;
2946
2947 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2948 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2949 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2950 // be narrowed back to the original type.
2951 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2952 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2953 IIScalableTy->getMinNumElements() /
2954 PatternWidth;
2955
2956 IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth);
2957 auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount);
2958 auto *WideShuffleMaskTy =
2959 ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount);
2960
2961 auto InsertSubvector = IC.Builder.CreateInsertVector(
2962 DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain,
2963 Idx: uint64_t(0));
2964 auto WideBitcast =
2965 IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy);
2966 auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy);
2967 auto WideShuffle = IC.Builder.CreateShuffleVector(
2968 V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask);
2969 auto NarrowBitcast =
2970 IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType());
2971
2972 return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast);
2973}
2974
2975static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2976 IntrinsicInst &II) {
2977 Value *A = II.getArgOperand(i: 0);
2978 Value *B = II.getArgOperand(i: 1);
2979 if (A == B)
2980 return IC.replaceInstUsesWith(I&: II, V: A);
2981
2982 return std::nullopt;
2983}
2984
2985static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2986 IntrinsicInst &II) {
2987 Value *Pred = II.getOperand(i_nocapture: 0);
2988 Value *Vec = II.getOperand(i_nocapture: 1);
2989 Value *Shift = II.getOperand(i_nocapture: 2);
2990
2991 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2992 Value *AbsPred, *MergedValue;
2993 if (!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2994 Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())) &&
2995 !match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2996 Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())))
2997
2998 return std::nullopt;
2999
3000 // Transform is valid if any of the following are true:
3001 // * The ABS merge value is an undef or non-negative
3002 // * The ABS predicate is all active
3003 // * The ABS predicate and the SRSHL predicates are the same
3004 if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) &&
3005 AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred))
3006 return std::nullopt;
3007
3008 // Only valid when the shift amount is non-negative, otherwise the rounding
3009 // behaviour of SRSHL cannot be ignored.
3010 if (!match(V: Shift, P: m_NonNegative()))
3011 return std::nullopt;
3012
3013 auto LSL = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_lsl,
3014 OverloadTypes: {II.getType()}, Args: {Pred, Vec, Shift});
3015
3016 return IC.replaceInstUsesWith(I&: II, V: LSL);
3017}
3018
3019static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
3020 IntrinsicInst &II) {
3021 Value *Vec = II.getOperand(i_nocapture: 0);
3022
3023 if (getSplatValue(V: Vec) == II.getOperand(i_nocapture: 1))
3024 return IC.replaceInstUsesWith(I&: II, V: Vec);
3025
3026 return std::nullopt;
3027}
3028
3029static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
3030 IntrinsicInst &II) {
3031 // If this barrier is post-dominated by identical one we can remove it
3032 auto *NI = II.getNextNode();
3033 unsigned LookaheadThreshold = DMBLookaheadThreshold;
3034 auto CanSkipOver = [](Instruction *I) {
3035 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
3036 };
3037 while (LookaheadThreshold-- && CanSkipOver(NI)) {
3038 auto *NIBB = NI->getParent();
3039 NI = NI->getNextNode();
3040 if (!NI) {
3041 if (auto *SuccBB = NIBB->getUniqueSuccessor())
3042 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
3043 else
3044 break;
3045 }
3046 }
3047 auto *NextII = dyn_cast_or_null<IntrinsicInst>(Val: NI);
3048 if (NextII && II.isIdenticalTo(I: NextII))
3049 return IC.eraseInstFromFunction(I&: II);
3050
3051 return std::nullopt;
3052}
3053
3054static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
3055 IntrinsicInst &II) {
3056 return IC.replaceInstUsesWith(
3057 I&: II,
3058 V: IC.Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
3059 OverloadTypes: {II.getType(), II.getOperand(i_nocapture: 0)->getType()},
3060 Args: {II.getOperand(i_nocapture: 0), II.getOperand(i_nocapture: 1)}));
3061}
3062
3063static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
3064 IntrinsicInst &II) {
3065 unsigned PredPattern = cast<ConstantInt>(Val: II.getOperand(i_nocapture: 0))->getZExtValue();
3066 // SVE vector length is a power-of-two, thus pow2 is synonymous with all.
3067 if (PredPattern == AArch64SVEPredPattern::all ||
3068 PredPattern == AArch64SVEPredPattern::pow2)
3069 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getTrue(Ty: II.getType()));
3070 return std::nullopt;
3071}
3072
3073static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
3074 IntrinsicInst &II,
3075 unsigned NumBits) {
3076 Value *Passthru = II.getOperand(i_nocapture: 0);
3077 Value *Pg = II.getOperand(i_nocapture: 1);
3078 Value *Op = II.getOperand(i_nocapture: 2);
3079
3080 // Convert UXT[BHW] to AND.
3081 if (isa<UndefValue>(Val: Passthru) || isAllActivePredicate(Pred: Pg)) {
3082 auto *Ty = cast<VectorType>(Val: II.getType());
3083 auto MaskValue = APInt::getLowBitsSet(numBits: Ty->getScalarSizeInBits(), loBitsSet: NumBits);
3084 auto *Mask = ConstantInt::get(Ty, V: MaskValue);
3085 auto *And = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_and_u, OverloadTypes: {Ty},
3086 Args: {Pg, Op, Mask});
3087 return IC.replaceInstUsesWith(I&: II, V: And);
3088 }
3089
3090 return std::nullopt;
3091}
3092
3093static std::optional<Instruction *>
3094instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II) {
3095 SMEAttrs FnSMEAttrs(*II.getFunction());
3096 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
3097 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
3098 return IC.replaceInstUsesWith(
3099 I&: II, V: ConstantInt::getBool(Ty: II.getType(), V: IsStreaming));
3100 return std::nullopt;
3101}
3102
3103std::optional<Instruction *>
3104AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
3105 IntrinsicInst &II) const {
3106 const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II);
3107 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
3108 return I;
3109
3110 Intrinsic::ID IID = II.getIntrinsicID();
3111 switch (IID) {
3112 default:
3113 break;
3114 case Intrinsic::aarch64_dmb:
3115 return instCombineDMB(IC, II);
3116 case Intrinsic::aarch64_neon_fmaxnm:
3117 case Intrinsic::aarch64_neon_fminnm:
3118 return instCombineMaxMinNM(IC, II);
3119 case Intrinsic::aarch64_sve_convert_from_svbool:
3120 return instCombineConvertFromSVBool(IC, II);
3121 case Intrinsic::aarch64_sve_dup:
3122 return instCombineSVEDup(IC, II);
3123 case Intrinsic::aarch64_sve_dup_x:
3124 return instCombineSVEDupX(IC, II);
3125 case Intrinsic::aarch64_sve_cmpne:
3126 case Intrinsic::aarch64_sve_cmpne_wide:
3127 return instCombineSVECmpNE(IC, II);
3128 case Intrinsic::aarch64_sve_rdffr:
3129 return instCombineRDFFR(IC, II);
3130 case Intrinsic::aarch64_sve_lasta:
3131 case Intrinsic::aarch64_sve_lastb:
3132 return instCombineSVELast(IC, II);
3133 case Intrinsic::aarch64_sve_clasta_n:
3134 case Intrinsic::aarch64_sve_clastb_n:
3135 return instCombineSVECondLast(IC, II);
3136 case Intrinsic::aarch64_sve_cntd:
3137 return instCombineSVECntElts(IC, II, NumElts: 2);
3138 case Intrinsic::aarch64_sve_cntw:
3139 return instCombineSVECntElts(IC, II, NumElts: 4);
3140 case Intrinsic::aarch64_sve_cnth:
3141 return instCombineSVECntElts(IC, II, NumElts: 8);
3142 case Intrinsic::aarch64_sve_cntb:
3143 return instCombineSVECntElts(IC, II, NumElts: 16);
3144 case Intrinsic::aarch64_sme_cntsd:
3145 return instCombineSMECntsd(IC, II, ST);
3146 case Intrinsic::aarch64_sve_ptest_any:
3147 case Intrinsic::aarch64_sve_ptest_first:
3148 case Intrinsic::aarch64_sve_ptest_last:
3149 return instCombineSVEPTest(IC, II);
3150 case Intrinsic::aarch64_sve_fadd:
3151 return instCombineSVEVectorFAdd(IC, II);
3152 case Intrinsic::aarch64_sve_fadd_u:
3153 return instCombineSVEVectorFAddU(IC, II);
3154 case Intrinsic::aarch64_sve_fmul_u:
3155 return instCombineSVEVectorBinOp(IC, II);
3156 case Intrinsic::aarch64_sve_fsub:
3157 return instCombineSVEVectorFSub(IC, II);
3158 case Intrinsic::aarch64_sve_fsub_u:
3159 return instCombineSVEVectorFSubU(IC, II);
3160 case Intrinsic::aarch64_sve_add:
3161 return instCombineSVEVectorAdd(IC, II);
3162 case Intrinsic::aarch64_sve_add_u:
3163 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3164 Intrinsic::aarch64_sve_mla_u>(
3165 IC, II, MergeIntoAddendOp: true);
3166 case Intrinsic::aarch64_sve_mla_u:
3167 return instCombineSVEVectorMlaU(IC, II);
3168 case Intrinsic::aarch64_sve_sub:
3169 return instCombineSVEVectorSub(IC, II);
3170 case Intrinsic::aarch64_sve_sub_u:
3171 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3172 Intrinsic::aarch64_sve_mls_u>(
3173 IC, II, MergeIntoAddendOp: true);
3174 case Intrinsic::aarch64_sve_tbl:
3175 return instCombineSVETBL(IC, II);
3176 case Intrinsic::aarch64_sve_uunpkhi:
3177 case Intrinsic::aarch64_sve_uunpklo:
3178 case Intrinsic::aarch64_sve_sunpkhi:
3179 case Intrinsic::aarch64_sve_sunpklo:
3180 return instCombineSVEUnpack(IC, II);
3181 case Intrinsic::aarch64_sve_uzp1:
3182 return instCombineSVEUzp1(IC, II);
3183 case Intrinsic::aarch64_sve_zip1:
3184 case Intrinsic::aarch64_sve_zip2:
3185 return instCombineSVEZip(IC, II);
3186 case Intrinsic::aarch64_sve_ld1_gather_index:
3187 return instCombineLD1GatherIndex(IC, II);
3188 case Intrinsic::aarch64_sve_st1_scatter_index:
3189 return instCombineST1ScatterIndex(IC, II);
3190 case Intrinsic::aarch64_sve_ld1:
3191 return instCombineSVELD1(IC, II, DL);
3192 case Intrinsic::aarch64_sve_st1:
3193 return instCombineSVEST1(IC, II, DL);
3194 case Intrinsic::aarch64_sve_sdiv:
3195 return instCombineSVESDIV(IC, II);
3196 case Intrinsic::aarch64_sve_sel:
3197 return instCombineSVESel(IC, II);
3198 case Intrinsic::aarch64_sve_srshl:
3199 return instCombineSVESrshl(IC, II);
3200 case Intrinsic::aarch64_sve_dupq_lane:
3201 return instCombineSVEDupqLane(IC, II);
3202 case Intrinsic::aarch64_sve_insr:
3203 return instCombineSVEInsr(IC, II);
3204 case Intrinsic::aarch64_sve_whilelo:
3205 return instCombineWhilelo(IC, II);
3206 case Intrinsic::aarch64_sve_ptrue:
3207 return instCombinePTrue(IC, II);
3208 case Intrinsic::aarch64_sve_uxtb:
3209 return instCombineSVEUxt(IC, II, NumBits: 8);
3210 case Intrinsic::aarch64_sve_uxth:
3211 return instCombineSVEUxt(IC, II, NumBits: 16);
3212 case Intrinsic::aarch64_sve_uxtw:
3213 return instCombineSVEUxt(IC, II, NumBits: 32);
3214 case Intrinsic::aarch64_sme_in_streaming_mode:
3215 return instCombineInStreamingMode(IC, II);
3216 }
3217
3218 return std::nullopt;
3219}
3220
3221std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3222 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3223 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3224 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3225 SimplifyAndSetOp) const {
3226 switch (II.getIntrinsicID()) {
3227 default:
3228 break;
3229 case Intrinsic::aarch64_neon_fcvtxn:
3230 case Intrinsic::aarch64_neon_rshrn:
3231 case Intrinsic::aarch64_neon_sqrshrn:
3232 case Intrinsic::aarch64_neon_sqrshrun:
3233 case Intrinsic::aarch64_neon_sqshrn:
3234 case Intrinsic::aarch64_neon_sqshrun:
3235 case Intrinsic::aarch64_neon_sqxtn:
3236 case Intrinsic::aarch64_neon_sqxtun:
3237 case Intrinsic::aarch64_neon_uqrshrn:
3238 case Intrinsic::aarch64_neon_uqshrn:
3239 case Intrinsic::aarch64_neon_uqxtn:
3240 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3241 break;
3242 }
3243
3244 return std::nullopt;
3245}
3246
3247bool AArch64TTIImpl::enableScalableVectorization() const {
3248 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3249 EnableScalableAutovecInStreamingMode);
3250}
3251
3252TypeSize
3253AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
3254 switch (K) {
3255 case TargetTransformInfo::RGK_Scalar:
3256 return TypeSize::getFixed(ExactSize: 64);
3257 case TargetTransformInfo::RGK_FixedWidthVector:
3258 if (ST->useSVEForFixedLengthVectors() &&
3259 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3260 return TypeSize::getFixed(
3261 ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: 128u));
3262 else if (ST->isNeonAvailable())
3263 return TypeSize::getFixed(ExactSize: 128);
3264 else
3265 return TypeSize::getFixed(ExactSize: 0);
3266 case TargetTransformInfo::RGK_ScalableVector:
3267 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3268 EnableScalableAutovecInStreamingMode))
3269 return TypeSize::getScalable(MinimumSize: 128);
3270 else
3271 return TypeSize::getScalable(MinimumSize: 0);
3272 }
3273 llvm_unreachable("Unsupported register kind");
3274}
3275
3276bool AArch64TTIImpl::isSingleExtWideningInstruction(
3277 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3278 Type *SrcOverrideTy) const {
3279 // A helper that returns a vector type from the given type. The number of
3280 // elements in type Ty determines the vector width.
3281 auto toVectorTy = [&](Type *ArgTy) {
3282 return VectorType::get(ElementType: ArgTy->getScalarType(),
3283 EC: cast<VectorType>(Val: DstTy)->getElementCount());
3284 };
3285
3286 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3287 // i32, i64]. SVE doesn't generally have the same set of instructions to
3288 // perform an extend with the add/sub/mul. There are SMULLB style
3289 // instructions, but they operate on top/bottom, requiring some sort of lane
3290 // interleaving to be used with zext/sext.
3291 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3292 if (!useNeonVector(Ty: DstTy) || Args.size() != 2 ||
3293 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3294 return false;
3295
3296 Type *SrcTy = SrcOverrideTy;
3297 switch (Opcode) {
3298 case Instruction::Add: // UADDW(2), SADDW(2).
3299 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3300 // The second operand needs to be an extend
3301 if (isa<SExtInst>(Val: Args[1]) || isa<ZExtInst>(Val: Args[1])) {
3302 if (!SrcTy)
3303 SrcTy =
3304 toVectorTy(cast<Instruction>(Val: Args[1])->getOperand(i: 0)->getType());
3305 break;
3306 }
3307
3308 if (Opcode == Instruction::Sub)
3309 return false;
3310
3311 // UADDW(2), SADDW(2) can be commutted.
3312 if (isa<SExtInst>(Val: Args[0]) || isa<ZExtInst>(Val: Args[0])) {
3313 if (!SrcTy)
3314 SrcTy =
3315 toVectorTy(cast<Instruction>(Val: Args[0])->getOperand(i: 0)->getType());
3316 break;
3317 }
3318 return false;
3319 }
3320 default:
3321 return false;
3322 }
3323
3324 // Legalize the destination type and ensure it can be used in a widening
3325 // operation.
3326 auto DstTyL = getTypeLegalizationCost(Ty: DstTy);
3327 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3328 return false;
3329
3330 // Legalize the source type and ensure it can be used in a widening
3331 // operation.
3332 assert(SrcTy && "Expected some SrcTy");
3333 auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy);
3334 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3335 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3336 return false;
3337
3338 // Get the total number of vector elements in the legalized types.
3339 InstructionCost NumDstEls =
3340 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3341 InstructionCost NumSrcEls =
3342 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3343
3344 // Return true if the legalized types have the same number of vector elements
3345 // and the destination element type size is twice that of the source type.
3346 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3347}
3348
3349Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3350 ArrayRef<const Value *> Args,
3351 Type *SrcOverrideTy) const {
3352 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3353 Opcode != Instruction::Mul)
3354 return nullptr;
3355
3356 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3357 // i32, i64]. SVE doesn't generally have the same set of instructions to
3358 // perform an extend with the add/sub/mul. There are SMULLB style
3359 // instructions, but they operate on top/bottom, requiring some sort of lane
3360 // interleaving to be used with zext/sext.
3361 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3362 if (!useNeonVector(Ty: DstTy) || Args.size() != 2 ||
3363 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3364 return nullptr;
3365
3366 auto getScalarSizeWithOverride = [&](const Value *V) {
3367 if (SrcOverrideTy)
3368 return SrcOverrideTy->getScalarSizeInBits();
3369 return cast<Instruction>(Val: V)
3370 ->getOperand(i: 0)
3371 ->getType()
3372 ->getScalarSizeInBits();
3373 };
3374
3375 unsigned MaxEltSize = 0;
3376 if ((isa<SExtInst>(Val: Args[0]) && isa<SExtInst>(Val: Args[1])) ||
3377 (isa<ZExtInst>(Val: Args[0]) && isa<ZExtInst>(Val: Args[1]))) {
3378 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3379 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3380 MaxEltSize = std::max(a: EltSize0, b: EltSize1);
3381 } else if (isa<SExtInst, ZExtInst>(Val: Args[0]) &&
3382 isa<SExtInst, ZExtInst>(Val: Args[1])) {
3383 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3384 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3385 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3386 // enough.
3387 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3388 return nullptr;
3389 MaxEltSize = DstEltSize / 2;
3390 } else if (Opcode == Instruction::Mul &&
3391 (isa<ZExtInst>(Val: Args[0]) || isa<ZExtInst>(Val: Args[1]))) {
3392 // If one of the operands is a Zext and the other has enough zero bits
3393 // to be treated as unsigned, we can still generate a umull, meaning the
3394 // zext is free.
3395 KnownBits Known =
3396 computeKnownBits(V: isa<ZExtInst>(Val: Args[0]) ? Args[1] : Args[0], DL);
3397 if (Args[0]->getType()->getScalarSizeInBits() -
3398 Known.Zero.countLeadingOnes() >
3399 DstTy->getScalarSizeInBits() / 2)
3400 return nullptr;
3401
3402 MaxEltSize =
3403 getScalarSizeWithOverride(isa<ZExtInst>(Val: Args[0]) ? Args[0] : Args[1]);
3404 } else
3405 return nullptr;
3406
3407 if (MaxEltSize * 2 > DstEltSize)
3408 return nullptr;
3409
3410 Type *ExtTy = DstTy->getWithNewBitWidth(NewBitWidth: MaxEltSize * 2);
3411 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3412 return nullptr;
3413 return ExtTy;
3414}
3415
3416// s/urhadd instructions implement the following pattern, making the
3417// extends free:
3418// %x = add ((zext i8 -> i16), 1)
3419// %y = (zext i8 -> i16)
3420// trunc i16 (lshr (add %x, %y), 1) -> i8
3421//
3422bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
3423 Type *Src) const {
3424 // The source should be a legal vector type.
3425 if (!Src->isVectorTy() || !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) ||
3426 (Src->isScalableTy() && !ST->hasSVE2()))
3427 return false;
3428
3429 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3430 return false;
3431
3432 // Look for trunc/shl/add before trying to match the pattern.
3433 const Instruction *Add = ExtUser;
3434 auto *AddUser =
3435 dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3436 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3437 Add = AddUser;
3438
3439 auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3440 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3441 return false;
3442
3443 auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser());
3444 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3445 Src->getScalarSizeInBits() !=
3446 cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits())
3447 return false;
3448
3449 // Try to match the whole pattern. Ext could be either the first or second
3450 // m_ZExtOrSExt matched.
3451 Instruction *Ex1, *Ex2;
3452 if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1),
3453 R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_One())))))
3454 return false;
3455
3456 // Ensure both extends are of the same type
3457 if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) &&
3458 Ex1->getOpcode() == Ex2->getOpcode())
3459 return true;
3460
3461 return false;
3462}
3463
3464InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
3465 Type *Src,
3466 TTI::CastContextHint CCH,
3467 TTI::TargetCostKind CostKind,
3468 const Instruction *I) const {
3469 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3470 assert(ISD && "Invalid opcode");
3471 // If the cast is observable, and it is used by a widening instruction (e.g.,
3472 // uaddl, saddw, etc.), it may be free.
3473 if (I && I->hasOneUser()) {
3474 auto *SingleUser = cast<Instruction>(Val: *I->user_begin());
3475 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3476 if (Type *ExtTy = isBinExtWideningInstruction(
3477 Opcode: SingleUser->getOpcode(), DstTy: Dst, Args: Operands,
3478 SrcOverrideTy: Src != I->getOperand(i: 0)->getType() ? Src : nullptr)) {
3479 // The cost from Src->Src*2 needs to be added if required, the cost from
3480 // Src*2->ExtTy is free.
3481 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3482 Type *DoubleSrcTy =
3483 Src->getWithNewBitWidth(NewBitWidth: Src->getScalarSizeInBits() * 2);
3484 return getCastInstrCost(Opcode, Dst: DoubleSrcTy, Src,
3485 CCH: TTI::CastContextHint::None, CostKind);
3486 }
3487
3488 return 0;
3489 }
3490
3491 if (isSingleExtWideningInstruction(
3492 Opcode: SingleUser->getOpcode(), DstTy: Dst, Args: Operands,
3493 SrcOverrideTy: Src != I->getOperand(i: 0)->getType() ? Src : nullptr)) {
3494 // For adds only count the second operand as free if both operands are
3495 // extends but not the same operation. (i.e both operands are not free in
3496 // add(sext, zext)).
3497 if (SingleUser->getOpcode() == Instruction::Add) {
3498 if (I == SingleUser->getOperand(i: 1) ||
3499 (isa<CastInst>(Val: SingleUser->getOperand(i: 1)) &&
3500 cast<CastInst>(Val: SingleUser->getOperand(i: 1))->getOpcode() == Opcode))
3501 return 0;
3502 } else {
3503 // Others are free so long as isSingleExtWideningInstruction
3504 // returned true.
3505 return 0;
3506 }
3507 }
3508
3509 // The cast will be free for the s/urhadd instructions
3510 if ((isa<ZExtInst>(Val: I) || isa<SExtInst>(Val: I)) &&
3511 isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src))
3512 return 0;
3513 }
3514
3515 EVT SrcTy = TLI->getValueType(DL, Ty: Src);
3516 EVT DstTy = TLI->getValueType(DL, Ty: Dst);
3517
3518 if (!SrcTy.isSimple() || !DstTy.isSimple())
3519 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3520
3521 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3522 // we use fcvtx under SVE2. Give them invalid costs.
3523 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3524 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3525 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3526 return InstructionCost::getInvalid();
3527
3528 static const TypeConversionCostTblEntry BF16Tbl[] = {
3529 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 1}, // bfcvt
3530 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 1}, // bfcvt
3531 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 1}, // bfcvtn
3532 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 2}, // bfcvtn+bfcvtn2
3533 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 2}, // bfcvtn+fcvtn
3534 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtl2+bfcvtn
3535 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3536 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f32, .Cost: 1}, // bfcvt
3537 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f32, .Cost: 1}, // bfcvt
3538 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f32, .Cost: 3}, // bfcvt+bfcvt+uzp1
3539 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f64, .Cost: 2}, // fcvtx+bfcvt
3540 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f64, .Cost: 5}, // 2*fcvtx+2*bfcvt+uzp1
3541 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f64, .Cost: 11}, // 4*fcvt+4*bfcvt+3*uzp
3542 };
3543
3544 if (ST->hasBF16())
3545 if (const auto *Entry = ConvertCostTableLookup(
3546 Table: BF16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3547 return Entry->Cost;
3548
3549 // We have to estimate a cost of fixed length operation upon
3550 // SVE registers(operations) with the number of registers required
3551 // for a fixed type to be represented upon SVE registers.
3552 EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy;
3553 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3554 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3555 ST->useSVEForFixedLengthVectors(VT: WiderTy)) {
3556 std::pair<InstructionCost, MVT> LT =
3557 getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext()));
3558 unsigned NumElements =
3559 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3560 return LT.first *
3561 getCastInstrCost(
3562 Opcode,
3563 Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements),
3564 Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH,
3565 CostKind, I);
3566 }
3567
3568 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3569 // The cost of unpacking twice is artificially increased for now in order
3570 // to avoid regressions against NEON, which will use tbl instructions directly
3571 // instead of multiple layers of [s|u]unpk[lo|hi].
3572 // We use the unpacks in cases where the destination type is illegal and
3573 // requires splitting of the input, even if the input type itself is legal.
3574 const unsigned int SVE_EXT_COST = 1;
3575 const unsigned int SVE_FCVT_COST = 1;
3576 const unsigned int SVE_UNPACK_ONCE = 4;
3577 const unsigned int SVE_UNPACK_TWICE = 16;
3578
3579 static const TypeConversionCostTblEntry ConversionTbl[] = {
3580 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: 1}, // xtn
3581 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: 1}, // xtn
3582 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i32, .Src: MVT::v2i64, .Cost: 1}, // xtn
3583 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: 1}, // xtn
3584 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: 3}, // 2 xtn + 1 uzp1
3585 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 1}, // xtn
3586 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: 2}, // 1 uzp1 + 1 xtn
3587 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: 1}, // 1 uzp1
3588 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: 1}, // 1 xtn
3589 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: 2}, // 1 uzp1 + 1 xtn
3590 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: 4}, // 3 x uzp1 + xtn
3591 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: 1}, // 1 uzp1
3592 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: 3}, // 3 x uzp1
3593 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: 2}, // 2 x uzp1
3594 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: 1}, // uzp1
3595 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 3}, // (2 + 1) x uzp1
3596 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: 7}, // (4 + 2 + 1) x uzp1
3597 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: 2}, // 2 x uzp1
3598 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i64, .Cost: 6}, // (4 + 2) x uzp1
3599 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i64, .Cost: 4}, // 4 x uzp1
3600
3601 // Truncations on nxvmiN
3602 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i8, .Cost: 2},
3603 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i16, .Cost: 2},
3604 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i32, .Cost: 2},
3605 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i64, .Cost: 2},
3606 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i8, .Cost: 2},
3607 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i16, .Cost: 2},
3608 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i32, .Cost: 2},
3609 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i64, .Cost: 5},
3610 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i8, .Cost: 2},
3611 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i16, .Cost: 2},
3612 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i32, .Cost: 5},
3613 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i64, .Cost: 11},
3614 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i1, .Src: MVT::nxv16i8, .Cost: 2},
3615 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i16, .Cost: 0},
3616 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i32, .Cost: 0},
3617 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i64, .Cost: 0},
3618 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i32, .Cost: 0},
3619 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i64, .Cost: 0},
3620 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i32, .Src: MVT::nxv2i64, .Cost: 0},
3621 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i16, .Cost: 0},
3622 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i32, .Cost: 0},
3623 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i64, .Cost: 1},
3624 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i32, .Cost: 0},
3625 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i64, .Cost: 1},
3626 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i32, .Src: MVT::nxv4i64, .Cost: 1},
3627 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i16, .Cost: 0},
3628 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i32, .Cost: 1},
3629 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i64, .Cost: 3},
3630 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i32, .Cost: 1},
3631 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i64, .Cost: 3},
3632 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i16, .Cost: 1},
3633 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i32, .Cost: 3},
3634 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i64, .Cost: 7},
3635
3636 // The number of shll instructions for the extension.
3637 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3},
3638 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3},
3639 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
3640 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
3641 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3},
3642 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3},
3643 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
3644 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
3645 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7},
3646 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7},
3647 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6},
3648 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6},
3649 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
3650 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
3651 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
3652 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
3653
3654 // FP Ext and trunc
3655 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f32, .Cost: 1}, // fcvt
3656 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f32, .Cost: 1}, // fcvtl
3657 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f32, .Cost: 2}, // fcvtl+fcvtl2
3658 // FP16
3659 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::f16, .Cost: 1}, // fcvt
3660 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f16, .Cost: 1}, // fcvt
3661 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: 1}, // fcvtl
3662 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: 2}, // fcvtl+fcvtl2
3663 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f16, .Cost: 2}, // fcvtl+fcvtl
3664 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f16, .Cost: 3}, // fcvtl+fcvtl2+fcvtl
3665 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f16, .Cost: 6}, // 2 * fcvtl+fcvtl2+fcvtl
3666 // BF16 (uses shift)
3667 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::bf16, .Cost: 1}, // shl
3668 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::bf16, .Cost: 2}, // shl+fcvt
3669 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4bf16, .Cost: 1}, // shll
3670 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8bf16, .Cost: 2}, // shll+shll2
3671 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2bf16, .Cost: 2}, // shll+fcvtl
3672 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4bf16, .Cost: 3}, // shll+fcvtl+fcvtl2
3673 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8bf16, .Cost: 6}, // 2 * shll+fcvtl+fcvtl2
3674 // FP Ext and trunc
3675 {.ISD: ISD::FP_ROUND, .Dst: MVT::f32, .Src: MVT::f64, .Cost: 1}, // fcvt
3676 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f32, .Src: MVT::v2f64, .Cost: 1}, // fcvtn
3677 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f64, .Cost: 2}, // fcvtn+fcvtn2
3678 // FP16
3679 {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f32, .Cost: 1}, // fcvt
3680 {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f64, .Cost: 1}, // fcvt
3681 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f32, .Cost: 1}, // fcvtn
3682 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f32, .Cost: 2}, // fcvtn+fcvtn2
3683 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f16, .Src: MVT::v2f64, .Cost: 2}, // fcvtn+fcvtn
3684 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtn2+fcvtn
3685 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+fcvtn
3686 // BF16 (more complex, with +bf16 is handled above)
3687 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 8}, // Expansion is ~8 insns
3688 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 9}, // fcvtn + above
3689 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f32, .Cost: 8},
3690 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 8},
3691 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 15},
3692 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 9},
3693 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 10},
3694 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 19},
3695
3696 // LowerVectorINT_TO_FP:
3697 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
3698 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
3699 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1},
3700 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
3701 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
3702 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1},
3703
3704 // SVE: to nxv2f16
3705 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3706 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3707 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3708 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3709 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3710 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3711 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3712 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3713 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3714 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3715
3716 // SVE: to nxv4f16
3717 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3718 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3719 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3720 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3721 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3722 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3723 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3724 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3725
3726 // SVE: to nxv8f16
3727 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3728 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3729 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3730 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3731 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3732 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3733
3734 // SVE: to nxv16f16
3735 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3736 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3737 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3738 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3739
3740 // Complex: to v2f32
3741 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
3742 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3},
3743 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
3744 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3},
3745
3746 // SVE: to nxv2f32
3747 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3748 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3749 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3750 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3751 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3752 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3753 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3754 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3755 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3756 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3757
3758 // Complex: to v4f32
3759 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 4},
3760 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
3761 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3},
3762 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
3763
3764 // SVE: to nxv4f32
3765 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3766 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3767 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3768 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3769 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3770 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3771 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3772 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3773
3774 // Complex: to v8f32
3775 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
3776 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4},
3777 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
3778 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4},
3779
3780 // SVE: to nxv8f32
3781 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3782 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3783 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3784 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3785 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3786 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3787 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3788 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3789
3790 // SVE: to nxv16f32
3791 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3792 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3793 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3794 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3795
3796 // Complex: to v16f32
3797 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21},
3798 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21},
3799
3800 // Complex: to v2f64
3801 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4},
3802 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4},
3803 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2},
3804 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4},
3805 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4},
3806 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2},
3807
3808 // SVE: to nxv2f64
3809 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3810 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3811 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3812 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3813 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3814 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3815 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3816 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3817 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3818 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3819
3820 // Complex: to v4f64
3821 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4},
3822 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4},
3823
3824 // SVE: to nxv4f64
3825 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3826 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3827 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3828 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3829 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3830 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3831 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3832 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3833 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3834 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3835 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3836 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3837
3838 // SVE: to nxv8f64
3839 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3840 .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3841 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3842 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3843 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3844 .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3845 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3846 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3847
3848 // LowerVectorFP_TO_INT
3849 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1},
3850 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1},
3851 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1},
3852 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1},
3853 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1},
3854 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1},
3855
3856 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3857 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2},
3858 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1},
3859 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1},
3860 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2},
3861 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1},
3862 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1},
3863
3864 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3865 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2},
3866 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2},
3867 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2},
3868 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2},
3869
3870 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3871 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2},
3872 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2},
3873 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2},
3874 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2},
3875 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2},
3876 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2},
3877
3878 // Complex, from nxv2f32.
3879 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1},
3880 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1},
3881 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1},
3882 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1},
3883 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1},
3884 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1},
3885 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1},
3886 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1},
3887
3888 // Complex, from nxv2f64.
3889 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1},
3890 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1},
3891 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1},
3892 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1},
3893 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1},
3894 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1},
3895 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1},
3896 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1},
3897 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1},
3898 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1},
3899
3900 // Complex, from nxv4f32.
3901 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4},
3902 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1},
3903 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1},
3904 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1},
3905 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1},
3906 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4},
3907 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1},
3908 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1},
3909 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1},
3910 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1},
3911
3912 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3913 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7},
3914 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7},
3915 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7},
3916 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7},
3917
3918 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3919 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3},
3920 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3},
3921 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3},
3922 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3},
3923 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3},
3924 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3},
3925
3926 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3927 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3},
3928 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3},
3929 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3},
3930 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3},
3931
3932 // Complex, from nxv8f16.
3933 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10},
3934 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4},
3935 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1},
3936 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1},
3937 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1},
3938 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10},
3939 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4},
3940 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1},
3941 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1},
3942 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1},
3943
3944 // Complex, from nxv4f16.
3945 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4},
3946 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1},
3947 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1},
3948 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1},
3949 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4},
3950 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1},
3951 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1},
3952 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1},
3953
3954 // Complex, from nxv2f16.
3955 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1},
3956 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1},
3957 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1},
3958 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1},
3959 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1},
3960 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1},
3961 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1},
3962 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1},
3963
3964 // Truncate from nxvmf32 to nxvmf16.
3965 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f32, .Cost: 1},
3966 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f32, .Cost: 1},
3967 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f32, .Cost: 3},
3968
3969 // Truncate from nxvmf32 to nxvmbf16.
3970 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f32, .Cost: 8},
3971 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f32, .Cost: 8},
3972 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f32, .Cost: 17},
3973
3974 // Truncate from nxvmf64 to nxvmf16.
3975 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f64, .Cost: 1},
3976 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f64, .Cost: 3},
3977 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f64, .Cost: 7},
3978
3979 // Truncate from nxvmf64 to nxvmbf16.
3980 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f64, .Cost: 9},
3981 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f64, .Cost: 19},
3982 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f64, .Cost: 39},
3983
3984 // Truncate from nxvmf64 to nxvmf32.
3985 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f64, .Cost: 1},
3986 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f64, .Cost: 3},
3987 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f64, .Cost: 6},
3988
3989 // Extend from nxvmf16 to nxvmf32.
3990 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f16, .Cost: 1},
3991 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f16, .Cost: 1},
3992 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f16, .Cost: 2},
3993
3994 // Extend from nxvmbf16 to nxvmf32.
3995 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2bf16, .Cost: 1}, // lsl
3996 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4bf16, .Cost: 1}, // lsl
3997 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8bf16, .Cost: 4}, // unpck+unpck+lsl+lsl
3998
3999 // Extend from nxvmf16 to nxvmf64.
4000 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f16, .Cost: 1},
4001 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f16, .Cost: 2},
4002 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f16, .Cost: 4},
4003
4004 // Extend from nxvmbf16 to nxvmf64.
4005 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2bf16, .Cost: 2}, // lsl+fcvt
4006 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4bf16, .Cost: 6}, // 2*unpck+2*lsl+2*fcvt
4007 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8bf16, .Cost: 14}, // 6*unpck+4*lsl+4*fcvt
4008
4009 // Extend from nxvmf32 to nxvmf64.
4010 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f32, .Cost: 1},
4011 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f32, .Cost: 2},
4012 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f32, .Cost: 6},
4013
4014 // Bitcasts from float to integer
4015 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: 0},
4016 {.ISD: ISD::BITCAST, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: 0},
4017 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: 0},
4018
4019 // Bitcasts from integer to float
4020 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 0},
4021 {.ISD: ISD::BITCAST, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 0},
4022 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 0},
4023
4024 // Add cost for extending to illegal -too wide- scalable vectors.
4025 // zero/sign extend are implemented by multiple unpack operations,
4026 // where each operation has a cost of 1.
4027 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2},
4028 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6},
4029 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14},
4030 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2},
4031 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6},
4032 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2},
4033
4034 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2},
4035 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6},
4036 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14},
4037 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2},
4038 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6},
4039 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2},
4040 };
4041
4042 if (const auto *Entry = ConvertCostTableLookup(
4043 Table: ConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
4044 return Entry->Cost;
4045
4046 static const TypeConversionCostTblEntry FP16Tbl[] = {
4047 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs
4048 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1},
4049 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs
4050 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1},
4051 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2}, // fcvtl+fcvtzs
4052 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2},
4053 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2}, // fcvtzs+xtn
4054 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2},
4055 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1}, // fcvtzs
4056 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1},
4057 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4}, // 2*fcvtl+2*fcvtzs
4058 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4},
4059 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3}, // 2*fcvtzs+xtn
4060 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3},
4061 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2}, // 2*fcvtzs
4062 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2},
4063 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8}, // 4*fcvtl+4*fcvtzs
4064 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8},
4065 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // ushll + ucvtf
4066 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // sshll + scvtf
4067 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * ushl(2) + 2 * ucvtf
4068 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * sshl(2) + 2 * scvtf
4069 };
4070
4071 if (ST->hasFullFP16())
4072 if (const auto *Entry = ConvertCostTableLookup(
4073 Table: FP16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
4074 return Entry->Cost;
4075
4076 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
4077 // double-rounding issues.
4078 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
4079 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
4080 isa<FixedVectorType>(Val: Dst) && isa<FixedVectorType>(Val: Src))
4081 return cast<FixedVectorType>(Val: Dst)->getNumElements() *
4082 getCastInstrCost(Opcode, Dst: Dst->getScalarType(),
4083 Src: Src->getScalarType(), CCH, CostKind) +
4084 BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Src), Insert: false,
4085 Extract: true, CostKind) +
4086 BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Dst), Insert: true,
4087 Extract: false, CostKind);
4088
4089 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4090 CCH == TTI::CastContextHint::Masked &&
4091 ST->isSVEorStreamingSVEAvailable() &&
4092 TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) ==
4093 TargetLowering::TypePromoteInteger &&
4094 TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) ==
4095 TargetLowering::TypeSplitVector) {
4096 // The standard behaviour in the backend for these cases is to split the
4097 // extend up into two parts:
4098 // 1. Perform an extending load or masked load up to the legal type.
4099 // 2. Extend the loaded data to the final type.
4100 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
4101 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Context&: Src->getContext());
4102 InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
4103 Opcode, Dst: LegalTy, Src, CCH, CostKind, I);
4104 InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
4105 Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I);
4106 return Part1 + Part2;
4107 }
4108
4109 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
4110 // but we also want to include the TTI::CastContextHint::Masked case too.
4111 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4112 CCH == TTI::CastContextHint::Masked &&
4113 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(VT: DstTy))
4114 CCH = TTI::CastContextHint::Normal;
4115
4116 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
4117}
4118
4119InstructionCost
4120AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
4121 VectorType *VecTy, unsigned Index,
4122 TTI::TargetCostKind CostKind) const {
4123
4124 // Make sure we were given a valid extend opcode.
4125 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
4126 "Invalid opcode");
4127
4128 // We are extending an element we extract from a vector, so the source type
4129 // of the extend is the element type of the vector.
4130 auto *Src = VecTy->getElementType();
4131
4132 // Sign- and zero-extends are for integer types only.
4133 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
4134
4135 // Get the cost for the extract. We compute the cost (if any) for the extend
4136 // below.
4137 InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
4138 CostKind, Index, Op0: nullptr, Op1: nullptr);
4139
4140 // Legalize the types.
4141 auto VecLT = getTypeLegalizationCost(Ty: VecTy);
4142 auto DstVT = TLI->getValueType(DL, Ty: Dst);
4143 auto SrcVT = TLI->getValueType(DL, Ty: Src);
4144
4145 // If the resulting type is still a vector and the destination type is legal,
4146 // we may get the extension for free. If not, get the default cost for the
4147 // extend.
4148 if (!VecLT.second.isVector() || !TLI->isTypeLegal(VT: DstVT))
4149 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
4150 CostKind);
4151
4152 // The destination type should be larger than the element type. If not, get
4153 // the default cost for the extend.
4154 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4155 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
4156 CostKind);
4157
4158 switch (Opcode) {
4159 default:
4160 llvm_unreachable("Opcode should be either SExt or ZExt");
4161
4162 // For sign-extends, we only need a smov, which performs the extension
4163 // automatically.
4164 case Instruction::SExt:
4165 return Cost;
4166
4167 // For zero-extends, the extend is performed automatically by a umov unless
4168 // the destination type is i64 and the element type is i8 or i16.
4169 case Instruction::ZExt:
4170 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
4171 return Cost;
4172 }
4173
4174 // If we are unable to perform the extend for free, get the default cost.
4175 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
4176 CostKind);
4177}
4178
4179InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
4180 TTI::TargetCostKind CostKind,
4181 const Instruction *I) const {
4182 if (CostKind != TTI::TCK_RecipThroughput)
4183 return Opcode == Instruction::PHI ? 0 : 1;
4184 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4185 // Branches are assumed to be predicted.
4186 return 0;
4187}
4188
4189InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4190 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4191 const Instruction *I, Value *Scalar,
4192 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4193 TTI::VectorInstrContext VIC) const {
4194 assert(Val->isVectorTy() && "This must be a vector type");
4195
4196 if (Index != -1U) {
4197 // Legalize the type.
4198 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
4199
4200 // This type is legalized to a scalar type.
4201 if (!LT.second.isVector())
4202 return 0;
4203
4204 // The type may be split. For fixed-width vectors we can normalize the
4205 // index to the new type.
4206 if (LT.second.isFixedLengthVector()) {
4207 unsigned Width = LT.second.getVectorNumElements();
4208 Index = Index % Width;
4209 }
4210
4211 // The element at index zero is already inside the vector.
4212 // - For a insert-element or extract-element
4213 // instruction that extracts integers, an explicit FPR -> GPR move is
4214 // needed. So it has non-zero cost.
4215 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4216 return 0;
4217
4218 // This is recognising a LD1 single-element structure to one lane of one
4219 // register instruction. I.e., if this is an `insertelement` instruction,
4220 // and its second operand is a load, then we will generate a LD1, which
4221 // are expensive instructions on some uArchs.
4222 if (VIC == TTI::VectorInstrContext::Load) {
4223 if (ST->hasFastLD1Single())
4224 return 0;
4225 return CostKind == TTI::TCK_CodeSize
4226 ? 0
4227 : ST->getVectorInsertExtractBaseCost() + 1;
4228 }
4229
4230 // i1 inserts and extract will include an extra cset or cmp of the vector
4231 // value. Increase the cost by 1 to account.
4232 if (Val->getScalarSizeInBits() == 1)
4233 return CostKind == TTI::TCK_CodeSize
4234 ? 2
4235 : ST->getVectorInsertExtractBaseCost() + 1;
4236
4237 // FIXME:
4238 // If the extract-element and insert-element instructions could be
4239 // simplified away (e.g., could be combined into users by looking at use-def
4240 // context), they have no cost. This is not done in the first place for
4241 // compile-time considerations.
4242 }
4243
4244 // In case of Neon, if there exists extractelement from lane != 0 such that
4245 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4246 // 2. extractelement result feeds into fmul.
4247 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4248 // equivalent to 0.
4249 // then the extractelement can be merged with fmul in the backend and it
4250 // incurs no cost.
4251 // e.g.
4252 // define double @foo(<2 x double> %a) {
4253 // %1 = extractelement <2 x double> %a, i32 0
4254 // %2 = extractelement <2 x double> %a, i32 1
4255 // %res = fmul double %1, %2
4256 // ret double %res
4257 // }
4258 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4259 auto ExtractCanFuseWithFmul = [&]() {
4260 // We bail out if the extract is from lane 0.
4261 if (Index == 0)
4262 return false;
4263
4264 // Check if the scalar element type of the vector operand of ExtractElement
4265 // instruction is one of the allowed types.
4266 auto IsAllowedScalarTy = [&](const Type *T) {
4267 return T->isFloatTy() || T->isDoubleTy() ||
4268 (T->isHalfTy() && ST->hasFullFP16());
4269 };
4270
4271 // Check if the extractelement user is scalar fmul.
4272 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4273 // Check if the user is scalar fmul.
4274 const auto *BO = dyn_cast<BinaryOperator>(Val: EEUser);
4275 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4276 !BO->getType()->isVectorTy();
4277 };
4278
4279 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4280 // certain scalar type and a certain vector register width.
4281 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4282 auto RegWidth =
4283 getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
4284 .getFixedValue();
4285 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4286 };
4287
4288 // Check if the type constraints on input vector type and result scalar type
4289 // of extractelement instruction are satisfied.
4290 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4291 return false;
4292
4293 if (Scalar) {
4294 DenseMap<User *, unsigned> UserToExtractIdx;
4295 for (auto *U : Scalar->users()) {
4296 if (!IsUserFMulScalarTy(U))
4297 return false;
4298 // Recording entry for the user is important. Index value is not
4299 // important.
4300 UserToExtractIdx[U];
4301 }
4302 if (UserToExtractIdx.empty())
4303 return false;
4304 for (auto &[S, U, L] : ScalarUserAndIdx) {
4305 for (auto *U : S->users()) {
4306 if (UserToExtractIdx.contains(Val: U)) {
4307 auto *FMul = cast<BinaryOperator>(Val: U);
4308 auto *Op0 = FMul->getOperand(i_nocapture: 0);
4309 auto *Op1 = FMul->getOperand(i_nocapture: 1);
4310 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4311 UserToExtractIdx[U] = L;
4312 break;
4313 }
4314 }
4315 }
4316 }
4317 for (auto &[U, L] : UserToExtractIdx) {
4318 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4319 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4320 return false;
4321 }
4322 } else {
4323 const auto *EE = cast<ExtractElementInst>(Val: I);
4324
4325 const auto *IdxOp = dyn_cast<ConstantInt>(Val: EE->getIndexOperand());
4326 if (!IdxOp)
4327 return false;
4328
4329 return !EE->users().empty() && all_of(Range: EE->users(), P: [&](const User *U) {
4330 if (!IsUserFMulScalarTy(U))
4331 return false;
4332
4333 // Check if the other operand of extractelement is also extractelement
4334 // from lane equivalent to 0.
4335 const auto *BO = cast<BinaryOperator>(Val: U);
4336 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4337 Val: BO->getOperand(i_nocapture: 0) == EE ? BO->getOperand(i_nocapture: 1) : BO->getOperand(i_nocapture: 0));
4338 if (OtherEE) {
4339 const auto *IdxOp = dyn_cast<ConstantInt>(Val: OtherEE->getIndexOperand());
4340 if (!IdxOp)
4341 return false;
4342 return IsExtractLaneEquivalentToZero(
4343 cast<ConstantInt>(Val: OtherEE->getIndexOperand())
4344 ->getValue()
4345 .getZExtValue(),
4346 OtherEE->getType()->getScalarSizeInBits());
4347 }
4348 return true;
4349 });
4350 }
4351 return true;
4352 };
4353
4354 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4355 ExtractCanFuseWithFmul())
4356 return 0;
4357
4358 // All other insert/extracts cost this much.
4359 return CostKind == TTI::TCK_CodeSize ? 1
4360 : ST->getVectorInsertExtractBaseCost();
4361}
4362
4363InstructionCost AArch64TTIImpl::getVectorInstrCost(
4364 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4365 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4366 // Treat insert at lane 0 into a poison vector as having zero cost. This
4367 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4368 // single dup) are treated as cheap.
4369 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4370 isa<PoisonValue>(Val: Op0))
4371 return 0;
4372 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, I: nullptr,
4373 Scalar: nullptr, ScalarUserAndIdx: {}, VIC);
4374}
4375
4376InstructionCost AArch64TTIImpl::getVectorInstrCost(
4377 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4378 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4379 TTI::VectorInstrContext VIC) const {
4380 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, I: nullptr, Scalar,
4381 ScalarUserAndIdx, VIC);
4382}
4383
4384InstructionCost
4385AArch64TTIImpl::getVectorInstrCost(const Instruction &I, Type *Val,
4386 TTI::TargetCostKind CostKind, unsigned Index,
4387 TTI::VectorInstrContext VIC) const {
4388 return getVectorInstrCostHelper(Opcode: I.getOpcode(), Val, CostKind, Index, I: &I,
4389 Scalar: nullptr, ScalarUserAndIdx: {}, VIC);
4390}
4391
4392InstructionCost
4393AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
4394 TTI::TargetCostKind CostKind,
4395 unsigned Index) const {
4396 if (isa<FixedVectorType>(Val))
4397 return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
4398 Index);
4399
4400 // This typically requires both while and lastb instructions in order
4401 // to extract the last element. If this is in a loop the while
4402 // instruction can at least be hoisted out, although it will consume a
4403 // predicate register. The cost should be more expensive than the base
4404 // extract cost, which is 2 for most CPUs.
4405 return CostKind == TTI::TCK_CodeSize
4406 ? 2
4407 : ST->getVectorInsertExtractBaseCost() + 1;
4408}
4409
4410InstructionCost AArch64TTIImpl::getScalarizationOverhead(
4411 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4412 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4413 TTI::VectorInstrContext VIC) const {
4414 if (isa<ScalableVectorType>(Val: Ty))
4415 return InstructionCost::getInvalid();
4416 if (Ty->getElementType()->isFloatingPointTy())
4417 return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract,
4418 CostKind);
4419 unsigned VecInstCost =
4420 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4421 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4422}
4423
4424std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4425 Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
4426 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4427 std::function<InstructionCost(Type *)> InstCost) const {
4428 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4429 return std::nullopt;
4430 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4431 return std::nullopt;
4432 // If we have +sve-b16b16 the operation can be promoted to SVE.
4433 if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
4434 return std::nullopt;
4435
4436 Type *PromotedTy = Ty->getWithNewType(EltTy: Type::getFloatTy(C&: Ty->getContext()));
4437 InstructionCost Cost = getCastInstrCost(Opcode: Instruction::FPExt, Dst: PromotedTy, Src: Ty,
4438 CCH: TTI::CastContextHint::None, CostKind);
4439 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4440 Cost *= 2;
4441 Cost += InstCost(PromotedTy);
4442 if (IncludeTrunc)
4443 Cost += getCastInstrCost(Opcode: Instruction::FPTrunc, Dst: Ty, Src: PromotedTy,
4444 CCH: TTI::CastContextHint::None, CostKind);
4445 return Cost;
4446}
4447
4448InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
4449 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4450 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
4451 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4452
4453 // The code-generator is currently not able to handle scalable vectors
4454 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4455 // it. This change will be removed when code-generation for these types is
4456 // sufficiently reliable.
4457 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
4458 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
4459 return InstructionCost::getInvalid();
4460
4461 // TODO: Handle more cost kinds.
4462 if (CostKind != TTI::TCK_RecipThroughput)
4463 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4464 Opd2Info: Op2Info, Args, CxtI);
4465
4466 // Legalize the type.
4467 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4468 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4469
4470 // Increase the cost for half and bfloat types if not architecturally
4471 // supported.
4472 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4473 ISD == ISD::FDIV || ISD == ISD::FREM) {
4474 if (auto PromotedCost = getFP16BF16PromoteCost(
4475 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4476 // There is not native support for fdiv/frem even with +sve-b16b16.
4477 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4478 InstCost: [&](Type *PromotedTy) {
4479 return getArithmeticInstrCost(Opcode, Ty: PromotedTy, CostKind,
4480 Op1Info, Op2Info);
4481 }))
4482 return *PromotedCost;
4483
4484 // fp128 all go via libcalls
4485 if (Ty->getScalarType()->isFP128Ty())
4486 return (CostKind == TTI::TCK_CodeSize ? 1 : 10) * LT.first;
4487 }
4488
4489 // If the operation is a widening instruction (smull or umull) and both
4490 // operands are extends the cost can be cheaper by considering that the
4491 // operation will operate on the narrowest type size possible (double the
4492 // largest input size) and a further extend.
4493 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, DstTy: Ty, Args)) {
4494 if (ExtTy != Ty)
4495 return getArithmeticInstrCost(Opcode, Ty: ExtTy, CostKind) +
4496 getCastInstrCost(Opcode: Instruction::ZExt, Dst: Ty, Src: ExtTy,
4497 CCH: TTI::CastContextHint::None, CostKind);
4498 return LT.first;
4499 }
4500
4501 switch (ISD) {
4502 default:
4503 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4504 Opd2Info: Op2Info);
4505 case ISD::ADD:
4506 case ISD::SUB:
4507 return LT.first; // Also works for i128
4508 case ISD::MUL:
4509 if (LT.second == MVT::v2i64) {
4510 // When SVE is available, then we can lower the v2i64 operation using
4511 // the SVE mul instruction, which has a lower cost.
4512 if (ST->hasSVE())
4513 return LT.first;
4514
4515 // When SVE is not available, there is no MUL.2d instruction,
4516 // which means mul <2 x i64> is expensive as elements are extracted
4517 // from the vectors and the muls scalarized.
4518 // As getScalarizationOverhead is a bit too pessimistic, we
4519 // estimate the cost for a i64 vector directly here, which is:
4520 // - four 2-cost i64 extracts,
4521 // - two 2-cost i64 inserts, and
4522 // - two 1-cost muls.
4523 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4524 // LT.first = 2 the cost is 28.
4525 return cast<VectorType>(Val: Ty)->getElementCount().getKnownMinValue() *
4526 (getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind) +
4527 getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind, Index: -1,
4528 Op0: nullptr, Op1: nullptr) *
4529 2 +
4530 getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1,
4531 Op0: nullptr, Op1: nullptr));
4532 }
4533 return LT.first;
4534 case ISD::SREM:
4535 case ISD::SDIV:
4536 /*
4537 Notes for sdiv/srem specific costs:
4538 1. This only considers the cases where the divisor is constant, uniform and
4539 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4540 result in some form of (ldr + adrp), corresponding to constant vectors, or
4541 scalarization of the division operation.
4542 2. Constant divisors, either negative in whole or partially, don't result in
4543 significantly different codegen as compared to positive constant divisors.
4544 So, we don't consider negative divisors separately.
4545 3. If the codegen is significantly different with SVE, it has been indicated
4546 using comments at appropriate places.
4547
4548 sdiv specific cases:
4549 -----------------------------------------------------------------------
4550 codegen | pow-of-2 | Type
4551 -----------------------------------------------------------------------
4552 add + cmp + csel + asr | Y | i64
4553 add + cmp + csel + asr | Y | i32
4554 -----------------------------------------------------------------------
4555
4556 srem specific cases:
4557 -----------------------------------------------------------------------
4558 codegen | pow-of-2 | Type
4559 -----------------------------------------------------------------------
4560 negs + and + and + csneg | Y | i64
4561 negs + and + and + csneg | Y | i32
4562 -----------------------------------------------------------------------
4563
4564 other sdiv/srem cases:
4565 -------------------------------------------------------------------------
4566 common codegen | + srem | + sdiv | pow-of-2 | Type
4567 -------------------------------------------------------------------------
4568 smulh + asr + add + add | - | - | N | i64
4569 smull + lsr + add + add | - | - | N | i32
4570 usra | and + sub | sshr | Y | <2 x i64>
4571 2 * (scalar code) | - | - | N | <2 x i64>
4572 usra | bic + sub | sshr + neg | Y | <4 x i32>
4573 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4574 + sshr + usra | | | |
4575 -------------------------------------------------------------------------
4576 */
4577 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4578 InstructionCost AddCost =
4579 getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
4580 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4581 InstructionCost AsrCost =
4582 getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4583 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4584 InstructionCost MulCost =
4585 getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4586 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4587 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4588 // have similar cost.
4589 auto VT = TLI->getValueType(DL, Ty);
4590 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4591 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4592 // Neg can be folded into the asr instruction.
4593 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4594 : (3 * AsrCost + AddCost);
4595 } else {
4596 return MulCost + AsrCost + 2 * AddCost;
4597 }
4598 } else if (VT.isVector()) {
4599 InstructionCost UsraCost = 2 * AsrCost;
4600 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4601 // Division with scalable types corresponds to native 'asrd'
4602 // instruction when SVE is available.
4603 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4604
4605 // One more for the negation in SDIV
4606 InstructionCost Cost =
4607 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4608 if (Ty->isScalableTy() && ST->hasSVE())
4609 Cost += 2 * AsrCost;
4610 else {
4611 Cost +=
4612 UsraCost +
4613 (ISD == ISD::SDIV
4614 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4615 : 2 * AddCost);
4616 }
4617 return Cost;
4618 } else if (LT.second == MVT::v2i64) {
4619 return VT.getVectorNumElements() *
4620 getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind,
4621 Op1Info: Op1Info.getNoProps(),
4622 Op2Info: Op2Info.getNoProps());
4623 } else {
4624 // When SVE is available, we get:
4625 // smulh + lsr + add/sub + asr + add/sub.
4626 if (Ty->isScalableTy() && ST->hasSVE())
4627 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4628 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4629 }
4630 }
4631 }
4632 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4633 LT.second.isFixedLengthVector()) {
4634 // FIXME: When the constant vector is non-uniform, this may result in
4635 // loading the vector from constant pool or in some cases, may also result
4636 // in scalarization. For now, we are approximating this with the
4637 // scalarization cost.
4638 auto ExtractCost = 2 * getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty,
4639 CostKind, Index: -1, Op0: nullptr, Op1: nullptr);
4640 auto InsertCost = getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty,
4641 CostKind, Index: -1, Op0: nullptr, Op1: nullptr);
4642 unsigned NElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
4643 return ExtractCost + InsertCost +
4644 NElts * getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(),
4645 CostKind, Op1Info: Op1Info.getNoProps(),
4646 Op2Info: Op2Info.getNoProps());
4647 }
4648 [[fallthrough]];
4649 case ISD::UDIV:
4650 case ISD::UREM: {
4651 auto VT = TLI->getValueType(DL, Ty);
4652 if (Op2Info.isConstant()) {
4653 // If the operand is a power of 2 we can use the shift or and cost.
4654 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4655 return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind,
4656 Op1Info: Op1Info.getNoProps(),
4657 Op2Info: Op2Info.getNoProps());
4658 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4659 return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind,
4660 Op1Info: Op1Info.getNoProps(),
4661 Op2Info: Op2Info.getNoProps());
4662
4663 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4664 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4665 // The MULHU will be expanded to UMULL for the types not listed below,
4666 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4667 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4668 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4669 LT.second == MVT::nxv16i8;
4670 bool Is128bit = LT.second.is128BitVector();
4671
4672 InstructionCost MulCost =
4673 getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4674 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4675 InstructionCost AddCost =
4676 getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
4677 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4678 InstructionCost ShrCost =
4679 getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4680 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4681 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4682 (HasMULH ? 0 : ShrCost) + // UMULL shift
4683 AddCost * 2 + ShrCost;
4684 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4685 }
4686 }
4687
4688 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4689 // emitted by the backend even when those functions are not declared in the
4690 // module.
4691 if (!VT.isVector() && VT.getSizeInBits() > 64)
4692 return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4693
4694 InstructionCost Cost = BaseT::getArithmeticInstrCost(
4695 Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4696 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4697 if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) {
4698 // SDIV/UDIV operations are lowered using SVE, then we can have less
4699 // costs.
4700 if (VT.isSimple() && isa<FixedVectorType>(Val: Ty) &&
4701 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4702 static const CostTblEntry DivTbl[]{
4703 {.ISD: ISD::SDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v4i8, .Cost: 8},
4704 {.ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::SDIV, .Type: MVT::v2i16, .Cost: 5},
4705 {.ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: 1},
4706 {.ISD: ISD::UDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v4i8, .Cost: 8},
4707 {.ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::UDIV, .Type: MVT::v2i16, .Cost: 5},
4708 {.ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: 1}};
4709
4710 const auto *Entry = CostTableLookup(Table: DivTbl, ISD, Ty: VT.getSimpleVT());
4711 if (nullptr != Entry)
4712 return Entry->Cost;
4713 }
4714 // For 8/16-bit elements, the cost is higher because the type
4715 // requires promotion and possibly splitting:
4716 if (LT.second.getScalarType() == MVT::i8)
4717 Cost *= 8;
4718 else if (LT.second.getScalarType() == MVT::i16)
4719 Cost *= 4;
4720 return Cost;
4721 } else {
4722 // If one of the operands is a uniform constant then the cost for each
4723 // element is Cost for insertion, extraction and division.
4724 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4725 // operation with scalar type
4726 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4727 (Op2Info.isConstant() && Op2Info.isUniform())) {
4728 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
4729 InstructionCost DivCost = BaseT::getArithmeticInstrCost(
4730 Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4731 return (4 + DivCost) * VTy->getNumElements();
4732 }
4733 }
4734 // On AArch64, without SVE, vector divisions are expanded
4735 // into scalar divisions of each pair of elements.
4736 Cost += getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind,
4737 Index: -1, Op0: nullptr, Op1: nullptr);
4738 Cost += getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1,
4739 Op0: nullptr, Op1: nullptr);
4740 }
4741
4742 // TODO: if one of the arguments is scalar, then it's not necessary to
4743 // double the cost of handling the vector elements.
4744 Cost += Cost;
4745 }
4746 return Cost;
4747 }
4748 case ISD::XOR:
4749 case ISD::OR:
4750 case ISD::AND:
4751 case ISD::SRL:
4752 case ISD::SRA:
4753 case ISD::SHL:
4754 // These nodes are marked as 'custom' for combining purposes only.
4755 // We know that they are legal. See LowerAdd in ISelLowering.
4756 return LT.first;
4757
4758 case ISD::FNEG:
4759 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4760 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4761 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4762 CxtI &&
4763 ((CxtI->hasOneUse() &&
4764 match(V: *CxtI->user_begin(), P: m_FMul(L: m_Value(), R: m_Value()))) ||
4765 match(V: CxtI->getOperand(i: 0), P: m_FMul(L: m_Value(), R: m_Value()))))
4766 return 0;
4767 [[fallthrough]];
4768 case ISD::FADD:
4769 case ISD::FSUB:
4770 if (!Ty->getScalarType()->isFP128Ty())
4771 return LT.first;
4772 [[fallthrough]];
4773 case ISD::FMUL:
4774 case ISD::FDIV:
4775 // These nodes are marked as 'custom' just to lower them to SVE.
4776 // We know said lowering will incur no additional cost.
4777 if (!Ty->getScalarType()->isFP128Ty())
4778 return 2 * LT.first;
4779
4780 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4781 Opd2Info: Op2Info);
4782 case ISD::FREM:
4783 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4784 // those functions are not declared in the module.
4785 if (!Ty->isVectorTy())
4786 return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4787 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4788 Opd2Info: Op2Info);
4789 }
4790}
4791
4792InstructionCost
4793AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
4794 const SCEV *Ptr,
4795 TTI::TargetCostKind CostKind) const {
4796 // Address computations in vectorized code with non-consecutive addresses will
4797 // likely result in more instructions compared to scalar code where the
4798 // computation can more often be merged into the index mode. The resulting
4799 // extra micro-ops can significantly decrease throughput.
4800 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4801 int MaxMergeDistance = 64;
4802
4803 if (PtrTy->isVectorTy() && SE &&
4804 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + 1))
4805 return NumVectorInstToHideOverhead;
4806
4807 // In many cases the address computation is not merged into the instruction
4808 // addressing mode.
4809 return 1;
4810}
4811
4812/// Check whether Opcode1 has less throughput according to the scheduling
4813/// model than Opcode2.
4814bool AArch64TTIImpl::hasKnownLowerThroughputFromSchedulingModel(
4815 unsigned Opcode1, unsigned Opcode2) const {
4816 const MCSchedModel &Sched = ST->getSchedModel();
4817 const TargetInstrInfo *TII = ST->getInstrInfo();
4818 if (!Sched.hasInstrSchedModel())
4819 return false;
4820
4821 const MCSchedClassDesc *SCD1 =
4822 Sched.getSchedClassDesc(SchedClassIdx: TII->get(Opcode: Opcode1).getSchedClass());
4823 const MCSchedClassDesc *SCD2 =
4824 Sched.getSchedClassDesc(SchedClassIdx: TII->get(Opcode: Opcode2).getSchedClass());
4825 // We cannot handle variant scheduling classes without an MI. If we need to
4826 // support them for any of the instructions we query the information of we
4827 // might need to add a way to resolve them without a MI or not use the
4828 // scheduling info.
4829 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4830 "Cannot handle variant scheduling classes without an MI");
4831 if (!SCD1->isValid() || !SCD2->isValid())
4832 return false;
4833
4834 return MCSchedModel::getReciprocalThroughput(STI: *ST, SCDesc: *SCD1) >
4835 MCSchedModel::getReciprocalThroughput(STI: *ST, SCDesc: *SCD2);
4836}
4837
4838InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
4839 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4840 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
4841 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4842 // We don't lower some vector selects well that are wider than the register
4843 // width. TODO: Improve this with different cost kinds.
4844 if (isa<FixedVectorType>(Val: ValTy) && Opcode == Instruction::Select) {
4845 // We would need this many instructions to hide the scalarization happening.
4846 const int AmortizationCost = 20;
4847
4848 // If VecPred is not set, check if we can get a predicate from the context
4849 // instruction, if its type matches the requested ValTy.
4850 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4851 CmpPredicate CurrentPred;
4852 if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(),
4853 R: m_Value())))
4854 VecPred = CurrentPred;
4855 }
4856 // Check if we have a compare/select chain that can be lowered using
4857 // a (F)CMxx & BFI pair.
4858 if (CmpInst::isIntPredicate(P: VecPred) || VecPred == CmpInst::FCMP_OLE ||
4859 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4860 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4861 VecPred == CmpInst::FCMP_UNE) {
4862 static const auto ValidMinMaxTys = {
4863 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4864 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4865 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4866
4867 auto LT = getTypeLegalizationCost(Ty: ValTy);
4868 if (any_of(Range: ValidMinMaxTys, P: equal_to(Arg&: LT.second)) ||
4869 (ST->hasFullFP16() &&
4870 any_of(Range: ValidFP16MinMaxTys, P: equal_to(Arg&: LT.second))))
4871 return LT.first;
4872 }
4873
4874 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4875 {.ISD: Instruction::Select, .Dst: MVT::v2i1, .Src: MVT::v2f32, .Cost: 2},
4876 {.ISD: Instruction::Select, .Dst: MVT::v2i1, .Src: MVT::v2f64, .Cost: 2},
4877 {.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4f32, .Cost: 2},
4878 {.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4f16, .Cost: 2},
4879 {.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8f16, .Cost: 2},
4880 {.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: 16},
4881 {.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: 8},
4882 {.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: 16},
4883 {.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: 4 * AmortizationCost},
4884 {.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: 8 * AmortizationCost},
4885 {.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: 16 * AmortizationCost}};
4886
4887 EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
4888 EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
4889 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4890 if (const auto *Entry = ConvertCostTableLookup(Table: VectorSelectTbl, ISD: Opcode,
4891 Dst: SelCondTy.getSimpleVT(),
4892 Src: SelValTy.getSimpleVT()))
4893 return Entry->Cost;
4894 }
4895 }
4896
4897 if (Opcode == Instruction::FCmp) {
4898 if (auto PromotedCost = getFP16BF16PromoteCost(
4899 Ty: ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4900 // TODO: Consider costing SVE FCMPs.
4901 /*CanUseSVE=*/false, InstCost: [&](Type *PromotedTy) {
4902 InstructionCost Cost =
4903 getCmpSelInstrCost(Opcode, ValTy: PromotedTy, CondTy, VecPred,
4904 CostKind, Op1Info, Op2Info);
4905 if (isa<VectorType>(Val: PromotedTy))
4906 Cost += getCastInstrCost(
4907 Opcode: Instruction::Trunc,
4908 Dst: VectorType::getInteger(VTy: cast<VectorType>(Val: ValTy)),
4909 Src: VectorType::getInteger(VTy: cast<VectorType>(Val: PromotedTy)),
4910 CCH: TTI::CastContextHint::None, CostKind);
4911 return Cost;
4912 }))
4913 return *PromotedCost;
4914
4915 auto LT = getTypeLegalizationCost(Ty: ValTy);
4916 // Model unknown fp compares as a libcall.
4917 if (LT.second.getScalarType() != MVT::f64 &&
4918 LT.second.getScalarType() != MVT::f32 &&
4919 LT.second.getScalarType() != MVT::f16)
4920 return LT.first * getCallInstrCost(/*Function*/ F: nullptr, RetTy: ValTy,
4921 Tys: {ValTy, ValTy}, CostKind);
4922
4923 // Some comparison operators require expanding to multiple compares + or.
4924 unsigned Factor = 1;
4925 if (!CondTy->isVectorTy() &&
4926 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4927 Factor = 2; // fcmp with 2 selects
4928 else if (isa<FixedVectorType>(Val: ValTy) &&
4929 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4930 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4931 Factor = 3; // fcmxx+fcmyy+or
4932 else if (isa<ScalableVectorType>(Val: ValTy) &&
4933 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4934 Factor = 3; // fcmxx+fcmyy+or
4935
4936 if (isa<ScalableVectorType>(Val: ValTy) &&
4937 CostKind == TTI::TCK_RecipThroughput &&
4938 hasKnownLowerThroughputFromSchedulingModel(Opcode1: AArch64::FCMEQ_PPzZZ_S,
4939 Opcode2: AArch64::FCMEQv4f32))
4940 Factor *= 2;
4941
4942 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4943 }
4944
4945 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4946 // icmp(and, 0) as free, as we can make use of ands, but only if the
4947 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4948 // providing it will not cause performance regressions.
4949 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4950 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(Pred: VecPred) &&
4951 TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) &&
4952 match(V: I->getOperand(i: 0), P: m_And(L: m_Value(), R: m_Value()))) {
4953 if (match(V: I->getOperand(i: 1), P: m_Zero()))
4954 return 0;
4955
4956 // x >= 1 / x < 1 -> x > 0 / x <= 0
4957 if (match(V: I->getOperand(i: 1), P: m_One()) &&
4958 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4959 return 0;
4960
4961 // x <= -1 / x > -1 -> x > 0 / x <= 0
4962 if (match(V: I->getOperand(i: 1), P: m_AllOnes()) &&
4963 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4964 return 0;
4965 }
4966
4967 // The base case handles scalable vectors fine for now, since it treats the
4968 // cost as 1 * legalization cost.
4969 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4970 Op1Info, Op2Info, I);
4971}
4972
4973AArch64TTIImpl::TTI::MemCmpExpansionOptions
4974AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4975 TTI::MemCmpExpansionOptions Options;
4976 if (ST->requiresStrictAlign()) {
4977 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4978 // a bunch of instructions when strict align is enabled.
4979 return Options;
4980 }
4981 Options.AllowOverlappingLoads = true;
4982 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4983 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4984 // TODO: Though vector loads usually perform well on AArch64, in some targets
4985 // they may wake up the FP unit, which raises the power consumption. Perhaps
4986 // they could be used with no holds barred (-O3).
4987 Options.LoadSizes = {8, 4, 2, 1};
4988 Options.AllowedTailExpansions = {3, 5, 6};
4989 return Options;
4990}
4991
4992bool AArch64TTIImpl::prefersVectorizedAddressing() const {
4993 return ST->hasSVE();
4994}
4995
4996InstructionCost
4997AArch64TTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
4998 TTI::TargetCostKind CostKind) const {
4999 switch (MICA.getID()) {
5000 case Intrinsic::masked_scatter:
5001 case Intrinsic::masked_gather:
5002 return getGatherScatterOpCost(MICA, CostKind);
5003 case Intrinsic::masked_load:
5004 case Intrinsic::masked_expandload:
5005 case Intrinsic::masked_store:
5006 return getMaskedMemoryOpCost(MICA, CostKind);
5007 }
5008 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
5009}
5010
5011InstructionCost
5012AArch64TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
5013 TTI::TargetCostKind CostKind) const {
5014 Type *Src = MICA.getDataType();
5015
5016 if (useNeonVector(Ty: Src))
5017 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
5018 auto LT = getTypeLegalizationCost(Ty: Src);
5019 if (!LT.first.isValid())
5020 return InstructionCost::getInvalid();
5021
5022 // Return an invalid cost for element types that we are unable to lower.
5023 auto *VT = cast<VectorType>(Val: Src);
5024 if (VT->getElementType()->isIntegerTy(BitWidth: 1))
5025 return InstructionCost::getInvalid();
5026
5027 // The code-generator is currently not able to handle scalable vectors
5028 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5029 // it. This change will be removed when code-generation for these types is
5030 // sufficiently reliable.
5031 if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1))
5032 return InstructionCost::getInvalid();
5033
5034 InstructionCost MemOpCost = LT.first;
5035 if (MICA.getID() == Intrinsic::masked_expandload) {
5036 if (!isLegalMaskedExpandLoad(DataTy: Src, Alignment: MICA.getAlignment()))
5037 return InstructionCost::getInvalid();
5038
5039 // Operation will be split into expand of masked.load
5040 MemOpCost *= 2;
5041 }
5042
5043 // If we need to split the memory operation, we will also need to split the
5044 // mask. This will likely lead to overestimating the cost in some cases if
5045 // multiple memory operations use the same mask, but we often don't have
5046 // enough context to figure that out here.
5047 //
5048 // If the elements being loaded are bytes then the mask will already be split,
5049 // since the number of bits in a P register matches the number of bytes in a
5050 // Z register.
5051 if (LT.first > 1 && LT.second.getScalarSizeInBits() > 8)
5052 return MemOpCost * 2;
5053
5054 return MemOpCost;
5055}
5056
5057// This function returns gather/scatter overhead either from
5058// user-provided value or specialized values per-target from \p ST.
5059static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
5060 const AArch64Subtarget *ST) {
5061 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5062 "Should be called on only load or stores.");
5063 switch (Opcode) {
5064 case Instruction::Load:
5065 if (SVEGatherOverhead.getNumOccurrences() > 0)
5066 return SVEGatherOverhead;
5067 return ST->getGatherOverhead();
5068 break;
5069 case Instruction::Store:
5070 if (SVEScatterOverhead.getNumOccurrences() > 0)
5071 return SVEScatterOverhead;
5072 return ST->getScatterOverhead();
5073 break;
5074 default:
5075 llvm_unreachable("Shouldn't have reached here");
5076 }
5077}
5078
5079InstructionCost
5080AArch64TTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
5081 TTI::TargetCostKind CostKind) const {
5082
5083 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
5084 MICA.getID() == Intrinsic::vp_gather)
5085 ? Instruction::Load
5086 : Instruction::Store;
5087
5088 Type *DataTy = MICA.getDataType();
5089 Align Alignment = MICA.getAlignment();
5090 const Instruction *I = MICA.getInst();
5091
5092 if (useNeonVector(Ty: DataTy) || !isLegalMaskedGatherScatter(DataType: DataTy))
5093 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
5094 auto *VT = cast<VectorType>(Val: DataTy);
5095 auto LT = getTypeLegalizationCost(Ty: DataTy);
5096 if (!LT.first.isValid())
5097 return InstructionCost::getInvalid();
5098
5099 // Return an invalid cost for element types that we are unable to lower.
5100 if (!LT.second.isVector() ||
5101 !isElementTypeLegalForScalableVector(Ty: VT->getElementType()) ||
5102 VT->getElementType()->isIntegerTy(BitWidth: 1))
5103 return InstructionCost::getInvalid();
5104
5105 // The code-generator is currently not able to handle scalable vectors
5106 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5107 // it. This change will be removed when code-generation for these types is
5108 // sufficiently reliable.
5109 if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1))
5110 return InstructionCost::getInvalid();
5111
5112 ElementCount LegalVF = LT.second.getVectorElementCount();
5113 InstructionCost MemOpCost =
5114 getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: 0, CostKind,
5115 OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
5116 // Add on an overhead cost for using gathers/scatters.
5117 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
5118 return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF);
5119}
5120
5121bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
5122 return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors();
5123}
5124
5125InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
5126 Align Alignment,
5127 unsigned AddressSpace,
5128 TTI::TargetCostKind CostKind,
5129 TTI::OperandValueInfo OpInfo,
5130 const Instruction *I) const {
5131 EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
5132 // Type legalization can't handle structs, and load latency isn't handled here
5133 if (VT == MVT::Other ||
5134 (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency))
5135 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
5136 CostKind);
5137
5138 auto LT = getTypeLegalizationCost(Ty);
5139 if (!LT.first.isValid())
5140 return InstructionCost::getInvalid();
5141
5142 // The code-generator is currently not able to handle scalable vectors
5143 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5144 // it. This change will be removed when code-generation for these types is
5145 // sufficiently reliable.
5146 // We also only support full register predicate loads and stores.
5147 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
5148 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1) ||
5149 (VTy->getElementType()->isIntegerTy(BitWidth: 1) &&
5150 !VTy->getElementCount().isKnownMultipleOf(
5151 RHS: ElementCount::getScalable(MinVal: 16))))
5152 return InstructionCost::getInvalid();
5153
5154 // TODO: consider latency as well for TCK_SizeAndLatency.
5155 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
5156 return LT.first;
5157
5158 if (CostKind != TTI::TCK_RecipThroughput)
5159 return 1;
5160
5161 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
5162 LT.second.is128BitVector() && Alignment < Align(16)) {
5163 // Unaligned stores are extremely inefficient. We don't split all
5164 // unaligned 128-bit stores because the negative impact that has shown in
5165 // practice on inlined block copy code.
5166 // We make such stores expensive so that we will only vectorize if there
5167 // are 6 other instructions getting vectorized.
5168 const int AmortizationCost = 6;
5169
5170 return LT.first * 2 * AmortizationCost;
5171 }
5172
5173 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
5174 if (Ty->isPtrOrPtrVectorTy())
5175 return LT.first;
5176
5177 if (useNeonVector(Ty)) {
5178 // Check truncating stores and extending loads.
5179 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
5180 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
5181 if (VT == MVT::v4i8)
5182 return 2;
5183 // Otherwise we need to scalarize.
5184 return cast<FixedVectorType>(Val: Ty)->getNumElements() * 2;
5185 }
5186 EVT EltVT = VT.getVectorElementType();
5187 unsigned EltSize = EltVT.getScalarSizeInBits();
5188 if (!isPowerOf2_32(Value: EltSize) || EltSize < 8 || EltSize > 64 ||
5189 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
5190 return LT.first;
5191 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
5192 // widening to v4i8, which produces suboptimal results.
5193 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
5194 return LT.first;
5195
5196 // Check non-power-of-2 loads/stores for legal vector element types with
5197 // NEON. Non-power-of-2 memory ops will get broken down to a set of
5198 // operations on smaller power-of-2 ops, including ld1/st1.
5199 LLVMContext &C = Ty->getContext();
5200 InstructionCost Cost(0);
5201 SmallVector<EVT> TypeWorklist;
5202 TypeWorklist.push_back(Elt: VT);
5203 while (!TypeWorklist.empty()) {
5204 EVT CurrVT = TypeWorklist.pop_back_val();
5205 unsigned CurrNumElements = CurrVT.getVectorNumElements();
5206 if (isPowerOf2_32(Value: CurrNumElements)) {
5207 Cost += 1;
5208 continue;
5209 }
5210
5211 unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / 2;
5212 TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2));
5213 TypeWorklist.push_back(
5214 Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2));
5215 }
5216 return Cost;
5217 }
5218
5219 return LT.first;
5220}
5221
5222InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
5223 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5224 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5225 bool UseMaskForCond, bool UseMaskForGaps) const {
5226 assert(Factor >= 2 && "Invalid interleave factor");
5227 auto *VecVTy = cast<VectorType>(Val: VecTy);
5228
5229 if (VecTy->isScalableTy() && !ST->hasSVE())
5230 return InstructionCost::getInvalid();
5231
5232 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5233 // only have lowering for power-of-2 factors.
5234 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5235 // InterleavedAccessPass for ld3/st3
5236 if (VecTy->isScalableTy() && !isPowerOf2_32(Value: Factor))
5237 return InstructionCost::getInvalid();
5238
5239 // Vectorization for masked interleaved accesses is only enabled for scalable
5240 // VF.
5241 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5242 return InstructionCost::getInvalid();
5243
5244 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5245 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5246 auto *SubVecTy =
5247 VectorType::get(ElementType: VecVTy->getElementType(),
5248 EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor));
5249
5250 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5251 // Accesses having vector types that are a multiple of 128 bits can be
5252 // matched to more than one ldN/stN instruction.
5253 bool UseScalable;
5254 if (MinElts % Factor == 0 &&
5255 TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
5256 return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
5257 }
5258
5259 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5260 Alignment, AddressSpace, CostKind,
5261 UseMaskForCond, UseMaskForGaps);
5262}
5263
5264InstructionCost
5265AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
5266 InstructionCost Cost = 0;
5267 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5268 for (auto *I : Tys) {
5269 if (!I->isVectorTy())
5270 continue;
5271 if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() ==
5272 128)
5273 Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind) +
5274 getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind);
5275 }
5276 return Cost;
5277}
5278
5279bool AArch64TTIImpl::isLegalMaskedExpandLoad(Type *DataTy,
5280 Align Alignment) const {
5281 // Neon types should be scalarised when we are not choosing to use SVE.
5282 if (useNeonVector(Ty: DataTy))
5283 return false;
5284
5285 // Return true only if we are able to lower using the SVE2p2/SME2p2
5286 // expand instruction.
5287 return (ST->isSVEAvailable() && ST->hasSVE2p2()) ||
5288 (ST->isSVEorStreamingSVEAvailable() && ST->hasSME2p2());
5289}
5290
5291unsigned
5292AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF,
5293 bool HasUnorderedReductions) const {
5294 if (VF.isScalar() || (HasUnorderedReductions && VF.getKnownMinValue() <= 4))
5295 return 4;
5296 return ST->getMaxInterleaveFactor();
5297}
5298
5299// For Falkor, we want to avoid having too many strided loads in a loop since
5300// that can exhaust the HW prefetcher resources. We adjust the unroller
5301// MaxCount preference below to attempt to ensure unrolling doesn't create too
5302// many strided loads.
5303static void
5304getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
5305 TargetTransformInfo::UnrollingPreferences &UP) {
5306 enum { MaxStridedLoads = 7 };
5307 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5308 int StridedLoads = 0;
5309 // FIXME? We could make this more precise by looking at the CFG and
5310 // e.g. not counting loads in each side of an if-then-else diamond.
5311 for (const auto BB : L->blocks()) {
5312 for (auto &I : *BB) {
5313 LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I);
5314 if (!LMemI)
5315 continue;
5316
5317 Value *PtrValue = LMemI->getPointerOperand();
5318 if (L->isLoopInvariant(V: PtrValue))
5319 continue;
5320
5321 const SCEV *LSCEV = SE.getSCEV(V: PtrValue);
5322 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV);
5323 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5324 continue;
5325
5326 // FIXME? We could take pairing of unrolled load copies into account
5327 // by looking at the AddRec, but we would probably have to limit this
5328 // to loops with no stores or other memory optimization barriers.
5329 ++StridedLoads;
5330 // We've seen enough strided loads that seeing more won't make a
5331 // difference.
5332 if (StridedLoads > MaxStridedLoads / 2)
5333 return StridedLoads;
5334 }
5335 }
5336 return StridedLoads;
5337 };
5338
5339 int StridedLoads = countStridedLoads(L, SE);
5340 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5341 << " strided loads\n");
5342 // Pick the largest power of 2 unroll count that won't result in too many
5343 // strided loads.
5344 if (StridedLoads) {
5345 UP.MaxCount = 1 << Log2_32(Value: MaxStridedLoads / StridedLoads);
5346 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5347 << UP.MaxCount << '\n');
5348 }
5349}
5350
5351// This function returns true if the loop:
5352// 1. Has a valid cost, and
5353// 2. Has a cost within the supplied budget.
5354// Otherwise it returns false.
5355static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI,
5356 InstructionCost Budget,
5357 unsigned *FinalSize) {
5358 // Estimate the size of the loop.
5359 InstructionCost LoopCost = 0;
5360
5361 for (auto *BB : L->getBlocks()) {
5362 for (auto &I : *BB) {
5363 SmallVector<const Value *, 4> Operands(I.operand_values());
5364 InstructionCost Cost =
5365 TTI.getInstructionCost(U: &I, Operands, CostKind: TTI::TCK_CodeSize);
5366 // This can happen with intrinsics that don't currently have a cost model
5367 // or for some operations that require SVE.
5368 if (!Cost.isValid())
5369 return false;
5370
5371 LoopCost += Cost;
5372 if (LoopCost > Budget)
5373 return false;
5374 }
5375 }
5376
5377 if (FinalSize)
5378 *FinalSize = LoopCost.getValue();
5379 return true;
5380}
5381
5382static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
5383 const AArch64TTIImpl &TTI) {
5384 // Only consider loops with unknown trip counts for which we can determine
5385 // a symbolic expression. Multi-exit loops with small known trip counts will
5386 // likely be unrolled anyway.
5387 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5388 if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC))
5389 return false;
5390
5391 // It might not be worth unrolling loops with low max trip counts. Restrict
5392 // this to max trip counts > 32 for now.
5393 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5394 if (MaxTC > 0 && MaxTC <= 32)
5395 return false;
5396
5397 // Make sure the loop size is <= 5.
5398 if (!isLoopSizeWithinBudget(L, TTI, Budget: 5, FinalSize: nullptr))
5399 return false;
5400
5401 // Small search loops with multiple exits can be highly beneficial to unroll.
5402 // We only care about loops with exactly two exiting blocks, although each
5403 // block could jump to the same exit block.
5404 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5405 if (Blocks.size() != 2)
5406 return false;
5407
5408 if (any_of(Range&: Blocks, P: [](BasicBlock *BB) {
5409 return !isa<UncondBrInst, CondBrInst>(Val: BB->getTerminator());
5410 }))
5411 return false;
5412
5413 return true;
5414}
5415
5416/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5417/// OOO engine's wide instruction window and various predictors.
5418static void
5419getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
5420 TargetTransformInfo::UnrollingPreferences &UP,
5421 const AArch64TTIImpl &TTI) {
5422 // Limit loops with structure that is highly likely to benefit from runtime
5423 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5424 // likely with complex control flow). Note that the heuristics here may be
5425 // overly conservative and we err on the side of avoiding runtime unrolling
5426 // rather than unroll excessively. They are all subject to further refinement.
5427 if (!L->isInnermost() || L->getNumBlocks() > 8)
5428 return;
5429
5430 // Loops with multiple exits are handled by common code.
5431 if (!L->getExitBlock())
5432 return;
5433
5434 // Check if the loop contains any reductions that could be parallelized when
5435 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5436 // a multiple of 2.
5437 bool HasParellelizableReductions =
5438 L->getNumBlocks() == 1 &&
5439 any_of(Range: L->getHeader()->phis(),
5440 P: [&SE, L](PHINode &Phi) {
5441 return canParallelizeReductionWhenUnrolling(Phi, L, SE: &SE);
5442 }) &&
5443 isLoopSizeWithinBudget(L, TTI, Budget: 12, FinalSize: nullptr);
5444 if (HasParellelizableReductions &&
5445 SE.getSmallConstantTripMultiple(L, ExitingBlock: L->getExitingBlock()) % 2 == 0) {
5446 UP.Partial = true;
5447 UP.MaxCount = 4;
5448 UP.AddAdditionalAccumulators = true;
5449 }
5450
5451 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5452 if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC) ||
5453 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5454 SE.getSmallConstantMaxTripCount(L) <= 32))
5455 return;
5456
5457 if (findStringMetadataForLoop(TheLoop: L, Name: "llvm.loop.isvectorized"))
5458 return;
5459
5460 if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
5461 return;
5462
5463 // Limit to loops with trip counts that are cheap to expand.
5464 UP.SCEVExpansionBudget = 1;
5465
5466 if (HasParellelizableReductions) {
5467 UP.Runtime = true;
5468 UP.DefaultUnrollRuntimeCount = 4;
5469 UP.AddAdditionalAccumulators = true;
5470 }
5471
5472 // Try to unroll small loops, of few-blocks with low budget, if they have
5473 // load/store dependencies, to expose more parallel memory access streams,
5474 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5475 BasicBlock *Header = L->getHeader();
5476 BasicBlock *Latch = L->getLoopLatch();
5477 if (Header == Latch) {
5478 // Estimate the size of the loop.
5479 unsigned Size;
5480 unsigned Width = 10;
5481 if (!isLoopSizeWithinBudget(L, TTI, Budget: Width, FinalSize: &Size))
5482 return;
5483
5484 // Try to find an unroll count that maximizes the use of the instruction
5485 // window, i.e. trying to fetch as many instructions per cycle as possible.
5486 unsigned MaxInstsPerLine = 16;
5487 unsigned UC = 1;
5488 unsigned BestUC = 1;
5489 unsigned SizeWithBestUC = BestUC * Size;
5490 while (UC <= 8) {
5491 unsigned SizeWithUC = UC * Size;
5492 if (SizeWithUC > 48)
5493 break;
5494 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5495 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5496 BestUC = UC;
5497 SizeWithBestUC = BestUC * Size;
5498 }
5499 UC++;
5500 }
5501
5502 if (BestUC == 1)
5503 return;
5504
5505 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5506 SmallVector<StoreInst *> Stores;
5507 for (auto *BB : L->blocks()) {
5508 for (auto &I : *BB) {
5509 Value *Ptr = getLoadStorePointerOperand(V: &I);
5510 if (!Ptr)
5511 continue;
5512 const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
5513 if (SE.isLoopInvariant(S: PtrSCEV, L))
5514 continue;
5515 if (isa<LoadInst>(Val: &I)) {
5516 LoadedValuesPlus.insert(Ptr: &I);
5517 // Include in-loop 1st users of loaded values.
5518 for (auto *U : I.users())
5519 if (L->contains(Inst: cast<Instruction>(Val: U)))
5520 LoadedValuesPlus.insert(Ptr: U);
5521 } else
5522 Stores.push_back(Elt: cast<StoreInst>(Val: &I));
5523 }
5524 }
5525
5526 if (none_of(Range&: Stores, P: [&LoadedValuesPlus](StoreInst *SI) {
5527 return LoadedValuesPlus.contains(Ptr: SI->getOperand(i_nocapture: 0));
5528 }))
5529 return;
5530
5531 UP.Runtime = true;
5532 UP.DefaultUnrollRuntimeCount = BestUC;
5533 return;
5534 }
5535
5536 // Try to runtime-unroll loops with early-continues depending on loop-varying
5537 // loads; this helps with branch-prediction for the early-continues.
5538 auto *Term = dyn_cast<CondBrInst>(Val: Header->getTerminator());
5539 SmallVector<BasicBlock *> Preds(predecessors(BB: Latch));
5540 if (!Term || Preds.size() == 1 || !llvm::is_contained(Range&: Preds, Element: Header) ||
5541 none_of(Range&: Preds, P: [L](BasicBlock *Pred) { return L->contains(BB: Pred); }))
5542 return;
5543
5544 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5545 [&](Instruction *I, unsigned Depth) -> bool {
5546 if (isa<PHINode>(Val: I) || L->isLoopInvariant(V: I) || Depth > 8)
5547 return false;
5548
5549 if (isa<LoadInst>(Val: I))
5550 return true;
5551
5552 return any_of(Range: I->operands(), P: [&](Value *V) {
5553 auto *I = dyn_cast<Instruction>(Val: V);
5554 return I && DependsOnLoopLoad(I, Depth + 1);
5555 });
5556 };
5557 CmpPredicate Pred;
5558 Instruction *I;
5559 if (match(V: Term, P: m_Br(C: m_ICmp(Pred, L: m_Instruction(I), R: m_Value()), T: m_Value(),
5560 F: m_Value())) &&
5561 DependsOnLoopLoad(I, 0)) {
5562 UP.Runtime = true;
5563 }
5564}
5565
5566void AArch64TTIImpl::getUnrollingPreferences(
5567 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
5568 OptimizationRemarkEmitter *ORE) const {
5569 // Enable partial unrolling and runtime unrolling.
5570 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5571
5572 UP.UpperBound = true;
5573
5574 // For inner loop, it is more likely to be a hot one, and the runtime check
5575 // can be promoted out from LICM pass, so the overhead is less, let's try
5576 // a larger threshold to unroll more loops.
5577 if (L->getLoopDepth() > 1)
5578 UP.PartialThreshold *= 2;
5579
5580 // Disable partial & runtime unrolling on -Os.
5581 UP.PartialOptSizeThreshold = 0;
5582
5583 // Scan the loop: don't unroll loops with calls as this could prevent
5584 // inlining. Don't unroll auto-vectorized loops either, though do allow
5585 // unrolling of the scalar remainder.
5586 bool IsVectorized = getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized");
5587 InstructionCost Cost = 0;
5588 for (auto *BB : L->getBlocks()) {
5589 for (auto &I : *BB) {
5590 // Both auto-vectorized loops and the scalar remainder have the
5591 // isvectorized attribute, so differentiate between them by the presence
5592 // of vector instructions.
5593 if (IsVectorized && I.getType()->isVectorTy())
5594 return;
5595 if (isa<CallBase>(Val: I)) {
5596 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I))
5597 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction())
5598 if (!isLoweredToCall(F))
5599 continue;
5600 return;
5601 }
5602
5603 SmallVector<const Value *, 4> Operands(I.operand_values());
5604 Cost += getInstructionCost(U: &I, Operands,
5605 CostKind: TargetTransformInfo::TCK_SizeAndLatency);
5606 }
5607 }
5608
5609 // Apply subtarget-specific unrolling preferences.
5610 if (ST->isAppleMLike())
5611 getAppleRuntimeUnrollPreferences(L, SE, UP, TTI: *this);
5612 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5613 EnableFalkorHWPFUnrollFix)
5614 getFalkorUnrollingPreferences(L, SE, UP);
5615
5616 // If this is a small, multi-exit loop similar to something like std::find,
5617 // then there is typically a performance improvement achieved by unrolling.
5618 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, TTI: *this)) {
5619 UP.RuntimeUnrollMultiExit = true;
5620 UP.Runtime = true;
5621 // Limit unroll count.
5622 UP.DefaultUnrollRuntimeCount = 4;
5623 // Allow slightly more costly trip-count expansion to catch search loops
5624 // with pointer inductions.
5625 UP.SCEVExpansionBudget = 5;
5626 return;
5627 }
5628
5629 // Enable runtime unrolling for in-order models
5630 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5631 // checking for that case, we can ensure that the default behaviour is
5632 // unchanged
5633 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5634 !ST->getSchedModel().isOutOfOrder()) {
5635 UP.Runtime = true;
5636 UP.Partial = true;
5637 UP.UnrollRemainder = true;
5638 UP.DefaultUnrollRuntimeCount = 4;
5639
5640 UP.UnrollAndJam = true;
5641 UP.UnrollAndJamInnerLoopThreshold = 60;
5642 }
5643
5644 // Force unrolling small loops can be very useful because of the branch
5645 // taken cost of the backedge.
5646 if (Cost < Aarch64ForceUnrollThreshold)
5647 UP.Force = true;
5648}
5649
5650void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
5651 TTI::PeelingPreferences &PP) const {
5652 BaseT::getPeelingPreferences(L, SE, PP);
5653}
5654
5655Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
5656 Type *ExpectedType,
5657 bool CanCreate) const {
5658 switch (Inst->getIntrinsicID()) {
5659 default:
5660 return nullptr;
5661 case Intrinsic::aarch64_neon_st1x2:
5662 case Intrinsic::aarch64_neon_st1x3:
5663 case Intrinsic::aarch64_neon_st1x4:
5664 case Intrinsic::aarch64_neon_st2:
5665 case Intrinsic::aarch64_neon_st3:
5666 case Intrinsic::aarch64_neon_st4: {
5667 // Create a struct type
5668 StructType *ST = dyn_cast<StructType>(Val: ExpectedType);
5669 if (!CanCreate || !ST)
5670 return nullptr;
5671 unsigned NumElts = Inst->arg_size() - 1;
5672 if (ST->getNumElements() != NumElts)
5673 return nullptr;
5674 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5675 if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i))
5676 return nullptr;
5677 }
5678 Value *Res = PoisonValue::get(T: ExpectedType);
5679 IRBuilder<> Builder(Inst);
5680 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5681 Value *L = Inst->getArgOperand(i);
5682 Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i);
5683 }
5684 return Res;
5685 }
5686 case Intrinsic::aarch64_neon_ld1x2:
5687 case Intrinsic::aarch64_neon_ld1x3:
5688 case Intrinsic::aarch64_neon_ld1x4:
5689 case Intrinsic::aarch64_neon_ld2:
5690 case Intrinsic::aarch64_neon_ld3:
5691 case Intrinsic::aarch64_neon_ld4:
5692 if (Inst->getType() == ExpectedType)
5693 return Inst;
5694 return nullptr;
5695 }
5696}
5697
5698bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
5699 MemIntrinsicInfo &Info) const {
5700 switch (Inst->getIntrinsicID()) {
5701 default:
5702 break;
5703 case Intrinsic::aarch64_neon_ld1x2:
5704 case Intrinsic::aarch64_neon_ld1x3:
5705 case Intrinsic::aarch64_neon_ld1x4:
5706 case Intrinsic::aarch64_neon_ld2:
5707 case Intrinsic::aarch64_neon_ld3:
5708 case Intrinsic::aarch64_neon_ld4:
5709 Info.ReadMem = true;
5710 Info.WriteMem = false;
5711 Info.PtrVal = Inst->getArgOperand(i: 0);
5712 break;
5713 case Intrinsic::aarch64_neon_st1x2:
5714 case Intrinsic::aarch64_neon_st1x3:
5715 case Intrinsic::aarch64_neon_st1x4:
5716 case Intrinsic::aarch64_neon_st2:
5717 case Intrinsic::aarch64_neon_st3:
5718 case Intrinsic::aarch64_neon_st4:
5719 Info.ReadMem = false;
5720 Info.WriteMem = true;
5721 Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - 1);
5722 break;
5723 }
5724
5725 // Use the ID of neon load as the "matching id".
5726 switch (Inst->getIntrinsicID()) {
5727 default:
5728 return false;
5729 case Intrinsic::aarch64_neon_ld1x2:
5730 case Intrinsic::aarch64_neon_st1x2:
5731 Info.MatchingId = Intrinsic::aarch64_neon_ld1x2;
5732 break;
5733 case Intrinsic::aarch64_neon_ld1x3:
5734 case Intrinsic::aarch64_neon_st1x3:
5735 Info.MatchingId = Intrinsic::aarch64_neon_ld1x3;
5736 break;
5737 case Intrinsic::aarch64_neon_ld1x4:
5738 case Intrinsic::aarch64_neon_st1x4:
5739 Info.MatchingId = Intrinsic::aarch64_neon_ld1x4;
5740 break;
5741 case Intrinsic::aarch64_neon_ld2:
5742 case Intrinsic::aarch64_neon_st2:
5743 Info.MatchingId = Intrinsic::aarch64_neon_ld2;
5744 break;
5745 case Intrinsic::aarch64_neon_ld3:
5746 case Intrinsic::aarch64_neon_st3:
5747 Info.MatchingId = Intrinsic::aarch64_neon_ld3;
5748 break;
5749 case Intrinsic::aarch64_neon_ld4:
5750 case Intrinsic::aarch64_neon_st4:
5751 Info.MatchingId = Intrinsic::aarch64_neon_ld4;
5752 break;
5753 }
5754 return true;
5755}
5756
5757/// See if \p I should be considered for address type promotion. We check if \p
5758/// I is a sext with right type and used in memory accesses. If it used in a
5759/// "complex" getelementptr, we allow it to be promoted without finding other
5760/// sext instructions that sign extended the same initial value. A getelementptr
5761/// is considered as "complex" if it has more than 2 operands.
5762bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
5763 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5764 bool Considerable = false;
5765 AllowPromotionWithoutCommonHeader = false;
5766 if (!isa<SExtInst>(Val: &I))
5767 return false;
5768 Type *ConsideredSExtType =
5769 Type::getInt64Ty(C&: I.getParent()->getParent()->getContext());
5770 if (I.getType() != ConsideredSExtType)
5771 return false;
5772 // See if the sext is the one with the right type and used in at least one
5773 // GetElementPtrInst.
5774 for (const User *U : I.users()) {
5775 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) {
5776 Considerable = true;
5777 // A getelementptr is considered as "complex" if it has more than 2
5778 // operands. We will promote a SExt used in such complex GEP as we
5779 // expect some computation to be merged if they are done on 64 bits.
5780 if (GEPInst->getNumOperands() > 2) {
5781 AllowPromotionWithoutCommonHeader = true;
5782 break;
5783 }
5784 }
5785 }
5786 return Considerable;
5787}
5788
5789bool AArch64TTIImpl::isLegalToVectorizeReduction(
5790 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5791 if (!VF.isScalable())
5792 return true;
5793
5794 Type *Ty = RdxDesc.getRecurrenceType();
5795 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5796 return false;
5797
5798 switch (RdxDesc.getRecurrenceKind()) {
5799 case RecurKind::Sub:
5800 case RecurKind::FSub:
5801 case RecurKind::AddChainWithSubs:
5802 case RecurKind::FAddChainWithSubs:
5803 case RecurKind::Add:
5804 case RecurKind::FAdd:
5805 case RecurKind::And:
5806 case RecurKind::Or:
5807 case RecurKind::Xor:
5808 case RecurKind::SMin:
5809 case RecurKind::SMax:
5810 case RecurKind::UMin:
5811 case RecurKind::UMax:
5812 case RecurKind::FMin:
5813 case RecurKind::FMax:
5814 case RecurKind::FMulAdd:
5815 case RecurKind::AnyOf:
5816 case RecurKind::FindLast:
5817 return true;
5818 default:
5819 return false;
5820 }
5821}
5822
5823InstructionCost
5824AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
5825 FastMathFlags FMF,
5826 TTI::TargetCostKind CostKind) const {
5827 // The code-generator is currently not able to handle scalable vectors
5828 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5829 // it. This change will be removed when code-generation for these types is
5830 // sufficiently reliable.
5831 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
5832 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
5833 return InstructionCost::getInvalid();
5834
5835 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5836
5837 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5838 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5839
5840 InstructionCost LegalizationCost = 0;
5841 if (LT.first > 1) {
5842 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Ty->getContext());
5843 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5844 LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - 1);
5845 }
5846
5847 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5848}
5849
5850InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
5851 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5852 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5853 InstructionCost LegalizationCost = 0;
5854 if (LT.first > 1) {
5855 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: ValTy->getContext());
5856 LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind);
5857 LegalizationCost *= LT.first - 1;
5858 }
5859
5860 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5861 assert(ISD && "Invalid opcode");
5862 // Add the final reduction cost for the legal horizontal reduction
5863 switch (ISD) {
5864 case ISD::ADD:
5865 case ISD::AND:
5866 case ISD::OR:
5867 case ISD::XOR:
5868 case ISD::FADD:
5869 return LegalizationCost + 2;
5870 default:
5871 return InstructionCost::getInvalid();
5872 }
5873}
5874
5875InstructionCost
5876AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
5877 std::optional<FastMathFlags> FMF,
5878 TTI::TargetCostKind CostKind) const {
5879 // The code-generator is currently not able to handle scalable vectors
5880 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5881 // it. This change will be removed when code-generation for these types is
5882 // sufficiently reliable.
5883 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: ValTy))
5884 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
5885 return InstructionCost::getInvalid();
5886
5887 if (TTI::requiresOrderedReduction(FMF)) {
5888 if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) {
5889 InstructionCost BaseCost =
5890 BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
5891 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5892 // end up vectorizing for more computationally intensive loops.
5893 return BaseCost + FixedVTy->getNumElements();
5894 }
5895
5896 if (Opcode != Instruction::FAdd || ValTy->getElementType()->isBFloatTy())
5897 return InstructionCost::getInvalid();
5898
5899 auto *VTy = cast<ScalableVectorType>(Val: ValTy);
5900 InstructionCost Cost =
5901 getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind);
5902 Cost *= getMaxNumElements(VF: VTy->getElementCount());
5903 return Cost;
5904 }
5905
5906 if (isa<ScalableVectorType>(Val: ValTy))
5907 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5908
5909 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5910 MVT MTy = LT.second;
5911 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5912 assert(ISD && "Invalid opcode");
5913
5914 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5915 // instructions as twice a normal vector add, plus 1 for each legalization
5916 // step (LT.first). This is the only arithmetic vector reduction operation for
5917 // which we have an instruction.
5918 // OR, XOR and AND costs should match the codegen from:
5919 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5920 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5921 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5922 static const CostTblEntry CostTblNoPairwise[]{
5923 {.ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: 2},
5924 {.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 2},
5925 {.ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: 2},
5926 {.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 2},
5927 {.ISD: ISD::ADD, .Type: MVT::v2i32, .Cost: 2},
5928 {.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 2},
5929 {.ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 2},
5930 {.ISD: ISD::OR, .Type: MVT::v8i8, .Cost: 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5931 {.ISD: ISD::OR, .Type: MVT::v16i8, .Cost: 7}, // ext + orr + same as v8i8
5932 {.ISD: ISD::OR, .Type: MVT::v4i16, .Cost: 4}, // fmov + orr_lsr + lsr + orr
5933 {.ISD: ISD::OR, .Type: MVT::v8i16, .Cost: 6}, // ext + orr + same as v4i16
5934 {.ISD: ISD::OR, .Type: MVT::v2i32, .Cost: 3}, // fmov + lsr + orr
5935 {.ISD: ISD::OR, .Type: MVT::v4i32, .Cost: 5}, // ext + orr + same as v2i32
5936 {.ISD: ISD::OR, .Type: MVT::v2i64, .Cost: 3}, // ext + orr + fmov
5937 {.ISD: ISD::XOR, .Type: MVT::v8i8, .Cost: 5}, // Same as above for or...
5938 {.ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: 7},
5939 {.ISD: ISD::XOR, .Type: MVT::v4i16, .Cost: 4},
5940 {.ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: 6},
5941 {.ISD: ISD::XOR, .Type: MVT::v2i32, .Cost: 3},
5942 {.ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: 5},
5943 {.ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: 3},
5944 {.ISD: ISD::AND, .Type: MVT::v8i8, .Cost: 5}, // Same as above for or...
5945 {.ISD: ISD::AND, .Type: MVT::v16i8, .Cost: 7},
5946 {.ISD: ISD::AND, .Type: MVT::v4i16, .Cost: 4},
5947 {.ISD: ISD::AND, .Type: MVT::v8i16, .Cost: 6},
5948 {.ISD: ISD::AND, .Type: MVT::v2i32, .Cost: 3},
5949 {.ISD: ISD::AND, .Type: MVT::v4i32, .Cost: 5},
5950 {.ISD: ISD::AND, .Type: MVT::v2i64, .Cost: 3},
5951 };
5952 switch (ISD) {
5953 default:
5954 break;
5955 case ISD::FADD:
5956 if (Type *EltTy = ValTy->getScalarType();
5957 // FIXME: For half types without fullfp16 support, this could extend and
5958 // use a fp32 faddp reduction but current codegen unrolls.
5959 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5960 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5961 const unsigned NElts = MTy.getVectorNumElements();
5962 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5963 isPowerOf2_32(Value: NElts))
5964 // Reduction corresponding to series of fadd instructions is lowered to
5965 // series of faddp instructions. faddp has latency/throughput that
5966 // matches fadd instruction and hence, every faddp instruction can be
5967 // considered to have a relative cost = 1 with
5968 // CostKind = TCK_RecipThroughput.
5969 // An faddp will pairwise add vector elements, so the size of input
5970 // vector reduces by half every time, requiring
5971 // #(faddp instructions) = log2_32(NElts).
5972 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(Value: NElts);
5973 }
5974 break;
5975 case ISD::ADD:
5976 if (const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy))
5977 return (LT.first - 1) + Entry->Cost;
5978 break;
5979 case ISD::XOR:
5980 case ISD::AND:
5981 case ISD::OR:
5982 const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy);
5983 if (!Entry)
5984 break;
5985 auto *ValVTy = cast<FixedVectorType>(Val: ValTy);
5986 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5987 isPowerOf2_32(Value: ValVTy->getNumElements())) {
5988 InstructionCost ExtraCost = 0;
5989 if (LT.first != 1) {
5990 // Type needs to be split, so there is an extra cost of LT.first - 1
5991 // arithmetic ops.
5992 auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(),
5993 NumElts: MTy.getVectorNumElements());
5994 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5995 ExtraCost *= LT.first - 1;
5996 }
5997 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5998 auto Cost = ValVTy->getElementType()->isIntegerTy(BitWidth: 1) ? 2 : Entry->Cost;
5999 return Cost + ExtraCost;
6000 }
6001 break;
6002 }
6003 return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
6004}
6005
6006InstructionCost AArch64TTIImpl::getExtendedReductionCost(
6007 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
6008 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
6009 EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
6010 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
6011
6012 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
6013 VecVT.getSizeInBits() >= 64) {
6014 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
6015
6016 // The legal cases are:
6017 // UADDLV 8/16/32->32
6018 // UADDLP 32->64
6019 unsigned RevVTSize = ResVT.getSizeInBits();
6020 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
6021 RevVTSize <= 32) ||
6022 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
6023 RevVTSize <= 32) ||
6024 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
6025 RevVTSize <= 64))
6026 return (LT.first - 1) * 2 + 2;
6027 }
6028
6029 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: VecTy, FMF,
6030 CostKind);
6031}
6032
6033InstructionCost
6034AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
6035 Type *ResTy, VectorType *VecTy,
6036 TTI::TargetCostKind CostKind) const {
6037 EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
6038 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
6039
6040 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
6041 RedOpcode == Instruction::Add) {
6042 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
6043
6044 // The legal cases with dotprod are
6045 // UDOT 8->32
6046 // Which requires an additional uaddv to sum the i32 values.
6047 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
6048 ResVT == MVT::i32)
6049 return LT.first + 2;
6050 }
6051
6052 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, Ty: VecTy,
6053 CostKind);
6054}
6055
6056InstructionCost
6057AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index,
6058 TTI::TargetCostKind CostKind) const {
6059 static const CostTblEntry ShuffleTbl[] = {
6060 { .ISD: TTI::SK_Splice, .Type: MVT::nxv16i8, .Cost: 1 },
6061 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8i16, .Cost: 1 },
6062 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4i32, .Cost: 1 },
6063 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2i64, .Cost: 1 },
6064 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f16, .Cost: 1 },
6065 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f16, .Cost: 1 },
6066 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8f16, .Cost: 1 },
6067 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2bf16, .Cost: 1 },
6068 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4bf16, .Cost: 1 },
6069 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8bf16, .Cost: 1 },
6070 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f32, .Cost: 1 },
6071 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f32, .Cost: 1 },
6072 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f64, .Cost: 1 },
6073 };
6074
6075 // The code-generator is currently not able to handle scalable vectors
6076 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
6077 // it. This change will be removed when code-generation for these types is
6078 // sufficiently reliable.
6079 if (Tp->getElementCount() == ElementCount::getScalable(MinVal: 1))
6080 return InstructionCost::getInvalid();
6081
6082 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
6083 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Tp->getContext());
6084 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
6085 ? TLI->getPromotedVTForPredicate(VT: EVT(LT.second))
6086 : LT.second;
6087 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Context&: Tp->getContext());
6088 InstructionCost LegalizationCost = 0;
6089 if (Index < 0) {
6090 LegalizationCost =
6091 getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy,
6092 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) +
6093 getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy,
6094 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6095 }
6096
6097 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
6098 // Cost performed on a promoted type.
6099 if (LT.second.getScalarType() == MVT::i1) {
6100 LegalizationCost +=
6101 getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy,
6102 CCH: TTI::CastContextHint::None, CostKind) +
6103 getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy,
6104 CCH: TTI::CastContextHint::None, CostKind);
6105 }
6106 const auto *Entry =
6107 CostTableLookup(Table: ShuffleTbl, ISD: TTI::SK_Splice, Ty: PromotedVT.getSimpleVT());
6108 assert(Entry && "Illegal Type for Splice");
6109 LegalizationCost += Entry->Cost;
6110 return LegalizationCost * LT.first;
6111}
6112
6113InstructionCost AArch64TTIImpl::getPartialReductionCost(
6114 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
6115 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
6116 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
6117 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
6118 InstructionCost Invalid = InstructionCost::getInvalid();
6119
6120 if (CostKind != TTI::TCK_RecipThroughput)
6121 return Invalid;
6122
6123 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
6124 Opcode != Instruction::FAdd && Opcode != Instruction::FSub) ||
6125 OpAExtend == TTI::PR_None)
6126 return Invalid;
6127
6128 // Floating-point partial reductions are invalid if `reassoc` and `contract`
6129 // are not allowed.
6130 if (AccumType->isFloatingPointTy()) {
6131 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
6132 if (!FMF->allowReassoc() || !FMF->allowContract())
6133 return Invalid;
6134 } else {
6135 assert(!FMF &&
6136 "FastMathFlags only apply to floating-point partial reductions");
6137 }
6138
6139 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
6140 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
6141 "Unexpected values for OpBExtend or InputTypeB");
6142
6143 // We only support multiply binary operations for now, and for muls we
6144 // require the types being extended to be the same.
6145 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
6146 InputTypeA != InputTypeB))
6147 return Invalid;
6148
6149 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
6150 // USDot is natively supported with +i8mm. With plain +dotprod, SUMLA is
6151 // lowered to two udots plus an eor and a sub.
6152 if (IsUSDot && !ST->hasMatMulInt8() && !ST->hasDotProd())
6153 // FIXME: Remove this early bailout in favour of expand cost.
6154 return Invalid;
6155
6156 unsigned Ratio =
6157 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
6158 if (VF.getKnownMinValue() <= Ratio)
6159 return Invalid;
6160
6161 VectorType *InputVectorType = VectorType::get(ElementType: InputTypeA, EC: VF);
6162 VectorType *AccumVectorType =
6163 VectorType::get(ElementType: AccumType, EC: VF.divideCoefficientBy(RHS: Ratio));
6164 // We don't yet support all kinds of legalization.
6165 auto TC = TLI->getTypeConversion(Context&: AccumVectorType->getContext(),
6166 VT: EVT::getEVT(Ty: AccumVectorType));
6167 switch (TC.first) {
6168 default:
6169 return Invalid;
6170 case TargetLowering::TypeLegal:
6171 case TargetLowering::TypePromoteInteger:
6172 case TargetLowering::TypeSplitVector:
6173 // The legalised type (e.g. after splitting) must be legal too.
6174 if (TLI->getTypeAction(Context&: AccumVectorType->getContext(), VT: TC.second) !=
6175 TargetLowering::TypeLegal)
6176 return Invalid;
6177 break;
6178 }
6179
6180 std::pair<InstructionCost, MVT> AccumLT =
6181 getTypeLegalizationCost(Ty: AccumVectorType);
6182 std::pair<InstructionCost, MVT> InputLT =
6183 getTypeLegalizationCost(Ty: InputVectorType);
6184
6185 // Returns true if the subtarget supports the operation for a given type.
6186 auto IsSupported = [&](bool SVEPred, bool NEONPred) -> bool {
6187 return (ST->isSVEorStreamingSVEAvailable() && SVEPred) ||
6188 (AccumLT.second.isFixedLengthVector() &&
6189 AccumLT.second.getSizeInBits() <= 128 && ST->isNeonAvailable() &&
6190 NEONPred);
6191 };
6192
6193 bool IsSub = Opcode == Instruction::Sub || Opcode == Instruction::FSub;
6194 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
6195 // Integer partial sub-reductions that don't map to a specific instruction,
6196 // carry an extra cost for implementing a double negation:
6197 // partial_reduce_umls acc, lhs, rhs
6198 // <=> -partial_reduce_umla -acc, lhs, rhs
6199 InstructionCost INegCost = IsSub ? 2 * InputLT.first * TTI::TCC_Basic : 0;
6200
6201 if (AccumLT.second.getScalarType() == MVT::i32 &&
6202 InputLT.second.getScalarType() == MVT::i8) {
6203 // i8 -> i32 is natively supported with udot/sdot for both NEON and SVE.
6204 if (!IsUSDot && IsSupported(true, ST->hasDotProd()))
6205 return Cost + INegCost;
6206 // i8 -> i32 usdot requires +i8mm
6207 if (IsUSDot && IsSupported(ST->hasMatMulInt8(), ST->hasMatMulInt8()))
6208 return Cost + INegCost;
6209 // Without +i8mm, lower SUMLA via two udots plus an eor and a sub on plain
6210 // +dotprod targets. Note that this is only implemented for NEON, as all
6211 // modern CPUs with SVE also have +i8mm. Charge an extra factor for the
6212 // expansion.
6213 if (IsUSDot && IsSupported(false, ST->hasDotProd()))
6214 return Cost * 3 + INegCost;
6215 }
6216
6217 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
6218 // i16 -> i64 is natively supported for udot/sdot
6219 if (AccumLT.second.getScalarType() == MVT::i64 &&
6220 InputLT.second.getScalarType() == MVT::i16)
6221 return Cost + INegCost;
6222 // i16 -> i32 is natively supported with SVE2p1 udot/sdot.
6223 // For sub-reductions, we prefer using the *mlslb/t instructions.
6224 if (AccumLT.second.getScalarType() == MVT::i32 &&
6225 InputLT.second.getScalarType() == MVT::i16 &&
6226 (ST->hasSVE2p1() || ST->hasSME2()) && !IsSub)
6227 return Cost;
6228 // i8 -> i64 is supported with an extra level of extends
6229 if (AccumLT.second.getScalarType() == MVT::i64 &&
6230 InputLT.second.getScalarType() == MVT::i8)
6231 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
6232 // because it requires two extra extends on the inputs. But if we'd change
6233 // that now, a regular reduction would be cheaper because the costs of
6234 // the extends in the IR are still counted. This can be fixed
6235 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
6236 return Cost + INegCost;
6237 // i8 -> i16 is natively supported with SVE2p3 udot/sdot
6238 // For sub-reductions, we prefer using the *mlslb/t instructions.
6239 if (AccumLT.second.getScalarType() == MVT::i16 &&
6240 InputLT.second.getScalarType() == MVT::i8 &&
6241 (ST->hasSVE2p3() || ST->hasSME2p3()) && !IsSub)
6242 return Cost;
6243 }
6244
6245 // f16 -> f32 is natively supported for fdot using either
6246 // SVE or NEON instruction.
6247 if (Opcode == Instruction::FAdd && !IsSub &&
6248 IsSupported(ST->hasSME2() || ST->hasSVE2p1(), ST->hasF16F32DOT()) &&
6249 AccumLT.second.getScalarType() == MVT::f32 &&
6250 InputLT.second.getScalarType() == MVT::f16)
6251 return Cost;
6252
6253 // For a ratio of 2, we can use *mlal and *mlsl top/bottom instructions.
6254 if (Ratio == 2 && !IsUSDot) {
6255 MVT InVT = InputLT.second.getScalarType();
6256
6257 // SVE2 [us]ml[as]lb/t and NEON [us]ml[as]l(2)
6258 if (IsSupported(ST->hasSVE2() || ST->hasSME(), true) &&
6259 llvm::is_contained(Set: {MVT::i8, MVT::i16, MVT::i32}, Element: InVT.SimpleTy))
6260 return Cost * 2;
6261
6262 // SVE2 fml[as]lb/t and NEON fml[as]l(2)
6263 if (IsSupported(ST->hasSVE2(), ST->hasFP16FML()) && InVT == MVT::f16)
6264 return Cost * 2;
6265
6266 // SME2/SVE2p1 bfmlslb/t
6267 if (IsSupported(ST->hasSVE2p1() || ST->hasSME2(), false) &&
6268 InVT == MVT::bf16 && IsSub)
6269 return Cost * 2;
6270
6271 // FP partial sub-reductions that don't map to a specific instruction,
6272 // carry an extra cost for implementing an extra negation:
6273 // partial_reduce_fmls acc, lhs, rhs
6274 // <=> partial_reduce_fmla acc, lhs, -rhs
6275 InstructionCost FNegCost = IsSub ? InputLT.first * TTI::TCC_Basic : 0;
6276
6277 // SVE and NEON bfmlalb/t
6278 if (IsSupported(ST->hasBF16(), ST->hasBF16()) && InVT == MVT::bf16)
6279 return Cost * 2 + FNegCost;
6280 }
6281
6282 return BaseT::getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
6283 AccumType, VF, OpAExtend, OpBExtend,
6284 BinOp, CostKind, FMF);
6285}
6286
6287InstructionCost
6288AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
6289 VectorType *SrcTy, ArrayRef<int> Mask,
6290 TTI::TargetCostKind CostKind, int Index,
6291 VectorType *SubTp, ArrayRef<const Value *> Args,
6292 const Instruction *CxtI) const {
6293 assert((Mask.empty() || DstTy->isScalableTy() ||
6294 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6295 "Expected the Mask to match the return size if given");
6296 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6297 "Expected the same scalar types");
6298 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
6299
6300 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6301 // into smaller vectors and sum the cost of each shuffle.
6302 if (!Mask.empty() && isa<FixedVectorType>(Val: SrcTy) && LT.second.isVector() &&
6303 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6304 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6305 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6306 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6307 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6308 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6309 // cost than just the load.
6310 if (Args.size() >= 1 && isa<LoadInst>(Val: Args[0]) &&
6311 (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 3) ||
6312 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 4)))
6313 return std::max<InstructionCost>(a: 1, b: LT.first / 4);
6314
6315 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6316 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6317 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6318 // cost than just the store.
6319 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) &&
6320 (ShuffleVectorInst::isInterleaveMask(
6321 Mask, Factor: 4, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2) ||
6322 ShuffleVectorInst::isInterleaveMask(
6323 Mask, Factor: 3, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2)))
6324 return LT.first;
6325
6326 unsigned TpNumElts = Mask.size();
6327 unsigned LTNumElts = LT.second.getVectorNumElements();
6328 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6329 VectorType *NTp = VectorType::get(ElementType: SrcTy->getScalarType(),
6330 EC: LT.second.getVectorElementCount());
6331 InstructionCost Cost;
6332 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6333 PreviousCosts;
6334 for (unsigned N = 0; N < NumVecs; N++) {
6335 SmallVector<int> NMask;
6336 // Split the existing mask into chunks of size LTNumElts. Track the source
6337 // sub-vectors to ensure the result has at most 2 inputs.
6338 unsigned Source1 = -1U, Source2 = -1U;
6339 unsigned NumSources = 0;
6340 for (unsigned E = 0; E < LTNumElts; E++) {
6341 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6342 : PoisonMaskElem;
6343 if (MaskElt < 0) {
6344 NMask.push_back(Elt: PoisonMaskElem);
6345 continue;
6346 }
6347
6348 // Calculate which source from the input this comes from and whether it
6349 // is new to us.
6350 unsigned Source = MaskElt / LTNumElts;
6351 if (NumSources == 0) {
6352 Source1 = Source;
6353 NumSources = 1;
6354 } else if (NumSources == 1 && Source != Source1) {
6355 Source2 = Source;
6356 NumSources = 2;
6357 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6358 NumSources++;
6359 }
6360
6361 // Add to the new mask. For the NumSources>2 case these are not correct,
6362 // but are only used for the modular lane number.
6363 if (Source == Source1)
6364 NMask.push_back(Elt: MaskElt % LTNumElts);
6365 else if (Source == Source2)
6366 NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts);
6367 else
6368 NMask.push_back(Elt: MaskElt % LTNumElts);
6369 }
6370 // Check if we have already generated this sub-shuffle, which means we
6371 // will have already generated the output. For example a <16 x i32> splat
6372 // will be the same sub-splat 4 times, which only needs to be generated
6373 // once and reused.
6374 auto Result =
6375 PreviousCosts.insert(x: {std::make_tuple(args&: Source1, args&: Source2, args&: NMask), 0});
6376 // Check if it was already in the map (already costed).
6377 if (!Result.second)
6378 continue;
6379 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6380 // getShuffleCost. If not then cost it using the worst case as the number
6381 // of element moves into a new vector.
6382 InstructionCost NCost =
6383 NumSources <= 2
6384 ? getShuffleCost(Kind: NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6385 : TTI::SK_PermuteTwoSrc,
6386 DstTy: NTp, SrcTy: NTp, Mask: NMask, CostKind, Index: 0, SubTp: nullptr, Args,
6387 CxtI)
6388 : LTNumElts;
6389 Result.first->second = NCost;
6390 Cost += NCost;
6391 }
6392 return Cost;
6393 }
6394
6395 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
6396 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6397 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6398 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6399 // This currently only handles low or high extracts to prevent SLP vectorizer
6400 // regressions.
6401 // Note that SVE's ext instruction is destructive, but it can be fused with
6402 // a movprfx to act like a constructive instruction.
6403 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6404 if (LT.second.getFixedSizeInBits() >= 128 &&
6405 cast<FixedVectorType>(Val: SubTp)->getNumElements() ==
6406 LT.second.getVectorNumElements() / 2) {
6407 if (Index == 0)
6408 return 0;
6409 if (Index == (int)LT.second.getVectorNumElements() / 2)
6410 return 1;
6411 }
6412 Kind = TTI::SK_PermuteSingleSrc;
6413 }
6414 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6415 // the code to handle length-changing shuffles.
6416 if (Kind == TTI::SK_InsertSubvector) {
6417 LT = getTypeLegalizationCost(Ty: DstTy);
6418 SrcTy = DstTy;
6419 }
6420
6421 // Check for identity masks, which we can treat as free for both fixed and
6422 // scalable vector paths.
6423 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6424 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6425 all_of(Range: enumerate(First&: Mask), P: [](const auto &M) {
6426 return M.value() < 0 || M.value() == (int)M.index();
6427 }))
6428 return 0;
6429
6430 // Segmented shuffle matching.
6431 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Val: SrcTy) &&
6432 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6433 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6434 RHS: AArch64::SVEBitsPerBlock)) {
6435
6436 FixedVectorType *VTy = cast<FixedVectorType>(Val: SrcTy);
6437 unsigned Segments =
6438 VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
6439 unsigned SegmentElts = VTy->getNumElements() / Segments;
6440
6441 // dupq zd.t, zn.t[idx]
6442 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6443 ST->isSVEorStreamingSVEAvailable() &&
6444 isDUPQMask(Mask, Segments, SegmentSize: SegmentElts))
6445 return LT.first;
6446
6447 // mov zd.q, vn
6448 if (ST->isSVEorStreamingSVEAvailable() &&
6449 isDUPFirstSegmentMask(Mask, Segments, SegmentSize: SegmentElts))
6450 return LT.first;
6451 }
6452
6453 // Check for broadcast loads, which are supported by the LD1R instruction.
6454 // In terms of code-size, the shuffle vector is free when a load + dup get
6455 // folded into a LD1R. That's what we check and return here. For performance
6456 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6457 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6458 // that we model the load + dup sequence slightly higher because LD1R is a
6459 // high latency instruction.
6460 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6461 bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args[0]);
6462 if (IsLoad && LT.second.isVector() &&
6463 isLegalBroadcastLoad(ElementTy: SrcTy->getElementType(),
6464 NumElements: LT.second.getVectorElementCount()))
6465 return 0;
6466 }
6467
6468 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6469 // from the perfect shuffle tables.
6470 if (Mask.size() == 4 &&
6471 SrcTy->getElementCount() == ElementCount::getFixed(MinVal: 4) &&
6472 (SrcTy->getScalarSizeInBits() == 16 ||
6473 SrcTy->getScalarSizeInBits() == 32) &&
6474 all_of(Range&: Mask, P: [](int E) { return E < 8; }))
6475 return getPerfectShuffleCost(M: Mask);
6476
6477 // Check for other shuffles that are not SK_ kinds but we have native
6478 // instructions for, for example ZIP and UZP.
6479 unsigned Unused;
6480 if (LT.second.isFixedLengthVector() &&
6481 LT.second.getVectorNumElements() == Mask.size() &&
6482 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6483 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6484 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6485 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6486 Kind == TTI::SK_InsertSubvector) &&
6487 (isZIPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused, OperandOrderOut&: Unused) ||
6488 isTRNMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused, OperandOrderOut&: Unused) ||
6489 isUZPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) ||
6490 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6491 NumElts: LT.second.getVectorNumElements(), BlockSize: 16) ||
6492 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6493 NumElts: LT.second.getVectorNumElements(), BlockSize: 32) ||
6494 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6495 NumElts: LT.second.getVectorNumElements(), BlockSize: 64) ||
6496 // Check for non-zero lane splats
6497 all_of(Range: drop_begin(RangeOrContainer&: Mask),
6498 P: [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6499 return 1;
6500
6501 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6502 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6503 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6504 static const CostTblEntry ShuffleTbl[] = {
6505 // Broadcast shuffle kinds can be performed with 'dup'.
6506 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: 1},
6507 {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: 1},
6508 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: 1},
6509 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: 1},
6510 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i32, .Cost: 1},
6511 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: 1},
6512 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: 1},
6513 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f16, .Cost: 1},
6514 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: 1},
6515 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4bf16, .Cost: 1},
6516 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8bf16, .Cost: 1},
6517 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f32, .Cost: 1},
6518 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: 1},
6519 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: 1},
6520 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6521 // 'zip1/zip2' instructions.
6522 {.ISD: TTI::SK_Transpose, .Type: MVT::v8i8, .Cost: 1},
6523 {.ISD: TTI::SK_Transpose, .Type: MVT::v16i8, .Cost: 1},
6524 {.ISD: TTI::SK_Transpose, .Type: MVT::v4i16, .Cost: 1},
6525 {.ISD: TTI::SK_Transpose, .Type: MVT::v8i16, .Cost: 1},
6526 {.ISD: TTI::SK_Transpose, .Type: MVT::v2i32, .Cost: 1},
6527 {.ISD: TTI::SK_Transpose, .Type: MVT::v4i32, .Cost: 1},
6528 {.ISD: TTI::SK_Transpose, .Type: MVT::v2i64, .Cost: 1},
6529 {.ISD: TTI::SK_Transpose, .Type: MVT::v4f16, .Cost: 1},
6530 {.ISD: TTI::SK_Transpose, .Type: MVT::v8f16, .Cost: 1},
6531 {.ISD: TTI::SK_Transpose, .Type: MVT::v4bf16, .Cost: 1},
6532 {.ISD: TTI::SK_Transpose, .Type: MVT::v8bf16, .Cost: 1},
6533 {.ISD: TTI::SK_Transpose, .Type: MVT::v2f32, .Cost: 1},
6534 {.ISD: TTI::SK_Transpose, .Type: MVT::v4f32, .Cost: 1},
6535 {.ISD: TTI::SK_Transpose, .Type: MVT::v2f64, .Cost: 1},
6536 // Select shuffle kinds.
6537 // TODO: handle vXi8/vXi16.
6538 {.ISD: TTI::SK_Select, .Type: MVT::v2i32, .Cost: 1}, // mov.
6539 {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: 2}, // rev+trn (or similar).
6540 {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: 1}, // mov.
6541 {.ISD: TTI::SK_Select, .Type: MVT::v2f32, .Cost: 1}, // mov.
6542 {.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: 2}, // rev+trn (or similar).
6543 {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: 1}, // mov.
6544 // PermuteSingleSrc shuffle kinds.
6545 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i32, .Cost: 1}, // mov.
6546 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: 3}, // perfectshuffle worst case.
6547 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: 1}, // mov.
6548 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f32, .Cost: 1}, // mov.
6549 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: 3}, // perfectshuffle worst case.
6550 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: 1}, // mov.
6551 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: 3}, // perfectshuffle worst case.
6552 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f16, .Cost: 3}, // perfectshuffle worst case.
6553 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4bf16, .Cost: 3}, // same
6554 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: 8}, // constpool + load + tbl
6555 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: 8}, // constpool + load + tbl
6556 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8bf16, .Cost: 8}, // constpool + load + tbl
6557 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: 8}, // constpool + load + tbl
6558 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: 8}, // constpool + load + tbl
6559 // Reverse can be lowered with `rev`.
6560 {.ISD: TTI::SK_Reverse, .Type: MVT::v2i32, .Cost: 1}, // REV64
6561 {.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: 2}, // REV64; EXT
6562 {.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: 1}, // EXT
6563 {.ISD: TTI::SK_Reverse, .Type: MVT::v2f32, .Cost: 1}, // REV64
6564 {.ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: 2}, // REV64; EXT
6565 {.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: 1}, // EXT
6566 {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: 2}, // REV64; EXT
6567 {.ISD: TTI::SK_Reverse, .Type: MVT::v8bf16, .Cost: 2}, // REV64; EXT
6568 {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: 2}, // REV64; EXT
6569 {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: 2}, // REV64; EXT
6570 {.ISD: TTI::SK_Reverse, .Type: MVT::v4f16, .Cost: 1}, // REV64
6571 {.ISD: TTI::SK_Reverse, .Type: MVT::v4bf16, .Cost: 1}, // REV64
6572 {.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: 1}, // REV64
6573 {.ISD: TTI::SK_Reverse, .Type: MVT::v8i8, .Cost: 1}, // REV64
6574 // Splice can all be lowered as `ext`.
6575 {.ISD: TTI::SK_Splice, .Type: MVT::v2i32, .Cost: 1},
6576 {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: 1},
6577 {.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: 1},
6578 {.ISD: TTI::SK_Splice, .Type: MVT::v2f32, .Cost: 1},
6579 {.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: 1},
6580 {.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: 1},
6581 {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: 1},
6582 {.ISD: TTI::SK_Splice, .Type: MVT::v8bf16, .Cost: 1},
6583 {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: 1},
6584 {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: 1},
6585 {.ISD: TTI::SK_Splice, .Type: MVT::v4f16, .Cost: 1},
6586 {.ISD: TTI::SK_Splice, .Type: MVT::v4bf16, .Cost: 1},
6587 {.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: 1},
6588 {.ISD: TTI::SK_Splice, .Type: MVT::v8i8, .Cost: 1},
6589 // Broadcast shuffle kinds for scalable vectors
6590 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i8, .Cost: 1},
6591 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i16, .Cost: 1},
6592 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i32, .Cost: 1},
6593 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i64, .Cost: 1},
6594 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f16, .Cost: 1},
6595 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f16, .Cost: 1},
6596 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8f16, .Cost: 1},
6597 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2bf16, .Cost: 1},
6598 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4bf16, .Cost: 1},
6599 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8bf16, .Cost: 1},
6600 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f32, .Cost: 1},
6601 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f32, .Cost: 1},
6602 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f64, .Cost: 1},
6603 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i1, .Cost: 1},
6604 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i1, .Cost: 1},
6605 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i1, .Cost: 1},
6606 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i1, .Cost: 1},
6607 // Handle the cases for vector.reverse with scalable vectors
6608 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i8, .Cost: 1},
6609 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i16, .Cost: 1},
6610 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i32, .Cost: 1},
6611 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i64, .Cost: 1},
6612 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f16, .Cost: 1},
6613 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f16, .Cost: 1},
6614 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8f16, .Cost: 1},
6615 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2bf16, .Cost: 1},
6616 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4bf16, .Cost: 1},
6617 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8bf16, .Cost: 1},
6618 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f32, .Cost: 1},
6619 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f32, .Cost: 1},
6620 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f64, .Cost: 1},
6621 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i1, .Cost: 1},
6622 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i1, .Cost: 1},
6623 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i1, .Cost: 1},
6624 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i1, .Cost: 1},
6625 };
6626 if (const auto *Entry = CostTableLookup(Table: ShuffleTbl, ISD: Kind, Ty: LT.second))
6627 return LT.first * Entry->Cost;
6628 }
6629
6630 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: SrcTy))
6631 return getSpliceCost(Tp: SrcTy, Index, CostKind);
6632
6633 // Inserting a subvector can often be done with either a D, S or H register
6634 // move, so long as the inserted vector is "aligned".
6635 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6636 LT.second.getSizeInBits() <= 128 && SubTp) {
6637 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
6638 if (SubLT.second.isVector()) {
6639 int NumElts = LT.second.getVectorNumElements();
6640 int NumSubElts = SubLT.second.getVectorNumElements();
6641 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6642 return SubLT.first;
6643 }
6644 }
6645
6646 // Restore optimal kind.
6647 if (IsExtractSubvector)
6648 Kind = TTI::SK_ExtractSubvector;
6649 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6650 Args, CxtI);
6651}
6652
6653static bool containsDecreasingPointers(Loop *TheLoop,
6654 PredicatedScalarEvolution *PSE,
6655 const DominatorTree &DT) {
6656 const auto &Strides = DenseMap<Value *, const SCEV *>();
6657 for (BasicBlock *BB : TheLoop->blocks()) {
6658 // Scan the instructions in the block and look for addresses that are
6659 // consecutive and decreasing.
6660 for (Instruction &I : *BB) {
6661 if (isa<LoadInst>(Val: &I) || isa<StoreInst>(Val: &I)) {
6662 Value *Ptr = getLoadStorePointerOperand(V: &I);
6663 Type *AccessTy = getLoadStoreType(I: &I);
6664 if (getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, DT, StridesMap: Strides,
6665 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6666 .value_or(u: 0) < 0)
6667 return true;
6668 }
6669 }
6670 }
6671 return false;
6672}
6673
6674bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost(bool IsEpilogue) const {
6675 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6676 return SVEPreferFixedOverScalableIfEqualCost;
6677 // For cases like post-LTO vectorization, when we eventually know the trip
6678 // count, epilogue with fixed-width vectorization can be deleted if the trip
6679 // count is less than the epilogue iterations. That's why we prefer
6680 // fixed-width vectorization in epilogue in case of equal costs.
6681 if (IsEpilogue)
6682 return true;
6683 return ST->useFixedOverScalableIfEqualCost();
6684}
6685
6686unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
6687 return ST->getEpilogueVectorizationMinVF();
6688}
6689
6690bool AArch64TTIImpl::preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const {
6691 if (!ST->hasSVE())
6692 return false;
6693
6694 // We don't currently support vectorisation with interleaving for SVE - with
6695 // such loops we're better off not using tail-folding. This gives us a chance
6696 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6697 if (TFI->IAI->hasGroups())
6698 return false;
6699
6700 TailFoldingOpts Required = TailFoldingOpts::Disabled;
6701 if (TFI->LVL->getReductionVars().size())
6702 Required |= TailFoldingOpts::Reductions;
6703 if (TFI->LVL->getFixedOrderRecurrences().size())
6704 Required |= TailFoldingOpts::Recurrences;
6705
6706 // We call this to discover whether any load/store pointers in the loop have
6707 // negative strides. This will require extra work to reverse the loop
6708 // predicate, which may be expensive.
6709 if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(),
6710 PSE: TFI->LVL->getPredicatedScalarEvolution(),
6711 DT: *TFI->LVL->getDominatorTree()))
6712 Required |= TailFoldingOpts::Reverse;
6713 if (Required == TailFoldingOpts::Disabled)
6714 Required |= TailFoldingOpts::Simple;
6715
6716 if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(),
6717 Required))
6718 return false;
6719
6720 // Don't tail-fold for tight loops where we would be better off interleaving
6721 // with an unpredicated loop.
6722 unsigned NumInsns = 0;
6723 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6724 NumInsns += BB->size();
6725 }
6726
6727 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6728 return NumInsns >= SVETailFoldInsnThreshold;
6729}
6730
6731InstructionCost
6732AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
6733 StackOffset BaseOffset, bool HasBaseReg,
6734 int64_t Scale, unsigned AddrSpace) const {
6735 // Scaling factors are not free at all.
6736 // Operands | Rt Latency
6737 // -------------------------------------------
6738 // Rt, [Xn, Xm] | 4
6739 // -------------------------------------------
6740 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6741 // Rt, [Xn, Wm, <extend> #imm] |
6742 TargetLoweringBase::AddrMode AM;
6743 AM.BaseGV = BaseGV;
6744 AM.BaseOffs = BaseOffset.getFixed();
6745 AM.HasBaseReg = HasBaseReg;
6746 AM.Scale = Scale;
6747 AM.ScalableOffset = BaseOffset.getScalable();
6748 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace))
6749 // Scale represents reg2 * scale, thus account for 1 if
6750 // it is not equal to 0 or 1.
6751 return AM.Scale != 0 && AM.Scale != 1;
6752 return InstructionCost::getInvalid();
6753}
6754
6755bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(
6756 const Instruction *I) const {
6757 if (EnableOrLikeSelectOpt) {
6758 // For the binary operators (e.g. or) we need to be more careful than
6759 // selects, here we only transform them if they are already at a natural
6760 // break point in the code - the end of a block with an unconditional
6761 // terminator.
6762 if (I->getOpcode() == Instruction::Or &&
6763 isa<UncondBrInst>(Val: I->getNextNode()))
6764 return true;
6765
6766 if (I->getOpcode() == Instruction::Add ||
6767 I->getOpcode() == Instruction::Sub)
6768 return true;
6769 }
6770 return BaseT::shouldTreatInstructionLikeSelect(I);
6771}
6772
6773bool AArch64TTIImpl::isLSRCostLess(
6774 const TargetTransformInfo::LSRCost &C1,
6775 const TargetTransformInfo::LSRCost &C2) const {
6776 // AArch64 specific here is adding the number of instructions to the
6777 // comparison (though not as the first consideration, as some targets do)
6778 // along with changing the priority of the base additions.
6779 // TODO: Maybe a more nuanced tradeoff between instruction count
6780 // and number of registers? To be investigated at a later date.
6781 if (EnableLSRCostOpt)
6782 return std::tie(args: C1.NumRegs, args: C1.Insns, args: C1.NumBaseAdds, args: C1.AddRecCost,
6783 args: C1.NumIVMuls, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) <
6784 std::tie(args: C2.NumRegs, args: C2.Insns, args: C2.NumBaseAdds, args: C2.AddRecCost,
6785 args: C2.NumIVMuls, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost);
6786
6787 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
6788}
6789
6790static bool isSplatShuffle(Value *V) {
6791 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V))
6792 return all_equal(Range: Shuf->getShuffleMask());
6793 return false;
6794}
6795
6796/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6797/// or upper half of the vector elements.
6798static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6799 bool AllowSplat = false) {
6800 // Scalable types can't be extract shuffle vectors.
6801 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6802 return false;
6803
6804 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6805 auto *FullTy = FullV->getType();
6806 auto *HalfTy = HalfV->getType();
6807 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6808 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6809 };
6810
6811 auto extractHalf = [](Value *FullV, Value *HalfV) {
6812 auto *FullVT = cast<FixedVectorType>(Val: FullV->getType());
6813 auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType());
6814 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6815 };
6816
6817 ArrayRef<int> M1, M2;
6818 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6819 if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask(M1))) ||
6820 !match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask(M2))))
6821 return false;
6822
6823 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6824 // it is not checked as an extract below.
6825 if (AllowSplat && isSplatShuffle(V: Op1))
6826 S1Op1 = nullptr;
6827 if (AllowSplat && isSplatShuffle(V: Op2))
6828 S2Op1 = nullptr;
6829
6830 // Check that the operands are half as wide as the result and we extract
6831 // half of the elements of the input vectors.
6832 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6833 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6834 return false;
6835
6836 // Check the mask extracts either the lower or upper half of vector
6837 // elements.
6838 int M1Start = 0;
6839 int M2Start = 0;
6840 int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * 2;
6841 if ((S1Op1 &&
6842 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) ||
6843 (S2Op1 &&
6844 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start)))
6845 return false;
6846
6847 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6848 (M2Start != 0 && M2Start != (NumElements / 2)))
6849 return false;
6850 if (S1Op1 && S2Op1 && M1Start != M2Start)
6851 return false;
6852
6853 return true;
6854}
6855
6856/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6857/// of the vector elements.
6858static bool areExtractExts(Value *Ext1, Value *Ext2) {
6859 auto areExtDoubled = [](Instruction *Ext) {
6860 return Ext->getType()->getScalarSizeInBits() ==
6861 2 * Ext->getOperand(i: 0)->getType()->getScalarSizeInBits();
6862 };
6863
6864 if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) ||
6865 !match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) ||
6866 !areExtDoubled(cast<Instruction>(Val: Ext1)) ||
6867 !areExtDoubled(cast<Instruction>(Val: Ext2)))
6868 return false;
6869
6870 return true;
6871}
6872
6873/// Check if Op could be used with vmull_high_p64 intrinsic.
6874static bool isOperandOfVmullHighP64(Value *Op) {
6875 Value *VectorOperand = nullptr;
6876 ConstantInt *ElementIndex = nullptr;
6877 return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand),
6878 Idx: m_ConstantInt(CI&: ElementIndex))) &&
6879 ElementIndex->getValue() == 1 &&
6880 isa<FixedVectorType>(Val: VectorOperand->getType()) &&
6881 cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == 2;
6882}
6883
6884/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6885static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6886 return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2);
6887}
6888
6889static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
6890 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6891 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs);
6892 if (!GEP || GEP->getNumOperands() != 2)
6893 return false;
6894
6895 Value *Base = GEP->getOperand(i_nocapture: 0);
6896 Value *Offsets = GEP->getOperand(i_nocapture: 1);
6897
6898 // We only care about scalar_base+vector_offsets.
6899 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6900 return false;
6901
6902 // Sink extends that would allow us to use 32-bit offset vectors.
6903 if (isa<SExtInst>(Val: Offsets) || isa<ZExtInst>(Val: Offsets)) {
6904 auto *OffsetsInst = cast<Instruction>(Val: Offsets);
6905 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6906 OffsetsInst->getOperand(i: 0)->getType()->getScalarSizeInBits() <= 32)
6907 Ops.push_back(Elt: &GEP->getOperandUse(i: 1));
6908 }
6909
6910 // Sink the GEP.
6911 return true;
6912}
6913
6914/// We want to sink following cases:
6915/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6916/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6917static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
6918 if (match(V: Op, P: m_VScale()))
6919 return true;
6920 if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) ||
6921 match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) {
6922 Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0));
6923 return true;
6924 }
6925 if (match(V: Op, P: m_Shl(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt())) ||
6926 match(V: Op, P: m_Mul(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt()))) {
6927 Value *ZExtOp = cast<Instruction>(Val: Op)->getOperand(i: 0);
6928 Ops.push_back(Elt: &cast<Instruction>(Val: ZExtOp)->getOperandUse(i: 0));
6929 Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0));
6930 return true;
6931 }
6932 return false;
6933}
6934
6935static bool isFNeg(Value *Op) { return match(V: Op, P: m_FNeg(X: m_Value())); }
6936
6937/// Check if sinking \p I's operands to I's basic block is profitable, because
6938/// the operands can be folded into a target instruction, e.g.
6939/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6940bool AArch64TTIImpl::isProfitableToSinkOperands(
6941 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6942 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
6943 switch (II->getIntrinsicID()) {
6944 case Intrinsic::aarch64_neon_smull:
6945 case Intrinsic::aarch64_neon_umull:
6946 if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1),
6947 /*AllowSplat=*/true)) {
6948 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6949 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6950 return true;
6951 }
6952 [[fallthrough]];
6953
6954 case Intrinsic::fma:
6955 case Intrinsic::fmuladd:
6956 if (isa<VectorType>(Val: I->getType()) &&
6957 cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
6958 !ST->hasFullFP16())
6959 return false;
6960
6961 if (isFNeg(Op: II->getOperand(i_nocapture: 0)))
6962 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6963 if (isFNeg(Op: II->getOperand(i_nocapture: 1)))
6964 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6965
6966 [[fallthrough]];
6967 case Intrinsic::aarch64_neon_sqdmull:
6968 case Intrinsic::aarch64_neon_sqdmulh:
6969 case Intrinsic::aarch64_neon_sqrdmulh:
6970 // Sink splats for index lane variants
6971 if (isSplatShuffle(V: II->getOperand(i_nocapture: 0)))
6972 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6973 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
6974 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6975 return !Ops.empty();
6976 case Intrinsic::aarch64_neon_fmlal:
6977 case Intrinsic::aarch64_neon_fmlal2:
6978 case Intrinsic::aarch64_neon_fmlsl:
6979 case Intrinsic::aarch64_neon_fmlsl2:
6980 // Sink splats for index lane variants
6981 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
6982 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6983 if (isSplatShuffle(V: II->getOperand(i_nocapture: 2)))
6984 Ops.push_back(Elt: &II->getOperandUse(i: 2));
6985 return !Ops.empty();
6986 case Intrinsic::aarch64_sve_ptest_first:
6987 case Intrinsic::aarch64_sve_ptest_last:
6988 if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: 0)))
6989 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6990 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6991 return !Ops.empty();
6992 case Intrinsic::aarch64_sme_write_horiz:
6993 case Intrinsic::aarch64_sme_write_vert:
6994 case Intrinsic::aarch64_sme_writeq_horiz:
6995 case Intrinsic::aarch64_sme_writeq_vert: {
6996 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 1));
6997 if (!Idx || Idx->getOpcode() != Instruction::Add)
6998 return false;
6999 Ops.push_back(Elt: &II->getOperandUse(i: 1));
7000 return true;
7001 }
7002 case Intrinsic::aarch64_sme_read_horiz:
7003 case Intrinsic::aarch64_sme_read_vert:
7004 case Intrinsic::aarch64_sme_readq_horiz:
7005 case Intrinsic::aarch64_sme_readq_vert:
7006 case Intrinsic::aarch64_sme_ld1b_vert:
7007 case Intrinsic::aarch64_sme_ld1h_vert:
7008 case Intrinsic::aarch64_sme_ld1w_vert:
7009 case Intrinsic::aarch64_sme_ld1d_vert:
7010 case Intrinsic::aarch64_sme_ld1q_vert:
7011 case Intrinsic::aarch64_sme_st1b_vert:
7012 case Intrinsic::aarch64_sme_st1h_vert:
7013 case Intrinsic::aarch64_sme_st1w_vert:
7014 case Intrinsic::aarch64_sme_st1d_vert:
7015 case Intrinsic::aarch64_sme_st1q_vert:
7016 case Intrinsic::aarch64_sme_ld1b_horiz:
7017 case Intrinsic::aarch64_sme_ld1h_horiz:
7018 case Intrinsic::aarch64_sme_ld1w_horiz:
7019 case Intrinsic::aarch64_sme_ld1d_horiz:
7020 case Intrinsic::aarch64_sme_ld1q_horiz:
7021 case Intrinsic::aarch64_sme_st1b_horiz:
7022 case Intrinsic::aarch64_sme_st1h_horiz:
7023 case Intrinsic::aarch64_sme_st1w_horiz:
7024 case Intrinsic::aarch64_sme_st1d_horiz:
7025 case Intrinsic::aarch64_sme_st1q_horiz: {
7026 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 3));
7027 if (!Idx || Idx->getOpcode() != Instruction::Add)
7028 return false;
7029 Ops.push_back(Elt: &II->getOperandUse(i: 3));
7030 return true;
7031 }
7032 case Intrinsic::aarch64_neon_pmull:
7033 if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1)))
7034 return false;
7035 Ops.push_back(Elt: &II->getOperandUse(i: 0));
7036 Ops.push_back(Elt: &II->getOperandUse(i: 1));
7037 return true;
7038 case Intrinsic::aarch64_neon_pmull64:
7039 if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: 0),
7040 Op2: II->getArgOperand(i: 1)))
7041 return false;
7042 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
7043 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
7044 return true;
7045 case Intrinsic::masked_gather:
7046 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 0), Ops))
7047 return false;
7048 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
7049 return true;
7050 case Intrinsic::masked_scatter:
7051 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 1), Ops))
7052 return false;
7053 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
7054 return true;
7055 default:
7056 return false;
7057 }
7058 }
7059
7060 auto ShouldSinkCondition = [](Value *Cond,
7061 SmallVectorImpl<Use *> &Ops) -> bool {
7062 if (!isa<IntrinsicInst>(Val: Cond))
7063 return false;
7064 auto *II = dyn_cast<IntrinsicInst>(Val: Cond);
7065 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
7066 !isa<ScalableVectorType>(Val: II->getOperand(i_nocapture: 0)->getType()))
7067 return false;
7068 if (isa<CmpInst>(Val: II->getOperand(i_nocapture: 0)))
7069 Ops.push_back(Elt: &II->getOperandUse(i: 0));
7070 return true;
7071 };
7072
7073 switch (I->getOpcode()) {
7074 case Instruction::GetElementPtr:
7075 case Instruction::Add:
7076 case Instruction::Sub:
7077 // Sink vscales closer to uses for better isel
7078 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
7079 if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) {
7080 Ops.push_back(Elt: &I->getOperandUse(i: Op));
7081 return true;
7082 }
7083 }
7084 break;
7085 case Instruction::Select: {
7086 if (!ShouldSinkCondition(I->getOperand(i: 0), Ops))
7087 return false;
7088
7089 Ops.push_back(Elt: &I->getOperandUse(i: 0));
7090 return true;
7091 }
7092 case Instruction::UncondBr:
7093 return false;
7094 case Instruction::CondBr: {
7095 if (!ShouldSinkCondition(cast<CondBrInst>(Val: I)->getCondition(), Ops))
7096 return false;
7097
7098 Ops.push_back(Elt: &I->getOperandUse(i: 0));
7099 return true;
7100 }
7101 case Instruction::FMul:
7102 // fmul with contract flag can be combined with fadd into fma.
7103 // Sinking fneg into this block enables fmls pattern.
7104 if (cast<FPMathOperator>(Val: I)->hasAllowContract()) {
7105 if (isFNeg(Op: I->getOperand(i: 0)))
7106 Ops.push_back(Elt: &I->getOperandUse(i: 0));
7107 if (isFNeg(Op: I->getOperand(i: 1)))
7108 Ops.push_back(Elt: &I->getOperandUse(i: 1));
7109 }
7110 break;
7111
7112 // Type | BIC | ORN | EON
7113 // ----------------+-----------+-----------+-----------
7114 // scalar | Base | Base | Base
7115 // scalar w/shift | - | - | -
7116 // fixed vector | NEON/Base | NEON/Base | BSL2N/Base
7117 // scalable vector | SVE | - | BSL2N
7118 case Instruction::Xor:
7119 // EON only for scalars (possibly expanded fixed vectors)
7120 // and vectors using the SVE2/SME BSL2N instruction.
7121 if (I->getType()->isVectorTy() && ST->isNeonAvailable()) {
7122 bool HasBSL2N =
7123 ST->isSVEorStreamingSVEAvailable() && (ST->hasSVE2() || ST->hasSME());
7124 if (!HasBSL2N)
7125 break;
7126 }
7127 [[fallthrough]];
7128 case Instruction::And:
7129 case Instruction::Or:
7130 // Even though we could use the SVE2/SME BSL2N instruction,
7131 // it might pessimize with an extra MOV depending on register allocation.
7132 if (I->getOpcode() == Instruction::Or &&
7133 isa<ScalableVectorType>(Val: I->getType()))
7134 break;
7135 // Shift can be fold into scalar AND/ORR/EOR,
7136 // but not the non-negated operand of BIC/ORN/EON.
7137 if (!(I->getType()->isVectorTy() && ST->hasNEON()) &&
7138 match(V: I, P: m_c_BinOp(L: m_Shift(L: m_Value(), R: m_ConstantInt()), R: m_Value())))
7139 break;
7140 for (auto &Op : I->operands()) {
7141 // (and/or/xor X, (not Y)) -> (bic/orn/eon X, Y)
7142 if (match(V: Op.get(), P: m_Not(V: m_Value()))) {
7143 Ops.push_back(Elt: &Op);
7144 return true;
7145 }
7146 // (and/or/xor X, (splat (not Y))) -> (bic/orn/eon X, (splat Y))
7147 if (match(V: Op.get(),
7148 P: m_Shuffle(v1: m_InsertElt(Val: m_Value(), Elt: m_Not(V: m_Value()), Idx: m_ZeroInt()),
7149 v2: m_Value(), mask: m_ZeroMask()))) {
7150 Use &InsertElt = cast<Instruction>(Val&: Op)->getOperandUse(i: 0);
7151 Use &Not = cast<Instruction>(Val&: InsertElt)->getOperandUse(i: 1);
7152 Ops.push_back(Elt: &Not);
7153 Ops.push_back(Elt: &InsertElt);
7154 Ops.push_back(Elt: &Op);
7155 return true;
7156 }
7157 }
7158 break;
7159 default:
7160 break;
7161 }
7162
7163 if (!I->getType()->isVectorTy())
7164 return !Ops.empty();
7165
7166 switch (I->getOpcode()) {
7167 case Instruction::Sub:
7168 case Instruction::Add: {
7169 if (!areExtractExts(Ext1: I->getOperand(i: 0), Ext2: I->getOperand(i: 1)))
7170 return false;
7171
7172 // If the exts' operands extract either the lower or upper elements, we
7173 // can sink them too.
7174 auto Ext1 = cast<Instruction>(Val: I->getOperand(i: 0));
7175 auto Ext2 = cast<Instruction>(Val: I->getOperand(i: 1));
7176 if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: 0), Op2: Ext2->getOperand(i: 0))) {
7177 Ops.push_back(Elt: &Ext1->getOperandUse(i: 0));
7178 Ops.push_back(Elt: &Ext2->getOperandUse(i: 0));
7179 }
7180
7181 Ops.push_back(Elt: &I->getOperandUse(i: 0));
7182 Ops.push_back(Elt: &I->getOperandUse(i: 1));
7183
7184 return true;
7185 }
7186 case Instruction::Or: {
7187 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
7188 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
7189 if (ST->hasNEON()) {
7190 Instruction *OtherAnd, *IA, *IB;
7191 Value *MaskValue;
7192 // MainAnd refers to And instruction that has 'Not' as one of its operands
7193 if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)),
7194 R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))),
7195 R: m_Instruction(I&: IA)))))) {
7196 if (match(V: OtherAnd,
7197 P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) {
7198 Instruction *MainAnd = I->getOperand(i: 0) == OtherAnd
7199 ? cast<Instruction>(Val: I->getOperand(i: 1))
7200 : cast<Instruction>(Val: I->getOperand(i: 0));
7201
7202 // Both Ands should be in same basic block as Or
7203 if (I->getParent() != MainAnd->getParent() ||
7204 I->getParent() != OtherAnd->getParent())
7205 return false;
7206
7207 // Non-mask operands of both Ands should also be in same basic block
7208 if (I->getParent() != IA->getParent() ||
7209 I->getParent() != IB->getParent())
7210 return false;
7211
7212 Ops.push_back(
7213 Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: 0) == IA ? 1 : 0));
7214 Ops.push_back(Elt: &I->getOperandUse(i: 0));
7215 Ops.push_back(Elt: &I->getOperandUse(i: 1));
7216
7217 return true;
7218 }
7219 }
7220 }
7221
7222 return false;
7223 }
7224 case Instruction::Mul: {
7225 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
7226 auto *Ty = cast<VectorType>(Val: V->getType());
7227 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7228 if (Ty->isScalableTy())
7229 return false;
7230
7231 // Indexed variants of Mul exist for i16 and i32 element types only.
7232 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
7233 };
7234
7235 int NumZExts = 0, NumSExts = 0;
7236 for (auto &Op : I->operands()) {
7237 // Make sure we are not already sinking this operand
7238 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; }))
7239 continue;
7240
7241 if (match(V: &Op, P: m_ZExtOrSExt(Op: m_Value()))) {
7242 auto *Ext = cast<Instruction>(Val&: Op);
7243 auto *ExtOp = Ext->getOperand(i: 0);
7244 if (isSplatShuffle(V: ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
7245 Ops.push_back(Elt: &Ext->getOperandUse(i: 0));
7246 Ops.push_back(Elt: &Op);
7247
7248 if (isa<SExtInst>(Val: Ext)) {
7249 NumSExts++;
7250 } else {
7251 NumZExts++;
7252 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
7253 if (Ext->getOperand(i: 0)->getType()->getScalarSizeInBits() * 2 <
7254 I->getType()->getScalarSizeInBits())
7255 NumSExts++;
7256 }
7257
7258 continue;
7259 }
7260
7261 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op);
7262 if (!Shuffle)
7263 continue;
7264
7265 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
7266 // operand and the s/zext can help create indexed s/umull. This is
7267 // especially useful to prevent i64 mul being scalarized.
7268 if (isSplatShuffle(V: Shuffle) &&
7269 match(V: Shuffle->getOperand(i_nocapture: 0), P: m_ZExtOrSExt(Op: m_Value()))) {
7270 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
7271 Ops.push_back(Elt: &Op);
7272 if (match(V: Shuffle->getOperand(i_nocapture: 0), P: m_SExt(Op: m_Value())))
7273 NumSExts++;
7274 else
7275 NumZExts++;
7276 continue;
7277 }
7278
7279 Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: 0);
7280 InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand);
7281 if (!Insert)
7282 continue;
7283
7284 Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: 1));
7285 if (!OperandInstr)
7286 continue;
7287
7288 ConstantInt *ElementConstant =
7289 dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: 2));
7290 // Check that the insertelement is inserting into element 0
7291 if (!ElementConstant || !ElementConstant->isZero())
7292 continue;
7293
7294 unsigned Opcode = OperandInstr->getOpcode();
7295 if (Opcode == Instruction::SExt)
7296 NumSExts++;
7297 else if (Opcode == Instruction::ZExt)
7298 NumZExts++;
7299 else {
7300 // If we find that the top bits are known 0, then we can sink and allow
7301 // the backend to generate a umull.
7302 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
7303 APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / 2);
7304 if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, SQ: DL))
7305 continue;
7306 NumZExts++;
7307 }
7308
7309 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
7310 // the And, just to hoist it again back to the load.
7311 if (!match(V: OperandInstr, P: m_And(L: m_Load(Op: m_Value()), R: m_Value())))
7312 Ops.push_back(Elt: &Insert->getOperandUse(i: 1));
7313 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
7314 Ops.push_back(Elt: &Op);
7315 }
7316
7317 // It is profitable to sink if we found two of the same type of extends.
7318 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
7319 return true;
7320
7321 // Otherwise, see if we should sink splats for indexed variants.
7322 if (!ShouldSinkSplatForIndexedVariant(I))
7323 return false;
7324
7325 Ops.clear();
7326 if (isSplatShuffle(V: I->getOperand(i: 0)))
7327 Ops.push_back(Elt: &I->getOperandUse(i: 0));
7328 if (isSplatShuffle(V: I->getOperand(i: 1)))
7329 Ops.push_back(Elt: &I->getOperandUse(i: 1));
7330
7331 return !Ops.empty();
7332 }
7333 case Instruction::FMul: {
7334 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7335 if (I->getType()->isScalableTy())
7336 return !Ops.empty();
7337
7338 if (cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
7339 !ST->hasFullFP16())
7340 return !Ops.empty();
7341
7342 // Sink splats for index lane variants
7343 if (isSplatShuffle(V: I->getOperand(i: 0)))
7344 Ops.push_back(Elt: &I->getOperandUse(i: 0));
7345 if (isSplatShuffle(V: I->getOperand(i: 1)))
7346 Ops.push_back(Elt: &I->getOperandUse(i: 1));
7347 return !Ops.empty();
7348 }
7349 default:
7350 return false;
7351 }
7352 return false;
7353}
7354