1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "AArch64PerfectShuffle.h"
12#include "AArch64SMEAttributes.h"
13#include "MCTargetDesc/AArch64AddressingModes.h"
14#include "llvm/ADT/DenseMap.h"
15#include "llvm/Analysis/LoopInfo.h"
16#include "llvm/Analysis/TargetTransformInfo.h"
17#include "llvm/CodeGen/BasicTTIImpl.h"
18#include "llvm/CodeGen/CostTable.h"
19#include "llvm/CodeGen/TargetLowering.h"
20#include "llvm/IR/DerivedTypes.h"
21#include "llvm/IR/IntrinsicInst.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
24#include "llvm/IR/PatternMatch.h"
25#include "llvm/Support/Debug.h"
26#include "llvm/TargetParser/AArch64TargetParser.h"
27#include "llvm/Transforms/InstCombine/InstCombiner.h"
28#include "llvm/Transforms/Utils/UnrollLoop.h"
29#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(Val: true), cl::Hidden);
39
40static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost(
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(Val: 10),
44 cl::Hidden);
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(Val: 10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(Val: 15), cl::Hidden);
51
52static cl::opt<unsigned>
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(Val: 10),
54 cl::Hidden);
55
56static cl::opt<unsigned> CallPenaltyChangeSM(
57 "call-penalty-sm-change", cl::init(Val: 5), cl::Hidden,
58 cl::desc(
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
61static cl::opt<unsigned> InlineCallPenaltyChangeSM(
62 "inline-call-penalty-sm-change", cl::init(Val: 10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(Val: true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(Val: true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
72static cl::opt<unsigned>
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(Val: 8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
76static cl::opt<unsigned> DMBLookaheadThreshold(
77 "dmb-lookahead-threshold", cl::init(Val: 10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
80static cl::opt<int> Aarch64ForceUnrollThreshold(
81 "aarch64-force-unroll-threshold", cl::init(Val: 0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
94 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
95 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
96 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error(reason: "Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError(Opt: "");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(A&: TailFoldTypes, Separator: '+', MaxSplit: -1, KeepEmpty: false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Opt: Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
193static cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
214 cl::location(L&: TailFoldingOptionLoc));
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
219static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
225static cl::opt<bool> EnableScalableAutovecInStreamingMode(
226 "enable-scalable-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
232 SMEAttrs(F->getName(), TLI.getRuntimeLibcallsInfo()).isSMEABIRoutine();
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
238static bool hasPossibleIncompatibleOps(const Function *F,
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(Val: I).isInlineAsm() || isa<IntrinsicInst>(Val: I) ||
248 isSMEABIRoutineCall(CI: cast<CallInst>(Val: I), TLI)))
249 return true;
250 }
251 }
252 return false;
253}
254
255static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI,
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(Kind: AttributeStr).getValueAsString();
260 FeatureStr.split(A&: Features, Separator: ",");
261}
262
263APInt AArch64TTIImpl::getFeatureMask(const Function &F) const {
264 SmallVector<StringRef, 8> Features;
265 extractAttrFeatures(F, TTI: this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
269APInt AArch64TTIImpl::getPriorityMask(const Function &F) const {
270 SmallVector<StringRef, 8> Features;
271 extractAttrFeatures(F, TTI: this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
275bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {
276 return F.hasFnAttribute(Kind: "fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
283bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
291 CallAttrs.callee().hasStreamingInterfaceOrBody())
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(M: SMEAttrs::SM_Compatible, Enable: false);
298 CallAttrs.callee().set(M: SMEAttrs::SM_Enabled, Enable: true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(F: Callee, TLI: *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
326bool AArch64TTIImpl::areTypesABICompatible(const Function *Caller,
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range&: Types, P: [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Val: Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
351AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
386bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
387 TargetTransformInfo::RegisterKind K) const {
388 assert(K != TargetTransformInfo::RGK_Scalar);
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
393 return K == TargetTransformInfo::RGK_ScalableVector &&
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
401InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) const {
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(imm: Val, regSize: 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
410 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
411 AArch64_IMM::expandMOVImm(Imm: Val, BitSize: 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
416InstructionCost
417AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
418 TTI::TargetCostKind CostKind) const {
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext(width: (BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
432 InstructionCost Cost = 0;
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(a: 1, b: Cost);
440}
441
442InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
443 const APInt &Imm, Type *Ty,
444 TTI::TargetCostKind CostKind,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
502 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
507 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
508}
509
510InstructionCost
511AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
512 const APInt &Imm, Type *Ty,
513 TTI::TargetCostKind CostKind) const {
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
526 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
539 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
559 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
560}
561
562TargetTransformInfo::PopcntSupportKind
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
566 return TTI::PSK_FastHardware;
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
573 VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
574}
575
576static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
581 return InstructionCost::getInvalid();
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
585 return InstructionCost::getInvalid();
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
594 return InstructionCost::getInvalid();
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(Val: BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(Value: EC) || !VTy->isScalableTy())
601 return InstructionCost::getInvalid();
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
607 return InstructionCost(BaseHistCntCost);
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
615 return InstructionCost::getInvalid();
616}
617
618InstructionCost
619AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
620 TTI::TargetCostKind CostKind) const {
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
628 return InstructionCost::getInvalid();
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(Ty: RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(Range: ValidMinMaxTys, P: equal_to(Arg&: LT.second)))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::scmp:
655 case Intrinsic::ucmp: {
656 static const CostTblEntry BitreverseTbl[] = {
657 {.ISD: Intrinsic::scmp, .Type: MVT::i32, .Cost: 3}, // cmp+cset+csinv
658 {.ISD: Intrinsic::scmp, .Type: MVT::i64, .Cost: 3}, // cmp+cset+csinv
659 {.ISD: Intrinsic::scmp, .Type: MVT::v8i8, .Cost: 3}, // cmgt+cmgt+sub
660 {.ISD: Intrinsic::scmp, .Type: MVT::v16i8, .Cost: 3}, // cmgt+cmgt+sub
661 {.ISD: Intrinsic::scmp, .Type: MVT::v4i16, .Cost: 3}, // cmgt+cmgt+sub
662 {.ISD: Intrinsic::scmp, .Type: MVT::v8i16, .Cost: 3}, // cmgt+cmgt+sub
663 {.ISD: Intrinsic::scmp, .Type: MVT::v2i32, .Cost: 3}, // cmgt+cmgt+sub
664 {.ISD: Intrinsic::scmp, .Type: MVT::v4i32, .Cost: 3}, // cmgt+cmgt+sub
665 {.ISD: Intrinsic::scmp, .Type: MVT::v1i64, .Cost: 3}, // cmgt+cmgt+sub
666 {.ISD: Intrinsic::scmp, .Type: MVT::v2i64, .Cost: 3}, // cmgt+cmgt+sub
667 };
668 const auto LT = getTypeLegalizationCost(Ty: RetTy);
669 const auto *Entry =
670 CostTableLookup(Table: BitreverseTbl, ISD: Intrinsic::scmp, Ty: LT.second);
671 if (Entry)
672 return Entry->Cost * LT.first;
673 break;
674 }
675 case Intrinsic::sadd_sat:
676 case Intrinsic::ssub_sat:
677 case Intrinsic::uadd_sat:
678 case Intrinsic::usub_sat: {
679 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
680 MVT::v8i16, MVT::v2i32, MVT::v4i32,
681 MVT::v2i64};
682 auto LT = getTypeLegalizationCost(Ty: RetTy);
683 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
684 // need to extend the type, as it uses shr(qadd(shl, shl)).
685 unsigned Instrs =
686 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
687 if (any_of(Range: ValidSatTys, P: equal_to(Arg&: LT.second)))
688 return LT.first * Instrs;
689
690 TypeSize TS = getDataLayout().getTypeSizeInBits(Ty: RetTy);
691 uint64_t VectorSize = TS.getKnownMinValue();
692
693 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(Value: VectorSize))
694 return LT.first * Instrs;
695
696 break;
697 }
698 case Intrinsic::abs: {
699 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
700 MVT::v8i16, MVT::v2i32, MVT::v4i32,
701 MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
702 MVT::nxv4i32, MVT::nxv2i64};
703 auto LT = getTypeLegalizationCost(Ty: RetTy);
704 if (any_of(Range: ValidAbsTys, P: equal_to(Arg&: LT.second)))
705 return LT.first;
706 break;
707 }
708 case Intrinsic::bswap: {
709 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
710 MVT::v4i32, MVT::v2i64};
711 auto LT = getTypeLegalizationCost(Ty: RetTy);
712 if (any_of(Range: ValidAbsTys, P: equal_to(Arg&: LT.second)) &&
713 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
714 return LT.first;
715 break;
716 }
717 case Intrinsic::fma:
718 case Intrinsic::fmuladd: {
719 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
720 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
721 Type *EltTy = RetTy->getScalarType();
722 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
723 (EltTy->isHalfTy() && ST->hasFullFP16()))
724 return getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: RetTy, CostKind);
725 break;
726 }
727 case Intrinsic::stepvector: {
728 InstructionCost Cost = 1; // Cost of the `index' instruction
729 auto LT = getTypeLegalizationCost(Ty: RetTy);
730 // Legalisation of illegal vectors involves an `index' instruction plus
731 // (LT.first - 1) vector adds.
732 if (LT.first > 1) {
733 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: RetTy->getContext());
734 InstructionCost AddCost =
735 getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind);
736 Cost += AddCost * (LT.first - 1);
737 }
738 return Cost;
739 }
740 case Intrinsic::vector_extract:
741 case Intrinsic::vector_insert: {
742 // If both the vector and subvector types are legal types and the index
743 // is 0, then this should be a no-op or simple operation; return a
744 // relatively low cost.
745
746 // If arguments aren't actually supplied, then we cannot determine the
747 // value of the index. We also want to skip predicate types.
748 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
749 ICA.getReturnType()->getScalarType()->isIntegerTy(Bitwidth: 1))
750 break;
751
752 LLVMContext &C = RetTy->getContext();
753 EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
754 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
755 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy)
756 : getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[1]);
757 // Skip this if either the vector or subvector types are unpacked
758 // SVE types; they may get lowered to stack stores and loads.
759 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(VecVT: SubVecVT))
760 break;
761
762 TargetLoweringBase::LegalizeKind SubVecLK =
763 getTLI()->getTypeConversion(Context&: C, VT: SubVecVT);
764 TargetLoweringBase::LegalizeKind VecLK =
765 getTLI()->getTypeConversion(Context&: C, VT: VecVT);
766 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
767 const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx);
768 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
769 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
770 return TTI::TCC_Free;
771 break;
772 }
773 case Intrinsic::bitreverse: {
774 static const CostTblEntry BitreverseTbl[] = {
775 {.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: 1},
776 {.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: 1},
777 {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i8, .Cost: 1},
778 {.ISD: Intrinsic::bitreverse, .Type: MVT::v16i8, .Cost: 1},
779 {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i16, .Cost: 2},
780 {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i16, .Cost: 2},
781 {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i32, .Cost: 2},
782 {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i32, .Cost: 2},
783 {.ISD: Intrinsic::bitreverse, .Type: MVT::v1i64, .Cost: 2},
784 {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i64, .Cost: 2},
785 };
786 const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
787 const auto *Entry =
788 CostTableLookup(Table: BitreverseTbl, ISD: ICA.getID(), Ty: LegalisationCost.second);
789 if (Entry) {
790 // Cost Model is using the legal type(i32) that i8 and i16 will be
791 // converted to +1 so that we match the actual lowering cost
792 if (TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i8 ||
793 TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i16)
794 return LegalisationCost.first * Entry->Cost + 1;
795
796 return LegalisationCost.first * Entry->Cost;
797 }
798 break;
799 }
800 case Intrinsic::ctpop: {
801 if (!ST->hasNEON()) {
802 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
803 return getTypeLegalizationCost(Ty: RetTy).first * 12;
804 }
805 static const CostTblEntry CtpopCostTbl[] = {
806 {.ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: 4},
807 {.ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: 3},
808 {.ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: 2},
809 {.ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: 1},
810 {.ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: 4},
811 {.ISD: ISD::CTPOP, .Type: MVT::v2i32, .Cost: 3},
812 {.ISD: ISD::CTPOP, .Type: MVT::v4i16, .Cost: 2},
813 {.ISD: ISD::CTPOP, .Type: MVT::v8i8, .Cost: 1},
814 {.ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: 5},
815 };
816 auto LT = getTypeLegalizationCost(Ty: RetTy);
817 MVT MTy = LT.second;
818 if (const auto *Entry = CostTableLookup(Table: CtpopCostTbl, ISD: ISD::CTPOP, Ty: MTy)) {
819 // Extra cost of +1 when illegal vector types are legalized by promoting
820 // the integer type.
821 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
822 RetTy->getScalarSizeInBits()
823 ? 1
824 : 0;
825 return LT.first * Entry->Cost + ExtraCost;
826 }
827 break;
828 }
829 case Intrinsic::sadd_with_overflow:
830 case Intrinsic::uadd_with_overflow:
831 case Intrinsic::ssub_with_overflow:
832 case Intrinsic::usub_with_overflow:
833 case Intrinsic::smul_with_overflow:
834 case Intrinsic::umul_with_overflow: {
835 static const CostTblEntry WithOverflowCostTbl[] = {
836 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i8, .Cost: 3},
837 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i8, .Cost: 3},
838 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i16, .Cost: 3},
839 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i16, .Cost: 3},
840 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i32, .Cost: 1},
841 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i32, .Cost: 1},
842 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i64, .Cost: 1},
843 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i64, .Cost: 1},
844 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i8, .Cost: 3},
845 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i8, .Cost: 3},
846 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i16, .Cost: 3},
847 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i16, .Cost: 3},
848 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i32, .Cost: 1},
849 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i32, .Cost: 1},
850 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i64, .Cost: 1},
851 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i64, .Cost: 1},
852 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i8, .Cost: 5},
853 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i8, .Cost: 4},
854 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i16, .Cost: 5},
855 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i16, .Cost: 4},
856 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;tst
857 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;cmp sxtw
858 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;smulh;cmp
859 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;umulh;cmp asr
860 };
861 EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: 0), AllowUnknown: true);
862 if (MTy.isSimple())
863 if (const auto *Entry = CostTableLookup(Table: WithOverflowCostTbl, ISD: ICA.getID(),
864 Ty: MTy.getSimpleVT()))
865 return Entry->Cost;
866 break;
867 }
868 case Intrinsic::fptosi_sat:
869 case Intrinsic::fptoui_sat: {
870 if (ICA.getArgTypes().empty())
871 break;
872 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
873 auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]);
874 EVT MTy = TLI->getValueType(DL, Ty: RetTy);
875 // Check for the legal types, which are where the size of the input and the
876 // output are the same, or we are using cvt f64->i32 or f32->i64.
877 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
878 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
879 LT.second == MVT::v2f64)) {
880 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
881 (LT.second == MVT::f64 && MTy == MVT::i32) ||
882 (LT.second == MVT::f32 && MTy == MVT::i64)))
883 return LT.first;
884 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
885 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
886 MTy.getScalarSizeInBits() == 64)
887 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
888 }
889 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
890 // f32.
891 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
892 return LT.first + getIntrinsicInstrCost(
893 ICA: {ICA.getID(),
894 RetTy,
895 {ICA.getArgTypes()[0]->getWithNewType(
896 EltTy: Type::getFloatTy(C&: RetTy->getContext()))}},
897 CostKind);
898 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
899 (LT.second == MVT::f16 && MTy == MVT::i64) ||
900 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
901 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
902 return LT.first;
903 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
904 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
905 MTy.getScalarSizeInBits() == 32)
906 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
907 // Extending vector types v8f16->v8i32. These current scalarize but the
908 // codegen could be better.
909 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
910 MTy.getScalarSizeInBits() == 64)
911 return MTy.getVectorNumElements() * 3;
912
913 // If we can we use a legal convert followed by a min+max
914 if ((LT.second.getScalarType() == MVT::f32 ||
915 LT.second.getScalarType() == MVT::f64 ||
916 LT.second.getScalarType() == MVT::f16) &&
917 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
918 Type *LegalTy =
919 Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits());
920 if (LT.second.isVector())
921 LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount());
922 InstructionCost Cost = 1;
923 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
924 LegalTy, {LegalTy, LegalTy});
925 Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
926 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
927 LegalTy, {LegalTy, LegalTy});
928 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
929 return LT.first * Cost +
930 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
931 : 1);
932 }
933 // Otherwise we need to follow the default expansion that clamps the value
934 // using a float min/max with a fcmp+sel for nan handling when signed.
935 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
936 RetTy = RetTy->getScalarType();
937 if (LT.second.isVector()) {
938 FPTy = VectorType::get(ElementType: FPTy, EC: LT.second.getVectorElementCount());
939 RetTy = VectorType::get(ElementType: RetTy, EC: LT.second.getVectorElementCount());
940 }
941 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
942 InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind);
943 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
944 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
945 Cost +=
946 getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
947 Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind);
948 if (IsSigned) {
949 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
950 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy,
951 VecPred: CmpInst::FCMP_UNO, CostKind);
952 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy,
953 VecPred: CmpInst::FCMP_UNO, CostKind);
954 }
955 return LT.first * Cost;
956 }
957 case Intrinsic::fshl:
958 case Intrinsic::fshr: {
959 if (ICA.getArgs().empty())
960 break;
961
962 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[2]);
963
964 // ROTR / ROTL is a funnel shift with equal first and second operand. For
965 // ROTR on integer registers (i32/i64) this can be done in a single ror
966 // instruction. A fshl with a non-constant shift uses a neg + ror.
967 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
968 (RetTy->getPrimitiveSizeInBits() == 32 ||
969 RetTy->getPrimitiveSizeInBits() == 64)) {
970 InstructionCost NegCost =
971 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
972 return 1 + NegCost;
973 }
974
975 // TODO: Add handling for fshl where third argument is not a constant.
976 if (!OpInfoZ.isConstant())
977 break;
978
979 const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
980 if (OpInfoZ.isUniform()) {
981 static const CostTblEntry FshlTbl[] = {
982 {.ISD: Intrinsic::fshl, .Type: MVT::v4i32, .Cost: 2}, // shl + usra
983 {.ISD: Intrinsic::fshl, .Type: MVT::v2i64, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v16i8, .Cost: 2},
984 {.ISD: Intrinsic::fshl, .Type: MVT::v8i16, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v2i32, .Cost: 2},
985 {.ISD: Intrinsic::fshl, .Type: MVT::v8i8, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v4i16, .Cost: 2}};
986 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
987 // to avoid having to duplicate the costs.
988 const auto *Entry =
989 CostTableLookup(Table: FshlTbl, ISD: Intrinsic::fshl, Ty: LegalisationCost.second);
990 if (Entry)
991 return LegalisationCost.first * Entry->Cost;
992 }
993
994 auto TyL = getTypeLegalizationCost(Ty: RetTy);
995 if (!RetTy->isIntegerTy())
996 break;
997
998 // Estimate cost manually, as types like i8 and i16 will get promoted to
999 // i32 and CostTableLookup will ignore the extra conversion cost.
1000 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1001 RetTy->getScalarSizeInBits() < 64) ||
1002 (RetTy->getScalarSizeInBits() % 64 != 0);
1003 unsigned ExtraCost = HigherCost ? 1 : 0;
1004 if (RetTy->getScalarSizeInBits() == 32 ||
1005 RetTy->getScalarSizeInBits() == 64)
1006 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1007 // extr instruction.
1008 else if (HigherCost)
1009 ExtraCost = 1;
1010 else
1011 break;
1012 return TyL.first + ExtraCost;
1013 }
1014 case Intrinsic::get_active_lane_mask: {
1015 auto RetTy = cast<VectorType>(Val: ICA.getReturnType());
1016 EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy);
1017 EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
1018 if (getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT))
1019 break;
1020
1021 if (RetTy->isScalableTy()) {
1022 if (TLI->getTypeAction(Context&: RetTy->getContext(), VT: RetVT) !=
1023 TargetLowering::TypeSplitVector)
1024 break;
1025
1026 auto LT = getTypeLegalizationCost(Ty: RetTy);
1027 InstructionCost Cost = LT.first;
1028 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1029 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1030 // nxv32i1 = get_active_lane_mask(base, idx) ->
1031 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1032 if (ST->hasSVE2p1() || ST->hasSME2()) {
1033 Cost /= 2;
1034 if (Cost == 1)
1035 return Cost;
1036 }
1037
1038 // If more than one whilelo intrinsic is required, include the extra cost
1039 // required by the saturating add & select required to increment the
1040 // start value after the first intrinsic call.
1041 Type *OpTy = ICA.getArgTypes()[0];
1042 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1043 InstructionCost SplitCost = getIntrinsicInstrCost(ICA: AddAttrs, CostKind);
1044 Type *CondTy = OpTy->getWithNewBitWidth(NewBitWidth: 1);
1045 SplitCost += getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: OpTy, CondTy,
1046 VecPred: CmpInst::ICMP_UGT, CostKind);
1047 return Cost + (SplitCost * (Cost - 1));
1048 } else if (!getTLI()->isTypeLegal(VT: RetVT)) {
1049 // We don't have enough context at this point to determine if the mask
1050 // is going to be kept live after the block, which will force the vXi1
1051 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1052 // For now, we just assume the vectorizer created this intrinsic and
1053 // the result will be the input for a PHI. In this case the cost will
1054 // be extremely high for fixed-width vectors.
1055 // NOTE: getScalarizationOverhead returns a cost that's far too
1056 // pessimistic for the actual generated codegen. In reality there are
1057 // two instructions generated per lane.
1058 return cast<FixedVectorType>(Val: RetTy)->getNumElements() * 2;
1059 }
1060 break;
1061 }
1062 case Intrinsic::experimental_vector_match: {
1063 auto *NeedleTy = cast<FixedVectorType>(Val: ICA.getArgTypes()[1]);
1064 EVT SearchVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
1065 unsigned SearchSize = NeedleTy->getNumElements();
1066 if (!getTLI()->shouldExpandVectorMatch(VT: SearchVT, SearchSize)) {
1067 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1068 // Neoverse V3, these are cheap operations with the same latency as a
1069 // vector ADD. In most cases, however, we also need to do an extra DUP.
1070 // For fixed-length vectors we currently need an extra five--six
1071 // instructions besides the MATCH.
1072 InstructionCost Cost = 4;
1073 if (isa<FixedVectorType>(Val: RetTy))
1074 Cost += 10;
1075 return Cost;
1076 }
1077 break;
1078 }
1079 case Intrinsic::experimental_cttz_elts: {
1080 EVT ArgVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
1081 if (!getTLI()->shouldExpandCttzElements(VT: ArgVT)) {
1082 // This will consist of a SVE brkb and a cntp instruction. These
1083 // typically have the same latency and half the throughput as a vector
1084 // add instruction.
1085 return 4;
1086 }
1087 break;
1088 }
1089 case Intrinsic::loop_dependence_raw_mask:
1090 case Intrinsic::loop_dependence_war_mask: {
1091 // The whilewr/rw instructions require SVE2 or SME.
1092 if (ST->hasSVE2() || ST->hasSME()) {
1093 EVT VecVT = getTLI()->getValueType(DL, Ty: RetTy);
1094 unsigned EltSizeInBytes =
1095 cast<ConstantInt>(Val: ICA.getArgs()[2])->getZExtValue();
1096 if (!is_contained(Set: {1u, 2u, 4u, 8u}, Element: EltSizeInBytes) ||
1097 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1098 break;
1099 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1100 return isa<FixedVectorType>(Val: RetTy) ? 2 : 1;
1101 }
1102 break;
1103 }
1104 case Intrinsic::experimental_vector_extract_last_active:
1105 if (ST->isSVEorStreamingSVEAvailable()) {
1106 auto [LegalCost, _] = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]);
1107 // This should turn into chained clastb instructions.
1108 return LegalCost;
1109 }
1110 break;
1111 case Intrinsic::pow: {
1112 // For scalar calls we know the target has the libcall, and for fixed-width
1113 // vectors we know for the worst case it can be scalarised.
1114 EVT VT = getTLI()->getValueType(DL, Ty: RetTy);
1115 RTLIB::Libcall LC = RTLIB::getPOW(RetVT: VT);
1116 bool HasLibcall = getTLI()->getLibcallImpl(Call: LC) != RTLIB::Unsupported;
1117 bool CanLowerWithLibcalls = !isa<ScalableVectorType>(Val: RetTy) || HasLibcall;
1118
1119 // If we know that the call can be lowered with libcalls then it's safe to
1120 // reduce the costs in some cases. This is important for scalable vectors,
1121 // since we cannot scalarize the call in the absence of a vector math
1122 // library.
1123 if (CanLowerWithLibcalls && ICA.getInst() && !ICA.getArgs().empty()) {
1124 // If we know the fast math flags and the exponent is a constant then the
1125 // cost may be less for some exponents like 0.25 and 0.75.
1126 const Constant *ExpC = dyn_cast<Constant>(Val: ICA.getArgs()[1]);
1127 if (ExpC && isa<VectorType>(Val: ExpC->getType()))
1128 ExpC = ExpC->getSplatValue();
1129 if (auto *ExpF = dyn_cast_or_null<ConstantFP>(Val: ExpC)) {
1130 // The argument must be a FP constant.
1131 bool Is025 = ExpF->getValueAPF().isExactlyValue(V: 0.25);
1132 bool Is075 = ExpF->getValueAPF().isExactlyValue(V: 0.75);
1133 FastMathFlags FMF = ICA.getInst()->getFastMathFlags();
1134 if ((Is025 || Is075) && FMF.noInfs() && FMF.approxFunc() &&
1135 (!Is025 || FMF.noSignedZeros())) {
1136 IntrinsicCostAttributes Attrs(Intrinsic::sqrt, RetTy, {RetTy}, FMF);
1137 InstructionCost Sqrt = getIntrinsicInstrCost(ICA: Attrs, CostKind);
1138 if (Is025)
1139 return 2 * Sqrt;
1140 InstructionCost FMul =
1141 getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: RetTy, CostKind);
1142 return (Sqrt * 2) + FMul;
1143 }
1144 // TODO: For 1/3 exponents we expect the cbrt call to be slightly
1145 // cheaper than pow.
1146 }
1147 }
1148
1149 if (HasLibcall)
1150 return getCallInstrCost(F: nullptr, RetTy, Tys: ICA.getArgTypes(), CostKind);
1151 break;
1152 }
1153 case Intrinsic::sqrt:
1154 case Intrinsic::fabs:
1155 case Intrinsic::ceil:
1156 case Intrinsic::floor:
1157 case Intrinsic::nearbyint:
1158 case Intrinsic::round:
1159 case Intrinsic::rint:
1160 case Intrinsic::roundeven:
1161 case Intrinsic::trunc:
1162 case Intrinsic::minnum:
1163 case Intrinsic::maxnum:
1164 case Intrinsic::minimum:
1165 case Intrinsic::maximum: {
1166 if (isa<ScalableVectorType>(Val: RetTy) && ST->isSVEorStreamingSVEAvailable()) {
1167 auto LT = getTypeLegalizationCost(Ty: RetTy);
1168 return LT.first;
1169 }
1170 break;
1171 }
1172 default:
1173 break;
1174 }
1175 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1176}
1177
1178/// The function will remove redundant reinterprets casting in the presence
1179/// of the control flow
1180static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1181 IntrinsicInst &II) {
1182 SmallVector<Instruction *, 32> Worklist;
1183 auto RequiredType = II.getType();
1184
1185 auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: 0));
1186 assert(PN && "Expected Phi Node!");
1187
1188 // Don't create a new Phi unless we can remove the old one.
1189 if (!PN->hasOneUse())
1190 return std::nullopt;
1191
1192 for (Value *IncValPhi : PN->incoming_values()) {
1193 auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi);
1194 if (!Reinterpret ||
1195 Reinterpret->getIntrinsicID() !=
1196 Intrinsic::aarch64_sve_convert_to_svbool ||
1197 RequiredType != Reinterpret->getArgOperand(i: 0)->getType())
1198 return std::nullopt;
1199 }
1200
1201 // Create the new Phi
1202 IC.Builder.SetInsertPoint(PN);
1203 PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues());
1204 Worklist.push_back(Elt: PN);
1205
1206 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1207 auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I));
1208 NPN->addIncoming(V: Reinterpret->getOperand(i: 0), BB: PN->getIncomingBlock(i: I));
1209 Worklist.push_back(Elt: Reinterpret);
1210 }
1211
1212 // Cleanup Phi Node and reinterprets
1213 return IC.replaceInstUsesWith(I&: II, V: NPN);
1214}
1215
1216// A collection of properties common to SVE intrinsics that allow for combines
1217// to be written without needing to know the specific intrinsic.
1218struct SVEIntrinsicInfo {
1219 //
1220 // Helper routines for common intrinsic definitions.
1221 //
1222
1223 // e.g. llvm.aarch64.sve.add pg, op1, op2
1224 // with IID ==> llvm.aarch64.sve.add_u
1225 static SVEIntrinsicInfo
1226 defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) {
1227 return SVEIntrinsicInfo()
1228 .setGoverningPredicateOperandIdx(0)
1229 .setOperandIdxInactiveLanesTakenFrom(1)
1230 .setMatchingUndefIntrinsic(IID);
1231 }
1232
1233 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1234 static SVEIntrinsicInfo defaultMergingUnaryOp() {
1235 return SVEIntrinsicInfo()
1236 .setGoverningPredicateOperandIdx(1)
1237 .setOperandIdxInactiveLanesTakenFrom(0)
1238 .setOperandIdxWithNoActiveLanes(0);
1239 }
1240
1241 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1242 static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() {
1243 return SVEIntrinsicInfo()
1244 .setGoverningPredicateOperandIdx(1)
1245 .setOperandIdxInactiveLanesTakenFrom(0);
1246 }
1247
1248 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1249 static SVEIntrinsicInfo defaultUndefOp() {
1250 return SVEIntrinsicInfo()
1251 .setGoverningPredicateOperandIdx(0)
1252 .setInactiveLanesAreNotDefined();
1253 }
1254
1255 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1256 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1257 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1258 return SVEIntrinsicInfo()
1259 .setGoverningPredicateOperandIdx(GPIndex)
1260 .setInactiveLanesAreUnused();
1261 }
1262
1263 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1264 // llvm.aarch64.sve.ld1 pg, ptr
1265 static SVEIntrinsicInfo defaultZeroingOp() {
1266 return SVEIntrinsicInfo()
1267 .setGoverningPredicateOperandIdx(0)
1268 .setInactiveLanesAreUnused()
1269 .setResultIsZeroInitialized();
1270 }
1271
1272 // All properties relate to predication and thus having a general predicate
1273 // is the minimum requirement to say there is intrinsic info to act on.
1274 explicit operator bool() const { return hasGoverningPredicate(); }
1275
1276 //
1277 // Properties relating to the governing predicate.
1278 //
1279
1280 bool hasGoverningPredicate() const {
1281 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1282 }
1283
1284 unsigned getGoverningPredicateOperandIdx() const {
1285 assert(hasGoverningPredicate() && "Propery not set!");
1286 return GoverningPredicateIdx;
1287 }
1288
1289 SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) {
1290 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1291 GoverningPredicateIdx = Index;
1292 return *this;
1293 }
1294
1295 //
1296 // Properties relating to operations the intrinsic could be transformed into.
1297 // NOTE: This does not mean such a transformation is always possible, but the
1298 // knowledge makes it possible to reuse existing optimisations without needing
1299 // to embed specific handling for each intrinsic. For example, instruction
1300 // simplification can be used to optimise an intrinsic's active lanes.
1301 //
1302
1303 bool hasMatchingUndefIntrinsic() const {
1304 return UndefIntrinsic != Intrinsic::not_intrinsic;
1305 }
1306
1307 Intrinsic::ID getMatchingUndefIntrinsic() const {
1308 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1309 return UndefIntrinsic;
1310 }
1311
1312 SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) {
1313 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1314 UndefIntrinsic = IID;
1315 return *this;
1316 }
1317
1318 bool hasMatchingIROpode() const { return IROpcode != 0; }
1319
1320 unsigned getMatchingIROpode() const {
1321 assert(hasMatchingIROpode() && "Propery not set!");
1322 return IROpcode;
1323 }
1324
1325 SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
1326 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1327 IROpcode = Opcode;
1328 return *this;
1329 }
1330
1331 //
1332 // Properties relating to the result of inactive lanes.
1333 //
1334
1335 bool inactiveLanesTakenFromOperand() const {
1336 return ResultLanes == InactiveLanesTakenFromOperand;
1337 }
1338
1339 unsigned getOperandIdxInactiveLanesTakenFrom() const {
1340 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1341 return OperandIdxForInactiveLanes;
1342 }
1343
1344 SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) {
1345 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1346 ResultLanes = InactiveLanesTakenFromOperand;
1347 OperandIdxForInactiveLanes = Index;
1348 return *this;
1349 }
1350
1351 bool inactiveLanesAreNotDefined() const {
1352 return ResultLanes == InactiveLanesAreNotDefined;
1353 }
1354
1355 SVEIntrinsicInfo &setInactiveLanesAreNotDefined() {
1356 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1357 ResultLanes = InactiveLanesAreNotDefined;
1358 return *this;
1359 }
1360
1361 bool inactiveLanesAreUnused() const {
1362 return ResultLanes == InactiveLanesAreUnused;
1363 }
1364
1365 SVEIntrinsicInfo &setInactiveLanesAreUnused() {
1366 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1367 ResultLanes = InactiveLanesAreUnused;
1368 return *this;
1369 }
1370
1371 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1372 // inactiveLanesAreZeroed =
1373 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1374 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1375
1376 SVEIntrinsicInfo &setResultIsZeroInitialized() {
1377 ResultIsZeroInitialized = true;
1378 return *this;
1379 }
1380
1381 //
1382 // The first operand of unary merging operations is typically only used to
1383 // set the result for inactive lanes. Knowing this allows us to deadcode the
1384 // operand when we can prove there are no inactive lanes.
1385 //
1386
1387 bool hasOperandWithNoActiveLanes() const {
1388 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1389 }
1390
1391 unsigned getOperandIdxWithNoActiveLanes() const {
1392 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1393 return OperandIdxWithNoActiveLanes;
1394 }
1395
1396 SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) {
1397 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1398 OperandIdxWithNoActiveLanes = Index;
1399 return *this;
1400 }
1401
1402private:
1403 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1404
1405 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1406 unsigned IROpcode = 0;
1407
1408 enum PredicationStyle {
1409 Uninitialized,
1410 InactiveLanesTakenFromOperand,
1411 InactiveLanesAreNotDefined,
1412 InactiveLanesAreUnused
1413 } ResultLanes = Uninitialized;
1414
1415 bool ResultIsZeroInitialized = false;
1416 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1417 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1418};
1419
1420static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
1421 // Some SVE intrinsics do not use scalable vector types, but since they are
1422 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1423 if (!isa<ScalableVectorType>(Val: II.getType()) &&
1424 all_of(Range: II.args(), P: [&](const Value *V) {
1425 return !isa<ScalableVectorType>(Val: V->getType());
1426 }))
1427 return SVEIntrinsicInfo();
1428
1429 Intrinsic::ID IID = II.getIntrinsicID();
1430 switch (IID) {
1431 default:
1432 break;
1433 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1434 case Intrinsic::aarch64_sve_fcvt_f16f32:
1435 case Intrinsic::aarch64_sve_fcvt_f16f64:
1436 case Intrinsic::aarch64_sve_fcvt_f32f16:
1437 case Intrinsic::aarch64_sve_fcvt_f32f64:
1438 case Intrinsic::aarch64_sve_fcvt_f64f16:
1439 case Intrinsic::aarch64_sve_fcvt_f64f32:
1440 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1441 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1442 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1443 case Intrinsic::aarch64_sve_fcvtzs:
1444 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1445 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1446 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1447 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1448 case Intrinsic::aarch64_sve_fcvtzu:
1449 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1450 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1451 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1452 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1453 case Intrinsic::aarch64_sve_scvtf:
1454 case Intrinsic::aarch64_sve_scvtf_f16i32:
1455 case Intrinsic::aarch64_sve_scvtf_f16i64:
1456 case Intrinsic::aarch64_sve_scvtf_f32i64:
1457 case Intrinsic::aarch64_sve_scvtf_f64i32:
1458 case Intrinsic::aarch64_sve_ucvtf:
1459 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1460 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1461 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1462 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1463 return SVEIntrinsicInfo::defaultMergingUnaryOp();
1464
1465 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1466 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1467 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1468 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1469 return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp();
1470
1471 case Intrinsic::aarch64_sve_fabd:
1472 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fabd_u);
1473 case Intrinsic::aarch64_sve_fadd:
1474 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fadd_u)
1475 .setMatchingIROpcode(Instruction::FAdd);
1476 case Intrinsic::aarch64_sve_fdiv:
1477 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fdiv_u)
1478 .setMatchingIROpcode(Instruction::FDiv);
1479 case Intrinsic::aarch64_sve_fmax:
1480 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmax_u);
1481 case Intrinsic::aarch64_sve_fmaxnm:
1482 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmaxnm_u);
1483 case Intrinsic::aarch64_sve_fmin:
1484 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmin_u);
1485 case Intrinsic::aarch64_sve_fminnm:
1486 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fminnm_u);
1487 case Intrinsic::aarch64_sve_fmla:
1488 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmla_u);
1489 case Intrinsic::aarch64_sve_fmls:
1490 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmls_u);
1491 case Intrinsic::aarch64_sve_fmul:
1492 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmul_u)
1493 .setMatchingIROpcode(Instruction::FMul);
1494 case Intrinsic::aarch64_sve_fmulx:
1495 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmulx_u);
1496 case Intrinsic::aarch64_sve_fnmla:
1497 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmla_u);
1498 case Intrinsic::aarch64_sve_fnmls:
1499 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmls_u);
1500 case Intrinsic::aarch64_sve_fsub:
1501 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fsub_u)
1502 .setMatchingIROpcode(Instruction::FSub);
1503 case Intrinsic::aarch64_sve_add:
1504 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_add_u)
1505 .setMatchingIROpcode(Instruction::Add);
1506 case Intrinsic::aarch64_sve_mla:
1507 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mla_u);
1508 case Intrinsic::aarch64_sve_mls:
1509 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mls_u);
1510 case Intrinsic::aarch64_sve_mul:
1511 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mul_u)
1512 .setMatchingIROpcode(Instruction::Mul);
1513 case Intrinsic::aarch64_sve_sabd:
1514 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sabd_u);
1515 case Intrinsic::aarch64_sve_sdiv:
1516 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sdiv_u)
1517 .setMatchingIROpcode(Instruction::SDiv);
1518 case Intrinsic::aarch64_sve_smax:
1519 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smax_u);
1520 case Intrinsic::aarch64_sve_smin:
1521 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smin_u);
1522 case Intrinsic::aarch64_sve_smulh:
1523 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smulh_u);
1524 case Intrinsic::aarch64_sve_sub:
1525 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sub_u)
1526 .setMatchingIROpcode(Instruction::Sub);
1527 case Intrinsic::aarch64_sve_uabd:
1528 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uabd_u);
1529 case Intrinsic::aarch64_sve_udiv:
1530 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_udiv_u)
1531 .setMatchingIROpcode(Instruction::UDiv);
1532 case Intrinsic::aarch64_sve_umax:
1533 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umax_u);
1534 case Intrinsic::aarch64_sve_umin:
1535 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umin_u);
1536 case Intrinsic::aarch64_sve_umulh:
1537 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umulh_u);
1538 case Intrinsic::aarch64_sve_asr:
1539 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_asr_u)
1540 .setMatchingIROpcode(Instruction::AShr);
1541 case Intrinsic::aarch64_sve_lsl:
1542 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsl_u)
1543 .setMatchingIROpcode(Instruction::Shl);
1544 case Intrinsic::aarch64_sve_lsr:
1545 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsr_u)
1546 .setMatchingIROpcode(Instruction::LShr);
1547 case Intrinsic::aarch64_sve_and:
1548 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_and_u)
1549 .setMatchingIROpcode(Instruction::And);
1550 case Intrinsic::aarch64_sve_bic:
1551 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_bic_u);
1552 case Intrinsic::aarch64_sve_eor:
1553 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_eor_u)
1554 .setMatchingIROpcode(Instruction::Xor);
1555 case Intrinsic::aarch64_sve_orr:
1556 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_orr_u)
1557 .setMatchingIROpcode(Instruction::Or);
1558 case Intrinsic::aarch64_sve_shsub:
1559 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_shsub_u);
1560 case Intrinsic::aarch64_sve_shsubr:
1561 return SVEIntrinsicInfo::defaultMergingOp();
1562 case Intrinsic::aarch64_sve_sqrshl:
1563 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqrshl_u);
1564 case Intrinsic::aarch64_sve_sqshl:
1565 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqshl_u);
1566 case Intrinsic::aarch64_sve_sqsub:
1567 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqsub_u);
1568 case Intrinsic::aarch64_sve_srshl:
1569 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_srshl_u);
1570 case Intrinsic::aarch64_sve_uhsub:
1571 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uhsub_u);
1572 case Intrinsic::aarch64_sve_uhsubr:
1573 return SVEIntrinsicInfo::defaultMergingOp();
1574 case Intrinsic::aarch64_sve_uqrshl:
1575 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqrshl_u);
1576 case Intrinsic::aarch64_sve_uqshl:
1577 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqshl_u);
1578 case Intrinsic::aarch64_sve_uqsub:
1579 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqsub_u);
1580 case Intrinsic::aarch64_sve_urshl:
1581 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_urshl_u);
1582
1583 case Intrinsic::aarch64_sve_add_u:
1584 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1585 Instruction::Add);
1586 case Intrinsic::aarch64_sve_and_u:
1587 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1588 Instruction::And);
1589 case Intrinsic::aarch64_sve_asr_u:
1590 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1591 Instruction::AShr);
1592 case Intrinsic::aarch64_sve_eor_u:
1593 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1594 Instruction::Xor);
1595 case Intrinsic::aarch64_sve_fadd_u:
1596 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1597 Instruction::FAdd);
1598 case Intrinsic::aarch64_sve_fdiv_u:
1599 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1600 Instruction::FDiv);
1601 case Intrinsic::aarch64_sve_fmul_u:
1602 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1603 Instruction::FMul);
1604 case Intrinsic::aarch64_sve_fsub_u:
1605 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1606 Instruction::FSub);
1607 case Intrinsic::aarch64_sve_lsl_u:
1608 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1609 Instruction::Shl);
1610 case Intrinsic::aarch64_sve_lsr_u:
1611 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1612 Instruction::LShr);
1613 case Intrinsic::aarch64_sve_mul_u:
1614 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1615 Instruction::Mul);
1616 case Intrinsic::aarch64_sve_orr_u:
1617 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1618 Instruction::Or);
1619 case Intrinsic::aarch64_sve_sdiv_u:
1620 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1621 Instruction::SDiv);
1622 case Intrinsic::aarch64_sve_sub_u:
1623 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1624 Instruction::Sub);
1625 case Intrinsic::aarch64_sve_udiv_u:
1626 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1627 Instruction::UDiv);
1628
1629 case Intrinsic::aarch64_sve_addqv:
1630 case Intrinsic::aarch64_sve_and_z:
1631 case Intrinsic::aarch64_sve_bic_z:
1632 case Intrinsic::aarch64_sve_brka_z:
1633 case Intrinsic::aarch64_sve_brkb_z:
1634 case Intrinsic::aarch64_sve_brkn_z:
1635 case Intrinsic::aarch64_sve_brkpa_z:
1636 case Intrinsic::aarch64_sve_brkpb_z:
1637 case Intrinsic::aarch64_sve_cntp:
1638 case Intrinsic::aarch64_sve_compact:
1639 case Intrinsic::aarch64_sve_eor_z:
1640 case Intrinsic::aarch64_sve_eorv:
1641 case Intrinsic::aarch64_sve_eorqv:
1642 case Intrinsic::aarch64_sve_nand_z:
1643 case Intrinsic::aarch64_sve_nor_z:
1644 case Intrinsic::aarch64_sve_orn_z:
1645 case Intrinsic::aarch64_sve_orr_z:
1646 case Intrinsic::aarch64_sve_orv:
1647 case Intrinsic::aarch64_sve_orqv:
1648 case Intrinsic::aarch64_sve_pnext:
1649 case Intrinsic::aarch64_sve_rdffr_z:
1650 case Intrinsic::aarch64_sve_saddv:
1651 case Intrinsic::aarch64_sve_uaddv:
1652 case Intrinsic::aarch64_sve_umaxv:
1653 case Intrinsic::aarch64_sve_umaxqv:
1654 case Intrinsic::aarch64_sve_cmpeq:
1655 case Intrinsic::aarch64_sve_cmpeq_wide:
1656 case Intrinsic::aarch64_sve_cmpge:
1657 case Intrinsic::aarch64_sve_cmpge_wide:
1658 case Intrinsic::aarch64_sve_cmpgt:
1659 case Intrinsic::aarch64_sve_cmpgt_wide:
1660 case Intrinsic::aarch64_sve_cmphi:
1661 case Intrinsic::aarch64_sve_cmphi_wide:
1662 case Intrinsic::aarch64_sve_cmphs:
1663 case Intrinsic::aarch64_sve_cmphs_wide:
1664 case Intrinsic::aarch64_sve_cmple_wide:
1665 case Intrinsic::aarch64_sve_cmplo_wide:
1666 case Intrinsic::aarch64_sve_cmpls_wide:
1667 case Intrinsic::aarch64_sve_cmplt_wide:
1668 case Intrinsic::aarch64_sve_cmpne:
1669 case Intrinsic::aarch64_sve_cmpne_wide:
1670 case Intrinsic::aarch64_sve_facge:
1671 case Intrinsic::aarch64_sve_facgt:
1672 case Intrinsic::aarch64_sve_fcmpeq:
1673 case Intrinsic::aarch64_sve_fcmpge:
1674 case Intrinsic::aarch64_sve_fcmpgt:
1675 case Intrinsic::aarch64_sve_fcmpne:
1676 case Intrinsic::aarch64_sve_fcmpuo:
1677 case Intrinsic::aarch64_sve_ld1:
1678 case Intrinsic::aarch64_sve_ld1_gather:
1679 case Intrinsic::aarch64_sve_ld1_gather_index:
1680 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1681 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1682 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1683 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1684 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1685 case Intrinsic::aarch64_sve_ld1q_gather_index:
1686 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1687 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1688 case Intrinsic::aarch64_sve_ld1ro:
1689 case Intrinsic::aarch64_sve_ld1rq:
1690 case Intrinsic::aarch64_sve_ld1udq:
1691 case Intrinsic::aarch64_sve_ld1uwq:
1692 case Intrinsic::aarch64_sve_ld2_sret:
1693 case Intrinsic::aarch64_sve_ld2q_sret:
1694 case Intrinsic::aarch64_sve_ld3_sret:
1695 case Intrinsic::aarch64_sve_ld3q_sret:
1696 case Intrinsic::aarch64_sve_ld4_sret:
1697 case Intrinsic::aarch64_sve_ld4q_sret:
1698 case Intrinsic::aarch64_sve_ldff1:
1699 case Intrinsic::aarch64_sve_ldff1_gather:
1700 case Intrinsic::aarch64_sve_ldff1_gather_index:
1701 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1702 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1703 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1704 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1705 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1706 case Intrinsic::aarch64_sve_ldnf1:
1707 case Intrinsic::aarch64_sve_ldnt1:
1708 case Intrinsic::aarch64_sve_ldnt1_gather:
1709 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1710 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1711 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1712 return SVEIntrinsicInfo::defaultZeroingOp();
1713
1714 case Intrinsic::aarch64_sve_prf:
1715 case Intrinsic::aarch64_sve_prfb_gather_index:
1716 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1717 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1718 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1719 case Intrinsic::aarch64_sve_prfd_gather_index:
1720 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1721 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1722 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1723 case Intrinsic::aarch64_sve_prfh_gather_index:
1724 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1725 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1726 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1727 case Intrinsic::aarch64_sve_prfw_gather_index:
1728 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1729 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1730 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1731 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 0);
1732
1733 case Intrinsic::aarch64_sve_st1_scatter:
1734 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1735 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1736 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1737 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1738 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1739 case Intrinsic::aarch64_sve_st1dq:
1740 case Intrinsic::aarch64_sve_st1q_scatter_index:
1741 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1742 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1743 case Intrinsic::aarch64_sve_st1wq:
1744 case Intrinsic::aarch64_sve_stnt1:
1745 case Intrinsic::aarch64_sve_stnt1_scatter:
1746 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1747 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1748 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1749 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 1);
1750 case Intrinsic::aarch64_sve_st2:
1751 case Intrinsic::aarch64_sve_st2q:
1752 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 2);
1753 case Intrinsic::aarch64_sve_st3:
1754 case Intrinsic::aarch64_sve_st3q:
1755 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 3);
1756 case Intrinsic::aarch64_sve_st4:
1757 case Intrinsic::aarch64_sve_st4q:
1758 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 4);
1759 }
1760
1761 return SVEIntrinsicInfo();
1762}
1763
1764static bool isAllActivePredicate(Value *Pred) {
1765 Value *UncastedPred;
1766
1767 // Look through predicate casts that only remove lanes.
1768 if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1769 Op0: m_Value(V&: UncastedPred)))) {
1770 auto *OrigPredTy = cast<ScalableVectorType>(Val: Pred->getType());
1771 Pred = UncastedPred;
1772
1773 if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1774 Op0: m_Value(V&: UncastedPred))))
1775 // If the predicate has the same or less lanes than the uncasted predicate
1776 // then we know the casting has no effect.
1777 if (OrigPredTy->getMinNumElements() <=
1778 cast<ScalableVectorType>(Val: UncastedPred->getType())
1779 ->getMinNumElements())
1780 Pred = UncastedPred;
1781 }
1782
1783 auto *C = dyn_cast<Constant>(Val: Pred);
1784 return C && C->isAllOnesValue();
1785}
1786
1787// Simplify `V` by only considering the operations that affect active lanes.
1788// This function should only return existing Values or newly created Constants.
1789static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1790 auto *Dup = dyn_cast<IntrinsicInst>(Val: V);
1791 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1792 Dup->getOperand(i_nocapture: 1) == Pg && isa<Constant>(Val: Dup->getOperand(i_nocapture: 2)))
1793 return ConstantVector::getSplat(
1794 EC: cast<VectorType>(Val: V->getType())->getElementCount(),
1795 Elt: cast<Constant>(Val: Dup->getOperand(i_nocapture: 2)));
1796
1797 return V;
1798}
1799
1800static std::optional<Instruction *>
1801simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II,
1802 const SVEIntrinsicInfo &IInfo) {
1803 const unsigned Opc = IInfo.getMatchingIROpode();
1804 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1805
1806 Value *Pg = II.getOperand(i_nocapture: 0);
1807 Value *Op1 = II.getOperand(i_nocapture: 1);
1808 Value *Op2 = II.getOperand(i_nocapture: 2);
1809 const DataLayout &DL = II.getDataLayout();
1810
1811 // Canonicalise constants to the RHS.
1812 if (Instruction::isCommutative(Opcode: Opc) && IInfo.inactiveLanesAreNotDefined() &&
1813 isa<Constant>(Val: Op1) && !isa<Constant>(Val: Op2)) {
1814 IC.replaceOperand(I&: II, OpNum: 1, V: Op2);
1815 IC.replaceOperand(I&: II, OpNum: 2, V: Op1);
1816 return &II;
1817 }
1818
1819 // Only active lanes matter when simplifying the operation.
1820 Op1 = stripInactiveLanes(V: Op1, Pg);
1821 Op2 = stripInactiveLanes(V: Op2, Pg);
1822
1823 Value *SimpleII;
1824 if (auto FII = dyn_cast<FPMathOperator>(Val: &II))
1825 SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, FMF: FII->getFastMathFlags(), Q: DL);
1826 else
1827 SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, Q: DL);
1828
1829 // An SVE intrinsic's result is always defined. However, this is not the case
1830 // for its equivalent IR instruction (e.g. when shifting by an amount more
1831 // than the data's bitwidth). Simplifications to an undefined result must be
1832 // ignored to preserve the intrinsic's expected behaviour.
1833 if (!SimpleII || isa<UndefValue>(Val: SimpleII))
1834 return std::nullopt;
1835
1836 if (IInfo.inactiveLanesAreNotDefined())
1837 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1838
1839 Value *Inactive = II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom());
1840
1841 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1842 if (SimpleII == Inactive)
1843 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1844
1845 // Inactive lanes must be preserved.
1846 SimpleII = IC.Builder.CreateSelect(C: Pg, True: SimpleII, False: Inactive);
1847 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1848}
1849
1850// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1851// to operations with less strict inactive lane requirements.
1852static std::optional<Instruction *>
1853simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II,
1854 const SVEIntrinsicInfo &IInfo) {
1855 if (!IInfo.hasGoverningPredicate())
1856 return std::nullopt;
1857
1858 auto *OpPredicate = II.getOperand(i_nocapture: IInfo.getGoverningPredicateOperandIdx());
1859
1860 // If there are no active lanes.
1861 if (match(V: OpPredicate, P: m_ZeroInt())) {
1862 if (IInfo.inactiveLanesTakenFromOperand())
1863 return IC.replaceInstUsesWith(
1864 I&: II, V: II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom()));
1865
1866 if (IInfo.inactiveLanesAreUnused()) {
1867 if (IInfo.resultIsZeroInitialized())
1868 IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1869
1870 return IC.eraseInstFromFunction(I&: II);
1871 }
1872 }
1873
1874 // If there are no inactive lanes.
1875 if (isAllActivePredicate(Pred: OpPredicate)) {
1876 if (IInfo.hasOperandWithNoActiveLanes()) {
1877 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1878 if (!isa<UndefValue>(Val: II.getOperand(i_nocapture: OpIdx)))
1879 return IC.replaceOperand(I&: II, OpNum: OpIdx, V: UndefValue::get(T: II.getType()));
1880 }
1881
1882 if (IInfo.hasMatchingUndefIntrinsic()) {
1883 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1884 M: II.getModule(), id: IInfo.getMatchingUndefIntrinsic(), Tys: {II.getType()});
1885 II.setCalledFunction(NewDecl);
1886 return &II;
1887 }
1888 }
1889
1890 // Operation specific simplifications.
1891 if (IInfo.hasMatchingIROpode() &&
1892 Instruction::isBinaryOp(Opcode: IInfo.getMatchingIROpode()))
1893 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1894
1895 return std::nullopt;
1896}
1897
1898// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1899// => (binop (pred) (from_svbool _) (from_svbool _))
1900//
1901// The above transformation eliminates a `to_svbool` in the predicate
1902// operand of bitwise operation `binop` by narrowing the vector width of
1903// the operation. For example, it would convert a `<vscale x 16 x i1>
1904// and` into a `<vscale x 4 x i1> and`. This is profitable because
1905// to_svbool must zero the new lanes during widening, whereas
1906// from_svbool is free.
1907static std::optional<Instruction *>
1908tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
1909 auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: 0));
1910 if (!BinOp)
1911 return std::nullopt;
1912
1913 auto IntrinsicID = BinOp->getIntrinsicID();
1914 switch (IntrinsicID) {
1915 case Intrinsic::aarch64_sve_and_z:
1916 case Intrinsic::aarch64_sve_bic_z:
1917 case Intrinsic::aarch64_sve_eor_z:
1918 case Intrinsic::aarch64_sve_nand_z:
1919 case Intrinsic::aarch64_sve_nor_z:
1920 case Intrinsic::aarch64_sve_orn_z:
1921 case Intrinsic::aarch64_sve_orr_z:
1922 break;
1923 default:
1924 return std::nullopt;
1925 }
1926
1927 auto BinOpPred = BinOp->getOperand(i_nocapture: 0);
1928 auto BinOpOp1 = BinOp->getOperand(i_nocapture: 1);
1929 auto BinOpOp2 = BinOp->getOperand(i_nocapture: 2);
1930
1931 auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred);
1932 if (!PredIntr ||
1933 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1934 return std::nullopt;
1935
1936 auto PredOp = PredIntr->getOperand(i_nocapture: 0);
1937 auto PredOpTy = cast<VectorType>(Val: PredOp->getType());
1938 if (PredOpTy != II.getType())
1939 return std::nullopt;
1940
1941 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1942 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1943 ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp1});
1944 NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
1945 if (BinOpOp1 == BinOpOp2)
1946 NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
1947 else
1948 NarrowedBinOpArgs.push_back(Elt: IC.Builder.CreateIntrinsic(
1949 ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp2}));
1950
1951 auto NarrowedBinOp =
1952 IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {PredOpTy}, Args: NarrowedBinOpArgs);
1953 return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp);
1954}
1955
1956static std::optional<Instruction *>
1957instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
1958 // If the reinterpret instruction operand is a PHI Node
1959 if (isa<PHINode>(Val: II.getArgOperand(i: 0)))
1960 return processPhiNode(IC, II);
1961
1962 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1963 return BinOpCombine;
1964
1965 // Ignore converts to/from svcount_t.
1966 if (isa<TargetExtType>(Val: II.getArgOperand(i: 0)->getType()) ||
1967 isa<TargetExtType>(Val: II.getType()))
1968 return std::nullopt;
1969
1970 SmallVector<Instruction *, 32> CandidatesForRemoval;
1971 Value *Cursor = II.getOperand(i_nocapture: 0), *EarliestReplacement = nullptr;
1972
1973 const auto *IVTy = cast<VectorType>(Val: II.getType());
1974
1975 // Walk the chain of conversions.
1976 while (Cursor) {
1977 // If the type of the cursor has fewer lanes than the final result, zeroing
1978 // must take place, which breaks the equivalence chain.
1979 const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType());
1980 if (CursorVTy->getElementCount().getKnownMinValue() <
1981 IVTy->getElementCount().getKnownMinValue())
1982 break;
1983
1984 // If the cursor has the same type as I, it is a viable replacement.
1985 if (Cursor->getType() == IVTy)
1986 EarliestReplacement = Cursor;
1987
1988 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor);
1989
1990 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1991 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1992 Intrinsic::aarch64_sve_convert_to_svbool ||
1993 IntrinsicCursor->getIntrinsicID() ==
1994 Intrinsic::aarch64_sve_convert_from_svbool))
1995 break;
1996
1997 CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor);
1998 Cursor = IntrinsicCursor->getOperand(i_nocapture: 0);
1999 }
2000
2001 // If no viable replacement in the conversion chain was found, there is
2002 // nothing to do.
2003 if (!EarliestReplacement)
2004 return std::nullopt;
2005
2006 return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement);
2007}
2008
2009static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
2010 IntrinsicInst &II) {
2011 // svsel(ptrue, x, y) => x
2012 auto *OpPredicate = II.getOperand(i_nocapture: 0);
2013 if (isAllActivePredicate(Pred: OpPredicate))
2014 return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: 1));
2015
2016 auto Select =
2017 IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: 1), False: II.getOperand(i_nocapture: 2));
2018 return IC.replaceInstUsesWith(I&: II, V: Select);
2019}
2020
2021static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
2022 IntrinsicInst &II) {
2023 Value *Pg = II.getOperand(i_nocapture: 1);
2024
2025 // sve.dup(V, all_active, X) ==> splat(X)
2026 if (isAllActivePredicate(Pred: Pg)) {
2027 auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2028 Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
2029 V: II.getArgOperand(i: 2));
2030 return IC.replaceInstUsesWith(I&: II, V: Splat);
2031 }
2032
2033 if (!match(V: Pg, P: m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
2034 Op0: m_SpecificInt(V: AArch64SVEPredPattern::vl1))))
2035 return std::nullopt;
2036
2037 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
2038 Value *Insert = IC.Builder.CreateInsertElement(
2039 Vec: II.getArgOperand(i: 0), NewElt: II.getArgOperand(i: 2), Idx: uint64_t(0));
2040 return IC.replaceInstUsesWith(I&: II, V: Insert);
2041}
2042
2043static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
2044 IntrinsicInst &II) {
2045 // Replace DupX with a regular IR splat.
2046 auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2047 Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
2048 V: II.getArgOperand(i: 0));
2049 Splat->takeName(V: &II);
2050 return IC.replaceInstUsesWith(I&: II, V: Splat);
2051}
2052
2053static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
2054 IntrinsicInst &II) {
2055 LLVMContext &Ctx = II.getContext();
2056
2057 if (!isAllActivePredicate(Pred: II.getArgOperand(i: 0)))
2058 return std::nullopt;
2059
2060 // Check that we have a compare of zero..
2061 auto *SplatValue =
2062 dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: 2)));
2063 if (!SplatValue || !SplatValue->isZero())
2064 return std::nullopt;
2065
2066 // ..against a dupq
2067 auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1));
2068 if (!DupQLane ||
2069 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2070 return std::nullopt;
2071
2072 // Where the dupq is a lane 0 replicate of a vector insert
2073 auto *DupQLaneIdx = dyn_cast<ConstantInt>(Val: DupQLane->getArgOperand(i: 1));
2074 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2075 return std::nullopt;
2076
2077 auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: 0));
2078 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2079 return std::nullopt;
2080
2081 // Where the vector insert is a fixed constant vector insert into undef at
2082 // index zero
2083 if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: 0)))
2084 return std::nullopt;
2085
2086 if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: 2))->isZero())
2087 return std::nullopt;
2088
2089 auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: 1));
2090 if (!ConstVec)
2091 return std::nullopt;
2092
2093 auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType());
2094 auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType());
2095 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2096 return std::nullopt;
2097
2098 unsigned NumElts = VecTy->getNumElements();
2099 unsigned PredicateBits = 0;
2100
2101 // Expand intrinsic operands to a 16-bit byte level predicate
2102 for (unsigned I = 0; I < NumElts; ++I) {
2103 auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I));
2104 if (!Arg)
2105 return std::nullopt;
2106 if (!Arg->isZero())
2107 PredicateBits |= 1 << (I * (16 / NumElts));
2108 }
2109
2110 // If all bits are zero bail early with an empty predicate
2111 if (PredicateBits == 0) {
2112 auto *PFalse = Constant::getNullValue(Ty: II.getType());
2113 PFalse->takeName(V: &II);
2114 return IC.replaceInstUsesWith(I&: II, V: PFalse);
2115 }
2116
2117 // Calculate largest predicate type used (where byte predicate is largest)
2118 unsigned Mask = 8;
2119 for (unsigned I = 0; I < 16; ++I)
2120 if ((PredicateBits & (1 << I)) != 0)
2121 Mask |= (I % 8);
2122
2123 unsigned PredSize = Mask & -Mask;
2124 auto *PredType = ScalableVectorType::get(
2125 ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * 8));
2126
2127 // Ensure all relevant bits are set
2128 for (unsigned I = 0; I < 16; I += PredSize)
2129 if ((PredicateBits & (1 << I)) == 0)
2130 return std::nullopt;
2131
2132 auto *PTruePat =
2133 ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
2134 auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
2135 Types: {PredType}, Args: {PTruePat});
2136 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2137 ID: Intrinsic::aarch64_sve_convert_to_svbool, Types: {PredType}, Args: {PTrue});
2138 auto *ConvertFromSVBool =
2139 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool,
2140 Types: {II.getType()}, Args: {ConvertToSVBool});
2141
2142 ConvertFromSVBool->takeName(V: &II);
2143 return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool);
2144}
2145
2146static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2147 IntrinsicInst &II) {
2148 Value *Pg = II.getArgOperand(i: 0);
2149 Value *Vec = II.getArgOperand(i: 1);
2150 auto IntrinsicID = II.getIntrinsicID();
2151 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2152
2153 // lastX(splat(X)) --> X
2154 if (auto *SplatVal = getSplatValue(V: Vec))
2155 return IC.replaceInstUsesWith(I&: II, V: SplatVal);
2156
2157 // If x and/or y is a splat value then:
2158 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2159 Value *LHS, *RHS;
2160 if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) {
2161 if (isSplatValue(V: LHS) || isSplatValue(V: RHS)) {
2162 auto *OldBinOp = cast<BinaryOperator>(Val: Vec);
2163 auto OpC = OldBinOp->getOpcode();
2164 auto *NewLHS =
2165 IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, LHS});
2166 auto *NewRHS =
2167 IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, RHS});
2168 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
2169 Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator());
2170 return IC.replaceInstUsesWith(I&: II, V: NewBinOp);
2171 }
2172 }
2173
2174 auto *C = dyn_cast<Constant>(Val: Pg);
2175 if (IsAfter && C && C->isNullValue()) {
2176 // The intrinsic is extracting lane 0 so use an extract instead.
2177 auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
2178 auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: 0));
2179 Extract->insertBefore(InsertPos: II.getIterator());
2180 Extract->takeName(V: &II);
2181 return IC.replaceInstUsesWith(I&: II, V: Extract);
2182 }
2183
2184 auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg);
2185 if (!IntrPG)
2186 return std::nullopt;
2187
2188 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2189 return std::nullopt;
2190
2191 const auto PTruePattern =
2192 cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: 0))->getZExtValue();
2193
2194 // Can the intrinsic's predicate be converted to a known constant index?
2195 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern);
2196 if (!MinNumElts)
2197 return std::nullopt;
2198
2199 unsigned Idx = MinNumElts - 1;
2200 // Increment the index if extracting the element after the last active
2201 // predicate element.
2202 if (IsAfter)
2203 ++Idx;
2204
2205 // Ignore extracts whose index is larger than the known minimum vector
2206 // length. NOTE: This is an artificial constraint where we prefer to
2207 // maintain what the user asked for until an alternative is proven faster.
2208 auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType());
2209 if (Idx >= PgVTy->getMinNumElements())
2210 return std::nullopt;
2211
2212 // The intrinsic is extracting a fixed lane so use an extract instead.
2213 auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
2214 auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx));
2215 Extract->insertBefore(InsertPos: II.getIterator());
2216 Extract->takeName(V: &II);
2217 return IC.replaceInstUsesWith(I&: II, V: Extract);
2218}
2219
2220static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2221 IntrinsicInst &II) {
2222 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2223 // integer variant across a variety of micro-architectures. Replace scalar
2224 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2225 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2226 // depending on the micro-architecture, but has been observed as generally
2227 // being faster, particularly when the CLAST[AB] op is a loop-carried
2228 // dependency.
2229 Value *Pg = II.getArgOperand(i: 0);
2230 Value *Fallback = II.getArgOperand(i: 1);
2231 Value *Vec = II.getArgOperand(i: 2);
2232 Type *Ty = II.getType();
2233
2234 if (!Ty->isIntegerTy())
2235 return std::nullopt;
2236
2237 Type *FPTy;
2238 switch (cast<IntegerType>(Val: Ty)->getBitWidth()) {
2239 default:
2240 return std::nullopt;
2241 case 16:
2242 FPTy = IC.Builder.getHalfTy();
2243 break;
2244 case 32:
2245 FPTy = IC.Builder.getFloatTy();
2246 break;
2247 case 64:
2248 FPTy = IC.Builder.getDoubleTy();
2249 break;
2250 }
2251
2252 Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy);
2253 auto *FPVTy = VectorType::get(
2254 ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount());
2255 Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy);
2256 auto *FPII = IC.Builder.CreateIntrinsic(
2257 ID: II.getIntrinsicID(), Types: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec});
2258 Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType());
2259 return IC.replaceInstUsesWith(I&: II, V: FPIItoInt);
2260}
2261
2262static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2263 IntrinsicInst &II) {
2264 LLVMContext &Ctx = II.getContext();
2265 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2266 // can work with RDFFR_PP for ptest elimination.
2267 auto *AllPat =
2268 ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
2269 auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
2270 Types: {II.getType()}, Args: {AllPat});
2271 auto *RDFFR =
2272 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_rdffr_z, Args: {PTrue});
2273 RDFFR->takeName(V: &II);
2274 return IC.replaceInstUsesWith(I&: II, V: RDFFR);
2275}
2276
2277static std::optional<Instruction *>
2278instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
2279 const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: 0))->getZExtValue();
2280
2281 if (Pattern == AArch64SVEPredPattern::all) {
2282 Value *Cnt = IC.Builder.CreateElementCount(
2283 Ty: II.getType(), EC: ElementCount::getScalable(MinVal: NumElts));
2284 Cnt->takeName(V: &II);
2285 return IC.replaceInstUsesWith(I&: II, V: Cnt);
2286 }
2287
2288 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2289
2290 return MinNumElts && NumElts >= MinNumElts
2291 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2292 I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts)))
2293 : std::nullopt;
2294}
2295
2296static std::optional<Instruction *>
2297instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II,
2298 const AArch64Subtarget *ST) {
2299 if (!ST->isStreaming())
2300 return std::nullopt;
2301
2302 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2303 // with SVEPredPattern::all
2304 Value *Cnt =
2305 IC.Builder.CreateElementCount(Ty: II.getType(), EC: ElementCount::getScalable(MinVal: 2));
2306 Cnt->takeName(V: &II);
2307 return IC.replaceInstUsesWith(I&: II, V: Cnt);
2308}
2309
2310static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2311 IntrinsicInst &II) {
2312 Value *PgVal = II.getArgOperand(i: 0);
2313 Value *OpVal = II.getArgOperand(i: 1);
2314
2315 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2316 // Later optimizations prefer this form.
2317 if (PgVal == OpVal &&
2318 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2319 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2320 Value *Ops[] = {PgVal, OpVal};
2321 Type *Tys[] = {PgVal->getType()};
2322
2323 auto *PTest =
2324 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptest_any, Types: Tys, Args: Ops);
2325 PTest->takeName(V: &II);
2326
2327 return IC.replaceInstUsesWith(I&: II, V: PTest);
2328 }
2329
2330 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal);
2331 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal);
2332
2333 if (!Pg || !Op)
2334 return std::nullopt;
2335
2336 Intrinsic::ID OpIID = Op->getIntrinsicID();
2337
2338 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2339 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2340 Pg->getArgOperand(i: 0)->getType() == Op->getArgOperand(i: 0)->getType()) {
2341 Value *Ops[] = {Pg->getArgOperand(i: 0), Op->getArgOperand(i: 0)};
2342 Type *Tys[] = {Pg->getArgOperand(i: 0)->getType()};
2343
2344 auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
2345
2346 PTest->takeName(V: &II);
2347 return IC.replaceInstUsesWith(I&: II, V: PTest);
2348 }
2349
2350 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2351 // Later optimizations may rewrite sequence to use the flag-setting variant
2352 // of instruction X to remove PTEST.
2353 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2354 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2355 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2356 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2357 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2358 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2359 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2360 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2361 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2362 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2363 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2364 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2365 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2366 Value *Ops[] = {Pg->getArgOperand(i: 0), Pg};
2367 Type *Tys[] = {Pg->getType()};
2368
2369 auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
2370 PTest->takeName(V: &II);
2371
2372 return IC.replaceInstUsesWith(I&: II, V: PTest);
2373 }
2374
2375 return std::nullopt;
2376}
2377
2378template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2379static std::optional<Instruction *>
2380instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
2381 bool MergeIntoAddendOp) {
2382 Value *P = II.getOperand(i_nocapture: 0);
2383 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2384 if (MergeIntoAddendOp) {
2385 AddendOp = II.getOperand(i_nocapture: 1);
2386 Mul = II.getOperand(i_nocapture: 2);
2387 } else {
2388 AddendOp = II.getOperand(i_nocapture: 2);
2389 Mul = II.getOperand(i_nocapture: 1);
2390 }
2391
2392 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0),
2393 m_Value(V&: MulOp1))))
2394 return std::nullopt;
2395
2396 if (!Mul->hasOneUse())
2397 return std::nullopt;
2398
2399 Instruction *FMFSource = nullptr;
2400 if (II.getType()->isFPOrFPVectorTy()) {
2401 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2402 // Stop the combine when the flags on the inputs differ in case dropping
2403 // flags would lead to us missing out on more beneficial optimizations.
2404 if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags())
2405 return std::nullopt;
2406 if (!FAddFlags.allowContract())
2407 return std::nullopt;
2408 FMFSource = &II;
2409 }
2410
2411 CallInst *Res;
2412 if (MergeIntoAddendOp)
2413 Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
2414 Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2415 else
2416 Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
2417 Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2418
2419 return IC.replaceInstUsesWith(I&: II, V: Res);
2420}
2421
2422static std::optional<Instruction *>
2423instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2424 Value *Pred = II.getOperand(i_nocapture: 0);
2425 Value *PtrOp = II.getOperand(i_nocapture: 1);
2426 Type *VecTy = II.getType();
2427
2428 if (isAllActivePredicate(Pred)) {
2429 LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp);
2430 Load->copyMetadata(SrcInst: II);
2431 return IC.replaceInstUsesWith(I&: II, V: Load);
2432 }
2433
2434 CallInst *MaskedLoad =
2435 IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL),
2436 Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy));
2437 MaskedLoad->copyMetadata(SrcInst: II);
2438 return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2439}
2440
2441static std::optional<Instruction *>
2442instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2443 Value *VecOp = II.getOperand(i_nocapture: 0);
2444 Value *Pred = II.getOperand(i_nocapture: 1);
2445 Value *PtrOp = II.getOperand(i_nocapture: 2);
2446
2447 if (isAllActivePredicate(Pred)) {
2448 StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp);
2449 Store->copyMetadata(SrcInst: II);
2450 return IC.eraseInstFromFunction(I&: II);
2451 }
2452
2453 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2454 Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred);
2455 MaskedStore->copyMetadata(SrcInst: II);
2456 return IC.eraseInstFromFunction(I&: II);
2457}
2458
2459static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
2460 switch (Intrinsic) {
2461 case Intrinsic::aarch64_sve_fmul_u:
2462 return Instruction::BinaryOps::FMul;
2463 case Intrinsic::aarch64_sve_fadd_u:
2464 return Instruction::BinaryOps::FAdd;
2465 case Intrinsic::aarch64_sve_fsub_u:
2466 return Instruction::BinaryOps::FSub;
2467 default:
2468 return Instruction::BinaryOpsEnd;
2469 }
2470}
2471
2472static std::optional<Instruction *>
2473instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
2474 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2475 if (II.isStrictFP())
2476 return std::nullopt;
2477
2478 auto *OpPredicate = II.getOperand(i_nocapture: 0);
2479 auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID());
2480 if (BinOpCode == Instruction::BinaryOpsEnd ||
2481 !isAllActivePredicate(Pred: OpPredicate))
2482 return std::nullopt;
2483 auto BinOp = IC.Builder.CreateBinOpFMF(
2484 Opc: BinOpCode, LHS: II.getOperand(i_nocapture: 1), RHS: II.getOperand(i_nocapture: 2), FMFSource: II.getFastMathFlags());
2485 return IC.replaceInstUsesWith(I&: II, V: BinOp);
2486}
2487
2488static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2489 IntrinsicInst &II) {
2490 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2491 Intrinsic::aarch64_sve_mla>(
2492 IC, II, MergeIntoAddendOp: true))
2493 return MLA;
2494 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2495 Intrinsic::aarch64_sve_mad>(
2496 IC, II, MergeIntoAddendOp: false))
2497 return MAD;
2498 return std::nullopt;
2499}
2500
2501static std::optional<Instruction *>
2502instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
2503 if (auto FMLA =
2504 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2505 Intrinsic::aarch64_sve_fmla>(IC, II,
2506 MergeIntoAddendOp: true))
2507 return FMLA;
2508 if (auto FMAD =
2509 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2510 Intrinsic::aarch64_sve_fmad>(IC, II,
2511 MergeIntoAddendOp: false))
2512 return FMAD;
2513 if (auto FMLA =
2514 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2515 Intrinsic::aarch64_sve_fmla>(IC, II,
2516 MergeIntoAddendOp: true))
2517 return FMLA;
2518 return std::nullopt;
2519}
2520
2521static std::optional<Instruction *>
2522instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
2523 if (auto FMLA =
2524 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2525 Intrinsic::aarch64_sve_fmla>(IC, II,
2526 MergeIntoAddendOp: true))
2527 return FMLA;
2528 if (auto FMAD =
2529 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2530 Intrinsic::aarch64_sve_fmad>(IC, II,
2531 MergeIntoAddendOp: false))
2532 return FMAD;
2533 if (auto FMLA_U =
2534 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2535 Intrinsic::aarch64_sve_fmla_u>(
2536 IC, II, MergeIntoAddendOp: true))
2537 return FMLA_U;
2538 return instCombineSVEVectorBinOp(IC, II);
2539}
2540
2541static std::optional<Instruction *>
2542instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
2543 if (auto FMLS =
2544 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2545 Intrinsic::aarch64_sve_fmls>(IC, II,
2546 MergeIntoAddendOp: true))
2547 return FMLS;
2548 if (auto FMSB =
2549 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2550 Intrinsic::aarch64_sve_fnmsb>(
2551 IC, II, MergeIntoAddendOp: false))
2552 return FMSB;
2553 if (auto FMLS =
2554 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2555 Intrinsic::aarch64_sve_fmls>(IC, II,
2556 MergeIntoAddendOp: true))
2557 return FMLS;
2558 return std::nullopt;
2559}
2560
2561static std::optional<Instruction *>
2562instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
2563 if (auto FMLS =
2564 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2565 Intrinsic::aarch64_sve_fmls>(IC, II,
2566 MergeIntoAddendOp: true))
2567 return FMLS;
2568 if (auto FMSB =
2569 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2570 Intrinsic::aarch64_sve_fnmsb>(
2571 IC, II, MergeIntoAddendOp: false))
2572 return FMSB;
2573 if (auto FMLS_U =
2574 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2575 Intrinsic::aarch64_sve_fmls_u>(
2576 IC, II, MergeIntoAddendOp: true))
2577 return FMLS_U;
2578 return instCombineSVEVectorBinOp(IC, II);
2579}
2580
2581static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2582 IntrinsicInst &II) {
2583 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2584 Intrinsic::aarch64_sve_mls>(
2585 IC, II, MergeIntoAddendOp: true))
2586 return MLS;
2587 return std::nullopt;
2588}
2589
2590static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2591 IntrinsicInst &II) {
2592 Value *UnpackArg = II.getArgOperand(i: 0);
2593 auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2594 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2595 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2596
2597 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2598 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2599 if (auto *ScalarArg = getSplatValue(V: UnpackArg)) {
2600 ScalarArg =
2601 IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned);
2602 Value *NewVal =
2603 IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg);
2604 NewVal->takeName(V: &II);
2605 return IC.replaceInstUsesWith(I&: II, V: NewVal);
2606 }
2607
2608 return std::nullopt;
2609}
2610static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2611 IntrinsicInst &II) {
2612 auto *OpVal = II.getOperand(i_nocapture: 0);
2613 auto *OpIndices = II.getOperand(i_nocapture: 1);
2614 VectorType *VTy = cast<VectorType>(Val: II.getType());
2615
2616 // Check whether OpIndices is a constant splat value < minimal element count
2617 // of result.
2618 auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices));
2619 if (!SplatValue ||
2620 SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue()))
2621 return std::nullopt;
2622
2623 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2624 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2625 auto *Extract = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue);
2626 auto *VectorSplat =
2627 IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract);
2628
2629 VectorSplat->takeName(V: &II);
2630 return IC.replaceInstUsesWith(I&: II, V: VectorSplat);
2631}
2632
2633static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2634 IntrinsicInst &II) {
2635 Value *A, *B;
2636 Type *RetTy = II.getType();
2637 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2638 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2639
2640 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2641 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2642 if ((match(V: II.getArgOperand(i: 0),
2643 P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A)))) &&
2644 match(V: II.getArgOperand(i: 1),
2645 P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) ||
2646 (match(V: II.getArgOperand(i: 0), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A))) &&
2647 match(V: II.getArgOperand(i: 1), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) {
2648 auto *TyA = cast<ScalableVectorType>(Val: A->getType());
2649 if (TyA == B->getType() &&
2650 RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) {
2651 auto *SubVec = IC.Builder.CreateInsertVector(
2652 DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: uint64_t(0));
2653 auto *ConcatVec = IC.Builder.CreateInsertVector(DstType: RetTy, SrcVec: SubVec, SubVec: B,
2654 Idx: TyA->getMinNumElements());
2655 ConcatVec->takeName(V: &II);
2656 return IC.replaceInstUsesWith(I&: II, V: ConcatVec);
2657 }
2658 }
2659
2660 return std::nullopt;
2661}
2662
2663static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2664 IntrinsicInst &II) {
2665 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2666 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2667 Value *A, *B;
2668 if (match(V: II.getArgOperand(i: 0),
2669 P: m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(Op0: m_Value(V&: A), Op1: m_Value(V&: B))) &&
2670 match(V: II.getArgOperand(i: 1), P: m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
2671 Op0: m_Specific(V: A), Op1: m_Specific(V: B))))
2672 return IC.replaceInstUsesWith(
2673 I&: II, V: (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2674
2675 return std::nullopt;
2676}
2677
2678static std::optional<Instruction *>
2679instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
2680 Value *Mask = II.getOperand(i_nocapture: 0);
2681 Value *BasePtr = II.getOperand(i_nocapture: 1);
2682 Value *Index = II.getOperand(i_nocapture: 2);
2683 Type *Ty = II.getType();
2684 Value *PassThru = ConstantAggregateZero::get(Ty);
2685
2686 // Contiguous gather => masked load.
2687 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2688 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2689 Value *IndexBase;
2690 if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
2691 Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) {
2692 Align Alignment =
2693 BasePtr->getPointerAlignment(DL: II.getDataLayout());
2694
2695 Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2696 Ptr: BasePtr, IdxList: IndexBase);
2697 CallInst *MaskedLoad =
2698 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2699 MaskedLoad->takeName(V: &II);
2700 return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2701 }
2702
2703 return std::nullopt;
2704}
2705
2706static std::optional<Instruction *>
2707instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
2708 Value *Val = II.getOperand(i_nocapture: 0);
2709 Value *Mask = II.getOperand(i_nocapture: 1);
2710 Value *BasePtr = II.getOperand(i_nocapture: 2);
2711 Value *Index = II.getOperand(i_nocapture: 3);
2712 Type *Ty = Val->getType();
2713
2714 // Contiguous scatter => masked store.
2715 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2716 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2717 Value *IndexBase;
2718 if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
2719 Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) {
2720 Align Alignment =
2721 BasePtr->getPointerAlignment(DL: II.getDataLayout());
2722
2723 Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2724 Ptr: BasePtr, IdxList: IndexBase);
2725 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2726
2727 return IC.eraseInstFromFunction(I&: II);
2728 }
2729
2730 return std::nullopt;
2731}
2732
2733static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2734 IntrinsicInst &II) {
2735 Type *Int32Ty = IC.Builder.getInt32Ty();
2736 Value *Pred = II.getOperand(i_nocapture: 0);
2737 Value *Vec = II.getOperand(i_nocapture: 1);
2738 Value *DivVec = II.getOperand(i_nocapture: 2);
2739
2740 Value *SplatValue = getSplatValue(V: DivVec);
2741 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue);
2742 if (!SplatConstantInt)
2743 return std::nullopt;
2744
2745 APInt Divisor = SplatConstantInt->getValue();
2746 const int64_t DivisorValue = Divisor.getSExtValue();
2747 if (DivisorValue == -1)
2748 return std::nullopt;
2749 if (DivisorValue == 1)
2750 IC.replaceInstUsesWith(I&: II, V: Vec);
2751
2752 if (Divisor.isPowerOf2()) {
2753 Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2754 auto ASRD = IC.Builder.CreateIntrinsic(
2755 ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2756 return IC.replaceInstUsesWith(I&: II, V: ASRD);
2757 }
2758 if (Divisor.isNegatedPowerOf2()) {
2759 Divisor.negate();
2760 Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2761 auto ASRD = IC.Builder.CreateIntrinsic(
2762 ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2763 auto NEG = IC.Builder.CreateIntrinsic(
2764 ID: Intrinsic::aarch64_sve_neg, Types: {ASRD->getType()}, Args: {ASRD, Pred, ASRD});
2765 return IC.replaceInstUsesWith(I&: II, V: NEG);
2766 }
2767
2768 return std::nullopt;
2769}
2770
2771bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2772 size_t VecSize = Vec.size();
2773 if (VecSize == 1)
2774 return true;
2775 if (!isPowerOf2_64(Value: VecSize))
2776 return false;
2777 size_t HalfVecSize = VecSize / 2;
2778
2779 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2780 RHS != Vec.end(); LHS++, RHS++) {
2781 if (*LHS != nullptr && *RHS != nullptr) {
2782 if (*LHS == *RHS)
2783 continue;
2784 else
2785 return false;
2786 }
2787 if (!AllowPoison)
2788 return false;
2789 if (*LHS == nullptr && *RHS != nullptr)
2790 *LHS = *RHS;
2791 }
2792
2793 Vec.resize(N: HalfVecSize);
2794 SimplifyValuePattern(Vec, AllowPoison);
2795 return true;
2796}
2797
2798// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2799// to dupqlane(f64(C)) where C is A concatenated with B
2800static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2801 IntrinsicInst &II) {
2802 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2803 if (!match(V: II.getOperand(i_nocapture: 0),
2804 P: m_Intrinsic<Intrinsic::vector_insert>(
2805 Op0: m_Value(V&: Default), Op1: m_Value(V&: CurrentInsertElt), Op2: m_Value())) ||
2806 !isa<FixedVectorType>(Val: CurrentInsertElt->getType()))
2807 return std::nullopt;
2808 auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType());
2809
2810 // Insert the scalars into a container ordered by InsertElement index
2811 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2812 while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) {
2813 auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: 2));
2814 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: 1);
2815 CurrentInsertElt = InsertElt->getOperand(i_nocapture: 0);
2816 }
2817
2818 bool AllowPoison =
2819 isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default);
2820 if (!SimplifyValuePattern(Vec&: Elts, AllowPoison))
2821 return std::nullopt;
2822
2823 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2824 Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType());
2825 for (size_t I = 0; I < Elts.size(); I++) {
2826 if (Elts[I] == nullptr)
2827 continue;
2828 InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts[I],
2829 Idx: IC.Builder.getInt64(C: I));
2830 }
2831 if (InsertEltChain == nullptr)
2832 return std::nullopt;
2833
2834 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2835 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2836 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2837 // be narrowed back to the original type.
2838 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2839 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2840 IIScalableTy->getMinNumElements() /
2841 PatternWidth;
2842
2843 IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth);
2844 auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount);
2845 auto *WideShuffleMaskTy =
2846 ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount);
2847
2848 auto InsertSubvector = IC.Builder.CreateInsertVector(
2849 DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain,
2850 Idx: uint64_t(0));
2851 auto WideBitcast =
2852 IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy);
2853 auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy);
2854 auto WideShuffle = IC.Builder.CreateShuffleVector(
2855 V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask);
2856 auto NarrowBitcast =
2857 IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType());
2858
2859 return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast);
2860}
2861
2862static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2863 IntrinsicInst &II) {
2864 Value *A = II.getArgOperand(i: 0);
2865 Value *B = II.getArgOperand(i: 1);
2866 if (A == B)
2867 return IC.replaceInstUsesWith(I&: II, V: A);
2868
2869 return std::nullopt;
2870}
2871
2872static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2873 IntrinsicInst &II) {
2874 Value *Pred = II.getOperand(i_nocapture: 0);
2875 Value *Vec = II.getOperand(i_nocapture: 1);
2876 Value *Shift = II.getOperand(i_nocapture: 2);
2877
2878 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2879 Value *AbsPred, *MergedValue;
2880 if (!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2881 Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())) &&
2882 !match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2883 Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())))
2884
2885 return std::nullopt;
2886
2887 // Transform is valid if any of the following are true:
2888 // * The ABS merge value is an undef or non-negative
2889 // * The ABS predicate is all active
2890 // * The ABS predicate and the SRSHL predicates are the same
2891 if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) &&
2892 AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred))
2893 return std::nullopt;
2894
2895 // Only valid when the shift amount is non-negative, otherwise the rounding
2896 // behaviour of SRSHL cannot be ignored.
2897 if (!match(V: Shift, P: m_NonNegative()))
2898 return std::nullopt;
2899
2900 auto LSL = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_lsl,
2901 Types: {II.getType()}, Args: {Pred, Vec, Shift});
2902
2903 return IC.replaceInstUsesWith(I&: II, V: LSL);
2904}
2905
2906static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2907 IntrinsicInst &II) {
2908 Value *Vec = II.getOperand(i_nocapture: 0);
2909
2910 if (getSplatValue(V: Vec) == II.getOperand(i_nocapture: 1))
2911 return IC.replaceInstUsesWith(I&: II, V: Vec);
2912
2913 return std::nullopt;
2914}
2915
2916static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2917 IntrinsicInst &II) {
2918 // If this barrier is post-dominated by identical one we can remove it
2919 auto *NI = II.getNextNode();
2920 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2921 auto CanSkipOver = [](Instruction *I) {
2922 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2923 };
2924 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2925 auto *NIBB = NI->getParent();
2926 NI = NI->getNextNode();
2927 if (!NI) {
2928 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2929 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2930 else
2931 break;
2932 }
2933 }
2934 auto *NextII = dyn_cast_or_null<IntrinsicInst>(Val: NI);
2935 if (NextII && II.isIdenticalTo(I: NextII))
2936 return IC.eraseInstFromFunction(I&: II);
2937
2938 return std::nullopt;
2939}
2940
2941static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2942 IntrinsicInst &II) {
2943 return IC.replaceInstUsesWith(
2944 I&: II,
2945 V: IC.Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
2946 Types: {II.getType(), II.getOperand(i_nocapture: 0)->getType()},
2947 Args: {II.getOperand(i_nocapture: 0), II.getOperand(i_nocapture: 1)}));
2948}
2949
2950static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2951 IntrinsicInst &II) {
2952 if (match(V: II.getOperand(i_nocapture: 0), P: m_ConstantInt<AArch64SVEPredPattern::all>()))
2953 return IC.replaceInstUsesWith(I&: II, V: Constant::getAllOnesValue(Ty: II.getType()));
2954 return std::nullopt;
2955}
2956
2957static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2958 IntrinsicInst &II,
2959 unsigned NumBits) {
2960 Value *Passthru = II.getOperand(i_nocapture: 0);
2961 Value *Pg = II.getOperand(i_nocapture: 1);
2962 Value *Op = II.getOperand(i_nocapture: 2);
2963
2964 // Convert UXT[BHW] to AND.
2965 if (isa<UndefValue>(Val: Passthru) || isAllActivePredicate(Pred: Pg)) {
2966 auto *Ty = cast<VectorType>(Val: II.getType());
2967 auto MaskValue = APInt::getLowBitsSet(numBits: Ty->getScalarSizeInBits(), loBitsSet: NumBits);
2968 auto *Mask = ConstantInt::get(Ty, V: MaskValue);
2969 auto *And = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_and_u, Types: {Ty},
2970 Args: {Pg, Op, Mask});
2971 return IC.replaceInstUsesWith(I&: II, V: And);
2972 }
2973
2974 return std::nullopt;
2975}
2976
2977static std::optional<Instruction *>
2978instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II) {
2979 SMEAttrs FnSMEAttrs(*II.getFunction());
2980 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2981 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2982 return IC.replaceInstUsesWith(
2983 I&: II, V: ConstantInt::getBool(Ty: II.getType(), V: IsStreaming));
2984 return std::nullopt;
2985}
2986
2987std::optional<Instruction *>
2988AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
2989 IntrinsicInst &II) const {
2990 const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II);
2991 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2992 return I;
2993
2994 Intrinsic::ID IID = II.getIntrinsicID();
2995 switch (IID) {
2996 default:
2997 break;
2998 case Intrinsic::aarch64_dmb:
2999 return instCombineDMB(IC, II);
3000 case Intrinsic::aarch64_neon_fmaxnm:
3001 case Intrinsic::aarch64_neon_fminnm:
3002 return instCombineMaxMinNM(IC, II);
3003 case Intrinsic::aarch64_sve_convert_from_svbool:
3004 return instCombineConvertFromSVBool(IC, II);
3005 case Intrinsic::aarch64_sve_dup:
3006 return instCombineSVEDup(IC, II);
3007 case Intrinsic::aarch64_sve_dup_x:
3008 return instCombineSVEDupX(IC, II);
3009 case Intrinsic::aarch64_sve_cmpne:
3010 case Intrinsic::aarch64_sve_cmpne_wide:
3011 return instCombineSVECmpNE(IC, II);
3012 case Intrinsic::aarch64_sve_rdffr:
3013 return instCombineRDFFR(IC, II);
3014 case Intrinsic::aarch64_sve_lasta:
3015 case Intrinsic::aarch64_sve_lastb:
3016 return instCombineSVELast(IC, II);
3017 case Intrinsic::aarch64_sve_clasta_n:
3018 case Intrinsic::aarch64_sve_clastb_n:
3019 return instCombineSVECondLast(IC, II);
3020 case Intrinsic::aarch64_sve_cntd:
3021 return instCombineSVECntElts(IC, II, NumElts: 2);
3022 case Intrinsic::aarch64_sve_cntw:
3023 return instCombineSVECntElts(IC, II, NumElts: 4);
3024 case Intrinsic::aarch64_sve_cnth:
3025 return instCombineSVECntElts(IC, II, NumElts: 8);
3026 case Intrinsic::aarch64_sve_cntb:
3027 return instCombineSVECntElts(IC, II, NumElts: 16);
3028 case Intrinsic::aarch64_sme_cntsd:
3029 return instCombineSMECntsd(IC, II, ST);
3030 case Intrinsic::aarch64_sve_ptest_any:
3031 case Intrinsic::aarch64_sve_ptest_first:
3032 case Intrinsic::aarch64_sve_ptest_last:
3033 return instCombineSVEPTest(IC, II);
3034 case Intrinsic::aarch64_sve_fadd:
3035 return instCombineSVEVectorFAdd(IC, II);
3036 case Intrinsic::aarch64_sve_fadd_u:
3037 return instCombineSVEVectorFAddU(IC, II);
3038 case Intrinsic::aarch64_sve_fmul_u:
3039 return instCombineSVEVectorBinOp(IC, II);
3040 case Intrinsic::aarch64_sve_fsub:
3041 return instCombineSVEVectorFSub(IC, II);
3042 case Intrinsic::aarch64_sve_fsub_u:
3043 return instCombineSVEVectorFSubU(IC, II);
3044 case Intrinsic::aarch64_sve_add:
3045 return instCombineSVEVectorAdd(IC, II);
3046 case Intrinsic::aarch64_sve_add_u:
3047 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3048 Intrinsic::aarch64_sve_mla_u>(
3049 IC, II, MergeIntoAddendOp: true);
3050 case Intrinsic::aarch64_sve_sub:
3051 return instCombineSVEVectorSub(IC, II);
3052 case Intrinsic::aarch64_sve_sub_u:
3053 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3054 Intrinsic::aarch64_sve_mls_u>(
3055 IC, II, MergeIntoAddendOp: true);
3056 case Intrinsic::aarch64_sve_tbl:
3057 return instCombineSVETBL(IC, II);
3058 case Intrinsic::aarch64_sve_uunpkhi:
3059 case Intrinsic::aarch64_sve_uunpklo:
3060 case Intrinsic::aarch64_sve_sunpkhi:
3061 case Intrinsic::aarch64_sve_sunpklo:
3062 return instCombineSVEUnpack(IC, II);
3063 case Intrinsic::aarch64_sve_uzp1:
3064 return instCombineSVEUzp1(IC, II);
3065 case Intrinsic::aarch64_sve_zip1:
3066 case Intrinsic::aarch64_sve_zip2:
3067 return instCombineSVEZip(IC, II);
3068 case Intrinsic::aarch64_sve_ld1_gather_index:
3069 return instCombineLD1GatherIndex(IC, II);
3070 case Intrinsic::aarch64_sve_st1_scatter_index:
3071 return instCombineST1ScatterIndex(IC, II);
3072 case Intrinsic::aarch64_sve_ld1:
3073 return instCombineSVELD1(IC, II, DL);
3074 case Intrinsic::aarch64_sve_st1:
3075 return instCombineSVEST1(IC, II, DL);
3076 case Intrinsic::aarch64_sve_sdiv:
3077 return instCombineSVESDIV(IC, II);
3078 case Intrinsic::aarch64_sve_sel:
3079 return instCombineSVESel(IC, II);
3080 case Intrinsic::aarch64_sve_srshl:
3081 return instCombineSVESrshl(IC, II);
3082 case Intrinsic::aarch64_sve_dupq_lane:
3083 return instCombineSVEDupqLane(IC, II);
3084 case Intrinsic::aarch64_sve_insr:
3085 return instCombineSVEInsr(IC, II);
3086 case Intrinsic::aarch64_sve_whilelo:
3087 return instCombineWhilelo(IC, II);
3088 case Intrinsic::aarch64_sve_ptrue:
3089 return instCombinePTrue(IC, II);
3090 case Intrinsic::aarch64_sve_uxtb:
3091 return instCombineSVEUxt(IC, II, NumBits: 8);
3092 case Intrinsic::aarch64_sve_uxth:
3093 return instCombineSVEUxt(IC, II, NumBits: 16);
3094 case Intrinsic::aarch64_sve_uxtw:
3095 return instCombineSVEUxt(IC, II, NumBits: 32);
3096 case Intrinsic::aarch64_sme_in_streaming_mode:
3097 return instCombineInStreamingMode(IC, II);
3098 }
3099
3100 return std::nullopt;
3101}
3102
3103std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3104 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3105 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3106 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3107 SimplifyAndSetOp) const {
3108 switch (II.getIntrinsicID()) {
3109 default:
3110 break;
3111 case Intrinsic::aarch64_neon_fcvtxn:
3112 case Intrinsic::aarch64_neon_rshrn:
3113 case Intrinsic::aarch64_neon_sqrshrn:
3114 case Intrinsic::aarch64_neon_sqrshrun:
3115 case Intrinsic::aarch64_neon_sqshrn:
3116 case Intrinsic::aarch64_neon_sqshrun:
3117 case Intrinsic::aarch64_neon_sqxtn:
3118 case Intrinsic::aarch64_neon_sqxtun:
3119 case Intrinsic::aarch64_neon_uqrshrn:
3120 case Intrinsic::aarch64_neon_uqshrn:
3121 case Intrinsic::aarch64_neon_uqxtn:
3122 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3123 break;
3124 }
3125
3126 return std::nullopt;
3127}
3128
3129bool AArch64TTIImpl::enableScalableVectorization() const {
3130 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3131 EnableScalableAutovecInStreamingMode);
3132}
3133
3134TypeSize
3135AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
3136 switch (K) {
3137 case TargetTransformInfo::RGK_Scalar:
3138 return TypeSize::getFixed(ExactSize: 64);
3139 case TargetTransformInfo::RGK_FixedWidthVector:
3140 if (ST->useSVEForFixedLengthVectors() &&
3141 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3142 return TypeSize::getFixed(
3143 ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: 128u));
3144 else if (ST->isNeonAvailable())
3145 return TypeSize::getFixed(ExactSize: 128);
3146 else
3147 return TypeSize::getFixed(ExactSize: 0);
3148 case TargetTransformInfo::RGK_ScalableVector:
3149 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3150 EnableScalableAutovecInStreamingMode))
3151 return TypeSize::getScalable(MinimumSize: 128);
3152 else
3153 return TypeSize::getScalable(MinimumSize: 0);
3154 }
3155 llvm_unreachable("Unsupported register kind");
3156}
3157
3158bool AArch64TTIImpl::isSingleExtWideningInstruction(
3159 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3160 Type *SrcOverrideTy) const {
3161 // A helper that returns a vector type from the given type. The number of
3162 // elements in type Ty determines the vector width.
3163 auto toVectorTy = [&](Type *ArgTy) {
3164 return VectorType::get(ElementType: ArgTy->getScalarType(),
3165 EC: cast<VectorType>(Val: DstTy)->getElementCount());
3166 };
3167
3168 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3169 // i32, i64]. SVE doesn't generally have the same set of instructions to
3170 // perform an extend with the add/sub/mul. There are SMULLB style
3171 // instructions, but they operate on top/bottom, requiring some sort of lane
3172 // interleaving to be used with zext/sext.
3173 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3174 if (!useNeonVector(Ty: DstTy) || Args.size() != 2 ||
3175 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3176 return false;
3177
3178 Type *SrcTy = SrcOverrideTy;
3179 switch (Opcode) {
3180 case Instruction::Add: // UADDW(2), SADDW(2).
3181 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3182 // The second operand needs to be an extend
3183 if (isa<SExtInst>(Val: Args[1]) || isa<ZExtInst>(Val: Args[1])) {
3184 if (!SrcTy)
3185 SrcTy =
3186 toVectorTy(cast<Instruction>(Val: Args[1])->getOperand(i: 0)->getType());
3187 break;
3188 }
3189
3190 if (Opcode == Instruction::Sub)
3191 return false;
3192
3193 // UADDW(2), SADDW(2) can be commutted.
3194 if (isa<SExtInst>(Val: Args[0]) || isa<ZExtInst>(Val: Args[0])) {
3195 if (!SrcTy)
3196 SrcTy =
3197 toVectorTy(cast<Instruction>(Val: Args[0])->getOperand(i: 0)->getType());
3198 break;
3199 }
3200 return false;
3201 }
3202 default:
3203 return false;
3204 }
3205
3206 // Legalize the destination type and ensure it can be used in a widening
3207 // operation.
3208 auto DstTyL = getTypeLegalizationCost(Ty: DstTy);
3209 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3210 return false;
3211
3212 // Legalize the source type and ensure it can be used in a widening
3213 // operation.
3214 assert(SrcTy && "Expected some SrcTy");
3215 auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy);
3216 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3217 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3218 return false;
3219
3220 // Get the total number of vector elements in the legalized types.
3221 InstructionCost NumDstEls =
3222 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3223 InstructionCost NumSrcEls =
3224 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3225
3226 // Return true if the legalized types have the same number of vector elements
3227 // and the destination element type size is twice that of the source type.
3228 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3229}
3230
3231Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3232 ArrayRef<const Value *> Args,
3233 Type *SrcOverrideTy) const {
3234 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3235 Opcode != Instruction::Mul)
3236 return nullptr;
3237
3238 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3239 // i32, i64]. SVE doesn't generally have the same set of instructions to
3240 // perform an extend with the add/sub/mul. There are SMULLB style
3241 // instructions, but they operate on top/bottom, requiring some sort of lane
3242 // interleaving to be used with zext/sext.
3243 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3244 if (!useNeonVector(Ty: DstTy) || Args.size() != 2 ||
3245 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3246 return nullptr;
3247
3248 auto getScalarSizeWithOverride = [&](const Value *V) {
3249 if (SrcOverrideTy)
3250 return SrcOverrideTy->getScalarSizeInBits();
3251 return cast<Instruction>(Val: V)
3252 ->getOperand(i: 0)
3253 ->getType()
3254 ->getScalarSizeInBits();
3255 };
3256
3257 unsigned MaxEltSize = 0;
3258 if ((isa<SExtInst>(Val: Args[0]) && isa<SExtInst>(Val: Args[1])) ||
3259 (isa<ZExtInst>(Val: Args[0]) && isa<ZExtInst>(Val: Args[1]))) {
3260 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3261 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3262 MaxEltSize = std::max(a: EltSize0, b: EltSize1);
3263 } else if (isa<SExtInst, ZExtInst>(Val: Args[0]) &&
3264 isa<SExtInst, ZExtInst>(Val: Args[1])) {
3265 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3266 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3267 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3268 // enough.
3269 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3270 return nullptr;
3271 MaxEltSize = DstEltSize / 2;
3272 } else if (Opcode == Instruction::Mul &&
3273 (isa<ZExtInst>(Val: Args[0]) || isa<ZExtInst>(Val: Args[1]))) {
3274 // If one of the operands is a Zext and the other has enough zero bits
3275 // to be treated as unsigned, we can still generate a umull, meaning the
3276 // zext is free.
3277 KnownBits Known =
3278 computeKnownBits(V: isa<ZExtInst>(Val: Args[0]) ? Args[1] : Args[0], DL);
3279 if (Args[0]->getType()->getScalarSizeInBits() -
3280 Known.Zero.countLeadingOnes() >
3281 DstTy->getScalarSizeInBits() / 2)
3282 return nullptr;
3283
3284 MaxEltSize =
3285 getScalarSizeWithOverride(isa<ZExtInst>(Val: Args[0]) ? Args[0] : Args[1]);
3286 } else
3287 return nullptr;
3288
3289 if (MaxEltSize * 2 > DstEltSize)
3290 return nullptr;
3291
3292 Type *ExtTy = DstTy->getWithNewBitWidth(NewBitWidth: MaxEltSize * 2);
3293 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3294 return nullptr;
3295 return ExtTy;
3296}
3297
3298// s/urhadd instructions implement the following pattern, making the
3299// extends free:
3300// %x = add ((zext i8 -> i16), 1)
3301// %y = (zext i8 -> i16)
3302// trunc i16 (lshr (add %x, %y), 1) -> i8
3303//
3304bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
3305 Type *Src) const {
3306 // The source should be a legal vector type.
3307 if (!Src->isVectorTy() || !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) ||
3308 (Src->isScalableTy() && !ST->hasSVE2()))
3309 return false;
3310
3311 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3312 return false;
3313
3314 // Look for trunc/shl/add before trying to match the pattern.
3315 const Instruction *Add = ExtUser;
3316 auto *AddUser =
3317 dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3318 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3319 Add = AddUser;
3320
3321 auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3322 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3323 return false;
3324
3325 auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser());
3326 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3327 Src->getScalarSizeInBits() !=
3328 cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits())
3329 return false;
3330
3331 // Try to match the whole pattern. Ext could be either the first or second
3332 // m_ZExtOrSExt matched.
3333 Instruction *Ex1, *Ex2;
3334 if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1),
3335 R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_SpecificInt(V: 1))))))
3336 return false;
3337
3338 // Ensure both extends are of the same type
3339 if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) &&
3340 Ex1->getOpcode() == Ex2->getOpcode())
3341 return true;
3342
3343 return false;
3344}
3345
3346InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
3347 Type *Src,
3348 TTI::CastContextHint CCH,
3349 TTI::TargetCostKind CostKind,
3350 const Instruction *I) const {
3351 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3352 assert(ISD && "Invalid opcode");
3353 // If the cast is observable, and it is used by a widening instruction (e.g.,
3354 // uaddl, saddw, etc.), it may be free.
3355 if (I && I->hasOneUser()) {
3356 auto *SingleUser = cast<Instruction>(Val: *I->user_begin());
3357 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3358 if (Type *ExtTy = isBinExtWideningInstruction(
3359 Opcode: SingleUser->getOpcode(), DstTy: Dst, Args: Operands,
3360 SrcOverrideTy: Src != I->getOperand(i: 0)->getType() ? Src : nullptr)) {
3361 // The cost from Src->Src*2 needs to be added if required, the cost from
3362 // Src*2->ExtTy is free.
3363 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3364 Type *DoubleSrcTy =
3365 Src->getWithNewBitWidth(NewBitWidth: Src->getScalarSizeInBits() * 2);
3366 return getCastInstrCost(Opcode, Dst: DoubleSrcTy, Src,
3367 CCH: TTI::CastContextHint::None, CostKind);
3368 }
3369
3370 return 0;
3371 }
3372
3373 if (isSingleExtWideningInstruction(
3374 Opcode: SingleUser->getOpcode(), DstTy: Dst, Args: Operands,
3375 SrcOverrideTy: Src != I->getOperand(i: 0)->getType() ? Src : nullptr)) {
3376 // For adds only count the second operand as free if both operands are
3377 // extends but not the same operation. (i.e both operands are not free in
3378 // add(sext, zext)).
3379 if (SingleUser->getOpcode() == Instruction::Add) {
3380 if (I == SingleUser->getOperand(i: 1) ||
3381 (isa<CastInst>(Val: SingleUser->getOperand(i: 1)) &&
3382 cast<CastInst>(Val: SingleUser->getOperand(i: 1))->getOpcode() == Opcode))
3383 return 0;
3384 } else {
3385 // Others are free so long as isSingleExtWideningInstruction
3386 // returned true.
3387 return 0;
3388 }
3389 }
3390
3391 // The cast will be free for the s/urhadd instructions
3392 if ((isa<ZExtInst>(Val: I) || isa<SExtInst>(Val: I)) &&
3393 isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src))
3394 return 0;
3395 }
3396
3397 EVT SrcTy = TLI->getValueType(DL, Ty: Src);
3398 EVT DstTy = TLI->getValueType(DL, Ty: Dst);
3399
3400 if (!SrcTy.isSimple() || !DstTy.isSimple())
3401 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3402
3403 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3404 // we use fcvtx under SVE2. Give them invalid costs.
3405 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3406 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3407 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3408 return InstructionCost::getInvalid();
3409
3410 static const TypeConversionCostTblEntry BF16Tbl[] = {
3411 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 1}, // bfcvt
3412 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 1}, // bfcvt
3413 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 1}, // bfcvtn
3414 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 2}, // bfcvtn+bfcvtn2
3415 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 2}, // bfcvtn+fcvtn
3416 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtl2+bfcvtn
3417 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3418 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f32, .Cost: 1}, // bfcvt
3419 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f32, .Cost: 1}, // bfcvt
3420 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f32, .Cost: 3}, // bfcvt+bfcvt+uzp1
3421 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f64, .Cost: 2}, // fcvtx+bfcvt
3422 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f64, .Cost: 5}, // 2*fcvtx+2*bfcvt+uzp1
3423 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f64, .Cost: 11}, // 4*fcvt+4*bfcvt+3*uzp
3424 };
3425
3426 if (ST->hasBF16())
3427 if (const auto *Entry = ConvertCostTableLookup(
3428 Table: BF16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3429 return Entry->Cost;
3430
3431 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3432 // The cost of unpacking twice is artificially increased for now in order
3433 // to avoid regressions against NEON, which will use tbl instructions directly
3434 // instead of multiple layers of [s|u]unpk[lo|hi].
3435 // We use the unpacks in cases where the destination type is illegal and
3436 // requires splitting of the input, even if the input type itself is legal.
3437 const unsigned int SVE_EXT_COST = 1;
3438 const unsigned int SVE_FCVT_COST = 1;
3439 const unsigned int SVE_UNPACK_ONCE = 4;
3440 const unsigned int SVE_UNPACK_TWICE = 16;
3441
3442 static const TypeConversionCostTblEntry ConversionTbl[] = {
3443 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: 1}, // xtn
3444 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: 1}, // xtn
3445 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i32, .Src: MVT::v2i64, .Cost: 1}, // xtn
3446 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: 1}, // xtn
3447 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: 3}, // 2 xtn + 1 uzp1
3448 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 1}, // xtn
3449 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: 2}, // 1 uzp1 + 1 xtn
3450 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: 1}, // 1 uzp1
3451 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: 1}, // 1 xtn
3452 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: 2}, // 1 uzp1 + 1 xtn
3453 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: 4}, // 3 x uzp1 + xtn
3454 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: 1}, // 1 uzp1
3455 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: 3}, // 3 x uzp1
3456 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: 2}, // 2 x uzp1
3457 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: 1}, // uzp1
3458 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 3}, // (2 + 1) x uzp1
3459 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: 7}, // (4 + 2 + 1) x uzp1
3460 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: 2}, // 2 x uzp1
3461 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i64, .Cost: 6}, // (4 + 2) x uzp1
3462 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i64, .Cost: 4}, // 4 x uzp1
3463
3464 // Truncations on nxvmiN
3465 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i8, .Cost: 2},
3466 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i16, .Cost: 2},
3467 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i32, .Cost: 2},
3468 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i64, .Cost: 2},
3469 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i8, .Cost: 2},
3470 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i16, .Cost: 2},
3471 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i32, .Cost: 2},
3472 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i64, .Cost: 5},
3473 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i8, .Cost: 2},
3474 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i16, .Cost: 2},
3475 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i32, .Cost: 5},
3476 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i64, .Cost: 11},
3477 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i1, .Src: MVT::nxv16i8, .Cost: 2},
3478 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i16, .Cost: 0},
3479 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i32, .Cost: 0},
3480 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i64, .Cost: 0},
3481 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i32, .Cost: 0},
3482 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i64, .Cost: 0},
3483 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i32, .Src: MVT::nxv2i64, .Cost: 0},
3484 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i16, .Cost: 0},
3485 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i32, .Cost: 0},
3486 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i64, .Cost: 1},
3487 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i32, .Cost: 0},
3488 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i64, .Cost: 1},
3489 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i32, .Src: MVT::nxv4i64, .Cost: 1},
3490 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i16, .Cost: 0},
3491 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i32, .Cost: 1},
3492 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i64, .Cost: 3},
3493 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i32, .Cost: 1},
3494 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i64, .Cost: 3},
3495 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i16, .Cost: 1},
3496 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i32, .Cost: 3},
3497 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i64, .Cost: 7},
3498
3499 // The number of shll instructions for the extension.
3500 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3},
3501 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3},
3502 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
3503 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
3504 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3},
3505 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3},
3506 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
3507 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
3508 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7},
3509 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7},
3510 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6},
3511 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6},
3512 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
3513 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
3514 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
3515 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
3516
3517 // FP Ext and trunc
3518 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f32, .Cost: 1}, // fcvt
3519 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f32, .Cost: 1}, // fcvtl
3520 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f32, .Cost: 2}, // fcvtl+fcvtl2
3521 // FP16
3522 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::f16, .Cost: 1}, // fcvt
3523 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f16, .Cost: 1}, // fcvt
3524 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: 1}, // fcvtl
3525 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: 2}, // fcvtl+fcvtl2
3526 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f16, .Cost: 2}, // fcvtl+fcvtl
3527 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f16, .Cost: 3}, // fcvtl+fcvtl2+fcvtl
3528 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f16, .Cost: 6}, // 2 * fcvtl+fcvtl2+fcvtl
3529 // BF16 (uses shift)
3530 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::bf16, .Cost: 1}, // shl
3531 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::bf16, .Cost: 2}, // shl+fcvt
3532 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4bf16, .Cost: 1}, // shll
3533 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8bf16, .Cost: 2}, // shll+shll2
3534 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2bf16, .Cost: 2}, // shll+fcvtl
3535 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4bf16, .Cost: 3}, // shll+fcvtl+fcvtl2
3536 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8bf16, .Cost: 6}, // 2 * shll+fcvtl+fcvtl2
3537 // FP Ext and trunc
3538 {.ISD: ISD::FP_ROUND, .Dst: MVT::f32, .Src: MVT::f64, .Cost: 1}, // fcvt
3539 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f32, .Src: MVT::v2f64, .Cost: 1}, // fcvtn
3540 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f64, .Cost: 2}, // fcvtn+fcvtn2
3541 // FP16
3542 {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f32, .Cost: 1}, // fcvt
3543 {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f64, .Cost: 1}, // fcvt
3544 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f32, .Cost: 1}, // fcvtn
3545 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f32, .Cost: 2}, // fcvtn+fcvtn2
3546 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f16, .Src: MVT::v2f64, .Cost: 2}, // fcvtn+fcvtn
3547 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtn2+fcvtn
3548 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+fcvtn
3549 // BF16 (more complex, with +bf16 is handled above)
3550 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 8}, // Expansion is ~8 insns
3551 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 9}, // fcvtn + above
3552 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f32, .Cost: 8},
3553 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 8},
3554 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 15},
3555 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 9},
3556 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 10},
3557 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 19},
3558
3559 // LowerVectorINT_TO_FP:
3560 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
3561 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
3562 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1},
3563 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
3564 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
3565 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1},
3566
3567 // SVE: to nxv2f16
3568 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3569 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3570 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3571 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3572 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3573 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3574 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3575 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3576 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3577 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3578
3579 // SVE: to nxv4f16
3580 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3581 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3582 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3583 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3584 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3585 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3586 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3587 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3588
3589 // SVE: to nxv8f16
3590 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3591 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3592 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3593 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3594 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3595 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3596
3597 // SVE: to nxv16f16
3598 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3599 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3600 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3601 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3602
3603 // Complex: to v2f32
3604 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
3605 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3},
3606 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
3607 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3},
3608
3609 // SVE: to nxv2f32
3610 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3611 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3612 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3613 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3614 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3615 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3616 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3617 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3618 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3619 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3620
3621 // Complex: to v4f32
3622 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 4},
3623 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
3624 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3},
3625 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
3626
3627 // SVE: to nxv4f32
3628 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3629 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3630 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3631 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3632 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3633 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3634 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3635 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3636
3637 // Complex: to v8f32
3638 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
3639 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4},
3640 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
3641 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4},
3642
3643 // SVE: to nxv8f32
3644 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3645 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3646 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3647 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3648 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3649 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3650 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3651 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3652
3653 // SVE: to nxv16f32
3654 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3655 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3656 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3657 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3658
3659 // Complex: to v16f32
3660 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21},
3661 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21},
3662
3663 // Complex: to v2f64
3664 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4},
3665 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4},
3666 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2},
3667 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4},
3668 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4},
3669 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2},
3670
3671 // SVE: to nxv2f64
3672 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3673 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3674 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3675 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3676 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3677 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3678 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3679 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3680 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3681 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3682
3683 // Complex: to v4f64
3684 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4},
3685 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4},
3686
3687 // SVE: to nxv4f64
3688 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3689 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3690 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3691 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3692 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3693 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3694 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3695 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3696 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3697 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3698 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3699 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3700
3701 // SVE: to nxv8f64
3702 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3703 .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3704 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3705 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3706 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3707 .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3708 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3709 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3710
3711 // LowerVectorFP_TO_INT
3712 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1},
3713 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1},
3714 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1},
3715 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1},
3716 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1},
3717 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1},
3718
3719 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3720 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2},
3721 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1},
3722 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1},
3723 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2},
3724 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1},
3725 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1},
3726
3727 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3728 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2},
3729 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2},
3730 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2},
3731 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2},
3732
3733 // Complex, from nxv2f32.
3734 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1},
3735 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1},
3736 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1},
3737 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1},
3738 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1},
3739 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1},
3740 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1},
3741 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1},
3742
3743 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3744 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2},
3745 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2},
3746 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2},
3747 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2},
3748 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2},
3749 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2},
3750
3751 // Complex, from nxv2f64.
3752 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1},
3753 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1},
3754 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1},
3755 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1},
3756 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1},
3757 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1},
3758 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1},
3759 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1},
3760 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1},
3761 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1},
3762
3763 // Complex, from nxv4f32.
3764 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4},
3765 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1},
3766 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1},
3767 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1},
3768 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1},
3769 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4},
3770 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1},
3771 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1},
3772 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1},
3773 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1},
3774
3775 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3776 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7},
3777 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7},
3778 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7},
3779 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7},
3780
3781 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3782 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3},
3783 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3},
3784 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3},
3785 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3},
3786 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3},
3787 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3},
3788
3789 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3790 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3},
3791 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3},
3792 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3},
3793 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3},
3794
3795 // Complex, from nxv8f16.
3796 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10},
3797 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4},
3798 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1},
3799 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1},
3800 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1},
3801 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10},
3802 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4},
3803 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1},
3804 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1},
3805 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1},
3806
3807 // Complex, from nxv4f16.
3808 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4},
3809 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1},
3810 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1},
3811 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1},
3812 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4},
3813 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1},
3814 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1},
3815 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1},
3816
3817 // Complex, from nxv2f16.
3818 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1},
3819 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1},
3820 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1},
3821 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1},
3822 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1},
3823 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1},
3824 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1},
3825 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1},
3826
3827 // Truncate from nxvmf32 to nxvmf16.
3828 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f32, .Cost: 1},
3829 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f32, .Cost: 1},
3830 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f32, .Cost: 3},
3831
3832 // Truncate from nxvmf32 to nxvmbf16.
3833 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f32, .Cost: 8},
3834 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f32, .Cost: 8},
3835 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f32, .Cost: 17},
3836
3837 // Truncate from nxvmf64 to nxvmf16.
3838 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f64, .Cost: 1},
3839 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f64, .Cost: 3},
3840 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f64, .Cost: 7},
3841
3842 // Truncate from nxvmf64 to nxvmbf16.
3843 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f64, .Cost: 9},
3844 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f64, .Cost: 19},
3845 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f64, .Cost: 39},
3846
3847 // Truncate from nxvmf64 to nxvmf32.
3848 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f64, .Cost: 1},
3849 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f64, .Cost: 3},
3850 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f64, .Cost: 6},
3851
3852 // Extend from nxvmf16 to nxvmf32.
3853 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f16, .Cost: 1},
3854 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f16, .Cost: 1},
3855 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f16, .Cost: 2},
3856
3857 // Extend from nxvmbf16 to nxvmf32.
3858 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2bf16, .Cost: 1}, // lsl
3859 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4bf16, .Cost: 1}, // lsl
3860 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8bf16, .Cost: 4}, // unpck+unpck+lsl+lsl
3861
3862 // Extend from nxvmf16 to nxvmf64.
3863 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f16, .Cost: 1},
3864 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f16, .Cost: 2},
3865 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f16, .Cost: 4},
3866
3867 // Extend from nxvmbf16 to nxvmf64.
3868 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2bf16, .Cost: 2}, // lsl+fcvt
3869 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4bf16, .Cost: 6}, // 2*unpck+2*lsl+2*fcvt
3870 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8bf16, .Cost: 14}, // 6*unpck+4*lsl+4*fcvt
3871
3872 // Extend from nxvmf32 to nxvmf64.
3873 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f32, .Cost: 1},
3874 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f32, .Cost: 2},
3875 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f32, .Cost: 6},
3876
3877 // Bitcasts from float to integer
3878 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: 0},
3879 {.ISD: ISD::BITCAST, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: 0},
3880 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: 0},
3881
3882 // Bitcasts from integer to float
3883 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 0},
3884 {.ISD: ISD::BITCAST, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 0},
3885 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 0},
3886
3887 // Add cost for extending to illegal -too wide- scalable vectors.
3888 // zero/sign extend are implemented by multiple unpack operations,
3889 // where each operation has a cost of 1.
3890 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2},
3891 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6},
3892 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14},
3893 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2},
3894 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6},
3895 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2},
3896
3897 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2},
3898 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6},
3899 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14},
3900 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2},
3901 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6},
3902 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2},
3903 };
3904
3905 // We have to estimate a cost of fixed length operation upon
3906 // SVE registers(operations) with the number of registers required
3907 // for a fixed type to be represented upon SVE registers.
3908 EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy;
3909 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3910 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3911 ST->useSVEForFixedLengthVectors(VT: WiderTy)) {
3912 std::pair<InstructionCost, MVT> LT =
3913 getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext()));
3914 unsigned NumElements =
3915 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3916 return LT.first *
3917 getCastInstrCost(
3918 Opcode,
3919 Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements),
3920 Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH,
3921 CostKind, I);
3922 }
3923
3924 if (const auto *Entry = ConvertCostTableLookup(
3925 Table: ConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3926 return Entry->Cost;
3927
3928 static const TypeConversionCostTblEntry FP16Tbl[] = {
3929 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs
3930 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1},
3931 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs
3932 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1},
3933 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2}, // fcvtl+fcvtzs
3934 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2},
3935 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2}, // fcvtzs+xtn
3936 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2},
3937 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1}, // fcvtzs
3938 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1},
3939 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4}, // 2*fcvtl+2*fcvtzs
3940 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4},
3941 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3}, // 2*fcvtzs+xtn
3942 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3},
3943 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2}, // 2*fcvtzs
3944 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2},
3945 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8}, // 4*fcvtl+4*fcvtzs
3946 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8},
3947 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // ushll + ucvtf
3948 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // sshll + scvtf
3949 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * ushl(2) + 2 * ucvtf
3950 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * sshl(2) + 2 * scvtf
3951 };
3952
3953 if (ST->hasFullFP16())
3954 if (const auto *Entry = ConvertCostTableLookup(
3955 Table: FP16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3956 return Entry->Cost;
3957
3958 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3959 // double-rounding issues.
3960 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3961 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3962 isa<FixedVectorType>(Val: Dst) && isa<FixedVectorType>(Val: Src))
3963 return cast<FixedVectorType>(Val: Dst)->getNumElements() *
3964 getCastInstrCost(Opcode, Dst: Dst->getScalarType(),
3965 Src: Src->getScalarType(), CCH, CostKind) +
3966 BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Src), Insert: false,
3967 Extract: true, CostKind) +
3968 BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Dst), Insert: true,
3969 Extract: false, CostKind);
3970
3971 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3972 CCH == TTI::CastContextHint::Masked &&
3973 ST->isSVEorStreamingSVEAvailable() &&
3974 TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) ==
3975 TargetLowering::TypePromoteInteger &&
3976 TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) ==
3977 TargetLowering::TypeSplitVector) {
3978 // The standard behaviour in the backend for these cases is to split the
3979 // extend up into two parts:
3980 // 1. Perform an extending load or masked load up to the legal type.
3981 // 2. Extend the loaded data to the final type.
3982 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
3983 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Context&: Src->getContext());
3984 InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
3985 Opcode, Dst: LegalTy, Src, CCH, CostKind, I);
3986 InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
3987 Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I);
3988 return Part1 + Part2;
3989 }
3990
3991 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3992 // but we also want to include the TTI::CastContextHint::Masked case too.
3993 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3994 CCH == TTI::CastContextHint::Masked &&
3995 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(VT: DstTy))
3996 CCH = TTI::CastContextHint::Normal;
3997
3998 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3999}
4000
4001InstructionCost
4002AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
4003 VectorType *VecTy, unsigned Index,
4004 TTI::TargetCostKind CostKind) const {
4005
4006 // Make sure we were given a valid extend opcode.
4007 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
4008 "Invalid opcode");
4009
4010 // We are extending an element we extract from a vector, so the source type
4011 // of the extend is the element type of the vector.
4012 auto *Src = VecTy->getElementType();
4013
4014 // Sign- and zero-extends are for integer types only.
4015 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
4016
4017 // Get the cost for the extract. We compute the cost (if any) for the extend
4018 // below.
4019 InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
4020 CostKind, Index, Op0: nullptr, Op1: nullptr);
4021
4022 // Legalize the types.
4023 auto VecLT = getTypeLegalizationCost(Ty: VecTy);
4024 auto DstVT = TLI->getValueType(DL, Ty: Dst);
4025 auto SrcVT = TLI->getValueType(DL, Ty: Src);
4026
4027 // If the resulting type is still a vector and the destination type is legal,
4028 // we may get the extension for free. If not, get the default cost for the
4029 // extend.
4030 if (!VecLT.second.isVector() || !TLI->isTypeLegal(VT: DstVT))
4031 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
4032 CostKind);
4033
4034 // The destination type should be larger than the element type. If not, get
4035 // the default cost for the extend.
4036 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4037 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
4038 CostKind);
4039
4040 switch (Opcode) {
4041 default:
4042 llvm_unreachable("Opcode should be either SExt or ZExt");
4043
4044 // For sign-extends, we only need a smov, which performs the extension
4045 // automatically.
4046 case Instruction::SExt:
4047 return Cost;
4048
4049 // For zero-extends, the extend is performed automatically by a umov unless
4050 // the destination type is i64 and the element type is i8 or i16.
4051 case Instruction::ZExt:
4052 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
4053 return Cost;
4054 }
4055
4056 // If we are unable to perform the extend for free, get the default cost.
4057 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
4058 CostKind);
4059}
4060
4061InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
4062 TTI::TargetCostKind CostKind,
4063 const Instruction *I) const {
4064 if (CostKind != TTI::TCK_RecipThroughput)
4065 return Opcode == Instruction::PHI ? 0 : 1;
4066 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4067 // Branches are assumed to be predicted.
4068 return 0;
4069}
4070
4071InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4072 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4073 const Instruction *I, Value *Scalar,
4074 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4075 TTI::VectorInstrContext VIC) const {
4076 assert(Val->isVectorTy() && "This must be a vector type");
4077
4078 if (Index != -1U) {
4079 // Legalize the type.
4080 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
4081
4082 // This type is legalized to a scalar type.
4083 if (!LT.second.isVector())
4084 return 0;
4085
4086 // The type may be split. For fixed-width vectors we can normalize the
4087 // index to the new type.
4088 if (LT.second.isFixedLengthVector()) {
4089 unsigned Width = LT.second.getVectorNumElements();
4090 Index = Index % Width;
4091 }
4092
4093 // The element at index zero is already inside the vector.
4094 // - For a insert-element or extract-element
4095 // instruction that extracts integers, an explicit FPR -> GPR move is
4096 // needed. So it has non-zero cost.
4097 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4098 return 0;
4099
4100 // This is recognising a LD1 single-element structure to one lane of one
4101 // register instruction. I.e., if this is an `insertelement` instruction,
4102 // and its second operand is a load, then we will generate a LD1, which
4103 // are expensive instructions on some uArchs.
4104 if (VIC == TTI::VectorInstrContext::Load) {
4105 if (ST->hasFastLD1Single())
4106 return 0;
4107 return CostKind == TTI::TCK_CodeSize
4108 ? 0
4109 : ST->getVectorInsertExtractBaseCost() + 1;
4110 }
4111
4112 // i1 inserts and extract will include an extra cset or cmp of the vector
4113 // value. Increase the cost by 1 to account.
4114 if (Val->getScalarSizeInBits() == 1)
4115 return CostKind == TTI::TCK_CodeSize
4116 ? 2
4117 : ST->getVectorInsertExtractBaseCost() + 1;
4118
4119 // FIXME:
4120 // If the extract-element and insert-element instructions could be
4121 // simplified away (e.g., could be combined into users by looking at use-def
4122 // context), they have no cost. This is not done in the first place for
4123 // compile-time considerations.
4124 }
4125
4126 // In case of Neon, if there exists extractelement from lane != 0 such that
4127 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4128 // 2. extractelement result feeds into fmul.
4129 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4130 // equivalent to 0.
4131 // then the extractelement can be merged with fmul in the backend and it
4132 // incurs no cost.
4133 // e.g.
4134 // define double @foo(<2 x double> %a) {
4135 // %1 = extractelement <2 x double> %a, i32 0
4136 // %2 = extractelement <2 x double> %a, i32 1
4137 // %res = fmul double %1, %2
4138 // ret double %res
4139 // }
4140 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4141 auto ExtractCanFuseWithFmul = [&]() {
4142 // We bail out if the extract is from lane 0.
4143 if (Index == 0)
4144 return false;
4145
4146 // Check if the scalar element type of the vector operand of ExtractElement
4147 // instruction is one of the allowed types.
4148 auto IsAllowedScalarTy = [&](const Type *T) {
4149 return T->isFloatTy() || T->isDoubleTy() ||
4150 (T->isHalfTy() && ST->hasFullFP16());
4151 };
4152
4153 // Check if the extractelement user is scalar fmul.
4154 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4155 // Check if the user is scalar fmul.
4156 const auto *BO = dyn_cast<BinaryOperator>(Val: EEUser);
4157 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4158 !BO->getType()->isVectorTy();
4159 };
4160
4161 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4162 // certain scalar type and a certain vector register width.
4163 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4164 auto RegWidth =
4165 getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
4166 .getFixedValue();
4167 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4168 };
4169
4170 // Check if the type constraints on input vector type and result scalar type
4171 // of extractelement instruction are satisfied.
4172 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4173 return false;
4174
4175 if (Scalar) {
4176 DenseMap<User *, unsigned> UserToExtractIdx;
4177 for (auto *U : Scalar->users()) {
4178 if (!IsUserFMulScalarTy(U))
4179 return false;
4180 // Recording entry for the user is important. Index value is not
4181 // important.
4182 UserToExtractIdx[U];
4183 }
4184 if (UserToExtractIdx.empty())
4185 return false;
4186 for (auto &[S, U, L] : ScalarUserAndIdx) {
4187 for (auto *U : S->users()) {
4188 if (UserToExtractIdx.contains(Val: U)) {
4189 auto *FMul = cast<BinaryOperator>(Val: U);
4190 auto *Op0 = FMul->getOperand(i_nocapture: 0);
4191 auto *Op1 = FMul->getOperand(i_nocapture: 1);
4192 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4193 UserToExtractIdx[U] = L;
4194 break;
4195 }
4196 }
4197 }
4198 }
4199 for (auto &[U, L] : UserToExtractIdx) {
4200 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4201 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4202 return false;
4203 }
4204 } else {
4205 const auto *EE = cast<ExtractElementInst>(Val: I);
4206
4207 const auto *IdxOp = dyn_cast<ConstantInt>(Val: EE->getIndexOperand());
4208 if (!IdxOp)
4209 return false;
4210
4211 return !EE->users().empty() && all_of(Range: EE->users(), P: [&](const User *U) {
4212 if (!IsUserFMulScalarTy(U))
4213 return false;
4214
4215 // Check if the other operand of extractelement is also extractelement
4216 // from lane equivalent to 0.
4217 const auto *BO = cast<BinaryOperator>(Val: U);
4218 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4219 Val: BO->getOperand(i_nocapture: 0) == EE ? BO->getOperand(i_nocapture: 1) : BO->getOperand(i_nocapture: 0));
4220 if (OtherEE) {
4221 const auto *IdxOp = dyn_cast<ConstantInt>(Val: OtherEE->getIndexOperand());
4222 if (!IdxOp)
4223 return false;
4224 return IsExtractLaneEquivalentToZero(
4225 cast<ConstantInt>(Val: OtherEE->getIndexOperand())
4226 ->getValue()
4227 .getZExtValue(),
4228 OtherEE->getType()->getScalarSizeInBits());
4229 }
4230 return true;
4231 });
4232 }
4233 return true;
4234 };
4235
4236 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4237 ExtractCanFuseWithFmul())
4238 return 0;
4239
4240 // All other insert/extracts cost this much.
4241 return CostKind == TTI::TCK_CodeSize ? 1
4242 : ST->getVectorInsertExtractBaseCost();
4243}
4244
4245InstructionCost AArch64TTIImpl::getVectorInstrCost(
4246 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4247 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4248 // Treat insert at lane 0 into a poison vector as having zero cost. This
4249 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4250 // single dup) are treated as cheap.
4251 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4252 isa<PoisonValue>(Val: Op0))
4253 return 0;
4254 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, I: nullptr,
4255 Scalar: nullptr, ScalarUserAndIdx: {}, VIC);
4256}
4257
4258InstructionCost AArch64TTIImpl::getVectorInstrCost(
4259 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4260 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4261 TTI::VectorInstrContext VIC) const {
4262 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, I: nullptr, Scalar,
4263 ScalarUserAndIdx, VIC);
4264}
4265
4266InstructionCost
4267AArch64TTIImpl::getVectorInstrCost(const Instruction &I, Type *Val,
4268 TTI::TargetCostKind CostKind, unsigned Index,
4269 TTI::VectorInstrContext VIC) const {
4270 return getVectorInstrCostHelper(Opcode: I.getOpcode(), Val, CostKind, Index, I: &I,
4271 Scalar: nullptr, ScalarUserAndIdx: {}, VIC);
4272}
4273
4274InstructionCost
4275AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
4276 TTI::TargetCostKind CostKind,
4277 unsigned Index) const {
4278 if (isa<FixedVectorType>(Val))
4279 return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
4280 Index);
4281
4282 // This typically requires both while and lastb instructions in order
4283 // to extract the last element. If this is in a loop the while
4284 // instruction can at least be hoisted out, although it will consume a
4285 // predicate register. The cost should be more expensive than the base
4286 // extract cost, which is 2 for most CPUs.
4287 return CostKind == TTI::TCK_CodeSize
4288 ? 2
4289 : ST->getVectorInsertExtractBaseCost() + 1;
4290}
4291
4292InstructionCost AArch64TTIImpl::getScalarizationOverhead(
4293 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4294 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4295 TTI::VectorInstrContext VIC) const {
4296 if (isa<ScalableVectorType>(Val: Ty))
4297 return InstructionCost::getInvalid();
4298 if (Ty->getElementType()->isFloatingPointTy())
4299 return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract,
4300 CostKind);
4301 unsigned VecInstCost =
4302 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4303 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4304}
4305
4306std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4307 Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
4308 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4309 std::function<InstructionCost(Type *)> InstCost) const {
4310 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4311 return std::nullopt;
4312 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4313 return std::nullopt;
4314 // If we have +sve-b16b16 the operation can be promoted to SVE.
4315 if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
4316 return std::nullopt;
4317
4318 Type *PromotedTy = Ty->getWithNewType(EltTy: Type::getFloatTy(C&: Ty->getContext()));
4319 InstructionCost Cost = getCastInstrCost(Opcode: Instruction::FPExt, Dst: PromotedTy, Src: Ty,
4320 CCH: TTI::CastContextHint::None, CostKind);
4321 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4322 Cost *= 2;
4323 Cost += InstCost(PromotedTy);
4324 if (IncludeTrunc)
4325 Cost += getCastInstrCost(Opcode: Instruction::FPTrunc, Dst: Ty, Src: PromotedTy,
4326 CCH: TTI::CastContextHint::None, CostKind);
4327 return Cost;
4328}
4329
4330InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
4331 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4332 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
4333 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4334
4335 // The code-generator is currently not able to handle scalable vectors
4336 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4337 // it. This change will be removed when code-generation for these types is
4338 // sufficiently reliable.
4339 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
4340 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
4341 return InstructionCost::getInvalid();
4342
4343 // TODO: Handle more cost kinds.
4344 if (CostKind != TTI::TCK_RecipThroughput)
4345 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4346 Opd2Info: Op2Info, Args, CxtI);
4347
4348 // Legalize the type.
4349 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4350 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4351
4352 // Increase the cost for half and bfloat types if not architecturally
4353 // supported.
4354 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4355 ISD == ISD::FDIV || ISD == ISD::FREM)
4356 if (auto PromotedCost = getFP16BF16PromoteCost(
4357 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4358 // There is not native support for fdiv/frem even with +sve-b16b16.
4359 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4360 InstCost: [&](Type *PromotedTy) {
4361 return getArithmeticInstrCost(Opcode, Ty: PromotedTy, CostKind,
4362 Op1Info, Op2Info);
4363 }))
4364 return *PromotedCost;
4365
4366 // If the operation is a widening instruction (smull or umull) and both
4367 // operands are extends the cost can be cheaper by considering that the
4368 // operation will operate on the narrowest type size possible (double the
4369 // largest input size) and a further extend.
4370 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, DstTy: Ty, Args)) {
4371 if (ExtTy != Ty)
4372 return getArithmeticInstrCost(Opcode, Ty: ExtTy, CostKind) +
4373 getCastInstrCost(Opcode: Instruction::ZExt, Dst: Ty, Src: ExtTy,
4374 CCH: TTI::CastContextHint::None, CostKind);
4375 return LT.first;
4376 }
4377
4378 switch (ISD) {
4379 default:
4380 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4381 Opd2Info: Op2Info);
4382 case ISD::SREM:
4383 case ISD::SDIV:
4384 /*
4385 Notes for sdiv/srem specific costs:
4386 1. This only considers the cases where the divisor is constant, uniform and
4387 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4388 result in some form of (ldr + adrp), corresponding to constant vectors, or
4389 scalarization of the division operation.
4390 2. Constant divisors, either negative in whole or partially, don't result in
4391 significantly different codegen as compared to positive constant divisors.
4392 So, we don't consider negative divisors separately.
4393 3. If the codegen is significantly different with SVE, it has been indicated
4394 using comments at appropriate places.
4395
4396 sdiv specific cases:
4397 -----------------------------------------------------------------------
4398 codegen | pow-of-2 | Type
4399 -----------------------------------------------------------------------
4400 add + cmp + csel + asr | Y | i64
4401 add + cmp + csel + asr | Y | i32
4402 -----------------------------------------------------------------------
4403
4404 srem specific cases:
4405 -----------------------------------------------------------------------
4406 codegen | pow-of-2 | Type
4407 -----------------------------------------------------------------------
4408 negs + and + and + csneg | Y | i64
4409 negs + and + and + csneg | Y | i32
4410 -----------------------------------------------------------------------
4411
4412 other sdiv/srem cases:
4413 -------------------------------------------------------------------------
4414 common codegen | + srem | + sdiv | pow-of-2 | Type
4415 -------------------------------------------------------------------------
4416 smulh + asr + add + add | - | - | N | i64
4417 smull + lsr + add + add | - | - | N | i32
4418 usra | and + sub | sshr | Y | <2 x i64>
4419 2 * (scalar code) | - | - | N | <2 x i64>
4420 usra | bic + sub | sshr + neg | Y | <4 x i32>
4421 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4422 + sshr + usra | | | |
4423 -------------------------------------------------------------------------
4424 */
4425 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4426 InstructionCost AddCost =
4427 getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
4428 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4429 InstructionCost AsrCost =
4430 getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4431 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4432 InstructionCost MulCost =
4433 getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4434 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4435 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4436 // have similar cost.
4437 auto VT = TLI->getValueType(DL, Ty);
4438 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4439 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4440 // Neg can be folded into the asr instruction.
4441 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4442 : (3 * AsrCost + AddCost);
4443 } else {
4444 return MulCost + AsrCost + 2 * AddCost;
4445 }
4446 } else if (VT.isVector()) {
4447 InstructionCost UsraCost = 2 * AsrCost;
4448 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4449 // Division with scalable types corresponds to native 'asrd'
4450 // instruction when SVE is available.
4451 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4452
4453 // One more for the negation in SDIV
4454 InstructionCost Cost =
4455 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4456 if (Ty->isScalableTy() && ST->hasSVE())
4457 Cost += 2 * AsrCost;
4458 else {
4459 Cost +=
4460 UsraCost +
4461 (ISD == ISD::SDIV
4462 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4463 : 2 * AddCost);
4464 }
4465 return Cost;
4466 } else if (LT.second == MVT::v2i64) {
4467 return VT.getVectorNumElements() *
4468 getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind,
4469 Op1Info: Op1Info.getNoProps(),
4470 Op2Info: Op2Info.getNoProps());
4471 } else {
4472 // When SVE is available, we get:
4473 // smulh + lsr + add/sub + asr + add/sub.
4474 if (Ty->isScalableTy() && ST->hasSVE())
4475 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4476 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4477 }
4478 }
4479 }
4480 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4481 LT.second.isFixedLengthVector()) {
4482 // FIXME: When the constant vector is non-uniform, this may result in
4483 // loading the vector from constant pool or in some cases, may also result
4484 // in scalarization. For now, we are approximating this with the
4485 // scalarization cost.
4486 auto ExtractCost = 2 * getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty,
4487 CostKind, Index: -1, Op0: nullptr, Op1: nullptr);
4488 auto InsertCost = getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty,
4489 CostKind, Index: -1, Op0: nullptr, Op1: nullptr);
4490 unsigned NElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
4491 return ExtractCost + InsertCost +
4492 NElts * getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(),
4493 CostKind, Op1Info: Op1Info.getNoProps(),
4494 Op2Info: Op2Info.getNoProps());
4495 }
4496 [[fallthrough]];
4497 case ISD::UDIV:
4498 case ISD::UREM: {
4499 auto VT = TLI->getValueType(DL, Ty);
4500 if (Op2Info.isConstant()) {
4501 // If the operand is a power of 2 we can use the shift or and cost.
4502 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4503 return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind,
4504 Op1Info: Op1Info.getNoProps(),
4505 Op2Info: Op2Info.getNoProps());
4506 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4507 return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind,
4508 Op1Info: Op1Info.getNoProps(),
4509 Op2Info: Op2Info.getNoProps());
4510
4511 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4512 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4513 // The MULHU will be expanded to UMULL for the types not listed below,
4514 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4515 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4516 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4517 LT.second == MVT::nxv16i8;
4518 bool Is128bit = LT.second.is128BitVector();
4519
4520 InstructionCost MulCost =
4521 getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4522 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4523 InstructionCost AddCost =
4524 getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
4525 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4526 InstructionCost ShrCost =
4527 getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4528 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4529 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4530 (HasMULH ? 0 : ShrCost) + // UMULL shift
4531 AddCost * 2 + ShrCost;
4532 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4533 }
4534 }
4535
4536 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4537 // emitted by the backend even when those functions are not declared in the
4538 // module.
4539 if (!VT.isVector() && VT.getSizeInBits() > 64)
4540 return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4541
4542 InstructionCost Cost = BaseT::getArithmeticInstrCost(
4543 Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4544 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4545 if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) {
4546 // SDIV/UDIV operations are lowered using SVE, then we can have less
4547 // costs.
4548 if (VT.isSimple() && isa<FixedVectorType>(Val: Ty) &&
4549 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4550 static const CostTblEntry DivTbl[]{
4551 {.ISD: ISD::SDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v4i8, .Cost: 8},
4552 {.ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::SDIV, .Type: MVT::v2i16, .Cost: 5},
4553 {.ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: 1},
4554 {.ISD: ISD::UDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v4i8, .Cost: 8},
4555 {.ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::UDIV, .Type: MVT::v2i16, .Cost: 5},
4556 {.ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: 1}};
4557
4558 const auto *Entry = CostTableLookup(Table: DivTbl, ISD, Ty: VT.getSimpleVT());
4559 if (nullptr != Entry)
4560 return Entry->Cost;
4561 }
4562 // For 8/16-bit elements, the cost is higher because the type
4563 // requires promotion and possibly splitting:
4564 if (LT.second.getScalarType() == MVT::i8)
4565 Cost *= 8;
4566 else if (LT.second.getScalarType() == MVT::i16)
4567 Cost *= 4;
4568 return Cost;
4569 } else {
4570 // If one of the operands is a uniform constant then the cost for each
4571 // element is Cost for insertion, extraction and division.
4572 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4573 // operation with scalar type
4574 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4575 (Op2Info.isConstant() && Op2Info.isUniform())) {
4576 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
4577 InstructionCost DivCost = BaseT::getArithmeticInstrCost(
4578 Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4579 return (4 + DivCost) * VTy->getNumElements();
4580 }
4581 }
4582 // On AArch64, without SVE, vector divisions are expanded
4583 // into scalar divisions of each pair of elements.
4584 Cost += getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind,
4585 Index: -1, Op0: nullptr, Op1: nullptr);
4586 Cost += getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1,
4587 Op0: nullptr, Op1: nullptr);
4588 }
4589
4590 // TODO: if one of the arguments is scalar, then it's not necessary to
4591 // double the cost of handling the vector elements.
4592 Cost += Cost;
4593 }
4594 return Cost;
4595 }
4596 case ISD::MUL:
4597 // When SVE is available, then we can lower the v2i64 operation using
4598 // the SVE mul instruction, which has a lower cost.
4599 if (LT.second == MVT::v2i64 && ST->hasSVE())
4600 return LT.first;
4601
4602 // When SVE is not available, there is no MUL.2d instruction,
4603 // which means mul <2 x i64> is expensive as elements are extracted
4604 // from the vectors and the muls scalarized.
4605 // As getScalarizationOverhead is a bit too pessimistic, we
4606 // estimate the cost for a i64 vector directly here, which is:
4607 // - four 2-cost i64 extracts,
4608 // - two 2-cost i64 inserts, and
4609 // - two 1-cost muls.
4610 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4611 // LT.first = 2 the cost is 28.
4612 if (LT.second != MVT::v2i64)
4613 return LT.first;
4614 return cast<VectorType>(Val: Ty)->getElementCount().getKnownMinValue() *
4615 (getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind) +
4616 getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind, Index: -1,
4617 Op0: nullptr, Op1: nullptr) *
4618 2 +
4619 getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1,
4620 Op0: nullptr, Op1: nullptr));
4621 case ISD::ADD:
4622 case ISD::XOR:
4623 case ISD::OR:
4624 case ISD::AND:
4625 case ISD::SRL:
4626 case ISD::SRA:
4627 case ISD::SHL:
4628 // These nodes are marked as 'custom' for combining purposes only.
4629 // We know that they are legal. See LowerAdd in ISelLowering.
4630 return LT.first;
4631
4632 case ISD::FNEG:
4633 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4634 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4635 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4636 CxtI &&
4637 ((CxtI->hasOneUse() &&
4638 match(V: *CxtI->user_begin(), P: m_FMul(L: m_Value(), R: m_Value()))) ||
4639 match(V: CxtI->getOperand(i: 0), P: m_FMul(L: m_Value(), R: m_Value()))))
4640 return 0;
4641 [[fallthrough]];
4642 case ISD::FADD:
4643 case ISD::FSUB:
4644 if (!Ty->getScalarType()->isFP128Ty())
4645 return LT.first;
4646 [[fallthrough]];
4647 case ISD::FMUL:
4648 case ISD::FDIV:
4649 // These nodes are marked as 'custom' just to lower them to SVE.
4650 // We know said lowering will incur no additional cost.
4651 if (!Ty->getScalarType()->isFP128Ty())
4652 return 2 * LT.first;
4653
4654 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4655 Opd2Info: Op2Info);
4656 case ISD::FREM:
4657 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4658 // those functions are not declared in the module.
4659 if (!Ty->isVectorTy())
4660 return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4661 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4662 Opd2Info: Op2Info);
4663 }
4664}
4665
4666InstructionCost
4667AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
4668 const SCEV *Ptr,
4669 TTI::TargetCostKind CostKind) const {
4670 // Address computations in vectorized code with non-consecutive addresses will
4671 // likely result in more instructions compared to scalar code where the
4672 // computation can more often be merged into the index mode. The resulting
4673 // extra micro-ops can significantly decrease throughput.
4674 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4675 int MaxMergeDistance = 64;
4676
4677 if (PtrTy->isVectorTy() && SE &&
4678 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + 1))
4679 return NumVectorInstToHideOverhead;
4680
4681 // In many cases the address computation is not merged into the instruction
4682 // addressing mode.
4683 return 1;
4684}
4685
4686/// Check whether Opcode1 has less throughput according to the scheduling
4687/// model than Opcode2.
4688bool AArch64TTIImpl::hasKnownLowerThroughputFromSchedulingModel(
4689 unsigned Opcode1, unsigned Opcode2) const {
4690 const MCSchedModel &Sched = ST->getSchedModel();
4691 const TargetInstrInfo *TII = ST->getInstrInfo();
4692 if (!Sched.hasInstrSchedModel())
4693 return false;
4694
4695 const MCSchedClassDesc *SCD1 =
4696 Sched.getSchedClassDesc(SchedClassIdx: TII->get(Opcode: Opcode1).getSchedClass());
4697 const MCSchedClassDesc *SCD2 =
4698 Sched.getSchedClassDesc(SchedClassIdx: TII->get(Opcode: Opcode2).getSchedClass());
4699 // We cannot handle variant scheduling classes without an MI. If we need to
4700 // support them for any of the instructions we query the information of we
4701 // might need to add a way to resolve them without a MI or not use the
4702 // scheduling info.
4703 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4704 "Cannot handle variant scheduling classes without an MI");
4705 if (!SCD1->isValid() || !SCD2->isValid())
4706 return false;
4707
4708 return MCSchedModel::getReciprocalThroughput(STI: *ST, SCDesc: *SCD1) >
4709 MCSchedModel::getReciprocalThroughput(STI: *ST, SCDesc: *SCD2);
4710}
4711
4712InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
4713 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4714 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
4715 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4716 // We don't lower some vector selects well that are wider than the register
4717 // width. TODO: Improve this with different cost kinds.
4718 if (isa<FixedVectorType>(Val: ValTy) && Opcode == Instruction::Select) {
4719 // We would need this many instructions to hide the scalarization happening.
4720 const int AmortizationCost = 20;
4721
4722 // If VecPred is not set, check if we can get a predicate from the context
4723 // instruction, if its type matches the requested ValTy.
4724 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4725 CmpPredicate CurrentPred;
4726 if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(),
4727 R: m_Value())))
4728 VecPred = CurrentPred;
4729 }
4730 // Check if we have a compare/select chain that can be lowered using
4731 // a (F)CMxx & BFI pair.
4732 if (CmpInst::isIntPredicate(P: VecPred) || VecPred == CmpInst::FCMP_OLE ||
4733 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4734 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4735 VecPred == CmpInst::FCMP_UNE) {
4736 static const auto ValidMinMaxTys = {
4737 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4738 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4739 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4740
4741 auto LT = getTypeLegalizationCost(Ty: ValTy);
4742 if (any_of(Range: ValidMinMaxTys, P: equal_to(Arg&: LT.second)) ||
4743 (ST->hasFullFP16() &&
4744 any_of(Range: ValidFP16MinMaxTys, P: equal_to(Arg&: LT.second))))
4745 return LT.first;
4746 }
4747
4748 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4749 {.ISD: Instruction::Select, .Dst: MVT::v2i1, .Src: MVT::v2f32, .Cost: 2},
4750 {.ISD: Instruction::Select, .Dst: MVT::v2i1, .Src: MVT::v2f64, .Cost: 2},
4751 {.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4f32, .Cost: 2},
4752 {.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4f16, .Cost: 2},
4753 {.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8f16, .Cost: 2},
4754 {.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: 16},
4755 {.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: 8},
4756 {.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: 16},
4757 {.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: 4 * AmortizationCost},
4758 {.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: 8 * AmortizationCost},
4759 {.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: 16 * AmortizationCost}};
4760
4761 EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
4762 EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
4763 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4764 if (const auto *Entry = ConvertCostTableLookup(Table: VectorSelectTbl, ISD: Opcode,
4765 Dst: SelCondTy.getSimpleVT(),
4766 Src: SelValTy.getSimpleVT()))
4767 return Entry->Cost;
4768 }
4769 }
4770
4771 if (Opcode == Instruction::FCmp) {
4772 if (auto PromotedCost = getFP16BF16PromoteCost(
4773 Ty: ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4774 // TODO: Consider costing SVE FCMPs.
4775 /*CanUseSVE=*/false, InstCost: [&](Type *PromotedTy) {
4776 InstructionCost Cost =
4777 getCmpSelInstrCost(Opcode, ValTy: PromotedTy, CondTy, VecPred,
4778 CostKind, Op1Info, Op2Info);
4779 if (isa<VectorType>(Val: PromotedTy))
4780 Cost += getCastInstrCost(
4781 Opcode: Instruction::Trunc,
4782 Dst: VectorType::getInteger(VTy: cast<VectorType>(Val: ValTy)),
4783 Src: VectorType::getInteger(VTy: cast<VectorType>(Val: PromotedTy)),
4784 CCH: TTI::CastContextHint::None, CostKind);
4785 return Cost;
4786 }))
4787 return *PromotedCost;
4788
4789 auto LT = getTypeLegalizationCost(Ty: ValTy);
4790 // Model unknown fp compares as a libcall.
4791 if (LT.second.getScalarType() != MVT::f64 &&
4792 LT.second.getScalarType() != MVT::f32 &&
4793 LT.second.getScalarType() != MVT::f16)
4794 return LT.first * getCallInstrCost(/*Function*/ F: nullptr, RetTy: ValTy,
4795 Tys: {ValTy, ValTy}, CostKind);
4796
4797 // Some comparison operators require expanding to multiple compares + or.
4798 unsigned Factor = 1;
4799 if (!CondTy->isVectorTy() &&
4800 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4801 Factor = 2; // fcmp with 2 selects
4802 else if (isa<FixedVectorType>(Val: ValTy) &&
4803 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4804 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4805 Factor = 3; // fcmxx+fcmyy+or
4806 else if (isa<ScalableVectorType>(Val: ValTy) &&
4807 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4808 Factor = 3; // fcmxx+fcmyy+or
4809
4810 if (isa<ScalableVectorType>(Val: ValTy) &&
4811 CostKind == TTI::TCK_RecipThroughput &&
4812 hasKnownLowerThroughputFromSchedulingModel(Opcode1: AArch64::FCMEQ_PPzZZ_S,
4813 Opcode2: AArch64::FCMEQv4f32))
4814 Factor *= 2;
4815
4816 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4817 }
4818
4819 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4820 // icmp(and, 0) as free, as we can make use of ands, but only if the
4821 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4822 // providing it will not cause performance regressions.
4823 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4824 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(Pred: VecPred) &&
4825 TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) &&
4826 match(V: I->getOperand(i: 0), P: m_And(L: m_Value(), R: m_Value()))) {
4827 if (match(V: I->getOperand(i: 1), P: m_Zero()))
4828 return 0;
4829
4830 // x >= 1 / x < 1 -> x > 0 / x <= 0
4831 if (match(V: I->getOperand(i: 1), P: m_One()) &&
4832 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4833 return 0;
4834
4835 // x <= -1 / x > -1 -> x > 0 / x <= 0
4836 if (match(V: I->getOperand(i: 1), P: m_AllOnes()) &&
4837 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4838 return 0;
4839 }
4840
4841 // The base case handles scalable vectors fine for now, since it treats the
4842 // cost as 1 * legalization cost.
4843 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4844 Op1Info, Op2Info, I);
4845}
4846
4847AArch64TTIImpl::TTI::MemCmpExpansionOptions
4848AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4849 TTI::MemCmpExpansionOptions Options;
4850 if (ST->requiresStrictAlign()) {
4851 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4852 // a bunch of instructions when strict align is enabled.
4853 return Options;
4854 }
4855 Options.AllowOverlappingLoads = true;
4856 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4857 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4858 // TODO: Though vector loads usually perform well on AArch64, in some targets
4859 // they may wake up the FP unit, which raises the power consumption. Perhaps
4860 // they could be used with no holds barred (-O3).
4861 Options.LoadSizes = {8, 4, 2, 1};
4862 Options.AllowedTailExpansions = {3, 5, 6};
4863 return Options;
4864}
4865
4866bool AArch64TTIImpl::prefersVectorizedAddressing() const {
4867 return ST->hasSVE();
4868}
4869
4870InstructionCost
4871AArch64TTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
4872 TTI::TargetCostKind CostKind) const {
4873 switch (MICA.getID()) {
4874 case Intrinsic::masked_scatter:
4875 case Intrinsic::masked_gather:
4876 return getGatherScatterOpCost(MICA, CostKind);
4877 case Intrinsic::masked_load:
4878 case Intrinsic::masked_store:
4879 return getMaskedMemoryOpCost(MICA, CostKind);
4880 }
4881 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
4882}
4883
4884InstructionCost
4885AArch64TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
4886 TTI::TargetCostKind CostKind) const {
4887 Type *Src = MICA.getDataType();
4888
4889 if (useNeonVector(Ty: Src))
4890 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
4891 auto LT = getTypeLegalizationCost(Ty: Src);
4892 if (!LT.first.isValid())
4893 return InstructionCost::getInvalid();
4894
4895 // Return an invalid cost for element types that we are unable to lower.
4896 auto *VT = cast<VectorType>(Val: Src);
4897 if (VT->getElementType()->isIntegerTy(Bitwidth: 1))
4898 return InstructionCost::getInvalid();
4899
4900 // The code-generator is currently not able to handle scalable vectors
4901 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4902 // it. This change will be removed when code-generation for these types is
4903 // sufficiently reliable.
4904 if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1))
4905 return InstructionCost::getInvalid();
4906
4907 return LT.first;
4908}
4909
4910// This function returns gather/scatter overhead either from
4911// user-provided value or specialized values per-target from \p ST.
4912static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4913 const AArch64Subtarget *ST) {
4914 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4915 "Should be called on only load or stores.");
4916 switch (Opcode) {
4917 case Instruction::Load:
4918 if (SVEGatherOverhead.getNumOccurrences() > 0)
4919 return SVEGatherOverhead;
4920 return ST->getGatherOverhead();
4921 break;
4922 case Instruction::Store:
4923 if (SVEScatterOverhead.getNumOccurrences() > 0)
4924 return SVEScatterOverhead;
4925 return ST->getScatterOverhead();
4926 break;
4927 default:
4928 llvm_unreachable("Shouldn't have reached here");
4929 }
4930}
4931
4932InstructionCost
4933AArch64TTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
4934 TTI::TargetCostKind CostKind) const {
4935
4936 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4937 MICA.getID() == Intrinsic::vp_gather)
4938 ? Instruction::Load
4939 : Instruction::Store;
4940
4941 Type *DataTy = MICA.getDataType();
4942 Align Alignment = MICA.getAlignment();
4943 const Instruction *I = MICA.getInst();
4944
4945 if (useNeonVector(Ty: DataTy) || !isLegalMaskedGatherScatter(DataType: DataTy))
4946 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
4947 auto *VT = cast<VectorType>(Val: DataTy);
4948 auto LT = getTypeLegalizationCost(Ty: DataTy);
4949 if (!LT.first.isValid())
4950 return InstructionCost::getInvalid();
4951
4952 // Return an invalid cost for element types that we are unable to lower.
4953 if (!LT.second.isVector() ||
4954 !isElementTypeLegalForScalableVector(Ty: VT->getElementType()) ||
4955 VT->getElementType()->isIntegerTy(Bitwidth: 1))
4956 return InstructionCost::getInvalid();
4957
4958 // The code-generator is currently not able to handle scalable vectors
4959 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4960 // it. This change will be removed when code-generation for these types is
4961 // sufficiently reliable.
4962 if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1))
4963 return InstructionCost::getInvalid();
4964
4965 ElementCount LegalVF = LT.second.getVectorElementCount();
4966 InstructionCost MemOpCost =
4967 getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: 0, CostKind,
4968 OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
4969 // Add on an overhead cost for using gathers/scatters.
4970 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4971 return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF);
4972}
4973
4974bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
4975 return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors();
4976}
4977
4978InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
4979 Align Alignment,
4980 unsigned AddressSpace,
4981 TTI::TargetCostKind CostKind,
4982 TTI::OperandValueInfo OpInfo,
4983 const Instruction *I) const {
4984 EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
4985 // Type legalization can't handle structs
4986 if (VT == MVT::Other)
4987 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
4988 CostKind);
4989
4990 auto LT = getTypeLegalizationCost(Ty);
4991 if (!LT.first.isValid())
4992 return InstructionCost::getInvalid();
4993
4994 // The code-generator is currently not able to handle scalable vectors
4995 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4996 // it. This change will be removed when code-generation for these types is
4997 // sufficiently reliable.
4998 // We also only support full register predicate loads and stores.
4999 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
5000 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1) ||
5001 (VTy->getElementType()->isIntegerTy(Bitwidth: 1) &&
5002 !VTy->getElementCount().isKnownMultipleOf(
5003 RHS: ElementCount::getScalable(MinVal: 16))))
5004 return InstructionCost::getInvalid();
5005
5006 // TODO: consider latency as well for TCK_SizeAndLatency.
5007 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
5008 return LT.first;
5009
5010 if (CostKind != TTI::TCK_RecipThroughput)
5011 return 1;
5012
5013 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
5014 LT.second.is128BitVector() && Alignment < Align(16)) {
5015 // Unaligned stores are extremely inefficient. We don't split all
5016 // unaligned 128-bit stores because the negative impact that has shown in
5017 // practice on inlined block copy code.
5018 // We make such stores expensive so that we will only vectorize if there
5019 // are 6 other instructions getting vectorized.
5020 const int AmortizationCost = 6;
5021
5022 return LT.first * 2 * AmortizationCost;
5023 }
5024
5025 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
5026 if (Ty->isPtrOrPtrVectorTy())
5027 return LT.first;
5028
5029 if (useNeonVector(Ty)) {
5030 // Check truncating stores and extending loads.
5031 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
5032 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
5033 if (VT == MVT::v4i8)
5034 return 2;
5035 // Otherwise we need to scalarize.
5036 return cast<FixedVectorType>(Val: Ty)->getNumElements() * 2;
5037 }
5038 EVT EltVT = VT.getVectorElementType();
5039 unsigned EltSize = EltVT.getScalarSizeInBits();
5040 if (!isPowerOf2_32(Value: EltSize) || EltSize < 8 || EltSize > 64 ||
5041 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
5042 return LT.first;
5043 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
5044 // widening to v4i8, which produces suboptimal results.
5045 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
5046 return LT.first;
5047
5048 // Check non-power-of-2 loads/stores for legal vector element types with
5049 // NEON. Non-power-of-2 memory ops will get broken down to a set of
5050 // operations on smaller power-of-2 ops, including ld1/st1.
5051 LLVMContext &C = Ty->getContext();
5052 InstructionCost Cost(0);
5053 SmallVector<EVT> TypeWorklist;
5054 TypeWorklist.push_back(Elt: VT);
5055 while (!TypeWorklist.empty()) {
5056 EVT CurrVT = TypeWorklist.pop_back_val();
5057 unsigned CurrNumElements = CurrVT.getVectorNumElements();
5058 if (isPowerOf2_32(Value: CurrNumElements)) {
5059 Cost += 1;
5060 continue;
5061 }
5062
5063 unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / 2;
5064 TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2));
5065 TypeWorklist.push_back(
5066 Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2));
5067 }
5068 return Cost;
5069 }
5070
5071 return LT.first;
5072}
5073
5074InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
5075 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5076 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5077 bool UseMaskForCond, bool UseMaskForGaps) const {
5078 assert(Factor >= 2 && "Invalid interleave factor");
5079 auto *VecVTy = cast<VectorType>(Val: VecTy);
5080
5081 if (VecTy->isScalableTy() && !ST->hasSVE())
5082 return InstructionCost::getInvalid();
5083
5084 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5085 // only have lowering for power-of-2 factors.
5086 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5087 // InterleavedAccessPass for ld3/st3
5088 if (VecTy->isScalableTy() && !isPowerOf2_32(Value: Factor))
5089 return InstructionCost::getInvalid();
5090
5091 // Vectorization for masked interleaved accesses is only enabled for scalable
5092 // VF.
5093 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5094 return InstructionCost::getInvalid();
5095
5096 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5097 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5098 auto *SubVecTy =
5099 VectorType::get(ElementType: VecVTy->getElementType(),
5100 EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor));
5101
5102 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5103 // Accesses having vector types that are a multiple of 128 bits can be
5104 // matched to more than one ldN/stN instruction.
5105 bool UseScalable;
5106 if (MinElts % Factor == 0 &&
5107 TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
5108 return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
5109 }
5110
5111 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5112 Alignment, AddressSpace, CostKind,
5113 UseMaskForCond, UseMaskForGaps);
5114}
5115
5116InstructionCost
5117AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
5118 InstructionCost Cost = 0;
5119 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5120 for (auto *I : Tys) {
5121 if (!I->isVectorTy())
5122 continue;
5123 if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() ==
5124 128)
5125 Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind) +
5126 getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind);
5127 }
5128 return Cost;
5129}
5130
5131unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
5132 return ST->getMaxInterleaveFactor();
5133}
5134
5135// For Falkor, we want to avoid having too many strided loads in a loop since
5136// that can exhaust the HW prefetcher resources. We adjust the unroller
5137// MaxCount preference below to attempt to ensure unrolling doesn't create too
5138// many strided loads.
5139static void
5140getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
5141 TargetTransformInfo::UnrollingPreferences &UP) {
5142 enum { MaxStridedLoads = 7 };
5143 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5144 int StridedLoads = 0;
5145 // FIXME? We could make this more precise by looking at the CFG and
5146 // e.g. not counting loads in each side of an if-then-else diamond.
5147 for (const auto BB : L->blocks()) {
5148 for (auto &I : *BB) {
5149 LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I);
5150 if (!LMemI)
5151 continue;
5152
5153 Value *PtrValue = LMemI->getPointerOperand();
5154 if (L->isLoopInvariant(V: PtrValue))
5155 continue;
5156
5157 const SCEV *LSCEV = SE.getSCEV(V: PtrValue);
5158 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV);
5159 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5160 continue;
5161
5162 // FIXME? We could take pairing of unrolled load copies into account
5163 // by looking at the AddRec, but we would probably have to limit this
5164 // to loops with no stores or other memory optimization barriers.
5165 ++StridedLoads;
5166 // We've seen enough strided loads that seeing more won't make a
5167 // difference.
5168 if (StridedLoads > MaxStridedLoads / 2)
5169 return StridedLoads;
5170 }
5171 }
5172 return StridedLoads;
5173 };
5174
5175 int StridedLoads = countStridedLoads(L, SE);
5176 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5177 << " strided loads\n");
5178 // Pick the largest power of 2 unroll count that won't result in too many
5179 // strided loads.
5180 if (StridedLoads) {
5181 UP.MaxCount = 1 << Log2_32(Value: MaxStridedLoads / StridedLoads);
5182 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5183 << UP.MaxCount << '\n');
5184 }
5185}
5186
5187// This function returns true if the loop:
5188// 1. Has a valid cost, and
5189// 2. Has a cost within the supplied budget.
5190// Otherwise it returns false.
5191static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI,
5192 InstructionCost Budget,
5193 unsigned *FinalSize) {
5194 // Estimate the size of the loop.
5195 InstructionCost LoopCost = 0;
5196
5197 for (auto *BB : L->getBlocks()) {
5198 for (auto &I : *BB) {
5199 SmallVector<const Value *, 4> Operands(I.operand_values());
5200 InstructionCost Cost =
5201 TTI.getInstructionCost(U: &I, Operands, CostKind: TTI::TCK_CodeSize);
5202 // This can happen with intrinsics that don't currently have a cost model
5203 // or for some operations that require SVE.
5204 if (!Cost.isValid())
5205 return false;
5206
5207 LoopCost += Cost;
5208 if (LoopCost > Budget)
5209 return false;
5210 }
5211 }
5212
5213 if (FinalSize)
5214 *FinalSize = LoopCost.getValue();
5215 return true;
5216}
5217
5218static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
5219 const AArch64TTIImpl &TTI) {
5220 // Only consider loops with unknown trip counts for which we can determine
5221 // a symbolic expression. Multi-exit loops with small known trip counts will
5222 // likely be unrolled anyway.
5223 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5224 if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC))
5225 return false;
5226
5227 // It might not be worth unrolling loops with low max trip counts. Restrict
5228 // this to max trip counts > 32 for now.
5229 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5230 if (MaxTC > 0 && MaxTC <= 32)
5231 return false;
5232
5233 // Make sure the loop size is <= 5.
5234 if (!isLoopSizeWithinBudget(L, TTI, Budget: 5, FinalSize: nullptr))
5235 return false;
5236
5237 // Small search loops with multiple exits can be highly beneficial to unroll.
5238 // We only care about loops with exactly two exiting blocks, although each
5239 // block could jump to the same exit block.
5240 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5241 if (Blocks.size() != 2)
5242 return false;
5243
5244 if (any_of(Range&: Blocks, P: [](BasicBlock *BB) {
5245 return !isa<UncondBrInst, CondBrInst>(Val: BB->getTerminator());
5246 }))
5247 return false;
5248
5249 return true;
5250}
5251
5252/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5253/// OOO engine's wide instruction window and various predictors.
5254static void
5255getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
5256 TargetTransformInfo::UnrollingPreferences &UP,
5257 const AArch64TTIImpl &TTI) {
5258 // Limit loops with structure that is highly likely to benefit from runtime
5259 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5260 // likely with complex control flow). Note that the heuristics here may be
5261 // overly conservative and we err on the side of avoiding runtime unrolling
5262 // rather than unroll excessively. They are all subject to further refinement.
5263 if (!L->isInnermost() || L->getNumBlocks() > 8)
5264 return;
5265
5266 // Loops with multiple exits are handled by common code.
5267 if (!L->getExitBlock())
5268 return;
5269
5270 // Check if the loop contains any reductions that could be parallelized when
5271 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5272 // a multiple of 2.
5273 bool HasParellelizableReductions =
5274 L->getNumBlocks() == 1 &&
5275 any_of(Range: L->getHeader()->phis(),
5276 P: [&SE, L](PHINode &Phi) {
5277 return canParallelizeReductionWhenUnrolling(Phi, L, SE: &SE);
5278 }) &&
5279 isLoopSizeWithinBudget(L, TTI, Budget: 12, FinalSize: nullptr);
5280 if (HasParellelizableReductions &&
5281 SE.getSmallConstantTripMultiple(L, ExitingBlock: L->getExitingBlock()) % 2 == 0) {
5282 UP.Partial = true;
5283 UP.MaxCount = 4;
5284 UP.AddAdditionalAccumulators = true;
5285 }
5286
5287 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5288 if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC) ||
5289 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5290 SE.getSmallConstantMaxTripCount(L) <= 32))
5291 return;
5292
5293 if (findStringMetadataForLoop(TheLoop: L, Name: "llvm.loop.isvectorized"))
5294 return;
5295
5296 if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
5297 return;
5298
5299 // Limit to loops with trip counts that are cheap to expand.
5300 UP.SCEVExpansionBudget = 1;
5301
5302 if (HasParellelizableReductions) {
5303 UP.Runtime = true;
5304 UP.DefaultUnrollRuntimeCount = 4;
5305 UP.AddAdditionalAccumulators = true;
5306 }
5307
5308 // Try to unroll small loops, of few-blocks with low budget, if they have
5309 // load/store dependencies, to expose more parallel memory access streams,
5310 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5311 BasicBlock *Header = L->getHeader();
5312 BasicBlock *Latch = L->getLoopLatch();
5313 if (Header == Latch) {
5314 // Estimate the size of the loop.
5315 unsigned Size;
5316 unsigned Width = 10;
5317 if (!isLoopSizeWithinBudget(L, TTI, Budget: Width, FinalSize: &Size))
5318 return;
5319
5320 // Try to find an unroll count that maximizes the use of the instruction
5321 // window, i.e. trying to fetch as many instructions per cycle as possible.
5322 unsigned MaxInstsPerLine = 16;
5323 unsigned UC = 1;
5324 unsigned BestUC = 1;
5325 unsigned SizeWithBestUC = BestUC * Size;
5326 while (UC <= 8) {
5327 unsigned SizeWithUC = UC * Size;
5328 if (SizeWithUC > 48)
5329 break;
5330 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5331 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5332 BestUC = UC;
5333 SizeWithBestUC = BestUC * Size;
5334 }
5335 UC++;
5336 }
5337
5338 if (BestUC == 1)
5339 return;
5340
5341 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5342 SmallVector<StoreInst *> Stores;
5343 for (auto *BB : L->blocks()) {
5344 for (auto &I : *BB) {
5345 Value *Ptr = getLoadStorePointerOperand(V: &I);
5346 if (!Ptr)
5347 continue;
5348 const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
5349 if (SE.isLoopInvariant(S: PtrSCEV, L))
5350 continue;
5351 if (isa<LoadInst>(Val: &I)) {
5352 LoadedValuesPlus.insert(Ptr: &I);
5353 // Include in-loop 1st users of loaded values.
5354 for (auto *U : I.users())
5355 if (L->contains(Inst: cast<Instruction>(Val: U)))
5356 LoadedValuesPlus.insert(Ptr: U);
5357 } else
5358 Stores.push_back(Elt: cast<StoreInst>(Val: &I));
5359 }
5360 }
5361
5362 if (none_of(Range&: Stores, P: [&LoadedValuesPlus](StoreInst *SI) {
5363 return LoadedValuesPlus.contains(Ptr: SI->getOperand(i_nocapture: 0));
5364 }))
5365 return;
5366
5367 UP.Runtime = true;
5368 UP.DefaultUnrollRuntimeCount = BestUC;
5369 return;
5370 }
5371
5372 // Try to runtime-unroll loops with early-continues depending on loop-varying
5373 // loads; this helps with branch-prediction for the early-continues.
5374 auto *Term = dyn_cast<CondBrInst>(Val: Header->getTerminator());
5375 SmallVector<BasicBlock *> Preds(predecessors(BB: Latch));
5376 if (!Term || Preds.size() == 1 || !llvm::is_contained(Range&: Preds, Element: Header) ||
5377 none_of(Range&: Preds, P: [L](BasicBlock *Pred) { return L->contains(BB: Pred); }))
5378 return;
5379
5380 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5381 [&](Instruction *I, unsigned Depth) -> bool {
5382 if (isa<PHINode>(Val: I) || L->isLoopInvariant(V: I) || Depth > 8)
5383 return false;
5384
5385 if (isa<LoadInst>(Val: I))
5386 return true;
5387
5388 return any_of(Range: I->operands(), P: [&](Value *V) {
5389 auto *I = dyn_cast<Instruction>(Val: V);
5390 return I && DependsOnLoopLoad(I, Depth + 1);
5391 });
5392 };
5393 CmpPredicate Pred;
5394 Instruction *I;
5395 if (match(V: Term, P: m_Br(C: m_ICmp(Pred, L: m_Instruction(I), R: m_Value()), T: m_Value(),
5396 F: m_Value())) &&
5397 DependsOnLoopLoad(I, 0)) {
5398 UP.Runtime = true;
5399 }
5400}
5401
5402void AArch64TTIImpl::getUnrollingPreferences(
5403 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
5404 OptimizationRemarkEmitter *ORE) const {
5405 // Enable partial unrolling and runtime unrolling.
5406 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5407
5408 UP.UpperBound = true;
5409
5410 // For inner loop, it is more likely to be a hot one, and the runtime check
5411 // can be promoted out from LICM pass, so the overhead is less, let's try
5412 // a larger threshold to unroll more loops.
5413 if (L->getLoopDepth() > 1)
5414 UP.PartialThreshold *= 2;
5415
5416 // Disable partial & runtime unrolling on -Os.
5417 UP.PartialOptSizeThreshold = 0;
5418
5419 // Scan the loop: don't unroll loops with calls as this could prevent
5420 // inlining. Don't unroll auto-vectorized loops either, though do allow
5421 // unrolling of the scalar remainder.
5422 bool IsVectorized = getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized");
5423 InstructionCost Cost = 0;
5424 for (auto *BB : L->getBlocks()) {
5425 for (auto &I : *BB) {
5426 // Both auto-vectorized loops and the scalar remainder have the
5427 // isvectorized attribute, so differentiate between them by the presence
5428 // of vector instructions.
5429 if (IsVectorized && I.getType()->isVectorTy())
5430 return;
5431 if (isa<CallBase>(Val: I)) {
5432 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I))
5433 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction())
5434 if (!isLoweredToCall(F))
5435 continue;
5436 return;
5437 }
5438
5439 SmallVector<const Value *, 4> Operands(I.operand_values());
5440 Cost += getInstructionCost(U: &I, Operands,
5441 CostKind: TargetTransformInfo::TCK_SizeAndLatency);
5442 }
5443 }
5444
5445 // Apply subtarget-specific unrolling preferences.
5446 if (ST->isAppleMLike())
5447 getAppleRuntimeUnrollPreferences(L, SE, UP, TTI: *this);
5448 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5449 EnableFalkorHWPFUnrollFix)
5450 getFalkorUnrollingPreferences(L, SE, UP);
5451
5452 // If this is a small, multi-exit loop similar to something like std::find,
5453 // then there is typically a performance improvement achieved by unrolling.
5454 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, TTI: *this)) {
5455 UP.RuntimeUnrollMultiExit = true;
5456 UP.Runtime = true;
5457 // Limit unroll count.
5458 UP.DefaultUnrollRuntimeCount = 4;
5459 // Allow slightly more costly trip-count expansion to catch search loops
5460 // with pointer inductions.
5461 UP.SCEVExpansionBudget = 5;
5462 return;
5463 }
5464
5465 // Enable runtime unrolling for in-order models
5466 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5467 // checking for that case, we can ensure that the default behaviour is
5468 // unchanged
5469 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5470 !ST->getSchedModel().isOutOfOrder()) {
5471 UP.Runtime = true;
5472 UP.Partial = true;
5473 UP.UnrollRemainder = true;
5474 UP.DefaultUnrollRuntimeCount = 4;
5475
5476 UP.UnrollAndJam = true;
5477 UP.UnrollAndJamInnerLoopThreshold = 60;
5478 }
5479
5480 // Force unrolling small loops can be very useful because of the branch
5481 // taken cost of the backedge.
5482 if (Cost < Aarch64ForceUnrollThreshold)
5483 UP.Force = true;
5484}
5485
5486void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
5487 TTI::PeelingPreferences &PP) const {
5488 BaseT::getPeelingPreferences(L, SE, PP);
5489}
5490
5491Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
5492 Type *ExpectedType,
5493 bool CanCreate) const {
5494 switch (Inst->getIntrinsicID()) {
5495 default:
5496 return nullptr;
5497 case Intrinsic::aarch64_neon_st2:
5498 case Intrinsic::aarch64_neon_st3:
5499 case Intrinsic::aarch64_neon_st4: {
5500 // Create a struct type
5501 StructType *ST = dyn_cast<StructType>(Val: ExpectedType);
5502 if (!CanCreate || !ST)
5503 return nullptr;
5504 unsigned NumElts = Inst->arg_size() - 1;
5505 if (ST->getNumElements() != NumElts)
5506 return nullptr;
5507 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5508 if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i))
5509 return nullptr;
5510 }
5511 Value *Res = PoisonValue::get(T: ExpectedType);
5512 IRBuilder<> Builder(Inst);
5513 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5514 Value *L = Inst->getArgOperand(i);
5515 Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i);
5516 }
5517 return Res;
5518 }
5519 case Intrinsic::aarch64_neon_ld2:
5520 case Intrinsic::aarch64_neon_ld3:
5521 case Intrinsic::aarch64_neon_ld4:
5522 if (Inst->getType() == ExpectedType)
5523 return Inst;
5524 return nullptr;
5525 }
5526}
5527
5528bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
5529 MemIntrinsicInfo &Info) const {
5530 switch (Inst->getIntrinsicID()) {
5531 default:
5532 break;
5533 case Intrinsic::aarch64_neon_ld2:
5534 case Intrinsic::aarch64_neon_ld3:
5535 case Intrinsic::aarch64_neon_ld4:
5536 Info.ReadMem = true;
5537 Info.WriteMem = false;
5538 Info.PtrVal = Inst->getArgOperand(i: 0);
5539 break;
5540 case Intrinsic::aarch64_neon_st2:
5541 case Intrinsic::aarch64_neon_st3:
5542 case Intrinsic::aarch64_neon_st4:
5543 Info.ReadMem = false;
5544 Info.WriteMem = true;
5545 Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - 1);
5546 break;
5547 }
5548
5549 switch (Inst->getIntrinsicID()) {
5550 default:
5551 return false;
5552 case Intrinsic::aarch64_neon_ld2:
5553 case Intrinsic::aarch64_neon_st2:
5554 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5555 break;
5556 case Intrinsic::aarch64_neon_ld3:
5557 case Intrinsic::aarch64_neon_st3:
5558 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5559 break;
5560 case Intrinsic::aarch64_neon_ld4:
5561 case Intrinsic::aarch64_neon_st4:
5562 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5563 break;
5564 }
5565 return true;
5566}
5567
5568/// See if \p I should be considered for address type promotion. We check if \p
5569/// I is a sext with right type and used in memory accesses. If it used in a
5570/// "complex" getelementptr, we allow it to be promoted without finding other
5571/// sext instructions that sign extended the same initial value. A getelementptr
5572/// is considered as "complex" if it has more than 2 operands.
5573bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
5574 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5575 bool Considerable = false;
5576 AllowPromotionWithoutCommonHeader = false;
5577 if (!isa<SExtInst>(Val: &I))
5578 return false;
5579 Type *ConsideredSExtType =
5580 Type::getInt64Ty(C&: I.getParent()->getParent()->getContext());
5581 if (I.getType() != ConsideredSExtType)
5582 return false;
5583 // See if the sext is the one with the right type and used in at least one
5584 // GetElementPtrInst.
5585 for (const User *U : I.users()) {
5586 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) {
5587 Considerable = true;
5588 // A getelementptr is considered as "complex" if it has more than 2
5589 // operands. We will promote a SExt used in such complex GEP as we
5590 // expect some computation to be merged if they are done on 64 bits.
5591 if (GEPInst->getNumOperands() > 2) {
5592 AllowPromotionWithoutCommonHeader = true;
5593 break;
5594 }
5595 }
5596 }
5597 return Considerable;
5598}
5599
5600bool AArch64TTIImpl::isLegalToVectorizeReduction(
5601 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5602 if (!VF.isScalable())
5603 return true;
5604
5605 Type *Ty = RdxDesc.getRecurrenceType();
5606 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5607 return false;
5608
5609 switch (RdxDesc.getRecurrenceKind()) {
5610 case RecurKind::Sub:
5611 case RecurKind::AddChainWithSubs:
5612 case RecurKind::Add:
5613 case RecurKind::FAdd:
5614 case RecurKind::And:
5615 case RecurKind::Or:
5616 case RecurKind::Xor:
5617 case RecurKind::SMin:
5618 case RecurKind::SMax:
5619 case RecurKind::UMin:
5620 case RecurKind::UMax:
5621 case RecurKind::FMin:
5622 case RecurKind::FMax:
5623 case RecurKind::FMulAdd:
5624 case RecurKind::AnyOf:
5625 case RecurKind::FindLast:
5626 return true;
5627 default:
5628 return false;
5629 }
5630}
5631
5632InstructionCost
5633AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
5634 FastMathFlags FMF,
5635 TTI::TargetCostKind CostKind) const {
5636 // The code-generator is currently not able to handle scalable vectors
5637 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5638 // it. This change will be removed when code-generation for these types is
5639 // sufficiently reliable.
5640 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
5641 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
5642 return InstructionCost::getInvalid();
5643
5644 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5645
5646 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5647 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5648
5649 InstructionCost LegalizationCost = 0;
5650 if (LT.first > 1) {
5651 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Ty->getContext());
5652 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5653 LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - 1);
5654 }
5655
5656 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5657}
5658
5659InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
5660 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5661 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5662 InstructionCost LegalizationCost = 0;
5663 if (LT.first > 1) {
5664 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: ValTy->getContext());
5665 LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind);
5666 LegalizationCost *= LT.first - 1;
5667 }
5668
5669 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5670 assert(ISD && "Invalid opcode");
5671 // Add the final reduction cost for the legal horizontal reduction
5672 switch (ISD) {
5673 case ISD::ADD:
5674 case ISD::AND:
5675 case ISD::OR:
5676 case ISD::XOR:
5677 case ISD::FADD:
5678 return LegalizationCost + 2;
5679 default:
5680 return InstructionCost::getInvalid();
5681 }
5682}
5683
5684InstructionCost
5685AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
5686 std::optional<FastMathFlags> FMF,
5687 TTI::TargetCostKind CostKind) const {
5688 // The code-generator is currently not able to handle scalable vectors
5689 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5690 // it. This change will be removed when code-generation for these types is
5691 // sufficiently reliable.
5692 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: ValTy))
5693 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
5694 return InstructionCost::getInvalid();
5695
5696 if (TTI::requiresOrderedReduction(FMF)) {
5697 if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) {
5698 InstructionCost BaseCost =
5699 BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
5700 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5701 // end up vectorizing for more computationally intensive loops.
5702 return BaseCost + FixedVTy->getNumElements();
5703 }
5704
5705 if (Opcode != Instruction::FAdd)
5706 return InstructionCost::getInvalid();
5707
5708 auto *VTy = cast<ScalableVectorType>(Val: ValTy);
5709 InstructionCost Cost =
5710 getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind);
5711 Cost *= getMaxNumElements(VF: VTy->getElementCount());
5712 return Cost;
5713 }
5714
5715 if (isa<ScalableVectorType>(Val: ValTy))
5716 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5717
5718 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5719 MVT MTy = LT.second;
5720 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5721 assert(ISD && "Invalid opcode");
5722
5723 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5724 // instructions as twice a normal vector add, plus 1 for each legalization
5725 // step (LT.first). This is the only arithmetic vector reduction operation for
5726 // which we have an instruction.
5727 // OR, XOR and AND costs should match the codegen from:
5728 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5729 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5730 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5731 static const CostTblEntry CostTblNoPairwise[]{
5732 {.ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: 2},
5733 {.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 2},
5734 {.ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: 2},
5735 {.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 2},
5736 {.ISD: ISD::ADD, .Type: MVT::v2i32, .Cost: 2},
5737 {.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 2},
5738 {.ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 2},
5739 {.ISD: ISD::OR, .Type: MVT::v8i8, .Cost: 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5740 {.ISD: ISD::OR, .Type: MVT::v16i8, .Cost: 7}, // ext + orr + same as v8i8
5741 {.ISD: ISD::OR, .Type: MVT::v4i16, .Cost: 4}, // fmov + orr_lsr + lsr + orr
5742 {.ISD: ISD::OR, .Type: MVT::v8i16, .Cost: 6}, // ext + orr + same as v4i16
5743 {.ISD: ISD::OR, .Type: MVT::v2i32, .Cost: 3}, // fmov + lsr + orr
5744 {.ISD: ISD::OR, .Type: MVT::v4i32, .Cost: 5}, // ext + orr + same as v2i32
5745 {.ISD: ISD::OR, .Type: MVT::v2i64, .Cost: 3}, // ext + orr + fmov
5746 {.ISD: ISD::XOR, .Type: MVT::v8i8, .Cost: 5}, // Same as above for or...
5747 {.ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: 7},
5748 {.ISD: ISD::XOR, .Type: MVT::v4i16, .Cost: 4},
5749 {.ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: 6},
5750 {.ISD: ISD::XOR, .Type: MVT::v2i32, .Cost: 3},
5751 {.ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: 5},
5752 {.ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: 3},
5753 {.ISD: ISD::AND, .Type: MVT::v8i8, .Cost: 5}, // Same as above for or...
5754 {.ISD: ISD::AND, .Type: MVT::v16i8, .Cost: 7},
5755 {.ISD: ISD::AND, .Type: MVT::v4i16, .Cost: 4},
5756 {.ISD: ISD::AND, .Type: MVT::v8i16, .Cost: 6},
5757 {.ISD: ISD::AND, .Type: MVT::v2i32, .Cost: 3},
5758 {.ISD: ISD::AND, .Type: MVT::v4i32, .Cost: 5},
5759 {.ISD: ISD::AND, .Type: MVT::v2i64, .Cost: 3},
5760 };
5761 switch (ISD) {
5762 default:
5763 break;
5764 case ISD::FADD:
5765 if (Type *EltTy = ValTy->getScalarType();
5766 // FIXME: For half types without fullfp16 support, this could extend and
5767 // use a fp32 faddp reduction but current codegen unrolls.
5768 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5769 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5770 const unsigned NElts = MTy.getVectorNumElements();
5771 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5772 isPowerOf2_32(Value: NElts))
5773 // Reduction corresponding to series of fadd instructions is lowered to
5774 // series of faddp instructions. faddp has latency/throughput that
5775 // matches fadd instruction and hence, every faddp instruction can be
5776 // considered to have a relative cost = 1 with
5777 // CostKind = TCK_RecipThroughput.
5778 // An faddp will pairwise add vector elements, so the size of input
5779 // vector reduces by half every time, requiring
5780 // #(faddp instructions) = log2_32(NElts).
5781 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(Value: NElts);
5782 }
5783 break;
5784 case ISD::ADD:
5785 if (const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy))
5786 return (LT.first - 1) + Entry->Cost;
5787 break;
5788 case ISD::XOR:
5789 case ISD::AND:
5790 case ISD::OR:
5791 const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy);
5792 if (!Entry)
5793 break;
5794 auto *ValVTy = cast<FixedVectorType>(Val: ValTy);
5795 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5796 isPowerOf2_32(Value: ValVTy->getNumElements())) {
5797 InstructionCost ExtraCost = 0;
5798 if (LT.first != 1) {
5799 // Type needs to be split, so there is an extra cost of LT.first - 1
5800 // arithmetic ops.
5801 auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(),
5802 NumElts: MTy.getVectorNumElements());
5803 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5804 ExtraCost *= LT.first - 1;
5805 }
5806 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5807 auto Cost = ValVTy->getElementType()->isIntegerTy(Bitwidth: 1) ? 2 : Entry->Cost;
5808 return Cost + ExtraCost;
5809 }
5810 break;
5811 }
5812 return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
5813}
5814
5815InstructionCost AArch64TTIImpl::getExtendedReductionCost(
5816 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5817 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5818 EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
5819 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
5820
5821 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5822 VecVT.getSizeInBits() >= 64) {
5823 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
5824
5825 // The legal cases are:
5826 // UADDLV 8/16/32->32
5827 // UADDLP 32->64
5828 unsigned RevVTSize = ResVT.getSizeInBits();
5829 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5830 RevVTSize <= 32) ||
5831 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5832 RevVTSize <= 32) ||
5833 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5834 RevVTSize <= 64))
5835 return (LT.first - 1) * 2 + 2;
5836 }
5837
5838 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: VecTy, FMF,
5839 CostKind);
5840}
5841
5842InstructionCost
5843AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5844 Type *ResTy, VectorType *VecTy,
5845 TTI::TargetCostKind CostKind) const {
5846 EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
5847 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
5848
5849 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5850 RedOpcode == Instruction::Add) {
5851 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
5852
5853 // The legal cases with dotprod are
5854 // UDOT 8->32
5855 // Which requires an additional uaddv to sum the i32 values.
5856 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5857 ResVT == MVT::i32)
5858 return LT.first + 2;
5859 }
5860
5861 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, Ty: VecTy,
5862 CostKind);
5863}
5864
5865InstructionCost
5866AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index,
5867 TTI::TargetCostKind CostKind) const {
5868 static const CostTblEntry ShuffleTbl[] = {
5869 { .ISD: TTI::SK_Splice, .Type: MVT::nxv16i8, .Cost: 1 },
5870 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8i16, .Cost: 1 },
5871 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4i32, .Cost: 1 },
5872 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2i64, .Cost: 1 },
5873 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f16, .Cost: 1 },
5874 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f16, .Cost: 1 },
5875 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8f16, .Cost: 1 },
5876 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2bf16, .Cost: 1 },
5877 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4bf16, .Cost: 1 },
5878 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8bf16, .Cost: 1 },
5879 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f32, .Cost: 1 },
5880 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f32, .Cost: 1 },
5881 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f64, .Cost: 1 },
5882 };
5883
5884 // The code-generator is currently not able to handle scalable vectors
5885 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5886 // it. This change will be removed when code-generation for these types is
5887 // sufficiently reliable.
5888 if (Tp->getElementCount() == ElementCount::getScalable(MinVal: 1))
5889 return InstructionCost::getInvalid();
5890
5891 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
5892 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Tp->getContext());
5893 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5894 ? TLI->getPromotedVTForPredicate(VT: EVT(LT.second))
5895 : LT.second;
5896 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Context&: Tp->getContext());
5897 InstructionCost LegalizationCost = 0;
5898 if (Index < 0) {
5899 LegalizationCost =
5900 getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy,
5901 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) +
5902 getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy,
5903 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
5904 }
5905
5906 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5907 // Cost performed on a promoted type.
5908 if (LT.second.getScalarType() == MVT::i1) {
5909 LegalizationCost +=
5910 getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy,
5911 CCH: TTI::CastContextHint::None, CostKind) +
5912 getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy,
5913 CCH: TTI::CastContextHint::None, CostKind);
5914 }
5915 const auto *Entry =
5916 CostTableLookup(Table: ShuffleTbl, ISD: TTI::SK_Splice, Ty: PromotedVT.getSimpleVT());
5917 assert(Entry && "Illegal Type for Splice");
5918 LegalizationCost += Entry->Cost;
5919 return LegalizationCost * LT.first;
5920}
5921
5922InstructionCost AArch64TTIImpl::getPartialReductionCost(
5923 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5924 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
5925 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5926 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
5927 InstructionCost Invalid = InstructionCost::getInvalid();
5928
5929 if (CostKind != TTI::TCK_RecipThroughput)
5930 return Invalid;
5931
5932 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5933 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5934 return Invalid;
5935
5936 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
5937 Opcode != Instruction::FAdd) ||
5938 OpAExtend == TTI::PR_None)
5939 return Invalid;
5940
5941 // Floating-point partial reductions are invalid if `reassoc` and `contract`
5942 // are not allowed.
5943 if (AccumType->isFloatingPointTy()) {
5944 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
5945 if (!FMF->allowReassoc() || !FMF->allowContract())
5946 return Invalid;
5947 } else {
5948 assert(!FMF &&
5949 "FastMathFlags only apply to floating-point partial reductions");
5950 }
5951
5952 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5953 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5954 "Unexpected values for OpBExtend or InputTypeB");
5955
5956 // We only support multiply binary operations for now, and for muls we
5957 // require the types being extended to be the same.
5958 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
5959 InputTypeA != InputTypeB))
5960 return Invalid;
5961
5962 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5963 if (IsUSDot && !ST->hasMatMulInt8())
5964 return Invalid;
5965
5966 unsigned Ratio =
5967 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5968 if (VF.getKnownMinValue() <= Ratio)
5969 return Invalid;
5970
5971 VectorType *InputVectorType = VectorType::get(ElementType: InputTypeA, EC: VF);
5972 VectorType *AccumVectorType =
5973 VectorType::get(ElementType: AccumType, EC: VF.divideCoefficientBy(RHS: Ratio));
5974 // We don't yet support all kinds of legalization.
5975 auto TC = TLI->getTypeConversion(Context&: AccumVectorType->getContext(),
5976 VT: EVT::getEVT(Ty: AccumVectorType));
5977 switch (TC.first) {
5978 default:
5979 return Invalid;
5980 case TargetLowering::TypeLegal:
5981 case TargetLowering::TypePromoteInteger:
5982 case TargetLowering::TypeSplitVector:
5983 // The legalised type (e.g. after splitting) must be legal too.
5984 if (TLI->getTypeAction(Context&: AccumVectorType->getContext(), VT: TC.second) !=
5985 TargetLowering::TypeLegal)
5986 return Invalid;
5987 break;
5988 }
5989
5990 std::pair<InstructionCost, MVT> AccumLT =
5991 getTypeLegalizationCost(Ty: AccumVectorType);
5992 std::pair<InstructionCost, MVT> InputLT =
5993 getTypeLegalizationCost(Ty: InputVectorType);
5994
5995 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5996
5997 // The sub/negation cannot be folded into the operands of
5998 // ISD::PARTIAL_REDUCE_*MLA, so make the cost more expensive.
5999 if (Opcode == Instruction::Sub)
6000 Cost += 8;
6001
6002 // Prefer using full types by costing half-full input types as more expensive.
6003 if (TypeSize::isKnownLT(LHS: InputVectorType->getPrimitiveSizeInBits(),
6004 RHS: TypeSize::getScalable(MinimumSize: 128)))
6005 // FIXME: This can be removed after the cost of the extends are folded into
6006 // the dot-product expression in VPlan, after landing:
6007 // https://github.com/llvm/llvm-project/pull/147302
6008 Cost *= 2;
6009
6010 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
6011 // i16 -> i64 is natively supported for udot/sdot
6012 if (AccumLT.second.getScalarType() == MVT::i64 &&
6013 InputLT.second.getScalarType() == MVT::i16)
6014 return Cost;
6015 // i16 -> i32 is natively supported with SVE2p1
6016 if (AccumLT.second.getScalarType() == MVT::i32 &&
6017 InputLT.second.getScalarType() == MVT::i16 &&
6018 (ST->hasSVE2p1() || ST->hasSME2()))
6019 return Cost;
6020 // i8 -> i64 is supported with an extra level of extends
6021 if (AccumLT.second.getScalarType() == MVT::i64 &&
6022 InputLT.second.getScalarType() == MVT::i8)
6023 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
6024 // because it requires two extra extends on the inputs. But if we'd change
6025 // that now, a regular reduction would be cheaper because the costs of
6026 // the extends in the IR are still counted. This can be fixed
6027 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
6028 return Cost;
6029 }
6030
6031 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
6032 if (ST->isSVEorStreamingSVEAvailable() ||
6033 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
6034 ST->hasDotProd())) {
6035 if (AccumLT.second.getScalarType() == MVT::i32 &&
6036 InputLT.second.getScalarType() == MVT::i8)
6037 return Cost;
6038 }
6039
6040 // f16 -> f32 is natively supported for fdot
6041 if (Opcode == Instruction::FAdd && (ST->hasSME2() || ST->hasSVE2p1())) {
6042 if (AccumLT.second.getScalarType() == MVT::f32 &&
6043 InputLT.second.getScalarType() == MVT::f16 &&
6044 AccumLT.second.getVectorMinNumElements() == 4 &&
6045 InputLT.second.getVectorMinNumElements() == 8)
6046 return Cost;
6047 // Floating-point types aren't promoted, so expanding the partial reduction
6048 // is more expensive.
6049 return Cost + 20;
6050 }
6051
6052 // Add additional cost for the extends that would need to be inserted.
6053 return Cost + 2;
6054}
6055
6056InstructionCost
6057AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
6058 VectorType *SrcTy, ArrayRef<int> Mask,
6059 TTI::TargetCostKind CostKind, int Index,
6060 VectorType *SubTp, ArrayRef<const Value *> Args,
6061 const Instruction *CxtI) const {
6062 assert((Mask.empty() || DstTy->isScalableTy() ||
6063 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6064 "Expected the Mask to match the return size if given");
6065 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6066 "Expected the same scalar types");
6067 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
6068
6069 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6070 // into smaller vectors and sum the cost of each shuffle.
6071 if (!Mask.empty() && isa<FixedVectorType>(Val: SrcTy) && LT.second.isVector() &&
6072 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6073 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6074 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6075 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6076 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6077 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6078 // cost than just the load.
6079 if (Args.size() >= 1 && isa<LoadInst>(Val: Args[0]) &&
6080 (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 3) ||
6081 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 4)))
6082 return std::max<InstructionCost>(a: 1, b: LT.first / 4);
6083
6084 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6085 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6086 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6087 // cost than just the store.
6088 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) &&
6089 (ShuffleVectorInst::isInterleaveMask(
6090 Mask, Factor: 4, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2) ||
6091 ShuffleVectorInst::isInterleaveMask(
6092 Mask, Factor: 3, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2)))
6093 return LT.first;
6094
6095 unsigned TpNumElts = Mask.size();
6096 unsigned LTNumElts = LT.second.getVectorNumElements();
6097 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6098 VectorType *NTp = VectorType::get(ElementType: SrcTy->getScalarType(),
6099 EC: LT.second.getVectorElementCount());
6100 InstructionCost Cost;
6101 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6102 PreviousCosts;
6103 for (unsigned N = 0; N < NumVecs; N++) {
6104 SmallVector<int> NMask;
6105 // Split the existing mask into chunks of size LTNumElts. Track the source
6106 // sub-vectors to ensure the result has at most 2 inputs.
6107 unsigned Source1 = -1U, Source2 = -1U;
6108 unsigned NumSources = 0;
6109 for (unsigned E = 0; E < LTNumElts; E++) {
6110 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6111 : PoisonMaskElem;
6112 if (MaskElt < 0) {
6113 NMask.push_back(Elt: PoisonMaskElem);
6114 continue;
6115 }
6116
6117 // Calculate which source from the input this comes from and whether it
6118 // is new to us.
6119 unsigned Source = MaskElt / LTNumElts;
6120 if (NumSources == 0) {
6121 Source1 = Source;
6122 NumSources = 1;
6123 } else if (NumSources == 1 && Source != Source1) {
6124 Source2 = Source;
6125 NumSources = 2;
6126 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6127 NumSources++;
6128 }
6129
6130 // Add to the new mask. For the NumSources>2 case these are not correct,
6131 // but are only used for the modular lane number.
6132 if (Source == Source1)
6133 NMask.push_back(Elt: MaskElt % LTNumElts);
6134 else if (Source == Source2)
6135 NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts);
6136 else
6137 NMask.push_back(Elt: MaskElt % LTNumElts);
6138 }
6139 // Check if we have already generated this sub-shuffle, which means we
6140 // will have already generated the output. For example a <16 x i32> splat
6141 // will be the same sub-splat 4 times, which only needs to be generated
6142 // once and reused.
6143 auto Result =
6144 PreviousCosts.insert(x: {std::make_tuple(args&: Source1, args&: Source2, args&: NMask), 0});
6145 // Check if it was already in the map (already costed).
6146 if (!Result.second)
6147 continue;
6148 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6149 // getShuffleCost. If not then cost it using the worst case as the number
6150 // of element moves into a new vector.
6151 InstructionCost NCost =
6152 NumSources <= 2
6153 ? getShuffleCost(Kind: NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6154 : TTI::SK_PermuteTwoSrc,
6155 DstTy: NTp, SrcTy: NTp, Mask: NMask, CostKind, Index: 0, SubTp: nullptr, Args,
6156 CxtI)
6157 : LTNumElts;
6158 Result.first->second = NCost;
6159 Cost += NCost;
6160 }
6161 return Cost;
6162 }
6163
6164 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
6165 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6166 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6167 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6168 // This currently only handles low or high extracts to prevent SLP vectorizer
6169 // regressions.
6170 // Note that SVE's ext instruction is destructive, but it can be fused with
6171 // a movprfx to act like a constructive instruction.
6172 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6173 if (LT.second.getFixedSizeInBits() >= 128 &&
6174 cast<FixedVectorType>(Val: SubTp)->getNumElements() ==
6175 LT.second.getVectorNumElements() / 2) {
6176 if (Index == 0)
6177 return 0;
6178 if (Index == (int)LT.second.getVectorNumElements() / 2)
6179 return 1;
6180 }
6181 Kind = TTI::SK_PermuteSingleSrc;
6182 }
6183 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6184 // the code to handle length-changing shuffles.
6185 if (Kind == TTI::SK_InsertSubvector) {
6186 LT = getTypeLegalizationCost(Ty: DstTy);
6187 SrcTy = DstTy;
6188 }
6189
6190 // Check for identity masks, which we can treat as free for both fixed and
6191 // scalable vector paths.
6192 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6193 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6194 all_of(Range: enumerate(First&: Mask), P: [](const auto &M) {
6195 return M.value() < 0 || M.value() == (int)M.index();
6196 }))
6197 return 0;
6198
6199 // Segmented shuffle matching.
6200 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Val: SrcTy) &&
6201 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6202 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6203 RHS: AArch64::SVEBitsPerBlock)) {
6204
6205 FixedVectorType *VTy = cast<FixedVectorType>(Val: SrcTy);
6206 unsigned Segments =
6207 VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
6208 unsigned SegmentElts = VTy->getNumElements() / Segments;
6209
6210 // dupq zd.t, zn.t[idx]
6211 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6212 ST->isSVEorStreamingSVEAvailable() &&
6213 isDUPQMask(Mask, Segments, SegmentSize: SegmentElts))
6214 return LT.first;
6215
6216 // mov zd.q, vn
6217 if (ST->isSVEorStreamingSVEAvailable() &&
6218 isDUPFirstSegmentMask(Mask, Segments, SegmentSize: SegmentElts))
6219 return LT.first;
6220 }
6221
6222 // Check for broadcast loads, which are supported by the LD1R instruction.
6223 // In terms of code-size, the shuffle vector is free when a load + dup get
6224 // folded into a LD1R. That's what we check and return here. For performance
6225 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6226 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6227 // that we model the load + dup sequence slightly higher because LD1R is a
6228 // high latency instruction.
6229 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6230 bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args[0]);
6231 if (IsLoad && LT.second.isVector() &&
6232 isLegalBroadcastLoad(ElementTy: SrcTy->getElementType(),
6233 NumElements: LT.second.getVectorElementCount()))
6234 return 0;
6235 }
6236
6237 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6238 // from the perfect shuffle tables.
6239 if (Mask.size() == 4 &&
6240 SrcTy->getElementCount() == ElementCount::getFixed(MinVal: 4) &&
6241 (SrcTy->getScalarSizeInBits() == 16 ||
6242 SrcTy->getScalarSizeInBits() == 32) &&
6243 all_of(Range&: Mask, P: [](int E) { return E < 8; }))
6244 return getPerfectShuffleCost(M: Mask);
6245
6246 // Check for other shuffles that are not SK_ kinds but we have native
6247 // instructions for, for example ZIP and UZP.
6248 unsigned Unused;
6249 if (LT.second.isFixedLengthVector() &&
6250 LT.second.getVectorNumElements() == Mask.size() &&
6251 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6252 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6253 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6254 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6255 Kind == TTI::SK_InsertSubvector) &&
6256 (isZIPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused, OperandOrderOut&: Unused) ||
6257 isTRNMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused, OperandOrderOut&: Unused) ||
6258 isUZPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) ||
6259 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6260 NumElts: LT.second.getVectorNumElements(), BlockSize: 16) ||
6261 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6262 NumElts: LT.second.getVectorNumElements(), BlockSize: 32) ||
6263 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6264 NumElts: LT.second.getVectorNumElements(), BlockSize: 64) ||
6265 // Check for non-zero lane splats
6266 all_of(Range: drop_begin(RangeOrContainer&: Mask),
6267 P: [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6268 return 1;
6269
6270 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6271 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6272 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6273 static const CostTblEntry ShuffleTbl[] = {
6274 // Broadcast shuffle kinds can be performed with 'dup'.
6275 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: 1},
6276 {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: 1},
6277 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: 1},
6278 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: 1},
6279 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i32, .Cost: 1},
6280 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: 1},
6281 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: 1},
6282 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f16, .Cost: 1},
6283 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: 1},
6284 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4bf16, .Cost: 1},
6285 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8bf16, .Cost: 1},
6286 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f32, .Cost: 1},
6287 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: 1},
6288 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: 1},
6289 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6290 // 'zip1/zip2' instructions.
6291 {.ISD: TTI::SK_Transpose, .Type: MVT::v8i8, .Cost: 1},
6292 {.ISD: TTI::SK_Transpose, .Type: MVT::v16i8, .Cost: 1},
6293 {.ISD: TTI::SK_Transpose, .Type: MVT::v4i16, .Cost: 1},
6294 {.ISD: TTI::SK_Transpose, .Type: MVT::v8i16, .Cost: 1},
6295 {.ISD: TTI::SK_Transpose, .Type: MVT::v2i32, .Cost: 1},
6296 {.ISD: TTI::SK_Transpose, .Type: MVT::v4i32, .Cost: 1},
6297 {.ISD: TTI::SK_Transpose, .Type: MVT::v2i64, .Cost: 1},
6298 {.ISD: TTI::SK_Transpose, .Type: MVT::v4f16, .Cost: 1},
6299 {.ISD: TTI::SK_Transpose, .Type: MVT::v8f16, .Cost: 1},
6300 {.ISD: TTI::SK_Transpose, .Type: MVT::v4bf16, .Cost: 1},
6301 {.ISD: TTI::SK_Transpose, .Type: MVT::v8bf16, .Cost: 1},
6302 {.ISD: TTI::SK_Transpose, .Type: MVT::v2f32, .Cost: 1},
6303 {.ISD: TTI::SK_Transpose, .Type: MVT::v4f32, .Cost: 1},
6304 {.ISD: TTI::SK_Transpose, .Type: MVT::v2f64, .Cost: 1},
6305 // Select shuffle kinds.
6306 // TODO: handle vXi8/vXi16.
6307 {.ISD: TTI::SK_Select, .Type: MVT::v2i32, .Cost: 1}, // mov.
6308 {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: 2}, // rev+trn (or similar).
6309 {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: 1}, // mov.
6310 {.ISD: TTI::SK_Select, .Type: MVT::v2f32, .Cost: 1}, // mov.
6311 {.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: 2}, // rev+trn (or similar).
6312 {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: 1}, // mov.
6313 // PermuteSingleSrc shuffle kinds.
6314 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i32, .Cost: 1}, // mov.
6315 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: 3}, // perfectshuffle worst case.
6316 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: 1}, // mov.
6317 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f32, .Cost: 1}, // mov.
6318 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: 3}, // perfectshuffle worst case.
6319 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: 1}, // mov.
6320 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: 3}, // perfectshuffle worst case.
6321 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f16, .Cost: 3}, // perfectshuffle worst case.
6322 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4bf16, .Cost: 3}, // same
6323 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: 8}, // constpool + load + tbl
6324 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: 8}, // constpool + load + tbl
6325 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8bf16, .Cost: 8}, // constpool + load + tbl
6326 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: 8}, // constpool + load + tbl
6327 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: 8}, // constpool + load + tbl
6328 // Reverse can be lowered with `rev`.
6329 {.ISD: TTI::SK_Reverse, .Type: MVT::v2i32, .Cost: 1}, // REV64
6330 {.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: 2}, // REV64; EXT
6331 {.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: 1}, // EXT
6332 {.ISD: TTI::SK_Reverse, .Type: MVT::v2f32, .Cost: 1}, // REV64
6333 {.ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: 2}, // REV64; EXT
6334 {.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: 1}, // EXT
6335 {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: 2}, // REV64; EXT
6336 {.ISD: TTI::SK_Reverse, .Type: MVT::v8bf16, .Cost: 2}, // REV64; EXT
6337 {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: 2}, // REV64; EXT
6338 {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: 2}, // REV64; EXT
6339 {.ISD: TTI::SK_Reverse, .Type: MVT::v4f16, .Cost: 1}, // REV64
6340 {.ISD: TTI::SK_Reverse, .Type: MVT::v4bf16, .Cost: 1}, // REV64
6341 {.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: 1}, // REV64
6342 {.ISD: TTI::SK_Reverse, .Type: MVT::v8i8, .Cost: 1}, // REV64
6343 // Splice can all be lowered as `ext`.
6344 {.ISD: TTI::SK_Splice, .Type: MVT::v2i32, .Cost: 1},
6345 {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: 1},
6346 {.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: 1},
6347 {.ISD: TTI::SK_Splice, .Type: MVT::v2f32, .Cost: 1},
6348 {.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: 1},
6349 {.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: 1},
6350 {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: 1},
6351 {.ISD: TTI::SK_Splice, .Type: MVT::v8bf16, .Cost: 1},
6352 {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: 1},
6353 {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: 1},
6354 {.ISD: TTI::SK_Splice, .Type: MVT::v4f16, .Cost: 1},
6355 {.ISD: TTI::SK_Splice, .Type: MVT::v4bf16, .Cost: 1},
6356 {.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: 1},
6357 {.ISD: TTI::SK_Splice, .Type: MVT::v8i8, .Cost: 1},
6358 // Broadcast shuffle kinds for scalable vectors
6359 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i8, .Cost: 1},
6360 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i16, .Cost: 1},
6361 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i32, .Cost: 1},
6362 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i64, .Cost: 1},
6363 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f16, .Cost: 1},
6364 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f16, .Cost: 1},
6365 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8f16, .Cost: 1},
6366 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2bf16, .Cost: 1},
6367 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4bf16, .Cost: 1},
6368 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8bf16, .Cost: 1},
6369 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f32, .Cost: 1},
6370 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f32, .Cost: 1},
6371 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f64, .Cost: 1},
6372 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i1, .Cost: 1},
6373 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i1, .Cost: 1},
6374 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i1, .Cost: 1},
6375 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i1, .Cost: 1},
6376 // Handle the cases for vector.reverse with scalable vectors
6377 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i8, .Cost: 1},
6378 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i16, .Cost: 1},
6379 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i32, .Cost: 1},
6380 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i64, .Cost: 1},
6381 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f16, .Cost: 1},
6382 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f16, .Cost: 1},
6383 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8f16, .Cost: 1},
6384 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2bf16, .Cost: 1},
6385 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4bf16, .Cost: 1},
6386 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8bf16, .Cost: 1},
6387 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f32, .Cost: 1},
6388 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f32, .Cost: 1},
6389 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f64, .Cost: 1},
6390 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i1, .Cost: 1},
6391 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i1, .Cost: 1},
6392 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i1, .Cost: 1},
6393 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i1, .Cost: 1},
6394 };
6395 if (const auto *Entry = CostTableLookup(Table: ShuffleTbl, ISD: Kind, Ty: LT.second))
6396 return LT.first * Entry->Cost;
6397 }
6398
6399 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: SrcTy))
6400 return getSpliceCost(Tp: SrcTy, Index, CostKind);
6401
6402 // Inserting a subvector can often be done with either a D, S or H register
6403 // move, so long as the inserted vector is "aligned".
6404 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6405 LT.second.getSizeInBits() <= 128 && SubTp) {
6406 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
6407 if (SubLT.second.isVector()) {
6408 int NumElts = LT.second.getVectorNumElements();
6409 int NumSubElts = SubLT.second.getVectorNumElements();
6410 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6411 return SubLT.first;
6412 }
6413 }
6414
6415 // Restore optimal kind.
6416 if (IsExtractSubvector)
6417 Kind = TTI::SK_ExtractSubvector;
6418 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6419 Args, CxtI);
6420}
6421
6422static bool containsDecreasingPointers(Loop *TheLoop,
6423 PredicatedScalarEvolution *PSE,
6424 const DominatorTree &DT) {
6425 const auto &Strides = DenseMap<Value *, const SCEV *>();
6426 for (BasicBlock *BB : TheLoop->blocks()) {
6427 // Scan the instructions in the block and look for addresses that are
6428 // consecutive and decreasing.
6429 for (Instruction &I : *BB) {
6430 if (isa<LoadInst>(Val: &I) || isa<StoreInst>(Val: &I)) {
6431 Value *Ptr = getLoadStorePointerOperand(V: &I);
6432 Type *AccessTy = getLoadStoreType(I: &I);
6433 if (getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, DT, StridesMap: Strides,
6434 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6435 .value_or(u: 0) < 0)
6436 return true;
6437 }
6438 }
6439 }
6440 return false;
6441}
6442
6443bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost(bool IsEpilogue) const {
6444 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6445 return SVEPreferFixedOverScalableIfEqualCost;
6446 // For cases like post-LTO vectorization, when we eventually know the trip
6447 // count, epilogue with fixed-width vectorization can be deleted if the trip
6448 // count is less than the epilogue iterations. That's why we prefer
6449 // fixed-width vectorization in epilogue in case of equal costs.
6450 if (IsEpilogue)
6451 return true;
6452 return ST->useFixedOverScalableIfEqualCost();
6453}
6454
6455unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
6456 return ST->getEpilogueVectorizationMinVF();
6457}
6458
6459bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
6460 if (!ST->hasSVE())
6461 return false;
6462
6463 // We don't currently support vectorisation with interleaving for SVE - with
6464 // such loops we're better off not using tail-folding. This gives us a chance
6465 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6466 if (TFI->IAI->hasGroups())
6467 return false;
6468
6469 TailFoldingOpts Required = TailFoldingOpts::Disabled;
6470 if (TFI->LVL->getReductionVars().size())
6471 Required |= TailFoldingOpts::Reductions;
6472 if (TFI->LVL->getFixedOrderRecurrences().size())
6473 Required |= TailFoldingOpts::Recurrences;
6474
6475 // We call this to discover whether any load/store pointers in the loop have
6476 // negative strides. This will require extra work to reverse the loop
6477 // predicate, which may be expensive.
6478 if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(),
6479 PSE: TFI->LVL->getPredicatedScalarEvolution(),
6480 DT: *TFI->LVL->getDominatorTree()))
6481 Required |= TailFoldingOpts::Reverse;
6482 if (Required == TailFoldingOpts::Disabled)
6483 Required |= TailFoldingOpts::Simple;
6484
6485 if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(),
6486 Required))
6487 return false;
6488
6489 // Don't tail-fold for tight loops where we would be better off interleaving
6490 // with an unpredicated loop.
6491 unsigned NumInsns = 0;
6492 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6493 NumInsns += BB->sizeWithoutDebug();
6494 }
6495
6496 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6497 return NumInsns >= SVETailFoldInsnThreshold;
6498}
6499
6500InstructionCost
6501AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
6502 StackOffset BaseOffset, bool HasBaseReg,
6503 int64_t Scale, unsigned AddrSpace) const {
6504 // Scaling factors are not free at all.
6505 // Operands | Rt Latency
6506 // -------------------------------------------
6507 // Rt, [Xn, Xm] | 4
6508 // -------------------------------------------
6509 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6510 // Rt, [Xn, Wm, <extend> #imm] |
6511 TargetLoweringBase::AddrMode AM;
6512 AM.BaseGV = BaseGV;
6513 AM.BaseOffs = BaseOffset.getFixed();
6514 AM.HasBaseReg = HasBaseReg;
6515 AM.Scale = Scale;
6516 AM.ScalableOffset = BaseOffset.getScalable();
6517 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace))
6518 // Scale represents reg2 * scale, thus account for 1 if
6519 // it is not equal to 0 or 1.
6520 return AM.Scale != 0 && AM.Scale != 1;
6521 return InstructionCost::getInvalid();
6522}
6523
6524bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(
6525 const Instruction *I) const {
6526 if (EnableOrLikeSelectOpt) {
6527 // For the binary operators (e.g. or) we need to be more careful than
6528 // selects, here we only transform them if they are already at a natural
6529 // break point in the code - the end of a block with an unconditional
6530 // terminator.
6531 if (I->getOpcode() == Instruction::Or &&
6532 isa<UncondBrInst>(Val: I->getNextNode()))
6533 return true;
6534
6535 if (I->getOpcode() == Instruction::Add ||
6536 I->getOpcode() == Instruction::Sub)
6537 return true;
6538 }
6539 return BaseT::shouldTreatInstructionLikeSelect(I);
6540}
6541
6542bool AArch64TTIImpl::isLSRCostLess(
6543 const TargetTransformInfo::LSRCost &C1,
6544 const TargetTransformInfo::LSRCost &C2) const {
6545 // AArch64 specific here is adding the number of instructions to the
6546 // comparison (though not as the first consideration, as some targets do)
6547 // along with changing the priority of the base additions.
6548 // TODO: Maybe a more nuanced tradeoff between instruction count
6549 // and number of registers? To be investigated at a later date.
6550 if (EnableLSRCostOpt)
6551 return std::tie(args: C1.NumRegs, args: C1.Insns, args: C1.NumBaseAdds, args: C1.AddRecCost,
6552 args: C1.NumIVMuls, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) <
6553 std::tie(args: C2.NumRegs, args: C2.Insns, args: C2.NumBaseAdds, args: C2.AddRecCost,
6554 args: C2.NumIVMuls, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost);
6555
6556 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
6557}
6558
6559static bool isSplatShuffle(Value *V) {
6560 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V))
6561 return all_equal(Range: Shuf->getShuffleMask());
6562 return false;
6563}
6564
6565/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6566/// or upper half of the vector elements.
6567static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6568 bool AllowSplat = false) {
6569 // Scalable types can't be extract shuffle vectors.
6570 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6571 return false;
6572
6573 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6574 auto *FullTy = FullV->getType();
6575 auto *HalfTy = HalfV->getType();
6576 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6577 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6578 };
6579
6580 auto extractHalf = [](Value *FullV, Value *HalfV) {
6581 auto *FullVT = cast<FixedVectorType>(Val: FullV->getType());
6582 auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType());
6583 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6584 };
6585
6586 ArrayRef<int> M1, M2;
6587 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6588 if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask(M1))) ||
6589 !match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask(M2))))
6590 return false;
6591
6592 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6593 // it is not checked as an extract below.
6594 if (AllowSplat && isSplatShuffle(V: Op1))
6595 S1Op1 = nullptr;
6596 if (AllowSplat && isSplatShuffle(V: Op2))
6597 S2Op1 = nullptr;
6598
6599 // Check that the operands are half as wide as the result and we extract
6600 // half of the elements of the input vectors.
6601 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6602 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6603 return false;
6604
6605 // Check the mask extracts either the lower or upper half of vector
6606 // elements.
6607 int M1Start = 0;
6608 int M2Start = 0;
6609 int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * 2;
6610 if ((S1Op1 &&
6611 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) ||
6612 (S2Op1 &&
6613 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start)))
6614 return false;
6615
6616 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6617 (M2Start != 0 && M2Start != (NumElements / 2)))
6618 return false;
6619 if (S1Op1 && S2Op1 && M1Start != M2Start)
6620 return false;
6621
6622 return true;
6623}
6624
6625/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6626/// of the vector elements.
6627static bool areExtractExts(Value *Ext1, Value *Ext2) {
6628 auto areExtDoubled = [](Instruction *Ext) {
6629 return Ext->getType()->getScalarSizeInBits() ==
6630 2 * Ext->getOperand(i: 0)->getType()->getScalarSizeInBits();
6631 };
6632
6633 if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) ||
6634 !match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) ||
6635 !areExtDoubled(cast<Instruction>(Val: Ext1)) ||
6636 !areExtDoubled(cast<Instruction>(Val: Ext2)))
6637 return false;
6638
6639 return true;
6640}
6641
6642/// Check if Op could be used with vmull_high_p64 intrinsic.
6643static bool isOperandOfVmullHighP64(Value *Op) {
6644 Value *VectorOperand = nullptr;
6645 ConstantInt *ElementIndex = nullptr;
6646 return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand),
6647 Idx: m_ConstantInt(CI&: ElementIndex))) &&
6648 ElementIndex->getValue() == 1 &&
6649 isa<FixedVectorType>(Val: VectorOperand->getType()) &&
6650 cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == 2;
6651}
6652
6653/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6654static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6655 return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2);
6656}
6657
6658static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
6659 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6660 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs);
6661 if (!GEP || GEP->getNumOperands() != 2)
6662 return false;
6663
6664 Value *Base = GEP->getOperand(i_nocapture: 0);
6665 Value *Offsets = GEP->getOperand(i_nocapture: 1);
6666
6667 // We only care about scalar_base+vector_offsets.
6668 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6669 return false;
6670
6671 // Sink extends that would allow us to use 32-bit offset vectors.
6672 if (isa<SExtInst>(Val: Offsets) || isa<ZExtInst>(Val: Offsets)) {
6673 auto *OffsetsInst = cast<Instruction>(Val: Offsets);
6674 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6675 OffsetsInst->getOperand(i: 0)->getType()->getScalarSizeInBits() <= 32)
6676 Ops.push_back(Elt: &GEP->getOperandUse(i: 1));
6677 }
6678
6679 // Sink the GEP.
6680 return true;
6681}
6682
6683/// We want to sink following cases:
6684/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6685/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6686static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
6687 if (match(V: Op, P: m_VScale()))
6688 return true;
6689 if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) ||
6690 match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) {
6691 Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0));
6692 return true;
6693 }
6694 if (match(V: Op, P: m_Shl(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt())) ||
6695 match(V: Op, P: m_Mul(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt()))) {
6696 Value *ZExtOp = cast<Instruction>(Val: Op)->getOperand(i: 0);
6697 Ops.push_back(Elt: &cast<Instruction>(Val: ZExtOp)->getOperandUse(i: 0));
6698 Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0));
6699 return true;
6700 }
6701 return false;
6702}
6703
6704static bool isFNeg(Value *Op) { return match(V: Op, P: m_FNeg(X: m_Value())); }
6705
6706/// Check if sinking \p I's operands to I's basic block is profitable, because
6707/// the operands can be folded into a target instruction, e.g.
6708/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6709bool AArch64TTIImpl::isProfitableToSinkOperands(
6710 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6711 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
6712 switch (II->getIntrinsicID()) {
6713 case Intrinsic::aarch64_neon_smull:
6714 case Intrinsic::aarch64_neon_umull:
6715 if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1),
6716 /*AllowSplat=*/true)) {
6717 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6718 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6719 return true;
6720 }
6721 [[fallthrough]];
6722
6723 case Intrinsic::fma:
6724 case Intrinsic::fmuladd:
6725 if (isa<VectorType>(Val: I->getType()) &&
6726 cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
6727 !ST->hasFullFP16())
6728 return false;
6729
6730 if (isFNeg(Op: II->getOperand(i_nocapture: 0)))
6731 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6732 if (isFNeg(Op: II->getOperand(i_nocapture: 1)))
6733 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6734
6735 [[fallthrough]];
6736 case Intrinsic::aarch64_neon_sqdmull:
6737 case Intrinsic::aarch64_neon_sqdmulh:
6738 case Intrinsic::aarch64_neon_sqrdmulh:
6739 // Sink splats for index lane variants
6740 if (isSplatShuffle(V: II->getOperand(i_nocapture: 0)))
6741 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6742 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
6743 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6744 return !Ops.empty();
6745 case Intrinsic::aarch64_neon_fmlal:
6746 case Intrinsic::aarch64_neon_fmlal2:
6747 case Intrinsic::aarch64_neon_fmlsl:
6748 case Intrinsic::aarch64_neon_fmlsl2:
6749 // Sink splats for index lane variants
6750 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
6751 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6752 if (isSplatShuffle(V: II->getOperand(i_nocapture: 2)))
6753 Ops.push_back(Elt: &II->getOperandUse(i: 2));
6754 return !Ops.empty();
6755 case Intrinsic::aarch64_sve_ptest_first:
6756 case Intrinsic::aarch64_sve_ptest_last:
6757 if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: 0)))
6758 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6759 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6760 return !Ops.empty();
6761 case Intrinsic::aarch64_sme_write_horiz:
6762 case Intrinsic::aarch64_sme_write_vert:
6763 case Intrinsic::aarch64_sme_writeq_horiz:
6764 case Intrinsic::aarch64_sme_writeq_vert: {
6765 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 1));
6766 if (!Idx || Idx->getOpcode() != Instruction::Add)
6767 return false;
6768 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6769 return true;
6770 }
6771 case Intrinsic::aarch64_sme_read_horiz:
6772 case Intrinsic::aarch64_sme_read_vert:
6773 case Intrinsic::aarch64_sme_readq_horiz:
6774 case Intrinsic::aarch64_sme_readq_vert:
6775 case Intrinsic::aarch64_sme_ld1b_vert:
6776 case Intrinsic::aarch64_sme_ld1h_vert:
6777 case Intrinsic::aarch64_sme_ld1w_vert:
6778 case Intrinsic::aarch64_sme_ld1d_vert:
6779 case Intrinsic::aarch64_sme_ld1q_vert:
6780 case Intrinsic::aarch64_sme_st1b_vert:
6781 case Intrinsic::aarch64_sme_st1h_vert:
6782 case Intrinsic::aarch64_sme_st1w_vert:
6783 case Intrinsic::aarch64_sme_st1d_vert:
6784 case Intrinsic::aarch64_sme_st1q_vert:
6785 case Intrinsic::aarch64_sme_ld1b_horiz:
6786 case Intrinsic::aarch64_sme_ld1h_horiz:
6787 case Intrinsic::aarch64_sme_ld1w_horiz:
6788 case Intrinsic::aarch64_sme_ld1d_horiz:
6789 case Intrinsic::aarch64_sme_ld1q_horiz:
6790 case Intrinsic::aarch64_sme_st1b_horiz:
6791 case Intrinsic::aarch64_sme_st1h_horiz:
6792 case Intrinsic::aarch64_sme_st1w_horiz:
6793 case Intrinsic::aarch64_sme_st1d_horiz:
6794 case Intrinsic::aarch64_sme_st1q_horiz: {
6795 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 3));
6796 if (!Idx || Idx->getOpcode() != Instruction::Add)
6797 return false;
6798 Ops.push_back(Elt: &II->getOperandUse(i: 3));
6799 return true;
6800 }
6801 case Intrinsic::aarch64_neon_pmull:
6802 if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1)))
6803 return false;
6804 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6805 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6806 return true;
6807 case Intrinsic::aarch64_neon_pmull64:
6808 if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: 0),
6809 Op2: II->getArgOperand(i: 1)))
6810 return false;
6811 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
6812 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
6813 return true;
6814 case Intrinsic::masked_gather:
6815 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 0), Ops))
6816 return false;
6817 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
6818 return true;
6819 case Intrinsic::masked_scatter:
6820 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 1), Ops))
6821 return false;
6822 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
6823 return true;
6824 default:
6825 return false;
6826 }
6827 }
6828
6829 auto ShouldSinkCondition = [](Value *Cond,
6830 SmallVectorImpl<Use *> &Ops) -> bool {
6831 if (!isa<IntrinsicInst>(Val: Cond))
6832 return false;
6833 auto *II = dyn_cast<IntrinsicInst>(Val: Cond);
6834 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6835 !isa<ScalableVectorType>(Val: II->getOperand(i_nocapture: 0)->getType()))
6836 return false;
6837 if (isa<CmpInst>(Val: II->getOperand(i_nocapture: 0)))
6838 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6839 return true;
6840 };
6841
6842 switch (I->getOpcode()) {
6843 case Instruction::GetElementPtr:
6844 case Instruction::Add:
6845 case Instruction::Sub:
6846 // Sink vscales closer to uses for better isel
6847 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6848 if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) {
6849 Ops.push_back(Elt: &I->getOperandUse(i: Op));
6850 return true;
6851 }
6852 }
6853 break;
6854 case Instruction::Select: {
6855 if (!ShouldSinkCondition(I->getOperand(i: 0), Ops))
6856 return false;
6857
6858 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6859 return true;
6860 }
6861 case Instruction::UncondBr:
6862 return false;
6863 case Instruction::CondBr: {
6864 if (!ShouldSinkCondition(cast<CondBrInst>(Val: I)->getCondition(), Ops))
6865 return false;
6866
6867 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6868 return true;
6869 }
6870 case Instruction::FMul:
6871 // fmul with contract flag can be combined with fadd into fma.
6872 // Sinking fneg into this block enables fmls pattern.
6873 if (cast<FPMathOperator>(Val: I)->hasAllowContract()) {
6874 if (isFNeg(Op: I->getOperand(i: 0)))
6875 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6876 if (isFNeg(Op: I->getOperand(i: 1)))
6877 Ops.push_back(Elt: &I->getOperandUse(i: 1));
6878 }
6879 break;
6880
6881 default:
6882 break;
6883 }
6884
6885 if (!I->getType()->isVectorTy())
6886 return !Ops.empty();
6887
6888 switch (I->getOpcode()) {
6889 case Instruction::Sub:
6890 case Instruction::Add: {
6891 if (!areExtractExts(Ext1: I->getOperand(i: 0), Ext2: I->getOperand(i: 1)))
6892 return false;
6893
6894 // If the exts' operands extract either the lower or upper elements, we
6895 // can sink them too.
6896 auto Ext1 = cast<Instruction>(Val: I->getOperand(i: 0));
6897 auto Ext2 = cast<Instruction>(Val: I->getOperand(i: 1));
6898 if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: 0), Op2: Ext2->getOperand(i: 0))) {
6899 Ops.push_back(Elt: &Ext1->getOperandUse(i: 0));
6900 Ops.push_back(Elt: &Ext2->getOperandUse(i: 0));
6901 }
6902
6903 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6904 Ops.push_back(Elt: &I->getOperandUse(i: 1));
6905
6906 return true;
6907 }
6908 case Instruction::Or: {
6909 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6910 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6911 if (ST->hasNEON()) {
6912 Instruction *OtherAnd, *IA, *IB;
6913 Value *MaskValue;
6914 // MainAnd refers to And instruction that has 'Not' as one of its operands
6915 if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)),
6916 R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))),
6917 R: m_Instruction(I&: IA)))))) {
6918 if (match(V: OtherAnd,
6919 P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) {
6920 Instruction *MainAnd = I->getOperand(i: 0) == OtherAnd
6921 ? cast<Instruction>(Val: I->getOperand(i: 1))
6922 : cast<Instruction>(Val: I->getOperand(i: 0));
6923
6924 // Both Ands should be in same basic block as Or
6925 if (I->getParent() != MainAnd->getParent() ||
6926 I->getParent() != OtherAnd->getParent())
6927 return false;
6928
6929 // Non-mask operands of both Ands should also be in same basic block
6930 if (I->getParent() != IA->getParent() ||
6931 I->getParent() != IB->getParent())
6932 return false;
6933
6934 Ops.push_back(
6935 Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: 0) == IA ? 1 : 0));
6936 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6937 Ops.push_back(Elt: &I->getOperandUse(i: 1));
6938
6939 return true;
6940 }
6941 }
6942 }
6943
6944 return false;
6945 }
6946 case Instruction::Mul: {
6947 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6948 auto *Ty = cast<VectorType>(Val: V->getType());
6949 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6950 if (Ty->isScalableTy())
6951 return false;
6952
6953 // Indexed variants of Mul exist for i16 and i32 element types only.
6954 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6955 };
6956
6957 int NumZExts = 0, NumSExts = 0;
6958 for (auto &Op : I->operands()) {
6959 // Make sure we are not already sinking this operand
6960 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; }))
6961 continue;
6962
6963 if (match(V: &Op, P: m_ZExtOrSExt(Op: m_Value()))) {
6964 auto *Ext = cast<Instruction>(Val&: Op);
6965 auto *ExtOp = Ext->getOperand(i: 0);
6966 if (isSplatShuffle(V: ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6967 Ops.push_back(Elt: &Ext->getOperandUse(i: 0));
6968 Ops.push_back(Elt: &Op);
6969
6970 if (isa<SExtInst>(Val: Ext)) {
6971 NumSExts++;
6972 } else {
6973 NumZExts++;
6974 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6975 if (Ext->getOperand(i: 0)->getType()->getScalarSizeInBits() * 2 <
6976 I->getType()->getScalarSizeInBits())
6977 NumSExts++;
6978 }
6979
6980 continue;
6981 }
6982
6983 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op);
6984 if (!Shuffle)
6985 continue;
6986
6987 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6988 // operand and the s/zext can help create indexed s/umull. This is
6989 // especially useful to prevent i64 mul being scalarized.
6990 if (isSplatShuffle(V: Shuffle) &&
6991 match(V: Shuffle->getOperand(i_nocapture: 0), P: m_ZExtOrSExt(Op: m_Value()))) {
6992 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
6993 Ops.push_back(Elt: &Op);
6994 if (match(V: Shuffle->getOperand(i_nocapture: 0), P: m_SExt(Op: m_Value())))
6995 NumSExts++;
6996 else
6997 NumZExts++;
6998 continue;
6999 }
7000
7001 Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: 0);
7002 InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand);
7003 if (!Insert)
7004 continue;
7005
7006 Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: 1));
7007 if (!OperandInstr)
7008 continue;
7009
7010 ConstantInt *ElementConstant =
7011 dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: 2));
7012 // Check that the insertelement is inserting into element 0
7013 if (!ElementConstant || !ElementConstant->isZero())
7014 continue;
7015
7016 unsigned Opcode = OperandInstr->getOpcode();
7017 if (Opcode == Instruction::SExt)
7018 NumSExts++;
7019 else if (Opcode == Instruction::ZExt)
7020 NumZExts++;
7021 else {
7022 // If we find that the top bits are known 0, then we can sink and allow
7023 // the backend to generate a umull.
7024 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
7025 APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / 2);
7026 if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, SQ: DL))
7027 continue;
7028 NumZExts++;
7029 }
7030
7031 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
7032 // the And, just to hoist it again back to the load.
7033 if (!match(V: OperandInstr, P: m_And(L: m_Load(Op: m_Value()), R: m_Value())))
7034 Ops.push_back(Elt: &Insert->getOperandUse(i: 1));
7035 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
7036 Ops.push_back(Elt: &Op);
7037 }
7038
7039 // It is profitable to sink if we found two of the same type of extends.
7040 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
7041 return true;
7042
7043 // Otherwise, see if we should sink splats for indexed variants.
7044 if (!ShouldSinkSplatForIndexedVariant(I))
7045 return false;
7046
7047 Ops.clear();
7048 if (isSplatShuffle(V: I->getOperand(i: 0)))
7049 Ops.push_back(Elt: &I->getOperandUse(i: 0));
7050 if (isSplatShuffle(V: I->getOperand(i: 1)))
7051 Ops.push_back(Elt: &I->getOperandUse(i: 1));
7052
7053 return !Ops.empty();
7054 }
7055 case Instruction::FMul: {
7056 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7057 if (I->getType()->isScalableTy())
7058 return !Ops.empty();
7059
7060 if (cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
7061 !ST->hasFullFP16())
7062 return !Ops.empty();
7063
7064 // Sink splats for index lane variants
7065 if (isSplatShuffle(V: I->getOperand(i: 0)))
7066 Ops.push_back(Elt: &I->getOperandUse(i: 0));
7067 if (isSplatShuffle(V: I->getOperand(i: 1)))
7068 Ops.push_back(Elt: &I->getOperandUse(i: 1));
7069 return !Ops.empty();
7070 }
7071 default:
7072 return false;
7073 }
7074 return false;
7075}
7076