1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AArch64TargetTransformInfo.h"
10#include "AArch64ExpandImm.h"
11#include "AArch64PerfectShuffle.h"
12#include "AArch64SMEAttributes.h"
13#include "MCTargetDesc/AArch64AddressingModes.h"
14#include "llvm/ADT/DenseMap.h"
15#include "llvm/Analysis/LoopInfo.h"
16#include "llvm/Analysis/TargetTransformInfo.h"
17#include "llvm/CodeGen/BasicTTIImpl.h"
18#include "llvm/CodeGen/CostTable.h"
19#include "llvm/CodeGen/TargetLowering.h"
20#include "llvm/IR/DerivedTypes.h"
21#include "llvm/IR/IntrinsicInst.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
24#include "llvm/IR/PatternMatch.h"
25#include "llvm/Support/Debug.h"
26#include "llvm/TargetParser/AArch64TargetParser.h"
27#include "llvm/Transforms/InstCombine/InstCombiner.h"
28#include "llvm/Transforms/Utils/UnrollLoop.h"
29#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(Val: true), cl::Hidden);
39
40static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost(
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(Val: 10),
44 cl::Hidden);
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(Val: 10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(Val: 15), cl::Hidden);
51
52static cl::opt<unsigned>
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(Val: 10),
54 cl::Hidden);
55
56static cl::opt<unsigned> CallPenaltyChangeSM(
57 "call-penalty-sm-change", cl::init(Val: 5), cl::Hidden,
58 cl::desc(
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
61static cl::opt<unsigned> InlineCallPenaltyChangeSM(
62 "inline-call-penalty-sm-change", cl::init(Val: 10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(Val: true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(Val: true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
72static cl::opt<unsigned>
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(Val: 8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
76static cl::opt<unsigned> DMBLookaheadThreshold(
77 "dmb-lookahead-threshold", cl::init(Val: 10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
80static cl::opt<int> Aarch64ForceUnrollThreshold(
81 "aarch64-force-unroll-threshold", cl::init(Val: 0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
94 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
95 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
96 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error(reason: "Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError(Opt: "");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(A&: TailFoldTypes, Separator: '+', MaxSplit: -1, KeepEmpty: false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Opt: Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
193static cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
214 cl::location(L&: TailFoldingOptionLoc));
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
219static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
225static cl::opt<bool> EnableScalableAutovecInStreamingMode(
226 "enable-scalable-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
232 SMEAttrs(F->getName(), TLI.getRuntimeLibcallsInfo()).isSMEABIRoutine();
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
238static bool hasPossibleIncompatibleOps(const Function *F,
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(Val: I).isInlineAsm() || isa<IntrinsicInst>(Val: I) ||
248 isSMEABIRoutineCall(CI: cast<CallInst>(Val: I), TLI)))
249 return true;
250 }
251 }
252 return false;
253}
254
255static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI,
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(Kind: AttributeStr).getValueAsString();
260 FeatureStr.split(A&: Features, Separator: ",");
261}
262
263APInt AArch64TTIImpl::getFeatureMask(const Function &F) const {
264 SmallVector<StringRef, 8> Features;
265 extractAttrFeatures(F, TTI: this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
269APInt AArch64TTIImpl::getPriorityMask(const Function &F) const {
270 SmallVector<StringRef, 8> Features;
271 extractAttrFeatures(F, TTI: this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
275bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {
276 return F.hasFnAttribute(Kind: "fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
283bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
291 CallAttrs.callee().hasStreamingInterfaceOrBody())
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(M: SMEAttrs::SM_Compatible, Enable: false);
298 CallAttrs.callee().set(M: SMEAttrs::SM_Enabled, Enable: true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(F: Callee, TLI: *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
326bool AArch64TTIImpl::areTypesABICompatible(const Function *Caller,
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range&: Types, P: [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Val: Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
351AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
386bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
387 TargetTransformInfo::RegisterKind K) const {
388 assert(K != TargetTransformInfo::RGK_Scalar);
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
393 return K == TargetTransformInfo::RGK_ScalableVector &&
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
401InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) const {
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(imm: Val, regSize: 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
410 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
411 AArch64_IMM::expandMOVImm(Imm: Val, BitSize: 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
416InstructionCost
417AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
418 TTI::TargetCostKind CostKind) const {
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext(width: (BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
432 InstructionCost Cost = 0;
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(a: 1, b: Cost);
440}
441
442InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
443 const APInt &Imm, Type *Ty,
444 TTI::TargetCostKind CostKind,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
502 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
507 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
508}
509
510InstructionCost
511AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
512 const APInt &Imm, Type *Ty,
513 TTI::TargetCostKind CostKind) const {
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
526 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
539 InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
559 return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
560}
561
562TargetTransformInfo::PopcntSupportKind
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
566 return TTI::PSK_FastHardware;
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
573 VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
574}
575
576static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
581 return InstructionCost::getInvalid();
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
585 return InstructionCost::getInvalid();
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
594 return InstructionCost::getInvalid();
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(Val: BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(Value: EC) || !VTy->isScalableTy())
601 return InstructionCost::getInvalid();
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
607 return InstructionCost(BaseHistCntCost);
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
615 return InstructionCost::getInvalid();
616}
617
618InstructionCost
619AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
620 TTI::TargetCostKind CostKind) const {
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
628 return InstructionCost::getInvalid();
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(Ty: RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(Range: ValidMinMaxTys, P: equal_to(Arg&: LT.second)))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::scmp:
655 case Intrinsic::ucmp: {
656 static const CostTblEntry BitreverseTbl[] = {
657 {.ISD: Intrinsic::scmp, .Type: MVT::i32, .Cost: 3}, // cmp+cset+csinv
658 {.ISD: Intrinsic::scmp, .Type: MVT::i64, .Cost: 3}, // cmp+cset+csinv
659 {.ISD: Intrinsic::scmp, .Type: MVT::v8i8, .Cost: 3}, // cmgt+cmgt+sub
660 {.ISD: Intrinsic::scmp, .Type: MVT::v16i8, .Cost: 3}, // cmgt+cmgt+sub
661 {.ISD: Intrinsic::scmp, .Type: MVT::v4i16, .Cost: 3}, // cmgt+cmgt+sub
662 {.ISD: Intrinsic::scmp, .Type: MVT::v8i16, .Cost: 3}, // cmgt+cmgt+sub
663 {.ISD: Intrinsic::scmp, .Type: MVT::v2i32, .Cost: 3}, // cmgt+cmgt+sub
664 {.ISD: Intrinsic::scmp, .Type: MVT::v4i32, .Cost: 3}, // cmgt+cmgt+sub
665 {.ISD: Intrinsic::scmp, .Type: MVT::v1i64, .Cost: 3}, // cmgt+cmgt+sub
666 {.ISD: Intrinsic::scmp, .Type: MVT::v2i64, .Cost: 3}, // cmgt+cmgt+sub
667 };
668 const auto LT = getTypeLegalizationCost(Ty: RetTy);
669 const auto *Entry =
670 CostTableLookup(Table: BitreverseTbl, ISD: Intrinsic::scmp, Ty: LT.second);
671 if (Entry)
672 return Entry->Cost * LT.first;
673 break;
674 }
675 case Intrinsic::sadd_sat:
676 case Intrinsic::ssub_sat:
677 case Intrinsic::uadd_sat:
678 case Intrinsic::usub_sat: {
679 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
680 MVT::v8i16, MVT::v2i32, MVT::v4i32,
681 MVT::v2i64};
682 auto LT = getTypeLegalizationCost(Ty: RetTy);
683 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
684 // need to extend the type, as it uses shr(qadd(shl, shl)).
685 unsigned Instrs =
686 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
687 if (any_of(Range: ValidSatTys, P: equal_to(Arg&: LT.second)))
688 return LT.first * Instrs;
689
690 TypeSize TS = getDataLayout().getTypeSizeInBits(Ty: RetTy);
691 uint64_t VectorSize = TS.getKnownMinValue();
692
693 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(Value: VectorSize))
694 return LT.first * Instrs;
695
696 break;
697 }
698 case Intrinsic::abs: {
699 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
700 MVT::v8i16, MVT::v2i32, MVT::v4i32,
701 MVT::v2i64};
702 auto LT = getTypeLegalizationCost(Ty: RetTy);
703 if (any_of(Range: ValidAbsTys, P: equal_to(Arg&: LT.second)))
704 return LT.first;
705 break;
706 }
707 case Intrinsic::bswap: {
708 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
709 MVT::v4i32, MVT::v2i64};
710 auto LT = getTypeLegalizationCost(Ty: RetTy);
711 if (any_of(Range: ValidAbsTys, P: equal_to(Arg&: LT.second)) &&
712 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
713 return LT.first;
714 break;
715 }
716 case Intrinsic::fma:
717 case Intrinsic::fmuladd: {
718 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
719 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
720 Type *EltTy = RetTy->getScalarType();
721 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
722 (EltTy->isHalfTy() && ST->hasFullFP16()))
723 return getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: RetTy, CostKind);
724 break;
725 }
726 case Intrinsic::stepvector: {
727 InstructionCost Cost = 1; // Cost of the `index' instruction
728 auto LT = getTypeLegalizationCost(Ty: RetTy);
729 // Legalisation of illegal vectors involves an `index' instruction plus
730 // (LT.first - 1) vector adds.
731 if (LT.first > 1) {
732 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: RetTy->getContext());
733 InstructionCost AddCost =
734 getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind);
735 Cost += AddCost * (LT.first - 1);
736 }
737 return Cost;
738 }
739 case Intrinsic::vector_extract:
740 case Intrinsic::vector_insert: {
741 // If both the vector and subvector types are legal types and the index
742 // is 0, then this should be a no-op or simple operation; return a
743 // relatively low cost.
744
745 // If arguments aren't actually supplied, then we cannot determine the
746 // value of the index. We also want to skip predicate types.
747 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
748 ICA.getReturnType()->getScalarType()->isIntegerTy(Bitwidth: 1))
749 break;
750
751 LLVMContext &C = RetTy->getContext();
752 EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
753 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
754 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy)
755 : getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[1]);
756 // Skip this if either the vector or subvector types are unpacked
757 // SVE types; they may get lowered to stack stores and loads.
758 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(VecVT: SubVecVT))
759 break;
760
761 TargetLoweringBase::LegalizeKind SubVecLK =
762 getTLI()->getTypeConversion(Context&: C, VT: SubVecVT);
763 TargetLoweringBase::LegalizeKind VecLK =
764 getTLI()->getTypeConversion(Context&: C, VT: VecVT);
765 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
766 const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx);
767 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
768 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
769 return TTI::TCC_Free;
770 break;
771 }
772 case Intrinsic::bitreverse: {
773 static const CostTblEntry BitreverseTbl[] = {
774 {.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: 1},
775 {.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: 1},
776 {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i8, .Cost: 1},
777 {.ISD: Intrinsic::bitreverse, .Type: MVT::v16i8, .Cost: 1},
778 {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i16, .Cost: 2},
779 {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i16, .Cost: 2},
780 {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i32, .Cost: 2},
781 {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i32, .Cost: 2},
782 {.ISD: Intrinsic::bitreverse, .Type: MVT::v1i64, .Cost: 2},
783 {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i64, .Cost: 2},
784 };
785 const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
786 const auto *Entry =
787 CostTableLookup(Table: BitreverseTbl, ISD: ICA.getID(), Ty: LegalisationCost.second);
788 if (Entry) {
789 // Cost Model is using the legal type(i32) that i8 and i16 will be
790 // converted to +1 so that we match the actual lowering cost
791 if (TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i8 ||
792 TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i16)
793 return LegalisationCost.first * Entry->Cost + 1;
794
795 return LegalisationCost.first * Entry->Cost;
796 }
797 break;
798 }
799 case Intrinsic::ctpop: {
800 if (!ST->hasNEON()) {
801 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
802 return getTypeLegalizationCost(Ty: RetTy).first * 12;
803 }
804 static const CostTblEntry CtpopCostTbl[] = {
805 {.ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: 4},
806 {.ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: 3},
807 {.ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: 2},
808 {.ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: 1},
809 {.ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: 4},
810 {.ISD: ISD::CTPOP, .Type: MVT::v2i32, .Cost: 3},
811 {.ISD: ISD::CTPOP, .Type: MVT::v4i16, .Cost: 2},
812 {.ISD: ISD::CTPOP, .Type: MVT::v8i8, .Cost: 1},
813 {.ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: 5},
814 };
815 auto LT = getTypeLegalizationCost(Ty: RetTy);
816 MVT MTy = LT.second;
817 if (const auto *Entry = CostTableLookup(Table: CtpopCostTbl, ISD: ISD::CTPOP, Ty: MTy)) {
818 // Extra cost of +1 when illegal vector types are legalized by promoting
819 // the integer type.
820 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
821 RetTy->getScalarSizeInBits()
822 ? 1
823 : 0;
824 return LT.first * Entry->Cost + ExtraCost;
825 }
826 break;
827 }
828 case Intrinsic::sadd_with_overflow:
829 case Intrinsic::uadd_with_overflow:
830 case Intrinsic::ssub_with_overflow:
831 case Intrinsic::usub_with_overflow:
832 case Intrinsic::smul_with_overflow:
833 case Intrinsic::umul_with_overflow: {
834 static const CostTblEntry WithOverflowCostTbl[] = {
835 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i8, .Cost: 3},
836 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i8, .Cost: 3},
837 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i16, .Cost: 3},
838 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i16, .Cost: 3},
839 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i32, .Cost: 1},
840 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i32, .Cost: 1},
841 {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i64, .Cost: 1},
842 {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i64, .Cost: 1},
843 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i8, .Cost: 3},
844 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i8, .Cost: 3},
845 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i16, .Cost: 3},
846 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i16, .Cost: 3},
847 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i32, .Cost: 1},
848 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i32, .Cost: 1},
849 {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i64, .Cost: 1},
850 {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i64, .Cost: 1},
851 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i8, .Cost: 5},
852 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i8, .Cost: 4},
853 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i16, .Cost: 5},
854 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i16, .Cost: 4},
855 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;tst
856 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;cmp sxtw
857 {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;smulh;cmp
858 {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;umulh;cmp asr
859 };
860 EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: 0), AllowUnknown: true);
861 if (MTy.isSimple())
862 if (const auto *Entry = CostTableLookup(Table: WithOverflowCostTbl, ISD: ICA.getID(),
863 Ty: MTy.getSimpleVT()))
864 return Entry->Cost;
865 break;
866 }
867 case Intrinsic::fptosi_sat:
868 case Intrinsic::fptoui_sat: {
869 if (ICA.getArgTypes().empty())
870 break;
871 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
872 auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]);
873 EVT MTy = TLI->getValueType(DL, Ty: RetTy);
874 // Check for the legal types, which are where the size of the input and the
875 // output are the same, or we are using cvt f64->i32 or f32->i64.
876 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
877 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
878 LT.second == MVT::v2f64)) {
879 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
880 (LT.second == MVT::f64 && MTy == MVT::i32) ||
881 (LT.second == MVT::f32 && MTy == MVT::i64)))
882 return LT.first;
883 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
884 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
885 MTy.getScalarSizeInBits() == 64)
886 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
887 }
888 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
889 // f32.
890 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
891 return LT.first + getIntrinsicInstrCost(
892 ICA: {ICA.getID(),
893 RetTy,
894 {ICA.getArgTypes()[0]->getWithNewType(
895 EltTy: Type::getFloatTy(C&: RetTy->getContext()))}},
896 CostKind);
897 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
898 (LT.second == MVT::f16 && MTy == MVT::i64) ||
899 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
900 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
901 return LT.first;
902 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
903 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
904 MTy.getScalarSizeInBits() == 32)
905 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
906 // Extending vector types v8f16->v8i32. These current scalarize but the
907 // codegen could be better.
908 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
909 MTy.getScalarSizeInBits() == 64)
910 return MTy.getVectorNumElements() * 3;
911
912 // If we can we use a legal convert followed by a min+max
913 if ((LT.second.getScalarType() == MVT::f32 ||
914 LT.second.getScalarType() == MVT::f64 ||
915 LT.second.getScalarType() == MVT::f16) &&
916 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
917 Type *LegalTy =
918 Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits());
919 if (LT.second.isVector())
920 LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount());
921 InstructionCost Cost = 1;
922 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
923 LegalTy, {LegalTy, LegalTy});
924 Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
925 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
926 LegalTy, {LegalTy, LegalTy});
927 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
928 return LT.first * Cost +
929 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
930 : 1);
931 }
932 // Otherwise we need to follow the default expansion that clamps the value
933 // using a float min/max with a fcmp+sel for nan handling when signed.
934 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
935 RetTy = RetTy->getScalarType();
936 if (LT.second.isVector()) {
937 FPTy = VectorType::get(ElementType: FPTy, EC: LT.second.getVectorElementCount());
938 RetTy = VectorType::get(ElementType: RetTy, EC: LT.second.getVectorElementCount());
939 }
940 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
941 InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind);
942 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
943 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
944 Cost +=
945 getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
946 Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind);
947 if (IsSigned) {
948 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
949 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy,
950 VecPred: CmpInst::FCMP_UNO, CostKind);
951 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy,
952 VecPred: CmpInst::FCMP_UNO, CostKind);
953 }
954 return LT.first * Cost;
955 }
956 case Intrinsic::fshl:
957 case Intrinsic::fshr: {
958 if (ICA.getArgs().empty())
959 break;
960
961 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[2]);
962
963 // ROTR / ROTL is a funnel shift with equal first and second operand. For
964 // ROTR on integer registers (i32/i64) this can be done in a single ror
965 // instruction. A fshl with a non-constant shift uses a neg + ror.
966 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
967 (RetTy->getPrimitiveSizeInBits() == 32 ||
968 RetTy->getPrimitiveSizeInBits() == 64)) {
969 InstructionCost NegCost =
970 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
971 return 1 + NegCost;
972 }
973
974 // TODO: Add handling for fshl where third argument is not a constant.
975 if (!OpInfoZ.isConstant())
976 break;
977
978 const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
979 if (OpInfoZ.isUniform()) {
980 static const CostTblEntry FshlTbl[] = {
981 {.ISD: Intrinsic::fshl, .Type: MVT::v4i32, .Cost: 2}, // shl + usra
982 {.ISD: Intrinsic::fshl, .Type: MVT::v2i64, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v16i8, .Cost: 2},
983 {.ISD: Intrinsic::fshl, .Type: MVT::v8i16, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v2i32, .Cost: 2},
984 {.ISD: Intrinsic::fshl, .Type: MVT::v8i8, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v4i16, .Cost: 2}};
985 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
986 // to avoid having to duplicate the costs.
987 const auto *Entry =
988 CostTableLookup(Table: FshlTbl, ISD: Intrinsic::fshl, Ty: LegalisationCost.second);
989 if (Entry)
990 return LegalisationCost.first * Entry->Cost;
991 }
992
993 auto TyL = getTypeLegalizationCost(Ty: RetTy);
994 if (!RetTy->isIntegerTy())
995 break;
996
997 // Estimate cost manually, as types like i8 and i16 will get promoted to
998 // i32 and CostTableLookup will ignore the extra conversion cost.
999 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1000 RetTy->getScalarSizeInBits() < 64) ||
1001 (RetTy->getScalarSizeInBits() % 64 != 0);
1002 unsigned ExtraCost = HigherCost ? 1 : 0;
1003 if (RetTy->getScalarSizeInBits() == 32 ||
1004 RetTy->getScalarSizeInBits() == 64)
1005 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1006 // extr instruction.
1007 else if (HigherCost)
1008 ExtraCost = 1;
1009 else
1010 break;
1011 return TyL.first + ExtraCost;
1012 }
1013 case Intrinsic::get_active_lane_mask: {
1014 auto RetTy = cast<VectorType>(Val: ICA.getReturnType());
1015 EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy);
1016 EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
1017 if (getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT))
1018 break;
1019
1020 if (RetTy->isScalableTy()) {
1021 if (TLI->getTypeAction(Context&: RetTy->getContext(), VT: RetVT) !=
1022 TargetLowering::TypeSplitVector)
1023 break;
1024
1025 auto LT = getTypeLegalizationCost(Ty: RetTy);
1026 InstructionCost Cost = LT.first;
1027 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1028 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1029 // nxv32i1 = get_active_lane_mask(base, idx) ->
1030 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1031 if (ST->hasSVE2p1() || ST->hasSME2()) {
1032 Cost /= 2;
1033 if (Cost == 1)
1034 return Cost;
1035 }
1036
1037 // If more than one whilelo intrinsic is required, include the extra cost
1038 // required by the saturating add & select required to increment the
1039 // start value after the first intrinsic call.
1040 Type *OpTy = ICA.getArgTypes()[0];
1041 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1042 InstructionCost SplitCost = getIntrinsicInstrCost(ICA: AddAttrs, CostKind);
1043 Type *CondTy = OpTy->getWithNewBitWidth(NewBitWidth: 1);
1044 SplitCost += getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: OpTy, CondTy,
1045 VecPred: CmpInst::ICMP_UGT, CostKind);
1046 return Cost + (SplitCost * (Cost - 1));
1047 } else if (!getTLI()->isTypeLegal(VT: RetVT)) {
1048 // We don't have enough context at this point to determine if the mask
1049 // is going to be kept live after the block, which will force the vXi1
1050 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1051 // For now, we just assume the vectorizer created this intrinsic and
1052 // the result will be the input for a PHI. In this case the cost will
1053 // be extremely high for fixed-width vectors.
1054 // NOTE: getScalarizationOverhead returns a cost that's far too
1055 // pessimistic for the actual generated codegen. In reality there are
1056 // two instructions generated per lane.
1057 return cast<FixedVectorType>(Val: RetTy)->getNumElements() * 2;
1058 }
1059 break;
1060 }
1061 case Intrinsic::experimental_vector_match: {
1062 auto *NeedleTy = cast<FixedVectorType>(Val: ICA.getArgTypes()[1]);
1063 EVT SearchVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
1064 unsigned SearchSize = NeedleTy->getNumElements();
1065 if (!getTLI()->shouldExpandVectorMatch(VT: SearchVT, SearchSize)) {
1066 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1067 // Neoverse V3, these are cheap operations with the same latency as a
1068 // vector ADD. In most cases, however, we also need to do an extra DUP.
1069 // For fixed-length vectors we currently need an extra five--six
1070 // instructions besides the MATCH.
1071 InstructionCost Cost = 4;
1072 if (isa<FixedVectorType>(Val: RetTy))
1073 Cost += 10;
1074 return Cost;
1075 }
1076 break;
1077 }
1078 case Intrinsic::experimental_cttz_elts: {
1079 EVT ArgVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]);
1080 if (!getTLI()->shouldExpandCttzElements(VT: ArgVT)) {
1081 // This will consist of a SVE brkb and a cntp instruction. These
1082 // typically have the same latency and half the throughput as a vector
1083 // add instruction.
1084 return 4;
1085 }
1086 break;
1087 }
1088 case Intrinsic::loop_dependence_raw_mask:
1089 case Intrinsic::loop_dependence_war_mask: {
1090 // The whilewr/rw instructions require SVE2 or SME.
1091 if (ST->hasSVE2() || ST->hasSME()) {
1092 EVT VecVT = getTLI()->getValueType(DL, Ty: RetTy);
1093 unsigned EltSizeInBytes =
1094 cast<ConstantInt>(Val: ICA.getArgs()[2])->getZExtValue();
1095 if (!is_contained(Set: {1u, 2u, 4u, 8u}, Element: EltSizeInBytes) ||
1096 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1097 break;
1098 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1099 return isa<FixedVectorType>(Val: RetTy) ? 2 : 1;
1100 }
1101 break;
1102 }
1103 case Intrinsic::experimental_vector_extract_last_active:
1104 if (ST->isSVEorStreamingSVEAvailable()) {
1105 auto [LegalCost, _] = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]);
1106 // This should turn into chained clastb instructions.
1107 return LegalCost;
1108 }
1109 break;
1110 default:
1111 break;
1112 }
1113 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1114}
1115
1116/// The function will remove redundant reinterprets casting in the presence
1117/// of the control flow
1118static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1119 IntrinsicInst &II) {
1120 SmallVector<Instruction *, 32> Worklist;
1121 auto RequiredType = II.getType();
1122
1123 auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: 0));
1124 assert(PN && "Expected Phi Node!");
1125
1126 // Don't create a new Phi unless we can remove the old one.
1127 if (!PN->hasOneUse())
1128 return std::nullopt;
1129
1130 for (Value *IncValPhi : PN->incoming_values()) {
1131 auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi);
1132 if (!Reinterpret ||
1133 Reinterpret->getIntrinsicID() !=
1134 Intrinsic::aarch64_sve_convert_to_svbool ||
1135 RequiredType != Reinterpret->getArgOperand(i: 0)->getType())
1136 return std::nullopt;
1137 }
1138
1139 // Create the new Phi
1140 IC.Builder.SetInsertPoint(PN);
1141 PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues());
1142 Worklist.push_back(Elt: PN);
1143
1144 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1145 auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I));
1146 NPN->addIncoming(V: Reinterpret->getOperand(i: 0), BB: PN->getIncomingBlock(i: I));
1147 Worklist.push_back(Elt: Reinterpret);
1148 }
1149
1150 // Cleanup Phi Node and reinterprets
1151 return IC.replaceInstUsesWith(I&: II, V: NPN);
1152}
1153
1154// A collection of properties common to SVE intrinsics that allow for combines
1155// to be written without needing to know the specific intrinsic.
1156struct SVEIntrinsicInfo {
1157 //
1158 // Helper routines for common intrinsic definitions.
1159 //
1160
1161 // e.g. llvm.aarch64.sve.add pg, op1, op2
1162 // with IID ==> llvm.aarch64.sve.add_u
1163 static SVEIntrinsicInfo
1164 defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) {
1165 return SVEIntrinsicInfo()
1166 .setGoverningPredicateOperandIdx(0)
1167 .setOperandIdxInactiveLanesTakenFrom(1)
1168 .setMatchingUndefIntrinsic(IID);
1169 }
1170
1171 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1172 static SVEIntrinsicInfo defaultMergingUnaryOp() {
1173 return SVEIntrinsicInfo()
1174 .setGoverningPredicateOperandIdx(1)
1175 .setOperandIdxInactiveLanesTakenFrom(0)
1176 .setOperandIdxWithNoActiveLanes(0);
1177 }
1178
1179 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1180 static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() {
1181 return SVEIntrinsicInfo()
1182 .setGoverningPredicateOperandIdx(1)
1183 .setOperandIdxInactiveLanesTakenFrom(0);
1184 }
1185
1186 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1187 static SVEIntrinsicInfo defaultUndefOp() {
1188 return SVEIntrinsicInfo()
1189 .setGoverningPredicateOperandIdx(0)
1190 .setInactiveLanesAreNotDefined();
1191 }
1192
1193 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1194 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1195 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1196 return SVEIntrinsicInfo()
1197 .setGoverningPredicateOperandIdx(GPIndex)
1198 .setInactiveLanesAreUnused();
1199 }
1200
1201 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1202 // llvm.aarch64.sve.ld1 pg, ptr
1203 static SVEIntrinsicInfo defaultZeroingOp() {
1204 return SVEIntrinsicInfo()
1205 .setGoverningPredicateOperandIdx(0)
1206 .setInactiveLanesAreUnused()
1207 .setResultIsZeroInitialized();
1208 }
1209
1210 // All properties relate to predication and thus having a general predicate
1211 // is the minimum requirement to say there is intrinsic info to act on.
1212 explicit operator bool() const { return hasGoverningPredicate(); }
1213
1214 //
1215 // Properties relating to the governing predicate.
1216 //
1217
1218 bool hasGoverningPredicate() const {
1219 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1220 }
1221
1222 unsigned getGoverningPredicateOperandIdx() const {
1223 assert(hasGoverningPredicate() && "Propery not set!");
1224 return GoverningPredicateIdx;
1225 }
1226
1227 SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) {
1228 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1229 GoverningPredicateIdx = Index;
1230 return *this;
1231 }
1232
1233 //
1234 // Properties relating to operations the intrinsic could be transformed into.
1235 // NOTE: This does not mean such a transformation is always possible, but the
1236 // knowledge makes it possible to reuse existing optimisations without needing
1237 // to embed specific handling for each intrinsic. For example, instruction
1238 // simplification can be used to optimise an intrinsic's active lanes.
1239 //
1240
1241 bool hasMatchingUndefIntrinsic() const {
1242 return UndefIntrinsic != Intrinsic::not_intrinsic;
1243 }
1244
1245 Intrinsic::ID getMatchingUndefIntrinsic() const {
1246 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1247 return UndefIntrinsic;
1248 }
1249
1250 SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) {
1251 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1252 UndefIntrinsic = IID;
1253 return *this;
1254 }
1255
1256 bool hasMatchingIROpode() const { return IROpcode != 0; }
1257
1258 unsigned getMatchingIROpode() const {
1259 assert(hasMatchingIROpode() && "Propery not set!");
1260 return IROpcode;
1261 }
1262
1263 SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
1264 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1265 IROpcode = Opcode;
1266 return *this;
1267 }
1268
1269 //
1270 // Properties relating to the result of inactive lanes.
1271 //
1272
1273 bool inactiveLanesTakenFromOperand() const {
1274 return ResultLanes == InactiveLanesTakenFromOperand;
1275 }
1276
1277 unsigned getOperandIdxInactiveLanesTakenFrom() const {
1278 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1279 return OperandIdxForInactiveLanes;
1280 }
1281
1282 SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) {
1283 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1284 ResultLanes = InactiveLanesTakenFromOperand;
1285 OperandIdxForInactiveLanes = Index;
1286 return *this;
1287 }
1288
1289 bool inactiveLanesAreNotDefined() const {
1290 return ResultLanes == InactiveLanesAreNotDefined;
1291 }
1292
1293 SVEIntrinsicInfo &setInactiveLanesAreNotDefined() {
1294 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1295 ResultLanes = InactiveLanesAreNotDefined;
1296 return *this;
1297 }
1298
1299 bool inactiveLanesAreUnused() const {
1300 return ResultLanes == InactiveLanesAreUnused;
1301 }
1302
1303 SVEIntrinsicInfo &setInactiveLanesAreUnused() {
1304 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1305 ResultLanes = InactiveLanesAreUnused;
1306 return *this;
1307 }
1308
1309 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1310 // inactiveLanesAreZeroed =
1311 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1312 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1313
1314 SVEIntrinsicInfo &setResultIsZeroInitialized() {
1315 ResultIsZeroInitialized = true;
1316 return *this;
1317 }
1318
1319 //
1320 // The first operand of unary merging operations is typically only used to
1321 // set the result for inactive lanes. Knowing this allows us to deadcode the
1322 // operand when we can prove there are no inactive lanes.
1323 //
1324
1325 bool hasOperandWithNoActiveLanes() const {
1326 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1327 }
1328
1329 unsigned getOperandIdxWithNoActiveLanes() const {
1330 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1331 return OperandIdxWithNoActiveLanes;
1332 }
1333
1334 SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) {
1335 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1336 OperandIdxWithNoActiveLanes = Index;
1337 return *this;
1338 }
1339
1340private:
1341 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1342
1343 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1344 unsigned IROpcode = 0;
1345
1346 enum PredicationStyle {
1347 Uninitialized,
1348 InactiveLanesTakenFromOperand,
1349 InactiveLanesAreNotDefined,
1350 InactiveLanesAreUnused
1351 } ResultLanes = Uninitialized;
1352
1353 bool ResultIsZeroInitialized = false;
1354 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1355 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1356};
1357
1358static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
1359 // Some SVE intrinsics do not use scalable vector types, but since they are
1360 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1361 if (!isa<ScalableVectorType>(Val: II.getType()) &&
1362 all_of(Range: II.args(), P: [&](const Value *V) {
1363 return !isa<ScalableVectorType>(Val: V->getType());
1364 }))
1365 return SVEIntrinsicInfo();
1366
1367 Intrinsic::ID IID = II.getIntrinsicID();
1368 switch (IID) {
1369 default:
1370 break;
1371 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1372 case Intrinsic::aarch64_sve_fcvt_f16f32:
1373 case Intrinsic::aarch64_sve_fcvt_f16f64:
1374 case Intrinsic::aarch64_sve_fcvt_f32f16:
1375 case Intrinsic::aarch64_sve_fcvt_f32f64:
1376 case Intrinsic::aarch64_sve_fcvt_f64f16:
1377 case Intrinsic::aarch64_sve_fcvt_f64f32:
1378 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1379 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1380 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1381 case Intrinsic::aarch64_sve_fcvtzs:
1382 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1383 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1384 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1385 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1386 case Intrinsic::aarch64_sve_fcvtzu:
1387 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1388 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1389 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1390 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1391 case Intrinsic::aarch64_sve_scvtf:
1392 case Intrinsic::aarch64_sve_scvtf_f16i32:
1393 case Intrinsic::aarch64_sve_scvtf_f16i64:
1394 case Intrinsic::aarch64_sve_scvtf_f32i64:
1395 case Intrinsic::aarch64_sve_scvtf_f64i32:
1396 case Intrinsic::aarch64_sve_ucvtf:
1397 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1398 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1399 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1400 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1401 return SVEIntrinsicInfo::defaultMergingUnaryOp();
1402
1403 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1404 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1405 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1406 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1407 return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp();
1408
1409 case Intrinsic::aarch64_sve_fabd:
1410 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fabd_u);
1411 case Intrinsic::aarch64_sve_fadd:
1412 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fadd_u)
1413 .setMatchingIROpcode(Instruction::FAdd);
1414 case Intrinsic::aarch64_sve_fdiv:
1415 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fdiv_u)
1416 .setMatchingIROpcode(Instruction::FDiv);
1417 case Intrinsic::aarch64_sve_fmax:
1418 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmax_u);
1419 case Intrinsic::aarch64_sve_fmaxnm:
1420 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmaxnm_u);
1421 case Intrinsic::aarch64_sve_fmin:
1422 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmin_u);
1423 case Intrinsic::aarch64_sve_fminnm:
1424 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fminnm_u);
1425 case Intrinsic::aarch64_sve_fmla:
1426 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmla_u);
1427 case Intrinsic::aarch64_sve_fmls:
1428 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmls_u);
1429 case Intrinsic::aarch64_sve_fmul:
1430 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmul_u)
1431 .setMatchingIROpcode(Instruction::FMul);
1432 case Intrinsic::aarch64_sve_fmulx:
1433 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmulx_u);
1434 case Intrinsic::aarch64_sve_fnmla:
1435 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmla_u);
1436 case Intrinsic::aarch64_sve_fnmls:
1437 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmls_u);
1438 case Intrinsic::aarch64_sve_fsub:
1439 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fsub_u)
1440 .setMatchingIROpcode(Instruction::FSub);
1441 case Intrinsic::aarch64_sve_add:
1442 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_add_u)
1443 .setMatchingIROpcode(Instruction::Add);
1444 case Intrinsic::aarch64_sve_mla:
1445 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mla_u);
1446 case Intrinsic::aarch64_sve_mls:
1447 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mls_u);
1448 case Intrinsic::aarch64_sve_mul:
1449 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mul_u)
1450 .setMatchingIROpcode(Instruction::Mul);
1451 case Intrinsic::aarch64_sve_sabd:
1452 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sabd_u);
1453 case Intrinsic::aarch64_sve_sdiv:
1454 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sdiv_u)
1455 .setMatchingIROpcode(Instruction::SDiv);
1456 case Intrinsic::aarch64_sve_smax:
1457 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smax_u);
1458 case Intrinsic::aarch64_sve_smin:
1459 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smin_u);
1460 case Intrinsic::aarch64_sve_smulh:
1461 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smulh_u);
1462 case Intrinsic::aarch64_sve_sub:
1463 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sub_u)
1464 .setMatchingIROpcode(Instruction::Sub);
1465 case Intrinsic::aarch64_sve_uabd:
1466 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uabd_u);
1467 case Intrinsic::aarch64_sve_udiv:
1468 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_udiv_u)
1469 .setMatchingIROpcode(Instruction::UDiv);
1470 case Intrinsic::aarch64_sve_umax:
1471 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umax_u);
1472 case Intrinsic::aarch64_sve_umin:
1473 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umin_u);
1474 case Intrinsic::aarch64_sve_umulh:
1475 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umulh_u);
1476 case Intrinsic::aarch64_sve_asr:
1477 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_asr_u)
1478 .setMatchingIROpcode(Instruction::AShr);
1479 case Intrinsic::aarch64_sve_lsl:
1480 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsl_u)
1481 .setMatchingIROpcode(Instruction::Shl);
1482 case Intrinsic::aarch64_sve_lsr:
1483 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsr_u)
1484 .setMatchingIROpcode(Instruction::LShr);
1485 case Intrinsic::aarch64_sve_and:
1486 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_and_u)
1487 .setMatchingIROpcode(Instruction::And);
1488 case Intrinsic::aarch64_sve_bic:
1489 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_bic_u);
1490 case Intrinsic::aarch64_sve_eor:
1491 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_eor_u)
1492 .setMatchingIROpcode(Instruction::Xor);
1493 case Intrinsic::aarch64_sve_orr:
1494 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_orr_u)
1495 .setMatchingIROpcode(Instruction::Or);
1496 case Intrinsic::aarch64_sve_shsub:
1497 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_shsub_u);
1498 case Intrinsic::aarch64_sve_shsubr:
1499 return SVEIntrinsicInfo::defaultMergingOp();
1500 case Intrinsic::aarch64_sve_sqrshl:
1501 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqrshl_u);
1502 case Intrinsic::aarch64_sve_sqshl:
1503 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqshl_u);
1504 case Intrinsic::aarch64_sve_sqsub:
1505 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqsub_u);
1506 case Intrinsic::aarch64_sve_srshl:
1507 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_srshl_u);
1508 case Intrinsic::aarch64_sve_uhsub:
1509 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uhsub_u);
1510 case Intrinsic::aarch64_sve_uhsubr:
1511 return SVEIntrinsicInfo::defaultMergingOp();
1512 case Intrinsic::aarch64_sve_uqrshl:
1513 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqrshl_u);
1514 case Intrinsic::aarch64_sve_uqshl:
1515 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqshl_u);
1516 case Intrinsic::aarch64_sve_uqsub:
1517 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqsub_u);
1518 case Intrinsic::aarch64_sve_urshl:
1519 return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_urshl_u);
1520
1521 case Intrinsic::aarch64_sve_add_u:
1522 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1523 Instruction::Add);
1524 case Intrinsic::aarch64_sve_and_u:
1525 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1526 Instruction::And);
1527 case Intrinsic::aarch64_sve_asr_u:
1528 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1529 Instruction::AShr);
1530 case Intrinsic::aarch64_sve_eor_u:
1531 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1532 Instruction::Xor);
1533 case Intrinsic::aarch64_sve_fadd_u:
1534 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1535 Instruction::FAdd);
1536 case Intrinsic::aarch64_sve_fdiv_u:
1537 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1538 Instruction::FDiv);
1539 case Intrinsic::aarch64_sve_fmul_u:
1540 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1541 Instruction::FMul);
1542 case Intrinsic::aarch64_sve_fsub_u:
1543 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1544 Instruction::FSub);
1545 case Intrinsic::aarch64_sve_lsl_u:
1546 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1547 Instruction::Shl);
1548 case Intrinsic::aarch64_sve_lsr_u:
1549 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1550 Instruction::LShr);
1551 case Intrinsic::aarch64_sve_mul_u:
1552 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1553 Instruction::Mul);
1554 case Intrinsic::aarch64_sve_orr_u:
1555 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1556 Instruction::Or);
1557 case Intrinsic::aarch64_sve_sdiv_u:
1558 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1559 Instruction::SDiv);
1560 case Intrinsic::aarch64_sve_sub_u:
1561 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1562 Instruction::Sub);
1563 case Intrinsic::aarch64_sve_udiv_u:
1564 return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1565 Instruction::UDiv);
1566
1567 case Intrinsic::aarch64_sve_addqv:
1568 case Intrinsic::aarch64_sve_and_z:
1569 case Intrinsic::aarch64_sve_bic_z:
1570 case Intrinsic::aarch64_sve_brka_z:
1571 case Intrinsic::aarch64_sve_brkb_z:
1572 case Intrinsic::aarch64_sve_brkn_z:
1573 case Intrinsic::aarch64_sve_brkpa_z:
1574 case Intrinsic::aarch64_sve_brkpb_z:
1575 case Intrinsic::aarch64_sve_cntp:
1576 case Intrinsic::aarch64_sve_compact:
1577 case Intrinsic::aarch64_sve_eor_z:
1578 case Intrinsic::aarch64_sve_eorv:
1579 case Intrinsic::aarch64_sve_eorqv:
1580 case Intrinsic::aarch64_sve_nand_z:
1581 case Intrinsic::aarch64_sve_nor_z:
1582 case Intrinsic::aarch64_sve_orn_z:
1583 case Intrinsic::aarch64_sve_orr_z:
1584 case Intrinsic::aarch64_sve_orv:
1585 case Intrinsic::aarch64_sve_orqv:
1586 case Intrinsic::aarch64_sve_pnext:
1587 case Intrinsic::aarch64_sve_rdffr_z:
1588 case Intrinsic::aarch64_sve_saddv:
1589 case Intrinsic::aarch64_sve_uaddv:
1590 case Intrinsic::aarch64_sve_umaxv:
1591 case Intrinsic::aarch64_sve_umaxqv:
1592 case Intrinsic::aarch64_sve_cmpeq:
1593 case Intrinsic::aarch64_sve_cmpeq_wide:
1594 case Intrinsic::aarch64_sve_cmpge:
1595 case Intrinsic::aarch64_sve_cmpge_wide:
1596 case Intrinsic::aarch64_sve_cmpgt:
1597 case Intrinsic::aarch64_sve_cmpgt_wide:
1598 case Intrinsic::aarch64_sve_cmphi:
1599 case Intrinsic::aarch64_sve_cmphi_wide:
1600 case Intrinsic::aarch64_sve_cmphs:
1601 case Intrinsic::aarch64_sve_cmphs_wide:
1602 case Intrinsic::aarch64_sve_cmple_wide:
1603 case Intrinsic::aarch64_sve_cmplo_wide:
1604 case Intrinsic::aarch64_sve_cmpls_wide:
1605 case Intrinsic::aarch64_sve_cmplt_wide:
1606 case Intrinsic::aarch64_sve_cmpne:
1607 case Intrinsic::aarch64_sve_cmpne_wide:
1608 case Intrinsic::aarch64_sve_facge:
1609 case Intrinsic::aarch64_sve_facgt:
1610 case Intrinsic::aarch64_sve_fcmpeq:
1611 case Intrinsic::aarch64_sve_fcmpge:
1612 case Intrinsic::aarch64_sve_fcmpgt:
1613 case Intrinsic::aarch64_sve_fcmpne:
1614 case Intrinsic::aarch64_sve_fcmpuo:
1615 case Intrinsic::aarch64_sve_ld1:
1616 case Intrinsic::aarch64_sve_ld1_gather:
1617 case Intrinsic::aarch64_sve_ld1_gather_index:
1618 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1619 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1620 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1621 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1622 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1623 case Intrinsic::aarch64_sve_ld1q_gather_index:
1624 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1625 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1626 case Intrinsic::aarch64_sve_ld1ro:
1627 case Intrinsic::aarch64_sve_ld1rq:
1628 case Intrinsic::aarch64_sve_ld1udq:
1629 case Intrinsic::aarch64_sve_ld1uwq:
1630 case Intrinsic::aarch64_sve_ld2_sret:
1631 case Intrinsic::aarch64_sve_ld2q_sret:
1632 case Intrinsic::aarch64_sve_ld3_sret:
1633 case Intrinsic::aarch64_sve_ld3q_sret:
1634 case Intrinsic::aarch64_sve_ld4_sret:
1635 case Intrinsic::aarch64_sve_ld4q_sret:
1636 case Intrinsic::aarch64_sve_ldff1:
1637 case Intrinsic::aarch64_sve_ldff1_gather:
1638 case Intrinsic::aarch64_sve_ldff1_gather_index:
1639 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1640 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1641 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1642 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1643 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1644 case Intrinsic::aarch64_sve_ldnf1:
1645 case Intrinsic::aarch64_sve_ldnt1:
1646 case Intrinsic::aarch64_sve_ldnt1_gather:
1647 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1648 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1649 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1650 return SVEIntrinsicInfo::defaultZeroingOp();
1651
1652 case Intrinsic::aarch64_sve_prf:
1653 case Intrinsic::aarch64_sve_prfb_gather_index:
1654 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1655 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1656 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1657 case Intrinsic::aarch64_sve_prfd_gather_index:
1658 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1659 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1660 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1661 case Intrinsic::aarch64_sve_prfh_gather_index:
1662 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1663 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1664 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1665 case Intrinsic::aarch64_sve_prfw_gather_index:
1666 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1667 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1668 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1669 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 0);
1670
1671 case Intrinsic::aarch64_sve_st1_scatter:
1672 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1673 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1674 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1675 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1676 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1677 case Intrinsic::aarch64_sve_st1dq:
1678 case Intrinsic::aarch64_sve_st1q_scatter_index:
1679 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1680 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1681 case Intrinsic::aarch64_sve_st1wq:
1682 case Intrinsic::aarch64_sve_stnt1:
1683 case Intrinsic::aarch64_sve_stnt1_scatter:
1684 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1685 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1686 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1687 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 1);
1688 case Intrinsic::aarch64_sve_st2:
1689 case Intrinsic::aarch64_sve_st2q:
1690 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 2);
1691 case Intrinsic::aarch64_sve_st3:
1692 case Intrinsic::aarch64_sve_st3q:
1693 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 3);
1694 case Intrinsic::aarch64_sve_st4:
1695 case Intrinsic::aarch64_sve_st4q:
1696 return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 4);
1697 }
1698
1699 return SVEIntrinsicInfo();
1700}
1701
1702static bool isAllActivePredicate(Value *Pred) {
1703 Value *UncastedPred;
1704
1705 // Look through predicate casts that only remove lanes.
1706 if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1707 Op0: m_Value(V&: UncastedPred)))) {
1708 auto *OrigPredTy = cast<ScalableVectorType>(Val: Pred->getType());
1709 Pred = UncastedPred;
1710
1711 if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1712 Op0: m_Value(V&: UncastedPred))))
1713 // If the predicate has the same or less lanes than the uncasted predicate
1714 // then we know the casting has no effect.
1715 if (OrigPredTy->getMinNumElements() <=
1716 cast<ScalableVectorType>(Val: UncastedPred->getType())
1717 ->getMinNumElements())
1718 Pred = UncastedPred;
1719 }
1720
1721 auto *C = dyn_cast<Constant>(Val: Pred);
1722 return C && C->isAllOnesValue();
1723}
1724
1725// Simplify `V` by only considering the operations that affect active lanes.
1726// This function should only return existing Values or newly created Constants.
1727static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1728 auto *Dup = dyn_cast<IntrinsicInst>(Val: V);
1729 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1730 Dup->getOperand(i_nocapture: 1) == Pg && isa<Constant>(Val: Dup->getOperand(i_nocapture: 2)))
1731 return ConstantVector::getSplat(
1732 EC: cast<VectorType>(Val: V->getType())->getElementCount(),
1733 Elt: cast<Constant>(Val: Dup->getOperand(i_nocapture: 2)));
1734
1735 return V;
1736}
1737
1738static std::optional<Instruction *>
1739simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II,
1740 const SVEIntrinsicInfo &IInfo) {
1741 const unsigned Opc = IInfo.getMatchingIROpode();
1742 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1743
1744 Value *Pg = II.getOperand(i_nocapture: 0);
1745 Value *Op1 = II.getOperand(i_nocapture: 1);
1746 Value *Op2 = II.getOperand(i_nocapture: 2);
1747 const DataLayout &DL = II.getDataLayout();
1748
1749 // Canonicalise constants to the RHS.
1750 if (Instruction::isCommutative(Opcode: Opc) && IInfo.inactiveLanesAreNotDefined() &&
1751 isa<Constant>(Val: Op1) && !isa<Constant>(Val: Op2)) {
1752 IC.replaceOperand(I&: II, OpNum: 1, V: Op2);
1753 IC.replaceOperand(I&: II, OpNum: 2, V: Op1);
1754 return &II;
1755 }
1756
1757 // Only active lanes matter when simplifying the operation.
1758 Op1 = stripInactiveLanes(V: Op1, Pg);
1759 Op2 = stripInactiveLanes(V: Op2, Pg);
1760
1761 Value *SimpleII;
1762 if (auto FII = dyn_cast<FPMathOperator>(Val: &II))
1763 SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, FMF: FII->getFastMathFlags(), Q: DL);
1764 else
1765 SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, Q: DL);
1766
1767 // An SVE intrinsic's result is always defined. However, this is not the case
1768 // for its equivalent IR instruction (e.g. when shifting by an amount more
1769 // than the data's bitwidth). Simplifications to an undefined result must be
1770 // ignored to preserve the intrinsic's expected behaviour.
1771 if (!SimpleII || isa<UndefValue>(Val: SimpleII))
1772 return std::nullopt;
1773
1774 if (IInfo.inactiveLanesAreNotDefined())
1775 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1776
1777 Value *Inactive = II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom());
1778
1779 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1780 if (SimpleII == Inactive)
1781 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1782
1783 // Inactive lanes must be preserved.
1784 SimpleII = IC.Builder.CreateSelect(C: Pg, True: SimpleII, False: Inactive);
1785 return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1786}
1787
1788// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1789// to operations with less strict inactive lane requirements.
1790static std::optional<Instruction *>
1791simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II,
1792 const SVEIntrinsicInfo &IInfo) {
1793 if (!IInfo.hasGoverningPredicate())
1794 return std::nullopt;
1795
1796 auto *OpPredicate = II.getOperand(i_nocapture: IInfo.getGoverningPredicateOperandIdx());
1797
1798 // If there are no active lanes.
1799 if (match(V: OpPredicate, P: m_ZeroInt())) {
1800 if (IInfo.inactiveLanesTakenFromOperand())
1801 return IC.replaceInstUsesWith(
1802 I&: II, V: II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom()));
1803
1804 if (IInfo.inactiveLanesAreUnused()) {
1805 if (IInfo.resultIsZeroInitialized())
1806 IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1807
1808 return IC.eraseInstFromFunction(I&: II);
1809 }
1810 }
1811
1812 // If there are no inactive lanes.
1813 if (isAllActivePredicate(Pred: OpPredicate)) {
1814 if (IInfo.hasOperandWithNoActiveLanes()) {
1815 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1816 if (!isa<UndefValue>(Val: II.getOperand(i_nocapture: OpIdx)))
1817 return IC.replaceOperand(I&: II, OpNum: OpIdx, V: UndefValue::get(T: II.getType()));
1818 }
1819
1820 if (IInfo.hasMatchingUndefIntrinsic()) {
1821 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1822 M: II.getModule(), id: IInfo.getMatchingUndefIntrinsic(), Tys: {II.getType()});
1823 II.setCalledFunction(NewDecl);
1824 return &II;
1825 }
1826 }
1827
1828 // Operation specific simplifications.
1829 if (IInfo.hasMatchingIROpode() &&
1830 Instruction::isBinaryOp(Opcode: IInfo.getMatchingIROpode()))
1831 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1832
1833 return std::nullopt;
1834}
1835
1836// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1837// => (binop (pred) (from_svbool _) (from_svbool _))
1838//
1839// The above transformation eliminates a `to_svbool` in the predicate
1840// operand of bitwise operation `binop` by narrowing the vector width of
1841// the operation. For example, it would convert a `<vscale x 16 x i1>
1842// and` into a `<vscale x 4 x i1> and`. This is profitable because
1843// to_svbool must zero the new lanes during widening, whereas
1844// from_svbool is free.
1845static std::optional<Instruction *>
1846tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
1847 auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: 0));
1848 if (!BinOp)
1849 return std::nullopt;
1850
1851 auto IntrinsicID = BinOp->getIntrinsicID();
1852 switch (IntrinsicID) {
1853 case Intrinsic::aarch64_sve_and_z:
1854 case Intrinsic::aarch64_sve_bic_z:
1855 case Intrinsic::aarch64_sve_eor_z:
1856 case Intrinsic::aarch64_sve_nand_z:
1857 case Intrinsic::aarch64_sve_nor_z:
1858 case Intrinsic::aarch64_sve_orn_z:
1859 case Intrinsic::aarch64_sve_orr_z:
1860 break;
1861 default:
1862 return std::nullopt;
1863 }
1864
1865 auto BinOpPred = BinOp->getOperand(i_nocapture: 0);
1866 auto BinOpOp1 = BinOp->getOperand(i_nocapture: 1);
1867 auto BinOpOp2 = BinOp->getOperand(i_nocapture: 2);
1868
1869 auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred);
1870 if (!PredIntr ||
1871 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1872 return std::nullopt;
1873
1874 auto PredOp = PredIntr->getOperand(i_nocapture: 0);
1875 auto PredOpTy = cast<VectorType>(Val: PredOp->getType());
1876 if (PredOpTy != II.getType())
1877 return std::nullopt;
1878
1879 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1880 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1881 ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp1});
1882 NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
1883 if (BinOpOp1 == BinOpOp2)
1884 NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
1885 else
1886 NarrowedBinOpArgs.push_back(Elt: IC.Builder.CreateIntrinsic(
1887 ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp2}));
1888
1889 auto NarrowedBinOp =
1890 IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {PredOpTy}, Args: NarrowedBinOpArgs);
1891 return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp);
1892}
1893
1894static std::optional<Instruction *>
1895instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
1896 // If the reinterpret instruction operand is a PHI Node
1897 if (isa<PHINode>(Val: II.getArgOperand(i: 0)))
1898 return processPhiNode(IC, II);
1899
1900 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1901 return BinOpCombine;
1902
1903 // Ignore converts to/from svcount_t.
1904 if (isa<TargetExtType>(Val: II.getArgOperand(i: 0)->getType()) ||
1905 isa<TargetExtType>(Val: II.getType()))
1906 return std::nullopt;
1907
1908 SmallVector<Instruction *, 32> CandidatesForRemoval;
1909 Value *Cursor = II.getOperand(i_nocapture: 0), *EarliestReplacement = nullptr;
1910
1911 const auto *IVTy = cast<VectorType>(Val: II.getType());
1912
1913 // Walk the chain of conversions.
1914 while (Cursor) {
1915 // If the type of the cursor has fewer lanes than the final result, zeroing
1916 // must take place, which breaks the equivalence chain.
1917 const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType());
1918 if (CursorVTy->getElementCount().getKnownMinValue() <
1919 IVTy->getElementCount().getKnownMinValue())
1920 break;
1921
1922 // If the cursor has the same type as I, it is a viable replacement.
1923 if (Cursor->getType() == IVTy)
1924 EarliestReplacement = Cursor;
1925
1926 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor);
1927
1928 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1929 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1930 Intrinsic::aarch64_sve_convert_to_svbool ||
1931 IntrinsicCursor->getIntrinsicID() ==
1932 Intrinsic::aarch64_sve_convert_from_svbool))
1933 break;
1934
1935 CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor);
1936 Cursor = IntrinsicCursor->getOperand(i_nocapture: 0);
1937 }
1938
1939 // If no viable replacement in the conversion chain was found, there is
1940 // nothing to do.
1941 if (!EarliestReplacement)
1942 return std::nullopt;
1943
1944 return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement);
1945}
1946
1947static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1948 IntrinsicInst &II) {
1949 // svsel(ptrue, x, y) => x
1950 auto *OpPredicate = II.getOperand(i_nocapture: 0);
1951 if (isAllActivePredicate(Pred: OpPredicate))
1952 return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: 1));
1953
1954 auto Select =
1955 IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: 1), False: II.getOperand(i_nocapture: 2));
1956 return IC.replaceInstUsesWith(I&: II, V: Select);
1957}
1958
1959static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1960 IntrinsicInst &II) {
1961 Value *Pg = II.getOperand(i_nocapture: 1);
1962
1963 // sve.dup(V, all_active, X) ==> splat(X)
1964 if (isAllActivePredicate(Pred: Pg)) {
1965 auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
1966 Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
1967 V: II.getArgOperand(i: 2));
1968 return IC.replaceInstUsesWith(I&: II, V: Splat);
1969 }
1970
1971 if (!match(V: Pg, P: m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1972 Op0: m_SpecificInt(V: AArch64SVEPredPattern::vl1))))
1973 return std::nullopt;
1974
1975 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
1976 Value *Insert = IC.Builder.CreateInsertElement(
1977 Vec: II.getArgOperand(i: 0), NewElt: II.getArgOperand(i: 2), Idx: uint64_t(0));
1978 return IC.replaceInstUsesWith(I&: II, V: Insert);
1979}
1980
1981static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1982 IntrinsicInst &II) {
1983 // Replace DupX with a regular IR splat.
1984 auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
1985 Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
1986 V: II.getArgOperand(i: 0));
1987 Splat->takeName(V: &II);
1988 return IC.replaceInstUsesWith(I&: II, V: Splat);
1989}
1990
1991static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1992 IntrinsicInst &II) {
1993 LLVMContext &Ctx = II.getContext();
1994
1995 if (!isAllActivePredicate(Pred: II.getArgOperand(i: 0)))
1996 return std::nullopt;
1997
1998 // Check that we have a compare of zero..
1999 auto *SplatValue =
2000 dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: 2)));
2001 if (!SplatValue || !SplatValue->isZero())
2002 return std::nullopt;
2003
2004 // ..against a dupq
2005 auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1));
2006 if (!DupQLane ||
2007 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2008 return std::nullopt;
2009
2010 // Where the dupq is a lane 0 replicate of a vector insert
2011 auto *DupQLaneIdx = dyn_cast<ConstantInt>(Val: DupQLane->getArgOperand(i: 1));
2012 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2013 return std::nullopt;
2014
2015 auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: 0));
2016 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2017 return std::nullopt;
2018
2019 // Where the vector insert is a fixed constant vector insert into undef at
2020 // index zero
2021 if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: 0)))
2022 return std::nullopt;
2023
2024 if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: 2))->isZero())
2025 return std::nullopt;
2026
2027 auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: 1));
2028 if (!ConstVec)
2029 return std::nullopt;
2030
2031 auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType());
2032 auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType());
2033 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2034 return std::nullopt;
2035
2036 unsigned NumElts = VecTy->getNumElements();
2037 unsigned PredicateBits = 0;
2038
2039 // Expand intrinsic operands to a 16-bit byte level predicate
2040 for (unsigned I = 0; I < NumElts; ++I) {
2041 auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I));
2042 if (!Arg)
2043 return std::nullopt;
2044 if (!Arg->isZero())
2045 PredicateBits |= 1 << (I * (16 / NumElts));
2046 }
2047
2048 // If all bits are zero bail early with an empty predicate
2049 if (PredicateBits == 0) {
2050 auto *PFalse = Constant::getNullValue(Ty: II.getType());
2051 PFalse->takeName(V: &II);
2052 return IC.replaceInstUsesWith(I&: II, V: PFalse);
2053 }
2054
2055 // Calculate largest predicate type used (where byte predicate is largest)
2056 unsigned Mask = 8;
2057 for (unsigned I = 0; I < 16; ++I)
2058 if ((PredicateBits & (1 << I)) != 0)
2059 Mask |= (I % 8);
2060
2061 unsigned PredSize = Mask & -Mask;
2062 auto *PredType = ScalableVectorType::get(
2063 ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * 8));
2064
2065 // Ensure all relevant bits are set
2066 for (unsigned I = 0; I < 16; I += PredSize)
2067 if ((PredicateBits & (1 << I)) == 0)
2068 return std::nullopt;
2069
2070 auto *PTruePat =
2071 ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
2072 auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
2073 Types: {PredType}, Args: {PTruePat});
2074 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2075 ID: Intrinsic::aarch64_sve_convert_to_svbool, Types: {PredType}, Args: {PTrue});
2076 auto *ConvertFromSVBool =
2077 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool,
2078 Types: {II.getType()}, Args: {ConvertToSVBool});
2079
2080 ConvertFromSVBool->takeName(V: &II);
2081 return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool);
2082}
2083
2084static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2085 IntrinsicInst &II) {
2086 Value *Pg = II.getArgOperand(i: 0);
2087 Value *Vec = II.getArgOperand(i: 1);
2088 auto IntrinsicID = II.getIntrinsicID();
2089 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2090
2091 // lastX(splat(X)) --> X
2092 if (auto *SplatVal = getSplatValue(V: Vec))
2093 return IC.replaceInstUsesWith(I&: II, V: SplatVal);
2094
2095 // If x and/or y is a splat value then:
2096 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2097 Value *LHS, *RHS;
2098 if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) {
2099 if (isSplatValue(V: LHS) || isSplatValue(V: RHS)) {
2100 auto *OldBinOp = cast<BinaryOperator>(Val: Vec);
2101 auto OpC = OldBinOp->getOpcode();
2102 auto *NewLHS =
2103 IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, LHS});
2104 auto *NewRHS =
2105 IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, RHS});
2106 auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
2107 Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator());
2108 return IC.replaceInstUsesWith(I&: II, V: NewBinOp);
2109 }
2110 }
2111
2112 auto *C = dyn_cast<Constant>(Val: Pg);
2113 if (IsAfter && C && C->isNullValue()) {
2114 // The intrinsic is extracting lane 0 so use an extract instead.
2115 auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
2116 auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: 0));
2117 Extract->insertBefore(InsertPos: II.getIterator());
2118 Extract->takeName(V: &II);
2119 return IC.replaceInstUsesWith(I&: II, V: Extract);
2120 }
2121
2122 auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg);
2123 if (!IntrPG)
2124 return std::nullopt;
2125
2126 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2127 return std::nullopt;
2128
2129 const auto PTruePattern =
2130 cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: 0))->getZExtValue();
2131
2132 // Can the intrinsic's predicate be converted to a known constant index?
2133 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern);
2134 if (!MinNumElts)
2135 return std::nullopt;
2136
2137 unsigned Idx = MinNumElts - 1;
2138 // Increment the index if extracting the element after the last active
2139 // predicate element.
2140 if (IsAfter)
2141 ++Idx;
2142
2143 // Ignore extracts whose index is larger than the known minimum vector
2144 // length. NOTE: This is an artificial constraint where we prefer to
2145 // maintain what the user asked for until an alternative is proven faster.
2146 auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType());
2147 if (Idx >= PgVTy->getMinNumElements())
2148 return std::nullopt;
2149
2150 // The intrinsic is extracting a fixed lane so use an extract instead.
2151 auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
2152 auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx));
2153 Extract->insertBefore(InsertPos: II.getIterator());
2154 Extract->takeName(V: &II);
2155 return IC.replaceInstUsesWith(I&: II, V: Extract);
2156}
2157
2158static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2159 IntrinsicInst &II) {
2160 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2161 // integer variant across a variety of micro-architectures. Replace scalar
2162 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2163 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2164 // depending on the micro-architecture, but has been observed as generally
2165 // being faster, particularly when the CLAST[AB] op is a loop-carried
2166 // dependency.
2167 Value *Pg = II.getArgOperand(i: 0);
2168 Value *Fallback = II.getArgOperand(i: 1);
2169 Value *Vec = II.getArgOperand(i: 2);
2170 Type *Ty = II.getType();
2171
2172 if (!Ty->isIntegerTy())
2173 return std::nullopt;
2174
2175 Type *FPTy;
2176 switch (cast<IntegerType>(Val: Ty)->getBitWidth()) {
2177 default:
2178 return std::nullopt;
2179 case 16:
2180 FPTy = IC.Builder.getHalfTy();
2181 break;
2182 case 32:
2183 FPTy = IC.Builder.getFloatTy();
2184 break;
2185 case 64:
2186 FPTy = IC.Builder.getDoubleTy();
2187 break;
2188 }
2189
2190 Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy);
2191 auto *FPVTy = VectorType::get(
2192 ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount());
2193 Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy);
2194 auto *FPII = IC.Builder.CreateIntrinsic(
2195 ID: II.getIntrinsicID(), Types: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec});
2196 Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType());
2197 return IC.replaceInstUsesWith(I&: II, V: FPIItoInt);
2198}
2199
2200static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2201 IntrinsicInst &II) {
2202 LLVMContext &Ctx = II.getContext();
2203 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2204 // can work with RDFFR_PP for ptest elimination.
2205 auto *AllPat =
2206 ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
2207 auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
2208 Types: {II.getType()}, Args: {AllPat});
2209 auto *RDFFR =
2210 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_rdffr_z, Args: {PTrue});
2211 RDFFR->takeName(V: &II);
2212 return IC.replaceInstUsesWith(I&: II, V: RDFFR);
2213}
2214
2215static std::optional<Instruction *>
2216instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
2217 const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: 0))->getZExtValue();
2218
2219 if (Pattern == AArch64SVEPredPattern::all) {
2220 Value *Cnt = IC.Builder.CreateElementCount(
2221 Ty: II.getType(), EC: ElementCount::getScalable(MinVal: NumElts));
2222 Cnt->takeName(V: &II);
2223 return IC.replaceInstUsesWith(I&: II, V: Cnt);
2224 }
2225
2226 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2227
2228 return MinNumElts && NumElts >= MinNumElts
2229 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2230 I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts)))
2231 : std::nullopt;
2232}
2233
2234static std::optional<Instruction *>
2235instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II,
2236 const AArch64Subtarget *ST) {
2237 if (!ST->isStreaming())
2238 return std::nullopt;
2239
2240 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2241 // with SVEPredPattern::all
2242 Value *Cnt =
2243 IC.Builder.CreateElementCount(Ty: II.getType(), EC: ElementCount::getScalable(MinVal: 2));
2244 Cnt->takeName(V: &II);
2245 return IC.replaceInstUsesWith(I&: II, V: Cnt);
2246}
2247
2248static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2249 IntrinsicInst &II) {
2250 Value *PgVal = II.getArgOperand(i: 0);
2251 Value *OpVal = II.getArgOperand(i: 1);
2252
2253 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2254 // Later optimizations prefer this form.
2255 if (PgVal == OpVal &&
2256 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2257 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2258 Value *Ops[] = {PgVal, OpVal};
2259 Type *Tys[] = {PgVal->getType()};
2260
2261 auto *PTest =
2262 IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptest_any, Types: Tys, Args: Ops);
2263 PTest->takeName(V: &II);
2264
2265 return IC.replaceInstUsesWith(I&: II, V: PTest);
2266 }
2267
2268 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal);
2269 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal);
2270
2271 if (!Pg || !Op)
2272 return std::nullopt;
2273
2274 Intrinsic::ID OpIID = Op->getIntrinsicID();
2275
2276 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2277 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2278 Pg->getArgOperand(i: 0)->getType() == Op->getArgOperand(i: 0)->getType()) {
2279 Value *Ops[] = {Pg->getArgOperand(i: 0), Op->getArgOperand(i: 0)};
2280 Type *Tys[] = {Pg->getArgOperand(i: 0)->getType()};
2281
2282 auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
2283
2284 PTest->takeName(V: &II);
2285 return IC.replaceInstUsesWith(I&: II, V: PTest);
2286 }
2287
2288 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2289 // Later optimizations may rewrite sequence to use the flag-setting variant
2290 // of instruction X to remove PTEST.
2291 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2292 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2293 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2294 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2295 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2296 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2297 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2298 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2299 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2300 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2301 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2302 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2303 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2304 Value *Ops[] = {Pg->getArgOperand(i: 0), Pg};
2305 Type *Tys[] = {Pg->getType()};
2306
2307 auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
2308 PTest->takeName(V: &II);
2309
2310 return IC.replaceInstUsesWith(I&: II, V: PTest);
2311 }
2312
2313 return std::nullopt;
2314}
2315
2316template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2317static std::optional<Instruction *>
2318instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
2319 bool MergeIntoAddendOp) {
2320 Value *P = II.getOperand(i_nocapture: 0);
2321 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2322 if (MergeIntoAddendOp) {
2323 AddendOp = II.getOperand(i_nocapture: 1);
2324 Mul = II.getOperand(i_nocapture: 2);
2325 } else {
2326 AddendOp = II.getOperand(i_nocapture: 2);
2327 Mul = II.getOperand(i_nocapture: 1);
2328 }
2329
2330 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0),
2331 m_Value(V&: MulOp1))))
2332 return std::nullopt;
2333
2334 if (!Mul->hasOneUse())
2335 return std::nullopt;
2336
2337 Instruction *FMFSource = nullptr;
2338 if (II.getType()->isFPOrFPVectorTy()) {
2339 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2340 // Stop the combine when the flags on the inputs differ in case dropping
2341 // flags would lead to us missing out on more beneficial optimizations.
2342 if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags())
2343 return std::nullopt;
2344 if (!FAddFlags.allowContract())
2345 return std::nullopt;
2346 FMFSource = &II;
2347 }
2348
2349 CallInst *Res;
2350 if (MergeIntoAddendOp)
2351 Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
2352 Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2353 else
2354 Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
2355 Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2356
2357 return IC.replaceInstUsesWith(I&: II, V: Res);
2358}
2359
2360static std::optional<Instruction *>
2361instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2362 Value *Pred = II.getOperand(i_nocapture: 0);
2363 Value *PtrOp = II.getOperand(i_nocapture: 1);
2364 Type *VecTy = II.getType();
2365
2366 if (isAllActivePredicate(Pred)) {
2367 LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp);
2368 Load->copyMetadata(SrcInst: II);
2369 return IC.replaceInstUsesWith(I&: II, V: Load);
2370 }
2371
2372 CallInst *MaskedLoad =
2373 IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL),
2374 Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy));
2375 MaskedLoad->copyMetadata(SrcInst: II);
2376 return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2377}
2378
2379static std::optional<Instruction *>
2380instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2381 Value *VecOp = II.getOperand(i_nocapture: 0);
2382 Value *Pred = II.getOperand(i_nocapture: 1);
2383 Value *PtrOp = II.getOperand(i_nocapture: 2);
2384
2385 if (isAllActivePredicate(Pred)) {
2386 StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp);
2387 Store->copyMetadata(SrcInst: II);
2388 return IC.eraseInstFromFunction(I&: II);
2389 }
2390
2391 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2392 Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred);
2393 MaskedStore->copyMetadata(SrcInst: II);
2394 return IC.eraseInstFromFunction(I&: II);
2395}
2396
2397static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
2398 switch (Intrinsic) {
2399 case Intrinsic::aarch64_sve_fmul_u:
2400 return Instruction::BinaryOps::FMul;
2401 case Intrinsic::aarch64_sve_fadd_u:
2402 return Instruction::BinaryOps::FAdd;
2403 case Intrinsic::aarch64_sve_fsub_u:
2404 return Instruction::BinaryOps::FSub;
2405 default:
2406 return Instruction::BinaryOpsEnd;
2407 }
2408}
2409
2410static std::optional<Instruction *>
2411instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
2412 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2413 if (II.isStrictFP())
2414 return std::nullopt;
2415
2416 auto *OpPredicate = II.getOperand(i_nocapture: 0);
2417 auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID());
2418 if (BinOpCode == Instruction::BinaryOpsEnd ||
2419 !isAllActivePredicate(Pred: OpPredicate))
2420 return std::nullopt;
2421 auto BinOp = IC.Builder.CreateBinOpFMF(
2422 Opc: BinOpCode, LHS: II.getOperand(i_nocapture: 1), RHS: II.getOperand(i_nocapture: 2), FMFSource: II.getFastMathFlags());
2423 return IC.replaceInstUsesWith(I&: II, V: BinOp);
2424}
2425
2426static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2427 IntrinsicInst &II) {
2428 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2429 Intrinsic::aarch64_sve_mla>(
2430 IC, II, MergeIntoAddendOp: true))
2431 return MLA;
2432 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2433 Intrinsic::aarch64_sve_mad>(
2434 IC, II, MergeIntoAddendOp: false))
2435 return MAD;
2436 return std::nullopt;
2437}
2438
2439static std::optional<Instruction *>
2440instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
2441 if (auto FMLA =
2442 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2443 Intrinsic::aarch64_sve_fmla>(IC, II,
2444 MergeIntoAddendOp: true))
2445 return FMLA;
2446 if (auto FMAD =
2447 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2448 Intrinsic::aarch64_sve_fmad>(IC, II,
2449 MergeIntoAddendOp: false))
2450 return FMAD;
2451 if (auto FMLA =
2452 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2453 Intrinsic::aarch64_sve_fmla>(IC, II,
2454 MergeIntoAddendOp: true))
2455 return FMLA;
2456 return std::nullopt;
2457}
2458
2459static std::optional<Instruction *>
2460instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
2461 if (auto FMLA =
2462 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2463 Intrinsic::aarch64_sve_fmla>(IC, II,
2464 MergeIntoAddendOp: true))
2465 return FMLA;
2466 if (auto FMAD =
2467 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2468 Intrinsic::aarch64_sve_fmad>(IC, II,
2469 MergeIntoAddendOp: false))
2470 return FMAD;
2471 if (auto FMLA_U =
2472 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2473 Intrinsic::aarch64_sve_fmla_u>(
2474 IC, II, MergeIntoAddendOp: true))
2475 return FMLA_U;
2476 return instCombineSVEVectorBinOp(IC, II);
2477}
2478
2479static std::optional<Instruction *>
2480instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
2481 if (auto FMLS =
2482 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2483 Intrinsic::aarch64_sve_fmls>(IC, II,
2484 MergeIntoAddendOp: true))
2485 return FMLS;
2486 if (auto FMSB =
2487 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2488 Intrinsic::aarch64_sve_fnmsb>(
2489 IC, II, MergeIntoAddendOp: false))
2490 return FMSB;
2491 if (auto FMLS =
2492 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2493 Intrinsic::aarch64_sve_fmls>(IC, II,
2494 MergeIntoAddendOp: true))
2495 return FMLS;
2496 return std::nullopt;
2497}
2498
2499static std::optional<Instruction *>
2500instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
2501 if (auto FMLS =
2502 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2503 Intrinsic::aarch64_sve_fmls>(IC, II,
2504 MergeIntoAddendOp: true))
2505 return FMLS;
2506 if (auto FMSB =
2507 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2508 Intrinsic::aarch64_sve_fnmsb>(
2509 IC, II, MergeIntoAddendOp: false))
2510 return FMSB;
2511 if (auto FMLS_U =
2512 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2513 Intrinsic::aarch64_sve_fmls_u>(
2514 IC, II, MergeIntoAddendOp: true))
2515 return FMLS_U;
2516 return instCombineSVEVectorBinOp(IC, II);
2517}
2518
2519static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2520 IntrinsicInst &II) {
2521 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2522 Intrinsic::aarch64_sve_mls>(
2523 IC, II, MergeIntoAddendOp: true))
2524 return MLS;
2525 return std::nullopt;
2526}
2527
2528static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2529 IntrinsicInst &II) {
2530 Value *UnpackArg = II.getArgOperand(i: 0);
2531 auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2532 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2533 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2534
2535 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2536 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2537 if (auto *ScalarArg = getSplatValue(V: UnpackArg)) {
2538 ScalarArg =
2539 IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned);
2540 Value *NewVal =
2541 IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg);
2542 NewVal->takeName(V: &II);
2543 return IC.replaceInstUsesWith(I&: II, V: NewVal);
2544 }
2545
2546 return std::nullopt;
2547}
2548static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2549 IntrinsicInst &II) {
2550 auto *OpVal = II.getOperand(i_nocapture: 0);
2551 auto *OpIndices = II.getOperand(i_nocapture: 1);
2552 VectorType *VTy = cast<VectorType>(Val: II.getType());
2553
2554 // Check whether OpIndices is a constant splat value < minimal element count
2555 // of result.
2556 auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices));
2557 if (!SplatValue ||
2558 SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue()))
2559 return std::nullopt;
2560
2561 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2562 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2563 auto *Extract = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue);
2564 auto *VectorSplat =
2565 IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract);
2566
2567 VectorSplat->takeName(V: &II);
2568 return IC.replaceInstUsesWith(I&: II, V: VectorSplat);
2569}
2570
2571static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2572 IntrinsicInst &II) {
2573 Value *A, *B;
2574 Type *RetTy = II.getType();
2575 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2576 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2577
2578 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2579 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2580 if ((match(V: II.getArgOperand(i: 0),
2581 P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A)))) &&
2582 match(V: II.getArgOperand(i: 1),
2583 P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) ||
2584 (match(V: II.getArgOperand(i: 0), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A))) &&
2585 match(V: II.getArgOperand(i: 1), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) {
2586 auto *TyA = cast<ScalableVectorType>(Val: A->getType());
2587 if (TyA == B->getType() &&
2588 RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) {
2589 auto *SubVec = IC.Builder.CreateInsertVector(
2590 DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: uint64_t(0));
2591 auto *ConcatVec = IC.Builder.CreateInsertVector(DstType: RetTy, SrcVec: SubVec, SubVec: B,
2592 Idx: TyA->getMinNumElements());
2593 ConcatVec->takeName(V: &II);
2594 return IC.replaceInstUsesWith(I&: II, V: ConcatVec);
2595 }
2596 }
2597
2598 return std::nullopt;
2599}
2600
2601static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2602 IntrinsicInst &II) {
2603 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2604 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2605 Value *A, *B;
2606 if (match(V: II.getArgOperand(i: 0),
2607 P: m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(Op0: m_Value(V&: A), Op1: m_Value(V&: B))) &&
2608 match(V: II.getArgOperand(i: 1), P: m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
2609 Op0: m_Specific(V: A), Op1: m_Specific(V: B))))
2610 return IC.replaceInstUsesWith(
2611 I&: II, V: (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2612
2613 return std::nullopt;
2614}
2615
2616static std::optional<Instruction *>
2617instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
2618 Value *Mask = II.getOperand(i_nocapture: 0);
2619 Value *BasePtr = II.getOperand(i_nocapture: 1);
2620 Value *Index = II.getOperand(i_nocapture: 2);
2621 Type *Ty = II.getType();
2622 Value *PassThru = ConstantAggregateZero::get(Ty);
2623
2624 // Contiguous gather => masked load.
2625 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2626 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2627 Value *IndexBase;
2628 if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
2629 Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) {
2630 Align Alignment =
2631 BasePtr->getPointerAlignment(DL: II.getDataLayout());
2632
2633 Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2634 Ptr: BasePtr, IdxList: IndexBase);
2635 CallInst *MaskedLoad =
2636 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2637 MaskedLoad->takeName(V: &II);
2638 return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2639 }
2640
2641 return std::nullopt;
2642}
2643
2644static std::optional<Instruction *>
2645instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
2646 Value *Val = II.getOperand(i_nocapture: 0);
2647 Value *Mask = II.getOperand(i_nocapture: 1);
2648 Value *BasePtr = II.getOperand(i_nocapture: 2);
2649 Value *Index = II.getOperand(i_nocapture: 3);
2650 Type *Ty = Val->getType();
2651
2652 // Contiguous scatter => masked store.
2653 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2654 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2655 Value *IndexBase;
2656 if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
2657 Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) {
2658 Align Alignment =
2659 BasePtr->getPointerAlignment(DL: II.getDataLayout());
2660
2661 Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2662 Ptr: BasePtr, IdxList: IndexBase);
2663 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2664
2665 return IC.eraseInstFromFunction(I&: II);
2666 }
2667
2668 return std::nullopt;
2669}
2670
2671static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2672 IntrinsicInst &II) {
2673 Type *Int32Ty = IC.Builder.getInt32Ty();
2674 Value *Pred = II.getOperand(i_nocapture: 0);
2675 Value *Vec = II.getOperand(i_nocapture: 1);
2676 Value *DivVec = II.getOperand(i_nocapture: 2);
2677
2678 Value *SplatValue = getSplatValue(V: DivVec);
2679 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue);
2680 if (!SplatConstantInt)
2681 return std::nullopt;
2682
2683 APInt Divisor = SplatConstantInt->getValue();
2684 const int64_t DivisorValue = Divisor.getSExtValue();
2685 if (DivisorValue == -1)
2686 return std::nullopt;
2687 if (DivisorValue == 1)
2688 IC.replaceInstUsesWith(I&: II, V: Vec);
2689
2690 if (Divisor.isPowerOf2()) {
2691 Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2692 auto ASRD = IC.Builder.CreateIntrinsic(
2693 ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2694 return IC.replaceInstUsesWith(I&: II, V: ASRD);
2695 }
2696 if (Divisor.isNegatedPowerOf2()) {
2697 Divisor.negate();
2698 Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2699 auto ASRD = IC.Builder.CreateIntrinsic(
2700 ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2701 auto NEG = IC.Builder.CreateIntrinsic(
2702 ID: Intrinsic::aarch64_sve_neg, Types: {ASRD->getType()}, Args: {ASRD, Pred, ASRD});
2703 return IC.replaceInstUsesWith(I&: II, V: NEG);
2704 }
2705
2706 return std::nullopt;
2707}
2708
2709bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2710 size_t VecSize = Vec.size();
2711 if (VecSize == 1)
2712 return true;
2713 if (!isPowerOf2_64(Value: VecSize))
2714 return false;
2715 size_t HalfVecSize = VecSize / 2;
2716
2717 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2718 RHS != Vec.end(); LHS++, RHS++) {
2719 if (*LHS != nullptr && *RHS != nullptr) {
2720 if (*LHS == *RHS)
2721 continue;
2722 else
2723 return false;
2724 }
2725 if (!AllowPoison)
2726 return false;
2727 if (*LHS == nullptr && *RHS != nullptr)
2728 *LHS = *RHS;
2729 }
2730
2731 Vec.resize(N: HalfVecSize);
2732 SimplifyValuePattern(Vec, AllowPoison);
2733 return true;
2734}
2735
2736// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2737// to dupqlane(f64(C)) where C is A concatenated with B
2738static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2739 IntrinsicInst &II) {
2740 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2741 if (!match(V: II.getOperand(i_nocapture: 0),
2742 P: m_Intrinsic<Intrinsic::vector_insert>(
2743 Op0: m_Value(V&: Default), Op1: m_Value(V&: CurrentInsertElt), Op2: m_Value())) ||
2744 !isa<FixedVectorType>(Val: CurrentInsertElt->getType()))
2745 return std::nullopt;
2746 auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType());
2747
2748 // Insert the scalars into a container ordered by InsertElement index
2749 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2750 while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) {
2751 auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: 2));
2752 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: 1);
2753 CurrentInsertElt = InsertElt->getOperand(i_nocapture: 0);
2754 }
2755
2756 bool AllowPoison =
2757 isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default);
2758 if (!SimplifyValuePattern(Vec&: Elts, AllowPoison))
2759 return std::nullopt;
2760
2761 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2762 Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType());
2763 for (size_t I = 0; I < Elts.size(); I++) {
2764 if (Elts[I] == nullptr)
2765 continue;
2766 InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts[I],
2767 Idx: IC.Builder.getInt64(C: I));
2768 }
2769 if (InsertEltChain == nullptr)
2770 return std::nullopt;
2771
2772 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2773 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2774 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2775 // be narrowed back to the original type.
2776 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2777 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2778 IIScalableTy->getMinNumElements() /
2779 PatternWidth;
2780
2781 IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth);
2782 auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount);
2783 auto *WideShuffleMaskTy =
2784 ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount);
2785
2786 auto InsertSubvector = IC.Builder.CreateInsertVector(
2787 DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain,
2788 Idx: uint64_t(0));
2789 auto WideBitcast =
2790 IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy);
2791 auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy);
2792 auto WideShuffle = IC.Builder.CreateShuffleVector(
2793 V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask);
2794 auto NarrowBitcast =
2795 IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType());
2796
2797 return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast);
2798}
2799
2800static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2801 IntrinsicInst &II) {
2802 Value *A = II.getArgOperand(i: 0);
2803 Value *B = II.getArgOperand(i: 1);
2804 if (A == B)
2805 return IC.replaceInstUsesWith(I&: II, V: A);
2806
2807 return std::nullopt;
2808}
2809
2810static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2811 IntrinsicInst &II) {
2812 Value *Pred = II.getOperand(i_nocapture: 0);
2813 Value *Vec = II.getOperand(i_nocapture: 1);
2814 Value *Shift = II.getOperand(i_nocapture: 2);
2815
2816 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2817 Value *AbsPred, *MergedValue;
2818 if (!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2819 Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())) &&
2820 !match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2821 Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())))
2822
2823 return std::nullopt;
2824
2825 // Transform is valid if any of the following are true:
2826 // * The ABS merge value is an undef or non-negative
2827 // * The ABS predicate is all active
2828 // * The ABS predicate and the SRSHL predicates are the same
2829 if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) &&
2830 AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred))
2831 return std::nullopt;
2832
2833 // Only valid when the shift amount is non-negative, otherwise the rounding
2834 // behaviour of SRSHL cannot be ignored.
2835 if (!match(V: Shift, P: m_NonNegative()))
2836 return std::nullopt;
2837
2838 auto LSL = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_lsl,
2839 Types: {II.getType()}, Args: {Pred, Vec, Shift});
2840
2841 return IC.replaceInstUsesWith(I&: II, V: LSL);
2842}
2843
2844static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2845 IntrinsicInst &II) {
2846 Value *Vec = II.getOperand(i_nocapture: 0);
2847
2848 if (getSplatValue(V: Vec) == II.getOperand(i_nocapture: 1))
2849 return IC.replaceInstUsesWith(I&: II, V: Vec);
2850
2851 return std::nullopt;
2852}
2853
2854static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2855 IntrinsicInst &II) {
2856 // If this barrier is post-dominated by identical one we can remove it
2857 auto *NI = II.getNextNode();
2858 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2859 auto CanSkipOver = [](Instruction *I) {
2860 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2861 };
2862 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2863 auto *NIBB = NI->getParent();
2864 NI = NI->getNextNode();
2865 if (!NI) {
2866 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2867 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2868 else
2869 break;
2870 }
2871 }
2872 auto *NextII = dyn_cast_or_null<IntrinsicInst>(Val: NI);
2873 if (NextII && II.isIdenticalTo(I: NextII))
2874 return IC.eraseInstFromFunction(I&: II);
2875
2876 return std::nullopt;
2877}
2878
2879static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2880 IntrinsicInst &II) {
2881 return IC.replaceInstUsesWith(
2882 I&: II,
2883 V: IC.Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
2884 Types: {II.getType(), II.getOperand(i_nocapture: 0)->getType()},
2885 Args: {II.getOperand(i_nocapture: 0), II.getOperand(i_nocapture: 1)}));
2886}
2887
2888static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2889 IntrinsicInst &II) {
2890 if (match(V: II.getOperand(i_nocapture: 0), P: m_ConstantInt<AArch64SVEPredPattern::all>()))
2891 return IC.replaceInstUsesWith(I&: II, V: Constant::getAllOnesValue(Ty: II.getType()));
2892 return std::nullopt;
2893}
2894
2895static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2896 IntrinsicInst &II,
2897 unsigned NumBits) {
2898 Value *Passthru = II.getOperand(i_nocapture: 0);
2899 Value *Pg = II.getOperand(i_nocapture: 1);
2900 Value *Op = II.getOperand(i_nocapture: 2);
2901
2902 // Convert UXT[BHW] to AND.
2903 if (isa<UndefValue>(Val: Passthru) || isAllActivePredicate(Pred: Pg)) {
2904 auto *Ty = cast<VectorType>(Val: II.getType());
2905 auto MaskValue = APInt::getLowBitsSet(numBits: Ty->getScalarSizeInBits(), loBitsSet: NumBits);
2906 auto *Mask = ConstantInt::get(Ty, V: MaskValue);
2907 auto *And = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_and_u, Types: {Ty},
2908 Args: {Pg, Op, Mask});
2909 return IC.replaceInstUsesWith(I&: II, V: And);
2910 }
2911
2912 return std::nullopt;
2913}
2914
2915static std::optional<Instruction *>
2916instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II) {
2917 SMEAttrs FnSMEAttrs(*II.getFunction());
2918 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2919 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2920 return IC.replaceInstUsesWith(
2921 I&: II, V: ConstantInt::getBool(Ty: II.getType(), V: IsStreaming));
2922 return std::nullopt;
2923}
2924
2925std::optional<Instruction *>
2926AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
2927 IntrinsicInst &II) const {
2928 const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II);
2929 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2930 return I;
2931
2932 Intrinsic::ID IID = II.getIntrinsicID();
2933 switch (IID) {
2934 default:
2935 break;
2936 case Intrinsic::aarch64_dmb:
2937 return instCombineDMB(IC, II);
2938 case Intrinsic::aarch64_neon_fmaxnm:
2939 case Intrinsic::aarch64_neon_fminnm:
2940 return instCombineMaxMinNM(IC, II);
2941 case Intrinsic::aarch64_sve_convert_from_svbool:
2942 return instCombineConvertFromSVBool(IC, II);
2943 case Intrinsic::aarch64_sve_dup:
2944 return instCombineSVEDup(IC, II);
2945 case Intrinsic::aarch64_sve_dup_x:
2946 return instCombineSVEDupX(IC, II);
2947 case Intrinsic::aarch64_sve_cmpne:
2948 case Intrinsic::aarch64_sve_cmpne_wide:
2949 return instCombineSVECmpNE(IC, II);
2950 case Intrinsic::aarch64_sve_rdffr:
2951 return instCombineRDFFR(IC, II);
2952 case Intrinsic::aarch64_sve_lasta:
2953 case Intrinsic::aarch64_sve_lastb:
2954 return instCombineSVELast(IC, II);
2955 case Intrinsic::aarch64_sve_clasta_n:
2956 case Intrinsic::aarch64_sve_clastb_n:
2957 return instCombineSVECondLast(IC, II);
2958 case Intrinsic::aarch64_sve_cntd:
2959 return instCombineSVECntElts(IC, II, NumElts: 2);
2960 case Intrinsic::aarch64_sve_cntw:
2961 return instCombineSVECntElts(IC, II, NumElts: 4);
2962 case Intrinsic::aarch64_sve_cnth:
2963 return instCombineSVECntElts(IC, II, NumElts: 8);
2964 case Intrinsic::aarch64_sve_cntb:
2965 return instCombineSVECntElts(IC, II, NumElts: 16);
2966 case Intrinsic::aarch64_sme_cntsd:
2967 return instCombineSMECntsd(IC, II, ST);
2968 case Intrinsic::aarch64_sve_ptest_any:
2969 case Intrinsic::aarch64_sve_ptest_first:
2970 case Intrinsic::aarch64_sve_ptest_last:
2971 return instCombineSVEPTest(IC, II);
2972 case Intrinsic::aarch64_sve_fadd:
2973 return instCombineSVEVectorFAdd(IC, II);
2974 case Intrinsic::aarch64_sve_fadd_u:
2975 return instCombineSVEVectorFAddU(IC, II);
2976 case Intrinsic::aarch64_sve_fmul_u:
2977 return instCombineSVEVectorBinOp(IC, II);
2978 case Intrinsic::aarch64_sve_fsub:
2979 return instCombineSVEVectorFSub(IC, II);
2980 case Intrinsic::aarch64_sve_fsub_u:
2981 return instCombineSVEVectorFSubU(IC, II);
2982 case Intrinsic::aarch64_sve_add:
2983 return instCombineSVEVectorAdd(IC, II);
2984 case Intrinsic::aarch64_sve_add_u:
2985 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2986 Intrinsic::aarch64_sve_mla_u>(
2987 IC, II, MergeIntoAddendOp: true);
2988 case Intrinsic::aarch64_sve_sub:
2989 return instCombineSVEVectorSub(IC, II);
2990 case Intrinsic::aarch64_sve_sub_u:
2991 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2992 Intrinsic::aarch64_sve_mls_u>(
2993 IC, II, MergeIntoAddendOp: true);
2994 case Intrinsic::aarch64_sve_tbl:
2995 return instCombineSVETBL(IC, II);
2996 case Intrinsic::aarch64_sve_uunpkhi:
2997 case Intrinsic::aarch64_sve_uunpklo:
2998 case Intrinsic::aarch64_sve_sunpkhi:
2999 case Intrinsic::aarch64_sve_sunpklo:
3000 return instCombineSVEUnpack(IC, II);
3001 case Intrinsic::aarch64_sve_uzp1:
3002 return instCombineSVEUzp1(IC, II);
3003 case Intrinsic::aarch64_sve_zip1:
3004 case Intrinsic::aarch64_sve_zip2:
3005 return instCombineSVEZip(IC, II);
3006 case Intrinsic::aarch64_sve_ld1_gather_index:
3007 return instCombineLD1GatherIndex(IC, II);
3008 case Intrinsic::aarch64_sve_st1_scatter_index:
3009 return instCombineST1ScatterIndex(IC, II);
3010 case Intrinsic::aarch64_sve_ld1:
3011 return instCombineSVELD1(IC, II, DL);
3012 case Intrinsic::aarch64_sve_st1:
3013 return instCombineSVEST1(IC, II, DL);
3014 case Intrinsic::aarch64_sve_sdiv:
3015 return instCombineSVESDIV(IC, II);
3016 case Intrinsic::aarch64_sve_sel:
3017 return instCombineSVESel(IC, II);
3018 case Intrinsic::aarch64_sve_srshl:
3019 return instCombineSVESrshl(IC, II);
3020 case Intrinsic::aarch64_sve_dupq_lane:
3021 return instCombineSVEDupqLane(IC, II);
3022 case Intrinsic::aarch64_sve_insr:
3023 return instCombineSVEInsr(IC, II);
3024 case Intrinsic::aarch64_sve_whilelo:
3025 return instCombineWhilelo(IC, II);
3026 case Intrinsic::aarch64_sve_ptrue:
3027 return instCombinePTrue(IC, II);
3028 case Intrinsic::aarch64_sve_uxtb:
3029 return instCombineSVEUxt(IC, II, NumBits: 8);
3030 case Intrinsic::aarch64_sve_uxth:
3031 return instCombineSVEUxt(IC, II, NumBits: 16);
3032 case Intrinsic::aarch64_sve_uxtw:
3033 return instCombineSVEUxt(IC, II, NumBits: 32);
3034 case Intrinsic::aarch64_sme_in_streaming_mode:
3035 return instCombineInStreamingMode(IC, II);
3036 }
3037
3038 return std::nullopt;
3039}
3040
3041std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3042 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3043 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3044 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3045 SimplifyAndSetOp) const {
3046 switch (II.getIntrinsicID()) {
3047 default:
3048 break;
3049 case Intrinsic::aarch64_neon_fcvtxn:
3050 case Intrinsic::aarch64_neon_rshrn:
3051 case Intrinsic::aarch64_neon_sqrshrn:
3052 case Intrinsic::aarch64_neon_sqrshrun:
3053 case Intrinsic::aarch64_neon_sqshrn:
3054 case Intrinsic::aarch64_neon_sqshrun:
3055 case Intrinsic::aarch64_neon_sqxtn:
3056 case Intrinsic::aarch64_neon_sqxtun:
3057 case Intrinsic::aarch64_neon_uqrshrn:
3058 case Intrinsic::aarch64_neon_uqshrn:
3059 case Intrinsic::aarch64_neon_uqxtn:
3060 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3061 break;
3062 }
3063
3064 return std::nullopt;
3065}
3066
3067bool AArch64TTIImpl::enableScalableVectorization() const {
3068 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3069 EnableScalableAutovecInStreamingMode);
3070}
3071
3072TypeSize
3073AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
3074 switch (K) {
3075 case TargetTransformInfo::RGK_Scalar:
3076 return TypeSize::getFixed(ExactSize: 64);
3077 case TargetTransformInfo::RGK_FixedWidthVector:
3078 if (ST->useSVEForFixedLengthVectors() &&
3079 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3080 return TypeSize::getFixed(
3081 ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: 128u));
3082 else if (ST->isNeonAvailable())
3083 return TypeSize::getFixed(ExactSize: 128);
3084 else
3085 return TypeSize::getFixed(ExactSize: 0);
3086 case TargetTransformInfo::RGK_ScalableVector:
3087 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3088 EnableScalableAutovecInStreamingMode))
3089 return TypeSize::getScalable(MinimumSize: 128);
3090 else
3091 return TypeSize::getScalable(MinimumSize: 0);
3092 }
3093 llvm_unreachable("Unsupported register kind");
3094}
3095
3096bool AArch64TTIImpl::isSingleExtWideningInstruction(
3097 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3098 Type *SrcOverrideTy) const {
3099 // A helper that returns a vector type from the given type. The number of
3100 // elements in type Ty determines the vector width.
3101 auto toVectorTy = [&](Type *ArgTy) {
3102 return VectorType::get(ElementType: ArgTy->getScalarType(),
3103 EC: cast<VectorType>(Val: DstTy)->getElementCount());
3104 };
3105
3106 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3107 // i32, i64]. SVE doesn't generally have the same set of instructions to
3108 // perform an extend with the add/sub/mul. There are SMULLB style
3109 // instructions, but they operate on top/bottom, requiring some sort of lane
3110 // interleaving to be used with zext/sext.
3111 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3112 if (!useNeonVector(Ty: DstTy) || Args.size() != 2 ||
3113 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3114 return false;
3115
3116 Type *SrcTy = SrcOverrideTy;
3117 switch (Opcode) {
3118 case Instruction::Add: // UADDW(2), SADDW(2).
3119 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3120 // The second operand needs to be an extend
3121 if (isa<SExtInst>(Val: Args[1]) || isa<ZExtInst>(Val: Args[1])) {
3122 if (!SrcTy)
3123 SrcTy =
3124 toVectorTy(cast<Instruction>(Val: Args[1])->getOperand(i: 0)->getType());
3125 break;
3126 }
3127
3128 if (Opcode == Instruction::Sub)
3129 return false;
3130
3131 // UADDW(2), SADDW(2) can be commutted.
3132 if (isa<SExtInst>(Val: Args[0]) || isa<ZExtInst>(Val: Args[0])) {
3133 if (!SrcTy)
3134 SrcTy =
3135 toVectorTy(cast<Instruction>(Val: Args[0])->getOperand(i: 0)->getType());
3136 break;
3137 }
3138 return false;
3139 }
3140 default:
3141 return false;
3142 }
3143
3144 // Legalize the destination type and ensure it can be used in a widening
3145 // operation.
3146 auto DstTyL = getTypeLegalizationCost(Ty: DstTy);
3147 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3148 return false;
3149
3150 // Legalize the source type and ensure it can be used in a widening
3151 // operation.
3152 assert(SrcTy && "Expected some SrcTy");
3153 auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy);
3154 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3155 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3156 return false;
3157
3158 // Get the total number of vector elements in the legalized types.
3159 InstructionCost NumDstEls =
3160 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3161 InstructionCost NumSrcEls =
3162 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3163
3164 // Return true if the legalized types have the same number of vector elements
3165 // and the destination element type size is twice that of the source type.
3166 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3167}
3168
3169Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3170 ArrayRef<const Value *> Args,
3171 Type *SrcOverrideTy) const {
3172 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3173 Opcode != Instruction::Mul)
3174 return nullptr;
3175
3176 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3177 // i32, i64]. SVE doesn't generally have the same set of instructions to
3178 // perform an extend with the add/sub/mul. There are SMULLB style
3179 // instructions, but they operate on top/bottom, requiring some sort of lane
3180 // interleaving to be used with zext/sext.
3181 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3182 if (!useNeonVector(Ty: DstTy) || Args.size() != 2 ||
3183 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3184 return nullptr;
3185
3186 auto getScalarSizeWithOverride = [&](const Value *V) {
3187 if (SrcOverrideTy)
3188 return SrcOverrideTy->getScalarSizeInBits();
3189 return cast<Instruction>(Val: V)
3190 ->getOperand(i: 0)
3191 ->getType()
3192 ->getScalarSizeInBits();
3193 };
3194
3195 unsigned MaxEltSize = 0;
3196 if ((isa<SExtInst>(Val: Args[0]) && isa<SExtInst>(Val: Args[1])) ||
3197 (isa<ZExtInst>(Val: Args[0]) && isa<ZExtInst>(Val: Args[1]))) {
3198 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3199 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3200 MaxEltSize = std::max(a: EltSize0, b: EltSize1);
3201 } else if (isa<SExtInst, ZExtInst>(Val: Args[0]) &&
3202 isa<SExtInst, ZExtInst>(Val: Args[1])) {
3203 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3204 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3205 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3206 // enough.
3207 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3208 return nullptr;
3209 MaxEltSize = DstEltSize / 2;
3210 } else if (Opcode == Instruction::Mul &&
3211 (isa<ZExtInst>(Val: Args[0]) || isa<ZExtInst>(Val: Args[1]))) {
3212 // If one of the operands is a Zext and the other has enough zero bits
3213 // to be treated as unsigned, we can still generate a umull, meaning the
3214 // zext is free.
3215 KnownBits Known =
3216 computeKnownBits(V: isa<ZExtInst>(Val: Args[0]) ? Args[1] : Args[0], DL);
3217 if (Args[0]->getType()->getScalarSizeInBits() -
3218 Known.Zero.countLeadingOnes() >
3219 DstTy->getScalarSizeInBits() / 2)
3220 return nullptr;
3221
3222 MaxEltSize =
3223 getScalarSizeWithOverride(isa<ZExtInst>(Val: Args[0]) ? Args[0] : Args[1]);
3224 } else
3225 return nullptr;
3226
3227 if (MaxEltSize * 2 > DstEltSize)
3228 return nullptr;
3229
3230 Type *ExtTy = DstTy->getWithNewBitWidth(NewBitWidth: MaxEltSize * 2);
3231 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3232 return nullptr;
3233 return ExtTy;
3234}
3235
3236// s/urhadd instructions implement the following pattern, making the
3237// extends free:
3238// %x = add ((zext i8 -> i16), 1)
3239// %y = (zext i8 -> i16)
3240// trunc i16 (lshr (add %x, %y), 1) -> i8
3241//
3242bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
3243 Type *Src) const {
3244 // The source should be a legal vector type.
3245 if (!Src->isVectorTy() || !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) ||
3246 (Src->isScalableTy() && !ST->hasSVE2()))
3247 return false;
3248
3249 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3250 return false;
3251
3252 // Look for trunc/shl/add before trying to match the pattern.
3253 const Instruction *Add = ExtUser;
3254 auto *AddUser =
3255 dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3256 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3257 Add = AddUser;
3258
3259 auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3260 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3261 return false;
3262
3263 auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser());
3264 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3265 Src->getScalarSizeInBits() !=
3266 cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits())
3267 return false;
3268
3269 // Try to match the whole pattern. Ext could be either the first or second
3270 // m_ZExtOrSExt matched.
3271 Instruction *Ex1, *Ex2;
3272 if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1),
3273 R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_SpecificInt(V: 1))))))
3274 return false;
3275
3276 // Ensure both extends are of the same type
3277 if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) &&
3278 Ex1->getOpcode() == Ex2->getOpcode())
3279 return true;
3280
3281 return false;
3282}
3283
3284InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
3285 Type *Src,
3286 TTI::CastContextHint CCH,
3287 TTI::TargetCostKind CostKind,
3288 const Instruction *I) const {
3289 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3290 assert(ISD && "Invalid opcode");
3291 // If the cast is observable, and it is used by a widening instruction (e.g.,
3292 // uaddl, saddw, etc.), it may be free.
3293 if (I && I->hasOneUser()) {
3294 auto *SingleUser = cast<Instruction>(Val: *I->user_begin());
3295 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3296 if (Type *ExtTy = isBinExtWideningInstruction(
3297 Opcode: SingleUser->getOpcode(), DstTy: Dst, Args: Operands,
3298 SrcOverrideTy: Src != I->getOperand(i: 0)->getType() ? Src : nullptr)) {
3299 // The cost from Src->Src*2 needs to be added if required, the cost from
3300 // Src*2->ExtTy is free.
3301 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3302 Type *DoubleSrcTy =
3303 Src->getWithNewBitWidth(NewBitWidth: Src->getScalarSizeInBits() * 2);
3304 return getCastInstrCost(Opcode, Dst: DoubleSrcTy, Src,
3305 CCH: TTI::CastContextHint::None, CostKind);
3306 }
3307
3308 return 0;
3309 }
3310
3311 if (isSingleExtWideningInstruction(
3312 Opcode: SingleUser->getOpcode(), DstTy: Dst, Args: Operands,
3313 SrcOverrideTy: Src != I->getOperand(i: 0)->getType() ? Src : nullptr)) {
3314 // For adds only count the second operand as free if both operands are
3315 // extends but not the same operation. (i.e both operands are not free in
3316 // add(sext, zext)).
3317 if (SingleUser->getOpcode() == Instruction::Add) {
3318 if (I == SingleUser->getOperand(i: 1) ||
3319 (isa<CastInst>(Val: SingleUser->getOperand(i: 1)) &&
3320 cast<CastInst>(Val: SingleUser->getOperand(i: 1))->getOpcode() == Opcode))
3321 return 0;
3322 } else {
3323 // Others are free so long as isSingleExtWideningInstruction
3324 // returned true.
3325 return 0;
3326 }
3327 }
3328
3329 // The cast will be free for the s/urhadd instructions
3330 if ((isa<ZExtInst>(Val: I) || isa<SExtInst>(Val: I)) &&
3331 isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src))
3332 return 0;
3333 }
3334
3335 EVT SrcTy = TLI->getValueType(DL, Ty: Src);
3336 EVT DstTy = TLI->getValueType(DL, Ty: Dst);
3337
3338 if (!SrcTy.isSimple() || !DstTy.isSimple())
3339 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3340
3341 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3342 // we use fcvtx under SVE2. Give them invalid costs.
3343 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3344 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3345 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3346 return InstructionCost::getInvalid();
3347
3348 static const TypeConversionCostTblEntry BF16Tbl[] = {
3349 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 1}, // bfcvt
3350 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 1}, // bfcvt
3351 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 1}, // bfcvtn
3352 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 2}, // bfcvtn+bfcvtn2
3353 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 2}, // bfcvtn+fcvtn
3354 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtl2+bfcvtn
3355 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3356 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f32, .Cost: 1}, // bfcvt
3357 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f32, .Cost: 1}, // bfcvt
3358 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f32, .Cost: 3}, // bfcvt+bfcvt+uzp1
3359 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f64, .Cost: 2}, // fcvtx+bfcvt
3360 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f64, .Cost: 5}, // 2*fcvtx+2*bfcvt+uzp1
3361 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f64, .Cost: 11}, // 4*fcvt+4*bfcvt+3*uzp
3362 };
3363
3364 if (ST->hasBF16())
3365 if (const auto *Entry = ConvertCostTableLookup(
3366 Table: BF16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3367 return Entry->Cost;
3368
3369 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3370 // The cost of unpacking twice is artificially increased for now in order
3371 // to avoid regressions against NEON, which will use tbl instructions directly
3372 // instead of multiple layers of [s|u]unpk[lo|hi].
3373 // We use the unpacks in cases where the destination type is illegal and
3374 // requires splitting of the input, even if the input type itself is legal.
3375 const unsigned int SVE_EXT_COST = 1;
3376 const unsigned int SVE_FCVT_COST = 1;
3377 const unsigned int SVE_UNPACK_ONCE = 4;
3378 const unsigned int SVE_UNPACK_TWICE = 16;
3379
3380 static const TypeConversionCostTblEntry ConversionTbl[] = {
3381 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: 1}, // xtn
3382 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: 1}, // xtn
3383 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i32, .Src: MVT::v2i64, .Cost: 1}, // xtn
3384 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: 1}, // xtn
3385 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: 3}, // 2 xtn + 1 uzp1
3386 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 1}, // xtn
3387 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: 2}, // 1 uzp1 + 1 xtn
3388 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: 1}, // 1 uzp1
3389 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: 1}, // 1 xtn
3390 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: 2}, // 1 uzp1 + 1 xtn
3391 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: 4}, // 3 x uzp1 + xtn
3392 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: 1}, // 1 uzp1
3393 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: 3}, // 3 x uzp1
3394 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: 2}, // 2 x uzp1
3395 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: 1}, // uzp1
3396 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 3}, // (2 + 1) x uzp1
3397 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: 7}, // (4 + 2 + 1) x uzp1
3398 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: 2}, // 2 x uzp1
3399 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i64, .Cost: 6}, // (4 + 2) x uzp1
3400 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i64, .Cost: 4}, // 4 x uzp1
3401
3402 // Truncations on nxvmiN
3403 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i8, .Cost: 2},
3404 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i16, .Cost: 2},
3405 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i32, .Cost: 2},
3406 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i64, .Cost: 2},
3407 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i8, .Cost: 2},
3408 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i16, .Cost: 2},
3409 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i32, .Cost: 2},
3410 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i64, .Cost: 5},
3411 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i8, .Cost: 2},
3412 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i16, .Cost: 2},
3413 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i32, .Cost: 5},
3414 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i64, .Cost: 11},
3415 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i1, .Src: MVT::nxv16i8, .Cost: 2},
3416 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i16, .Cost: 0},
3417 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i32, .Cost: 0},
3418 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i64, .Cost: 0},
3419 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i32, .Cost: 0},
3420 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i64, .Cost: 0},
3421 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i32, .Src: MVT::nxv2i64, .Cost: 0},
3422 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i16, .Cost: 0},
3423 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i32, .Cost: 0},
3424 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i64, .Cost: 1},
3425 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i32, .Cost: 0},
3426 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i64, .Cost: 1},
3427 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i32, .Src: MVT::nxv4i64, .Cost: 1},
3428 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i16, .Cost: 0},
3429 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i32, .Cost: 1},
3430 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i64, .Cost: 3},
3431 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i32, .Cost: 1},
3432 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i64, .Cost: 3},
3433 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i16, .Cost: 1},
3434 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i32, .Cost: 3},
3435 {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i64, .Cost: 7},
3436
3437 // The number of shll instructions for the extension.
3438 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3},
3439 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3},
3440 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
3441 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
3442 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3},
3443 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3},
3444 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
3445 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
3446 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7},
3447 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7},
3448 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6},
3449 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6},
3450 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
3451 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
3452 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
3453 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
3454
3455 // FP Ext and trunc
3456 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f32, .Cost: 1}, // fcvt
3457 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f32, .Cost: 1}, // fcvtl
3458 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f32, .Cost: 2}, // fcvtl+fcvtl2
3459 // FP16
3460 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::f16, .Cost: 1}, // fcvt
3461 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f16, .Cost: 1}, // fcvt
3462 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: 1}, // fcvtl
3463 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: 2}, // fcvtl+fcvtl2
3464 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f16, .Cost: 2}, // fcvtl+fcvtl
3465 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f16, .Cost: 3}, // fcvtl+fcvtl2+fcvtl
3466 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f16, .Cost: 6}, // 2 * fcvtl+fcvtl2+fcvtl
3467 // BF16 (uses shift)
3468 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::bf16, .Cost: 1}, // shl
3469 {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::bf16, .Cost: 2}, // shl+fcvt
3470 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4bf16, .Cost: 1}, // shll
3471 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8bf16, .Cost: 2}, // shll+shll2
3472 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2bf16, .Cost: 2}, // shll+fcvtl
3473 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4bf16, .Cost: 3}, // shll+fcvtl+fcvtl2
3474 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8bf16, .Cost: 6}, // 2 * shll+fcvtl+fcvtl2
3475 // FP Ext and trunc
3476 {.ISD: ISD::FP_ROUND, .Dst: MVT::f32, .Src: MVT::f64, .Cost: 1}, // fcvt
3477 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f32, .Src: MVT::v2f64, .Cost: 1}, // fcvtn
3478 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f64, .Cost: 2}, // fcvtn+fcvtn2
3479 // FP16
3480 {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f32, .Cost: 1}, // fcvt
3481 {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f64, .Cost: 1}, // fcvt
3482 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f32, .Cost: 1}, // fcvtn
3483 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f32, .Cost: 2}, // fcvtn+fcvtn2
3484 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f16, .Src: MVT::v2f64, .Cost: 2}, // fcvtn+fcvtn
3485 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtn2+fcvtn
3486 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+fcvtn
3487 // BF16 (more complex, with +bf16 is handled above)
3488 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 8}, // Expansion is ~8 insns
3489 {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 9}, // fcvtn + above
3490 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f32, .Cost: 8},
3491 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 8},
3492 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 15},
3493 {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 9},
3494 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 10},
3495 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 19},
3496
3497 // LowerVectorINT_TO_FP:
3498 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
3499 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
3500 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1},
3501 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
3502 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
3503 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1},
3504
3505 // SVE: to nxv2f16
3506 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3507 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3508 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3509 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3510 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3511 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3512 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3513 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3514 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3515 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3516
3517 // SVE: to nxv4f16
3518 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3519 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3520 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3521 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3522 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3523 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3524 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3525 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3526
3527 // SVE: to nxv8f16
3528 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3529 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3530 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3531 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3532 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3533 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3534
3535 // SVE: to nxv16f16
3536 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3537 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3538 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3539 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3540
3541 // Complex: to v2f32
3542 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
3543 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3},
3544 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
3545 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3},
3546
3547 // SVE: to nxv2f32
3548 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3549 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3550 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3551 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3552 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3553 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3554 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3555 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3556 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3557 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3558
3559 // Complex: to v4f32
3560 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 4},
3561 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
3562 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3},
3563 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
3564
3565 // SVE: to nxv4f32
3566 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3567 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3568 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3569 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3570 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3571 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3572 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3573 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3574
3575 // Complex: to v8f32
3576 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
3577 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4},
3578 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
3579 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4},
3580
3581 // SVE: to nxv8f32
3582 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3583 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3584 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3585 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3586 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3587 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3588 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3589 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3590
3591 // SVE: to nxv16f32
3592 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3593 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3594 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3595 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3596
3597 // Complex: to v16f32
3598 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21},
3599 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21},
3600
3601 // Complex: to v2f64
3602 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4},
3603 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4},
3604 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2},
3605 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4},
3606 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4},
3607 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2},
3608
3609 // SVE: to nxv2f64
3610 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3611 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3612 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3613 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3614 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3615 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3616 .Cost: SVE_EXT_COST + SVE_FCVT_COST},
3617 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3618 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3619 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3620
3621 // Complex: to v4f64
3622 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4},
3623 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4},
3624
3625 // SVE: to nxv4f64
3626 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3627 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3628 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3629 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3630 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3631 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3632 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3633 .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3634 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3635 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3636 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3637 .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3638
3639 // SVE: to nxv8f64
3640 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3641 .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3642 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3643 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3644 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3645 .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3646 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3647 .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3648
3649 // LowerVectorFP_TO_INT
3650 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1},
3651 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1},
3652 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1},
3653 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1},
3654 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1},
3655 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1},
3656
3657 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3658 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2},
3659 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1},
3660 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1},
3661 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2},
3662 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1},
3663 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1},
3664
3665 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3666 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2},
3667 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2},
3668 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2},
3669 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2},
3670
3671 // Complex, from nxv2f32.
3672 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1},
3673 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1},
3674 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1},
3675 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1},
3676 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1},
3677 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1},
3678 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1},
3679 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1},
3680
3681 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3682 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2},
3683 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2},
3684 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2},
3685 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2},
3686 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2},
3687 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2},
3688
3689 // Complex, from nxv2f64.
3690 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1},
3691 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1},
3692 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1},
3693 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1},
3694 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1},
3695 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1},
3696 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1},
3697 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1},
3698 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1},
3699 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1},
3700
3701 // Complex, from nxv4f32.
3702 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4},
3703 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1},
3704 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1},
3705 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1},
3706 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1},
3707 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4},
3708 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1},
3709 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1},
3710 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1},
3711 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1},
3712
3713 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3714 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7},
3715 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7},
3716 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7},
3717 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7},
3718
3719 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3720 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3},
3721 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3},
3722 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3},
3723 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3},
3724 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3},
3725 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3},
3726
3727 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3728 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3},
3729 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3},
3730 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3},
3731 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3},
3732
3733 // Complex, from nxv8f16.
3734 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10},
3735 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4},
3736 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1},
3737 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1},
3738 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1},
3739 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10},
3740 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4},
3741 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1},
3742 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1},
3743 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1},
3744
3745 // Complex, from nxv4f16.
3746 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4},
3747 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1},
3748 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1},
3749 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1},
3750 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4},
3751 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1},
3752 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1},
3753 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1},
3754
3755 // Complex, from nxv2f16.
3756 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1},
3757 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1},
3758 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1},
3759 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1},
3760 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1},
3761 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1},
3762 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1},
3763 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1},
3764
3765 // Truncate from nxvmf32 to nxvmf16.
3766 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f32, .Cost: 1},
3767 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f32, .Cost: 1},
3768 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f32, .Cost: 3},
3769
3770 // Truncate from nxvmf32 to nxvmbf16.
3771 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f32, .Cost: 8},
3772 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f32, .Cost: 8},
3773 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f32, .Cost: 17},
3774
3775 // Truncate from nxvmf64 to nxvmf16.
3776 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f64, .Cost: 1},
3777 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f64, .Cost: 3},
3778 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f64, .Cost: 7},
3779
3780 // Truncate from nxvmf64 to nxvmbf16.
3781 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f64, .Cost: 9},
3782 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f64, .Cost: 19},
3783 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f64, .Cost: 39},
3784
3785 // Truncate from nxvmf64 to nxvmf32.
3786 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f64, .Cost: 1},
3787 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f64, .Cost: 3},
3788 {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f64, .Cost: 6},
3789
3790 // Extend from nxvmf16 to nxvmf32.
3791 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f16, .Cost: 1},
3792 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f16, .Cost: 1},
3793 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f16, .Cost: 2},
3794
3795 // Extend from nxvmbf16 to nxvmf32.
3796 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2bf16, .Cost: 1}, // lsl
3797 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4bf16, .Cost: 1}, // lsl
3798 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8bf16, .Cost: 4}, // unpck+unpck+lsl+lsl
3799
3800 // Extend from nxvmf16 to nxvmf64.
3801 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f16, .Cost: 1},
3802 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f16, .Cost: 2},
3803 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f16, .Cost: 4},
3804
3805 // Extend from nxvmbf16 to nxvmf64.
3806 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2bf16, .Cost: 2}, // lsl+fcvt
3807 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4bf16, .Cost: 6}, // 2*unpck+2*lsl+2*fcvt
3808 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8bf16, .Cost: 14}, // 6*unpck+4*lsl+4*fcvt
3809
3810 // Extend from nxvmf32 to nxvmf64.
3811 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f32, .Cost: 1},
3812 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f32, .Cost: 2},
3813 {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f32, .Cost: 6},
3814
3815 // Bitcasts from float to integer
3816 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: 0},
3817 {.ISD: ISD::BITCAST, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: 0},
3818 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: 0},
3819
3820 // Bitcasts from integer to float
3821 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 0},
3822 {.ISD: ISD::BITCAST, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 0},
3823 {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 0},
3824
3825 // Add cost for extending to illegal -too wide- scalable vectors.
3826 // zero/sign extend are implemented by multiple unpack operations,
3827 // where each operation has a cost of 1.
3828 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2},
3829 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6},
3830 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14},
3831 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2},
3832 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6},
3833 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2},
3834
3835 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2},
3836 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6},
3837 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14},
3838 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2},
3839 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6},
3840 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2},
3841 };
3842
3843 // We have to estimate a cost of fixed length operation upon
3844 // SVE registers(operations) with the number of registers required
3845 // for a fixed type to be represented upon SVE registers.
3846 EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy;
3847 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3848 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3849 ST->useSVEForFixedLengthVectors(VT: WiderTy)) {
3850 std::pair<InstructionCost, MVT> LT =
3851 getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext()));
3852 unsigned NumElements =
3853 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3854 return LT.first *
3855 getCastInstrCost(
3856 Opcode,
3857 Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements),
3858 Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH,
3859 CostKind, I);
3860 }
3861
3862 if (const auto *Entry = ConvertCostTableLookup(
3863 Table: ConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3864 return Entry->Cost;
3865
3866 static const TypeConversionCostTblEntry FP16Tbl[] = {
3867 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs
3868 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1},
3869 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs
3870 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1},
3871 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2}, // fcvtl+fcvtzs
3872 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2},
3873 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2}, // fcvtzs+xtn
3874 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2},
3875 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1}, // fcvtzs
3876 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1},
3877 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4}, // 2*fcvtl+2*fcvtzs
3878 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4},
3879 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3}, // 2*fcvtzs+xtn
3880 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3},
3881 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2}, // 2*fcvtzs
3882 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2},
3883 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8}, // 4*fcvtl+4*fcvtzs
3884 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8},
3885 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // ushll + ucvtf
3886 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // sshll + scvtf
3887 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * ushl(2) + 2 * ucvtf
3888 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * sshl(2) + 2 * scvtf
3889 };
3890
3891 if (ST->hasFullFP16())
3892 if (const auto *Entry = ConvertCostTableLookup(
3893 Table: FP16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3894 return Entry->Cost;
3895
3896 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3897 // double-rounding issues.
3898 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3899 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3900 isa<FixedVectorType>(Val: Dst) && isa<FixedVectorType>(Val: Src))
3901 return cast<FixedVectorType>(Val: Dst)->getNumElements() *
3902 getCastInstrCost(Opcode, Dst: Dst->getScalarType(),
3903 Src: Src->getScalarType(), CCH, CostKind) +
3904 BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Src), Insert: false,
3905 Extract: true, CostKind) +
3906 BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Dst), Insert: true,
3907 Extract: false, CostKind);
3908
3909 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3910 CCH == TTI::CastContextHint::Masked &&
3911 ST->isSVEorStreamingSVEAvailable() &&
3912 TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) ==
3913 TargetLowering::TypePromoteInteger &&
3914 TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) ==
3915 TargetLowering::TypeSplitVector) {
3916 // The standard behaviour in the backend for these cases is to split the
3917 // extend up into two parts:
3918 // 1. Perform an extending load or masked load up to the legal type.
3919 // 2. Extend the loaded data to the final type.
3920 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
3921 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Context&: Src->getContext());
3922 InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
3923 Opcode, Dst: LegalTy, Src, CCH, CostKind, I);
3924 InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
3925 Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I);
3926 return Part1 + Part2;
3927 }
3928
3929 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3930 // but we also want to include the TTI::CastContextHint::Masked case too.
3931 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3932 CCH == TTI::CastContextHint::Masked &&
3933 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(VT: DstTy))
3934 CCH = TTI::CastContextHint::Normal;
3935
3936 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3937}
3938
3939InstructionCost
3940AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
3941 VectorType *VecTy, unsigned Index,
3942 TTI::TargetCostKind CostKind) const {
3943
3944 // Make sure we were given a valid extend opcode.
3945 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3946 "Invalid opcode");
3947
3948 // We are extending an element we extract from a vector, so the source type
3949 // of the extend is the element type of the vector.
3950 auto *Src = VecTy->getElementType();
3951
3952 // Sign- and zero-extends are for integer types only.
3953 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3954
3955 // Get the cost for the extract. We compute the cost (if any) for the extend
3956 // below.
3957 InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
3958 CostKind, Index, Op0: nullptr, Op1: nullptr);
3959
3960 // Legalize the types.
3961 auto VecLT = getTypeLegalizationCost(Ty: VecTy);
3962 auto DstVT = TLI->getValueType(DL, Ty: Dst);
3963 auto SrcVT = TLI->getValueType(DL, Ty: Src);
3964
3965 // If the resulting type is still a vector and the destination type is legal,
3966 // we may get the extension for free. If not, get the default cost for the
3967 // extend.
3968 if (!VecLT.second.isVector() || !TLI->isTypeLegal(VT: DstVT))
3969 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
3970 CostKind);
3971
3972 // The destination type should be larger than the element type. If not, get
3973 // the default cost for the extend.
3974 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3975 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
3976 CostKind);
3977
3978 switch (Opcode) {
3979 default:
3980 llvm_unreachable("Opcode should be either SExt or ZExt");
3981
3982 // For sign-extends, we only need a smov, which performs the extension
3983 // automatically.
3984 case Instruction::SExt:
3985 return Cost;
3986
3987 // For zero-extends, the extend is performed automatically by a umov unless
3988 // the destination type is i64 and the element type is i8 or i16.
3989 case Instruction::ZExt:
3990 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3991 return Cost;
3992 }
3993
3994 // If we are unable to perform the extend for free, get the default cost.
3995 return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
3996 CostKind);
3997}
3998
3999InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
4000 TTI::TargetCostKind CostKind,
4001 const Instruction *I) const {
4002 if (CostKind != TTI::TCK_RecipThroughput)
4003 return Opcode == Instruction::PHI ? 0 : 1;
4004 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4005 // Branches are assumed to be predicted.
4006 return 0;
4007}
4008
4009InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4010 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4011 const Instruction *I, Value *Scalar,
4012 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4013 TTI::VectorInstrContext VIC) const {
4014 assert(Val->isVectorTy() && "This must be a vector type");
4015
4016 if (Index != -1U) {
4017 // Legalize the type.
4018 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
4019
4020 // This type is legalized to a scalar type.
4021 if (!LT.second.isVector())
4022 return 0;
4023
4024 // The type may be split. For fixed-width vectors we can normalize the
4025 // index to the new type.
4026 if (LT.second.isFixedLengthVector()) {
4027 unsigned Width = LT.second.getVectorNumElements();
4028 Index = Index % Width;
4029 }
4030
4031 // The element at index zero is already inside the vector.
4032 // - For a insert-element or extract-element
4033 // instruction that extracts integers, an explicit FPR -> GPR move is
4034 // needed. So it has non-zero cost.
4035 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4036 return 0;
4037
4038 // This is recognising a LD1 single-element structure to one lane of one
4039 // register instruction. I.e., if this is an `insertelement` instruction,
4040 // and its second operand is a load, then we will generate a LD1, which
4041 // are expensive instructions on some uArchs.
4042 if (VIC == TTI::VectorInstrContext::Load) {
4043 if (ST->hasFastLD1Single())
4044 return 0;
4045 return CostKind == TTI::TCK_CodeSize
4046 ? 0
4047 : ST->getVectorInsertExtractBaseCost() + 1;
4048 }
4049
4050 // i1 inserts and extract will include an extra cset or cmp of the vector
4051 // value. Increase the cost by 1 to account.
4052 if (Val->getScalarSizeInBits() == 1)
4053 return CostKind == TTI::TCK_CodeSize
4054 ? 2
4055 : ST->getVectorInsertExtractBaseCost() + 1;
4056
4057 // FIXME:
4058 // If the extract-element and insert-element instructions could be
4059 // simplified away (e.g., could be combined into users by looking at use-def
4060 // context), they have no cost. This is not done in the first place for
4061 // compile-time considerations.
4062 }
4063
4064 // In case of Neon, if there exists extractelement from lane != 0 such that
4065 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4066 // 2. extractelement result feeds into fmul.
4067 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4068 // equivalent to 0.
4069 // then the extractelement can be merged with fmul in the backend and it
4070 // incurs no cost.
4071 // e.g.
4072 // define double @foo(<2 x double> %a) {
4073 // %1 = extractelement <2 x double> %a, i32 0
4074 // %2 = extractelement <2 x double> %a, i32 1
4075 // %res = fmul double %1, %2
4076 // ret double %res
4077 // }
4078 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4079 auto ExtractCanFuseWithFmul = [&]() {
4080 // We bail out if the extract is from lane 0.
4081 if (Index == 0)
4082 return false;
4083
4084 // Check if the scalar element type of the vector operand of ExtractElement
4085 // instruction is one of the allowed types.
4086 auto IsAllowedScalarTy = [&](const Type *T) {
4087 return T->isFloatTy() || T->isDoubleTy() ||
4088 (T->isHalfTy() && ST->hasFullFP16());
4089 };
4090
4091 // Check if the extractelement user is scalar fmul.
4092 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4093 // Check if the user is scalar fmul.
4094 const auto *BO = dyn_cast<BinaryOperator>(Val: EEUser);
4095 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4096 !BO->getType()->isVectorTy();
4097 };
4098
4099 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4100 // certain scalar type and a certain vector register width.
4101 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4102 auto RegWidth =
4103 getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
4104 .getFixedValue();
4105 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4106 };
4107
4108 // Check if the type constraints on input vector type and result scalar type
4109 // of extractelement instruction are satisfied.
4110 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4111 return false;
4112
4113 if (Scalar) {
4114 DenseMap<User *, unsigned> UserToExtractIdx;
4115 for (auto *U : Scalar->users()) {
4116 if (!IsUserFMulScalarTy(U))
4117 return false;
4118 // Recording entry for the user is important. Index value is not
4119 // important.
4120 UserToExtractIdx[U];
4121 }
4122 if (UserToExtractIdx.empty())
4123 return false;
4124 for (auto &[S, U, L] : ScalarUserAndIdx) {
4125 for (auto *U : S->users()) {
4126 if (UserToExtractIdx.contains(Val: U)) {
4127 auto *FMul = cast<BinaryOperator>(Val: U);
4128 auto *Op0 = FMul->getOperand(i_nocapture: 0);
4129 auto *Op1 = FMul->getOperand(i_nocapture: 1);
4130 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4131 UserToExtractIdx[U] = L;
4132 break;
4133 }
4134 }
4135 }
4136 }
4137 for (auto &[U, L] : UserToExtractIdx) {
4138 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4139 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4140 return false;
4141 }
4142 } else {
4143 const auto *EE = cast<ExtractElementInst>(Val: I);
4144
4145 const auto *IdxOp = dyn_cast<ConstantInt>(Val: EE->getIndexOperand());
4146 if (!IdxOp)
4147 return false;
4148
4149 return !EE->users().empty() && all_of(Range: EE->users(), P: [&](const User *U) {
4150 if (!IsUserFMulScalarTy(U))
4151 return false;
4152
4153 // Check if the other operand of extractelement is also extractelement
4154 // from lane equivalent to 0.
4155 const auto *BO = cast<BinaryOperator>(Val: U);
4156 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4157 Val: BO->getOperand(i_nocapture: 0) == EE ? BO->getOperand(i_nocapture: 1) : BO->getOperand(i_nocapture: 0));
4158 if (OtherEE) {
4159 const auto *IdxOp = dyn_cast<ConstantInt>(Val: OtherEE->getIndexOperand());
4160 if (!IdxOp)
4161 return false;
4162 return IsExtractLaneEquivalentToZero(
4163 cast<ConstantInt>(Val: OtherEE->getIndexOperand())
4164 ->getValue()
4165 .getZExtValue(),
4166 OtherEE->getType()->getScalarSizeInBits());
4167 }
4168 return true;
4169 });
4170 }
4171 return true;
4172 };
4173
4174 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4175 ExtractCanFuseWithFmul())
4176 return 0;
4177
4178 // All other insert/extracts cost this much.
4179 return CostKind == TTI::TCK_CodeSize ? 1
4180 : ST->getVectorInsertExtractBaseCost();
4181}
4182
4183InstructionCost AArch64TTIImpl::getVectorInstrCost(
4184 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4185 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4186 // Treat insert at lane 0 into a poison vector as having zero cost. This
4187 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4188 // single dup) are treated as cheap.
4189 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4190 isa<PoisonValue>(Val: Op0))
4191 return 0;
4192 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, I: nullptr,
4193 Scalar: nullptr, ScalarUserAndIdx: {}, VIC);
4194}
4195
4196InstructionCost AArch64TTIImpl::getVectorInstrCost(
4197 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4198 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4199 TTI::VectorInstrContext VIC) const {
4200 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, I: nullptr, Scalar,
4201 ScalarUserAndIdx, VIC);
4202}
4203
4204InstructionCost
4205AArch64TTIImpl::getVectorInstrCost(const Instruction &I, Type *Val,
4206 TTI::TargetCostKind CostKind, unsigned Index,
4207 TTI::VectorInstrContext VIC) const {
4208 return getVectorInstrCostHelper(Opcode: I.getOpcode(), Val, CostKind, Index, I: &I,
4209 Scalar: nullptr, ScalarUserAndIdx: {}, VIC);
4210}
4211
4212InstructionCost
4213AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
4214 TTI::TargetCostKind CostKind,
4215 unsigned Index) const {
4216 if (isa<FixedVectorType>(Val))
4217 return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
4218 Index);
4219
4220 // This typically requires both while and lastb instructions in order
4221 // to extract the last element. If this is in a loop the while
4222 // instruction can at least be hoisted out, although it will consume a
4223 // predicate register. The cost should be more expensive than the base
4224 // extract cost, which is 2 for most CPUs.
4225 return CostKind == TTI::TCK_CodeSize
4226 ? 2
4227 : ST->getVectorInsertExtractBaseCost() + 1;
4228}
4229
4230InstructionCost AArch64TTIImpl::getScalarizationOverhead(
4231 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4232 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4233 TTI::VectorInstrContext VIC) const {
4234 if (isa<ScalableVectorType>(Val: Ty))
4235 return InstructionCost::getInvalid();
4236 if (Ty->getElementType()->isFloatingPointTy())
4237 return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract,
4238 CostKind);
4239 unsigned VecInstCost =
4240 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4241 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4242}
4243
4244std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4245 Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
4246 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4247 std::function<InstructionCost(Type *)> InstCost) const {
4248 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4249 return std::nullopt;
4250 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4251 return std::nullopt;
4252 if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
4253 ST->isNonStreamingSVEorSME2Available())
4254 return std::nullopt;
4255
4256 Type *PromotedTy = Ty->getWithNewType(EltTy: Type::getFloatTy(C&: Ty->getContext()));
4257 InstructionCost Cost = getCastInstrCost(Opcode: Instruction::FPExt, Dst: PromotedTy, Src: Ty,
4258 CCH: TTI::CastContextHint::None, CostKind);
4259 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4260 Cost *= 2;
4261 Cost += InstCost(PromotedTy);
4262 if (IncludeTrunc)
4263 Cost += getCastInstrCost(Opcode: Instruction::FPTrunc, Dst: Ty, Src: PromotedTy,
4264 CCH: TTI::CastContextHint::None, CostKind);
4265 return Cost;
4266}
4267
4268InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
4269 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4270 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
4271 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4272
4273 // The code-generator is currently not able to handle scalable vectors
4274 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4275 // it. This change will be removed when code-generation for these types is
4276 // sufficiently reliable.
4277 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
4278 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
4279 return InstructionCost::getInvalid();
4280
4281 // TODO: Handle more cost kinds.
4282 if (CostKind != TTI::TCK_RecipThroughput)
4283 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4284 Opd2Info: Op2Info, Args, CxtI);
4285
4286 // Legalize the type.
4287 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4288 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4289
4290 // Increase the cost for half and bfloat types if not architecturally
4291 // supported.
4292 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4293 ISD == ISD::FDIV || ISD == ISD::FREM)
4294 if (auto PromotedCost = getFP16BF16PromoteCost(
4295 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4296 // There is not native support for fdiv/frem even with +sve-b16b16.
4297 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4298 InstCost: [&](Type *PromotedTy) {
4299 return getArithmeticInstrCost(Opcode, Ty: PromotedTy, CostKind,
4300 Op1Info, Op2Info);
4301 }))
4302 return *PromotedCost;
4303
4304 // If the operation is a widening instruction (smull or umull) and both
4305 // operands are extends the cost can be cheaper by considering that the
4306 // operation will operate on the narrowest type size possible (double the
4307 // largest input size) and a further extend.
4308 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, DstTy: Ty, Args)) {
4309 if (ExtTy != Ty)
4310 return getArithmeticInstrCost(Opcode, Ty: ExtTy, CostKind) +
4311 getCastInstrCost(Opcode: Instruction::ZExt, Dst: Ty, Src: ExtTy,
4312 CCH: TTI::CastContextHint::None, CostKind);
4313 return LT.first;
4314 }
4315
4316 switch (ISD) {
4317 default:
4318 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4319 Opd2Info: Op2Info);
4320 case ISD::SREM:
4321 case ISD::SDIV:
4322 /*
4323 Notes for sdiv/srem specific costs:
4324 1. This only considers the cases where the divisor is constant, uniform and
4325 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4326 result in some form of (ldr + adrp), corresponding to constant vectors, or
4327 scalarization of the division operation.
4328 2. Constant divisors, either negative in whole or partially, don't result in
4329 significantly different codegen as compared to positive constant divisors.
4330 So, we don't consider negative divisors separately.
4331 3. If the codegen is significantly different with SVE, it has been indicated
4332 using comments at appropriate places.
4333
4334 sdiv specific cases:
4335 -----------------------------------------------------------------------
4336 codegen | pow-of-2 | Type
4337 -----------------------------------------------------------------------
4338 add + cmp + csel + asr | Y | i64
4339 add + cmp + csel + asr | Y | i32
4340 -----------------------------------------------------------------------
4341
4342 srem specific cases:
4343 -----------------------------------------------------------------------
4344 codegen | pow-of-2 | Type
4345 -----------------------------------------------------------------------
4346 negs + and + and + csneg | Y | i64
4347 negs + and + and + csneg | Y | i32
4348 -----------------------------------------------------------------------
4349
4350 other sdiv/srem cases:
4351 -------------------------------------------------------------------------
4352 common codegen | + srem | + sdiv | pow-of-2 | Type
4353 -------------------------------------------------------------------------
4354 smulh + asr + add + add | - | - | N | i64
4355 smull + lsr + add + add | - | - | N | i32
4356 usra | and + sub | sshr | Y | <2 x i64>
4357 2 * (scalar code) | - | - | N | <2 x i64>
4358 usra | bic + sub | sshr + neg | Y | <4 x i32>
4359 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4360 + sshr + usra | | | |
4361 -------------------------------------------------------------------------
4362 */
4363 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4364 InstructionCost AddCost =
4365 getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
4366 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4367 InstructionCost AsrCost =
4368 getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4369 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4370 InstructionCost MulCost =
4371 getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4372 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4373 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4374 // have similar cost.
4375 auto VT = TLI->getValueType(DL, Ty);
4376 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4377 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4378 // Neg can be folded into the asr instruction.
4379 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4380 : (3 * AsrCost + AddCost);
4381 } else {
4382 return MulCost + AsrCost + 2 * AddCost;
4383 }
4384 } else if (VT.isVector()) {
4385 InstructionCost UsraCost = 2 * AsrCost;
4386 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4387 // Division with scalable types corresponds to native 'asrd'
4388 // instruction when SVE is available.
4389 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4390
4391 // One more for the negation in SDIV
4392 InstructionCost Cost =
4393 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4394 if (Ty->isScalableTy() && ST->hasSVE())
4395 Cost += 2 * AsrCost;
4396 else {
4397 Cost +=
4398 UsraCost +
4399 (ISD == ISD::SDIV
4400 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4401 : 2 * AddCost);
4402 }
4403 return Cost;
4404 } else if (LT.second == MVT::v2i64) {
4405 return VT.getVectorNumElements() *
4406 getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind,
4407 Op1Info: Op1Info.getNoProps(),
4408 Op2Info: Op2Info.getNoProps());
4409 } else {
4410 // When SVE is available, we get:
4411 // smulh + lsr + add/sub + asr + add/sub.
4412 if (Ty->isScalableTy() && ST->hasSVE())
4413 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4414 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4415 }
4416 }
4417 }
4418 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4419 LT.second.isFixedLengthVector()) {
4420 // FIXME: When the constant vector is non-uniform, this may result in
4421 // loading the vector from constant pool or in some cases, may also result
4422 // in scalarization. For now, we are approximating this with the
4423 // scalarization cost.
4424 auto ExtractCost = 2 * getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty,
4425 CostKind, Index: -1, Op0: nullptr, Op1: nullptr);
4426 auto InsertCost = getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty,
4427 CostKind, Index: -1, Op0: nullptr, Op1: nullptr);
4428 unsigned NElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
4429 return ExtractCost + InsertCost +
4430 NElts * getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(),
4431 CostKind, Op1Info: Op1Info.getNoProps(),
4432 Op2Info: Op2Info.getNoProps());
4433 }
4434 [[fallthrough]];
4435 case ISD::UDIV:
4436 case ISD::UREM: {
4437 auto VT = TLI->getValueType(DL, Ty);
4438 if (Op2Info.isConstant()) {
4439 // If the operand is a power of 2 we can use the shift or and cost.
4440 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4441 return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind,
4442 Op1Info: Op1Info.getNoProps(),
4443 Op2Info: Op2Info.getNoProps());
4444 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4445 return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind,
4446 Op1Info: Op1Info.getNoProps(),
4447 Op2Info: Op2Info.getNoProps());
4448
4449 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4450 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4451 // The MULHU will be expanded to UMULL for the types not listed below,
4452 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4453 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4454 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4455 LT.second == MVT::nxv16i8;
4456 bool Is128bit = LT.second.is128BitVector();
4457
4458 InstructionCost MulCost =
4459 getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4460 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4461 InstructionCost AddCost =
4462 getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
4463 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4464 InstructionCost ShrCost =
4465 getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4466 Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4467 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4468 (HasMULH ? 0 : ShrCost) + // UMULL shift
4469 AddCost * 2 + ShrCost;
4470 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4471 }
4472 }
4473
4474 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4475 // emitted by the backend even when those functions are not declared in the
4476 // module.
4477 if (!VT.isVector() && VT.getSizeInBits() > 64)
4478 return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4479
4480 InstructionCost Cost = BaseT::getArithmeticInstrCost(
4481 Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4482 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4483 if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) {
4484 // SDIV/UDIV operations are lowered using SVE, then we can have less
4485 // costs.
4486 if (VT.isSimple() && isa<FixedVectorType>(Val: Ty) &&
4487 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4488 static const CostTblEntry DivTbl[]{
4489 {.ISD: ISD::SDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v4i8, .Cost: 8},
4490 {.ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::SDIV, .Type: MVT::v2i16, .Cost: 5},
4491 {.ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: 1},
4492 {.ISD: ISD::UDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v4i8, .Cost: 8},
4493 {.ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::UDIV, .Type: MVT::v2i16, .Cost: 5},
4494 {.ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: 1}};
4495
4496 const auto *Entry = CostTableLookup(Table: DivTbl, ISD, Ty: VT.getSimpleVT());
4497 if (nullptr != Entry)
4498 return Entry->Cost;
4499 }
4500 // For 8/16-bit elements, the cost is higher because the type
4501 // requires promotion and possibly splitting:
4502 if (LT.second.getScalarType() == MVT::i8)
4503 Cost *= 8;
4504 else if (LT.second.getScalarType() == MVT::i16)
4505 Cost *= 4;
4506 return Cost;
4507 } else {
4508 // If one of the operands is a uniform constant then the cost for each
4509 // element is Cost for insertion, extraction and division.
4510 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4511 // operation with scalar type
4512 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4513 (Op2Info.isConstant() && Op2Info.isUniform())) {
4514 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
4515 InstructionCost DivCost = BaseT::getArithmeticInstrCost(
4516 Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4517 return (4 + DivCost) * VTy->getNumElements();
4518 }
4519 }
4520 // On AArch64, without SVE, vector divisions are expanded
4521 // into scalar divisions of each pair of elements.
4522 Cost += getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind,
4523 Index: -1, Op0: nullptr, Op1: nullptr);
4524 Cost += getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1,
4525 Op0: nullptr, Op1: nullptr);
4526 }
4527
4528 // TODO: if one of the arguments is scalar, then it's not necessary to
4529 // double the cost of handling the vector elements.
4530 Cost += Cost;
4531 }
4532 return Cost;
4533 }
4534 case ISD::MUL:
4535 // When SVE is available, then we can lower the v2i64 operation using
4536 // the SVE mul instruction, which has a lower cost.
4537 if (LT.second == MVT::v2i64 && ST->hasSVE())
4538 return LT.first;
4539
4540 // When SVE is not available, there is no MUL.2d instruction,
4541 // which means mul <2 x i64> is expensive as elements are extracted
4542 // from the vectors and the muls scalarized.
4543 // As getScalarizationOverhead is a bit too pessimistic, we
4544 // estimate the cost for a i64 vector directly here, which is:
4545 // - four 2-cost i64 extracts,
4546 // - two 2-cost i64 inserts, and
4547 // - two 1-cost muls.
4548 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4549 // LT.first = 2 the cost is 28.
4550 if (LT.second != MVT::v2i64)
4551 return LT.first;
4552 return cast<VectorType>(Val: Ty)->getElementCount().getKnownMinValue() *
4553 (getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind) +
4554 getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind, Index: -1,
4555 Op0: nullptr, Op1: nullptr) *
4556 2 +
4557 getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1,
4558 Op0: nullptr, Op1: nullptr));
4559 case ISD::ADD:
4560 case ISD::XOR:
4561 case ISD::OR:
4562 case ISD::AND:
4563 case ISD::SRL:
4564 case ISD::SRA:
4565 case ISD::SHL:
4566 // These nodes are marked as 'custom' for combining purposes only.
4567 // We know that they are legal. See LowerAdd in ISelLowering.
4568 return LT.first;
4569
4570 case ISD::FNEG:
4571 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4572 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4573 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4574 CxtI &&
4575 ((CxtI->hasOneUse() &&
4576 match(V: *CxtI->user_begin(), P: m_FMul(L: m_Value(), R: m_Value()))) ||
4577 match(V: CxtI->getOperand(i: 0), P: m_FMul(L: m_Value(), R: m_Value()))))
4578 return 0;
4579 [[fallthrough]];
4580 case ISD::FADD:
4581 case ISD::FSUB:
4582 if (!Ty->getScalarType()->isFP128Ty())
4583 return LT.first;
4584 [[fallthrough]];
4585 case ISD::FMUL:
4586 case ISD::FDIV:
4587 // These nodes are marked as 'custom' just to lower them to SVE.
4588 // We know said lowering will incur no additional cost.
4589 if (!Ty->getScalarType()->isFP128Ty())
4590 return 2 * LT.first;
4591
4592 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4593 Opd2Info: Op2Info);
4594 case ISD::FREM:
4595 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4596 // those functions are not declared in the module.
4597 if (!Ty->isVectorTy())
4598 return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4599 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4600 Opd2Info: Op2Info);
4601 }
4602}
4603
4604InstructionCost
4605AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
4606 const SCEV *Ptr,
4607 TTI::TargetCostKind CostKind) const {
4608 // Address computations in vectorized code with non-consecutive addresses will
4609 // likely result in more instructions compared to scalar code where the
4610 // computation can more often be merged into the index mode. The resulting
4611 // extra micro-ops can significantly decrease throughput.
4612 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4613 int MaxMergeDistance = 64;
4614
4615 if (PtrTy->isVectorTy() && SE &&
4616 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + 1))
4617 return NumVectorInstToHideOverhead;
4618
4619 // In many cases the address computation is not merged into the instruction
4620 // addressing mode.
4621 return 1;
4622}
4623
4624/// Check whether Opcode1 has less throughput according to the scheduling
4625/// model than Opcode2.
4626bool AArch64TTIImpl::hasKnownLowerThroughputFromSchedulingModel(
4627 unsigned Opcode1, unsigned Opcode2) const {
4628 const MCSchedModel &Sched = ST->getSchedModel();
4629 const TargetInstrInfo *TII = ST->getInstrInfo();
4630 if (!Sched.hasInstrSchedModel())
4631 return false;
4632
4633 const MCSchedClassDesc *SCD1 =
4634 Sched.getSchedClassDesc(SchedClassIdx: TII->get(Opcode: Opcode1).getSchedClass());
4635 const MCSchedClassDesc *SCD2 =
4636 Sched.getSchedClassDesc(SchedClassIdx: TII->get(Opcode: Opcode2).getSchedClass());
4637 // We cannot handle variant scheduling classes without an MI. If we need to
4638 // support them for any of the instructions we query the information of we
4639 // might need to add a way to resolve them without a MI or not use the
4640 // scheduling info.
4641 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4642 "Cannot handle variant scheduling classes without an MI");
4643 if (!SCD1->isValid() || !SCD2->isValid())
4644 return false;
4645
4646 return MCSchedModel::getReciprocalThroughput(STI: *ST, SCDesc: *SCD1) >
4647 MCSchedModel::getReciprocalThroughput(STI: *ST, SCDesc: *SCD2);
4648}
4649
4650InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
4651 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4652 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
4653 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4654 // We don't lower some vector selects well that are wider than the register
4655 // width. TODO: Improve this with different cost kinds.
4656 if (isa<FixedVectorType>(Val: ValTy) && Opcode == Instruction::Select) {
4657 // We would need this many instructions to hide the scalarization happening.
4658 const int AmortizationCost = 20;
4659
4660 // If VecPred is not set, check if we can get a predicate from the context
4661 // instruction, if its type matches the requested ValTy.
4662 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4663 CmpPredicate CurrentPred;
4664 if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(),
4665 R: m_Value())))
4666 VecPred = CurrentPred;
4667 }
4668 // Check if we have a compare/select chain that can be lowered using
4669 // a (F)CMxx & BFI pair.
4670 if (CmpInst::isIntPredicate(P: VecPred) || VecPred == CmpInst::FCMP_OLE ||
4671 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4672 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4673 VecPred == CmpInst::FCMP_UNE) {
4674 static const auto ValidMinMaxTys = {
4675 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4676 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4677 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4678
4679 auto LT = getTypeLegalizationCost(Ty: ValTy);
4680 if (any_of(Range: ValidMinMaxTys, P: equal_to(Arg&: LT.second)) ||
4681 (ST->hasFullFP16() &&
4682 any_of(Range: ValidFP16MinMaxTys, P: equal_to(Arg&: LT.second))))
4683 return LT.first;
4684 }
4685
4686 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4687 {.ISD: Instruction::Select, .Dst: MVT::v2i1, .Src: MVT::v2f32, .Cost: 2},
4688 {.ISD: Instruction::Select, .Dst: MVT::v2i1, .Src: MVT::v2f64, .Cost: 2},
4689 {.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4f32, .Cost: 2},
4690 {.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4f16, .Cost: 2},
4691 {.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8f16, .Cost: 2},
4692 {.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: 16},
4693 {.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: 8},
4694 {.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: 16},
4695 {.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: 4 * AmortizationCost},
4696 {.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: 8 * AmortizationCost},
4697 {.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: 16 * AmortizationCost}};
4698
4699 EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
4700 EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
4701 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4702 if (const auto *Entry = ConvertCostTableLookup(Table: VectorSelectTbl, ISD: Opcode,
4703 Dst: SelCondTy.getSimpleVT(),
4704 Src: SelValTy.getSimpleVT()))
4705 return Entry->Cost;
4706 }
4707 }
4708
4709 if (Opcode == Instruction::FCmp) {
4710 if (auto PromotedCost = getFP16BF16PromoteCost(
4711 Ty: ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4712 // TODO: Consider costing SVE FCMPs.
4713 /*CanUseSVE=*/false, InstCost: [&](Type *PromotedTy) {
4714 InstructionCost Cost =
4715 getCmpSelInstrCost(Opcode, ValTy: PromotedTy, CondTy, VecPred,
4716 CostKind, Op1Info, Op2Info);
4717 if (isa<VectorType>(Val: PromotedTy))
4718 Cost += getCastInstrCost(
4719 Opcode: Instruction::Trunc,
4720 Dst: VectorType::getInteger(VTy: cast<VectorType>(Val: ValTy)),
4721 Src: VectorType::getInteger(VTy: cast<VectorType>(Val: PromotedTy)),
4722 CCH: TTI::CastContextHint::None, CostKind);
4723 return Cost;
4724 }))
4725 return *PromotedCost;
4726
4727 auto LT = getTypeLegalizationCost(Ty: ValTy);
4728 // Model unknown fp compares as a libcall.
4729 if (LT.second.getScalarType() != MVT::f64 &&
4730 LT.second.getScalarType() != MVT::f32 &&
4731 LT.second.getScalarType() != MVT::f16)
4732 return LT.first * getCallInstrCost(/*Function*/ F: nullptr, RetTy: ValTy,
4733 Tys: {ValTy, ValTy}, CostKind);
4734
4735 // Some comparison operators require expanding to multiple compares + or.
4736 unsigned Factor = 1;
4737 if (!CondTy->isVectorTy() &&
4738 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4739 Factor = 2; // fcmp with 2 selects
4740 else if (isa<FixedVectorType>(Val: ValTy) &&
4741 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4742 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4743 Factor = 3; // fcmxx+fcmyy+or
4744 else if (isa<ScalableVectorType>(Val: ValTy) &&
4745 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4746 Factor = 3; // fcmxx+fcmyy+or
4747
4748 if (isa<ScalableVectorType>(Val: ValTy) &&
4749 CostKind == TTI::TCK_RecipThroughput &&
4750 hasKnownLowerThroughputFromSchedulingModel(Opcode1: AArch64::FCMEQ_PPzZZ_S,
4751 Opcode2: AArch64::FCMEQv4f32))
4752 Factor *= 2;
4753
4754 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4755 }
4756
4757 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4758 // icmp(and, 0) as free, as we can make use of ands, but only if the
4759 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4760 // providing it will not cause performance regressions.
4761 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4762 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(predicate: VecPred) &&
4763 TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) &&
4764 match(V: I->getOperand(i: 0), P: m_And(L: m_Value(), R: m_Value()))) {
4765 if (match(V: I->getOperand(i: 1), P: m_Zero()))
4766 return 0;
4767
4768 // x >= 1 / x < 1 -> x > 0 / x <= 0
4769 if (match(V: I->getOperand(i: 1), P: m_One()) &&
4770 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4771 return 0;
4772
4773 // x <= -1 / x > -1 -> x > 0 / x <= 0
4774 if (match(V: I->getOperand(i: 1), P: m_AllOnes()) &&
4775 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4776 return 0;
4777 }
4778
4779 // The base case handles scalable vectors fine for now, since it treats the
4780 // cost as 1 * legalization cost.
4781 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4782 Op1Info, Op2Info, I);
4783}
4784
4785AArch64TTIImpl::TTI::MemCmpExpansionOptions
4786AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4787 TTI::MemCmpExpansionOptions Options;
4788 if (ST->requiresStrictAlign()) {
4789 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4790 // a bunch of instructions when strict align is enabled.
4791 return Options;
4792 }
4793 Options.AllowOverlappingLoads = true;
4794 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4795 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4796 // TODO: Though vector loads usually perform well on AArch64, in some targets
4797 // they may wake up the FP unit, which raises the power consumption. Perhaps
4798 // they could be used with no holds barred (-O3).
4799 Options.LoadSizes = {8, 4, 2, 1};
4800 Options.AllowedTailExpansions = {3, 5, 6};
4801 return Options;
4802}
4803
4804bool AArch64TTIImpl::prefersVectorizedAddressing() const {
4805 return ST->hasSVE();
4806}
4807
4808InstructionCost
4809AArch64TTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
4810 TTI::TargetCostKind CostKind) const {
4811 switch (MICA.getID()) {
4812 case Intrinsic::masked_scatter:
4813 case Intrinsic::masked_gather:
4814 return getGatherScatterOpCost(MICA, CostKind);
4815 case Intrinsic::masked_load:
4816 case Intrinsic::masked_store:
4817 return getMaskedMemoryOpCost(MICA, CostKind);
4818 }
4819 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
4820}
4821
4822InstructionCost
4823AArch64TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
4824 TTI::TargetCostKind CostKind) const {
4825 Type *Src = MICA.getDataType();
4826
4827 if (useNeonVector(Ty: Src))
4828 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
4829 auto LT = getTypeLegalizationCost(Ty: Src);
4830 if (!LT.first.isValid())
4831 return InstructionCost::getInvalid();
4832
4833 // Return an invalid cost for element types that we are unable to lower.
4834 auto *VT = cast<VectorType>(Val: Src);
4835 if (VT->getElementType()->isIntegerTy(Bitwidth: 1))
4836 return InstructionCost::getInvalid();
4837
4838 // The code-generator is currently not able to handle scalable vectors
4839 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4840 // it. This change will be removed when code-generation for these types is
4841 // sufficiently reliable.
4842 if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1))
4843 return InstructionCost::getInvalid();
4844
4845 return LT.first;
4846}
4847
4848// This function returns gather/scatter overhead either from
4849// user-provided value or specialized values per-target from \p ST.
4850static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4851 const AArch64Subtarget *ST) {
4852 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4853 "Should be called on only load or stores.");
4854 switch (Opcode) {
4855 case Instruction::Load:
4856 if (SVEGatherOverhead.getNumOccurrences() > 0)
4857 return SVEGatherOverhead;
4858 return ST->getGatherOverhead();
4859 break;
4860 case Instruction::Store:
4861 if (SVEScatterOverhead.getNumOccurrences() > 0)
4862 return SVEScatterOverhead;
4863 return ST->getScatterOverhead();
4864 break;
4865 default:
4866 llvm_unreachable("Shouldn't have reached here");
4867 }
4868}
4869
4870InstructionCost
4871AArch64TTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
4872 TTI::TargetCostKind CostKind) const {
4873
4874 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4875 MICA.getID() == Intrinsic::vp_gather)
4876 ? Instruction::Load
4877 : Instruction::Store;
4878
4879 Type *DataTy = MICA.getDataType();
4880 Align Alignment = MICA.getAlignment();
4881 const Instruction *I = MICA.getInst();
4882
4883 if (useNeonVector(Ty: DataTy) || !isLegalMaskedGatherScatter(DataType: DataTy))
4884 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
4885 auto *VT = cast<VectorType>(Val: DataTy);
4886 auto LT = getTypeLegalizationCost(Ty: DataTy);
4887 if (!LT.first.isValid())
4888 return InstructionCost::getInvalid();
4889
4890 // Return an invalid cost for element types that we are unable to lower.
4891 if (!LT.second.isVector() ||
4892 !isElementTypeLegalForScalableVector(Ty: VT->getElementType()) ||
4893 VT->getElementType()->isIntegerTy(Bitwidth: 1))
4894 return InstructionCost::getInvalid();
4895
4896 // The code-generator is currently not able to handle scalable vectors
4897 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4898 // it. This change will be removed when code-generation for these types is
4899 // sufficiently reliable.
4900 if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1))
4901 return InstructionCost::getInvalid();
4902
4903 ElementCount LegalVF = LT.second.getVectorElementCount();
4904 InstructionCost MemOpCost =
4905 getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: 0, CostKind,
4906 OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
4907 // Add on an overhead cost for using gathers/scatters.
4908 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4909 return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF);
4910}
4911
4912bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
4913 return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors();
4914}
4915
4916InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
4917 Align Alignment,
4918 unsigned AddressSpace,
4919 TTI::TargetCostKind CostKind,
4920 TTI::OperandValueInfo OpInfo,
4921 const Instruction *I) const {
4922 EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
4923 // Type legalization can't handle structs
4924 if (VT == MVT::Other)
4925 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
4926 CostKind);
4927
4928 auto LT = getTypeLegalizationCost(Ty);
4929 if (!LT.first.isValid())
4930 return InstructionCost::getInvalid();
4931
4932 // The code-generator is currently not able to handle scalable vectors
4933 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4934 // it. This change will be removed when code-generation for these types is
4935 // sufficiently reliable.
4936 // We also only support full register predicate loads and stores.
4937 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
4938 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1) ||
4939 (VTy->getElementType()->isIntegerTy(Bitwidth: 1) &&
4940 !VTy->getElementCount().isKnownMultipleOf(
4941 RHS: ElementCount::getScalable(MinVal: 16))))
4942 return InstructionCost::getInvalid();
4943
4944 // TODO: consider latency as well for TCK_SizeAndLatency.
4945 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
4946 return LT.first;
4947
4948 if (CostKind != TTI::TCK_RecipThroughput)
4949 return 1;
4950
4951 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4952 LT.second.is128BitVector() && Alignment < Align(16)) {
4953 // Unaligned stores are extremely inefficient. We don't split all
4954 // unaligned 128-bit stores because the negative impact that has shown in
4955 // practice on inlined block copy code.
4956 // We make such stores expensive so that we will only vectorize if there
4957 // are 6 other instructions getting vectorized.
4958 const int AmortizationCost = 6;
4959
4960 return LT.first * 2 * AmortizationCost;
4961 }
4962
4963 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4964 if (Ty->isPtrOrPtrVectorTy())
4965 return LT.first;
4966
4967 if (useNeonVector(Ty)) {
4968 // Check truncating stores and extending loads.
4969 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4970 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4971 if (VT == MVT::v4i8)
4972 return 2;
4973 // Otherwise we need to scalarize.
4974 return cast<FixedVectorType>(Val: Ty)->getNumElements() * 2;
4975 }
4976 EVT EltVT = VT.getVectorElementType();
4977 unsigned EltSize = EltVT.getScalarSizeInBits();
4978 if (!isPowerOf2_32(Value: EltSize) || EltSize < 8 || EltSize > 64 ||
4979 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4980 return LT.first;
4981 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4982 // widening to v4i8, which produces suboptimal results.
4983 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4984 return LT.first;
4985
4986 // Check non-power-of-2 loads/stores for legal vector element types with
4987 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4988 // operations on smaller power-of-2 ops, including ld1/st1.
4989 LLVMContext &C = Ty->getContext();
4990 InstructionCost Cost(0);
4991 SmallVector<EVT> TypeWorklist;
4992 TypeWorklist.push_back(Elt: VT);
4993 while (!TypeWorklist.empty()) {
4994 EVT CurrVT = TypeWorklist.pop_back_val();
4995 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4996 if (isPowerOf2_32(Value: CurrNumElements)) {
4997 Cost += 1;
4998 continue;
4999 }
5000
5001 unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / 2;
5002 TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2));
5003 TypeWorklist.push_back(
5004 Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2));
5005 }
5006 return Cost;
5007 }
5008
5009 return LT.first;
5010}
5011
5012InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
5013 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5014 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5015 bool UseMaskForCond, bool UseMaskForGaps) const {
5016 assert(Factor >= 2 && "Invalid interleave factor");
5017 auto *VecVTy = cast<VectorType>(Val: VecTy);
5018
5019 if (VecTy->isScalableTy() && !ST->hasSVE())
5020 return InstructionCost::getInvalid();
5021
5022 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5023 // only have lowering for power-of-2 factors.
5024 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5025 // InterleavedAccessPass for ld3/st3
5026 if (VecTy->isScalableTy() && !isPowerOf2_32(Value: Factor))
5027 return InstructionCost::getInvalid();
5028
5029 // Vectorization for masked interleaved accesses is only enabled for scalable
5030 // VF.
5031 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5032 return InstructionCost::getInvalid();
5033
5034 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5035 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5036 auto *SubVecTy =
5037 VectorType::get(ElementType: VecVTy->getElementType(),
5038 EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor));
5039
5040 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5041 // Accesses having vector types that are a multiple of 128 bits can be
5042 // matched to more than one ldN/stN instruction.
5043 bool UseScalable;
5044 if (MinElts % Factor == 0 &&
5045 TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
5046 return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
5047 }
5048
5049 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5050 Alignment, AddressSpace, CostKind,
5051 UseMaskForCond, UseMaskForGaps);
5052}
5053
5054InstructionCost
5055AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
5056 InstructionCost Cost = 0;
5057 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5058 for (auto *I : Tys) {
5059 if (!I->isVectorTy())
5060 continue;
5061 if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() ==
5062 128)
5063 Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind) +
5064 getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind);
5065 }
5066 return Cost;
5067}
5068
5069unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
5070 return ST->getMaxInterleaveFactor();
5071}
5072
5073// For Falkor, we want to avoid having too many strided loads in a loop since
5074// that can exhaust the HW prefetcher resources. We adjust the unroller
5075// MaxCount preference below to attempt to ensure unrolling doesn't create too
5076// many strided loads.
5077static void
5078getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
5079 TargetTransformInfo::UnrollingPreferences &UP) {
5080 enum { MaxStridedLoads = 7 };
5081 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5082 int StridedLoads = 0;
5083 // FIXME? We could make this more precise by looking at the CFG and
5084 // e.g. not counting loads in each side of an if-then-else diamond.
5085 for (const auto BB : L->blocks()) {
5086 for (auto &I : *BB) {
5087 LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I);
5088 if (!LMemI)
5089 continue;
5090
5091 Value *PtrValue = LMemI->getPointerOperand();
5092 if (L->isLoopInvariant(V: PtrValue))
5093 continue;
5094
5095 const SCEV *LSCEV = SE.getSCEV(V: PtrValue);
5096 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV);
5097 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5098 continue;
5099
5100 // FIXME? We could take pairing of unrolled load copies into account
5101 // by looking at the AddRec, but we would probably have to limit this
5102 // to loops with no stores or other memory optimization barriers.
5103 ++StridedLoads;
5104 // We've seen enough strided loads that seeing more won't make a
5105 // difference.
5106 if (StridedLoads > MaxStridedLoads / 2)
5107 return StridedLoads;
5108 }
5109 }
5110 return StridedLoads;
5111 };
5112
5113 int StridedLoads = countStridedLoads(L, SE);
5114 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5115 << " strided loads\n");
5116 // Pick the largest power of 2 unroll count that won't result in too many
5117 // strided loads.
5118 if (StridedLoads) {
5119 UP.MaxCount = 1 << Log2_32(Value: MaxStridedLoads / StridedLoads);
5120 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5121 << UP.MaxCount << '\n');
5122 }
5123}
5124
5125// This function returns true if the loop:
5126// 1. Has a valid cost, and
5127// 2. Has a cost within the supplied budget.
5128// Otherwise it returns false.
5129static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI,
5130 InstructionCost Budget,
5131 unsigned *FinalSize) {
5132 // Estimate the size of the loop.
5133 InstructionCost LoopCost = 0;
5134
5135 for (auto *BB : L->getBlocks()) {
5136 for (auto &I : *BB) {
5137 SmallVector<const Value *, 4> Operands(I.operand_values());
5138 InstructionCost Cost =
5139 TTI.getInstructionCost(U: &I, Operands, CostKind: TTI::TCK_CodeSize);
5140 // This can happen with intrinsics that don't currently have a cost model
5141 // or for some operations that require SVE.
5142 if (!Cost.isValid())
5143 return false;
5144
5145 LoopCost += Cost;
5146 if (LoopCost > Budget)
5147 return false;
5148 }
5149 }
5150
5151 if (FinalSize)
5152 *FinalSize = LoopCost.getValue();
5153 return true;
5154}
5155
5156static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
5157 const AArch64TTIImpl &TTI) {
5158 // Only consider loops with unknown trip counts for which we can determine
5159 // a symbolic expression. Multi-exit loops with small known trip counts will
5160 // likely be unrolled anyway.
5161 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5162 if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC))
5163 return false;
5164
5165 // It might not be worth unrolling loops with low max trip counts. Restrict
5166 // this to max trip counts > 32 for now.
5167 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5168 if (MaxTC > 0 && MaxTC <= 32)
5169 return false;
5170
5171 // Make sure the loop size is <= 5.
5172 if (!isLoopSizeWithinBudget(L, TTI, Budget: 5, FinalSize: nullptr))
5173 return false;
5174
5175 // Small search loops with multiple exits can be highly beneficial to unroll.
5176 // We only care about loops with exactly two exiting blocks, although each
5177 // block could jump to the same exit block.
5178 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5179 if (Blocks.size() != 2)
5180 return false;
5181
5182 if (any_of(Range&: Blocks, P: [](BasicBlock *BB) {
5183 return !isa<BranchInst>(Val: BB->getTerminator());
5184 }))
5185 return false;
5186
5187 return true;
5188}
5189
5190/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5191/// OOO engine's wide instruction window and various predictors.
5192static void
5193getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
5194 TargetTransformInfo::UnrollingPreferences &UP,
5195 const AArch64TTIImpl &TTI) {
5196 // Limit loops with structure that is highly likely to benefit from runtime
5197 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5198 // likely with complex control flow). Note that the heuristics here may be
5199 // overly conservative and we err on the side of avoiding runtime unrolling
5200 // rather than unroll excessively. They are all subject to further refinement.
5201 if (!L->isInnermost() || L->getNumBlocks() > 8)
5202 return;
5203
5204 // Loops with multiple exits are handled by common code.
5205 if (!L->getExitBlock())
5206 return;
5207
5208 // Check if the loop contains any reductions that could be parallelized when
5209 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5210 // a multiple of 2.
5211 bool HasParellelizableReductions =
5212 L->getNumBlocks() == 1 &&
5213 any_of(Range: L->getHeader()->phis(),
5214 P: [&SE, L](PHINode &Phi) {
5215 return canParallelizeReductionWhenUnrolling(Phi, L, SE: &SE);
5216 }) &&
5217 isLoopSizeWithinBudget(L, TTI, Budget: 12, FinalSize: nullptr);
5218 if (HasParellelizableReductions &&
5219 SE.getSmallConstantTripMultiple(L, ExitingBlock: L->getExitingBlock()) % 2 == 0) {
5220 UP.Partial = true;
5221 UP.MaxCount = 4;
5222 UP.AddAdditionalAccumulators = true;
5223 }
5224
5225 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5226 if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC) ||
5227 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5228 SE.getSmallConstantMaxTripCount(L) <= 32))
5229 return;
5230
5231 if (findStringMetadataForLoop(TheLoop: L, Name: "llvm.loop.isvectorized"))
5232 return;
5233
5234 if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
5235 return;
5236
5237 // Limit to loops with trip counts that are cheap to expand.
5238 UP.SCEVExpansionBudget = 1;
5239
5240 if (HasParellelizableReductions) {
5241 UP.Runtime = true;
5242 UP.DefaultUnrollRuntimeCount = 4;
5243 UP.AddAdditionalAccumulators = true;
5244 }
5245
5246 // Try to unroll small loops, of few-blocks with low budget, if they have
5247 // load/store dependencies, to expose more parallel memory access streams,
5248 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5249 BasicBlock *Header = L->getHeader();
5250 BasicBlock *Latch = L->getLoopLatch();
5251 if (Header == Latch) {
5252 // Estimate the size of the loop.
5253 unsigned Size;
5254 unsigned Width = 10;
5255 if (!isLoopSizeWithinBudget(L, TTI, Budget: Width, FinalSize: &Size))
5256 return;
5257
5258 // Try to find an unroll count that maximizes the use of the instruction
5259 // window, i.e. trying to fetch as many instructions per cycle as possible.
5260 unsigned MaxInstsPerLine = 16;
5261 unsigned UC = 1;
5262 unsigned BestUC = 1;
5263 unsigned SizeWithBestUC = BestUC * Size;
5264 while (UC <= 8) {
5265 unsigned SizeWithUC = UC * Size;
5266 if (SizeWithUC > 48)
5267 break;
5268 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5269 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5270 BestUC = UC;
5271 SizeWithBestUC = BestUC * Size;
5272 }
5273 UC++;
5274 }
5275
5276 if (BestUC == 1)
5277 return;
5278
5279 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5280 SmallVector<StoreInst *> Stores;
5281 for (auto *BB : L->blocks()) {
5282 for (auto &I : *BB) {
5283 Value *Ptr = getLoadStorePointerOperand(V: &I);
5284 if (!Ptr)
5285 continue;
5286 const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
5287 if (SE.isLoopInvariant(S: PtrSCEV, L))
5288 continue;
5289 if (isa<LoadInst>(Val: &I)) {
5290 LoadedValuesPlus.insert(Ptr: &I);
5291 // Include in-loop 1st users of loaded values.
5292 for (auto *U : I.users())
5293 if (L->contains(Inst: cast<Instruction>(Val: U)))
5294 LoadedValuesPlus.insert(Ptr: U);
5295 } else
5296 Stores.push_back(Elt: cast<StoreInst>(Val: &I));
5297 }
5298 }
5299
5300 if (none_of(Range&: Stores, P: [&LoadedValuesPlus](StoreInst *SI) {
5301 return LoadedValuesPlus.contains(Ptr: SI->getOperand(i_nocapture: 0));
5302 }))
5303 return;
5304
5305 UP.Runtime = true;
5306 UP.DefaultUnrollRuntimeCount = BestUC;
5307 return;
5308 }
5309
5310 // Try to runtime-unroll loops with early-continues depending on loop-varying
5311 // loads; this helps with branch-prediction for the early-continues.
5312 auto *Term = dyn_cast<BranchInst>(Val: Header->getTerminator());
5313 SmallVector<BasicBlock *> Preds(predecessors(BB: Latch));
5314 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5315 !llvm::is_contained(Range&: Preds, Element: Header) ||
5316 none_of(Range&: Preds, P: [L](BasicBlock *Pred) { return L->contains(BB: Pred); }))
5317 return;
5318
5319 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5320 [&](Instruction *I, unsigned Depth) -> bool {
5321 if (isa<PHINode>(Val: I) || L->isLoopInvariant(V: I) || Depth > 8)
5322 return false;
5323
5324 if (isa<LoadInst>(Val: I))
5325 return true;
5326
5327 return any_of(Range: I->operands(), P: [&](Value *V) {
5328 auto *I = dyn_cast<Instruction>(Val: V);
5329 return I && DependsOnLoopLoad(I, Depth + 1);
5330 });
5331 };
5332 CmpPredicate Pred;
5333 Instruction *I;
5334 if (match(V: Term, P: m_Br(C: m_ICmp(Pred, L: m_Instruction(I), R: m_Value()), T: m_Value(),
5335 F: m_Value())) &&
5336 DependsOnLoopLoad(I, 0)) {
5337 UP.Runtime = true;
5338 }
5339}
5340
5341void AArch64TTIImpl::getUnrollingPreferences(
5342 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
5343 OptimizationRemarkEmitter *ORE) const {
5344 // Enable partial unrolling and runtime unrolling.
5345 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5346
5347 UP.UpperBound = true;
5348
5349 // For inner loop, it is more likely to be a hot one, and the runtime check
5350 // can be promoted out from LICM pass, so the overhead is less, let's try
5351 // a larger threshold to unroll more loops.
5352 if (L->getLoopDepth() > 1)
5353 UP.PartialThreshold *= 2;
5354
5355 // Disable partial & runtime unrolling on -Os.
5356 UP.PartialOptSizeThreshold = 0;
5357
5358 // Scan the loop: don't unroll loops with calls as this could prevent
5359 // inlining. Don't unroll auto-vectorized loops either, though do allow
5360 // unrolling of the scalar remainder.
5361 bool IsVectorized = getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized");
5362 InstructionCost Cost = 0;
5363 for (auto *BB : L->getBlocks()) {
5364 for (auto &I : *BB) {
5365 // Both auto-vectorized loops and the scalar remainder have the
5366 // isvectorized attribute, so differentiate between them by the presence
5367 // of vector instructions.
5368 if (IsVectorized && I.getType()->isVectorTy())
5369 return;
5370 if (isa<CallBase>(Val: I)) {
5371 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I))
5372 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction())
5373 if (!isLoweredToCall(F))
5374 continue;
5375 return;
5376 }
5377
5378 SmallVector<const Value *, 4> Operands(I.operand_values());
5379 Cost += getInstructionCost(U: &I, Operands,
5380 CostKind: TargetTransformInfo::TCK_SizeAndLatency);
5381 }
5382 }
5383
5384 // Apply subtarget-specific unrolling preferences.
5385 if (ST->isAppleMLike())
5386 getAppleRuntimeUnrollPreferences(L, SE, UP, TTI: *this);
5387 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5388 EnableFalkorHWPFUnrollFix)
5389 getFalkorUnrollingPreferences(L, SE, UP);
5390
5391 // If this is a small, multi-exit loop similar to something like std::find,
5392 // then there is typically a performance improvement achieved by unrolling.
5393 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, TTI: *this)) {
5394 UP.RuntimeUnrollMultiExit = true;
5395 UP.Runtime = true;
5396 // Limit unroll count.
5397 UP.DefaultUnrollRuntimeCount = 4;
5398 // Allow slightly more costly trip-count expansion to catch search loops
5399 // with pointer inductions.
5400 UP.SCEVExpansionBudget = 5;
5401 return;
5402 }
5403
5404 // Enable runtime unrolling for in-order models
5405 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5406 // checking for that case, we can ensure that the default behaviour is
5407 // unchanged
5408 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5409 !ST->getSchedModel().isOutOfOrder()) {
5410 UP.Runtime = true;
5411 UP.Partial = true;
5412 UP.UnrollRemainder = true;
5413 UP.DefaultUnrollRuntimeCount = 4;
5414
5415 UP.UnrollAndJam = true;
5416 UP.UnrollAndJamInnerLoopThreshold = 60;
5417 }
5418
5419 // Force unrolling small loops can be very useful because of the branch
5420 // taken cost of the backedge.
5421 if (Cost < Aarch64ForceUnrollThreshold)
5422 UP.Force = true;
5423}
5424
5425void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
5426 TTI::PeelingPreferences &PP) const {
5427 BaseT::getPeelingPreferences(L, SE, PP);
5428}
5429
5430Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
5431 Type *ExpectedType,
5432 bool CanCreate) const {
5433 switch (Inst->getIntrinsicID()) {
5434 default:
5435 return nullptr;
5436 case Intrinsic::aarch64_neon_st2:
5437 case Intrinsic::aarch64_neon_st3:
5438 case Intrinsic::aarch64_neon_st4: {
5439 // Create a struct type
5440 StructType *ST = dyn_cast<StructType>(Val: ExpectedType);
5441 if (!CanCreate || !ST)
5442 return nullptr;
5443 unsigned NumElts = Inst->arg_size() - 1;
5444 if (ST->getNumElements() != NumElts)
5445 return nullptr;
5446 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5447 if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i))
5448 return nullptr;
5449 }
5450 Value *Res = PoisonValue::get(T: ExpectedType);
5451 IRBuilder<> Builder(Inst);
5452 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5453 Value *L = Inst->getArgOperand(i);
5454 Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i);
5455 }
5456 return Res;
5457 }
5458 case Intrinsic::aarch64_neon_ld2:
5459 case Intrinsic::aarch64_neon_ld3:
5460 case Intrinsic::aarch64_neon_ld4:
5461 if (Inst->getType() == ExpectedType)
5462 return Inst;
5463 return nullptr;
5464 }
5465}
5466
5467bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
5468 MemIntrinsicInfo &Info) const {
5469 switch (Inst->getIntrinsicID()) {
5470 default:
5471 break;
5472 case Intrinsic::aarch64_neon_ld2:
5473 case Intrinsic::aarch64_neon_ld3:
5474 case Intrinsic::aarch64_neon_ld4:
5475 Info.ReadMem = true;
5476 Info.WriteMem = false;
5477 Info.PtrVal = Inst->getArgOperand(i: 0);
5478 break;
5479 case Intrinsic::aarch64_neon_st2:
5480 case Intrinsic::aarch64_neon_st3:
5481 case Intrinsic::aarch64_neon_st4:
5482 Info.ReadMem = false;
5483 Info.WriteMem = true;
5484 Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - 1);
5485 break;
5486 }
5487
5488 switch (Inst->getIntrinsicID()) {
5489 default:
5490 return false;
5491 case Intrinsic::aarch64_neon_ld2:
5492 case Intrinsic::aarch64_neon_st2:
5493 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5494 break;
5495 case Intrinsic::aarch64_neon_ld3:
5496 case Intrinsic::aarch64_neon_st3:
5497 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5498 break;
5499 case Intrinsic::aarch64_neon_ld4:
5500 case Intrinsic::aarch64_neon_st4:
5501 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5502 break;
5503 }
5504 return true;
5505}
5506
5507/// See if \p I should be considered for address type promotion. We check if \p
5508/// I is a sext with right type and used in memory accesses. If it used in a
5509/// "complex" getelementptr, we allow it to be promoted without finding other
5510/// sext instructions that sign extended the same initial value. A getelementptr
5511/// is considered as "complex" if it has more than 2 operands.
5512bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
5513 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5514 bool Considerable = false;
5515 AllowPromotionWithoutCommonHeader = false;
5516 if (!isa<SExtInst>(Val: &I))
5517 return false;
5518 Type *ConsideredSExtType =
5519 Type::getInt64Ty(C&: I.getParent()->getParent()->getContext());
5520 if (I.getType() != ConsideredSExtType)
5521 return false;
5522 // See if the sext is the one with the right type and used in at least one
5523 // GetElementPtrInst.
5524 for (const User *U : I.users()) {
5525 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) {
5526 Considerable = true;
5527 // A getelementptr is considered as "complex" if it has more than 2
5528 // operands. We will promote a SExt used in such complex GEP as we
5529 // expect some computation to be merged if they are done on 64 bits.
5530 if (GEPInst->getNumOperands() > 2) {
5531 AllowPromotionWithoutCommonHeader = true;
5532 break;
5533 }
5534 }
5535 }
5536 return Considerable;
5537}
5538
5539bool AArch64TTIImpl::isLegalToVectorizeReduction(
5540 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5541 if (!VF.isScalable())
5542 return true;
5543
5544 Type *Ty = RdxDesc.getRecurrenceType();
5545 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5546 return false;
5547
5548 switch (RdxDesc.getRecurrenceKind()) {
5549 case RecurKind::Sub:
5550 case RecurKind::AddChainWithSubs:
5551 case RecurKind::Add:
5552 case RecurKind::FAdd:
5553 case RecurKind::And:
5554 case RecurKind::Or:
5555 case RecurKind::Xor:
5556 case RecurKind::SMin:
5557 case RecurKind::SMax:
5558 case RecurKind::UMin:
5559 case RecurKind::UMax:
5560 case RecurKind::FMin:
5561 case RecurKind::FMax:
5562 case RecurKind::FMulAdd:
5563 case RecurKind::AnyOf:
5564 case RecurKind::FindLast:
5565 return true;
5566 default:
5567 return false;
5568 }
5569}
5570
5571InstructionCost
5572AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
5573 FastMathFlags FMF,
5574 TTI::TargetCostKind CostKind) const {
5575 // The code-generator is currently not able to handle scalable vectors
5576 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5577 // it. This change will be removed when code-generation for these types is
5578 // sufficiently reliable.
5579 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
5580 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
5581 return InstructionCost::getInvalid();
5582
5583 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5584
5585 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5586 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5587
5588 InstructionCost LegalizationCost = 0;
5589 if (LT.first > 1) {
5590 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Ty->getContext());
5591 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5592 LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - 1);
5593 }
5594
5595 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5596}
5597
5598InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
5599 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5600 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5601 InstructionCost LegalizationCost = 0;
5602 if (LT.first > 1) {
5603 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: ValTy->getContext());
5604 LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind);
5605 LegalizationCost *= LT.first - 1;
5606 }
5607
5608 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5609 assert(ISD && "Invalid opcode");
5610 // Add the final reduction cost for the legal horizontal reduction
5611 switch (ISD) {
5612 case ISD::ADD:
5613 case ISD::AND:
5614 case ISD::OR:
5615 case ISD::XOR:
5616 case ISD::FADD:
5617 return LegalizationCost + 2;
5618 default:
5619 return InstructionCost::getInvalid();
5620 }
5621}
5622
5623InstructionCost
5624AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
5625 std::optional<FastMathFlags> FMF,
5626 TTI::TargetCostKind CostKind) const {
5627 // The code-generator is currently not able to handle scalable vectors
5628 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5629 // it. This change will be removed when code-generation for these types is
5630 // sufficiently reliable.
5631 if (auto *VTy = dyn_cast<ScalableVectorType>(Val: ValTy))
5632 if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1))
5633 return InstructionCost::getInvalid();
5634
5635 if (TTI::requiresOrderedReduction(FMF)) {
5636 if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) {
5637 InstructionCost BaseCost =
5638 BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
5639 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5640 // end up vectorizing for more computationally intensive loops.
5641 return BaseCost + FixedVTy->getNumElements();
5642 }
5643
5644 if (Opcode != Instruction::FAdd)
5645 return InstructionCost::getInvalid();
5646
5647 auto *VTy = cast<ScalableVectorType>(Val: ValTy);
5648 InstructionCost Cost =
5649 getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind);
5650 Cost *= getMaxNumElements(VF: VTy->getElementCount());
5651 return Cost;
5652 }
5653
5654 if (isa<ScalableVectorType>(Val: ValTy))
5655 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5656
5657 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5658 MVT MTy = LT.second;
5659 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5660 assert(ISD && "Invalid opcode");
5661
5662 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5663 // instructions as twice a normal vector add, plus 1 for each legalization
5664 // step (LT.first). This is the only arithmetic vector reduction operation for
5665 // which we have an instruction.
5666 // OR, XOR and AND costs should match the codegen from:
5667 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5668 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5669 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5670 static const CostTblEntry CostTblNoPairwise[]{
5671 {.ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: 2},
5672 {.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 2},
5673 {.ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: 2},
5674 {.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 2},
5675 {.ISD: ISD::ADD, .Type: MVT::v2i32, .Cost: 2},
5676 {.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 2},
5677 {.ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 2},
5678 {.ISD: ISD::OR, .Type: MVT::v8i8, .Cost: 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5679 {.ISD: ISD::OR, .Type: MVT::v16i8, .Cost: 7}, // ext + orr + same as v8i8
5680 {.ISD: ISD::OR, .Type: MVT::v4i16, .Cost: 4}, // fmov + orr_lsr + lsr + orr
5681 {.ISD: ISD::OR, .Type: MVT::v8i16, .Cost: 6}, // ext + orr + same as v4i16
5682 {.ISD: ISD::OR, .Type: MVT::v2i32, .Cost: 3}, // fmov + lsr + orr
5683 {.ISD: ISD::OR, .Type: MVT::v4i32, .Cost: 5}, // ext + orr + same as v2i32
5684 {.ISD: ISD::OR, .Type: MVT::v2i64, .Cost: 3}, // ext + orr + fmov
5685 {.ISD: ISD::XOR, .Type: MVT::v8i8, .Cost: 5}, // Same as above for or...
5686 {.ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: 7},
5687 {.ISD: ISD::XOR, .Type: MVT::v4i16, .Cost: 4},
5688 {.ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: 6},
5689 {.ISD: ISD::XOR, .Type: MVT::v2i32, .Cost: 3},
5690 {.ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: 5},
5691 {.ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: 3},
5692 {.ISD: ISD::AND, .Type: MVT::v8i8, .Cost: 5}, // Same as above for or...
5693 {.ISD: ISD::AND, .Type: MVT::v16i8, .Cost: 7},
5694 {.ISD: ISD::AND, .Type: MVT::v4i16, .Cost: 4},
5695 {.ISD: ISD::AND, .Type: MVT::v8i16, .Cost: 6},
5696 {.ISD: ISD::AND, .Type: MVT::v2i32, .Cost: 3},
5697 {.ISD: ISD::AND, .Type: MVT::v4i32, .Cost: 5},
5698 {.ISD: ISD::AND, .Type: MVT::v2i64, .Cost: 3},
5699 };
5700 switch (ISD) {
5701 default:
5702 break;
5703 case ISD::FADD:
5704 if (Type *EltTy = ValTy->getScalarType();
5705 // FIXME: For half types without fullfp16 support, this could extend and
5706 // use a fp32 faddp reduction but current codegen unrolls.
5707 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5708 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5709 const unsigned NElts = MTy.getVectorNumElements();
5710 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5711 isPowerOf2_32(Value: NElts))
5712 // Reduction corresponding to series of fadd instructions is lowered to
5713 // series of faddp instructions. faddp has latency/throughput that
5714 // matches fadd instruction and hence, every faddp instruction can be
5715 // considered to have a relative cost = 1 with
5716 // CostKind = TCK_RecipThroughput.
5717 // An faddp will pairwise add vector elements, so the size of input
5718 // vector reduces by half every time, requiring
5719 // #(faddp instructions) = log2_32(NElts).
5720 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(Value: NElts);
5721 }
5722 break;
5723 case ISD::ADD:
5724 if (const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy))
5725 return (LT.first - 1) + Entry->Cost;
5726 break;
5727 case ISD::XOR:
5728 case ISD::AND:
5729 case ISD::OR:
5730 const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy);
5731 if (!Entry)
5732 break;
5733 auto *ValVTy = cast<FixedVectorType>(Val: ValTy);
5734 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5735 isPowerOf2_32(Value: ValVTy->getNumElements())) {
5736 InstructionCost ExtraCost = 0;
5737 if (LT.first != 1) {
5738 // Type needs to be split, so there is an extra cost of LT.first - 1
5739 // arithmetic ops.
5740 auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(),
5741 NumElts: MTy.getVectorNumElements());
5742 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5743 ExtraCost *= LT.first - 1;
5744 }
5745 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5746 auto Cost = ValVTy->getElementType()->isIntegerTy(Bitwidth: 1) ? 2 : Entry->Cost;
5747 return Cost + ExtraCost;
5748 }
5749 break;
5750 }
5751 return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
5752}
5753
5754InstructionCost AArch64TTIImpl::getExtendedReductionCost(
5755 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5756 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5757 EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
5758 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
5759
5760 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5761 VecVT.getSizeInBits() >= 64) {
5762 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
5763
5764 // The legal cases are:
5765 // UADDLV 8/16/32->32
5766 // UADDLP 32->64
5767 unsigned RevVTSize = ResVT.getSizeInBits();
5768 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5769 RevVTSize <= 32) ||
5770 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5771 RevVTSize <= 32) ||
5772 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5773 RevVTSize <= 64))
5774 return (LT.first - 1) * 2 + 2;
5775 }
5776
5777 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: VecTy, FMF,
5778 CostKind);
5779}
5780
5781InstructionCost
5782AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5783 Type *ResTy, VectorType *VecTy,
5784 TTI::TargetCostKind CostKind) const {
5785 EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
5786 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
5787
5788 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5789 RedOpcode == Instruction::Add) {
5790 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
5791
5792 // The legal cases with dotprod are
5793 // UDOT 8->32
5794 // Which requires an additional uaddv to sum the i32 values.
5795 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5796 ResVT == MVT::i32)
5797 return LT.first + 2;
5798 }
5799
5800 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, Ty: VecTy,
5801 CostKind);
5802}
5803
5804InstructionCost
5805AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index,
5806 TTI::TargetCostKind CostKind) const {
5807 static const CostTblEntry ShuffleTbl[] = {
5808 { .ISD: TTI::SK_Splice, .Type: MVT::nxv16i8, .Cost: 1 },
5809 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8i16, .Cost: 1 },
5810 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4i32, .Cost: 1 },
5811 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2i64, .Cost: 1 },
5812 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f16, .Cost: 1 },
5813 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f16, .Cost: 1 },
5814 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8f16, .Cost: 1 },
5815 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2bf16, .Cost: 1 },
5816 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4bf16, .Cost: 1 },
5817 { .ISD: TTI::SK_Splice, .Type: MVT::nxv8bf16, .Cost: 1 },
5818 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f32, .Cost: 1 },
5819 { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f32, .Cost: 1 },
5820 { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f64, .Cost: 1 },
5821 };
5822
5823 // The code-generator is currently not able to handle scalable vectors
5824 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5825 // it. This change will be removed when code-generation for these types is
5826 // sufficiently reliable.
5827 if (Tp->getElementCount() == ElementCount::getScalable(MinVal: 1))
5828 return InstructionCost::getInvalid();
5829
5830 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
5831 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Tp->getContext());
5832 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5833 ? TLI->getPromotedVTForPredicate(VT: EVT(LT.second))
5834 : LT.second;
5835 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Context&: Tp->getContext());
5836 InstructionCost LegalizationCost = 0;
5837 if (Index < 0) {
5838 LegalizationCost =
5839 getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy,
5840 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) +
5841 getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy,
5842 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
5843 }
5844
5845 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5846 // Cost performed on a promoted type.
5847 if (LT.second.getScalarType() == MVT::i1) {
5848 LegalizationCost +=
5849 getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy,
5850 CCH: TTI::CastContextHint::None, CostKind) +
5851 getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy,
5852 CCH: TTI::CastContextHint::None, CostKind);
5853 }
5854 const auto *Entry =
5855 CostTableLookup(Table: ShuffleTbl, ISD: TTI::SK_Splice, Ty: PromotedVT.getSimpleVT());
5856 assert(Entry && "Illegal Type for Splice");
5857 LegalizationCost += Entry->Cost;
5858 return LegalizationCost * LT.first;
5859}
5860
5861InstructionCost AArch64TTIImpl::getPartialReductionCost(
5862 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5863 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
5864 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5865 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
5866 InstructionCost Invalid = InstructionCost::getInvalid();
5867
5868 if (CostKind != TTI::TCK_RecipThroughput)
5869 return Invalid;
5870
5871 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5872 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5873 return Invalid;
5874
5875 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
5876 Opcode != Instruction::FAdd) ||
5877 OpAExtend == TTI::PR_None)
5878 return Invalid;
5879
5880 // Floating-point partial reductions are invalid if `reassoc` and `contract`
5881 // are not allowed.
5882 if (AccumType->isFloatingPointTy()) {
5883 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
5884 if (!FMF->allowReassoc() || !FMF->allowContract())
5885 return Invalid;
5886 } else {
5887 assert(!FMF &&
5888 "FastMathFlags only apply to floating-point partial reductions");
5889 }
5890
5891 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5892 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5893 "Unexpected values for OpBExtend or InputTypeB");
5894
5895 // We only support multiply binary operations for now, and for muls we
5896 // require the types being extended to be the same.
5897 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
5898 InputTypeA != InputTypeB))
5899 return Invalid;
5900
5901 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5902 if (IsUSDot && !ST->hasMatMulInt8())
5903 return Invalid;
5904
5905 unsigned Ratio =
5906 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5907 if (VF.getKnownMinValue() <= Ratio)
5908 return Invalid;
5909
5910 VectorType *InputVectorType = VectorType::get(ElementType: InputTypeA, EC: VF);
5911 VectorType *AccumVectorType =
5912 VectorType::get(ElementType: AccumType, EC: VF.divideCoefficientBy(RHS: Ratio));
5913 // We don't yet support all kinds of legalization.
5914 auto TC = TLI->getTypeConversion(Context&: AccumVectorType->getContext(),
5915 VT: EVT::getEVT(Ty: AccumVectorType));
5916 switch (TC.first) {
5917 default:
5918 return Invalid;
5919 case TargetLowering::TypeLegal:
5920 case TargetLowering::TypePromoteInteger:
5921 case TargetLowering::TypeSplitVector:
5922 // The legalised type (e.g. after splitting) must be legal too.
5923 if (TLI->getTypeAction(Context&: AccumVectorType->getContext(), VT: TC.second) !=
5924 TargetLowering::TypeLegal)
5925 return Invalid;
5926 break;
5927 }
5928
5929 std::pair<InstructionCost, MVT> AccumLT =
5930 getTypeLegalizationCost(Ty: AccumVectorType);
5931 std::pair<InstructionCost, MVT> InputLT =
5932 getTypeLegalizationCost(Ty: InputVectorType);
5933
5934 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5935
5936 // The sub/negation cannot be folded into the operands of
5937 // ISD::PARTIAL_REDUCE_*MLA, so make the cost more expensive.
5938 if (Opcode == Instruction::Sub)
5939 Cost += 8;
5940
5941 // Prefer using full types by costing half-full input types as more expensive.
5942 if (TypeSize::isKnownLT(LHS: InputVectorType->getPrimitiveSizeInBits(),
5943 RHS: TypeSize::getScalable(MinimumSize: 128)))
5944 // FIXME: This can be removed after the cost of the extends are folded into
5945 // the dot-product expression in VPlan, after landing:
5946 // https://github.com/llvm/llvm-project/pull/147302
5947 Cost *= 2;
5948
5949 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5950 // i16 -> i64 is natively supported for udot/sdot
5951 if (AccumLT.second.getScalarType() == MVT::i64 &&
5952 InputLT.second.getScalarType() == MVT::i16)
5953 return Cost;
5954 // i16 -> i32 is natively supported with SVE2p1
5955 if (AccumLT.second.getScalarType() == MVT::i32 &&
5956 InputLT.second.getScalarType() == MVT::i16 &&
5957 (ST->hasSVE2p1() || ST->hasSME2()))
5958 return Cost;
5959 // i8 -> i64 is supported with an extra level of extends
5960 if (AccumLT.second.getScalarType() == MVT::i64 &&
5961 InputLT.second.getScalarType() == MVT::i8)
5962 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5963 // because it requires two extra extends on the inputs. But if we'd change
5964 // that now, a regular reduction would be cheaper because the costs of
5965 // the extends in the IR are still counted. This can be fixed
5966 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5967 return Cost;
5968 }
5969
5970 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5971 if (ST->isSVEorStreamingSVEAvailable() ||
5972 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
5973 ST->hasDotProd())) {
5974 if (AccumLT.second.getScalarType() == MVT::i32 &&
5975 InputLT.second.getScalarType() == MVT::i8)
5976 return Cost;
5977 }
5978
5979 // f16 -> f32 is natively supported for fdot
5980 if (Opcode == Instruction::FAdd && (ST->hasSME2() || ST->hasSVE2p1())) {
5981 if (AccumLT.second.getScalarType() == MVT::f32 &&
5982 InputLT.second.getScalarType() == MVT::f16 &&
5983 AccumLT.second.getVectorMinNumElements() == 4 &&
5984 InputLT.second.getVectorMinNumElements() == 8)
5985 return Cost;
5986 // Floating-point types aren't promoted, so expanding the partial reduction
5987 // is more expensive.
5988 return Cost + 20;
5989 }
5990
5991 // Add additional cost for the extends that would need to be inserted.
5992 return Cost + 2;
5993}
5994
5995InstructionCost
5996AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
5997 VectorType *SrcTy, ArrayRef<int> Mask,
5998 TTI::TargetCostKind CostKind, int Index,
5999 VectorType *SubTp, ArrayRef<const Value *> Args,
6000 const Instruction *CxtI) const {
6001 assert((Mask.empty() || DstTy->isScalableTy() ||
6002 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6003 "Expected the Mask to match the return size if given");
6004 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6005 "Expected the same scalar types");
6006 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
6007
6008 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6009 // into smaller vectors and sum the cost of each shuffle.
6010 if (!Mask.empty() && isa<FixedVectorType>(Val: SrcTy) && LT.second.isVector() &&
6011 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6012 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6013 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6014 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6015 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6016 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6017 // cost than just the load.
6018 if (Args.size() >= 1 && isa<LoadInst>(Val: Args[0]) &&
6019 (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 3) ||
6020 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 4)))
6021 return std::max<InstructionCost>(a: 1, b: LT.first / 4);
6022
6023 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6024 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6025 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6026 // cost than just the store.
6027 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) &&
6028 (ShuffleVectorInst::isInterleaveMask(
6029 Mask, Factor: 4, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2) ||
6030 ShuffleVectorInst::isInterleaveMask(
6031 Mask, Factor: 3, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2)))
6032 return LT.first;
6033
6034 unsigned TpNumElts = Mask.size();
6035 unsigned LTNumElts = LT.second.getVectorNumElements();
6036 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6037 VectorType *NTp = VectorType::get(ElementType: SrcTy->getScalarType(),
6038 EC: LT.second.getVectorElementCount());
6039 InstructionCost Cost;
6040 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6041 PreviousCosts;
6042 for (unsigned N = 0; N < NumVecs; N++) {
6043 SmallVector<int> NMask;
6044 // Split the existing mask into chunks of size LTNumElts. Track the source
6045 // sub-vectors to ensure the result has at most 2 inputs.
6046 unsigned Source1 = -1U, Source2 = -1U;
6047 unsigned NumSources = 0;
6048 for (unsigned E = 0; E < LTNumElts; E++) {
6049 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6050 : PoisonMaskElem;
6051 if (MaskElt < 0) {
6052 NMask.push_back(Elt: PoisonMaskElem);
6053 continue;
6054 }
6055
6056 // Calculate which source from the input this comes from and whether it
6057 // is new to us.
6058 unsigned Source = MaskElt / LTNumElts;
6059 if (NumSources == 0) {
6060 Source1 = Source;
6061 NumSources = 1;
6062 } else if (NumSources == 1 && Source != Source1) {
6063 Source2 = Source;
6064 NumSources = 2;
6065 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6066 NumSources++;
6067 }
6068
6069 // Add to the new mask. For the NumSources>2 case these are not correct,
6070 // but are only used for the modular lane number.
6071 if (Source == Source1)
6072 NMask.push_back(Elt: MaskElt % LTNumElts);
6073 else if (Source == Source2)
6074 NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts);
6075 else
6076 NMask.push_back(Elt: MaskElt % LTNumElts);
6077 }
6078 // Check if we have already generated this sub-shuffle, which means we
6079 // will have already generated the output. For example a <16 x i32> splat
6080 // will be the same sub-splat 4 times, which only needs to be generated
6081 // once and reused.
6082 auto Result =
6083 PreviousCosts.insert(x: {std::make_tuple(args&: Source1, args&: Source2, args&: NMask), 0});
6084 // Check if it was already in the map (already costed).
6085 if (!Result.second)
6086 continue;
6087 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6088 // getShuffleCost. If not then cost it using the worst case as the number
6089 // of element moves into a new vector.
6090 InstructionCost NCost =
6091 NumSources <= 2
6092 ? getShuffleCost(Kind: NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6093 : TTI::SK_PermuteTwoSrc,
6094 DstTy: NTp, SrcTy: NTp, Mask: NMask, CostKind, Index: 0, SubTp: nullptr, Args,
6095 CxtI)
6096 : LTNumElts;
6097 Result.first->second = NCost;
6098 Cost += NCost;
6099 }
6100 return Cost;
6101 }
6102
6103 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
6104 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6105 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6106 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6107 // This currently only handles low or high extracts to prevent SLP vectorizer
6108 // regressions.
6109 // Note that SVE's ext instruction is destructive, but it can be fused with
6110 // a movprfx to act like a constructive instruction.
6111 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6112 if (LT.second.getFixedSizeInBits() >= 128 &&
6113 cast<FixedVectorType>(Val: SubTp)->getNumElements() ==
6114 LT.second.getVectorNumElements() / 2) {
6115 if (Index == 0)
6116 return 0;
6117 if (Index == (int)LT.second.getVectorNumElements() / 2)
6118 return 1;
6119 }
6120 Kind = TTI::SK_PermuteSingleSrc;
6121 }
6122 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6123 // the code to handle length-changing shuffles.
6124 if (Kind == TTI::SK_InsertSubvector) {
6125 LT = getTypeLegalizationCost(Ty: DstTy);
6126 SrcTy = DstTy;
6127 }
6128
6129 // Check for identity masks, which we can treat as free for both fixed and
6130 // scalable vector paths.
6131 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6132 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6133 all_of(Range: enumerate(First&: Mask), P: [](const auto &M) {
6134 return M.value() < 0 || M.value() == (int)M.index();
6135 }))
6136 return 0;
6137
6138 // Segmented shuffle matching.
6139 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Val: SrcTy) &&
6140 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6141 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6142 RHS: AArch64::SVEBitsPerBlock)) {
6143
6144 FixedVectorType *VTy = cast<FixedVectorType>(Val: SrcTy);
6145 unsigned Segments =
6146 VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
6147 unsigned SegmentElts = VTy->getNumElements() / Segments;
6148
6149 // dupq zd.t, zn.t[idx]
6150 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6151 ST->isSVEorStreamingSVEAvailable() &&
6152 isDUPQMask(Mask, Segments, SegmentSize: SegmentElts))
6153 return LT.first;
6154
6155 // mov zd.q, vn
6156 if (ST->isSVEorStreamingSVEAvailable() &&
6157 isDUPFirstSegmentMask(Mask, Segments, SegmentSize: SegmentElts))
6158 return LT.first;
6159 }
6160
6161 // Check for broadcast loads, which are supported by the LD1R instruction.
6162 // In terms of code-size, the shuffle vector is free when a load + dup get
6163 // folded into a LD1R. That's what we check and return here. For performance
6164 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6165 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6166 // that we model the load + dup sequence slightly higher because LD1R is a
6167 // high latency instruction.
6168 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6169 bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args[0]);
6170 if (IsLoad && LT.second.isVector() &&
6171 isLegalBroadcastLoad(ElementTy: SrcTy->getElementType(),
6172 NumElements: LT.second.getVectorElementCount()))
6173 return 0;
6174 }
6175
6176 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6177 // from the perfect shuffle tables.
6178 if (Mask.size() == 4 &&
6179 SrcTy->getElementCount() == ElementCount::getFixed(MinVal: 4) &&
6180 (SrcTy->getScalarSizeInBits() == 16 ||
6181 SrcTy->getScalarSizeInBits() == 32) &&
6182 all_of(Range&: Mask, P: [](int E) { return E < 8; }))
6183 return getPerfectShuffleCost(M: Mask);
6184
6185 // Check for other shuffles that are not SK_ kinds but we have native
6186 // instructions for, for example ZIP and UZP.
6187 unsigned Unused;
6188 if (LT.second.isFixedLengthVector() &&
6189 LT.second.getVectorNumElements() == Mask.size() &&
6190 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6191 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6192 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6193 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6194 Kind == TTI::SK_InsertSubvector) &&
6195 (isZIPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused, OperandOrderOut&: Unused) ||
6196 isTRNMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused, OperandOrderOut&: Unused) ||
6197 isUZPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) ||
6198 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6199 NumElts: LT.second.getVectorNumElements(), BlockSize: 16) ||
6200 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6201 NumElts: LT.second.getVectorNumElements(), BlockSize: 32) ||
6202 isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6203 NumElts: LT.second.getVectorNumElements(), BlockSize: 64) ||
6204 // Check for non-zero lane splats
6205 all_of(Range: drop_begin(RangeOrContainer&: Mask),
6206 P: [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6207 return 1;
6208
6209 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6210 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6211 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6212 static const CostTblEntry ShuffleTbl[] = {
6213 // Broadcast shuffle kinds can be performed with 'dup'.
6214 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: 1},
6215 {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: 1},
6216 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: 1},
6217 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: 1},
6218 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i32, .Cost: 1},
6219 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: 1},
6220 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: 1},
6221 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f16, .Cost: 1},
6222 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: 1},
6223 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4bf16, .Cost: 1},
6224 {.ISD: TTI::SK_Broadcast, .Type: MVT::v8bf16, .Cost: 1},
6225 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f32, .Cost: 1},
6226 {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: 1},
6227 {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: 1},
6228 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6229 // 'zip1/zip2' instructions.
6230 {.ISD: TTI::SK_Transpose, .Type: MVT::v8i8, .Cost: 1},
6231 {.ISD: TTI::SK_Transpose, .Type: MVT::v16i8, .Cost: 1},
6232 {.ISD: TTI::SK_Transpose, .Type: MVT::v4i16, .Cost: 1},
6233 {.ISD: TTI::SK_Transpose, .Type: MVT::v8i16, .Cost: 1},
6234 {.ISD: TTI::SK_Transpose, .Type: MVT::v2i32, .Cost: 1},
6235 {.ISD: TTI::SK_Transpose, .Type: MVT::v4i32, .Cost: 1},
6236 {.ISD: TTI::SK_Transpose, .Type: MVT::v2i64, .Cost: 1},
6237 {.ISD: TTI::SK_Transpose, .Type: MVT::v4f16, .Cost: 1},
6238 {.ISD: TTI::SK_Transpose, .Type: MVT::v8f16, .Cost: 1},
6239 {.ISD: TTI::SK_Transpose, .Type: MVT::v4bf16, .Cost: 1},
6240 {.ISD: TTI::SK_Transpose, .Type: MVT::v8bf16, .Cost: 1},
6241 {.ISD: TTI::SK_Transpose, .Type: MVT::v2f32, .Cost: 1},
6242 {.ISD: TTI::SK_Transpose, .Type: MVT::v4f32, .Cost: 1},
6243 {.ISD: TTI::SK_Transpose, .Type: MVT::v2f64, .Cost: 1},
6244 // Select shuffle kinds.
6245 // TODO: handle vXi8/vXi16.
6246 {.ISD: TTI::SK_Select, .Type: MVT::v2i32, .Cost: 1}, // mov.
6247 {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: 2}, // rev+trn (or similar).
6248 {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: 1}, // mov.
6249 {.ISD: TTI::SK_Select, .Type: MVT::v2f32, .Cost: 1}, // mov.
6250 {.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: 2}, // rev+trn (or similar).
6251 {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: 1}, // mov.
6252 // PermuteSingleSrc shuffle kinds.
6253 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i32, .Cost: 1}, // mov.
6254 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: 3}, // perfectshuffle worst case.
6255 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: 1}, // mov.
6256 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f32, .Cost: 1}, // mov.
6257 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: 3}, // perfectshuffle worst case.
6258 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: 1}, // mov.
6259 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: 3}, // perfectshuffle worst case.
6260 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f16, .Cost: 3}, // perfectshuffle worst case.
6261 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4bf16, .Cost: 3}, // same
6262 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: 8}, // constpool + load + tbl
6263 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: 8}, // constpool + load + tbl
6264 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8bf16, .Cost: 8}, // constpool + load + tbl
6265 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: 8}, // constpool + load + tbl
6266 {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: 8}, // constpool + load + tbl
6267 // Reverse can be lowered with `rev`.
6268 {.ISD: TTI::SK_Reverse, .Type: MVT::v2i32, .Cost: 1}, // REV64
6269 {.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: 2}, // REV64; EXT
6270 {.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: 1}, // EXT
6271 {.ISD: TTI::SK_Reverse, .Type: MVT::v2f32, .Cost: 1}, // REV64
6272 {.ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: 2}, // REV64; EXT
6273 {.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: 1}, // EXT
6274 {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: 2}, // REV64; EXT
6275 {.ISD: TTI::SK_Reverse, .Type: MVT::v8bf16, .Cost: 2}, // REV64; EXT
6276 {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: 2}, // REV64; EXT
6277 {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: 2}, // REV64; EXT
6278 {.ISD: TTI::SK_Reverse, .Type: MVT::v4f16, .Cost: 1}, // REV64
6279 {.ISD: TTI::SK_Reverse, .Type: MVT::v4bf16, .Cost: 1}, // REV64
6280 {.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: 1}, // REV64
6281 {.ISD: TTI::SK_Reverse, .Type: MVT::v8i8, .Cost: 1}, // REV64
6282 // Splice can all be lowered as `ext`.
6283 {.ISD: TTI::SK_Splice, .Type: MVT::v2i32, .Cost: 1},
6284 {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: 1},
6285 {.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: 1},
6286 {.ISD: TTI::SK_Splice, .Type: MVT::v2f32, .Cost: 1},
6287 {.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: 1},
6288 {.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: 1},
6289 {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: 1},
6290 {.ISD: TTI::SK_Splice, .Type: MVT::v8bf16, .Cost: 1},
6291 {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: 1},
6292 {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: 1},
6293 {.ISD: TTI::SK_Splice, .Type: MVT::v4f16, .Cost: 1},
6294 {.ISD: TTI::SK_Splice, .Type: MVT::v4bf16, .Cost: 1},
6295 {.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: 1},
6296 {.ISD: TTI::SK_Splice, .Type: MVT::v8i8, .Cost: 1},
6297 // Broadcast shuffle kinds for scalable vectors
6298 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i8, .Cost: 1},
6299 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i16, .Cost: 1},
6300 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i32, .Cost: 1},
6301 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i64, .Cost: 1},
6302 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f16, .Cost: 1},
6303 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f16, .Cost: 1},
6304 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8f16, .Cost: 1},
6305 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2bf16, .Cost: 1},
6306 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4bf16, .Cost: 1},
6307 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8bf16, .Cost: 1},
6308 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f32, .Cost: 1},
6309 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f32, .Cost: 1},
6310 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f64, .Cost: 1},
6311 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i1, .Cost: 1},
6312 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i1, .Cost: 1},
6313 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i1, .Cost: 1},
6314 {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i1, .Cost: 1},
6315 // Handle the cases for vector.reverse with scalable vectors
6316 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i8, .Cost: 1},
6317 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i16, .Cost: 1},
6318 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i32, .Cost: 1},
6319 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i64, .Cost: 1},
6320 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f16, .Cost: 1},
6321 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f16, .Cost: 1},
6322 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8f16, .Cost: 1},
6323 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2bf16, .Cost: 1},
6324 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4bf16, .Cost: 1},
6325 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8bf16, .Cost: 1},
6326 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f32, .Cost: 1},
6327 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f32, .Cost: 1},
6328 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f64, .Cost: 1},
6329 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i1, .Cost: 1},
6330 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i1, .Cost: 1},
6331 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i1, .Cost: 1},
6332 {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i1, .Cost: 1},
6333 };
6334 if (const auto *Entry = CostTableLookup(Table: ShuffleTbl, ISD: Kind, Ty: LT.second))
6335 return LT.first * Entry->Cost;
6336 }
6337
6338 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: SrcTy))
6339 return getSpliceCost(Tp: SrcTy, Index, CostKind);
6340
6341 // Inserting a subvector can often be done with either a D, S or H register
6342 // move, so long as the inserted vector is "aligned".
6343 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6344 LT.second.getSizeInBits() <= 128 && SubTp) {
6345 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
6346 if (SubLT.second.isVector()) {
6347 int NumElts = LT.second.getVectorNumElements();
6348 int NumSubElts = SubLT.second.getVectorNumElements();
6349 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6350 return SubLT.first;
6351 }
6352 }
6353
6354 // Restore optimal kind.
6355 if (IsExtractSubvector)
6356 Kind = TTI::SK_ExtractSubvector;
6357 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6358 Args, CxtI);
6359}
6360
6361static bool containsDecreasingPointers(Loop *TheLoop,
6362 PredicatedScalarEvolution *PSE,
6363 const DominatorTree &DT) {
6364 const auto &Strides = DenseMap<Value *, const SCEV *>();
6365 for (BasicBlock *BB : TheLoop->blocks()) {
6366 // Scan the instructions in the block and look for addresses that are
6367 // consecutive and decreasing.
6368 for (Instruction &I : *BB) {
6369 if (isa<LoadInst>(Val: &I) || isa<StoreInst>(Val: &I)) {
6370 Value *Ptr = getLoadStorePointerOperand(V: &I);
6371 Type *AccessTy = getLoadStoreType(I: &I);
6372 if (getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, DT, StridesMap: Strides,
6373 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6374 .value_or(u: 0) < 0)
6375 return true;
6376 }
6377 }
6378 }
6379 return false;
6380}
6381
6382bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost(bool IsEpilogue) const {
6383 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6384 return SVEPreferFixedOverScalableIfEqualCost;
6385 // For cases like post-LTO vectorization, when we eventually know the trip
6386 // count, epilogue with fixed-width vectorization can be deleted if the trip
6387 // count is less than the epilogue iterations. That's why we prefer
6388 // fixed-width vectorization in epilogue in case of equal costs.
6389 if (IsEpilogue)
6390 return true;
6391 return ST->useFixedOverScalableIfEqualCost();
6392}
6393
6394unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
6395 return ST->getEpilogueVectorizationMinVF();
6396}
6397
6398bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
6399 if (!ST->hasSVE())
6400 return false;
6401
6402 // We don't currently support vectorisation with interleaving for SVE - with
6403 // such loops we're better off not using tail-folding. This gives us a chance
6404 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6405 if (TFI->IAI->hasGroups())
6406 return false;
6407
6408 TailFoldingOpts Required = TailFoldingOpts::Disabled;
6409 if (TFI->LVL->getReductionVars().size())
6410 Required |= TailFoldingOpts::Reductions;
6411 if (TFI->LVL->getFixedOrderRecurrences().size())
6412 Required |= TailFoldingOpts::Recurrences;
6413
6414 // We call this to discover whether any load/store pointers in the loop have
6415 // negative strides. This will require extra work to reverse the loop
6416 // predicate, which may be expensive.
6417 if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(),
6418 PSE: TFI->LVL->getPredicatedScalarEvolution(),
6419 DT: *TFI->LVL->getDominatorTree()))
6420 Required |= TailFoldingOpts::Reverse;
6421 if (Required == TailFoldingOpts::Disabled)
6422 Required |= TailFoldingOpts::Simple;
6423
6424 if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(),
6425 Required))
6426 return false;
6427
6428 // Don't tail-fold for tight loops where we would be better off interleaving
6429 // with an unpredicated loop.
6430 unsigned NumInsns = 0;
6431 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6432 NumInsns += BB->sizeWithoutDebug();
6433 }
6434
6435 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6436 return NumInsns >= SVETailFoldInsnThreshold;
6437}
6438
6439InstructionCost
6440AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
6441 StackOffset BaseOffset, bool HasBaseReg,
6442 int64_t Scale, unsigned AddrSpace) const {
6443 // Scaling factors are not free at all.
6444 // Operands | Rt Latency
6445 // -------------------------------------------
6446 // Rt, [Xn, Xm] | 4
6447 // -------------------------------------------
6448 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6449 // Rt, [Xn, Wm, <extend> #imm] |
6450 TargetLoweringBase::AddrMode AM;
6451 AM.BaseGV = BaseGV;
6452 AM.BaseOffs = BaseOffset.getFixed();
6453 AM.HasBaseReg = HasBaseReg;
6454 AM.Scale = Scale;
6455 AM.ScalableOffset = BaseOffset.getScalable();
6456 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace))
6457 // Scale represents reg2 * scale, thus account for 1 if
6458 // it is not equal to 0 or 1.
6459 return AM.Scale != 0 && AM.Scale != 1;
6460 return InstructionCost::getInvalid();
6461}
6462
6463bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(
6464 const Instruction *I) const {
6465 if (EnableOrLikeSelectOpt) {
6466 // For the binary operators (e.g. or) we need to be more careful than
6467 // selects, here we only transform them if they are already at a natural
6468 // break point in the code - the end of a block with an unconditional
6469 // terminator.
6470 if (I->getOpcode() == Instruction::Or &&
6471 isa<BranchInst>(Val: I->getNextNode()) &&
6472 cast<BranchInst>(Val: I->getNextNode())->isUnconditional())
6473 return true;
6474
6475 if (I->getOpcode() == Instruction::Add ||
6476 I->getOpcode() == Instruction::Sub)
6477 return true;
6478 }
6479 return BaseT::shouldTreatInstructionLikeSelect(I);
6480}
6481
6482bool AArch64TTIImpl::isLSRCostLess(
6483 const TargetTransformInfo::LSRCost &C1,
6484 const TargetTransformInfo::LSRCost &C2) const {
6485 // AArch64 specific here is adding the number of instructions to the
6486 // comparison (though not as the first consideration, as some targets do)
6487 // along with changing the priority of the base additions.
6488 // TODO: Maybe a more nuanced tradeoff between instruction count
6489 // and number of registers? To be investigated at a later date.
6490 if (EnableLSRCostOpt)
6491 return std::tie(args: C1.NumRegs, args: C1.Insns, args: C1.NumBaseAdds, args: C1.AddRecCost,
6492 args: C1.NumIVMuls, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) <
6493 std::tie(args: C2.NumRegs, args: C2.Insns, args: C2.NumBaseAdds, args: C2.AddRecCost,
6494 args: C2.NumIVMuls, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost);
6495
6496 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
6497}
6498
6499static bool isSplatShuffle(Value *V) {
6500 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V))
6501 return all_equal(Range: Shuf->getShuffleMask());
6502 return false;
6503}
6504
6505/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6506/// or upper half of the vector elements.
6507static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6508 bool AllowSplat = false) {
6509 // Scalable types can't be extract shuffle vectors.
6510 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6511 return false;
6512
6513 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6514 auto *FullTy = FullV->getType();
6515 auto *HalfTy = HalfV->getType();
6516 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6517 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6518 };
6519
6520 auto extractHalf = [](Value *FullV, Value *HalfV) {
6521 auto *FullVT = cast<FixedVectorType>(Val: FullV->getType());
6522 auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType());
6523 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6524 };
6525
6526 ArrayRef<int> M1, M2;
6527 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6528 if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask(M1))) ||
6529 !match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask(M2))))
6530 return false;
6531
6532 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6533 // it is not checked as an extract below.
6534 if (AllowSplat && isSplatShuffle(V: Op1))
6535 S1Op1 = nullptr;
6536 if (AllowSplat && isSplatShuffle(V: Op2))
6537 S2Op1 = nullptr;
6538
6539 // Check that the operands are half as wide as the result and we extract
6540 // half of the elements of the input vectors.
6541 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6542 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6543 return false;
6544
6545 // Check the mask extracts either the lower or upper half of vector
6546 // elements.
6547 int M1Start = 0;
6548 int M2Start = 0;
6549 int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * 2;
6550 if ((S1Op1 &&
6551 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) ||
6552 (S2Op1 &&
6553 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start)))
6554 return false;
6555
6556 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6557 (M2Start != 0 && M2Start != (NumElements / 2)))
6558 return false;
6559 if (S1Op1 && S2Op1 && M1Start != M2Start)
6560 return false;
6561
6562 return true;
6563}
6564
6565/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6566/// of the vector elements.
6567static bool areExtractExts(Value *Ext1, Value *Ext2) {
6568 auto areExtDoubled = [](Instruction *Ext) {
6569 return Ext->getType()->getScalarSizeInBits() ==
6570 2 * Ext->getOperand(i: 0)->getType()->getScalarSizeInBits();
6571 };
6572
6573 if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) ||
6574 !match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) ||
6575 !areExtDoubled(cast<Instruction>(Val: Ext1)) ||
6576 !areExtDoubled(cast<Instruction>(Val: Ext2)))
6577 return false;
6578
6579 return true;
6580}
6581
6582/// Check if Op could be used with vmull_high_p64 intrinsic.
6583static bool isOperandOfVmullHighP64(Value *Op) {
6584 Value *VectorOperand = nullptr;
6585 ConstantInt *ElementIndex = nullptr;
6586 return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand),
6587 Idx: m_ConstantInt(CI&: ElementIndex))) &&
6588 ElementIndex->getValue() == 1 &&
6589 isa<FixedVectorType>(Val: VectorOperand->getType()) &&
6590 cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == 2;
6591}
6592
6593/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6594static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6595 return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2);
6596}
6597
6598static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
6599 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6600 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs);
6601 if (!GEP || GEP->getNumOperands() != 2)
6602 return false;
6603
6604 Value *Base = GEP->getOperand(i_nocapture: 0);
6605 Value *Offsets = GEP->getOperand(i_nocapture: 1);
6606
6607 // We only care about scalar_base+vector_offsets.
6608 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6609 return false;
6610
6611 // Sink extends that would allow us to use 32-bit offset vectors.
6612 if (isa<SExtInst>(Val: Offsets) || isa<ZExtInst>(Val: Offsets)) {
6613 auto *OffsetsInst = cast<Instruction>(Val: Offsets);
6614 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6615 OffsetsInst->getOperand(i: 0)->getType()->getScalarSizeInBits() <= 32)
6616 Ops.push_back(Elt: &GEP->getOperandUse(i: 1));
6617 }
6618
6619 // Sink the GEP.
6620 return true;
6621}
6622
6623/// We want to sink following cases:
6624/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6625/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6626static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
6627 if (match(V: Op, P: m_VScale()))
6628 return true;
6629 if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) ||
6630 match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) {
6631 Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0));
6632 return true;
6633 }
6634 if (match(V: Op, P: m_Shl(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt())) ||
6635 match(V: Op, P: m_Mul(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt()))) {
6636 Value *ZExtOp = cast<Instruction>(Val: Op)->getOperand(i: 0);
6637 Ops.push_back(Elt: &cast<Instruction>(Val: ZExtOp)->getOperandUse(i: 0));
6638 Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0));
6639 return true;
6640 }
6641 return false;
6642}
6643
6644static bool isFNeg(Value *Op) { return match(V: Op, P: m_FNeg(X: m_Value())); }
6645
6646/// Check if sinking \p I's operands to I's basic block is profitable, because
6647/// the operands can be folded into a target instruction, e.g.
6648/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6649bool AArch64TTIImpl::isProfitableToSinkOperands(
6650 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6651 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
6652 switch (II->getIntrinsicID()) {
6653 case Intrinsic::aarch64_neon_smull:
6654 case Intrinsic::aarch64_neon_umull:
6655 if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1),
6656 /*AllowSplat=*/true)) {
6657 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6658 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6659 return true;
6660 }
6661 [[fallthrough]];
6662
6663 case Intrinsic::fma:
6664 case Intrinsic::fmuladd:
6665 if (isa<VectorType>(Val: I->getType()) &&
6666 cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
6667 !ST->hasFullFP16())
6668 return false;
6669
6670 if (isFNeg(Op: II->getOperand(i_nocapture: 0)))
6671 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6672 if (isFNeg(Op: II->getOperand(i_nocapture: 1)))
6673 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6674
6675 [[fallthrough]];
6676 case Intrinsic::aarch64_neon_sqdmull:
6677 case Intrinsic::aarch64_neon_sqdmulh:
6678 case Intrinsic::aarch64_neon_sqrdmulh:
6679 // Sink splats for index lane variants
6680 if (isSplatShuffle(V: II->getOperand(i_nocapture: 0)))
6681 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6682 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
6683 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6684 return !Ops.empty();
6685 case Intrinsic::aarch64_neon_fmlal:
6686 case Intrinsic::aarch64_neon_fmlal2:
6687 case Intrinsic::aarch64_neon_fmlsl:
6688 case Intrinsic::aarch64_neon_fmlsl2:
6689 // Sink splats for index lane variants
6690 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
6691 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6692 if (isSplatShuffle(V: II->getOperand(i_nocapture: 2)))
6693 Ops.push_back(Elt: &II->getOperandUse(i: 2));
6694 return !Ops.empty();
6695 case Intrinsic::aarch64_sve_ptest_first:
6696 case Intrinsic::aarch64_sve_ptest_last:
6697 if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: 0)))
6698 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6699 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6700 return !Ops.empty();
6701 case Intrinsic::aarch64_sme_write_horiz:
6702 case Intrinsic::aarch64_sme_write_vert:
6703 case Intrinsic::aarch64_sme_writeq_horiz:
6704 case Intrinsic::aarch64_sme_writeq_vert: {
6705 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 1));
6706 if (!Idx || Idx->getOpcode() != Instruction::Add)
6707 return false;
6708 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6709 return true;
6710 }
6711 case Intrinsic::aarch64_sme_read_horiz:
6712 case Intrinsic::aarch64_sme_read_vert:
6713 case Intrinsic::aarch64_sme_readq_horiz:
6714 case Intrinsic::aarch64_sme_readq_vert:
6715 case Intrinsic::aarch64_sme_ld1b_vert:
6716 case Intrinsic::aarch64_sme_ld1h_vert:
6717 case Intrinsic::aarch64_sme_ld1w_vert:
6718 case Intrinsic::aarch64_sme_ld1d_vert:
6719 case Intrinsic::aarch64_sme_ld1q_vert:
6720 case Intrinsic::aarch64_sme_st1b_vert:
6721 case Intrinsic::aarch64_sme_st1h_vert:
6722 case Intrinsic::aarch64_sme_st1w_vert:
6723 case Intrinsic::aarch64_sme_st1d_vert:
6724 case Intrinsic::aarch64_sme_st1q_vert:
6725 case Intrinsic::aarch64_sme_ld1b_horiz:
6726 case Intrinsic::aarch64_sme_ld1h_horiz:
6727 case Intrinsic::aarch64_sme_ld1w_horiz:
6728 case Intrinsic::aarch64_sme_ld1d_horiz:
6729 case Intrinsic::aarch64_sme_ld1q_horiz:
6730 case Intrinsic::aarch64_sme_st1b_horiz:
6731 case Intrinsic::aarch64_sme_st1h_horiz:
6732 case Intrinsic::aarch64_sme_st1w_horiz:
6733 case Intrinsic::aarch64_sme_st1d_horiz:
6734 case Intrinsic::aarch64_sme_st1q_horiz: {
6735 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 3));
6736 if (!Idx || Idx->getOpcode() != Instruction::Add)
6737 return false;
6738 Ops.push_back(Elt: &II->getOperandUse(i: 3));
6739 return true;
6740 }
6741 case Intrinsic::aarch64_neon_pmull:
6742 if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1)))
6743 return false;
6744 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6745 Ops.push_back(Elt: &II->getOperandUse(i: 1));
6746 return true;
6747 case Intrinsic::aarch64_neon_pmull64:
6748 if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: 0),
6749 Op2: II->getArgOperand(i: 1)))
6750 return false;
6751 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
6752 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
6753 return true;
6754 case Intrinsic::masked_gather:
6755 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 0), Ops))
6756 return false;
6757 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
6758 return true;
6759 case Intrinsic::masked_scatter:
6760 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 1), Ops))
6761 return false;
6762 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
6763 return true;
6764 default:
6765 return false;
6766 }
6767 }
6768
6769 auto ShouldSinkCondition = [](Value *Cond,
6770 SmallVectorImpl<Use *> &Ops) -> bool {
6771 if (!isa<IntrinsicInst>(Val: Cond))
6772 return false;
6773 auto *II = dyn_cast<IntrinsicInst>(Val: Cond);
6774 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6775 !isa<ScalableVectorType>(Val: II->getOperand(i_nocapture: 0)->getType()))
6776 return false;
6777 if (isa<CmpInst>(Val: II->getOperand(i_nocapture: 0)))
6778 Ops.push_back(Elt: &II->getOperandUse(i: 0));
6779 return true;
6780 };
6781
6782 switch (I->getOpcode()) {
6783 case Instruction::GetElementPtr:
6784 case Instruction::Add:
6785 case Instruction::Sub:
6786 // Sink vscales closer to uses for better isel
6787 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6788 if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) {
6789 Ops.push_back(Elt: &I->getOperandUse(i: Op));
6790 return true;
6791 }
6792 }
6793 break;
6794 case Instruction::Select: {
6795 if (!ShouldSinkCondition(I->getOperand(i: 0), Ops))
6796 return false;
6797
6798 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6799 return true;
6800 }
6801 case Instruction::Br: {
6802 if (cast<BranchInst>(Val: I)->isUnconditional())
6803 return false;
6804
6805 if (!ShouldSinkCondition(cast<BranchInst>(Val: I)->getCondition(), Ops))
6806 return false;
6807
6808 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6809 return true;
6810 }
6811 case Instruction::FMul:
6812 // fmul with contract flag can be combined with fadd into fma.
6813 // Sinking fneg into this block enables fmls pattern.
6814 if (cast<FPMathOperator>(Val: I)->hasAllowContract()) {
6815 if (isFNeg(Op: I->getOperand(i: 0)))
6816 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6817 if (isFNeg(Op: I->getOperand(i: 1)))
6818 Ops.push_back(Elt: &I->getOperandUse(i: 1));
6819 }
6820 break;
6821
6822 default:
6823 break;
6824 }
6825
6826 if (!I->getType()->isVectorTy())
6827 return !Ops.empty();
6828
6829 switch (I->getOpcode()) {
6830 case Instruction::Sub:
6831 case Instruction::Add: {
6832 if (!areExtractExts(Ext1: I->getOperand(i: 0), Ext2: I->getOperand(i: 1)))
6833 return false;
6834
6835 // If the exts' operands extract either the lower or upper elements, we
6836 // can sink them too.
6837 auto Ext1 = cast<Instruction>(Val: I->getOperand(i: 0));
6838 auto Ext2 = cast<Instruction>(Val: I->getOperand(i: 1));
6839 if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: 0), Op2: Ext2->getOperand(i: 0))) {
6840 Ops.push_back(Elt: &Ext1->getOperandUse(i: 0));
6841 Ops.push_back(Elt: &Ext2->getOperandUse(i: 0));
6842 }
6843
6844 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6845 Ops.push_back(Elt: &I->getOperandUse(i: 1));
6846
6847 return true;
6848 }
6849 case Instruction::Or: {
6850 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6851 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6852 if (ST->hasNEON()) {
6853 Instruction *OtherAnd, *IA, *IB;
6854 Value *MaskValue;
6855 // MainAnd refers to And instruction that has 'Not' as one of its operands
6856 if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)),
6857 R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))),
6858 R: m_Instruction(I&: IA)))))) {
6859 if (match(V: OtherAnd,
6860 P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) {
6861 Instruction *MainAnd = I->getOperand(i: 0) == OtherAnd
6862 ? cast<Instruction>(Val: I->getOperand(i: 1))
6863 : cast<Instruction>(Val: I->getOperand(i: 0));
6864
6865 // Both Ands should be in same basic block as Or
6866 if (I->getParent() != MainAnd->getParent() ||
6867 I->getParent() != OtherAnd->getParent())
6868 return false;
6869
6870 // Non-mask operands of both Ands should also be in same basic block
6871 if (I->getParent() != IA->getParent() ||
6872 I->getParent() != IB->getParent())
6873 return false;
6874
6875 Ops.push_back(
6876 Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: 0) == IA ? 1 : 0));
6877 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6878 Ops.push_back(Elt: &I->getOperandUse(i: 1));
6879
6880 return true;
6881 }
6882 }
6883 }
6884
6885 return false;
6886 }
6887 case Instruction::Mul: {
6888 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6889 auto *Ty = cast<VectorType>(Val: V->getType());
6890 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6891 if (Ty->isScalableTy())
6892 return false;
6893
6894 // Indexed variants of Mul exist for i16 and i32 element types only.
6895 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6896 };
6897
6898 int NumZExts = 0, NumSExts = 0;
6899 for (auto &Op : I->operands()) {
6900 // Make sure we are not already sinking this operand
6901 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; }))
6902 continue;
6903
6904 if (match(V: &Op, P: m_ZExtOrSExt(Op: m_Value()))) {
6905 auto *Ext = cast<Instruction>(Val&: Op);
6906 auto *ExtOp = Ext->getOperand(i: 0);
6907 if (isSplatShuffle(V: ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6908 Ops.push_back(Elt: &Ext->getOperandUse(i: 0));
6909 Ops.push_back(Elt: &Op);
6910
6911 if (isa<SExtInst>(Val: Ext)) {
6912 NumSExts++;
6913 } else {
6914 NumZExts++;
6915 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6916 if (Ext->getOperand(i: 0)->getType()->getScalarSizeInBits() * 2 <
6917 I->getType()->getScalarSizeInBits())
6918 NumSExts++;
6919 }
6920
6921 continue;
6922 }
6923
6924 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op);
6925 if (!Shuffle)
6926 continue;
6927
6928 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6929 // operand and the s/zext can help create indexed s/umull. This is
6930 // especially useful to prevent i64 mul being scalarized.
6931 if (isSplatShuffle(V: Shuffle) &&
6932 match(V: Shuffle->getOperand(i_nocapture: 0), P: m_ZExtOrSExt(Op: m_Value()))) {
6933 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
6934 Ops.push_back(Elt: &Op);
6935 if (match(V: Shuffle->getOperand(i_nocapture: 0), P: m_SExt(Op: m_Value())))
6936 NumSExts++;
6937 else
6938 NumZExts++;
6939 continue;
6940 }
6941
6942 Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: 0);
6943 InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand);
6944 if (!Insert)
6945 continue;
6946
6947 Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: 1));
6948 if (!OperandInstr)
6949 continue;
6950
6951 ConstantInt *ElementConstant =
6952 dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: 2));
6953 // Check that the insertelement is inserting into element 0
6954 if (!ElementConstant || !ElementConstant->isZero())
6955 continue;
6956
6957 unsigned Opcode = OperandInstr->getOpcode();
6958 if (Opcode == Instruction::SExt)
6959 NumSExts++;
6960 else if (Opcode == Instruction::ZExt)
6961 NumZExts++;
6962 else {
6963 // If we find that the top bits are known 0, then we can sink and allow
6964 // the backend to generate a umull.
6965 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6966 APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / 2);
6967 if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, SQ: DL))
6968 continue;
6969 NumZExts++;
6970 }
6971
6972 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6973 // the And, just to hoist it again back to the load.
6974 if (!match(V: OperandInstr, P: m_And(L: m_Load(Op: m_Value()), R: m_Value())))
6975 Ops.push_back(Elt: &Insert->getOperandUse(i: 1));
6976 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
6977 Ops.push_back(Elt: &Op);
6978 }
6979
6980 // It is profitable to sink if we found two of the same type of extends.
6981 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6982 return true;
6983
6984 // Otherwise, see if we should sink splats for indexed variants.
6985 if (!ShouldSinkSplatForIndexedVariant(I))
6986 return false;
6987
6988 Ops.clear();
6989 if (isSplatShuffle(V: I->getOperand(i: 0)))
6990 Ops.push_back(Elt: &I->getOperandUse(i: 0));
6991 if (isSplatShuffle(V: I->getOperand(i: 1)))
6992 Ops.push_back(Elt: &I->getOperandUse(i: 1));
6993
6994 return !Ops.empty();
6995 }
6996 case Instruction::FMul: {
6997 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6998 if (I->getType()->isScalableTy())
6999 return !Ops.empty();
7000
7001 if (cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
7002 !ST->hasFullFP16())
7003 return !Ops.empty();
7004
7005 // Sink splats for index lane variants
7006 if (isSplatShuffle(V: I->getOperand(i: 0)))
7007 Ops.push_back(Elt: &I->getOperandUse(i: 0));
7008 if (isSplatShuffle(V: I->getOperand(i: 1)))
7009 Ops.push_back(Elt: &I->getOperandUse(i: 1));
7010 return !Ops.empty();
7011 }
7012 default:
7013 return false;
7014 }
7015 return false;
7016}
7017