| 1 | //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "AArch64TargetTransformInfo.h" |
| 10 | #include "AArch64ExpandImm.h" |
| 11 | #include "AArch64PerfectShuffle.h" |
| 12 | #include "MCTargetDesc/AArch64AddressingModes.h" |
| 13 | #include "Utils/AArch64SMEAttributes.h" |
| 14 | #include "llvm/ADT/DenseMap.h" |
| 15 | #include "llvm/Analysis/LoopInfo.h" |
| 16 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 17 | #include "llvm/CodeGen/BasicTTIImpl.h" |
| 18 | #include "llvm/CodeGen/CostTable.h" |
| 19 | #include "llvm/CodeGen/TargetLowering.h" |
| 20 | #include "llvm/IR/DerivedTypes.h" |
| 21 | #include "llvm/IR/IntrinsicInst.h" |
| 22 | #include "llvm/IR/Intrinsics.h" |
| 23 | #include "llvm/IR/IntrinsicsAArch64.h" |
| 24 | #include "llvm/IR/PatternMatch.h" |
| 25 | #include "llvm/Support/Debug.h" |
| 26 | #include "llvm/TargetParser/AArch64TargetParser.h" |
| 27 | #include "llvm/Transforms/InstCombine/InstCombiner.h" |
| 28 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
| 29 | #include <algorithm> |
| 30 | #include <optional> |
| 31 | using namespace llvm; |
| 32 | using namespace llvm::PatternMatch; |
| 33 | |
| 34 | #define DEBUG_TYPE "aarch64tti" |
| 35 | |
| 36 | static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix" , |
| 37 | cl::init(Val: true), cl::Hidden); |
| 38 | |
| 39 | static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost( |
| 40 | "sve-prefer-fixed-over-scalable-if-equal" , cl::Hidden); |
| 41 | |
| 42 | static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead" , cl::init(Val: 10), |
| 43 | cl::Hidden); |
| 44 | |
| 45 | static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead" , |
| 46 | cl::init(Val: 10), cl::Hidden); |
| 47 | |
| 48 | static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold" , |
| 49 | cl::init(Val: 15), cl::Hidden); |
| 50 | |
| 51 | static cl::opt<unsigned> |
| 52 | NeonNonConstStrideOverhead("neon-nonconst-stride-overhead" , cl::init(Val: 10), |
| 53 | cl::Hidden); |
| 54 | |
| 55 | static cl::opt<unsigned> CallPenaltyChangeSM( |
| 56 | "call-penalty-sm-change" , cl::init(Val: 5), cl::Hidden, |
| 57 | cl::desc( |
| 58 | "Penalty of calling a function that requires a change to PSTATE.SM" )); |
| 59 | |
| 60 | static cl::opt<unsigned> InlineCallPenaltyChangeSM( |
| 61 | "inline-call-penalty-sm-change" , cl::init(Val: 10), cl::Hidden, |
| 62 | cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM" )); |
| 63 | |
| 64 | static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select" , |
| 65 | cl::init(Val: true), cl::Hidden); |
| 66 | |
| 67 | static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt" , |
| 68 | cl::init(Val: true), cl::Hidden); |
| 69 | |
| 70 | // A complete guess as to a reasonable cost. |
| 71 | static cl::opt<unsigned> |
| 72 | BaseHistCntCost("aarch64-base-histcnt-cost" , cl::init(Val: 8), cl::Hidden, |
| 73 | cl::desc("The cost of a histcnt instruction" )); |
| 74 | |
| 75 | static cl::opt<unsigned> DMBLookaheadThreshold( |
| 76 | "dmb-lookahead-threshold" , cl::init(Val: 10), cl::Hidden, |
| 77 | cl::desc("The number of instructions to search for a redundant dmb" )); |
| 78 | |
| 79 | namespace { |
| 80 | class TailFoldingOption { |
| 81 | // These bitfields will only ever be set to something non-zero in operator=, |
| 82 | // when setting the -sve-tail-folding option. This option should always be of |
| 83 | // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here |
| 84 | // InitialBits is one of (disabled|all|simple). EnableBits represents |
| 85 | // additional flags we're enabling, and DisableBits for those flags we're |
| 86 | // disabling. The default flag is tracked in the variable NeedsDefault, since |
| 87 | // at the time of setting the option we may not know what the default value |
| 88 | // for the CPU is. |
| 89 | TailFoldingOpts InitialBits = TailFoldingOpts::Disabled; |
| 90 | TailFoldingOpts EnableBits = TailFoldingOpts::Disabled; |
| 91 | TailFoldingOpts DisableBits = TailFoldingOpts::Disabled; |
| 92 | |
| 93 | // This value needs to be initialised to true in case the user does not |
| 94 | // explicitly set the -sve-tail-folding option. |
| 95 | bool NeedsDefault = true; |
| 96 | |
| 97 | void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; } |
| 98 | |
| 99 | void setNeedsDefault(bool V) { NeedsDefault = V; } |
| 100 | |
| 101 | void setEnableBit(TailFoldingOpts Bit) { |
| 102 | EnableBits |= Bit; |
| 103 | DisableBits &= ~Bit; |
| 104 | } |
| 105 | |
| 106 | void setDisableBit(TailFoldingOpts Bit) { |
| 107 | EnableBits &= ~Bit; |
| 108 | DisableBits |= Bit; |
| 109 | } |
| 110 | |
| 111 | TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const { |
| 112 | TailFoldingOpts Bits = TailFoldingOpts::Disabled; |
| 113 | |
| 114 | assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) && |
| 115 | "Initial bits should only include one of " |
| 116 | "(disabled|all|simple|default)" ); |
| 117 | Bits = NeedsDefault ? DefaultBits : InitialBits; |
| 118 | Bits |= EnableBits; |
| 119 | Bits &= ~DisableBits; |
| 120 | |
| 121 | return Bits; |
| 122 | } |
| 123 | |
| 124 | void reportError(std::string Opt) { |
| 125 | errs() << "invalid argument '" << Opt |
| 126 | << "' to -sve-tail-folding=; the option should be of the form\n" |
| 127 | " (disabled|all|default|simple)[+(reductions|recurrences" |
| 128 | "|reverse|noreductions|norecurrences|noreverse)]\n" ; |
| 129 | report_fatal_error(reason: "Unrecognised tail-folding option" ); |
| 130 | } |
| 131 | |
| 132 | public: |
| 133 | |
| 134 | void operator=(const std::string &Val) { |
| 135 | // If the user explicitly sets -sve-tail-folding= then treat as an error. |
| 136 | if (Val.empty()) { |
| 137 | reportError(Opt: "" ); |
| 138 | return; |
| 139 | } |
| 140 | |
| 141 | // Since the user is explicitly setting the option we don't automatically |
| 142 | // need the default unless they require it. |
| 143 | setNeedsDefault(false); |
| 144 | |
| 145 | SmallVector<StringRef, 4> TailFoldTypes; |
| 146 | StringRef(Val).split(A&: TailFoldTypes, Separator: '+', MaxSplit: -1, KeepEmpty: false); |
| 147 | |
| 148 | unsigned StartIdx = 1; |
| 149 | if (TailFoldTypes[0] == "disabled" ) |
| 150 | setInitialBits(TailFoldingOpts::Disabled); |
| 151 | else if (TailFoldTypes[0] == "all" ) |
| 152 | setInitialBits(TailFoldingOpts::All); |
| 153 | else if (TailFoldTypes[0] == "default" ) |
| 154 | setNeedsDefault(true); |
| 155 | else if (TailFoldTypes[0] == "simple" ) |
| 156 | setInitialBits(TailFoldingOpts::Simple); |
| 157 | else { |
| 158 | StartIdx = 0; |
| 159 | setInitialBits(TailFoldingOpts::Disabled); |
| 160 | } |
| 161 | |
| 162 | for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) { |
| 163 | if (TailFoldTypes[I] == "reductions" ) |
| 164 | setEnableBit(TailFoldingOpts::Reductions); |
| 165 | else if (TailFoldTypes[I] == "recurrences" ) |
| 166 | setEnableBit(TailFoldingOpts::Recurrences); |
| 167 | else if (TailFoldTypes[I] == "reverse" ) |
| 168 | setEnableBit(TailFoldingOpts::Reverse); |
| 169 | else if (TailFoldTypes[I] == "noreductions" ) |
| 170 | setDisableBit(TailFoldingOpts::Reductions); |
| 171 | else if (TailFoldTypes[I] == "norecurrences" ) |
| 172 | setDisableBit(TailFoldingOpts::Recurrences); |
| 173 | else if (TailFoldTypes[I] == "noreverse" ) |
| 174 | setDisableBit(TailFoldingOpts::Reverse); |
| 175 | else |
| 176 | reportError(Opt: Val); |
| 177 | } |
| 178 | } |
| 179 | |
| 180 | bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const { |
| 181 | return (getBits(DefaultBits) & Required) == Required; |
| 182 | } |
| 183 | }; |
| 184 | } // namespace |
| 185 | |
| 186 | TailFoldingOption TailFoldingOptionLoc; |
| 187 | |
| 188 | static cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding( |
| 189 | "sve-tail-folding" , |
| 190 | cl::desc( |
| 191 | "Control the use of vectorisation using tail-folding for SVE where the" |
| 192 | " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" |
| 193 | "\ndisabled (Initial) No loop types will vectorize using " |
| 194 | "tail-folding" |
| 195 | "\ndefault (Initial) Uses the default tail-folding settings for " |
| 196 | "the target CPU" |
| 197 | "\nall (Initial) All legal loop types will vectorize using " |
| 198 | "tail-folding" |
| 199 | "\nsimple (Initial) Use tail-folding for simple loops (not " |
| 200 | "reductions or recurrences)" |
| 201 | "\nreductions Use tail-folding for loops containing reductions" |
| 202 | "\nnoreductions Inverse of above" |
| 203 | "\nrecurrences Use tail-folding for loops containing fixed order " |
| 204 | "recurrences" |
| 205 | "\nnorecurrences Inverse of above" |
| 206 | "\nreverse Use tail-folding for loops requiring reversed " |
| 207 | "predicates" |
| 208 | "\nnoreverse Inverse of above" ), |
| 209 | cl::location(L&: TailFoldingOptionLoc)); |
| 210 | |
| 211 | // Experimental option that will only be fully functional when the |
| 212 | // code-generator is changed to use SVE instead of NEON for all fixed-width |
| 213 | // operations. |
| 214 | static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( |
| 215 | "enable-fixedwidth-autovec-in-streaming-mode" , cl::init(Val: false), cl::Hidden); |
| 216 | |
| 217 | // Experimental option that will only be fully functional when the cost-model |
| 218 | // and code-generator have been changed to avoid using scalable vector |
| 219 | // instructions that are not legal in streaming SVE mode. |
| 220 | static cl::opt<bool> EnableScalableAutovecInStreamingMode( |
| 221 | "enable-scalable-autovec-in-streaming-mode" , cl::init(Val: false), cl::Hidden); |
| 222 | |
| 223 | static bool isSMEABIRoutineCall(const CallInst &CI) { |
| 224 | const auto *F = CI.getCalledFunction(); |
| 225 | return F && StringSwitch<bool>(F->getName()) |
| 226 | .Case(S: "__arm_sme_state" , Value: true) |
| 227 | .Case(S: "__arm_tpidr2_save" , Value: true) |
| 228 | .Case(S: "__arm_tpidr2_restore" , Value: true) |
| 229 | .Case(S: "__arm_za_disable" , Value: true) |
| 230 | .Default(Value: false); |
| 231 | } |
| 232 | |
| 233 | /// Returns true if the function has explicit operations that can only be |
| 234 | /// lowered using incompatible instructions for the selected mode. This also |
| 235 | /// returns true if the function F may use or modify ZA state. |
| 236 | static bool hasPossibleIncompatibleOps(const Function *F) { |
| 237 | for (const BasicBlock &BB : *F) { |
| 238 | for (const Instruction &I : BB) { |
| 239 | // Be conservative for now and assume that any call to inline asm or to |
| 240 | // intrinsics could could result in non-streaming ops (e.g. calls to |
| 241 | // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that |
| 242 | // all native LLVM instructions can be lowered to compatible instructions. |
| 243 | if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() && |
| 244 | (cast<CallInst>(Val: I).isInlineAsm() || isa<IntrinsicInst>(Val: I) || |
| 245 | isSMEABIRoutineCall(CI: cast<CallInst>(Val: I)))) |
| 246 | return true; |
| 247 | } |
| 248 | } |
| 249 | return false; |
| 250 | } |
| 251 | |
| 252 | uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const { |
| 253 | StringRef AttributeStr = |
| 254 | isMultiversionedFunction(F) ? "fmv-features" : "target-features" ; |
| 255 | StringRef FeatureStr = F.getFnAttribute(Kind: AttributeStr).getValueAsString(); |
| 256 | SmallVector<StringRef, 8> Features; |
| 257 | FeatureStr.split(A&: Features, Separator: "," ); |
| 258 | return AArch64::getFMVPriority(Features); |
| 259 | } |
| 260 | |
| 261 | bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const { |
| 262 | return F.hasFnAttribute(Kind: "fmv-features" ); |
| 263 | } |
| 264 | |
| 265 | const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = { |
| 266 | AArch64::FeatureExecuteOnly, |
| 267 | }; |
| 268 | |
| 269 | bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, |
| 270 | const Function *Callee) const { |
| 271 | SMECallAttrs CallAttrs(*Caller, *Callee); |
| 272 | |
| 273 | // When inlining, we should consider the body of the function, not the |
| 274 | // interface. |
| 275 | if (CallAttrs.callee().hasStreamingBody()) { |
| 276 | CallAttrs.callee().set(M: SMEAttrs::SM_Compatible, Enable: false); |
| 277 | CallAttrs.callee().set(M: SMEAttrs::SM_Enabled, Enable: true); |
| 278 | } |
| 279 | |
| 280 | if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0()) |
| 281 | return false; |
| 282 | |
| 283 | if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() || |
| 284 | CallAttrs.requiresPreservingZT0() || |
| 285 | CallAttrs.requiresPreservingAllZAState()) { |
| 286 | if (hasPossibleIncompatibleOps(F: Callee)) |
| 287 | return false; |
| 288 | } |
| 289 | |
| 290 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
| 291 | const FeatureBitset &CallerBits = |
| 292 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
| 293 | const FeatureBitset &CalleeBits = |
| 294 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
| 295 | // Adjust the feature bitsets by inverting some of the bits. This is needed |
| 296 | // for target features that represent restrictions rather than capabilities, |
| 297 | // for example a "+execute-only" callee can be inlined into a caller without |
| 298 | // "+execute-only", but not vice versa. |
| 299 | FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures; |
| 300 | FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures; |
| 301 | |
| 302 | return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits; |
| 303 | } |
| 304 | |
| 305 | bool AArch64TTIImpl::areTypesABICompatible( |
| 306 | const Function *Caller, const Function *Callee, |
| 307 | const ArrayRef<Type *> &Types) const { |
| 308 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) |
| 309 | return false; |
| 310 | |
| 311 | // We need to ensure that argument promotion does not attempt to promote |
| 312 | // pointers to fixed-length vector types larger than 128 bits like |
| 313 | // <8 x float> (and pointers to aggregate types which have such fixed-length |
| 314 | // vector type members) into the values of the pointees. Such vector types |
| 315 | // are used for SVE VLS but there is no ABI for SVE VLS arguments and the |
| 316 | // backend cannot lower such value arguments. The 128-bit fixed-length SVE |
| 317 | // types can be safely treated as 128-bit NEON types and they cannot be |
| 318 | // distinguished in IR. |
| 319 | if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range: Types, P: [](Type *Ty) { |
| 320 | auto FVTy = dyn_cast<FixedVectorType>(Val: Ty); |
| 321 | return FVTy && |
| 322 | FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128; |
| 323 | })) |
| 324 | return false; |
| 325 | |
| 326 | return true; |
| 327 | } |
| 328 | |
| 329 | unsigned |
| 330 | AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, |
| 331 | unsigned DefaultCallPenalty) const { |
| 332 | // This function calculates a penalty for executing Call in F. |
| 333 | // |
| 334 | // There are two ways this function can be called: |
| 335 | // (1) F: |
| 336 | // call from F -> G (the call here is Call) |
| 337 | // |
| 338 | // For (1), Call.getCaller() == F, so it will always return a high cost if |
| 339 | // a streaming-mode change is required (thus promoting the need to inline the |
| 340 | // function) |
| 341 | // |
| 342 | // (2) F: |
| 343 | // call from F -> G (the call here is not Call) |
| 344 | // G: |
| 345 | // call from G -> H (the call here is Call) |
| 346 | // |
| 347 | // For (2), if after inlining the body of G into F the call to H requires a |
| 348 | // streaming-mode change, and the call to G from F would also require a |
| 349 | // streaming-mode change, then there is benefit to do the streaming-mode |
| 350 | // change only once and avoid inlining of G into F. |
| 351 | |
| 352 | SMEAttrs FAttrs(*F); |
| 353 | SMECallAttrs CallAttrs(Call); |
| 354 | |
| 355 | if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) { |
| 356 | if (F == Call.getCaller()) // (1) |
| 357 | return CallPenaltyChangeSM * DefaultCallPenalty; |
| 358 | if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2) |
| 359 | return InlineCallPenaltyChangeSM * DefaultCallPenalty; |
| 360 | } |
| 361 | |
| 362 | return DefaultCallPenalty; |
| 363 | } |
| 364 | |
| 365 | bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( |
| 366 | TargetTransformInfo::RegisterKind K) const { |
| 367 | assert(K != TargetTransformInfo::RGK_Scalar); |
| 368 | return (K == TargetTransformInfo::RGK_FixedWidthVector && |
| 369 | ST->isNeonAvailable()); |
| 370 | } |
| 371 | |
| 372 | /// Calculate the cost of materializing a 64-bit value. This helper |
| 373 | /// method might only calculate a fraction of a larger immediate. Therefore it |
| 374 | /// is valid to return a cost of ZERO. |
| 375 | InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) const { |
| 376 | // Check if the immediate can be encoded within an instruction. |
| 377 | if (Val == 0 || AArch64_AM::isLogicalImmediate(imm: Val, regSize: 64)) |
| 378 | return 0; |
| 379 | |
| 380 | if (Val < 0) |
| 381 | Val = ~Val; |
| 382 | |
| 383 | // Calculate how many moves we will need to materialize this constant. |
| 384 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
| 385 | AArch64_IMM::expandMOVImm(Imm: Val, BitSize: 64, Insn); |
| 386 | return Insn.size(); |
| 387 | } |
| 388 | |
| 389 | /// Calculate the cost of materializing the given constant. |
| 390 | InstructionCost |
| 391 | AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
| 392 | TTI::TargetCostKind CostKind) const { |
| 393 | assert(Ty->isIntegerTy()); |
| 394 | |
| 395 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 396 | if (BitSize == 0) |
| 397 | return ~0U; |
| 398 | |
| 399 | // Sign-extend all constants to a multiple of 64-bit. |
| 400 | APInt ImmVal = Imm; |
| 401 | if (BitSize & 0x3f) |
| 402 | ImmVal = Imm.sext(width: (BitSize + 63) & ~0x3fU); |
| 403 | |
| 404 | // Split the constant into 64-bit chunks and calculate the cost for each |
| 405 | // chunk. |
| 406 | InstructionCost Cost = 0; |
| 407 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { |
| 408 | APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64); |
| 409 | int64_t Val = Tmp.getSExtValue(); |
| 410 | Cost += getIntImmCost(Val); |
| 411 | } |
| 412 | // We need at least one instruction to materialze the constant. |
| 413 | return std::max<InstructionCost>(a: 1, b: Cost); |
| 414 | } |
| 415 | |
| 416 | InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
| 417 | const APInt &Imm, Type *Ty, |
| 418 | TTI::TargetCostKind CostKind, |
| 419 | Instruction *Inst) const { |
| 420 | assert(Ty->isIntegerTy()); |
| 421 | |
| 422 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 423 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
| 424 | // here, so that constant hoisting will ignore this constant. |
| 425 | if (BitSize == 0) |
| 426 | return TTI::TCC_Free; |
| 427 | |
| 428 | unsigned ImmIdx = ~0U; |
| 429 | switch (Opcode) { |
| 430 | default: |
| 431 | return TTI::TCC_Free; |
| 432 | case Instruction::GetElementPtr: |
| 433 | // Always hoist the base address of a GetElementPtr. |
| 434 | if (Idx == 0) |
| 435 | return 2 * TTI::TCC_Basic; |
| 436 | return TTI::TCC_Free; |
| 437 | case Instruction::Store: |
| 438 | ImmIdx = 0; |
| 439 | break; |
| 440 | case Instruction::Add: |
| 441 | case Instruction::Sub: |
| 442 | case Instruction::Mul: |
| 443 | case Instruction::UDiv: |
| 444 | case Instruction::SDiv: |
| 445 | case Instruction::URem: |
| 446 | case Instruction::SRem: |
| 447 | case Instruction::And: |
| 448 | case Instruction::Or: |
| 449 | case Instruction::Xor: |
| 450 | case Instruction::ICmp: |
| 451 | ImmIdx = 1; |
| 452 | break; |
| 453 | // Always return TCC_Free for the shift value of a shift instruction. |
| 454 | case Instruction::Shl: |
| 455 | case Instruction::LShr: |
| 456 | case Instruction::AShr: |
| 457 | if (Idx == 1) |
| 458 | return TTI::TCC_Free; |
| 459 | break; |
| 460 | case Instruction::Trunc: |
| 461 | case Instruction::ZExt: |
| 462 | case Instruction::SExt: |
| 463 | case Instruction::IntToPtr: |
| 464 | case Instruction::PtrToInt: |
| 465 | case Instruction::BitCast: |
| 466 | case Instruction::PHI: |
| 467 | case Instruction::Call: |
| 468 | case Instruction::Select: |
| 469 | case Instruction::Ret: |
| 470 | case Instruction::Load: |
| 471 | break; |
| 472 | } |
| 473 | |
| 474 | if (Idx == ImmIdx) { |
| 475 | int NumConstants = (BitSize + 63) / 64; |
| 476 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
| 477 | return (Cost <= NumConstants * TTI::TCC_Basic) |
| 478 | ? static_cast<int>(TTI::TCC_Free) |
| 479 | : Cost; |
| 480 | } |
| 481 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
| 482 | } |
| 483 | |
| 484 | InstructionCost |
| 485 | AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
| 486 | const APInt &Imm, Type *Ty, |
| 487 | TTI::TargetCostKind CostKind) const { |
| 488 | assert(Ty->isIntegerTy()); |
| 489 | |
| 490 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 491 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
| 492 | // here, so that constant hoisting will ignore this constant. |
| 493 | if (BitSize == 0) |
| 494 | return TTI::TCC_Free; |
| 495 | |
| 496 | // Most (all?) AArch64 intrinsics do not support folding immediates into the |
| 497 | // selected instruction, so we compute the materialization cost for the |
| 498 | // immediate directly. |
| 499 | if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) |
| 500 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
| 501 | |
| 502 | switch (IID) { |
| 503 | default: |
| 504 | return TTI::TCC_Free; |
| 505 | case Intrinsic::sadd_with_overflow: |
| 506 | case Intrinsic::uadd_with_overflow: |
| 507 | case Intrinsic::ssub_with_overflow: |
| 508 | case Intrinsic::usub_with_overflow: |
| 509 | case Intrinsic::smul_with_overflow: |
| 510 | case Intrinsic::umul_with_overflow: |
| 511 | if (Idx == 1) { |
| 512 | int NumConstants = (BitSize + 63) / 64; |
| 513 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
| 514 | return (Cost <= NumConstants * TTI::TCC_Basic) |
| 515 | ? static_cast<int>(TTI::TCC_Free) |
| 516 | : Cost; |
| 517 | } |
| 518 | break; |
| 519 | case Intrinsic::experimental_stackmap: |
| 520 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
| 521 | return TTI::TCC_Free; |
| 522 | break; |
| 523 | case Intrinsic::experimental_patchpoint_void: |
| 524 | case Intrinsic::experimental_patchpoint: |
| 525 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
| 526 | return TTI::TCC_Free; |
| 527 | break; |
| 528 | case Intrinsic::experimental_gc_statepoint: |
| 529 | if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
| 530 | return TTI::TCC_Free; |
| 531 | break; |
| 532 | } |
| 533 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
| 534 | } |
| 535 | |
| 536 | TargetTransformInfo::PopcntSupportKind |
| 537 | AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const { |
| 538 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
| 539 | if (TyWidth == 32 || TyWidth == 64) |
| 540 | return TTI::PSK_FastHardware; |
| 541 | // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. |
| 542 | return TTI::PSK_Software; |
| 543 | } |
| 544 | |
| 545 | static bool isUnpackedVectorVT(EVT VecVT) { |
| 546 | return VecVT.isScalableVector() && |
| 547 | VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; |
| 548 | } |
| 549 | |
| 550 | static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { |
| 551 | Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers |
| 552 | Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements |
| 553 | unsigned TotalHistCnts = 1; |
| 554 | |
| 555 | unsigned EltSize = EltTy->getScalarSizeInBits(); |
| 556 | // Only allow (up to 64b) integers or pointers |
| 557 | if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64) |
| 558 | return InstructionCost::getInvalid(); |
| 559 | |
| 560 | // FIXME: We should be able to generate histcnt for fixed-length vectors |
| 561 | // using ptrue with a specific VL. |
| 562 | if (VectorType *VTy = dyn_cast<VectorType>(Val: BucketPtrsTy)) { |
| 563 | unsigned EC = VTy->getElementCount().getKnownMinValue(); |
| 564 | if (!isPowerOf2_64(Value: EC) || !VTy->isScalableTy()) |
| 565 | return InstructionCost::getInvalid(); |
| 566 | |
| 567 | // HistCnt only supports 32b and 64b element types |
| 568 | unsigned LegalEltSize = EltSize <= 32 ? 32 : 64; |
| 569 | |
| 570 | if (EC == 2 || (LegalEltSize == 32 && EC == 4)) |
| 571 | return InstructionCost(BaseHistCntCost); |
| 572 | |
| 573 | unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize; |
| 574 | TotalHistCnts = EC / NaturalVectorWidth; |
| 575 | } |
| 576 | |
| 577 | return InstructionCost(BaseHistCntCost * TotalHistCnts); |
| 578 | } |
| 579 | |
| 580 | InstructionCost |
| 581 | AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
| 582 | TTI::TargetCostKind CostKind) const { |
| 583 | // The code-generator is currently not able to handle scalable vectors |
| 584 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 585 | // it. This change will be removed when code-generation for these types is |
| 586 | // sufficiently reliable. |
| 587 | auto *RetTy = ICA.getReturnType(); |
| 588 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: RetTy)) |
| 589 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
| 590 | return InstructionCost::getInvalid(); |
| 591 | |
| 592 | switch (ICA.getID()) { |
| 593 | case Intrinsic::experimental_vector_histogram_add: |
| 594 | if (!ST->hasSVE2()) |
| 595 | return InstructionCost::getInvalid(); |
| 596 | return getHistogramCost(ICA); |
| 597 | case Intrinsic::umin: |
| 598 | case Intrinsic::umax: |
| 599 | case Intrinsic::smin: |
| 600 | case Intrinsic::smax: { |
| 601 | static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
| 602 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
| 603 | MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, |
| 604 | MVT::nxv2i64}; |
| 605 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
| 606 | // v2i64 types get converted to cmp+bif hence the cost of 2 |
| 607 | if (LT.second == MVT::v2i64) |
| 608 | return LT.first * 2; |
| 609 | if (any_of(Range: ValidMinMaxTys, P: [<](MVT M) { return M == LT.second; })) |
| 610 | return LT.first; |
| 611 | break; |
| 612 | } |
| 613 | case Intrinsic::sadd_sat: |
| 614 | case Intrinsic::ssub_sat: |
| 615 | case Intrinsic::uadd_sat: |
| 616 | case Intrinsic::usub_sat: { |
| 617 | static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
| 618 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
| 619 | MVT::v2i64}; |
| 620 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
| 621 | // This is a base cost of 1 for the vadd, plus 3 extract shifts if we |
| 622 | // need to extend the type, as it uses shr(qadd(shl, shl)). |
| 623 | unsigned Instrs = |
| 624 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; |
| 625 | if (any_of(Range: ValidSatTys, P: [<](MVT M) { return M == LT.second; })) |
| 626 | return LT.first * Instrs; |
| 627 | break; |
| 628 | } |
| 629 | case Intrinsic::abs: { |
| 630 | static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
| 631 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
| 632 | MVT::v2i64}; |
| 633 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
| 634 | if (any_of(Range: ValidAbsTys, P: [<](MVT M) { return M == LT.second; })) |
| 635 | return LT.first; |
| 636 | break; |
| 637 | } |
| 638 | case Intrinsic::bswap: { |
| 639 | static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32, |
| 640 | MVT::v4i32, MVT::v2i64}; |
| 641 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
| 642 | if (any_of(Range: ValidAbsTys, P: [<](MVT M) { return M == LT.second; }) && |
| 643 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits()) |
| 644 | return LT.first; |
| 645 | break; |
| 646 | } |
| 647 | case Intrinsic::stepvector: { |
| 648 | InstructionCost Cost = 1; // Cost of the `index' instruction |
| 649 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
| 650 | // Legalisation of illegal vectors involves an `index' instruction plus |
| 651 | // (LT.first - 1) vector adds. |
| 652 | if (LT.first > 1) { |
| 653 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: RetTy->getContext()); |
| 654 | InstructionCost AddCost = |
| 655 | getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind); |
| 656 | Cost += AddCost * (LT.first - 1); |
| 657 | } |
| 658 | return Cost; |
| 659 | } |
| 660 | case Intrinsic::vector_extract: |
| 661 | case Intrinsic::vector_insert: { |
| 662 | // If both the vector and subvector types are legal types and the index |
| 663 | // is 0, then this should be a no-op or simple operation; return a |
| 664 | // relatively low cost. |
| 665 | |
| 666 | // If arguments aren't actually supplied, then we cannot determine the |
| 667 | // value of the index. We also want to skip predicate types. |
| 668 | if (ICA.getArgs().size() != ICA.getArgTypes().size() || |
| 669 | ICA.getReturnType()->getScalarType()->isIntegerTy(Bitwidth: 1)) |
| 670 | break; |
| 671 | |
| 672 | LLVMContext &C = RetTy->getContext(); |
| 673 | EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
| 674 | bool = ICA.getID() == Intrinsic::vector_extract; |
| 675 | EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy) |
| 676 | : getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[1]); |
| 677 | // Skip this if either the vector or subvector types are unpacked |
| 678 | // SVE types; they may get lowered to stack stores and loads. |
| 679 | if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(VecVT: SubVecVT)) |
| 680 | break; |
| 681 | |
| 682 | TargetLoweringBase::LegalizeKind SubVecLK = |
| 683 | getTLI()->getTypeConversion(Context&: C, VT: SubVecVT); |
| 684 | TargetLoweringBase::LegalizeKind VecLK = |
| 685 | getTLI()->getTypeConversion(Context&: C, VT: VecVT); |
| 686 | const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2]; |
| 687 | const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx); |
| 688 | if (SubVecLK.first == TargetLoweringBase::TypeLegal && |
| 689 | VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero()) |
| 690 | return TTI::TCC_Free; |
| 691 | break; |
| 692 | } |
| 693 | case Intrinsic::bitreverse: { |
| 694 | static const CostTblEntry BitreverseTbl[] = { |
| 695 | {.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: 1}, |
| 696 | {.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: 1}, |
| 697 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i8, .Cost: 1}, |
| 698 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v16i8, .Cost: 1}, |
| 699 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i16, .Cost: 2}, |
| 700 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i16, .Cost: 2}, |
| 701 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i32, .Cost: 2}, |
| 702 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i32, .Cost: 2}, |
| 703 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v1i64, .Cost: 2}, |
| 704 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i64, .Cost: 2}, |
| 705 | }; |
| 706 | const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy); |
| 707 | const auto *Entry = |
| 708 | CostTableLookup(Table: BitreverseTbl, ISD: ICA.getID(), Ty: LegalisationCost.second); |
| 709 | if (Entry) { |
| 710 | // Cost Model is using the legal type(i32) that i8 and i16 will be |
| 711 | // converted to +1 so that we match the actual lowering cost |
| 712 | if (TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i8 || |
| 713 | TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i16) |
| 714 | return LegalisationCost.first * Entry->Cost + 1; |
| 715 | |
| 716 | return LegalisationCost.first * Entry->Cost; |
| 717 | } |
| 718 | break; |
| 719 | } |
| 720 | case Intrinsic::ctpop: { |
| 721 | if (!ST->hasNEON()) { |
| 722 | // 32-bit or 64-bit ctpop without NEON is 12 instructions. |
| 723 | return getTypeLegalizationCost(Ty: RetTy).first * 12; |
| 724 | } |
| 725 | static const CostTblEntry CtpopCostTbl[] = { |
| 726 | {.ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: 4}, |
| 727 | {.ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: 3}, |
| 728 | {.ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: 2}, |
| 729 | {.ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: 1}, |
| 730 | {.ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: 4}, |
| 731 | {.ISD: ISD::CTPOP, .Type: MVT::v2i32, .Cost: 3}, |
| 732 | {.ISD: ISD::CTPOP, .Type: MVT::v4i16, .Cost: 2}, |
| 733 | {.ISD: ISD::CTPOP, .Type: MVT::v8i8, .Cost: 1}, |
| 734 | {.ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: 5}, |
| 735 | }; |
| 736 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
| 737 | MVT MTy = LT.second; |
| 738 | if (const auto *Entry = CostTableLookup(Table: CtpopCostTbl, ISD: ISD::CTPOP, Ty: MTy)) { |
| 739 | // Extra cost of +1 when illegal vector types are legalized by promoting |
| 740 | // the integer type. |
| 741 | int = MTy.isVector() && MTy.getScalarSizeInBits() != |
| 742 | RetTy->getScalarSizeInBits() |
| 743 | ? 1 |
| 744 | : 0; |
| 745 | return LT.first * Entry->Cost + ExtraCost; |
| 746 | } |
| 747 | break; |
| 748 | } |
| 749 | case Intrinsic::sadd_with_overflow: |
| 750 | case Intrinsic::uadd_with_overflow: |
| 751 | case Intrinsic::ssub_with_overflow: |
| 752 | case Intrinsic::usub_with_overflow: |
| 753 | case Intrinsic::smul_with_overflow: |
| 754 | case Intrinsic::umul_with_overflow: { |
| 755 | static const CostTblEntry WithOverflowCostTbl[] = { |
| 756 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i8, .Cost: 3}, |
| 757 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i8, .Cost: 3}, |
| 758 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i16, .Cost: 3}, |
| 759 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i16, .Cost: 3}, |
| 760 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i32, .Cost: 1}, |
| 761 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i32, .Cost: 1}, |
| 762 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i64, .Cost: 1}, |
| 763 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i64, .Cost: 1}, |
| 764 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i8, .Cost: 3}, |
| 765 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i8, .Cost: 3}, |
| 766 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i16, .Cost: 3}, |
| 767 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i16, .Cost: 3}, |
| 768 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i32, .Cost: 1}, |
| 769 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i32, .Cost: 1}, |
| 770 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i64, .Cost: 1}, |
| 771 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i64, .Cost: 1}, |
| 772 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i8, .Cost: 5}, |
| 773 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i8, .Cost: 4}, |
| 774 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i16, .Cost: 5}, |
| 775 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i16, .Cost: 4}, |
| 776 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;tst |
| 777 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;cmp sxtw |
| 778 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;smulh;cmp |
| 779 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;umulh;cmp asr |
| 780 | }; |
| 781 | EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: 0), AllowUnknown: true); |
| 782 | if (MTy.isSimple()) |
| 783 | if (const auto *Entry = CostTableLookup(Table: WithOverflowCostTbl, ISD: ICA.getID(), |
| 784 | Ty: MTy.getSimpleVT())) |
| 785 | return Entry->Cost; |
| 786 | break; |
| 787 | } |
| 788 | case Intrinsic::fptosi_sat: |
| 789 | case Intrinsic::fptoui_sat: { |
| 790 | if (ICA.getArgTypes().empty()) |
| 791 | break; |
| 792 | bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; |
| 793 | auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]); |
| 794 | EVT MTy = TLI->getValueType(DL, Ty: RetTy); |
| 795 | // Check for the legal types, which are where the size of the input and the |
| 796 | // output are the same, or we are using cvt f64->i32 or f32->i64. |
| 797 | if ((LT.second == MVT::f32 || LT.second == MVT::f64 || |
| 798 | LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || |
| 799 | LT.second == MVT::v2f64)) { |
| 800 | if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || |
| 801 | (LT.second == MVT::f64 && MTy == MVT::i32) || |
| 802 | (LT.second == MVT::f32 && MTy == MVT::i64))) |
| 803 | return LT.first; |
| 804 | // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2 |
| 805 | if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() && |
| 806 | MTy.getScalarSizeInBits() == 64) |
| 807 | return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2); |
| 808 | } |
| 809 | // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to |
| 810 | // f32. |
| 811 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) |
| 812 | return LT.first + getIntrinsicInstrCost( |
| 813 | ICA: {ICA.getID(), |
| 814 | RetTy, |
| 815 | {ICA.getArgTypes()[0]->getWithNewType( |
| 816 | EltTy: Type::getFloatTy(C&: RetTy->getContext()))}}, |
| 817 | CostKind); |
| 818 | if ((LT.second == MVT::f16 && MTy == MVT::i32) || |
| 819 | (LT.second == MVT::f16 && MTy == MVT::i64) || |
| 820 | ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && |
| 821 | (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))) |
| 822 | return LT.first; |
| 823 | // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2 |
| 824 | if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() && |
| 825 | MTy.getScalarSizeInBits() == 32) |
| 826 | return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2); |
| 827 | // Extending vector types v8f16->v8i32. These current scalarize but the |
| 828 | // codegen could be better. |
| 829 | if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() && |
| 830 | MTy.getScalarSizeInBits() == 64) |
| 831 | return MTy.getVectorNumElements() * 3; |
| 832 | |
| 833 | // If we can we use a legal convert followed by a min+max |
| 834 | if ((LT.second.getScalarType() == MVT::f32 || |
| 835 | LT.second.getScalarType() == MVT::f64 || |
| 836 | LT.second.getScalarType() == MVT::f16) && |
| 837 | LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { |
| 838 | Type *LegalTy = |
| 839 | Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits()); |
| 840 | if (LT.second.isVector()) |
| 841 | LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount()); |
| 842 | InstructionCost Cost = 1; |
| 843 | IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, |
| 844 | LegalTy, {LegalTy, LegalTy}); |
| 845 | Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind); |
| 846 | IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, |
| 847 | LegalTy, {LegalTy, LegalTy}); |
| 848 | Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind); |
| 849 | return LT.first * Cost + |
| 850 | ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0 |
| 851 | : 1); |
| 852 | } |
| 853 | // Otherwise we need to follow the default expansion that clamps the value |
| 854 | // using a float min/max with a fcmp+sel for nan handling when signed. |
| 855 | Type *FPTy = ICA.getArgTypes()[0]->getScalarType(); |
| 856 | RetTy = RetTy->getScalarType(); |
| 857 | if (LT.second.isVector()) { |
| 858 | FPTy = VectorType::get(ElementType: FPTy, EC: LT.second.getVectorElementCount()); |
| 859 | RetTy = VectorType::get(ElementType: RetTy, EC: LT.second.getVectorElementCount()); |
| 860 | } |
| 861 | IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy}); |
| 862 | InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind); |
| 863 | IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy}); |
| 864 | Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind); |
| 865 | Cost += |
| 866 | getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI, |
| 867 | Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind); |
| 868 | if (IsSigned) { |
| 869 | Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1); |
| 870 | Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy, |
| 871 | VecPred: CmpInst::FCMP_UNO, CostKind); |
| 872 | Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy, |
| 873 | VecPred: CmpInst::FCMP_UNO, CostKind); |
| 874 | } |
| 875 | return LT.first * Cost; |
| 876 | } |
| 877 | case Intrinsic::fshl: |
| 878 | case Intrinsic::fshr: { |
| 879 | if (ICA.getArgs().empty()) |
| 880 | break; |
| 881 | |
| 882 | // TODO: Add handling for fshl where third argument is not a constant. |
| 883 | const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[2]); |
| 884 | if (!OpInfoZ.isConstant()) |
| 885 | break; |
| 886 | |
| 887 | const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy); |
| 888 | if (OpInfoZ.isUniform()) { |
| 889 | static const CostTblEntry FshlTbl[] = { |
| 890 | {.ISD: Intrinsic::fshl, .Type: MVT::v4i32, .Cost: 2}, // shl + usra |
| 891 | {.ISD: Intrinsic::fshl, .Type: MVT::v2i64, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v16i8, .Cost: 2}, |
| 892 | {.ISD: Intrinsic::fshl, .Type: MVT::v8i16, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v2i32, .Cost: 2}, |
| 893 | {.ISD: Intrinsic::fshl, .Type: MVT::v8i8, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v4i16, .Cost: 2}}; |
| 894 | // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl |
| 895 | // to avoid having to duplicate the costs. |
| 896 | const auto *Entry = |
| 897 | CostTableLookup(Table: FshlTbl, ISD: Intrinsic::fshl, Ty: LegalisationCost.second); |
| 898 | if (Entry) |
| 899 | return LegalisationCost.first * Entry->Cost; |
| 900 | } |
| 901 | |
| 902 | auto TyL = getTypeLegalizationCost(Ty: RetTy); |
| 903 | if (!RetTy->isIntegerTy()) |
| 904 | break; |
| 905 | |
| 906 | // Estimate cost manually, as types like i8 and i16 will get promoted to |
| 907 | // i32 and CostTableLookup will ignore the extra conversion cost. |
| 908 | bool HigherCost = (RetTy->getScalarSizeInBits() != 32 && |
| 909 | RetTy->getScalarSizeInBits() < 64) || |
| 910 | (RetTy->getScalarSizeInBits() % 64 != 0); |
| 911 | unsigned = HigherCost ? 1 : 0; |
| 912 | if (RetTy->getScalarSizeInBits() == 32 || |
| 913 | RetTy->getScalarSizeInBits() == 64) |
| 914 | ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single |
| 915 | // extr instruction. |
| 916 | else if (HigherCost) |
| 917 | ExtraCost = 1; |
| 918 | else |
| 919 | break; |
| 920 | return TyL.first + ExtraCost; |
| 921 | } |
| 922 | case Intrinsic::get_active_lane_mask: { |
| 923 | auto *RetTy = dyn_cast<FixedVectorType>(Val: ICA.getReturnType()); |
| 924 | if (RetTy) { |
| 925 | EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy); |
| 926 | EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
| 927 | if (!getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT) && |
| 928 | !getTLI()->isTypeLegal(VT: RetVT)) { |
| 929 | // We don't have enough context at this point to determine if the mask |
| 930 | // is going to be kept live after the block, which will force the vXi1 |
| 931 | // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. |
| 932 | // For now, we just assume the vectorizer created this intrinsic and |
| 933 | // the result will be the input for a PHI. In this case the cost will |
| 934 | // be extremely high for fixed-width vectors. |
| 935 | // NOTE: getScalarizationOverhead returns a cost that's far too |
| 936 | // pessimistic for the actual generated codegen. In reality there are |
| 937 | // two instructions generated per lane. |
| 938 | return RetTy->getNumElements() * 2; |
| 939 | } |
| 940 | } |
| 941 | break; |
| 942 | } |
| 943 | case Intrinsic::experimental_vector_match: { |
| 944 | auto *NeedleTy = cast<FixedVectorType>(Val: ICA.getArgTypes()[1]); |
| 945 | EVT SearchVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
| 946 | unsigned SearchSize = NeedleTy->getNumElements(); |
| 947 | if (!getTLI()->shouldExpandVectorMatch(VT: SearchVT, SearchSize)) { |
| 948 | // Base cost for MATCH instructions. At least on the Neoverse V2 and |
| 949 | // Neoverse V3, these are cheap operations with the same latency as a |
| 950 | // vector ADD. In most cases, however, we also need to do an extra DUP. |
| 951 | // For fixed-length vectors we currently need an extra five--six |
| 952 | // instructions besides the MATCH. |
| 953 | InstructionCost Cost = 4; |
| 954 | if (isa<FixedVectorType>(Val: RetTy)) |
| 955 | Cost += 10; |
| 956 | return Cost; |
| 957 | } |
| 958 | break; |
| 959 | } |
| 960 | case Intrinsic::experimental_cttz_elts: { |
| 961 | EVT ArgVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
| 962 | if (!getTLI()->shouldExpandCttzElements(VT: ArgVT)) { |
| 963 | // This will consist of a SVE brkb and a cntp instruction. These |
| 964 | // typically have the same latency and half the throughput as a vector |
| 965 | // add instruction. |
| 966 | return 4; |
| 967 | } |
| 968 | break; |
| 969 | } |
| 970 | default: |
| 971 | break; |
| 972 | } |
| 973 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
| 974 | } |
| 975 | |
| 976 | /// The function will remove redundant reinterprets casting in the presence |
| 977 | /// of the control flow |
| 978 | static std::optional<Instruction *> processPhiNode(InstCombiner &IC, |
| 979 | IntrinsicInst &II) { |
| 980 | SmallVector<Instruction *, 32> Worklist; |
| 981 | auto RequiredType = II.getType(); |
| 982 | |
| 983 | auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: 0)); |
| 984 | assert(PN && "Expected Phi Node!" ); |
| 985 | |
| 986 | // Don't create a new Phi unless we can remove the old one. |
| 987 | if (!PN->hasOneUse()) |
| 988 | return std::nullopt; |
| 989 | |
| 990 | for (Value *IncValPhi : PN->incoming_values()) { |
| 991 | auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi); |
| 992 | if (!Reinterpret || |
| 993 | Reinterpret->getIntrinsicID() != |
| 994 | Intrinsic::aarch64_sve_convert_to_svbool || |
| 995 | RequiredType != Reinterpret->getArgOperand(i: 0)->getType()) |
| 996 | return std::nullopt; |
| 997 | } |
| 998 | |
| 999 | // Create the new Phi |
| 1000 | IC.Builder.SetInsertPoint(PN); |
| 1001 | PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues()); |
| 1002 | Worklist.push_back(Elt: PN); |
| 1003 | |
| 1004 | for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { |
| 1005 | auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I)); |
| 1006 | NPN->addIncoming(V: Reinterpret->getOperand(i: 0), BB: PN->getIncomingBlock(i: I)); |
| 1007 | Worklist.push_back(Elt: Reinterpret); |
| 1008 | } |
| 1009 | |
| 1010 | // Cleanup Phi Node and reinterprets |
| 1011 | return IC.replaceInstUsesWith(I&: II, V: NPN); |
| 1012 | } |
| 1013 | |
| 1014 | // A collection of properties common to SVE intrinsics that allow for combines |
| 1015 | // to be written without needing to know the specific intrinsic. |
| 1016 | struct SVEIntrinsicInfo { |
| 1017 | // |
| 1018 | // Helper routines for common intrinsic definitions. |
| 1019 | // |
| 1020 | |
| 1021 | // e.g. llvm.aarch64.sve.add pg, op1, op2 |
| 1022 | // with IID ==> llvm.aarch64.sve.add_u |
| 1023 | static SVEIntrinsicInfo |
| 1024 | defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) { |
| 1025 | return SVEIntrinsicInfo() |
| 1026 | .setGoverningPredicateOperandIdx(0) |
| 1027 | .setOperandIdxInactiveLanesTakenFrom(1) |
| 1028 | .setMatchingUndefIntrinsic(IID); |
| 1029 | } |
| 1030 | |
| 1031 | // e.g. llvm.aarch64.sve.neg inactive, pg, op |
| 1032 | static SVEIntrinsicInfo defaultMergingUnaryOp() { |
| 1033 | return SVEIntrinsicInfo() |
| 1034 | .setGoverningPredicateOperandIdx(1) |
| 1035 | .setOperandIdxInactiveLanesTakenFrom(0) |
| 1036 | .setOperandIdxWithNoActiveLanes(0); |
| 1037 | } |
| 1038 | |
| 1039 | // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op |
| 1040 | static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() { |
| 1041 | return SVEIntrinsicInfo() |
| 1042 | .setGoverningPredicateOperandIdx(1) |
| 1043 | .setOperandIdxInactiveLanesTakenFrom(0); |
| 1044 | } |
| 1045 | |
| 1046 | // e.g. llvm.aarch64.sve.add_u pg, op1, op2 |
| 1047 | static SVEIntrinsicInfo defaultUndefOp() { |
| 1048 | return SVEIntrinsicInfo() |
| 1049 | .setGoverningPredicateOperandIdx(0) |
| 1050 | .setInactiveLanesAreNotDefined(); |
| 1051 | } |
| 1052 | |
| 1053 | // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0) |
| 1054 | // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1) |
| 1055 | static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) { |
| 1056 | return SVEIntrinsicInfo() |
| 1057 | .setGoverningPredicateOperandIdx(GPIndex) |
| 1058 | .setInactiveLanesAreUnused(); |
| 1059 | } |
| 1060 | |
| 1061 | // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2 |
| 1062 | // llvm.aarch64.sve.ld1 pg, ptr |
| 1063 | static SVEIntrinsicInfo defaultZeroingOp() { |
| 1064 | return SVEIntrinsicInfo() |
| 1065 | .setGoverningPredicateOperandIdx(0) |
| 1066 | .setInactiveLanesAreUnused() |
| 1067 | .setResultIsZeroInitialized(); |
| 1068 | } |
| 1069 | |
| 1070 | // All properties relate to predication and thus having a general predicate |
| 1071 | // is the minimum requirement to say there is intrinsic info to act on. |
| 1072 | explicit operator bool() const { return hasGoverningPredicate(); } |
| 1073 | |
| 1074 | // |
| 1075 | // Properties relating to the governing predicate. |
| 1076 | // |
| 1077 | |
| 1078 | bool hasGoverningPredicate() const { |
| 1079 | return GoverningPredicateIdx != std::numeric_limits<unsigned>::max(); |
| 1080 | } |
| 1081 | |
| 1082 | unsigned getGoverningPredicateOperandIdx() const { |
| 1083 | assert(hasGoverningPredicate() && "Propery not set!" ); |
| 1084 | return GoverningPredicateIdx; |
| 1085 | } |
| 1086 | |
| 1087 | SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) { |
| 1088 | assert(!hasGoverningPredicate() && "Cannot set property twice!" ); |
| 1089 | GoverningPredicateIdx = Index; |
| 1090 | return *this; |
| 1091 | } |
| 1092 | |
| 1093 | // |
| 1094 | // Properties relating to operations the intrinsic could be transformed into. |
| 1095 | // NOTE: This does not mean such a transformation is always possible, but the |
| 1096 | // knowledge makes it possible to reuse existing optimisations without needing |
| 1097 | // to embed specific handling for each intrinsic. For example, instruction |
| 1098 | // simplification can be used to optimise an intrinsic's active lanes. |
| 1099 | // |
| 1100 | |
| 1101 | bool hasMatchingUndefIntrinsic() const { |
| 1102 | return UndefIntrinsic != Intrinsic::not_intrinsic; |
| 1103 | } |
| 1104 | |
| 1105 | Intrinsic::ID getMatchingUndefIntrinsic() const { |
| 1106 | assert(hasMatchingUndefIntrinsic() && "Propery not set!" ); |
| 1107 | return UndefIntrinsic; |
| 1108 | } |
| 1109 | |
| 1110 | SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) { |
| 1111 | assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!" ); |
| 1112 | UndefIntrinsic = IID; |
| 1113 | return *this; |
| 1114 | } |
| 1115 | |
| 1116 | bool hasMatchingIROpode() const { return IROpcode != 0; } |
| 1117 | |
| 1118 | unsigned getMatchingIROpode() const { |
| 1119 | assert(hasMatchingIROpode() && "Propery not set!" ); |
| 1120 | return IROpcode; |
| 1121 | } |
| 1122 | |
| 1123 | SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) { |
| 1124 | assert(!hasMatchingIROpode() && "Cannot set property twice!" ); |
| 1125 | IROpcode = Opcode; |
| 1126 | return *this; |
| 1127 | } |
| 1128 | |
| 1129 | // |
| 1130 | // Properties relating to the result of inactive lanes. |
| 1131 | // |
| 1132 | |
| 1133 | bool inactiveLanesTakenFromOperand() const { |
| 1134 | return ResultLanes == InactiveLanesTakenFromOperand; |
| 1135 | } |
| 1136 | |
| 1137 | unsigned getOperandIdxInactiveLanesTakenFrom() const { |
| 1138 | assert(inactiveLanesTakenFromOperand() && "Propery not set!" ); |
| 1139 | return OperandIdxForInactiveLanes; |
| 1140 | } |
| 1141 | |
| 1142 | SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) { |
| 1143 | assert(ResultLanes == Uninitialized && "Cannot set property twice!" ); |
| 1144 | ResultLanes = InactiveLanesTakenFromOperand; |
| 1145 | OperandIdxForInactiveLanes = Index; |
| 1146 | return *this; |
| 1147 | } |
| 1148 | |
| 1149 | bool inactiveLanesAreNotDefined() const { |
| 1150 | return ResultLanes == InactiveLanesAreNotDefined; |
| 1151 | } |
| 1152 | |
| 1153 | SVEIntrinsicInfo &setInactiveLanesAreNotDefined() { |
| 1154 | assert(ResultLanes == Uninitialized && "Cannot set property twice!" ); |
| 1155 | ResultLanes = InactiveLanesAreNotDefined; |
| 1156 | return *this; |
| 1157 | } |
| 1158 | |
| 1159 | bool inactiveLanesAreUnused() const { |
| 1160 | return ResultLanes == InactiveLanesAreUnused; |
| 1161 | } |
| 1162 | |
| 1163 | SVEIntrinsicInfo &setInactiveLanesAreUnused() { |
| 1164 | assert(ResultLanes == Uninitialized && "Cannot set property twice!" ); |
| 1165 | ResultLanes = InactiveLanesAreUnused; |
| 1166 | return *this; |
| 1167 | } |
| 1168 | |
| 1169 | // NOTE: Whilst not limited to only inactive lanes, the common use case is: |
| 1170 | // inactiveLanesAreZeroed = |
| 1171 | // resultIsZeroInitialized() && inactiveLanesAreUnused() |
| 1172 | bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; } |
| 1173 | |
| 1174 | SVEIntrinsicInfo &setResultIsZeroInitialized() { |
| 1175 | ResultIsZeroInitialized = true; |
| 1176 | return *this; |
| 1177 | } |
| 1178 | |
| 1179 | // |
| 1180 | // The first operand of unary merging operations is typically only used to |
| 1181 | // set the result for inactive lanes. Knowing this allows us to deadcode the |
| 1182 | // operand when we can prove there are no inactive lanes. |
| 1183 | // |
| 1184 | |
| 1185 | bool hasOperandWithNoActiveLanes() const { |
| 1186 | return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max(); |
| 1187 | } |
| 1188 | |
| 1189 | unsigned getOperandIdxWithNoActiveLanes() const { |
| 1190 | assert(hasOperandWithNoActiveLanes() && "Propery not set!" ); |
| 1191 | return OperandIdxWithNoActiveLanes; |
| 1192 | } |
| 1193 | |
| 1194 | SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) { |
| 1195 | assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!" ); |
| 1196 | OperandIdxWithNoActiveLanes = Index; |
| 1197 | return *this; |
| 1198 | } |
| 1199 | |
| 1200 | private: |
| 1201 | unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max(); |
| 1202 | |
| 1203 | Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic; |
| 1204 | unsigned IROpcode = 0; |
| 1205 | |
| 1206 | enum PredicationStyle { |
| 1207 | Uninitialized, |
| 1208 | InactiveLanesTakenFromOperand, |
| 1209 | InactiveLanesAreNotDefined, |
| 1210 | InactiveLanesAreUnused |
| 1211 | } ResultLanes = Uninitialized; |
| 1212 | |
| 1213 | bool ResultIsZeroInitialized = false; |
| 1214 | unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max(); |
| 1215 | unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max(); |
| 1216 | }; |
| 1217 | |
| 1218 | static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) { |
| 1219 | // Some SVE intrinsics do not use scalable vector types, but since they are |
| 1220 | // not relevant from an SVEIntrinsicInfo perspective, they are also ignored. |
| 1221 | if (!isa<ScalableVectorType>(Val: II.getType()) && |
| 1222 | all_of(Range: II.args(), P: [&](const Value *V) { |
| 1223 | return !isa<ScalableVectorType>(Val: V->getType()); |
| 1224 | })) |
| 1225 | return SVEIntrinsicInfo(); |
| 1226 | |
| 1227 | Intrinsic::ID IID = II.getIntrinsicID(); |
| 1228 | switch (IID) { |
| 1229 | default: |
| 1230 | break; |
| 1231 | case Intrinsic::aarch64_sve_fcvt_bf16f32_v2: |
| 1232 | case Intrinsic::aarch64_sve_fcvt_f16f32: |
| 1233 | case Intrinsic::aarch64_sve_fcvt_f16f64: |
| 1234 | case Intrinsic::aarch64_sve_fcvt_f32f16: |
| 1235 | case Intrinsic::aarch64_sve_fcvt_f32f64: |
| 1236 | case Intrinsic::aarch64_sve_fcvt_f64f16: |
| 1237 | case Intrinsic::aarch64_sve_fcvt_f64f32: |
| 1238 | case Intrinsic::aarch64_sve_fcvtlt_f32f16: |
| 1239 | case Intrinsic::aarch64_sve_fcvtlt_f64f32: |
| 1240 | case Intrinsic::aarch64_sve_fcvtx_f32f64: |
| 1241 | case Intrinsic::aarch64_sve_fcvtzs: |
| 1242 | case Intrinsic::aarch64_sve_fcvtzs_i32f16: |
| 1243 | case Intrinsic::aarch64_sve_fcvtzs_i32f64: |
| 1244 | case Intrinsic::aarch64_sve_fcvtzs_i64f16: |
| 1245 | case Intrinsic::aarch64_sve_fcvtzs_i64f32: |
| 1246 | case Intrinsic::aarch64_sve_fcvtzu: |
| 1247 | case Intrinsic::aarch64_sve_fcvtzu_i32f16: |
| 1248 | case Intrinsic::aarch64_sve_fcvtzu_i32f64: |
| 1249 | case Intrinsic::aarch64_sve_fcvtzu_i64f16: |
| 1250 | case Intrinsic::aarch64_sve_fcvtzu_i64f32: |
| 1251 | case Intrinsic::aarch64_sve_scvtf: |
| 1252 | case Intrinsic::aarch64_sve_scvtf_f16i32: |
| 1253 | case Intrinsic::aarch64_sve_scvtf_f16i64: |
| 1254 | case Intrinsic::aarch64_sve_scvtf_f32i64: |
| 1255 | case Intrinsic::aarch64_sve_scvtf_f64i32: |
| 1256 | case Intrinsic::aarch64_sve_ucvtf: |
| 1257 | case Intrinsic::aarch64_sve_ucvtf_f16i32: |
| 1258 | case Intrinsic::aarch64_sve_ucvtf_f16i64: |
| 1259 | case Intrinsic::aarch64_sve_ucvtf_f32i64: |
| 1260 | case Intrinsic::aarch64_sve_ucvtf_f64i32: |
| 1261 | return SVEIntrinsicInfo::defaultMergingUnaryOp(); |
| 1262 | |
| 1263 | case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2: |
| 1264 | case Intrinsic::aarch64_sve_fcvtnt_f16f32: |
| 1265 | case Intrinsic::aarch64_sve_fcvtnt_f32f64: |
| 1266 | case Intrinsic::aarch64_sve_fcvtxnt_f32f64: |
| 1267 | return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp(); |
| 1268 | |
| 1269 | case Intrinsic::aarch64_sve_fabd: |
| 1270 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fabd_u); |
| 1271 | case Intrinsic::aarch64_sve_fadd: |
| 1272 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fadd_u) |
| 1273 | .setMatchingIROpcode(Instruction::FAdd); |
| 1274 | case Intrinsic::aarch64_sve_fdiv: |
| 1275 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fdiv_u) |
| 1276 | .setMatchingIROpcode(Instruction::FDiv); |
| 1277 | case Intrinsic::aarch64_sve_fmax: |
| 1278 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmax_u); |
| 1279 | case Intrinsic::aarch64_sve_fmaxnm: |
| 1280 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmaxnm_u); |
| 1281 | case Intrinsic::aarch64_sve_fmin: |
| 1282 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmin_u); |
| 1283 | case Intrinsic::aarch64_sve_fminnm: |
| 1284 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fminnm_u); |
| 1285 | case Intrinsic::aarch64_sve_fmla: |
| 1286 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmla_u); |
| 1287 | case Intrinsic::aarch64_sve_fmls: |
| 1288 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmls_u); |
| 1289 | case Intrinsic::aarch64_sve_fmul: |
| 1290 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmul_u) |
| 1291 | .setMatchingIROpcode(Instruction::FMul); |
| 1292 | case Intrinsic::aarch64_sve_fmulx: |
| 1293 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmulx_u); |
| 1294 | case Intrinsic::aarch64_sve_fnmla: |
| 1295 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmla_u); |
| 1296 | case Intrinsic::aarch64_sve_fnmls: |
| 1297 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmls_u); |
| 1298 | case Intrinsic::aarch64_sve_fsub: |
| 1299 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fsub_u) |
| 1300 | .setMatchingIROpcode(Instruction::FSub); |
| 1301 | case Intrinsic::aarch64_sve_add: |
| 1302 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_add_u) |
| 1303 | .setMatchingIROpcode(Instruction::Add); |
| 1304 | case Intrinsic::aarch64_sve_mla: |
| 1305 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mla_u); |
| 1306 | case Intrinsic::aarch64_sve_mls: |
| 1307 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mls_u); |
| 1308 | case Intrinsic::aarch64_sve_mul: |
| 1309 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mul_u) |
| 1310 | .setMatchingIROpcode(Instruction::Mul); |
| 1311 | case Intrinsic::aarch64_sve_sabd: |
| 1312 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sabd_u); |
| 1313 | case Intrinsic::aarch64_sve_sdiv: |
| 1314 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sdiv_u) |
| 1315 | .setMatchingIROpcode(Instruction::SDiv); |
| 1316 | case Intrinsic::aarch64_sve_smax: |
| 1317 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smax_u); |
| 1318 | case Intrinsic::aarch64_sve_smin: |
| 1319 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smin_u); |
| 1320 | case Intrinsic::aarch64_sve_smulh: |
| 1321 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smulh_u); |
| 1322 | case Intrinsic::aarch64_sve_sub: |
| 1323 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sub_u) |
| 1324 | .setMatchingIROpcode(Instruction::Sub); |
| 1325 | case Intrinsic::aarch64_sve_uabd: |
| 1326 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uabd_u); |
| 1327 | case Intrinsic::aarch64_sve_udiv: |
| 1328 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_udiv_u) |
| 1329 | .setMatchingIROpcode(Instruction::UDiv); |
| 1330 | case Intrinsic::aarch64_sve_umax: |
| 1331 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umax_u); |
| 1332 | case Intrinsic::aarch64_sve_umin: |
| 1333 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umin_u); |
| 1334 | case Intrinsic::aarch64_sve_umulh: |
| 1335 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umulh_u); |
| 1336 | case Intrinsic::aarch64_sve_asr: |
| 1337 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_asr_u) |
| 1338 | .setMatchingIROpcode(Instruction::AShr); |
| 1339 | case Intrinsic::aarch64_sve_lsl: |
| 1340 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsl_u) |
| 1341 | .setMatchingIROpcode(Instruction::Shl); |
| 1342 | case Intrinsic::aarch64_sve_lsr: |
| 1343 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsr_u) |
| 1344 | .setMatchingIROpcode(Instruction::LShr); |
| 1345 | case Intrinsic::aarch64_sve_and: |
| 1346 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_and_u) |
| 1347 | .setMatchingIROpcode(Instruction::And); |
| 1348 | case Intrinsic::aarch64_sve_bic: |
| 1349 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_bic_u); |
| 1350 | case Intrinsic::aarch64_sve_eor: |
| 1351 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_eor_u) |
| 1352 | .setMatchingIROpcode(Instruction::Xor); |
| 1353 | case Intrinsic::aarch64_sve_orr: |
| 1354 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_orr_u) |
| 1355 | .setMatchingIROpcode(Instruction::Or); |
| 1356 | case Intrinsic::aarch64_sve_sqsub: |
| 1357 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqsub_u); |
| 1358 | case Intrinsic::aarch64_sve_uqsub: |
| 1359 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqsub_u); |
| 1360 | |
| 1361 | case Intrinsic::aarch64_sve_add_u: |
| 1362 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1363 | Instruction::Add); |
| 1364 | case Intrinsic::aarch64_sve_and_u: |
| 1365 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1366 | Instruction::And); |
| 1367 | case Intrinsic::aarch64_sve_asr_u: |
| 1368 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1369 | Instruction::AShr); |
| 1370 | case Intrinsic::aarch64_sve_eor_u: |
| 1371 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1372 | Instruction::Xor); |
| 1373 | case Intrinsic::aarch64_sve_fadd_u: |
| 1374 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1375 | Instruction::FAdd); |
| 1376 | case Intrinsic::aarch64_sve_fdiv_u: |
| 1377 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1378 | Instruction::FDiv); |
| 1379 | case Intrinsic::aarch64_sve_fmul_u: |
| 1380 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1381 | Instruction::FMul); |
| 1382 | case Intrinsic::aarch64_sve_fsub_u: |
| 1383 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1384 | Instruction::FSub); |
| 1385 | case Intrinsic::aarch64_sve_lsl_u: |
| 1386 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1387 | Instruction::Shl); |
| 1388 | case Intrinsic::aarch64_sve_lsr_u: |
| 1389 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1390 | Instruction::LShr); |
| 1391 | case Intrinsic::aarch64_sve_mul_u: |
| 1392 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1393 | Instruction::Mul); |
| 1394 | case Intrinsic::aarch64_sve_orr_u: |
| 1395 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1396 | Instruction::Or); |
| 1397 | case Intrinsic::aarch64_sve_sdiv_u: |
| 1398 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1399 | Instruction::SDiv); |
| 1400 | case Intrinsic::aarch64_sve_sub_u: |
| 1401 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1402 | Instruction::Sub); |
| 1403 | case Intrinsic::aarch64_sve_udiv_u: |
| 1404 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
| 1405 | Instruction::UDiv); |
| 1406 | |
| 1407 | case Intrinsic::aarch64_sve_addqv: |
| 1408 | case Intrinsic::aarch64_sve_and_z: |
| 1409 | case Intrinsic::aarch64_sve_bic_z: |
| 1410 | case Intrinsic::aarch64_sve_brka_z: |
| 1411 | case Intrinsic::aarch64_sve_brkb_z: |
| 1412 | case Intrinsic::aarch64_sve_brkn_z: |
| 1413 | case Intrinsic::aarch64_sve_brkpa_z: |
| 1414 | case Intrinsic::aarch64_sve_brkpb_z: |
| 1415 | case Intrinsic::aarch64_sve_cntp: |
| 1416 | case Intrinsic::aarch64_sve_compact: |
| 1417 | case Intrinsic::aarch64_sve_eor_z: |
| 1418 | case Intrinsic::aarch64_sve_eorv: |
| 1419 | case Intrinsic::aarch64_sve_eorqv: |
| 1420 | case Intrinsic::aarch64_sve_nand_z: |
| 1421 | case Intrinsic::aarch64_sve_nor_z: |
| 1422 | case Intrinsic::aarch64_sve_orn_z: |
| 1423 | case Intrinsic::aarch64_sve_orr_z: |
| 1424 | case Intrinsic::aarch64_sve_orv: |
| 1425 | case Intrinsic::aarch64_sve_orqv: |
| 1426 | case Intrinsic::aarch64_sve_pnext: |
| 1427 | case Intrinsic::aarch64_sve_rdffr_z: |
| 1428 | case Intrinsic::aarch64_sve_saddv: |
| 1429 | case Intrinsic::aarch64_sve_uaddv: |
| 1430 | case Intrinsic::aarch64_sve_umaxv: |
| 1431 | case Intrinsic::aarch64_sve_umaxqv: |
| 1432 | case Intrinsic::aarch64_sve_cmpeq: |
| 1433 | case Intrinsic::aarch64_sve_cmpeq_wide: |
| 1434 | case Intrinsic::aarch64_sve_cmpge: |
| 1435 | case Intrinsic::aarch64_sve_cmpge_wide: |
| 1436 | case Intrinsic::aarch64_sve_cmpgt: |
| 1437 | case Intrinsic::aarch64_sve_cmpgt_wide: |
| 1438 | case Intrinsic::aarch64_sve_cmphi: |
| 1439 | case Intrinsic::aarch64_sve_cmphi_wide: |
| 1440 | case Intrinsic::aarch64_sve_cmphs: |
| 1441 | case Intrinsic::aarch64_sve_cmphs_wide: |
| 1442 | case Intrinsic::aarch64_sve_cmple_wide: |
| 1443 | case Intrinsic::aarch64_sve_cmplo_wide: |
| 1444 | case Intrinsic::aarch64_sve_cmpls_wide: |
| 1445 | case Intrinsic::aarch64_sve_cmplt_wide: |
| 1446 | case Intrinsic::aarch64_sve_cmpne: |
| 1447 | case Intrinsic::aarch64_sve_cmpne_wide: |
| 1448 | case Intrinsic::aarch64_sve_facge: |
| 1449 | case Intrinsic::aarch64_sve_facgt: |
| 1450 | case Intrinsic::aarch64_sve_fcmpeq: |
| 1451 | case Intrinsic::aarch64_sve_fcmpge: |
| 1452 | case Intrinsic::aarch64_sve_fcmpgt: |
| 1453 | case Intrinsic::aarch64_sve_fcmpne: |
| 1454 | case Intrinsic::aarch64_sve_fcmpuo: |
| 1455 | case Intrinsic::aarch64_sve_ld1: |
| 1456 | case Intrinsic::aarch64_sve_ld1_gather: |
| 1457 | case Intrinsic::aarch64_sve_ld1_gather_index: |
| 1458 | case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: |
| 1459 | case Intrinsic::aarch64_sve_ld1_gather_sxtw: |
| 1460 | case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: |
| 1461 | case Intrinsic::aarch64_sve_ld1_gather_uxtw: |
| 1462 | case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: |
| 1463 | case Intrinsic::aarch64_sve_ld1q_gather_index: |
| 1464 | case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: |
| 1465 | case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: |
| 1466 | case Intrinsic::aarch64_sve_ld1ro: |
| 1467 | case Intrinsic::aarch64_sve_ld1rq: |
| 1468 | case Intrinsic::aarch64_sve_ld1udq: |
| 1469 | case Intrinsic::aarch64_sve_ld1uwq: |
| 1470 | case Intrinsic::aarch64_sve_ld2_sret: |
| 1471 | case Intrinsic::aarch64_sve_ld2q_sret: |
| 1472 | case Intrinsic::aarch64_sve_ld3_sret: |
| 1473 | case Intrinsic::aarch64_sve_ld3q_sret: |
| 1474 | case Intrinsic::aarch64_sve_ld4_sret: |
| 1475 | case Intrinsic::aarch64_sve_ld4q_sret: |
| 1476 | case Intrinsic::aarch64_sve_ldff1: |
| 1477 | case Intrinsic::aarch64_sve_ldff1_gather: |
| 1478 | case Intrinsic::aarch64_sve_ldff1_gather_index: |
| 1479 | case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: |
| 1480 | case Intrinsic::aarch64_sve_ldff1_gather_sxtw: |
| 1481 | case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: |
| 1482 | case Intrinsic::aarch64_sve_ldff1_gather_uxtw: |
| 1483 | case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: |
| 1484 | case Intrinsic::aarch64_sve_ldnf1: |
| 1485 | case Intrinsic::aarch64_sve_ldnt1: |
| 1486 | case Intrinsic::aarch64_sve_ldnt1_gather: |
| 1487 | case Intrinsic::aarch64_sve_ldnt1_gather_index: |
| 1488 | case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: |
| 1489 | case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: |
| 1490 | return SVEIntrinsicInfo::defaultZeroingOp(); |
| 1491 | |
| 1492 | case Intrinsic::aarch64_sve_prf: |
| 1493 | case Intrinsic::aarch64_sve_prfb_gather_index: |
| 1494 | case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: |
| 1495 | case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: |
| 1496 | case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: |
| 1497 | case Intrinsic::aarch64_sve_prfd_gather_index: |
| 1498 | case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: |
| 1499 | case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: |
| 1500 | case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: |
| 1501 | case Intrinsic::aarch64_sve_prfh_gather_index: |
| 1502 | case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: |
| 1503 | case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: |
| 1504 | case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: |
| 1505 | case Intrinsic::aarch64_sve_prfw_gather_index: |
| 1506 | case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: |
| 1507 | case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: |
| 1508 | case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: |
| 1509 | return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 0); |
| 1510 | |
| 1511 | case Intrinsic::aarch64_sve_st1_scatter: |
| 1512 | case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: |
| 1513 | case Intrinsic::aarch64_sve_st1_scatter_sxtw: |
| 1514 | case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: |
| 1515 | case Intrinsic::aarch64_sve_st1_scatter_uxtw: |
| 1516 | case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: |
| 1517 | case Intrinsic::aarch64_sve_st1dq: |
| 1518 | case Intrinsic::aarch64_sve_st1q_scatter_index: |
| 1519 | case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset: |
| 1520 | case Intrinsic::aarch64_sve_st1q_scatter_vector_offset: |
| 1521 | case Intrinsic::aarch64_sve_st1wq: |
| 1522 | case Intrinsic::aarch64_sve_stnt1: |
| 1523 | case Intrinsic::aarch64_sve_stnt1_scatter: |
| 1524 | case Intrinsic::aarch64_sve_stnt1_scatter_index: |
| 1525 | case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: |
| 1526 | case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: |
| 1527 | return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 1); |
| 1528 | case Intrinsic::aarch64_sve_st2: |
| 1529 | case Intrinsic::aarch64_sve_st2q: |
| 1530 | return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 2); |
| 1531 | case Intrinsic::aarch64_sve_st3: |
| 1532 | case Intrinsic::aarch64_sve_st3q: |
| 1533 | return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 3); |
| 1534 | case Intrinsic::aarch64_sve_st4: |
| 1535 | case Intrinsic::aarch64_sve_st4q: |
| 1536 | return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 4); |
| 1537 | } |
| 1538 | |
| 1539 | return SVEIntrinsicInfo(); |
| 1540 | } |
| 1541 | |
| 1542 | static bool isAllActivePredicate(Value *Pred) { |
| 1543 | // Look through convert.from.svbool(convert.to.svbool(...) chain. |
| 1544 | Value *UncastedPred; |
| 1545 | if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( |
| 1546 | Op0: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( |
| 1547 | Op0: m_Value(V&: UncastedPred))))) |
| 1548 | // If the predicate has the same or less lanes than the uncasted |
| 1549 | // predicate then we know the casting has no effect. |
| 1550 | if (cast<ScalableVectorType>(Val: Pred->getType())->getMinNumElements() <= |
| 1551 | cast<ScalableVectorType>(Val: UncastedPred->getType())->getMinNumElements()) |
| 1552 | Pred = UncastedPred; |
| 1553 | auto *C = dyn_cast<Constant>(Val: Pred); |
| 1554 | return (C && C->isAllOnesValue()); |
| 1555 | } |
| 1556 | |
| 1557 | // Simplify `V` by only considering the operations that affect active lanes. |
| 1558 | // This function should only return existing Values or newly created Constants. |
| 1559 | static Value *stripInactiveLanes(Value *V, const Value *Pg) { |
| 1560 | auto *Dup = dyn_cast<IntrinsicInst>(Val: V); |
| 1561 | if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup && |
| 1562 | Dup->getOperand(i_nocapture: 1) == Pg && isa<Constant>(Val: Dup->getOperand(i_nocapture: 2))) |
| 1563 | return ConstantVector::getSplat( |
| 1564 | EC: cast<VectorType>(Val: V->getType())->getElementCount(), |
| 1565 | Elt: cast<Constant>(Val: Dup->getOperand(i_nocapture: 2))); |
| 1566 | |
| 1567 | return V; |
| 1568 | } |
| 1569 | |
| 1570 | static std::optional<Instruction *> |
| 1571 | simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, |
| 1572 | const SVEIntrinsicInfo &IInfo) { |
| 1573 | const unsigned Opc = IInfo.getMatchingIROpode(); |
| 1574 | assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!" ); |
| 1575 | |
| 1576 | Value *Pg = II.getOperand(i_nocapture: 0); |
| 1577 | Value *Op1 = II.getOperand(i_nocapture: 1); |
| 1578 | Value *Op2 = II.getOperand(i_nocapture: 2); |
| 1579 | const DataLayout &DL = II.getDataLayout(); |
| 1580 | |
| 1581 | // Canonicalise constants to the RHS. |
| 1582 | if (Instruction::isCommutative(Opcode: Opc) && IInfo.inactiveLanesAreNotDefined() && |
| 1583 | isa<Constant>(Val: Op1) && !isa<Constant>(Val: Op2)) { |
| 1584 | IC.replaceOperand(I&: II, OpNum: 1, V: Op2); |
| 1585 | IC.replaceOperand(I&: II, OpNum: 2, V: Op1); |
| 1586 | return &II; |
| 1587 | } |
| 1588 | |
| 1589 | // Only active lanes matter when simplifying the operation. |
| 1590 | Op1 = stripInactiveLanes(V: Op1, Pg); |
| 1591 | Op2 = stripInactiveLanes(V: Op2, Pg); |
| 1592 | |
| 1593 | Value *SimpleII; |
| 1594 | if (auto FII = dyn_cast<FPMathOperator>(Val: &II)) |
| 1595 | SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, FMF: FII->getFastMathFlags(), Q: DL); |
| 1596 | else |
| 1597 | SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, Q: DL); |
| 1598 | |
| 1599 | // An SVE intrinsic's result is always defined. However, this is not the case |
| 1600 | // for its equivalent IR instruction (e.g. when shifting by an amount more |
| 1601 | // than the data's bitwidth). Simplifications to an undefined result must be |
| 1602 | // ignored to preserve the intrinsic's expected behaviour. |
| 1603 | if (!SimpleII || isa<UndefValue>(Val: SimpleII)) |
| 1604 | return std::nullopt; |
| 1605 | |
| 1606 | if (IInfo.inactiveLanesAreNotDefined()) |
| 1607 | return IC.replaceInstUsesWith(I&: II, V: SimpleII); |
| 1608 | |
| 1609 | Value *Inactive = II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom()); |
| 1610 | |
| 1611 | // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)). |
| 1612 | if (SimpleII == Inactive) |
| 1613 | return IC.replaceInstUsesWith(I&: II, V: SimpleII); |
| 1614 | |
| 1615 | // Inactive lanes must be preserved. |
| 1616 | SimpleII = IC.Builder.CreateSelect(C: Pg, True: SimpleII, False: Inactive); |
| 1617 | return IC.replaceInstUsesWith(I&: II, V: SimpleII); |
| 1618 | } |
| 1619 | |
| 1620 | // Use SVE intrinsic info to eliminate redundant operands and/or canonicalise |
| 1621 | // to operations with less strict inactive lane requirements. |
| 1622 | static std::optional<Instruction *> |
| 1623 | simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, |
| 1624 | const SVEIntrinsicInfo &IInfo) { |
| 1625 | if (!IInfo.hasGoverningPredicate()) |
| 1626 | return std::nullopt; |
| 1627 | |
| 1628 | auto *OpPredicate = II.getOperand(i_nocapture: IInfo.getGoverningPredicateOperandIdx()); |
| 1629 | |
| 1630 | // If there are no active lanes. |
| 1631 | if (match(V: OpPredicate, P: m_ZeroInt())) { |
| 1632 | if (IInfo.inactiveLanesTakenFromOperand()) |
| 1633 | return IC.replaceInstUsesWith( |
| 1634 | I&: II, V: II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom())); |
| 1635 | |
| 1636 | if (IInfo.inactiveLanesAreUnused()) { |
| 1637 | if (IInfo.resultIsZeroInitialized()) |
| 1638 | IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType())); |
| 1639 | |
| 1640 | return IC.eraseInstFromFunction(I&: II); |
| 1641 | } |
| 1642 | } |
| 1643 | |
| 1644 | // If there are no inactive lanes. |
| 1645 | if (isAllActivePredicate(Pred: OpPredicate)) { |
| 1646 | if (IInfo.hasOperandWithNoActiveLanes()) { |
| 1647 | unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes(); |
| 1648 | if (!isa<UndefValue>(Val: II.getOperand(i_nocapture: OpIdx))) |
| 1649 | return IC.replaceOperand(I&: II, OpNum: OpIdx, V: UndefValue::get(T: II.getType())); |
| 1650 | } |
| 1651 | |
| 1652 | if (IInfo.hasMatchingUndefIntrinsic()) { |
| 1653 | auto *NewDecl = Intrinsic::getOrInsertDeclaration( |
| 1654 | M: II.getModule(), id: IInfo.getMatchingUndefIntrinsic(), Tys: {II.getType()}); |
| 1655 | II.setCalledFunction(NewDecl); |
| 1656 | return &II; |
| 1657 | } |
| 1658 | } |
| 1659 | |
| 1660 | // Operation specific simplifications. |
| 1661 | if (IInfo.hasMatchingIROpode() && |
| 1662 | Instruction::isBinaryOp(Opcode: IInfo.getMatchingIROpode())) |
| 1663 | return simplifySVEIntrinsicBinOp(IC, II, IInfo); |
| 1664 | |
| 1665 | return std::nullopt; |
| 1666 | } |
| 1667 | |
| 1668 | // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) |
| 1669 | // => (binop (pred) (from_svbool _) (from_svbool _)) |
| 1670 | // |
| 1671 | // The above transformation eliminates a `to_svbool` in the predicate |
| 1672 | // operand of bitwise operation `binop` by narrowing the vector width of |
| 1673 | // the operation. For example, it would convert a `<vscale x 16 x i1> |
| 1674 | // and` into a `<vscale x 4 x i1> and`. This is profitable because |
| 1675 | // to_svbool must zero the new lanes during widening, whereas |
| 1676 | // from_svbool is free. |
| 1677 | static std::optional<Instruction *> |
| 1678 | tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) { |
| 1679 | auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: 0)); |
| 1680 | if (!BinOp) |
| 1681 | return std::nullopt; |
| 1682 | |
| 1683 | auto IntrinsicID = BinOp->getIntrinsicID(); |
| 1684 | switch (IntrinsicID) { |
| 1685 | case Intrinsic::aarch64_sve_and_z: |
| 1686 | case Intrinsic::aarch64_sve_bic_z: |
| 1687 | case Intrinsic::aarch64_sve_eor_z: |
| 1688 | case Intrinsic::aarch64_sve_nand_z: |
| 1689 | case Intrinsic::aarch64_sve_nor_z: |
| 1690 | case Intrinsic::aarch64_sve_orn_z: |
| 1691 | case Intrinsic::aarch64_sve_orr_z: |
| 1692 | break; |
| 1693 | default: |
| 1694 | return std::nullopt; |
| 1695 | } |
| 1696 | |
| 1697 | auto BinOpPred = BinOp->getOperand(i_nocapture: 0); |
| 1698 | auto BinOpOp1 = BinOp->getOperand(i_nocapture: 1); |
| 1699 | auto BinOpOp2 = BinOp->getOperand(i_nocapture: 2); |
| 1700 | |
| 1701 | auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred); |
| 1702 | if (!PredIntr || |
| 1703 | PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) |
| 1704 | return std::nullopt; |
| 1705 | |
| 1706 | auto PredOp = PredIntr->getOperand(i_nocapture: 0); |
| 1707 | auto PredOpTy = cast<VectorType>(Val: PredOp->getType()); |
| 1708 | if (PredOpTy != II.getType()) |
| 1709 | return std::nullopt; |
| 1710 | |
| 1711 | SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; |
| 1712 | auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic( |
| 1713 | ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp1}); |
| 1714 | NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1); |
| 1715 | if (BinOpOp1 == BinOpOp2) |
| 1716 | NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1); |
| 1717 | else |
| 1718 | NarrowedBinOpArgs.push_back(Elt: IC.Builder.CreateIntrinsic( |
| 1719 | ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp2})); |
| 1720 | |
| 1721 | auto NarrowedBinOp = |
| 1722 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {PredOpTy}, Args: NarrowedBinOpArgs); |
| 1723 | return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp); |
| 1724 | } |
| 1725 | |
| 1726 | static std::optional<Instruction *> |
| 1727 | instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { |
| 1728 | // If the reinterpret instruction operand is a PHI Node |
| 1729 | if (isa<PHINode>(Val: II.getArgOperand(i: 0))) |
| 1730 | return processPhiNode(IC, II); |
| 1731 | |
| 1732 | if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) |
| 1733 | return BinOpCombine; |
| 1734 | |
| 1735 | // Ignore converts to/from svcount_t. |
| 1736 | if (isa<TargetExtType>(Val: II.getArgOperand(i: 0)->getType()) || |
| 1737 | isa<TargetExtType>(Val: II.getType())) |
| 1738 | return std::nullopt; |
| 1739 | |
| 1740 | SmallVector<Instruction *, 32> CandidatesForRemoval; |
| 1741 | Value *Cursor = II.getOperand(i_nocapture: 0), *EarliestReplacement = nullptr; |
| 1742 | |
| 1743 | const auto *IVTy = cast<VectorType>(Val: II.getType()); |
| 1744 | |
| 1745 | // Walk the chain of conversions. |
| 1746 | while (Cursor) { |
| 1747 | // If the type of the cursor has fewer lanes than the final result, zeroing |
| 1748 | // must take place, which breaks the equivalence chain. |
| 1749 | const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType()); |
| 1750 | if (CursorVTy->getElementCount().getKnownMinValue() < |
| 1751 | IVTy->getElementCount().getKnownMinValue()) |
| 1752 | break; |
| 1753 | |
| 1754 | // If the cursor has the same type as I, it is a viable replacement. |
| 1755 | if (Cursor->getType() == IVTy) |
| 1756 | EarliestReplacement = Cursor; |
| 1757 | |
| 1758 | auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor); |
| 1759 | |
| 1760 | // If this is not an SVE conversion intrinsic, this is the end of the chain. |
| 1761 | if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == |
| 1762 | Intrinsic::aarch64_sve_convert_to_svbool || |
| 1763 | IntrinsicCursor->getIntrinsicID() == |
| 1764 | Intrinsic::aarch64_sve_convert_from_svbool)) |
| 1765 | break; |
| 1766 | |
| 1767 | CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor); |
| 1768 | Cursor = IntrinsicCursor->getOperand(i_nocapture: 0); |
| 1769 | } |
| 1770 | |
| 1771 | // If no viable replacement in the conversion chain was found, there is |
| 1772 | // nothing to do. |
| 1773 | if (!EarliestReplacement) |
| 1774 | return std::nullopt; |
| 1775 | |
| 1776 | return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement); |
| 1777 | } |
| 1778 | |
| 1779 | static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, |
| 1780 | IntrinsicInst &II) { |
| 1781 | // svsel(ptrue, x, y) => x |
| 1782 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
| 1783 | if (isAllActivePredicate(Pred: OpPredicate)) |
| 1784 | return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: 1)); |
| 1785 | |
| 1786 | auto Select = |
| 1787 | IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: 1), False: II.getOperand(i_nocapture: 2)); |
| 1788 | return IC.replaceInstUsesWith(I&: II, V: Select); |
| 1789 | } |
| 1790 | |
| 1791 | static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, |
| 1792 | IntrinsicInst &II) { |
| 1793 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1)); |
| 1794 | if (!Pg) |
| 1795 | return std::nullopt; |
| 1796 | |
| 1797 | if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
| 1798 | return std::nullopt; |
| 1799 | |
| 1800 | const auto PTruePattern = |
| 1801 | cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: 0))->getZExtValue(); |
| 1802 | if (PTruePattern != AArch64SVEPredPattern::vl1) |
| 1803 | return std::nullopt; |
| 1804 | |
| 1805 | // The intrinsic is inserting into lane zero so use an insert instead. |
| 1806 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
| 1807 | auto *Insert = InsertElementInst::Create( |
| 1808 | Vec: II.getArgOperand(i: 0), NewElt: II.getArgOperand(i: 2), Idx: ConstantInt::get(Ty: IdxTy, V: 0)); |
| 1809 | Insert->insertBefore(InsertPos: II.getIterator()); |
| 1810 | Insert->takeName(V: &II); |
| 1811 | |
| 1812 | return IC.replaceInstUsesWith(I&: II, V: Insert); |
| 1813 | } |
| 1814 | |
| 1815 | static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, |
| 1816 | IntrinsicInst &II) { |
| 1817 | // Replace DupX with a regular IR splat. |
| 1818 | auto *RetTy = cast<ScalableVectorType>(Val: II.getType()); |
| 1819 | Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), |
| 1820 | V: II.getArgOperand(i: 0)); |
| 1821 | Splat->takeName(V: &II); |
| 1822 | return IC.replaceInstUsesWith(I&: II, V: Splat); |
| 1823 | } |
| 1824 | |
| 1825 | static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, |
| 1826 | IntrinsicInst &II) { |
| 1827 | LLVMContext &Ctx = II.getContext(); |
| 1828 | |
| 1829 | if (!isAllActivePredicate(Pred: II.getArgOperand(i: 0))) |
| 1830 | return std::nullopt; |
| 1831 | |
| 1832 | // Check that we have a compare of zero.. |
| 1833 | auto *SplatValue = |
| 1834 | dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: 2))); |
| 1835 | if (!SplatValue || !SplatValue->isZero()) |
| 1836 | return std::nullopt; |
| 1837 | |
| 1838 | // ..against a dupq |
| 1839 | auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1)); |
| 1840 | if (!DupQLane || |
| 1841 | DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) |
| 1842 | return std::nullopt; |
| 1843 | |
| 1844 | // Where the dupq is a lane 0 replicate of a vector insert |
| 1845 | auto *DupQLaneIdx = dyn_cast<ConstantInt>(Val: DupQLane->getArgOperand(i: 1)); |
| 1846 | if (!DupQLaneIdx || !DupQLaneIdx->isZero()) |
| 1847 | return std::nullopt; |
| 1848 | |
| 1849 | auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: 0)); |
| 1850 | if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) |
| 1851 | return std::nullopt; |
| 1852 | |
| 1853 | // Where the vector insert is a fixed constant vector insert into undef at |
| 1854 | // index zero |
| 1855 | if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: 0))) |
| 1856 | return std::nullopt; |
| 1857 | |
| 1858 | if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: 2))->isZero()) |
| 1859 | return std::nullopt; |
| 1860 | |
| 1861 | auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: 1)); |
| 1862 | if (!ConstVec) |
| 1863 | return std::nullopt; |
| 1864 | |
| 1865 | auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType()); |
| 1866 | auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType()); |
| 1867 | if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) |
| 1868 | return std::nullopt; |
| 1869 | |
| 1870 | unsigned NumElts = VecTy->getNumElements(); |
| 1871 | unsigned PredicateBits = 0; |
| 1872 | |
| 1873 | // Expand intrinsic operands to a 16-bit byte level predicate |
| 1874 | for (unsigned I = 0; I < NumElts; ++I) { |
| 1875 | auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I)); |
| 1876 | if (!Arg) |
| 1877 | return std::nullopt; |
| 1878 | if (!Arg->isZero()) |
| 1879 | PredicateBits |= 1 << (I * (16 / NumElts)); |
| 1880 | } |
| 1881 | |
| 1882 | // If all bits are zero bail early with an empty predicate |
| 1883 | if (PredicateBits == 0) { |
| 1884 | auto *PFalse = Constant::getNullValue(Ty: II.getType()); |
| 1885 | PFalse->takeName(V: &II); |
| 1886 | return IC.replaceInstUsesWith(I&: II, V: PFalse); |
| 1887 | } |
| 1888 | |
| 1889 | // Calculate largest predicate type used (where byte predicate is largest) |
| 1890 | unsigned Mask = 8; |
| 1891 | for (unsigned I = 0; I < 16; ++I) |
| 1892 | if ((PredicateBits & (1 << I)) != 0) |
| 1893 | Mask |= (I % 8); |
| 1894 | |
| 1895 | unsigned PredSize = Mask & -Mask; |
| 1896 | auto *PredType = ScalableVectorType::get( |
| 1897 | ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * 8)); |
| 1898 | |
| 1899 | // Ensure all relevant bits are set |
| 1900 | for (unsigned I = 0; I < 16; I += PredSize) |
| 1901 | if ((PredicateBits & (1 << I)) == 0) |
| 1902 | return std::nullopt; |
| 1903 | |
| 1904 | auto *PTruePat = |
| 1905 | ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all); |
| 1906 | auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, |
| 1907 | Types: {PredType}, Args: {PTruePat}); |
| 1908 | auto *ConvertToSVBool = IC.Builder.CreateIntrinsic( |
| 1909 | ID: Intrinsic::aarch64_sve_convert_to_svbool, Types: {PredType}, Args: {PTrue}); |
| 1910 | auto *ConvertFromSVBool = |
| 1911 | IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool, |
| 1912 | Types: {II.getType()}, Args: {ConvertToSVBool}); |
| 1913 | |
| 1914 | ConvertFromSVBool->takeName(V: &II); |
| 1915 | return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool); |
| 1916 | } |
| 1917 | |
| 1918 | static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC, |
| 1919 | IntrinsicInst &II) { |
| 1920 | Value *Pg = II.getArgOperand(i: 0); |
| 1921 | Value *Vec = II.getArgOperand(i: 1); |
| 1922 | auto IntrinsicID = II.getIntrinsicID(); |
| 1923 | bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; |
| 1924 | |
| 1925 | // lastX(splat(X)) --> X |
| 1926 | if (auto *SplatVal = getSplatValue(V: Vec)) |
| 1927 | return IC.replaceInstUsesWith(I&: II, V: SplatVal); |
| 1928 | |
| 1929 | // If x and/or y is a splat value then: |
| 1930 | // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) |
| 1931 | Value *LHS, *RHS; |
| 1932 | if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) { |
| 1933 | if (isSplatValue(V: LHS) || isSplatValue(V: RHS)) { |
| 1934 | auto *OldBinOp = cast<BinaryOperator>(Val: Vec); |
| 1935 | auto OpC = OldBinOp->getOpcode(); |
| 1936 | auto *NewLHS = |
| 1937 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, LHS}); |
| 1938 | auto *NewRHS = |
| 1939 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, RHS}); |
| 1940 | auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( |
| 1941 | Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator()); |
| 1942 | return IC.replaceInstUsesWith(I&: II, V: NewBinOp); |
| 1943 | } |
| 1944 | } |
| 1945 | |
| 1946 | auto *C = dyn_cast<Constant>(Val: Pg); |
| 1947 | if (IsAfter && C && C->isNullValue()) { |
| 1948 | // The intrinsic is extracting lane 0 so use an extract instead. |
| 1949 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
| 1950 | auto * = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: 0)); |
| 1951 | Extract->insertBefore(InsertPos: II.getIterator()); |
| 1952 | Extract->takeName(V: &II); |
| 1953 | return IC.replaceInstUsesWith(I&: II, V: Extract); |
| 1954 | } |
| 1955 | |
| 1956 | auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg); |
| 1957 | if (!IntrPG) |
| 1958 | return std::nullopt; |
| 1959 | |
| 1960 | if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
| 1961 | return std::nullopt; |
| 1962 | |
| 1963 | const auto PTruePattern = |
| 1964 | cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: 0))->getZExtValue(); |
| 1965 | |
| 1966 | // Can the intrinsic's predicate be converted to a known constant index? |
| 1967 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern); |
| 1968 | if (!MinNumElts) |
| 1969 | return std::nullopt; |
| 1970 | |
| 1971 | unsigned Idx = MinNumElts - 1; |
| 1972 | // Increment the index if extracting the element after the last active |
| 1973 | // predicate element. |
| 1974 | if (IsAfter) |
| 1975 | ++Idx; |
| 1976 | |
| 1977 | // Ignore extracts whose index is larger than the known minimum vector |
| 1978 | // length. NOTE: This is an artificial constraint where we prefer to |
| 1979 | // maintain what the user asked for until an alternative is proven faster. |
| 1980 | auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType()); |
| 1981 | if (Idx >= PgVTy->getMinNumElements()) |
| 1982 | return std::nullopt; |
| 1983 | |
| 1984 | // The intrinsic is extracting a fixed lane so use an extract instead. |
| 1985 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
| 1986 | auto * = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx)); |
| 1987 | Extract->insertBefore(InsertPos: II.getIterator()); |
| 1988 | Extract->takeName(V: &II); |
| 1989 | return IC.replaceInstUsesWith(I&: II, V: Extract); |
| 1990 | } |
| 1991 | |
| 1992 | static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, |
| 1993 | IntrinsicInst &II) { |
| 1994 | // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar |
| 1995 | // integer variant across a variety of micro-architectures. Replace scalar |
| 1996 | // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple |
| 1997 | // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more |
| 1998 | // depending on the micro-architecture, but has been observed as generally |
| 1999 | // being faster, particularly when the CLAST[AB] op is a loop-carried |
| 2000 | // dependency. |
| 2001 | Value *Pg = II.getArgOperand(i: 0); |
| 2002 | Value *Fallback = II.getArgOperand(i: 1); |
| 2003 | Value *Vec = II.getArgOperand(i: 2); |
| 2004 | Type *Ty = II.getType(); |
| 2005 | |
| 2006 | if (!Ty->isIntegerTy()) |
| 2007 | return std::nullopt; |
| 2008 | |
| 2009 | Type *FPTy; |
| 2010 | switch (cast<IntegerType>(Val: Ty)->getBitWidth()) { |
| 2011 | default: |
| 2012 | return std::nullopt; |
| 2013 | case 16: |
| 2014 | FPTy = IC.Builder.getHalfTy(); |
| 2015 | break; |
| 2016 | case 32: |
| 2017 | FPTy = IC.Builder.getFloatTy(); |
| 2018 | break; |
| 2019 | case 64: |
| 2020 | FPTy = IC.Builder.getDoubleTy(); |
| 2021 | break; |
| 2022 | } |
| 2023 | |
| 2024 | Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy); |
| 2025 | auto *FPVTy = VectorType::get( |
| 2026 | ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount()); |
| 2027 | Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy); |
| 2028 | auto *FPII = IC.Builder.CreateIntrinsic( |
| 2029 | ID: II.getIntrinsicID(), Types: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec}); |
| 2030 | Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType()); |
| 2031 | return IC.replaceInstUsesWith(I&: II, V: FPIItoInt); |
| 2032 | } |
| 2033 | |
| 2034 | static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC, |
| 2035 | IntrinsicInst &II) { |
| 2036 | LLVMContext &Ctx = II.getContext(); |
| 2037 | // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr |
| 2038 | // can work with RDFFR_PP for ptest elimination. |
| 2039 | auto *AllPat = |
| 2040 | ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all); |
| 2041 | auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, |
| 2042 | Types: {II.getType()}, Args: {AllPat}); |
| 2043 | auto *RDFFR = |
| 2044 | IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_rdffr_z, Args: {PTrue}); |
| 2045 | RDFFR->takeName(V: &II); |
| 2046 | return IC.replaceInstUsesWith(I&: II, V: RDFFR); |
| 2047 | } |
| 2048 | |
| 2049 | static std::optional<Instruction *> |
| 2050 | instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { |
| 2051 | const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: 0))->getZExtValue(); |
| 2052 | |
| 2053 | if (Pattern == AArch64SVEPredPattern::all) { |
| 2054 | Value *Cnt = IC.Builder.CreateElementCount( |
| 2055 | Ty: II.getType(), EC: ElementCount::getScalable(MinVal: NumElts)); |
| 2056 | Cnt->takeName(V: &II); |
| 2057 | return IC.replaceInstUsesWith(I&: II, V: Cnt); |
| 2058 | } |
| 2059 | |
| 2060 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); |
| 2061 | |
| 2062 | return MinNumElts && NumElts >= MinNumElts |
| 2063 | ? std::optional<Instruction *>(IC.replaceInstUsesWith( |
| 2064 | I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts))) |
| 2065 | : std::nullopt; |
| 2066 | } |
| 2067 | |
| 2068 | static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, |
| 2069 | IntrinsicInst &II) { |
| 2070 | Value *PgVal = II.getArgOperand(i: 0); |
| 2071 | Value *OpVal = II.getArgOperand(i: 1); |
| 2072 | |
| 2073 | // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X). |
| 2074 | // Later optimizations prefer this form. |
| 2075 | if (PgVal == OpVal && |
| 2076 | (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first || |
| 2077 | II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) { |
| 2078 | Value *Ops[] = {PgVal, OpVal}; |
| 2079 | Type *Tys[] = {PgVal->getType()}; |
| 2080 | |
| 2081 | auto *PTest = |
| 2082 | IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptest_any, Types: Tys, Args: Ops); |
| 2083 | PTest->takeName(V: &II); |
| 2084 | |
| 2085 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
| 2086 | } |
| 2087 | |
| 2088 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal); |
| 2089 | IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal); |
| 2090 | |
| 2091 | if (!Pg || !Op) |
| 2092 | return std::nullopt; |
| 2093 | |
| 2094 | Intrinsic::ID OpIID = Op->getIntrinsicID(); |
| 2095 | |
| 2096 | if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && |
| 2097 | OpIID == Intrinsic::aarch64_sve_convert_to_svbool && |
| 2098 | Pg->getArgOperand(i: 0)->getType() == Op->getArgOperand(i: 0)->getType()) { |
| 2099 | Value *Ops[] = {Pg->getArgOperand(i: 0), Op->getArgOperand(i: 0)}; |
| 2100 | Type *Tys[] = {Pg->getArgOperand(i: 0)->getType()}; |
| 2101 | |
| 2102 | auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops); |
| 2103 | |
| 2104 | PTest->takeName(V: &II); |
| 2105 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
| 2106 | } |
| 2107 | |
| 2108 | // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)). |
| 2109 | // Later optimizations may rewrite sequence to use the flag-setting variant |
| 2110 | // of instruction X to remove PTEST. |
| 2111 | if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) && |
| 2112 | ((OpIID == Intrinsic::aarch64_sve_brka_z) || |
| 2113 | (OpIID == Intrinsic::aarch64_sve_brkb_z) || |
| 2114 | (OpIID == Intrinsic::aarch64_sve_brkpa_z) || |
| 2115 | (OpIID == Intrinsic::aarch64_sve_brkpb_z) || |
| 2116 | (OpIID == Intrinsic::aarch64_sve_rdffr_z) || |
| 2117 | (OpIID == Intrinsic::aarch64_sve_and_z) || |
| 2118 | (OpIID == Intrinsic::aarch64_sve_bic_z) || |
| 2119 | (OpIID == Intrinsic::aarch64_sve_eor_z) || |
| 2120 | (OpIID == Intrinsic::aarch64_sve_nand_z) || |
| 2121 | (OpIID == Intrinsic::aarch64_sve_nor_z) || |
| 2122 | (OpIID == Intrinsic::aarch64_sve_orn_z) || |
| 2123 | (OpIID == Intrinsic::aarch64_sve_orr_z))) { |
| 2124 | Value *Ops[] = {Pg->getArgOperand(i: 0), Pg}; |
| 2125 | Type *Tys[] = {Pg->getType()}; |
| 2126 | |
| 2127 | auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops); |
| 2128 | PTest->takeName(V: &II); |
| 2129 | |
| 2130 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
| 2131 | } |
| 2132 | |
| 2133 | return std::nullopt; |
| 2134 | } |
| 2135 | |
| 2136 | template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> |
| 2137 | static std::optional<Instruction *> |
| 2138 | instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, |
| 2139 | bool MergeIntoAddendOp) { |
| 2140 | Value *P = II.getOperand(i_nocapture: 0); |
| 2141 | Value *MulOp0, *MulOp1, *AddendOp, *Mul; |
| 2142 | if (MergeIntoAddendOp) { |
| 2143 | AddendOp = II.getOperand(i_nocapture: 1); |
| 2144 | Mul = II.getOperand(i_nocapture: 2); |
| 2145 | } else { |
| 2146 | AddendOp = II.getOperand(i_nocapture: 2); |
| 2147 | Mul = II.getOperand(i_nocapture: 1); |
| 2148 | } |
| 2149 | |
| 2150 | if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0), |
| 2151 | m_Value(V&: MulOp1)))) |
| 2152 | return std::nullopt; |
| 2153 | |
| 2154 | if (!Mul->hasOneUse()) |
| 2155 | return std::nullopt; |
| 2156 | |
| 2157 | Instruction *FMFSource = nullptr; |
| 2158 | if (II.getType()->isFPOrFPVectorTy()) { |
| 2159 | llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); |
| 2160 | // Stop the combine when the flags on the inputs differ in case dropping |
| 2161 | // flags would lead to us missing out on more beneficial optimizations. |
| 2162 | if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags()) |
| 2163 | return std::nullopt; |
| 2164 | if (!FAddFlags.allowContract()) |
| 2165 | return std::nullopt; |
| 2166 | FMFSource = &II; |
| 2167 | } |
| 2168 | |
| 2169 | CallInst *Res; |
| 2170 | if (MergeIntoAddendOp) |
| 2171 | Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()}, |
| 2172 | Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource); |
| 2173 | else |
| 2174 | Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()}, |
| 2175 | Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource); |
| 2176 | |
| 2177 | return IC.replaceInstUsesWith(I&: II, V: Res); |
| 2178 | } |
| 2179 | |
| 2180 | static std::optional<Instruction *> |
| 2181 | instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { |
| 2182 | Value *Pred = II.getOperand(i_nocapture: 0); |
| 2183 | Value *PtrOp = II.getOperand(i_nocapture: 1); |
| 2184 | Type *VecTy = II.getType(); |
| 2185 | |
| 2186 | if (isAllActivePredicate(Pred)) { |
| 2187 | LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp); |
| 2188 | Load->copyMetadata(SrcInst: II); |
| 2189 | return IC.replaceInstUsesWith(I&: II, V: Load); |
| 2190 | } |
| 2191 | |
| 2192 | CallInst *MaskedLoad = |
| 2193 | IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), |
| 2194 | Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy)); |
| 2195 | MaskedLoad->copyMetadata(SrcInst: II); |
| 2196 | return IC.replaceInstUsesWith(I&: II, V: MaskedLoad); |
| 2197 | } |
| 2198 | |
| 2199 | static std::optional<Instruction *> |
| 2200 | instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { |
| 2201 | Value *VecOp = II.getOperand(i_nocapture: 0); |
| 2202 | Value *Pred = II.getOperand(i_nocapture: 1); |
| 2203 | Value *PtrOp = II.getOperand(i_nocapture: 2); |
| 2204 | |
| 2205 | if (isAllActivePredicate(Pred)) { |
| 2206 | StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp); |
| 2207 | Store->copyMetadata(SrcInst: II); |
| 2208 | return IC.eraseInstFromFunction(I&: II); |
| 2209 | } |
| 2210 | |
| 2211 | CallInst *MaskedStore = IC.Builder.CreateMaskedStore( |
| 2212 | Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred); |
| 2213 | MaskedStore->copyMetadata(SrcInst: II); |
| 2214 | return IC.eraseInstFromFunction(I&: II); |
| 2215 | } |
| 2216 | |
| 2217 | static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { |
| 2218 | switch (Intrinsic) { |
| 2219 | case Intrinsic::aarch64_sve_fmul_u: |
| 2220 | return Instruction::BinaryOps::FMul; |
| 2221 | case Intrinsic::aarch64_sve_fadd_u: |
| 2222 | return Instruction::BinaryOps::FAdd; |
| 2223 | case Intrinsic::aarch64_sve_fsub_u: |
| 2224 | return Instruction::BinaryOps::FSub; |
| 2225 | default: |
| 2226 | return Instruction::BinaryOpsEnd; |
| 2227 | } |
| 2228 | } |
| 2229 | |
| 2230 | static std::optional<Instruction *> |
| 2231 | instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { |
| 2232 | // Bail due to missing support for ISD::STRICT_ scalable vector operations. |
| 2233 | if (II.isStrictFP()) |
| 2234 | return std::nullopt; |
| 2235 | |
| 2236 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
| 2237 | auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID()); |
| 2238 | if (BinOpCode == Instruction::BinaryOpsEnd || |
| 2239 | !isAllActivePredicate(Pred: OpPredicate)) |
| 2240 | return std::nullopt; |
| 2241 | auto BinOp = IC.Builder.CreateBinOpFMF( |
| 2242 | Opc: BinOpCode, LHS: II.getOperand(i_nocapture: 1), RHS: II.getOperand(i_nocapture: 2), FMFSource: II.getFastMathFlags()); |
| 2243 | return IC.replaceInstUsesWith(I&: II, V: BinOp); |
| 2244 | } |
| 2245 | |
| 2246 | static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC, |
| 2247 | IntrinsicInst &II) { |
| 2248 | if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
| 2249 | Intrinsic::aarch64_sve_mla>( |
| 2250 | IC, II, MergeIntoAddendOp: true)) |
| 2251 | return MLA; |
| 2252 | if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
| 2253 | Intrinsic::aarch64_sve_mad>( |
| 2254 | IC, II, MergeIntoAddendOp: false)) |
| 2255 | return MAD; |
| 2256 | return std::nullopt; |
| 2257 | } |
| 2258 | |
| 2259 | static std::optional<Instruction *> |
| 2260 | instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) { |
| 2261 | if (auto FMLA = |
| 2262 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2263 | Intrinsic::aarch64_sve_fmla>(IC, II, |
| 2264 | MergeIntoAddendOp: true)) |
| 2265 | return FMLA; |
| 2266 | if (auto FMAD = |
| 2267 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2268 | Intrinsic::aarch64_sve_fmad>(IC, II, |
| 2269 | MergeIntoAddendOp: false)) |
| 2270 | return FMAD; |
| 2271 | if (auto FMLA = |
| 2272 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
| 2273 | Intrinsic::aarch64_sve_fmla>(IC, II, |
| 2274 | MergeIntoAddendOp: true)) |
| 2275 | return FMLA; |
| 2276 | return std::nullopt; |
| 2277 | } |
| 2278 | |
| 2279 | static std::optional<Instruction *> |
| 2280 | instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) { |
| 2281 | if (auto FMLA = |
| 2282 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2283 | Intrinsic::aarch64_sve_fmla>(IC, II, |
| 2284 | MergeIntoAddendOp: true)) |
| 2285 | return FMLA; |
| 2286 | if (auto FMAD = |
| 2287 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2288 | Intrinsic::aarch64_sve_fmad>(IC, II, |
| 2289 | MergeIntoAddendOp: false)) |
| 2290 | return FMAD; |
| 2291 | if (auto FMLA_U = |
| 2292 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
| 2293 | Intrinsic::aarch64_sve_fmla_u>( |
| 2294 | IC, II, MergeIntoAddendOp: true)) |
| 2295 | return FMLA_U; |
| 2296 | return instCombineSVEVectorBinOp(IC, II); |
| 2297 | } |
| 2298 | |
| 2299 | static std::optional<Instruction *> |
| 2300 | instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) { |
| 2301 | if (auto FMLS = |
| 2302 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2303 | Intrinsic::aarch64_sve_fmls>(IC, II, |
| 2304 | MergeIntoAddendOp: true)) |
| 2305 | return FMLS; |
| 2306 | if (auto FMSB = |
| 2307 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2308 | Intrinsic::aarch64_sve_fnmsb>( |
| 2309 | IC, II, MergeIntoAddendOp: false)) |
| 2310 | return FMSB; |
| 2311 | if (auto FMLS = |
| 2312 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
| 2313 | Intrinsic::aarch64_sve_fmls>(IC, II, |
| 2314 | MergeIntoAddendOp: true)) |
| 2315 | return FMLS; |
| 2316 | return std::nullopt; |
| 2317 | } |
| 2318 | |
| 2319 | static std::optional<Instruction *> |
| 2320 | instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) { |
| 2321 | if (auto FMLS = |
| 2322 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2323 | Intrinsic::aarch64_sve_fmls>(IC, II, |
| 2324 | MergeIntoAddendOp: true)) |
| 2325 | return FMLS; |
| 2326 | if (auto FMSB = |
| 2327 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
| 2328 | Intrinsic::aarch64_sve_fnmsb>( |
| 2329 | IC, II, MergeIntoAddendOp: false)) |
| 2330 | return FMSB; |
| 2331 | if (auto FMLS_U = |
| 2332 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
| 2333 | Intrinsic::aarch64_sve_fmls_u>( |
| 2334 | IC, II, MergeIntoAddendOp: true)) |
| 2335 | return FMLS_U; |
| 2336 | return instCombineSVEVectorBinOp(IC, II); |
| 2337 | } |
| 2338 | |
| 2339 | static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC, |
| 2340 | IntrinsicInst &II) { |
| 2341 | if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
| 2342 | Intrinsic::aarch64_sve_mls>( |
| 2343 | IC, II, MergeIntoAddendOp: true)) |
| 2344 | return MLS; |
| 2345 | return std::nullopt; |
| 2346 | } |
| 2347 | |
| 2348 | static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, |
| 2349 | IntrinsicInst &II) { |
| 2350 | Value *UnpackArg = II.getArgOperand(i: 0); |
| 2351 | auto *RetTy = cast<ScalableVectorType>(Val: II.getType()); |
| 2352 | bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || |
| 2353 | II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; |
| 2354 | |
| 2355 | // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) |
| 2356 | // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) |
| 2357 | if (auto *ScalarArg = getSplatValue(V: UnpackArg)) { |
| 2358 | ScalarArg = |
| 2359 | IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned); |
| 2360 | Value *NewVal = |
| 2361 | IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg); |
| 2362 | NewVal->takeName(V: &II); |
| 2363 | return IC.replaceInstUsesWith(I&: II, V: NewVal); |
| 2364 | } |
| 2365 | |
| 2366 | return std::nullopt; |
| 2367 | } |
| 2368 | static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC, |
| 2369 | IntrinsicInst &II) { |
| 2370 | auto *OpVal = II.getOperand(i_nocapture: 0); |
| 2371 | auto *OpIndices = II.getOperand(i_nocapture: 1); |
| 2372 | VectorType *VTy = cast<VectorType>(Val: II.getType()); |
| 2373 | |
| 2374 | // Check whether OpIndices is a constant splat value < minimal element count |
| 2375 | // of result. |
| 2376 | auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices)); |
| 2377 | if (!SplatValue || |
| 2378 | SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue())) |
| 2379 | return std::nullopt; |
| 2380 | |
| 2381 | // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to |
| 2382 | // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. |
| 2383 | auto * = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue); |
| 2384 | auto *VectorSplat = |
| 2385 | IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract); |
| 2386 | |
| 2387 | VectorSplat->takeName(V: &II); |
| 2388 | return IC.replaceInstUsesWith(I&: II, V: VectorSplat); |
| 2389 | } |
| 2390 | |
| 2391 | static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC, |
| 2392 | IntrinsicInst &II) { |
| 2393 | Value *A, *B; |
| 2394 | Type *RetTy = II.getType(); |
| 2395 | constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool; |
| 2396 | constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool; |
| 2397 | |
| 2398 | // uzp1(to_svbool(A), to_svbool(B)) --> <A, B> |
| 2399 | // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B> |
| 2400 | if ((match(V: II.getArgOperand(i: 0), |
| 2401 | P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A)))) && |
| 2402 | match(V: II.getArgOperand(i: 1), |
| 2403 | P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) || |
| 2404 | (match(V: II.getArgOperand(i: 0), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A))) && |
| 2405 | match(V: II.getArgOperand(i: 1), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) { |
| 2406 | auto *TyA = cast<ScalableVectorType>(Val: A->getType()); |
| 2407 | if (TyA == B->getType() && |
| 2408 | RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) { |
| 2409 | auto *SubVec = IC.Builder.CreateInsertVector( |
| 2410 | DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: uint64_t(0)); |
| 2411 | auto *ConcatVec = IC.Builder.CreateInsertVector(DstType: RetTy, SrcVec: SubVec, SubVec: B, |
| 2412 | Idx: TyA->getMinNumElements()); |
| 2413 | ConcatVec->takeName(V: &II); |
| 2414 | return IC.replaceInstUsesWith(I&: II, V: ConcatVec); |
| 2415 | } |
| 2416 | } |
| 2417 | |
| 2418 | return std::nullopt; |
| 2419 | } |
| 2420 | |
| 2421 | static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC, |
| 2422 | IntrinsicInst &II) { |
| 2423 | // zip1(uzp1(A, B), uzp2(A, B)) --> A |
| 2424 | // zip2(uzp1(A, B), uzp2(A, B)) --> B |
| 2425 | Value *A, *B; |
| 2426 | if (match(V: II.getArgOperand(i: 0), |
| 2427 | P: m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(Op0: m_Value(V&: A), Op1: m_Value(V&: B))) && |
| 2428 | match(V: II.getArgOperand(i: 1), P: m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( |
| 2429 | Op0: m_Specific(V: A), Op1: m_Specific(V: B)))) |
| 2430 | return IC.replaceInstUsesWith( |
| 2431 | I&: II, V: (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); |
| 2432 | |
| 2433 | return std::nullopt; |
| 2434 | } |
| 2435 | |
| 2436 | static std::optional<Instruction *> |
| 2437 | instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { |
| 2438 | Value *Mask = II.getOperand(i_nocapture: 0); |
| 2439 | Value *BasePtr = II.getOperand(i_nocapture: 1); |
| 2440 | Value *Index = II.getOperand(i_nocapture: 2); |
| 2441 | Type *Ty = II.getType(); |
| 2442 | Value *PassThru = ConstantAggregateZero::get(Ty); |
| 2443 | |
| 2444 | // Contiguous gather => masked load. |
| 2445 | // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) |
| 2446 | // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) |
| 2447 | Value *IndexBase; |
| 2448 | if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>( |
| 2449 | Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) { |
| 2450 | Align Alignment = |
| 2451 | BasePtr->getPointerAlignment(DL: II.getDataLayout()); |
| 2452 | |
| 2453 | Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(), |
| 2454 | Ptr: BasePtr, IdxList: IndexBase); |
| 2455 | CallInst *MaskedLoad = |
| 2456 | IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); |
| 2457 | MaskedLoad->takeName(V: &II); |
| 2458 | return IC.replaceInstUsesWith(I&: II, V: MaskedLoad); |
| 2459 | } |
| 2460 | |
| 2461 | return std::nullopt; |
| 2462 | } |
| 2463 | |
| 2464 | static std::optional<Instruction *> |
| 2465 | instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { |
| 2466 | Value *Val = II.getOperand(i_nocapture: 0); |
| 2467 | Value *Mask = II.getOperand(i_nocapture: 1); |
| 2468 | Value *BasePtr = II.getOperand(i_nocapture: 2); |
| 2469 | Value *Index = II.getOperand(i_nocapture: 3); |
| 2470 | Type *Ty = Val->getType(); |
| 2471 | |
| 2472 | // Contiguous scatter => masked store. |
| 2473 | // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) |
| 2474 | // => (masked.store Value (gep BasePtr IndexBase) Align Mask) |
| 2475 | Value *IndexBase; |
| 2476 | if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>( |
| 2477 | Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) { |
| 2478 | Align Alignment = |
| 2479 | BasePtr->getPointerAlignment(DL: II.getDataLayout()); |
| 2480 | |
| 2481 | Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(), |
| 2482 | Ptr: BasePtr, IdxList: IndexBase); |
| 2483 | (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); |
| 2484 | |
| 2485 | return IC.eraseInstFromFunction(I&: II); |
| 2486 | } |
| 2487 | |
| 2488 | return std::nullopt; |
| 2489 | } |
| 2490 | |
| 2491 | static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, |
| 2492 | IntrinsicInst &II) { |
| 2493 | Type *Int32Ty = IC.Builder.getInt32Ty(); |
| 2494 | Value *Pred = II.getOperand(i_nocapture: 0); |
| 2495 | Value *Vec = II.getOperand(i_nocapture: 1); |
| 2496 | Value *DivVec = II.getOperand(i_nocapture: 2); |
| 2497 | |
| 2498 | Value *SplatValue = getSplatValue(V: DivVec); |
| 2499 | ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue); |
| 2500 | if (!SplatConstantInt) |
| 2501 | return std::nullopt; |
| 2502 | |
| 2503 | APInt Divisor = SplatConstantInt->getValue(); |
| 2504 | const int64_t DivisorValue = Divisor.getSExtValue(); |
| 2505 | if (DivisorValue == -1) |
| 2506 | return std::nullopt; |
| 2507 | if (DivisorValue == 1) |
| 2508 | IC.replaceInstUsesWith(I&: II, V: Vec); |
| 2509 | |
| 2510 | if (Divisor.isPowerOf2()) { |
| 2511 | Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2()); |
| 2512 | auto ASRD = IC.Builder.CreateIntrinsic( |
| 2513 | ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2}); |
| 2514 | return IC.replaceInstUsesWith(I&: II, V: ASRD); |
| 2515 | } |
| 2516 | if (Divisor.isNegatedPowerOf2()) { |
| 2517 | Divisor.negate(); |
| 2518 | Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2()); |
| 2519 | auto ASRD = IC.Builder.CreateIntrinsic( |
| 2520 | ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2}); |
| 2521 | auto NEG = IC.Builder.CreateIntrinsic( |
| 2522 | ID: Intrinsic::aarch64_sve_neg, Types: {ASRD->getType()}, Args: {ASRD, Pred, ASRD}); |
| 2523 | return IC.replaceInstUsesWith(I&: II, V: NEG); |
| 2524 | } |
| 2525 | |
| 2526 | return std::nullopt; |
| 2527 | } |
| 2528 | |
| 2529 | bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) { |
| 2530 | size_t VecSize = Vec.size(); |
| 2531 | if (VecSize == 1) |
| 2532 | return true; |
| 2533 | if (!isPowerOf2_64(Value: VecSize)) |
| 2534 | return false; |
| 2535 | size_t HalfVecSize = VecSize / 2; |
| 2536 | |
| 2537 | for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize; |
| 2538 | RHS != Vec.end(); LHS++, RHS++) { |
| 2539 | if (*LHS != nullptr && *RHS != nullptr) { |
| 2540 | if (*LHS == *RHS) |
| 2541 | continue; |
| 2542 | else |
| 2543 | return false; |
| 2544 | } |
| 2545 | if (!AllowPoison) |
| 2546 | return false; |
| 2547 | if (*LHS == nullptr && *RHS != nullptr) |
| 2548 | *LHS = *RHS; |
| 2549 | } |
| 2550 | |
| 2551 | Vec.resize(N: HalfVecSize); |
| 2552 | SimplifyValuePattern(Vec, AllowPoison); |
| 2553 | return true; |
| 2554 | } |
| 2555 | |
| 2556 | // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) |
| 2557 | // to dupqlane(f64(C)) where C is A concatenated with B |
| 2558 | static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC, |
| 2559 | IntrinsicInst &II) { |
| 2560 | Value *CurrentInsertElt = nullptr, *Default = nullptr; |
| 2561 | if (!match(V: II.getOperand(i_nocapture: 0), |
| 2562 | P: m_Intrinsic<Intrinsic::vector_insert>( |
| 2563 | Op0: m_Value(V&: Default), Op1: m_Value(V&: CurrentInsertElt), Op2: m_Value())) || |
| 2564 | !isa<FixedVectorType>(Val: CurrentInsertElt->getType())) |
| 2565 | return std::nullopt; |
| 2566 | auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType()); |
| 2567 | |
| 2568 | // Insert the scalars into a container ordered by InsertElement index |
| 2569 | SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr); |
| 2570 | while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) { |
| 2571 | auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: 2)); |
| 2572 | Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: 1); |
| 2573 | CurrentInsertElt = InsertElt->getOperand(i_nocapture: 0); |
| 2574 | } |
| 2575 | |
| 2576 | bool AllowPoison = |
| 2577 | isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default); |
| 2578 | if (!SimplifyValuePattern(Vec&: Elts, AllowPoison)) |
| 2579 | return std::nullopt; |
| 2580 | |
| 2581 | // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) |
| 2582 | Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType()); |
| 2583 | for (size_t I = 0; I < Elts.size(); I++) { |
| 2584 | if (Elts[I] == nullptr) |
| 2585 | continue; |
| 2586 | InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts[I], |
| 2587 | Idx: IC.Builder.getInt64(C: I)); |
| 2588 | } |
| 2589 | if (InsertEltChain == nullptr) |
| 2590 | return std::nullopt; |
| 2591 | |
| 2592 | // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 |
| 2593 | // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector |
| 2594 | // be bitcast to a type wide enough to fit the sequence, be splatted, and then |
| 2595 | // be narrowed back to the original type. |
| 2596 | unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size(); |
| 2597 | unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() * |
| 2598 | IIScalableTy->getMinNumElements() / |
| 2599 | PatternWidth; |
| 2600 | |
| 2601 | IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth); |
| 2602 | auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount); |
| 2603 | auto *WideShuffleMaskTy = |
| 2604 | ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount); |
| 2605 | |
| 2606 | auto InsertSubvector = IC.Builder.CreateInsertVector( |
| 2607 | DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain, |
| 2608 | Idx: uint64_t(0)); |
| 2609 | auto WideBitcast = |
| 2610 | IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy); |
| 2611 | auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy); |
| 2612 | auto WideShuffle = IC.Builder.CreateShuffleVector( |
| 2613 | V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask); |
| 2614 | auto NarrowBitcast = |
| 2615 | IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType()); |
| 2616 | |
| 2617 | return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast); |
| 2618 | } |
| 2619 | |
| 2620 | static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, |
| 2621 | IntrinsicInst &II) { |
| 2622 | Value *A = II.getArgOperand(i: 0); |
| 2623 | Value *B = II.getArgOperand(i: 1); |
| 2624 | if (A == B) |
| 2625 | return IC.replaceInstUsesWith(I&: II, V: A); |
| 2626 | |
| 2627 | return std::nullopt; |
| 2628 | } |
| 2629 | |
| 2630 | static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, |
| 2631 | IntrinsicInst &II) { |
| 2632 | Value *Pred = II.getOperand(i_nocapture: 0); |
| 2633 | Value *Vec = II.getOperand(i_nocapture: 1); |
| 2634 | Value *Shift = II.getOperand(i_nocapture: 2); |
| 2635 | |
| 2636 | // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. |
| 2637 | Value *AbsPred, *MergedValue; |
| 2638 | if (!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( |
| 2639 | Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())) && |
| 2640 | !match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_abs>( |
| 2641 | Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value()))) |
| 2642 | |
| 2643 | return std::nullopt; |
| 2644 | |
| 2645 | // Transform is valid if any of the following are true: |
| 2646 | // * The ABS merge value is an undef or non-negative |
| 2647 | // * The ABS predicate is all active |
| 2648 | // * The ABS predicate and the SRSHL predicates are the same |
| 2649 | if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) && |
| 2650 | AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred)) |
| 2651 | return std::nullopt; |
| 2652 | |
| 2653 | // Only valid when the shift amount is non-negative, otherwise the rounding |
| 2654 | // behaviour of SRSHL cannot be ignored. |
| 2655 | if (!match(V: Shift, P: m_NonNegative())) |
| 2656 | return std::nullopt; |
| 2657 | |
| 2658 | auto LSL = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_lsl, |
| 2659 | Types: {II.getType()}, Args: {Pred, Vec, Shift}); |
| 2660 | |
| 2661 | return IC.replaceInstUsesWith(I&: II, V: LSL); |
| 2662 | } |
| 2663 | |
| 2664 | static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC, |
| 2665 | IntrinsicInst &II) { |
| 2666 | Value *Vec = II.getOperand(i_nocapture: 0); |
| 2667 | |
| 2668 | if (getSplatValue(V: Vec) == II.getOperand(i_nocapture: 1)) |
| 2669 | return IC.replaceInstUsesWith(I&: II, V: Vec); |
| 2670 | |
| 2671 | return std::nullopt; |
| 2672 | } |
| 2673 | |
| 2674 | static std::optional<Instruction *> instCombineDMB(InstCombiner &IC, |
| 2675 | IntrinsicInst &II) { |
| 2676 | // If this barrier is post-dominated by identical one we can remove it |
| 2677 | auto *NI = II.getNextNonDebugInstruction(); |
| 2678 | unsigned LookaheadThreshold = DMBLookaheadThreshold; |
| 2679 | auto CanSkipOver = [](Instruction *I) { |
| 2680 | return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects(); |
| 2681 | }; |
| 2682 | while (LookaheadThreshold-- && CanSkipOver(NI)) { |
| 2683 | auto *NIBB = NI->getParent(); |
| 2684 | NI = NI->getNextNonDebugInstruction(); |
| 2685 | if (!NI) { |
| 2686 | if (auto *SuccBB = NIBB->getUniqueSuccessor()) |
| 2687 | NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime(); |
| 2688 | else |
| 2689 | break; |
| 2690 | } |
| 2691 | } |
| 2692 | auto *NextII = dyn_cast_or_null<IntrinsicInst>(Val: NI); |
| 2693 | if (NextII && II.isIdenticalTo(I: NextII)) |
| 2694 | return IC.eraseInstFromFunction(I&: II); |
| 2695 | |
| 2696 | return std::nullopt; |
| 2697 | } |
| 2698 | |
| 2699 | static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC, |
| 2700 | IntrinsicInst &II) { |
| 2701 | if (match(V: II.getOperand(i_nocapture: 0), P: m_ConstantInt<AArch64SVEPredPattern::all>())) |
| 2702 | return IC.replaceInstUsesWith(I&: II, V: Constant::getAllOnesValue(Ty: II.getType())); |
| 2703 | return std::nullopt; |
| 2704 | } |
| 2705 | |
| 2706 | static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC, |
| 2707 | IntrinsicInst &II, |
| 2708 | unsigned NumBits) { |
| 2709 | Value *Passthru = II.getOperand(i_nocapture: 0); |
| 2710 | Value *Pg = II.getOperand(i_nocapture: 1); |
| 2711 | Value *Op = II.getOperand(i_nocapture: 2); |
| 2712 | |
| 2713 | // Convert UXT[BHW] to AND. |
| 2714 | if (isa<UndefValue>(Val: Passthru) || isAllActivePredicate(Pred: Pg)) { |
| 2715 | auto *Ty = cast<VectorType>(Val: II.getType()); |
| 2716 | auto MaskValue = APInt::getLowBitsSet(numBits: Ty->getScalarSizeInBits(), loBitsSet: NumBits); |
| 2717 | auto *Mask = ConstantInt::get(Ty, V: MaskValue); |
| 2718 | auto *And = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_and_u, Types: {Ty}, |
| 2719 | Args: {Pg, Op, Mask}); |
| 2720 | return IC.replaceInstUsesWith(I&: II, V: And); |
| 2721 | } |
| 2722 | |
| 2723 | return std::nullopt; |
| 2724 | } |
| 2725 | |
| 2726 | std::optional<Instruction *> |
| 2727 | AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, |
| 2728 | IntrinsicInst &II) const { |
| 2729 | const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II); |
| 2730 | if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo)) |
| 2731 | return I; |
| 2732 | |
| 2733 | Intrinsic::ID IID = II.getIntrinsicID(); |
| 2734 | switch (IID) { |
| 2735 | default: |
| 2736 | break; |
| 2737 | case Intrinsic::aarch64_dmb: |
| 2738 | return instCombineDMB(IC, II); |
| 2739 | case Intrinsic::aarch64_neon_fmaxnm: |
| 2740 | case Intrinsic::aarch64_neon_fminnm: |
| 2741 | return instCombineMaxMinNM(IC, II); |
| 2742 | case Intrinsic::aarch64_sve_convert_from_svbool: |
| 2743 | return instCombineConvertFromSVBool(IC, II); |
| 2744 | case Intrinsic::aarch64_sve_dup: |
| 2745 | return instCombineSVEDup(IC, II); |
| 2746 | case Intrinsic::aarch64_sve_dup_x: |
| 2747 | return instCombineSVEDupX(IC, II); |
| 2748 | case Intrinsic::aarch64_sve_cmpne: |
| 2749 | case Intrinsic::aarch64_sve_cmpne_wide: |
| 2750 | return instCombineSVECmpNE(IC, II); |
| 2751 | case Intrinsic::aarch64_sve_rdffr: |
| 2752 | return instCombineRDFFR(IC, II); |
| 2753 | case Intrinsic::aarch64_sve_lasta: |
| 2754 | case Intrinsic::aarch64_sve_lastb: |
| 2755 | return instCombineSVELast(IC, II); |
| 2756 | case Intrinsic::aarch64_sve_clasta_n: |
| 2757 | case Intrinsic::aarch64_sve_clastb_n: |
| 2758 | return instCombineSVECondLast(IC, II); |
| 2759 | case Intrinsic::aarch64_sve_cntd: |
| 2760 | return instCombineSVECntElts(IC, II, NumElts: 2); |
| 2761 | case Intrinsic::aarch64_sve_cntw: |
| 2762 | return instCombineSVECntElts(IC, II, NumElts: 4); |
| 2763 | case Intrinsic::aarch64_sve_cnth: |
| 2764 | return instCombineSVECntElts(IC, II, NumElts: 8); |
| 2765 | case Intrinsic::aarch64_sve_cntb: |
| 2766 | return instCombineSVECntElts(IC, II, NumElts: 16); |
| 2767 | case Intrinsic::aarch64_sve_ptest_any: |
| 2768 | case Intrinsic::aarch64_sve_ptest_first: |
| 2769 | case Intrinsic::aarch64_sve_ptest_last: |
| 2770 | return instCombineSVEPTest(IC, II); |
| 2771 | case Intrinsic::aarch64_sve_fadd: |
| 2772 | return instCombineSVEVectorFAdd(IC, II); |
| 2773 | case Intrinsic::aarch64_sve_fadd_u: |
| 2774 | return instCombineSVEVectorFAddU(IC, II); |
| 2775 | case Intrinsic::aarch64_sve_fmul_u: |
| 2776 | return instCombineSVEVectorBinOp(IC, II); |
| 2777 | case Intrinsic::aarch64_sve_fsub: |
| 2778 | return instCombineSVEVectorFSub(IC, II); |
| 2779 | case Intrinsic::aarch64_sve_fsub_u: |
| 2780 | return instCombineSVEVectorFSubU(IC, II); |
| 2781 | case Intrinsic::aarch64_sve_add: |
| 2782 | return instCombineSVEVectorAdd(IC, II); |
| 2783 | case Intrinsic::aarch64_sve_add_u: |
| 2784 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, |
| 2785 | Intrinsic::aarch64_sve_mla_u>( |
| 2786 | IC, II, MergeIntoAddendOp: true); |
| 2787 | case Intrinsic::aarch64_sve_sub: |
| 2788 | return instCombineSVEVectorSub(IC, II); |
| 2789 | case Intrinsic::aarch64_sve_sub_u: |
| 2790 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, |
| 2791 | Intrinsic::aarch64_sve_mls_u>( |
| 2792 | IC, II, MergeIntoAddendOp: true); |
| 2793 | case Intrinsic::aarch64_sve_tbl: |
| 2794 | return instCombineSVETBL(IC, II); |
| 2795 | case Intrinsic::aarch64_sve_uunpkhi: |
| 2796 | case Intrinsic::aarch64_sve_uunpklo: |
| 2797 | case Intrinsic::aarch64_sve_sunpkhi: |
| 2798 | case Intrinsic::aarch64_sve_sunpklo: |
| 2799 | return instCombineSVEUnpack(IC, II); |
| 2800 | case Intrinsic::aarch64_sve_uzp1: |
| 2801 | return instCombineSVEUzp1(IC, II); |
| 2802 | case Intrinsic::aarch64_sve_zip1: |
| 2803 | case Intrinsic::aarch64_sve_zip2: |
| 2804 | return instCombineSVEZip(IC, II); |
| 2805 | case Intrinsic::aarch64_sve_ld1_gather_index: |
| 2806 | return instCombineLD1GatherIndex(IC, II); |
| 2807 | case Intrinsic::aarch64_sve_st1_scatter_index: |
| 2808 | return instCombineST1ScatterIndex(IC, II); |
| 2809 | case Intrinsic::aarch64_sve_ld1: |
| 2810 | return instCombineSVELD1(IC, II, DL); |
| 2811 | case Intrinsic::aarch64_sve_st1: |
| 2812 | return instCombineSVEST1(IC, II, DL); |
| 2813 | case Intrinsic::aarch64_sve_sdiv: |
| 2814 | return instCombineSVESDIV(IC, II); |
| 2815 | case Intrinsic::aarch64_sve_sel: |
| 2816 | return instCombineSVESel(IC, II); |
| 2817 | case Intrinsic::aarch64_sve_srshl: |
| 2818 | return instCombineSVESrshl(IC, II); |
| 2819 | case Intrinsic::aarch64_sve_dupq_lane: |
| 2820 | return instCombineSVEDupqLane(IC, II); |
| 2821 | case Intrinsic::aarch64_sve_insr: |
| 2822 | return instCombineSVEInsr(IC, II); |
| 2823 | case Intrinsic::aarch64_sve_ptrue: |
| 2824 | return instCombinePTrue(IC, II); |
| 2825 | case Intrinsic::aarch64_sve_uxtb: |
| 2826 | return instCombineSVEUxt(IC, II, NumBits: 8); |
| 2827 | case Intrinsic::aarch64_sve_uxth: |
| 2828 | return instCombineSVEUxt(IC, II, NumBits: 16); |
| 2829 | case Intrinsic::aarch64_sve_uxtw: |
| 2830 | return instCombineSVEUxt(IC, II, NumBits: 32); |
| 2831 | } |
| 2832 | |
| 2833 | return std::nullopt; |
| 2834 | } |
| 2835 | |
| 2836 | std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( |
| 2837 | InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, |
| 2838 | APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, |
| 2839 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
| 2840 | SimplifyAndSetOp) const { |
| 2841 | switch (II.getIntrinsicID()) { |
| 2842 | default: |
| 2843 | break; |
| 2844 | case Intrinsic::aarch64_neon_fcvtxn: |
| 2845 | case Intrinsic::aarch64_neon_rshrn: |
| 2846 | case Intrinsic::aarch64_neon_sqrshrn: |
| 2847 | case Intrinsic::aarch64_neon_sqrshrun: |
| 2848 | case Intrinsic::aarch64_neon_sqshrn: |
| 2849 | case Intrinsic::aarch64_neon_sqshrun: |
| 2850 | case Intrinsic::aarch64_neon_sqxtn: |
| 2851 | case Intrinsic::aarch64_neon_sqxtun: |
| 2852 | case Intrinsic::aarch64_neon_uqrshrn: |
| 2853 | case Intrinsic::aarch64_neon_uqshrn: |
| 2854 | case Intrinsic::aarch64_neon_uqxtn: |
| 2855 | SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); |
| 2856 | break; |
| 2857 | } |
| 2858 | |
| 2859 | return std::nullopt; |
| 2860 | } |
| 2861 | |
| 2862 | bool AArch64TTIImpl::enableScalableVectorization() const { |
| 2863 | return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && |
| 2864 | EnableScalableAutovecInStreamingMode); |
| 2865 | } |
| 2866 | |
| 2867 | TypeSize |
| 2868 | AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
| 2869 | switch (K) { |
| 2870 | case TargetTransformInfo::RGK_Scalar: |
| 2871 | return TypeSize::getFixed(ExactSize: 64); |
| 2872 | case TargetTransformInfo::RGK_FixedWidthVector: |
| 2873 | if (ST->useSVEForFixedLengthVectors() && |
| 2874 | (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode)) |
| 2875 | return TypeSize::getFixed( |
| 2876 | ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: 128u)); |
| 2877 | else if (ST->isNeonAvailable()) |
| 2878 | return TypeSize::getFixed(ExactSize: 128); |
| 2879 | else |
| 2880 | return TypeSize::getFixed(ExactSize: 0); |
| 2881 | case TargetTransformInfo::RGK_ScalableVector: |
| 2882 | if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && |
| 2883 | EnableScalableAutovecInStreamingMode)) |
| 2884 | return TypeSize::getScalable(MinimumSize: 128); |
| 2885 | else |
| 2886 | return TypeSize::getScalable(MinimumSize: 0); |
| 2887 | } |
| 2888 | llvm_unreachable("Unsupported register kind" ); |
| 2889 | } |
| 2890 | |
| 2891 | bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, |
| 2892 | ArrayRef<const Value *> Args, |
| 2893 | Type *SrcOverrideTy) const { |
| 2894 | // A helper that returns a vector type from the given type. The number of |
| 2895 | // elements in type Ty determines the vector width. |
| 2896 | auto toVectorTy = [&](Type *ArgTy) { |
| 2897 | return VectorType::get(ElementType: ArgTy->getScalarType(), |
| 2898 | EC: cast<VectorType>(Val: DstTy)->getElementCount()); |
| 2899 | }; |
| 2900 | |
| 2901 | // Exit early if DstTy is not a vector type whose elements are one of [i16, |
| 2902 | // i32, i64]. SVE doesn't generally have the same set of instructions to |
| 2903 | // perform an extend with the add/sub/mul. There are SMULLB style |
| 2904 | // instructions, but they operate on top/bottom, requiring some sort of lane |
| 2905 | // interleaving to be used with zext/sext. |
| 2906 | unsigned DstEltSize = DstTy->getScalarSizeInBits(); |
| 2907 | if (!useNeonVector(Ty: DstTy) || Args.size() != 2 || |
| 2908 | (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) |
| 2909 | return false; |
| 2910 | |
| 2911 | // Determine if the operation has a widening variant. We consider both the |
| 2912 | // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the |
| 2913 | // instructions. |
| 2914 | // |
| 2915 | // TODO: Add additional widening operations (e.g., shl, etc.) once we |
| 2916 | // verify that their extending operands are eliminated during code |
| 2917 | // generation. |
| 2918 | Type *SrcTy = SrcOverrideTy; |
| 2919 | switch (Opcode) { |
| 2920 | case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). |
| 2921 | case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). |
| 2922 | // The second operand needs to be an extend |
| 2923 | if (isa<SExtInst>(Val: Args[1]) || isa<ZExtInst>(Val: Args[1])) { |
| 2924 | if (!SrcTy) |
| 2925 | SrcTy = |
| 2926 | toVectorTy(cast<Instruction>(Val: Args[1])->getOperand(i: 0)->getType()); |
| 2927 | } else |
| 2928 | return false; |
| 2929 | break; |
| 2930 | case Instruction::Mul: { // SMULL(2), UMULL(2) |
| 2931 | // Both operands need to be extends of the same type. |
| 2932 | if ((isa<SExtInst>(Val: Args[0]) && isa<SExtInst>(Val: Args[1])) || |
| 2933 | (isa<ZExtInst>(Val: Args[0]) && isa<ZExtInst>(Val: Args[1]))) { |
| 2934 | if (!SrcTy) |
| 2935 | SrcTy = |
| 2936 | toVectorTy(cast<Instruction>(Val: Args[0])->getOperand(i: 0)->getType()); |
| 2937 | } else if (isa<ZExtInst>(Val: Args[0]) || isa<ZExtInst>(Val: Args[1])) { |
| 2938 | // If one of the operands is a Zext and the other has enough zero bits to |
| 2939 | // be treated as unsigned, we can still general a umull, meaning the zext |
| 2940 | // is free. |
| 2941 | KnownBits Known = |
| 2942 | computeKnownBits(V: isa<ZExtInst>(Val: Args[0]) ? Args[1] : Args[0], DL); |
| 2943 | if (Args[0]->getType()->getScalarSizeInBits() - |
| 2944 | Known.Zero.countLeadingOnes() > |
| 2945 | DstTy->getScalarSizeInBits() / 2) |
| 2946 | return false; |
| 2947 | if (!SrcTy) |
| 2948 | SrcTy = toVectorTy(Type::getIntNTy(C&: DstTy->getContext(), |
| 2949 | N: DstTy->getScalarSizeInBits() / 2)); |
| 2950 | } else |
| 2951 | return false; |
| 2952 | break; |
| 2953 | } |
| 2954 | default: |
| 2955 | return false; |
| 2956 | } |
| 2957 | |
| 2958 | // Legalize the destination type and ensure it can be used in a widening |
| 2959 | // operation. |
| 2960 | auto DstTyL = getTypeLegalizationCost(Ty: DstTy); |
| 2961 | if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits()) |
| 2962 | return false; |
| 2963 | |
| 2964 | // Legalize the source type and ensure it can be used in a widening |
| 2965 | // operation. |
| 2966 | assert(SrcTy && "Expected some SrcTy" ); |
| 2967 | auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy); |
| 2968 | unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); |
| 2969 | if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) |
| 2970 | return false; |
| 2971 | |
| 2972 | // Get the total number of vector elements in the legalized types. |
| 2973 | InstructionCost NumDstEls = |
| 2974 | DstTyL.first * DstTyL.second.getVectorMinNumElements(); |
| 2975 | InstructionCost NumSrcEls = |
| 2976 | SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); |
| 2977 | |
| 2978 | // Return true if the legalized types have the same number of vector elements |
| 2979 | // and the destination element type size is twice that of the source type. |
| 2980 | return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; |
| 2981 | } |
| 2982 | |
| 2983 | // s/urhadd instructions implement the following pattern, making the |
| 2984 | // extends free: |
| 2985 | // %x = add ((zext i8 -> i16), 1) |
| 2986 | // %y = (zext i8 -> i16) |
| 2987 | // trunc i16 (lshr (add %x, %y), 1) -> i8 |
| 2988 | // |
| 2989 | bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, |
| 2990 | Type *Src) const { |
| 2991 | // The source should be a legal vector type. |
| 2992 | if (!Src->isVectorTy() || !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) || |
| 2993 | (Src->isScalableTy() && !ST->hasSVE2())) |
| 2994 | return false; |
| 2995 | |
| 2996 | if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse()) |
| 2997 | return false; |
| 2998 | |
| 2999 | // Look for trunc/shl/add before trying to match the pattern. |
| 3000 | const Instruction *Add = ExtUser; |
| 3001 | auto *AddUser = |
| 3002 | dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser()); |
| 3003 | if (AddUser && AddUser->getOpcode() == Instruction::Add) |
| 3004 | Add = AddUser; |
| 3005 | |
| 3006 | auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser()); |
| 3007 | if (!Shr || Shr->getOpcode() != Instruction::LShr) |
| 3008 | return false; |
| 3009 | |
| 3010 | auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser()); |
| 3011 | if (!Trunc || Trunc->getOpcode() != Instruction::Trunc || |
| 3012 | Src->getScalarSizeInBits() != |
| 3013 | cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits()) |
| 3014 | return false; |
| 3015 | |
| 3016 | // Try to match the whole pattern. Ext could be either the first or second |
| 3017 | // m_ZExtOrSExt matched. |
| 3018 | Instruction *Ex1, *Ex2; |
| 3019 | if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1), |
| 3020 | R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_SpecificInt(V: 1)))))) |
| 3021 | return false; |
| 3022 | |
| 3023 | // Ensure both extends are of the same type |
| 3024 | if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) && |
| 3025 | Ex1->getOpcode() == Ex2->getOpcode()) |
| 3026 | return true; |
| 3027 | |
| 3028 | return false; |
| 3029 | } |
| 3030 | |
| 3031 | InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
| 3032 | Type *Src, |
| 3033 | TTI::CastContextHint CCH, |
| 3034 | TTI::TargetCostKind CostKind, |
| 3035 | const Instruction *I) const { |
| 3036 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 3037 | assert(ISD && "Invalid opcode" ); |
| 3038 | // If the cast is observable, and it is used by a widening instruction (e.g., |
| 3039 | // uaddl, saddw, etc.), it may be free. |
| 3040 | if (I && I->hasOneUser()) { |
| 3041 | auto *SingleUser = cast<Instruction>(Val: *I->user_begin()); |
| 3042 | SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); |
| 3043 | if (isWideningInstruction(DstTy: Dst, Opcode: SingleUser->getOpcode(), Args: Operands, SrcOverrideTy: Src)) { |
| 3044 | // For adds only count the second operand as free if both operands are |
| 3045 | // extends but not the same operation. (i.e both operands are not free in |
| 3046 | // add(sext, zext)). |
| 3047 | if (SingleUser->getOpcode() == Instruction::Add) { |
| 3048 | if (I == SingleUser->getOperand(i: 1) || |
| 3049 | (isa<CastInst>(Val: SingleUser->getOperand(i: 1)) && |
| 3050 | cast<CastInst>(Val: SingleUser->getOperand(i: 1))->getOpcode() == Opcode)) |
| 3051 | return 0; |
| 3052 | } else // Others are free so long as isWideningInstruction returned true. |
| 3053 | return 0; |
| 3054 | } |
| 3055 | |
| 3056 | // The cast will be free for the s/urhadd instructions |
| 3057 | if ((isa<ZExtInst>(Val: I) || isa<SExtInst>(Val: I)) && |
| 3058 | isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src)) |
| 3059 | return 0; |
| 3060 | } |
| 3061 | |
| 3062 | // TODO: Allow non-throughput costs that aren't binary. |
| 3063 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { |
| 3064 | if (CostKind != TTI::TCK_RecipThroughput) |
| 3065 | return Cost == 0 ? 0 : 1; |
| 3066 | return Cost; |
| 3067 | }; |
| 3068 | |
| 3069 | EVT SrcTy = TLI->getValueType(DL, Ty: Src); |
| 3070 | EVT DstTy = TLI->getValueType(DL, Ty: Dst); |
| 3071 | |
| 3072 | if (!SrcTy.isSimple() || !DstTy.isSimple()) |
| 3073 | return AdjustCost( |
| 3074 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
| 3075 | |
| 3076 | static const TypeConversionCostTblEntry BF16Tbl[] = { |
| 3077 | {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 1}, // bfcvt |
| 3078 | {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 1}, // bfcvt |
| 3079 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 1}, // bfcvtn |
| 3080 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 2}, // bfcvtn+bfcvtn2 |
| 3081 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 2}, // bfcvtn+fcvtn |
| 3082 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtl2+bfcvtn |
| 3083 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+bfcvtn |
| 3084 | }; |
| 3085 | |
| 3086 | if (ST->hasBF16()) |
| 3087 | if (const auto *Entry = ConvertCostTableLookup( |
| 3088 | Table: BF16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT())) |
| 3089 | return AdjustCost(Entry->Cost); |
| 3090 | |
| 3091 | // Symbolic constants for the SVE sitofp/uitofp entries in the table below |
| 3092 | // The cost of unpacking twice is artificially increased for now in order |
| 3093 | // to avoid regressions against NEON, which will use tbl instructions directly |
| 3094 | // instead of multiple layers of [s|u]unpk[lo|hi]. |
| 3095 | // We use the unpacks in cases where the destination type is illegal and |
| 3096 | // requires splitting of the input, even if the input type itself is legal. |
| 3097 | const unsigned int SVE_EXT_COST = 1; |
| 3098 | const unsigned int SVE_FCVT_COST = 1; |
| 3099 | const unsigned int SVE_UNPACK_ONCE = 4; |
| 3100 | const unsigned int SVE_UNPACK_TWICE = 16; |
| 3101 | |
| 3102 | static const TypeConversionCostTblEntry ConversionTbl[] = { |
| 3103 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: 1}, // xtn |
| 3104 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: 1}, // xtn |
| 3105 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i32, .Src: MVT::v2i64, .Cost: 1}, // xtn |
| 3106 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: 1}, // xtn |
| 3107 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: 3}, // 2 xtn + 1 uzp1 |
| 3108 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 1}, // xtn |
| 3109 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: 2}, // 1 uzp1 + 1 xtn |
| 3110 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: 1}, // 1 uzp1 |
| 3111 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: 1}, // 1 xtn |
| 3112 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: 2}, // 1 uzp1 + 1 xtn |
| 3113 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: 4}, // 3 x uzp1 + xtn |
| 3114 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: 1}, // 1 uzp1 |
| 3115 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: 3}, // 3 x uzp1 |
| 3116 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: 2}, // 2 x uzp1 |
| 3117 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: 1}, // uzp1 |
| 3118 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 3}, // (2 + 1) x uzp1 |
| 3119 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: 7}, // (4 + 2 + 1) x uzp1 |
| 3120 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: 2}, // 2 x uzp1 |
| 3121 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i64, .Cost: 6}, // (4 + 2) x uzp1 |
| 3122 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i64, .Cost: 4}, // 4 x uzp1 |
| 3123 | |
| 3124 | // Truncations on nxvmiN |
| 3125 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i8, .Cost: 2}, |
| 3126 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i16, .Cost: 2}, |
| 3127 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i32, .Cost: 2}, |
| 3128 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i64, .Cost: 2}, |
| 3129 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i8, .Cost: 2}, |
| 3130 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i16, .Cost: 2}, |
| 3131 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i32, .Cost: 2}, |
| 3132 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i64, .Cost: 5}, |
| 3133 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i8, .Cost: 2}, |
| 3134 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i16, .Cost: 2}, |
| 3135 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i32, .Cost: 5}, |
| 3136 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i64, .Cost: 11}, |
| 3137 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i1, .Src: MVT::nxv16i8, .Cost: 2}, |
| 3138 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i16, .Cost: 0}, |
| 3139 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i32, .Cost: 0}, |
| 3140 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i64, .Cost: 0}, |
| 3141 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i32, .Cost: 0}, |
| 3142 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i64, .Cost: 0}, |
| 3143 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i32, .Src: MVT::nxv2i64, .Cost: 0}, |
| 3144 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i16, .Cost: 0}, |
| 3145 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i32, .Cost: 0}, |
| 3146 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i64, .Cost: 1}, |
| 3147 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i32, .Cost: 0}, |
| 3148 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i64, .Cost: 1}, |
| 3149 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i32, .Src: MVT::nxv4i64, .Cost: 1}, |
| 3150 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i16, .Cost: 0}, |
| 3151 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i32, .Cost: 1}, |
| 3152 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i64, .Cost: 3}, |
| 3153 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i32, .Cost: 1}, |
| 3154 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i64, .Cost: 3}, |
| 3155 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i16, .Cost: 1}, |
| 3156 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i32, .Cost: 3}, |
| 3157 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i64, .Cost: 7}, |
| 3158 | |
| 3159 | // The number of shll instructions for the extension. |
| 3160 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3}, |
| 3161 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3}, |
| 3162 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2}, |
| 3163 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2}, |
| 3164 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3}, |
| 3165 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3}, |
| 3166 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2}, |
| 3167 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2}, |
| 3168 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7}, |
| 3169 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7}, |
| 3170 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6}, |
| 3171 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6}, |
| 3172 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2}, |
| 3173 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2}, |
| 3174 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6}, |
| 3175 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6}, |
| 3176 | |
| 3177 | // FP Ext and trunc |
| 3178 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f32, .Cost: 1}, // fcvt |
| 3179 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f32, .Cost: 1}, // fcvtl |
| 3180 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f32, .Cost: 2}, // fcvtl+fcvtl2 |
| 3181 | // FP16 |
| 3182 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::f16, .Cost: 1}, // fcvt |
| 3183 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f16, .Cost: 1}, // fcvt |
| 3184 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: 1}, // fcvtl |
| 3185 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: 2}, // fcvtl+fcvtl2 |
| 3186 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f16, .Cost: 2}, // fcvtl+fcvtl |
| 3187 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f16, .Cost: 3}, // fcvtl+fcvtl2+fcvtl |
| 3188 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f16, .Cost: 6}, // 2 * fcvtl+fcvtl2+fcvtl |
| 3189 | // BF16 (uses shift) |
| 3190 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::bf16, .Cost: 1}, // shl |
| 3191 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::bf16, .Cost: 2}, // shl+fcvt |
| 3192 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4bf16, .Cost: 1}, // shll |
| 3193 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8bf16, .Cost: 2}, // shll+shll2 |
| 3194 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2bf16, .Cost: 2}, // shll+fcvtl |
| 3195 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4bf16, .Cost: 3}, // shll+fcvtl+fcvtl2 |
| 3196 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8bf16, .Cost: 6}, // 2 * shll+fcvtl+fcvtl2 |
| 3197 | // FP Ext and trunc |
| 3198 | {.ISD: ISD::FP_ROUND, .Dst: MVT::f32, .Src: MVT::f64, .Cost: 1}, // fcvt |
| 3199 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f32, .Src: MVT::v2f64, .Cost: 1}, // fcvtn |
| 3200 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f64, .Cost: 2}, // fcvtn+fcvtn2 |
| 3201 | // FP16 |
| 3202 | {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f32, .Cost: 1}, // fcvt |
| 3203 | {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f64, .Cost: 1}, // fcvt |
| 3204 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f32, .Cost: 1}, // fcvtn |
| 3205 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f32, .Cost: 2}, // fcvtn+fcvtn2 |
| 3206 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f16, .Src: MVT::v2f64, .Cost: 2}, // fcvtn+fcvtn |
| 3207 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtn2+fcvtn |
| 3208 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+fcvtn |
| 3209 | // BF16 (more complex, with +bf16 is handled above) |
| 3210 | {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 8}, // Expansion is ~8 insns |
| 3211 | {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 9}, // fcvtn + above |
| 3212 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f32, .Cost: 8}, |
| 3213 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 8}, |
| 3214 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 15}, |
| 3215 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 9}, |
| 3216 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 10}, |
| 3217 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 19}, |
| 3218 | |
| 3219 | // LowerVectorINT_TO_FP: |
| 3220 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1}, |
| 3221 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1}, |
| 3222 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1}, |
| 3223 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1}, |
| 3224 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1}, |
| 3225 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1}, |
| 3226 | |
| 3227 | // SVE: to nxv2f16 |
| 3228 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8, |
| 3229 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3230 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
| 3231 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
| 3232 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
| 3233 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8, |
| 3234 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3235 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
| 3236 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
| 3237 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
| 3238 | |
| 3239 | // SVE: to nxv4f16 |
| 3240 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8, |
| 3241 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3242 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST}, |
| 3243 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST}, |
| 3244 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8, |
| 3245 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3246 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST}, |
| 3247 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST}, |
| 3248 | |
| 3249 | // SVE: to nxv8f16 |
| 3250 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8, |
| 3251 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3252 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST}, |
| 3253 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8, |
| 3254 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3255 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST}, |
| 3256 | |
| 3257 | // SVE: to nxv16f16 |
| 3258 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8, |
| 3259 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3260 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8, |
| 3261 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3262 | |
| 3263 | // Complex: to v2f32 |
| 3264 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3}, |
| 3265 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3}, |
| 3266 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3}, |
| 3267 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3}, |
| 3268 | |
| 3269 | // SVE: to nxv2f32 |
| 3270 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8, |
| 3271 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3272 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
| 3273 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
| 3274 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
| 3275 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8, |
| 3276 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3277 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
| 3278 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
| 3279 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
| 3280 | |
| 3281 | // Complex: to v4f32 |
| 3282 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 4}, |
| 3283 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2}, |
| 3284 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3}, |
| 3285 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2}, |
| 3286 | |
| 3287 | // SVE: to nxv4f32 |
| 3288 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8, |
| 3289 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3290 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST}, |
| 3291 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST}, |
| 3292 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8, |
| 3293 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3294 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST}, |
| 3295 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST}, |
| 3296 | |
| 3297 | // Complex: to v8f32 |
| 3298 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10}, |
| 3299 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4}, |
| 3300 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10}, |
| 3301 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4}, |
| 3302 | |
| 3303 | // SVE: to nxv8f32 |
| 3304 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8, |
| 3305 | .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3306 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16, |
| 3307 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3308 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8, |
| 3309 | .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3310 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16, |
| 3311 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3312 | |
| 3313 | // SVE: to nxv16f32 |
| 3314 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8, |
| 3315 | .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3316 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8, |
| 3317 | .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3318 | |
| 3319 | // Complex: to v16f32 |
| 3320 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21}, |
| 3321 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21}, |
| 3322 | |
| 3323 | // Complex: to v2f64 |
| 3324 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4}, |
| 3325 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4}, |
| 3326 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2}, |
| 3327 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4}, |
| 3328 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4}, |
| 3329 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2}, |
| 3330 | |
| 3331 | // SVE: to nxv2f64 |
| 3332 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8, |
| 3333 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3334 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
| 3335 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
| 3336 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
| 3337 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8, |
| 3338 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
| 3339 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
| 3340 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
| 3341 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
| 3342 | |
| 3343 | // Complex: to v4f64 |
| 3344 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4}, |
| 3345 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4}, |
| 3346 | |
| 3347 | // SVE: to nxv4f64 |
| 3348 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8, |
| 3349 | .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3350 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16, |
| 3351 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3352 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32, |
| 3353 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3354 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8, |
| 3355 | .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3356 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16, |
| 3357 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3358 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32, |
| 3359 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
| 3360 | |
| 3361 | // SVE: to nxv8f64 |
| 3362 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8, |
| 3363 | .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3364 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16, |
| 3365 | .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3366 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8, |
| 3367 | .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3368 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16, |
| 3369 | .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
| 3370 | |
| 3371 | // LowerVectorFP_TO_INT |
| 3372 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1}, |
| 3373 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1}, |
| 3374 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1}, |
| 3375 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1}, |
| 3376 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1}, |
| 3377 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1}, |
| 3378 | |
| 3379 | // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). |
| 3380 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2}, |
| 3381 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1}, |
| 3382 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1}, |
| 3383 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2}, |
| 3384 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1}, |
| 3385 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1}, |
| 3386 | |
| 3387 | // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 |
| 3388 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2}, |
| 3389 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2}, |
| 3390 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2}, |
| 3391 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2}, |
| 3392 | |
| 3393 | // Complex, from nxv2f32. |
| 3394 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1}, |
| 3395 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1}, |
| 3396 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1}, |
| 3397 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1}, |
| 3398 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1}, |
| 3399 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1}, |
| 3400 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1}, |
| 3401 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1}, |
| 3402 | |
| 3403 | // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. |
| 3404 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2}, |
| 3405 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2}, |
| 3406 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2}, |
| 3407 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2}, |
| 3408 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2}, |
| 3409 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2}, |
| 3410 | |
| 3411 | // Complex, from nxv2f64. |
| 3412 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3413 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3414 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3415 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3416 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3417 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3418 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3419 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3420 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3421 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3422 | |
| 3423 | // Complex, from nxv4f32. |
| 3424 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4}, |
| 3425 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1}, |
| 3426 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1}, |
| 3427 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1}, |
| 3428 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1}, |
| 3429 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4}, |
| 3430 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1}, |
| 3431 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1}, |
| 3432 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1}, |
| 3433 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1}, |
| 3434 | |
| 3435 | // Complex, from nxv8f64. Illegal -> illegal conversions not required. |
| 3436 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7}, |
| 3437 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7}, |
| 3438 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7}, |
| 3439 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7}, |
| 3440 | |
| 3441 | // Complex, from nxv4f64. Illegal -> illegal conversions not required. |
| 3442 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3}, |
| 3443 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3}, |
| 3444 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3}, |
| 3445 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3}, |
| 3446 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3}, |
| 3447 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3}, |
| 3448 | |
| 3449 | // Complex, from nxv8f32. Illegal -> illegal conversions not required. |
| 3450 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3}, |
| 3451 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3}, |
| 3452 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3}, |
| 3453 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3}, |
| 3454 | |
| 3455 | // Complex, from nxv8f16. |
| 3456 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10}, |
| 3457 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4}, |
| 3458 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1}, |
| 3459 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1}, |
| 3460 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1}, |
| 3461 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10}, |
| 3462 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4}, |
| 3463 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1}, |
| 3464 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1}, |
| 3465 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1}, |
| 3466 | |
| 3467 | // Complex, from nxv4f16. |
| 3468 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4}, |
| 3469 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1}, |
| 3470 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1}, |
| 3471 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1}, |
| 3472 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4}, |
| 3473 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1}, |
| 3474 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1}, |
| 3475 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1}, |
| 3476 | |
| 3477 | // Complex, from nxv2f16. |
| 3478 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1}, |
| 3479 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1}, |
| 3480 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1}, |
| 3481 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1}, |
| 3482 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1}, |
| 3483 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1}, |
| 3484 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1}, |
| 3485 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1}, |
| 3486 | |
| 3487 | // Truncate from nxvmf32 to nxvmf16. |
| 3488 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f32, .Cost: 1}, |
| 3489 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f32, .Cost: 1}, |
| 3490 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f32, .Cost: 3}, |
| 3491 | |
| 3492 | // Truncate from nxvmf64 to nxvmf16. |
| 3493 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3494 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f64, .Cost: 3}, |
| 3495 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f64, .Cost: 7}, |
| 3496 | |
| 3497 | // Truncate from nxvmf64 to nxvmf32. |
| 3498 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f64, .Cost: 1}, |
| 3499 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f64, .Cost: 3}, |
| 3500 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f64, .Cost: 6}, |
| 3501 | |
| 3502 | // Extend from nxvmf16 to nxvmf32. |
| 3503 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f16, .Cost: 1}, |
| 3504 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f16, .Cost: 1}, |
| 3505 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f16, .Cost: 2}, |
| 3506 | |
| 3507 | // Extend from nxvmf16 to nxvmf64. |
| 3508 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f16, .Cost: 1}, |
| 3509 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f16, .Cost: 2}, |
| 3510 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f16, .Cost: 4}, |
| 3511 | |
| 3512 | // Extend from nxvmf32 to nxvmf64. |
| 3513 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f32, .Cost: 1}, |
| 3514 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f32, .Cost: 2}, |
| 3515 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f32, .Cost: 6}, |
| 3516 | |
| 3517 | // Bitcasts from float to integer |
| 3518 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: 0}, |
| 3519 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: 0}, |
| 3520 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: 0}, |
| 3521 | |
| 3522 | // Bitcasts from integer to float |
| 3523 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 0}, |
| 3524 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 0}, |
| 3525 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 0}, |
| 3526 | |
| 3527 | // Add cost for extending to illegal -too wide- scalable vectors. |
| 3528 | // zero/sign extend are implemented by multiple unpack operations, |
| 3529 | // where each operation has a cost of 1. |
| 3530 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2}, |
| 3531 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6}, |
| 3532 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14}, |
| 3533 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2}, |
| 3534 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6}, |
| 3535 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2}, |
| 3536 | |
| 3537 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2}, |
| 3538 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6}, |
| 3539 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14}, |
| 3540 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2}, |
| 3541 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6}, |
| 3542 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2}, |
| 3543 | }; |
| 3544 | |
| 3545 | // We have to estimate a cost of fixed length operation upon |
| 3546 | // SVE registers(operations) with the number of registers required |
| 3547 | // for a fixed type to be represented upon SVE registers. |
| 3548 | EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy; |
| 3549 | if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() && |
| 3550 | SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() && |
| 3551 | ST->useSVEForFixedLengthVectors(VT: WiderTy)) { |
| 3552 | std::pair<InstructionCost, MVT> LT = |
| 3553 | getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext())); |
| 3554 | unsigned NumElements = |
| 3555 | AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits(); |
| 3556 | return AdjustCost( |
| 3557 | LT.first * |
| 3558 | getCastInstrCost( |
| 3559 | Opcode, Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements), |
| 3560 | Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH, |
| 3561 | CostKind, I)); |
| 3562 | } |
| 3563 | |
| 3564 | if (const auto *Entry = ConvertCostTableLookup( |
| 3565 | Table: ConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT())) |
| 3566 | return AdjustCost(Entry->Cost); |
| 3567 | |
| 3568 | static const TypeConversionCostTblEntry FP16Tbl[] = { |
| 3569 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs |
| 3570 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1}, |
| 3571 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs |
| 3572 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1}, |
| 3573 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2}, // fcvtl+fcvtzs |
| 3574 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2}, |
| 3575 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2}, // fcvtzs+xtn |
| 3576 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2}, |
| 3577 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1}, // fcvtzs |
| 3578 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1}, |
| 3579 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4}, // 2*fcvtl+2*fcvtzs |
| 3580 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4}, |
| 3581 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3}, // 2*fcvtzs+xtn |
| 3582 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3}, |
| 3583 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2}, // 2*fcvtzs |
| 3584 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2}, |
| 3585 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8}, // 4*fcvtl+4*fcvtzs |
| 3586 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8}, |
| 3587 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // ushll + ucvtf |
| 3588 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // sshll + scvtf |
| 3589 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * ushl(2) + 2 * ucvtf |
| 3590 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * sshl(2) + 2 * scvtf |
| 3591 | }; |
| 3592 | |
| 3593 | if (ST->hasFullFP16()) |
| 3594 | if (const auto *Entry = ConvertCostTableLookup( |
| 3595 | Table: FP16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT())) |
| 3596 | return AdjustCost(Entry->Cost); |
| 3597 | |
| 3598 | // INT_TO_FP of i64->f32 will scalarize, which is required to avoid |
| 3599 | // double-rounding issues. |
| 3600 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && |
| 3601 | DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 && |
| 3602 | isa<FixedVectorType>(Val: Dst) && isa<FixedVectorType>(Val: Src)) |
| 3603 | return AdjustCost( |
| 3604 | cast<FixedVectorType>(Val: Dst)->getNumElements() * |
| 3605 | getCastInstrCost(Opcode, Dst: Dst->getScalarType(), Src: Src->getScalarType(), |
| 3606 | CCH, CostKind) + |
| 3607 | BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Src), Insert: false, Extract: true, |
| 3608 | CostKind) + |
| 3609 | BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Dst), Insert: true, Extract: false, |
| 3610 | CostKind)); |
| 3611 | |
| 3612 | if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && |
| 3613 | CCH == TTI::CastContextHint::Masked && |
| 3614 | ST->isSVEorStreamingSVEAvailable() && |
| 3615 | TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) == |
| 3616 | TargetLowering::TypePromoteInteger && |
| 3617 | TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) == |
| 3618 | TargetLowering::TypeSplitVector) { |
| 3619 | // The standard behaviour in the backend for these cases is to split the |
| 3620 | // extend up into two parts: |
| 3621 | // 1. Perform an extending load or masked load up to the legal type. |
| 3622 | // 2. Extend the loaded data to the final type. |
| 3623 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src); |
| 3624 | Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Context&: Src->getContext()); |
| 3625 | InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost( |
| 3626 | Opcode, Dst: LegalTy, Src, CCH, CostKind, I); |
| 3627 | InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost( |
| 3628 | Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I); |
| 3629 | return Part1 + Part2; |
| 3630 | } |
| 3631 | |
| 3632 | // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal, |
| 3633 | // but we also want to include the TTI::CastContextHint::Masked case too. |
| 3634 | if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && |
| 3635 | CCH == TTI::CastContextHint::Masked && |
| 3636 | ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(VT: DstTy)) |
| 3637 | CCH = TTI::CastContextHint::Normal; |
| 3638 | |
| 3639 | return AdjustCost( |
| 3640 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
| 3641 | } |
| 3642 | |
| 3643 | InstructionCost |
| 3644 | AArch64TTIImpl::(unsigned Opcode, Type *Dst, |
| 3645 | VectorType *VecTy, unsigned Index, |
| 3646 | TTI::TargetCostKind CostKind) const { |
| 3647 | |
| 3648 | // Make sure we were given a valid extend opcode. |
| 3649 | assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && |
| 3650 | "Invalid opcode" ); |
| 3651 | |
| 3652 | // We are extending an element we extract from a vector, so the source type |
| 3653 | // of the extend is the element type of the vector. |
| 3654 | auto *Src = VecTy->getElementType(); |
| 3655 | |
| 3656 | // Sign- and zero-extends are for integer types only. |
| 3657 | assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type" ); |
| 3658 | |
| 3659 | // Get the cost for the extract. We compute the cost (if any) for the extend |
| 3660 | // below. |
| 3661 | InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, |
| 3662 | CostKind, Index, Op0: nullptr, Op1: nullptr); |
| 3663 | |
| 3664 | // Legalize the types. |
| 3665 | auto VecLT = getTypeLegalizationCost(Ty: VecTy); |
| 3666 | auto DstVT = TLI->getValueType(DL, Ty: Dst); |
| 3667 | auto SrcVT = TLI->getValueType(DL, Ty: Src); |
| 3668 | |
| 3669 | // If the resulting type is still a vector and the destination type is legal, |
| 3670 | // we may get the extension for free. If not, get the default cost for the |
| 3671 | // extend. |
| 3672 | if (!VecLT.second.isVector() || !TLI->isTypeLegal(VT: DstVT)) |
| 3673 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
| 3674 | CostKind); |
| 3675 | |
| 3676 | // The destination type should be larger than the element type. If not, get |
| 3677 | // the default cost for the extend. |
| 3678 | if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) |
| 3679 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
| 3680 | CostKind); |
| 3681 | |
| 3682 | switch (Opcode) { |
| 3683 | default: |
| 3684 | llvm_unreachable("Opcode should be either SExt or ZExt" ); |
| 3685 | |
| 3686 | // For sign-extends, we only need a smov, which performs the extension |
| 3687 | // automatically. |
| 3688 | case Instruction::SExt: |
| 3689 | return Cost; |
| 3690 | |
| 3691 | // For zero-extends, the extend is performed automatically by a umov unless |
| 3692 | // the destination type is i64 and the element type is i8 or i16. |
| 3693 | case Instruction::ZExt: |
| 3694 | if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) |
| 3695 | return Cost; |
| 3696 | } |
| 3697 | |
| 3698 | // If we are unable to perform the extend for free, get the default cost. |
| 3699 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
| 3700 | CostKind); |
| 3701 | } |
| 3702 | |
| 3703 | InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, |
| 3704 | TTI::TargetCostKind CostKind, |
| 3705 | const Instruction *I) const { |
| 3706 | if (CostKind != TTI::TCK_RecipThroughput) |
| 3707 | return Opcode == Instruction::PHI ? 0 : 1; |
| 3708 | assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind" ); |
| 3709 | // Branches are assumed to be predicted. |
| 3710 | return 0; |
| 3711 | } |
| 3712 | |
| 3713 | InstructionCost AArch64TTIImpl::getVectorInstrCostHelper( |
| 3714 | unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, |
| 3715 | bool HasRealUse, const Instruction *I, Value *Scalar, |
| 3716 | ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const { |
| 3717 | assert(Val->isVectorTy() && "This must be a vector type" ); |
| 3718 | |
| 3719 | if (Index != -1U) { |
| 3720 | // Legalize the type. |
| 3721 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val); |
| 3722 | |
| 3723 | // This type is legalized to a scalar type. |
| 3724 | if (!LT.second.isVector()) |
| 3725 | return 0; |
| 3726 | |
| 3727 | // The type may be split. For fixed-width vectors we can normalize the |
| 3728 | // index to the new type. |
| 3729 | if (LT.second.isFixedLengthVector()) { |
| 3730 | unsigned Width = LT.second.getVectorNumElements(); |
| 3731 | Index = Index % Width; |
| 3732 | } |
| 3733 | |
| 3734 | // The element at index zero is already inside the vector. |
| 3735 | // - For a physical (HasRealUse==true) insert-element or extract-element |
| 3736 | // instruction that extracts integers, an explicit FPR -> GPR move is |
| 3737 | // needed. So it has non-zero cost. |
| 3738 | // - For the rest of cases (virtual instruction or element type is float), |
| 3739 | // consider the instruction free. |
| 3740 | if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) |
| 3741 | return 0; |
| 3742 | |
| 3743 | // This is recognising a LD1 single-element structure to one lane of one |
| 3744 | // register instruction. I.e., if this is an `insertelement` instruction, |
| 3745 | // and its second operand is a load, then we will generate a LD1, which |
| 3746 | // are expensive instructions. |
| 3747 | if (I && dyn_cast<LoadInst>(Val: I->getOperand(i: 1))) |
| 3748 | return CostKind == TTI::TCK_CodeSize |
| 3749 | ? 0 |
| 3750 | : ST->getVectorInsertExtractBaseCost() + 1; |
| 3751 | |
| 3752 | // i1 inserts and extract will include an extra cset or cmp of the vector |
| 3753 | // value. Increase the cost by 1 to account. |
| 3754 | if (Val->getScalarSizeInBits() == 1) |
| 3755 | return CostKind == TTI::TCK_CodeSize |
| 3756 | ? 2 |
| 3757 | : ST->getVectorInsertExtractBaseCost() + 1; |
| 3758 | |
| 3759 | // FIXME: |
| 3760 | // If the extract-element and insert-element instructions could be |
| 3761 | // simplified away (e.g., could be combined into users by looking at use-def |
| 3762 | // context), they have no cost. This is not done in the first place for |
| 3763 | // compile-time considerations. |
| 3764 | } |
| 3765 | |
| 3766 | // In case of Neon, if there exists extractelement from lane != 0 such that |
| 3767 | // 1. extractelement does not necessitate a move from vector_reg -> GPR. |
| 3768 | // 2. extractelement result feeds into fmul. |
| 3769 | // 3. Other operand of fmul is an extractelement from lane 0 or lane |
| 3770 | // equivalent to 0. |
| 3771 | // then the extractelement can be merged with fmul in the backend and it |
| 3772 | // incurs no cost. |
| 3773 | // e.g. |
| 3774 | // define double @foo(<2 x double> %a) { |
| 3775 | // %1 = extractelement <2 x double> %a, i32 0 |
| 3776 | // %2 = extractelement <2 x double> %a, i32 1 |
| 3777 | // %res = fmul double %1, %2 |
| 3778 | // ret double %res |
| 3779 | // } |
| 3780 | // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1] |
| 3781 | auto = [&]() { |
| 3782 | // We bail out if the extract is from lane 0. |
| 3783 | if (Index == 0) |
| 3784 | return false; |
| 3785 | |
| 3786 | // Check if the scalar element type of the vector operand of ExtractElement |
| 3787 | // instruction is one of the allowed types. |
| 3788 | auto IsAllowedScalarTy = [&](const Type *T) { |
| 3789 | return T->isFloatTy() || T->isDoubleTy() || |
| 3790 | (T->isHalfTy() && ST->hasFullFP16()); |
| 3791 | }; |
| 3792 | |
| 3793 | // Check if the extractelement user is scalar fmul. |
| 3794 | auto IsUserFMulScalarTy = [](const Value *EEUser) { |
| 3795 | // Check if the user is scalar fmul. |
| 3796 | const auto *BO = dyn_cast<BinaryOperator>(Val: EEUser); |
| 3797 | return BO && BO->getOpcode() == BinaryOperator::FMul && |
| 3798 | !BO->getType()->isVectorTy(); |
| 3799 | }; |
| 3800 | |
| 3801 | // Check if the extract index is from lane 0 or lane equivalent to 0 for a |
| 3802 | // certain scalar type and a certain vector register width. |
| 3803 | auto = [&](unsigned Idx, unsigned EltSz) { |
| 3804 | auto RegWidth = |
| 3805 | getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) |
| 3806 | .getFixedValue(); |
| 3807 | return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0); |
| 3808 | }; |
| 3809 | |
| 3810 | // Check if the type constraints on input vector type and result scalar type |
| 3811 | // of extractelement instruction are satisfied. |
| 3812 | if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType())) |
| 3813 | return false; |
| 3814 | |
| 3815 | if (Scalar) { |
| 3816 | DenseMap<User *, unsigned> ; |
| 3817 | for (auto *U : Scalar->users()) { |
| 3818 | if (!IsUserFMulScalarTy(U)) |
| 3819 | return false; |
| 3820 | // Recording entry for the user is important. Index value is not |
| 3821 | // important. |
| 3822 | UserToExtractIdx[U]; |
| 3823 | } |
| 3824 | if (UserToExtractIdx.empty()) |
| 3825 | return false; |
| 3826 | for (auto &[S, U, L] : ScalarUserAndIdx) { |
| 3827 | for (auto *U : S->users()) { |
| 3828 | if (UserToExtractIdx.contains(Val: U)) { |
| 3829 | auto *FMul = cast<BinaryOperator>(Val: U); |
| 3830 | auto *Op0 = FMul->getOperand(i_nocapture: 0); |
| 3831 | auto *Op1 = FMul->getOperand(i_nocapture: 1); |
| 3832 | if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) { |
| 3833 | UserToExtractIdx[U] = L; |
| 3834 | break; |
| 3835 | } |
| 3836 | } |
| 3837 | } |
| 3838 | } |
| 3839 | for (auto &[U, L] : UserToExtractIdx) { |
| 3840 | if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) && |
| 3841 | !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits())) |
| 3842 | return false; |
| 3843 | } |
| 3844 | } else { |
| 3845 | const auto *EE = cast<ExtractElementInst>(Val: I); |
| 3846 | |
| 3847 | const auto *IdxOp = dyn_cast<ConstantInt>(Val: EE->getIndexOperand()); |
| 3848 | if (!IdxOp) |
| 3849 | return false; |
| 3850 | |
| 3851 | return !EE->users().empty() && all_of(Range: EE->users(), P: [&](const User *U) { |
| 3852 | if (!IsUserFMulScalarTy(U)) |
| 3853 | return false; |
| 3854 | |
| 3855 | // Check if the other operand of extractelement is also extractelement |
| 3856 | // from lane equivalent to 0. |
| 3857 | const auto *BO = cast<BinaryOperator>(Val: U); |
| 3858 | const auto *OtherEE = dyn_cast<ExtractElementInst>( |
| 3859 | Val: BO->getOperand(i_nocapture: 0) == EE ? BO->getOperand(i_nocapture: 1) : BO->getOperand(i_nocapture: 0)); |
| 3860 | if (OtherEE) { |
| 3861 | const auto *IdxOp = dyn_cast<ConstantInt>(Val: OtherEE->getIndexOperand()); |
| 3862 | if (!IdxOp) |
| 3863 | return false; |
| 3864 | return IsExtractLaneEquivalentToZero( |
| 3865 | cast<ConstantInt>(Val: OtherEE->getIndexOperand()) |
| 3866 | ->getValue() |
| 3867 | .getZExtValue(), |
| 3868 | OtherEE->getType()->getScalarSizeInBits()); |
| 3869 | } |
| 3870 | return true; |
| 3871 | }); |
| 3872 | } |
| 3873 | return true; |
| 3874 | }; |
| 3875 | |
| 3876 | if (Opcode == Instruction::ExtractElement && (I || Scalar) && |
| 3877 | ExtractCanFuseWithFmul()) |
| 3878 | return 0; |
| 3879 | |
| 3880 | // All other insert/extracts cost this much. |
| 3881 | return CostKind == TTI::TCK_CodeSize ? 1 |
| 3882 | : ST->getVectorInsertExtractBaseCost(); |
| 3883 | } |
| 3884 | |
| 3885 | InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
| 3886 | TTI::TargetCostKind CostKind, |
| 3887 | unsigned Index, |
| 3888 | const Value *Op0, |
| 3889 | const Value *Op1) const { |
| 3890 | bool HasRealUse = |
| 3891 | Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Val: Op0); |
| 3892 | return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse); |
| 3893 | } |
| 3894 | |
| 3895 | InstructionCost AArch64TTIImpl::getVectorInstrCost( |
| 3896 | unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, |
| 3897 | Value *Scalar, |
| 3898 | ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const { |
| 3899 | return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse: false, I: nullptr, |
| 3900 | Scalar, ScalarUserAndIdx); |
| 3901 | } |
| 3902 | |
| 3903 | InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, |
| 3904 | Type *Val, |
| 3905 | TTI::TargetCostKind CostKind, |
| 3906 | unsigned Index) const { |
| 3907 | return getVectorInstrCostHelper(Opcode: I.getOpcode(), Val, CostKind, Index, |
| 3908 | HasRealUse: true /* HasRealUse */, I: &I); |
| 3909 | } |
| 3910 | |
| 3911 | InstructionCost AArch64TTIImpl::getScalarizationOverhead( |
| 3912 | VectorType *Ty, const APInt &DemandedElts, bool Insert, bool , |
| 3913 | TTI::TargetCostKind CostKind, bool ForPoisonSrc, |
| 3914 | ArrayRef<Value *> VL) const { |
| 3915 | if (isa<ScalableVectorType>(Val: Ty)) |
| 3916 | return InstructionCost::getInvalid(); |
| 3917 | if (Ty->getElementType()->isFloatingPointTy()) |
| 3918 | return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract, |
| 3919 | CostKind); |
| 3920 | unsigned VecInstCost = |
| 3921 | CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost(); |
| 3922 | return DemandedElts.popcount() * (Insert + Extract) * VecInstCost; |
| 3923 | } |
| 3924 | |
| 3925 | InstructionCost AArch64TTIImpl::getArithmeticInstrCost( |
| 3926 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
| 3927 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
| 3928 | ArrayRef<const Value *> Args, const Instruction *CxtI) const { |
| 3929 | |
| 3930 | // The code-generator is currently not able to handle scalable vectors |
| 3931 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 3932 | // it. This change will be removed when code-generation for these types is |
| 3933 | // sufficiently reliable. |
| 3934 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty)) |
| 3935 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
| 3936 | return InstructionCost::getInvalid(); |
| 3937 | |
| 3938 | // TODO: Handle more cost kinds. |
| 3939 | if (CostKind != TTI::TCK_RecipThroughput) |
| 3940 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
| 3941 | Opd2Info: Op2Info, Args, CxtI); |
| 3942 | |
| 3943 | // Legalize the type. |
| 3944 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
| 3945 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 3946 | |
| 3947 | switch (ISD) { |
| 3948 | default: |
| 3949 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
| 3950 | Opd2Info: Op2Info); |
| 3951 | case ISD::SREM: |
| 3952 | case ISD::SDIV: |
| 3953 | /* |
| 3954 | Notes for sdiv/srem specific costs: |
| 3955 | 1. This only considers the cases where the divisor is constant, uniform and |
| 3956 | (pow-of-2/non-pow-of-2). Other cases are not important since they either |
| 3957 | result in some form of (ldr + adrp), corresponding to constant vectors, or |
| 3958 | scalarization of the division operation. |
| 3959 | 2. Constant divisors, either negative in whole or partially, don't result in |
| 3960 | significantly different codegen as compared to positive constant divisors. |
| 3961 | So, we don't consider negative divisors separately. |
| 3962 | 3. If the codegen is significantly different with SVE, it has been indicated |
| 3963 | using comments at appropriate places. |
| 3964 | |
| 3965 | sdiv specific cases: |
| 3966 | ----------------------------------------------------------------------- |
| 3967 | codegen | pow-of-2 | Type |
| 3968 | ----------------------------------------------------------------------- |
| 3969 | add + cmp + csel + asr | Y | i64 |
| 3970 | add + cmp + csel + asr | Y | i32 |
| 3971 | ----------------------------------------------------------------------- |
| 3972 | |
| 3973 | srem specific cases: |
| 3974 | ----------------------------------------------------------------------- |
| 3975 | codegen | pow-of-2 | Type |
| 3976 | ----------------------------------------------------------------------- |
| 3977 | negs + and + and + csneg | Y | i64 |
| 3978 | negs + and + and + csneg | Y | i32 |
| 3979 | ----------------------------------------------------------------------- |
| 3980 | |
| 3981 | other sdiv/srem cases: |
| 3982 | ------------------------------------------------------------------------- |
| 3983 | common codegen | + srem | + sdiv | pow-of-2 | Type |
| 3984 | ------------------------------------------------------------------------- |
| 3985 | smulh + asr + add + add | - | - | N | i64 |
| 3986 | smull + lsr + add + add | - | - | N | i32 |
| 3987 | usra | and + sub | sshr | Y | <2 x i64> |
| 3988 | 2 * (scalar code) | - | - | N | <2 x i64> |
| 3989 | usra | bic + sub | sshr + neg | Y | <4 x i32> |
| 3990 | smull2 + smull + uzp2 | mls | - | N | <4 x i32> |
| 3991 | + sshr + usra | | | | |
| 3992 | ------------------------------------------------------------------------- |
| 3993 | */ |
| 3994 | if (Op2Info.isConstant() && Op2Info.isUniform()) { |
| 3995 | InstructionCost AddCost = |
| 3996 | getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind, |
| 3997 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 3998 | InstructionCost AsrCost = |
| 3999 | getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind, |
| 4000 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 4001 | InstructionCost MulCost = |
| 4002 | getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
| 4003 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 4004 | // add/cmp/csel/csneg should have similar cost while asr/negs/and should |
| 4005 | // have similar cost. |
| 4006 | auto VT = TLI->getValueType(DL, Ty); |
| 4007 | if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) { |
| 4008 | if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) { |
| 4009 | // Neg can be folded into the asr instruction. |
| 4010 | return ISD == ISD::SDIV ? (3 * AddCost + AsrCost) |
| 4011 | : (3 * AsrCost + AddCost); |
| 4012 | } else { |
| 4013 | return MulCost + AsrCost + 2 * AddCost; |
| 4014 | } |
| 4015 | } else if (VT.isVector()) { |
| 4016 | InstructionCost UsraCost = 2 * AsrCost; |
| 4017 | if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) { |
| 4018 | // Division with scalable types corresponds to native 'asrd' |
| 4019 | // instruction when SVE is available. |
| 4020 | // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8) |
| 4021 | |
| 4022 | // One more for the negation in SDIV |
| 4023 | InstructionCost Cost = |
| 4024 | (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0; |
| 4025 | if (Ty->isScalableTy() && ST->hasSVE()) |
| 4026 | Cost += 2 * AsrCost; |
| 4027 | else { |
| 4028 | Cost += |
| 4029 | UsraCost + |
| 4030 | (ISD == ISD::SDIV |
| 4031 | ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost |
| 4032 | : 2 * AddCost); |
| 4033 | } |
| 4034 | return Cost; |
| 4035 | } else if (LT.second == MVT::v2i64) { |
| 4036 | return VT.getVectorNumElements() * |
| 4037 | getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind, |
| 4038 | Op1Info: Op1Info.getNoProps(), |
| 4039 | Op2Info: Op2Info.getNoProps()); |
| 4040 | } else { |
| 4041 | // When SVE is available, we get: |
| 4042 | // smulh + lsr + add/sub + asr + add/sub. |
| 4043 | if (Ty->isScalableTy() && ST->hasSVE()) |
| 4044 | return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost; |
| 4045 | return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost; |
| 4046 | } |
| 4047 | } |
| 4048 | } |
| 4049 | if (Op2Info.isConstant() && !Op2Info.isUniform() && |
| 4050 | LT.second.isFixedLengthVector()) { |
| 4051 | // FIXME: When the constant vector is non-uniform, this may result in |
| 4052 | // loading the vector from constant pool or in some cases, may also result |
| 4053 | // in scalarization. For now, we are approximating this with the |
| 4054 | // scalarization cost. |
| 4055 | auto = 2 * getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, |
| 4056 | CostKind, Index: -1, Op0: nullptr, Op1: nullptr); |
| 4057 | auto InsertCost = getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, |
| 4058 | CostKind, Index: -1, Op0: nullptr, Op1: nullptr); |
| 4059 | unsigned NElts = cast<FixedVectorType>(Val: Ty)->getNumElements(); |
| 4060 | return ExtractCost + InsertCost + |
| 4061 | NElts * getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), |
| 4062 | CostKind, Op1Info: Op1Info.getNoProps(), |
| 4063 | Op2Info: Op2Info.getNoProps()); |
| 4064 | } |
| 4065 | [[fallthrough]]; |
| 4066 | case ISD::UDIV: |
| 4067 | case ISD::UREM: { |
| 4068 | auto VT = TLI->getValueType(DL, Ty); |
| 4069 | if (Op2Info.isConstant()) { |
| 4070 | // If the operand is a power of 2 we can use the shift or and cost. |
| 4071 | if (ISD == ISD::UDIV && Op2Info.isPowerOf2()) |
| 4072 | return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind, |
| 4073 | Op1Info: Op1Info.getNoProps(), |
| 4074 | Op2Info: Op2Info.getNoProps()); |
| 4075 | if (ISD == ISD::UREM && Op2Info.isPowerOf2()) |
| 4076 | return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind, |
| 4077 | Op1Info: Op1Info.getNoProps(), |
| 4078 | Op2Info: Op2Info.getNoProps()); |
| 4079 | |
| 4080 | if (ISD == ISD::UDIV || ISD == ISD::UREM) { |
| 4081 | // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL. |
| 4082 | // The MULHU will be expanded to UMULL for the types not listed below, |
| 4083 | // and will become a pair of UMULL+MULL2 for 128bit vectors. |
| 4084 | bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 || |
| 4085 | LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 || |
| 4086 | LT.second == MVT::nxv16i8; |
| 4087 | bool Is128bit = LT.second.is128BitVector(); |
| 4088 | |
| 4089 | InstructionCost MulCost = |
| 4090 | getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
| 4091 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 4092 | InstructionCost AddCost = |
| 4093 | getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind, |
| 4094 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 4095 | InstructionCost ShrCost = |
| 4096 | getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind, |
| 4097 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 4098 | InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH |
| 4099 | (HasMULH ? 0 : ShrCost) + // UMULL shift |
| 4100 | AddCost * 2 + ShrCost; |
| 4101 | return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0); |
| 4102 | } |
| 4103 | } |
| 4104 | |
| 4105 | // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are |
| 4106 | // emitted by the backend even when those functions are not declared in the |
| 4107 | // module. |
| 4108 | if (!VT.isVector() && VT.getSizeInBits() > 64) |
| 4109 | return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind); |
| 4110 | |
| 4111 | InstructionCost Cost = BaseT::getArithmeticInstrCost( |
| 4112 | Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info); |
| 4113 | if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) { |
| 4114 | if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) { |
| 4115 | // SDIV/UDIV operations are lowered using SVE, then we can have less |
| 4116 | // costs. |
| 4117 | if (isa<FixedVectorType>(Val: Ty) && cast<FixedVectorType>(Val: Ty) |
| 4118 | ->getPrimitiveSizeInBits() |
| 4119 | .getFixedValue() < 128) { |
| 4120 | EVT VT = TLI->getValueType(DL, Ty); |
| 4121 | static const CostTblEntry DivTbl[]{ |
| 4122 | {.ISD: ISD::SDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v4i8, .Cost: 8}, |
| 4123 | {.ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::SDIV, .Type: MVT::v2i16, .Cost: 5}, |
| 4124 | {.ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: 1}, |
| 4125 | {.ISD: ISD::UDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v4i8, .Cost: 8}, |
| 4126 | {.ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::UDIV, .Type: MVT::v2i16, .Cost: 5}, |
| 4127 | {.ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: 1}}; |
| 4128 | |
| 4129 | const auto *Entry = CostTableLookup(Table: DivTbl, ISD, Ty: VT.getSimpleVT()); |
| 4130 | if (nullptr != Entry) |
| 4131 | return Entry->Cost; |
| 4132 | } |
| 4133 | // For 8/16-bit elements, the cost is higher because the type |
| 4134 | // requires promotion and possibly splitting: |
| 4135 | if (LT.second.getScalarType() == MVT::i8) |
| 4136 | Cost *= 8; |
| 4137 | else if (LT.second.getScalarType() == MVT::i16) |
| 4138 | Cost *= 4; |
| 4139 | return Cost; |
| 4140 | } else { |
| 4141 | // If one of the operands is a uniform constant then the cost for each |
| 4142 | // element is Cost for insertion, extraction and division. |
| 4143 | // Insertion cost = 2, Extraction Cost = 2, Division = cost for the |
| 4144 | // operation with scalar type |
| 4145 | if ((Op1Info.isConstant() && Op1Info.isUniform()) || |
| 4146 | (Op2Info.isConstant() && Op2Info.isUniform())) { |
| 4147 | if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) { |
| 4148 | InstructionCost DivCost = BaseT::getArithmeticInstrCost( |
| 4149 | Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info); |
| 4150 | return (4 + DivCost) * VTy->getNumElements(); |
| 4151 | } |
| 4152 | } |
| 4153 | // On AArch64, without SVE, vector divisions are expanded |
| 4154 | // into scalar divisions of each pair of elements. |
| 4155 | Cost += getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind, |
| 4156 | Index: -1, Op0: nullptr, Op1: nullptr); |
| 4157 | Cost += getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1, |
| 4158 | Op0: nullptr, Op1: nullptr); |
| 4159 | } |
| 4160 | |
| 4161 | // TODO: if one of the arguments is scalar, then it's not necessary to |
| 4162 | // double the cost of handling the vector elements. |
| 4163 | Cost += Cost; |
| 4164 | } |
| 4165 | return Cost; |
| 4166 | } |
| 4167 | case ISD::MUL: |
| 4168 | // When SVE is available, then we can lower the v2i64 operation using |
| 4169 | // the SVE mul instruction, which has a lower cost. |
| 4170 | if (LT.second == MVT::v2i64 && ST->hasSVE()) |
| 4171 | return LT.first; |
| 4172 | |
| 4173 | // When SVE is not available, there is no MUL.2d instruction, |
| 4174 | // which means mul <2 x i64> is expensive as elements are extracted |
| 4175 | // from the vectors and the muls scalarized. |
| 4176 | // As getScalarizationOverhead is a bit too pessimistic, we |
| 4177 | // estimate the cost for a i64 vector directly here, which is: |
| 4178 | // - four 2-cost i64 extracts, |
| 4179 | // - two 2-cost i64 inserts, and |
| 4180 | // - two 1-cost muls. |
| 4181 | // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with |
| 4182 | // LT.first = 2 the cost is 28. If both operands are extensions it will not |
| 4183 | // need to scalarize so the cost can be cheaper (smull or umull). |
| 4184 | // so the cost can be cheaper (smull or umull). |
| 4185 | if (LT.second != MVT::v2i64 || isWideningInstruction(DstTy: Ty, Opcode, Args)) |
| 4186 | return LT.first; |
| 4187 | return cast<VectorType>(Val: Ty)->getElementCount().getKnownMinValue() * |
| 4188 | (getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind) + |
| 4189 | getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind, Index: -1, |
| 4190 | Op0: nullptr, Op1: nullptr) * |
| 4191 | 2 + |
| 4192 | getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1, |
| 4193 | Op0: nullptr, Op1: nullptr)); |
| 4194 | case ISD::ADD: |
| 4195 | case ISD::XOR: |
| 4196 | case ISD::OR: |
| 4197 | case ISD::AND: |
| 4198 | case ISD::SRL: |
| 4199 | case ISD::SRA: |
| 4200 | case ISD::SHL: |
| 4201 | // These nodes are marked as 'custom' for combining purposes only. |
| 4202 | // We know that they are legal. See LowerAdd in ISelLowering. |
| 4203 | return LT.first; |
| 4204 | |
| 4205 | case ISD::FNEG: |
| 4206 | // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul |
| 4207 | if ((Ty->isFloatTy() || Ty->isDoubleTy() || |
| 4208 | (Ty->isHalfTy() && ST->hasFullFP16())) && |
| 4209 | CxtI && |
| 4210 | ((CxtI->hasOneUse() && |
| 4211 | match(V: *CxtI->user_begin(), P: m_FMul(L: m_Value(), R: m_Value()))) || |
| 4212 | match(V: CxtI->getOperand(i: 0), P: m_FMul(L: m_Value(), R: m_Value())))) |
| 4213 | return 0; |
| 4214 | [[fallthrough]]; |
| 4215 | case ISD::FADD: |
| 4216 | case ISD::FSUB: |
| 4217 | // Increase the cost for half and bfloat types if not architecturally |
| 4218 | // supported. |
| 4219 | if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || |
| 4220 | (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) |
| 4221 | return 2 * LT.first; |
| 4222 | if (!Ty->getScalarType()->isFP128Ty()) |
| 4223 | return LT.first; |
| 4224 | [[fallthrough]]; |
| 4225 | case ISD::FMUL: |
| 4226 | case ISD::FDIV: |
| 4227 | // These nodes are marked as 'custom' just to lower them to SVE. |
| 4228 | // We know said lowering will incur no additional cost. |
| 4229 | if (!Ty->getScalarType()->isFP128Ty()) |
| 4230 | return 2 * LT.first; |
| 4231 | |
| 4232 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
| 4233 | Opd2Info: Op2Info); |
| 4234 | case ISD::FREM: |
| 4235 | // Pass nullptr as fmod/fmodf calls are emitted by the backend even when |
| 4236 | // those functions are not declared in the module. |
| 4237 | if (!Ty->isVectorTy()) |
| 4238 | return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind); |
| 4239 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
| 4240 | Opd2Info: Op2Info); |
| 4241 | } |
| 4242 | } |
| 4243 | |
| 4244 | InstructionCost |
| 4245 | AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, |
| 4246 | const SCEV *Ptr) const { |
| 4247 | // Address computations in vectorized code with non-consecutive addresses will |
| 4248 | // likely result in more instructions compared to scalar code where the |
| 4249 | // computation can more often be merged into the index mode. The resulting |
| 4250 | // extra micro-ops can significantly decrease throughput. |
| 4251 | unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead; |
| 4252 | int MaxMergeDistance = 64; |
| 4253 | |
| 4254 | if (Ty->isVectorTy() && SE && |
| 4255 | !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + 1)) |
| 4256 | return NumVectorInstToHideOverhead; |
| 4257 | |
| 4258 | // In many cases the address computation is not merged into the instruction |
| 4259 | // addressing mode. |
| 4260 | return 1; |
| 4261 | } |
| 4262 | |
| 4263 | InstructionCost AArch64TTIImpl::getCmpSelInstrCost( |
| 4264 | unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, |
| 4265 | TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, |
| 4266 | TTI::OperandValueInfo Op2Info, const Instruction *I) const { |
| 4267 | // TODO: Handle other cost kinds. |
| 4268 | if (CostKind != TTI::TCK_RecipThroughput) |
| 4269 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
| 4270 | Op1Info, Op2Info, I); |
| 4271 | |
| 4272 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 4273 | // We don't lower some vector selects well that are wider than the register |
| 4274 | // width. |
| 4275 | if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SELECT) { |
| 4276 | // We would need this many instructions to hide the scalarization happening. |
| 4277 | const int AmortizationCost = 20; |
| 4278 | |
| 4279 | // If VecPred is not set, check if we can get a predicate from the context |
| 4280 | // instruction, if its type matches the requested ValTy. |
| 4281 | if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { |
| 4282 | CmpPredicate CurrentPred; |
| 4283 | if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(), |
| 4284 | R: m_Value()))) |
| 4285 | VecPred = CurrentPred; |
| 4286 | } |
| 4287 | // Check if we have a compare/select chain that can be lowered using |
| 4288 | // a (F)CMxx & BFI pair. |
| 4289 | if (CmpInst::isIntPredicate(P: VecPred) || VecPred == CmpInst::FCMP_OLE || |
| 4290 | VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || |
| 4291 | VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || |
| 4292 | VecPred == CmpInst::FCMP_UNE) { |
| 4293 | static const auto ValidMinMaxTys = { |
| 4294 | MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, |
| 4295 | MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; |
| 4296 | static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; |
| 4297 | |
| 4298 | auto LT = getTypeLegalizationCost(Ty: ValTy); |
| 4299 | if (any_of(Range: ValidMinMaxTys, P: [<](MVT M) { return M == LT.second; }) || |
| 4300 | (ST->hasFullFP16() && |
| 4301 | any_of(Range: ValidFP16MinMaxTys, P: [<](MVT M) { return M == LT.second; }))) |
| 4302 | return LT.first; |
| 4303 | } |
| 4304 | |
| 4305 | static const TypeConversionCostTblEntry |
| 4306 | VectorSelectTbl[] = { |
| 4307 | { .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f32, .Cost: 2 }, |
| 4308 | { .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f64, .Cost: 2 }, |
| 4309 | { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f32, .Cost: 2 }, |
| 4310 | { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f16, .Cost: 2 }, |
| 4311 | { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8f16, .Cost: 2 }, |
| 4312 | { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: 16 }, |
| 4313 | { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: 8 }, |
| 4314 | { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: 16 }, |
| 4315 | { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: 4 * AmortizationCost }, |
| 4316 | { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: 8 * AmortizationCost }, |
| 4317 | { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: 16 * AmortizationCost } |
| 4318 | }; |
| 4319 | |
| 4320 | EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy); |
| 4321 | EVT SelValTy = TLI->getValueType(DL, Ty: ValTy); |
| 4322 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { |
| 4323 | if (const auto *Entry = ConvertCostTableLookup(Table: VectorSelectTbl, ISD, |
| 4324 | Dst: SelCondTy.getSimpleVT(), |
| 4325 | Src: SelValTy.getSimpleVT())) |
| 4326 | return Entry->Cost; |
| 4327 | } |
| 4328 | } |
| 4329 | |
| 4330 | if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SETCC) { |
| 4331 | Type *ValScalarTy = ValTy->getScalarType(); |
| 4332 | if ((ValScalarTy->isHalfTy() && !ST->hasFullFP16()) || |
| 4333 | ValScalarTy->isBFloatTy()) { |
| 4334 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
| 4335 | |
| 4336 | // Without dedicated instructions we promote [b]f16 compares to f32. |
| 4337 | auto *PromotedTy = |
| 4338 | VectorType::get(ElementType: Type::getFloatTy(C&: ValTy->getContext()), Other: ValVTy); |
| 4339 | |
| 4340 | InstructionCost Cost = 0; |
| 4341 | // Promote operands to float vectors. |
| 4342 | Cost += 2 * getCastInstrCost(Opcode: Instruction::FPExt, Dst: PromotedTy, Src: ValTy, |
| 4343 | CCH: TTI::CastContextHint::None, CostKind); |
| 4344 | // Compare float vectors. |
| 4345 | Cost += getCmpSelInstrCost(Opcode, ValTy: PromotedTy, CondTy, VecPred, CostKind, |
| 4346 | Op1Info, Op2Info); |
| 4347 | // During codegen we'll truncate the vector result from i32 to i16. |
| 4348 | Cost += |
| 4349 | getCastInstrCost(Opcode: Instruction::Trunc, Dst: VectorType::getInteger(VTy: ValVTy), |
| 4350 | Src: VectorType::getInteger(VTy: PromotedTy), |
| 4351 | CCH: TTI::CastContextHint::None, CostKind); |
| 4352 | return Cost; |
| 4353 | } |
| 4354 | } |
| 4355 | |
| 4356 | // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to |
| 4357 | // icmp(and, 0) as free, as we can make use of ands, but only if the |
| 4358 | // comparison is not unsigned. |
| 4359 | if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I && |
| 4360 | !CmpInst::isUnsigned(predicate: VecPred) && |
| 4361 | TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) && |
| 4362 | match(V: I->getOperand(i: 0), P: m_And(L: m_Value(), R: m_Value()))) { |
| 4363 | if (match(V: I->getOperand(i: 1), P: m_Zero())) |
| 4364 | return 0; |
| 4365 | |
| 4366 | // x >= 1 / x < 1 -> x > 0 / x <= 0 |
| 4367 | if (match(V: I->getOperand(i: 1), P: m_One()) && |
| 4368 | (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE)) |
| 4369 | return 0; |
| 4370 | |
| 4371 | // x <= -1 / x > -1 -> x > 0 / x <= 0 |
| 4372 | if (match(V: I->getOperand(i: 1), P: m_AllOnes()) && |
| 4373 | (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT)) |
| 4374 | return 0; |
| 4375 | } |
| 4376 | |
| 4377 | // The base case handles scalable vectors fine for now, since it treats the |
| 4378 | // cost as 1 * legalization cost. |
| 4379 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
| 4380 | Op1Info, Op2Info, I); |
| 4381 | } |
| 4382 | |
| 4383 | AArch64TTIImpl::TTI::MemCmpExpansionOptions |
| 4384 | AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { |
| 4385 | TTI::MemCmpExpansionOptions Options; |
| 4386 | if (ST->requiresStrictAlign()) { |
| 4387 | // TODO: Add cost modeling for strict align. Misaligned loads expand to |
| 4388 | // a bunch of instructions when strict align is enabled. |
| 4389 | return Options; |
| 4390 | } |
| 4391 | Options.AllowOverlappingLoads = true; |
| 4392 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); |
| 4393 | Options.NumLoadsPerBlock = Options.MaxNumLoads; |
| 4394 | // TODO: Though vector loads usually perform well on AArch64, in some targets |
| 4395 | // they may wake up the FP unit, which raises the power consumption. Perhaps |
| 4396 | // they could be used with no holds barred (-O3). |
| 4397 | Options.LoadSizes = {8, 4, 2, 1}; |
| 4398 | Options.AllowedTailExpansions = {3, 5, 6}; |
| 4399 | return Options; |
| 4400 | } |
| 4401 | |
| 4402 | bool AArch64TTIImpl::prefersVectorizedAddressing() const { |
| 4403 | return ST->hasSVE(); |
| 4404 | } |
| 4405 | |
| 4406 | InstructionCost |
| 4407 | AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, |
| 4408 | Align Alignment, unsigned AddressSpace, |
| 4409 | TTI::TargetCostKind CostKind) const { |
| 4410 | if (useNeonVector(Ty: Src)) |
| 4411 | return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace, |
| 4412 | CostKind); |
| 4413 | auto LT = getTypeLegalizationCost(Ty: Src); |
| 4414 | if (!LT.first.isValid()) |
| 4415 | return InstructionCost::getInvalid(); |
| 4416 | |
| 4417 | // Return an invalid cost for element types that we are unable to lower. |
| 4418 | auto *VT = cast<VectorType>(Val: Src); |
| 4419 | if (VT->getElementType()->isIntegerTy(Bitwidth: 1)) |
| 4420 | return InstructionCost::getInvalid(); |
| 4421 | |
| 4422 | // The code-generator is currently not able to handle scalable vectors |
| 4423 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 4424 | // it. This change will be removed when code-generation for these types is |
| 4425 | // sufficiently reliable. |
| 4426 | if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
| 4427 | return InstructionCost::getInvalid(); |
| 4428 | |
| 4429 | return LT.first; |
| 4430 | } |
| 4431 | |
| 4432 | // This function returns gather/scatter overhead either from |
| 4433 | // user-provided value or specialized values per-target from \p ST. |
| 4434 | static unsigned getSVEGatherScatterOverhead(unsigned Opcode, |
| 4435 | const AArch64Subtarget *ST) { |
| 4436 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && |
| 4437 | "Should be called on only load or stores." ); |
| 4438 | switch (Opcode) { |
| 4439 | case Instruction::Load: |
| 4440 | if (SVEGatherOverhead.getNumOccurrences() > 0) |
| 4441 | return SVEGatherOverhead; |
| 4442 | return ST->getGatherOverhead(); |
| 4443 | break; |
| 4444 | case Instruction::Store: |
| 4445 | if (SVEScatterOverhead.getNumOccurrences() > 0) |
| 4446 | return SVEScatterOverhead; |
| 4447 | return ST->getScatterOverhead(); |
| 4448 | break; |
| 4449 | default: |
| 4450 | llvm_unreachable("Shouldn't have reached here" ); |
| 4451 | } |
| 4452 | } |
| 4453 | |
| 4454 | InstructionCost AArch64TTIImpl::getGatherScatterOpCost( |
| 4455 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
| 4456 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { |
| 4457 | if (useNeonVector(Ty: DataTy) || !isLegalMaskedGatherScatter(DataType: DataTy)) |
| 4458 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
| 4459 | Alignment, CostKind, I); |
| 4460 | auto *VT = cast<VectorType>(Val: DataTy); |
| 4461 | auto LT = getTypeLegalizationCost(Ty: DataTy); |
| 4462 | if (!LT.first.isValid()) |
| 4463 | return InstructionCost::getInvalid(); |
| 4464 | |
| 4465 | // Return an invalid cost for element types that we are unable to lower. |
| 4466 | if (!LT.second.isVector() || |
| 4467 | !isElementTypeLegalForScalableVector(Ty: VT->getElementType()) || |
| 4468 | VT->getElementType()->isIntegerTy(Bitwidth: 1)) |
| 4469 | return InstructionCost::getInvalid(); |
| 4470 | |
| 4471 | // The code-generator is currently not able to handle scalable vectors |
| 4472 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 4473 | // it. This change will be removed when code-generation for these types is |
| 4474 | // sufficiently reliable. |
| 4475 | if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
| 4476 | return InstructionCost::getInvalid(); |
| 4477 | |
| 4478 | ElementCount LegalVF = LT.second.getVectorElementCount(); |
| 4479 | InstructionCost MemOpCost = |
| 4480 | getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: 0, CostKind, |
| 4481 | OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I); |
| 4482 | // Add on an overhead cost for using gathers/scatters. |
| 4483 | MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST); |
| 4484 | return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF); |
| 4485 | } |
| 4486 | |
| 4487 | bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { |
| 4488 | return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors(); |
| 4489 | } |
| 4490 | |
| 4491 | InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, |
| 4492 | Align Alignment, |
| 4493 | unsigned AddressSpace, |
| 4494 | TTI::TargetCostKind CostKind, |
| 4495 | TTI::OperandValueInfo OpInfo, |
| 4496 | const Instruction *I) const { |
| 4497 | EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true); |
| 4498 | // Type legalization can't handle structs |
| 4499 | if (VT == MVT::Other) |
| 4500 | return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace, |
| 4501 | CostKind); |
| 4502 | |
| 4503 | auto LT = getTypeLegalizationCost(Ty); |
| 4504 | if (!LT.first.isValid()) |
| 4505 | return InstructionCost::getInvalid(); |
| 4506 | |
| 4507 | // The code-generator is currently not able to handle scalable vectors |
| 4508 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 4509 | // it. This change will be removed when code-generation for these types is |
| 4510 | // sufficiently reliable. |
| 4511 | // We also only support full register predicate loads and stores. |
| 4512 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty)) |
| 4513 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1) || |
| 4514 | (VTy->getElementType()->isIntegerTy(Bitwidth: 1) && |
| 4515 | !VTy->getElementCount().isKnownMultipleOf( |
| 4516 | RHS: ElementCount::getScalable(MinVal: 16)))) |
| 4517 | return InstructionCost::getInvalid(); |
| 4518 | |
| 4519 | // TODO: consider latency as well for TCK_SizeAndLatency. |
| 4520 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) |
| 4521 | return LT.first; |
| 4522 | |
| 4523 | if (CostKind != TTI::TCK_RecipThroughput) |
| 4524 | return 1; |
| 4525 | |
| 4526 | if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && |
| 4527 | LT.second.is128BitVector() && Alignment < Align(16)) { |
| 4528 | // Unaligned stores are extremely inefficient. We don't split all |
| 4529 | // unaligned 128-bit stores because the negative impact that has shown in |
| 4530 | // practice on inlined block copy code. |
| 4531 | // We make such stores expensive so that we will only vectorize if there |
| 4532 | // are 6 other instructions getting vectorized. |
| 4533 | const int AmortizationCost = 6; |
| 4534 | |
| 4535 | return LT.first * 2 * AmortizationCost; |
| 4536 | } |
| 4537 | |
| 4538 | // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs. |
| 4539 | if (Ty->isPtrOrPtrVectorTy()) |
| 4540 | return LT.first; |
| 4541 | |
| 4542 | if (useNeonVector(Ty)) { |
| 4543 | // Check truncating stores and extending loads. |
| 4544 | if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { |
| 4545 | // v4i8 types are lowered to scalar a load/store and sshll/xtn. |
| 4546 | if (VT == MVT::v4i8) |
| 4547 | return 2; |
| 4548 | // Otherwise we need to scalarize. |
| 4549 | return cast<FixedVectorType>(Val: Ty)->getNumElements() * 2; |
| 4550 | } |
| 4551 | EVT EltVT = VT.getVectorElementType(); |
| 4552 | unsigned EltSize = EltVT.getScalarSizeInBits(); |
| 4553 | if (!isPowerOf2_32(Value: EltSize) || EltSize < 8 || EltSize > 64 || |
| 4554 | VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1)) |
| 4555 | return LT.first; |
| 4556 | // FIXME: v3i8 lowering currently is very inefficient, due to automatic |
| 4557 | // widening to v4i8, which produces suboptimal results. |
| 4558 | if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8) |
| 4559 | return LT.first; |
| 4560 | |
| 4561 | // Check non-power-of-2 loads/stores for legal vector element types with |
| 4562 | // NEON. Non-power-of-2 memory ops will get broken down to a set of |
| 4563 | // operations on smaller power-of-2 ops, including ld1/st1. |
| 4564 | LLVMContext &C = Ty->getContext(); |
| 4565 | InstructionCost Cost(0); |
| 4566 | SmallVector<EVT> TypeWorklist; |
| 4567 | TypeWorklist.push_back(Elt: VT); |
| 4568 | while (!TypeWorklist.empty()) { |
| 4569 | EVT CurrVT = TypeWorklist.pop_back_val(); |
| 4570 | unsigned CurrNumElements = CurrVT.getVectorNumElements(); |
| 4571 | if (isPowerOf2_32(Value: CurrNumElements)) { |
| 4572 | Cost += 1; |
| 4573 | continue; |
| 4574 | } |
| 4575 | |
| 4576 | unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / 2; |
| 4577 | TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2)); |
| 4578 | TypeWorklist.push_back( |
| 4579 | Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2)); |
| 4580 | } |
| 4581 | return Cost; |
| 4582 | } |
| 4583 | |
| 4584 | return LT.first; |
| 4585 | } |
| 4586 | |
| 4587 | InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( |
| 4588 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
| 4589 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
| 4590 | bool UseMaskForCond, bool UseMaskForGaps) const { |
| 4591 | assert(Factor >= 2 && "Invalid interleave factor" ); |
| 4592 | auto *VecVTy = cast<VectorType>(Val: VecTy); |
| 4593 | |
| 4594 | if (VecTy->isScalableTy() && !ST->hasSVE()) |
| 4595 | return InstructionCost::getInvalid(); |
| 4596 | |
| 4597 | // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we |
| 4598 | // only have lowering for power-of-2 factors. |
| 4599 | // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in |
| 4600 | // InterleavedAccessPass for ld3/st3 |
| 4601 | if (VecTy->isScalableTy() && !isPowerOf2_32(Value: Factor)) |
| 4602 | return InstructionCost::getInvalid(); |
| 4603 | |
| 4604 | // Vectorization for masked interleaved accesses is only enabled for scalable |
| 4605 | // VF. |
| 4606 | if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) |
| 4607 | return InstructionCost::getInvalid(); |
| 4608 | |
| 4609 | if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { |
| 4610 | unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); |
| 4611 | auto *SubVecTy = |
| 4612 | VectorType::get(ElementType: VecVTy->getElementType(), |
| 4613 | EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor)); |
| 4614 | |
| 4615 | // ldN/stN only support legal vector types of size 64 or 128 in bits. |
| 4616 | // Accesses having vector types that are a multiple of 128 bits can be |
| 4617 | // matched to more than one ldN/stN instruction. |
| 4618 | bool UseScalable; |
| 4619 | if (MinElts % Factor == 0 && |
| 4620 | TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable)) |
| 4621 | return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable); |
| 4622 | } |
| 4623 | |
| 4624 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
| 4625 | Alignment, AddressSpace, CostKind, |
| 4626 | UseMaskForCond, UseMaskForGaps); |
| 4627 | } |
| 4628 | |
| 4629 | InstructionCost |
| 4630 | AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const { |
| 4631 | InstructionCost Cost = 0; |
| 4632 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 4633 | for (auto *I : Tys) { |
| 4634 | if (!I->isVectorTy()) |
| 4635 | continue; |
| 4636 | if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() == |
| 4637 | 128) |
| 4638 | Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind) + |
| 4639 | getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind); |
| 4640 | } |
| 4641 | return Cost; |
| 4642 | } |
| 4643 | |
| 4644 | unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) const { |
| 4645 | return ST->getMaxInterleaveFactor(); |
| 4646 | } |
| 4647 | |
| 4648 | // For Falkor, we want to avoid having too many strided loads in a loop since |
| 4649 | // that can exhaust the HW prefetcher resources. We adjust the unroller |
| 4650 | // MaxCount preference below to attempt to ensure unrolling doesn't create too |
| 4651 | // many strided loads. |
| 4652 | static void |
| 4653 | getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
| 4654 | TargetTransformInfo::UnrollingPreferences &UP) { |
| 4655 | enum { MaxStridedLoads = 7 }; |
| 4656 | auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { |
| 4657 | int StridedLoads = 0; |
| 4658 | // FIXME? We could make this more precise by looking at the CFG and |
| 4659 | // e.g. not counting loads in each side of an if-then-else diamond. |
| 4660 | for (const auto BB : L->blocks()) { |
| 4661 | for (auto &I : *BB) { |
| 4662 | LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I); |
| 4663 | if (!LMemI) |
| 4664 | continue; |
| 4665 | |
| 4666 | Value *PtrValue = LMemI->getPointerOperand(); |
| 4667 | if (L->isLoopInvariant(V: PtrValue)) |
| 4668 | continue; |
| 4669 | |
| 4670 | const SCEV *LSCEV = SE.getSCEV(V: PtrValue); |
| 4671 | const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV); |
| 4672 | if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) |
| 4673 | continue; |
| 4674 | |
| 4675 | // FIXME? We could take pairing of unrolled load copies into account |
| 4676 | // by looking at the AddRec, but we would probably have to limit this |
| 4677 | // to loops with no stores or other memory optimization barriers. |
| 4678 | ++StridedLoads; |
| 4679 | // We've seen enough strided loads that seeing more won't make a |
| 4680 | // difference. |
| 4681 | if (StridedLoads > MaxStridedLoads / 2) |
| 4682 | return StridedLoads; |
| 4683 | } |
| 4684 | } |
| 4685 | return StridedLoads; |
| 4686 | }; |
| 4687 | |
| 4688 | int StridedLoads = countStridedLoads(L, SE); |
| 4689 | LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads |
| 4690 | << " strided loads\n" ); |
| 4691 | // Pick the largest power of 2 unroll count that won't result in too many |
| 4692 | // strided loads. |
| 4693 | if (StridedLoads) { |
| 4694 | UP.MaxCount = 1 << Log2_32(Value: MaxStridedLoads / StridedLoads); |
| 4695 | LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " |
| 4696 | << UP.MaxCount << '\n'); |
| 4697 | } |
| 4698 | } |
| 4699 | |
| 4700 | // This function returns true if the loop: |
| 4701 | // 1. Has a valid cost, and |
| 4702 | // 2. Has a cost within the supplied budget. |
| 4703 | // Otherwise it returns false. |
| 4704 | static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, |
| 4705 | InstructionCost Budget, |
| 4706 | unsigned *FinalSize) { |
| 4707 | // Estimate the size of the loop. |
| 4708 | InstructionCost LoopCost = 0; |
| 4709 | |
| 4710 | for (auto *BB : L->getBlocks()) { |
| 4711 | for (auto &I : *BB) { |
| 4712 | SmallVector<const Value *, 4> Operands(I.operand_values()); |
| 4713 | InstructionCost Cost = |
| 4714 | TTI.getInstructionCost(U: &I, Operands, CostKind: TTI::TCK_CodeSize); |
| 4715 | // This can happen with intrinsics that don't currently have a cost model |
| 4716 | // or for some operations that require SVE. |
| 4717 | if (!Cost.isValid()) |
| 4718 | return false; |
| 4719 | |
| 4720 | LoopCost += Cost; |
| 4721 | if (LoopCost > Budget) |
| 4722 | return false; |
| 4723 | } |
| 4724 | } |
| 4725 | |
| 4726 | if (FinalSize) |
| 4727 | *FinalSize = LoopCost.getValue(); |
| 4728 | return true; |
| 4729 | } |
| 4730 | |
| 4731 | static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, |
| 4732 | const AArch64TTIImpl &TTI) { |
| 4733 | // Only consider loops with unknown trip counts for which we can determine |
| 4734 | // a symbolic expression. Multi-exit loops with small known trip counts will |
| 4735 | // likely be unrolled anyway. |
| 4736 | const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L); |
| 4737 | if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC)) |
| 4738 | return false; |
| 4739 | |
| 4740 | // It might not be worth unrolling loops with low max trip counts. Restrict |
| 4741 | // this to max trip counts > 32 for now. |
| 4742 | unsigned MaxTC = SE.getSmallConstantMaxTripCount(L); |
| 4743 | if (MaxTC > 0 && MaxTC <= 32) |
| 4744 | return false; |
| 4745 | |
| 4746 | // Make sure the loop size is <= 5. |
| 4747 | if (!isLoopSizeWithinBudget(L, TTI, Budget: 5, FinalSize: nullptr)) |
| 4748 | return false; |
| 4749 | |
| 4750 | // Small search loops with multiple exits can be highly beneficial to unroll. |
| 4751 | // We only care about loops with exactly two exiting blocks, although each |
| 4752 | // block could jump to the same exit block. |
| 4753 | ArrayRef<BasicBlock *> Blocks = L->getBlocks(); |
| 4754 | if (Blocks.size() != 2) |
| 4755 | return false; |
| 4756 | |
| 4757 | if (any_of(Range&: Blocks, P: [](BasicBlock *BB) { |
| 4758 | return !isa<BranchInst>(Val: BB->getTerminator()); |
| 4759 | })) |
| 4760 | return false; |
| 4761 | |
| 4762 | return true; |
| 4763 | } |
| 4764 | |
| 4765 | /// For Apple CPUs, we want to runtime-unroll loops to make better use if the |
| 4766 | /// OOO engine's wide instruction window and various predictors. |
| 4767 | static void |
| 4768 | getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, |
| 4769 | TargetTransformInfo::UnrollingPreferences &UP, |
| 4770 | const AArch64TTIImpl &TTI) { |
| 4771 | // Limit loops with structure that is highly likely to benefit from runtime |
| 4772 | // unrolling; that is we exclude outer loops and loops with many blocks (i.e. |
| 4773 | // likely with complex control flow). Note that the heuristics here may be |
| 4774 | // overly conservative and we err on the side of avoiding runtime unrolling |
| 4775 | // rather than unroll excessively. They are all subject to further refinement. |
| 4776 | if (!L->isInnermost() || L->getNumBlocks() > 8) |
| 4777 | return; |
| 4778 | |
| 4779 | // Loops with multiple exits are handled by common code. |
| 4780 | if (!L->getExitBlock()) |
| 4781 | return; |
| 4782 | |
| 4783 | const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L); |
| 4784 | if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC) || |
| 4785 | (SE.getSmallConstantMaxTripCount(L) > 0 && |
| 4786 | SE.getSmallConstantMaxTripCount(L) <= 32)) |
| 4787 | return; |
| 4788 | |
| 4789 | if (findStringMetadataForLoop(TheLoop: L, Name: "llvm.loop.isvectorized" )) |
| 4790 | return; |
| 4791 | |
| 4792 | if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L)) |
| 4793 | return; |
| 4794 | |
| 4795 | // Limit to loops with trip counts that are cheap to expand. |
| 4796 | UP.SCEVExpansionBudget = 1; |
| 4797 | |
| 4798 | // Try to unroll small, single block loops, if they have load/store |
| 4799 | // dependencies, to expose more parallel memory access streams. |
| 4800 | BasicBlock * = L->getHeader(); |
| 4801 | if (Header == L->getLoopLatch()) { |
| 4802 | // Estimate the size of the loop. |
| 4803 | unsigned Size; |
| 4804 | if (!isLoopSizeWithinBudget(L, TTI, Budget: 8, FinalSize: &Size)) |
| 4805 | return; |
| 4806 | |
| 4807 | SmallPtrSet<Value *, 8> LoadedValues; |
| 4808 | SmallVector<StoreInst *> Stores; |
| 4809 | for (auto *BB : L->blocks()) { |
| 4810 | for (auto &I : *BB) { |
| 4811 | Value *Ptr = getLoadStorePointerOperand(V: &I); |
| 4812 | if (!Ptr) |
| 4813 | continue; |
| 4814 | const SCEV *PtrSCEV = SE.getSCEV(V: Ptr); |
| 4815 | if (SE.isLoopInvariant(S: PtrSCEV, L)) |
| 4816 | continue; |
| 4817 | if (isa<LoadInst>(Val: &I)) |
| 4818 | LoadedValues.insert(Ptr: &I); |
| 4819 | else |
| 4820 | Stores.push_back(Elt: cast<StoreInst>(Val: &I)); |
| 4821 | } |
| 4822 | } |
| 4823 | |
| 4824 | // Try to find an unroll count that maximizes the use of the instruction |
| 4825 | // window, i.e. trying to fetch as many instructions per cycle as possible. |
| 4826 | unsigned MaxInstsPerLine = 16; |
| 4827 | unsigned UC = 1; |
| 4828 | unsigned BestUC = 1; |
| 4829 | unsigned SizeWithBestUC = BestUC * Size; |
| 4830 | while (UC <= 8) { |
| 4831 | unsigned SizeWithUC = UC * Size; |
| 4832 | if (SizeWithUC > 48) |
| 4833 | break; |
| 4834 | if ((SizeWithUC % MaxInstsPerLine) == 0 || |
| 4835 | (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) { |
| 4836 | BestUC = UC; |
| 4837 | SizeWithBestUC = BestUC * Size; |
| 4838 | } |
| 4839 | UC++; |
| 4840 | } |
| 4841 | |
| 4842 | if (BestUC == 1 || none_of(Range&: Stores, P: [&LoadedValues](StoreInst *SI) { |
| 4843 | return LoadedValues.contains(Ptr: SI->getOperand(i_nocapture: 0)); |
| 4844 | })) |
| 4845 | return; |
| 4846 | |
| 4847 | UP.Runtime = true; |
| 4848 | UP.DefaultUnrollRuntimeCount = BestUC; |
| 4849 | return; |
| 4850 | } |
| 4851 | |
| 4852 | // Try to runtime-unroll loops with early-continues depending on loop-varying |
| 4853 | // loads; this helps with branch-prediction for the early-continues. |
| 4854 | auto *Term = dyn_cast<BranchInst>(Val: Header->getTerminator()); |
| 4855 | auto *Latch = L->getLoopLatch(); |
| 4856 | SmallVector<BasicBlock *> Preds(predecessors(BB: Latch)); |
| 4857 | if (!Term || !Term->isConditional() || Preds.size() == 1 || |
| 4858 | !llvm::is_contained(Range&: Preds, Element: Header) || |
| 4859 | none_of(Range&: Preds, P: [L](BasicBlock *Pred) { return L->contains(BB: Pred); })) |
| 4860 | return; |
| 4861 | |
| 4862 | std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad = |
| 4863 | [&](Instruction *I, unsigned Depth) -> bool { |
| 4864 | if (isa<PHINode>(Val: I) || L->isLoopInvariant(V: I) || Depth > 8) |
| 4865 | return false; |
| 4866 | |
| 4867 | if (isa<LoadInst>(Val: I)) |
| 4868 | return true; |
| 4869 | |
| 4870 | return any_of(Range: I->operands(), P: [&](Value *V) { |
| 4871 | auto *I = dyn_cast<Instruction>(Val: V); |
| 4872 | return I && DependsOnLoopLoad(I, Depth + 1); |
| 4873 | }); |
| 4874 | }; |
| 4875 | CmpPredicate Pred; |
| 4876 | Instruction *I; |
| 4877 | if (match(V: Term, P: m_Br(C: m_ICmp(Pred, L: m_Instruction(I), R: m_Value()), T: m_Value(), |
| 4878 | F: m_Value())) && |
| 4879 | DependsOnLoopLoad(I, 0)) { |
| 4880 | UP.Runtime = true; |
| 4881 | } |
| 4882 | } |
| 4883 | |
| 4884 | void AArch64TTIImpl::( |
| 4885 | Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, |
| 4886 | OptimizationRemarkEmitter *ORE) const { |
| 4887 | // Enable partial unrolling and runtime unrolling. |
| 4888 | BaseT::getUnrollingPreferences(L, SE, UP, ORE); |
| 4889 | |
| 4890 | UP.UpperBound = true; |
| 4891 | |
| 4892 | // For inner loop, it is more likely to be a hot one, and the runtime check |
| 4893 | // can be promoted out from LICM pass, so the overhead is less, let's try |
| 4894 | // a larger threshold to unroll more loops. |
| 4895 | if (L->getLoopDepth() > 1) |
| 4896 | UP.PartialThreshold *= 2; |
| 4897 | |
| 4898 | // Disable partial & runtime unrolling on -Os. |
| 4899 | UP.PartialOptSizeThreshold = 0; |
| 4900 | |
| 4901 | // Scan the loop: don't unroll loops with calls as this could prevent |
| 4902 | // inlining. Don't unroll vector loops either, as they don't benefit much from |
| 4903 | // unrolling. |
| 4904 | for (auto *BB : L->getBlocks()) { |
| 4905 | for (auto &I : *BB) { |
| 4906 | // Don't unroll vectorised loop. |
| 4907 | if (I.getType()->isVectorTy()) |
| 4908 | return; |
| 4909 | |
| 4910 | if (isa<CallBase>(Val: I)) { |
| 4911 | if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) |
| 4912 | if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) |
| 4913 | if (!isLoweredToCall(F)) |
| 4914 | continue; |
| 4915 | return; |
| 4916 | } |
| 4917 | } |
| 4918 | } |
| 4919 | |
| 4920 | // Apply subtarget-specific unrolling preferences. |
| 4921 | switch (ST->getProcFamily()) { |
| 4922 | case AArch64Subtarget::AppleA14: |
| 4923 | case AArch64Subtarget::AppleA15: |
| 4924 | case AArch64Subtarget::AppleA16: |
| 4925 | case AArch64Subtarget::AppleM4: |
| 4926 | getAppleRuntimeUnrollPreferences(L, SE, UP, TTI: *this); |
| 4927 | break; |
| 4928 | case AArch64Subtarget::Falkor: |
| 4929 | if (EnableFalkorHWPFUnrollFix) |
| 4930 | getFalkorUnrollingPreferences(L, SE, UP); |
| 4931 | break; |
| 4932 | default: |
| 4933 | break; |
| 4934 | } |
| 4935 | |
| 4936 | // If this is a small, multi-exit loop similar to something like std::find, |
| 4937 | // then there is typically a performance improvement achieved by unrolling. |
| 4938 | if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, TTI: *this)) { |
| 4939 | UP.RuntimeUnrollMultiExit = true; |
| 4940 | UP.Runtime = true; |
| 4941 | // Limit unroll count. |
| 4942 | UP.DefaultUnrollRuntimeCount = 4; |
| 4943 | // Allow slightly more costly trip-count expansion to catch search loops |
| 4944 | // with pointer inductions. |
| 4945 | UP.SCEVExpansionBudget = 5; |
| 4946 | return; |
| 4947 | } |
| 4948 | |
| 4949 | // Enable runtime unrolling for in-order models |
| 4950 | // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by |
| 4951 | // checking for that case, we can ensure that the default behaviour is |
| 4952 | // unchanged |
| 4953 | if (ST->getProcFamily() != AArch64Subtarget::Generic && |
| 4954 | !ST->getSchedModel().isOutOfOrder()) { |
| 4955 | UP.Runtime = true; |
| 4956 | UP.Partial = true; |
| 4957 | UP.UnrollRemainder = true; |
| 4958 | UP.DefaultUnrollRuntimeCount = 4; |
| 4959 | |
| 4960 | UP.UnrollAndJam = true; |
| 4961 | UP.UnrollAndJamInnerLoopThreshold = 60; |
| 4962 | } |
| 4963 | } |
| 4964 | |
| 4965 | void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
| 4966 | TTI::PeelingPreferences &PP) const { |
| 4967 | BaseT::getPeelingPreferences(L, SE, PP); |
| 4968 | } |
| 4969 | |
| 4970 | Value * |
| 4971 | AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, |
| 4972 | Type *ExpectedType) const { |
| 4973 | switch (Inst->getIntrinsicID()) { |
| 4974 | default: |
| 4975 | return nullptr; |
| 4976 | case Intrinsic::aarch64_neon_st2: |
| 4977 | case Intrinsic::aarch64_neon_st3: |
| 4978 | case Intrinsic::aarch64_neon_st4: { |
| 4979 | // Create a struct type |
| 4980 | StructType *ST = dyn_cast<StructType>(Val: ExpectedType); |
| 4981 | if (!ST) |
| 4982 | return nullptr; |
| 4983 | unsigned NumElts = Inst->arg_size() - 1; |
| 4984 | if (ST->getNumElements() != NumElts) |
| 4985 | return nullptr; |
| 4986 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
| 4987 | if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i)) |
| 4988 | return nullptr; |
| 4989 | } |
| 4990 | Value *Res = PoisonValue::get(T: ExpectedType); |
| 4991 | IRBuilder<> Builder(Inst); |
| 4992 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
| 4993 | Value *L = Inst->getArgOperand(i); |
| 4994 | Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i); |
| 4995 | } |
| 4996 | return Res; |
| 4997 | } |
| 4998 | case Intrinsic::aarch64_neon_ld2: |
| 4999 | case Intrinsic::aarch64_neon_ld3: |
| 5000 | case Intrinsic::aarch64_neon_ld4: |
| 5001 | if (Inst->getType() == ExpectedType) |
| 5002 | return Inst; |
| 5003 | return nullptr; |
| 5004 | } |
| 5005 | } |
| 5006 | |
| 5007 | bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, |
| 5008 | MemIntrinsicInfo &Info) const { |
| 5009 | switch (Inst->getIntrinsicID()) { |
| 5010 | default: |
| 5011 | break; |
| 5012 | case Intrinsic::aarch64_neon_ld2: |
| 5013 | case Intrinsic::aarch64_neon_ld3: |
| 5014 | case Intrinsic::aarch64_neon_ld4: |
| 5015 | Info.ReadMem = true; |
| 5016 | Info.WriteMem = false; |
| 5017 | Info.PtrVal = Inst->getArgOperand(i: 0); |
| 5018 | break; |
| 5019 | case Intrinsic::aarch64_neon_st2: |
| 5020 | case Intrinsic::aarch64_neon_st3: |
| 5021 | case Intrinsic::aarch64_neon_st4: |
| 5022 | Info.ReadMem = false; |
| 5023 | Info.WriteMem = true; |
| 5024 | Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - 1); |
| 5025 | break; |
| 5026 | } |
| 5027 | |
| 5028 | switch (Inst->getIntrinsicID()) { |
| 5029 | default: |
| 5030 | return false; |
| 5031 | case Intrinsic::aarch64_neon_ld2: |
| 5032 | case Intrinsic::aarch64_neon_st2: |
| 5033 | Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; |
| 5034 | break; |
| 5035 | case Intrinsic::aarch64_neon_ld3: |
| 5036 | case Intrinsic::aarch64_neon_st3: |
| 5037 | Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; |
| 5038 | break; |
| 5039 | case Intrinsic::aarch64_neon_ld4: |
| 5040 | case Intrinsic::aarch64_neon_st4: |
| 5041 | Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; |
| 5042 | break; |
| 5043 | } |
| 5044 | return true; |
| 5045 | } |
| 5046 | |
| 5047 | /// See if \p I should be considered for address type promotion. We check if \p |
| 5048 | /// I is a sext with right type and used in memory accesses. If it used in a |
| 5049 | /// "complex" getelementptr, we allow it to be promoted without finding other |
| 5050 | /// sext instructions that sign extended the same initial value. A getelementptr |
| 5051 | /// is considered as "complex" if it has more than 2 operands. |
| 5052 | bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( |
| 5053 | const Instruction &I, bool &) const { |
| 5054 | bool Considerable = false; |
| 5055 | AllowPromotionWithoutCommonHeader = false; |
| 5056 | if (!isa<SExtInst>(Val: &I)) |
| 5057 | return false; |
| 5058 | Type *ConsideredSExtType = |
| 5059 | Type::getInt64Ty(C&: I.getParent()->getParent()->getContext()); |
| 5060 | if (I.getType() != ConsideredSExtType) |
| 5061 | return false; |
| 5062 | // See if the sext is the one with the right type and used in at least one |
| 5063 | // GetElementPtrInst. |
| 5064 | for (const User *U : I.users()) { |
| 5065 | if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) { |
| 5066 | Considerable = true; |
| 5067 | // A getelementptr is considered as "complex" if it has more than 2 |
| 5068 | // operands. We will promote a SExt used in such complex GEP as we |
| 5069 | // expect some computation to be merged if they are done on 64 bits. |
| 5070 | if (GEPInst->getNumOperands() > 2) { |
| 5071 | AllowPromotionWithoutCommonHeader = true; |
| 5072 | break; |
| 5073 | } |
| 5074 | } |
| 5075 | } |
| 5076 | return Considerable; |
| 5077 | } |
| 5078 | |
| 5079 | bool AArch64TTIImpl::isLegalToVectorizeReduction( |
| 5080 | const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { |
| 5081 | if (!VF.isScalable()) |
| 5082 | return true; |
| 5083 | |
| 5084 | Type *Ty = RdxDesc.getRecurrenceType(); |
| 5085 | if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) |
| 5086 | return false; |
| 5087 | |
| 5088 | switch (RdxDesc.getRecurrenceKind()) { |
| 5089 | case RecurKind::Add: |
| 5090 | case RecurKind::FAdd: |
| 5091 | case RecurKind::And: |
| 5092 | case RecurKind::Or: |
| 5093 | case RecurKind::Xor: |
| 5094 | case RecurKind::SMin: |
| 5095 | case RecurKind::SMax: |
| 5096 | case RecurKind::UMin: |
| 5097 | case RecurKind::UMax: |
| 5098 | case RecurKind::FMin: |
| 5099 | case RecurKind::FMax: |
| 5100 | case RecurKind::FMulAdd: |
| 5101 | case RecurKind::AnyOf: |
| 5102 | return true; |
| 5103 | default: |
| 5104 | return false; |
| 5105 | } |
| 5106 | } |
| 5107 | |
| 5108 | InstructionCost |
| 5109 | AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
| 5110 | FastMathFlags FMF, |
| 5111 | TTI::TargetCostKind CostKind) const { |
| 5112 | // The code-generator is currently not able to handle scalable vectors |
| 5113 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 5114 | // it. This change will be removed when code-generation for these types is |
| 5115 | // sufficiently reliable. |
| 5116 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty)) |
| 5117 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
| 5118 | return InstructionCost::getInvalid(); |
| 5119 | |
| 5120 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
| 5121 | |
| 5122 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) |
| 5123 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
| 5124 | |
| 5125 | InstructionCost LegalizationCost = 0; |
| 5126 | if (LT.first > 1) { |
| 5127 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Ty->getContext()); |
| 5128 | IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF); |
| 5129 | LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - 1); |
| 5130 | } |
| 5131 | |
| 5132 | return LegalizationCost + /*Cost of horizontal reduction*/ 2; |
| 5133 | } |
| 5134 | |
| 5135 | InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( |
| 5136 | unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const { |
| 5137 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
| 5138 | InstructionCost LegalizationCost = 0; |
| 5139 | if (LT.first > 1) { |
| 5140 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: ValTy->getContext()); |
| 5141 | LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind); |
| 5142 | LegalizationCost *= LT.first - 1; |
| 5143 | } |
| 5144 | |
| 5145 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 5146 | assert(ISD && "Invalid opcode" ); |
| 5147 | // Add the final reduction cost for the legal horizontal reduction |
| 5148 | switch (ISD) { |
| 5149 | case ISD::ADD: |
| 5150 | case ISD::AND: |
| 5151 | case ISD::OR: |
| 5152 | case ISD::XOR: |
| 5153 | case ISD::FADD: |
| 5154 | return LegalizationCost + 2; |
| 5155 | default: |
| 5156 | return InstructionCost::getInvalid(); |
| 5157 | } |
| 5158 | } |
| 5159 | |
| 5160 | InstructionCost |
| 5161 | AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, |
| 5162 | std::optional<FastMathFlags> FMF, |
| 5163 | TTI::TargetCostKind CostKind) const { |
| 5164 | // The code-generator is currently not able to handle scalable vectors |
| 5165 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 5166 | // it. This change will be removed when code-generation for these types is |
| 5167 | // sufficiently reliable. |
| 5168 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: ValTy)) |
| 5169 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
| 5170 | return InstructionCost::getInvalid(); |
| 5171 | |
| 5172 | if (TTI::requiresOrderedReduction(FMF)) { |
| 5173 | if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) { |
| 5174 | InstructionCost BaseCost = |
| 5175 | BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
| 5176 | // Add on extra cost to reflect the extra overhead on some CPUs. We still |
| 5177 | // end up vectorizing for more computationally intensive loops. |
| 5178 | return BaseCost + FixedVTy->getNumElements(); |
| 5179 | } |
| 5180 | |
| 5181 | if (Opcode != Instruction::FAdd) |
| 5182 | return InstructionCost::getInvalid(); |
| 5183 | |
| 5184 | auto *VTy = cast<ScalableVectorType>(Val: ValTy); |
| 5185 | InstructionCost Cost = |
| 5186 | getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind); |
| 5187 | Cost *= getMaxNumElements(VF: VTy->getElementCount()); |
| 5188 | return Cost; |
| 5189 | } |
| 5190 | |
| 5191 | if (isa<ScalableVectorType>(Val: ValTy)) |
| 5192 | return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); |
| 5193 | |
| 5194 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
| 5195 | MVT MTy = LT.second; |
| 5196 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 5197 | assert(ISD && "Invalid opcode" ); |
| 5198 | |
| 5199 | // Horizontal adds can use the 'addv' instruction. We model the cost of these |
| 5200 | // instructions as twice a normal vector add, plus 1 for each legalization |
| 5201 | // step (LT.first). This is the only arithmetic vector reduction operation for |
| 5202 | // which we have an instruction. |
| 5203 | // OR, XOR and AND costs should match the codegen from: |
| 5204 | // OR: llvm/test/CodeGen/AArch64/reduce-or.ll |
| 5205 | // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll |
| 5206 | // AND: llvm/test/CodeGen/AArch64/reduce-and.ll |
| 5207 | static const CostTblEntry CostTblNoPairwise[]{ |
| 5208 | {.ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: 2}, |
| 5209 | {.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 2}, |
| 5210 | {.ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: 2}, |
| 5211 | {.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 2}, |
| 5212 | {.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 2}, |
| 5213 | {.ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 2}, |
| 5214 | {.ISD: ISD::OR, .Type: MVT::v8i8, .Cost: 15}, |
| 5215 | {.ISD: ISD::OR, .Type: MVT::v16i8, .Cost: 17}, |
| 5216 | {.ISD: ISD::OR, .Type: MVT::v4i16, .Cost: 7}, |
| 5217 | {.ISD: ISD::OR, .Type: MVT::v8i16, .Cost: 9}, |
| 5218 | {.ISD: ISD::OR, .Type: MVT::v2i32, .Cost: 3}, |
| 5219 | {.ISD: ISD::OR, .Type: MVT::v4i32, .Cost: 5}, |
| 5220 | {.ISD: ISD::OR, .Type: MVT::v2i64, .Cost: 3}, |
| 5221 | {.ISD: ISD::XOR, .Type: MVT::v8i8, .Cost: 15}, |
| 5222 | {.ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: 17}, |
| 5223 | {.ISD: ISD::XOR, .Type: MVT::v4i16, .Cost: 7}, |
| 5224 | {.ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: 9}, |
| 5225 | {.ISD: ISD::XOR, .Type: MVT::v2i32, .Cost: 3}, |
| 5226 | {.ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: 5}, |
| 5227 | {.ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: 3}, |
| 5228 | {.ISD: ISD::AND, .Type: MVT::v8i8, .Cost: 15}, |
| 5229 | {.ISD: ISD::AND, .Type: MVT::v16i8, .Cost: 17}, |
| 5230 | {.ISD: ISD::AND, .Type: MVT::v4i16, .Cost: 7}, |
| 5231 | {.ISD: ISD::AND, .Type: MVT::v8i16, .Cost: 9}, |
| 5232 | {.ISD: ISD::AND, .Type: MVT::v2i32, .Cost: 3}, |
| 5233 | {.ISD: ISD::AND, .Type: MVT::v4i32, .Cost: 5}, |
| 5234 | {.ISD: ISD::AND, .Type: MVT::v2i64, .Cost: 3}, |
| 5235 | }; |
| 5236 | switch (ISD) { |
| 5237 | default: |
| 5238 | break; |
| 5239 | case ISD::FADD: |
| 5240 | if (Type *EltTy = ValTy->getScalarType(); |
| 5241 | // FIXME: For half types without fullfp16 support, this could extend and |
| 5242 | // use a fp32 faddp reduction but current codegen unrolls. |
| 5243 | MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() || |
| 5244 | (EltTy->isHalfTy() && ST->hasFullFP16()))) { |
| 5245 | const unsigned NElts = MTy.getVectorNumElements(); |
| 5246 | if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 && |
| 5247 | isPowerOf2_32(Value: NElts)) |
| 5248 | // Reduction corresponding to series of fadd instructions is lowered to |
| 5249 | // series of faddp instructions. faddp has latency/throughput that |
| 5250 | // matches fadd instruction and hence, every faddp instruction can be |
| 5251 | // considered to have a relative cost = 1 with |
| 5252 | // CostKind = TCK_RecipThroughput. |
| 5253 | // An faddp will pairwise add vector elements, so the size of input |
| 5254 | // vector reduces by half every time, requiring |
| 5255 | // #(faddp instructions) = log2_32(NElts). |
| 5256 | return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(Value: NElts); |
| 5257 | } |
| 5258 | break; |
| 5259 | case ISD::ADD: |
| 5260 | if (const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy)) |
| 5261 | return (LT.first - 1) + Entry->Cost; |
| 5262 | break; |
| 5263 | case ISD::XOR: |
| 5264 | case ISD::AND: |
| 5265 | case ISD::OR: |
| 5266 | const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy); |
| 5267 | if (!Entry) |
| 5268 | break; |
| 5269 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
| 5270 | if (MTy.getVectorNumElements() <= ValVTy->getNumElements() && |
| 5271 | isPowerOf2_32(Value: ValVTy->getNumElements())) { |
| 5272 | InstructionCost = 0; |
| 5273 | if (LT.first != 1) { |
| 5274 | // Type needs to be split, so there is an extra cost of LT.first - 1 |
| 5275 | // arithmetic ops. |
| 5276 | auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(), |
| 5277 | NumElts: MTy.getVectorNumElements()); |
| 5278 | ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); |
| 5279 | ExtraCost *= LT.first - 1; |
| 5280 | } |
| 5281 | // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov |
| 5282 | auto Cost = ValVTy->getElementType()->isIntegerTy(Bitwidth: 1) ? 2 : Entry->Cost; |
| 5283 | return Cost + ExtraCost; |
| 5284 | } |
| 5285 | break; |
| 5286 | } |
| 5287 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
| 5288 | } |
| 5289 | |
| 5290 | InstructionCost AArch64TTIImpl::getExtendedReductionCost( |
| 5291 | unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy, |
| 5292 | std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const { |
| 5293 | EVT VecVT = TLI->getValueType(DL, Ty: VecTy); |
| 5294 | EVT ResVT = TLI->getValueType(DL, Ty: ResTy); |
| 5295 | |
| 5296 | if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() && |
| 5297 | VecVT.getSizeInBits() >= 64) { |
| 5298 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy); |
| 5299 | |
| 5300 | // The legal cases are: |
| 5301 | // UADDLV 8/16/32->32 |
| 5302 | // UADDLP 32->64 |
| 5303 | unsigned RevVTSize = ResVT.getSizeInBits(); |
| 5304 | if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) && |
| 5305 | RevVTSize <= 32) || |
| 5306 | ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) && |
| 5307 | RevVTSize <= 32) || |
| 5308 | ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) && |
| 5309 | RevVTSize <= 64)) |
| 5310 | return (LT.first - 1) * 2 + 2; |
| 5311 | } |
| 5312 | |
| 5313 | return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: VecTy, FMF, |
| 5314 | CostKind); |
| 5315 | } |
| 5316 | |
| 5317 | InstructionCost |
| 5318 | AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, |
| 5319 | VectorType *VecTy, |
| 5320 | TTI::TargetCostKind CostKind) const { |
| 5321 | EVT VecVT = TLI->getValueType(DL, Ty: VecTy); |
| 5322 | EVT ResVT = TLI->getValueType(DL, Ty: ResTy); |
| 5323 | |
| 5324 | if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) { |
| 5325 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy); |
| 5326 | |
| 5327 | // The legal cases with dotprod are |
| 5328 | // UDOT 8->32 |
| 5329 | // Which requires an additional uaddv to sum the i32 values. |
| 5330 | if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) && |
| 5331 | ResVT == MVT::i32) |
| 5332 | return LT.first + 2; |
| 5333 | } |
| 5334 | |
| 5335 | return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, Ty: VecTy, CostKind); |
| 5336 | } |
| 5337 | |
| 5338 | InstructionCost |
| 5339 | AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index, |
| 5340 | TTI::TargetCostKind CostKind) const { |
| 5341 | static const CostTblEntry ShuffleTbl[] = { |
| 5342 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv16i8, .Cost: 1 }, |
| 5343 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv8i16, .Cost: 1 }, |
| 5344 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4i32, .Cost: 1 }, |
| 5345 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2i64, .Cost: 1 }, |
| 5346 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f16, .Cost: 1 }, |
| 5347 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f16, .Cost: 1 }, |
| 5348 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv8f16, .Cost: 1 }, |
| 5349 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2bf16, .Cost: 1 }, |
| 5350 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4bf16, .Cost: 1 }, |
| 5351 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv8bf16, .Cost: 1 }, |
| 5352 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f32, .Cost: 1 }, |
| 5353 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f32, .Cost: 1 }, |
| 5354 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f64, .Cost: 1 }, |
| 5355 | }; |
| 5356 | |
| 5357 | // The code-generator is currently not able to handle scalable vectors |
| 5358 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
| 5359 | // it. This change will be removed when code-generation for these types is |
| 5360 | // sufficiently reliable. |
| 5361 | if (Tp->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
| 5362 | return InstructionCost::getInvalid(); |
| 5363 | |
| 5364 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp); |
| 5365 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Tp->getContext()); |
| 5366 | EVT PromotedVT = LT.second.getScalarType() == MVT::i1 |
| 5367 | ? TLI->getPromotedVTForPredicate(VT: EVT(LT.second)) |
| 5368 | : LT.second; |
| 5369 | Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Context&: Tp->getContext()); |
| 5370 | InstructionCost LegalizationCost = 0; |
| 5371 | if (Index < 0) { |
| 5372 | LegalizationCost = |
| 5373 | getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy, |
| 5374 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) + |
| 5375 | getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy, |
| 5376 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
| 5377 | } |
| 5378 | |
| 5379 | // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp |
| 5380 | // Cost performed on a promoted type. |
| 5381 | if (LT.second.getScalarType() == MVT::i1) { |
| 5382 | LegalizationCost += |
| 5383 | getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy, |
| 5384 | CCH: TTI::CastContextHint::None, CostKind) + |
| 5385 | getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy, |
| 5386 | CCH: TTI::CastContextHint::None, CostKind); |
| 5387 | } |
| 5388 | const auto *Entry = |
| 5389 | CostTableLookup(Table: ShuffleTbl, ISD: TTI::SK_Splice, Ty: PromotedVT.getSimpleVT()); |
| 5390 | assert(Entry && "Illegal Type for Splice" ); |
| 5391 | LegalizationCost += Entry->Cost; |
| 5392 | return LegalizationCost * LT.first; |
| 5393 | } |
| 5394 | |
| 5395 | InstructionCost AArch64TTIImpl::getPartialReductionCost( |
| 5396 | unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, |
| 5397 | ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, |
| 5398 | TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp, |
| 5399 | TTI::TargetCostKind CostKind) const { |
| 5400 | InstructionCost Invalid = InstructionCost::getInvalid(); |
| 5401 | InstructionCost Cost(TTI::TCC_Basic); |
| 5402 | |
| 5403 | if (CostKind != TTI::TCK_RecipThroughput) |
| 5404 | return Invalid; |
| 5405 | |
| 5406 | // Sub opcodes currently only occur in chained cases. |
| 5407 | // Independent partial reduction subtractions are still costed as an add |
| 5408 | if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) || |
| 5409 | OpAExtend == TTI::PR_None) |
| 5410 | return Invalid; |
| 5411 | |
| 5412 | // We only support multiply binary operations for now, and for muls we |
| 5413 | // require the types being extended to be the same. |
| 5414 | // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but |
| 5415 | // only if the i8mm or sve/streaming features are available. |
| 5416 | if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB || |
| 5417 | OpBExtend == TTI::PR_None || |
| 5418 | (OpAExtend != OpBExtend && !ST->hasMatMulInt8() && |
| 5419 | !ST->isSVEorStreamingSVEAvailable()))) |
| 5420 | return Invalid; |
| 5421 | assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) && |
| 5422 | "Unexpected values for OpBExtend or InputTypeB" ); |
| 5423 | |
| 5424 | EVT InputEVT = EVT::getEVT(Ty: InputTypeA); |
| 5425 | EVT AccumEVT = EVT::getEVT(Ty: AccumType); |
| 5426 | |
| 5427 | unsigned VFMinValue = VF.getKnownMinValue(); |
| 5428 | |
| 5429 | if (VF.isScalable()) { |
| 5430 | if (!ST->isSVEorStreamingSVEAvailable()) |
| 5431 | return Invalid; |
| 5432 | |
| 5433 | // Don't accept a partial reduction if the scaled accumulator is vscale x 1, |
| 5434 | // since we can't lower that type. |
| 5435 | unsigned Scale = |
| 5436 | AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits(); |
| 5437 | if (VFMinValue == Scale) |
| 5438 | return Invalid; |
| 5439 | } |
| 5440 | if (VF.isFixed() && |
| 5441 | (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64)) |
| 5442 | return Invalid; |
| 5443 | |
| 5444 | if (InputEVT == MVT::i8) { |
| 5445 | switch (VFMinValue) { |
| 5446 | default: |
| 5447 | return Invalid; |
| 5448 | case 8: |
| 5449 | if (AccumEVT == MVT::i32) |
| 5450 | Cost *= 2; |
| 5451 | else if (AccumEVT != MVT::i64) |
| 5452 | return Invalid; |
| 5453 | break; |
| 5454 | case 16: |
| 5455 | if (AccumEVT == MVT::i64) |
| 5456 | Cost *= 2; |
| 5457 | else if (AccumEVT != MVT::i32) |
| 5458 | return Invalid; |
| 5459 | break; |
| 5460 | } |
| 5461 | } else if (InputEVT == MVT::i16) { |
| 5462 | // FIXME: Allow i32 accumulator but increase cost, as we would extend |
| 5463 | // it to i64. |
| 5464 | if (VFMinValue != 8 || AccumEVT != MVT::i64) |
| 5465 | return Invalid; |
| 5466 | } else |
| 5467 | return Invalid; |
| 5468 | |
| 5469 | return Cost; |
| 5470 | } |
| 5471 | |
| 5472 | InstructionCost |
| 5473 | AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, |
| 5474 | VectorType *SrcTy, ArrayRef<int> Mask, |
| 5475 | TTI::TargetCostKind CostKind, int Index, |
| 5476 | VectorType *SubTp, ArrayRef<const Value *> Args, |
| 5477 | const Instruction *CxtI) const { |
| 5478 | assert((Mask.empty() || DstTy->isScalableTy() || |
| 5479 | Mask.size() == DstTy->getElementCount().getKnownMinValue()) && |
| 5480 | "Expected the Mask to match the return size if given" ); |
| 5481 | assert(SrcTy->getScalarType() == DstTy->getScalarType() && |
| 5482 | "Expected the same scalar types" ); |
| 5483 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy); |
| 5484 | |
| 5485 | // If we have a Mask, and the LT is being legalized somehow, split the Mask |
| 5486 | // into smaller vectors and sum the cost of each shuffle. |
| 5487 | if (!Mask.empty() && isa<FixedVectorType>(Val: SrcTy) && LT.second.isVector() && |
| 5488 | LT.second.getScalarSizeInBits() * Mask.size() > 128 && |
| 5489 | SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && |
| 5490 | Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) { |
| 5491 | // Check for LD3/LD4 instructions, which are represented in llvm IR as |
| 5492 | // deinterleaving-shuffle(load). The shuffle cost could potentially be free, |
| 5493 | // but we model it with a cost of LT.first so that LD3/LD4 have a higher |
| 5494 | // cost than just the load. |
| 5495 | if (Args.size() >= 1 && isa<LoadInst>(Val: Args[0]) && |
| 5496 | (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 3) || |
| 5497 | ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 4))) |
| 5498 | return std::max<InstructionCost>(a: 1, b: LT.first / 4); |
| 5499 | |
| 5500 | // Check for ST3/ST4 instructions, which are represented in llvm IR as |
| 5501 | // store(interleaving-shuffle). The shuffle cost could potentially be free, |
| 5502 | // but we model it with a cost of LT.first so that ST3/ST4 have a higher |
| 5503 | // cost than just the store. |
| 5504 | if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) && |
| 5505 | (ShuffleVectorInst::isInterleaveMask( |
| 5506 | Mask, Factor: 4, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2) || |
| 5507 | ShuffleVectorInst::isInterleaveMask( |
| 5508 | Mask, Factor: 3, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2))) |
| 5509 | return LT.first; |
| 5510 | |
| 5511 | unsigned TpNumElts = Mask.size(); |
| 5512 | unsigned LTNumElts = LT.second.getVectorNumElements(); |
| 5513 | unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; |
| 5514 | VectorType *NTp = VectorType::get(ElementType: SrcTy->getScalarType(), |
| 5515 | EC: LT.second.getVectorElementCount()); |
| 5516 | InstructionCost Cost; |
| 5517 | std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost> |
| 5518 | PreviousCosts; |
| 5519 | for (unsigned N = 0; N < NumVecs; N++) { |
| 5520 | SmallVector<int> NMask; |
| 5521 | // Split the existing mask into chunks of size LTNumElts. Track the source |
| 5522 | // sub-vectors to ensure the result has at most 2 inputs. |
| 5523 | unsigned Source1 = -1U, Source2 = -1U; |
| 5524 | unsigned NumSources = 0; |
| 5525 | for (unsigned E = 0; E < LTNumElts; E++) { |
| 5526 | int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] |
| 5527 | : PoisonMaskElem; |
| 5528 | if (MaskElt < 0) { |
| 5529 | NMask.push_back(Elt: PoisonMaskElem); |
| 5530 | continue; |
| 5531 | } |
| 5532 | |
| 5533 | // Calculate which source from the input this comes from and whether it |
| 5534 | // is new to us. |
| 5535 | unsigned Source = MaskElt / LTNumElts; |
| 5536 | if (NumSources == 0) { |
| 5537 | Source1 = Source; |
| 5538 | NumSources = 1; |
| 5539 | } else if (NumSources == 1 && Source != Source1) { |
| 5540 | Source2 = Source; |
| 5541 | NumSources = 2; |
| 5542 | } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { |
| 5543 | NumSources++; |
| 5544 | } |
| 5545 | |
| 5546 | // Add to the new mask. For the NumSources>2 case these are not correct, |
| 5547 | // but are only used for the modular lane number. |
| 5548 | if (Source == Source1) |
| 5549 | NMask.push_back(Elt: MaskElt % LTNumElts); |
| 5550 | else if (Source == Source2) |
| 5551 | NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts); |
| 5552 | else |
| 5553 | NMask.push_back(Elt: MaskElt % LTNumElts); |
| 5554 | } |
| 5555 | // Check if we have already generated this sub-shuffle, which means we |
| 5556 | // will have already generated the output. For example a <16 x i32> splat |
| 5557 | // will be the same sub-splat 4 times, which only needs to be generated |
| 5558 | // once and reused. |
| 5559 | auto Result = |
| 5560 | PreviousCosts.insert(x: {std::make_tuple(args&: Source1, args&: Source2, args&: NMask), 0}); |
| 5561 | // Check if it was already in the map (already costed). |
| 5562 | if (!Result.second) |
| 5563 | continue; |
| 5564 | // If the sub-mask has at most 2 input sub-vectors then re-cost it using |
| 5565 | // getShuffleCost. If not then cost it using the worst case as the number |
| 5566 | // of element moves into a new vector. |
| 5567 | InstructionCost NCost = |
| 5568 | NumSources <= 2 |
| 5569 | ? getShuffleCost(Kind: NumSources <= 1 ? TTI::SK_PermuteSingleSrc |
| 5570 | : TTI::SK_PermuteTwoSrc, |
| 5571 | DstTy: NTp, SrcTy: NTp, Mask: NMask, CostKind, Index: 0, SubTp: nullptr, Args, |
| 5572 | CxtI) |
| 5573 | : LTNumElts; |
| 5574 | Result.first->second = NCost; |
| 5575 | Cost += NCost; |
| 5576 | } |
| 5577 | return Cost; |
| 5578 | } |
| 5579 | |
| 5580 | Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp); |
| 5581 | bool = Kind == TTI::SK_ExtractSubvector; |
| 5582 | // A subvector extract can be implemented with an ext (or trivial extract, if |
| 5583 | // from lane 0). This currently only handles low or high extracts to prevent |
| 5584 | // SLP vectorizer regressions. |
| 5585 | if (IsExtractSubvector && LT.second.isFixedLengthVector()) { |
| 5586 | if (LT.second.is128BitVector() && |
| 5587 | cast<FixedVectorType>(Val: SubTp)->getNumElements() == |
| 5588 | LT.second.getVectorNumElements() / 2) { |
| 5589 | if (Index == 0) |
| 5590 | return 0; |
| 5591 | if (Index == (int)LT.second.getVectorNumElements() / 2) |
| 5592 | return 1; |
| 5593 | } |
| 5594 | Kind = TTI::SK_PermuteSingleSrc; |
| 5595 | } |
| 5596 | // FIXME: This was added to keep the costs equal when adding DstTys. Update |
| 5597 | // the code to handle length-changing shuffles. |
| 5598 | if (Kind == TTI::SK_InsertSubvector) { |
| 5599 | LT = getTypeLegalizationCost(Ty: DstTy); |
| 5600 | SrcTy = DstTy; |
| 5601 | } |
| 5602 | |
| 5603 | // Segmented shuffle matching. |
| 5604 | if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Val: SrcTy) && |
| 5605 | !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() && |
| 5606 | SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf( |
| 5607 | RHS: AArch64::SVEBitsPerBlock)) { |
| 5608 | |
| 5609 | FixedVectorType *VTy = cast<FixedVectorType>(Val: SrcTy); |
| 5610 | unsigned Segments = |
| 5611 | VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock; |
| 5612 | unsigned SegmentElts = VTy->getNumElements() / Segments; |
| 5613 | |
| 5614 | // dupq zd.t, zn.t[idx] |
| 5615 | if ((ST->hasSVE2p1() || ST->hasSME2p1()) && |
| 5616 | ST->isSVEorStreamingSVEAvailable() && |
| 5617 | isDUPQMask(Mask, Segments, SegmentSize: SegmentElts)) |
| 5618 | return LT.first; |
| 5619 | |
| 5620 | // mov zd.q, vn |
| 5621 | if (ST->isSVEorStreamingSVEAvailable() && |
| 5622 | isDUPFirstSegmentMask(Mask, Segments, SegmentSize: SegmentElts)) |
| 5623 | return LT.first; |
| 5624 | } |
| 5625 | |
| 5626 | // Check for broadcast loads, which are supported by the LD1R instruction. |
| 5627 | // In terms of code-size, the shuffle vector is free when a load + dup get |
| 5628 | // folded into a LD1R. That's what we check and return here. For performance |
| 5629 | // and reciprocal throughput, a LD1R is not completely free. In this case, we |
| 5630 | // return the cost for the broadcast below (i.e. 1 for most/all types), so |
| 5631 | // that we model the load + dup sequence slightly higher because LD1R is a |
| 5632 | // high latency instruction. |
| 5633 | if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { |
| 5634 | bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args[0]); |
| 5635 | if (IsLoad && LT.second.isVector() && |
| 5636 | isLegalBroadcastLoad(ElementTy: SrcTy->getElementType(), |
| 5637 | NumElements: LT.second.getVectorElementCount())) |
| 5638 | return 0; |
| 5639 | } |
| 5640 | |
| 5641 | // If we have 4 elements for the shuffle and a Mask, get the cost straight |
| 5642 | // from the perfect shuffle tables. |
| 5643 | if (Mask.size() == 4 && |
| 5644 | SrcTy->getElementCount() == ElementCount::getFixed(MinVal: 4) && |
| 5645 | (SrcTy->getScalarSizeInBits() == 16 || |
| 5646 | SrcTy->getScalarSizeInBits() == 32) && |
| 5647 | all_of(Range&: Mask, P: [](int E) { return E < 8; })) |
| 5648 | return getPerfectShuffleCost(M: Mask); |
| 5649 | |
| 5650 | // Check for identity masks, which we can treat as free. |
| 5651 | if (!Mask.empty() && LT.second.isFixedLengthVector() && |
| 5652 | (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && |
| 5653 | all_of(Range: enumerate(First&: Mask), P: [](const auto &M) { |
| 5654 | return M.value() < 0 || M.value() == (int)M.index(); |
| 5655 | })) |
| 5656 | return 0; |
| 5657 | |
| 5658 | // Check for other shuffles that are not SK_ kinds but we have native |
| 5659 | // instructions for, for example ZIP and UZP. |
| 5660 | unsigned Unused; |
| 5661 | if (LT.second.isFixedLengthVector() && |
| 5662 | LT.second.getVectorNumElements() == Mask.size() && |
| 5663 | (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && |
| 5664 | (isZIPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) || |
| 5665 | isUZPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) || |
| 5666 | isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(), |
| 5667 | NumElts: LT.second.getVectorNumElements(), BlockSize: 16) || |
| 5668 | isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(), |
| 5669 | NumElts: LT.second.getVectorNumElements(), BlockSize: 32) || |
| 5670 | isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(), |
| 5671 | NumElts: LT.second.getVectorNumElements(), BlockSize: 64) || |
| 5672 | // Check for non-zero lane splats |
| 5673 | all_of(Range: drop_begin(RangeOrContainer&: Mask), |
| 5674 | P: [&Mask](int M) { return M < 0 || M == Mask[0]; }))) |
| 5675 | return 1; |
| 5676 | |
| 5677 | if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || |
| 5678 | Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || |
| 5679 | Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { |
| 5680 | static const CostTblEntry ShuffleTbl[] = { |
| 5681 | // Broadcast shuffle kinds can be performed with 'dup'. |
| 5682 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: 1}, |
| 5683 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: 1}, |
| 5684 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: 1}, |
| 5685 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: 1}, |
| 5686 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i32, .Cost: 1}, |
| 5687 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: 1}, |
| 5688 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: 1}, |
| 5689 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f16, .Cost: 1}, |
| 5690 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: 1}, |
| 5691 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4bf16, .Cost: 1}, |
| 5692 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8bf16, .Cost: 1}, |
| 5693 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f32, .Cost: 1}, |
| 5694 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: 1}, |
| 5695 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: 1}, |
| 5696 | // Transpose shuffle kinds can be performed with 'trn1/trn2' and |
| 5697 | // 'zip1/zip2' instructions. |
| 5698 | {.ISD: TTI::SK_Transpose, .Type: MVT::v8i8, .Cost: 1}, |
| 5699 | {.ISD: TTI::SK_Transpose, .Type: MVT::v16i8, .Cost: 1}, |
| 5700 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4i16, .Cost: 1}, |
| 5701 | {.ISD: TTI::SK_Transpose, .Type: MVT::v8i16, .Cost: 1}, |
| 5702 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2i32, .Cost: 1}, |
| 5703 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4i32, .Cost: 1}, |
| 5704 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2i64, .Cost: 1}, |
| 5705 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4f16, .Cost: 1}, |
| 5706 | {.ISD: TTI::SK_Transpose, .Type: MVT::v8f16, .Cost: 1}, |
| 5707 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4bf16, .Cost: 1}, |
| 5708 | {.ISD: TTI::SK_Transpose, .Type: MVT::v8bf16, .Cost: 1}, |
| 5709 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2f32, .Cost: 1}, |
| 5710 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4f32, .Cost: 1}, |
| 5711 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2f64, .Cost: 1}, |
| 5712 | // Select shuffle kinds. |
| 5713 | // TODO: handle vXi8/vXi16. |
| 5714 | {.ISD: TTI::SK_Select, .Type: MVT::v2i32, .Cost: 1}, // mov. |
| 5715 | {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: 2}, // rev+trn (or similar). |
| 5716 | {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: 1}, // mov. |
| 5717 | {.ISD: TTI::SK_Select, .Type: MVT::v2f32, .Cost: 1}, // mov. |
| 5718 | {.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: 2}, // rev+trn (or similar). |
| 5719 | {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: 1}, // mov. |
| 5720 | // PermuteSingleSrc shuffle kinds. |
| 5721 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i32, .Cost: 1}, // mov. |
| 5722 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: 3}, // perfectshuffle worst case. |
| 5723 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: 1}, // mov. |
| 5724 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f32, .Cost: 1}, // mov. |
| 5725 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: 3}, // perfectshuffle worst case. |
| 5726 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: 1}, // mov. |
| 5727 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: 3}, // perfectshuffle worst case. |
| 5728 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f16, .Cost: 3}, // perfectshuffle worst case. |
| 5729 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4bf16, .Cost: 3}, // same |
| 5730 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: 8}, // constpool + load + tbl |
| 5731 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: 8}, // constpool + load + tbl |
| 5732 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8bf16, .Cost: 8}, // constpool + load + tbl |
| 5733 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: 8}, // constpool + load + tbl |
| 5734 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: 8}, // constpool + load + tbl |
| 5735 | // Reverse can be lowered with `rev`. |
| 5736 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i32, .Cost: 1}, // REV64 |
| 5737 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: 2}, // REV64; EXT |
| 5738 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: 1}, // EXT |
| 5739 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2f32, .Cost: 1}, // REV64 |
| 5740 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: 2}, // REV64; EXT |
| 5741 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: 1}, // EXT |
| 5742 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: 2}, // REV64; EXT |
| 5743 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8bf16, .Cost: 2}, // REV64; EXT |
| 5744 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: 2}, // REV64; EXT |
| 5745 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: 2}, // REV64; EXT |
| 5746 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4f16, .Cost: 1}, // REV64 |
| 5747 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4bf16, .Cost: 1}, // REV64 |
| 5748 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: 1}, // REV64 |
| 5749 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i8, .Cost: 1}, // REV64 |
| 5750 | // Splice can all be lowered as `ext`. |
| 5751 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i32, .Cost: 1}, |
| 5752 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: 1}, |
| 5753 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: 1}, |
| 5754 | {.ISD: TTI::SK_Splice, .Type: MVT::v2f32, .Cost: 1}, |
| 5755 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: 1}, |
| 5756 | {.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: 1}, |
| 5757 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: 1}, |
| 5758 | {.ISD: TTI::SK_Splice, .Type: MVT::v8bf16, .Cost: 1}, |
| 5759 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: 1}, |
| 5760 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: 1}, |
| 5761 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f16, .Cost: 1}, |
| 5762 | {.ISD: TTI::SK_Splice, .Type: MVT::v4bf16, .Cost: 1}, |
| 5763 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: 1}, |
| 5764 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i8, .Cost: 1}, |
| 5765 | // Broadcast shuffle kinds for scalable vectors |
| 5766 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i8, .Cost: 1}, |
| 5767 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i16, .Cost: 1}, |
| 5768 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i32, .Cost: 1}, |
| 5769 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i64, .Cost: 1}, |
| 5770 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f16, .Cost: 1}, |
| 5771 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f16, .Cost: 1}, |
| 5772 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8f16, .Cost: 1}, |
| 5773 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2bf16, .Cost: 1}, |
| 5774 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4bf16, .Cost: 1}, |
| 5775 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8bf16, .Cost: 1}, |
| 5776 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f32, .Cost: 1}, |
| 5777 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f32, .Cost: 1}, |
| 5778 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f64, .Cost: 1}, |
| 5779 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i1, .Cost: 1}, |
| 5780 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i1, .Cost: 1}, |
| 5781 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i1, .Cost: 1}, |
| 5782 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i1, .Cost: 1}, |
| 5783 | // Handle the cases for vector.reverse with scalable vectors |
| 5784 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i8, .Cost: 1}, |
| 5785 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i16, .Cost: 1}, |
| 5786 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i32, .Cost: 1}, |
| 5787 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i64, .Cost: 1}, |
| 5788 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f16, .Cost: 1}, |
| 5789 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f16, .Cost: 1}, |
| 5790 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8f16, .Cost: 1}, |
| 5791 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2bf16, .Cost: 1}, |
| 5792 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4bf16, .Cost: 1}, |
| 5793 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8bf16, .Cost: 1}, |
| 5794 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f32, .Cost: 1}, |
| 5795 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f32, .Cost: 1}, |
| 5796 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f64, .Cost: 1}, |
| 5797 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i1, .Cost: 1}, |
| 5798 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i1, .Cost: 1}, |
| 5799 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i1, .Cost: 1}, |
| 5800 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i1, .Cost: 1}, |
| 5801 | }; |
| 5802 | if (const auto *Entry = CostTableLookup(Table: ShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 5803 | return LT.first * Entry->Cost; |
| 5804 | } |
| 5805 | |
| 5806 | if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: SrcTy)) |
| 5807 | return getSpliceCost(Tp: SrcTy, Index, CostKind); |
| 5808 | |
| 5809 | // Inserting a subvector can often be done with either a D, S or H register |
| 5810 | // move, so long as the inserted vector is "aligned". |
| 5811 | if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && |
| 5812 | LT.second.getSizeInBits() <= 128 && SubTp) { |
| 5813 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
| 5814 | if (SubLT.second.isVector()) { |
| 5815 | int NumElts = LT.second.getVectorNumElements(); |
| 5816 | int NumSubElts = SubLT.second.getVectorNumElements(); |
| 5817 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
| 5818 | return SubLT.first; |
| 5819 | } |
| 5820 | } |
| 5821 | |
| 5822 | // Restore optimal kind. |
| 5823 | if (IsExtractSubvector) |
| 5824 | Kind = TTI::SK_ExtractSubvector; |
| 5825 | return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp, |
| 5826 | Args, CxtI); |
| 5827 | } |
| 5828 | |
| 5829 | static bool containsDecreasingPointers(Loop *TheLoop, |
| 5830 | PredicatedScalarEvolution *PSE) { |
| 5831 | const auto &Strides = DenseMap<Value *, const SCEV *>(); |
| 5832 | for (BasicBlock *BB : TheLoop->blocks()) { |
| 5833 | // Scan the instructions in the block and look for addresses that are |
| 5834 | // consecutive and decreasing. |
| 5835 | for (Instruction &I : *BB) { |
| 5836 | if (isa<LoadInst>(Val: &I) || isa<StoreInst>(Val: &I)) { |
| 5837 | Value *Ptr = getLoadStorePointerOperand(V: &I); |
| 5838 | Type *AccessTy = getLoadStoreType(I: &I); |
| 5839 | if (getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides, /*Assume=*/true, |
| 5840 | /*ShouldCheckWrap=*/false) |
| 5841 | .value_or(u: 0) < 0) |
| 5842 | return true; |
| 5843 | } |
| 5844 | } |
| 5845 | } |
| 5846 | return false; |
| 5847 | } |
| 5848 | |
| 5849 | bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const { |
| 5850 | if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences()) |
| 5851 | return SVEPreferFixedOverScalableIfEqualCost; |
| 5852 | return ST->useFixedOverScalableIfEqualCost(); |
| 5853 | } |
| 5854 | |
| 5855 | unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const { |
| 5856 | return ST->getEpilogueVectorizationMinVF(); |
| 5857 | } |
| 5858 | |
| 5859 | bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { |
| 5860 | if (!ST->hasSVE()) |
| 5861 | return false; |
| 5862 | |
| 5863 | // We don't currently support vectorisation with interleaving for SVE - with |
| 5864 | // such loops we're better off not using tail-folding. This gives us a chance |
| 5865 | // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. |
| 5866 | if (TFI->IAI->hasGroups()) |
| 5867 | return false; |
| 5868 | |
| 5869 | TailFoldingOpts Required = TailFoldingOpts::Disabled; |
| 5870 | if (TFI->LVL->getReductionVars().size()) |
| 5871 | Required |= TailFoldingOpts::Reductions; |
| 5872 | if (TFI->LVL->getFixedOrderRecurrences().size()) |
| 5873 | Required |= TailFoldingOpts::Recurrences; |
| 5874 | |
| 5875 | // We call this to discover whether any load/store pointers in the loop have |
| 5876 | // negative strides. This will require extra work to reverse the loop |
| 5877 | // predicate, which may be expensive. |
| 5878 | if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(), |
| 5879 | PSE: TFI->LVL->getPredicatedScalarEvolution())) |
| 5880 | Required |= TailFoldingOpts::Reverse; |
| 5881 | if (Required == TailFoldingOpts::Disabled) |
| 5882 | Required |= TailFoldingOpts::Simple; |
| 5883 | |
| 5884 | if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(), |
| 5885 | Required)) |
| 5886 | return false; |
| 5887 | |
| 5888 | // Don't tail-fold for tight loops where we would be better off interleaving |
| 5889 | // with an unpredicated loop. |
| 5890 | unsigned NumInsns = 0; |
| 5891 | for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) { |
| 5892 | NumInsns += BB->sizeWithoutDebug(); |
| 5893 | } |
| 5894 | |
| 5895 | // We expect 4 of these to be a IV PHI, IV add, IV compare and branch. |
| 5896 | return NumInsns >= SVETailFoldInsnThreshold; |
| 5897 | } |
| 5898 | |
| 5899 | InstructionCost |
| 5900 | AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
| 5901 | StackOffset BaseOffset, bool HasBaseReg, |
| 5902 | int64_t Scale, unsigned AddrSpace) const { |
| 5903 | // Scaling factors are not free at all. |
| 5904 | // Operands | Rt Latency |
| 5905 | // ------------------------------------------- |
| 5906 | // Rt, [Xn, Xm] | 4 |
| 5907 | // ------------------------------------------- |
| 5908 | // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 |
| 5909 | // Rt, [Xn, Wm, <extend> #imm] | |
| 5910 | TargetLoweringBase::AddrMode AM; |
| 5911 | AM.BaseGV = BaseGV; |
| 5912 | AM.BaseOffs = BaseOffset.getFixed(); |
| 5913 | AM.HasBaseReg = HasBaseReg; |
| 5914 | AM.Scale = Scale; |
| 5915 | AM.ScalableOffset = BaseOffset.getScalable(); |
| 5916 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) |
| 5917 | // Scale represents reg2 * scale, thus account for 1 if |
| 5918 | // it is not equal to 0 or 1. |
| 5919 | return AM.Scale != 0 && AM.Scale != 1; |
| 5920 | return InstructionCost::getInvalid(); |
| 5921 | } |
| 5922 | |
| 5923 | bool AArch64TTIImpl::shouldTreatInstructionLikeSelect( |
| 5924 | const Instruction *I) const { |
| 5925 | if (EnableOrLikeSelectOpt) { |
| 5926 | // For the binary operators (e.g. or) we need to be more careful than |
| 5927 | // selects, here we only transform them if they are already at a natural |
| 5928 | // break point in the code - the end of a block with an unconditional |
| 5929 | // terminator. |
| 5930 | if (I->getOpcode() == Instruction::Or && |
| 5931 | isa<BranchInst>(Val: I->getNextNode()) && |
| 5932 | cast<BranchInst>(Val: I->getNextNode())->isUnconditional()) |
| 5933 | return true; |
| 5934 | |
| 5935 | if (I->getOpcode() == Instruction::Add || |
| 5936 | I->getOpcode() == Instruction::Sub) |
| 5937 | return true; |
| 5938 | } |
| 5939 | return BaseT::shouldTreatInstructionLikeSelect(I); |
| 5940 | } |
| 5941 | |
| 5942 | bool AArch64TTIImpl::isLSRCostLess( |
| 5943 | const TargetTransformInfo::LSRCost &C1, |
| 5944 | const TargetTransformInfo::LSRCost &C2) const { |
| 5945 | // AArch64 specific here is adding the number of instructions to the |
| 5946 | // comparison (though not as the first consideration, as some targets do) |
| 5947 | // along with changing the priority of the base additions. |
| 5948 | // TODO: Maybe a more nuanced tradeoff between instruction count |
| 5949 | // and number of registers? To be investigated at a later date. |
| 5950 | if (EnableLSRCostOpt) |
| 5951 | return std::tie(args: C1.NumRegs, args: C1.Insns, args: C1.NumBaseAdds, args: C1.AddRecCost, |
| 5952 | args: C1.NumIVMuls, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) < |
| 5953 | std::tie(args: C2.NumRegs, args: C2.Insns, args: C2.NumBaseAdds, args: C2.AddRecCost, |
| 5954 | args: C2.NumIVMuls, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost); |
| 5955 | |
| 5956 | return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); |
| 5957 | } |
| 5958 | |
| 5959 | static bool isSplatShuffle(Value *V) { |
| 5960 | if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V)) |
| 5961 | return all_equal(Range: Shuf->getShuffleMask()); |
| 5962 | return false; |
| 5963 | } |
| 5964 | |
| 5965 | /// Check if both Op1 and Op2 are shufflevector extracts of either the lower |
| 5966 | /// or upper half of the vector elements. |
| 5967 | static bool (Value *Op1, Value *Op2, |
| 5968 | bool AllowSplat = false) { |
| 5969 | // Scalable types can't be extract shuffle vectors. |
| 5970 | if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy()) |
| 5971 | return false; |
| 5972 | |
| 5973 | auto areTypesHalfed = [](Value *FullV, Value *HalfV) { |
| 5974 | auto *FullTy = FullV->getType(); |
| 5975 | auto *HalfTy = HalfV->getType(); |
| 5976 | return FullTy->getPrimitiveSizeInBits().getFixedValue() == |
| 5977 | 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue(); |
| 5978 | }; |
| 5979 | |
| 5980 | auto = [](Value *FullV, Value *HalfV) { |
| 5981 | auto *FullVT = cast<FixedVectorType>(Val: FullV->getType()); |
| 5982 | auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType()); |
| 5983 | return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); |
| 5984 | }; |
| 5985 | |
| 5986 | ArrayRef<int> M1, M2; |
| 5987 | Value *S1Op1 = nullptr, *S2Op1 = nullptr; |
| 5988 | if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask(M1))) || |
| 5989 | !match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask(M2)))) |
| 5990 | return false; |
| 5991 | |
| 5992 | // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that |
| 5993 | // it is not checked as an extract below. |
| 5994 | if (AllowSplat && isSplatShuffle(V: Op1)) |
| 5995 | S1Op1 = nullptr; |
| 5996 | if (AllowSplat && isSplatShuffle(V: Op2)) |
| 5997 | S2Op1 = nullptr; |
| 5998 | |
| 5999 | // Check that the operands are half as wide as the result and we extract |
| 6000 | // half of the elements of the input vectors. |
| 6001 | if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) || |
| 6002 | (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2)))) |
| 6003 | return false; |
| 6004 | |
| 6005 | // Check the mask extracts either the lower or upper half of vector |
| 6006 | // elements. |
| 6007 | int M1Start = 0; |
| 6008 | int M2Start = 0; |
| 6009 | int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * 2; |
| 6010 | if ((S1Op1 && |
| 6011 | !ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) || |
| 6012 | (S2Op1 && |
| 6013 | !ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start))) |
| 6014 | return false; |
| 6015 | |
| 6016 | if ((M1Start != 0 && M1Start != (NumElements / 2)) || |
| 6017 | (M2Start != 0 && M2Start != (NumElements / 2))) |
| 6018 | return false; |
| 6019 | if (S1Op1 && S2Op1 && M1Start != M2Start) |
| 6020 | return false; |
| 6021 | |
| 6022 | return true; |
| 6023 | } |
| 6024 | |
| 6025 | /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth |
| 6026 | /// of the vector elements. |
| 6027 | static bool (Value *Ext1, Value *Ext2) { |
| 6028 | auto areExtDoubled = [](Instruction *Ext) { |
| 6029 | return Ext->getType()->getScalarSizeInBits() == |
| 6030 | 2 * Ext->getOperand(i: 0)->getType()->getScalarSizeInBits(); |
| 6031 | }; |
| 6032 | |
| 6033 | if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) || |
| 6034 | !match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) || |
| 6035 | !areExtDoubled(cast<Instruction>(Val: Ext1)) || |
| 6036 | !areExtDoubled(cast<Instruction>(Val: Ext2))) |
| 6037 | return false; |
| 6038 | |
| 6039 | return true; |
| 6040 | } |
| 6041 | |
| 6042 | /// Check if Op could be used with vmull_high_p64 intrinsic. |
| 6043 | static bool isOperandOfVmullHighP64(Value *Op) { |
| 6044 | Value *VectorOperand = nullptr; |
| 6045 | ConstantInt *ElementIndex = nullptr; |
| 6046 | return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand), |
| 6047 | Idx: m_ConstantInt(CI&: ElementIndex))) && |
| 6048 | ElementIndex->getValue() == 1 && |
| 6049 | isa<FixedVectorType>(Val: VectorOperand->getType()) && |
| 6050 | cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == 2; |
| 6051 | } |
| 6052 | |
| 6053 | /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. |
| 6054 | static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { |
| 6055 | return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2); |
| 6056 | } |
| 6057 | |
| 6058 | static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) { |
| 6059 | // Restrict ourselves to the form CodeGenPrepare typically constructs. |
| 6060 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs); |
| 6061 | if (!GEP || GEP->getNumOperands() != 2) |
| 6062 | return false; |
| 6063 | |
| 6064 | Value *Base = GEP->getOperand(i_nocapture: 0); |
| 6065 | Value *Offsets = GEP->getOperand(i_nocapture: 1); |
| 6066 | |
| 6067 | // We only care about scalar_base+vector_offsets. |
| 6068 | if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy()) |
| 6069 | return false; |
| 6070 | |
| 6071 | // Sink extends that would allow us to use 32-bit offset vectors. |
| 6072 | if (isa<SExtInst>(Val: Offsets) || isa<ZExtInst>(Val: Offsets)) { |
| 6073 | auto *OffsetsInst = cast<Instruction>(Val: Offsets); |
| 6074 | if (OffsetsInst->getType()->getScalarSizeInBits() > 32 && |
| 6075 | OffsetsInst->getOperand(i: 0)->getType()->getScalarSizeInBits() <= 32) |
| 6076 | Ops.push_back(Elt: &GEP->getOperandUse(i: 1)); |
| 6077 | } |
| 6078 | |
| 6079 | // Sink the GEP. |
| 6080 | return true; |
| 6081 | } |
| 6082 | |
| 6083 | /// We want to sink following cases: |
| 6084 | /// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale; |
| 6085 | /// (add|sub|gep) A, ((mul|shl) zext(vscale), imm); |
| 6086 | static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) { |
| 6087 | if (match(V: Op, P: m_VScale())) |
| 6088 | return true; |
| 6089 | if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) || |
| 6090 | match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) { |
| 6091 | Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0)); |
| 6092 | return true; |
| 6093 | } |
| 6094 | if (match(V: Op, P: m_Shl(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt())) || |
| 6095 | match(V: Op, P: m_Mul(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt()))) { |
| 6096 | Value *ZExtOp = cast<Instruction>(Val: Op)->getOperand(i: 0); |
| 6097 | Ops.push_back(Elt: &cast<Instruction>(Val: ZExtOp)->getOperandUse(i: 0)); |
| 6098 | Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0)); |
| 6099 | return true; |
| 6100 | } |
| 6101 | return false; |
| 6102 | } |
| 6103 | |
| 6104 | /// Check if sinking \p I's operands to I's basic block is profitable, because |
| 6105 | /// the operands can be folded into a target instruction, e.g. |
| 6106 | /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). |
| 6107 | bool AArch64TTIImpl::isProfitableToSinkOperands( |
| 6108 | Instruction *I, SmallVectorImpl<Use *> &Ops) const { |
| 6109 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) { |
| 6110 | switch (II->getIntrinsicID()) { |
| 6111 | case Intrinsic::aarch64_neon_smull: |
| 6112 | case Intrinsic::aarch64_neon_umull: |
| 6113 | if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1), |
| 6114 | /*AllowSplat=*/true)) { |
| 6115 | Ops.push_back(Elt: &II->getOperandUse(i: 0)); |
| 6116 | Ops.push_back(Elt: &II->getOperandUse(i: 1)); |
| 6117 | return true; |
| 6118 | } |
| 6119 | [[fallthrough]]; |
| 6120 | |
| 6121 | case Intrinsic::fma: |
| 6122 | case Intrinsic::fmuladd: |
| 6123 | if (isa<VectorType>(Val: I->getType()) && |
| 6124 | cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() && |
| 6125 | !ST->hasFullFP16()) |
| 6126 | return false; |
| 6127 | [[fallthrough]]; |
| 6128 | case Intrinsic::aarch64_neon_sqdmull: |
| 6129 | case Intrinsic::aarch64_neon_sqdmulh: |
| 6130 | case Intrinsic::aarch64_neon_sqrdmulh: |
| 6131 | // Sink splats for index lane variants |
| 6132 | if (isSplatShuffle(V: II->getOperand(i_nocapture: 0))) |
| 6133 | Ops.push_back(Elt: &II->getOperandUse(i: 0)); |
| 6134 | if (isSplatShuffle(V: II->getOperand(i_nocapture: 1))) |
| 6135 | Ops.push_back(Elt: &II->getOperandUse(i: 1)); |
| 6136 | return !Ops.empty(); |
| 6137 | case Intrinsic::aarch64_neon_fmlal: |
| 6138 | case Intrinsic::aarch64_neon_fmlal2: |
| 6139 | case Intrinsic::aarch64_neon_fmlsl: |
| 6140 | case Intrinsic::aarch64_neon_fmlsl2: |
| 6141 | // Sink splats for index lane variants |
| 6142 | if (isSplatShuffle(V: II->getOperand(i_nocapture: 1))) |
| 6143 | Ops.push_back(Elt: &II->getOperandUse(i: 1)); |
| 6144 | if (isSplatShuffle(V: II->getOperand(i_nocapture: 2))) |
| 6145 | Ops.push_back(Elt: &II->getOperandUse(i: 2)); |
| 6146 | return !Ops.empty(); |
| 6147 | case Intrinsic::aarch64_sve_ptest_first: |
| 6148 | case Intrinsic::aarch64_sve_ptest_last: |
| 6149 | if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: 0))) |
| 6150 | if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) |
| 6151 | Ops.push_back(Elt: &II->getOperandUse(i: 0)); |
| 6152 | return !Ops.empty(); |
| 6153 | case Intrinsic::aarch64_sme_write_horiz: |
| 6154 | case Intrinsic::aarch64_sme_write_vert: |
| 6155 | case Intrinsic::aarch64_sme_writeq_horiz: |
| 6156 | case Intrinsic::aarch64_sme_writeq_vert: { |
| 6157 | auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 1)); |
| 6158 | if (!Idx || Idx->getOpcode() != Instruction::Add) |
| 6159 | return false; |
| 6160 | Ops.push_back(Elt: &II->getOperandUse(i: 1)); |
| 6161 | return true; |
| 6162 | } |
| 6163 | case Intrinsic::aarch64_sme_read_horiz: |
| 6164 | case Intrinsic::aarch64_sme_read_vert: |
| 6165 | case Intrinsic::aarch64_sme_readq_horiz: |
| 6166 | case Intrinsic::aarch64_sme_readq_vert: |
| 6167 | case Intrinsic::aarch64_sme_ld1b_vert: |
| 6168 | case Intrinsic::aarch64_sme_ld1h_vert: |
| 6169 | case Intrinsic::aarch64_sme_ld1w_vert: |
| 6170 | case Intrinsic::aarch64_sme_ld1d_vert: |
| 6171 | case Intrinsic::aarch64_sme_ld1q_vert: |
| 6172 | case Intrinsic::aarch64_sme_st1b_vert: |
| 6173 | case Intrinsic::aarch64_sme_st1h_vert: |
| 6174 | case Intrinsic::aarch64_sme_st1w_vert: |
| 6175 | case Intrinsic::aarch64_sme_st1d_vert: |
| 6176 | case Intrinsic::aarch64_sme_st1q_vert: |
| 6177 | case Intrinsic::aarch64_sme_ld1b_horiz: |
| 6178 | case Intrinsic::aarch64_sme_ld1h_horiz: |
| 6179 | case Intrinsic::aarch64_sme_ld1w_horiz: |
| 6180 | case Intrinsic::aarch64_sme_ld1d_horiz: |
| 6181 | case Intrinsic::aarch64_sme_ld1q_horiz: |
| 6182 | case Intrinsic::aarch64_sme_st1b_horiz: |
| 6183 | case Intrinsic::aarch64_sme_st1h_horiz: |
| 6184 | case Intrinsic::aarch64_sme_st1w_horiz: |
| 6185 | case Intrinsic::aarch64_sme_st1d_horiz: |
| 6186 | case Intrinsic::aarch64_sme_st1q_horiz: { |
| 6187 | auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 3)); |
| 6188 | if (!Idx || Idx->getOpcode() != Instruction::Add) |
| 6189 | return false; |
| 6190 | Ops.push_back(Elt: &II->getOperandUse(i: 3)); |
| 6191 | return true; |
| 6192 | } |
| 6193 | case Intrinsic::aarch64_neon_pmull: |
| 6194 | if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1))) |
| 6195 | return false; |
| 6196 | Ops.push_back(Elt: &II->getOperandUse(i: 0)); |
| 6197 | Ops.push_back(Elt: &II->getOperandUse(i: 1)); |
| 6198 | return true; |
| 6199 | case Intrinsic::aarch64_neon_pmull64: |
| 6200 | if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: 0), |
| 6201 | Op2: II->getArgOperand(i: 1))) |
| 6202 | return false; |
| 6203 | Ops.push_back(Elt: &II->getArgOperandUse(i: 0)); |
| 6204 | Ops.push_back(Elt: &II->getArgOperandUse(i: 1)); |
| 6205 | return true; |
| 6206 | case Intrinsic::masked_gather: |
| 6207 | if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 0), Ops)) |
| 6208 | return false; |
| 6209 | Ops.push_back(Elt: &II->getArgOperandUse(i: 0)); |
| 6210 | return true; |
| 6211 | case Intrinsic::masked_scatter: |
| 6212 | if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 1), Ops)) |
| 6213 | return false; |
| 6214 | Ops.push_back(Elt: &II->getArgOperandUse(i: 1)); |
| 6215 | return true; |
| 6216 | default: |
| 6217 | return false; |
| 6218 | } |
| 6219 | } |
| 6220 | |
| 6221 | auto ShouldSinkCondition = [](Value *Cond) -> bool { |
| 6222 | auto *II = dyn_cast<IntrinsicInst>(Val: Cond); |
| 6223 | return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or && |
| 6224 | isa<ScalableVectorType>(Val: II->getOperand(i_nocapture: 0)->getType()); |
| 6225 | }; |
| 6226 | |
| 6227 | switch (I->getOpcode()) { |
| 6228 | case Instruction::GetElementPtr: |
| 6229 | case Instruction::Add: |
| 6230 | case Instruction::Sub: |
| 6231 | // Sink vscales closer to uses for better isel |
| 6232 | for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) { |
| 6233 | if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) { |
| 6234 | Ops.push_back(Elt: &I->getOperandUse(i: Op)); |
| 6235 | return true; |
| 6236 | } |
| 6237 | } |
| 6238 | break; |
| 6239 | case Instruction::Select: { |
| 6240 | if (!ShouldSinkCondition(I->getOperand(i: 0))) |
| 6241 | return false; |
| 6242 | |
| 6243 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
| 6244 | return true; |
| 6245 | } |
| 6246 | case Instruction::Br: { |
| 6247 | if (cast<BranchInst>(Val: I)->isUnconditional()) |
| 6248 | return false; |
| 6249 | |
| 6250 | if (!ShouldSinkCondition(cast<BranchInst>(Val: I)->getCondition())) |
| 6251 | return false; |
| 6252 | |
| 6253 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
| 6254 | return true; |
| 6255 | } |
| 6256 | default: |
| 6257 | break; |
| 6258 | } |
| 6259 | |
| 6260 | if (!I->getType()->isVectorTy()) |
| 6261 | return false; |
| 6262 | |
| 6263 | switch (I->getOpcode()) { |
| 6264 | case Instruction::Sub: |
| 6265 | case Instruction::Add: { |
| 6266 | if (!areExtractExts(Ext1: I->getOperand(i: 0), Ext2: I->getOperand(i: 1))) |
| 6267 | return false; |
| 6268 | |
| 6269 | // If the exts' operands extract either the lower or upper elements, we |
| 6270 | // can sink them too. |
| 6271 | auto Ext1 = cast<Instruction>(Val: I->getOperand(i: 0)); |
| 6272 | auto Ext2 = cast<Instruction>(Val: I->getOperand(i: 1)); |
| 6273 | if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: 0), Op2: Ext2->getOperand(i: 0))) { |
| 6274 | Ops.push_back(Elt: &Ext1->getOperandUse(i: 0)); |
| 6275 | Ops.push_back(Elt: &Ext2->getOperandUse(i: 0)); |
| 6276 | } |
| 6277 | |
| 6278 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
| 6279 | Ops.push_back(Elt: &I->getOperandUse(i: 1)); |
| 6280 | |
| 6281 | return true; |
| 6282 | } |
| 6283 | case Instruction::Or: { |
| 6284 | // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) -> |
| 6285 | // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1) |
| 6286 | if (ST->hasNEON()) { |
| 6287 | Instruction *OtherAnd, *IA, *IB; |
| 6288 | Value *MaskValue; |
| 6289 | // MainAnd refers to And instruction that has 'Not' as one of its operands |
| 6290 | if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)), |
| 6291 | R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))), |
| 6292 | R: m_Instruction(I&: IA)))))) { |
| 6293 | if (match(V: OtherAnd, |
| 6294 | P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) { |
| 6295 | Instruction *MainAnd = I->getOperand(i: 0) == OtherAnd |
| 6296 | ? cast<Instruction>(Val: I->getOperand(i: 1)) |
| 6297 | : cast<Instruction>(Val: I->getOperand(i: 0)); |
| 6298 | |
| 6299 | // Both Ands should be in same basic block as Or |
| 6300 | if (I->getParent() != MainAnd->getParent() || |
| 6301 | I->getParent() != OtherAnd->getParent()) |
| 6302 | return false; |
| 6303 | |
| 6304 | // Non-mask operands of both Ands should also be in same basic block |
| 6305 | if (I->getParent() != IA->getParent() || |
| 6306 | I->getParent() != IB->getParent()) |
| 6307 | return false; |
| 6308 | |
| 6309 | Ops.push_back( |
| 6310 | Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: 0) == IA ? 1 : 0)); |
| 6311 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
| 6312 | Ops.push_back(Elt: &I->getOperandUse(i: 1)); |
| 6313 | |
| 6314 | return true; |
| 6315 | } |
| 6316 | } |
| 6317 | } |
| 6318 | |
| 6319 | return false; |
| 6320 | } |
| 6321 | case Instruction::Mul: { |
| 6322 | auto ShouldSinkSplatForIndexedVariant = [](Value *V) { |
| 6323 | auto *Ty = cast<VectorType>(Val: V->getType()); |
| 6324 | // For SVE the lane-indexing is within 128-bits, so we can't fold splats. |
| 6325 | if (Ty->isScalableTy()) |
| 6326 | return false; |
| 6327 | |
| 6328 | // Indexed variants of Mul exist for i16 and i32 element types only. |
| 6329 | return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32; |
| 6330 | }; |
| 6331 | |
| 6332 | int NumZExts = 0, NumSExts = 0; |
| 6333 | for (auto &Op : I->operands()) { |
| 6334 | // Make sure we are not already sinking this operand |
| 6335 | if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; })) |
| 6336 | continue; |
| 6337 | |
| 6338 | if (match(V: &Op, P: m_ZExtOrSExt(Op: m_Value()))) { |
| 6339 | auto *Ext = cast<Instruction>(Val&: Op); |
| 6340 | auto *ExtOp = Ext->getOperand(i: 0); |
| 6341 | if (isSplatShuffle(V: ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp)) |
| 6342 | Ops.push_back(Elt: &Ext->getOperandUse(i: 0)); |
| 6343 | Ops.push_back(Elt: &Op); |
| 6344 | |
| 6345 | if (isa<SExtInst>(Val: Ext)) |
| 6346 | NumSExts++; |
| 6347 | else |
| 6348 | NumZExts++; |
| 6349 | |
| 6350 | continue; |
| 6351 | } |
| 6352 | |
| 6353 | ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op); |
| 6354 | if (!Shuffle) |
| 6355 | continue; |
| 6356 | |
| 6357 | // If the Shuffle is a splat and the operand is a zext/sext, sinking the |
| 6358 | // operand and the s/zext can help create indexed s/umull. This is |
| 6359 | // especially useful to prevent i64 mul being scalarized. |
| 6360 | if (isSplatShuffle(V: Shuffle) && |
| 6361 | match(V: Shuffle->getOperand(i_nocapture: 0), P: m_ZExtOrSExt(Op: m_Value()))) { |
| 6362 | Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0)); |
| 6363 | Ops.push_back(Elt: &Op); |
| 6364 | if (match(V: Shuffle->getOperand(i_nocapture: 0), P: m_SExt(Op: m_Value()))) |
| 6365 | NumSExts++; |
| 6366 | else |
| 6367 | NumZExts++; |
| 6368 | continue; |
| 6369 | } |
| 6370 | |
| 6371 | Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: 0); |
| 6372 | InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand); |
| 6373 | if (!Insert) |
| 6374 | continue; |
| 6375 | |
| 6376 | Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: 1)); |
| 6377 | if (!OperandInstr) |
| 6378 | continue; |
| 6379 | |
| 6380 | ConstantInt *ElementConstant = |
| 6381 | dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: 2)); |
| 6382 | // Check that the insertelement is inserting into element 0 |
| 6383 | if (!ElementConstant || !ElementConstant->isZero()) |
| 6384 | continue; |
| 6385 | |
| 6386 | unsigned Opcode = OperandInstr->getOpcode(); |
| 6387 | if (Opcode == Instruction::SExt) |
| 6388 | NumSExts++; |
| 6389 | else if (Opcode == Instruction::ZExt) |
| 6390 | NumZExts++; |
| 6391 | else { |
| 6392 | // If we find that the top bits are known 0, then we can sink and allow |
| 6393 | // the backend to generate a umull. |
| 6394 | unsigned Bitwidth = I->getType()->getScalarSizeInBits(); |
| 6395 | APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / 2); |
| 6396 | if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, SQ: DL)) |
| 6397 | continue; |
| 6398 | NumZExts++; |
| 6399 | } |
| 6400 | |
| 6401 | // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking |
| 6402 | // the And, just to hoist it again back to the load. |
| 6403 | if (!match(V: OperandInstr, P: m_And(L: m_Load(Op: m_Value()), R: m_Value()))) |
| 6404 | Ops.push_back(Elt: &Insert->getOperandUse(i: 1)); |
| 6405 | Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0)); |
| 6406 | Ops.push_back(Elt: &Op); |
| 6407 | } |
| 6408 | |
| 6409 | // It is profitable to sink if we found two of the same type of extends. |
| 6410 | if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2)) |
| 6411 | return true; |
| 6412 | |
| 6413 | // Otherwise, see if we should sink splats for indexed variants. |
| 6414 | if (!ShouldSinkSplatForIndexedVariant(I)) |
| 6415 | return false; |
| 6416 | |
| 6417 | Ops.clear(); |
| 6418 | if (isSplatShuffle(V: I->getOperand(i: 0))) |
| 6419 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
| 6420 | if (isSplatShuffle(V: I->getOperand(i: 1))) |
| 6421 | Ops.push_back(Elt: &I->getOperandUse(i: 1)); |
| 6422 | |
| 6423 | return !Ops.empty(); |
| 6424 | } |
| 6425 | case Instruction::FMul: { |
| 6426 | // For SVE the lane-indexing is within 128-bits, so we can't fold splats. |
| 6427 | if (I->getType()->isScalableTy()) |
| 6428 | return false; |
| 6429 | |
| 6430 | if (cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() && |
| 6431 | !ST->hasFullFP16()) |
| 6432 | return false; |
| 6433 | |
| 6434 | // Sink splats for index lane variants |
| 6435 | if (isSplatShuffle(V: I->getOperand(i: 0))) |
| 6436 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
| 6437 | if (isSplatShuffle(V: I->getOperand(i: 1))) |
| 6438 | Ops.push_back(Elt: &I->getOperandUse(i: 1)); |
| 6439 | return !Ops.empty(); |
| 6440 | } |
| 6441 | default: |
| 6442 | return false; |
| 6443 | } |
| 6444 | return false; |
| 6445 | } |
| 6446 | |