1 | //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "AArch64TargetTransformInfo.h" |
10 | #include "AArch64ExpandImm.h" |
11 | #include "AArch64PerfectShuffle.h" |
12 | #include "MCTargetDesc/AArch64AddressingModes.h" |
13 | #include "Utils/AArch64SMEAttributes.h" |
14 | #include "llvm/ADT/DenseMap.h" |
15 | #include "llvm/Analysis/LoopInfo.h" |
16 | #include "llvm/Analysis/TargetTransformInfo.h" |
17 | #include "llvm/CodeGen/BasicTTIImpl.h" |
18 | #include "llvm/CodeGen/CostTable.h" |
19 | #include "llvm/CodeGen/TargetLowering.h" |
20 | #include "llvm/IR/DerivedTypes.h" |
21 | #include "llvm/IR/IntrinsicInst.h" |
22 | #include "llvm/IR/Intrinsics.h" |
23 | #include "llvm/IR/IntrinsicsAArch64.h" |
24 | #include "llvm/IR/PatternMatch.h" |
25 | #include "llvm/Support/Debug.h" |
26 | #include "llvm/TargetParser/AArch64TargetParser.h" |
27 | #include "llvm/Transforms/InstCombine/InstCombiner.h" |
28 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
29 | #include <algorithm> |
30 | #include <optional> |
31 | using namespace llvm; |
32 | using namespace llvm::PatternMatch; |
33 | |
34 | #define DEBUG_TYPE "aarch64tti" |
35 | |
36 | static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix" , |
37 | cl::init(Val: true), cl::Hidden); |
38 | |
39 | static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost( |
40 | "sve-prefer-fixed-over-scalable-if-equal" , cl::Hidden); |
41 | |
42 | static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead" , cl::init(Val: 10), |
43 | cl::Hidden); |
44 | |
45 | static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead" , |
46 | cl::init(Val: 10), cl::Hidden); |
47 | |
48 | static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold" , |
49 | cl::init(Val: 15), cl::Hidden); |
50 | |
51 | static cl::opt<unsigned> |
52 | NeonNonConstStrideOverhead("neon-nonconst-stride-overhead" , cl::init(Val: 10), |
53 | cl::Hidden); |
54 | |
55 | static cl::opt<unsigned> CallPenaltyChangeSM( |
56 | "call-penalty-sm-change" , cl::init(Val: 5), cl::Hidden, |
57 | cl::desc( |
58 | "Penalty of calling a function that requires a change to PSTATE.SM" )); |
59 | |
60 | static cl::opt<unsigned> InlineCallPenaltyChangeSM( |
61 | "inline-call-penalty-sm-change" , cl::init(Val: 10), cl::Hidden, |
62 | cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM" )); |
63 | |
64 | static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select" , |
65 | cl::init(Val: true), cl::Hidden); |
66 | |
67 | static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt" , |
68 | cl::init(Val: true), cl::Hidden); |
69 | |
70 | // A complete guess as to a reasonable cost. |
71 | static cl::opt<unsigned> |
72 | BaseHistCntCost("aarch64-base-histcnt-cost" , cl::init(Val: 8), cl::Hidden, |
73 | cl::desc("The cost of a histcnt instruction" )); |
74 | |
75 | static cl::opt<unsigned> DMBLookaheadThreshold( |
76 | "dmb-lookahead-threshold" , cl::init(Val: 10), cl::Hidden, |
77 | cl::desc("The number of instructions to search for a redundant dmb" )); |
78 | |
79 | namespace { |
80 | class TailFoldingOption { |
81 | // These bitfields will only ever be set to something non-zero in operator=, |
82 | // when setting the -sve-tail-folding option. This option should always be of |
83 | // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here |
84 | // InitialBits is one of (disabled|all|simple). EnableBits represents |
85 | // additional flags we're enabling, and DisableBits for those flags we're |
86 | // disabling. The default flag is tracked in the variable NeedsDefault, since |
87 | // at the time of setting the option we may not know what the default value |
88 | // for the CPU is. |
89 | TailFoldingOpts InitialBits = TailFoldingOpts::Disabled; |
90 | TailFoldingOpts EnableBits = TailFoldingOpts::Disabled; |
91 | TailFoldingOpts DisableBits = TailFoldingOpts::Disabled; |
92 | |
93 | // This value needs to be initialised to true in case the user does not |
94 | // explicitly set the -sve-tail-folding option. |
95 | bool NeedsDefault = true; |
96 | |
97 | void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; } |
98 | |
99 | void setNeedsDefault(bool V) { NeedsDefault = V; } |
100 | |
101 | void setEnableBit(TailFoldingOpts Bit) { |
102 | EnableBits |= Bit; |
103 | DisableBits &= ~Bit; |
104 | } |
105 | |
106 | void setDisableBit(TailFoldingOpts Bit) { |
107 | EnableBits &= ~Bit; |
108 | DisableBits |= Bit; |
109 | } |
110 | |
111 | TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const { |
112 | TailFoldingOpts Bits = TailFoldingOpts::Disabled; |
113 | |
114 | assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) && |
115 | "Initial bits should only include one of " |
116 | "(disabled|all|simple|default)" ); |
117 | Bits = NeedsDefault ? DefaultBits : InitialBits; |
118 | Bits |= EnableBits; |
119 | Bits &= ~DisableBits; |
120 | |
121 | return Bits; |
122 | } |
123 | |
124 | void reportError(std::string Opt) { |
125 | errs() << "invalid argument '" << Opt |
126 | << "' to -sve-tail-folding=; the option should be of the form\n" |
127 | " (disabled|all|default|simple)[+(reductions|recurrences" |
128 | "|reverse|noreductions|norecurrences|noreverse)]\n" ; |
129 | report_fatal_error(reason: "Unrecognised tail-folding option" ); |
130 | } |
131 | |
132 | public: |
133 | |
134 | void operator=(const std::string &Val) { |
135 | // If the user explicitly sets -sve-tail-folding= then treat as an error. |
136 | if (Val.empty()) { |
137 | reportError(Opt: "" ); |
138 | return; |
139 | } |
140 | |
141 | // Since the user is explicitly setting the option we don't automatically |
142 | // need the default unless they require it. |
143 | setNeedsDefault(false); |
144 | |
145 | SmallVector<StringRef, 4> TailFoldTypes; |
146 | StringRef(Val).split(A&: TailFoldTypes, Separator: '+', MaxSplit: -1, KeepEmpty: false); |
147 | |
148 | unsigned StartIdx = 1; |
149 | if (TailFoldTypes[0] == "disabled" ) |
150 | setInitialBits(TailFoldingOpts::Disabled); |
151 | else if (TailFoldTypes[0] == "all" ) |
152 | setInitialBits(TailFoldingOpts::All); |
153 | else if (TailFoldTypes[0] == "default" ) |
154 | setNeedsDefault(true); |
155 | else if (TailFoldTypes[0] == "simple" ) |
156 | setInitialBits(TailFoldingOpts::Simple); |
157 | else { |
158 | StartIdx = 0; |
159 | setInitialBits(TailFoldingOpts::Disabled); |
160 | } |
161 | |
162 | for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) { |
163 | if (TailFoldTypes[I] == "reductions" ) |
164 | setEnableBit(TailFoldingOpts::Reductions); |
165 | else if (TailFoldTypes[I] == "recurrences" ) |
166 | setEnableBit(TailFoldingOpts::Recurrences); |
167 | else if (TailFoldTypes[I] == "reverse" ) |
168 | setEnableBit(TailFoldingOpts::Reverse); |
169 | else if (TailFoldTypes[I] == "noreductions" ) |
170 | setDisableBit(TailFoldingOpts::Reductions); |
171 | else if (TailFoldTypes[I] == "norecurrences" ) |
172 | setDisableBit(TailFoldingOpts::Recurrences); |
173 | else if (TailFoldTypes[I] == "noreverse" ) |
174 | setDisableBit(TailFoldingOpts::Reverse); |
175 | else |
176 | reportError(Opt: Val); |
177 | } |
178 | } |
179 | |
180 | bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const { |
181 | return (getBits(DefaultBits) & Required) == Required; |
182 | } |
183 | }; |
184 | } // namespace |
185 | |
186 | TailFoldingOption TailFoldingOptionLoc; |
187 | |
188 | static cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding( |
189 | "sve-tail-folding" , |
190 | cl::desc( |
191 | "Control the use of vectorisation using tail-folding for SVE where the" |
192 | " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" |
193 | "\ndisabled (Initial) No loop types will vectorize using " |
194 | "tail-folding" |
195 | "\ndefault (Initial) Uses the default tail-folding settings for " |
196 | "the target CPU" |
197 | "\nall (Initial) All legal loop types will vectorize using " |
198 | "tail-folding" |
199 | "\nsimple (Initial) Use tail-folding for simple loops (not " |
200 | "reductions or recurrences)" |
201 | "\nreductions Use tail-folding for loops containing reductions" |
202 | "\nnoreductions Inverse of above" |
203 | "\nrecurrences Use tail-folding for loops containing fixed order " |
204 | "recurrences" |
205 | "\nnorecurrences Inverse of above" |
206 | "\nreverse Use tail-folding for loops requiring reversed " |
207 | "predicates" |
208 | "\nnoreverse Inverse of above" ), |
209 | cl::location(L&: TailFoldingOptionLoc)); |
210 | |
211 | // Experimental option that will only be fully functional when the |
212 | // code-generator is changed to use SVE instead of NEON for all fixed-width |
213 | // operations. |
214 | static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( |
215 | "enable-fixedwidth-autovec-in-streaming-mode" , cl::init(Val: false), cl::Hidden); |
216 | |
217 | // Experimental option that will only be fully functional when the cost-model |
218 | // and code-generator have been changed to avoid using scalable vector |
219 | // instructions that are not legal in streaming SVE mode. |
220 | static cl::opt<bool> EnableScalableAutovecInStreamingMode( |
221 | "enable-scalable-autovec-in-streaming-mode" , cl::init(Val: false), cl::Hidden); |
222 | |
223 | static bool isSMEABIRoutineCall(const CallInst &CI) { |
224 | const auto *F = CI.getCalledFunction(); |
225 | return F && StringSwitch<bool>(F->getName()) |
226 | .Case(S: "__arm_sme_state" , Value: true) |
227 | .Case(S: "__arm_tpidr2_save" , Value: true) |
228 | .Case(S: "__arm_tpidr2_restore" , Value: true) |
229 | .Case(S: "__arm_za_disable" , Value: true) |
230 | .Default(Value: false); |
231 | } |
232 | |
233 | /// Returns true if the function has explicit operations that can only be |
234 | /// lowered using incompatible instructions for the selected mode. This also |
235 | /// returns true if the function F may use or modify ZA state. |
236 | static bool hasPossibleIncompatibleOps(const Function *F) { |
237 | for (const BasicBlock &BB : *F) { |
238 | for (const Instruction &I : BB) { |
239 | // Be conservative for now and assume that any call to inline asm or to |
240 | // intrinsics could could result in non-streaming ops (e.g. calls to |
241 | // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that |
242 | // all native LLVM instructions can be lowered to compatible instructions. |
243 | if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() && |
244 | (cast<CallInst>(Val: I).isInlineAsm() || isa<IntrinsicInst>(Val: I) || |
245 | isSMEABIRoutineCall(CI: cast<CallInst>(Val: I)))) |
246 | return true; |
247 | } |
248 | } |
249 | return false; |
250 | } |
251 | |
252 | uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const { |
253 | StringRef AttributeStr = |
254 | isMultiversionedFunction(F) ? "fmv-features" : "target-features" ; |
255 | StringRef FeatureStr = F.getFnAttribute(Kind: AttributeStr).getValueAsString(); |
256 | SmallVector<StringRef, 8> Features; |
257 | FeatureStr.split(A&: Features, Separator: "," ); |
258 | return AArch64::getFMVPriority(Features); |
259 | } |
260 | |
261 | bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const { |
262 | return F.hasFnAttribute(Kind: "fmv-features" ); |
263 | } |
264 | |
265 | const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = { |
266 | AArch64::FeatureExecuteOnly, |
267 | }; |
268 | |
269 | bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, |
270 | const Function *Callee) const { |
271 | SMECallAttrs CallAttrs(*Caller, *Callee); |
272 | |
273 | // When inlining, we should consider the body of the function, not the |
274 | // interface. |
275 | if (CallAttrs.callee().hasStreamingBody()) { |
276 | CallAttrs.callee().set(M: SMEAttrs::SM_Compatible, Enable: false); |
277 | CallAttrs.callee().set(M: SMEAttrs::SM_Enabled, Enable: true); |
278 | } |
279 | |
280 | if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0()) |
281 | return false; |
282 | |
283 | if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() || |
284 | CallAttrs.requiresPreservingZT0() || |
285 | CallAttrs.requiresPreservingAllZAState()) { |
286 | if (hasPossibleIncompatibleOps(F: Callee)) |
287 | return false; |
288 | } |
289 | |
290 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
291 | const FeatureBitset &CallerBits = |
292 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
293 | const FeatureBitset &CalleeBits = |
294 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
295 | // Adjust the feature bitsets by inverting some of the bits. This is needed |
296 | // for target features that represent restrictions rather than capabilities, |
297 | // for example a "+execute-only" callee can be inlined into a caller without |
298 | // "+execute-only", but not vice versa. |
299 | FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures; |
300 | FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures; |
301 | |
302 | return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits; |
303 | } |
304 | |
305 | bool AArch64TTIImpl::areTypesABICompatible( |
306 | const Function *Caller, const Function *Callee, |
307 | const ArrayRef<Type *> &Types) const { |
308 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) |
309 | return false; |
310 | |
311 | // We need to ensure that argument promotion does not attempt to promote |
312 | // pointers to fixed-length vector types larger than 128 bits like |
313 | // <8 x float> (and pointers to aggregate types which have such fixed-length |
314 | // vector type members) into the values of the pointees. Such vector types |
315 | // are used for SVE VLS but there is no ABI for SVE VLS arguments and the |
316 | // backend cannot lower such value arguments. The 128-bit fixed-length SVE |
317 | // types can be safely treated as 128-bit NEON types and they cannot be |
318 | // distinguished in IR. |
319 | if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range: Types, P: [](Type *Ty) { |
320 | auto FVTy = dyn_cast<FixedVectorType>(Val: Ty); |
321 | return FVTy && |
322 | FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128; |
323 | })) |
324 | return false; |
325 | |
326 | return true; |
327 | } |
328 | |
329 | unsigned |
330 | AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, |
331 | unsigned DefaultCallPenalty) const { |
332 | // This function calculates a penalty for executing Call in F. |
333 | // |
334 | // There are two ways this function can be called: |
335 | // (1) F: |
336 | // call from F -> G (the call here is Call) |
337 | // |
338 | // For (1), Call.getCaller() == F, so it will always return a high cost if |
339 | // a streaming-mode change is required (thus promoting the need to inline the |
340 | // function) |
341 | // |
342 | // (2) F: |
343 | // call from F -> G (the call here is not Call) |
344 | // G: |
345 | // call from G -> H (the call here is Call) |
346 | // |
347 | // For (2), if after inlining the body of G into F the call to H requires a |
348 | // streaming-mode change, and the call to G from F would also require a |
349 | // streaming-mode change, then there is benefit to do the streaming-mode |
350 | // change only once and avoid inlining of G into F. |
351 | |
352 | SMEAttrs FAttrs(*F); |
353 | SMECallAttrs CallAttrs(Call); |
354 | |
355 | if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) { |
356 | if (F == Call.getCaller()) // (1) |
357 | return CallPenaltyChangeSM * DefaultCallPenalty; |
358 | if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2) |
359 | return InlineCallPenaltyChangeSM * DefaultCallPenalty; |
360 | } |
361 | |
362 | return DefaultCallPenalty; |
363 | } |
364 | |
365 | bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( |
366 | TargetTransformInfo::RegisterKind K) const { |
367 | assert(K != TargetTransformInfo::RGK_Scalar); |
368 | return (K == TargetTransformInfo::RGK_FixedWidthVector && |
369 | ST->isNeonAvailable()); |
370 | } |
371 | |
372 | /// Calculate the cost of materializing a 64-bit value. This helper |
373 | /// method might only calculate a fraction of a larger immediate. Therefore it |
374 | /// is valid to return a cost of ZERO. |
375 | InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) const { |
376 | // Check if the immediate can be encoded within an instruction. |
377 | if (Val == 0 || AArch64_AM::isLogicalImmediate(imm: Val, regSize: 64)) |
378 | return 0; |
379 | |
380 | if (Val < 0) |
381 | Val = ~Val; |
382 | |
383 | // Calculate how many moves we will need to materialize this constant. |
384 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
385 | AArch64_IMM::expandMOVImm(Imm: Val, BitSize: 64, Insn); |
386 | return Insn.size(); |
387 | } |
388 | |
389 | /// Calculate the cost of materializing the given constant. |
390 | InstructionCost |
391 | AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
392 | TTI::TargetCostKind CostKind) const { |
393 | assert(Ty->isIntegerTy()); |
394 | |
395 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
396 | if (BitSize == 0) |
397 | return ~0U; |
398 | |
399 | // Sign-extend all constants to a multiple of 64-bit. |
400 | APInt ImmVal = Imm; |
401 | if (BitSize & 0x3f) |
402 | ImmVal = Imm.sext(width: (BitSize + 63) & ~0x3fU); |
403 | |
404 | // Split the constant into 64-bit chunks and calculate the cost for each |
405 | // chunk. |
406 | InstructionCost Cost = 0; |
407 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { |
408 | APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64); |
409 | int64_t Val = Tmp.getSExtValue(); |
410 | Cost += getIntImmCost(Val); |
411 | } |
412 | // We need at least one instruction to materialze the constant. |
413 | return std::max<InstructionCost>(a: 1, b: Cost); |
414 | } |
415 | |
416 | InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
417 | const APInt &Imm, Type *Ty, |
418 | TTI::TargetCostKind CostKind, |
419 | Instruction *Inst) const { |
420 | assert(Ty->isIntegerTy()); |
421 | |
422 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
423 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
424 | // here, so that constant hoisting will ignore this constant. |
425 | if (BitSize == 0) |
426 | return TTI::TCC_Free; |
427 | |
428 | unsigned ImmIdx = ~0U; |
429 | switch (Opcode) { |
430 | default: |
431 | return TTI::TCC_Free; |
432 | case Instruction::GetElementPtr: |
433 | // Always hoist the base address of a GetElementPtr. |
434 | if (Idx == 0) |
435 | return 2 * TTI::TCC_Basic; |
436 | return TTI::TCC_Free; |
437 | case Instruction::Store: |
438 | ImmIdx = 0; |
439 | break; |
440 | case Instruction::Add: |
441 | case Instruction::Sub: |
442 | case Instruction::Mul: |
443 | case Instruction::UDiv: |
444 | case Instruction::SDiv: |
445 | case Instruction::URem: |
446 | case Instruction::SRem: |
447 | case Instruction::And: |
448 | case Instruction::Or: |
449 | case Instruction::Xor: |
450 | case Instruction::ICmp: |
451 | ImmIdx = 1; |
452 | break; |
453 | // Always return TCC_Free for the shift value of a shift instruction. |
454 | case Instruction::Shl: |
455 | case Instruction::LShr: |
456 | case Instruction::AShr: |
457 | if (Idx == 1) |
458 | return TTI::TCC_Free; |
459 | break; |
460 | case Instruction::Trunc: |
461 | case Instruction::ZExt: |
462 | case Instruction::SExt: |
463 | case Instruction::IntToPtr: |
464 | case Instruction::PtrToInt: |
465 | case Instruction::BitCast: |
466 | case Instruction::PHI: |
467 | case Instruction::Call: |
468 | case Instruction::Select: |
469 | case Instruction::Ret: |
470 | case Instruction::Load: |
471 | break; |
472 | } |
473 | |
474 | if (Idx == ImmIdx) { |
475 | int NumConstants = (BitSize + 63) / 64; |
476 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
477 | return (Cost <= NumConstants * TTI::TCC_Basic) |
478 | ? static_cast<int>(TTI::TCC_Free) |
479 | : Cost; |
480 | } |
481 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
482 | } |
483 | |
484 | InstructionCost |
485 | AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
486 | const APInt &Imm, Type *Ty, |
487 | TTI::TargetCostKind CostKind) const { |
488 | assert(Ty->isIntegerTy()); |
489 | |
490 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
491 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
492 | // here, so that constant hoisting will ignore this constant. |
493 | if (BitSize == 0) |
494 | return TTI::TCC_Free; |
495 | |
496 | // Most (all?) AArch64 intrinsics do not support folding immediates into the |
497 | // selected instruction, so we compute the materialization cost for the |
498 | // immediate directly. |
499 | if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) |
500 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
501 | |
502 | switch (IID) { |
503 | default: |
504 | return TTI::TCC_Free; |
505 | case Intrinsic::sadd_with_overflow: |
506 | case Intrinsic::uadd_with_overflow: |
507 | case Intrinsic::ssub_with_overflow: |
508 | case Intrinsic::usub_with_overflow: |
509 | case Intrinsic::smul_with_overflow: |
510 | case Intrinsic::umul_with_overflow: |
511 | if (Idx == 1) { |
512 | int NumConstants = (BitSize + 63) / 64; |
513 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
514 | return (Cost <= NumConstants * TTI::TCC_Basic) |
515 | ? static_cast<int>(TTI::TCC_Free) |
516 | : Cost; |
517 | } |
518 | break; |
519 | case Intrinsic::experimental_stackmap: |
520 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
521 | return TTI::TCC_Free; |
522 | break; |
523 | case Intrinsic::experimental_patchpoint_void: |
524 | case Intrinsic::experimental_patchpoint: |
525 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
526 | return TTI::TCC_Free; |
527 | break; |
528 | case Intrinsic::experimental_gc_statepoint: |
529 | if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
530 | return TTI::TCC_Free; |
531 | break; |
532 | } |
533 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
534 | } |
535 | |
536 | TargetTransformInfo::PopcntSupportKind |
537 | AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const { |
538 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
539 | if (TyWidth == 32 || TyWidth == 64) |
540 | return TTI::PSK_FastHardware; |
541 | // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. |
542 | return TTI::PSK_Software; |
543 | } |
544 | |
545 | static bool isUnpackedVectorVT(EVT VecVT) { |
546 | return VecVT.isScalableVector() && |
547 | VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; |
548 | } |
549 | |
550 | static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { |
551 | Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers |
552 | Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements |
553 | unsigned TotalHistCnts = 1; |
554 | |
555 | unsigned EltSize = EltTy->getScalarSizeInBits(); |
556 | // Only allow (up to 64b) integers or pointers |
557 | if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64) |
558 | return InstructionCost::getInvalid(); |
559 | |
560 | // FIXME: We should be able to generate histcnt for fixed-length vectors |
561 | // using ptrue with a specific VL. |
562 | if (VectorType *VTy = dyn_cast<VectorType>(Val: BucketPtrsTy)) { |
563 | unsigned EC = VTy->getElementCount().getKnownMinValue(); |
564 | if (!isPowerOf2_64(Value: EC) || !VTy->isScalableTy()) |
565 | return InstructionCost::getInvalid(); |
566 | |
567 | // HistCnt only supports 32b and 64b element types |
568 | unsigned LegalEltSize = EltSize <= 32 ? 32 : 64; |
569 | |
570 | if (EC == 2 || (LegalEltSize == 32 && EC == 4)) |
571 | return InstructionCost(BaseHistCntCost); |
572 | |
573 | unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize; |
574 | TotalHistCnts = EC / NaturalVectorWidth; |
575 | } |
576 | |
577 | return InstructionCost(BaseHistCntCost * TotalHistCnts); |
578 | } |
579 | |
580 | InstructionCost |
581 | AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
582 | TTI::TargetCostKind CostKind) const { |
583 | // The code-generator is currently not able to handle scalable vectors |
584 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
585 | // it. This change will be removed when code-generation for these types is |
586 | // sufficiently reliable. |
587 | auto *RetTy = ICA.getReturnType(); |
588 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: RetTy)) |
589 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
590 | return InstructionCost::getInvalid(); |
591 | |
592 | switch (ICA.getID()) { |
593 | case Intrinsic::experimental_vector_histogram_add: |
594 | if (!ST->hasSVE2()) |
595 | return InstructionCost::getInvalid(); |
596 | return getHistogramCost(ICA); |
597 | case Intrinsic::umin: |
598 | case Intrinsic::umax: |
599 | case Intrinsic::smin: |
600 | case Intrinsic::smax: { |
601 | static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
602 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
603 | MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, |
604 | MVT::nxv2i64}; |
605 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
606 | // v2i64 types get converted to cmp+bif hence the cost of 2 |
607 | if (LT.second == MVT::v2i64) |
608 | return LT.first * 2; |
609 | if (any_of(Range: ValidMinMaxTys, P: [<](MVT M) { return M == LT.second; })) |
610 | return LT.first; |
611 | break; |
612 | } |
613 | case Intrinsic::sadd_sat: |
614 | case Intrinsic::ssub_sat: |
615 | case Intrinsic::uadd_sat: |
616 | case Intrinsic::usub_sat: { |
617 | static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
618 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
619 | MVT::v2i64}; |
620 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
621 | // This is a base cost of 1 for the vadd, plus 3 extract shifts if we |
622 | // need to extend the type, as it uses shr(qadd(shl, shl)). |
623 | unsigned Instrs = |
624 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; |
625 | if (any_of(Range: ValidSatTys, P: [<](MVT M) { return M == LT.second; })) |
626 | return LT.first * Instrs; |
627 | break; |
628 | } |
629 | case Intrinsic::abs: { |
630 | static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
631 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
632 | MVT::v2i64}; |
633 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
634 | if (any_of(Range: ValidAbsTys, P: [<](MVT M) { return M == LT.second; })) |
635 | return LT.first; |
636 | break; |
637 | } |
638 | case Intrinsic::bswap: { |
639 | static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32, |
640 | MVT::v4i32, MVT::v2i64}; |
641 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
642 | if (any_of(Range: ValidAbsTys, P: [<](MVT M) { return M == LT.second; }) && |
643 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits()) |
644 | return LT.first; |
645 | break; |
646 | } |
647 | case Intrinsic::stepvector: { |
648 | InstructionCost Cost = 1; // Cost of the `index' instruction |
649 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
650 | // Legalisation of illegal vectors involves an `index' instruction plus |
651 | // (LT.first - 1) vector adds. |
652 | if (LT.first > 1) { |
653 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: RetTy->getContext()); |
654 | InstructionCost AddCost = |
655 | getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind); |
656 | Cost += AddCost * (LT.first - 1); |
657 | } |
658 | return Cost; |
659 | } |
660 | case Intrinsic::vector_extract: |
661 | case Intrinsic::vector_insert: { |
662 | // If both the vector and subvector types are legal types and the index |
663 | // is 0, then this should be a no-op or simple operation; return a |
664 | // relatively low cost. |
665 | |
666 | // If arguments aren't actually supplied, then we cannot determine the |
667 | // value of the index. We also want to skip predicate types. |
668 | if (ICA.getArgs().size() != ICA.getArgTypes().size() || |
669 | ICA.getReturnType()->getScalarType()->isIntegerTy(Bitwidth: 1)) |
670 | break; |
671 | |
672 | LLVMContext &C = RetTy->getContext(); |
673 | EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
674 | bool = ICA.getID() == Intrinsic::vector_extract; |
675 | EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy) |
676 | : getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[1]); |
677 | // Skip this if either the vector or subvector types are unpacked |
678 | // SVE types; they may get lowered to stack stores and loads. |
679 | if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(VecVT: SubVecVT)) |
680 | break; |
681 | |
682 | TargetLoweringBase::LegalizeKind SubVecLK = |
683 | getTLI()->getTypeConversion(Context&: C, VT: SubVecVT); |
684 | TargetLoweringBase::LegalizeKind VecLK = |
685 | getTLI()->getTypeConversion(Context&: C, VT: VecVT); |
686 | const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2]; |
687 | const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx); |
688 | if (SubVecLK.first == TargetLoweringBase::TypeLegal && |
689 | VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero()) |
690 | return TTI::TCC_Free; |
691 | break; |
692 | } |
693 | case Intrinsic::bitreverse: { |
694 | static const CostTblEntry BitreverseTbl[] = { |
695 | {.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: 1}, |
696 | {.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: 1}, |
697 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i8, .Cost: 1}, |
698 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v16i8, .Cost: 1}, |
699 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i16, .Cost: 2}, |
700 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i16, .Cost: 2}, |
701 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i32, .Cost: 2}, |
702 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i32, .Cost: 2}, |
703 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v1i64, .Cost: 2}, |
704 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i64, .Cost: 2}, |
705 | }; |
706 | const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy); |
707 | const auto *Entry = |
708 | CostTableLookup(Table: BitreverseTbl, ISD: ICA.getID(), Ty: LegalisationCost.second); |
709 | if (Entry) { |
710 | // Cost Model is using the legal type(i32) that i8 and i16 will be |
711 | // converted to +1 so that we match the actual lowering cost |
712 | if (TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i8 || |
713 | TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i16) |
714 | return LegalisationCost.first * Entry->Cost + 1; |
715 | |
716 | return LegalisationCost.first * Entry->Cost; |
717 | } |
718 | break; |
719 | } |
720 | case Intrinsic::ctpop: { |
721 | if (!ST->hasNEON()) { |
722 | // 32-bit or 64-bit ctpop without NEON is 12 instructions. |
723 | return getTypeLegalizationCost(Ty: RetTy).first * 12; |
724 | } |
725 | static const CostTblEntry CtpopCostTbl[] = { |
726 | {.ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: 4}, |
727 | {.ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: 3}, |
728 | {.ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: 2}, |
729 | {.ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: 1}, |
730 | {.ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: 4}, |
731 | {.ISD: ISD::CTPOP, .Type: MVT::v2i32, .Cost: 3}, |
732 | {.ISD: ISD::CTPOP, .Type: MVT::v4i16, .Cost: 2}, |
733 | {.ISD: ISD::CTPOP, .Type: MVT::v8i8, .Cost: 1}, |
734 | {.ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: 5}, |
735 | }; |
736 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
737 | MVT MTy = LT.second; |
738 | if (const auto *Entry = CostTableLookup(Table: CtpopCostTbl, ISD: ISD::CTPOP, Ty: MTy)) { |
739 | // Extra cost of +1 when illegal vector types are legalized by promoting |
740 | // the integer type. |
741 | int = MTy.isVector() && MTy.getScalarSizeInBits() != |
742 | RetTy->getScalarSizeInBits() |
743 | ? 1 |
744 | : 0; |
745 | return LT.first * Entry->Cost + ExtraCost; |
746 | } |
747 | break; |
748 | } |
749 | case Intrinsic::sadd_with_overflow: |
750 | case Intrinsic::uadd_with_overflow: |
751 | case Intrinsic::ssub_with_overflow: |
752 | case Intrinsic::usub_with_overflow: |
753 | case Intrinsic::smul_with_overflow: |
754 | case Intrinsic::umul_with_overflow: { |
755 | static const CostTblEntry WithOverflowCostTbl[] = { |
756 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i8, .Cost: 3}, |
757 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i8, .Cost: 3}, |
758 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i16, .Cost: 3}, |
759 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i16, .Cost: 3}, |
760 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i32, .Cost: 1}, |
761 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i32, .Cost: 1}, |
762 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i64, .Cost: 1}, |
763 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i64, .Cost: 1}, |
764 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i8, .Cost: 3}, |
765 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i8, .Cost: 3}, |
766 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i16, .Cost: 3}, |
767 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i16, .Cost: 3}, |
768 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i32, .Cost: 1}, |
769 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i32, .Cost: 1}, |
770 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i64, .Cost: 1}, |
771 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i64, .Cost: 1}, |
772 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i8, .Cost: 5}, |
773 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i8, .Cost: 4}, |
774 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i16, .Cost: 5}, |
775 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i16, .Cost: 4}, |
776 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;tst |
777 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;cmp sxtw |
778 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;smulh;cmp |
779 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;umulh;cmp asr |
780 | }; |
781 | EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: 0), AllowUnknown: true); |
782 | if (MTy.isSimple()) |
783 | if (const auto *Entry = CostTableLookup(Table: WithOverflowCostTbl, ISD: ICA.getID(), |
784 | Ty: MTy.getSimpleVT())) |
785 | return Entry->Cost; |
786 | break; |
787 | } |
788 | case Intrinsic::fptosi_sat: |
789 | case Intrinsic::fptoui_sat: { |
790 | if (ICA.getArgTypes().empty()) |
791 | break; |
792 | bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; |
793 | auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]); |
794 | EVT MTy = TLI->getValueType(DL, Ty: RetTy); |
795 | // Check for the legal types, which are where the size of the input and the |
796 | // output are the same, or we are using cvt f64->i32 or f32->i64. |
797 | if ((LT.second == MVT::f32 || LT.second == MVT::f64 || |
798 | LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || |
799 | LT.second == MVT::v2f64)) { |
800 | if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || |
801 | (LT.second == MVT::f64 && MTy == MVT::i32) || |
802 | (LT.second == MVT::f32 && MTy == MVT::i64))) |
803 | return LT.first; |
804 | // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2 |
805 | if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() && |
806 | MTy.getScalarSizeInBits() == 64) |
807 | return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2); |
808 | } |
809 | // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to |
810 | // f32. |
811 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) |
812 | return LT.first + getIntrinsicInstrCost( |
813 | ICA: {ICA.getID(), |
814 | RetTy, |
815 | {ICA.getArgTypes()[0]->getWithNewType( |
816 | EltTy: Type::getFloatTy(C&: RetTy->getContext()))}}, |
817 | CostKind); |
818 | if ((LT.second == MVT::f16 && MTy == MVT::i32) || |
819 | (LT.second == MVT::f16 && MTy == MVT::i64) || |
820 | ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && |
821 | (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))) |
822 | return LT.first; |
823 | // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2 |
824 | if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() && |
825 | MTy.getScalarSizeInBits() == 32) |
826 | return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2); |
827 | // Extending vector types v8f16->v8i32. These current scalarize but the |
828 | // codegen could be better. |
829 | if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() && |
830 | MTy.getScalarSizeInBits() == 64) |
831 | return MTy.getVectorNumElements() * 3; |
832 | |
833 | // If we can we use a legal convert followed by a min+max |
834 | if ((LT.second.getScalarType() == MVT::f32 || |
835 | LT.second.getScalarType() == MVT::f64 || |
836 | LT.second.getScalarType() == MVT::f16) && |
837 | LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { |
838 | Type *LegalTy = |
839 | Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits()); |
840 | if (LT.second.isVector()) |
841 | LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount()); |
842 | InstructionCost Cost = 1; |
843 | IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, |
844 | LegalTy, {LegalTy, LegalTy}); |
845 | Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind); |
846 | IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, |
847 | LegalTy, {LegalTy, LegalTy}); |
848 | Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind); |
849 | return LT.first * Cost + |
850 | ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0 |
851 | : 1); |
852 | } |
853 | // Otherwise we need to follow the default expansion that clamps the value |
854 | // using a float min/max with a fcmp+sel for nan handling when signed. |
855 | Type *FPTy = ICA.getArgTypes()[0]->getScalarType(); |
856 | RetTy = RetTy->getScalarType(); |
857 | if (LT.second.isVector()) { |
858 | FPTy = VectorType::get(ElementType: FPTy, EC: LT.second.getVectorElementCount()); |
859 | RetTy = VectorType::get(ElementType: RetTy, EC: LT.second.getVectorElementCount()); |
860 | } |
861 | IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy}); |
862 | InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind); |
863 | IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy}); |
864 | Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind); |
865 | Cost += |
866 | getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI, |
867 | Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind); |
868 | if (IsSigned) { |
869 | Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1); |
870 | Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy, |
871 | VecPred: CmpInst::FCMP_UNO, CostKind); |
872 | Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy, |
873 | VecPred: CmpInst::FCMP_UNO, CostKind); |
874 | } |
875 | return LT.first * Cost; |
876 | } |
877 | case Intrinsic::fshl: |
878 | case Intrinsic::fshr: { |
879 | if (ICA.getArgs().empty()) |
880 | break; |
881 | |
882 | // TODO: Add handling for fshl where third argument is not a constant. |
883 | const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[2]); |
884 | if (!OpInfoZ.isConstant()) |
885 | break; |
886 | |
887 | const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy); |
888 | if (OpInfoZ.isUniform()) { |
889 | static const CostTblEntry FshlTbl[] = { |
890 | {.ISD: Intrinsic::fshl, .Type: MVT::v4i32, .Cost: 2}, // shl + usra |
891 | {.ISD: Intrinsic::fshl, .Type: MVT::v2i64, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v16i8, .Cost: 2}, |
892 | {.ISD: Intrinsic::fshl, .Type: MVT::v8i16, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v2i32, .Cost: 2}, |
893 | {.ISD: Intrinsic::fshl, .Type: MVT::v8i8, .Cost: 2}, {.ISD: Intrinsic::fshl, .Type: MVT::v4i16, .Cost: 2}}; |
894 | // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl |
895 | // to avoid having to duplicate the costs. |
896 | const auto *Entry = |
897 | CostTableLookup(Table: FshlTbl, ISD: Intrinsic::fshl, Ty: LegalisationCost.second); |
898 | if (Entry) |
899 | return LegalisationCost.first * Entry->Cost; |
900 | } |
901 | |
902 | auto TyL = getTypeLegalizationCost(Ty: RetTy); |
903 | if (!RetTy->isIntegerTy()) |
904 | break; |
905 | |
906 | // Estimate cost manually, as types like i8 and i16 will get promoted to |
907 | // i32 and CostTableLookup will ignore the extra conversion cost. |
908 | bool HigherCost = (RetTy->getScalarSizeInBits() != 32 && |
909 | RetTy->getScalarSizeInBits() < 64) || |
910 | (RetTy->getScalarSizeInBits() % 64 != 0); |
911 | unsigned = HigherCost ? 1 : 0; |
912 | if (RetTy->getScalarSizeInBits() == 32 || |
913 | RetTy->getScalarSizeInBits() == 64) |
914 | ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single |
915 | // extr instruction. |
916 | else if (HigherCost) |
917 | ExtraCost = 1; |
918 | else |
919 | break; |
920 | return TyL.first + ExtraCost; |
921 | } |
922 | case Intrinsic::get_active_lane_mask: { |
923 | auto *RetTy = dyn_cast<FixedVectorType>(Val: ICA.getReturnType()); |
924 | if (RetTy) { |
925 | EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy); |
926 | EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
927 | if (!getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT) && |
928 | !getTLI()->isTypeLegal(VT: RetVT)) { |
929 | // We don't have enough context at this point to determine if the mask |
930 | // is going to be kept live after the block, which will force the vXi1 |
931 | // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. |
932 | // For now, we just assume the vectorizer created this intrinsic and |
933 | // the result will be the input for a PHI. In this case the cost will |
934 | // be extremely high for fixed-width vectors. |
935 | // NOTE: getScalarizationOverhead returns a cost that's far too |
936 | // pessimistic for the actual generated codegen. In reality there are |
937 | // two instructions generated per lane. |
938 | return RetTy->getNumElements() * 2; |
939 | } |
940 | } |
941 | break; |
942 | } |
943 | case Intrinsic::experimental_vector_match: { |
944 | auto *NeedleTy = cast<FixedVectorType>(Val: ICA.getArgTypes()[1]); |
945 | EVT SearchVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
946 | unsigned SearchSize = NeedleTy->getNumElements(); |
947 | if (!getTLI()->shouldExpandVectorMatch(VT: SearchVT, SearchSize)) { |
948 | // Base cost for MATCH instructions. At least on the Neoverse V2 and |
949 | // Neoverse V3, these are cheap operations with the same latency as a |
950 | // vector ADD. In most cases, however, we also need to do an extra DUP. |
951 | // For fixed-length vectors we currently need an extra five--six |
952 | // instructions besides the MATCH. |
953 | InstructionCost Cost = 4; |
954 | if (isa<FixedVectorType>(Val: RetTy)) |
955 | Cost += 10; |
956 | return Cost; |
957 | } |
958 | break; |
959 | } |
960 | case Intrinsic::experimental_cttz_elts: { |
961 | EVT ArgVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
962 | if (!getTLI()->shouldExpandCttzElements(VT: ArgVT)) { |
963 | // This will consist of a SVE brkb and a cntp instruction. These |
964 | // typically have the same latency and half the throughput as a vector |
965 | // add instruction. |
966 | return 4; |
967 | } |
968 | break; |
969 | } |
970 | default: |
971 | break; |
972 | } |
973 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
974 | } |
975 | |
976 | /// The function will remove redundant reinterprets casting in the presence |
977 | /// of the control flow |
978 | static std::optional<Instruction *> processPhiNode(InstCombiner &IC, |
979 | IntrinsicInst &II) { |
980 | SmallVector<Instruction *, 32> Worklist; |
981 | auto RequiredType = II.getType(); |
982 | |
983 | auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: 0)); |
984 | assert(PN && "Expected Phi Node!" ); |
985 | |
986 | // Don't create a new Phi unless we can remove the old one. |
987 | if (!PN->hasOneUse()) |
988 | return std::nullopt; |
989 | |
990 | for (Value *IncValPhi : PN->incoming_values()) { |
991 | auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi); |
992 | if (!Reinterpret || |
993 | Reinterpret->getIntrinsicID() != |
994 | Intrinsic::aarch64_sve_convert_to_svbool || |
995 | RequiredType != Reinterpret->getArgOperand(i: 0)->getType()) |
996 | return std::nullopt; |
997 | } |
998 | |
999 | // Create the new Phi |
1000 | IC.Builder.SetInsertPoint(PN); |
1001 | PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues()); |
1002 | Worklist.push_back(Elt: PN); |
1003 | |
1004 | for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { |
1005 | auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I)); |
1006 | NPN->addIncoming(V: Reinterpret->getOperand(i: 0), BB: PN->getIncomingBlock(i: I)); |
1007 | Worklist.push_back(Elt: Reinterpret); |
1008 | } |
1009 | |
1010 | // Cleanup Phi Node and reinterprets |
1011 | return IC.replaceInstUsesWith(I&: II, V: NPN); |
1012 | } |
1013 | |
1014 | // A collection of properties common to SVE intrinsics that allow for combines |
1015 | // to be written without needing to know the specific intrinsic. |
1016 | struct SVEIntrinsicInfo { |
1017 | // |
1018 | // Helper routines for common intrinsic definitions. |
1019 | // |
1020 | |
1021 | // e.g. llvm.aarch64.sve.add pg, op1, op2 |
1022 | // with IID ==> llvm.aarch64.sve.add_u |
1023 | static SVEIntrinsicInfo |
1024 | defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) { |
1025 | return SVEIntrinsicInfo() |
1026 | .setGoverningPredicateOperandIdx(0) |
1027 | .setOperandIdxInactiveLanesTakenFrom(1) |
1028 | .setMatchingUndefIntrinsic(IID); |
1029 | } |
1030 | |
1031 | // e.g. llvm.aarch64.sve.neg inactive, pg, op |
1032 | static SVEIntrinsicInfo defaultMergingUnaryOp() { |
1033 | return SVEIntrinsicInfo() |
1034 | .setGoverningPredicateOperandIdx(1) |
1035 | .setOperandIdxInactiveLanesTakenFrom(0) |
1036 | .setOperandIdxWithNoActiveLanes(0); |
1037 | } |
1038 | |
1039 | // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op |
1040 | static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() { |
1041 | return SVEIntrinsicInfo() |
1042 | .setGoverningPredicateOperandIdx(1) |
1043 | .setOperandIdxInactiveLanesTakenFrom(0); |
1044 | } |
1045 | |
1046 | // e.g. llvm.aarch64.sve.add_u pg, op1, op2 |
1047 | static SVEIntrinsicInfo defaultUndefOp() { |
1048 | return SVEIntrinsicInfo() |
1049 | .setGoverningPredicateOperandIdx(0) |
1050 | .setInactiveLanesAreNotDefined(); |
1051 | } |
1052 | |
1053 | // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0) |
1054 | // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1) |
1055 | static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) { |
1056 | return SVEIntrinsicInfo() |
1057 | .setGoverningPredicateOperandIdx(GPIndex) |
1058 | .setInactiveLanesAreUnused(); |
1059 | } |
1060 | |
1061 | // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2 |
1062 | // llvm.aarch64.sve.ld1 pg, ptr |
1063 | static SVEIntrinsicInfo defaultZeroingOp() { |
1064 | return SVEIntrinsicInfo() |
1065 | .setGoverningPredicateOperandIdx(0) |
1066 | .setInactiveLanesAreUnused() |
1067 | .setResultIsZeroInitialized(); |
1068 | } |
1069 | |
1070 | // All properties relate to predication and thus having a general predicate |
1071 | // is the minimum requirement to say there is intrinsic info to act on. |
1072 | explicit operator bool() const { return hasGoverningPredicate(); } |
1073 | |
1074 | // |
1075 | // Properties relating to the governing predicate. |
1076 | // |
1077 | |
1078 | bool hasGoverningPredicate() const { |
1079 | return GoverningPredicateIdx != std::numeric_limits<unsigned>::max(); |
1080 | } |
1081 | |
1082 | unsigned getGoverningPredicateOperandIdx() const { |
1083 | assert(hasGoverningPredicate() && "Propery not set!" ); |
1084 | return GoverningPredicateIdx; |
1085 | } |
1086 | |
1087 | SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) { |
1088 | assert(!hasGoverningPredicate() && "Cannot set property twice!" ); |
1089 | GoverningPredicateIdx = Index; |
1090 | return *this; |
1091 | } |
1092 | |
1093 | // |
1094 | // Properties relating to operations the intrinsic could be transformed into. |
1095 | // NOTE: This does not mean such a transformation is always possible, but the |
1096 | // knowledge makes it possible to reuse existing optimisations without needing |
1097 | // to embed specific handling for each intrinsic. For example, instruction |
1098 | // simplification can be used to optimise an intrinsic's active lanes. |
1099 | // |
1100 | |
1101 | bool hasMatchingUndefIntrinsic() const { |
1102 | return UndefIntrinsic != Intrinsic::not_intrinsic; |
1103 | } |
1104 | |
1105 | Intrinsic::ID getMatchingUndefIntrinsic() const { |
1106 | assert(hasMatchingUndefIntrinsic() && "Propery not set!" ); |
1107 | return UndefIntrinsic; |
1108 | } |
1109 | |
1110 | SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) { |
1111 | assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!" ); |
1112 | UndefIntrinsic = IID; |
1113 | return *this; |
1114 | } |
1115 | |
1116 | bool hasMatchingIROpode() const { return IROpcode != 0; } |
1117 | |
1118 | unsigned getMatchingIROpode() const { |
1119 | assert(hasMatchingIROpode() && "Propery not set!" ); |
1120 | return IROpcode; |
1121 | } |
1122 | |
1123 | SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) { |
1124 | assert(!hasMatchingIROpode() && "Cannot set property twice!" ); |
1125 | IROpcode = Opcode; |
1126 | return *this; |
1127 | } |
1128 | |
1129 | // |
1130 | // Properties relating to the result of inactive lanes. |
1131 | // |
1132 | |
1133 | bool inactiveLanesTakenFromOperand() const { |
1134 | return ResultLanes == InactiveLanesTakenFromOperand; |
1135 | } |
1136 | |
1137 | unsigned getOperandIdxInactiveLanesTakenFrom() const { |
1138 | assert(inactiveLanesTakenFromOperand() && "Propery not set!" ); |
1139 | return OperandIdxForInactiveLanes; |
1140 | } |
1141 | |
1142 | SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) { |
1143 | assert(ResultLanes == Uninitialized && "Cannot set property twice!" ); |
1144 | ResultLanes = InactiveLanesTakenFromOperand; |
1145 | OperandIdxForInactiveLanes = Index; |
1146 | return *this; |
1147 | } |
1148 | |
1149 | bool inactiveLanesAreNotDefined() const { |
1150 | return ResultLanes == InactiveLanesAreNotDefined; |
1151 | } |
1152 | |
1153 | SVEIntrinsicInfo &setInactiveLanesAreNotDefined() { |
1154 | assert(ResultLanes == Uninitialized && "Cannot set property twice!" ); |
1155 | ResultLanes = InactiveLanesAreNotDefined; |
1156 | return *this; |
1157 | } |
1158 | |
1159 | bool inactiveLanesAreUnused() const { |
1160 | return ResultLanes == InactiveLanesAreUnused; |
1161 | } |
1162 | |
1163 | SVEIntrinsicInfo &setInactiveLanesAreUnused() { |
1164 | assert(ResultLanes == Uninitialized && "Cannot set property twice!" ); |
1165 | ResultLanes = InactiveLanesAreUnused; |
1166 | return *this; |
1167 | } |
1168 | |
1169 | // NOTE: Whilst not limited to only inactive lanes, the common use case is: |
1170 | // inactiveLanesAreZeroed = |
1171 | // resultIsZeroInitialized() && inactiveLanesAreUnused() |
1172 | bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; } |
1173 | |
1174 | SVEIntrinsicInfo &setResultIsZeroInitialized() { |
1175 | ResultIsZeroInitialized = true; |
1176 | return *this; |
1177 | } |
1178 | |
1179 | // |
1180 | // The first operand of unary merging operations is typically only used to |
1181 | // set the result for inactive lanes. Knowing this allows us to deadcode the |
1182 | // operand when we can prove there are no inactive lanes. |
1183 | // |
1184 | |
1185 | bool hasOperandWithNoActiveLanes() const { |
1186 | return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max(); |
1187 | } |
1188 | |
1189 | unsigned getOperandIdxWithNoActiveLanes() const { |
1190 | assert(hasOperandWithNoActiveLanes() && "Propery not set!" ); |
1191 | return OperandIdxWithNoActiveLanes; |
1192 | } |
1193 | |
1194 | SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) { |
1195 | assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!" ); |
1196 | OperandIdxWithNoActiveLanes = Index; |
1197 | return *this; |
1198 | } |
1199 | |
1200 | private: |
1201 | unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max(); |
1202 | |
1203 | Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic; |
1204 | unsigned IROpcode = 0; |
1205 | |
1206 | enum PredicationStyle { |
1207 | Uninitialized, |
1208 | InactiveLanesTakenFromOperand, |
1209 | InactiveLanesAreNotDefined, |
1210 | InactiveLanesAreUnused |
1211 | } ResultLanes = Uninitialized; |
1212 | |
1213 | bool ResultIsZeroInitialized = false; |
1214 | unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max(); |
1215 | unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max(); |
1216 | }; |
1217 | |
1218 | static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) { |
1219 | // Some SVE intrinsics do not use scalable vector types, but since they are |
1220 | // not relevant from an SVEIntrinsicInfo perspective, they are also ignored. |
1221 | if (!isa<ScalableVectorType>(Val: II.getType()) && |
1222 | all_of(Range: II.args(), P: [&](const Value *V) { |
1223 | return !isa<ScalableVectorType>(Val: V->getType()); |
1224 | })) |
1225 | return SVEIntrinsicInfo(); |
1226 | |
1227 | Intrinsic::ID IID = II.getIntrinsicID(); |
1228 | switch (IID) { |
1229 | default: |
1230 | break; |
1231 | case Intrinsic::aarch64_sve_fcvt_bf16f32_v2: |
1232 | case Intrinsic::aarch64_sve_fcvt_f16f32: |
1233 | case Intrinsic::aarch64_sve_fcvt_f16f64: |
1234 | case Intrinsic::aarch64_sve_fcvt_f32f16: |
1235 | case Intrinsic::aarch64_sve_fcvt_f32f64: |
1236 | case Intrinsic::aarch64_sve_fcvt_f64f16: |
1237 | case Intrinsic::aarch64_sve_fcvt_f64f32: |
1238 | case Intrinsic::aarch64_sve_fcvtlt_f32f16: |
1239 | case Intrinsic::aarch64_sve_fcvtlt_f64f32: |
1240 | case Intrinsic::aarch64_sve_fcvtx_f32f64: |
1241 | case Intrinsic::aarch64_sve_fcvtzs: |
1242 | case Intrinsic::aarch64_sve_fcvtzs_i32f16: |
1243 | case Intrinsic::aarch64_sve_fcvtzs_i32f64: |
1244 | case Intrinsic::aarch64_sve_fcvtzs_i64f16: |
1245 | case Intrinsic::aarch64_sve_fcvtzs_i64f32: |
1246 | case Intrinsic::aarch64_sve_fcvtzu: |
1247 | case Intrinsic::aarch64_sve_fcvtzu_i32f16: |
1248 | case Intrinsic::aarch64_sve_fcvtzu_i32f64: |
1249 | case Intrinsic::aarch64_sve_fcvtzu_i64f16: |
1250 | case Intrinsic::aarch64_sve_fcvtzu_i64f32: |
1251 | case Intrinsic::aarch64_sve_scvtf: |
1252 | case Intrinsic::aarch64_sve_scvtf_f16i32: |
1253 | case Intrinsic::aarch64_sve_scvtf_f16i64: |
1254 | case Intrinsic::aarch64_sve_scvtf_f32i64: |
1255 | case Intrinsic::aarch64_sve_scvtf_f64i32: |
1256 | case Intrinsic::aarch64_sve_ucvtf: |
1257 | case Intrinsic::aarch64_sve_ucvtf_f16i32: |
1258 | case Intrinsic::aarch64_sve_ucvtf_f16i64: |
1259 | case Intrinsic::aarch64_sve_ucvtf_f32i64: |
1260 | case Intrinsic::aarch64_sve_ucvtf_f64i32: |
1261 | return SVEIntrinsicInfo::defaultMergingUnaryOp(); |
1262 | |
1263 | case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2: |
1264 | case Intrinsic::aarch64_sve_fcvtnt_f16f32: |
1265 | case Intrinsic::aarch64_sve_fcvtnt_f32f64: |
1266 | case Intrinsic::aarch64_sve_fcvtxnt_f32f64: |
1267 | return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp(); |
1268 | |
1269 | case Intrinsic::aarch64_sve_fabd: |
1270 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fabd_u); |
1271 | case Intrinsic::aarch64_sve_fadd: |
1272 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fadd_u) |
1273 | .setMatchingIROpcode(Instruction::FAdd); |
1274 | case Intrinsic::aarch64_sve_fdiv: |
1275 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fdiv_u) |
1276 | .setMatchingIROpcode(Instruction::FDiv); |
1277 | case Intrinsic::aarch64_sve_fmax: |
1278 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmax_u); |
1279 | case Intrinsic::aarch64_sve_fmaxnm: |
1280 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmaxnm_u); |
1281 | case Intrinsic::aarch64_sve_fmin: |
1282 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmin_u); |
1283 | case Intrinsic::aarch64_sve_fminnm: |
1284 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fminnm_u); |
1285 | case Intrinsic::aarch64_sve_fmla: |
1286 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmla_u); |
1287 | case Intrinsic::aarch64_sve_fmls: |
1288 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmls_u); |
1289 | case Intrinsic::aarch64_sve_fmul: |
1290 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmul_u) |
1291 | .setMatchingIROpcode(Instruction::FMul); |
1292 | case Intrinsic::aarch64_sve_fmulx: |
1293 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmulx_u); |
1294 | case Intrinsic::aarch64_sve_fnmla: |
1295 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmla_u); |
1296 | case Intrinsic::aarch64_sve_fnmls: |
1297 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmls_u); |
1298 | case Intrinsic::aarch64_sve_fsub: |
1299 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fsub_u) |
1300 | .setMatchingIROpcode(Instruction::FSub); |
1301 | case Intrinsic::aarch64_sve_add: |
1302 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_add_u) |
1303 | .setMatchingIROpcode(Instruction::Add); |
1304 | case Intrinsic::aarch64_sve_mla: |
1305 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mla_u); |
1306 | case Intrinsic::aarch64_sve_mls: |
1307 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mls_u); |
1308 | case Intrinsic::aarch64_sve_mul: |
1309 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mul_u) |
1310 | .setMatchingIROpcode(Instruction::Mul); |
1311 | case Intrinsic::aarch64_sve_sabd: |
1312 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sabd_u); |
1313 | case Intrinsic::aarch64_sve_sdiv: |
1314 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sdiv_u) |
1315 | .setMatchingIROpcode(Instruction::SDiv); |
1316 | case Intrinsic::aarch64_sve_smax: |
1317 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smax_u); |
1318 | case Intrinsic::aarch64_sve_smin: |
1319 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smin_u); |
1320 | case Intrinsic::aarch64_sve_smulh: |
1321 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smulh_u); |
1322 | case Intrinsic::aarch64_sve_sub: |
1323 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sub_u) |
1324 | .setMatchingIROpcode(Instruction::Sub); |
1325 | case Intrinsic::aarch64_sve_uabd: |
1326 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uabd_u); |
1327 | case Intrinsic::aarch64_sve_udiv: |
1328 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_udiv_u) |
1329 | .setMatchingIROpcode(Instruction::UDiv); |
1330 | case Intrinsic::aarch64_sve_umax: |
1331 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umax_u); |
1332 | case Intrinsic::aarch64_sve_umin: |
1333 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umin_u); |
1334 | case Intrinsic::aarch64_sve_umulh: |
1335 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umulh_u); |
1336 | case Intrinsic::aarch64_sve_asr: |
1337 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_asr_u) |
1338 | .setMatchingIROpcode(Instruction::AShr); |
1339 | case Intrinsic::aarch64_sve_lsl: |
1340 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsl_u) |
1341 | .setMatchingIROpcode(Instruction::Shl); |
1342 | case Intrinsic::aarch64_sve_lsr: |
1343 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsr_u) |
1344 | .setMatchingIROpcode(Instruction::LShr); |
1345 | case Intrinsic::aarch64_sve_and: |
1346 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_and_u) |
1347 | .setMatchingIROpcode(Instruction::And); |
1348 | case Intrinsic::aarch64_sve_bic: |
1349 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_bic_u); |
1350 | case Intrinsic::aarch64_sve_eor: |
1351 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_eor_u) |
1352 | .setMatchingIROpcode(Instruction::Xor); |
1353 | case Intrinsic::aarch64_sve_orr: |
1354 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_orr_u) |
1355 | .setMatchingIROpcode(Instruction::Or); |
1356 | case Intrinsic::aarch64_sve_sqsub: |
1357 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqsub_u); |
1358 | case Intrinsic::aarch64_sve_uqsub: |
1359 | return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqsub_u); |
1360 | |
1361 | case Intrinsic::aarch64_sve_add_u: |
1362 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1363 | Instruction::Add); |
1364 | case Intrinsic::aarch64_sve_and_u: |
1365 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1366 | Instruction::And); |
1367 | case Intrinsic::aarch64_sve_asr_u: |
1368 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1369 | Instruction::AShr); |
1370 | case Intrinsic::aarch64_sve_eor_u: |
1371 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1372 | Instruction::Xor); |
1373 | case Intrinsic::aarch64_sve_fadd_u: |
1374 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1375 | Instruction::FAdd); |
1376 | case Intrinsic::aarch64_sve_fdiv_u: |
1377 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1378 | Instruction::FDiv); |
1379 | case Intrinsic::aarch64_sve_fmul_u: |
1380 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1381 | Instruction::FMul); |
1382 | case Intrinsic::aarch64_sve_fsub_u: |
1383 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1384 | Instruction::FSub); |
1385 | case Intrinsic::aarch64_sve_lsl_u: |
1386 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1387 | Instruction::Shl); |
1388 | case Intrinsic::aarch64_sve_lsr_u: |
1389 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1390 | Instruction::LShr); |
1391 | case Intrinsic::aarch64_sve_mul_u: |
1392 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1393 | Instruction::Mul); |
1394 | case Intrinsic::aarch64_sve_orr_u: |
1395 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1396 | Instruction::Or); |
1397 | case Intrinsic::aarch64_sve_sdiv_u: |
1398 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1399 | Instruction::SDiv); |
1400 | case Intrinsic::aarch64_sve_sub_u: |
1401 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1402 | Instruction::Sub); |
1403 | case Intrinsic::aarch64_sve_udiv_u: |
1404 | return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( |
1405 | Instruction::UDiv); |
1406 | |
1407 | case Intrinsic::aarch64_sve_addqv: |
1408 | case Intrinsic::aarch64_sve_and_z: |
1409 | case Intrinsic::aarch64_sve_bic_z: |
1410 | case Intrinsic::aarch64_sve_brka_z: |
1411 | case Intrinsic::aarch64_sve_brkb_z: |
1412 | case Intrinsic::aarch64_sve_brkn_z: |
1413 | case Intrinsic::aarch64_sve_brkpa_z: |
1414 | case Intrinsic::aarch64_sve_brkpb_z: |
1415 | case Intrinsic::aarch64_sve_cntp: |
1416 | case Intrinsic::aarch64_sve_compact: |
1417 | case Intrinsic::aarch64_sve_eor_z: |
1418 | case Intrinsic::aarch64_sve_eorv: |
1419 | case Intrinsic::aarch64_sve_eorqv: |
1420 | case Intrinsic::aarch64_sve_nand_z: |
1421 | case Intrinsic::aarch64_sve_nor_z: |
1422 | case Intrinsic::aarch64_sve_orn_z: |
1423 | case Intrinsic::aarch64_sve_orr_z: |
1424 | case Intrinsic::aarch64_sve_orv: |
1425 | case Intrinsic::aarch64_sve_orqv: |
1426 | case Intrinsic::aarch64_sve_pnext: |
1427 | case Intrinsic::aarch64_sve_rdffr_z: |
1428 | case Intrinsic::aarch64_sve_saddv: |
1429 | case Intrinsic::aarch64_sve_uaddv: |
1430 | case Intrinsic::aarch64_sve_umaxv: |
1431 | case Intrinsic::aarch64_sve_umaxqv: |
1432 | case Intrinsic::aarch64_sve_cmpeq: |
1433 | case Intrinsic::aarch64_sve_cmpeq_wide: |
1434 | case Intrinsic::aarch64_sve_cmpge: |
1435 | case Intrinsic::aarch64_sve_cmpge_wide: |
1436 | case Intrinsic::aarch64_sve_cmpgt: |
1437 | case Intrinsic::aarch64_sve_cmpgt_wide: |
1438 | case Intrinsic::aarch64_sve_cmphi: |
1439 | case Intrinsic::aarch64_sve_cmphi_wide: |
1440 | case Intrinsic::aarch64_sve_cmphs: |
1441 | case Intrinsic::aarch64_sve_cmphs_wide: |
1442 | case Intrinsic::aarch64_sve_cmple_wide: |
1443 | case Intrinsic::aarch64_sve_cmplo_wide: |
1444 | case Intrinsic::aarch64_sve_cmpls_wide: |
1445 | case Intrinsic::aarch64_sve_cmplt_wide: |
1446 | case Intrinsic::aarch64_sve_cmpne: |
1447 | case Intrinsic::aarch64_sve_cmpne_wide: |
1448 | case Intrinsic::aarch64_sve_facge: |
1449 | case Intrinsic::aarch64_sve_facgt: |
1450 | case Intrinsic::aarch64_sve_fcmpeq: |
1451 | case Intrinsic::aarch64_sve_fcmpge: |
1452 | case Intrinsic::aarch64_sve_fcmpgt: |
1453 | case Intrinsic::aarch64_sve_fcmpne: |
1454 | case Intrinsic::aarch64_sve_fcmpuo: |
1455 | case Intrinsic::aarch64_sve_ld1: |
1456 | case Intrinsic::aarch64_sve_ld1_gather: |
1457 | case Intrinsic::aarch64_sve_ld1_gather_index: |
1458 | case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: |
1459 | case Intrinsic::aarch64_sve_ld1_gather_sxtw: |
1460 | case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: |
1461 | case Intrinsic::aarch64_sve_ld1_gather_uxtw: |
1462 | case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: |
1463 | case Intrinsic::aarch64_sve_ld1q_gather_index: |
1464 | case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: |
1465 | case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: |
1466 | case Intrinsic::aarch64_sve_ld1ro: |
1467 | case Intrinsic::aarch64_sve_ld1rq: |
1468 | case Intrinsic::aarch64_sve_ld1udq: |
1469 | case Intrinsic::aarch64_sve_ld1uwq: |
1470 | case Intrinsic::aarch64_sve_ld2_sret: |
1471 | case Intrinsic::aarch64_sve_ld2q_sret: |
1472 | case Intrinsic::aarch64_sve_ld3_sret: |
1473 | case Intrinsic::aarch64_sve_ld3q_sret: |
1474 | case Intrinsic::aarch64_sve_ld4_sret: |
1475 | case Intrinsic::aarch64_sve_ld4q_sret: |
1476 | case Intrinsic::aarch64_sve_ldff1: |
1477 | case Intrinsic::aarch64_sve_ldff1_gather: |
1478 | case Intrinsic::aarch64_sve_ldff1_gather_index: |
1479 | case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: |
1480 | case Intrinsic::aarch64_sve_ldff1_gather_sxtw: |
1481 | case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: |
1482 | case Intrinsic::aarch64_sve_ldff1_gather_uxtw: |
1483 | case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: |
1484 | case Intrinsic::aarch64_sve_ldnf1: |
1485 | case Intrinsic::aarch64_sve_ldnt1: |
1486 | case Intrinsic::aarch64_sve_ldnt1_gather: |
1487 | case Intrinsic::aarch64_sve_ldnt1_gather_index: |
1488 | case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: |
1489 | case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: |
1490 | return SVEIntrinsicInfo::defaultZeroingOp(); |
1491 | |
1492 | case Intrinsic::aarch64_sve_prf: |
1493 | case Intrinsic::aarch64_sve_prfb_gather_index: |
1494 | case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: |
1495 | case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: |
1496 | case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: |
1497 | case Intrinsic::aarch64_sve_prfd_gather_index: |
1498 | case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: |
1499 | case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: |
1500 | case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: |
1501 | case Intrinsic::aarch64_sve_prfh_gather_index: |
1502 | case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: |
1503 | case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: |
1504 | case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: |
1505 | case Intrinsic::aarch64_sve_prfw_gather_index: |
1506 | case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: |
1507 | case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: |
1508 | case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: |
1509 | return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 0); |
1510 | |
1511 | case Intrinsic::aarch64_sve_st1_scatter: |
1512 | case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: |
1513 | case Intrinsic::aarch64_sve_st1_scatter_sxtw: |
1514 | case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: |
1515 | case Intrinsic::aarch64_sve_st1_scatter_uxtw: |
1516 | case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: |
1517 | case Intrinsic::aarch64_sve_st1dq: |
1518 | case Intrinsic::aarch64_sve_st1q_scatter_index: |
1519 | case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset: |
1520 | case Intrinsic::aarch64_sve_st1q_scatter_vector_offset: |
1521 | case Intrinsic::aarch64_sve_st1wq: |
1522 | case Intrinsic::aarch64_sve_stnt1: |
1523 | case Intrinsic::aarch64_sve_stnt1_scatter: |
1524 | case Intrinsic::aarch64_sve_stnt1_scatter_index: |
1525 | case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: |
1526 | case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: |
1527 | return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 1); |
1528 | case Intrinsic::aarch64_sve_st2: |
1529 | case Intrinsic::aarch64_sve_st2q: |
1530 | return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 2); |
1531 | case Intrinsic::aarch64_sve_st3: |
1532 | case Intrinsic::aarch64_sve_st3q: |
1533 | return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 3); |
1534 | case Intrinsic::aarch64_sve_st4: |
1535 | case Intrinsic::aarch64_sve_st4q: |
1536 | return SVEIntrinsicInfo::defaultVoidOp(GPIndex: 4); |
1537 | } |
1538 | |
1539 | return SVEIntrinsicInfo(); |
1540 | } |
1541 | |
1542 | static bool isAllActivePredicate(Value *Pred) { |
1543 | // Look through convert.from.svbool(convert.to.svbool(...) chain. |
1544 | Value *UncastedPred; |
1545 | if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( |
1546 | Op0: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( |
1547 | Op0: m_Value(V&: UncastedPred))))) |
1548 | // If the predicate has the same or less lanes than the uncasted |
1549 | // predicate then we know the casting has no effect. |
1550 | if (cast<ScalableVectorType>(Val: Pred->getType())->getMinNumElements() <= |
1551 | cast<ScalableVectorType>(Val: UncastedPred->getType())->getMinNumElements()) |
1552 | Pred = UncastedPred; |
1553 | auto *C = dyn_cast<Constant>(Val: Pred); |
1554 | return (C && C->isAllOnesValue()); |
1555 | } |
1556 | |
1557 | // Simplify `V` by only considering the operations that affect active lanes. |
1558 | // This function should only return existing Values or newly created Constants. |
1559 | static Value *stripInactiveLanes(Value *V, const Value *Pg) { |
1560 | auto *Dup = dyn_cast<IntrinsicInst>(Val: V); |
1561 | if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup && |
1562 | Dup->getOperand(i_nocapture: 1) == Pg && isa<Constant>(Val: Dup->getOperand(i_nocapture: 2))) |
1563 | return ConstantVector::getSplat( |
1564 | EC: cast<VectorType>(Val: V->getType())->getElementCount(), |
1565 | Elt: cast<Constant>(Val: Dup->getOperand(i_nocapture: 2))); |
1566 | |
1567 | return V; |
1568 | } |
1569 | |
1570 | static std::optional<Instruction *> |
1571 | simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, |
1572 | const SVEIntrinsicInfo &IInfo) { |
1573 | const unsigned Opc = IInfo.getMatchingIROpode(); |
1574 | assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!" ); |
1575 | |
1576 | Value *Pg = II.getOperand(i_nocapture: 0); |
1577 | Value *Op1 = II.getOperand(i_nocapture: 1); |
1578 | Value *Op2 = II.getOperand(i_nocapture: 2); |
1579 | const DataLayout &DL = II.getDataLayout(); |
1580 | |
1581 | // Canonicalise constants to the RHS. |
1582 | if (Instruction::isCommutative(Opcode: Opc) && IInfo.inactiveLanesAreNotDefined() && |
1583 | isa<Constant>(Val: Op1) && !isa<Constant>(Val: Op2)) { |
1584 | IC.replaceOperand(I&: II, OpNum: 1, V: Op2); |
1585 | IC.replaceOperand(I&: II, OpNum: 2, V: Op1); |
1586 | return &II; |
1587 | } |
1588 | |
1589 | // Only active lanes matter when simplifying the operation. |
1590 | Op1 = stripInactiveLanes(V: Op1, Pg); |
1591 | Op2 = stripInactiveLanes(V: Op2, Pg); |
1592 | |
1593 | Value *SimpleII; |
1594 | if (auto FII = dyn_cast<FPMathOperator>(Val: &II)) |
1595 | SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, FMF: FII->getFastMathFlags(), Q: DL); |
1596 | else |
1597 | SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, Q: DL); |
1598 | |
1599 | // An SVE intrinsic's result is always defined. However, this is not the case |
1600 | // for its equivalent IR instruction (e.g. when shifting by an amount more |
1601 | // than the data's bitwidth). Simplifications to an undefined result must be |
1602 | // ignored to preserve the intrinsic's expected behaviour. |
1603 | if (!SimpleII || isa<UndefValue>(Val: SimpleII)) |
1604 | return std::nullopt; |
1605 | |
1606 | if (IInfo.inactiveLanesAreNotDefined()) |
1607 | return IC.replaceInstUsesWith(I&: II, V: SimpleII); |
1608 | |
1609 | Value *Inactive = II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom()); |
1610 | |
1611 | // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)). |
1612 | if (SimpleII == Inactive) |
1613 | return IC.replaceInstUsesWith(I&: II, V: SimpleII); |
1614 | |
1615 | // Inactive lanes must be preserved. |
1616 | SimpleII = IC.Builder.CreateSelect(C: Pg, True: SimpleII, False: Inactive); |
1617 | return IC.replaceInstUsesWith(I&: II, V: SimpleII); |
1618 | } |
1619 | |
1620 | // Use SVE intrinsic info to eliminate redundant operands and/or canonicalise |
1621 | // to operations with less strict inactive lane requirements. |
1622 | static std::optional<Instruction *> |
1623 | simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, |
1624 | const SVEIntrinsicInfo &IInfo) { |
1625 | if (!IInfo.hasGoverningPredicate()) |
1626 | return std::nullopt; |
1627 | |
1628 | auto *OpPredicate = II.getOperand(i_nocapture: IInfo.getGoverningPredicateOperandIdx()); |
1629 | |
1630 | // If there are no active lanes. |
1631 | if (match(V: OpPredicate, P: m_ZeroInt())) { |
1632 | if (IInfo.inactiveLanesTakenFromOperand()) |
1633 | return IC.replaceInstUsesWith( |
1634 | I&: II, V: II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom())); |
1635 | |
1636 | if (IInfo.inactiveLanesAreUnused()) { |
1637 | if (IInfo.resultIsZeroInitialized()) |
1638 | IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType())); |
1639 | |
1640 | return IC.eraseInstFromFunction(I&: II); |
1641 | } |
1642 | } |
1643 | |
1644 | // If there are no inactive lanes. |
1645 | if (isAllActivePredicate(Pred: OpPredicate)) { |
1646 | if (IInfo.hasOperandWithNoActiveLanes()) { |
1647 | unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes(); |
1648 | if (!isa<UndefValue>(Val: II.getOperand(i_nocapture: OpIdx))) |
1649 | return IC.replaceOperand(I&: II, OpNum: OpIdx, V: UndefValue::get(T: II.getType())); |
1650 | } |
1651 | |
1652 | if (IInfo.hasMatchingUndefIntrinsic()) { |
1653 | auto *NewDecl = Intrinsic::getOrInsertDeclaration( |
1654 | M: II.getModule(), id: IInfo.getMatchingUndefIntrinsic(), Tys: {II.getType()}); |
1655 | II.setCalledFunction(NewDecl); |
1656 | return &II; |
1657 | } |
1658 | } |
1659 | |
1660 | // Operation specific simplifications. |
1661 | if (IInfo.hasMatchingIROpode() && |
1662 | Instruction::isBinaryOp(Opcode: IInfo.getMatchingIROpode())) |
1663 | return simplifySVEIntrinsicBinOp(IC, II, IInfo); |
1664 | |
1665 | return std::nullopt; |
1666 | } |
1667 | |
1668 | // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) |
1669 | // => (binop (pred) (from_svbool _) (from_svbool _)) |
1670 | // |
1671 | // The above transformation eliminates a `to_svbool` in the predicate |
1672 | // operand of bitwise operation `binop` by narrowing the vector width of |
1673 | // the operation. For example, it would convert a `<vscale x 16 x i1> |
1674 | // and` into a `<vscale x 4 x i1> and`. This is profitable because |
1675 | // to_svbool must zero the new lanes during widening, whereas |
1676 | // from_svbool is free. |
1677 | static std::optional<Instruction *> |
1678 | tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) { |
1679 | auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: 0)); |
1680 | if (!BinOp) |
1681 | return std::nullopt; |
1682 | |
1683 | auto IntrinsicID = BinOp->getIntrinsicID(); |
1684 | switch (IntrinsicID) { |
1685 | case Intrinsic::aarch64_sve_and_z: |
1686 | case Intrinsic::aarch64_sve_bic_z: |
1687 | case Intrinsic::aarch64_sve_eor_z: |
1688 | case Intrinsic::aarch64_sve_nand_z: |
1689 | case Intrinsic::aarch64_sve_nor_z: |
1690 | case Intrinsic::aarch64_sve_orn_z: |
1691 | case Intrinsic::aarch64_sve_orr_z: |
1692 | break; |
1693 | default: |
1694 | return std::nullopt; |
1695 | } |
1696 | |
1697 | auto BinOpPred = BinOp->getOperand(i_nocapture: 0); |
1698 | auto BinOpOp1 = BinOp->getOperand(i_nocapture: 1); |
1699 | auto BinOpOp2 = BinOp->getOperand(i_nocapture: 2); |
1700 | |
1701 | auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred); |
1702 | if (!PredIntr || |
1703 | PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) |
1704 | return std::nullopt; |
1705 | |
1706 | auto PredOp = PredIntr->getOperand(i_nocapture: 0); |
1707 | auto PredOpTy = cast<VectorType>(Val: PredOp->getType()); |
1708 | if (PredOpTy != II.getType()) |
1709 | return std::nullopt; |
1710 | |
1711 | SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; |
1712 | auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic( |
1713 | ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp1}); |
1714 | NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1); |
1715 | if (BinOpOp1 == BinOpOp2) |
1716 | NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1); |
1717 | else |
1718 | NarrowedBinOpArgs.push_back(Elt: IC.Builder.CreateIntrinsic( |
1719 | ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp2})); |
1720 | |
1721 | auto NarrowedBinOp = |
1722 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {PredOpTy}, Args: NarrowedBinOpArgs); |
1723 | return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp); |
1724 | } |
1725 | |
1726 | static std::optional<Instruction *> |
1727 | instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { |
1728 | // If the reinterpret instruction operand is a PHI Node |
1729 | if (isa<PHINode>(Val: II.getArgOperand(i: 0))) |
1730 | return processPhiNode(IC, II); |
1731 | |
1732 | if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) |
1733 | return BinOpCombine; |
1734 | |
1735 | // Ignore converts to/from svcount_t. |
1736 | if (isa<TargetExtType>(Val: II.getArgOperand(i: 0)->getType()) || |
1737 | isa<TargetExtType>(Val: II.getType())) |
1738 | return std::nullopt; |
1739 | |
1740 | SmallVector<Instruction *, 32> CandidatesForRemoval; |
1741 | Value *Cursor = II.getOperand(i_nocapture: 0), *EarliestReplacement = nullptr; |
1742 | |
1743 | const auto *IVTy = cast<VectorType>(Val: II.getType()); |
1744 | |
1745 | // Walk the chain of conversions. |
1746 | while (Cursor) { |
1747 | // If the type of the cursor has fewer lanes than the final result, zeroing |
1748 | // must take place, which breaks the equivalence chain. |
1749 | const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType()); |
1750 | if (CursorVTy->getElementCount().getKnownMinValue() < |
1751 | IVTy->getElementCount().getKnownMinValue()) |
1752 | break; |
1753 | |
1754 | // If the cursor has the same type as I, it is a viable replacement. |
1755 | if (Cursor->getType() == IVTy) |
1756 | EarliestReplacement = Cursor; |
1757 | |
1758 | auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor); |
1759 | |
1760 | // If this is not an SVE conversion intrinsic, this is the end of the chain. |
1761 | if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == |
1762 | Intrinsic::aarch64_sve_convert_to_svbool || |
1763 | IntrinsicCursor->getIntrinsicID() == |
1764 | Intrinsic::aarch64_sve_convert_from_svbool)) |
1765 | break; |
1766 | |
1767 | CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor); |
1768 | Cursor = IntrinsicCursor->getOperand(i_nocapture: 0); |
1769 | } |
1770 | |
1771 | // If no viable replacement in the conversion chain was found, there is |
1772 | // nothing to do. |
1773 | if (!EarliestReplacement) |
1774 | return std::nullopt; |
1775 | |
1776 | return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement); |
1777 | } |
1778 | |
1779 | static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, |
1780 | IntrinsicInst &II) { |
1781 | // svsel(ptrue, x, y) => x |
1782 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
1783 | if (isAllActivePredicate(Pred: OpPredicate)) |
1784 | return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: 1)); |
1785 | |
1786 | auto Select = |
1787 | IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: 1), False: II.getOperand(i_nocapture: 2)); |
1788 | return IC.replaceInstUsesWith(I&: II, V: Select); |
1789 | } |
1790 | |
1791 | static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, |
1792 | IntrinsicInst &II) { |
1793 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1)); |
1794 | if (!Pg) |
1795 | return std::nullopt; |
1796 | |
1797 | if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
1798 | return std::nullopt; |
1799 | |
1800 | const auto PTruePattern = |
1801 | cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: 0))->getZExtValue(); |
1802 | if (PTruePattern != AArch64SVEPredPattern::vl1) |
1803 | return std::nullopt; |
1804 | |
1805 | // The intrinsic is inserting into lane zero so use an insert instead. |
1806 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
1807 | auto *Insert = InsertElementInst::Create( |
1808 | Vec: II.getArgOperand(i: 0), NewElt: II.getArgOperand(i: 2), Idx: ConstantInt::get(Ty: IdxTy, V: 0)); |
1809 | Insert->insertBefore(InsertPos: II.getIterator()); |
1810 | Insert->takeName(V: &II); |
1811 | |
1812 | return IC.replaceInstUsesWith(I&: II, V: Insert); |
1813 | } |
1814 | |
1815 | static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, |
1816 | IntrinsicInst &II) { |
1817 | // Replace DupX with a regular IR splat. |
1818 | auto *RetTy = cast<ScalableVectorType>(Val: II.getType()); |
1819 | Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), |
1820 | V: II.getArgOperand(i: 0)); |
1821 | Splat->takeName(V: &II); |
1822 | return IC.replaceInstUsesWith(I&: II, V: Splat); |
1823 | } |
1824 | |
1825 | static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, |
1826 | IntrinsicInst &II) { |
1827 | LLVMContext &Ctx = II.getContext(); |
1828 | |
1829 | if (!isAllActivePredicate(Pred: II.getArgOperand(i: 0))) |
1830 | return std::nullopt; |
1831 | |
1832 | // Check that we have a compare of zero.. |
1833 | auto *SplatValue = |
1834 | dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: 2))); |
1835 | if (!SplatValue || !SplatValue->isZero()) |
1836 | return std::nullopt; |
1837 | |
1838 | // ..against a dupq |
1839 | auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1)); |
1840 | if (!DupQLane || |
1841 | DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) |
1842 | return std::nullopt; |
1843 | |
1844 | // Where the dupq is a lane 0 replicate of a vector insert |
1845 | auto *DupQLaneIdx = dyn_cast<ConstantInt>(Val: DupQLane->getArgOperand(i: 1)); |
1846 | if (!DupQLaneIdx || !DupQLaneIdx->isZero()) |
1847 | return std::nullopt; |
1848 | |
1849 | auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: 0)); |
1850 | if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) |
1851 | return std::nullopt; |
1852 | |
1853 | // Where the vector insert is a fixed constant vector insert into undef at |
1854 | // index zero |
1855 | if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: 0))) |
1856 | return std::nullopt; |
1857 | |
1858 | if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: 2))->isZero()) |
1859 | return std::nullopt; |
1860 | |
1861 | auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: 1)); |
1862 | if (!ConstVec) |
1863 | return std::nullopt; |
1864 | |
1865 | auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType()); |
1866 | auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType()); |
1867 | if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) |
1868 | return std::nullopt; |
1869 | |
1870 | unsigned NumElts = VecTy->getNumElements(); |
1871 | unsigned PredicateBits = 0; |
1872 | |
1873 | // Expand intrinsic operands to a 16-bit byte level predicate |
1874 | for (unsigned I = 0; I < NumElts; ++I) { |
1875 | auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I)); |
1876 | if (!Arg) |
1877 | return std::nullopt; |
1878 | if (!Arg->isZero()) |
1879 | PredicateBits |= 1 << (I * (16 / NumElts)); |
1880 | } |
1881 | |
1882 | // If all bits are zero bail early with an empty predicate |
1883 | if (PredicateBits == 0) { |
1884 | auto *PFalse = Constant::getNullValue(Ty: II.getType()); |
1885 | PFalse->takeName(V: &II); |
1886 | return IC.replaceInstUsesWith(I&: II, V: PFalse); |
1887 | } |
1888 | |
1889 | // Calculate largest predicate type used (where byte predicate is largest) |
1890 | unsigned Mask = 8; |
1891 | for (unsigned I = 0; I < 16; ++I) |
1892 | if ((PredicateBits & (1 << I)) != 0) |
1893 | Mask |= (I % 8); |
1894 | |
1895 | unsigned PredSize = Mask & -Mask; |
1896 | auto *PredType = ScalableVectorType::get( |
1897 | ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * 8)); |
1898 | |
1899 | // Ensure all relevant bits are set |
1900 | for (unsigned I = 0; I < 16; I += PredSize) |
1901 | if ((PredicateBits & (1 << I)) == 0) |
1902 | return std::nullopt; |
1903 | |
1904 | auto *PTruePat = |
1905 | ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all); |
1906 | auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, |
1907 | Types: {PredType}, Args: {PTruePat}); |
1908 | auto *ConvertToSVBool = IC.Builder.CreateIntrinsic( |
1909 | ID: Intrinsic::aarch64_sve_convert_to_svbool, Types: {PredType}, Args: {PTrue}); |
1910 | auto *ConvertFromSVBool = |
1911 | IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool, |
1912 | Types: {II.getType()}, Args: {ConvertToSVBool}); |
1913 | |
1914 | ConvertFromSVBool->takeName(V: &II); |
1915 | return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool); |
1916 | } |
1917 | |
1918 | static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC, |
1919 | IntrinsicInst &II) { |
1920 | Value *Pg = II.getArgOperand(i: 0); |
1921 | Value *Vec = II.getArgOperand(i: 1); |
1922 | auto IntrinsicID = II.getIntrinsicID(); |
1923 | bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; |
1924 | |
1925 | // lastX(splat(X)) --> X |
1926 | if (auto *SplatVal = getSplatValue(V: Vec)) |
1927 | return IC.replaceInstUsesWith(I&: II, V: SplatVal); |
1928 | |
1929 | // If x and/or y is a splat value then: |
1930 | // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) |
1931 | Value *LHS, *RHS; |
1932 | if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) { |
1933 | if (isSplatValue(V: LHS) || isSplatValue(V: RHS)) { |
1934 | auto *OldBinOp = cast<BinaryOperator>(Val: Vec); |
1935 | auto OpC = OldBinOp->getOpcode(); |
1936 | auto *NewLHS = |
1937 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, LHS}); |
1938 | auto *NewRHS = |
1939 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, RHS}); |
1940 | auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( |
1941 | Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator()); |
1942 | return IC.replaceInstUsesWith(I&: II, V: NewBinOp); |
1943 | } |
1944 | } |
1945 | |
1946 | auto *C = dyn_cast<Constant>(Val: Pg); |
1947 | if (IsAfter && C && C->isNullValue()) { |
1948 | // The intrinsic is extracting lane 0 so use an extract instead. |
1949 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
1950 | auto * = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: 0)); |
1951 | Extract->insertBefore(InsertPos: II.getIterator()); |
1952 | Extract->takeName(V: &II); |
1953 | return IC.replaceInstUsesWith(I&: II, V: Extract); |
1954 | } |
1955 | |
1956 | auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg); |
1957 | if (!IntrPG) |
1958 | return std::nullopt; |
1959 | |
1960 | if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
1961 | return std::nullopt; |
1962 | |
1963 | const auto PTruePattern = |
1964 | cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: 0))->getZExtValue(); |
1965 | |
1966 | // Can the intrinsic's predicate be converted to a known constant index? |
1967 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern); |
1968 | if (!MinNumElts) |
1969 | return std::nullopt; |
1970 | |
1971 | unsigned Idx = MinNumElts - 1; |
1972 | // Increment the index if extracting the element after the last active |
1973 | // predicate element. |
1974 | if (IsAfter) |
1975 | ++Idx; |
1976 | |
1977 | // Ignore extracts whose index is larger than the known minimum vector |
1978 | // length. NOTE: This is an artificial constraint where we prefer to |
1979 | // maintain what the user asked for until an alternative is proven faster. |
1980 | auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType()); |
1981 | if (Idx >= PgVTy->getMinNumElements()) |
1982 | return std::nullopt; |
1983 | |
1984 | // The intrinsic is extracting a fixed lane so use an extract instead. |
1985 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
1986 | auto * = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx)); |
1987 | Extract->insertBefore(InsertPos: II.getIterator()); |
1988 | Extract->takeName(V: &II); |
1989 | return IC.replaceInstUsesWith(I&: II, V: Extract); |
1990 | } |
1991 | |
1992 | static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, |
1993 | IntrinsicInst &II) { |
1994 | // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar |
1995 | // integer variant across a variety of micro-architectures. Replace scalar |
1996 | // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple |
1997 | // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more |
1998 | // depending on the micro-architecture, but has been observed as generally |
1999 | // being faster, particularly when the CLAST[AB] op is a loop-carried |
2000 | // dependency. |
2001 | Value *Pg = II.getArgOperand(i: 0); |
2002 | Value *Fallback = II.getArgOperand(i: 1); |
2003 | Value *Vec = II.getArgOperand(i: 2); |
2004 | Type *Ty = II.getType(); |
2005 | |
2006 | if (!Ty->isIntegerTy()) |
2007 | return std::nullopt; |
2008 | |
2009 | Type *FPTy; |
2010 | switch (cast<IntegerType>(Val: Ty)->getBitWidth()) { |
2011 | default: |
2012 | return std::nullopt; |
2013 | case 16: |
2014 | FPTy = IC.Builder.getHalfTy(); |
2015 | break; |
2016 | case 32: |
2017 | FPTy = IC.Builder.getFloatTy(); |
2018 | break; |
2019 | case 64: |
2020 | FPTy = IC.Builder.getDoubleTy(); |
2021 | break; |
2022 | } |
2023 | |
2024 | Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy); |
2025 | auto *FPVTy = VectorType::get( |
2026 | ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount()); |
2027 | Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy); |
2028 | auto *FPII = IC.Builder.CreateIntrinsic( |
2029 | ID: II.getIntrinsicID(), Types: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec}); |
2030 | Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType()); |
2031 | return IC.replaceInstUsesWith(I&: II, V: FPIItoInt); |
2032 | } |
2033 | |
2034 | static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC, |
2035 | IntrinsicInst &II) { |
2036 | LLVMContext &Ctx = II.getContext(); |
2037 | // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr |
2038 | // can work with RDFFR_PP for ptest elimination. |
2039 | auto *AllPat = |
2040 | ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all); |
2041 | auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, |
2042 | Types: {II.getType()}, Args: {AllPat}); |
2043 | auto *RDFFR = |
2044 | IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_rdffr_z, Args: {PTrue}); |
2045 | RDFFR->takeName(V: &II); |
2046 | return IC.replaceInstUsesWith(I&: II, V: RDFFR); |
2047 | } |
2048 | |
2049 | static std::optional<Instruction *> |
2050 | instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { |
2051 | const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: 0))->getZExtValue(); |
2052 | |
2053 | if (Pattern == AArch64SVEPredPattern::all) { |
2054 | Value *Cnt = IC.Builder.CreateElementCount( |
2055 | Ty: II.getType(), EC: ElementCount::getScalable(MinVal: NumElts)); |
2056 | Cnt->takeName(V: &II); |
2057 | return IC.replaceInstUsesWith(I&: II, V: Cnt); |
2058 | } |
2059 | |
2060 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); |
2061 | |
2062 | return MinNumElts && NumElts >= MinNumElts |
2063 | ? std::optional<Instruction *>(IC.replaceInstUsesWith( |
2064 | I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts))) |
2065 | : std::nullopt; |
2066 | } |
2067 | |
2068 | static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, |
2069 | IntrinsicInst &II) { |
2070 | Value *PgVal = II.getArgOperand(i: 0); |
2071 | Value *OpVal = II.getArgOperand(i: 1); |
2072 | |
2073 | // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X). |
2074 | // Later optimizations prefer this form. |
2075 | if (PgVal == OpVal && |
2076 | (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first || |
2077 | II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) { |
2078 | Value *Ops[] = {PgVal, OpVal}; |
2079 | Type *Tys[] = {PgVal->getType()}; |
2080 | |
2081 | auto *PTest = |
2082 | IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptest_any, Types: Tys, Args: Ops); |
2083 | PTest->takeName(V: &II); |
2084 | |
2085 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
2086 | } |
2087 | |
2088 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal); |
2089 | IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal); |
2090 | |
2091 | if (!Pg || !Op) |
2092 | return std::nullopt; |
2093 | |
2094 | Intrinsic::ID OpIID = Op->getIntrinsicID(); |
2095 | |
2096 | if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && |
2097 | OpIID == Intrinsic::aarch64_sve_convert_to_svbool && |
2098 | Pg->getArgOperand(i: 0)->getType() == Op->getArgOperand(i: 0)->getType()) { |
2099 | Value *Ops[] = {Pg->getArgOperand(i: 0), Op->getArgOperand(i: 0)}; |
2100 | Type *Tys[] = {Pg->getArgOperand(i: 0)->getType()}; |
2101 | |
2102 | auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops); |
2103 | |
2104 | PTest->takeName(V: &II); |
2105 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
2106 | } |
2107 | |
2108 | // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)). |
2109 | // Later optimizations may rewrite sequence to use the flag-setting variant |
2110 | // of instruction X to remove PTEST. |
2111 | if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) && |
2112 | ((OpIID == Intrinsic::aarch64_sve_brka_z) || |
2113 | (OpIID == Intrinsic::aarch64_sve_brkb_z) || |
2114 | (OpIID == Intrinsic::aarch64_sve_brkpa_z) || |
2115 | (OpIID == Intrinsic::aarch64_sve_brkpb_z) || |
2116 | (OpIID == Intrinsic::aarch64_sve_rdffr_z) || |
2117 | (OpIID == Intrinsic::aarch64_sve_and_z) || |
2118 | (OpIID == Intrinsic::aarch64_sve_bic_z) || |
2119 | (OpIID == Intrinsic::aarch64_sve_eor_z) || |
2120 | (OpIID == Intrinsic::aarch64_sve_nand_z) || |
2121 | (OpIID == Intrinsic::aarch64_sve_nor_z) || |
2122 | (OpIID == Intrinsic::aarch64_sve_orn_z) || |
2123 | (OpIID == Intrinsic::aarch64_sve_orr_z))) { |
2124 | Value *Ops[] = {Pg->getArgOperand(i: 0), Pg}; |
2125 | Type *Tys[] = {Pg->getType()}; |
2126 | |
2127 | auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops); |
2128 | PTest->takeName(V: &II); |
2129 | |
2130 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
2131 | } |
2132 | |
2133 | return std::nullopt; |
2134 | } |
2135 | |
2136 | template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> |
2137 | static std::optional<Instruction *> |
2138 | instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, |
2139 | bool MergeIntoAddendOp) { |
2140 | Value *P = II.getOperand(i_nocapture: 0); |
2141 | Value *MulOp0, *MulOp1, *AddendOp, *Mul; |
2142 | if (MergeIntoAddendOp) { |
2143 | AddendOp = II.getOperand(i_nocapture: 1); |
2144 | Mul = II.getOperand(i_nocapture: 2); |
2145 | } else { |
2146 | AddendOp = II.getOperand(i_nocapture: 2); |
2147 | Mul = II.getOperand(i_nocapture: 1); |
2148 | } |
2149 | |
2150 | if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0), |
2151 | m_Value(V&: MulOp1)))) |
2152 | return std::nullopt; |
2153 | |
2154 | if (!Mul->hasOneUse()) |
2155 | return std::nullopt; |
2156 | |
2157 | Instruction *FMFSource = nullptr; |
2158 | if (II.getType()->isFPOrFPVectorTy()) { |
2159 | llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); |
2160 | // Stop the combine when the flags on the inputs differ in case dropping |
2161 | // flags would lead to us missing out on more beneficial optimizations. |
2162 | if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags()) |
2163 | return std::nullopt; |
2164 | if (!FAddFlags.allowContract()) |
2165 | return std::nullopt; |
2166 | FMFSource = &II; |
2167 | } |
2168 | |
2169 | CallInst *Res; |
2170 | if (MergeIntoAddendOp) |
2171 | Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()}, |
2172 | Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource); |
2173 | else |
2174 | Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()}, |
2175 | Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource); |
2176 | |
2177 | return IC.replaceInstUsesWith(I&: II, V: Res); |
2178 | } |
2179 | |
2180 | static std::optional<Instruction *> |
2181 | instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { |
2182 | Value *Pred = II.getOperand(i_nocapture: 0); |
2183 | Value *PtrOp = II.getOperand(i_nocapture: 1); |
2184 | Type *VecTy = II.getType(); |
2185 | |
2186 | if (isAllActivePredicate(Pred)) { |
2187 | LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp); |
2188 | Load->copyMetadata(SrcInst: II); |
2189 | return IC.replaceInstUsesWith(I&: II, V: Load); |
2190 | } |
2191 | |
2192 | CallInst *MaskedLoad = |
2193 | IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), |
2194 | Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy)); |
2195 | MaskedLoad->copyMetadata(SrcInst: II); |
2196 | return IC.replaceInstUsesWith(I&: II, V: MaskedLoad); |
2197 | } |
2198 | |
2199 | static std::optional<Instruction *> |
2200 | instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { |
2201 | Value *VecOp = II.getOperand(i_nocapture: 0); |
2202 | Value *Pred = II.getOperand(i_nocapture: 1); |
2203 | Value *PtrOp = II.getOperand(i_nocapture: 2); |
2204 | |
2205 | if (isAllActivePredicate(Pred)) { |
2206 | StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp); |
2207 | Store->copyMetadata(SrcInst: II); |
2208 | return IC.eraseInstFromFunction(I&: II); |
2209 | } |
2210 | |
2211 | CallInst *MaskedStore = IC.Builder.CreateMaskedStore( |
2212 | Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred); |
2213 | MaskedStore->copyMetadata(SrcInst: II); |
2214 | return IC.eraseInstFromFunction(I&: II); |
2215 | } |
2216 | |
2217 | static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { |
2218 | switch (Intrinsic) { |
2219 | case Intrinsic::aarch64_sve_fmul_u: |
2220 | return Instruction::BinaryOps::FMul; |
2221 | case Intrinsic::aarch64_sve_fadd_u: |
2222 | return Instruction::BinaryOps::FAdd; |
2223 | case Intrinsic::aarch64_sve_fsub_u: |
2224 | return Instruction::BinaryOps::FSub; |
2225 | default: |
2226 | return Instruction::BinaryOpsEnd; |
2227 | } |
2228 | } |
2229 | |
2230 | static std::optional<Instruction *> |
2231 | instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { |
2232 | // Bail due to missing support for ISD::STRICT_ scalable vector operations. |
2233 | if (II.isStrictFP()) |
2234 | return std::nullopt; |
2235 | |
2236 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
2237 | auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID()); |
2238 | if (BinOpCode == Instruction::BinaryOpsEnd || |
2239 | !isAllActivePredicate(Pred: OpPredicate)) |
2240 | return std::nullopt; |
2241 | auto BinOp = IC.Builder.CreateBinOpFMF( |
2242 | Opc: BinOpCode, LHS: II.getOperand(i_nocapture: 1), RHS: II.getOperand(i_nocapture: 2), FMFSource: II.getFastMathFlags()); |
2243 | return IC.replaceInstUsesWith(I&: II, V: BinOp); |
2244 | } |
2245 | |
2246 | static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC, |
2247 | IntrinsicInst &II) { |
2248 | if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
2249 | Intrinsic::aarch64_sve_mla>( |
2250 | IC, II, MergeIntoAddendOp: true)) |
2251 | return MLA; |
2252 | if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
2253 | Intrinsic::aarch64_sve_mad>( |
2254 | IC, II, MergeIntoAddendOp: false)) |
2255 | return MAD; |
2256 | return std::nullopt; |
2257 | } |
2258 | |
2259 | static std::optional<Instruction *> |
2260 | instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) { |
2261 | if (auto FMLA = |
2262 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
2263 | Intrinsic::aarch64_sve_fmla>(IC, II, |
2264 | MergeIntoAddendOp: true)) |
2265 | return FMLA; |
2266 | if (auto FMAD = |
2267 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
2268 | Intrinsic::aarch64_sve_fmad>(IC, II, |
2269 | MergeIntoAddendOp: false)) |
2270 | return FMAD; |
2271 | if (auto FMLA = |
2272 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
2273 | Intrinsic::aarch64_sve_fmla>(IC, II, |
2274 | MergeIntoAddendOp: true)) |
2275 | return FMLA; |
2276 | return std::nullopt; |
2277 | } |
2278 | |
2279 | static std::optional<Instruction *> |
2280 | instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) { |
2281 | if (auto FMLA = |
2282 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
2283 | Intrinsic::aarch64_sve_fmla>(IC, II, |
2284 | MergeIntoAddendOp: true)) |
2285 | return FMLA; |
2286 | if (auto FMAD = |
2287 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
2288 | Intrinsic::aarch64_sve_fmad>(IC, II, |
2289 | MergeIntoAddendOp: false)) |
2290 | return FMAD; |
2291 | if (auto FMLA_U = |
2292 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
2293 | Intrinsic::aarch64_sve_fmla_u>( |
2294 | IC, II, MergeIntoAddendOp: true)) |
2295 | return FMLA_U; |
2296 | return instCombineSVEVectorBinOp(IC, II); |
2297 | } |
2298 | |
2299 | static std::optional<Instruction *> |
2300 | instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) { |
2301 | if (auto FMLS = |
2302 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
2303 | Intrinsic::aarch64_sve_fmls>(IC, II, |
2304 | MergeIntoAddendOp: true)) |
2305 | return FMLS; |
2306 | if (auto FMSB = |
2307 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
2308 | Intrinsic::aarch64_sve_fnmsb>( |
2309 | IC, II, MergeIntoAddendOp: false)) |
2310 | return FMSB; |
2311 | if (auto FMLS = |
2312 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
2313 | Intrinsic::aarch64_sve_fmls>(IC, II, |
2314 | MergeIntoAddendOp: true)) |
2315 | return FMLS; |
2316 | return std::nullopt; |
2317 | } |
2318 | |
2319 | static std::optional<Instruction *> |
2320 | instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) { |
2321 | if (auto FMLS = |
2322 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
2323 | Intrinsic::aarch64_sve_fmls>(IC, II, |
2324 | MergeIntoAddendOp: true)) |
2325 | return FMLS; |
2326 | if (auto FMSB = |
2327 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
2328 | Intrinsic::aarch64_sve_fnmsb>( |
2329 | IC, II, MergeIntoAddendOp: false)) |
2330 | return FMSB; |
2331 | if (auto FMLS_U = |
2332 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
2333 | Intrinsic::aarch64_sve_fmls_u>( |
2334 | IC, II, MergeIntoAddendOp: true)) |
2335 | return FMLS_U; |
2336 | return instCombineSVEVectorBinOp(IC, II); |
2337 | } |
2338 | |
2339 | static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC, |
2340 | IntrinsicInst &II) { |
2341 | if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
2342 | Intrinsic::aarch64_sve_mls>( |
2343 | IC, II, MergeIntoAddendOp: true)) |
2344 | return MLS; |
2345 | return std::nullopt; |
2346 | } |
2347 | |
2348 | static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, |
2349 | IntrinsicInst &II) { |
2350 | Value *UnpackArg = II.getArgOperand(i: 0); |
2351 | auto *RetTy = cast<ScalableVectorType>(Val: II.getType()); |
2352 | bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || |
2353 | II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; |
2354 | |
2355 | // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) |
2356 | // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) |
2357 | if (auto *ScalarArg = getSplatValue(V: UnpackArg)) { |
2358 | ScalarArg = |
2359 | IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned); |
2360 | Value *NewVal = |
2361 | IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg); |
2362 | NewVal->takeName(V: &II); |
2363 | return IC.replaceInstUsesWith(I&: II, V: NewVal); |
2364 | } |
2365 | |
2366 | return std::nullopt; |
2367 | } |
2368 | static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC, |
2369 | IntrinsicInst &II) { |
2370 | auto *OpVal = II.getOperand(i_nocapture: 0); |
2371 | auto *OpIndices = II.getOperand(i_nocapture: 1); |
2372 | VectorType *VTy = cast<VectorType>(Val: II.getType()); |
2373 | |
2374 | // Check whether OpIndices is a constant splat value < minimal element count |
2375 | // of result. |
2376 | auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices)); |
2377 | if (!SplatValue || |
2378 | SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue())) |
2379 | return std::nullopt; |
2380 | |
2381 | // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to |
2382 | // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. |
2383 | auto * = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue); |
2384 | auto *VectorSplat = |
2385 | IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract); |
2386 | |
2387 | VectorSplat->takeName(V: &II); |
2388 | return IC.replaceInstUsesWith(I&: II, V: VectorSplat); |
2389 | } |
2390 | |
2391 | static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC, |
2392 | IntrinsicInst &II) { |
2393 | Value *A, *B; |
2394 | Type *RetTy = II.getType(); |
2395 | constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool; |
2396 | constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool; |
2397 | |
2398 | // uzp1(to_svbool(A), to_svbool(B)) --> <A, B> |
2399 | // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B> |
2400 | if ((match(V: II.getArgOperand(i: 0), |
2401 | P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A)))) && |
2402 | match(V: II.getArgOperand(i: 1), |
2403 | P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) || |
2404 | (match(V: II.getArgOperand(i: 0), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A))) && |
2405 | match(V: II.getArgOperand(i: 1), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) { |
2406 | auto *TyA = cast<ScalableVectorType>(Val: A->getType()); |
2407 | if (TyA == B->getType() && |
2408 | RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) { |
2409 | auto *SubVec = IC.Builder.CreateInsertVector( |
2410 | DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: uint64_t(0)); |
2411 | auto *ConcatVec = IC.Builder.CreateInsertVector(DstType: RetTy, SrcVec: SubVec, SubVec: B, |
2412 | Idx: TyA->getMinNumElements()); |
2413 | ConcatVec->takeName(V: &II); |
2414 | return IC.replaceInstUsesWith(I&: II, V: ConcatVec); |
2415 | } |
2416 | } |
2417 | |
2418 | return std::nullopt; |
2419 | } |
2420 | |
2421 | static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC, |
2422 | IntrinsicInst &II) { |
2423 | // zip1(uzp1(A, B), uzp2(A, B)) --> A |
2424 | // zip2(uzp1(A, B), uzp2(A, B)) --> B |
2425 | Value *A, *B; |
2426 | if (match(V: II.getArgOperand(i: 0), |
2427 | P: m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(Op0: m_Value(V&: A), Op1: m_Value(V&: B))) && |
2428 | match(V: II.getArgOperand(i: 1), P: m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( |
2429 | Op0: m_Specific(V: A), Op1: m_Specific(V: B)))) |
2430 | return IC.replaceInstUsesWith( |
2431 | I&: II, V: (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); |
2432 | |
2433 | return std::nullopt; |
2434 | } |
2435 | |
2436 | static std::optional<Instruction *> |
2437 | instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { |
2438 | Value *Mask = II.getOperand(i_nocapture: 0); |
2439 | Value *BasePtr = II.getOperand(i_nocapture: 1); |
2440 | Value *Index = II.getOperand(i_nocapture: 2); |
2441 | Type *Ty = II.getType(); |
2442 | Value *PassThru = ConstantAggregateZero::get(Ty); |
2443 | |
2444 | // Contiguous gather => masked load. |
2445 | // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) |
2446 | // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) |
2447 | Value *IndexBase; |
2448 | if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>( |
2449 | Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) { |
2450 | Align Alignment = |
2451 | BasePtr->getPointerAlignment(DL: II.getDataLayout()); |
2452 | |
2453 | Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(), |
2454 | Ptr: BasePtr, IdxList: IndexBase); |
2455 | CallInst *MaskedLoad = |
2456 | IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); |
2457 | MaskedLoad->takeName(V: &II); |
2458 | return IC.replaceInstUsesWith(I&: II, V: MaskedLoad); |
2459 | } |
2460 | |
2461 | return std::nullopt; |
2462 | } |
2463 | |
2464 | static std::optional<Instruction *> |
2465 | instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { |
2466 | Value *Val = II.getOperand(i_nocapture: 0); |
2467 | Value *Mask = II.getOperand(i_nocapture: 1); |
2468 | Value *BasePtr = II.getOperand(i_nocapture: 2); |
2469 | Value *Index = II.getOperand(i_nocapture: 3); |
2470 | Type *Ty = Val->getType(); |
2471 | |
2472 | // Contiguous scatter => masked store. |
2473 | // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) |
2474 | // => (masked.store Value (gep BasePtr IndexBase) Align Mask) |
2475 | Value *IndexBase; |
2476 | if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>( |
2477 | Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) { |
2478 | Align Alignment = |
2479 | BasePtr->getPointerAlignment(DL: II.getDataLayout()); |
2480 | |
2481 | Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(), |
2482 | Ptr: BasePtr, IdxList: IndexBase); |
2483 | (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); |
2484 | |
2485 | return IC.eraseInstFromFunction(I&: II); |
2486 | } |
2487 | |
2488 | return std::nullopt; |
2489 | } |
2490 | |
2491 | static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, |
2492 | IntrinsicInst &II) { |
2493 | Type *Int32Ty = IC.Builder.getInt32Ty(); |
2494 | Value *Pred = II.getOperand(i_nocapture: 0); |
2495 | Value *Vec = II.getOperand(i_nocapture: 1); |
2496 | Value *DivVec = II.getOperand(i_nocapture: 2); |
2497 | |
2498 | Value *SplatValue = getSplatValue(V: DivVec); |
2499 | ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue); |
2500 | if (!SplatConstantInt) |
2501 | return std::nullopt; |
2502 | |
2503 | APInt Divisor = SplatConstantInt->getValue(); |
2504 | const int64_t DivisorValue = Divisor.getSExtValue(); |
2505 | if (DivisorValue == -1) |
2506 | return std::nullopt; |
2507 | if (DivisorValue == 1) |
2508 | IC.replaceInstUsesWith(I&: II, V: Vec); |
2509 | |
2510 | if (Divisor.isPowerOf2()) { |
2511 | Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2()); |
2512 | auto ASRD = IC.Builder.CreateIntrinsic( |
2513 | ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2}); |
2514 | return IC.replaceInstUsesWith(I&: II, V: ASRD); |
2515 | } |
2516 | if (Divisor.isNegatedPowerOf2()) { |
2517 | Divisor.negate(); |
2518 | Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2()); |
2519 | auto ASRD = IC.Builder.CreateIntrinsic( |
2520 | ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2}); |
2521 | auto NEG = IC.Builder.CreateIntrinsic( |
2522 | ID: Intrinsic::aarch64_sve_neg, Types: {ASRD->getType()}, Args: {ASRD, Pred, ASRD}); |
2523 | return IC.replaceInstUsesWith(I&: II, V: NEG); |
2524 | } |
2525 | |
2526 | return std::nullopt; |
2527 | } |
2528 | |
2529 | bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) { |
2530 | size_t VecSize = Vec.size(); |
2531 | if (VecSize == 1) |
2532 | return true; |
2533 | if (!isPowerOf2_64(Value: VecSize)) |
2534 | return false; |
2535 | size_t HalfVecSize = VecSize / 2; |
2536 | |
2537 | for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize; |
2538 | RHS != Vec.end(); LHS++, RHS++) { |
2539 | if (*LHS != nullptr && *RHS != nullptr) { |
2540 | if (*LHS == *RHS) |
2541 | continue; |
2542 | else |
2543 | return false; |
2544 | } |
2545 | if (!AllowPoison) |
2546 | return false; |
2547 | if (*LHS == nullptr && *RHS != nullptr) |
2548 | *LHS = *RHS; |
2549 | } |
2550 | |
2551 | Vec.resize(N: HalfVecSize); |
2552 | SimplifyValuePattern(Vec, AllowPoison); |
2553 | return true; |
2554 | } |
2555 | |
2556 | // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) |
2557 | // to dupqlane(f64(C)) where C is A concatenated with B |
2558 | static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC, |
2559 | IntrinsicInst &II) { |
2560 | Value *CurrentInsertElt = nullptr, *Default = nullptr; |
2561 | if (!match(V: II.getOperand(i_nocapture: 0), |
2562 | P: m_Intrinsic<Intrinsic::vector_insert>( |
2563 | Op0: m_Value(V&: Default), Op1: m_Value(V&: CurrentInsertElt), Op2: m_Value())) || |
2564 | !isa<FixedVectorType>(Val: CurrentInsertElt->getType())) |
2565 | return std::nullopt; |
2566 | auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType()); |
2567 | |
2568 | // Insert the scalars into a container ordered by InsertElement index |
2569 | SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr); |
2570 | while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) { |
2571 | auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: 2)); |
2572 | Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: 1); |
2573 | CurrentInsertElt = InsertElt->getOperand(i_nocapture: 0); |
2574 | } |
2575 | |
2576 | bool AllowPoison = |
2577 | isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default); |
2578 | if (!SimplifyValuePattern(Vec&: Elts, AllowPoison)) |
2579 | return std::nullopt; |
2580 | |
2581 | // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) |
2582 | Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType()); |
2583 | for (size_t I = 0; I < Elts.size(); I++) { |
2584 | if (Elts[I] == nullptr) |
2585 | continue; |
2586 | InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts[I], |
2587 | Idx: IC.Builder.getInt64(C: I)); |
2588 | } |
2589 | if (InsertEltChain == nullptr) |
2590 | return std::nullopt; |
2591 | |
2592 | // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 |
2593 | // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector |
2594 | // be bitcast to a type wide enough to fit the sequence, be splatted, and then |
2595 | // be narrowed back to the original type. |
2596 | unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size(); |
2597 | unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() * |
2598 | IIScalableTy->getMinNumElements() / |
2599 | PatternWidth; |
2600 | |
2601 | IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth); |
2602 | auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount); |
2603 | auto *WideShuffleMaskTy = |
2604 | ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount); |
2605 | |
2606 | auto InsertSubvector = IC.Builder.CreateInsertVector( |
2607 | DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain, |
2608 | Idx: uint64_t(0)); |
2609 | auto WideBitcast = |
2610 | IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy); |
2611 | auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy); |
2612 | auto WideShuffle = IC.Builder.CreateShuffleVector( |
2613 | V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask); |
2614 | auto NarrowBitcast = |
2615 | IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType()); |
2616 | |
2617 | return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast); |
2618 | } |
2619 | |
2620 | static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, |
2621 | IntrinsicInst &II) { |
2622 | Value *A = II.getArgOperand(i: 0); |
2623 | Value *B = II.getArgOperand(i: 1); |
2624 | if (A == B) |
2625 | return IC.replaceInstUsesWith(I&: II, V: A); |
2626 | |
2627 | return std::nullopt; |
2628 | } |
2629 | |
2630 | static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, |
2631 | IntrinsicInst &II) { |
2632 | Value *Pred = II.getOperand(i_nocapture: 0); |
2633 | Value *Vec = II.getOperand(i_nocapture: 1); |
2634 | Value *Shift = II.getOperand(i_nocapture: 2); |
2635 | |
2636 | // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. |
2637 | Value *AbsPred, *MergedValue; |
2638 | if (!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( |
2639 | Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())) && |
2640 | !match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_abs>( |
2641 | Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value()))) |
2642 | |
2643 | return std::nullopt; |
2644 | |
2645 | // Transform is valid if any of the following are true: |
2646 | // * The ABS merge value is an undef or non-negative |
2647 | // * The ABS predicate is all active |
2648 | // * The ABS predicate and the SRSHL predicates are the same |
2649 | if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) && |
2650 | AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred)) |
2651 | return std::nullopt; |
2652 | |
2653 | // Only valid when the shift amount is non-negative, otherwise the rounding |
2654 | // behaviour of SRSHL cannot be ignored. |
2655 | if (!match(V: Shift, P: m_NonNegative())) |
2656 | return std::nullopt; |
2657 | |
2658 | auto LSL = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_lsl, |
2659 | Types: {II.getType()}, Args: {Pred, Vec, Shift}); |
2660 | |
2661 | return IC.replaceInstUsesWith(I&: II, V: LSL); |
2662 | } |
2663 | |
2664 | static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC, |
2665 | IntrinsicInst &II) { |
2666 | Value *Vec = II.getOperand(i_nocapture: 0); |
2667 | |
2668 | if (getSplatValue(V: Vec) == II.getOperand(i_nocapture: 1)) |
2669 | return IC.replaceInstUsesWith(I&: II, V: Vec); |
2670 | |
2671 | return std::nullopt; |
2672 | } |
2673 | |
2674 | static std::optional<Instruction *> instCombineDMB(InstCombiner &IC, |
2675 | IntrinsicInst &II) { |
2676 | // If this barrier is post-dominated by identical one we can remove it |
2677 | auto *NI = II.getNextNonDebugInstruction(); |
2678 | unsigned LookaheadThreshold = DMBLookaheadThreshold; |
2679 | auto CanSkipOver = [](Instruction *I) { |
2680 | return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects(); |
2681 | }; |
2682 | while (LookaheadThreshold-- && CanSkipOver(NI)) { |
2683 | auto *NIBB = NI->getParent(); |
2684 | NI = NI->getNextNonDebugInstruction(); |
2685 | if (!NI) { |
2686 | if (auto *SuccBB = NIBB->getUniqueSuccessor()) |
2687 | NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime(); |
2688 | else |
2689 | break; |
2690 | } |
2691 | } |
2692 | auto *NextII = dyn_cast_or_null<IntrinsicInst>(Val: NI); |
2693 | if (NextII && II.isIdenticalTo(I: NextII)) |
2694 | return IC.eraseInstFromFunction(I&: II); |
2695 | |
2696 | return std::nullopt; |
2697 | } |
2698 | |
2699 | static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC, |
2700 | IntrinsicInst &II) { |
2701 | if (match(V: II.getOperand(i_nocapture: 0), P: m_ConstantInt<AArch64SVEPredPattern::all>())) |
2702 | return IC.replaceInstUsesWith(I&: II, V: Constant::getAllOnesValue(Ty: II.getType())); |
2703 | return std::nullopt; |
2704 | } |
2705 | |
2706 | static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC, |
2707 | IntrinsicInst &II, |
2708 | unsigned NumBits) { |
2709 | Value *Passthru = II.getOperand(i_nocapture: 0); |
2710 | Value *Pg = II.getOperand(i_nocapture: 1); |
2711 | Value *Op = II.getOperand(i_nocapture: 2); |
2712 | |
2713 | // Convert UXT[BHW] to AND. |
2714 | if (isa<UndefValue>(Val: Passthru) || isAllActivePredicate(Pred: Pg)) { |
2715 | auto *Ty = cast<VectorType>(Val: II.getType()); |
2716 | auto MaskValue = APInt::getLowBitsSet(numBits: Ty->getScalarSizeInBits(), loBitsSet: NumBits); |
2717 | auto *Mask = ConstantInt::get(Ty, V: MaskValue); |
2718 | auto *And = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_and_u, Types: {Ty}, |
2719 | Args: {Pg, Op, Mask}); |
2720 | return IC.replaceInstUsesWith(I&: II, V: And); |
2721 | } |
2722 | |
2723 | return std::nullopt; |
2724 | } |
2725 | |
2726 | std::optional<Instruction *> |
2727 | AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, |
2728 | IntrinsicInst &II) const { |
2729 | const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II); |
2730 | if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo)) |
2731 | return I; |
2732 | |
2733 | Intrinsic::ID IID = II.getIntrinsicID(); |
2734 | switch (IID) { |
2735 | default: |
2736 | break; |
2737 | case Intrinsic::aarch64_dmb: |
2738 | return instCombineDMB(IC, II); |
2739 | case Intrinsic::aarch64_neon_fmaxnm: |
2740 | case Intrinsic::aarch64_neon_fminnm: |
2741 | return instCombineMaxMinNM(IC, II); |
2742 | case Intrinsic::aarch64_sve_convert_from_svbool: |
2743 | return instCombineConvertFromSVBool(IC, II); |
2744 | case Intrinsic::aarch64_sve_dup: |
2745 | return instCombineSVEDup(IC, II); |
2746 | case Intrinsic::aarch64_sve_dup_x: |
2747 | return instCombineSVEDupX(IC, II); |
2748 | case Intrinsic::aarch64_sve_cmpne: |
2749 | case Intrinsic::aarch64_sve_cmpne_wide: |
2750 | return instCombineSVECmpNE(IC, II); |
2751 | case Intrinsic::aarch64_sve_rdffr: |
2752 | return instCombineRDFFR(IC, II); |
2753 | case Intrinsic::aarch64_sve_lasta: |
2754 | case Intrinsic::aarch64_sve_lastb: |
2755 | return instCombineSVELast(IC, II); |
2756 | case Intrinsic::aarch64_sve_clasta_n: |
2757 | case Intrinsic::aarch64_sve_clastb_n: |
2758 | return instCombineSVECondLast(IC, II); |
2759 | case Intrinsic::aarch64_sve_cntd: |
2760 | return instCombineSVECntElts(IC, II, NumElts: 2); |
2761 | case Intrinsic::aarch64_sve_cntw: |
2762 | return instCombineSVECntElts(IC, II, NumElts: 4); |
2763 | case Intrinsic::aarch64_sve_cnth: |
2764 | return instCombineSVECntElts(IC, II, NumElts: 8); |
2765 | case Intrinsic::aarch64_sve_cntb: |
2766 | return instCombineSVECntElts(IC, II, NumElts: 16); |
2767 | case Intrinsic::aarch64_sve_ptest_any: |
2768 | case Intrinsic::aarch64_sve_ptest_first: |
2769 | case Intrinsic::aarch64_sve_ptest_last: |
2770 | return instCombineSVEPTest(IC, II); |
2771 | case Intrinsic::aarch64_sve_fadd: |
2772 | return instCombineSVEVectorFAdd(IC, II); |
2773 | case Intrinsic::aarch64_sve_fadd_u: |
2774 | return instCombineSVEVectorFAddU(IC, II); |
2775 | case Intrinsic::aarch64_sve_fmul_u: |
2776 | return instCombineSVEVectorBinOp(IC, II); |
2777 | case Intrinsic::aarch64_sve_fsub: |
2778 | return instCombineSVEVectorFSub(IC, II); |
2779 | case Intrinsic::aarch64_sve_fsub_u: |
2780 | return instCombineSVEVectorFSubU(IC, II); |
2781 | case Intrinsic::aarch64_sve_add: |
2782 | return instCombineSVEVectorAdd(IC, II); |
2783 | case Intrinsic::aarch64_sve_add_u: |
2784 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, |
2785 | Intrinsic::aarch64_sve_mla_u>( |
2786 | IC, II, MergeIntoAddendOp: true); |
2787 | case Intrinsic::aarch64_sve_sub: |
2788 | return instCombineSVEVectorSub(IC, II); |
2789 | case Intrinsic::aarch64_sve_sub_u: |
2790 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, |
2791 | Intrinsic::aarch64_sve_mls_u>( |
2792 | IC, II, MergeIntoAddendOp: true); |
2793 | case Intrinsic::aarch64_sve_tbl: |
2794 | return instCombineSVETBL(IC, II); |
2795 | case Intrinsic::aarch64_sve_uunpkhi: |
2796 | case Intrinsic::aarch64_sve_uunpklo: |
2797 | case Intrinsic::aarch64_sve_sunpkhi: |
2798 | case Intrinsic::aarch64_sve_sunpklo: |
2799 | return instCombineSVEUnpack(IC, II); |
2800 | case Intrinsic::aarch64_sve_uzp1: |
2801 | return instCombineSVEUzp1(IC, II); |
2802 | case Intrinsic::aarch64_sve_zip1: |
2803 | case Intrinsic::aarch64_sve_zip2: |
2804 | return instCombineSVEZip(IC, II); |
2805 | case Intrinsic::aarch64_sve_ld1_gather_index: |
2806 | return instCombineLD1GatherIndex(IC, II); |
2807 | case Intrinsic::aarch64_sve_st1_scatter_index: |
2808 | return instCombineST1ScatterIndex(IC, II); |
2809 | case Intrinsic::aarch64_sve_ld1: |
2810 | return instCombineSVELD1(IC, II, DL); |
2811 | case Intrinsic::aarch64_sve_st1: |
2812 | return instCombineSVEST1(IC, II, DL); |
2813 | case Intrinsic::aarch64_sve_sdiv: |
2814 | return instCombineSVESDIV(IC, II); |
2815 | case Intrinsic::aarch64_sve_sel: |
2816 | return instCombineSVESel(IC, II); |
2817 | case Intrinsic::aarch64_sve_srshl: |
2818 | return instCombineSVESrshl(IC, II); |
2819 | case Intrinsic::aarch64_sve_dupq_lane: |
2820 | return instCombineSVEDupqLane(IC, II); |
2821 | case Intrinsic::aarch64_sve_insr: |
2822 | return instCombineSVEInsr(IC, II); |
2823 | case Intrinsic::aarch64_sve_ptrue: |
2824 | return instCombinePTrue(IC, II); |
2825 | case Intrinsic::aarch64_sve_uxtb: |
2826 | return instCombineSVEUxt(IC, II, NumBits: 8); |
2827 | case Intrinsic::aarch64_sve_uxth: |
2828 | return instCombineSVEUxt(IC, II, NumBits: 16); |
2829 | case Intrinsic::aarch64_sve_uxtw: |
2830 | return instCombineSVEUxt(IC, II, NumBits: 32); |
2831 | } |
2832 | |
2833 | return std::nullopt; |
2834 | } |
2835 | |
2836 | std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( |
2837 | InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, |
2838 | APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, |
2839 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
2840 | SimplifyAndSetOp) const { |
2841 | switch (II.getIntrinsicID()) { |
2842 | default: |
2843 | break; |
2844 | case Intrinsic::aarch64_neon_fcvtxn: |
2845 | case Intrinsic::aarch64_neon_rshrn: |
2846 | case Intrinsic::aarch64_neon_sqrshrn: |
2847 | case Intrinsic::aarch64_neon_sqrshrun: |
2848 | case Intrinsic::aarch64_neon_sqshrn: |
2849 | case Intrinsic::aarch64_neon_sqshrun: |
2850 | case Intrinsic::aarch64_neon_sqxtn: |
2851 | case Intrinsic::aarch64_neon_sqxtun: |
2852 | case Intrinsic::aarch64_neon_uqrshrn: |
2853 | case Intrinsic::aarch64_neon_uqshrn: |
2854 | case Intrinsic::aarch64_neon_uqxtn: |
2855 | SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); |
2856 | break; |
2857 | } |
2858 | |
2859 | return std::nullopt; |
2860 | } |
2861 | |
2862 | bool AArch64TTIImpl::enableScalableVectorization() const { |
2863 | return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && |
2864 | EnableScalableAutovecInStreamingMode); |
2865 | } |
2866 | |
2867 | TypeSize |
2868 | AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
2869 | switch (K) { |
2870 | case TargetTransformInfo::RGK_Scalar: |
2871 | return TypeSize::getFixed(ExactSize: 64); |
2872 | case TargetTransformInfo::RGK_FixedWidthVector: |
2873 | if (ST->useSVEForFixedLengthVectors() && |
2874 | (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode)) |
2875 | return TypeSize::getFixed( |
2876 | ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: 128u)); |
2877 | else if (ST->isNeonAvailable()) |
2878 | return TypeSize::getFixed(ExactSize: 128); |
2879 | else |
2880 | return TypeSize::getFixed(ExactSize: 0); |
2881 | case TargetTransformInfo::RGK_ScalableVector: |
2882 | if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && |
2883 | EnableScalableAutovecInStreamingMode)) |
2884 | return TypeSize::getScalable(MinimumSize: 128); |
2885 | else |
2886 | return TypeSize::getScalable(MinimumSize: 0); |
2887 | } |
2888 | llvm_unreachable("Unsupported register kind" ); |
2889 | } |
2890 | |
2891 | bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, |
2892 | ArrayRef<const Value *> Args, |
2893 | Type *SrcOverrideTy) const { |
2894 | // A helper that returns a vector type from the given type. The number of |
2895 | // elements in type Ty determines the vector width. |
2896 | auto toVectorTy = [&](Type *ArgTy) { |
2897 | return VectorType::get(ElementType: ArgTy->getScalarType(), |
2898 | EC: cast<VectorType>(Val: DstTy)->getElementCount()); |
2899 | }; |
2900 | |
2901 | // Exit early if DstTy is not a vector type whose elements are one of [i16, |
2902 | // i32, i64]. SVE doesn't generally have the same set of instructions to |
2903 | // perform an extend with the add/sub/mul. There are SMULLB style |
2904 | // instructions, but they operate on top/bottom, requiring some sort of lane |
2905 | // interleaving to be used with zext/sext. |
2906 | unsigned DstEltSize = DstTy->getScalarSizeInBits(); |
2907 | if (!useNeonVector(Ty: DstTy) || Args.size() != 2 || |
2908 | (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) |
2909 | return false; |
2910 | |
2911 | // Determine if the operation has a widening variant. We consider both the |
2912 | // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the |
2913 | // instructions. |
2914 | // |
2915 | // TODO: Add additional widening operations (e.g., shl, etc.) once we |
2916 | // verify that their extending operands are eliminated during code |
2917 | // generation. |
2918 | Type *SrcTy = SrcOverrideTy; |
2919 | switch (Opcode) { |
2920 | case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). |
2921 | case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). |
2922 | // The second operand needs to be an extend |
2923 | if (isa<SExtInst>(Val: Args[1]) || isa<ZExtInst>(Val: Args[1])) { |
2924 | if (!SrcTy) |
2925 | SrcTy = |
2926 | toVectorTy(cast<Instruction>(Val: Args[1])->getOperand(i: 0)->getType()); |
2927 | } else |
2928 | return false; |
2929 | break; |
2930 | case Instruction::Mul: { // SMULL(2), UMULL(2) |
2931 | // Both operands need to be extends of the same type. |
2932 | if ((isa<SExtInst>(Val: Args[0]) && isa<SExtInst>(Val: Args[1])) || |
2933 | (isa<ZExtInst>(Val: Args[0]) && isa<ZExtInst>(Val: Args[1]))) { |
2934 | if (!SrcTy) |
2935 | SrcTy = |
2936 | toVectorTy(cast<Instruction>(Val: Args[0])->getOperand(i: 0)->getType()); |
2937 | } else if (isa<ZExtInst>(Val: Args[0]) || isa<ZExtInst>(Val: Args[1])) { |
2938 | // If one of the operands is a Zext and the other has enough zero bits to |
2939 | // be treated as unsigned, we can still general a umull, meaning the zext |
2940 | // is free. |
2941 | KnownBits Known = |
2942 | computeKnownBits(V: isa<ZExtInst>(Val: Args[0]) ? Args[1] : Args[0], DL); |
2943 | if (Args[0]->getType()->getScalarSizeInBits() - |
2944 | Known.Zero.countLeadingOnes() > |
2945 | DstTy->getScalarSizeInBits() / 2) |
2946 | return false; |
2947 | if (!SrcTy) |
2948 | SrcTy = toVectorTy(Type::getIntNTy(C&: DstTy->getContext(), |
2949 | N: DstTy->getScalarSizeInBits() / 2)); |
2950 | } else |
2951 | return false; |
2952 | break; |
2953 | } |
2954 | default: |
2955 | return false; |
2956 | } |
2957 | |
2958 | // Legalize the destination type and ensure it can be used in a widening |
2959 | // operation. |
2960 | auto DstTyL = getTypeLegalizationCost(Ty: DstTy); |
2961 | if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits()) |
2962 | return false; |
2963 | |
2964 | // Legalize the source type and ensure it can be used in a widening |
2965 | // operation. |
2966 | assert(SrcTy && "Expected some SrcTy" ); |
2967 | auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy); |
2968 | unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); |
2969 | if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) |
2970 | return false; |
2971 | |
2972 | // Get the total number of vector elements in the legalized types. |
2973 | InstructionCost NumDstEls = |
2974 | DstTyL.first * DstTyL.second.getVectorMinNumElements(); |
2975 | InstructionCost NumSrcEls = |
2976 | SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); |
2977 | |
2978 | // Return true if the legalized types have the same number of vector elements |
2979 | // and the destination element type size is twice that of the source type. |
2980 | return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; |
2981 | } |
2982 | |
2983 | // s/urhadd instructions implement the following pattern, making the |
2984 | // extends free: |
2985 | // %x = add ((zext i8 -> i16), 1) |
2986 | // %y = (zext i8 -> i16) |
2987 | // trunc i16 (lshr (add %x, %y), 1) -> i8 |
2988 | // |
2989 | bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, |
2990 | Type *Src) const { |
2991 | // The source should be a legal vector type. |
2992 | if (!Src->isVectorTy() || !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) || |
2993 | (Src->isScalableTy() && !ST->hasSVE2())) |
2994 | return false; |
2995 | |
2996 | if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse()) |
2997 | return false; |
2998 | |
2999 | // Look for trunc/shl/add before trying to match the pattern. |
3000 | const Instruction *Add = ExtUser; |
3001 | auto *AddUser = |
3002 | dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser()); |
3003 | if (AddUser && AddUser->getOpcode() == Instruction::Add) |
3004 | Add = AddUser; |
3005 | |
3006 | auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser()); |
3007 | if (!Shr || Shr->getOpcode() != Instruction::LShr) |
3008 | return false; |
3009 | |
3010 | auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser()); |
3011 | if (!Trunc || Trunc->getOpcode() != Instruction::Trunc || |
3012 | Src->getScalarSizeInBits() != |
3013 | cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits()) |
3014 | return false; |
3015 | |
3016 | // Try to match the whole pattern. Ext could be either the first or second |
3017 | // m_ZExtOrSExt matched. |
3018 | Instruction *Ex1, *Ex2; |
3019 | if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1), |
3020 | R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_SpecificInt(V: 1)))))) |
3021 | return false; |
3022 | |
3023 | // Ensure both extends are of the same type |
3024 | if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) && |
3025 | Ex1->getOpcode() == Ex2->getOpcode()) |
3026 | return true; |
3027 | |
3028 | return false; |
3029 | } |
3030 | |
3031 | InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
3032 | Type *Src, |
3033 | TTI::CastContextHint CCH, |
3034 | TTI::TargetCostKind CostKind, |
3035 | const Instruction *I) const { |
3036 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3037 | assert(ISD && "Invalid opcode" ); |
3038 | // If the cast is observable, and it is used by a widening instruction (e.g., |
3039 | // uaddl, saddw, etc.), it may be free. |
3040 | if (I && I->hasOneUser()) { |
3041 | auto *SingleUser = cast<Instruction>(Val: *I->user_begin()); |
3042 | SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); |
3043 | if (isWideningInstruction(DstTy: Dst, Opcode: SingleUser->getOpcode(), Args: Operands, SrcOverrideTy: Src)) { |
3044 | // For adds only count the second operand as free if both operands are |
3045 | // extends but not the same operation. (i.e both operands are not free in |
3046 | // add(sext, zext)). |
3047 | if (SingleUser->getOpcode() == Instruction::Add) { |
3048 | if (I == SingleUser->getOperand(i: 1) || |
3049 | (isa<CastInst>(Val: SingleUser->getOperand(i: 1)) && |
3050 | cast<CastInst>(Val: SingleUser->getOperand(i: 1))->getOpcode() == Opcode)) |
3051 | return 0; |
3052 | } else // Others are free so long as isWideningInstruction returned true. |
3053 | return 0; |
3054 | } |
3055 | |
3056 | // The cast will be free for the s/urhadd instructions |
3057 | if ((isa<ZExtInst>(Val: I) || isa<SExtInst>(Val: I)) && |
3058 | isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src)) |
3059 | return 0; |
3060 | } |
3061 | |
3062 | // TODO: Allow non-throughput costs that aren't binary. |
3063 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { |
3064 | if (CostKind != TTI::TCK_RecipThroughput) |
3065 | return Cost == 0 ? 0 : 1; |
3066 | return Cost; |
3067 | }; |
3068 | |
3069 | EVT SrcTy = TLI->getValueType(DL, Ty: Src); |
3070 | EVT DstTy = TLI->getValueType(DL, Ty: Dst); |
3071 | |
3072 | if (!SrcTy.isSimple() || !DstTy.isSimple()) |
3073 | return AdjustCost( |
3074 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
3075 | |
3076 | static const TypeConversionCostTblEntry BF16Tbl[] = { |
3077 | {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 1}, // bfcvt |
3078 | {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 1}, // bfcvt |
3079 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 1}, // bfcvtn |
3080 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 2}, // bfcvtn+bfcvtn2 |
3081 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 2}, // bfcvtn+fcvtn |
3082 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtl2+bfcvtn |
3083 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+bfcvtn |
3084 | }; |
3085 | |
3086 | if (ST->hasBF16()) |
3087 | if (const auto *Entry = ConvertCostTableLookup( |
3088 | Table: BF16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT())) |
3089 | return AdjustCost(Entry->Cost); |
3090 | |
3091 | // Symbolic constants for the SVE sitofp/uitofp entries in the table below |
3092 | // The cost of unpacking twice is artificially increased for now in order |
3093 | // to avoid regressions against NEON, which will use tbl instructions directly |
3094 | // instead of multiple layers of [s|u]unpk[lo|hi]. |
3095 | // We use the unpacks in cases where the destination type is illegal and |
3096 | // requires splitting of the input, even if the input type itself is legal. |
3097 | const unsigned int SVE_EXT_COST = 1; |
3098 | const unsigned int SVE_FCVT_COST = 1; |
3099 | const unsigned int SVE_UNPACK_ONCE = 4; |
3100 | const unsigned int SVE_UNPACK_TWICE = 16; |
3101 | |
3102 | static const TypeConversionCostTblEntry ConversionTbl[] = { |
3103 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: 1}, // xtn |
3104 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: 1}, // xtn |
3105 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i32, .Src: MVT::v2i64, .Cost: 1}, // xtn |
3106 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: 1}, // xtn |
3107 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: 3}, // 2 xtn + 1 uzp1 |
3108 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 1}, // xtn |
3109 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: 2}, // 1 uzp1 + 1 xtn |
3110 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: 1}, // 1 uzp1 |
3111 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: 1}, // 1 xtn |
3112 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: 2}, // 1 uzp1 + 1 xtn |
3113 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: 4}, // 3 x uzp1 + xtn |
3114 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: 1}, // 1 uzp1 |
3115 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: 3}, // 3 x uzp1 |
3116 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: 2}, // 2 x uzp1 |
3117 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: 1}, // uzp1 |
3118 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 3}, // (2 + 1) x uzp1 |
3119 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: 7}, // (4 + 2 + 1) x uzp1 |
3120 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: 2}, // 2 x uzp1 |
3121 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i64, .Cost: 6}, // (4 + 2) x uzp1 |
3122 | {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i64, .Cost: 4}, // 4 x uzp1 |
3123 | |
3124 | // Truncations on nxvmiN |
3125 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i8, .Cost: 2}, |
3126 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i16, .Cost: 2}, |
3127 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i32, .Cost: 2}, |
3128 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i64, .Cost: 2}, |
3129 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i8, .Cost: 2}, |
3130 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i16, .Cost: 2}, |
3131 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i32, .Cost: 2}, |
3132 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i64, .Cost: 5}, |
3133 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i8, .Cost: 2}, |
3134 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i16, .Cost: 2}, |
3135 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i32, .Cost: 5}, |
3136 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i64, .Cost: 11}, |
3137 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i1, .Src: MVT::nxv16i8, .Cost: 2}, |
3138 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i16, .Cost: 0}, |
3139 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i32, .Cost: 0}, |
3140 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i64, .Cost: 0}, |
3141 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i32, .Cost: 0}, |
3142 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i64, .Cost: 0}, |
3143 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i32, .Src: MVT::nxv2i64, .Cost: 0}, |
3144 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i16, .Cost: 0}, |
3145 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i32, .Cost: 0}, |
3146 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i64, .Cost: 1}, |
3147 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i32, .Cost: 0}, |
3148 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i64, .Cost: 1}, |
3149 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i32, .Src: MVT::nxv4i64, .Cost: 1}, |
3150 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i16, .Cost: 0}, |
3151 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i32, .Cost: 1}, |
3152 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i64, .Cost: 3}, |
3153 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i32, .Cost: 1}, |
3154 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i64, .Cost: 3}, |
3155 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i16, .Cost: 1}, |
3156 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i32, .Cost: 3}, |
3157 | {.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i64, .Cost: 7}, |
3158 | |
3159 | // The number of shll instructions for the extension. |
3160 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3}, |
3161 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3}, |
3162 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2}, |
3163 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2}, |
3164 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3}, |
3165 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3}, |
3166 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2}, |
3167 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2}, |
3168 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7}, |
3169 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7}, |
3170 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6}, |
3171 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6}, |
3172 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2}, |
3173 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2}, |
3174 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6}, |
3175 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6}, |
3176 | |
3177 | // FP Ext and trunc |
3178 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f32, .Cost: 1}, // fcvt |
3179 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f32, .Cost: 1}, // fcvtl |
3180 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f32, .Cost: 2}, // fcvtl+fcvtl2 |
3181 | // FP16 |
3182 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::f16, .Cost: 1}, // fcvt |
3183 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f16, .Cost: 1}, // fcvt |
3184 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: 1}, // fcvtl |
3185 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: 2}, // fcvtl+fcvtl2 |
3186 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f16, .Cost: 2}, // fcvtl+fcvtl |
3187 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f16, .Cost: 3}, // fcvtl+fcvtl2+fcvtl |
3188 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f16, .Cost: 6}, // 2 * fcvtl+fcvtl2+fcvtl |
3189 | // BF16 (uses shift) |
3190 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::bf16, .Cost: 1}, // shl |
3191 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::bf16, .Cost: 2}, // shl+fcvt |
3192 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4bf16, .Cost: 1}, // shll |
3193 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8bf16, .Cost: 2}, // shll+shll2 |
3194 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2bf16, .Cost: 2}, // shll+fcvtl |
3195 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4bf16, .Cost: 3}, // shll+fcvtl+fcvtl2 |
3196 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8bf16, .Cost: 6}, // 2 * shll+fcvtl+fcvtl2 |
3197 | // FP Ext and trunc |
3198 | {.ISD: ISD::FP_ROUND, .Dst: MVT::f32, .Src: MVT::f64, .Cost: 1}, // fcvt |
3199 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f32, .Src: MVT::v2f64, .Cost: 1}, // fcvtn |
3200 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f64, .Cost: 2}, // fcvtn+fcvtn2 |
3201 | // FP16 |
3202 | {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f32, .Cost: 1}, // fcvt |
3203 | {.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f64, .Cost: 1}, // fcvt |
3204 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f32, .Cost: 1}, // fcvtn |
3205 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f32, .Cost: 2}, // fcvtn+fcvtn2 |
3206 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v2f16, .Src: MVT::v2f64, .Cost: 2}, // fcvtn+fcvtn |
3207 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f64, .Cost: 3}, // fcvtn+fcvtn2+fcvtn |
3208 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f64, .Cost: 6}, // 2 * fcvtn+fcvtn2+fcvtn |
3209 | // BF16 (more complex, with +bf16 is handled above) |
3210 | {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: 8}, // Expansion is ~8 insns |
3211 | {.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: 9}, // fcvtn + above |
3212 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f32, .Cost: 8}, |
3213 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: 8}, |
3214 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: 15}, |
3215 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: 9}, |
3216 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: 10}, |
3217 | {.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: 19}, |
3218 | |
3219 | // LowerVectorINT_TO_FP: |
3220 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1}, |
3221 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1}, |
3222 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1}, |
3223 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1}, |
3224 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1}, |
3225 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1}, |
3226 | |
3227 | // SVE: to nxv2f16 |
3228 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8, |
3229 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3230 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
3231 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
3232 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
3233 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8, |
3234 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3235 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
3236 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
3237 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
3238 | |
3239 | // SVE: to nxv4f16 |
3240 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8, |
3241 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3242 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST}, |
3243 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST}, |
3244 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8, |
3245 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3246 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST}, |
3247 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST}, |
3248 | |
3249 | // SVE: to nxv8f16 |
3250 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8, |
3251 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3252 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST}, |
3253 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8, |
3254 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3255 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST}, |
3256 | |
3257 | // SVE: to nxv16f16 |
3258 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8, |
3259 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3260 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8, |
3261 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3262 | |
3263 | // Complex: to v2f32 |
3264 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3}, |
3265 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3}, |
3266 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3}, |
3267 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3}, |
3268 | |
3269 | // SVE: to nxv2f32 |
3270 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8, |
3271 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3272 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
3273 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
3274 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
3275 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8, |
3276 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3277 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
3278 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
3279 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
3280 | |
3281 | // Complex: to v4f32 |
3282 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 4}, |
3283 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2}, |
3284 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3}, |
3285 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2}, |
3286 | |
3287 | // SVE: to nxv4f32 |
3288 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8, |
3289 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3290 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST}, |
3291 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST}, |
3292 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8, |
3293 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3294 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST}, |
3295 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST}, |
3296 | |
3297 | // Complex: to v8f32 |
3298 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10}, |
3299 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4}, |
3300 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10}, |
3301 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4}, |
3302 | |
3303 | // SVE: to nxv8f32 |
3304 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8, |
3305 | .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3306 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16, |
3307 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3308 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8, |
3309 | .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3310 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16, |
3311 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3312 | |
3313 | // SVE: to nxv16f32 |
3314 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8, |
3315 | .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
3316 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8, |
3317 | .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
3318 | |
3319 | // Complex: to v16f32 |
3320 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21}, |
3321 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21}, |
3322 | |
3323 | // Complex: to v2f64 |
3324 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4}, |
3325 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4}, |
3326 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2}, |
3327 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4}, |
3328 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4}, |
3329 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2}, |
3330 | |
3331 | // SVE: to nxv2f64 |
3332 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8, |
3333 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3334 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
3335 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
3336 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
3337 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8, |
3338 | .Cost: SVE_EXT_COST + SVE_FCVT_COST}, |
3339 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST}, |
3340 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST}, |
3341 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST}, |
3342 | |
3343 | // Complex: to v4f64 |
3344 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4}, |
3345 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4}, |
3346 | |
3347 | // SVE: to nxv4f64 |
3348 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8, |
3349 | .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3350 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16, |
3351 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3352 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32, |
3353 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3354 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8, |
3355 | .Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3356 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16, |
3357 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3358 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32, |
3359 | .Cost: SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST}, |
3360 | |
3361 | // SVE: to nxv8f64 |
3362 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8, |
3363 | .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
3364 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16, |
3365 | .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
3366 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8, |
3367 | .Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
3368 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16, |
3369 | .Cost: SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST}, |
3370 | |
3371 | // LowerVectorFP_TO_INT |
3372 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1}, |
3373 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1}, |
3374 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1}, |
3375 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1}, |
3376 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1}, |
3377 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1}, |
3378 | |
3379 | // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). |
3380 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2}, |
3381 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1}, |
3382 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1}, |
3383 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2}, |
3384 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1}, |
3385 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1}, |
3386 | |
3387 | // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 |
3388 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2}, |
3389 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2}, |
3390 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2}, |
3391 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2}, |
3392 | |
3393 | // Complex, from nxv2f32. |
3394 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1}, |
3395 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1}, |
3396 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1}, |
3397 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1}, |
3398 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1}, |
3399 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1}, |
3400 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1}, |
3401 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1}, |
3402 | |
3403 | // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. |
3404 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2}, |
3405 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2}, |
3406 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2}, |
3407 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2}, |
3408 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2}, |
3409 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2}, |
3410 | |
3411 | // Complex, from nxv2f64. |
3412 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1}, |
3413 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1}, |
3414 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1}, |
3415 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1}, |
3416 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1}, |
3417 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1}, |
3418 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1}, |
3419 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1}, |
3420 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1}, |
3421 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: 1}, |
3422 | |
3423 | // Complex, from nxv4f32. |
3424 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4}, |
3425 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1}, |
3426 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1}, |
3427 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1}, |
3428 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1}, |
3429 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4}, |
3430 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1}, |
3431 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1}, |
3432 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1}, |
3433 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: 1}, |
3434 | |
3435 | // Complex, from nxv8f64. Illegal -> illegal conversions not required. |
3436 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7}, |
3437 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7}, |
3438 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7}, |
3439 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7}, |
3440 | |
3441 | // Complex, from nxv4f64. Illegal -> illegal conversions not required. |
3442 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3}, |
3443 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3}, |
3444 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3}, |
3445 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3}, |
3446 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3}, |
3447 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3}, |
3448 | |
3449 | // Complex, from nxv8f32. Illegal -> illegal conversions not required. |
3450 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3}, |
3451 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3}, |
3452 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3}, |
3453 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3}, |
3454 | |
3455 | // Complex, from nxv8f16. |
3456 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10}, |
3457 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4}, |
3458 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1}, |
3459 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1}, |
3460 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1}, |
3461 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10}, |
3462 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4}, |
3463 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1}, |
3464 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1}, |
3465 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: 1}, |
3466 | |
3467 | // Complex, from nxv4f16. |
3468 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4}, |
3469 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1}, |
3470 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1}, |
3471 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1}, |
3472 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4}, |
3473 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1}, |
3474 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1}, |
3475 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1}, |
3476 | |
3477 | // Complex, from nxv2f16. |
3478 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1}, |
3479 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1}, |
3480 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1}, |
3481 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1}, |
3482 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1}, |
3483 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1}, |
3484 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1}, |
3485 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1}, |
3486 | |
3487 | // Truncate from nxvmf32 to nxvmf16. |
3488 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f32, .Cost: 1}, |
3489 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f32, .Cost: 1}, |
3490 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f32, .Cost: 3}, |
3491 | |
3492 | // Truncate from nxvmf64 to nxvmf16. |
3493 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f64, .Cost: 1}, |
3494 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f64, .Cost: 3}, |
3495 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f64, .Cost: 7}, |
3496 | |
3497 | // Truncate from nxvmf64 to nxvmf32. |
3498 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f64, .Cost: 1}, |
3499 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f64, .Cost: 3}, |
3500 | {.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f64, .Cost: 6}, |
3501 | |
3502 | // Extend from nxvmf16 to nxvmf32. |
3503 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f16, .Cost: 1}, |
3504 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f16, .Cost: 1}, |
3505 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f16, .Cost: 2}, |
3506 | |
3507 | // Extend from nxvmf16 to nxvmf64. |
3508 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f16, .Cost: 1}, |
3509 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f16, .Cost: 2}, |
3510 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f16, .Cost: 4}, |
3511 | |
3512 | // Extend from nxvmf32 to nxvmf64. |
3513 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f32, .Cost: 1}, |
3514 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f32, .Cost: 2}, |
3515 | {.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f32, .Cost: 6}, |
3516 | |
3517 | // Bitcasts from float to integer |
3518 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: 0}, |
3519 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: 0}, |
3520 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: 0}, |
3521 | |
3522 | // Bitcasts from integer to float |
3523 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 0}, |
3524 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 0}, |
3525 | {.ISD: ISD::BITCAST, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 0}, |
3526 | |
3527 | // Add cost for extending to illegal -too wide- scalable vectors. |
3528 | // zero/sign extend are implemented by multiple unpack operations, |
3529 | // where each operation has a cost of 1. |
3530 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2}, |
3531 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6}, |
3532 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14}, |
3533 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2}, |
3534 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6}, |
3535 | {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2}, |
3536 | |
3537 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2}, |
3538 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6}, |
3539 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14}, |
3540 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2}, |
3541 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6}, |
3542 | {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2}, |
3543 | }; |
3544 | |
3545 | // We have to estimate a cost of fixed length operation upon |
3546 | // SVE registers(operations) with the number of registers required |
3547 | // for a fixed type to be represented upon SVE registers. |
3548 | EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy; |
3549 | if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() && |
3550 | SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() && |
3551 | ST->useSVEForFixedLengthVectors(VT: WiderTy)) { |
3552 | std::pair<InstructionCost, MVT> LT = |
3553 | getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext())); |
3554 | unsigned NumElements = |
3555 | AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits(); |
3556 | return AdjustCost( |
3557 | LT.first * |
3558 | getCastInstrCost( |
3559 | Opcode, Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements), |
3560 | Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH, |
3561 | CostKind, I)); |
3562 | } |
3563 | |
3564 | if (const auto *Entry = ConvertCostTableLookup( |
3565 | Table: ConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT())) |
3566 | return AdjustCost(Entry->Cost); |
3567 | |
3568 | static const TypeConversionCostTblEntry FP16Tbl[] = { |
3569 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs |
3570 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1}, |
3571 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs |
3572 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1}, |
3573 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2}, // fcvtl+fcvtzs |
3574 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2}, |
3575 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2}, // fcvtzs+xtn |
3576 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2}, |
3577 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1}, // fcvtzs |
3578 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1}, |
3579 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4}, // 2*fcvtl+2*fcvtzs |
3580 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4}, |
3581 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3}, // 2*fcvtzs+xtn |
3582 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3}, |
3583 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2}, // 2*fcvtzs |
3584 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2}, |
3585 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8}, // 4*fcvtl+4*fcvtzs |
3586 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8}, |
3587 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // ushll + ucvtf |
3588 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // sshll + scvtf |
3589 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * ushl(2) + 2 * ucvtf |
3590 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * sshl(2) + 2 * scvtf |
3591 | }; |
3592 | |
3593 | if (ST->hasFullFP16()) |
3594 | if (const auto *Entry = ConvertCostTableLookup( |
3595 | Table: FP16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT())) |
3596 | return AdjustCost(Entry->Cost); |
3597 | |
3598 | // INT_TO_FP of i64->f32 will scalarize, which is required to avoid |
3599 | // double-rounding issues. |
3600 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && |
3601 | DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 && |
3602 | isa<FixedVectorType>(Val: Dst) && isa<FixedVectorType>(Val: Src)) |
3603 | return AdjustCost( |
3604 | cast<FixedVectorType>(Val: Dst)->getNumElements() * |
3605 | getCastInstrCost(Opcode, Dst: Dst->getScalarType(), Src: Src->getScalarType(), |
3606 | CCH, CostKind) + |
3607 | BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Src), Insert: false, Extract: true, |
3608 | CostKind) + |
3609 | BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Dst), Insert: true, Extract: false, |
3610 | CostKind)); |
3611 | |
3612 | if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && |
3613 | CCH == TTI::CastContextHint::Masked && |
3614 | ST->isSVEorStreamingSVEAvailable() && |
3615 | TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) == |
3616 | TargetLowering::TypePromoteInteger && |
3617 | TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) == |
3618 | TargetLowering::TypeSplitVector) { |
3619 | // The standard behaviour in the backend for these cases is to split the |
3620 | // extend up into two parts: |
3621 | // 1. Perform an extending load or masked load up to the legal type. |
3622 | // 2. Extend the loaded data to the final type. |
3623 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src); |
3624 | Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Context&: Src->getContext()); |
3625 | InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost( |
3626 | Opcode, Dst: LegalTy, Src, CCH, CostKind, I); |
3627 | InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost( |
3628 | Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I); |
3629 | return Part1 + Part2; |
3630 | } |
3631 | |
3632 | // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal, |
3633 | // but we also want to include the TTI::CastContextHint::Masked case too. |
3634 | if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && |
3635 | CCH == TTI::CastContextHint::Masked && |
3636 | ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(VT: DstTy)) |
3637 | CCH = TTI::CastContextHint::Normal; |
3638 | |
3639 | return AdjustCost( |
3640 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
3641 | } |
3642 | |
3643 | InstructionCost |
3644 | AArch64TTIImpl::(unsigned Opcode, Type *Dst, |
3645 | VectorType *VecTy, unsigned Index, |
3646 | TTI::TargetCostKind CostKind) const { |
3647 | |
3648 | // Make sure we were given a valid extend opcode. |
3649 | assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && |
3650 | "Invalid opcode" ); |
3651 | |
3652 | // We are extending an element we extract from a vector, so the source type |
3653 | // of the extend is the element type of the vector. |
3654 | auto *Src = VecTy->getElementType(); |
3655 | |
3656 | // Sign- and zero-extends are for integer types only. |
3657 | assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type" ); |
3658 | |
3659 | // Get the cost for the extract. We compute the cost (if any) for the extend |
3660 | // below. |
3661 | InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, |
3662 | CostKind, Index, Op0: nullptr, Op1: nullptr); |
3663 | |
3664 | // Legalize the types. |
3665 | auto VecLT = getTypeLegalizationCost(Ty: VecTy); |
3666 | auto DstVT = TLI->getValueType(DL, Ty: Dst); |
3667 | auto SrcVT = TLI->getValueType(DL, Ty: Src); |
3668 | |
3669 | // If the resulting type is still a vector and the destination type is legal, |
3670 | // we may get the extension for free. If not, get the default cost for the |
3671 | // extend. |
3672 | if (!VecLT.second.isVector() || !TLI->isTypeLegal(VT: DstVT)) |
3673 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
3674 | CostKind); |
3675 | |
3676 | // The destination type should be larger than the element type. If not, get |
3677 | // the default cost for the extend. |
3678 | if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) |
3679 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
3680 | CostKind); |
3681 | |
3682 | switch (Opcode) { |
3683 | default: |
3684 | llvm_unreachable("Opcode should be either SExt or ZExt" ); |
3685 | |
3686 | // For sign-extends, we only need a smov, which performs the extension |
3687 | // automatically. |
3688 | case Instruction::SExt: |
3689 | return Cost; |
3690 | |
3691 | // For zero-extends, the extend is performed automatically by a umov unless |
3692 | // the destination type is i64 and the element type is i8 or i16. |
3693 | case Instruction::ZExt: |
3694 | if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) |
3695 | return Cost; |
3696 | } |
3697 | |
3698 | // If we are unable to perform the extend for free, get the default cost. |
3699 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
3700 | CostKind); |
3701 | } |
3702 | |
3703 | InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, |
3704 | TTI::TargetCostKind CostKind, |
3705 | const Instruction *I) const { |
3706 | if (CostKind != TTI::TCK_RecipThroughput) |
3707 | return Opcode == Instruction::PHI ? 0 : 1; |
3708 | assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind" ); |
3709 | // Branches are assumed to be predicted. |
3710 | return 0; |
3711 | } |
3712 | |
3713 | InstructionCost AArch64TTIImpl::getVectorInstrCostHelper( |
3714 | unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, |
3715 | bool HasRealUse, const Instruction *I, Value *Scalar, |
3716 | ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const { |
3717 | assert(Val->isVectorTy() && "This must be a vector type" ); |
3718 | |
3719 | if (Index != -1U) { |
3720 | // Legalize the type. |
3721 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val); |
3722 | |
3723 | // This type is legalized to a scalar type. |
3724 | if (!LT.second.isVector()) |
3725 | return 0; |
3726 | |
3727 | // The type may be split. For fixed-width vectors we can normalize the |
3728 | // index to the new type. |
3729 | if (LT.second.isFixedLengthVector()) { |
3730 | unsigned Width = LT.second.getVectorNumElements(); |
3731 | Index = Index % Width; |
3732 | } |
3733 | |
3734 | // The element at index zero is already inside the vector. |
3735 | // - For a physical (HasRealUse==true) insert-element or extract-element |
3736 | // instruction that extracts integers, an explicit FPR -> GPR move is |
3737 | // needed. So it has non-zero cost. |
3738 | // - For the rest of cases (virtual instruction or element type is float), |
3739 | // consider the instruction free. |
3740 | if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) |
3741 | return 0; |
3742 | |
3743 | // This is recognising a LD1 single-element structure to one lane of one |
3744 | // register instruction. I.e., if this is an `insertelement` instruction, |
3745 | // and its second operand is a load, then we will generate a LD1, which |
3746 | // are expensive instructions. |
3747 | if (I && dyn_cast<LoadInst>(Val: I->getOperand(i: 1))) |
3748 | return CostKind == TTI::TCK_CodeSize |
3749 | ? 0 |
3750 | : ST->getVectorInsertExtractBaseCost() + 1; |
3751 | |
3752 | // i1 inserts and extract will include an extra cset or cmp of the vector |
3753 | // value. Increase the cost by 1 to account. |
3754 | if (Val->getScalarSizeInBits() == 1) |
3755 | return CostKind == TTI::TCK_CodeSize |
3756 | ? 2 |
3757 | : ST->getVectorInsertExtractBaseCost() + 1; |
3758 | |
3759 | // FIXME: |
3760 | // If the extract-element and insert-element instructions could be |
3761 | // simplified away (e.g., could be combined into users by looking at use-def |
3762 | // context), they have no cost. This is not done in the first place for |
3763 | // compile-time considerations. |
3764 | } |
3765 | |
3766 | // In case of Neon, if there exists extractelement from lane != 0 such that |
3767 | // 1. extractelement does not necessitate a move from vector_reg -> GPR. |
3768 | // 2. extractelement result feeds into fmul. |
3769 | // 3. Other operand of fmul is an extractelement from lane 0 or lane |
3770 | // equivalent to 0. |
3771 | // then the extractelement can be merged with fmul in the backend and it |
3772 | // incurs no cost. |
3773 | // e.g. |
3774 | // define double @foo(<2 x double> %a) { |
3775 | // %1 = extractelement <2 x double> %a, i32 0 |
3776 | // %2 = extractelement <2 x double> %a, i32 1 |
3777 | // %res = fmul double %1, %2 |
3778 | // ret double %res |
3779 | // } |
3780 | // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1] |
3781 | auto = [&]() { |
3782 | // We bail out if the extract is from lane 0. |
3783 | if (Index == 0) |
3784 | return false; |
3785 | |
3786 | // Check if the scalar element type of the vector operand of ExtractElement |
3787 | // instruction is one of the allowed types. |
3788 | auto IsAllowedScalarTy = [&](const Type *T) { |
3789 | return T->isFloatTy() || T->isDoubleTy() || |
3790 | (T->isHalfTy() && ST->hasFullFP16()); |
3791 | }; |
3792 | |
3793 | // Check if the extractelement user is scalar fmul. |
3794 | auto IsUserFMulScalarTy = [](const Value *EEUser) { |
3795 | // Check if the user is scalar fmul. |
3796 | const auto *BO = dyn_cast<BinaryOperator>(Val: EEUser); |
3797 | return BO && BO->getOpcode() == BinaryOperator::FMul && |
3798 | !BO->getType()->isVectorTy(); |
3799 | }; |
3800 | |
3801 | // Check if the extract index is from lane 0 or lane equivalent to 0 for a |
3802 | // certain scalar type and a certain vector register width. |
3803 | auto = [&](unsigned Idx, unsigned EltSz) { |
3804 | auto RegWidth = |
3805 | getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) |
3806 | .getFixedValue(); |
3807 | return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0); |
3808 | }; |
3809 | |
3810 | // Check if the type constraints on input vector type and result scalar type |
3811 | // of extractelement instruction are satisfied. |
3812 | if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType())) |
3813 | return false; |
3814 | |
3815 | if (Scalar) { |
3816 | DenseMap<User *, unsigned> ; |
3817 | for (auto *U : Scalar->users()) { |
3818 | if (!IsUserFMulScalarTy(U)) |
3819 | return false; |
3820 | // Recording entry for the user is important. Index value is not |
3821 | // important. |
3822 | UserToExtractIdx[U]; |
3823 | } |
3824 | if (UserToExtractIdx.empty()) |
3825 | return false; |
3826 | for (auto &[S, U, L] : ScalarUserAndIdx) { |
3827 | for (auto *U : S->users()) { |
3828 | if (UserToExtractIdx.contains(Val: U)) { |
3829 | auto *FMul = cast<BinaryOperator>(Val: U); |
3830 | auto *Op0 = FMul->getOperand(i_nocapture: 0); |
3831 | auto *Op1 = FMul->getOperand(i_nocapture: 1); |
3832 | if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) { |
3833 | UserToExtractIdx[U] = L; |
3834 | break; |
3835 | } |
3836 | } |
3837 | } |
3838 | } |
3839 | for (auto &[U, L] : UserToExtractIdx) { |
3840 | if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) && |
3841 | !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits())) |
3842 | return false; |
3843 | } |
3844 | } else { |
3845 | const auto *EE = cast<ExtractElementInst>(Val: I); |
3846 | |
3847 | const auto *IdxOp = dyn_cast<ConstantInt>(Val: EE->getIndexOperand()); |
3848 | if (!IdxOp) |
3849 | return false; |
3850 | |
3851 | return !EE->users().empty() && all_of(Range: EE->users(), P: [&](const User *U) { |
3852 | if (!IsUserFMulScalarTy(U)) |
3853 | return false; |
3854 | |
3855 | // Check if the other operand of extractelement is also extractelement |
3856 | // from lane equivalent to 0. |
3857 | const auto *BO = cast<BinaryOperator>(Val: U); |
3858 | const auto *OtherEE = dyn_cast<ExtractElementInst>( |
3859 | Val: BO->getOperand(i_nocapture: 0) == EE ? BO->getOperand(i_nocapture: 1) : BO->getOperand(i_nocapture: 0)); |
3860 | if (OtherEE) { |
3861 | const auto *IdxOp = dyn_cast<ConstantInt>(Val: OtherEE->getIndexOperand()); |
3862 | if (!IdxOp) |
3863 | return false; |
3864 | return IsExtractLaneEquivalentToZero( |
3865 | cast<ConstantInt>(Val: OtherEE->getIndexOperand()) |
3866 | ->getValue() |
3867 | .getZExtValue(), |
3868 | OtherEE->getType()->getScalarSizeInBits()); |
3869 | } |
3870 | return true; |
3871 | }); |
3872 | } |
3873 | return true; |
3874 | }; |
3875 | |
3876 | if (Opcode == Instruction::ExtractElement && (I || Scalar) && |
3877 | ExtractCanFuseWithFmul()) |
3878 | return 0; |
3879 | |
3880 | // All other insert/extracts cost this much. |
3881 | return CostKind == TTI::TCK_CodeSize ? 1 |
3882 | : ST->getVectorInsertExtractBaseCost(); |
3883 | } |
3884 | |
3885 | InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
3886 | TTI::TargetCostKind CostKind, |
3887 | unsigned Index, |
3888 | const Value *Op0, |
3889 | const Value *Op1) const { |
3890 | bool HasRealUse = |
3891 | Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Val: Op0); |
3892 | return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse); |
3893 | } |
3894 | |
3895 | InstructionCost AArch64TTIImpl::getVectorInstrCost( |
3896 | unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, |
3897 | Value *Scalar, |
3898 | ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const { |
3899 | return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse: false, I: nullptr, |
3900 | Scalar, ScalarUserAndIdx); |
3901 | } |
3902 | |
3903 | InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, |
3904 | Type *Val, |
3905 | TTI::TargetCostKind CostKind, |
3906 | unsigned Index) const { |
3907 | return getVectorInstrCostHelper(Opcode: I.getOpcode(), Val, CostKind, Index, |
3908 | HasRealUse: true /* HasRealUse */, I: &I); |
3909 | } |
3910 | |
3911 | InstructionCost AArch64TTIImpl::getScalarizationOverhead( |
3912 | VectorType *Ty, const APInt &DemandedElts, bool Insert, bool , |
3913 | TTI::TargetCostKind CostKind, bool ForPoisonSrc, |
3914 | ArrayRef<Value *> VL) const { |
3915 | if (isa<ScalableVectorType>(Val: Ty)) |
3916 | return InstructionCost::getInvalid(); |
3917 | if (Ty->getElementType()->isFloatingPointTy()) |
3918 | return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract, |
3919 | CostKind); |
3920 | unsigned VecInstCost = |
3921 | CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost(); |
3922 | return DemandedElts.popcount() * (Insert + Extract) * VecInstCost; |
3923 | } |
3924 | |
3925 | InstructionCost AArch64TTIImpl::getArithmeticInstrCost( |
3926 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
3927 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
3928 | ArrayRef<const Value *> Args, const Instruction *CxtI) const { |
3929 | |
3930 | // The code-generator is currently not able to handle scalable vectors |
3931 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3932 | // it. This change will be removed when code-generation for these types is |
3933 | // sufficiently reliable. |
3934 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty)) |
3935 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
3936 | return InstructionCost::getInvalid(); |
3937 | |
3938 | // TODO: Handle more cost kinds. |
3939 | if (CostKind != TTI::TCK_RecipThroughput) |
3940 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
3941 | Opd2Info: Op2Info, Args, CxtI); |
3942 | |
3943 | // Legalize the type. |
3944 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
3945 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3946 | |
3947 | switch (ISD) { |
3948 | default: |
3949 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
3950 | Opd2Info: Op2Info); |
3951 | case ISD::SREM: |
3952 | case ISD::SDIV: |
3953 | /* |
3954 | Notes for sdiv/srem specific costs: |
3955 | 1. This only considers the cases where the divisor is constant, uniform and |
3956 | (pow-of-2/non-pow-of-2). Other cases are not important since they either |
3957 | result in some form of (ldr + adrp), corresponding to constant vectors, or |
3958 | scalarization of the division operation. |
3959 | 2. Constant divisors, either negative in whole or partially, don't result in |
3960 | significantly different codegen as compared to positive constant divisors. |
3961 | So, we don't consider negative divisors separately. |
3962 | 3. If the codegen is significantly different with SVE, it has been indicated |
3963 | using comments at appropriate places. |
3964 | |
3965 | sdiv specific cases: |
3966 | ----------------------------------------------------------------------- |
3967 | codegen | pow-of-2 | Type |
3968 | ----------------------------------------------------------------------- |
3969 | add + cmp + csel + asr | Y | i64 |
3970 | add + cmp + csel + asr | Y | i32 |
3971 | ----------------------------------------------------------------------- |
3972 | |
3973 | srem specific cases: |
3974 | ----------------------------------------------------------------------- |
3975 | codegen | pow-of-2 | Type |
3976 | ----------------------------------------------------------------------- |
3977 | negs + and + and + csneg | Y | i64 |
3978 | negs + and + and + csneg | Y | i32 |
3979 | ----------------------------------------------------------------------- |
3980 | |
3981 | other sdiv/srem cases: |
3982 | ------------------------------------------------------------------------- |
3983 | common codegen | + srem | + sdiv | pow-of-2 | Type |
3984 | ------------------------------------------------------------------------- |
3985 | smulh + asr + add + add | - | - | N | i64 |
3986 | smull + lsr + add + add | - | - | N | i32 |
3987 | usra | and + sub | sshr | Y | <2 x i64> |
3988 | 2 * (scalar code) | - | - | N | <2 x i64> |
3989 | usra | bic + sub | sshr + neg | Y | <4 x i32> |
3990 | smull2 + smull + uzp2 | mls | - | N | <4 x i32> |
3991 | + sshr + usra | | | | |
3992 | ------------------------------------------------------------------------- |
3993 | */ |
3994 | if (Op2Info.isConstant() && Op2Info.isUniform()) { |
3995 | InstructionCost AddCost = |
3996 | getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind, |
3997 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
3998 | InstructionCost AsrCost = |
3999 | getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind, |
4000 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
4001 | InstructionCost MulCost = |
4002 | getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
4003 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
4004 | // add/cmp/csel/csneg should have similar cost while asr/negs/and should |
4005 | // have similar cost. |
4006 | auto VT = TLI->getValueType(DL, Ty); |
4007 | if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) { |
4008 | if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) { |
4009 | // Neg can be folded into the asr instruction. |
4010 | return ISD == ISD::SDIV ? (3 * AddCost + AsrCost) |
4011 | : (3 * AsrCost + AddCost); |
4012 | } else { |
4013 | return MulCost + AsrCost + 2 * AddCost; |
4014 | } |
4015 | } else if (VT.isVector()) { |
4016 | InstructionCost UsraCost = 2 * AsrCost; |
4017 | if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) { |
4018 | // Division with scalable types corresponds to native 'asrd' |
4019 | // instruction when SVE is available. |
4020 | // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8) |
4021 | |
4022 | // One more for the negation in SDIV |
4023 | InstructionCost Cost = |
4024 | (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0; |
4025 | if (Ty->isScalableTy() && ST->hasSVE()) |
4026 | Cost += 2 * AsrCost; |
4027 | else { |
4028 | Cost += |
4029 | UsraCost + |
4030 | (ISD == ISD::SDIV |
4031 | ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost |
4032 | : 2 * AddCost); |
4033 | } |
4034 | return Cost; |
4035 | } else if (LT.second == MVT::v2i64) { |
4036 | return VT.getVectorNumElements() * |
4037 | getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind, |
4038 | Op1Info: Op1Info.getNoProps(), |
4039 | Op2Info: Op2Info.getNoProps()); |
4040 | } else { |
4041 | // When SVE is available, we get: |
4042 | // smulh + lsr + add/sub + asr + add/sub. |
4043 | if (Ty->isScalableTy() && ST->hasSVE()) |
4044 | return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost; |
4045 | return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost; |
4046 | } |
4047 | } |
4048 | } |
4049 | if (Op2Info.isConstant() && !Op2Info.isUniform() && |
4050 | LT.second.isFixedLengthVector()) { |
4051 | // FIXME: When the constant vector is non-uniform, this may result in |
4052 | // loading the vector from constant pool or in some cases, may also result |
4053 | // in scalarization. For now, we are approximating this with the |
4054 | // scalarization cost. |
4055 | auto = 2 * getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, |
4056 | CostKind, Index: -1, Op0: nullptr, Op1: nullptr); |
4057 | auto InsertCost = getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, |
4058 | CostKind, Index: -1, Op0: nullptr, Op1: nullptr); |
4059 | unsigned NElts = cast<FixedVectorType>(Val: Ty)->getNumElements(); |
4060 | return ExtractCost + InsertCost + |
4061 | NElts * getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), |
4062 | CostKind, Op1Info: Op1Info.getNoProps(), |
4063 | Op2Info: Op2Info.getNoProps()); |
4064 | } |
4065 | [[fallthrough]]; |
4066 | case ISD::UDIV: |
4067 | case ISD::UREM: { |
4068 | auto VT = TLI->getValueType(DL, Ty); |
4069 | if (Op2Info.isConstant()) { |
4070 | // If the operand is a power of 2 we can use the shift or and cost. |
4071 | if (ISD == ISD::UDIV && Op2Info.isPowerOf2()) |
4072 | return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind, |
4073 | Op1Info: Op1Info.getNoProps(), |
4074 | Op2Info: Op2Info.getNoProps()); |
4075 | if (ISD == ISD::UREM && Op2Info.isPowerOf2()) |
4076 | return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind, |
4077 | Op1Info: Op1Info.getNoProps(), |
4078 | Op2Info: Op2Info.getNoProps()); |
4079 | |
4080 | if (ISD == ISD::UDIV || ISD == ISD::UREM) { |
4081 | // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL. |
4082 | // The MULHU will be expanded to UMULL for the types not listed below, |
4083 | // and will become a pair of UMULL+MULL2 for 128bit vectors. |
4084 | bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 || |
4085 | LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 || |
4086 | LT.second == MVT::nxv16i8; |
4087 | bool Is128bit = LT.second.is128BitVector(); |
4088 | |
4089 | InstructionCost MulCost = |
4090 | getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
4091 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
4092 | InstructionCost AddCost = |
4093 | getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind, |
4094 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
4095 | InstructionCost ShrCost = |
4096 | getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind, |
4097 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
4098 | InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH |
4099 | (HasMULH ? 0 : ShrCost) + // UMULL shift |
4100 | AddCost * 2 + ShrCost; |
4101 | return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0); |
4102 | } |
4103 | } |
4104 | |
4105 | // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are |
4106 | // emitted by the backend even when those functions are not declared in the |
4107 | // module. |
4108 | if (!VT.isVector() && VT.getSizeInBits() > 64) |
4109 | return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind); |
4110 | |
4111 | InstructionCost Cost = BaseT::getArithmeticInstrCost( |
4112 | Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info); |
4113 | if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) { |
4114 | if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) { |
4115 | // SDIV/UDIV operations are lowered using SVE, then we can have less |
4116 | // costs. |
4117 | if (isa<FixedVectorType>(Val: Ty) && cast<FixedVectorType>(Val: Ty) |
4118 | ->getPrimitiveSizeInBits() |
4119 | .getFixedValue() < 128) { |
4120 | EVT VT = TLI->getValueType(DL, Ty); |
4121 | static const CostTblEntry DivTbl[]{ |
4122 | {.ISD: ISD::SDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v4i8, .Cost: 8}, |
4123 | {.ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::SDIV, .Type: MVT::v2i16, .Cost: 5}, |
4124 | {.ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: 1}, |
4125 | {.ISD: ISD::UDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v4i8, .Cost: 8}, |
4126 | {.ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::UDIV, .Type: MVT::v2i16, .Cost: 5}, |
4127 | {.ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: 1}}; |
4128 | |
4129 | const auto *Entry = CostTableLookup(Table: DivTbl, ISD, Ty: VT.getSimpleVT()); |
4130 | if (nullptr != Entry) |
4131 | return Entry->Cost; |
4132 | } |
4133 | // For 8/16-bit elements, the cost is higher because the type |
4134 | // requires promotion and possibly splitting: |
4135 | if (LT.second.getScalarType() == MVT::i8) |
4136 | Cost *= 8; |
4137 | else if (LT.second.getScalarType() == MVT::i16) |
4138 | Cost *= 4; |
4139 | return Cost; |
4140 | } else { |
4141 | // If one of the operands is a uniform constant then the cost for each |
4142 | // element is Cost for insertion, extraction and division. |
4143 | // Insertion cost = 2, Extraction Cost = 2, Division = cost for the |
4144 | // operation with scalar type |
4145 | if ((Op1Info.isConstant() && Op1Info.isUniform()) || |
4146 | (Op2Info.isConstant() && Op2Info.isUniform())) { |
4147 | if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) { |
4148 | InstructionCost DivCost = BaseT::getArithmeticInstrCost( |
4149 | Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info); |
4150 | return (4 + DivCost) * VTy->getNumElements(); |
4151 | } |
4152 | } |
4153 | // On AArch64, without SVE, vector divisions are expanded |
4154 | // into scalar divisions of each pair of elements. |
4155 | Cost += getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind, |
4156 | Index: -1, Op0: nullptr, Op1: nullptr); |
4157 | Cost += getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1, |
4158 | Op0: nullptr, Op1: nullptr); |
4159 | } |
4160 | |
4161 | // TODO: if one of the arguments is scalar, then it's not necessary to |
4162 | // double the cost of handling the vector elements. |
4163 | Cost += Cost; |
4164 | } |
4165 | return Cost; |
4166 | } |
4167 | case ISD::MUL: |
4168 | // When SVE is available, then we can lower the v2i64 operation using |
4169 | // the SVE mul instruction, which has a lower cost. |
4170 | if (LT.second == MVT::v2i64 && ST->hasSVE()) |
4171 | return LT.first; |
4172 | |
4173 | // When SVE is not available, there is no MUL.2d instruction, |
4174 | // which means mul <2 x i64> is expensive as elements are extracted |
4175 | // from the vectors and the muls scalarized. |
4176 | // As getScalarizationOverhead is a bit too pessimistic, we |
4177 | // estimate the cost for a i64 vector directly here, which is: |
4178 | // - four 2-cost i64 extracts, |
4179 | // - two 2-cost i64 inserts, and |
4180 | // - two 1-cost muls. |
4181 | // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with |
4182 | // LT.first = 2 the cost is 28. If both operands are extensions it will not |
4183 | // need to scalarize so the cost can be cheaper (smull or umull). |
4184 | // so the cost can be cheaper (smull or umull). |
4185 | if (LT.second != MVT::v2i64 || isWideningInstruction(DstTy: Ty, Opcode, Args)) |
4186 | return LT.first; |
4187 | return cast<VectorType>(Val: Ty)->getElementCount().getKnownMinValue() * |
4188 | (getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind) + |
4189 | getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind, Index: -1, |
4190 | Op0: nullptr, Op1: nullptr) * |
4191 | 2 + |
4192 | getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -1, |
4193 | Op0: nullptr, Op1: nullptr)); |
4194 | case ISD::ADD: |
4195 | case ISD::XOR: |
4196 | case ISD::OR: |
4197 | case ISD::AND: |
4198 | case ISD::SRL: |
4199 | case ISD::SRA: |
4200 | case ISD::SHL: |
4201 | // These nodes are marked as 'custom' for combining purposes only. |
4202 | // We know that they are legal. See LowerAdd in ISelLowering. |
4203 | return LT.first; |
4204 | |
4205 | case ISD::FNEG: |
4206 | // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul |
4207 | if ((Ty->isFloatTy() || Ty->isDoubleTy() || |
4208 | (Ty->isHalfTy() && ST->hasFullFP16())) && |
4209 | CxtI && |
4210 | ((CxtI->hasOneUse() && |
4211 | match(V: *CxtI->user_begin(), P: m_FMul(L: m_Value(), R: m_Value()))) || |
4212 | match(V: CxtI->getOperand(i: 0), P: m_FMul(L: m_Value(), R: m_Value())))) |
4213 | return 0; |
4214 | [[fallthrough]]; |
4215 | case ISD::FADD: |
4216 | case ISD::FSUB: |
4217 | // Increase the cost for half and bfloat types if not architecturally |
4218 | // supported. |
4219 | if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || |
4220 | (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) |
4221 | return 2 * LT.first; |
4222 | if (!Ty->getScalarType()->isFP128Ty()) |
4223 | return LT.first; |
4224 | [[fallthrough]]; |
4225 | case ISD::FMUL: |
4226 | case ISD::FDIV: |
4227 | // These nodes are marked as 'custom' just to lower them to SVE. |
4228 | // We know said lowering will incur no additional cost. |
4229 | if (!Ty->getScalarType()->isFP128Ty()) |
4230 | return 2 * LT.first; |
4231 | |
4232 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
4233 | Opd2Info: Op2Info); |
4234 | case ISD::FREM: |
4235 | // Pass nullptr as fmod/fmodf calls are emitted by the backend even when |
4236 | // those functions are not declared in the module. |
4237 | if (!Ty->isVectorTy()) |
4238 | return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind); |
4239 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
4240 | Opd2Info: Op2Info); |
4241 | } |
4242 | } |
4243 | |
4244 | InstructionCost |
4245 | AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, |
4246 | const SCEV *Ptr) const { |
4247 | // Address computations in vectorized code with non-consecutive addresses will |
4248 | // likely result in more instructions compared to scalar code where the |
4249 | // computation can more often be merged into the index mode. The resulting |
4250 | // extra micro-ops can significantly decrease throughput. |
4251 | unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead; |
4252 | int MaxMergeDistance = 64; |
4253 | |
4254 | if (Ty->isVectorTy() && SE && |
4255 | !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + 1)) |
4256 | return NumVectorInstToHideOverhead; |
4257 | |
4258 | // In many cases the address computation is not merged into the instruction |
4259 | // addressing mode. |
4260 | return 1; |
4261 | } |
4262 | |
4263 | InstructionCost AArch64TTIImpl::getCmpSelInstrCost( |
4264 | unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, |
4265 | TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, |
4266 | TTI::OperandValueInfo Op2Info, const Instruction *I) const { |
4267 | // TODO: Handle other cost kinds. |
4268 | if (CostKind != TTI::TCK_RecipThroughput) |
4269 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
4270 | Op1Info, Op2Info, I); |
4271 | |
4272 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
4273 | // We don't lower some vector selects well that are wider than the register |
4274 | // width. |
4275 | if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SELECT) { |
4276 | // We would need this many instructions to hide the scalarization happening. |
4277 | const int AmortizationCost = 20; |
4278 | |
4279 | // If VecPred is not set, check if we can get a predicate from the context |
4280 | // instruction, if its type matches the requested ValTy. |
4281 | if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { |
4282 | CmpPredicate CurrentPred; |
4283 | if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(), |
4284 | R: m_Value()))) |
4285 | VecPred = CurrentPred; |
4286 | } |
4287 | // Check if we have a compare/select chain that can be lowered using |
4288 | // a (F)CMxx & BFI pair. |
4289 | if (CmpInst::isIntPredicate(P: VecPred) || VecPred == CmpInst::FCMP_OLE || |
4290 | VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || |
4291 | VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || |
4292 | VecPred == CmpInst::FCMP_UNE) { |
4293 | static const auto ValidMinMaxTys = { |
4294 | MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, |
4295 | MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; |
4296 | static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; |
4297 | |
4298 | auto LT = getTypeLegalizationCost(Ty: ValTy); |
4299 | if (any_of(Range: ValidMinMaxTys, P: [<](MVT M) { return M == LT.second; }) || |
4300 | (ST->hasFullFP16() && |
4301 | any_of(Range: ValidFP16MinMaxTys, P: [<](MVT M) { return M == LT.second; }))) |
4302 | return LT.first; |
4303 | } |
4304 | |
4305 | static const TypeConversionCostTblEntry |
4306 | VectorSelectTbl[] = { |
4307 | { .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f32, .Cost: 2 }, |
4308 | { .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f64, .Cost: 2 }, |
4309 | { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f32, .Cost: 2 }, |
4310 | { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f16, .Cost: 2 }, |
4311 | { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8f16, .Cost: 2 }, |
4312 | { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: 16 }, |
4313 | { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: 8 }, |
4314 | { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: 16 }, |
4315 | { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: 4 * AmortizationCost }, |
4316 | { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: 8 * AmortizationCost }, |
4317 | { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: 16 * AmortizationCost } |
4318 | }; |
4319 | |
4320 | EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy); |
4321 | EVT SelValTy = TLI->getValueType(DL, Ty: ValTy); |
4322 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { |
4323 | if (const auto *Entry = ConvertCostTableLookup(Table: VectorSelectTbl, ISD, |
4324 | Dst: SelCondTy.getSimpleVT(), |
4325 | Src: SelValTy.getSimpleVT())) |
4326 | return Entry->Cost; |
4327 | } |
4328 | } |
4329 | |
4330 | if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SETCC) { |
4331 | Type *ValScalarTy = ValTy->getScalarType(); |
4332 | if ((ValScalarTy->isHalfTy() && !ST->hasFullFP16()) || |
4333 | ValScalarTy->isBFloatTy()) { |
4334 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
4335 | |
4336 | // Without dedicated instructions we promote [b]f16 compares to f32. |
4337 | auto *PromotedTy = |
4338 | VectorType::get(ElementType: Type::getFloatTy(C&: ValTy->getContext()), Other: ValVTy); |
4339 | |
4340 | InstructionCost Cost = 0; |
4341 | // Promote operands to float vectors. |
4342 | Cost += 2 * getCastInstrCost(Opcode: Instruction::FPExt, Dst: PromotedTy, Src: ValTy, |
4343 | CCH: TTI::CastContextHint::None, CostKind); |
4344 | // Compare float vectors. |
4345 | Cost += getCmpSelInstrCost(Opcode, ValTy: PromotedTy, CondTy, VecPred, CostKind, |
4346 | Op1Info, Op2Info); |
4347 | // During codegen we'll truncate the vector result from i32 to i16. |
4348 | Cost += |
4349 | getCastInstrCost(Opcode: Instruction::Trunc, Dst: VectorType::getInteger(VTy: ValVTy), |
4350 | Src: VectorType::getInteger(VTy: PromotedTy), |
4351 | CCH: TTI::CastContextHint::None, CostKind); |
4352 | return Cost; |
4353 | } |
4354 | } |
4355 | |
4356 | // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to |
4357 | // icmp(and, 0) as free, as we can make use of ands, but only if the |
4358 | // comparison is not unsigned. |
4359 | if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I && |
4360 | !CmpInst::isUnsigned(predicate: VecPred) && |
4361 | TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) && |
4362 | match(V: I->getOperand(i: 0), P: m_And(L: m_Value(), R: m_Value()))) { |
4363 | if (match(V: I->getOperand(i: 1), P: m_Zero())) |
4364 | return 0; |
4365 | |
4366 | // x >= 1 / x < 1 -> x > 0 / x <= 0 |
4367 | if (match(V: I->getOperand(i: 1), P: m_One()) && |
4368 | (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE)) |
4369 | return 0; |
4370 | |
4371 | // x <= -1 / x > -1 -> x > 0 / x <= 0 |
4372 | if (match(V: I->getOperand(i: 1), P: m_AllOnes()) && |
4373 | (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT)) |
4374 | return 0; |
4375 | } |
4376 | |
4377 | // The base case handles scalable vectors fine for now, since it treats the |
4378 | // cost as 1 * legalization cost. |
4379 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
4380 | Op1Info, Op2Info, I); |
4381 | } |
4382 | |
4383 | AArch64TTIImpl::TTI::MemCmpExpansionOptions |
4384 | AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { |
4385 | TTI::MemCmpExpansionOptions Options; |
4386 | if (ST->requiresStrictAlign()) { |
4387 | // TODO: Add cost modeling for strict align. Misaligned loads expand to |
4388 | // a bunch of instructions when strict align is enabled. |
4389 | return Options; |
4390 | } |
4391 | Options.AllowOverlappingLoads = true; |
4392 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); |
4393 | Options.NumLoadsPerBlock = Options.MaxNumLoads; |
4394 | // TODO: Though vector loads usually perform well on AArch64, in some targets |
4395 | // they may wake up the FP unit, which raises the power consumption. Perhaps |
4396 | // they could be used with no holds barred (-O3). |
4397 | Options.LoadSizes = {8, 4, 2, 1}; |
4398 | Options.AllowedTailExpansions = {3, 5, 6}; |
4399 | return Options; |
4400 | } |
4401 | |
4402 | bool AArch64TTIImpl::prefersVectorizedAddressing() const { |
4403 | return ST->hasSVE(); |
4404 | } |
4405 | |
4406 | InstructionCost |
4407 | AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, |
4408 | Align Alignment, unsigned AddressSpace, |
4409 | TTI::TargetCostKind CostKind) const { |
4410 | if (useNeonVector(Ty: Src)) |
4411 | return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace, |
4412 | CostKind); |
4413 | auto LT = getTypeLegalizationCost(Ty: Src); |
4414 | if (!LT.first.isValid()) |
4415 | return InstructionCost::getInvalid(); |
4416 | |
4417 | // Return an invalid cost for element types that we are unable to lower. |
4418 | auto *VT = cast<VectorType>(Val: Src); |
4419 | if (VT->getElementType()->isIntegerTy(Bitwidth: 1)) |
4420 | return InstructionCost::getInvalid(); |
4421 | |
4422 | // The code-generator is currently not able to handle scalable vectors |
4423 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
4424 | // it. This change will be removed when code-generation for these types is |
4425 | // sufficiently reliable. |
4426 | if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
4427 | return InstructionCost::getInvalid(); |
4428 | |
4429 | return LT.first; |
4430 | } |
4431 | |
4432 | // This function returns gather/scatter overhead either from |
4433 | // user-provided value or specialized values per-target from \p ST. |
4434 | static unsigned getSVEGatherScatterOverhead(unsigned Opcode, |
4435 | const AArch64Subtarget *ST) { |
4436 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && |
4437 | "Should be called on only load or stores." ); |
4438 | switch (Opcode) { |
4439 | case Instruction::Load: |
4440 | if (SVEGatherOverhead.getNumOccurrences() > 0) |
4441 | return SVEGatherOverhead; |
4442 | return ST->getGatherOverhead(); |
4443 | break; |
4444 | case Instruction::Store: |
4445 | if (SVEScatterOverhead.getNumOccurrences() > 0) |
4446 | return SVEScatterOverhead; |
4447 | return ST->getScatterOverhead(); |
4448 | break; |
4449 | default: |
4450 | llvm_unreachable("Shouldn't have reached here" ); |
4451 | } |
4452 | } |
4453 | |
4454 | InstructionCost AArch64TTIImpl::getGatherScatterOpCost( |
4455 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
4456 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { |
4457 | if (useNeonVector(Ty: DataTy) || !isLegalMaskedGatherScatter(DataType: DataTy)) |
4458 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
4459 | Alignment, CostKind, I); |
4460 | auto *VT = cast<VectorType>(Val: DataTy); |
4461 | auto LT = getTypeLegalizationCost(Ty: DataTy); |
4462 | if (!LT.first.isValid()) |
4463 | return InstructionCost::getInvalid(); |
4464 | |
4465 | // Return an invalid cost for element types that we are unable to lower. |
4466 | if (!LT.second.isVector() || |
4467 | !isElementTypeLegalForScalableVector(Ty: VT->getElementType()) || |
4468 | VT->getElementType()->isIntegerTy(Bitwidth: 1)) |
4469 | return InstructionCost::getInvalid(); |
4470 | |
4471 | // The code-generator is currently not able to handle scalable vectors |
4472 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
4473 | // it. This change will be removed when code-generation for these types is |
4474 | // sufficiently reliable. |
4475 | if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
4476 | return InstructionCost::getInvalid(); |
4477 | |
4478 | ElementCount LegalVF = LT.second.getVectorElementCount(); |
4479 | InstructionCost MemOpCost = |
4480 | getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: 0, CostKind, |
4481 | OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I); |
4482 | // Add on an overhead cost for using gathers/scatters. |
4483 | MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST); |
4484 | return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF); |
4485 | } |
4486 | |
4487 | bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { |
4488 | return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors(); |
4489 | } |
4490 | |
4491 | InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, |
4492 | Align Alignment, |
4493 | unsigned AddressSpace, |
4494 | TTI::TargetCostKind CostKind, |
4495 | TTI::OperandValueInfo OpInfo, |
4496 | const Instruction *I) const { |
4497 | EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true); |
4498 | // Type legalization can't handle structs |
4499 | if (VT == MVT::Other) |
4500 | return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace, |
4501 | CostKind); |
4502 | |
4503 | auto LT = getTypeLegalizationCost(Ty); |
4504 | if (!LT.first.isValid()) |
4505 | return InstructionCost::getInvalid(); |
4506 | |
4507 | // The code-generator is currently not able to handle scalable vectors |
4508 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
4509 | // it. This change will be removed when code-generation for these types is |
4510 | // sufficiently reliable. |
4511 | // We also only support full register predicate loads and stores. |
4512 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty)) |
4513 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1) || |
4514 | (VTy->getElementType()->isIntegerTy(Bitwidth: 1) && |
4515 | !VTy->getElementCount().isKnownMultipleOf( |
4516 | RHS: ElementCount::getScalable(MinVal: 16)))) |
4517 | return InstructionCost::getInvalid(); |
4518 | |
4519 | // TODO: consider latency as well for TCK_SizeAndLatency. |
4520 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) |
4521 | return LT.first; |
4522 | |
4523 | if (CostKind != TTI::TCK_RecipThroughput) |
4524 | return 1; |
4525 | |
4526 | if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && |
4527 | LT.second.is128BitVector() && Alignment < Align(16)) { |
4528 | // Unaligned stores are extremely inefficient. We don't split all |
4529 | // unaligned 128-bit stores because the negative impact that has shown in |
4530 | // practice on inlined block copy code. |
4531 | // We make such stores expensive so that we will only vectorize if there |
4532 | // are 6 other instructions getting vectorized. |
4533 | const int AmortizationCost = 6; |
4534 | |
4535 | return LT.first * 2 * AmortizationCost; |
4536 | } |
4537 | |
4538 | // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs. |
4539 | if (Ty->isPtrOrPtrVectorTy()) |
4540 | return LT.first; |
4541 | |
4542 | if (useNeonVector(Ty)) { |
4543 | // Check truncating stores and extending loads. |
4544 | if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { |
4545 | // v4i8 types are lowered to scalar a load/store and sshll/xtn. |
4546 | if (VT == MVT::v4i8) |
4547 | return 2; |
4548 | // Otherwise we need to scalarize. |
4549 | return cast<FixedVectorType>(Val: Ty)->getNumElements() * 2; |
4550 | } |
4551 | EVT EltVT = VT.getVectorElementType(); |
4552 | unsigned EltSize = EltVT.getScalarSizeInBits(); |
4553 | if (!isPowerOf2_32(Value: EltSize) || EltSize < 8 || EltSize > 64 || |
4554 | VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1)) |
4555 | return LT.first; |
4556 | // FIXME: v3i8 lowering currently is very inefficient, due to automatic |
4557 | // widening to v4i8, which produces suboptimal results. |
4558 | if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8) |
4559 | return LT.first; |
4560 | |
4561 | // Check non-power-of-2 loads/stores for legal vector element types with |
4562 | // NEON. Non-power-of-2 memory ops will get broken down to a set of |
4563 | // operations on smaller power-of-2 ops, including ld1/st1. |
4564 | LLVMContext &C = Ty->getContext(); |
4565 | InstructionCost Cost(0); |
4566 | SmallVector<EVT> TypeWorklist; |
4567 | TypeWorklist.push_back(Elt: VT); |
4568 | while (!TypeWorklist.empty()) { |
4569 | EVT CurrVT = TypeWorklist.pop_back_val(); |
4570 | unsigned CurrNumElements = CurrVT.getVectorNumElements(); |
4571 | if (isPowerOf2_32(Value: CurrNumElements)) { |
4572 | Cost += 1; |
4573 | continue; |
4574 | } |
4575 | |
4576 | unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / 2; |
4577 | TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2)); |
4578 | TypeWorklist.push_back( |
4579 | Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2)); |
4580 | } |
4581 | return Cost; |
4582 | } |
4583 | |
4584 | return LT.first; |
4585 | } |
4586 | |
4587 | InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( |
4588 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
4589 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
4590 | bool UseMaskForCond, bool UseMaskForGaps) const { |
4591 | assert(Factor >= 2 && "Invalid interleave factor" ); |
4592 | auto *VecVTy = cast<VectorType>(Val: VecTy); |
4593 | |
4594 | if (VecTy->isScalableTy() && !ST->hasSVE()) |
4595 | return InstructionCost::getInvalid(); |
4596 | |
4597 | // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we |
4598 | // only have lowering for power-of-2 factors. |
4599 | // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in |
4600 | // InterleavedAccessPass for ld3/st3 |
4601 | if (VecTy->isScalableTy() && !isPowerOf2_32(Value: Factor)) |
4602 | return InstructionCost::getInvalid(); |
4603 | |
4604 | // Vectorization for masked interleaved accesses is only enabled for scalable |
4605 | // VF. |
4606 | if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) |
4607 | return InstructionCost::getInvalid(); |
4608 | |
4609 | if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { |
4610 | unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); |
4611 | auto *SubVecTy = |
4612 | VectorType::get(ElementType: VecVTy->getElementType(), |
4613 | EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor)); |
4614 | |
4615 | // ldN/stN only support legal vector types of size 64 or 128 in bits. |
4616 | // Accesses having vector types that are a multiple of 128 bits can be |
4617 | // matched to more than one ldN/stN instruction. |
4618 | bool UseScalable; |
4619 | if (MinElts % Factor == 0 && |
4620 | TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable)) |
4621 | return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable); |
4622 | } |
4623 | |
4624 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
4625 | Alignment, AddressSpace, CostKind, |
4626 | UseMaskForCond, UseMaskForGaps); |
4627 | } |
4628 | |
4629 | InstructionCost |
4630 | AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const { |
4631 | InstructionCost Cost = 0; |
4632 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
4633 | for (auto *I : Tys) { |
4634 | if (!I->isVectorTy()) |
4635 | continue; |
4636 | if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() == |
4637 | 128) |
4638 | Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind) + |
4639 | getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind); |
4640 | } |
4641 | return Cost; |
4642 | } |
4643 | |
4644 | unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) const { |
4645 | return ST->getMaxInterleaveFactor(); |
4646 | } |
4647 | |
4648 | // For Falkor, we want to avoid having too many strided loads in a loop since |
4649 | // that can exhaust the HW prefetcher resources. We adjust the unroller |
4650 | // MaxCount preference below to attempt to ensure unrolling doesn't create too |
4651 | // many strided loads. |
4652 | static void |
4653 | getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
4654 | TargetTransformInfo::UnrollingPreferences &UP) { |
4655 | enum { MaxStridedLoads = 7 }; |
4656 | auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { |
4657 | int StridedLoads = 0; |
4658 | // FIXME? We could make this more precise by looking at the CFG and |
4659 | // e.g. not counting loads in each side of an if-then-else diamond. |
4660 | for (const auto BB : L->blocks()) { |
4661 | for (auto &I : *BB) { |
4662 | LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I); |
4663 | if (!LMemI) |
4664 | continue; |
4665 | |
4666 | Value *PtrValue = LMemI->getPointerOperand(); |
4667 | if (L->isLoopInvariant(V: PtrValue)) |
4668 | continue; |
4669 | |
4670 | const SCEV *LSCEV = SE.getSCEV(V: PtrValue); |
4671 | const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV); |
4672 | if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) |
4673 | continue; |
4674 | |
4675 | // FIXME? We could take pairing of unrolled load copies into account |
4676 | // by looking at the AddRec, but we would probably have to limit this |
4677 | // to loops with no stores or other memory optimization barriers. |
4678 | ++StridedLoads; |
4679 | // We've seen enough strided loads that seeing more won't make a |
4680 | // difference. |
4681 | if (StridedLoads > MaxStridedLoads / 2) |
4682 | return StridedLoads; |
4683 | } |
4684 | } |
4685 | return StridedLoads; |
4686 | }; |
4687 | |
4688 | int StridedLoads = countStridedLoads(L, SE); |
4689 | LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads |
4690 | << " strided loads\n" ); |
4691 | // Pick the largest power of 2 unroll count that won't result in too many |
4692 | // strided loads. |
4693 | if (StridedLoads) { |
4694 | UP.MaxCount = 1 << Log2_32(Value: MaxStridedLoads / StridedLoads); |
4695 | LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " |
4696 | << UP.MaxCount << '\n'); |
4697 | } |
4698 | } |
4699 | |
4700 | // This function returns true if the loop: |
4701 | // 1. Has a valid cost, and |
4702 | // 2. Has a cost within the supplied budget. |
4703 | // Otherwise it returns false. |
4704 | static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, |
4705 | InstructionCost Budget, |
4706 | unsigned *FinalSize) { |
4707 | // Estimate the size of the loop. |
4708 | InstructionCost LoopCost = 0; |
4709 | |
4710 | for (auto *BB : L->getBlocks()) { |
4711 | for (auto &I : *BB) { |
4712 | SmallVector<const Value *, 4> Operands(I.operand_values()); |
4713 | InstructionCost Cost = |
4714 | TTI.getInstructionCost(U: &I, Operands, CostKind: TTI::TCK_CodeSize); |
4715 | // This can happen with intrinsics that don't currently have a cost model |
4716 | // or for some operations that require SVE. |
4717 | if (!Cost.isValid()) |
4718 | return false; |
4719 | |
4720 | LoopCost += Cost; |
4721 | if (LoopCost > Budget) |
4722 | return false; |
4723 | } |
4724 | } |
4725 | |
4726 | if (FinalSize) |
4727 | *FinalSize = LoopCost.getValue(); |
4728 | return true; |
4729 | } |
4730 | |
4731 | static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, |
4732 | const AArch64TTIImpl &TTI) { |
4733 | // Only consider loops with unknown trip counts for which we can determine |
4734 | // a symbolic expression. Multi-exit loops with small known trip counts will |
4735 | // likely be unrolled anyway. |
4736 | const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L); |
4737 | if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC)) |
4738 | return false; |
4739 | |
4740 | // It might not be worth unrolling loops with low max trip counts. Restrict |
4741 | // this to max trip counts > 32 for now. |
4742 | unsigned MaxTC = SE.getSmallConstantMaxTripCount(L); |
4743 | if (MaxTC > 0 && MaxTC <= 32) |
4744 | return false; |
4745 | |
4746 | // Make sure the loop size is <= 5. |
4747 | if (!isLoopSizeWithinBudget(L, TTI, Budget: 5, FinalSize: nullptr)) |
4748 | return false; |
4749 | |
4750 | // Small search loops with multiple exits can be highly beneficial to unroll. |
4751 | // We only care about loops with exactly two exiting blocks, although each |
4752 | // block could jump to the same exit block. |
4753 | ArrayRef<BasicBlock *> Blocks = L->getBlocks(); |
4754 | if (Blocks.size() != 2) |
4755 | return false; |
4756 | |
4757 | if (any_of(Range&: Blocks, P: [](BasicBlock *BB) { |
4758 | return !isa<BranchInst>(Val: BB->getTerminator()); |
4759 | })) |
4760 | return false; |
4761 | |
4762 | return true; |
4763 | } |
4764 | |
4765 | /// For Apple CPUs, we want to runtime-unroll loops to make better use if the |
4766 | /// OOO engine's wide instruction window and various predictors. |
4767 | static void |
4768 | getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, |
4769 | TargetTransformInfo::UnrollingPreferences &UP, |
4770 | const AArch64TTIImpl &TTI) { |
4771 | // Limit loops with structure that is highly likely to benefit from runtime |
4772 | // unrolling; that is we exclude outer loops and loops with many blocks (i.e. |
4773 | // likely with complex control flow). Note that the heuristics here may be |
4774 | // overly conservative and we err on the side of avoiding runtime unrolling |
4775 | // rather than unroll excessively. They are all subject to further refinement. |
4776 | if (!L->isInnermost() || L->getNumBlocks() > 8) |
4777 | return; |
4778 | |
4779 | // Loops with multiple exits are handled by common code. |
4780 | if (!L->getExitBlock()) |
4781 | return; |
4782 | |
4783 | const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L); |
4784 | if (isa<SCEVConstant>(Val: BTC) || isa<SCEVCouldNotCompute>(Val: BTC) || |
4785 | (SE.getSmallConstantMaxTripCount(L) > 0 && |
4786 | SE.getSmallConstantMaxTripCount(L) <= 32)) |
4787 | return; |
4788 | |
4789 | if (findStringMetadataForLoop(TheLoop: L, Name: "llvm.loop.isvectorized" )) |
4790 | return; |
4791 | |
4792 | if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L)) |
4793 | return; |
4794 | |
4795 | // Limit to loops with trip counts that are cheap to expand. |
4796 | UP.SCEVExpansionBudget = 1; |
4797 | |
4798 | // Try to unroll small, single block loops, if they have load/store |
4799 | // dependencies, to expose more parallel memory access streams. |
4800 | BasicBlock * = L->getHeader(); |
4801 | if (Header == L->getLoopLatch()) { |
4802 | // Estimate the size of the loop. |
4803 | unsigned Size; |
4804 | if (!isLoopSizeWithinBudget(L, TTI, Budget: 8, FinalSize: &Size)) |
4805 | return; |
4806 | |
4807 | SmallPtrSet<Value *, 8> LoadedValues; |
4808 | SmallVector<StoreInst *> Stores; |
4809 | for (auto *BB : L->blocks()) { |
4810 | for (auto &I : *BB) { |
4811 | Value *Ptr = getLoadStorePointerOperand(V: &I); |
4812 | if (!Ptr) |
4813 | continue; |
4814 | const SCEV *PtrSCEV = SE.getSCEV(V: Ptr); |
4815 | if (SE.isLoopInvariant(S: PtrSCEV, L)) |
4816 | continue; |
4817 | if (isa<LoadInst>(Val: &I)) |
4818 | LoadedValues.insert(Ptr: &I); |
4819 | else |
4820 | Stores.push_back(Elt: cast<StoreInst>(Val: &I)); |
4821 | } |
4822 | } |
4823 | |
4824 | // Try to find an unroll count that maximizes the use of the instruction |
4825 | // window, i.e. trying to fetch as many instructions per cycle as possible. |
4826 | unsigned MaxInstsPerLine = 16; |
4827 | unsigned UC = 1; |
4828 | unsigned BestUC = 1; |
4829 | unsigned SizeWithBestUC = BestUC * Size; |
4830 | while (UC <= 8) { |
4831 | unsigned SizeWithUC = UC * Size; |
4832 | if (SizeWithUC > 48) |
4833 | break; |
4834 | if ((SizeWithUC % MaxInstsPerLine) == 0 || |
4835 | (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) { |
4836 | BestUC = UC; |
4837 | SizeWithBestUC = BestUC * Size; |
4838 | } |
4839 | UC++; |
4840 | } |
4841 | |
4842 | if (BestUC == 1 || none_of(Range&: Stores, P: [&LoadedValues](StoreInst *SI) { |
4843 | return LoadedValues.contains(Ptr: SI->getOperand(i_nocapture: 0)); |
4844 | })) |
4845 | return; |
4846 | |
4847 | UP.Runtime = true; |
4848 | UP.DefaultUnrollRuntimeCount = BestUC; |
4849 | return; |
4850 | } |
4851 | |
4852 | // Try to runtime-unroll loops with early-continues depending on loop-varying |
4853 | // loads; this helps with branch-prediction for the early-continues. |
4854 | auto *Term = dyn_cast<BranchInst>(Val: Header->getTerminator()); |
4855 | auto *Latch = L->getLoopLatch(); |
4856 | SmallVector<BasicBlock *> Preds(predecessors(BB: Latch)); |
4857 | if (!Term || !Term->isConditional() || Preds.size() == 1 || |
4858 | !llvm::is_contained(Range&: Preds, Element: Header) || |
4859 | none_of(Range&: Preds, P: [L](BasicBlock *Pred) { return L->contains(BB: Pred); })) |
4860 | return; |
4861 | |
4862 | std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad = |
4863 | [&](Instruction *I, unsigned Depth) -> bool { |
4864 | if (isa<PHINode>(Val: I) || L->isLoopInvariant(V: I) || Depth > 8) |
4865 | return false; |
4866 | |
4867 | if (isa<LoadInst>(Val: I)) |
4868 | return true; |
4869 | |
4870 | return any_of(Range: I->operands(), P: [&](Value *V) { |
4871 | auto *I = dyn_cast<Instruction>(Val: V); |
4872 | return I && DependsOnLoopLoad(I, Depth + 1); |
4873 | }); |
4874 | }; |
4875 | CmpPredicate Pred; |
4876 | Instruction *I; |
4877 | if (match(V: Term, P: m_Br(C: m_ICmp(Pred, L: m_Instruction(I), R: m_Value()), T: m_Value(), |
4878 | F: m_Value())) && |
4879 | DependsOnLoopLoad(I, 0)) { |
4880 | UP.Runtime = true; |
4881 | } |
4882 | } |
4883 | |
4884 | void AArch64TTIImpl::( |
4885 | Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, |
4886 | OptimizationRemarkEmitter *ORE) const { |
4887 | // Enable partial unrolling and runtime unrolling. |
4888 | BaseT::getUnrollingPreferences(L, SE, UP, ORE); |
4889 | |
4890 | UP.UpperBound = true; |
4891 | |
4892 | // For inner loop, it is more likely to be a hot one, and the runtime check |
4893 | // can be promoted out from LICM pass, so the overhead is less, let's try |
4894 | // a larger threshold to unroll more loops. |
4895 | if (L->getLoopDepth() > 1) |
4896 | UP.PartialThreshold *= 2; |
4897 | |
4898 | // Disable partial & runtime unrolling on -Os. |
4899 | UP.PartialOptSizeThreshold = 0; |
4900 | |
4901 | // Scan the loop: don't unroll loops with calls as this could prevent |
4902 | // inlining. Don't unroll vector loops either, as they don't benefit much from |
4903 | // unrolling. |
4904 | for (auto *BB : L->getBlocks()) { |
4905 | for (auto &I : *BB) { |
4906 | // Don't unroll vectorised loop. |
4907 | if (I.getType()->isVectorTy()) |
4908 | return; |
4909 | |
4910 | if (isa<CallBase>(Val: I)) { |
4911 | if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) |
4912 | if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) |
4913 | if (!isLoweredToCall(F)) |
4914 | continue; |
4915 | return; |
4916 | } |
4917 | } |
4918 | } |
4919 | |
4920 | // Apply subtarget-specific unrolling preferences. |
4921 | switch (ST->getProcFamily()) { |
4922 | case AArch64Subtarget::AppleA14: |
4923 | case AArch64Subtarget::AppleA15: |
4924 | case AArch64Subtarget::AppleA16: |
4925 | case AArch64Subtarget::AppleM4: |
4926 | getAppleRuntimeUnrollPreferences(L, SE, UP, TTI: *this); |
4927 | break; |
4928 | case AArch64Subtarget::Falkor: |
4929 | if (EnableFalkorHWPFUnrollFix) |
4930 | getFalkorUnrollingPreferences(L, SE, UP); |
4931 | break; |
4932 | default: |
4933 | break; |
4934 | } |
4935 | |
4936 | // If this is a small, multi-exit loop similar to something like std::find, |
4937 | // then there is typically a performance improvement achieved by unrolling. |
4938 | if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, TTI: *this)) { |
4939 | UP.RuntimeUnrollMultiExit = true; |
4940 | UP.Runtime = true; |
4941 | // Limit unroll count. |
4942 | UP.DefaultUnrollRuntimeCount = 4; |
4943 | // Allow slightly more costly trip-count expansion to catch search loops |
4944 | // with pointer inductions. |
4945 | UP.SCEVExpansionBudget = 5; |
4946 | return; |
4947 | } |
4948 | |
4949 | // Enable runtime unrolling for in-order models |
4950 | // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by |
4951 | // checking for that case, we can ensure that the default behaviour is |
4952 | // unchanged |
4953 | if (ST->getProcFamily() != AArch64Subtarget::Generic && |
4954 | !ST->getSchedModel().isOutOfOrder()) { |
4955 | UP.Runtime = true; |
4956 | UP.Partial = true; |
4957 | UP.UnrollRemainder = true; |
4958 | UP.DefaultUnrollRuntimeCount = 4; |
4959 | |
4960 | UP.UnrollAndJam = true; |
4961 | UP.UnrollAndJamInnerLoopThreshold = 60; |
4962 | } |
4963 | } |
4964 | |
4965 | void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
4966 | TTI::PeelingPreferences &PP) const { |
4967 | BaseT::getPeelingPreferences(L, SE, PP); |
4968 | } |
4969 | |
4970 | Value * |
4971 | AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, |
4972 | Type *ExpectedType) const { |
4973 | switch (Inst->getIntrinsicID()) { |
4974 | default: |
4975 | return nullptr; |
4976 | case Intrinsic::aarch64_neon_st2: |
4977 | case Intrinsic::aarch64_neon_st3: |
4978 | case Intrinsic::aarch64_neon_st4: { |
4979 | // Create a struct type |
4980 | StructType *ST = dyn_cast<StructType>(Val: ExpectedType); |
4981 | if (!ST) |
4982 | return nullptr; |
4983 | unsigned NumElts = Inst->arg_size() - 1; |
4984 | if (ST->getNumElements() != NumElts) |
4985 | return nullptr; |
4986 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
4987 | if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i)) |
4988 | return nullptr; |
4989 | } |
4990 | Value *Res = PoisonValue::get(T: ExpectedType); |
4991 | IRBuilder<> Builder(Inst); |
4992 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
4993 | Value *L = Inst->getArgOperand(i); |
4994 | Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i); |
4995 | } |
4996 | return Res; |
4997 | } |
4998 | case Intrinsic::aarch64_neon_ld2: |
4999 | case Intrinsic::aarch64_neon_ld3: |
5000 | case Intrinsic::aarch64_neon_ld4: |
5001 | if (Inst->getType() == ExpectedType) |
5002 | return Inst; |
5003 | return nullptr; |
5004 | } |
5005 | } |
5006 | |
5007 | bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, |
5008 | MemIntrinsicInfo &Info) const { |
5009 | switch (Inst->getIntrinsicID()) { |
5010 | default: |
5011 | break; |
5012 | case Intrinsic::aarch64_neon_ld2: |
5013 | case Intrinsic::aarch64_neon_ld3: |
5014 | case Intrinsic::aarch64_neon_ld4: |
5015 | Info.ReadMem = true; |
5016 | Info.WriteMem = false; |
5017 | Info.PtrVal = Inst->getArgOperand(i: 0); |
5018 | break; |
5019 | case Intrinsic::aarch64_neon_st2: |
5020 | case Intrinsic::aarch64_neon_st3: |
5021 | case Intrinsic::aarch64_neon_st4: |
5022 | Info.ReadMem = false; |
5023 | Info.WriteMem = true; |
5024 | Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - 1); |
5025 | break; |
5026 | } |
5027 | |
5028 | switch (Inst->getIntrinsicID()) { |
5029 | default: |
5030 | return false; |
5031 | case Intrinsic::aarch64_neon_ld2: |
5032 | case Intrinsic::aarch64_neon_st2: |
5033 | Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; |
5034 | break; |
5035 | case Intrinsic::aarch64_neon_ld3: |
5036 | case Intrinsic::aarch64_neon_st3: |
5037 | Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; |
5038 | break; |
5039 | case Intrinsic::aarch64_neon_ld4: |
5040 | case Intrinsic::aarch64_neon_st4: |
5041 | Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; |
5042 | break; |
5043 | } |
5044 | return true; |
5045 | } |
5046 | |
5047 | /// See if \p I should be considered for address type promotion. We check if \p |
5048 | /// I is a sext with right type and used in memory accesses. If it used in a |
5049 | /// "complex" getelementptr, we allow it to be promoted without finding other |
5050 | /// sext instructions that sign extended the same initial value. A getelementptr |
5051 | /// is considered as "complex" if it has more than 2 operands. |
5052 | bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( |
5053 | const Instruction &I, bool &) const { |
5054 | bool Considerable = false; |
5055 | AllowPromotionWithoutCommonHeader = false; |
5056 | if (!isa<SExtInst>(Val: &I)) |
5057 | return false; |
5058 | Type *ConsideredSExtType = |
5059 | Type::getInt64Ty(C&: I.getParent()->getParent()->getContext()); |
5060 | if (I.getType() != ConsideredSExtType) |
5061 | return false; |
5062 | // See if the sext is the one with the right type and used in at least one |
5063 | // GetElementPtrInst. |
5064 | for (const User *U : I.users()) { |
5065 | if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) { |
5066 | Considerable = true; |
5067 | // A getelementptr is considered as "complex" if it has more than 2 |
5068 | // operands. We will promote a SExt used in such complex GEP as we |
5069 | // expect some computation to be merged if they are done on 64 bits. |
5070 | if (GEPInst->getNumOperands() > 2) { |
5071 | AllowPromotionWithoutCommonHeader = true; |
5072 | break; |
5073 | } |
5074 | } |
5075 | } |
5076 | return Considerable; |
5077 | } |
5078 | |
5079 | bool AArch64TTIImpl::isLegalToVectorizeReduction( |
5080 | const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { |
5081 | if (!VF.isScalable()) |
5082 | return true; |
5083 | |
5084 | Type *Ty = RdxDesc.getRecurrenceType(); |
5085 | if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) |
5086 | return false; |
5087 | |
5088 | switch (RdxDesc.getRecurrenceKind()) { |
5089 | case RecurKind::Add: |
5090 | case RecurKind::FAdd: |
5091 | case RecurKind::And: |
5092 | case RecurKind::Or: |
5093 | case RecurKind::Xor: |
5094 | case RecurKind::SMin: |
5095 | case RecurKind::SMax: |
5096 | case RecurKind::UMin: |
5097 | case RecurKind::UMax: |
5098 | case RecurKind::FMin: |
5099 | case RecurKind::FMax: |
5100 | case RecurKind::FMulAdd: |
5101 | case RecurKind::AnyOf: |
5102 | return true; |
5103 | default: |
5104 | return false; |
5105 | } |
5106 | } |
5107 | |
5108 | InstructionCost |
5109 | AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
5110 | FastMathFlags FMF, |
5111 | TTI::TargetCostKind CostKind) const { |
5112 | // The code-generator is currently not able to handle scalable vectors |
5113 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
5114 | // it. This change will be removed when code-generation for these types is |
5115 | // sufficiently reliable. |
5116 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty)) |
5117 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
5118 | return InstructionCost::getInvalid(); |
5119 | |
5120 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
5121 | |
5122 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) |
5123 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
5124 | |
5125 | InstructionCost LegalizationCost = 0; |
5126 | if (LT.first > 1) { |
5127 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Ty->getContext()); |
5128 | IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF); |
5129 | LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - 1); |
5130 | } |
5131 | |
5132 | return LegalizationCost + /*Cost of horizontal reduction*/ 2; |
5133 | } |
5134 | |
5135 | InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( |
5136 | unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const { |
5137 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
5138 | InstructionCost LegalizationCost = 0; |
5139 | if (LT.first > 1) { |
5140 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: ValTy->getContext()); |
5141 | LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind); |
5142 | LegalizationCost *= LT.first - 1; |
5143 | } |
5144 | |
5145 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
5146 | assert(ISD && "Invalid opcode" ); |
5147 | // Add the final reduction cost for the legal horizontal reduction |
5148 | switch (ISD) { |
5149 | case ISD::ADD: |
5150 | case ISD::AND: |
5151 | case ISD::OR: |
5152 | case ISD::XOR: |
5153 | case ISD::FADD: |
5154 | return LegalizationCost + 2; |
5155 | default: |
5156 | return InstructionCost::getInvalid(); |
5157 | } |
5158 | } |
5159 | |
5160 | InstructionCost |
5161 | AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, |
5162 | std::optional<FastMathFlags> FMF, |
5163 | TTI::TargetCostKind CostKind) const { |
5164 | // The code-generator is currently not able to handle scalable vectors |
5165 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
5166 | // it. This change will be removed when code-generation for these types is |
5167 | // sufficiently reliable. |
5168 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: ValTy)) |
5169 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
5170 | return InstructionCost::getInvalid(); |
5171 | |
5172 | if (TTI::requiresOrderedReduction(FMF)) { |
5173 | if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) { |
5174 | InstructionCost BaseCost = |
5175 | BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
5176 | // Add on extra cost to reflect the extra overhead on some CPUs. We still |
5177 | // end up vectorizing for more computationally intensive loops. |
5178 | return BaseCost + FixedVTy->getNumElements(); |
5179 | } |
5180 | |
5181 | if (Opcode != Instruction::FAdd) |
5182 | return InstructionCost::getInvalid(); |
5183 | |
5184 | auto *VTy = cast<ScalableVectorType>(Val: ValTy); |
5185 | InstructionCost Cost = |
5186 | getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind); |
5187 | Cost *= getMaxNumElements(VF: VTy->getElementCount()); |
5188 | return Cost; |
5189 | } |
5190 | |
5191 | if (isa<ScalableVectorType>(Val: ValTy)) |
5192 | return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); |
5193 | |
5194 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
5195 | MVT MTy = LT.second; |
5196 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
5197 | assert(ISD && "Invalid opcode" ); |
5198 | |
5199 | // Horizontal adds can use the 'addv' instruction. We model the cost of these |
5200 | // instructions as twice a normal vector add, plus 1 for each legalization |
5201 | // step (LT.first). This is the only arithmetic vector reduction operation for |
5202 | // which we have an instruction. |
5203 | // OR, XOR and AND costs should match the codegen from: |
5204 | // OR: llvm/test/CodeGen/AArch64/reduce-or.ll |
5205 | // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll |
5206 | // AND: llvm/test/CodeGen/AArch64/reduce-and.ll |
5207 | static const CostTblEntry CostTblNoPairwise[]{ |
5208 | {.ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: 2}, |
5209 | {.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 2}, |
5210 | {.ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: 2}, |
5211 | {.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 2}, |
5212 | {.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 2}, |
5213 | {.ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 2}, |
5214 | {.ISD: ISD::OR, .Type: MVT::v8i8, .Cost: 15}, |
5215 | {.ISD: ISD::OR, .Type: MVT::v16i8, .Cost: 17}, |
5216 | {.ISD: ISD::OR, .Type: MVT::v4i16, .Cost: 7}, |
5217 | {.ISD: ISD::OR, .Type: MVT::v8i16, .Cost: 9}, |
5218 | {.ISD: ISD::OR, .Type: MVT::v2i32, .Cost: 3}, |
5219 | {.ISD: ISD::OR, .Type: MVT::v4i32, .Cost: 5}, |
5220 | {.ISD: ISD::OR, .Type: MVT::v2i64, .Cost: 3}, |
5221 | {.ISD: ISD::XOR, .Type: MVT::v8i8, .Cost: 15}, |
5222 | {.ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: 17}, |
5223 | {.ISD: ISD::XOR, .Type: MVT::v4i16, .Cost: 7}, |
5224 | {.ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: 9}, |
5225 | {.ISD: ISD::XOR, .Type: MVT::v2i32, .Cost: 3}, |
5226 | {.ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: 5}, |
5227 | {.ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: 3}, |
5228 | {.ISD: ISD::AND, .Type: MVT::v8i8, .Cost: 15}, |
5229 | {.ISD: ISD::AND, .Type: MVT::v16i8, .Cost: 17}, |
5230 | {.ISD: ISD::AND, .Type: MVT::v4i16, .Cost: 7}, |
5231 | {.ISD: ISD::AND, .Type: MVT::v8i16, .Cost: 9}, |
5232 | {.ISD: ISD::AND, .Type: MVT::v2i32, .Cost: 3}, |
5233 | {.ISD: ISD::AND, .Type: MVT::v4i32, .Cost: 5}, |
5234 | {.ISD: ISD::AND, .Type: MVT::v2i64, .Cost: 3}, |
5235 | }; |
5236 | switch (ISD) { |
5237 | default: |
5238 | break; |
5239 | case ISD::FADD: |
5240 | if (Type *EltTy = ValTy->getScalarType(); |
5241 | // FIXME: For half types without fullfp16 support, this could extend and |
5242 | // use a fp32 faddp reduction but current codegen unrolls. |
5243 | MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() || |
5244 | (EltTy->isHalfTy() && ST->hasFullFP16()))) { |
5245 | const unsigned NElts = MTy.getVectorNumElements(); |
5246 | if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 && |
5247 | isPowerOf2_32(Value: NElts)) |
5248 | // Reduction corresponding to series of fadd instructions is lowered to |
5249 | // series of faddp instructions. faddp has latency/throughput that |
5250 | // matches fadd instruction and hence, every faddp instruction can be |
5251 | // considered to have a relative cost = 1 with |
5252 | // CostKind = TCK_RecipThroughput. |
5253 | // An faddp will pairwise add vector elements, so the size of input |
5254 | // vector reduces by half every time, requiring |
5255 | // #(faddp instructions) = log2_32(NElts). |
5256 | return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(Value: NElts); |
5257 | } |
5258 | break; |
5259 | case ISD::ADD: |
5260 | if (const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy)) |
5261 | return (LT.first - 1) + Entry->Cost; |
5262 | break; |
5263 | case ISD::XOR: |
5264 | case ISD::AND: |
5265 | case ISD::OR: |
5266 | const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy); |
5267 | if (!Entry) |
5268 | break; |
5269 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
5270 | if (MTy.getVectorNumElements() <= ValVTy->getNumElements() && |
5271 | isPowerOf2_32(Value: ValVTy->getNumElements())) { |
5272 | InstructionCost = 0; |
5273 | if (LT.first != 1) { |
5274 | // Type needs to be split, so there is an extra cost of LT.first - 1 |
5275 | // arithmetic ops. |
5276 | auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(), |
5277 | NumElts: MTy.getVectorNumElements()); |
5278 | ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); |
5279 | ExtraCost *= LT.first - 1; |
5280 | } |
5281 | // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov |
5282 | auto Cost = ValVTy->getElementType()->isIntegerTy(Bitwidth: 1) ? 2 : Entry->Cost; |
5283 | return Cost + ExtraCost; |
5284 | } |
5285 | break; |
5286 | } |
5287 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
5288 | } |
5289 | |
5290 | InstructionCost AArch64TTIImpl::getExtendedReductionCost( |
5291 | unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy, |
5292 | std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const { |
5293 | EVT VecVT = TLI->getValueType(DL, Ty: VecTy); |
5294 | EVT ResVT = TLI->getValueType(DL, Ty: ResTy); |
5295 | |
5296 | if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() && |
5297 | VecVT.getSizeInBits() >= 64) { |
5298 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy); |
5299 | |
5300 | // The legal cases are: |
5301 | // UADDLV 8/16/32->32 |
5302 | // UADDLP 32->64 |
5303 | unsigned RevVTSize = ResVT.getSizeInBits(); |
5304 | if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) && |
5305 | RevVTSize <= 32) || |
5306 | ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) && |
5307 | RevVTSize <= 32) || |
5308 | ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) && |
5309 | RevVTSize <= 64)) |
5310 | return (LT.first - 1) * 2 + 2; |
5311 | } |
5312 | |
5313 | return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: VecTy, FMF, |
5314 | CostKind); |
5315 | } |
5316 | |
5317 | InstructionCost |
5318 | AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, |
5319 | VectorType *VecTy, |
5320 | TTI::TargetCostKind CostKind) const { |
5321 | EVT VecVT = TLI->getValueType(DL, Ty: VecTy); |
5322 | EVT ResVT = TLI->getValueType(DL, Ty: ResTy); |
5323 | |
5324 | if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) { |
5325 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy); |
5326 | |
5327 | // The legal cases with dotprod are |
5328 | // UDOT 8->32 |
5329 | // Which requires an additional uaddv to sum the i32 values. |
5330 | if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) && |
5331 | ResVT == MVT::i32) |
5332 | return LT.first + 2; |
5333 | } |
5334 | |
5335 | return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, Ty: VecTy, CostKind); |
5336 | } |
5337 | |
5338 | InstructionCost |
5339 | AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index, |
5340 | TTI::TargetCostKind CostKind) const { |
5341 | static const CostTblEntry ShuffleTbl[] = { |
5342 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv16i8, .Cost: 1 }, |
5343 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv8i16, .Cost: 1 }, |
5344 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4i32, .Cost: 1 }, |
5345 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2i64, .Cost: 1 }, |
5346 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f16, .Cost: 1 }, |
5347 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f16, .Cost: 1 }, |
5348 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv8f16, .Cost: 1 }, |
5349 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2bf16, .Cost: 1 }, |
5350 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4bf16, .Cost: 1 }, |
5351 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv8bf16, .Cost: 1 }, |
5352 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f32, .Cost: 1 }, |
5353 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f32, .Cost: 1 }, |
5354 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f64, .Cost: 1 }, |
5355 | }; |
5356 | |
5357 | // The code-generator is currently not able to handle scalable vectors |
5358 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
5359 | // it. This change will be removed when code-generation for these types is |
5360 | // sufficiently reliable. |
5361 | if (Tp->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
5362 | return InstructionCost::getInvalid(); |
5363 | |
5364 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp); |
5365 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Tp->getContext()); |
5366 | EVT PromotedVT = LT.second.getScalarType() == MVT::i1 |
5367 | ? TLI->getPromotedVTForPredicate(VT: EVT(LT.second)) |
5368 | : LT.second; |
5369 | Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Context&: Tp->getContext()); |
5370 | InstructionCost LegalizationCost = 0; |
5371 | if (Index < 0) { |
5372 | LegalizationCost = |
5373 | getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy, |
5374 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) + |
5375 | getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy, |
5376 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
5377 | } |
5378 | |
5379 | // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp |
5380 | // Cost performed on a promoted type. |
5381 | if (LT.second.getScalarType() == MVT::i1) { |
5382 | LegalizationCost += |
5383 | getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy, |
5384 | CCH: TTI::CastContextHint::None, CostKind) + |
5385 | getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy, |
5386 | CCH: TTI::CastContextHint::None, CostKind); |
5387 | } |
5388 | const auto *Entry = |
5389 | CostTableLookup(Table: ShuffleTbl, ISD: TTI::SK_Splice, Ty: PromotedVT.getSimpleVT()); |
5390 | assert(Entry && "Illegal Type for Splice" ); |
5391 | LegalizationCost += Entry->Cost; |
5392 | return LegalizationCost * LT.first; |
5393 | } |
5394 | |
5395 | InstructionCost AArch64TTIImpl::getPartialReductionCost( |
5396 | unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, |
5397 | ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, |
5398 | TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp, |
5399 | TTI::TargetCostKind CostKind) const { |
5400 | InstructionCost Invalid = InstructionCost::getInvalid(); |
5401 | InstructionCost Cost(TTI::TCC_Basic); |
5402 | |
5403 | if (CostKind != TTI::TCK_RecipThroughput) |
5404 | return Invalid; |
5405 | |
5406 | // Sub opcodes currently only occur in chained cases. |
5407 | // Independent partial reduction subtractions are still costed as an add |
5408 | if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) || |
5409 | OpAExtend == TTI::PR_None) |
5410 | return Invalid; |
5411 | |
5412 | // We only support multiply binary operations for now, and for muls we |
5413 | // require the types being extended to be the same. |
5414 | // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but |
5415 | // only if the i8mm or sve/streaming features are available. |
5416 | if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB || |
5417 | OpBExtend == TTI::PR_None || |
5418 | (OpAExtend != OpBExtend && !ST->hasMatMulInt8() && |
5419 | !ST->isSVEorStreamingSVEAvailable()))) |
5420 | return Invalid; |
5421 | assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) && |
5422 | "Unexpected values for OpBExtend or InputTypeB" ); |
5423 | |
5424 | EVT InputEVT = EVT::getEVT(Ty: InputTypeA); |
5425 | EVT AccumEVT = EVT::getEVT(Ty: AccumType); |
5426 | |
5427 | unsigned VFMinValue = VF.getKnownMinValue(); |
5428 | |
5429 | if (VF.isScalable()) { |
5430 | if (!ST->isSVEorStreamingSVEAvailable()) |
5431 | return Invalid; |
5432 | |
5433 | // Don't accept a partial reduction if the scaled accumulator is vscale x 1, |
5434 | // since we can't lower that type. |
5435 | unsigned Scale = |
5436 | AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits(); |
5437 | if (VFMinValue == Scale) |
5438 | return Invalid; |
5439 | } |
5440 | if (VF.isFixed() && |
5441 | (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64)) |
5442 | return Invalid; |
5443 | |
5444 | if (InputEVT == MVT::i8) { |
5445 | switch (VFMinValue) { |
5446 | default: |
5447 | return Invalid; |
5448 | case 8: |
5449 | if (AccumEVT == MVT::i32) |
5450 | Cost *= 2; |
5451 | else if (AccumEVT != MVT::i64) |
5452 | return Invalid; |
5453 | break; |
5454 | case 16: |
5455 | if (AccumEVT == MVT::i64) |
5456 | Cost *= 2; |
5457 | else if (AccumEVT != MVT::i32) |
5458 | return Invalid; |
5459 | break; |
5460 | } |
5461 | } else if (InputEVT == MVT::i16) { |
5462 | // FIXME: Allow i32 accumulator but increase cost, as we would extend |
5463 | // it to i64. |
5464 | if (VFMinValue != 8 || AccumEVT != MVT::i64) |
5465 | return Invalid; |
5466 | } else |
5467 | return Invalid; |
5468 | |
5469 | return Cost; |
5470 | } |
5471 | |
5472 | InstructionCost |
5473 | AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, |
5474 | VectorType *SrcTy, ArrayRef<int> Mask, |
5475 | TTI::TargetCostKind CostKind, int Index, |
5476 | VectorType *SubTp, ArrayRef<const Value *> Args, |
5477 | const Instruction *CxtI) const { |
5478 | assert((Mask.empty() || DstTy->isScalableTy() || |
5479 | Mask.size() == DstTy->getElementCount().getKnownMinValue()) && |
5480 | "Expected the Mask to match the return size if given" ); |
5481 | assert(SrcTy->getScalarType() == DstTy->getScalarType() && |
5482 | "Expected the same scalar types" ); |
5483 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy); |
5484 | |
5485 | // If we have a Mask, and the LT is being legalized somehow, split the Mask |
5486 | // into smaller vectors and sum the cost of each shuffle. |
5487 | if (!Mask.empty() && isa<FixedVectorType>(Val: SrcTy) && LT.second.isVector() && |
5488 | LT.second.getScalarSizeInBits() * Mask.size() > 128 && |
5489 | SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && |
5490 | Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) { |
5491 | // Check for LD3/LD4 instructions, which are represented in llvm IR as |
5492 | // deinterleaving-shuffle(load). The shuffle cost could potentially be free, |
5493 | // but we model it with a cost of LT.first so that LD3/LD4 have a higher |
5494 | // cost than just the load. |
5495 | if (Args.size() >= 1 && isa<LoadInst>(Val: Args[0]) && |
5496 | (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 3) || |
5497 | ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 4))) |
5498 | return std::max<InstructionCost>(a: 1, b: LT.first / 4); |
5499 | |
5500 | // Check for ST3/ST4 instructions, which are represented in llvm IR as |
5501 | // store(interleaving-shuffle). The shuffle cost could potentially be free, |
5502 | // but we model it with a cost of LT.first so that ST3/ST4 have a higher |
5503 | // cost than just the store. |
5504 | if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) && |
5505 | (ShuffleVectorInst::isInterleaveMask( |
5506 | Mask, Factor: 4, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2) || |
5507 | ShuffleVectorInst::isInterleaveMask( |
5508 | Mask, Factor: 3, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2))) |
5509 | return LT.first; |
5510 | |
5511 | unsigned TpNumElts = Mask.size(); |
5512 | unsigned LTNumElts = LT.second.getVectorNumElements(); |
5513 | unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; |
5514 | VectorType *NTp = VectorType::get(ElementType: SrcTy->getScalarType(), |
5515 | EC: LT.second.getVectorElementCount()); |
5516 | InstructionCost Cost; |
5517 | std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost> |
5518 | PreviousCosts; |
5519 | for (unsigned N = 0; N < NumVecs; N++) { |
5520 | SmallVector<int> NMask; |
5521 | // Split the existing mask into chunks of size LTNumElts. Track the source |
5522 | // sub-vectors to ensure the result has at most 2 inputs. |
5523 | unsigned Source1 = -1U, Source2 = -1U; |
5524 | unsigned NumSources = 0; |
5525 | for (unsigned E = 0; E < LTNumElts; E++) { |
5526 | int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] |
5527 | : PoisonMaskElem; |
5528 | if (MaskElt < 0) { |
5529 | NMask.push_back(Elt: PoisonMaskElem); |
5530 | continue; |
5531 | } |
5532 | |
5533 | // Calculate which source from the input this comes from and whether it |
5534 | // is new to us. |
5535 | unsigned Source = MaskElt / LTNumElts; |
5536 | if (NumSources == 0) { |
5537 | Source1 = Source; |
5538 | NumSources = 1; |
5539 | } else if (NumSources == 1 && Source != Source1) { |
5540 | Source2 = Source; |
5541 | NumSources = 2; |
5542 | } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { |
5543 | NumSources++; |
5544 | } |
5545 | |
5546 | // Add to the new mask. For the NumSources>2 case these are not correct, |
5547 | // but are only used for the modular lane number. |
5548 | if (Source == Source1) |
5549 | NMask.push_back(Elt: MaskElt % LTNumElts); |
5550 | else if (Source == Source2) |
5551 | NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts); |
5552 | else |
5553 | NMask.push_back(Elt: MaskElt % LTNumElts); |
5554 | } |
5555 | // Check if we have already generated this sub-shuffle, which means we |
5556 | // will have already generated the output. For example a <16 x i32> splat |
5557 | // will be the same sub-splat 4 times, which only needs to be generated |
5558 | // once and reused. |
5559 | auto Result = |
5560 | PreviousCosts.insert(x: {std::make_tuple(args&: Source1, args&: Source2, args&: NMask), 0}); |
5561 | // Check if it was already in the map (already costed). |
5562 | if (!Result.second) |
5563 | continue; |
5564 | // If the sub-mask has at most 2 input sub-vectors then re-cost it using |
5565 | // getShuffleCost. If not then cost it using the worst case as the number |
5566 | // of element moves into a new vector. |
5567 | InstructionCost NCost = |
5568 | NumSources <= 2 |
5569 | ? getShuffleCost(Kind: NumSources <= 1 ? TTI::SK_PermuteSingleSrc |
5570 | : TTI::SK_PermuteTwoSrc, |
5571 | DstTy: NTp, SrcTy: NTp, Mask: NMask, CostKind, Index: 0, SubTp: nullptr, Args, |
5572 | CxtI) |
5573 | : LTNumElts; |
5574 | Result.first->second = NCost; |
5575 | Cost += NCost; |
5576 | } |
5577 | return Cost; |
5578 | } |
5579 | |
5580 | Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp); |
5581 | bool = Kind == TTI::SK_ExtractSubvector; |
5582 | // A subvector extract can be implemented with an ext (or trivial extract, if |
5583 | // from lane 0). This currently only handles low or high extracts to prevent |
5584 | // SLP vectorizer regressions. |
5585 | if (IsExtractSubvector && LT.second.isFixedLengthVector()) { |
5586 | if (LT.second.is128BitVector() && |
5587 | cast<FixedVectorType>(Val: SubTp)->getNumElements() == |
5588 | LT.second.getVectorNumElements() / 2) { |
5589 | if (Index == 0) |
5590 | return 0; |
5591 | if (Index == (int)LT.second.getVectorNumElements() / 2) |
5592 | return 1; |
5593 | } |
5594 | Kind = TTI::SK_PermuteSingleSrc; |
5595 | } |
5596 | // FIXME: This was added to keep the costs equal when adding DstTys. Update |
5597 | // the code to handle length-changing shuffles. |
5598 | if (Kind == TTI::SK_InsertSubvector) { |
5599 | LT = getTypeLegalizationCost(Ty: DstTy); |
5600 | SrcTy = DstTy; |
5601 | } |
5602 | |
5603 | // Segmented shuffle matching. |
5604 | if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Val: SrcTy) && |
5605 | !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() && |
5606 | SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf( |
5607 | RHS: AArch64::SVEBitsPerBlock)) { |
5608 | |
5609 | FixedVectorType *VTy = cast<FixedVectorType>(Val: SrcTy); |
5610 | unsigned Segments = |
5611 | VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock; |
5612 | unsigned SegmentElts = VTy->getNumElements() / Segments; |
5613 | |
5614 | // dupq zd.t, zn.t[idx] |
5615 | if ((ST->hasSVE2p1() || ST->hasSME2p1()) && |
5616 | ST->isSVEorStreamingSVEAvailable() && |
5617 | isDUPQMask(Mask, Segments, SegmentSize: SegmentElts)) |
5618 | return LT.first; |
5619 | |
5620 | // mov zd.q, vn |
5621 | if (ST->isSVEorStreamingSVEAvailable() && |
5622 | isDUPFirstSegmentMask(Mask, Segments, SegmentSize: SegmentElts)) |
5623 | return LT.first; |
5624 | } |
5625 | |
5626 | // Check for broadcast loads, which are supported by the LD1R instruction. |
5627 | // In terms of code-size, the shuffle vector is free when a load + dup get |
5628 | // folded into a LD1R. That's what we check and return here. For performance |
5629 | // and reciprocal throughput, a LD1R is not completely free. In this case, we |
5630 | // return the cost for the broadcast below (i.e. 1 for most/all types), so |
5631 | // that we model the load + dup sequence slightly higher because LD1R is a |
5632 | // high latency instruction. |
5633 | if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { |
5634 | bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args[0]); |
5635 | if (IsLoad && LT.second.isVector() && |
5636 | isLegalBroadcastLoad(ElementTy: SrcTy->getElementType(), |
5637 | NumElements: LT.second.getVectorElementCount())) |
5638 | return 0; |
5639 | } |
5640 | |
5641 | // If we have 4 elements for the shuffle and a Mask, get the cost straight |
5642 | // from the perfect shuffle tables. |
5643 | if (Mask.size() == 4 && |
5644 | SrcTy->getElementCount() == ElementCount::getFixed(MinVal: 4) && |
5645 | (SrcTy->getScalarSizeInBits() == 16 || |
5646 | SrcTy->getScalarSizeInBits() == 32) && |
5647 | all_of(Range&: Mask, P: [](int E) { return E < 8; })) |
5648 | return getPerfectShuffleCost(M: Mask); |
5649 | |
5650 | // Check for identity masks, which we can treat as free. |
5651 | if (!Mask.empty() && LT.second.isFixedLengthVector() && |
5652 | (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && |
5653 | all_of(Range: enumerate(First&: Mask), P: [](const auto &M) { |
5654 | return M.value() < 0 || M.value() == (int)M.index(); |
5655 | })) |
5656 | return 0; |
5657 | |
5658 | // Check for other shuffles that are not SK_ kinds but we have native |
5659 | // instructions for, for example ZIP and UZP. |
5660 | unsigned Unused; |
5661 | if (LT.second.isFixedLengthVector() && |
5662 | LT.second.getVectorNumElements() == Mask.size() && |
5663 | (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && |
5664 | (isZIPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) || |
5665 | isUZPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) || |
5666 | isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(), |
5667 | NumElts: LT.second.getVectorNumElements(), BlockSize: 16) || |
5668 | isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(), |
5669 | NumElts: LT.second.getVectorNumElements(), BlockSize: 32) || |
5670 | isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(), |
5671 | NumElts: LT.second.getVectorNumElements(), BlockSize: 64) || |
5672 | // Check for non-zero lane splats |
5673 | all_of(Range: drop_begin(RangeOrContainer&: Mask), |
5674 | P: [&Mask](int M) { return M < 0 || M == Mask[0]; }))) |
5675 | return 1; |
5676 | |
5677 | if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || |
5678 | Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || |
5679 | Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { |
5680 | static const CostTblEntry ShuffleTbl[] = { |
5681 | // Broadcast shuffle kinds can be performed with 'dup'. |
5682 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: 1}, |
5683 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: 1}, |
5684 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: 1}, |
5685 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: 1}, |
5686 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i32, .Cost: 1}, |
5687 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: 1}, |
5688 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: 1}, |
5689 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f16, .Cost: 1}, |
5690 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: 1}, |
5691 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4bf16, .Cost: 1}, |
5692 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8bf16, .Cost: 1}, |
5693 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f32, .Cost: 1}, |
5694 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: 1}, |
5695 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: 1}, |
5696 | // Transpose shuffle kinds can be performed with 'trn1/trn2' and |
5697 | // 'zip1/zip2' instructions. |
5698 | {.ISD: TTI::SK_Transpose, .Type: MVT::v8i8, .Cost: 1}, |
5699 | {.ISD: TTI::SK_Transpose, .Type: MVT::v16i8, .Cost: 1}, |
5700 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4i16, .Cost: 1}, |
5701 | {.ISD: TTI::SK_Transpose, .Type: MVT::v8i16, .Cost: 1}, |
5702 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2i32, .Cost: 1}, |
5703 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4i32, .Cost: 1}, |
5704 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2i64, .Cost: 1}, |
5705 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4f16, .Cost: 1}, |
5706 | {.ISD: TTI::SK_Transpose, .Type: MVT::v8f16, .Cost: 1}, |
5707 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4bf16, .Cost: 1}, |
5708 | {.ISD: TTI::SK_Transpose, .Type: MVT::v8bf16, .Cost: 1}, |
5709 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2f32, .Cost: 1}, |
5710 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4f32, .Cost: 1}, |
5711 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2f64, .Cost: 1}, |
5712 | // Select shuffle kinds. |
5713 | // TODO: handle vXi8/vXi16. |
5714 | {.ISD: TTI::SK_Select, .Type: MVT::v2i32, .Cost: 1}, // mov. |
5715 | {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: 2}, // rev+trn (or similar). |
5716 | {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: 1}, // mov. |
5717 | {.ISD: TTI::SK_Select, .Type: MVT::v2f32, .Cost: 1}, // mov. |
5718 | {.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: 2}, // rev+trn (or similar). |
5719 | {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: 1}, // mov. |
5720 | // PermuteSingleSrc shuffle kinds. |
5721 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i32, .Cost: 1}, // mov. |
5722 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: 3}, // perfectshuffle worst case. |
5723 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: 1}, // mov. |
5724 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f32, .Cost: 1}, // mov. |
5725 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: 3}, // perfectshuffle worst case. |
5726 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: 1}, // mov. |
5727 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: 3}, // perfectshuffle worst case. |
5728 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f16, .Cost: 3}, // perfectshuffle worst case. |
5729 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4bf16, .Cost: 3}, // same |
5730 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: 8}, // constpool + load + tbl |
5731 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: 8}, // constpool + load + tbl |
5732 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8bf16, .Cost: 8}, // constpool + load + tbl |
5733 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: 8}, // constpool + load + tbl |
5734 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: 8}, // constpool + load + tbl |
5735 | // Reverse can be lowered with `rev`. |
5736 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i32, .Cost: 1}, // REV64 |
5737 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: 2}, // REV64; EXT |
5738 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: 1}, // EXT |
5739 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2f32, .Cost: 1}, // REV64 |
5740 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: 2}, // REV64; EXT |
5741 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: 1}, // EXT |
5742 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: 2}, // REV64; EXT |
5743 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8bf16, .Cost: 2}, // REV64; EXT |
5744 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: 2}, // REV64; EXT |
5745 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: 2}, // REV64; EXT |
5746 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4f16, .Cost: 1}, // REV64 |
5747 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4bf16, .Cost: 1}, // REV64 |
5748 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: 1}, // REV64 |
5749 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i8, .Cost: 1}, // REV64 |
5750 | // Splice can all be lowered as `ext`. |
5751 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i32, .Cost: 1}, |
5752 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: 1}, |
5753 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: 1}, |
5754 | {.ISD: TTI::SK_Splice, .Type: MVT::v2f32, .Cost: 1}, |
5755 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: 1}, |
5756 | {.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: 1}, |
5757 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: 1}, |
5758 | {.ISD: TTI::SK_Splice, .Type: MVT::v8bf16, .Cost: 1}, |
5759 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: 1}, |
5760 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: 1}, |
5761 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f16, .Cost: 1}, |
5762 | {.ISD: TTI::SK_Splice, .Type: MVT::v4bf16, .Cost: 1}, |
5763 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: 1}, |
5764 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i8, .Cost: 1}, |
5765 | // Broadcast shuffle kinds for scalable vectors |
5766 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i8, .Cost: 1}, |
5767 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i16, .Cost: 1}, |
5768 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i32, .Cost: 1}, |
5769 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i64, .Cost: 1}, |
5770 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f16, .Cost: 1}, |
5771 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f16, .Cost: 1}, |
5772 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8f16, .Cost: 1}, |
5773 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2bf16, .Cost: 1}, |
5774 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4bf16, .Cost: 1}, |
5775 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8bf16, .Cost: 1}, |
5776 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f32, .Cost: 1}, |
5777 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f32, .Cost: 1}, |
5778 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f64, .Cost: 1}, |
5779 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i1, .Cost: 1}, |
5780 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i1, .Cost: 1}, |
5781 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i1, .Cost: 1}, |
5782 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i1, .Cost: 1}, |
5783 | // Handle the cases for vector.reverse with scalable vectors |
5784 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i8, .Cost: 1}, |
5785 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i16, .Cost: 1}, |
5786 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i32, .Cost: 1}, |
5787 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i64, .Cost: 1}, |
5788 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f16, .Cost: 1}, |
5789 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f16, .Cost: 1}, |
5790 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8f16, .Cost: 1}, |
5791 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2bf16, .Cost: 1}, |
5792 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4bf16, .Cost: 1}, |
5793 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8bf16, .Cost: 1}, |
5794 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f32, .Cost: 1}, |
5795 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f32, .Cost: 1}, |
5796 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f64, .Cost: 1}, |
5797 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i1, .Cost: 1}, |
5798 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i1, .Cost: 1}, |
5799 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i1, .Cost: 1}, |
5800 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i1, .Cost: 1}, |
5801 | }; |
5802 | if (const auto *Entry = CostTableLookup(Table: ShuffleTbl, ISD: Kind, Ty: LT.second)) |
5803 | return LT.first * Entry->Cost; |
5804 | } |
5805 | |
5806 | if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: SrcTy)) |
5807 | return getSpliceCost(Tp: SrcTy, Index, CostKind); |
5808 | |
5809 | // Inserting a subvector can often be done with either a D, S or H register |
5810 | // move, so long as the inserted vector is "aligned". |
5811 | if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && |
5812 | LT.second.getSizeInBits() <= 128 && SubTp) { |
5813 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
5814 | if (SubLT.second.isVector()) { |
5815 | int NumElts = LT.second.getVectorNumElements(); |
5816 | int NumSubElts = SubLT.second.getVectorNumElements(); |
5817 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
5818 | return SubLT.first; |
5819 | } |
5820 | } |
5821 | |
5822 | // Restore optimal kind. |
5823 | if (IsExtractSubvector) |
5824 | Kind = TTI::SK_ExtractSubvector; |
5825 | return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp, |
5826 | Args, CxtI); |
5827 | } |
5828 | |
5829 | static bool containsDecreasingPointers(Loop *TheLoop, |
5830 | PredicatedScalarEvolution *PSE) { |
5831 | const auto &Strides = DenseMap<Value *, const SCEV *>(); |
5832 | for (BasicBlock *BB : TheLoop->blocks()) { |
5833 | // Scan the instructions in the block and look for addresses that are |
5834 | // consecutive and decreasing. |
5835 | for (Instruction &I : *BB) { |
5836 | if (isa<LoadInst>(Val: &I) || isa<StoreInst>(Val: &I)) { |
5837 | Value *Ptr = getLoadStorePointerOperand(V: &I); |
5838 | Type *AccessTy = getLoadStoreType(I: &I); |
5839 | if (getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides, /*Assume=*/true, |
5840 | /*ShouldCheckWrap=*/false) |
5841 | .value_or(u: 0) < 0) |
5842 | return true; |
5843 | } |
5844 | } |
5845 | } |
5846 | return false; |
5847 | } |
5848 | |
5849 | bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const { |
5850 | if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences()) |
5851 | return SVEPreferFixedOverScalableIfEqualCost; |
5852 | return ST->useFixedOverScalableIfEqualCost(); |
5853 | } |
5854 | |
5855 | unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const { |
5856 | return ST->getEpilogueVectorizationMinVF(); |
5857 | } |
5858 | |
5859 | bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { |
5860 | if (!ST->hasSVE()) |
5861 | return false; |
5862 | |
5863 | // We don't currently support vectorisation with interleaving for SVE - with |
5864 | // such loops we're better off not using tail-folding. This gives us a chance |
5865 | // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. |
5866 | if (TFI->IAI->hasGroups()) |
5867 | return false; |
5868 | |
5869 | TailFoldingOpts Required = TailFoldingOpts::Disabled; |
5870 | if (TFI->LVL->getReductionVars().size()) |
5871 | Required |= TailFoldingOpts::Reductions; |
5872 | if (TFI->LVL->getFixedOrderRecurrences().size()) |
5873 | Required |= TailFoldingOpts::Recurrences; |
5874 | |
5875 | // We call this to discover whether any load/store pointers in the loop have |
5876 | // negative strides. This will require extra work to reverse the loop |
5877 | // predicate, which may be expensive. |
5878 | if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(), |
5879 | PSE: TFI->LVL->getPredicatedScalarEvolution())) |
5880 | Required |= TailFoldingOpts::Reverse; |
5881 | if (Required == TailFoldingOpts::Disabled) |
5882 | Required |= TailFoldingOpts::Simple; |
5883 | |
5884 | if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(), |
5885 | Required)) |
5886 | return false; |
5887 | |
5888 | // Don't tail-fold for tight loops where we would be better off interleaving |
5889 | // with an unpredicated loop. |
5890 | unsigned NumInsns = 0; |
5891 | for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) { |
5892 | NumInsns += BB->sizeWithoutDebug(); |
5893 | } |
5894 | |
5895 | // We expect 4 of these to be a IV PHI, IV add, IV compare and branch. |
5896 | return NumInsns >= SVETailFoldInsnThreshold; |
5897 | } |
5898 | |
5899 | InstructionCost |
5900 | AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
5901 | StackOffset BaseOffset, bool HasBaseReg, |
5902 | int64_t Scale, unsigned AddrSpace) const { |
5903 | // Scaling factors are not free at all. |
5904 | // Operands | Rt Latency |
5905 | // ------------------------------------------- |
5906 | // Rt, [Xn, Xm] | 4 |
5907 | // ------------------------------------------- |
5908 | // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 |
5909 | // Rt, [Xn, Wm, <extend> #imm] | |
5910 | TargetLoweringBase::AddrMode AM; |
5911 | AM.BaseGV = BaseGV; |
5912 | AM.BaseOffs = BaseOffset.getFixed(); |
5913 | AM.HasBaseReg = HasBaseReg; |
5914 | AM.Scale = Scale; |
5915 | AM.ScalableOffset = BaseOffset.getScalable(); |
5916 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) |
5917 | // Scale represents reg2 * scale, thus account for 1 if |
5918 | // it is not equal to 0 or 1. |
5919 | return AM.Scale != 0 && AM.Scale != 1; |
5920 | return InstructionCost::getInvalid(); |
5921 | } |
5922 | |
5923 | bool AArch64TTIImpl::shouldTreatInstructionLikeSelect( |
5924 | const Instruction *I) const { |
5925 | if (EnableOrLikeSelectOpt) { |
5926 | // For the binary operators (e.g. or) we need to be more careful than |
5927 | // selects, here we only transform them if they are already at a natural |
5928 | // break point in the code - the end of a block with an unconditional |
5929 | // terminator. |
5930 | if (I->getOpcode() == Instruction::Or && |
5931 | isa<BranchInst>(Val: I->getNextNode()) && |
5932 | cast<BranchInst>(Val: I->getNextNode())->isUnconditional()) |
5933 | return true; |
5934 | |
5935 | if (I->getOpcode() == Instruction::Add || |
5936 | I->getOpcode() == Instruction::Sub) |
5937 | return true; |
5938 | } |
5939 | return BaseT::shouldTreatInstructionLikeSelect(I); |
5940 | } |
5941 | |
5942 | bool AArch64TTIImpl::isLSRCostLess( |
5943 | const TargetTransformInfo::LSRCost &C1, |
5944 | const TargetTransformInfo::LSRCost &C2) const { |
5945 | // AArch64 specific here is adding the number of instructions to the |
5946 | // comparison (though not as the first consideration, as some targets do) |
5947 | // along with changing the priority of the base additions. |
5948 | // TODO: Maybe a more nuanced tradeoff between instruction count |
5949 | // and number of registers? To be investigated at a later date. |
5950 | if (EnableLSRCostOpt) |
5951 | return std::tie(args: C1.NumRegs, args: C1.Insns, args: C1.NumBaseAdds, args: C1.AddRecCost, |
5952 | args: C1.NumIVMuls, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) < |
5953 | std::tie(args: C2.NumRegs, args: C2.Insns, args: C2.NumBaseAdds, args: C2.AddRecCost, |
5954 | args: C2.NumIVMuls, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost); |
5955 | |
5956 | return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); |
5957 | } |
5958 | |
5959 | static bool isSplatShuffle(Value *V) { |
5960 | if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V)) |
5961 | return all_equal(Range: Shuf->getShuffleMask()); |
5962 | return false; |
5963 | } |
5964 | |
5965 | /// Check if both Op1 and Op2 are shufflevector extracts of either the lower |
5966 | /// or upper half of the vector elements. |
5967 | static bool (Value *Op1, Value *Op2, |
5968 | bool AllowSplat = false) { |
5969 | // Scalable types can't be extract shuffle vectors. |
5970 | if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy()) |
5971 | return false; |
5972 | |
5973 | auto areTypesHalfed = [](Value *FullV, Value *HalfV) { |
5974 | auto *FullTy = FullV->getType(); |
5975 | auto *HalfTy = HalfV->getType(); |
5976 | return FullTy->getPrimitiveSizeInBits().getFixedValue() == |
5977 | 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue(); |
5978 | }; |
5979 | |
5980 | auto = [](Value *FullV, Value *HalfV) { |
5981 | auto *FullVT = cast<FixedVectorType>(Val: FullV->getType()); |
5982 | auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType()); |
5983 | return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); |
5984 | }; |
5985 | |
5986 | ArrayRef<int> M1, M2; |
5987 | Value *S1Op1 = nullptr, *S2Op1 = nullptr; |
5988 | if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask(M1))) || |
5989 | !match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask(M2)))) |
5990 | return false; |
5991 | |
5992 | // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that |
5993 | // it is not checked as an extract below. |
5994 | if (AllowSplat && isSplatShuffle(V: Op1)) |
5995 | S1Op1 = nullptr; |
5996 | if (AllowSplat && isSplatShuffle(V: Op2)) |
5997 | S2Op1 = nullptr; |
5998 | |
5999 | // Check that the operands are half as wide as the result and we extract |
6000 | // half of the elements of the input vectors. |
6001 | if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) || |
6002 | (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2)))) |
6003 | return false; |
6004 | |
6005 | // Check the mask extracts either the lower or upper half of vector |
6006 | // elements. |
6007 | int M1Start = 0; |
6008 | int M2Start = 0; |
6009 | int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * 2; |
6010 | if ((S1Op1 && |
6011 | !ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) || |
6012 | (S2Op1 && |
6013 | !ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start))) |
6014 | return false; |
6015 | |
6016 | if ((M1Start != 0 && M1Start != (NumElements / 2)) || |
6017 | (M2Start != 0 && M2Start != (NumElements / 2))) |
6018 | return false; |
6019 | if (S1Op1 && S2Op1 && M1Start != M2Start) |
6020 | return false; |
6021 | |
6022 | return true; |
6023 | } |
6024 | |
6025 | /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth |
6026 | /// of the vector elements. |
6027 | static bool (Value *Ext1, Value *Ext2) { |
6028 | auto areExtDoubled = [](Instruction *Ext) { |
6029 | return Ext->getType()->getScalarSizeInBits() == |
6030 | 2 * Ext->getOperand(i: 0)->getType()->getScalarSizeInBits(); |
6031 | }; |
6032 | |
6033 | if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) || |
6034 | !match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) || |
6035 | !areExtDoubled(cast<Instruction>(Val: Ext1)) || |
6036 | !areExtDoubled(cast<Instruction>(Val: Ext2))) |
6037 | return false; |
6038 | |
6039 | return true; |
6040 | } |
6041 | |
6042 | /// Check if Op could be used with vmull_high_p64 intrinsic. |
6043 | static bool isOperandOfVmullHighP64(Value *Op) { |
6044 | Value *VectorOperand = nullptr; |
6045 | ConstantInt *ElementIndex = nullptr; |
6046 | return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand), |
6047 | Idx: m_ConstantInt(CI&: ElementIndex))) && |
6048 | ElementIndex->getValue() == 1 && |
6049 | isa<FixedVectorType>(Val: VectorOperand->getType()) && |
6050 | cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == 2; |
6051 | } |
6052 | |
6053 | /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. |
6054 | static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { |
6055 | return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2); |
6056 | } |
6057 | |
6058 | static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) { |
6059 | // Restrict ourselves to the form CodeGenPrepare typically constructs. |
6060 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs); |
6061 | if (!GEP || GEP->getNumOperands() != 2) |
6062 | return false; |
6063 | |
6064 | Value *Base = GEP->getOperand(i_nocapture: 0); |
6065 | Value *Offsets = GEP->getOperand(i_nocapture: 1); |
6066 | |
6067 | // We only care about scalar_base+vector_offsets. |
6068 | if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy()) |
6069 | return false; |
6070 | |
6071 | // Sink extends that would allow us to use 32-bit offset vectors. |
6072 | if (isa<SExtInst>(Val: Offsets) || isa<ZExtInst>(Val: Offsets)) { |
6073 | auto *OffsetsInst = cast<Instruction>(Val: Offsets); |
6074 | if (OffsetsInst->getType()->getScalarSizeInBits() > 32 && |
6075 | OffsetsInst->getOperand(i: 0)->getType()->getScalarSizeInBits() <= 32) |
6076 | Ops.push_back(Elt: &GEP->getOperandUse(i: 1)); |
6077 | } |
6078 | |
6079 | // Sink the GEP. |
6080 | return true; |
6081 | } |
6082 | |
6083 | /// We want to sink following cases: |
6084 | /// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale; |
6085 | /// (add|sub|gep) A, ((mul|shl) zext(vscale), imm); |
6086 | static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) { |
6087 | if (match(V: Op, P: m_VScale())) |
6088 | return true; |
6089 | if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) || |
6090 | match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) { |
6091 | Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0)); |
6092 | return true; |
6093 | } |
6094 | if (match(V: Op, P: m_Shl(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt())) || |
6095 | match(V: Op, P: m_Mul(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt()))) { |
6096 | Value *ZExtOp = cast<Instruction>(Val: Op)->getOperand(i: 0); |
6097 | Ops.push_back(Elt: &cast<Instruction>(Val: ZExtOp)->getOperandUse(i: 0)); |
6098 | Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0)); |
6099 | return true; |
6100 | } |
6101 | return false; |
6102 | } |
6103 | |
6104 | /// Check if sinking \p I's operands to I's basic block is profitable, because |
6105 | /// the operands can be folded into a target instruction, e.g. |
6106 | /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). |
6107 | bool AArch64TTIImpl::isProfitableToSinkOperands( |
6108 | Instruction *I, SmallVectorImpl<Use *> &Ops) const { |
6109 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) { |
6110 | switch (II->getIntrinsicID()) { |
6111 | case Intrinsic::aarch64_neon_smull: |
6112 | case Intrinsic::aarch64_neon_umull: |
6113 | if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1), |
6114 | /*AllowSplat=*/true)) { |
6115 | Ops.push_back(Elt: &II->getOperandUse(i: 0)); |
6116 | Ops.push_back(Elt: &II->getOperandUse(i: 1)); |
6117 | return true; |
6118 | } |
6119 | [[fallthrough]]; |
6120 | |
6121 | case Intrinsic::fma: |
6122 | case Intrinsic::fmuladd: |
6123 | if (isa<VectorType>(Val: I->getType()) && |
6124 | cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() && |
6125 | !ST->hasFullFP16()) |
6126 | return false; |
6127 | [[fallthrough]]; |
6128 | case Intrinsic::aarch64_neon_sqdmull: |
6129 | case Intrinsic::aarch64_neon_sqdmulh: |
6130 | case Intrinsic::aarch64_neon_sqrdmulh: |
6131 | // Sink splats for index lane variants |
6132 | if (isSplatShuffle(V: II->getOperand(i_nocapture: 0))) |
6133 | Ops.push_back(Elt: &II->getOperandUse(i: 0)); |
6134 | if (isSplatShuffle(V: II->getOperand(i_nocapture: 1))) |
6135 | Ops.push_back(Elt: &II->getOperandUse(i: 1)); |
6136 | return !Ops.empty(); |
6137 | case Intrinsic::aarch64_neon_fmlal: |
6138 | case Intrinsic::aarch64_neon_fmlal2: |
6139 | case Intrinsic::aarch64_neon_fmlsl: |
6140 | case Intrinsic::aarch64_neon_fmlsl2: |
6141 | // Sink splats for index lane variants |
6142 | if (isSplatShuffle(V: II->getOperand(i_nocapture: 1))) |
6143 | Ops.push_back(Elt: &II->getOperandUse(i: 1)); |
6144 | if (isSplatShuffle(V: II->getOperand(i_nocapture: 2))) |
6145 | Ops.push_back(Elt: &II->getOperandUse(i: 2)); |
6146 | return !Ops.empty(); |
6147 | case Intrinsic::aarch64_sve_ptest_first: |
6148 | case Intrinsic::aarch64_sve_ptest_last: |
6149 | if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: 0))) |
6150 | if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) |
6151 | Ops.push_back(Elt: &II->getOperandUse(i: 0)); |
6152 | return !Ops.empty(); |
6153 | case Intrinsic::aarch64_sme_write_horiz: |
6154 | case Intrinsic::aarch64_sme_write_vert: |
6155 | case Intrinsic::aarch64_sme_writeq_horiz: |
6156 | case Intrinsic::aarch64_sme_writeq_vert: { |
6157 | auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 1)); |
6158 | if (!Idx || Idx->getOpcode() != Instruction::Add) |
6159 | return false; |
6160 | Ops.push_back(Elt: &II->getOperandUse(i: 1)); |
6161 | return true; |
6162 | } |
6163 | case Intrinsic::aarch64_sme_read_horiz: |
6164 | case Intrinsic::aarch64_sme_read_vert: |
6165 | case Intrinsic::aarch64_sme_readq_horiz: |
6166 | case Intrinsic::aarch64_sme_readq_vert: |
6167 | case Intrinsic::aarch64_sme_ld1b_vert: |
6168 | case Intrinsic::aarch64_sme_ld1h_vert: |
6169 | case Intrinsic::aarch64_sme_ld1w_vert: |
6170 | case Intrinsic::aarch64_sme_ld1d_vert: |
6171 | case Intrinsic::aarch64_sme_ld1q_vert: |
6172 | case Intrinsic::aarch64_sme_st1b_vert: |
6173 | case Intrinsic::aarch64_sme_st1h_vert: |
6174 | case Intrinsic::aarch64_sme_st1w_vert: |
6175 | case Intrinsic::aarch64_sme_st1d_vert: |
6176 | case Intrinsic::aarch64_sme_st1q_vert: |
6177 | case Intrinsic::aarch64_sme_ld1b_horiz: |
6178 | case Intrinsic::aarch64_sme_ld1h_horiz: |
6179 | case Intrinsic::aarch64_sme_ld1w_horiz: |
6180 | case Intrinsic::aarch64_sme_ld1d_horiz: |
6181 | case Intrinsic::aarch64_sme_ld1q_horiz: |
6182 | case Intrinsic::aarch64_sme_st1b_horiz: |
6183 | case Intrinsic::aarch64_sme_st1h_horiz: |
6184 | case Intrinsic::aarch64_sme_st1w_horiz: |
6185 | case Intrinsic::aarch64_sme_st1d_horiz: |
6186 | case Intrinsic::aarch64_sme_st1q_horiz: { |
6187 | auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 3)); |
6188 | if (!Idx || Idx->getOpcode() != Instruction::Add) |
6189 | return false; |
6190 | Ops.push_back(Elt: &II->getOperandUse(i: 3)); |
6191 | return true; |
6192 | } |
6193 | case Intrinsic::aarch64_neon_pmull: |
6194 | if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1))) |
6195 | return false; |
6196 | Ops.push_back(Elt: &II->getOperandUse(i: 0)); |
6197 | Ops.push_back(Elt: &II->getOperandUse(i: 1)); |
6198 | return true; |
6199 | case Intrinsic::aarch64_neon_pmull64: |
6200 | if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: 0), |
6201 | Op2: II->getArgOperand(i: 1))) |
6202 | return false; |
6203 | Ops.push_back(Elt: &II->getArgOperandUse(i: 0)); |
6204 | Ops.push_back(Elt: &II->getArgOperandUse(i: 1)); |
6205 | return true; |
6206 | case Intrinsic::masked_gather: |
6207 | if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 0), Ops)) |
6208 | return false; |
6209 | Ops.push_back(Elt: &II->getArgOperandUse(i: 0)); |
6210 | return true; |
6211 | case Intrinsic::masked_scatter: |
6212 | if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 1), Ops)) |
6213 | return false; |
6214 | Ops.push_back(Elt: &II->getArgOperandUse(i: 1)); |
6215 | return true; |
6216 | default: |
6217 | return false; |
6218 | } |
6219 | } |
6220 | |
6221 | auto ShouldSinkCondition = [](Value *Cond) -> bool { |
6222 | auto *II = dyn_cast<IntrinsicInst>(Val: Cond); |
6223 | return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or && |
6224 | isa<ScalableVectorType>(Val: II->getOperand(i_nocapture: 0)->getType()); |
6225 | }; |
6226 | |
6227 | switch (I->getOpcode()) { |
6228 | case Instruction::GetElementPtr: |
6229 | case Instruction::Add: |
6230 | case Instruction::Sub: |
6231 | // Sink vscales closer to uses for better isel |
6232 | for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) { |
6233 | if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) { |
6234 | Ops.push_back(Elt: &I->getOperandUse(i: Op)); |
6235 | return true; |
6236 | } |
6237 | } |
6238 | break; |
6239 | case Instruction::Select: { |
6240 | if (!ShouldSinkCondition(I->getOperand(i: 0))) |
6241 | return false; |
6242 | |
6243 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
6244 | return true; |
6245 | } |
6246 | case Instruction::Br: { |
6247 | if (cast<BranchInst>(Val: I)->isUnconditional()) |
6248 | return false; |
6249 | |
6250 | if (!ShouldSinkCondition(cast<BranchInst>(Val: I)->getCondition())) |
6251 | return false; |
6252 | |
6253 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
6254 | return true; |
6255 | } |
6256 | default: |
6257 | break; |
6258 | } |
6259 | |
6260 | if (!I->getType()->isVectorTy()) |
6261 | return false; |
6262 | |
6263 | switch (I->getOpcode()) { |
6264 | case Instruction::Sub: |
6265 | case Instruction::Add: { |
6266 | if (!areExtractExts(Ext1: I->getOperand(i: 0), Ext2: I->getOperand(i: 1))) |
6267 | return false; |
6268 | |
6269 | // If the exts' operands extract either the lower or upper elements, we |
6270 | // can sink them too. |
6271 | auto Ext1 = cast<Instruction>(Val: I->getOperand(i: 0)); |
6272 | auto Ext2 = cast<Instruction>(Val: I->getOperand(i: 1)); |
6273 | if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: 0), Op2: Ext2->getOperand(i: 0))) { |
6274 | Ops.push_back(Elt: &Ext1->getOperandUse(i: 0)); |
6275 | Ops.push_back(Elt: &Ext2->getOperandUse(i: 0)); |
6276 | } |
6277 | |
6278 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
6279 | Ops.push_back(Elt: &I->getOperandUse(i: 1)); |
6280 | |
6281 | return true; |
6282 | } |
6283 | case Instruction::Or: { |
6284 | // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) -> |
6285 | // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1) |
6286 | if (ST->hasNEON()) { |
6287 | Instruction *OtherAnd, *IA, *IB; |
6288 | Value *MaskValue; |
6289 | // MainAnd refers to And instruction that has 'Not' as one of its operands |
6290 | if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)), |
6291 | R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))), |
6292 | R: m_Instruction(I&: IA)))))) { |
6293 | if (match(V: OtherAnd, |
6294 | P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) { |
6295 | Instruction *MainAnd = I->getOperand(i: 0) == OtherAnd |
6296 | ? cast<Instruction>(Val: I->getOperand(i: 1)) |
6297 | : cast<Instruction>(Val: I->getOperand(i: 0)); |
6298 | |
6299 | // Both Ands should be in same basic block as Or |
6300 | if (I->getParent() != MainAnd->getParent() || |
6301 | I->getParent() != OtherAnd->getParent()) |
6302 | return false; |
6303 | |
6304 | // Non-mask operands of both Ands should also be in same basic block |
6305 | if (I->getParent() != IA->getParent() || |
6306 | I->getParent() != IB->getParent()) |
6307 | return false; |
6308 | |
6309 | Ops.push_back( |
6310 | Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: 0) == IA ? 1 : 0)); |
6311 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
6312 | Ops.push_back(Elt: &I->getOperandUse(i: 1)); |
6313 | |
6314 | return true; |
6315 | } |
6316 | } |
6317 | } |
6318 | |
6319 | return false; |
6320 | } |
6321 | case Instruction::Mul: { |
6322 | auto ShouldSinkSplatForIndexedVariant = [](Value *V) { |
6323 | auto *Ty = cast<VectorType>(Val: V->getType()); |
6324 | // For SVE the lane-indexing is within 128-bits, so we can't fold splats. |
6325 | if (Ty->isScalableTy()) |
6326 | return false; |
6327 | |
6328 | // Indexed variants of Mul exist for i16 and i32 element types only. |
6329 | return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32; |
6330 | }; |
6331 | |
6332 | int NumZExts = 0, NumSExts = 0; |
6333 | for (auto &Op : I->operands()) { |
6334 | // Make sure we are not already sinking this operand |
6335 | if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; })) |
6336 | continue; |
6337 | |
6338 | if (match(V: &Op, P: m_ZExtOrSExt(Op: m_Value()))) { |
6339 | auto *Ext = cast<Instruction>(Val&: Op); |
6340 | auto *ExtOp = Ext->getOperand(i: 0); |
6341 | if (isSplatShuffle(V: ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp)) |
6342 | Ops.push_back(Elt: &Ext->getOperandUse(i: 0)); |
6343 | Ops.push_back(Elt: &Op); |
6344 | |
6345 | if (isa<SExtInst>(Val: Ext)) |
6346 | NumSExts++; |
6347 | else |
6348 | NumZExts++; |
6349 | |
6350 | continue; |
6351 | } |
6352 | |
6353 | ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op); |
6354 | if (!Shuffle) |
6355 | continue; |
6356 | |
6357 | // If the Shuffle is a splat and the operand is a zext/sext, sinking the |
6358 | // operand and the s/zext can help create indexed s/umull. This is |
6359 | // especially useful to prevent i64 mul being scalarized. |
6360 | if (isSplatShuffle(V: Shuffle) && |
6361 | match(V: Shuffle->getOperand(i_nocapture: 0), P: m_ZExtOrSExt(Op: m_Value()))) { |
6362 | Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0)); |
6363 | Ops.push_back(Elt: &Op); |
6364 | if (match(V: Shuffle->getOperand(i_nocapture: 0), P: m_SExt(Op: m_Value()))) |
6365 | NumSExts++; |
6366 | else |
6367 | NumZExts++; |
6368 | continue; |
6369 | } |
6370 | |
6371 | Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: 0); |
6372 | InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand); |
6373 | if (!Insert) |
6374 | continue; |
6375 | |
6376 | Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: 1)); |
6377 | if (!OperandInstr) |
6378 | continue; |
6379 | |
6380 | ConstantInt *ElementConstant = |
6381 | dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: 2)); |
6382 | // Check that the insertelement is inserting into element 0 |
6383 | if (!ElementConstant || !ElementConstant->isZero()) |
6384 | continue; |
6385 | |
6386 | unsigned Opcode = OperandInstr->getOpcode(); |
6387 | if (Opcode == Instruction::SExt) |
6388 | NumSExts++; |
6389 | else if (Opcode == Instruction::ZExt) |
6390 | NumZExts++; |
6391 | else { |
6392 | // If we find that the top bits are known 0, then we can sink and allow |
6393 | // the backend to generate a umull. |
6394 | unsigned Bitwidth = I->getType()->getScalarSizeInBits(); |
6395 | APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / 2); |
6396 | if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, SQ: DL)) |
6397 | continue; |
6398 | NumZExts++; |
6399 | } |
6400 | |
6401 | // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking |
6402 | // the And, just to hoist it again back to the load. |
6403 | if (!match(V: OperandInstr, P: m_And(L: m_Load(Op: m_Value()), R: m_Value()))) |
6404 | Ops.push_back(Elt: &Insert->getOperandUse(i: 1)); |
6405 | Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0)); |
6406 | Ops.push_back(Elt: &Op); |
6407 | } |
6408 | |
6409 | // It is profitable to sink if we found two of the same type of extends. |
6410 | if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2)) |
6411 | return true; |
6412 | |
6413 | // Otherwise, see if we should sink splats for indexed variants. |
6414 | if (!ShouldSinkSplatForIndexedVariant(I)) |
6415 | return false; |
6416 | |
6417 | Ops.clear(); |
6418 | if (isSplatShuffle(V: I->getOperand(i: 0))) |
6419 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
6420 | if (isSplatShuffle(V: I->getOperand(i: 1))) |
6421 | Ops.push_back(Elt: &I->getOperandUse(i: 1)); |
6422 | |
6423 | return !Ops.empty(); |
6424 | } |
6425 | case Instruction::FMul: { |
6426 | // For SVE the lane-indexing is within 128-bits, so we can't fold splats. |
6427 | if (I->getType()->isScalableTy()) |
6428 | return false; |
6429 | |
6430 | if (cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() && |
6431 | !ST->hasFullFP16()) |
6432 | return false; |
6433 | |
6434 | // Sink splats for index lane variants |
6435 | if (isSplatShuffle(V: I->getOperand(i: 0))) |
6436 | Ops.push_back(Elt: &I->getOperandUse(i: 0)); |
6437 | if (isSplatShuffle(V: I->getOperand(i: 1))) |
6438 | Ops.push_back(Elt: &I->getOperandUse(i: 1)); |
6439 | return !Ops.empty(); |
6440 | } |
6441 | default: |
6442 | return false; |
6443 | } |
6444 | return false; |
6445 | } |
6446 | |