1 | //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "AArch64TargetTransformInfo.h" |
10 | #include "AArch64ExpandImm.h" |
11 | #include "AArch64PerfectShuffle.h" |
12 | #include "MCTargetDesc/AArch64AddressingModes.h" |
13 | #include "llvm/Analysis/IVDescriptors.h" |
14 | #include "llvm/Analysis/LoopInfo.h" |
15 | #include "llvm/Analysis/TargetTransformInfo.h" |
16 | #include "llvm/CodeGen/BasicTTIImpl.h" |
17 | #include "llvm/CodeGen/CostTable.h" |
18 | #include "llvm/CodeGen/TargetLowering.h" |
19 | #include "llvm/IR/IntrinsicInst.h" |
20 | #include "llvm/IR/Intrinsics.h" |
21 | #include "llvm/IR/IntrinsicsAArch64.h" |
22 | #include "llvm/IR/PatternMatch.h" |
23 | #include "llvm/Support/Debug.h" |
24 | #include "llvm/Transforms/InstCombine/InstCombiner.h" |
25 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
26 | #include <algorithm> |
27 | #include <optional> |
28 | using namespace llvm; |
29 | using namespace llvm::PatternMatch; |
30 | |
31 | #define DEBUG_TYPE "aarch64tti" |
32 | |
33 | static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix" , |
34 | cl::init(Val: true), cl::Hidden); |
35 | |
36 | static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead" , cl::init(Val: 10), |
37 | cl::Hidden); |
38 | |
39 | static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead" , |
40 | cl::init(Val: 10), cl::Hidden); |
41 | |
42 | static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold" , |
43 | cl::init(Val: 15), cl::Hidden); |
44 | |
45 | static cl::opt<unsigned> |
46 | NeonNonConstStrideOverhead("neon-nonconst-stride-overhead" , cl::init(Val: 10), |
47 | cl::Hidden); |
48 | |
49 | static cl::opt<unsigned> CallPenaltyChangeSM( |
50 | "call-penalty-sm-change" , cl::init(Val: 5), cl::Hidden, |
51 | cl::desc( |
52 | "Penalty of calling a function that requires a change to PSTATE.SM" )); |
53 | |
54 | static cl::opt<unsigned> InlineCallPenaltyChangeSM( |
55 | "inline-call-penalty-sm-change" , cl::init(Val: 10), cl::Hidden, |
56 | cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM" )); |
57 | |
58 | static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select" , |
59 | cl::init(Val: true), cl::Hidden); |
60 | |
61 | static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt" , |
62 | cl::init(Val: true), cl::Hidden); |
63 | |
64 | // A complete guess as to a reasonable cost. |
65 | static cl::opt<unsigned> |
66 | BaseHistCntCost("aarch64-base-histcnt-cost" , cl::init(Val: 8), cl::Hidden, |
67 | cl::desc("The cost of a histcnt instruction" )); |
68 | |
69 | namespace { |
70 | class TailFoldingOption { |
71 | // These bitfields will only ever be set to something non-zero in operator=, |
72 | // when setting the -sve-tail-folding option. This option should always be of |
73 | // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here |
74 | // InitialBits is one of (disabled|all|simple). EnableBits represents |
75 | // additional flags we're enabling, and DisableBits for those flags we're |
76 | // disabling. The default flag is tracked in the variable NeedsDefault, since |
77 | // at the time of setting the option we may not know what the default value |
78 | // for the CPU is. |
79 | TailFoldingOpts InitialBits = TailFoldingOpts::Disabled; |
80 | TailFoldingOpts EnableBits = TailFoldingOpts::Disabled; |
81 | TailFoldingOpts DisableBits = TailFoldingOpts::Disabled; |
82 | |
83 | // This value needs to be initialised to true in case the user does not |
84 | // explicitly set the -sve-tail-folding option. |
85 | bool NeedsDefault = true; |
86 | |
87 | void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; } |
88 | |
89 | void setNeedsDefault(bool V) { NeedsDefault = V; } |
90 | |
91 | void setEnableBit(TailFoldingOpts Bit) { |
92 | EnableBits |= Bit; |
93 | DisableBits &= ~Bit; |
94 | } |
95 | |
96 | void setDisableBit(TailFoldingOpts Bit) { |
97 | EnableBits &= ~Bit; |
98 | DisableBits |= Bit; |
99 | } |
100 | |
101 | TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const { |
102 | TailFoldingOpts Bits = TailFoldingOpts::Disabled; |
103 | |
104 | assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) && |
105 | "Initial bits should only include one of " |
106 | "(disabled|all|simple|default)" ); |
107 | Bits = NeedsDefault ? DefaultBits : InitialBits; |
108 | Bits |= EnableBits; |
109 | Bits &= ~DisableBits; |
110 | |
111 | return Bits; |
112 | } |
113 | |
114 | void reportError(std::string Opt) { |
115 | errs() << "invalid argument '" << Opt |
116 | << "' to -sve-tail-folding=; the option should be of the form\n" |
117 | " (disabled|all|default|simple)[+(reductions|recurrences" |
118 | "|reverse|noreductions|norecurrences|noreverse)]\n" ; |
119 | report_fatal_error(reason: "Unrecognised tail-folding option" ); |
120 | } |
121 | |
122 | public: |
123 | |
124 | void operator=(const std::string &Val) { |
125 | // If the user explicitly sets -sve-tail-folding= then treat as an error. |
126 | if (Val.empty()) { |
127 | reportError(Opt: "" ); |
128 | return; |
129 | } |
130 | |
131 | // Since the user is explicitly setting the option we don't automatically |
132 | // need the default unless they require it. |
133 | setNeedsDefault(false); |
134 | |
135 | SmallVector<StringRef, 4> TailFoldTypes; |
136 | StringRef(Val).split(A&: TailFoldTypes, Separator: '+', MaxSplit: -1, KeepEmpty: false); |
137 | |
138 | unsigned StartIdx = 1; |
139 | if (TailFoldTypes[0] == "disabled" ) |
140 | setInitialBits(TailFoldingOpts::Disabled); |
141 | else if (TailFoldTypes[0] == "all" ) |
142 | setInitialBits(TailFoldingOpts::All); |
143 | else if (TailFoldTypes[0] == "default" ) |
144 | setNeedsDefault(true); |
145 | else if (TailFoldTypes[0] == "simple" ) |
146 | setInitialBits(TailFoldingOpts::Simple); |
147 | else { |
148 | StartIdx = 0; |
149 | setInitialBits(TailFoldingOpts::Disabled); |
150 | } |
151 | |
152 | for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) { |
153 | if (TailFoldTypes[I] == "reductions" ) |
154 | setEnableBit(TailFoldingOpts::Reductions); |
155 | else if (TailFoldTypes[I] == "recurrences" ) |
156 | setEnableBit(TailFoldingOpts::Recurrences); |
157 | else if (TailFoldTypes[I] == "reverse" ) |
158 | setEnableBit(TailFoldingOpts::Reverse); |
159 | else if (TailFoldTypes[I] == "noreductions" ) |
160 | setDisableBit(TailFoldingOpts::Reductions); |
161 | else if (TailFoldTypes[I] == "norecurrences" ) |
162 | setDisableBit(TailFoldingOpts::Recurrences); |
163 | else if (TailFoldTypes[I] == "noreverse" ) |
164 | setDisableBit(TailFoldingOpts::Reverse); |
165 | else |
166 | reportError(Opt: Val); |
167 | } |
168 | } |
169 | |
170 | bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const { |
171 | return (getBits(DefaultBits) & Required) == Required; |
172 | } |
173 | }; |
174 | } // namespace |
175 | |
176 | TailFoldingOption TailFoldingOptionLoc; |
177 | |
178 | cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding( |
179 | "sve-tail-folding" , |
180 | cl::desc( |
181 | "Control the use of vectorisation using tail-folding for SVE where the" |
182 | " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" |
183 | "\ndisabled (Initial) No loop types will vectorize using " |
184 | "tail-folding" |
185 | "\ndefault (Initial) Uses the default tail-folding settings for " |
186 | "the target CPU" |
187 | "\nall (Initial) All legal loop types will vectorize using " |
188 | "tail-folding" |
189 | "\nsimple (Initial) Use tail-folding for simple loops (not " |
190 | "reductions or recurrences)" |
191 | "\nreductions Use tail-folding for loops containing reductions" |
192 | "\nnoreductions Inverse of above" |
193 | "\nrecurrences Use tail-folding for loops containing fixed order " |
194 | "recurrences" |
195 | "\nnorecurrences Inverse of above" |
196 | "\nreverse Use tail-folding for loops requiring reversed " |
197 | "predicates" |
198 | "\nnoreverse Inverse of above" ), |
199 | cl::location(L&: TailFoldingOptionLoc)); |
200 | |
201 | // Experimental option that will only be fully functional when the |
202 | // code-generator is changed to use SVE instead of NEON for all fixed-width |
203 | // operations. |
204 | static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( |
205 | "enable-fixedwidth-autovec-in-streaming-mode" , cl::init(Val: false), cl::Hidden); |
206 | |
207 | // Experimental option that will only be fully functional when the cost-model |
208 | // and code-generator have been changed to avoid using scalable vector |
209 | // instructions that are not legal in streaming SVE mode. |
210 | static cl::opt<bool> EnableScalableAutovecInStreamingMode( |
211 | "enable-scalable-autovec-in-streaming-mode" , cl::init(Val: false), cl::Hidden); |
212 | |
213 | static bool isSMEABIRoutineCall(const CallInst &CI) { |
214 | const auto *F = CI.getCalledFunction(); |
215 | return F && StringSwitch<bool>(F->getName()) |
216 | .Case(S: "__arm_sme_state" , Value: true) |
217 | .Case(S: "__arm_tpidr2_save" , Value: true) |
218 | .Case(S: "__arm_tpidr2_restore" , Value: true) |
219 | .Case(S: "__arm_za_disable" , Value: true) |
220 | .Default(Value: false); |
221 | } |
222 | |
223 | /// Returns true if the function has explicit operations that can only be |
224 | /// lowered using incompatible instructions for the selected mode. This also |
225 | /// returns true if the function F may use or modify ZA state. |
226 | static bool hasPossibleIncompatibleOps(const Function *F) { |
227 | for (const BasicBlock &BB : *F) { |
228 | for (const Instruction &I : BB) { |
229 | // Be conservative for now and assume that any call to inline asm or to |
230 | // intrinsics could could result in non-streaming ops (e.g. calls to |
231 | // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that |
232 | // all native LLVM instructions can be lowered to compatible instructions. |
233 | if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() && |
234 | (cast<CallInst>(Val: I).isInlineAsm() || isa<IntrinsicInst>(Val: I) || |
235 | isSMEABIRoutineCall(CI: cast<CallInst>(Val: I)))) |
236 | return true; |
237 | } |
238 | } |
239 | return false; |
240 | } |
241 | |
242 | bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, |
243 | const Function *Callee) const { |
244 | SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); |
245 | |
246 | // When inlining, we should consider the body of the function, not the |
247 | // interface. |
248 | if (CalleeAttrs.hasStreamingBody()) { |
249 | CalleeAttrs.set(M: SMEAttrs::SM_Compatible, Enable: false); |
250 | CalleeAttrs.set(M: SMEAttrs::SM_Enabled, Enable: true); |
251 | } |
252 | |
253 | if (CalleeAttrs.isNewZA()) |
254 | return false; |
255 | |
256 | if (CallerAttrs.requiresLazySave(Callee: CalleeAttrs) || |
257 | CallerAttrs.requiresSMChange(Callee: CalleeAttrs) || |
258 | CallerAttrs.requiresPreservingZT0(Callee: CalleeAttrs)) { |
259 | if (hasPossibleIncompatibleOps(F: Callee)) |
260 | return false; |
261 | } |
262 | |
263 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
264 | |
265 | const FeatureBitset &CallerBits = |
266 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
267 | const FeatureBitset &CalleeBits = |
268 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
269 | |
270 | // Inline a callee if its target-features are a subset of the callers |
271 | // target-features. |
272 | return (CallerBits & CalleeBits) == CalleeBits; |
273 | } |
274 | |
275 | bool AArch64TTIImpl::areTypesABICompatible( |
276 | const Function *Caller, const Function *Callee, |
277 | const ArrayRef<Type *> &Types) const { |
278 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) |
279 | return false; |
280 | |
281 | // We need to ensure that argument promotion does not attempt to promote |
282 | // pointers to fixed-length vector types larger than 128 bits like |
283 | // <8 x float> (and pointers to aggregate types which have such fixed-length |
284 | // vector type members) into the values of the pointees. Such vector types |
285 | // are used for SVE VLS but there is no ABI for SVE VLS arguments and the |
286 | // backend cannot lower such value arguments. The 128-bit fixed-length SVE |
287 | // types can be safely treated as 128-bit NEON types and they cannot be |
288 | // distinguished in IR. |
289 | if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range: Types, P: [](Type *Ty) { |
290 | auto FVTy = dyn_cast<FixedVectorType>(Val: Ty); |
291 | return FVTy && |
292 | FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128; |
293 | })) |
294 | return false; |
295 | |
296 | return true; |
297 | } |
298 | |
299 | unsigned |
300 | AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, |
301 | unsigned DefaultCallPenalty) const { |
302 | // This function calculates a penalty for executing Call in F. |
303 | // |
304 | // There are two ways this function can be called: |
305 | // (1) F: |
306 | // call from F -> G (the call here is Call) |
307 | // |
308 | // For (1), Call.getCaller() == F, so it will always return a high cost if |
309 | // a streaming-mode change is required (thus promoting the need to inline the |
310 | // function) |
311 | // |
312 | // (2) F: |
313 | // call from F -> G (the call here is not Call) |
314 | // G: |
315 | // call from G -> H (the call here is Call) |
316 | // |
317 | // For (2), if after inlining the body of G into F the call to H requires a |
318 | // streaming-mode change, and the call to G from F would also require a |
319 | // streaming-mode change, then there is benefit to do the streaming-mode |
320 | // change only once and avoid inlining of G into F. |
321 | SMEAttrs FAttrs(*F); |
322 | SMEAttrs CalleeAttrs(Call); |
323 | if (FAttrs.requiresSMChange(Callee: CalleeAttrs)) { |
324 | if (F == Call.getCaller()) // (1) |
325 | return CallPenaltyChangeSM * DefaultCallPenalty; |
326 | if (FAttrs.requiresSMChange(Callee: SMEAttrs(*Call.getCaller()))) // (2) |
327 | return InlineCallPenaltyChangeSM * DefaultCallPenalty; |
328 | } |
329 | |
330 | return DefaultCallPenalty; |
331 | } |
332 | |
333 | bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( |
334 | TargetTransformInfo::RegisterKind K) const { |
335 | assert(K != TargetTransformInfo::RGK_Scalar); |
336 | return (K == TargetTransformInfo::RGK_FixedWidthVector && |
337 | ST->isNeonAvailable()); |
338 | } |
339 | |
340 | /// Calculate the cost of materializing a 64-bit value. This helper |
341 | /// method might only calculate a fraction of a larger immediate. Therefore it |
342 | /// is valid to return a cost of ZERO. |
343 | InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { |
344 | // Check if the immediate can be encoded within an instruction. |
345 | if (Val == 0 || AArch64_AM::isLogicalImmediate(imm: Val, regSize: 64)) |
346 | return 0; |
347 | |
348 | if (Val < 0) |
349 | Val = ~Val; |
350 | |
351 | // Calculate how many moves we will need to materialize this constant. |
352 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
353 | AArch64_IMM::expandMOVImm(Imm: Val, BitSize: 64, Insn); |
354 | return Insn.size(); |
355 | } |
356 | |
357 | /// Calculate the cost of materializing the given constant. |
358 | InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
359 | TTI::TargetCostKind CostKind) { |
360 | assert(Ty->isIntegerTy()); |
361 | |
362 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
363 | if (BitSize == 0) |
364 | return ~0U; |
365 | |
366 | // Sign-extend all constants to a multiple of 64-bit. |
367 | APInt ImmVal = Imm; |
368 | if (BitSize & 0x3f) |
369 | ImmVal = Imm.sext(width: (BitSize + 63) & ~0x3fU); |
370 | |
371 | // Split the constant into 64-bit chunks and calculate the cost for each |
372 | // chunk. |
373 | InstructionCost Cost = 0; |
374 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { |
375 | APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64); |
376 | int64_t Val = Tmp.getSExtValue(); |
377 | Cost += getIntImmCost(Val); |
378 | } |
379 | // We need at least one instruction to materialze the constant. |
380 | return std::max<InstructionCost>(a: 1, b: Cost); |
381 | } |
382 | |
383 | InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
384 | const APInt &Imm, Type *Ty, |
385 | TTI::TargetCostKind CostKind, |
386 | Instruction *Inst) { |
387 | assert(Ty->isIntegerTy()); |
388 | |
389 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
390 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
391 | // here, so that constant hoisting will ignore this constant. |
392 | if (BitSize == 0) |
393 | return TTI::TCC_Free; |
394 | |
395 | unsigned ImmIdx = ~0U; |
396 | switch (Opcode) { |
397 | default: |
398 | return TTI::TCC_Free; |
399 | case Instruction::GetElementPtr: |
400 | // Always hoist the base address of a GetElementPtr. |
401 | if (Idx == 0) |
402 | return 2 * TTI::TCC_Basic; |
403 | return TTI::TCC_Free; |
404 | case Instruction::Store: |
405 | ImmIdx = 0; |
406 | break; |
407 | case Instruction::Add: |
408 | case Instruction::Sub: |
409 | case Instruction::Mul: |
410 | case Instruction::UDiv: |
411 | case Instruction::SDiv: |
412 | case Instruction::URem: |
413 | case Instruction::SRem: |
414 | case Instruction::And: |
415 | case Instruction::Or: |
416 | case Instruction::Xor: |
417 | case Instruction::ICmp: |
418 | ImmIdx = 1; |
419 | break; |
420 | // Always return TCC_Free for the shift value of a shift instruction. |
421 | case Instruction::Shl: |
422 | case Instruction::LShr: |
423 | case Instruction::AShr: |
424 | if (Idx == 1) |
425 | return TTI::TCC_Free; |
426 | break; |
427 | case Instruction::Trunc: |
428 | case Instruction::ZExt: |
429 | case Instruction::SExt: |
430 | case Instruction::IntToPtr: |
431 | case Instruction::PtrToInt: |
432 | case Instruction::BitCast: |
433 | case Instruction::PHI: |
434 | case Instruction::Call: |
435 | case Instruction::Select: |
436 | case Instruction::Ret: |
437 | case Instruction::Load: |
438 | break; |
439 | } |
440 | |
441 | if (Idx == ImmIdx) { |
442 | int NumConstants = (BitSize + 63) / 64; |
443 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
444 | return (Cost <= NumConstants * TTI::TCC_Basic) |
445 | ? static_cast<int>(TTI::TCC_Free) |
446 | : Cost; |
447 | } |
448 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
449 | } |
450 | |
451 | InstructionCost |
452 | AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
453 | const APInt &Imm, Type *Ty, |
454 | TTI::TargetCostKind CostKind) { |
455 | assert(Ty->isIntegerTy()); |
456 | |
457 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
458 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
459 | // here, so that constant hoisting will ignore this constant. |
460 | if (BitSize == 0) |
461 | return TTI::TCC_Free; |
462 | |
463 | // Most (all?) AArch64 intrinsics do not support folding immediates into the |
464 | // selected instruction, so we compute the materialization cost for the |
465 | // immediate directly. |
466 | if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) |
467 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
468 | |
469 | switch (IID) { |
470 | default: |
471 | return TTI::TCC_Free; |
472 | case Intrinsic::sadd_with_overflow: |
473 | case Intrinsic::uadd_with_overflow: |
474 | case Intrinsic::ssub_with_overflow: |
475 | case Intrinsic::usub_with_overflow: |
476 | case Intrinsic::smul_with_overflow: |
477 | case Intrinsic::umul_with_overflow: |
478 | if (Idx == 1) { |
479 | int NumConstants = (BitSize + 63) / 64; |
480 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
481 | return (Cost <= NumConstants * TTI::TCC_Basic) |
482 | ? static_cast<int>(TTI::TCC_Free) |
483 | : Cost; |
484 | } |
485 | break; |
486 | case Intrinsic::experimental_stackmap: |
487 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
488 | return TTI::TCC_Free; |
489 | break; |
490 | case Intrinsic::experimental_patchpoint_void: |
491 | case Intrinsic::experimental_patchpoint: |
492 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
493 | return TTI::TCC_Free; |
494 | break; |
495 | case Intrinsic::experimental_gc_statepoint: |
496 | if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
497 | return TTI::TCC_Free; |
498 | break; |
499 | } |
500 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
501 | } |
502 | |
503 | TargetTransformInfo::PopcntSupportKind |
504 | AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { |
505 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
506 | if (TyWidth == 32 || TyWidth == 64) |
507 | return TTI::PSK_FastHardware; |
508 | // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. |
509 | return TTI::PSK_Software; |
510 | } |
511 | |
512 | static bool isUnpackedVectorVT(EVT VecVT) { |
513 | return VecVT.isScalableVector() && |
514 | VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; |
515 | } |
516 | |
517 | static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) { |
518 | Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers |
519 | Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements |
520 | |
521 | // Only allow (32b and 64b) integers or pointers for now... |
522 | if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || |
523 | (EltTy->getScalarSizeInBits() != 32 && |
524 | EltTy->getScalarSizeInBits() != 64)) |
525 | return InstructionCost::getInvalid(); |
526 | |
527 | // FIXME: Hacky check for legal vector types. We can promote smaller types |
528 | // but we cannot legalize vectors via splitting for histcnt. |
529 | // FIXME: We should be able to generate histcnt for fixed-length vectors |
530 | // using ptrue with a specific VL. |
531 | if (VectorType *VTy = dyn_cast<VectorType>(Val: BucketPtrsTy)) |
532 | if ((VTy->getElementCount().getKnownMinValue() != 2 && |
533 | VTy->getElementCount().getKnownMinValue() != 4) || |
534 | VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 || |
535 | !VTy->isScalableTy()) |
536 | return InstructionCost::getInvalid(); |
537 | |
538 | return InstructionCost(BaseHistCntCost); |
539 | } |
540 | |
541 | InstructionCost |
542 | AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
543 | TTI::TargetCostKind CostKind) { |
544 | // The code-generator is currently not able to handle scalable vectors |
545 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
546 | // it. This change will be removed when code-generation for these types is |
547 | // sufficiently reliable. |
548 | auto *RetTy = ICA.getReturnType(); |
549 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: RetTy)) |
550 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
551 | return InstructionCost::getInvalid(); |
552 | |
553 | switch (ICA.getID()) { |
554 | case Intrinsic::experimental_vector_histogram_add: |
555 | if (!ST->hasSVE2()) |
556 | return InstructionCost::getInvalid(); |
557 | return getHistogramCost(ICA); |
558 | case Intrinsic::umin: |
559 | case Intrinsic::umax: |
560 | case Intrinsic::smin: |
561 | case Intrinsic::smax: { |
562 | static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
563 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
564 | MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, |
565 | MVT::nxv2i64}; |
566 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
567 | // v2i64 types get converted to cmp+bif hence the cost of 2 |
568 | if (LT.second == MVT::v2i64) |
569 | return LT.first * 2; |
570 | if (any_of(Range: ValidMinMaxTys, P: [<](MVT M) { return M == LT.second; })) |
571 | return LT.first; |
572 | break; |
573 | } |
574 | case Intrinsic::sadd_sat: |
575 | case Intrinsic::ssub_sat: |
576 | case Intrinsic::uadd_sat: |
577 | case Intrinsic::usub_sat: { |
578 | static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
579 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
580 | MVT::v2i64}; |
581 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
582 | // This is a base cost of 1 for the vadd, plus 3 extract shifts if we |
583 | // need to extend the type, as it uses shr(qadd(shl, shl)). |
584 | unsigned Instrs = |
585 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; |
586 | if (any_of(Range: ValidSatTys, P: [<](MVT M) { return M == LT.second; })) |
587 | return LT.first * Instrs; |
588 | break; |
589 | } |
590 | case Intrinsic::abs: { |
591 | static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
592 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
593 | MVT::v2i64}; |
594 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
595 | if (any_of(Range: ValidAbsTys, P: [<](MVT M) { return M == LT.second; })) |
596 | return LT.first; |
597 | break; |
598 | } |
599 | case Intrinsic::bswap: { |
600 | static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32, |
601 | MVT::v4i32, MVT::v2i64}; |
602 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
603 | if (any_of(Range: ValidAbsTys, P: [<](MVT M) { return M == LT.second; }) && |
604 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits()) |
605 | return LT.first; |
606 | break; |
607 | } |
608 | case Intrinsic::experimental_stepvector: { |
609 | InstructionCost Cost = 1; // Cost of the `index' instruction |
610 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
611 | // Legalisation of illegal vectors involves an `index' instruction plus |
612 | // (LT.first - 1) vector adds. |
613 | if (LT.first > 1) { |
614 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: RetTy->getContext()); |
615 | InstructionCost AddCost = |
616 | getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind); |
617 | Cost += AddCost * (LT.first - 1); |
618 | } |
619 | return Cost; |
620 | } |
621 | case Intrinsic::vector_extract: |
622 | case Intrinsic::vector_insert: { |
623 | // If both the vector and subvector types are legal types and the index |
624 | // is 0, then this should be a no-op or simple operation; return a |
625 | // relatively low cost. |
626 | |
627 | // If arguments aren't actually supplied, then we cannot determine the |
628 | // value of the index. We also want to skip predicate types. |
629 | if (ICA.getArgs().size() != ICA.getArgTypes().size() || |
630 | ICA.getReturnType()->getScalarType()->isIntegerTy(Bitwidth: 1)) |
631 | break; |
632 | |
633 | LLVMContext &C = RetTy->getContext(); |
634 | EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
635 | bool = ICA.getID() == Intrinsic::vector_extract; |
636 | EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy) |
637 | : getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[1]); |
638 | // Skip this if either the vector or subvector types are unpacked |
639 | // SVE types; they may get lowered to stack stores and loads. |
640 | if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(VecVT: SubVecVT)) |
641 | break; |
642 | |
643 | TargetLoweringBase::LegalizeKind SubVecLK = |
644 | getTLI()->getTypeConversion(Context&: C, VT: SubVecVT); |
645 | TargetLoweringBase::LegalizeKind VecLK = |
646 | getTLI()->getTypeConversion(Context&: C, VT: VecVT); |
647 | const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2]; |
648 | const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx); |
649 | if (SubVecLK.first == TargetLoweringBase::TypeLegal && |
650 | VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero()) |
651 | return TTI::TCC_Free; |
652 | break; |
653 | } |
654 | case Intrinsic::bitreverse: { |
655 | static const CostTblEntry BitreverseTbl[] = { |
656 | {.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: 1}, |
657 | {.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: 1}, |
658 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i8, .Cost: 1}, |
659 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v16i8, .Cost: 1}, |
660 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i16, .Cost: 2}, |
661 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v8i16, .Cost: 2}, |
662 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i32, .Cost: 2}, |
663 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v4i32, .Cost: 2}, |
664 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v1i64, .Cost: 2}, |
665 | {.ISD: Intrinsic::bitreverse, .Type: MVT::v2i64, .Cost: 2}, |
666 | }; |
667 | const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy); |
668 | const auto *Entry = |
669 | CostTableLookup(Table: BitreverseTbl, ISD: ICA.getID(), Ty: LegalisationCost.second); |
670 | if (Entry) { |
671 | // Cost Model is using the legal type(i32) that i8 and i16 will be |
672 | // converted to +1 so that we match the actual lowering cost |
673 | if (TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i8 || |
674 | TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i16) |
675 | return LegalisationCost.first * Entry->Cost + 1; |
676 | |
677 | return LegalisationCost.first * Entry->Cost; |
678 | } |
679 | break; |
680 | } |
681 | case Intrinsic::ctpop: { |
682 | if (!ST->hasNEON()) { |
683 | // 32-bit or 64-bit ctpop without NEON is 12 instructions. |
684 | return getTypeLegalizationCost(Ty: RetTy).first * 12; |
685 | } |
686 | static const CostTblEntry CtpopCostTbl[] = { |
687 | {.ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: 4}, |
688 | {.ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: 3}, |
689 | {.ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: 2}, |
690 | {.ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: 1}, |
691 | {.ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: 4}, |
692 | {.ISD: ISD::CTPOP, .Type: MVT::v2i32, .Cost: 3}, |
693 | {.ISD: ISD::CTPOP, .Type: MVT::v4i16, .Cost: 2}, |
694 | {.ISD: ISD::CTPOP, .Type: MVT::v8i8, .Cost: 1}, |
695 | {.ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: 5}, |
696 | }; |
697 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
698 | MVT MTy = LT.second; |
699 | if (const auto *Entry = CostTableLookup(Table: CtpopCostTbl, ISD: ISD::CTPOP, Ty: MTy)) { |
700 | // Extra cost of +1 when illegal vector types are legalized by promoting |
701 | // the integer type. |
702 | int = MTy.isVector() && MTy.getScalarSizeInBits() != |
703 | RetTy->getScalarSizeInBits() |
704 | ? 1 |
705 | : 0; |
706 | return LT.first * Entry->Cost + ExtraCost; |
707 | } |
708 | break; |
709 | } |
710 | case Intrinsic::sadd_with_overflow: |
711 | case Intrinsic::uadd_with_overflow: |
712 | case Intrinsic::ssub_with_overflow: |
713 | case Intrinsic::usub_with_overflow: |
714 | case Intrinsic::smul_with_overflow: |
715 | case Intrinsic::umul_with_overflow: { |
716 | static const CostTblEntry WithOverflowCostTbl[] = { |
717 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i8, .Cost: 3}, |
718 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i8, .Cost: 3}, |
719 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i16, .Cost: 3}, |
720 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i16, .Cost: 3}, |
721 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i32, .Cost: 1}, |
722 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i32, .Cost: 1}, |
723 | {.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i64, .Cost: 1}, |
724 | {.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i64, .Cost: 1}, |
725 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i8, .Cost: 3}, |
726 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i8, .Cost: 3}, |
727 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i16, .Cost: 3}, |
728 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i16, .Cost: 3}, |
729 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i32, .Cost: 1}, |
730 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i32, .Cost: 1}, |
731 | {.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i64, .Cost: 1}, |
732 | {.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i64, .Cost: 1}, |
733 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i8, .Cost: 5}, |
734 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i8, .Cost: 4}, |
735 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i16, .Cost: 5}, |
736 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i16, .Cost: 4}, |
737 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;tst |
738 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i32, .Cost: 2}, // eg umull;cmp sxtw |
739 | {.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;smulh;cmp |
740 | {.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i64, .Cost: 3}, // eg mul;umulh;cmp asr |
741 | }; |
742 | EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: 0), AllowUnknown: true); |
743 | if (MTy.isSimple()) |
744 | if (const auto *Entry = CostTableLookup(Table: WithOverflowCostTbl, ISD: ICA.getID(), |
745 | Ty: MTy.getSimpleVT())) |
746 | return Entry->Cost; |
747 | break; |
748 | } |
749 | case Intrinsic::fptosi_sat: |
750 | case Intrinsic::fptoui_sat: { |
751 | if (ICA.getArgTypes().empty()) |
752 | break; |
753 | bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; |
754 | auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]); |
755 | EVT MTy = TLI->getValueType(DL, Ty: RetTy); |
756 | // Check for the legal types, which are where the size of the input and the |
757 | // output are the same, or we are using cvt f64->i32 or f32->i64. |
758 | if ((LT.second == MVT::f32 || LT.second == MVT::f64 || |
759 | LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || |
760 | LT.second == MVT::v2f64) && |
761 | (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || |
762 | (LT.second == MVT::f64 && MTy == MVT::i32) || |
763 | (LT.second == MVT::f32 && MTy == MVT::i64))) |
764 | return LT.first; |
765 | // Similarly for fp16 sizes |
766 | if (ST->hasFullFP16() && |
767 | ((LT.second == MVT::f16 && MTy == MVT::i32) || |
768 | ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && |
769 | (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) |
770 | return LT.first; |
771 | |
772 | // Otherwise we use a legal convert followed by a min+max |
773 | if ((LT.second.getScalarType() == MVT::f32 || |
774 | LT.second.getScalarType() == MVT::f64 || |
775 | (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && |
776 | LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { |
777 | Type *LegalTy = |
778 | Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits()); |
779 | if (LT.second.isVector()) |
780 | LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount()); |
781 | InstructionCost Cost = 1; |
782 | IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, |
783 | LegalTy, {LegalTy, LegalTy}); |
784 | Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind); |
785 | IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, |
786 | LegalTy, {LegalTy, LegalTy}); |
787 | Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind); |
788 | return LT.first * Cost; |
789 | } |
790 | break; |
791 | } |
792 | case Intrinsic::fshl: |
793 | case Intrinsic::fshr: { |
794 | if (ICA.getArgs().empty()) |
795 | break; |
796 | |
797 | // TODO: Add handling for fshl where third argument is not a constant. |
798 | const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[2]); |
799 | if (!OpInfoZ.isConstant()) |
800 | break; |
801 | |
802 | const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy); |
803 | if (OpInfoZ.isUniform()) { |
804 | // FIXME: The costs could be lower if the codegen is better. |
805 | static const CostTblEntry FshlTbl[] = { |
806 | {.ISD: Intrinsic::fshl, .Type: MVT::v4i32, .Cost: 3}, // ushr + shl + orr |
807 | {.ISD: Intrinsic::fshl, .Type: MVT::v2i64, .Cost: 3}, {.ISD: Intrinsic::fshl, .Type: MVT::v16i8, .Cost: 4}, |
808 | {.ISD: Intrinsic::fshl, .Type: MVT::v8i16, .Cost: 4}, {.ISD: Intrinsic::fshl, .Type: MVT::v2i32, .Cost: 3}, |
809 | {.ISD: Intrinsic::fshl, .Type: MVT::v8i8, .Cost: 4}, {.ISD: Intrinsic::fshl, .Type: MVT::v4i16, .Cost: 4}}; |
810 | // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl |
811 | // to avoid having to duplicate the costs. |
812 | const auto *Entry = |
813 | CostTableLookup(Table: FshlTbl, ISD: Intrinsic::fshl, Ty: LegalisationCost.second); |
814 | if (Entry) |
815 | return LegalisationCost.first * Entry->Cost; |
816 | } |
817 | |
818 | auto TyL = getTypeLegalizationCost(Ty: RetTy); |
819 | if (!RetTy->isIntegerTy()) |
820 | break; |
821 | |
822 | // Estimate cost manually, as types like i8 and i16 will get promoted to |
823 | // i32 and CostTableLookup will ignore the extra conversion cost. |
824 | bool HigherCost = (RetTy->getScalarSizeInBits() != 32 && |
825 | RetTy->getScalarSizeInBits() < 64) || |
826 | (RetTy->getScalarSizeInBits() % 64 != 0); |
827 | unsigned = HigherCost ? 1 : 0; |
828 | if (RetTy->getScalarSizeInBits() == 32 || |
829 | RetTy->getScalarSizeInBits() == 64) |
830 | ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single |
831 | // extr instruction. |
832 | else if (HigherCost) |
833 | ExtraCost = 1; |
834 | else |
835 | break; |
836 | return TyL.first + ExtraCost; |
837 | } |
838 | case Intrinsic::get_active_lane_mask: { |
839 | auto *RetTy = dyn_cast<FixedVectorType>(Val: ICA.getReturnType()); |
840 | if (RetTy) { |
841 | EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy); |
842 | EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
843 | if (!getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT) && |
844 | !getTLI()->isTypeLegal(VT: RetVT)) { |
845 | // We don't have enough context at this point to determine if the mask |
846 | // is going to be kept live after the block, which will force the vXi1 |
847 | // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. |
848 | // For now, we just assume the vectorizer created this intrinsic and |
849 | // the result will be the input for a PHI. In this case the cost will |
850 | // be extremely high for fixed-width vectors. |
851 | // NOTE: getScalarizationOverhead returns a cost that's far too |
852 | // pessimistic for the actual generated codegen. In reality there are |
853 | // two instructions generated per lane. |
854 | return RetTy->getNumElements() * 2; |
855 | } |
856 | } |
857 | break; |
858 | } |
859 | default: |
860 | break; |
861 | } |
862 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
863 | } |
864 | |
865 | /// The function will remove redundant reinterprets casting in the presence |
866 | /// of the control flow |
867 | static std::optional<Instruction *> processPhiNode(InstCombiner &IC, |
868 | IntrinsicInst &II) { |
869 | SmallVector<Instruction *, 32> Worklist; |
870 | auto RequiredType = II.getType(); |
871 | |
872 | auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: 0)); |
873 | assert(PN && "Expected Phi Node!" ); |
874 | |
875 | // Don't create a new Phi unless we can remove the old one. |
876 | if (!PN->hasOneUse()) |
877 | return std::nullopt; |
878 | |
879 | for (Value *IncValPhi : PN->incoming_values()) { |
880 | auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi); |
881 | if (!Reinterpret || |
882 | Reinterpret->getIntrinsicID() != |
883 | Intrinsic::aarch64_sve_convert_to_svbool || |
884 | RequiredType != Reinterpret->getArgOperand(i: 0)->getType()) |
885 | return std::nullopt; |
886 | } |
887 | |
888 | // Create the new Phi |
889 | IC.Builder.SetInsertPoint(PN); |
890 | PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues()); |
891 | Worklist.push_back(Elt: PN); |
892 | |
893 | for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { |
894 | auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I)); |
895 | NPN->addIncoming(V: Reinterpret->getOperand(i: 0), BB: PN->getIncomingBlock(i: I)); |
896 | Worklist.push_back(Elt: Reinterpret); |
897 | } |
898 | |
899 | // Cleanup Phi Node and reinterprets |
900 | return IC.replaceInstUsesWith(I&: II, V: NPN); |
901 | } |
902 | |
903 | // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) |
904 | // => (binop (pred) (from_svbool _) (from_svbool _)) |
905 | // |
906 | // The above transformation eliminates a `to_svbool` in the predicate |
907 | // operand of bitwise operation `binop` by narrowing the vector width of |
908 | // the operation. For example, it would convert a `<vscale x 16 x i1> |
909 | // and` into a `<vscale x 4 x i1> and`. This is profitable because |
910 | // to_svbool must zero the new lanes during widening, whereas |
911 | // from_svbool is free. |
912 | static std::optional<Instruction *> |
913 | tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) { |
914 | auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: 0)); |
915 | if (!BinOp) |
916 | return std::nullopt; |
917 | |
918 | auto IntrinsicID = BinOp->getIntrinsicID(); |
919 | switch (IntrinsicID) { |
920 | case Intrinsic::aarch64_sve_and_z: |
921 | case Intrinsic::aarch64_sve_bic_z: |
922 | case Intrinsic::aarch64_sve_eor_z: |
923 | case Intrinsic::aarch64_sve_nand_z: |
924 | case Intrinsic::aarch64_sve_nor_z: |
925 | case Intrinsic::aarch64_sve_orn_z: |
926 | case Intrinsic::aarch64_sve_orr_z: |
927 | break; |
928 | default: |
929 | return std::nullopt; |
930 | } |
931 | |
932 | auto BinOpPred = BinOp->getOperand(i_nocapture: 0); |
933 | auto BinOpOp1 = BinOp->getOperand(i_nocapture: 1); |
934 | auto BinOpOp2 = BinOp->getOperand(i_nocapture: 2); |
935 | |
936 | auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred); |
937 | if (!PredIntr || |
938 | PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) |
939 | return std::nullopt; |
940 | |
941 | auto PredOp = PredIntr->getOperand(i_nocapture: 0); |
942 | auto PredOpTy = cast<VectorType>(Val: PredOp->getType()); |
943 | if (PredOpTy != II.getType()) |
944 | return std::nullopt; |
945 | |
946 | SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; |
947 | auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic( |
948 | ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp1}); |
949 | NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1); |
950 | if (BinOpOp1 == BinOpOp2) |
951 | NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1); |
952 | else |
953 | NarrowedBinOpArgs.push_back(Elt: IC.Builder.CreateIntrinsic( |
954 | ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp2})); |
955 | |
956 | auto NarrowedBinOp = |
957 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {PredOpTy}, Args: NarrowedBinOpArgs); |
958 | return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp); |
959 | } |
960 | |
961 | static std::optional<Instruction *> |
962 | instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { |
963 | // If the reinterpret instruction operand is a PHI Node |
964 | if (isa<PHINode>(Val: II.getArgOperand(i: 0))) |
965 | return processPhiNode(IC, II); |
966 | |
967 | if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) |
968 | return BinOpCombine; |
969 | |
970 | // Ignore converts to/from svcount_t. |
971 | if (isa<TargetExtType>(Val: II.getArgOperand(i: 0)->getType()) || |
972 | isa<TargetExtType>(Val: II.getType())) |
973 | return std::nullopt; |
974 | |
975 | SmallVector<Instruction *, 32> CandidatesForRemoval; |
976 | Value *Cursor = II.getOperand(i_nocapture: 0), *EarliestReplacement = nullptr; |
977 | |
978 | const auto *IVTy = cast<VectorType>(Val: II.getType()); |
979 | |
980 | // Walk the chain of conversions. |
981 | while (Cursor) { |
982 | // If the type of the cursor has fewer lanes than the final result, zeroing |
983 | // must take place, which breaks the equivalence chain. |
984 | const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType()); |
985 | if (CursorVTy->getElementCount().getKnownMinValue() < |
986 | IVTy->getElementCount().getKnownMinValue()) |
987 | break; |
988 | |
989 | // If the cursor has the same type as I, it is a viable replacement. |
990 | if (Cursor->getType() == IVTy) |
991 | EarliestReplacement = Cursor; |
992 | |
993 | auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor); |
994 | |
995 | // If this is not an SVE conversion intrinsic, this is the end of the chain. |
996 | if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == |
997 | Intrinsic::aarch64_sve_convert_to_svbool || |
998 | IntrinsicCursor->getIntrinsicID() == |
999 | Intrinsic::aarch64_sve_convert_from_svbool)) |
1000 | break; |
1001 | |
1002 | CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor); |
1003 | Cursor = IntrinsicCursor->getOperand(i_nocapture: 0); |
1004 | } |
1005 | |
1006 | // If no viable replacement in the conversion chain was found, there is |
1007 | // nothing to do. |
1008 | if (!EarliestReplacement) |
1009 | return std::nullopt; |
1010 | |
1011 | return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement); |
1012 | } |
1013 | |
1014 | static bool isAllActivePredicate(Value *Pred) { |
1015 | // Look through convert.from.svbool(convert.to.svbool(...) chain. |
1016 | Value *UncastedPred; |
1017 | if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( |
1018 | Op0: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( |
1019 | Op0: m_Value(V&: UncastedPred))))) |
1020 | // If the predicate has the same or less lanes than the uncasted |
1021 | // predicate then we know the casting has no effect. |
1022 | if (cast<ScalableVectorType>(Val: Pred->getType())->getMinNumElements() <= |
1023 | cast<ScalableVectorType>(Val: UncastedPred->getType())->getMinNumElements()) |
1024 | Pred = UncastedPred; |
1025 | |
1026 | return match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( |
1027 | Op0: m_ConstantInt<AArch64SVEPredPattern::all>())); |
1028 | } |
1029 | |
1030 | // Erase unary operation where predicate has all inactive lanes |
1031 | static std::optional<Instruction *> |
1032 | instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, |
1033 | int PredPos) { |
1034 | if (match(V: II.getOperand(i_nocapture: PredPos), P: m_ZeroInt())) { |
1035 | return IC.eraseInstFromFunction(I&: II); |
1036 | } |
1037 | return std::nullopt; |
1038 | } |
1039 | |
1040 | // Simplify unary operation where predicate has all inactive lanes by replacing |
1041 | // instruction with zeroed object |
1042 | static std::optional<Instruction *> |
1043 | instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) { |
1044 | if (match(V: II.getOperand(i_nocapture: 0), P: m_ZeroInt())) { |
1045 | Constant *Node; |
1046 | Type *RetTy = II.getType(); |
1047 | if (RetTy->isStructTy()) { |
1048 | auto StructT = cast<StructType>(Val: RetTy); |
1049 | auto VecT = StructT->getElementType(N: 0); |
1050 | SmallVector<llvm::Constant *, 4> ZerVec; |
1051 | for (unsigned i = 0; i < StructT->getNumElements(); i++) { |
1052 | ZerVec.push_back(Elt: VecT->isFPOrFPVectorTy() ? ConstantFP::get(Ty: VecT, V: 0.0) |
1053 | : ConstantInt::get(Ty: VecT, V: 0)); |
1054 | } |
1055 | Node = ConstantStruct::get(T: StructT, V: ZerVec); |
1056 | } else if (RetTy->isFPOrFPVectorTy()) |
1057 | Node = ConstantFP::get(Ty: RetTy, V: 0.0); |
1058 | else |
1059 | Node = ConstantInt::get(Ty: II.getType(), V: 0); |
1060 | |
1061 | IC.replaceInstUsesWith(I&: II, V: Node); |
1062 | return IC.eraseInstFromFunction(I&: II); |
1063 | } |
1064 | return std::nullopt; |
1065 | } |
1066 | |
1067 | static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, |
1068 | IntrinsicInst &II) { |
1069 | // svsel(ptrue, x, y) => x |
1070 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
1071 | if (isAllActivePredicate(Pred: OpPredicate)) |
1072 | return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: 1)); |
1073 | |
1074 | auto Select = |
1075 | IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: 1), False: II.getOperand(i_nocapture: 2)); |
1076 | return IC.replaceInstUsesWith(I&: II, V: Select); |
1077 | } |
1078 | |
1079 | static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, |
1080 | IntrinsicInst &II) { |
1081 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1)); |
1082 | if (!Pg) |
1083 | return std::nullopt; |
1084 | |
1085 | if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
1086 | return std::nullopt; |
1087 | |
1088 | const auto PTruePattern = |
1089 | cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: 0))->getZExtValue(); |
1090 | if (PTruePattern != AArch64SVEPredPattern::vl1) |
1091 | return std::nullopt; |
1092 | |
1093 | // The intrinsic is inserting into lane zero so use an insert instead. |
1094 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
1095 | auto *Insert = InsertElementInst::Create( |
1096 | Vec: II.getArgOperand(i: 0), NewElt: II.getArgOperand(i: 2), Idx: ConstantInt::get(Ty: IdxTy, V: 0)); |
1097 | Insert->insertBefore(InsertPos: &II); |
1098 | Insert->takeName(V: &II); |
1099 | |
1100 | return IC.replaceInstUsesWith(I&: II, V: Insert); |
1101 | } |
1102 | |
1103 | static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, |
1104 | IntrinsicInst &II) { |
1105 | // Replace DupX with a regular IR splat. |
1106 | auto *RetTy = cast<ScalableVectorType>(Val: II.getType()); |
1107 | Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), |
1108 | V: II.getArgOperand(i: 0)); |
1109 | Splat->takeName(V: &II); |
1110 | return IC.replaceInstUsesWith(I&: II, V: Splat); |
1111 | } |
1112 | |
1113 | static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, |
1114 | IntrinsicInst &II) { |
1115 | LLVMContext &Ctx = II.getContext(); |
1116 | |
1117 | // Check that the predicate is all active |
1118 | auto *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 0)); |
1119 | if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
1120 | return std::nullopt; |
1121 | |
1122 | const auto PTruePattern = |
1123 | cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: 0))->getZExtValue(); |
1124 | if (PTruePattern != AArch64SVEPredPattern::all) |
1125 | return std::nullopt; |
1126 | |
1127 | // Check that we have a compare of zero.. |
1128 | auto *SplatValue = |
1129 | dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: 2))); |
1130 | if (!SplatValue || !SplatValue->isZero()) |
1131 | return std::nullopt; |
1132 | |
1133 | // ..against a dupq |
1134 | auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1)); |
1135 | if (!DupQLane || |
1136 | DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) |
1137 | return std::nullopt; |
1138 | |
1139 | // Where the dupq is a lane 0 replicate of a vector insert |
1140 | if (!cast<ConstantInt>(Val: DupQLane->getArgOperand(i: 1))->isZero()) |
1141 | return std::nullopt; |
1142 | |
1143 | auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: 0)); |
1144 | if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) |
1145 | return std::nullopt; |
1146 | |
1147 | // Where the vector insert is a fixed constant vector insert into undef at |
1148 | // index zero |
1149 | if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: 0))) |
1150 | return std::nullopt; |
1151 | |
1152 | if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: 2))->isZero()) |
1153 | return std::nullopt; |
1154 | |
1155 | auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: 1)); |
1156 | if (!ConstVec) |
1157 | return std::nullopt; |
1158 | |
1159 | auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType()); |
1160 | auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType()); |
1161 | if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) |
1162 | return std::nullopt; |
1163 | |
1164 | unsigned NumElts = VecTy->getNumElements(); |
1165 | unsigned PredicateBits = 0; |
1166 | |
1167 | // Expand intrinsic operands to a 16-bit byte level predicate |
1168 | for (unsigned I = 0; I < NumElts; ++I) { |
1169 | auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I)); |
1170 | if (!Arg) |
1171 | return std::nullopt; |
1172 | if (!Arg->isZero()) |
1173 | PredicateBits |= 1 << (I * (16 / NumElts)); |
1174 | } |
1175 | |
1176 | // If all bits are zero bail early with an empty predicate |
1177 | if (PredicateBits == 0) { |
1178 | auto *PFalse = Constant::getNullValue(Ty: II.getType()); |
1179 | PFalse->takeName(V: &II); |
1180 | return IC.replaceInstUsesWith(I&: II, V: PFalse); |
1181 | } |
1182 | |
1183 | // Calculate largest predicate type used (where byte predicate is largest) |
1184 | unsigned Mask = 8; |
1185 | for (unsigned I = 0; I < 16; ++I) |
1186 | if ((PredicateBits & (1 << I)) != 0) |
1187 | Mask |= (I % 8); |
1188 | |
1189 | unsigned PredSize = Mask & -Mask; |
1190 | auto *PredType = ScalableVectorType::get( |
1191 | ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * 8)); |
1192 | |
1193 | // Ensure all relevant bits are set |
1194 | for (unsigned I = 0; I < 16; I += PredSize) |
1195 | if ((PredicateBits & (1 << I)) == 0) |
1196 | return std::nullopt; |
1197 | |
1198 | auto *PTruePat = |
1199 | ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all); |
1200 | auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, |
1201 | Types: {PredType}, Args: {PTruePat}); |
1202 | auto *ConvertToSVBool = IC.Builder.CreateIntrinsic( |
1203 | ID: Intrinsic::aarch64_sve_convert_to_svbool, Types: {PredType}, Args: {PTrue}); |
1204 | auto *ConvertFromSVBool = |
1205 | IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool, |
1206 | Types: {II.getType()}, Args: {ConvertToSVBool}); |
1207 | |
1208 | ConvertFromSVBool->takeName(V: &II); |
1209 | return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool); |
1210 | } |
1211 | |
1212 | static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC, |
1213 | IntrinsicInst &II) { |
1214 | Value *Pg = II.getArgOperand(i: 0); |
1215 | Value *Vec = II.getArgOperand(i: 1); |
1216 | auto IntrinsicID = II.getIntrinsicID(); |
1217 | bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; |
1218 | |
1219 | // lastX(splat(X)) --> X |
1220 | if (auto *SplatVal = getSplatValue(V: Vec)) |
1221 | return IC.replaceInstUsesWith(I&: II, V: SplatVal); |
1222 | |
1223 | // If x and/or y is a splat value then: |
1224 | // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) |
1225 | Value *LHS, *RHS; |
1226 | if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) { |
1227 | if (isSplatValue(V: LHS) || isSplatValue(V: RHS)) { |
1228 | auto *OldBinOp = cast<BinaryOperator>(Val: Vec); |
1229 | auto OpC = OldBinOp->getOpcode(); |
1230 | auto *NewLHS = |
1231 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, LHS}); |
1232 | auto *NewRHS = |
1233 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, RHS}); |
1234 | auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( |
1235 | Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator()); |
1236 | return IC.replaceInstUsesWith(I&: II, V: NewBinOp); |
1237 | } |
1238 | } |
1239 | |
1240 | auto *C = dyn_cast<Constant>(Val: Pg); |
1241 | if (IsAfter && C && C->isNullValue()) { |
1242 | // The intrinsic is extracting lane 0 so use an extract instead. |
1243 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
1244 | auto * = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: 0)); |
1245 | Extract->insertBefore(InsertPos: &II); |
1246 | Extract->takeName(V: &II); |
1247 | return IC.replaceInstUsesWith(I&: II, V: Extract); |
1248 | } |
1249 | |
1250 | auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg); |
1251 | if (!IntrPG) |
1252 | return std::nullopt; |
1253 | |
1254 | if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
1255 | return std::nullopt; |
1256 | |
1257 | const auto PTruePattern = |
1258 | cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: 0))->getZExtValue(); |
1259 | |
1260 | // Can the intrinsic's predicate be converted to a known constant index? |
1261 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern); |
1262 | if (!MinNumElts) |
1263 | return std::nullopt; |
1264 | |
1265 | unsigned Idx = MinNumElts - 1; |
1266 | // Increment the index if extracting the element after the last active |
1267 | // predicate element. |
1268 | if (IsAfter) |
1269 | ++Idx; |
1270 | |
1271 | // Ignore extracts whose index is larger than the known minimum vector |
1272 | // length. NOTE: This is an artificial constraint where we prefer to |
1273 | // maintain what the user asked for until an alternative is proven faster. |
1274 | auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType()); |
1275 | if (Idx >= PgVTy->getMinNumElements()) |
1276 | return std::nullopt; |
1277 | |
1278 | // The intrinsic is extracting a fixed lane so use an extract instead. |
1279 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
1280 | auto * = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx)); |
1281 | Extract->insertBefore(InsertPos: &II); |
1282 | Extract->takeName(V: &II); |
1283 | return IC.replaceInstUsesWith(I&: II, V: Extract); |
1284 | } |
1285 | |
1286 | static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, |
1287 | IntrinsicInst &II) { |
1288 | // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar |
1289 | // integer variant across a variety of micro-architectures. Replace scalar |
1290 | // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple |
1291 | // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more |
1292 | // depending on the micro-architecture, but has been observed as generally |
1293 | // being faster, particularly when the CLAST[AB] op is a loop-carried |
1294 | // dependency. |
1295 | Value *Pg = II.getArgOperand(i: 0); |
1296 | Value *Fallback = II.getArgOperand(i: 1); |
1297 | Value *Vec = II.getArgOperand(i: 2); |
1298 | Type *Ty = II.getType(); |
1299 | |
1300 | if (!Ty->isIntegerTy()) |
1301 | return std::nullopt; |
1302 | |
1303 | Type *FPTy; |
1304 | switch (cast<IntegerType>(Val: Ty)->getBitWidth()) { |
1305 | default: |
1306 | return std::nullopt; |
1307 | case 16: |
1308 | FPTy = IC.Builder.getHalfTy(); |
1309 | break; |
1310 | case 32: |
1311 | FPTy = IC.Builder.getFloatTy(); |
1312 | break; |
1313 | case 64: |
1314 | FPTy = IC.Builder.getDoubleTy(); |
1315 | break; |
1316 | } |
1317 | |
1318 | Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy); |
1319 | auto *FPVTy = VectorType::get( |
1320 | ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount()); |
1321 | Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy); |
1322 | auto *FPII = IC.Builder.CreateIntrinsic( |
1323 | ID: II.getIntrinsicID(), Types: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec}); |
1324 | Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType()); |
1325 | return IC.replaceInstUsesWith(I&: II, V: FPIItoInt); |
1326 | } |
1327 | |
1328 | static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC, |
1329 | IntrinsicInst &II) { |
1330 | LLVMContext &Ctx = II.getContext(); |
1331 | // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr |
1332 | // can work with RDFFR_PP for ptest elimination. |
1333 | auto *AllPat = |
1334 | ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all); |
1335 | auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, |
1336 | Types: {II.getType()}, Args: {AllPat}); |
1337 | auto *RDFFR = |
1338 | IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_rdffr_z, Types: {}, Args: {PTrue}); |
1339 | RDFFR->takeName(V: &II); |
1340 | return IC.replaceInstUsesWith(I&: II, V: RDFFR); |
1341 | } |
1342 | |
1343 | static std::optional<Instruction *> |
1344 | instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { |
1345 | const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: 0))->getZExtValue(); |
1346 | |
1347 | if (Pattern == AArch64SVEPredPattern::all) { |
1348 | Constant *StepVal = ConstantInt::get(Ty: II.getType(), V: NumElts); |
1349 | auto *VScale = IC.Builder.CreateVScale(Scaling: StepVal); |
1350 | VScale->takeName(V: &II); |
1351 | return IC.replaceInstUsesWith(I&: II, V: VScale); |
1352 | } |
1353 | |
1354 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); |
1355 | |
1356 | return MinNumElts && NumElts >= MinNumElts |
1357 | ? std::optional<Instruction *>(IC.replaceInstUsesWith( |
1358 | I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts))) |
1359 | : std::nullopt; |
1360 | } |
1361 | |
1362 | static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, |
1363 | IntrinsicInst &II) { |
1364 | Value *PgVal = II.getArgOperand(i: 0); |
1365 | Value *OpVal = II.getArgOperand(i: 1); |
1366 | |
1367 | // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X). |
1368 | // Later optimizations prefer this form. |
1369 | if (PgVal == OpVal && |
1370 | (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first || |
1371 | II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) { |
1372 | Value *Ops[] = {PgVal, OpVal}; |
1373 | Type *Tys[] = {PgVal->getType()}; |
1374 | |
1375 | auto *PTest = |
1376 | IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptest_any, Types: Tys, Args: Ops); |
1377 | PTest->takeName(V: &II); |
1378 | |
1379 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
1380 | } |
1381 | |
1382 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal); |
1383 | IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal); |
1384 | |
1385 | if (!Pg || !Op) |
1386 | return std::nullopt; |
1387 | |
1388 | Intrinsic::ID OpIID = Op->getIntrinsicID(); |
1389 | |
1390 | if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && |
1391 | OpIID == Intrinsic::aarch64_sve_convert_to_svbool && |
1392 | Pg->getArgOperand(i: 0)->getType() == Op->getArgOperand(i: 0)->getType()) { |
1393 | Value *Ops[] = {Pg->getArgOperand(i: 0), Op->getArgOperand(i: 0)}; |
1394 | Type *Tys[] = {Pg->getArgOperand(i: 0)->getType()}; |
1395 | |
1396 | auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops); |
1397 | |
1398 | PTest->takeName(V: &II); |
1399 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
1400 | } |
1401 | |
1402 | // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)). |
1403 | // Later optimizations may rewrite sequence to use the flag-setting variant |
1404 | // of instruction X to remove PTEST. |
1405 | if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) && |
1406 | ((OpIID == Intrinsic::aarch64_sve_brka_z) || |
1407 | (OpIID == Intrinsic::aarch64_sve_brkb_z) || |
1408 | (OpIID == Intrinsic::aarch64_sve_brkpa_z) || |
1409 | (OpIID == Intrinsic::aarch64_sve_brkpb_z) || |
1410 | (OpIID == Intrinsic::aarch64_sve_rdffr_z) || |
1411 | (OpIID == Intrinsic::aarch64_sve_and_z) || |
1412 | (OpIID == Intrinsic::aarch64_sve_bic_z) || |
1413 | (OpIID == Intrinsic::aarch64_sve_eor_z) || |
1414 | (OpIID == Intrinsic::aarch64_sve_nand_z) || |
1415 | (OpIID == Intrinsic::aarch64_sve_nor_z) || |
1416 | (OpIID == Intrinsic::aarch64_sve_orn_z) || |
1417 | (OpIID == Intrinsic::aarch64_sve_orr_z))) { |
1418 | Value *Ops[] = {Pg->getArgOperand(i: 0), Pg}; |
1419 | Type *Tys[] = {Pg->getType()}; |
1420 | |
1421 | auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops); |
1422 | PTest->takeName(V: &II); |
1423 | |
1424 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
1425 | } |
1426 | |
1427 | return std::nullopt; |
1428 | } |
1429 | |
1430 | template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> |
1431 | static std::optional<Instruction *> |
1432 | instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, |
1433 | bool MergeIntoAddendOp) { |
1434 | Value *P = II.getOperand(i_nocapture: 0); |
1435 | Value *MulOp0, *MulOp1, *AddendOp, *Mul; |
1436 | if (MergeIntoAddendOp) { |
1437 | AddendOp = II.getOperand(i_nocapture: 1); |
1438 | Mul = II.getOperand(i_nocapture: 2); |
1439 | } else { |
1440 | AddendOp = II.getOperand(i_nocapture: 2); |
1441 | Mul = II.getOperand(i_nocapture: 1); |
1442 | } |
1443 | |
1444 | if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0), |
1445 | m_Value(V&: MulOp1)))) |
1446 | return std::nullopt; |
1447 | |
1448 | if (!Mul->hasOneUse()) |
1449 | return std::nullopt; |
1450 | |
1451 | Instruction *FMFSource = nullptr; |
1452 | if (II.getType()->isFPOrFPVectorTy()) { |
1453 | llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); |
1454 | // Stop the combine when the flags on the inputs differ in case dropping |
1455 | // flags would lead to us missing out on more beneficial optimizations. |
1456 | if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags()) |
1457 | return std::nullopt; |
1458 | if (!FAddFlags.allowContract()) |
1459 | return std::nullopt; |
1460 | FMFSource = &II; |
1461 | } |
1462 | |
1463 | CallInst *Res; |
1464 | if (MergeIntoAddendOp) |
1465 | Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()}, |
1466 | Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource); |
1467 | else |
1468 | Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()}, |
1469 | Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource); |
1470 | |
1471 | return IC.replaceInstUsesWith(I&: II, V: Res); |
1472 | } |
1473 | |
1474 | static std::optional<Instruction *> |
1475 | instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { |
1476 | Value *Pred = II.getOperand(i_nocapture: 0); |
1477 | Value *PtrOp = II.getOperand(i_nocapture: 1); |
1478 | Type *VecTy = II.getType(); |
1479 | |
1480 | // Replace by zero constant when all lanes are inactive |
1481 | if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) |
1482 | return II_NA; |
1483 | |
1484 | if (isAllActivePredicate(Pred)) { |
1485 | LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp); |
1486 | Load->copyMetadata(SrcInst: II); |
1487 | return IC.replaceInstUsesWith(I&: II, V: Load); |
1488 | } |
1489 | |
1490 | CallInst *MaskedLoad = |
1491 | IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), |
1492 | Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy)); |
1493 | MaskedLoad->copyMetadata(SrcInst: II); |
1494 | return IC.replaceInstUsesWith(I&: II, V: MaskedLoad); |
1495 | } |
1496 | |
1497 | static std::optional<Instruction *> |
1498 | instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { |
1499 | Value *VecOp = II.getOperand(i_nocapture: 0); |
1500 | Value *Pred = II.getOperand(i_nocapture: 1); |
1501 | Value *PtrOp = II.getOperand(i_nocapture: 2); |
1502 | |
1503 | if (isAllActivePredicate(Pred)) { |
1504 | StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp); |
1505 | Store->copyMetadata(SrcInst: II); |
1506 | return IC.eraseInstFromFunction(I&: II); |
1507 | } |
1508 | |
1509 | CallInst *MaskedStore = IC.Builder.CreateMaskedStore( |
1510 | Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred); |
1511 | MaskedStore->copyMetadata(SrcInst: II); |
1512 | return IC.eraseInstFromFunction(I&: II); |
1513 | } |
1514 | |
1515 | static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { |
1516 | switch (Intrinsic) { |
1517 | case Intrinsic::aarch64_sve_fmul_u: |
1518 | return Instruction::BinaryOps::FMul; |
1519 | case Intrinsic::aarch64_sve_fadd_u: |
1520 | return Instruction::BinaryOps::FAdd; |
1521 | case Intrinsic::aarch64_sve_fsub_u: |
1522 | return Instruction::BinaryOps::FSub; |
1523 | default: |
1524 | return Instruction::BinaryOpsEnd; |
1525 | } |
1526 | } |
1527 | |
1528 | static std::optional<Instruction *> |
1529 | instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { |
1530 | // Bail due to missing support for ISD::STRICT_ scalable vector operations. |
1531 | if (II.isStrictFP()) |
1532 | return std::nullopt; |
1533 | |
1534 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
1535 | auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID()); |
1536 | if (BinOpCode == Instruction::BinaryOpsEnd || |
1537 | !match(V: OpPredicate, P: m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( |
1538 | Op0: m_ConstantInt<AArch64SVEPredPattern::all>()))) |
1539 | return std::nullopt; |
1540 | IRBuilderBase::FastMathFlagGuard FMFGuard(IC.Builder); |
1541 | IC.Builder.setFastMathFlags(II.getFastMathFlags()); |
1542 | auto BinOp = |
1543 | IC.Builder.CreateBinOp(Opc: BinOpCode, LHS: II.getOperand(i_nocapture: 1), RHS: II.getOperand(i_nocapture: 2)); |
1544 | return IC.replaceInstUsesWith(I&: II, V: BinOp); |
1545 | } |
1546 | |
1547 | // Canonicalise operations that take an all active predicate (e.g. sve.add -> |
1548 | // sve.add_u). |
1549 | static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II, |
1550 | Intrinsic::ID IID) { |
1551 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
1552 | if (!match(V: OpPredicate, P: m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( |
1553 | Op0: m_ConstantInt<AArch64SVEPredPattern::all>()))) |
1554 | return std::nullopt; |
1555 | |
1556 | auto *Mod = II.getModule(); |
1557 | auto *NewDecl = Intrinsic::getDeclaration(M: Mod, id: IID, Tys: {II.getType()}); |
1558 | II.setCalledFunction(NewDecl); |
1559 | |
1560 | return &II; |
1561 | } |
1562 | |
1563 | // Simplify operations where predicate has all inactive lanes or try to replace |
1564 | // with _u form when all lanes are active |
1565 | static std::optional<Instruction *> |
1566 | instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, |
1567 | Intrinsic::ID IID) { |
1568 | if (match(V: II.getOperand(i_nocapture: 0), P: m_ZeroInt())) { |
1569 | // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are |
1570 | // inactive for sv[func]_m |
1571 | return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: 1)); |
1572 | } |
1573 | return instCombineSVEAllActive(II, IID); |
1574 | } |
1575 | |
1576 | static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC, |
1577 | IntrinsicInst &II) { |
1578 | if (auto II_U = |
1579 | instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_add_u)) |
1580 | return II_U; |
1581 | if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
1582 | Intrinsic::aarch64_sve_mla>( |
1583 | IC, II, MergeIntoAddendOp: true)) |
1584 | return MLA; |
1585 | if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
1586 | Intrinsic::aarch64_sve_mad>( |
1587 | IC, II, MergeIntoAddendOp: false)) |
1588 | return MAD; |
1589 | return std::nullopt; |
1590 | } |
1591 | |
1592 | static std::optional<Instruction *> |
1593 | instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) { |
1594 | if (auto II_U = |
1595 | instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fadd_u)) |
1596 | return II_U; |
1597 | if (auto FMLA = |
1598 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1599 | Intrinsic::aarch64_sve_fmla>(IC, II, |
1600 | MergeIntoAddendOp: true)) |
1601 | return FMLA; |
1602 | if (auto FMAD = |
1603 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1604 | Intrinsic::aarch64_sve_fmad>(IC, II, |
1605 | MergeIntoAddendOp: false)) |
1606 | return FMAD; |
1607 | if (auto FMLA = |
1608 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
1609 | Intrinsic::aarch64_sve_fmla>(IC, II, |
1610 | MergeIntoAddendOp: true)) |
1611 | return FMLA; |
1612 | return std::nullopt; |
1613 | } |
1614 | |
1615 | static std::optional<Instruction *> |
1616 | instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) { |
1617 | if (auto FMLA = |
1618 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1619 | Intrinsic::aarch64_sve_fmla>(IC, II, |
1620 | MergeIntoAddendOp: true)) |
1621 | return FMLA; |
1622 | if (auto FMAD = |
1623 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1624 | Intrinsic::aarch64_sve_fmad>(IC, II, |
1625 | MergeIntoAddendOp: false)) |
1626 | return FMAD; |
1627 | if (auto FMLA_U = |
1628 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
1629 | Intrinsic::aarch64_sve_fmla_u>( |
1630 | IC, II, MergeIntoAddendOp: true)) |
1631 | return FMLA_U; |
1632 | return instCombineSVEVectorBinOp(IC, II); |
1633 | } |
1634 | |
1635 | static std::optional<Instruction *> |
1636 | instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) { |
1637 | if (auto II_U = |
1638 | instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fsub_u)) |
1639 | return II_U; |
1640 | if (auto FMLS = |
1641 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1642 | Intrinsic::aarch64_sve_fmls>(IC, II, |
1643 | MergeIntoAddendOp: true)) |
1644 | return FMLS; |
1645 | if (auto FMSB = |
1646 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1647 | Intrinsic::aarch64_sve_fnmsb>( |
1648 | IC, II, MergeIntoAddendOp: false)) |
1649 | return FMSB; |
1650 | if (auto FMLS = |
1651 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
1652 | Intrinsic::aarch64_sve_fmls>(IC, II, |
1653 | MergeIntoAddendOp: true)) |
1654 | return FMLS; |
1655 | return std::nullopt; |
1656 | } |
1657 | |
1658 | static std::optional<Instruction *> |
1659 | instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) { |
1660 | if (auto FMLS = |
1661 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1662 | Intrinsic::aarch64_sve_fmls>(IC, II, |
1663 | MergeIntoAddendOp: true)) |
1664 | return FMLS; |
1665 | if (auto FMSB = |
1666 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1667 | Intrinsic::aarch64_sve_fnmsb>( |
1668 | IC, II, MergeIntoAddendOp: false)) |
1669 | return FMSB; |
1670 | if (auto FMLS_U = |
1671 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
1672 | Intrinsic::aarch64_sve_fmls_u>( |
1673 | IC, II, MergeIntoAddendOp: true)) |
1674 | return FMLS_U; |
1675 | return instCombineSVEVectorBinOp(IC, II); |
1676 | } |
1677 | |
1678 | static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC, |
1679 | IntrinsicInst &II) { |
1680 | if (auto II_U = |
1681 | instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_sub_u)) |
1682 | return II_U; |
1683 | if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
1684 | Intrinsic::aarch64_sve_mls>( |
1685 | IC, II, MergeIntoAddendOp: true)) |
1686 | return MLS; |
1687 | return std::nullopt; |
1688 | } |
1689 | |
1690 | static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, |
1691 | IntrinsicInst &II, |
1692 | Intrinsic::ID IID) { |
1693 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
1694 | auto *OpMultiplicand = II.getOperand(i_nocapture: 1); |
1695 | auto *OpMultiplier = II.getOperand(i_nocapture: 2); |
1696 | |
1697 | // Return true if a given instruction is a unit splat value, false otherwise. |
1698 | auto IsUnitSplat = [](auto *I) { |
1699 | auto *SplatValue = getSplatValue(I); |
1700 | if (!SplatValue) |
1701 | return false; |
1702 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); |
1703 | }; |
1704 | |
1705 | // Return true if a given instruction is an aarch64_sve_dup intrinsic call |
1706 | // with a unit splat value, false otherwise. |
1707 | auto IsUnitDup = [](auto *I) { |
1708 | auto *IntrI = dyn_cast<IntrinsicInst>(I); |
1709 | if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) |
1710 | return false; |
1711 | |
1712 | auto *SplatValue = IntrI->getOperand(2); |
1713 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); |
1714 | }; |
1715 | |
1716 | if (IsUnitSplat(OpMultiplier)) { |
1717 | // [f]mul pg %n, (dupx 1) => %n |
1718 | OpMultiplicand->takeName(V: &II); |
1719 | return IC.replaceInstUsesWith(I&: II, V: OpMultiplicand); |
1720 | } else if (IsUnitDup(OpMultiplier)) { |
1721 | // [f]mul pg %n, (dup pg 1) => %n |
1722 | auto *DupInst = cast<IntrinsicInst>(Val: OpMultiplier); |
1723 | auto *DupPg = DupInst->getOperand(i_nocapture: 1); |
1724 | // TODO: this is naive. The optimization is still valid if DupPg |
1725 | // 'encompasses' OpPredicate, not only if they're the same predicate. |
1726 | if (OpPredicate == DupPg) { |
1727 | OpMultiplicand->takeName(V: &II); |
1728 | return IC.replaceInstUsesWith(I&: II, V: OpMultiplicand); |
1729 | } |
1730 | } |
1731 | |
1732 | return instCombineSVEVectorBinOp(IC, II); |
1733 | } |
1734 | |
1735 | static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, |
1736 | IntrinsicInst &II) { |
1737 | Value *UnpackArg = II.getArgOperand(i: 0); |
1738 | auto *RetTy = cast<ScalableVectorType>(Val: II.getType()); |
1739 | bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || |
1740 | II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; |
1741 | |
1742 | // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) |
1743 | // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) |
1744 | if (auto *ScalarArg = getSplatValue(V: UnpackArg)) { |
1745 | ScalarArg = |
1746 | IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned); |
1747 | Value *NewVal = |
1748 | IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg); |
1749 | NewVal->takeName(V: &II); |
1750 | return IC.replaceInstUsesWith(I&: II, V: NewVal); |
1751 | } |
1752 | |
1753 | return std::nullopt; |
1754 | } |
1755 | static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC, |
1756 | IntrinsicInst &II) { |
1757 | auto *OpVal = II.getOperand(i_nocapture: 0); |
1758 | auto *OpIndices = II.getOperand(i_nocapture: 1); |
1759 | VectorType *VTy = cast<VectorType>(Val: II.getType()); |
1760 | |
1761 | // Check whether OpIndices is a constant splat value < minimal element count |
1762 | // of result. |
1763 | auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices)); |
1764 | if (!SplatValue || |
1765 | SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue())) |
1766 | return std::nullopt; |
1767 | |
1768 | // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to |
1769 | // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. |
1770 | auto * = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue); |
1771 | auto *VectorSplat = |
1772 | IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract); |
1773 | |
1774 | VectorSplat->takeName(V: &II); |
1775 | return IC.replaceInstUsesWith(I&: II, V: VectorSplat); |
1776 | } |
1777 | |
1778 | static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC, |
1779 | IntrinsicInst &II) { |
1780 | Value *A, *B; |
1781 | Type *RetTy = II.getType(); |
1782 | constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool; |
1783 | constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool; |
1784 | |
1785 | // uzp1(to_svbool(A), to_svbool(B)) --> <A, B> |
1786 | // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B> |
1787 | if ((match(V: II.getArgOperand(i: 0), |
1788 | P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A)))) && |
1789 | match(V: II.getArgOperand(i: 1), |
1790 | P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) || |
1791 | (match(V: II.getArgOperand(i: 0), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A))) && |
1792 | match(V: II.getArgOperand(i: 1), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) { |
1793 | auto *TyA = cast<ScalableVectorType>(Val: A->getType()); |
1794 | if (TyA == B->getType() && |
1795 | RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) { |
1796 | auto *SubVec = IC.Builder.CreateInsertVector( |
1797 | DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: IC.Builder.getInt64(C: 0)); |
1798 | auto *ConcatVec = IC.Builder.CreateInsertVector( |
1799 | DstType: RetTy, SrcVec: SubVec, SubVec: B, Idx: IC.Builder.getInt64(C: TyA->getMinNumElements())); |
1800 | ConcatVec->takeName(V: &II); |
1801 | return IC.replaceInstUsesWith(I&: II, V: ConcatVec); |
1802 | } |
1803 | } |
1804 | |
1805 | return std::nullopt; |
1806 | } |
1807 | |
1808 | static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC, |
1809 | IntrinsicInst &II) { |
1810 | // zip1(uzp1(A, B), uzp2(A, B)) --> A |
1811 | // zip2(uzp1(A, B), uzp2(A, B)) --> B |
1812 | Value *A, *B; |
1813 | if (match(V: II.getArgOperand(i: 0), |
1814 | P: m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(Op0: m_Value(V&: A), Op1: m_Value(V&: B))) && |
1815 | match(V: II.getArgOperand(i: 1), P: m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( |
1816 | Op0: m_Specific(V: A), Op1: m_Specific(V: B)))) |
1817 | return IC.replaceInstUsesWith( |
1818 | I&: II, V: (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); |
1819 | |
1820 | return std::nullopt; |
1821 | } |
1822 | |
1823 | static std::optional<Instruction *> |
1824 | instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { |
1825 | Value *Mask = II.getOperand(i_nocapture: 0); |
1826 | Value *BasePtr = II.getOperand(i_nocapture: 1); |
1827 | Value *Index = II.getOperand(i_nocapture: 2); |
1828 | Type *Ty = II.getType(); |
1829 | Value *PassThru = ConstantAggregateZero::get(Ty); |
1830 | |
1831 | // Replace by zero constant when all lanes are inactive |
1832 | if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) |
1833 | return II_NA; |
1834 | |
1835 | // Contiguous gather => masked load. |
1836 | // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) |
1837 | // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) |
1838 | Value *IndexBase; |
1839 | if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>( |
1840 | Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) { |
1841 | Align Alignment = |
1842 | BasePtr->getPointerAlignment(DL: II.getDataLayout()); |
1843 | |
1844 | Type *VecPtrTy = PointerType::getUnqual(ElementType: Ty); |
1845 | Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(), |
1846 | Ptr: BasePtr, IdxList: IndexBase); |
1847 | Ptr = IC.Builder.CreateBitCast(V: Ptr, DestTy: VecPtrTy); |
1848 | CallInst *MaskedLoad = |
1849 | IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); |
1850 | MaskedLoad->takeName(V: &II); |
1851 | return IC.replaceInstUsesWith(I&: II, V: MaskedLoad); |
1852 | } |
1853 | |
1854 | return std::nullopt; |
1855 | } |
1856 | |
1857 | static std::optional<Instruction *> |
1858 | instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { |
1859 | Value *Val = II.getOperand(i_nocapture: 0); |
1860 | Value *Mask = II.getOperand(i_nocapture: 1); |
1861 | Value *BasePtr = II.getOperand(i_nocapture: 2); |
1862 | Value *Index = II.getOperand(i_nocapture: 3); |
1863 | Type *Ty = Val->getType(); |
1864 | |
1865 | // Contiguous scatter => masked store. |
1866 | // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) |
1867 | // => (masked.store Value (gep BasePtr IndexBase) Align Mask) |
1868 | Value *IndexBase; |
1869 | if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>( |
1870 | Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: 1)))) { |
1871 | Align Alignment = |
1872 | BasePtr->getPointerAlignment(DL: II.getDataLayout()); |
1873 | |
1874 | Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(), |
1875 | Ptr: BasePtr, IdxList: IndexBase); |
1876 | Type *VecPtrTy = PointerType::getUnqual(ElementType: Ty); |
1877 | Ptr = IC.Builder.CreateBitCast(V: Ptr, DestTy: VecPtrTy); |
1878 | |
1879 | (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); |
1880 | |
1881 | return IC.eraseInstFromFunction(I&: II); |
1882 | } |
1883 | |
1884 | return std::nullopt; |
1885 | } |
1886 | |
1887 | static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, |
1888 | IntrinsicInst &II) { |
1889 | Type *Int32Ty = IC.Builder.getInt32Ty(); |
1890 | Value *Pred = II.getOperand(i_nocapture: 0); |
1891 | Value *Vec = II.getOperand(i_nocapture: 1); |
1892 | Value *DivVec = II.getOperand(i_nocapture: 2); |
1893 | |
1894 | Value *SplatValue = getSplatValue(V: DivVec); |
1895 | ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue); |
1896 | if (!SplatConstantInt) |
1897 | return std::nullopt; |
1898 | APInt Divisor = SplatConstantInt->getValue(); |
1899 | |
1900 | if (Divisor.isPowerOf2()) { |
1901 | Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2()); |
1902 | auto ASRD = IC.Builder.CreateIntrinsic( |
1903 | ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2}); |
1904 | return IC.replaceInstUsesWith(I&: II, V: ASRD); |
1905 | } |
1906 | if (Divisor.isNegatedPowerOf2()) { |
1907 | Divisor.negate(); |
1908 | Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2()); |
1909 | auto ASRD = IC.Builder.CreateIntrinsic( |
1910 | ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2}); |
1911 | auto NEG = IC.Builder.CreateIntrinsic( |
1912 | ID: Intrinsic::aarch64_sve_neg, Types: {ASRD->getType()}, Args: {ASRD, Pred, ASRD}); |
1913 | return IC.replaceInstUsesWith(I&: II, V: NEG); |
1914 | } |
1915 | |
1916 | return std::nullopt; |
1917 | } |
1918 | |
1919 | bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) { |
1920 | size_t VecSize = Vec.size(); |
1921 | if (VecSize == 1) |
1922 | return true; |
1923 | if (!isPowerOf2_64(Value: VecSize)) |
1924 | return false; |
1925 | size_t HalfVecSize = VecSize / 2; |
1926 | |
1927 | for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize; |
1928 | RHS != Vec.end(); LHS++, RHS++) { |
1929 | if (*LHS != nullptr && *RHS != nullptr) { |
1930 | if (*LHS == *RHS) |
1931 | continue; |
1932 | else |
1933 | return false; |
1934 | } |
1935 | if (!AllowPoison) |
1936 | return false; |
1937 | if (*LHS == nullptr && *RHS != nullptr) |
1938 | *LHS = *RHS; |
1939 | } |
1940 | |
1941 | Vec.resize(N: HalfVecSize); |
1942 | SimplifyValuePattern(Vec, AllowPoison); |
1943 | return true; |
1944 | } |
1945 | |
1946 | // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) |
1947 | // to dupqlane(f64(C)) where C is A concatenated with B |
1948 | static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC, |
1949 | IntrinsicInst &II) { |
1950 | Value *CurrentInsertElt = nullptr, *Default = nullptr; |
1951 | if (!match(V: II.getOperand(i_nocapture: 0), |
1952 | P: m_Intrinsic<Intrinsic::vector_insert>( |
1953 | Op0: m_Value(V&: Default), Op1: m_Value(V&: CurrentInsertElt), Op2: m_Value())) || |
1954 | !isa<FixedVectorType>(Val: CurrentInsertElt->getType())) |
1955 | return std::nullopt; |
1956 | auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType()); |
1957 | |
1958 | // Insert the scalars into a container ordered by InsertElement index |
1959 | SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr); |
1960 | while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) { |
1961 | auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: 2)); |
1962 | Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: 1); |
1963 | CurrentInsertElt = InsertElt->getOperand(i_nocapture: 0); |
1964 | } |
1965 | |
1966 | bool AllowPoison = |
1967 | isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default); |
1968 | if (!SimplifyValuePattern(Vec&: Elts, AllowPoison)) |
1969 | return std::nullopt; |
1970 | |
1971 | // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) |
1972 | Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType()); |
1973 | for (size_t I = 0; I < Elts.size(); I++) { |
1974 | if (Elts[I] == nullptr) |
1975 | continue; |
1976 | InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts[I], |
1977 | Idx: IC.Builder.getInt64(C: I)); |
1978 | } |
1979 | if (InsertEltChain == nullptr) |
1980 | return std::nullopt; |
1981 | |
1982 | // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 |
1983 | // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector |
1984 | // be bitcast to a type wide enough to fit the sequence, be splatted, and then |
1985 | // be narrowed back to the original type. |
1986 | unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size(); |
1987 | unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() * |
1988 | IIScalableTy->getMinNumElements() / |
1989 | PatternWidth; |
1990 | |
1991 | IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth); |
1992 | auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount); |
1993 | auto *WideShuffleMaskTy = |
1994 | ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount); |
1995 | |
1996 | auto ZeroIdx = ConstantInt::get(Ty: IC.Builder.getInt64Ty(), V: APInt(64, 0)); |
1997 | auto InsertSubvector = IC.Builder.CreateInsertVector( |
1998 | DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain, Idx: ZeroIdx); |
1999 | auto WideBitcast = |
2000 | IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy); |
2001 | auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy); |
2002 | auto WideShuffle = IC.Builder.CreateShuffleVector( |
2003 | V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask); |
2004 | auto NarrowBitcast = |
2005 | IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType()); |
2006 | |
2007 | return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast); |
2008 | } |
2009 | |
2010 | static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, |
2011 | IntrinsicInst &II) { |
2012 | Value *A = II.getArgOperand(i: 0); |
2013 | Value *B = II.getArgOperand(i: 1); |
2014 | if (A == B) |
2015 | return IC.replaceInstUsesWith(I&: II, V: A); |
2016 | |
2017 | return std::nullopt; |
2018 | } |
2019 | |
2020 | static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, |
2021 | IntrinsicInst &II) { |
2022 | Value *Pred = II.getOperand(i_nocapture: 0); |
2023 | Value *Vec = II.getOperand(i_nocapture: 1); |
2024 | Value *Shift = II.getOperand(i_nocapture: 2); |
2025 | |
2026 | // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. |
2027 | Value *AbsPred, *MergedValue; |
2028 | if (!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( |
2029 | Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())) && |
2030 | !match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_abs>( |
2031 | Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value()))) |
2032 | |
2033 | return std::nullopt; |
2034 | |
2035 | // Transform is valid if any of the following are true: |
2036 | // * The ABS merge value is an undef or non-negative |
2037 | // * The ABS predicate is all active |
2038 | // * The ABS predicate and the SRSHL predicates are the same |
2039 | if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) && |
2040 | AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred)) |
2041 | return std::nullopt; |
2042 | |
2043 | // Only valid when the shift amount is non-negative, otherwise the rounding |
2044 | // behaviour of SRSHL cannot be ignored. |
2045 | if (!match(V: Shift, P: m_NonNegative())) |
2046 | return std::nullopt; |
2047 | |
2048 | auto LSL = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_lsl, |
2049 | Types: {II.getType()}, Args: {Pred, Vec, Shift}); |
2050 | |
2051 | return IC.replaceInstUsesWith(I&: II, V: LSL); |
2052 | } |
2053 | |
2054 | std::optional<Instruction *> |
2055 | AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, |
2056 | IntrinsicInst &II) const { |
2057 | Intrinsic::ID IID = II.getIntrinsicID(); |
2058 | switch (IID) { |
2059 | default: |
2060 | break; |
2061 | |
2062 | case Intrinsic::aarch64_sve_st1_scatter: |
2063 | case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: |
2064 | case Intrinsic::aarch64_sve_st1_scatter_sxtw: |
2065 | case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: |
2066 | case Intrinsic::aarch64_sve_st1_scatter_uxtw: |
2067 | case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: |
2068 | case Intrinsic::aarch64_sve_st1dq: |
2069 | case Intrinsic::aarch64_sve_st1q_scatter_index: |
2070 | case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset: |
2071 | case Intrinsic::aarch64_sve_st1q_scatter_vector_offset: |
2072 | case Intrinsic::aarch64_sve_st1wq: |
2073 | case Intrinsic::aarch64_sve_stnt1: |
2074 | case Intrinsic::aarch64_sve_stnt1_scatter: |
2075 | case Intrinsic::aarch64_sve_stnt1_scatter_index: |
2076 | case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: |
2077 | case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: |
2078 | return instCombineSVENoActiveUnaryErase(IC, II, PredPos: 1); |
2079 | case Intrinsic::aarch64_sve_st2: |
2080 | case Intrinsic::aarch64_sve_st2q: |
2081 | return instCombineSVENoActiveUnaryErase(IC, II, PredPos: 2); |
2082 | case Intrinsic::aarch64_sve_st3: |
2083 | case Intrinsic::aarch64_sve_st3q: |
2084 | return instCombineSVENoActiveUnaryErase(IC, II, PredPos: 3); |
2085 | case Intrinsic::aarch64_sve_st4: |
2086 | case Intrinsic::aarch64_sve_st4q: |
2087 | return instCombineSVENoActiveUnaryErase(IC, II, PredPos: 4); |
2088 | case Intrinsic::aarch64_sve_ld1_gather: |
2089 | case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: |
2090 | case Intrinsic::aarch64_sve_ld1_gather_sxtw: |
2091 | case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: |
2092 | case Intrinsic::aarch64_sve_ld1_gather_uxtw: |
2093 | case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: |
2094 | case Intrinsic::aarch64_sve_ld1q_gather_index: |
2095 | case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: |
2096 | case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: |
2097 | case Intrinsic::aarch64_sve_ld1ro: |
2098 | case Intrinsic::aarch64_sve_ld1rq: |
2099 | case Intrinsic::aarch64_sve_ld1udq: |
2100 | case Intrinsic::aarch64_sve_ld1uwq: |
2101 | case Intrinsic::aarch64_sve_ld2_sret: |
2102 | case Intrinsic::aarch64_sve_ld2q_sret: |
2103 | case Intrinsic::aarch64_sve_ld3_sret: |
2104 | case Intrinsic::aarch64_sve_ld3q_sret: |
2105 | case Intrinsic::aarch64_sve_ld4_sret: |
2106 | case Intrinsic::aarch64_sve_ld4q_sret: |
2107 | case Intrinsic::aarch64_sve_ldff1: |
2108 | case Intrinsic::aarch64_sve_ldff1_gather: |
2109 | case Intrinsic::aarch64_sve_ldff1_gather_index: |
2110 | case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: |
2111 | case Intrinsic::aarch64_sve_ldff1_gather_sxtw: |
2112 | case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: |
2113 | case Intrinsic::aarch64_sve_ldff1_gather_uxtw: |
2114 | case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: |
2115 | case Intrinsic::aarch64_sve_ldnf1: |
2116 | case Intrinsic::aarch64_sve_ldnt1: |
2117 | case Intrinsic::aarch64_sve_ldnt1_gather: |
2118 | case Intrinsic::aarch64_sve_ldnt1_gather_index: |
2119 | case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: |
2120 | case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: |
2121 | return instCombineSVENoActiveUnaryZero(IC, II); |
2122 | case Intrinsic::aarch64_neon_fmaxnm: |
2123 | case Intrinsic::aarch64_neon_fminnm: |
2124 | return instCombineMaxMinNM(IC, II); |
2125 | case Intrinsic::aarch64_sve_convert_from_svbool: |
2126 | return instCombineConvertFromSVBool(IC, II); |
2127 | case Intrinsic::aarch64_sve_dup: |
2128 | return instCombineSVEDup(IC, II); |
2129 | case Intrinsic::aarch64_sve_dup_x: |
2130 | return instCombineSVEDupX(IC, II); |
2131 | case Intrinsic::aarch64_sve_cmpne: |
2132 | case Intrinsic::aarch64_sve_cmpne_wide: |
2133 | return instCombineSVECmpNE(IC, II); |
2134 | case Intrinsic::aarch64_sve_rdffr: |
2135 | return instCombineRDFFR(IC, II); |
2136 | case Intrinsic::aarch64_sve_lasta: |
2137 | case Intrinsic::aarch64_sve_lastb: |
2138 | return instCombineSVELast(IC, II); |
2139 | case Intrinsic::aarch64_sve_clasta_n: |
2140 | case Intrinsic::aarch64_sve_clastb_n: |
2141 | return instCombineSVECondLast(IC, II); |
2142 | case Intrinsic::aarch64_sve_cntd: |
2143 | return instCombineSVECntElts(IC, II, NumElts: 2); |
2144 | case Intrinsic::aarch64_sve_cntw: |
2145 | return instCombineSVECntElts(IC, II, NumElts: 4); |
2146 | case Intrinsic::aarch64_sve_cnth: |
2147 | return instCombineSVECntElts(IC, II, NumElts: 8); |
2148 | case Intrinsic::aarch64_sve_cntb: |
2149 | return instCombineSVECntElts(IC, II, NumElts: 16); |
2150 | case Intrinsic::aarch64_sve_ptest_any: |
2151 | case Intrinsic::aarch64_sve_ptest_first: |
2152 | case Intrinsic::aarch64_sve_ptest_last: |
2153 | return instCombineSVEPTest(IC, II); |
2154 | case Intrinsic::aarch64_sve_fabd: |
2155 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fabd_u); |
2156 | case Intrinsic::aarch64_sve_fadd: |
2157 | return instCombineSVEVectorFAdd(IC, II); |
2158 | case Intrinsic::aarch64_sve_fadd_u: |
2159 | return instCombineSVEVectorFAddU(IC, II); |
2160 | case Intrinsic::aarch64_sve_fdiv: |
2161 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fdiv_u); |
2162 | case Intrinsic::aarch64_sve_fmax: |
2163 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmax_u); |
2164 | case Intrinsic::aarch64_sve_fmaxnm: |
2165 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmaxnm_u); |
2166 | case Intrinsic::aarch64_sve_fmin: |
2167 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmin_u); |
2168 | case Intrinsic::aarch64_sve_fminnm: |
2169 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fminnm_u); |
2170 | case Intrinsic::aarch64_sve_fmla: |
2171 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmla_u); |
2172 | case Intrinsic::aarch64_sve_fmls: |
2173 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmls_u); |
2174 | case Intrinsic::aarch64_sve_fmul: |
2175 | if (auto II_U = |
2176 | instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmul_u)) |
2177 | return II_U; |
2178 | return instCombineSVEVectorMul(IC, II, IID: Intrinsic::aarch64_sve_fmul_u); |
2179 | case Intrinsic::aarch64_sve_fmul_u: |
2180 | return instCombineSVEVectorMul(IC, II, IID: Intrinsic::aarch64_sve_fmul_u); |
2181 | case Intrinsic::aarch64_sve_fmulx: |
2182 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmulx_u); |
2183 | case Intrinsic::aarch64_sve_fnmla: |
2184 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fnmla_u); |
2185 | case Intrinsic::aarch64_sve_fnmls: |
2186 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fnmls_u); |
2187 | case Intrinsic::aarch64_sve_fsub: |
2188 | return instCombineSVEVectorFSub(IC, II); |
2189 | case Intrinsic::aarch64_sve_fsub_u: |
2190 | return instCombineSVEVectorFSubU(IC, II); |
2191 | case Intrinsic::aarch64_sve_add: |
2192 | return instCombineSVEVectorAdd(IC, II); |
2193 | case Intrinsic::aarch64_sve_add_u: |
2194 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, |
2195 | Intrinsic::aarch64_sve_mla_u>( |
2196 | IC, II, MergeIntoAddendOp: true); |
2197 | case Intrinsic::aarch64_sve_mla: |
2198 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_mla_u); |
2199 | case Intrinsic::aarch64_sve_mls: |
2200 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_mls_u); |
2201 | case Intrinsic::aarch64_sve_mul: |
2202 | if (auto II_U = |
2203 | instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_mul_u)) |
2204 | return II_U; |
2205 | return instCombineSVEVectorMul(IC, II, IID: Intrinsic::aarch64_sve_mul_u); |
2206 | case Intrinsic::aarch64_sve_mul_u: |
2207 | return instCombineSVEVectorMul(IC, II, IID: Intrinsic::aarch64_sve_mul_u); |
2208 | case Intrinsic::aarch64_sve_sabd: |
2209 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_sabd_u); |
2210 | case Intrinsic::aarch64_sve_smax: |
2211 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_smax_u); |
2212 | case Intrinsic::aarch64_sve_smin: |
2213 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_smin_u); |
2214 | case Intrinsic::aarch64_sve_smulh: |
2215 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_smulh_u); |
2216 | case Intrinsic::aarch64_sve_sub: |
2217 | return instCombineSVEVectorSub(IC, II); |
2218 | case Intrinsic::aarch64_sve_sub_u: |
2219 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, |
2220 | Intrinsic::aarch64_sve_mls_u>( |
2221 | IC, II, MergeIntoAddendOp: true); |
2222 | case Intrinsic::aarch64_sve_uabd: |
2223 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_uabd_u); |
2224 | case Intrinsic::aarch64_sve_umax: |
2225 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_umax_u); |
2226 | case Intrinsic::aarch64_sve_umin: |
2227 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_umin_u); |
2228 | case Intrinsic::aarch64_sve_umulh: |
2229 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_umulh_u); |
2230 | case Intrinsic::aarch64_sve_asr: |
2231 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_asr_u); |
2232 | case Intrinsic::aarch64_sve_lsl: |
2233 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_lsl_u); |
2234 | case Intrinsic::aarch64_sve_lsr: |
2235 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_lsr_u); |
2236 | case Intrinsic::aarch64_sve_and: |
2237 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_and_u); |
2238 | case Intrinsic::aarch64_sve_bic: |
2239 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_bic_u); |
2240 | case Intrinsic::aarch64_sve_eor: |
2241 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_eor_u); |
2242 | case Intrinsic::aarch64_sve_orr: |
2243 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_orr_u); |
2244 | case Intrinsic::aarch64_sve_sqsub: |
2245 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_sqsub_u); |
2246 | case Intrinsic::aarch64_sve_uqsub: |
2247 | return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_uqsub_u); |
2248 | case Intrinsic::aarch64_sve_tbl: |
2249 | return instCombineSVETBL(IC, II); |
2250 | case Intrinsic::aarch64_sve_uunpkhi: |
2251 | case Intrinsic::aarch64_sve_uunpklo: |
2252 | case Intrinsic::aarch64_sve_sunpkhi: |
2253 | case Intrinsic::aarch64_sve_sunpklo: |
2254 | return instCombineSVEUnpack(IC, II); |
2255 | case Intrinsic::aarch64_sve_uzp1: |
2256 | return instCombineSVEUzp1(IC, II); |
2257 | case Intrinsic::aarch64_sve_zip1: |
2258 | case Intrinsic::aarch64_sve_zip2: |
2259 | return instCombineSVEZip(IC, II); |
2260 | case Intrinsic::aarch64_sve_ld1_gather_index: |
2261 | return instCombineLD1GatherIndex(IC, II); |
2262 | case Intrinsic::aarch64_sve_st1_scatter_index: |
2263 | return instCombineST1ScatterIndex(IC, II); |
2264 | case Intrinsic::aarch64_sve_ld1: |
2265 | return instCombineSVELD1(IC, II, DL); |
2266 | case Intrinsic::aarch64_sve_st1: |
2267 | return instCombineSVEST1(IC, II, DL); |
2268 | case Intrinsic::aarch64_sve_sdiv: |
2269 | return instCombineSVESDIV(IC, II); |
2270 | case Intrinsic::aarch64_sve_sel: |
2271 | return instCombineSVESel(IC, II); |
2272 | case Intrinsic::aarch64_sve_srshl: |
2273 | return instCombineSVESrshl(IC, II); |
2274 | case Intrinsic::aarch64_sve_dupq_lane: |
2275 | return instCombineSVEDupqLane(IC, II); |
2276 | } |
2277 | |
2278 | return std::nullopt; |
2279 | } |
2280 | |
2281 | std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( |
2282 | InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, |
2283 | APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, |
2284 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
2285 | SimplifyAndSetOp) const { |
2286 | switch (II.getIntrinsicID()) { |
2287 | default: |
2288 | break; |
2289 | case Intrinsic::aarch64_neon_fcvtxn: |
2290 | case Intrinsic::aarch64_neon_rshrn: |
2291 | case Intrinsic::aarch64_neon_sqrshrn: |
2292 | case Intrinsic::aarch64_neon_sqrshrun: |
2293 | case Intrinsic::aarch64_neon_sqshrn: |
2294 | case Intrinsic::aarch64_neon_sqshrun: |
2295 | case Intrinsic::aarch64_neon_sqxtn: |
2296 | case Intrinsic::aarch64_neon_sqxtun: |
2297 | case Intrinsic::aarch64_neon_uqrshrn: |
2298 | case Intrinsic::aarch64_neon_uqshrn: |
2299 | case Intrinsic::aarch64_neon_uqxtn: |
2300 | SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); |
2301 | break; |
2302 | } |
2303 | |
2304 | return std::nullopt; |
2305 | } |
2306 | |
2307 | bool AArch64TTIImpl::enableScalableVectorization() const { |
2308 | return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && |
2309 | EnableScalableAutovecInStreamingMode); |
2310 | } |
2311 | |
2312 | TypeSize |
2313 | AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
2314 | switch (K) { |
2315 | case TargetTransformInfo::RGK_Scalar: |
2316 | return TypeSize::getFixed(ExactSize: 64); |
2317 | case TargetTransformInfo::RGK_FixedWidthVector: |
2318 | if (ST->useSVEForFixedLengthVectors() && |
2319 | (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode)) |
2320 | return TypeSize::getFixed( |
2321 | ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: 128u)); |
2322 | else if (ST->isNeonAvailable()) |
2323 | return TypeSize::getFixed(ExactSize: 128); |
2324 | else |
2325 | return TypeSize::getFixed(ExactSize: 0); |
2326 | case TargetTransformInfo::RGK_ScalableVector: |
2327 | if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() && |
2328 | EnableScalableAutovecInStreamingMode)) |
2329 | return TypeSize::getScalable(MinimumSize: 128); |
2330 | else |
2331 | return TypeSize::getScalable(MinimumSize: 0); |
2332 | } |
2333 | llvm_unreachable("Unsupported register kind" ); |
2334 | } |
2335 | |
2336 | bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, |
2337 | ArrayRef<const Value *> Args, |
2338 | Type *SrcOverrideTy) { |
2339 | // A helper that returns a vector type from the given type. The number of |
2340 | // elements in type Ty determines the vector width. |
2341 | auto toVectorTy = [&](Type *ArgTy) { |
2342 | return VectorType::get(ElementType: ArgTy->getScalarType(), |
2343 | EC: cast<VectorType>(Val: DstTy)->getElementCount()); |
2344 | }; |
2345 | |
2346 | // Exit early if DstTy is not a vector type whose elements are one of [i16, |
2347 | // i32, i64]. SVE doesn't generally have the same set of instructions to |
2348 | // perform an extend with the add/sub/mul. There are SMULLB style |
2349 | // instructions, but they operate on top/bottom, requiring some sort of lane |
2350 | // interleaving to be used with zext/sext. |
2351 | unsigned DstEltSize = DstTy->getScalarSizeInBits(); |
2352 | if (!useNeonVector(Ty: DstTy) || Args.size() != 2 || |
2353 | (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) |
2354 | return false; |
2355 | |
2356 | // Determine if the operation has a widening variant. We consider both the |
2357 | // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the |
2358 | // instructions. |
2359 | // |
2360 | // TODO: Add additional widening operations (e.g., shl, etc.) once we |
2361 | // verify that their extending operands are eliminated during code |
2362 | // generation. |
2363 | Type *SrcTy = SrcOverrideTy; |
2364 | switch (Opcode) { |
2365 | case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). |
2366 | case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). |
2367 | // The second operand needs to be an extend |
2368 | if (isa<SExtInst>(Val: Args[1]) || isa<ZExtInst>(Val: Args[1])) { |
2369 | if (!SrcTy) |
2370 | SrcTy = |
2371 | toVectorTy(cast<Instruction>(Val: Args[1])->getOperand(i: 0)->getType()); |
2372 | } else |
2373 | return false; |
2374 | break; |
2375 | case Instruction::Mul: { // SMULL(2), UMULL(2) |
2376 | // Both operands need to be extends of the same type. |
2377 | if ((isa<SExtInst>(Val: Args[0]) && isa<SExtInst>(Val: Args[1])) || |
2378 | (isa<ZExtInst>(Val: Args[0]) && isa<ZExtInst>(Val: Args[1]))) { |
2379 | if (!SrcTy) |
2380 | SrcTy = |
2381 | toVectorTy(cast<Instruction>(Val: Args[0])->getOperand(i: 0)->getType()); |
2382 | } else if (isa<ZExtInst>(Val: Args[0]) || isa<ZExtInst>(Val: Args[1])) { |
2383 | // If one of the operands is a Zext and the other has enough zero bits to |
2384 | // be treated as unsigned, we can still general a umull, meaning the zext |
2385 | // is free. |
2386 | KnownBits Known = |
2387 | computeKnownBits(V: isa<ZExtInst>(Val: Args[0]) ? Args[1] : Args[0], DL); |
2388 | if (Args[0]->getType()->getScalarSizeInBits() - |
2389 | Known.Zero.countLeadingOnes() > |
2390 | DstTy->getScalarSizeInBits() / 2) |
2391 | return false; |
2392 | if (!SrcTy) |
2393 | SrcTy = toVectorTy(Type::getIntNTy(C&: DstTy->getContext(), |
2394 | N: DstTy->getScalarSizeInBits() / 2)); |
2395 | } else |
2396 | return false; |
2397 | break; |
2398 | } |
2399 | default: |
2400 | return false; |
2401 | } |
2402 | |
2403 | // Legalize the destination type and ensure it can be used in a widening |
2404 | // operation. |
2405 | auto DstTyL = getTypeLegalizationCost(Ty: DstTy); |
2406 | if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits()) |
2407 | return false; |
2408 | |
2409 | // Legalize the source type and ensure it can be used in a widening |
2410 | // operation. |
2411 | assert(SrcTy && "Expected some SrcTy" ); |
2412 | auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy); |
2413 | unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); |
2414 | if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) |
2415 | return false; |
2416 | |
2417 | // Get the total number of vector elements in the legalized types. |
2418 | InstructionCost NumDstEls = |
2419 | DstTyL.first * DstTyL.second.getVectorMinNumElements(); |
2420 | InstructionCost NumSrcEls = |
2421 | SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); |
2422 | |
2423 | // Return true if the legalized types have the same number of vector elements |
2424 | // and the destination element type size is twice that of the source type. |
2425 | return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; |
2426 | } |
2427 | |
2428 | // s/urhadd instructions implement the following pattern, making the |
2429 | // extends free: |
2430 | // %x = add ((zext i8 -> i16), 1) |
2431 | // %y = (zext i8 -> i16) |
2432 | // trunc i16 (lshr (add %x, %y), 1) -> i8 |
2433 | // |
2434 | bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, |
2435 | Type *Src) { |
2436 | // The source should be a legal vector type. |
2437 | if (!Src->isVectorTy() || !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) || |
2438 | (Src->isScalableTy() && !ST->hasSVE2())) |
2439 | return false; |
2440 | |
2441 | if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse()) |
2442 | return false; |
2443 | |
2444 | // Look for trunc/shl/add before trying to match the pattern. |
2445 | const Instruction *Add = ExtUser; |
2446 | auto *AddUser = |
2447 | dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser()); |
2448 | if (AddUser && AddUser->getOpcode() == Instruction::Add) |
2449 | Add = AddUser; |
2450 | |
2451 | auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser()); |
2452 | if (!Shr || Shr->getOpcode() != Instruction::LShr) |
2453 | return false; |
2454 | |
2455 | auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser()); |
2456 | if (!Trunc || Trunc->getOpcode() != Instruction::Trunc || |
2457 | Src->getScalarSizeInBits() != |
2458 | cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits()) |
2459 | return false; |
2460 | |
2461 | // Try to match the whole pattern. Ext could be either the first or second |
2462 | // m_ZExtOrSExt matched. |
2463 | Instruction *Ex1, *Ex2; |
2464 | if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1), |
2465 | R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_SpecificInt(V: 1)))))) |
2466 | return false; |
2467 | |
2468 | // Ensure both extends are of the same type |
2469 | if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) && |
2470 | Ex1->getOpcode() == Ex2->getOpcode()) |
2471 | return true; |
2472 | |
2473 | return false; |
2474 | } |
2475 | |
2476 | InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
2477 | Type *Src, |
2478 | TTI::CastContextHint CCH, |
2479 | TTI::TargetCostKind CostKind, |
2480 | const Instruction *I) { |
2481 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
2482 | assert(ISD && "Invalid opcode" ); |
2483 | // If the cast is observable, and it is used by a widening instruction (e.g., |
2484 | // uaddl, saddw, etc.), it may be free. |
2485 | if (I && I->hasOneUser()) { |
2486 | auto *SingleUser = cast<Instruction>(Val: *I->user_begin()); |
2487 | SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); |
2488 | if (isWideningInstruction(DstTy: Dst, Opcode: SingleUser->getOpcode(), Args: Operands, SrcOverrideTy: Src)) { |
2489 | // For adds only count the second operand as free if both operands are |
2490 | // extends but not the same operation. (i.e both operands are not free in |
2491 | // add(sext, zext)). |
2492 | if (SingleUser->getOpcode() == Instruction::Add) { |
2493 | if (I == SingleUser->getOperand(i: 1) || |
2494 | (isa<CastInst>(Val: SingleUser->getOperand(i: 1)) && |
2495 | cast<CastInst>(Val: SingleUser->getOperand(i: 1))->getOpcode() == Opcode)) |
2496 | return 0; |
2497 | } else // Others are free so long as isWideningInstruction returned true. |
2498 | return 0; |
2499 | } |
2500 | |
2501 | // The cast will be free for the s/urhadd instructions |
2502 | if ((isa<ZExtInst>(Val: I) || isa<SExtInst>(Val: I)) && |
2503 | isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src)) |
2504 | return 0; |
2505 | } |
2506 | |
2507 | // TODO: Allow non-throughput costs that aren't binary. |
2508 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { |
2509 | if (CostKind != TTI::TCK_RecipThroughput) |
2510 | return Cost == 0 ? 0 : 1; |
2511 | return Cost; |
2512 | }; |
2513 | |
2514 | EVT SrcTy = TLI->getValueType(DL, Ty: Src); |
2515 | EVT DstTy = TLI->getValueType(DL, Ty: Dst); |
2516 | |
2517 | if (!SrcTy.isSimple() || !DstTy.isSimple()) |
2518 | return AdjustCost( |
2519 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
2520 | |
2521 | static const TypeConversionCostTblEntry |
2522 | ConversionTbl[] = { |
2523 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: 1}, // xtn |
2524 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: 1}, // xtn |
2525 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i32, .Src: MVT::v2i64, .Cost: 1}, // xtn |
2526 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: 1}, // xtn |
2527 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: 3}, // 2 xtn + 1 uzp1 |
2528 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 1}, // xtn |
2529 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: 2}, // 1 uzp1 + 1 xtn |
2530 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: 1}, // 1 uzp1 |
2531 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: 1}, // 1 xtn |
2532 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: 2}, // 1 uzp1 + 1 xtn |
2533 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: 4}, // 3 x uzp1 + xtn |
2534 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: 1}, // 1 uzp1 |
2535 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: 3}, // 3 x uzp1 |
2536 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: 2}, // 2 x uzp1 |
2537 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: 1}, // uzp1 |
2538 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 3}, // (2 + 1) x uzp1 |
2539 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: 7}, // (4 + 2 + 1) x uzp1 |
2540 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: 2}, // 2 x uzp1 |
2541 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i64, .Cost: 6}, // (4 + 2) x uzp1 |
2542 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i64, .Cost: 4}, // 4 x uzp1 |
2543 | |
2544 | // Truncations on nxvmiN |
2545 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i16, .Cost: 1 }, |
2546 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i32, .Cost: 1 }, |
2547 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i64, .Cost: 1 }, |
2548 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i16, .Cost: 1 }, |
2549 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i32, .Cost: 1 }, |
2550 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i64, .Cost: 2 }, |
2551 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i16, .Cost: 1 }, |
2552 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i32, .Cost: 3 }, |
2553 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i64, .Cost: 5 }, |
2554 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i1, .Src: MVT::nxv16i8, .Cost: 1 }, |
2555 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i32, .Cost: 1 }, |
2556 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i32, .Src: MVT::nxv2i64, .Cost: 1 }, |
2557 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i32, .Cost: 1 }, |
2558 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i32, .Src: MVT::nxv4i64, .Cost: 2 }, |
2559 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i32, .Cost: 3 }, |
2560 | { .ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i64, .Cost: 6 }, |
2561 | |
2562 | // The number of shll instructions for the extension. |
2563 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3 }, |
2564 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3 }, |
2565 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2 }, |
2566 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2 }, |
2567 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3 }, |
2568 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3 }, |
2569 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2 }, |
2570 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2 }, |
2571 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7 }, |
2572 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7 }, |
2573 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6 }, |
2574 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6 }, |
2575 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2 }, |
2576 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2 }, |
2577 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6 }, |
2578 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6 }, |
2579 | |
2580 | // LowerVectorINT_TO_FP: |
2581 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1 }, |
2582 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1 }, |
2583 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1 }, |
2584 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1 }, |
2585 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1 }, |
2586 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: 1 }, |
2587 | |
2588 | // Complex: to v2f32 |
2589 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3 }, |
2590 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3 }, |
2591 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: 2 }, |
2592 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3 }, |
2593 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 3 }, |
2594 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: 2 }, |
2595 | |
2596 | // Complex: to v4f32 |
2597 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 4 }, |
2598 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2 }, |
2599 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3 }, |
2600 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2 }, |
2601 | |
2602 | // Complex: to v8f32 |
2603 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10 }, |
2604 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4 }, |
2605 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10 }, |
2606 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4 }, |
2607 | |
2608 | // Complex: to v16f32 |
2609 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21 }, |
2610 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: 21 }, |
2611 | |
2612 | // Complex: to v2f64 |
2613 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4 }, |
2614 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4 }, |
2615 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2 }, |
2616 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4 }, |
2617 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 4 }, |
2618 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2 }, |
2619 | |
2620 | // Complex: to v4f64 |
2621 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4 }, |
2622 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: 4 }, |
2623 | |
2624 | // LowerVectorFP_TO_INT |
2625 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1 }, |
2626 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1 }, |
2627 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1 }, |
2628 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: 1 }, |
2629 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1 }, |
2630 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: 1 }, |
2631 | |
2632 | // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). |
2633 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2 }, |
2634 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1 }, |
2635 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1 }, |
2636 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: 2 }, |
2637 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 1 }, |
2638 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 1 }, |
2639 | |
2640 | // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 |
2641 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2 }, |
2642 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2 }, |
2643 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2 }, |
2644 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 2 }, |
2645 | |
2646 | // Complex, from nxv2f32. |
2647 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1 }, |
2648 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1 }, |
2649 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1 }, |
2650 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1 }, |
2651 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: 1 }, |
2652 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 1 }, |
2653 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: 1 }, |
2654 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: 1 }, |
2655 | |
2656 | // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. |
2657 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2 }, |
2658 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2 }, |
2659 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2 }, |
2660 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2 }, |
2661 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: 2 }, |
2662 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: 2 }, |
2663 | |
2664 | // Complex, from nxv2f64. |
2665 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1 }, |
2666 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1 }, |
2667 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1 }, |
2668 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1 }, |
2669 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: 1 }, |
2670 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: 1 }, |
2671 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: 1 }, |
2672 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: 1 }, |
2673 | |
2674 | // Complex, from nxv4f32. |
2675 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4 }, |
2676 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1 }, |
2677 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1 }, |
2678 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1 }, |
2679 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: 4 }, |
2680 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: 1 }, |
2681 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: 1 }, |
2682 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: 1 }, |
2683 | |
2684 | // Complex, from nxv8f64. Illegal -> illegal conversions not required. |
2685 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7 }, |
2686 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7 }, |
2687 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: 7 }, |
2688 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: 7 }, |
2689 | |
2690 | // Complex, from nxv4f64. Illegal -> illegal conversions not required. |
2691 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3 }, |
2692 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3 }, |
2693 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3 }, |
2694 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: 3 }, |
2695 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: 3 }, |
2696 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: 3 }, |
2697 | |
2698 | // Complex, from nxv8f32. Illegal -> illegal conversions not required. |
2699 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3 }, |
2700 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3 }, |
2701 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: 3 }, |
2702 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: 3 }, |
2703 | |
2704 | // Complex, from nxv8f16. |
2705 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10 }, |
2706 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4 }, |
2707 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1 }, |
2708 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1 }, |
2709 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: 10 }, |
2710 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: 4 }, |
2711 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: 1 }, |
2712 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: 1 }, |
2713 | |
2714 | // Complex, from nxv4f16. |
2715 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4 }, |
2716 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1 }, |
2717 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1 }, |
2718 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1 }, |
2719 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: 4 }, |
2720 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: 1 }, |
2721 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 1 }, |
2722 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: 1 }, |
2723 | |
2724 | // Complex, from nxv2f16. |
2725 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1 }, |
2726 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1 }, |
2727 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1 }, |
2728 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1 }, |
2729 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: 1 }, |
2730 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: 1 }, |
2731 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 1 }, |
2732 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: 1 }, |
2733 | |
2734 | // Truncate from nxvmf32 to nxvmf16. |
2735 | { .ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f32, .Cost: 1 }, |
2736 | { .ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f32, .Cost: 1 }, |
2737 | { .ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f32, .Cost: 3 }, |
2738 | |
2739 | // Truncate from nxvmf64 to nxvmf16. |
2740 | { .ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f64, .Cost: 1 }, |
2741 | { .ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f64, .Cost: 3 }, |
2742 | { .ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f64, .Cost: 7 }, |
2743 | |
2744 | // Truncate from nxvmf64 to nxvmf32. |
2745 | { .ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f64, .Cost: 1 }, |
2746 | { .ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f64, .Cost: 3 }, |
2747 | { .ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f64, .Cost: 6 }, |
2748 | |
2749 | // Extend from nxvmf16 to nxvmf32. |
2750 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f16, .Cost: 1}, |
2751 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f16, .Cost: 1}, |
2752 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f16, .Cost: 2}, |
2753 | |
2754 | // Extend from nxvmf16 to nxvmf64. |
2755 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f16, .Cost: 1}, |
2756 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f16, .Cost: 2}, |
2757 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f16, .Cost: 4}, |
2758 | |
2759 | // Extend from nxvmf32 to nxvmf64. |
2760 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f32, .Cost: 1}, |
2761 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f32, .Cost: 2}, |
2762 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f32, .Cost: 6}, |
2763 | |
2764 | // Bitcasts from float to integer |
2765 | { .ISD: ISD::BITCAST, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: 0 }, |
2766 | { .ISD: ISD::BITCAST, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: 0 }, |
2767 | { .ISD: ISD::BITCAST, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: 0 }, |
2768 | |
2769 | // Bitcasts from integer to float |
2770 | { .ISD: ISD::BITCAST, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: 0 }, |
2771 | { .ISD: ISD::BITCAST, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: 0 }, |
2772 | { .ISD: ISD::BITCAST, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: 0 }, |
2773 | |
2774 | // Add cost for extending to illegal -too wide- scalable vectors. |
2775 | // zero/sign extend are implemented by multiple unpack operations, |
2776 | // where each operation has a cost of 1. |
2777 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2}, |
2778 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6}, |
2779 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14}, |
2780 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2}, |
2781 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6}, |
2782 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2}, |
2783 | |
2784 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: 2}, |
2785 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: 6}, |
2786 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: 14}, |
2787 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: 2}, |
2788 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: 6}, |
2789 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: 2}, |
2790 | }; |
2791 | |
2792 | // We have to estimate a cost of fixed length operation upon |
2793 | // SVE registers(operations) with the number of registers required |
2794 | // for a fixed type to be represented upon SVE registers. |
2795 | EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy; |
2796 | if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() && |
2797 | SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() && |
2798 | ST->useSVEForFixedLengthVectors(VT: WiderTy)) { |
2799 | std::pair<InstructionCost, MVT> LT = |
2800 | getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext())); |
2801 | unsigned NumElements = AArch64::SVEBitsPerBlock / |
2802 | LT.second.getScalarSizeInBits(); |
2803 | return AdjustCost( |
2804 | LT.first * |
2805 | getCastInstrCost( |
2806 | Opcode, Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements), |
2807 | Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH, |
2808 | CostKind, I)); |
2809 | } |
2810 | |
2811 | if (const auto *Entry = ConvertCostTableLookup(Table: ConversionTbl, ISD, |
2812 | Dst: DstTy.getSimpleVT(), |
2813 | Src: SrcTy.getSimpleVT())) |
2814 | return AdjustCost(Entry->Cost); |
2815 | |
2816 | static const TypeConversionCostTblEntry FP16Tbl[] = { |
2817 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs |
2818 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: 1}, |
2819 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1}, // fcvtzs |
2820 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: 1}, |
2821 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2}, // fcvtl+fcvtzs |
2822 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: 2}, |
2823 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2}, // fcvtzs+xtn |
2824 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: 2}, |
2825 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1}, // fcvtzs |
2826 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: 1}, |
2827 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4}, // 2*fcvtl+2*fcvtzs |
2828 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: 4}, |
2829 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3}, // 2*fcvtzs+xtn |
2830 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: 3}, |
2831 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2}, // 2*fcvtzs |
2832 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: 2}, |
2833 | {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8}, // 4*fcvtl+4*fcvtzs |
2834 | {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: 8}, |
2835 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // ushll + ucvtf |
2836 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: 2}, // sshll + scvtf |
2837 | {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * ushl(2) + 2 * ucvtf |
2838 | {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: 4}, // 2 * sshl(2) + 2 * scvtf |
2839 | }; |
2840 | |
2841 | if (ST->hasFullFP16()) |
2842 | if (const auto *Entry = ConvertCostTableLookup( |
2843 | Table: FP16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT())) |
2844 | return AdjustCost(Entry->Cost); |
2845 | |
2846 | if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && |
2847 | CCH == TTI::CastContextHint::Masked && |
2848 | ST->isSVEorStreamingSVEAvailable() && |
2849 | TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) == |
2850 | TargetLowering::TypePromoteInteger && |
2851 | TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) == |
2852 | TargetLowering::TypeSplitVector) { |
2853 | // The standard behaviour in the backend for these cases is to split the |
2854 | // extend up into two parts: |
2855 | // 1. Perform an extending load or masked load up to the legal type. |
2856 | // 2. Extend the loaded data to the final type. |
2857 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src); |
2858 | Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Context&: Src->getContext()); |
2859 | InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost( |
2860 | Opcode, Dst: LegalTy, Src, CCH, CostKind, I); |
2861 | InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost( |
2862 | Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I); |
2863 | return Part1 + Part2; |
2864 | } |
2865 | |
2866 | // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal, |
2867 | // but we also want to include the TTI::CastContextHint::Masked case too. |
2868 | if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && |
2869 | CCH == TTI::CastContextHint::Masked && |
2870 | ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(VT: DstTy)) |
2871 | CCH = TTI::CastContextHint::Normal; |
2872 | |
2873 | return AdjustCost( |
2874 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
2875 | } |
2876 | |
2877 | InstructionCost AArch64TTIImpl::(unsigned Opcode, |
2878 | Type *Dst, |
2879 | VectorType *VecTy, |
2880 | unsigned Index) { |
2881 | |
2882 | // Make sure we were given a valid extend opcode. |
2883 | assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && |
2884 | "Invalid opcode" ); |
2885 | |
2886 | // We are extending an element we extract from a vector, so the source type |
2887 | // of the extend is the element type of the vector. |
2888 | auto *Src = VecTy->getElementType(); |
2889 | |
2890 | // Sign- and zero-extends are for integer types only. |
2891 | assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type" ); |
2892 | |
2893 | // Get the cost for the extract. We compute the cost (if any) for the extend |
2894 | // below. |
2895 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
2896 | InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, |
2897 | CostKind, Index, Op0: nullptr, Op1: nullptr); |
2898 | |
2899 | // Legalize the types. |
2900 | auto VecLT = getTypeLegalizationCost(Ty: VecTy); |
2901 | auto DstVT = TLI->getValueType(DL, Ty: Dst); |
2902 | auto SrcVT = TLI->getValueType(DL, Ty: Src); |
2903 | |
2904 | // If the resulting type is still a vector and the destination type is legal, |
2905 | // we may get the extension for free. If not, get the default cost for the |
2906 | // extend. |
2907 | if (!VecLT.second.isVector() || !TLI->isTypeLegal(VT: DstVT)) |
2908 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
2909 | CostKind); |
2910 | |
2911 | // The destination type should be larger than the element type. If not, get |
2912 | // the default cost for the extend. |
2913 | if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) |
2914 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
2915 | CostKind); |
2916 | |
2917 | switch (Opcode) { |
2918 | default: |
2919 | llvm_unreachable("Opcode should be either SExt or ZExt" ); |
2920 | |
2921 | // For sign-extends, we only need a smov, which performs the extension |
2922 | // automatically. |
2923 | case Instruction::SExt: |
2924 | return Cost; |
2925 | |
2926 | // For zero-extends, the extend is performed automatically by a umov unless |
2927 | // the destination type is i64 and the element type is i8 or i16. |
2928 | case Instruction::ZExt: |
2929 | if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) |
2930 | return Cost; |
2931 | } |
2932 | |
2933 | // If we are unable to perform the extend for free, get the default cost. |
2934 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
2935 | CostKind); |
2936 | } |
2937 | |
2938 | InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, |
2939 | TTI::TargetCostKind CostKind, |
2940 | const Instruction *I) { |
2941 | if (CostKind != TTI::TCK_RecipThroughput) |
2942 | return Opcode == Instruction::PHI ? 0 : 1; |
2943 | assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind" ); |
2944 | // Branches are assumed to be predicted. |
2945 | return 0; |
2946 | } |
2947 | |
2948 | InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I, |
2949 | Type *Val, |
2950 | unsigned Index, |
2951 | bool HasRealUse) { |
2952 | assert(Val->isVectorTy() && "This must be a vector type" ); |
2953 | |
2954 | if (Index != -1U) { |
2955 | // Legalize the type. |
2956 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val); |
2957 | |
2958 | // This type is legalized to a scalar type. |
2959 | if (!LT.second.isVector()) |
2960 | return 0; |
2961 | |
2962 | // The type may be split. For fixed-width vectors we can normalize the |
2963 | // index to the new type. |
2964 | if (LT.second.isFixedLengthVector()) { |
2965 | unsigned Width = LT.second.getVectorNumElements(); |
2966 | Index = Index % Width; |
2967 | } |
2968 | |
2969 | // The element at index zero is already inside the vector. |
2970 | // - For a physical (HasRealUse==true) insert-element or extract-element |
2971 | // instruction that extracts integers, an explicit FPR -> GPR move is |
2972 | // needed. So it has non-zero cost. |
2973 | // - For the rest of cases (virtual instruction or element type is float), |
2974 | // consider the instruction free. |
2975 | if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) |
2976 | return 0; |
2977 | |
2978 | // This is recognising a LD1 single-element structure to one lane of one |
2979 | // register instruction. I.e., if this is an `insertelement` instruction, |
2980 | // and its second operand is a load, then we will generate a LD1, which |
2981 | // are expensive instructions. |
2982 | if (I && dyn_cast<LoadInst>(Val: I->getOperand(i: 1))) |
2983 | return ST->getVectorInsertExtractBaseCost() + 1; |
2984 | |
2985 | // i1 inserts and extract will include an extra cset or cmp of the vector |
2986 | // value. Increase the cost by 1 to account. |
2987 | if (Val->getScalarSizeInBits() == 1) |
2988 | return ST->getVectorInsertExtractBaseCost() + 1; |
2989 | |
2990 | // FIXME: |
2991 | // If the extract-element and insert-element instructions could be |
2992 | // simplified away (e.g., could be combined into users by looking at use-def |
2993 | // context), they have no cost. This is not done in the first place for |
2994 | // compile-time considerations. |
2995 | } |
2996 | |
2997 | // All other insert/extracts cost this much. |
2998 | return ST->getVectorInsertExtractBaseCost(); |
2999 | } |
3000 | |
3001 | InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
3002 | TTI::TargetCostKind CostKind, |
3003 | unsigned Index, Value *Op0, |
3004 | Value *Op1) { |
3005 | bool HasRealUse = |
3006 | Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Val: Op0); |
3007 | return getVectorInstrCostHelper(I: nullptr, Val, Index, HasRealUse); |
3008 | } |
3009 | |
3010 | InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, |
3011 | Type *Val, |
3012 | TTI::TargetCostKind CostKind, |
3013 | unsigned Index) { |
3014 | return getVectorInstrCostHelper(I: &I, Val, Index, HasRealUse: true /* HasRealUse */); |
3015 | } |
3016 | |
3017 | InstructionCost AArch64TTIImpl::getScalarizationOverhead( |
3018 | VectorType *Ty, const APInt &DemandedElts, bool Insert, bool , |
3019 | TTI::TargetCostKind CostKind) { |
3020 | if (isa<ScalableVectorType>(Val: Ty)) |
3021 | return InstructionCost::getInvalid(); |
3022 | if (Ty->getElementType()->isFloatingPointTy()) |
3023 | return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract, |
3024 | CostKind); |
3025 | return DemandedElts.popcount() * (Insert + Extract) * |
3026 | ST->getVectorInsertExtractBaseCost(); |
3027 | } |
3028 | |
3029 | InstructionCost AArch64TTIImpl::getArithmeticInstrCost( |
3030 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
3031 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
3032 | ArrayRef<const Value *> Args, |
3033 | const Instruction *CxtI) { |
3034 | |
3035 | // The code-generator is currently not able to handle scalable vectors |
3036 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3037 | // it. This change will be removed when code-generation for these types is |
3038 | // sufficiently reliable. |
3039 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty)) |
3040 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
3041 | return InstructionCost::getInvalid(); |
3042 | |
3043 | // TODO: Handle more cost kinds. |
3044 | if (CostKind != TTI::TCK_RecipThroughput) |
3045 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
3046 | Opd2Info: Op2Info, Args, CxtI); |
3047 | |
3048 | // Legalize the type. |
3049 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
3050 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3051 | |
3052 | switch (ISD) { |
3053 | default: |
3054 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
3055 | Opd2Info: Op2Info); |
3056 | case ISD::SDIV: |
3057 | if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) { |
3058 | // On AArch64, scalar signed division by constants power-of-two are |
3059 | // normally expanded to the sequence ADD + CMP + SELECT + SRA. |
3060 | // The OperandValue properties many not be same as that of previous |
3061 | // operation; conservatively assume OP_None. |
3062 | InstructionCost Cost = getArithmeticInstrCost( |
3063 | Opcode: Instruction::Add, Ty, CostKind, |
3064 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
3065 | Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind, |
3066 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
3067 | Cost += getArithmeticInstrCost( |
3068 | Opcode: Instruction::Select, Ty, CostKind, |
3069 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
3070 | Cost += getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind, |
3071 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
3072 | return Cost; |
3073 | } |
3074 | [[fallthrough]]; |
3075 | case ISD::UDIV: { |
3076 | if (Op2Info.isConstant() && Op2Info.isUniform()) { |
3077 | auto VT = TLI->getValueType(DL, Ty); |
3078 | if (TLI->isOperationLegalOrCustom(Op: ISD::MULHU, VT)) { |
3079 | // Vector signed division by constant are expanded to the |
3080 | // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division |
3081 | // to MULHS + SUB + SRL + ADD + SRL. |
3082 | InstructionCost MulCost = getArithmeticInstrCost( |
3083 | Opcode: Instruction::Mul, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
3084 | InstructionCost AddCost = getArithmeticInstrCost( |
3085 | Opcode: Instruction::Add, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
3086 | InstructionCost ShrCost = getArithmeticInstrCost( |
3087 | Opcode: Instruction::AShr, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
3088 | return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; |
3089 | } |
3090 | } |
3091 | |
3092 | InstructionCost Cost = BaseT::getArithmeticInstrCost( |
3093 | Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info); |
3094 | if (Ty->isVectorTy()) { |
3095 | if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) { |
3096 | // SDIV/UDIV operations are lowered using SVE, then we can have less |
3097 | // costs. |
3098 | if (isa<FixedVectorType>(Val: Ty) && cast<FixedVectorType>(Val: Ty) |
3099 | ->getPrimitiveSizeInBits() |
3100 | .getFixedValue() < 128) { |
3101 | EVT VT = TLI->getValueType(DL, Ty); |
3102 | static const CostTblEntry DivTbl[]{ |
3103 | {.ISD: ISD::SDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v4i8, .Cost: 8}, |
3104 | {.ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::SDIV, .Type: MVT::v2i16, .Cost: 5}, |
3105 | {.ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: 1}, |
3106 | {.ISD: ISD::UDIV, .Type: MVT::v2i8, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v4i8, .Cost: 8}, |
3107 | {.ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: 8}, {.ISD: ISD::UDIV, .Type: MVT::v2i16, .Cost: 5}, |
3108 | {.ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: 5}, {.ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: 1}}; |
3109 | |
3110 | const auto *Entry = CostTableLookup(Table: DivTbl, ISD, Ty: VT.getSimpleVT()); |
3111 | if (nullptr != Entry) |
3112 | return Entry->Cost; |
3113 | } |
3114 | // For 8/16-bit elements, the cost is higher because the type |
3115 | // requires promotion and possibly splitting: |
3116 | if (LT.second.getScalarType() == MVT::i8) |
3117 | Cost *= 8; |
3118 | else if (LT.second.getScalarType() == MVT::i16) |
3119 | Cost *= 4; |
3120 | return Cost; |
3121 | } else { |
3122 | // If one of the operands is a uniform constant then the cost for each |
3123 | // element is Cost for insertion, extraction and division. |
3124 | // Insertion cost = 2, Extraction Cost = 2, Division = cost for the |
3125 | // operation with scalar type |
3126 | if ((Op1Info.isConstant() && Op1Info.isUniform()) || |
3127 | (Op2Info.isConstant() && Op2Info.isUniform())) { |
3128 | if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) { |
3129 | InstructionCost DivCost = BaseT::getArithmeticInstrCost( |
3130 | Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info); |
3131 | return (4 + DivCost) * VTy->getNumElements(); |
3132 | } |
3133 | } |
3134 | // On AArch64, without SVE, vector divisions are expanded |
3135 | // into scalar divisions of each pair of elements. |
3136 | Cost += getArithmeticInstrCost(Opcode: Instruction::ExtractElement, Ty, |
3137 | CostKind, Op1Info, Op2Info); |
3138 | Cost += getArithmeticInstrCost(Opcode: Instruction::InsertElement, Ty, CostKind, |
3139 | Op1Info, Op2Info); |
3140 | } |
3141 | |
3142 | // TODO: if one of the arguments is scalar, then it's not necessary to |
3143 | // double the cost of handling the vector elements. |
3144 | Cost += Cost; |
3145 | } |
3146 | return Cost; |
3147 | } |
3148 | case ISD::MUL: |
3149 | // When SVE is available, then we can lower the v2i64 operation using |
3150 | // the SVE mul instruction, which has a lower cost. |
3151 | if (LT.second == MVT::v2i64 && ST->hasSVE()) |
3152 | return LT.first; |
3153 | |
3154 | // When SVE is not available, there is no MUL.2d instruction, |
3155 | // which means mul <2 x i64> is expensive as elements are extracted |
3156 | // from the vectors and the muls scalarized. |
3157 | // As getScalarizationOverhead is a bit too pessimistic, we |
3158 | // estimate the cost for a i64 vector directly here, which is: |
3159 | // - four 2-cost i64 extracts, |
3160 | // - two 2-cost i64 inserts, and |
3161 | // - two 1-cost muls. |
3162 | // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with |
3163 | // LT.first = 2 the cost is 28. If both operands are extensions it will not |
3164 | // need to scalarize so the cost can be cheaper (smull or umull). |
3165 | // so the cost can be cheaper (smull or umull). |
3166 | if (LT.second != MVT::v2i64 || isWideningInstruction(DstTy: Ty, Opcode, Args)) |
3167 | return LT.first; |
3168 | return LT.first * 14; |
3169 | case ISD::ADD: |
3170 | case ISD::XOR: |
3171 | case ISD::OR: |
3172 | case ISD::AND: |
3173 | case ISD::SRL: |
3174 | case ISD::SRA: |
3175 | case ISD::SHL: |
3176 | // These nodes are marked as 'custom' for combining purposes only. |
3177 | // We know that they are legal. See LowerAdd in ISelLowering. |
3178 | return LT.first; |
3179 | |
3180 | case ISD::FNEG: |
3181 | case ISD::FADD: |
3182 | case ISD::FSUB: |
3183 | // Increase the cost for half and bfloat types if not architecturally |
3184 | // supported. |
3185 | if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || |
3186 | (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) |
3187 | return 2 * LT.first; |
3188 | if (!Ty->getScalarType()->isFP128Ty()) |
3189 | return LT.first; |
3190 | [[fallthrough]]; |
3191 | case ISD::FMUL: |
3192 | case ISD::FDIV: |
3193 | // These nodes are marked as 'custom' just to lower them to SVE. |
3194 | // We know said lowering will incur no additional cost. |
3195 | if (!Ty->getScalarType()->isFP128Ty()) |
3196 | return 2 * LT.first; |
3197 | |
3198 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
3199 | Opd2Info: Op2Info); |
3200 | case ISD::FREM: |
3201 | // Pass nullptr as fmod/fmodf calls are emitted by the backend even when |
3202 | // those functions are not declared in the module. |
3203 | if (!Ty->isVectorTy()) |
3204 | return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind); |
3205 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
3206 | Opd2Info: Op2Info); |
3207 | } |
3208 | } |
3209 | |
3210 | InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, |
3211 | ScalarEvolution *SE, |
3212 | const SCEV *Ptr) { |
3213 | // Address computations in vectorized code with non-consecutive addresses will |
3214 | // likely result in more instructions compared to scalar code where the |
3215 | // computation can more often be merged into the index mode. The resulting |
3216 | // extra micro-ops can significantly decrease throughput. |
3217 | unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead; |
3218 | int MaxMergeDistance = 64; |
3219 | |
3220 | if (Ty->isVectorTy() && SE && |
3221 | !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + 1)) |
3222 | return NumVectorInstToHideOverhead; |
3223 | |
3224 | // In many cases the address computation is not merged into the instruction |
3225 | // addressing mode. |
3226 | return 1; |
3227 | } |
3228 | |
3229 | InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
3230 | Type *CondTy, |
3231 | CmpInst::Predicate VecPred, |
3232 | TTI::TargetCostKind CostKind, |
3233 | const Instruction *I) { |
3234 | // TODO: Handle other cost kinds. |
3235 | if (CostKind != TTI::TCK_RecipThroughput) |
3236 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
3237 | I); |
3238 | |
3239 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3240 | // We don't lower some vector selects well that are wider than the register |
3241 | // width. |
3242 | if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SELECT) { |
3243 | // We would need this many instructions to hide the scalarization happening. |
3244 | const int AmortizationCost = 20; |
3245 | |
3246 | // If VecPred is not set, check if we can get a predicate from the context |
3247 | // instruction, if its type matches the requested ValTy. |
3248 | if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { |
3249 | CmpInst::Predicate CurrentPred; |
3250 | if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(), |
3251 | R: m_Value()))) |
3252 | VecPred = CurrentPred; |
3253 | } |
3254 | // Check if we have a compare/select chain that can be lowered using |
3255 | // a (F)CMxx & BFI pair. |
3256 | if (CmpInst::isIntPredicate(P: VecPred) || VecPred == CmpInst::FCMP_OLE || |
3257 | VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || |
3258 | VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || |
3259 | VecPred == CmpInst::FCMP_UNE) { |
3260 | static const auto ValidMinMaxTys = { |
3261 | MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, |
3262 | MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; |
3263 | static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; |
3264 | |
3265 | auto LT = getTypeLegalizationCost(Ty: ValTy); |
3266 | if (any_of(Range: ValidMinMaxTys, P: [<](MVT M) { return M == LT.second; }) || |
3267 | (ST->hasFullFP16() && |
3268 | any_of(Range: ValidFP16MinMaxTys, P: [<](MVT M) { return M == LT.second; }))) |
3269 | return LT.first; |
3270 | } |
3271 | |
3272 | static const TypeConversionCostTblEntry |
3273 | VectorSelectTbl[] = { |
3274 | { .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f32, .Cost: 2 }, |
3275 | { .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f64, .Cost: 2 }, |
3276 | { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f32, .Cost: 2 }, |
3277 | { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f16, .Cost: 2 }, |
3278 | { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8f16, .Cost: 2 }, |
3279 | { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: 16 }, |
3280 | { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: 8 }, |
3281 | { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: 16 }, |
3282 | { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: 4 * AmortizationCost }, |
3283 | { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: 8 * AmortizationCost }, |
3284 | { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: 16 * AmortizationCost } |
3285 | }; |
3286 | |
3287 | EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy); |
3288 | EVT SelValTy = TLI->getValueType(DL, Ty: ValTy); |
3289 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { |
3290 | if (const auto *Entry = ConvertCostTableLookup(Table: VectorSelectTbl, ISD, |
3291 | Dst: SelCondTy.getSimpleVT(), |
3292 | Src: SelValTy.getSimpleVT())) |
3293 | return Entry->Cost; |
3294 | } |
3295 | } |
3296 | |
3297 | if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SETCC) { |
3298 | auto LT = getTypeLegalizationCost(Ty: ValTy); |
3299 | // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back. |
3300 | if (LT.second == MVT::v4f16 && !ST->hasFullFP16()) |
3301 | return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn |
3302 | } |
3303 | |
3304 | // Treat the icmp in icmp(and, 0) as free, as we can make use of ands. |
3305 | // FIXME: This can apply to more conditions and add/sub if it can be shown to |
3306 | // be profitable. |
3307 | if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I && |
3308 | ICmpInst::isEquality(P: VecPred) && |
3309 | TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) && |
3310 | match(V: I->getOperand(i: 1), P: m_Zero()) && |
3311 | match(V: I->getOperand(i: 0), P: m_And(L: m_Value(), R: m_Value()))) |
3312 | return 0; |
3313 | |
3314 | // The base case handles scalable vectors fine for now, since it treats the |
3315 | // cost as 1 * legalization cost. |
3316 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); |
3317 | } |
3318 | |
3319 | AArch64TTIImpl::TTI::MemCmpExpansionOptions |
3320 | AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { |
3321 | TTI::MemCmpExpansionOptions Options; |
3322 | if (ST->requiresStrictAlign()) { |
3323 | // TODO: Add cost modeling for strict align. Misaligned loads expand to |
3324 | // a bunch of instructions when strict align is enabled. |
3325 | return Options; |
3326 | } |
3327 | Options.AllowOverlappingLoads = true; |
3328 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); |
3329 | Options.NumLoadsPerBlock = Options.MaxNumLoads; |
3330 | // TODO: Though vector loads usually perform well on AArch64, in some targets |
3331 | // they may wake up the FP unit, which raises the power consumption. Perhaps |
3332 | // they could be used with no holds barred (-O3). |
3333 | Options.LoadSizes = {8, 4, 2, 1}; |
3334 | Options.AllowedTailExpansions = {3, 5, 6}; |
3335 | return Options; |
3336 | } |
3337 | |
3338 | bool AArch64TTIImpl::prefersVectorizedAddressing() const { |
3339 | return ST->hasSVE(); |
3340 | } |
3341 | |
3342 | InstructionCost |
3343 | AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, |
3344 | Align Alignment, unsigned AddressSpace, |
3345 | TTI::TargetCostKind CostKind) { |
3346 | if (useNeonVector(Ty: Src)) |
3347 | return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace, |
3348 | CostKind); |
3349 | auto LT = getTypeLegalizationCost(Ty: Src); |
3350 | if (!LT.first.isValid()) |
3351 | return InstructionCost::getInvalid(); |
3352 | |
3353 | // Return an invalid cost for element types that we are unable to lower. |
3354 | auto *VT = cast<VectorType>(Val: Src); |
3355 | if (VT->getElementType()->isIntegerTy(Bitwidth: 1)) |
3356 | return InstructionCost::getInvalid(); |
3357 | |
3358 | // The code-generator is currently not able to handle scalable vectors |
3359 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3360 | // it. This change will be removed when code-generation for these types is |
3361 | // sufficiently reliable. |
3362 | if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
3363 | return InstructionCost::getInvalid(); |
3364 | |
3365 | return LT.first; |
3366 | } |
3367 | |
3368 | static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { |
3369 | return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; |
3370 | } |
3371 | |
3372 | InstructionCost AArch64TTIImpl::getGatherScatterOpCost( |
3373 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
3374 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { |
3375 | if (useNeonVector(Ty: DataTy) || !isLegalMaskedGatherScatter(DataType: DataTy)) |
3376 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
3377 | Alignment, CostKind, I); |
3378 | auto *VT = cast<VectorType>(Val: DataTy); |
3379 | auto LT = getTypeLegalizationCost(Ty: DataTy); |
3380 | if (!LT.first.isValid()) |
3381 | return InstructionCost::getInvalid(); |
3382 | |
3383 | // Return an invalid cost for element types that we are unable to lower. |
3384 | if (!LT.second.isVector() || |
3385 | !isElementTypeLegalForScalableVector(Ty: VT->getElementType()) || |
3386 | VT->getElementType()->isIntegerTy(Bitwidth: 1)) |
3387 | return InstructionCost::getInvalid(); |
3388 | |
3389 | // The code-generator is currently not able to handle scalable vectors |
3390 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3391 | // it. This change will be removed when code-generation for these types is |
3392 | // sufficiently reliable. |
3393 | if (VT->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
3394 | return InstructionCost::getInvalid(); |
3395 | |
3396 | ElementCount LegalVF = LT.second.getVectorElementCount(); |
3397 | InstructionCost MemOpCost = |
3398 | getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: 0, CostKind, |
3399 | OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I); |
3400 | // Add on an overhead cost for using gathers/scatters. |
3401 | // TODO: At the moment this is applied unilaterally for all CPUs, but at some |
3402 | // point we may want a per-CPU overhead. |
3403 | MemOpCost *= getSVEGatherScatterOverhead(Opcode); |
3404 | return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF); |
3405 | } |
3406 | |
3407 | bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { |
3408 | return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors(); |
3409 | } |
3410 | |
3411 | InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, |
3412 | MaybeAlign Alignment, |
3413 | unsigned AddressSpace, |
3414 | TTI::TargetCostKind CostKind, |
3415 | TTI::OperandValueInfo OpInfo, |
3416 | const Instruction *I) { |
3417 | EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true); |
3418 | // Type legalization can't handle structs |
3419 | if (VT == MVT::Other) |
3420 | return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace, |
3421 | CostKind); |
3422 | |
3423 | auto LT = getTypeLegalizationCost(Ty); |
3424 | if (!LT.first.isValid()) |
3425 | return InstructionCost::getInvalid(); |
3426 | |
3427 | // The code-generator is currently not able to handle scalable vectors |
3428 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3429 | // it. This change will be removed when code-generation for these types is |
3430 | // sufficiently reliable. |
3431 | // We also only support full register predicate loads and stores. |
3432 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty)) |
3433 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1) || |
3434 | (VTy->getElementType()->isIntegerTy(Bitwidth: 1) && |
3435 | !VTy->getElementCount().isKnownMultipleOf( |
3436 | RHS: ElementCount::getScalable(MinVal: 16)))) |
3437 | return InstructionCost::getInvalid(); |
3438 | |
3439 | // TODO: consider latency as well for TCK_SizeAndLatency. |
3440 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) |
3441 | return LT.first; |
3442 | |
3443 | if (CostKind != TTI::TCK_RecipThroughput) |
3444 | return 1; |
3445 | |
3446 | if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && |
3447 | LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { |
3448 | // Unaligned stores are extremely inefficient. We don't split all |
3449 | // unaligned 128-bit stores because the negative impact that has shown in |
3450 | // practice on inlined block copy code. |
3451 | // We make such stores expensive so that we will only vectorize if there |
3452 | // are 6 other instructions getting vectorized. |
3453 | const int AmortizationCost = 6; |
3454 | |
3455 | return LT.first * 2 * AmortizationCost; |
3456 | } |
3457 | |
3458 | // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs. |
3459 | if (Ty->isPtrOrPtrVectorTy()) |
3460 | return LT.first; |
3461 | |
3462 | if (useNeonVector(Ty)) { |
3463 | // Check truncating stores and extending loads. |
3464 | if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { |
3465 | // v4i8 types are lowered to scalar a load/store and sshll/xtn. |
3466 | if (VT == MVT::v4i8) |
3467 | return 2; |
3468 | // Otherwise we need to scalarize. |
3469 | return cast<FixedVectorType>(Val: Ty)->getNumElements() * 2; |
3470 | } |
3471 | EVT EltVT = VT.getVectorElementType(); |
3472 | unsigned EltSize = EltVT.getScalarSizeInBits(); |
3473 | if (!isPowerOf2_32(Value: EltSize) || EltSize < 8 || EltSize > 64 || |
3474 | VT.getVectorNumElements() >= (128 / EltSize) || !Alignment || |
3475 | *Alignment != Align(1)) |
3476 | return LT.first; |
3477 | // FIXME: v3i8 lowering currently is very inefficient, due to automatic |
3478 | // widening to v4i8, which produces suboptimal results. |
3479 | if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8) |
3480 | return LT.first; |
3481 | |
3482 | // Check non-power-of-2 loads/stores for legal vector element types with |
3483 | // NEON. Non-power-of-2 memory ops will get broken down to a set of |
3484 | // operations on smaller power-of-2 ops, including ld1/st1. |
3485 | LLVMContext &C = Ty->getContext(); |
3486 | InstructionCost Cost(0); |
3487 | SmallVector<EVT> TypeWorklist; |
3488 | TypeWorklist.push_back(Elt: VT); |
3489 | while (!TypeWorklist.empty()) { |
3490 | EVT CurrVT = TypeWorklist.pop_back_val(); |
3491 | unsigned CurrNumElements = CurrVT.getVectorNumElements(); |
3492 | if (isPowerOf2_32(Value: CurrNumElements)) { |
3493 | Cost += 1; |
3494 | continue; |
3495 | } |
3496 | |
3497 | unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / 2; |
3498 | TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2)); |
3499 | TypeWorklist.push_back( |
3500 | Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2)); |
3501 | } |
3502 | return Cost; |
3503 | } |
3504 | |
3505 | return LT.first; |
3506 | } |
3507 | |
3508 | InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( |
3509 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
3510 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
3511 | bool UseMaskForCond, bool UseMaskForGaps) { |
3512 | assert(Factor >= 2 && "Invalid interleave factor" ); |
3513 | auto *VecVTy = cast<VectorType>(Val: VecTy); |
3514 | |
3515 | if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2)) |
3516 | return InstructionCost::getInvalid(); |
3517 | |
3518 | // Vectorization for masked interleaved accesses is only enabled for scalable |
3519 | // VF. |
3520 | if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) |
3521 | return InstructionCost::getInvalid(); |
3522 | |
3523 | if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { |
3524 | unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); |
3525 | auto *SubVecTy = |
3526 | VectorType::get(ElementType: VecVTy->getElementType(), |
3527 | EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor)); |
3528 | |
3529 | // ldN/stN only support legal vector types of size 64 or 128 in bits. |
3530 | // Accesses having vector types that are a multiple of 128 bits can be |
3531 | // matched to more than one ldN/stN instruction. |
3532 | bool UseScalable; |
3533 | if (MinElts % Factor == 0 && |
3534 | TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable)) |
3535 | return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable); |
3536 | } |
3537 | |
3538 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
3539 | Alignment, AddressSpace, CostKind, |
3540 | UseMaskForCond, UseMaskForGaps); |
3541 | } |
3542 | |
3543 | InstructionCost |
3544 | AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { |
3545 | InstructionCost Cost = 0; |
3546 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
3547 | for (auto *I : Tys) { |
3548 | if (!I->isVectorTy()) |
3549 | continue; |
3550 | if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() == |
3551 | 128) |
3552 | Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind) + |
3553 | getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind); |
3554 | } |
3555 | return Cost; |
3556 | } |
3557 | |
3558 | unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) { |
3559 | return ST->getMaxInterleaveFactor(); |
3560 | } |
3561 | |
3562 | // For Falkor, we want to avoid having too many strided loads in a loop since |
3563 | // that can exhaust the HW prefetcher resources. We adjust the unroller |
3564 | // MaxCount preference below to attempt to ensure unrolling doesn't create too |
3565 | // many strided loads. |
3566 | static void |
3567 | getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
3568 | TargetTransformInfo::UnrollingPreferences &UP) { |
3569 | enum { MaxStridedLoads = 7 }; |
3570 | auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { |
3571 | int StridedLoads = 0; |
3572 | // FIXME? We could make this more precise by looking at the CFG and |
3573 | // e.g. not counting loads in each side of an if-then-else diamond. |
3574 | for (const auto BB : L->blocks()) { |
3575 | for (auto &I : *BB) { |
3576 | LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I); |
3577 | if (!LMemI) |
3578 | continue; |
3579 | |
3580 | Value *PtrValue = LMemI->getPointerOperand(); |
3581 | if (L->isLoopInvariant(V: PtrValue)) |
3582 | continue; |
3583 | |
3584 | const SCEV *LSCEV = SE.getSCEV(V: PtrValue); |
3585 | const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV); |
3586 | if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) |
3587 | continue; |
3588 | |
3589 | // FIXME? We could take pairing of unrolled load copies into account |
3590 | // by looking at the AddRec, but we would probably have to limit this |
3591 | // to loops with no stores or other memory optimization barriers. |
3592 | ++StridedLoads; |
3593 | // We've seen enough strided loads that seeing more won't make a |
3594 | // difference. |
3595 | if (StridedLoads > MaxStridedLoads / 2) |
3596 | return StridedLoads; |
3597 | } |
3598 | } |
3599 | return StridedLoads; |
3600 | }; |
3601 | |
3602 | int StridedLoads = countStridedLoads(L, SE); |
3603 | LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads |
3604 | << " strided loads\n" ); |
3605 | // Pick the largest power of 2 unroll count that won't result in too many |
3606 | // strided loads. |
3607 | if (StridedLoads) { |
3608 | UP.MaxCount = 1 << Log2_32(Value: MaxStridedLoads / StridedLoads); |
3609 | LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " |
3610 | << UP.MaxCount << '\n'); |
3611 | } |
3612 | } |
3613 | |
3614 | void AArch64TTIImpl::(Loop *L, ScalarEvolution &SE, |
3615 | TTI::UnrollingPreferences &UP, |
3616 | OptimizationRemarkEmitter *ORE) { |
3617 | // Enable partial unrolling and runtime unrolling. |
3618 | BaseT::getUnrollingPreferences(L, SE, UP, ORE); |
3619 | |
3620 | UP.UpperBound = true; |
3621 | |
3622 | // For inner loop, it is more likely to be a hot one, and the runtime check |
3623 | // can be promoted out from LICM pass, so the overhead is less, let's try |
3624 | // a larger threshold to unroll more loops. |
3625 | if (L->getLoopDepth() > 1) |
3626 | UP.PartialThreshold *= 2; |
3627 | |
3628 | // Disable partial & runtime unrolling on -Os. |
3629 | UP.PartialOptSizeThreshold = 0; |
3630 | |
3631 | if (ST->getProcFamily() == AArch64Subtarget::Falkor && |
3632 | EnableFalkorHWPFUnrollFix) |
3633 | getFalkorUnrollingPreferences(L, SE, UP); |
3634 | |
3635 | // Scan the loop: don't unroll loops with calls as this could prevent |
3636 | // inlining. Don't unroll vector loops either, as they don't benefit much from |
3637 | // unrolling. |
3638 | for (auto *BB : L->getBlocks()) { |
3639 | for (auto &I : *BB) { |
3640 | // Don't unroll vectorised loop. |
3641 | if (I.getType()->isVectorTy()) |
3642 | return; |
3643 | |
3644 | if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) { |
3645 | if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) { |
3646 | if (!isLoweredToCall(F)) |
3647 | continue; |
3648 | } |
3649 | return; |
3650 | } |
3651 | } |
3652 | } |
3653 | |
3654 | // Enable runtime unrolling for in-order models |
3655 | // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by |
3656 | // checking for that case, we can ensure that the default behaviour is |
3657 | // unchanged |
3658 | if (ST->getProcFamily() != AArch64Subtarget::Others && |
3659 | !ST->getSchedModel().isOutOfOrder()) { |
3660 | UP.Runtime = true; |
3661 | UP.Partial = true; |
3662 | UP.UnrollRemainder = true; |
3663 | UP.DefaultUnrollRuntimeCount = 4; |
3664 | |
3665 | UP.UnrollAndJam = true; |
3666 | UP.UnrollAndJamInnerLoopThreshold = 60; |
3667 | } |
3668 | } |
3669 | |
3670 | void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
3671 | TTI::PeelingPreferences &PP) { |
3672 | BaseT::getPeelingPreferences(L, SE, PP); |
3673 | } |
3674 | |
3675 | Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, |
3676 | Type *ExpectedType) { |
3677 | switch (Inst->getIntrinsicID()) { |
3678 | default: |
3679 | return nullptr; |
3680 | case Intrinsic::aarch64_neon_st2: |
3681 | case Intrinsic::aarch64_neon_st3: |
3682 | case Intrinsic::aarch64_neon_st4: { |
3683 | // Create a struct type |
3684 | StructType *ST = dyn_cast<StructType>(Val: ExpectedType); |
3685 | if (!ST) |
3686 | return nullptr; |
3687 | unsigned NumElts = Inst->arg_size() - 1; |
3688 | if (ST->getNumElements() != NumElts) |
3689 | return nullptr; |
3690 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
3691 | if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i)) |
3692 | return nullptr; |
3693 | } |
3694 | Value *Res = PoisonValue::get(T: ExpectedType); |
3695 | IRBuilder<> Builder(Inst); |
3696 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
3697 | Value *L = Inst->getArgOperand(i); |
3698 | Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i); |
3699 | } |
3700 | return Res; |
3701 | } |
3702 | case Intrinsic::aarch64_neon_ld2: |
3703 | case Intrinsic::aarch64_neon_ld3: |
3704 | case Intrinsic::aarch64_neon_ld4: |
3705 | if (Inst->getType() == ExpectedType) |
3706 | return Inst; |
3707 | return nullptr; |
3708 | } |
3709 | } |
3710 | |
3711 | bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, |
3712 | MemIntrinsicInfo &Info) { |
3713 | switch (Inst->getIntrinsicID()) { |
3714 | default: |
3715 | break; |
3716 | case Intrinsic::aarch64_neon_ld2: |
3717 | case Intrinsic::aarch64_neon_ld3: |
3718 | case Intrinsic::aarch64_neon_ld4: |
3719 | Info.ReadMem = true; |
3720 | Info.WriteMem = false; |
3721 | Info.PtrVal = Inst->getArgOperand(i: 0); |
3722 | break; |
3723 | case Intrinsic::aarch64_neon_st2: |
3724 | case Intrinsic::aarch64_neon_st3: |
3725 | case Intrinsic::aarch64_neon_st4: |
3726 | Info.ReadMem = false; |
3727 | Info.WriteMem = true; |
3728 | Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - 1); |
3729 | break; |
3730 | } |
3731 | |
3732 | switch (Inst->getIntrinsicID()) { |
3733 | default: |
3734 | return false; |
3735 | case Intrinsic::aarch64_neon_ld2: |
3736 | case Intrinsic::aarch64_neon_st2: |
3737 | Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; |
3738 | break; |
3739 | case Intrinsic::aarch64_neon_ld3: |
3740 | case Intrinsic::aarch64_neon_st3: |
3741 | Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; |
3742 | break; |
3743 | case Intrinsic::aarch64_neon_ld4: |
3744 | case Intrinsic::aarch64_neon_st4: |
3745 | Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; |
3746 | break; |
3747 | } |
3748 | return true; |
3749 | } |
3750 | |
3751 | /// See if \p I should be considered for address type promotion. We check if \p |
3752 | /// I is a sext with right type and used in memory accesses. If it used in a |
3753 | /// "complex" getelementptr, we allow it to be promoted without finding other |
3754 | /// sext instructions that sign extended the same initial value. A getelementptr |
3755 | /// is considered as "complex" if it has more than 2 operands. |
3756 | bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( |
3757 | const Instruction &I, bool &) { |
3758 | bool Considerable = false; |
3759 | AllowPromotionWithoutCommonHeader = false; |
3760 | if (!isa<SExtInst>(Val: &I)) |
3761 | return false; |
3762 | Type *ConsideredSExtType = |
3763 | Type::getInt64Ty(C&: I.getParent()->getParent()->getContext()); |
3764 | if (I.getType() != ConsideredSExtType) |
3765 | return false; |
3766 | // See if the sext is the one with the right type and used in at least one |
3767 | // GetElementPtrInst. |
3768 | for (const User *U : I.users()) { |
3769 | if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) { |
3770 | Considerable = true; |
3771 | // A getelementptr is considered as "complex" if it has more than 2 |
3772 | // operands. We will promote a SExt used in such complex GEP as we |
3773 | // expect some computation to be merged if they are done on 64 bits. |
3774 | if (GEPInst->getNumOperands() > 2) { |
3775 | AllowPromotionWithoutCommonHeader = true; |
3776 | break; |
3777 | } |
3778 | } |
3779 | } |
3780 | return Considerable; |
3781 | } |
3782 | |
3783 | bool AArch64TTIImpl::isLegalToVectorizeReduction( |
3784 | const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { |
3785 | if (!VF.isScalable()) |
3786 | return true; |
3787 | |
3788 | Type *Ty = RdxDesc.getRecurrenceType(); |
3789 | if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) |
3790 | return false; |
3791 | |
3792 | switch (RdxDesc.getRecurrenceKind()) { |
3793 | case RecurKind::Add: |
3794 | case RecurKind::FAdd: |
3795 | case RecurKind::And: |
3796 | case RecurKind::Or: |
3797 | case RecurKind::Xor: |
3798 | case RecurKind::SMin: |
3799 | case RecurKind::SMax: |
3800 | case RecurKind::UMin: |
3801 | case RecurKind::UMax: |
3802 | case RecurKind::FMin: |
3803 | case RecurKind::FMax: |
3804 | case RecurKind::FMulAdd: |
3805 | case RecurKind::IAnyOf: |
3806 | case RecurKind::FAnyOf: |
3807 | return true; |
3808 | default: |
3809 | return false; |
3810 | } |
3811 | } |
3812 | |
3813 | InstructionCost |
3814 | AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
3815 | FastMathFlags FMF, |
3816 | TTI::TargetCostKind CostKind) { |
3817 | // The code-generator is currently not able to handle scalable vectors |
3818 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3819 | // it. This change will be removed when code-generation for these types is |
3820 | // sufficiently reliable. |
3821 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty)) |
3822 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
3823 | return InstructionCost::getInvalid(); |
3824 | |
3825 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
3826 | |
3827 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) |
3828 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
3829 | |
3830 | InstructionCost LegalizationCost = 0; |
3831 | if (LT.first > 1) { |
3832 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Ty->getContext()); |
3833 | IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF); |
3834 | LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - 1); |
3835 | } |
3836 | |
3837 | return LegalizationCost + /*Cost of horizontal reduction*/ 2; |
3838 | } |
3839 | |
3840 | InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( |
3841 | unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { |
3842 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
3843 | InstructionCost LegalizationCost = 0; |
3844 | if (LT.first > 1) { |
3845 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: ValTy->getContext()); |
3846 | LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind); |
3847 | LegalizationCost *= LT.first - 1; |
3848 | } |
3849 | |
3850 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3851 | assert(ISD && "Invalid opcode" ); |
3852 | // Add the final reduction cost for the legal horizontal reduction |
3853 | switch (ISD) { |
3854 | case ISD::ADD: |
3855 | case ISD::AND: |
3856 | case ISD::OR: |
3857 | case ISD::XOR: |
3858 | case ISD::FADD: |
3859 | return LegalizationCost + 2; |
3860 | default: |
3861 | return InstructionCost::getInvalid(); |
3862 | } |
3863 | } |
3864 | |
3865 | InstructionCost |
3866 | AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, |
3867 | std::optional<FastMathFlags> FMF, |
3868 | TTI::TargetCostKind CostKind) { |
3869 | // The code-generator is currently not able to handle scalable vectors |
3870 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3871 | // it. This change will be removed when code-generation for these types is |
3872 | // sufficiently reliable. |
3873 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: ValTy)) |
3874 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
3875 | return InstructionCost::getInvalid(); |
3876 | |
3877 | if (TTI::requiresOrderedReduction(FMF)) { |
3878 | if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) { |
3879 | InstructionCost BaseCost = |
3880 | BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
3881 | // Add on extra cost to reflect the extra overhead on some CPUs. We still |
3882 | // end up vectorizing for more computationally intensive loops. |
3883 | return BaseCost + FixedVTy->getNumElements(); |
3884 | } |
3885 | |
3886 | if (Opcode != Instruction::FAdd) |
3887 | return InstructionCost::getInvalid(); |
3888 | |
3889 | auto *VTy = cast<ScalableVectorType>(Val: ValTy); |
3890 | InstructionCost Cost = |
3891 | getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind); |
3892 | Cost *= getMaxNumElements(VF: VTy->getElementCount()); |
3893 | return Cost; |
3894 | } |
3895 | |
3896 | if (isa<ScalableVectorType>(Val: ValTy)) |
3897 | return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); |
3898 | |
3899 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
3900 | MVT MTy = LT.second; |
3901 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3902 | assert(ISD && "Invalid opcode" ); |
3903 | |
3904 | // Horizontal adds can use the 'addv' instruction. We model the cost of these |
3905 | // instructions as twice a normal vector add, plus 1 for each legalization |
3906 | // step (LT.first). This is the only arithmetic vector reduction operation for |
3907 | // which we have an instruction. |
3908 | // OR, XOR and AND costs should match the codegen from: |
3909 | // OR: llvm/test/CodeGen/AArch64/reduce-or.ll |
3910 | // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll |
3911 | // AND: llvm/test/CodeGen/AArch64/reduce-and.ll |
3912 | static const CostTblEntry CostTblNoPairwise[]{ |
3913 | {.ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: 2}, |
3914 | {.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 2}, |
3915 | {.ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: 2}, |
3916 | {.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 2}, |
3917 | {.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 2}, |
3918 | {.ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 2}, |
3919 | {.ISD: ISD::OR, .Type: MVT::v8i8, .Cost: 15}, |
3920 | {.ISD: ISD::OR, .Type: MVT::v16i8, .Cost: 17}, |
3921 | {.ISD: ISD::OR, .Type: MVT::v4i16, .Cost: 7}, |
3922 | {.ISD: ISD::OR, .Type: MVT::v8i16, .Cost: 9}, |
3923 | {.ISD: ISD::OR, .Type: MVT::v2i32, .Cost: 3}, |
3924 | {.ISD: ISD::OR, .Type: MVT::v4i32, .Cost: 5}, |
3925 | {.ISD: ISD::OR, .Type: MVT::v2i64, .Cost: 3}, |
3926 | {.ISD: ISD::XOR, .Type: MVT::v8i8, .Cost: 15}, |
3927 | {.ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: 17}, |
3928 | {.ISD: ISD::XOR, .Type: MVT::v4i16, .Cost: 7}, |
3929 | {.ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: 9}, |
3930 | {.ISD: ISD::XOR, .Type: MVT::v2i32, .Cost: 3}, |
3931 | {.ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: 5}, |
3932 | {.ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: 3}, |
3933 | {.ISD: ISD::AND, .Type: MVT::v8i8, .Cost: 15}, |
3934 | {.ISD: ISD::AND, .Type: MVT::v16i8, .Cost: 17}, |
3935 | {.ISD: ISD::AND, .Type: MVT::v4i16, .Cost: 7}, |
3936 | {.ISD: ISD::AND, .Type: MVT::v8i16, .Cost: 9}, |
3937 | {.ISD: ISD::AND, .Type: MVT::v2i32, .Cost: 3}, |
3938 | {.ISD: ISD::AND, .Type: MVT::v4i32, .Cost: 5}, |
3939 | {.ISD: ISD::AND, .Type: MVT::v2i64, .Cost: 3}, |
3940 | }; |
3941 | switch (ISD) { |
3942 | default: |
3943 | break; |
3944 | case ISD::ADD: |
3945 | if (const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy)) |
3946 | return (LT.first - 1) + Entry->Cost; |
3947 | break; |
3948 | case ISD::XOR: |
3949 | case ISD::AND: |
3950 | case ISD::OR: |
3951 | const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy); |
3952 | if (!Entry) |
3953 | break; |
3954 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
3955 | if (MTy.getVectorNumElements() <= ValVTy->getNumElements() && |
3956 | isPowerOf2_32(Value: ValVTy->getNumElements())) { |
3957 | InstructionCost = 0; |
3958 | if (LT.first != 1) { |
3959 | // Type needs to be split, so there is an extra cost of LT.first - 1 |
3960 | // arithmetic ops. |
3961 | auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(), |
3962 | NumElts: MTy.getVectorNumElements()); |
3963 | ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); |
3964 | ExtraCost *= LT.first - 1; |
3965 | } |
3966 | // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov |
3967 | auto Cost = ValVTy->getElementType()->isIntegerTy(Bitwidth: 1) ? 2 : Entry->Cost; |
3968 | return Cost + ExtraCost; |
3969 | } |
3970 | break; |
3971 | } |
3972 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
3973 | } |
3974 | |
3975 | InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { |
3976 | static const CostTblEntry ShuffleTbl[] = { |
3977 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv16i8, .Cost: 1 }, |
3978 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv8i16, .Cost: 1 }, |
3979 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4i32, .Cost: 1 }, |
3980 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2i64, .Cost: 1 }, |
3981 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f16, .Cost: 1 }, |
3982 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f16, .Cost: 1 }, |
3983 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv8f16, .Cost: 1 }, |
3984 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2bf16, .Cost: 1 }, |
3985 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4bf16, .Cost: 1 }, |
3986 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv8bf16, .Cost: 1 }, |
3987 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f32, .Cost: 1 }, |
3988 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv4f32, .Cost: 1 }, |
3989 | { .ISD: TTI::SK_Splice, .Type: MVT::nxv2f64, .Cost: 1 }, |
3990 | }; |
3991 | |
3992 | // The code-generator is currently not able to handle scalable vectors |
3993 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3994 | // it. This change will be removed when code-generation for these types is |
3995 | // sufficiently reliable. |
3996 | if (Tp->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
3997 | return InstructionCost::getInvalid(); |
3998 | |
3999 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp); |
4000 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Tp->getContext()); |
4001 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
4002 | EVT PromotedVT = LT.second.getScalarType() == MVT::i1 |
4003 | ? TLI->getPromotedVTForPredicate(VT: EVT(LT.second)) |
4004 | : LT.second; |
4005 | Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Context&: Tp->getContext()); |
4006 | InstructionCost LegalizationCost = 0; |
4007 | if (Index < 0) { |
4008 | LegalizationCost = |
4009 | getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy, |
4010 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) + |
4011 | getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy, |
4012 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
4013 | } |
4014 | |
4015 | // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp |
4016 | // Cost performed on a promoted type. |
4017 | if (LT.second.getScalarType() == MVT::i1) { |
4018 | LegalizationCost += |
4019 | getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy, |
4020 | CCH: TTI::CastContextHint::None, CostKind) + |
4021 | getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy, |
4022 | CCH: TTI::CastContextHint::None, CostKind); |
4023 | } |
4024 | const auto *Entry = |
4025 | CostTableLookup(Table: ShuffleTbl, ISD: TTI::SK_Splice, Ty: PromotedVT.getSimpleVT()); |
4026 | assert(Entry && "Illegal Type for Splice" ); |
4027 | LegalizationCost += Entry->Cost; |
4028 | return LegalizationCost * LT.first; |
4029 | } |
4030 | |
4031 | InstructionCost AArch64TTIImpl::getShuffleCost( |
4032 | TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, |
4033 | TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, |
4034 | ArrayRef<const Value *> Args, const Instruction *CxtI) { |
4035 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp); |
4036 | |
4037 | // If we have a Mask, and the LT is being legalized somehow, split the Mask |
4038 | // into smaller vectors and sum the cost of each shuffle. |
4039 | if (!Mask.empty() && isa<FixedVectorType>(Val: Tp) && LT.second.isVector() && |
4040 | Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && |
4041 | Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) { |
4042 | |
4043 | // Check for LD3/LD4 instructions, which are represented in llvm IR as |
4044 | // deinterleaving-shuffle(load). The shuffle cost could potentially be free, |
4045 | // but we model it with a cost of LT.first so that LD3/LD4 have a higher |
4046 | // cost than just the load. |
4047 | if (Args.size() >= 1 && isa<LoadInst>(Val: Args[0]) && |
4048 | (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 3) || |
4049 | ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 4))) |
4050 | return std::max<InstructionCost>(a: 1, b: LT.first / 4); |
4051 | |
4052 | // Check for ST3/ST4 instructions, which are represented in llvm IR as |
4053 | // store(interleaving-shuffle). The shuffle cost could potentially be free, |
4054 | // but we model it with a cost of LT.first so that ST3/ST4 have a higher |
4055 | // cost than just the store. |
4056 | if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) && |
4057 | (ShuffleVectorInst::isInterleaveMask( |
4058 | Mask, Factor: 4, NumInputElts: Tp->getElementCount().getKnownMinValue() * 2) || |
4059 | ShuffleVectorInst::isInterleaveMask( |
4060 | Mask, Factor: 3, NumInputElts: Tp->getElementCount().getKnownMinValue() * 2))) |
4061 | return LT.first; |
4062 | |
4063 | unsigned TpNumElts = Mask.size(); |
4064 | unsigned LTNumElts = LT.second.getVectorNumElements(); |
4065 | unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; |
4066 | VectorType *NTp = |
4067 | VectorType::get(ElementType: Tp->getScalarType(), EC: LT.second.getVectorElementCount()); |
4068 | InstructionCost Cost; |
4069 | for (unsigned N = 0; N < NumVecs; N++) { |
4070 | SmallVector<int> NMask; |
4071 | // Split the existing mask into chunks of size LTNumElts. Track the source |
4072 | // sub-vectors to ensure the result has at most 2 inputs. |
4073 | unsigned Source1, Source2; |
4074 | unsigned NumSources = 0; |
4075 | for (unsigned E = 0; E < LTNumElts; E++) { |
4076 | int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] |
4077 | : PoisonMaskElem; |
4078 | if (MaskElt < 0) { |
4079 | NMask.push_back(Elt: PoisonMaskElem); |
4080 | continue; |
4081 | } |
4082 | |
4083 | // Calculate which source from the input this comes from and whether it |
4084 | // is new to us. |
4085 | unsigned Source = MaskElt / LTNumElts; |
4086 | if (NumSources == 0) { |
4087 | Source1 = Source; |
4088 | NumSources = 1; |
4089 | } else if (NumSources == 1 && Source != Source1) { |
4090 | Source2 = Source; |
4091 | NumSources = 2; |
4092 | } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { |
4093 | NumSources++; |
4094 | } |
4095 | |
4096 | // Add to the new mask. For the NumSources>2 case these are not correct, |
4097 | // but are only used for the modular lane number. |
4098 | if (Source == Source1) |
4099 | NMask.push_back(Elt: MaskElt % LTNumElts); |
4100 | else if (Source == Source2) |
4101 | NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts); |
4102 | else |
4103 | NMask.push_back(Elt: MaskElt % LTNumElts); |
4104 | } |
4105 | // If the sub-mask has at most 2 input sub-vectors then re-cost it using |
4106 | // getShuffleCost. If not then cost it using the worst case. |
4107 | if (NumSources <= 2) |
4108 | Cost += getShuffleCost(Kind: NumSources <= 1 ? TTI::SK_PermuteSingleSrc |
4109 | : TTI::SK_PermuteTwoSrc, |
4110 | Tp: NTp, Mask: NMask, CostKind, Index: 0, SubTp: nullptr, Args, CxtI); |
4111 | else if (any_of(Range: enumerate(First&: NMask), P: [&](const auto &ME) { |
4112 | return ME.value() % LTNumElts == ME.index(); |
4113 | })) |
4114 | Cost += LTNumElts - 1; |
4115 | else |
4116 | Cost += LTNumElts; |
4117 | } |
4118 | return Cost; |
4119 | } |
4120 | |
4121 | Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp); |
4122 | // Treat extractsubvector as single op permutation. |
4123 | bool = Kind == TTI::SK_ExtractSubvector; |
4124 | if (IsExtractSubvector && LT.second.isFixedLengthVector()) |
4125 | Kind = TTI::SK_PermuteSingleSrc; |
4126 | |
4127 | // Check for broadcast loads, which are supported by the LD1R instruction. |
4128 | // In terms of code-size, the shuffle vector is free when a load + dup get |
4129 | // folded into a LD1R. That's what we check and return here. For performance |
4130 | // and reciprocal throughput, a LD1R is not completely free. In this case, we |
4131 | // return the cost for the broadcast below (i.e. 1 for most/all types), so |
4132 | // that we model the load + dup sequence slightly higher because LD1R is a |
4133 | // high latency instruction. |
4134 | if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { |
4135 | bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args[0]); |
4136 | if (IsLoad && LT.second.isVector() && |
4137 | isLegalBroadcastLoad(ElementTy: Tp->getElementType(), |
4138 | NumElements: LT.second.getVectorElementCount())) |
4139 | return 0; |
4140 | } |
4141 | |
4142 | // If we have 4 elements for the shuffle and a Mask, get the cost straight |
4143 | // from the perfect shuffle tables. |
4144 | if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(MinVal: 4) && |
4145 | (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && |
4146 | all_of(Range&: Mask, P: [](int E) { return E < 8; })) |
4147 | return getPerfectShuffleCost(M: Mask); |
4148 | |
4149 | // Check for identity masks, which we can treat as free. |
4150 | if (!Mask.empty() && LT.second.isFixedLengthVector() && |
4151 | (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && |
4152 | all_of(Range: enumerate(First&: Mask), P: [](const auto &M) { |
4153 | return M.value() < 0 || M.value() == (int)M.index(); |
4154 | })) |
4155 | return 0; |
4156 | |
4157 | // Check for other shuffles that are not SK_ kinds but we have native |
4158 | // instructions for, for example ZIP and UZP. |
4159 | unsigned Unused; |
4160 | if (LT.second.isFixedLengthVector() && |
4161 | LT.second.getVectorNumElements() == Mask.size() && |
4162 | (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && |
4163 | (isZIPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) || |
4164 | isUZPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) || |
4165 | // Check for non-zero lane splats |
4166 | all_of(Range: drop_begin(RangeOrContainer&: Mask), |
4167 | P: [&Mask](int M) { return M < 0 || M == Mask[0]; }))) |
4168 | return 1; |
4169 | |
4170 | if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || |
4171 | Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || |
4172 | Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { |
4173 | static const CostTblEntry ShuffleTbl[] = { |
4174 | // Broadcast shuffle kinds can be performed with 'dup'. |
4175 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: 1}, |
4176 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: 1}, |
4177 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: 1}, |
4178 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: 1}, |
4179 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i32, .Cost: 1}, |
4180 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: 1}, |
4181 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: 1}, |
4182 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f16, .Cost: 1}, |
4183 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: 1}, |
4184 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f32, .Cost: 1}, |
4185 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: 1}, |
4186 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: 1}, |
4187 | // Transpose shuffle kinds can be performed with 'trn1/trn2' and |
4188 | // 'zip1/zip2' instructions. |
4189 | {.ISD: TTI::SK_Transpose, .Type: MVT::v8i8, .Cost: 1}, |
4190 | {.ISD: TTI::SK_Transpose, .Type: MVT::v16i8, .Cost: 1}, |
4191 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4i16, .Cost: 1}, |
4192 | {.ISD: TTI::SK_Transpose, .Type: MVT::v8i16, .Cost: 1}, |
4193 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2i32, .Cost: 1}, |
4194 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4i32, .Cost: 1}, |
4195 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2i64, .Cost: 1}, |
4196 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4f16, .Cost: 1}, |
4197 | {.ISD: TTI::SK_Transpose, .Type: MVT::v8f16, .Cost: 1}, |
4198 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2f32, .Cost: 1}, |
4199 | {.ISD: TTI::SK_Transpose, .Type: MVT::v4f32, .Cost: 1}, |
4200 | {.ISD: TTI::SK_Transpose, .Type: MVT::v2f64, .Cost: 1}, |
4201 | // Select shuffle kinds. |
4202 | // TODO: handle vXi8/vXi16. |
4203 | {.ISD: TTI::SK_Select, .Type: MVT::v2i32, .Cost: 1}, // mov. |
4204 | {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: 2}, // rev+trn (or similar). |
4205 | {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: 1}, // mov. |
4206 | {.ISD: TTI::SK_Select, .Type: MVT::v2f32, .Cost: 1}, // mov. |
4207 | {.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: 2}, // rev+trn (or similar). |
4208 | {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: 1}, // mov. |
4209 | // PermuteSingleSrc shuffle kinds. |
4210 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i32, .Cost: 1}, // mov. |
4211 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: 3}, // perfectshuffle worst case. |
4212 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: 1}, // mov. |
4213 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f32, .Cost: 1}, // mov. |
4214 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: 3}, // perfectshuffle worst case. |
4215 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: 1}, // mov. |
4216 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: 3}, // perfectshuffle worst case. |
4217 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f16, .Cost: 3}, // perfectshuffle worst case. |
4218 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4bf16, .Cost: 3}, // same |
4219 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: 8}, // constpool + load + tbl |
4220 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: 8}, // constpool + load + tbl |
4221 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8bf16, .Cost: 8}, // constpool + load + tbl |
4222 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: 8}, // constpool + load + tbl |
4223 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: 8}, // constpool + load + tbl |
4224 | // Reverse can be lowered with `rev`. |
4225 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i32, .Cost: 1}, // REV64 |
4226 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: 2}, // REV64; EXT |
4227 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: 1}, // EXT |
4228 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2f32, .Cost: 1}, // REV64 |
4229 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: 2}, // REV64; EXT |
4230 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: 1}, // EXT |
4231 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: 2}, // REV64; EXT |
4232 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: 2}, // REV64; EXT |
4233 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: 2}, // REV64; EXT |
4234 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4f16, .Cost: 1}, // REV64 |
4235 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: 1}, // REV64 |
4236 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i8, .Cost: 1}, // REV64 |
4237 | // Splice can all be lowered as `ext`. |
4238 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i32, .Cost: 1}, |
4239 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: 1}, |
4240 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: 1}, |
4241 | {.ISD: TTI::SK_Splice, .Type: MVT::v2f32, .Cost: 1}, |
4242 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: 1}, |
4243 | {.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: 1}, |
4244 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: 1}, |
4245 | {.ISD: TTI::SK_Splice, .Type: MVT::v8bf16, .Cost: 1}, |
4246 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: 1}, |
4247 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: 1}, |
4248 | {.ISD: TTI::SK_Splice, .Type: MVT::v4bf16, .Cost: 1}, |
4249 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f16, .Cost: 1}, |
4250 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: 1}, |
4251 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i8, .Cost: 1}, |
4252 | // Broadcast shuffle kinds for scalable vectors |
4253 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i8, .Cost: 1}, |
4254 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i16, .Cost: 1}, |
4255 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i32, .Cost: 1}, |
4256 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i64, .Cost: 1}, |
4257 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f16, .Cost: 1}, |
4258 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f16, .Cost: 1}, |
4259 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8f16, .Cost: 1}, |
4260 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2bf16, .Cost: 1}, |
4261 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4bf16, .Cost: 1}, |
4262 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8bf16, .Cost: 1}, |
4263 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f32, .Cost: 1}, |
4264 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f32, .Cost: 1}, |
4265 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f64, .Cost: 1}, |
4266 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i1, .Cost: 1}, |
4267 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i1, .Cost: 1}, |
4268 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i1, .Cost: 1}, |
4269 | {.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i1, .Cost: 1}, |
4270 | // Handle the cases for vector.reverse with scalable vectors |
4271 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i8, .Cost: 1}, |
4272 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i16, .Cost: 1}, |
4273 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i32, .Cost: 1}, |
4274 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i64, .Cost: 1}, |
4275 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f16, .Cost: 1}, |
4276 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f16, .Cost: 1}, |
4277 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8f16, .Cost: 1}, |
4278 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2bf16, .Cost: 1}, |
4279 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4bf16, .Cost: 1}, |
4280 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8bf16, .Cost: 1}, |
4281 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f32, .Cost: 1}, |
4282 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f32, .Cost: 1}, |
4283 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f64, .Cost: 1}, |
4284 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i1, .Cost: 1}, |
4285 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i1, .Cost: 1}, |
4286 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i1, .Cost: 1}, |
4287 | {.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i1, .Cost: 1}, |
4288 | }; |
4289 | if (const auto *Entry = CostTableLookup(Table: ShuffleTbl, ISD: Kind, Ty: LT.second)) |
4290 | return LT.first * Entry->Cost; |
4291 | } |
4292 | |
4293 | if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: Tp)) |
4294 | return getSpliceCost(Tp, Index); |
4295 | |
4296 | // Inserting a subvector can often be done with either a D, S or H register |
4297 | // move, so long as the inserted vector is "aligned". |
4298 | if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && |
4299 | LT.second.getSizeInBits() <= 128 && SubTp) { |
4300 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
4301 | if (SubLT.second.isVector()) { |
4302 | int NumElts = LT.second.getVectorNumElements(); |
4303 | int NumSubElts = SubLT.second.getVectorNumElements(); |
4304 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
4305 | return SubLT.first; |
4306 | } |
4307 | } |
4308 | |
4309 | // Restore optimal kind. |
4310 | if (IsExtractSubvector) |
4311 | Kind = TTI::SK_ExtractSubvector; |
4312 | return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args, |
4313 | CxtI); |
4314 | } |
4315 | |
4316 | static bool containsDecreasingPointers(Loop *TheLoop, |
4317 | PredicatedScalarEvolution *PSE) { |
4318 | const auto &Strides = DenseMap<Value *, const SCEV *>(); |
4319 | for (BasicBlock *BB : TheLoop->blocks()) { |
4320 | // Scan the instructions in the block and look for addresses that are |
4321 | // consecutive and decreasing. |
4322 | for (Instruction &I : *BB) { |
4323 | if (isa<LoadInst>(Val: &I) || isa<StoreInst>(Val: &I)) { |
4324 | Value *Ptr = getLoadStorePointerOperand(V: &I); |
4325 | Type *AccessTy = getLoadStoreType(I: &I); |
4326 | if (getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides, /*Assume=*/true, |
4327 | /*ShouldCheckWrap=*/false) |
4328 | .value_or(u: 0) < 0) |
4329 | return true; |
4330 | } |
4331 | } |
4332 | } |
4333 | return false; |
4334 | } |
4335 | |
4336 | bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) { |
4337 | if (!ST->hasSVE()) |
4338 | return false; |
4339 | |
4340 | // We don't currently support vectorisation with interleaving for SVE - with |
4341 | // such loops we're better off not using tail-folding. This gives us a chance |
4342 | // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. |
4343 | if (TFI->IAI->hasGroups()) |
4344 | return false; |
4345 | |
4346 | TailFoldingOpts Required = TailFoldingOpts::Disabled; |
4347 | if (TFI->LVL->getReductionVars().size()) |
4348 | Required |= TailFoldingOpts::Reductions; |
4349 | if (TFI->LVL->getFixedOrderRecurrences().size()) |
4350 | Required |= TailFoldingOpts::Recurrences; |
4351 | |
4352 | // We call this to discover whether any load/store pointers in the loop have |
4353 | // negative strides. This will require extra work to reverse the loop |
4354 | // predicate, which may be expensive. |
4355 | if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(), |
4356 | PSE: TFI->LVL->getPredicatedScalarEvolution())) |
4357 | Required |= TailFoldingOpts::Reverse; |
4358 | if (Required == TailFoldingOpts::Disabled) |
4359 | Required |= TailFoldingOpts::Simple; |
4360 | |
4361 | if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(), |
4362 | Required)) |
4363 | return false; |
4364 | |
4365 | // Don't tail-fold for tight loops where we would be better off interleaving |
4366 | // with an unpredicated loop. |
4367 | unsigned NumInsns = 0; |
4368 | for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) { |
4369 | NumInsns += BB->sizeWithoutDebug(); |
4370 | } |
4371 | |
4372 | // We expect 4 of these to be a IV PHI, IV add, IV compare and branch. |
4373 | return NumInsns >= SVETailFoldInsnThreshold; |
4374 | } |
4375 | |
4376 | InstructionCost |
4377 | AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
4378 | StackOffset BaseOffset, bool HasBaseReg, |
4379 | int64_t Scale, unsigned AddrSpace) const { |
4380 | // Scaling factors are not free at all. |
4381 | // Operands | Rt Latency |
4382 | // ------------------------------------------- |
4383 | // Rt, [Xn, Xm] | 4 |
4384 | // ------------------------------------------- |
4385 | // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 |
4386 | // Rt, [Xn, Wm, <extend> #imm] | |
4387 | TargetLoweringBase::AddrMode AM; |
4388 | AM.BaseGV = BaseGV; |
4389 | AM.BaseOffs = BaseOffset.getFixed(); |
4390 | AM.HasBaseReg = HasBaseReg; |
4391 | AM.Scale = Scale; |
4392 | AM.ScalableOffset = BaseOffset.getScalable(); |
4393 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) |
4394 | // Scale represents reg2 * scale, thus account for 1 if |
4395 | // it is not equal to 0 or 1. |
4396 | return AM.Scale != 0 && AM.Scale != 1; |
4397 | return -1; |
4398 | } |
4399 | |
4400 | bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) { |
4401 | // For the binary operators (e.g. or) we need to be more careful than |
4402 | // selects, here we only transform them if they are already at a natural |
4403 | // break point in the code - the end of a block with an unconditional |
4404 | // terminator. |
4405 | if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or && |
4406 | isa<BranchInst>(Val: I->getNextNode()) && |
4407 | cast<BranchInst>(Val: I->getNextNode())->isUnconditional()) |
4408 | return true; |
4409 | return BaseT::shouldTreatInstructionLikeSelect(I); |
4410 | } |
4411 | |
4412 | bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, |
4413 | const TargetTransformInfo::LSRCost &C2) { |
4414 | // AArch64 specific here is adding the number of instructions to the |
4415 | // comparison (though not as the first consideration, as some targets do) |
4416 | // along with changing the priority of the base additions. |
4417 | // TODO: Maybe a more nuanced tradeoff between instruction count |
4418 | // and number of registers? To be investigated at a later date. |
4419 | if (EnableLSRCostOpt) |
4420 | return std::tie(args: C1.NumRegs, args: C1.Insns, args: C1.NumBaseAdds, args: C1.AddRecCost, |
4421 | args: C1.NumIVMuls, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) < |
4422 | std::tie(args: C2.NumRegs, args: C2.Insns, args: C2.NumBaseAdds, args: C2.AddRecCost, |
4423 | args: C2.NumIVMuls, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost); |
4424 | |
4425 | return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); |
4426 | } |
4427 | |