1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/CodeGenTypes/MachineValueType.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
21#include "llvm/IR/DerivedTypes.h"
22#include "llvm/IR/Instruction.h"
23#include "llvm/IR/Instructions.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
27#include "llvm/IR/PatternMatch.h"
28#include "llvm/IR/Type.h"
29#include "llvm/Support/Casting.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Target/TargetMachine.h"
32#include "llvm/TargetParser/SubtargetFeature.h"
33#include "llvm/Transforms/InstCombine/InstCombiner.h"
34#include "llvm/Transforms/Utils/Local.h"
35#include "llvm/Transforms/Utils/LoopUtils.h"
36#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
47static cl::opt<bool> EnableMaskedLoadStores(
48 "enable-arm-maskedldst", cl::Hidden, cl::init(Val: true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
51static cl::opt<bool> DisableLowOverheadLoops(
52 "disable-arm-loloops", cl::Hidden, cl::init(Val: false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(Val: true),
57 cl::desc("Enable the generation of WLS loops"));
58
59static cl::opt<bool> UseWidenGlobalArrays(
60 "widen-global-strings", cl::Hidden, cl::init(Val: true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
63extern cl::opt<TailPredication::Mode> EnableTailPredication;
64
65extern cl::opt<bool> EnableMaskedGatherScatters;
66
67extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
68
69/// Convert a vector load intrinsic into a simple llvm load instruction.
70/// This is beneficial when the underlying object being addressed comes
71/// from a constant, since we get constant-folding for free.
72static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
73 InstCombiner::BuilderTy &Builder) {
74 auto *IntrAlign = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 1));
75
76 if (!IntrAlign)
77 return nullptr;
78
79 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
80 ? MemAlign
81 : IntrAlign->getLimitedValue();
82
83 if (!isPowerOf2_32(Value: Alignment))
84 return nullptr;
85
86 return Builder.CreateAlignedLoad(Ty: II.getType(), Ptr: II.getArgOperand(i: 0),
87 Align: Align(Alignment));
88}
89
90bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
91 const Function *Callee) const {
92 const TargetMachine &TM = getTLI()->getTargetMachine();
93 const FeatureBitset &CallerBits =
94 TM.getSubtargetImpl(*Caller)->getFeatureBits();
95 const FeatureBitset &CalleeBits =
96 TM.getSubtargetImpl(*Callee)->getFeatureBits();
97
98 // To inline a callee, all features not in the allowed list must match exactly.
99 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
100 (CalleeBits & ~InlineFeaturesAllowed);
101 // For features in the allowed list, the callee's features must be a subset of
102 // the callers'.
103 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
104 (CalleeBits & InlineFeaturesAllowed);
105 return MatchExact && MatchSubset;
106}
107
108TTI::AddressingModeKind
109ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
110 ScalarEvolution *SE) const {
111 if (ST->hasMVEIntegerOps())
112 return TTI::AMK_PostIndexed;
113
114 if (L->getHeader()->getParent()->hasOptSize())
115 return TTI::AMK_None;
116
117 if (ST->isMClass() && ST->isThumb2() &&
118 L->getNumBlocks() == 1)
119 return TTI::AMK_PreIndexed;
120
121 return TTI::AMK_None;
122}
123
124std::optional<Instruction *>
125ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
126 using namespace PatternMatch;
127 Intrinsic::ID IID = II.getIntrinsicID();
128 switch (IID) {
129 default:
130 break;
131 case Intrinsic::arm_neon_vld1: {
132 Align MemAlign =
133 getKnownAlignment(V: II.getArgOperand(i: 0), DL: IC.getDataLayout(), CxtI: &II,
134 AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
135 if (Value *V = simplifyNeonVld1(II, MemAlign: MemAlign.value(), Builder&: IC.Builder)) {
136 return IC.replaceInstUsesWith(I&: II, V);
137 }
138 break;
139 }
140
141 case Intrinsic::arm_neon_vld2:
142 case Intrinsic::arm_neon_vld3:
143 case Intrinsic::arm_neon_vld4:
144 case Intrinsic::arm_neon_vld2lane:
145 case Intrinsic::arm_neon_vld3lane:
146 case Intrinsic::arm_neon_vld4lane:
147 case Intrinsic::arm_neon_vst1:
148 case Intrinsic::arm_neon_vst2:
149 case Intrinsic::arm_neon_vst3:
150 case Intrinsic::arm_neon_vst4:
151 case Intrinsic::arm_neon_vst2lane:
152 case Intrinsic::arm_neon_vst3lane:
153 case Intrinsic::arm_neon_vst4lane: {
154 Align MemAlign =
155 getKnownAlignment(V: II.getArgOperand(i: 0), DL: IC.getDataLayout(), CxtI: &II,
156 AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
157 unsigned AlignArg = II.arg_size() - 1;
158 Value *AlignArgOp = II.getArgOperand(i: AlignArg);
159 MaybeAlign Align = cast<ConstantInt>(Val: AlignArgOp)->getMaybeAlignValue();
160 if (Align && *Align < MemAlign) {
161 return IC.replaceOperand(
162 I&: II, OpNum: AlignArg,
163 V: ConstantInt::get(Ty: Type::getInt32Ty(C&: II.getContext()), V: MemAlign.value(),
164 IsSigned: false));
165 }
166 break;
167 }
168
169 case Intrinsic::arm_neon_vld1x2:
170 case Intrinsic::arm_neon_vld1x3:
171 case Intrinsic::arm_neon_vld1x4:
172 case Intrinsic::arm_neon_vst1x2:
173 case Intrinsic::arm_neon_vst1x3:
174 case Intrinsic::arm_neon_vst1x4: {
175 Align NewAlign =
176 getKnownAlignment(V: II.getArgOperand(i: 0), DL: IC.getDataLayout(), CxtI: &II,
177 AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
178 Align OldAlign = II.getParamAlign(ArgNo: 0).valueOrOne();
179 if (NewAlign > OldAlign)
180 II.addParamAttr(ArgNo: 0,
181 Attr: Attribute::getWithAlignment(Context&: II.getContext(), Alignment: NewAlign));
182 break;
183 }
184
185 case Intrinsic::arm_mve_pred_i2v: {
186 Value *Arg = II.getArgOperand(i: 0);
187 Value *ArgArg;
188 if (match(V: Arg, P: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
189 Op0: PatternMatch::m_Value(V&: ArgArg))) &&
190 II.getType() == ArgArg->getType()) {
191 return IC.replaceInstUsesWith(I&: II, V: ArgArg);
192 }
193 Constant *XorMask;
194 if (match(V: Arg, P: m_Xor(L: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
195 Op0: PatternMatch::m_Value(V&: ArgArg)),
196 R: PatternMatch::m_Constant(C&: XorMask))) &&
197 II.getType() == ArgArg->getType()) {
198 if (auto *CI = dyn_cast<ConstantInt>(Val: XorMask)) {
199 if (CI->getValue().trunc(width: 16).isAllOnes()) {
200 auto TrueVector = IC.Builder.CreateVectorSplat(
201 NumElts: cast<FixedVectorType>(Val: II.getType())->getNumElements(),
202 V: IC.Builder.getTrue());
203 return BinaryOperator::Create(Op: Instruction::Xor, S1: ArgArg, S2: TrueVector);
204 }
205 }
206 }
207 KnownBits ScalarKnown(32);
208 if (IC.SimplifyDemandedBits(I: &II, OpNo: 0, DemandedMask: APInt::getLowBitsSet(numBits: 32, loBitsSet: 16),
209 Known&: ScalarKnown)) {
210 return &II;
211 }
212 break;
213 }
214 case Intrinsic::arm_mve_pred_v2i: {
215 Value *Arg = II.getArgOperand(i: 0);
216 Value *ArgArg;
217 if (match(V: Arg, P: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
218 Op0: PatternMatch::m_Value(V&: ArgArg)))) {
219 return IC.replaceInstUsesWith(I&: II, V: ArgArg);
220 }
221
222 if (II.getMetadata(KindID: LLVMContext::MD_range))
223 break;
224
225 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
226
227 if (auto CurrentRange = II.getRange()) {
228 Range = Range.intersectWith(CR: *CurrentRange);
229 if (Range == CurrentRange)
230 break;
231 }
232
233 II.addRangeRetAttr(CR: Range);
234 II.addRetAttr(Kind: Attribute::NoUndef);
235 return &II;
236 }
237 case Intrinsic::arm_mve_vadc:
238 case Intrinsic::arm_mve_vadc_predicated: {
239 unsigned CarryOp =
240 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
241 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
242 "Bad type for intrinsic!");
243
244 KnownBits CarryKnown(32);
245 if (IC.SimplifyDemandedBits(I: &II, OpNo: CarryOp, DemandedMask: APInt::getOneBitSet(numBits: 32, BitNo: 29),
246 Known&: CarryKnown)) {
247 return &II;
248 }
249 break;
250 }
251 case Intrinsic::arm_mve_vmldava: {
252 Instruction *I = cast<Instruction>(Val: &II);
253 if (I->hasOneUse()) {
254 auto *User = cast<Instruction>(Val: *I->user_begin());
255 Value *OpZ;
256 if (match(V: User, P: m_c_Add(L: m_Specific(V: I), R: m_Value(V&: OpZ))) &&
257 match(V: I->getOperand(i: 3), P: m_Zero())) {
258 Value *OpX = I->getOperand(i: 4);
259 Value *OpY = I->getOperand(i: 5);
260 Type *OpTy = OpX->getType();
261
262 IC.Builder.SetInsertPoint(User);
263 Value *V =
264 IC.Builder.CreateIntrinsic(ID: Intrinsic::arm_mve_vmldava, Types: {OpTy},
265 Args: {I->getOperand(i: 0), I->getOperand(i: 1),
266 I->getOperand(i: 2), OpZ, OpX, OpY});
267
268 IC.replaceInstUsesWith(I&: *User, V);
269 return IC.eraseInstFromFunction(I&: *User);
270 }
271 }
272 return std::nullopt;
273 }
274 }
275 return std::nullopt;
276}
277
278std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
279 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
280 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
281 std::function<void(Instruction *, unsigned, APInt, APInt &)>
282 SimplifyAndSetOp) const {
283
284 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
285 // opcode specifying a Top/Bottom instruction, which can change between
286 // instructions.
287 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
288 unsigned NumElts = cast<FixedVectorType>(Val: II.getType())->getNumElements();
289 unsigned IsTop = cast<ConstantInt>(Val: II.getOperand(i_nocapture: TopOpc))->getZExtValue();
290
291 // The only odd/even lanes of operand 0 will only be demanded depending
292 // on whether this is a top/bottom instruction.
293 APInt DemandedElts =
294 APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: 2, loBitsSet: 1)
295 : APInt::getHighBitsSet(numBits: 2, hiBitsSet: 1));
296 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
297 // The other lanes will be defined from the inserted elements.
298 UndefElts &= APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: 2, loBitsSet: 1)
299 : APInt::getHighBitsSet(numBits: 2, hiBitsSet: 1));
300 return std::nullopt;
301 };
302
303 switch (II.getIntrinsicID()) {
304 default:
305 break;
306 case Intrinsic::arm_mve_vcvt_narrow:
307 SimplifyNarrowInstrTopBottom(2);
308 break;
309 case Intrinsic::arm_mve_vqmovn:
310 SimplifyNarrowInstrTopBottom(4);
311 break;
312 case Intrinsic::arm_mve_vshrn:
313 SimplifyNarrowInstrTopBottom(7);
314 break;
315 }
316
317 return std::nullopt;
318}
319
320InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
321 TTI::TargetCostKind CostKind) const {
322 assert(Ty->isIntegerTy());
323
324 unsigned Bits = Ty->getPrimitiveSizeInBits();
325 if (Bits == 0 || Imm.getActiveBits() >= 64)
326 return 4;
327
328 int64_t SImmVal = Imm.getSExtValue();
329 uint64_t ZImmVal = Imm.getZExtValue();
330 if (!ST->isThumb()) {
331 if ((SImmVal >= 0 && SImmVal < 65536) ||
332 (ARM_AM::getSOImmVal(Arg: ZImmVal) != -1) ||
333 (ARM_AM::getSOImmVal(Arg: ~ZImmVal) != -1))
334 return 1;
335 return ST->hasV6T2Ops() ? 2 : 3;
336 }
337 if (ST->isThumb2()) {
338 if ((SImmVal >= 0 && SImmVal < 65536) ||
339 (ARM_AM::getT2SOImmVal(Arg: ZImmVal) != -1) ||
340 (ARM_AM::getT2SOImmVal(Arg: ~ZImmVal) != -1))
341 return 1;
342 return ST->hasV6T2Ops() ? 2 : 3;
343 }
344 // Thumb1, any i8 imm cost 1.
345 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
346 return 1;
347 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(V: ZImmVal))
348 return 2;
349 // Load from constantpool.
350 return 3;
351}
352
353// Constants smaller than 256 fit in the immediate field of
354// Thumb1 instructions so we return a zero cost and 1 otherwise.
355InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
356 const APInt &Imm,
357 Type *Ty) const {
358 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
359 return 0;
360
361 return 1;
362}
363
364// Checks whether Inst is part of a min(max()) or max(min()) pattern
365// that will match to an SSAT instruction. Returns the instruction being
366// saturated, or null if no saturation pattern was found.
367static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
368 Value *LHS, *RHS;
369 ConstantInt *C;
370 SelectPatternFlavor InstSPF = matchSelectPattern(V: Inst, LHS, RHS).Flavor;
371
372 if (InstSPF == SPF_SMAX &&
373 PatternMatch::match(V: RHS, P: PatternMatch::m_ConstantInt(CI&: C)) &&
374 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
375
376 auto isSSatMin = [&](Value *MinInst) {
377 if (isa<SelectInst>(Val: MinInst)) {
378 Value *MinLHS, *MinRHS;
379 ConstantInt *MinC;
380 SelectPatternFlavor MinSPF =
381 matchSelectPattern(V: MinInst, LHS&: MinLHS, RHS&: MinRHS).Flavor;
382 if (MinSPF == SPF_SMIN &&
383 PatternMatch::match(V: MinRHS, P: PatternMatch::m_ConstantInt(CI&: MinC)) &&
384 MinC->getValue() == ((-Imm) - 1))
385 return true;
386 }
387 return false;
388 };
389
390 if (isSSatMin(Inst->getOperand(i: 1)))
391 return cast<Instruction>(Val: Inst->getOperand(i: 1))->getOperand(i: 1);
392 if (Inst->hasNUses(N: 2) &&
393 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
394 return Inst->getOperand(i: 1);
395 }
396 return nullptr;
397}
398
399// Look for a FP Saturation pattern, where the instruction can be simplified to
400// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
401static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
402 if (Imm.getBitWidth() != 64 ||
403 Imm != APInt::getHighBitsSet(numBits: 64, hiBitsSet: 33)) // -2147483648
404 return false;
405 Value *FP = isSSATMinMaxPattern(Inst, Imm);
406 if (!FP && isa<ICmpInst>(Val: Inst) && Inst->hasOneUse())
407 FP = isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm);
408 if (!FP)
409 return false;
410 return isa<FPToSIInst>(Val: FP);
411}
412
413InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
414 const APInt &Imm, Type *Ty,
415 TTI::TargetCostKind CostKind,
416 Instruction *Inst) const {
417 // Division by a constant can be turned into multiplication, but only if we
418 // know it's constant. So it's not so much that the immediate is cheap (it's
419 // not), but that the alternative is worse.
420 // FIXME: this is probably unneeded with GlobalISel.
421 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
422 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
423 Idx == 1)
424 return 0;
425
426 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
427 // splitting any large offsets.
428 if (Opcode == Instruction::GetElementPtr && Idx != 0)
429 return 0;
430
431 if (Opcode == Instruction::And) {
432 // UXTB/UXTH
433 if (Imm == 255 || Imm == 65535)
434 return 0;
435 // Conversion to BIC is free, and means we can use ~Imm instead.
436 return std::min(a: getIntImmCost(Imm, Ty, CostKind),
437 b: getIntImmCost(Imm: ~Imm, Ty, CostKind));
438 }
439
440 if (Opcode == Instruction::Add)
441 // Conversion to SUB is free, and means we can use -Imm instead.
442 return std::min(a: getIntImmCost(Imm, Ty, CostKind),
443 b: getIntImmCost(Imm: -Imm, Ty, CostKind));
444
445 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
446 Ty->getIntegerBitWidth() == 32) {
447 int64_t NegImm = -Imm.getSExtValue();
448 if (ST->isThumb2() && NegImm < 1<<12)
449 // icmp X, #-C -> cmn X, #C
450 return 0;
451 if (ST->isThumb() && NegImm < 1<<8)
452 // icmp X, #-C -> adds X, #C
453 return 0;
454 }
455
456 // xor a, -1 can always be folded to MVN
457 if (Opcode == Instruction::Xor && Imm.isAllOnes())
458 return 0;
459
460 // Ensures negative constant of min(max()) or max(min()) patterns that
461 // match to SSAT instructions don't get hoisted
462 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
463 Ty->getIntegerBitWidth() <= 32) {
464 if (isSSATMinMaxPattern(Inst, Imm) ||
465 (isa<ICmpInst>(Val: Inst) && Inst->hasOneUse() &&
466 isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm)))
467 return 0;
468 }
469
470 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
471 return 0;
472
473 // We can convert <= -1 to < 0, which is generally quite cheap.
474 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
475 ICmpInst::Predicate Pred = cast<ICmpInst>(Val: Inst)->getPredicate();
476 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
477 return std::min(a: getIntImmCost(Imm, Ty, CostKind),
478 b: getIntImmCost(Imm: Imm + 1, Ty, CostKind));
479 }
480
481 return getIntImmCost(Imm, Ty, CostKind);
482}
483
484InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
485 TTI::TargetCostKind CostKind,
486 const Instruction *I) const {
487 if (CostKind == TTI::TCK_RecipThroughput &&
488 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
489 // FIXME: The vectorizer is highly sensistive to the cost of these
490 // instructions, which suggests that it may be using the costs incorrectly.
491 // But, for now, just make them free to avoid performance regressions for
492 // vector targets.
493 return 0;
494 }
495 return BaseT::getCFInstrCost(Opcode, CostKind, I);
496}
497
498InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
499 Type *Src,
500 TTI::CastContextHint CCH,
501 TTI::TargetCostKind CostKind,
502 const Instruction *I) const {
503 int ISD = TLI->InstructionOpcodeToISD(Opcode);
504 assert(ISD && "Invalid opcode");
505
506 // TODO: Allow non-throughput costs that aren't binary.
507 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
508 if (CostKind != TTI::TCK_RecipThroughput)
509 return Cost == 0 ? 0 : 1;
510 return Cost;
511 };
512 auto IsLegalFPType = [this](EVT VT) {
513 EVT EltVT = VT.getScalarType();
514 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
515 (EltVT == MVT::f64 && ST->hasFP64()) ||
516 (EltVT == MVT::f16 && ST->hasFullFP16());
517 };
518
519 EVT SrcTy = TLI->getValueType(DL, Ty: Src);
520 EVT DstTy = TLI->getValueType(DL, Ty: Dst);
521
522 if (!SrcTy.isSimple() || !DstTy.isSimple())
523 return AdjustCost(
524 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
525
526 // Extending masked load/Truncating masked stores is expensive because we
527 // currently don't split them. This means that we'll likely end up
528 // loading/storing each element individually (hence the high cost).
529 if ((ST->hasMVEIntegerOps() &&
530 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
531 Opcode == Instruction::SExt)) ||
532 (ST->hasMVEFloatOps() &&
533 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
534 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
535 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
536 return 2 * DstTy.getVectorNumElements() *
537 ST->getMVEVectorCostFactor(CostKind);
538
539 // The extend of other kinds of load is free
540 if (CCH == TTI::CastContextHint::Normal ||
541 CCH == TTI::CastContextHint::Masked) {
542 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
543 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i32, .Src: MVT::i16, .Cost: 0},
544 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i32, .Src: MVT::i16, .Cost: 0},
545 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i32, .Src: MVT::i8, .Cost: 0},
546 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i32, .Src: MVT::i8, .Cost: 0},
547 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i16, .Src: MVT::i8, .Cost: 0},
548 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i16, .Src: MVT::i8, .Cost: 0},
549 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i32, .Cost: 1},
550 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i32, .Cost: 1},
551 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: 1},
552 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: 1},
553 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i8, .Cost: 1},
554 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i8, .Cost: 1},
555 };
556 if (const auto *Entry = ConvertCostTableLookup(
557 Table: LoadConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
558 return AdjustCost(Entry->Cost);
559
560 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
561 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0},
562 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0},
563 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 0},
564 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 0},
565 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0},
566 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0},
567 // The following extend from a legal type to an illegal type, so need to
568 // split the load. This introduced an extra load operation, but the
569 // extend is still "free".
570 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 1},
571 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 1},
572 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 3},
573 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 3},
574 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 1},
575 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 1},
576 };
577 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
578 if (const auto *Entry =
579 ConvertCostTableLookup(Table: MVELoadConversionTbl, ISD,
580 Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
581 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
582 }
583
584 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
585 // FPExtends are similar but also require the VCVT instructions.
586 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: 1},
587 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: 3},
588 };
589 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
590 if (const auto *Entry =
591 ConvertCostTableLookup(Table: MVEFLoadConversionTbl, ISD,
592 Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
593 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
594 }
595
596 // The truncate of a store is free. This is the mirror of extends above.
597 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
598 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0},
599 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 0},
600 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0},
601 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 1},
602 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 1},
603 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 3},
604 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 1},
605 };
606 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
607 if (const auto *Entry =
608 ConvertCostTableLookup(Table: MVEStoreConversionTbl, ISD,
609 Dst: SrcTy.getSimpleVT(), Src: DstTy.getSimpleVT()))
610 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
611 }
612
613 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
614 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: 1},
615 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: 3},
616 };
617 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
618 if (const auto *Entry =
619 ConvertCostTableLookup(Table: MVEFStoreConversionTbl, ISD,
620 Dst: SrcTy.getSimpleVT(), Src: DstTy.getSimpleVT()))
621 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
622 }
623 }
624
625 // NEON vector operations that can extend their inputs.
626 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
627 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
628 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
629 // vaddl
630 { .ISD: ISD::ADD, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0 },
631 { .ISD: ISD::ADD, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0 },
632 // vsubl
633 { .ISD: ISD::SUB, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0 },
634 { .ISD: ISD::SUB, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0 },
635 // vmull
636 { .ISD: ISD::MUL, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0 },
637 { .ISD: ISD::MUL, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0 },
638 // vshll
639 { .ISD: ISD::SHL, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0 },
640 { .ISD: ISD::SHL, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0 },
641 };
642
643 auto *User = cast<Instruction>(Val: *I->user_begin());
644 int UserISD = TLI->InstructionOpcodeToISD(Opcode: User->getOpcode());
645 if (auto *Entry = ConvertCostTableLookup(Table: NEONDoubleWidthTbl, ISD: UserISD,
646 Dst: DstTy.getSimpleVT(),
647 Src: SrcTy.getSimpleVT())) {
648 return AdjustCost(Entry->Cost);
649 }
650 }
651
652 // Single to/from double precision conversions.
653 if (Src->isVectorTy() && ST->hasNEON() &&
654 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
655 DstTy.getScalarType() == MVT::f32) ||
656 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
657 DstTy.getScalarType() == MVT::f64))) {
658 static const CostTblEntry NEONFltDblTbl[] = {
659 // Vector fptrunc/fpext conversions.
660 {.ISD: ISD::FP_ROUND, .Type: MVT::v2f64, .Cost: 2},
661 {.ISD: ISD::FP_EXTEND, .Type: MVT::v2f32, .Cost: 2},
662 {.ISD: ISD::FP_EXTEND, .Type: MVT::v4f32, .Cost: 4}};
663
664 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
665 if (const auto *Entry = CostTableLookup(Table: NEONFltDblTbl, ISD, Ty: LT.second))
666 return AdjustCost(LT.first * Entry->Cost);
667 }
668
669 // Some arithmetic, load and store operations have specific instructions
670 // to cast up/down their types automatically at no extra cost.
671 // TODO: Get these tables to know at least what the related operations are.
672 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
673 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1 },
674 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1 },
675 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 1 },
676 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 1 },
677 { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: 0 },
678 { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 1 },
679
680 // The number of vmovl instructions for the extension.
681 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1 },
682 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1 },
683 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2 },
684 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2 },
685 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: 3 },
686 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: 3 },
687 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2 },
688 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2 },
689 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3 },
690 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3 },
691 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3 },
692 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3 },
693 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7 },
694 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7 },
695 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6 },
696 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6 },
697 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6 },
698 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6 },
699
700 // Operations that we legalize using splitting.
701 { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 6 },
702 { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: 3 },
703
704 // Vector float <-> i32 conversions.
705 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1 },
706 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1 },
707
708 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3 },
709 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3 },
710 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 2 },
711 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 2 },
712 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1 },
713 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1 },
714 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: 3 },
715 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: 3 },
716 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3 },
717 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3 },
718 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2 },
719 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2 },
720 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4 },
721 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4 },
722 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: 2 },
723 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: 2 },
724 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: 8 },
725 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: 8 },
726 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: 4 },
727 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: 4 },
728
729 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1 },
730 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1 },
731 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 3 },
732 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 3 },
733 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2 },
734 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2 },
735
736 // Vector double <-> i32 conversions.
737 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2 },
738 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2 },
739
740 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4 },
741 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4 },
742 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 3 },
743 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 3 },
744 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2 },
745 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2 },
746
747 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2 },
748 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2 },
749 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: 4 },
750 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: 4 },
751 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: 8 },
752 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: 8 }
753 };
754
755 if (SrcTy.isVector() && ST->hasNEON()) {
756 if (const auto *Entry = ConvertCostTableLookup(Table: NEONVectorConversionTbl, ISD,
757 Dst: DstTy.getSimpleVT(),
758 Src: SrcTy.getSimpleVT()))
759 return AdjustCost(Entry->Cost);
760 }
761
762 // Scalar float to integer conversions.
763 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
764 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i1, .Src: MVT::f32, .Cost: 2 },
765 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i1, .Src: MVT::f32, .Cost: 2 },
766 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i1, .Src: MVT::f64, .Cost: 2 },
767 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i1, .Src: MVT::f64, .Cost: 2 },
768 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i8, .Src: MVT::f32, .Cost: 2 },
769 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i8, .Src: MVT::f32, .Cost: 2 },
770 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i8, .Src: MVT::f64, .Cost: 2 },
771 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i8, .Src: MVT::f64, .Cost: 2 },
772 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i16, .Src: MVT::f32, .Cost: 2 },
773 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i16, .Src: MVT::f32, .Cost: 2 },
774 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i16, .Src: MVT::f64, .Cost: 2 },
775 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i16, .Src: MVT::f64, .Cost: 2 },
776 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: 2 },
777 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: 2 },
778 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: 2 },
779 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: 2 },
780 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: 10 },
781 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: 10 },
782 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: 10 },
783 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: 10 }
784 };
785 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
786 if (const auto *Entry = ConvertCostTableLookup(Table: NEONFloatConversionTbl, ISD,
787 Dst: DstTy.getSimpleVT(),
788 Src: SrcTy.getSimpleVT()))
789 return AdjustCost(Entry->Cost);
790 }
791
792 // Scalar integer to float conversions.
793 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
794 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i1, .Cost: 2 },
795 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i1, .Cost: 2 },
796 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i1, .Cost: 2 },
797 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i1, .Cost: 2 },
798 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i8, .Cost: 2 },
799 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i8, .Cost: 2 },
800 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i8, .Cost: 2 },
801 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i8, .Cost: 2 },
802 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i16, .Cost: 2 },
803 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i16, .Cost: 2 },
804 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i16, .Cost: 2 },
805 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i16, .Cost: 2 },
806 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: 2 },
807 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: 2 },
808 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: 2 },
809 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: 2 },
810 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: 10 },
811 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: 10 },
812 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: 10 },
813 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: 10 }
814 };
815
816 if (SrcTy.isInteger() && ST->hasNEON()) {
817 if (const auto *Entry = ConvertCostTableLookup(Table: NEONIntegerConversionTbl,
818 ISD, Dst: DstTy.getSimpleVT(),
819 Src: SrcTy.getSimpleVT()))
820 return AdjustCost(Entry->Cost);
821 }
822
823 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
824 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
825 // are linearised so take more.
826 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
827 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1 },
828 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1 },
829 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2 },
830 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2 },
831 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: 10 },
832 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: 2 },
833 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1 },
834 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1 },
835 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 10 },
836 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2 },
837 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 8 },
838 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 2 },
839 };
840
841 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
842 if (const auto *Entry = ConvertCostTableLookup(Table: MVEVectorConversionTbl,
843 ISD, Dst: DstTy.getSimpleVT(),
844 Src: SrcTy.getSimpleVT()))
845 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
846 }
847
848 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
849 // As general rule, fp converts that were not matched above are scalarized
850 // and cost 1 vcvt for each lane, so long as the instruction is available.
851 // If not it will become a series of function calls.
852 const InstructionCost CallCost =
853 getCallInstrCost(F: nullptr, RetTy: Dst, Tys: {Src}, CostKind);
854 int Lanes = 1;
855 if (SrcTy.isFixedLengthVector())
856 Lanes = SrcTy.getVectorNumElements();
857
858 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
859 return Lanes;
860 else
861 return Lanes * CallCost;
862 }
863
864 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
865 SrcTy.isFixedLengthVector()) {
866 // Treat a truncate with larger than legal source (128bits for MVE) as
867 // expensive, 2 instructions per lane.
868 if ((SrcTy.getScalarType() == MVT::i8 ||
869 SrcTy.getScalarType() == MVT::i16 ||
870 SrcTy.getScalarType() == MVT::i32) &&
871 SrcTy.getSizeInBits() > 128 &&
872 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
873 return SrcTy.getVectorNumElements() * 2;
874 }
875
876 // Scalar integer conversion costs.
877 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
878 // i16 -> i64 requires two dependent operations.
879 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: 2 },
880
881 // Truncates on i64 are assumed to be free.
882 { .ISD: ISD::TRUNCATE, .Dst: MVT::i32, .Src: MVT::i64, .Cost: 0 },
883 { .ISD: ISD::TRUNCATE, .Dst: MVT::i16, .Src: MVT::i64, .Cost: 0 },
884 { .ISD: ISD::TRUNCATE, .Dst: MVT::i8, .Src: MVT::i64, .Cost: 0 },
885 { .ISD: ISD::TRUNCATE, .Dst: MVT::i1, .Src: MVT::i64, .Cost: 0 }
886 };
887
888 if (SrcTy.isInteger()) {
889 if (const auto *Entry = ConvertCostTableLookup(Table: ARMIntegerConversionTbl, ISD,
890 Dst: DstTy.getSimpleVT(),
891 Src: SrcTy.getSimpleVT()))
892 return AdjustCost(Entry->Cost);
893 }
894
895 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
896 ? ST->getMVEVectorCostFactor(CostKind)
897 : 1;
898 return AdjustCost(
899 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
900}
901
902InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
903 TTI::TargetCostKind CostKind,
904 unsigned Index, const Value *Op0,
905 const Value *Op1) const {
906 // Penalize inserting into an D-subregister. We end up with a three times
907 // lower estimated throughput on swift.
908 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
909 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
910 return 3;
911
912 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
913 Opcode == Instruction::ExtractElement)) {
914 // Cross-class copies are expensive on many microarchitectures,
915 // so assume they are expensive by default.
916 if (cast<VectorType>(Val: ValTy)->getElementType()->isIntegerTy())
917 return 3;
918
919 // Even if it's not a cross class copy, this likely leads to mixing
920 // of NEON and VFP code and should be therefore penalized.
921 if (ValTy->isVectorTy() &&
922 ValTy->getScalarSizeInBits() <= 32)
923 return std::max<InstructionCost>(
924 a: BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1),
925 b: 2U);
926 }
927
928 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
929 Opcode == Instruction::ExtractElement)) {
930 // Integer cross-lane moves are more expensive than float, which can
931 // sometimes just be vmovs. Integer involve being passes to GPR registers,
932 // causing more of a delay.
933 std::pair<InstructionCost, MVT> LT =
934 getTypeLegalizationCost(Ty: ValTy->getScalarType());
935 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
936 }
937
938 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1);
939}
940
941InstructionCost ARMTTIImpl::getCmpSelInstrCost(
942 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
943 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
944 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
945 int ISD = TLI->InstructionOpcodeToISD(Opcode);
946
947 // Thumb scalar code size cost for select.
948 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
949 ST->isThumb() && !ValTy->isVectorTy()) {
950 // Assume expensive structs.
951 if (TLI->getValueType(DL, Ty: ValTy, AllowUnknown: true) == MVT::Other)
952 return TTI::TCC_Expensive;
953
954 // Select costs can vary because they:
955 // - may require one or more conditional mov (including an IT),
956 // - can't operate directly on immediates,
957 // - require live flags, which we can't copy around easily.
958 InstructionCost Cost = getTypeLegalizationCost(Ty: ValTy).first;
959
960 // Possible IT instruction for Thumb2, or more for Thumb1.
961 ++Cost;
962
963 // i1 values may need rematerialising by using mov immediates and/or
964 // flag setting instructions.
965 if (ValTy->isIntegerTy(Bitwidth: 1))
966 ++Cost;
967
968 return Cost;
969 }
970
971 // If this is a vector min/max/abs, use the cost of that intrinsic directly
972 // instead. Hopefully when min/max intrinsics are more prevalent this code
973 // will not be needed.
974 const Instruction *Sel = I;
975 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
976 Sel->hasOneUse())
977 Sel = cast<Instruction>(Val: Sel->user_back());
978 if (Sel && ValTy->isVectorTy() &&
979 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
980 const Value *LHS, *RHS;
981 SelectPatternFlavor SPF = matchSelectPattern(V: Sel, LHS, RHS).Flavor;
982 unsigned IID = 0;
983 switch (SPF) {
984 case SPF_ABS:
985 IID = Intrinsic::abs;
986 break;
987 case SPF_SMIN:
988 IID = Intrinsic::smin;
989 break;
990 case SPF_SMAX:
991 IID = Intrinsic::smax;
992 break;
993 case SPF_UMIN:
994 IID = Intrinsic::umin;
995 break;
996 case SPF_UMAX:
997 IID = Intrinsic::umax;
998 break;
999 case SPF_FMINNUM:
1000 IID = Intrinsic::minnum;
1001 break;
1002 case SPF_FMAXNUM:
1003 IID = Intrinsic::maxnum;
1004 break;
1005 default:
1006 break;
1007 }
1008 if (IID) {
1009 // The ICmp is free, the select gets the cost of the min/max/etc
1010 if (Sel != I)
1011 return 0;
1012 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1013 return getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
1014 }
1015 }
1016
1017 // On NEON a vector select gets lowered to vbsl.
1018 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1019 // Lowering of some vector selects is currently far from perfect.
1020 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1021 { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: 4*4 + 1*2 + 1 },
1022 { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: 50 },
1023 { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: 100 }
1024 };
1025
1026 EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
1027 EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
1028 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1029 if (const auto *Entry = ConvertCostTableLookup(Table: NEONVectorSelectTbl, ISD,
1030 Dst: SelCondTy.getSimpleVT(),
1031 Src: SelValTy.getSimpleVT()))
1032 return Entry->Cost;
1033 }
1034
1035 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1036 return LT.first;
1037 }
1038
1039 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1040 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1041 cast<FixedVectorType>(Val: ValTy)->getNumElements() > 1) {
1042 FixedVectorType *VecValTy = cast<FixedVectorType>(Val: ValTy);
1043 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(Val: CondTy);
1044 if (!VecCondTy)
1045 VecCondTy = cast<FixedVectorType>(Val: CmpInst::makeCmpResultType(opnd_type: VecValTy));
1046
1047 // If we don't have mve.fp any fp operations will need to be scalarized.
1048 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1049 // One scalaization insert, one scalarization extract and the cost of the
1050 // fcmps.
1051 return BaseT::getScalarizationOverhead(InTy: VecValTy, /*Insert*/ false,
1052 /*Extract*/ true, CostKind) +
1053 BaseT::getScalarizationOverhead(InTy: VecCondTy, /*Insert*/ true,
1054 /*Extract*/ false, CostKind) +
1055 VecValTy->getNumElements() *
1056 getCmpSelInstrCost(Opcode, ValTy: ValTy->getScalarType(),
1057 CondTy: VecCondTy->getScalarType(), VecPred,
1058 CostKind, Op1Info, Op2Info, I);
1059 }
1060
1061 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1062 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1063 // There are two types - the input that specifies the type of the compare
1064 // and the output vXi1 type. Because we don't know how the output will be
1065 // split, we may need an expensive shuffle to get two in sync. This has the
1066 // effect of making larger than legal compares (v8i32 for example)
1067 // expensive.
1068 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1069 if (LT.first > 1)
1070 return LT.first * BaseCost +
1071 BaseT::getScalarizationOverhead(InTy: VecCondTy, /*Insert*/ true,
1072 /*Extract*/ false, CostKind);
1073 return BaseCost;
1074 }
1075 }
1076
1077 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1078 // for "multiple beats" potentially needed by MVE instructions.
1079 int BaseCost = 1;
1080 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1081 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1082
1083 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1084 CostKind, Op1Info, Op2Info, I);
1085}
1086
1087InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1088 ScalarEvolution *SE,
1089 const SCEV *Ptr) const {
1090 // Address computations in vectorized code with non-consecutive addresses will
1091 // likely result in more instructions compared to scalar code where the
1092 // computation can more often be merged into the index mode. The resulting
1093 // extra micro-ops can significantly decrease throughput.
1094 unsigned NumVectorInstToHideOverhead = 10;
1095 int MaxMergeDistance = 64;
1096
1097 if (ST->hasNEON()) {
1098 if (Ty->isVectorTy() && SE &&
1099 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + 1))
1100 return NumVectorInstToHideOverhead;
1101
1102 // In many cases the address computation is not merged into the instruction
1103 // addressing mode.
1104 return 1;
1105 }
1106 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1107}
1108
1109bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) const {
1110 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
1111 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1112 // optimized, else LSR may block tail-predication.
1113 switch (II->getIntrinsicID()) {
1114 case Intrinsic::arm_mve_vctp8:
1115 case Intrinsic::arm_mve_vctp16:
1116 case Intrinsic::arm_mve_vctp32:
1117 case Intrinsic::arm_mve_vctp64:
1118 return true;
1119 default:
1120 break;
1121 }
1122 }
1123 return false;
1124}
1125
1126bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment,
1127 unsigned /*AddressSpace*/) const {
1128 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1129 return false;
1130
1131 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DataTy)) {
1132 // Don't support v2i1 yet.
1133 if (VecTy->getNumElements() == 2)
1134 return false;
1135
1136 // We don't support extending fp types.
1137 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1138 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1139 return false;
1140 }
1141
1142 unsigned EltWidth = DataTy->getScalarSizeInBits();
1143 return (EltWidth == 32 && Alignment >= 4) ||
1144 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1145}
1146
1147bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
1148 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1149 return false;
1150
1151 unsigned EltWidth = Ty->getScalarSizeInBits();
1152 return ((EltWidth == 32 && Alignment >= 4) ||
1153 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1154}
1155
1156/// Given a memcpy/memset/memmove instruction, return the number of memory
1157/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1158/// call is used.
1159int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1160 MemOp MOp;
1161 unsigned DstAddrSpace = ~0u;
1162 unsigned SrcAddrSpace = ~0u;
1163 const Function *F = I->getParent()->getParent();
1164
1165 if (const auto *MC = dyn_cast<MemTransferInst>(Val: I)) {
1166 ConstantInt *C = dyn_cast<ConstantInt>(Val: MC->getLength());
1167 // If 'size' is not a constant, a library call will be generated.
1168 if (!C)
1169 return -1;
1170
1171 const unsigned Size = C->getValue().getZExtValue();
1172 const Align DstAlign = MC->getDestAlign().valueOrOne();
1173 const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1174
1175 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1176 /*IsVolatile*/ false);
1177 DstAddrSpace = MC->getDestAddressSpace();
1178 SrcAddrSpace = MC->getSourceAddressSpace();
1179 }
1180 else if (const auto *MS = dyn_cast<MemSetInst>(Val: I)) {
1181 ConstantInt *C = dyn_cast<ConstantInt>(Val: MS->getLength());
1182 // If 'size' is not a constant, a library call will be generated.
1183 if (!C)
1184 return -1;
1185
1186 const unsigned Size = C->getValue().getZExtValue();
1187 const Align DstAlign = MS->getDestAlign().valueOrOne();
1188
1189 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1190 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1191 DstAddrSpace = MS->getDestAddressSpace();
1192 }
1193 else
1194 llvm_unreachable("Expected a memcpy/move or memset!");
1195
1196 unsigned Limit, Factor = 2;
1197 switch(I->getIntrinsicID()) {
1198 case Intrinsic::memcpy:
1199 Limit = TLI->getMaxStoresPerMemcpy(OptSize: F->hasMinSize());
1200 break;
1201 case Intrinsic::memmove:
1202 Limit = TLI->getMaxStoresPerMemmove(OptSize: F->hasMinSize());
1203 break;
1204 case Intrinsic::memset:
1205 Limit = TLI->getMaxStoresPerMemset(OptSize: F->hasMinSize());
1206 Factor = 1;
1207 break;
1208 default:
1209 llvm_unreachable("Expected a memcpy/move or memset!");
1210 }
1211
1212 // MemOps will be poplulated with a list of data types that needs to be
1213 // loaded and stored. That's why we multiply the number of elements by 2 to
1214 // get the cost for this memcpy.
1215 std::vector<EVT> MemOps;
1216 if (getTLI()->findOptimalMemOpLowering(
1217 MemOps, Limit, Op: MOp, DstAS: DstAddrSpace,
1218 SrcAS: SrcAddrSpace, FuncAttributes: F->getAttributes()))
1219 return MemOps.size() * Factor;
1220
1221 // If we can't find an optimal memop lowering, return the default cost
1222 return -1;
1223}
1224
1225InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) const {
1226 int NumOps = getNumMemOps(I: cast<IntrinsicInst>(Val: I));
1227
1228 // To model the cost of a library call, we assume 1 for the call, and
1229 // 3 for the argument setup.
1230 if (NumOps == -1)
1231 return 4;
1232 return NumOps;
1233}
1234
1235InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1236 VectorType *DstTy, VectorType *SrcTy,
1237 ArrayRef<int> Mask,
1238 TTI::TargetCostKind CostKind,
1239 int Index, VectorType *SubTp,
1240 ArrayRef<const Value *> Args,
1241 const Instruction *CxtI) const {
1242 assert((Mask.empty() || DstTy->isScalableTy() ||
1243 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1244 "Expected the Mask to match the return size if given");
1245 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1246 "Expected the same scalar types");
1247
1248 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
1249 // Treat extractsubvector as single op permutation.
1250 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1251 if (IsExtractSubvector)
1252 Kind = TTI::SK_PermuteSingleSrc;
1253 if (ST->hasNEON()) {
1254 if (Kind == TTI::SK_Broadcast) {
1255 static const CostTblEntry NEONDupTbl[] = {
1256 // VDUP handles these cases.
1257 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: 1},
1258 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: 1},
1259 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: 1},
1260 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: 1},
1261 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: 1},
1262 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i8, .Cost: 1},
1263
1264 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: 1},
1265 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: 1},
1266 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: 1},
1267 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: 1}};
1268
1269 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1270 if (const auto *Entry =
1271 CostTableLookup(Table: NEONDupTbl, ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1272 return LT.first * Entry->Cost;
1273 }
1274 if (Kind == TTI::SK_Reverse) {
1275 static const CostTblEntry NEONShuffleTbl[] = {
1276 // Reverse shuffle cost one instruction if we are shuffling within a
1277 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1278 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: 1},
1279 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: 1},
1280 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: 1},
1281 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: 1},
1282 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: 1},
1283 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i8, .Cost: 1},
1284
1285 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: 2},
1286 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: 2},
1287 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: 2},
1288 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: 2}};
1289
1290 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1291 if (const auto *Entry =
1292 CostTableLookup(Table: NEONShuffleTbl, ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1293 return LT.first * Entry->Cost;
1294 }
1295 if (Kind == TTI::SK_Select) {
1296 static const CostTblEntry NEONSelShuffleTbl[] = {
1297 // Select shuffle cost table for ARM. Cost is the number of
1298 // instructions
1299 // required to create the shuffled vector.
1300
1301 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: 1},
1302 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: 1},
1303 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: 1},
1304 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: 1},
1305
1306 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: 2},
1307 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: 2},
1308 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: 2},
1309
1310 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: 16},
1311
1312 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: 32}};
1313
1314 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1315 if (const auto *Entry = CostTableLookup(Table: NEONSelShuffleTbl,
1316 ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1317 return LT.first * Entry->Cost;
1318 }
1319 }
1320 if (ST->hasMVEIntegerOps()) {
1321 if (Kind == TTI::SK_Broadcast) {
1322 static const CostTblEntry MVEDupTbl[] = {
1323 // VDUP handles these cases.
1324 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: 1},
1325 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: 1},
1326 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: 1},
1327 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: 1},
1328 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8f16, .Cost: 1}};
1329
1330 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1331 if (const auto *Entry = CostTableLookup(Table: MVEDupTbl, ISD: ISD::VECTOR_SHUFFLE,
1332 Ty: LT.second))
1333 return LT.first * Entry->Cost *
1334 ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput);
1335 }
1336
1337 if (!Mask.empty()) {
1338 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1339 if (LT.second.isVector() &&
1340 Mask.size() <= LT.second.getVectorNumElements() &&
1341 (isVREVMask(M: Mask, VT: LT.second, BlockSize: 16) || isVREVMask(M: Mask, VT: LT.second, BlockSize: 32) ||
1342 isVREVMask(M: Mask, VT: LT.second, BlockSize: 64)))
1343 return ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput) * LT.first;
1344 }
1345 }
1346
1347 // Restore optimal kind.
1348 if (IsExtractSubvector)
1349 Kind = TTI::SK_ExtractSubvector;
1350 int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1351 ? ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput)
1352 : 1;
1353 return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1354 Index, SubTp);
1355}
1356
1357InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1358 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1359 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1360 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
1361 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1362 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(Bitwidth: 1)) {
1363 // Make operations on i1 relatively expensive as this often involves
1364 // combining predicates. AND and XOR should be easier to handle with IT
1365 // blocks.
1366 switch (ISDOpcode) {
1367 default:
1368 break;
1369 case ISD::AND:
1370 case ISD::XOR:
1371 return 2;
1372 case ISD::OR:
1373 return 3;
1374 }
1375 }
1376
1377 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1378
1379 if (ST->hasNEON()) {
1380 const unsigned FunctionCallDivCost = 20;
1381 const unsigned ReciprocalDivCost = 10;
1382 static const CostTblEntry CostTbl[] = {
1383 // Division.
1384 // These costs are somewhat random. Choose a cost of 20 to indicate that
1385 // vectorizing devision (added function call) is going to be very expensive.
1386 // Double registers types.
1387 { .ISD: ISD::SDIV, .Type: MVT::v1i64, .Cost: 1 * FunctionCallDivCost},
1388 { .ISD: ISD::UDIV, .Type: MVT::v1i64, .Cost: 1 * FunctionCallDivCost},
1389 { .ISD: ISD::SREM, .Type: MVT::v1i64, .Cost: 1 * FunctionCallDivCost},
1390 { .ISD: ISD::UREM, .Type: MVT::v1i64, .Cost: 1 * FunctionCallDivCost},
1391 { .ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: 2 * FunctionCallDivCost},
1392 { .ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: 2 * FunctionCallDivCost},
1393 { .ISD: ISD::SREM, .Type: MVT::v2i32, .Cost: 2 * FunctionCallDivCost},
1394 { .ISD: ISD::UREM, .Type: MVT::v2i32, .Cost: 2 * FunctionCallDivCost},
1395 { .ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: ReciprocalDivCost},
1396 { .ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: ReciprocalDivCost},
1397 { .ISD: ISD::SREM, .Type: MVT::v4i16, .Cost: 4 * FunctionCallDivCost},
1398 { .ISD: ISD::UREM, .Type: MVT::v4i16, .Cost: 4 * FunctionCallDivCost},
1399 { .ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: ReciprocalDivCost},
1400 { .ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: ReciprocalDivCost},
1401 { .ISD: ISD::SREM, .Type: MVT::v8i8, .Cost: 8 * FunctionCallDivCost},
1402 { .ISD: ISD::UREM, .Type: MVT::v8i8, .Cost: 8 * FunctionCallDivCost},
1403 // Quad register types.
1404 { .ISD: ISD::SDIV, .Type: MVT::v2i64, .Cost: 2 * FunctionCallDivCost},
1405 { .ISD: ISD::UDIV, .Type: MVT::v2i64, .Cost: 2 * FunctionCallDivCost},
1406 { .ISD: ISD::SREM, .Type: MVT::v2i64, .Cost: 2 * FunctionCallDivCost},
1407 { .ISD: ISD::UREM, .Type: MVT::v2i64, .Cost: 2 * FunctionCallDivCost},
1408 { .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: 4 * FunctionCallDivCost},
1409 { .ISD: ISD::UDIV, .Type: MVT::v4i32, .Cost: 4 * FunctionCallDivCost},
1410 { .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: 4 * FunctionCallDivCost},
1411 { .ISD: ISD::UREM, .Type: MVT::v4i32, .Cost: 4 * FunctionCallDivCost},
1412 { .ISD: ISD::SDIV, .Type: MVT::v8i16, .Cost: 8 * FunctionCallDivCost},
1413 { .ISD: ISD::UDIV, .Type: MVT::v8i16, .Cost: 8 * FunctionCallDivCost},
1414 { .ISD: ISD::SREM, .Type: MVT::v8i16, .Cost: 8 * FunctionCallDivCost},
1415 { .ISD: ISD::UREM, .Type: MVT::v8i16, .Cost: 8 * FunctionCallDivCost},
1416 { .ISD: ISD::SDIV, .Type: MVT::v16i8, .Cost: 16 * FunctionCallDivCost},
1417 { .ISD: ISD::UDIV, .Type: MVT::v16i8, .Cost: 16 * FunctionCallDivCost},
1418 { .ISD: ISD::SREM, .Type: MVT::v16i8, .Cost: 16 * FunctionCallDivCost},
1419 { .ISD: ISD::UREM, .Type: MVT::v16i8, .Cost: 16 * FunctionCallDivCost},
1420 // Multiplication.
1421 };
1422
1423 if (const auto *Entry = CostTableLookup(Table: CostTbl, ISD: ISDOpcode, Ty: LT.second))
1424 return LT.first * Entry->Cost;
1425
1426 InstructionCost Cost = BaseT::getArithmeticInstrCost(
1427 Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
1428
1429 // This is somewhat of a hack. The problem that we are facing is that SROA
1430 // creates a sequence of shift, and, or instructions to construct values.
1431 // These sequences are recognized by the ISel and have zero-cost. Not so for
1432 // the vectorized code. Because we have support for v2i64 but not i64 those
1433 // sequences look particularly beneficial to vectorize.
1434 // To work around this we increase the cost of v2i64 operations to make them
1435 // seem less beneficial.
1436 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1437 Cost += 4;
1438
1439 return Cost;
1440 }
1441
1442 // If this operation is a shift on arm/thumb2, it might well be folded into
1443 // the following instruction, hence having a cost of 0.
1444 auto LooksLikeAFreeShift = [&]() {
1445 if (ST->isThumb1Only() || Ty->isVectorTy())
1446 return false;
1447
1448 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1449 return false;
1450 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1451 return false;
1452
1453 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1454 switch (cast<Instruction>(Val: CxtI->user_back())->getOpcode()) {
1455 case Instruction::Add:
1456 case Instruction::Sub:
1457 case Instruction::And:
1458 case Instruction::Xor:
1459 case Instruction::Or:
1460 case Instruction::ICmp:
1461 return true;
1462 default:
1463 return false;
1464 }
1465 };
1466 if (LooksLikeAFreeShift())
1467 return 0;
1468
1469 // When targets have both DSP and MVE we find that the
1470 // the compiler will attempt to vectorize as well as using
1471 // scalar (S/U)MLAL operations. This is in cases where we have
1472 // the pattern ext(mul(ext(i16), ext(i16))) we find
1473 // that codegen performs better when only using (S/U)MLAL scalar
1474 // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1475 // check if a mul instruction is used in a (U/S)MLAL pattern.
1476 auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
1477 Type *Ty) -> bool {
1478 if (!ST->hasDSP())
1479 return false;
1480
1481 if (!I)
1482 return false;
1483
1484 if (Opcode != Instruction::Mul)
1485 return false;
1486
1487 if (Ty->isVectorTy())
1488 return false;
1489
1490 auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
1491 return cast<Instruction>(Val: LHS)->getOpcode() ==
1492 cast<Instruction>(Val: RHS)->getOpcode();
1493 };
1494 auto IsExtInst = [](const Value *V) -> bool {
1495 return isa<ZExtInst>(Val: V) || isa<SExtInst>(Val: V);
1496 };
1497 auto IsExtensionFromHalf = [](const Value *V) -> bool {
1498 return cast<Instruction>(Val: V)->getOperand(i: 0)->getType()->isIntegerTy(Bitwidth: 16);
1499 };
1500
1501 // We check the arguments of the instruction to see if they're extends
1502 auto *BinOp = dyn_cast<BinaryOperator>(Val: I);
1503 if (!BinOp)
1504 return false;
1505 Value *Op0 = BinOp->getOperand(i_nocapture: 0);
1506 Value *Op1 = BinOp->getOperand(i_nocapture: 1);
1507 if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1508 // We're interested in an ext of an i16
1509 if (!I->getType()->isIntegerTy(Bitwidth: 32) || !IsExtensionFromHalf(Op0) ||
1510 !IsExtensionFromHalf(Op1))
1511 return false;
1512 // We need to check if this result will be further extended to i64
1513 // and that all these uses are SExt
1514 for (auto *U : I->users())
1515 if (!IsExtInst(U))
1516 return false;
1517 return true;
1518 }
1519
1520 return false;
1521 };
1522
1523 if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
1524 return 0;
1525
1526 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1527 // for "multiple beats" potentially needed by MVE instructions.
1528 int BaseCost = 1;
1529 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1530 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1531
1532 // The rest of this mostly follows what is done in
1533 // BaseT::getArithmeticInstrCost, without treating floats as more expensive
1534 // that scalars or increasing the costs for custom operations. The results is
1535 // also multiplied by the MVEVectorCostFactor where appropriate.
1536 if (TLI->isOperationLegalOrCustomOrPromote(Op: ISDOpcode, VT: LT.second))
1537 return LT.first * BaseCost;
1538
1539 // Else this is expand, assume that we need to scalarize this op.
1540 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
1541 unsigned Num = VTy->getNumElements();
1542 InstructionCost Cost =
1543 getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind);
1544 // Return the cost of multiple scalar invocation plus the cost of
1545 // inserting and extracting the values.
1546 SmallVector<Type *> Tys(Args.size(), Ty);
1547 return BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind) +
1548 Num * Cost;
1549 }
1550
1551 return BaseCost;
1552}
1553
1554InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1555 Align Alignment,
1556 unsigned AddressSpace,
1557 TTI::TargetCostKind CostKind,
1558 TTI::OperandValueInfo OpInfo,
1559 const Instruction *I) const {
1560 // TODO: Handle other cost kinds.
1561 if (CostKind != TTI::TCK_RecipThroughput)
1562 return 1;
1563
1564 // Type legalization can't handle structs
1565 if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other)
1566 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1567 CostKind);
1568
1569 if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align(16) &&
1570 cast<VectorType>(Val: Src)->getElementType()->isDoubleTy()) {
1571 // Unaligned loads/stores are extremely inefficient.
1572 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1573 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1574 return LT.first * 4;
1575 }
1576
1577 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1578 // Same for stores.
1579 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Val: Src) && I &&
1580 ((Opcode == Instruction::Load && I->hasOneUse() &&
1581 isa<FPExtInst>(Val: *I->user_begin())) ||
1582 (Opcode == Instruction::Store && isa<FPTruncInst>(Val: I->getOperand(i: 0))))) {
1583 FixedVectorType *SrcVTy = cast<FixedVectorType>(Val: Src);
1584 Type *DstTy =
1585 Opcode == Instruction::Load
1586 ? (*I->user_begin())->getType()
1587 : cast<Instruction>(Val: I->getOperand(i: 0))->getOperand(i: 0)->getType();
1588 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1589 DstTy->getScalarType()->isFloatTy())
1590 return ST->getMVEVectorCostFactor(CostKind);
1591 }
1592
1593 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1594 ? ST->getMVEVectorCostFactor(CostKind)
1595 : 1;
1596 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1597 CostKind, OpInfo, I);
1598}
1599
1600InstructionCost
1601ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1602 unsigned AddressSpace,
1603 TTI::TargetCostKind CostKind) const {
1604 if (ST->hasMVEIntegerOps()) {
1605 if (Opcode == Instruction::Load &&
1606 isLegalMaskedLoad(DataTy: Src, Alignment, AddressSpace))
1607 return ST->getMVEVectorCostFactor(CostKind);
1608 if (Opcode == Instruction::Store &&
1609 isLegalMaskedStore(DataTy: Src, Alignment, AddressSpace))
1610 return ST->getMVEVectorCostFactor(CostKind);
1611 }
1612 if (!isa<FixedVectorType>(Val: Src))
1613 return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
1614 CostKind);
1615 // Scalar cost, which is currently very high due to the efficiency of the
1616 // generated code.
1617 return cast<FixedVectorType>(Val: Src)->getNumElements() * 8;
1618}
1619
1620InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1621 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1622 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1623 bool UseMaskForCond, bool UseMaskForGaps) const {
1624 assert(Factor >= 2 && "Invalid interleave factor");
1625 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1626
1627 // vldN/vstN doesn't support vector types of i64/f64 element.
1628 bool EltIs64Bits = DL.getTypeSizeInBits(Ty: VecTy->getScalarType()) == 64;
1629
1630 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1631 !UseMaskForCond && !UseMaskForGaps) {
1632 unsigned NumElts = cast<FixedVectorType>(Val: VecTy)->getNumElements();
1633 auto *SubVecTy =
1634 FixedVectorType::get(ElementType: VecTy->getScalarType(), NumElts: NumElts / Factor);
1635
1636 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1637 // Accesses having vector types that are a multiple of 128 bits can be
1638 // matched to more than one vldN/vstN instruction.
1639 int BaseCost =
1640 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1641 if (NumElts % Factor == 0 &&
1642 TLI->isLegalInterleavedAccessType(Factor, VecTy: SubVecTy, Alignment, DL))
1643 return Factor * BaseCost * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL);
1644
1645 // Some smaller than legal interleaved patterns are cheap as we can make
1646 // use of the vmovn or vrev patterns to interleave a standard load. This is
1647 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1648 // promoted differently). The cost of 2 here is then a load and vrev or
1649 // vmovn.
1650 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1651 VecTy->isIntOrIntVectorTy() &&
1652 DL.getTypeSizeInBits(Ty: SubVecTy).getFixedValue() <= 64)
1653 return 2 * BaseCost;
1654 }
1655
1656 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1657 Alignment, AddressSpace, CostKind,
1658 UseMaskForCond, UseMaskForGaps);
1659}
1660
1661InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1662 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1663 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1664 using namespace PatternMatch;
1665 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1666 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1667 Alignment, CostKind, I);
1668
1669 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1670 auto *VTy = cast<FixedVectorType>(Val: DataTy);
1671
1672 // TODO: Splitting, once we do that.
1673
1674 unsigned NumElems = VTy->getNumElements();
1675 unsigned EltSize = VTy->getScalarSizeInBits();
1676 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: DataTy);
1677
1678 // For now, it is assumed that for the MVE gather instructions the loads are
1679 // all effectively serialised. This means the cost is the scalar cost
1680 // multiplied by the number of elements being loaded. This is possibly very
1681 // conservative, but even so we still end up vectorising loops because the
1682 // cost per iteration for many loops is lower than for scalar loops.
1683 InstructionCost VectorCost =
1684 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1685 // The scalarization cost should be a lot higher. We use the number of vector
1686 // elements plus the scalarization overhead. If masking is required then a lot
1687 // of little blocks will be needed and potentially a scalarized p0 mask,
1688 // greatly increasing the cost.
1689 InstructionCost ScalarCost =
1690 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1691 BaseT::getScalarizationOverhead(InTy: VTy, /*Insert*/ true, /*Extract*/ false,
1692 CostKind) +
1693 BaseT::getScalarizationOverhead(InTy: VTy, /*Insert*/ false, /*Extract*/ true,
1694 CostKind);
1695
1696 if (EltSize < 8 || Alignment < EltSize / 8)
1697 return ScalarCost;
1698
1699 unsigned ExtSize = EltSize;
1700 // Check whether there's a single user that asks for an extended type
1701 if (I != nullptr) {
1702 // Dependent of the caller of this function, a gather instruction will
1703 // either have opcode Instruction::Load or be a call to the masked_gather
1704 // intrinsic
1705 if ((I->getOpcode() == Instruction::Load ||
1706 match(V: I, P: m_Intrinsic<Intrinsic::masked_gather>())) &&
1707 I->hasOneUse()) {
1708 const User *Us = *I->users().begin();
1709 if (isa<ZExtInst>(Val: Us) || isa<SExtInst>(Val: Us)) {
1710 // only allow valid type combinations
1711 unsigned TypeSize =
1712 cast<Instruction>(Val: Us)->getType()->getScalarSizeInBits();
1713 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1714 (TypeSize == 16 && EltSize == 8)) &&
1715 TypeSize * NumElems == 128) {
1716 ExtSize = TypeSize;
1717 }
1718 }
1719 }
1720 // Check whether the input data needs to be truncated
1721 TruncInst *T;
1722 if ((I->getOpcode() == Instruction::Store ||
1723 match(V: I, P: m_Intrinsic<Intrinsic::masked_scatter>())) &&
1724 (T = dyn_cast<TruncInst>(Val: I->getOperand(i: 0)))) {
1725 // Only allow valid type combinations
1726 unsigned TypeSize = T->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits();
1727 if (((EltSize == 16 && TypeSize == 32) ||
1728 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1729 TypeSize * NumElems == 128)
1730 ExtSize = TypeSize;
1731 }
1732 }
1733
1734 if (ExtSize * NumElems != 128 || NumElems < 4)
1735 return ScalarCost;
1736
1737 // Any (aligned) i32 gather will not need to be scalarised.
1738 if (ExtSize == 32)
1739 return VectorCost;
1740 // For smaller types, we need to ensure that the gep's inputs are correctly
1741 // extended from a small enough value. Other sizes (including i64) are
1742 // scalarized for now.
1743 if (ExtSize != 8 && ExtSize != 16)
1744 return ScalarCost;
1745
1746 if (const auto *BC = dyn_cast<BitCastInst>(Val: Ptr))
1747 Ptr = BC->getOperand(i_nocapture: 0);
1748 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr)) {
1749 if (GEP->getNumOperands() != 2)
1750 return ScalarCost;
1751 unsigned Scale = DL.getTypeAllocSize(Ty: GEP->getResultElementType());
1752 // Scale needs to be correct (which is only relevant for i16s).
1753 if (Scale != 1 && Scale * 8 != ExtSize)
1754 return ScalarCost;
1755 // And we need to zext (not sext) the indexes from a small enough type.
1756 if (const auto *ZExt = dyn_cast<ZExtInst>(Val: GEP->getOperand(i_nocapture: 1))) {
1757 if (ZExt->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() <= ExtSize)
1758 return VectorCost;
1759 }
1760 return ScalarCost;
1761 }
1762 return ScalarCost;
1763}
1764
1765InstructionCost
1766ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1767 std::optional<FastMathFlags> FMF,
1768 TTI::TargetCostKind CostKind) const {
1769
1770 EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1771 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1772 unsigned EltSize = ValVT.getScalarSizeInBits();
1773
1774 // In general floating point reductions are a series of elementwise
1775 // operations, with free extracts on each step. These are either in-order or
1776 // treewise depending on whether that is allowed by the fast math flags.
1777 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1778 ((EltSize == 32 && ST->hasVFP2Base()) ||
1779 (EltSize == 64 && ST->hasFP64()) ||
1780 (EltSize == 16 && ST->hasFullFP16()))) {
1781 unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1782 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1783 InstructionCost VecCost = 0;
1784 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(Value: NumElts) &&
1785 NumElts * EltSize > VecLimit) {
1786 Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / 2);
1787 VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1788 NumElts /= 2;
1789 }
1790
1791 // For fp16 we need to extract the upper lane elements. MVE can add a
1792 // VREV+FMIN/MAX to perform another vector step instead.
1793 InstructionCost ExtractCost = 0;
1794 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1795 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1796 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1797 NumElts /= 2;
1798 } else if (ValVT.getVectorElementType() == MVT::f16)
1799 ExtractCost = NumElts / 2;
1800
1801 return VecCost + ExtractCost +
1802 NumElts *
1803 getArithmeticInstrCost(Opcode, Ty: ValTy->getElementType(), CostKind);
1804 }
1805
1806 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1807 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1808 unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1809 unsigned VecLimit =
1810 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1811 InstructionCost VecCost = 0;
1812 while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1813 Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / 2);
1814 VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1815 NumElts /= 2;
1816 }
1817 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1818 // step.
1819 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1820 NumElts * EltSize == 64) {
1821 Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts);
1822 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1823 getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1824 NumElts /= 2;
1825 }
1826
1827 // From here we extract the elements and perform the and/or/xor.
1828 InstructionCost ExtractCost = NumElts;
1829 return VecCost + ExtractCost +
1830 (NumElts - 1) * getArithmeticInstrCost(
1831 Opcode, Ty: ValTy->getElementType(), CostKind);
1832 }
1833
1834 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1835 TTI::requiresOrderedReduction(FMF))
1836 return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1837
1838 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1839
1840 static const CostTblEntry CostTblAdd[]{
1841 {.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 1},
1842 {.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 1},
1843 {.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 1},
1844 };
1845 if (const auto *Entry = CostTableLookup(Table: CostTblAdd, ISD, Ty: LT.second))
1846 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1847
1848 return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1849}
1850
1851InstructionCost ARMTTIImpl::getExtendedReductionCost(
1852 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1853 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1854 EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1855 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1856
1857 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1858
1859 switch (ISD) {
1860 case ISD::ADD:
1861 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1862 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1863
1864 // The legal cases are:
1865 // VADDV u/s 8/16/32
1866 // VADDLV u/s 32
1867 // Codegen currently cannot always handle larger than legal vectors very
1868 // well, especially for predicated reductions where the mask needs to be
1869 // split, so restrict to 128bit or smaller input types.
1870 unsigned RevVTSize = ResVT.getSizeInBits();
1871 if (ValVT.getSizeInBits() <= 128 &&
1872 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1873 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1874 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1875 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1876 }
1877 break;
1878 default:
1879 break;
1880 }
1881 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, FMF,
1882 CostKind);
1883}
1884
1885InstructionCost
1886ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
1887 VectorType *ValTy,
1888 TTI::TargetCostKind CostKind) const {
1889 EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1890 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1891
1892 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1893 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1894
1895 // The legal cases are:
1896 // VMLAV u/s 8/16/32
1897 // VMLALV u/s 16/32
1898 // Codegen currently cannot always handle larger than legal vectors very
1899 // well, especially for predicated reductions where the mask needs to be
1900 // split, so restrict to 128bit or smaller input types.
1901 unsigned RevVTSize = ResVT.getSizeInBits();
1902 if (ValVT.getSizeInBits() <= 128 &&
1903 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1904 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1905 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1906 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1907 }
1908
1909 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, Ty: ValTy, CostKind);
1910}
1911
1912InstructionCost
1913ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1914 FastMathFlags FMF,
1915 TTI::TargetCostKind CostKind) const {
1916 EVT ValVT = TLI->getValueType(DL, Ty);
1917
1918 // In general floating point reductions are a series of elementwise
1919 // operations, with free extracts on each step. These are either in-order or
1920 // treewise depending on whether that is allowed by the fast math flags.
1921 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1922 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1923 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1924 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1925 unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
1926 unsigned EltSize = ValVT.getScalarSizeInBits();
1927 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1928 InstructionCost VecCost;
1929 while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1930 Type *VecTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElts/2);
1931 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1932 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1933 NumElts /= 2;
1934 }
1935
1936 // For fp16 we need to extract the upper lane elements. MVE can add a
1937 // VREV+FMIN/MAX to perform another vector step instead.
1938 InstructionCost ExtractCost = 0;
1939 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1940 NumElts == 8) {
1941 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1942 NumElts /= 2;
1943 } else if (ValVT.getVectorElementType() == MVT::f16)
1944 ExtractCost = cast<FixedVectorType>(Val: Ty)->getNumElements() / 2;
1945
1946 IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
1947 {Ty->getElementType(), Ty->getElementType()},
1948 FMF);
1949 return VecCost + ExtractCost +
1950 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
1951 }
1952
1953 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
1954 IID == Intrinsic::umin || IID == Intrinsic::umax) {
1955 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1956
1957 // All costs are the same for u/s min/max. These lower to vminv, which are
1958 // given a slightly higher cost as they tend to take multiple cycles for
1959 // smaller type sizes.
1960 static const CostTblEntry CostTblAdd[]{
1961 {.ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: 4},
1962 {.ISD: ISD::SMIN, .Type: MVT::v8i16, .Cost: 3},
1963 {.ISD: ISD::SMIN, .Type: MVT::v4i32, .Cost: 2},
1964 };
1965 if (const auto *Entry = CostTableLookup(Table: CostTblAdd, ISD: ISD::SMIN, Ty: LT.second))
1966 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1967 }
1968
1969 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1970}
1971
1972InstructionCost
1973ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1974 TTI::TargetCostKind CostKind) const {
1975 unsigned Opc = ICA.getID();
1976 switch (Opc) {
1977 case Intrinsic::get_active_lane_mask:
1978 // Currently we make a somewhat optimistic assumption that
1979 // active_lane_mask's are always free. In reality it may be freely folded
1980 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1981 // of add/icmp code. We may need to improve this in the future, but being
1982 // able to detect if it is free or not involves looking at a lot of other
1983 // code. We currently assume that the vectorizer inserted these, and knew
1984 // what it was doing in adding one.
1985 if (ST->hasMVEIntegerOps())
1986 return 0;
1987 break;
1988 case Intrinsic::sadd_sat:
1989 case Intrinsic::ssub_sat:
1990 case Intrinsic::uadd_sat:
1991 case Intrinsic::usub_sat: {
1992 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1993 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1994 Type *RetTy = ICA.getReturnType();
1995
1996 if (auto *ITy = dyn_cast<IntegerType>(Val: RetTy)) {
1997 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
1998 return 1; // qadd / qsub
1999 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
2000 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2001 // Otherwise return the cost of expanding the node. Generally an add +
2002 // icmp + sel.
2003 CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2004 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
2005 return getArithmeticInstrCost(Opcode: IsAdd ? Instruction::Add : Instruction::Sub,
2006 Ty: RetTy, CostKind) +
2007 2 * getCmpSelInstrCost(Opcode: BinaryOperator::ICmp, ValTy: RetTy, CondTy, VecPred: Pred,
2008 CostKind) +
2009 2 * getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy, VecPred: Pred,
2010 CostKind);
2011 }
2012
2013 if (!ST->hasMVEIntegerOps())
2014 break;
2015
2016 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
2017 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2018 LT.second == MVT::v16i8) {
2019 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2020 // need to extend the type, as it uses shr(qadd(shl, shl)).
2021 unsigned Instrs =
2022 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
2023 : 4;
2024 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2025 }
2026 break;
2027 }
2028 case Intrinsic::abs:
2029 case Intrinsic::smin:
2030 case Intrinsic::smax:
2031 case Intrinsic::umin:
2032 case Intrinsic::umax: {
2033 if (!ST->hasMVEIntegerOps())
2034 break;
2035 Type *VT = ICA.getReturnType();
2036
2037 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
2038 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2039 LT.second == MVT::v16i8)
2040 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2041 break;
2042 }
2043 case Intrinsic::minnum:
2044 case Intrinsic::maxnum: {
2045 if (!ST->hasMVEFloatOps())
2046 break;
2047 Type *VT = ICA.getReturnType();
2048 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
2049 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
2050 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2051 break;
2052 }
2053 case Intrinsic::fptosi_sat:
2054 case Intrinsic::fptoui_sat: {
2055 if (ICA.getArgTypes().empty())
2056 break;
2057 bool IsSigned = Opc == Intrinsic::fptosi_sat;
2058 auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]);
2059 EVT MTy = TLI->getValueType(DL, Ty: ICA.getReturnType());
2060 // Check for the legal types, with the corect subtarget features.
2061 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
2062 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
2063 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2064 return LT.first;
2065
2066 // Equally for MVE vector types
2067 if (ST->hasMVEFloatOps() &&
2068 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2069 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2070 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2071
2072 // If we can we use a legal convert followed by a min+max
2073 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2074 (ST->hasFP64() && LT.second == MVT::f64) ||
2075 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2076 (ST->hasMVEFloatOps() &&
2077 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2078 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2079 Type *LegalTy = Type::getIntNTy(C&: ICA.getReturnType()->getContext(),
2080 N: LT.second.getScalarSizeInBits());
2081 InstructionCost Cost =
2082 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2083 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2084 : Intrinsic::umin,
2085 LegalTy, {LegalTy, LegalTy});
2086 Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
2087 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2088 : Intrinsic::umax,
2089 LegalTy, {LegalTy, LegalTy});
2090 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
2091 return LT.first * Cost;
2092 }
2093 // Otherwise we need to follow the default expansion that clamps the value
2094 // using a float min/max with a fcmp+sel for nan handling when signed.
2095 Type *FPTy = ICA.getArgTypes()[0];
2096 Type *RetTy = ICA.getReturnType();
2097 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2098 InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind);
2099 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2100 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
2101 Cost +=
2102 getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2103 Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind);
2104 if (IsSigned) {
2105 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
2106 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy,
2107 VecPred: CmpInst::FCMP_UNO, CostKind);
2108 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy,
2109 VecPred: CmpInst::FCMP_UNO, CostKind);
2110 }
2111 return Cost;
2112 }
2113 }
2114
2115 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2116}
2117
2118bool ARMTTIImpl::isLoweredToCall(const Function *F) const {
2119 if (!F->isIntrinsic())
2120 return BaseT::isLoweredToCall(F);
2121
2122 // Assume all Arm-specific intrinsics map to an instruction.
2123 if (F->getName().starts_with(Prefix: "llvm.arm"))
2124 return false;
2125
2126 switch (F->getIntrinsicID()) {
2127 default: break;
2128 case Intrinsic::powi:
2129 case Intrinsic::sin:
2130 case Intrinsic::cos:
2131 case Intrinsic::sincos:
2132 case Intrinsic::pow:
2133 case Intrinsic::log:
2134 case Intrinsic::log10:
2135 case Intrinsic::log2:
2136 case Intrinsic::exp:
2137 case Intrinsic::exp2:
2138 return true;
2139 case Intrinsic::sqrt:
2140 case Intrinsic::fabs:
2141 case Intrinsic::copysign:
2142 case Intrinsic::floor:
2143 case Intrinsic::ceil:
2144 case Intrinsic::trunc:
2145 case Intrinsic::rint:
2146 case Intrinsic::nearbyint:
2147 case Intrinsic::round:
2148 case Intrinsic::canonicalize:
2149 case Intrinsic::lround:
2150 case Intrinsic::llround:
2151 case Intrinsic::lrint:
2152 case Intrinsic::llrint:
2153 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2154 return true;
2155 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2156 return true;
2157 // Some operations can be handled by vector instructions and assume
2158 // unsupported vectors will be expanded into supported scalar ones.
2159 // TODO Handle scalar operations properly.
2160 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2161 case Intrinsic::masked_store:
2162 case Intrinsic::masked_load:
2163 case Intrinsic::masked_gather:
2164 case Intrinsic::masked_scatter:
2165 return !ST->hasMVEIntegerOps();
2166 case Intrinsic::sadd_with_overflow:
2167 case Intrinsic::uadd_with_overflow:
2168 case Intrinsic::ssub_with_overflow:
2169 case Intrinsic::usub_with_overflow:
2170 case Intrinsic::sadd_sat:
2171 case Intrinsic::uadd_sat:
2172 case Intrinsic::ssub_sat:
2173 case Intrinsic::usub_sat:
2174 return false;
2175 }
2176
2177 return BaseT::isLoweredToCall(F);
2178}
2179
2180bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) const {
2181 unsigned ISD = TLI->InstructionOpcodeToISD(Opcode: I.getOpcode());
2182 EVT VT = TLI->getValueType(DL, Ty: I.getType(), AllowUnknown: true);
2183 if (TLI->getOperationAction(Op: ISD, VT) == TargetLowering::LibCall)
2184 return true;
2185
2186 // Check if an intrinsic will be lowered to a call and assume that any
2187 // other CallInst will generate a bl.
2188 if (auto *Call = dyn_cast<CallInst>(Val: &I)) {
2189 if (auto *II = dyn_cast<IntrinsicInst>(Val: Call)) {
2190 switch(II->getIntrinsicID()) {
2191 case Intrinsic::memcpy:
2192 case Intrinsic::memset:
2193 case Intrinsic::memmove:
2194 return getNumMemOps(I: II) == -1;
2195 default:
2196 if (const Function *F = Call->getCalledFunction())
2197 return isLoweredToCall(F);
2198 }
2199 }
2200 return true;
2201 }
2202
2203 // FPv5 provides conversions between integer, double-precision,
2204 // single-precision, and half-precision formats.
2205 switch (I.getOpcode()) {
2206 default:
2207 break;
2208 case Instruction::FPToSI:
2209 case Instruction::FPToUI:
2210 case Instruction::SIToFP:
2211 case Instruction::UIToFP:
2212 case Instruction::FPTrunc:
2213 case Instruction::FPExt:
2214 return !ST->hasFPARMv8Base();
2215 }
2216
2217 // FIXME: Unfortunately the approach of checking the Operation Action does
2218 // not catch all cases of Legalization that use library calls. Our
2219 // Legalization step categorizes some transformations into library calls as
2220 // Custom, Expand or even Legal when doing type legalization. So for now
2221 // we have to special case for instance the SDIV of 64bit integers and the
2222 // use of floating point emulation.
2223 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2224 switch (ISD) {
2225 default:
2226 break;
2227 case ISD::SDIV:
2228 case ISD::UDIV:
2229 case ISD::SREM:
2230 case ISD::UREM:
2231 case ISD::SDIVREM:
2232 case ISD::UDIVREM:
2233 return true;
2234 }
2235 }
2236
2237 // Assume all other non-float operations are supported.
2238 if (!VT.isFloatingPoint())
2239 return false;
2240
2241 // We'll need a library call to handle most floats when using soft.
2242 if (TLI->useSoftFloat()) {
2243 switch (I.getOpcode()) {
2244 default:
2245 return true;
2246 case Instruction::Alloca:
2247 case Instruction::Load:
2248 case Instruction::Store:
2249 case Instruction::Select:
2250 case Instruction::PHI:
2251 return false;
2252 }
2253 }
2254
2255 // We'll need a libcall to perform double precision operations on a single
2256 // precision only FPU.
2257 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2258 return true;
2259
2260 // Likewise for half precision arithmetic.
2261 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2262 return true;
2263
2264 return false;
2265}
2266
2267bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2268 AssumptionCache &AC,
2269 TargetLibraryInfo *LibInfo,
2270 HardwareLoopInfo &HWLoopInfo) const {
2271 // Low-overhead branches are only supported in the 'low-overhead branch'
2272 // extension of v8.1-m.
2273 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2274 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2275 return false;
2276 }
2277
2278 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2279 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2280 return false;
2281 }
2282
2283 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2284 if (isa<SCEVCouldNotCompute>(Val: BackedgeTakenCount)) {
2285 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2286 return false;
2287 }
2288
2289 const SCEV *TripCountSCEV =
2290 SE.getAddExpr(LHS: BackedgeTakenCount,
2291 RHS: SE.getOne(Ty: BackedgeTakenCount->getType()));
2292
2293 // We need to store the trip count in LR, a 32-bit register.
2294 if (SE.getUnsignedRangeMax(S: TripCountSCEV).getBitWidth() > 32) {
2295 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2296 return false;
2297 }
2298
2299 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2300 // point in generating a hardware loop if that's going to happen.
2301
2302 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2303 if (auto *Call = dyn_cast<IntrinsicInst>(Val: &I)) {
2304 switch (Call->getIntrinsicID()) {
2305 default:
2306 break;
2307 case Intrinsic::start_loop_iterations:
2308 case Intrinsic::test_start_loop_iterations:
2309 case Intrinsic::loop_decrement:
2310 case Intrinsic::loop_decrement_reg:
2311 return true;
2312 }
2313 }
2314 return false;
2315 };
2316
2317 // Scan the instructions to see if there's any that we know will turn into a
2318 // call or if this loop is already a low-overhead loop or will become a tail
2319 // predicated loop.
2320 bool IsTailPredLoop = false;
2321 auto ScanLoop = [&](Loop *L) {
2322 for (auto *BB : L->getBlocks()) {
2323 for (auto &I : *BB) {
2324 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2325 isa<InlineAsm>(Val: I)) {
2326 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2327 return false;
2328 }
2329 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2330 IsTailPredLoop |=
2331 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2332 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2333 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2334 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2335 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2336 }
2337 }
2338 return true;
2339 };
2340
2341 // Visit inner loops.
2342 for (auto *Inner : *L)
2343 if (!ScanLoop(Inner))
2344 return false;
2345
2346 if (!ScanLoop(L))
2347 return false;
2348
2349 // TODO: Check whether the trip count calculation is expensive. If L is the
2350 // inner loop but we know it has a low trip count, calculating that trip
2351 // count (in the parent loop) may be detrimental.
2352
2353 LLVMContext &C = L->getHeader()->getContext();
2354 HWLoopInfo.CounterInReg = true;
2355 HWLoopInfo.IsNestingLegal = false;
2356 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2357 HWLoopInfo.CountType = Type::getInt32Ty(C);
2358 HWLoopInfo.LoopDecrement = ConstantInt::get(Ty: HWLoopInfo.CountType, V: 1);
2359 return true;
2360}
2361
2362static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2363 // We don't allow icmp's, and because we only look at single block loops,
2364 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2365 if (isa<ICmpInst>(Val: &I) && ++ICmpCount > 1)
2366 return false;
2367 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2368 // not currently canonical, but soon will be. Code without them uses icmp, and
2369 // so is not tail predicated as per the condition above. In order to get the
2370 // same performance we treat min and max the same as an icmp for tailpred
2371 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2372 // pick more optimial instructions like VQDMULH. They need to be recognized
2373 // directly by the vectorizer).
2374 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2375 if ((II->getIntrinsicID() == Intrinsic::smin ||
2376 II->getIntrinsicID() == Intrinsic::smax ||
2377 II->getIntrinsicID() == Intrinsic::umin ||
2378 II->getIntrinsicID() == Intrinsic::umax) &&
2379 ++ICmpCount > 1)
2380 return false;
2381
2382 if (isa<FCmpInst>(Val: &I))
2383 return false;
2384
2385 // We could allow extending/narrowing FP loads/stores, but codegen is
2386 // too inefficient so reject this for now.
2387 if (isa<FPExtInst>(Val: &I) || isa<FPTruncInst>(Val: &I))
2388 return false;
2389
2390 // Extends have to be extending-loads
2391 if (isa<SExtInst>(Val: &I) || isa<ZExtInst>(Val: &I) )
2392 if (!I.getOperand(i: 0)->hasOneUse() || !isa<LoadInst>(Val: I.getOperand(i: 0)))
2393 return false;
2394
2395 // Truncs have to be narrowing-stores
2396 if (isa<TruncInst>(Val: &I) )
2397 if (!I.hasOneUse() || !isa<StoreInst>(Val: *I.user_begin()))
2398 return false;
2399
2400 return true;
2401}
2402
2403// To set up a tail-predicated loop, we need to know the total number of
2404// elements processed by that loop. Thus, we need to determine the element
2405// size and:
2406// 1) it should be uniform for all operations in the vector loop, so we
2407// e.g. don't want any widening/narrowing operations.
2408// 2) it should be smaller than i64s because we don't have vector operations
2409// that work on i64s.
2410// 3) we don't want elements to be reversed or shuffled, to make sure the
2411// tail-predication masks/predicates the right lanes.
2412//
2413static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2414 const DataLayout &DL,
2415 const LoopAccessInfo *LAI) {
2416 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2417
2418 // If there are live-out values, it is probably a reduction. We can predicate
2419 // most reduction operations freely under MVE using a combination of
2420 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2421 // floating point and integer reductions, but don't check for operators
2422 // specifically here. If the value ends up not being a reduction (and so the
2423 // vectorizer cannot tailfold the loop), we should fall back to standard
2424 // vectorization automatically.
2425 SmallVector< Instruction *, 8 > LiveOuts;
2426 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2427 bool ReductionsDisabled =
2428 EnableTailPredication == TailPredication::EnabledNoReductions ||
2429 EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2430
2431 for (auto *I : LiveOuts) {
2432 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2433 !I->getType()->isHalfTy()) {
2434 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2435 "live-out value\n");
2436 return false;
2437 }
2438 if (ReductionsDisabled) {
2439 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2440 return false;
2441 }
2442 }
2443
2444 // Next, check that all instructions can be tail-predicated.
2445 PredicatedScalarEvolution PSE = LAI->getPSE();
2446 int ICmpCount = 0;
2447
2448 for (BasicBlock *BB : L->blocks()) {
2449 for (Instruction &I : BB->instructionsWithoutDebug()) {
2450 if (isa<PHINode>(Val: &I))
2451 continue;
2452 if (!canTailPredicateInstruction(I, ICmpCount)) {
2453 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2454 return false;
2455 }
2456
2457 Type *T = I.getType();
2458 if (T->getScalarSizeInBits() > 32) {
2459 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2460 return false;
2461 }
2462 if (isa<StoreInst>(Val: I) || isa<LoadInst>(Val: I)) {
2463 Value *Ptr = getLoadStorePointerOperand(V: &I);
2464 Type *AccessTy = getLoadStoreType(I: &I);
2465 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, Lp: L).value_or(u: 0);
2466 if (NextStride == 1) {
2467 // TODO: for now only allow consecutive strides of 1. We could support
2468 // other strides as long as it is uniform, but let's keep it simple
2469 // for now.
2470 continue;
2471 } else if (NextStride == -1 ||
2472 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2473 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2474 LLVM_DEBUG(dbgs()
2475 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2476 "be tail-predicated\n.");
2477 return false;
2478 // TODO: don't tail predicate if there is a reversed load?
2479 } else if (EnableMaskedGatherScatters) {
2480 // Gather/scatters do allow loading from arbitrary strides, at
2481 // least if they are loop invariant.
2482 // TODO: Loop variant strides should in theory work, too, but
2483 // this requires further testing.
2484 const SCEV *PtrScev = PSE.getSE()->getSCEV(V: Ptr);
2485 if (auto AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev)) {
2486 const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE());
2487 if (PSE.getSE()->isLoopInvariant(S: Step, L))
2488 continue;
2489 }
2490 }
2491 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2492 "tail-predicate\n.");
2493 return false;
2494 }
2495 }
2496 }
2497
2498 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2499 return true;
2500}
2501
2502bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
2503 if (!EnableTailPredication) {
2504 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2505 return false;
2506 }
2507
2508 // Creating a predicated vector loop is the first step for generating a
2509 // tail-predicated hardware loop, for which we need the MVE masked
2510 // load/stores instructions:
2511 if (!ST->hasMVEIntegerOps())
2512 return false;
2513
2514 LoopVectorizationLegality *LVL = TFI->LVL;
2515 Loop *L = LVL->getLoop();
2516
2517 // For now, restrict this to single block loops.
2518 if (L->getNumBlocks() > 1) {
2519 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2520 "loop.\n");
2521 return false;
2522 }
2523
2524 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2525
2526 LoopInfo *LI = LVL->getLoopInfo();
2527 HardwareLoopInfo HWLoopInfo(L);
2528 if (!HWLoopInfo.canAnalyze(LI&: *LI)) {
2529 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2530 "analyzable.\n");
2531 return false;
2532 }
2533
2534 AssumptionCache *AC = LVL->getAssumptionCache();
2535 ScalarEvolution *SE = LVL->getScalarEvolution();
2536
2537 // This checks if we have the low-overhead branch architecture
2538 // extension, and if we will create a hardware-loop:
2539 if (!isHardwareLoopProfitable(L, SE&: *SE, AC&: *AC, LibInfo: TFI->TLI, HWLoopInfo)) {
2540 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2541 "profitable.\n");
2542 return false;
2543 }
2544
2545 DominatorTree *DT = LVL->getDominatorTree();
2546 if (!HWLoopInfo.isHardwareLoopCandidate(SE&: *SE, LI&: *LI, DT&: *DT)) {
2547 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2548 "a candidate.\n");
2549 return false;
2550 }
2551
2552 return canTailPredicateLoop(L, LI, SE&: *SE, DL, LAI: LVL->getLAI());
2553}
2554
2555TailFoldingStyle
2556ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2557 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2558 return TailFoldingStyle::DataWithoutLaneMask;
2559
2560 // Intrinsic @llvm.get.active.lane.mask is supported.
2561 // It is used in the MVETailPredication pass, which requires the number of
2562 // elements processed by this vector loop to setup the tail-predicated
2563 // loop.
2564 return TailFoldingStyle::Data;
2565}
2566void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2567 TTI::UnrollingPreferences &UP,
2568 OptimizationRemarkEmitter *ORE) const {
2569 // Enable Upper bound unrolling universally, providing that we do not see an
2570 // active lane mask, which will be better kept as a loop to become tail
2571 // predicated than to be conditionally unrolled.
2572 UP.UpperBound =
2573 !ST->hasMVEIntegerOps() || !any_of(Range&: *L->getHeader(), P: [](Instruction &I) {
2574 return isa<IntrinsicInst>(Val: I) &&
2575 cast<IntrinsicInst>(Val&: I).getIntrinsicID() ==
2576 Intrinsic::get_active_lane_mask;
2577 });
2578
2579 // Only currently enable these preferences for M-Class cores.
2580 if (!ST->isMClass())
2581 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2582
2583 // Disable loop unrolling for Oz and Os.
2584 UP.OptSizeThreshold = 0;
2585 UP.PartialOptSizeThreshold = 0;
2586 if (L->getHeader()->getParent()->hasOptSize())
2587 return;
2588
2589 SmallVector<BasicBlock*, 4> ExitingBlocks;
2590 L->getExitingBlocks(ExitingBlocks);
2591 LLVM_DEBUG(dbgs() << "Loop has:\n"
2592 << "Blocks: " << L->getNumBlocks() << "\n"
2593 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2594
2595 // Only allow another exit other than the latch. This acts as an early exit
2596 // as it mirrors the profitability calculation of the runtime unroller.
2597 if (ExitingBlocks.size() > 2)
2598 return;
2599
2600 // Limit the CFG of the loop body for targets with a branch predictor.
2601 // Allowing 4 blocks permits if-then-else diamonds in the body.
2602 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2603 return;
2604
2605 // Don't unroll vectorized loops, including the remainder loop
2606 if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized"))
2607 return;
2608
2609 // Scan the loop: don't unroll loops with calls as this could prevent
2610 // inlining.
2611 InstructionCost Cost = 0;
2612 for (auto *BB : L->getBlocks()) {
2613 for (auto &I : *BB) {
2614 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2615 // scalar code.
2616 if (I.getType()->isVectorTy())
2617 return;
2618
2619 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) {
2620 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
2621 if (!isLoweredToCall(F))
2622 continue;
2623 }
2624 return;
2625 }
2626
2627 SmallVector<const Value*, 4> Operands(I.operand_values());
2628 Cost += getInstructionCost(U: &I, Operands,
2629 CostKind: TargetTransformInfo::TCK_SizeAndLatency);
2630 }
2631 }
2632
2633 // On v6m cores, there are very few registers available. We can easily end up
2634 // spilling and reloading more registers in an unrolled loop. Look at the
2635 // number of LCSSA phis as a rough measure of how many registers will need to
2636 // be live out of the loop, reducing the default unroll count if more than 1
2637 // value is needed. In the long run, all of this should be being learnt by a
2638 // machine.
2639 unsigned UnrollCount = 4;
2640 if (ST->isThumb1Only()) {
2641 unsigned ExitingValues = 0;
2642 SmallVector<BasicBlock *, 4> ExitBlocks;
2643 L->getExitBlocks(ExitBlocks);
2644 for (auto *Exit : ExitBlocks) {
2645 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2646 // only the last is expected to be needed for address operands.
2647 unsigned LiveOuts = count_if(Range: Exit->phis(), P: [](auto &PH) {
2648 return PH.getNumOperands() != 1 ||
2649 !isa<GetElementPtrInst>(PH.getOperand(0));
2650 });
2651 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2652 }
2653 if (ExitingValues)
2654 UnrollCount /= ExitingValues;
2655 if (UnrollCount <= 1)
2656 return;
2657 }
2658
2659 // For processors with low overhead branching (LOB), runtime unrolling the
2660 // innermost loop is often detrimental to performance. In these cases the loop
2661 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2662 // deeply nested loops get executed multiple times, negating the benefits of
2663 // LOB. This is particularly noticable when the loop trip count of the
2664 // innermost loop varies within the outer loop, such as in the case of
2665 // triangular matrix decompositions. In these cases we will prefer to not
2666 // unroll the innermost loop, with the intention for it to be executed as a
2667 // low overhead loop.
2668 bool Runtime = true;
2669 if (ST->hasLOB()) {
2670 if (SE.hasLoopInvariantBackedgeTakenCount(L)) {
2671 const auto *BETC = SE.getBackedgeTakenCount(L);
2672 auto *Outer = L->getOutermostLoop();
2673 if ((L != Outer && Outer != L->getParentLoop()) ||
2674 (L != Outer && BETC && !SE.isLoopInvariant(S: BETC, L: Outer))) {
2675 Runtime = false;
2676 }
2677 }
2678 }
2679
2680 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2681 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2682
2683 UP.Partial = true;
2684 UP.Runtime = Runtime;
2685 UP.UnrollRemainder = true;
2686 UP.DefaultUnrollRuntimeCount = UnrollCount;
2687 UP.UnrollAndJam = true;
2688 UP.UnrollAndJamInnerLoopThreshold = 60;
2689
2690 // Force unrolling small loops can be very useful because of the branch
2691 // taken cost of the backedge.
2692 if (Cost < 12)
2693 UP.Force = true;
2694}
2695
2696void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2697 TTI::PeelingPreferences &PP) const {
2698 BaseT::getPeelingPreferences(L, SE, PP);
2699}
2700
2701bool ARMTTIImpl::preferInLoopReduction(RecurKind Kind, Type *Ty) const {
2702 if (!ST->hasMVEIntegerOps())
2703 return false;
2704
2705 unsigned ScalarBits = Ty->getScalarSizeInBits();
2706 switch (Kind) {
2707 case RecurKind::Add:
2708 return ScalarBits <= 64;
2709 default:
2710 return false;
2711 }
2712}
2713
2714bool ARMTTIImpl::preferPredicatedReductionSelect() const {
2715 if (!ST->hasMVEIntegerOps())
2716 return false;
2717 return true;
2718}
2719
2720InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
2721 StackOffset BaseOffset,
2722 bool HasBaseReg, int64_t Scale,
2723 unsigned AddrSpace) const {
2724 TargetLoweringBase::AddrMode AM;
2725 AM.BaseGV = BaseGV;
2726 AM.BaseOffs = BaseOffset.getFixed();
2727 AM.HasBaseReg = HasBaseReg;
2728 AM.Scale = Scale;
2729 AM.ScalableOffset = BaseOffset.getScalable();
2730 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) {
2731 if (ST->hasFPAO())
2732 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2733 return 0;
2734 }
2735 return InstructionCost::getInvalid();
2736}
2737
2738bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2739 if (Thumb) {
2740 // B.W is available in any Thumb2-supporting target, and also in every
2741 // version of Armv8-M, even Baseline which does not include the rest of
2742 // Thumb2.
2743 return ST->isThumb2() || ST->hasV8MBaselineOps();
2744 } else {
2745 // B is available in all versions of the Arm ISA, so the only question is
2746 // whether that ISA is available at all.
2747 return ST->hasARMOps();
2748 }
2749}
2750
2751/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2752/// of the vector elements.
2753static bool areExtractExts(Value *Ext1, Value *Ext2) {
2754 using namespace PatternMatch;
2755
2756 auto areExtDoubled = [](Instruction *Ext) {
2757 return Ext->getType()->getScalarSizeInBits() ==
2758 2 * Ext->getOperand(i: 0)->getType()->getScalarSizeInBits();
2759 };
2760
2761 if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) ||
2762 !match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) ||
2763 !areExtDoubled(cast<Instruction>(Val: Ext1)) ||
2764 !areExtDoubled(cast<Instruction>(Val: Ext2)))
2765 return false;
2766
2767 return true;
2768}
2769
2770/// Check if sinking \p I's operands to I's basic block is profitable, because
2771/// the operands can be folded into a target instruction, e.g.
2772/// sext/zext can be folded into vsubl.
2773bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,
2774 SmallVectorImpl<Use *> &Ops) const {
2775 using namespace PatternMatch;
2776
2777 if (!I->getType()->isVectorTy())
2778 return false;
2779
2780 if (ST->hasNEON()) {
2781 switch (I->getOpcode()) {
2782 case Instruction::Sub:
2783 case Instruction::Add: {
2784 if (!areExtractExts(Ext1: I->getOperand(i: 0), Ext2: I->getOperand(i: 1)))
2785 return false;
2786 Ops.push_back(Elt: &I->getOperandUse(i: 0));
2787 Ops.push_back(Elt: &I->getOperandUse(i: 1));
2788 return true;
2789 }
2790 default:
2791 return false;
2792 }
2793 }
2794
2795 if (!ST->hasMVEIntegerOps())
2796 return false;
2797
2798 auto IsFMSMul = [&](Instruction *I) {
2799 if (!I->hasOneUse())
2800 return false;
2801 auto *Sub = cast<Instruction>(Val: *I->users().begin());
2802 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(i: 1) == I;
2803 };
2804 auto IsFMS = [&](Instruction *I) {
2805 if (match(V: I->getOperand(i: 0), P: m_FNeg(X: m_Value())) ||
2806 match(V: I->getOperand(i: 1), P: m_FNeg(X: m_Value())))
2807 return true;
2808 return false;
2809 };
2810
2811 auto IsSinker = [&](Instruction *I, int Operand) {
2812 switch (I->getOpcode()) {
2813 case Instruction::Add:
2814 case Instruction::Mul:
2815 case Instruction::FAdd:
2816 case Instruction::ICmp:
2817 case Instruction::FCmp:
2818 return true;
2819 case Instruction::FMul:
2820 return !IsFMSMul(I);
2821 case Instruction::Sub:
2822 case Instruction::FSub:
2823 case Instruction::Shl:
2824 case Instruction::LShr:
2825 case Instruction::AShr:
2826 return Operand == 1;
2827 case Instruction::Call:
2828 if (auto *II = dyn_cast<IntrinsicInst>(Val: I)) {
2829 switch (II->getIntrinsicID()) {
2830 case Intrinsic::fma:
2831 return !IsFMS(I);
2832 case Intrinsic::sadd_sat:
2833 case Intrinsic::uadd_sat:
2834 case Intrinsic::arm_mve_add_predicated:
2835 case Intrinsic::arm_mve_mul_predicated:
2836 case Intrinsic::arm_mve_qadd_predicated:
2837 case Intrinsic::arm_mve_vhadd:
2838 case Intrinsic::arm_mve_hadd_predicated:
2839 case Intrinsic::arm_mve_vqdmull:
2840 case Intrinsic::arm_mve_vqdmull_predicated:
2841 case Intrinsic::arm_mve_vqdmulh:
2842 case Intrinsic::arm_mve_qdmulh_predicated:
2843 case Intrinsic::arm_mve_vqrdmulh:
2844 case Intrinsic::arm_mve_qrdmulh_predicated:
2845 case Intrinsic::arm_mve_fma_predicated:
2846 return true;
2847 case Intrinsic::ssub_sat:
2848 case Intrinsic::usub_sat:
2849 case Intrinsic::arm_mve_sub_predicated:
2850 case Intrinsic::arm_mve_qsub_predicated:
2851 case Intrinsic::arm_mve_hsub_predicated:
2852 case Intrinsic::arm_mve_vhsub:
2853 return Operand == 1;
2854 default:
2855 return false;
2856 }
2857 }
2858 return false;
2859 default:
2860 return false;
2861 }
2862 };
2863
2864 for (auto OpIdx : enumerate(First: I->operands())) {
2865 Instruction *Op = dyn_cast<Instruction>(Val: OpIdx.value().get());
2866 // Make sure we are not already sinking this operand
2867 if (!Op || any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; }))
2868 continue;
2869
2870 Instruction *Shuffle = Op;
2871 if (Shuffle->getOpcode() == Instruction::BitCast)
2872 Shuffle = dyn_cast<Instruction>(Val: Shuffle->getOperand(i: 0));
2873 // We are looking for a splat that can be sunk.
2874 if (!Shuffle || !match(V: Shuffle, P: m_Shuffle(v1: m_InsertElt(Val: m_Undef(), Elt: m_Value(),
2875 Idx: m_ZeroInt()),
2876 v2: m_Undef(), mask: m_ZeroMask())))
2877 continue;
2878 if (!IsSinker(I, OpIdx.index()))
2879 continue;
2880
2881 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2882 // and vector registers
2883 for (Use &U : Op->uses()) {
2884 Instruction *Insn = cast<Instruction>(Val: U.getUser());
2885 if (!IsSinker(Insn, U.getOperandNo()))
2886 return false;
2887 }
2888
2889 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
2890 if (Shuffle != Op)
2891 Ops.push_back(Elt: &Op->getOperandUse(i: 0));
2892 Ops.push_back(Elt: &OpIdx.value());
2893 }
2894 return true;
2895}
2896
2897unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
2898 Type *ArrayType) const {
2899 if (!UseWidenGlobalArrays) {
2900 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
2901 return false;
2902 }
2903
2904 // Don't modify none integer array types
2905 if (!ArrayType || !ArrayType->isArrayTy() ||
2906 !ArrayType->getArrayElementType()->isIntegerTy())
2907 return 0;
2908
2909 // We pad to 4 byte boundaries
2910 if (Size % 4 == 0)
2911 return 0;
2912
2913 unsigned NumBytesToPad = 4 - (Size % 4);
2914 unsigned NewSize = Size + NumBytesToPad;
2915
2916 // Max number of bytes that memcpy allows for lowering to load/stores before
2917 // it uses library function (__aeabi_memcpy).
2918 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
2919
2920 if (NewSize > MaxMemIntrinsicSize)
2921 return 0;
2922
2923 return NumBytesToPad;
2924}
2925