1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/CodeGenTypes/MachineValueType.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
21#include "llvm/IR/DerivedTypes.h"
22#include "llvm/IR/Instruction.h"
23#include "llvm/IR/Instructions.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
27#include "llvm/IR/PatternMatch.h"
28#include "llvm/IR/Type.h"
29#include "llvm/Support/Casting.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Target/TargetMachine.h"
32#include "llvm/TargetParser/SubtargetFeature.h"
33#include "llvm/Transforms/InstCombine/InstCombiner.h"
34#include "llvm/Transforms/Utils/Local.h"
35#include "llvm/Transforms/Utils/LoopUtils.h"
36#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
47static cl::opt<bool> EnableMaskedLoadStores(
48 "enable-arm-maskedldst", cl::Hidden, cl::init(Val: true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
51static cl::opt<bool> DisableLowOverheadLoops(
52 "disable-arm-loloops", cl::Hidden, cl::init(Val: false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(Val: true),
57 cl::desc("Enable the generation of WLS loops"));
58
59static cl::opt<bool> UseWidenGlobalArrays(
60 "widen-global-strings", cl::Hidden, cl::init(Val: true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
63extern cl::opt<TailPredication::Mode> EnableTailPredication;
64
65extern cl::opt<bool> EnableMaskedGatherScatters;
66
67extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
68
69static cl::opt<int> ArmForceUnrollThreshold(
70 "arm-force-unroll-threshold", cl::init(Val: 12), cl::Hidden,
71 cl::desc(
72 "Threshold for forced unrolling of small loops in Arm architecture"));
73
74/// Convert a vector load intrinsic into a simple llvm load instruction.
75/// This is beneficial when the underlying object being addressed comes
76/// from a constant, since we get constant-folding for free.
77static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
78 InstCombiner::BuilderTy &Builder) {
79 auto *IntrAlign = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 1));
80
81 if (!IntrAlign)
82 return nullptr;
83
84 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
85 ? MemAlign
86 : IntrAlign->getLimitedValue();
87
88 if (!isPowerOf2_32(Value: Alignment))
89 return nullptr;
90
91 return Builder.CreateAlignedLoad(Ty: II.getType(), Ptr: II.getArgOperand(i: 0),
92 Align: Align(Alignment));
93}
94
95bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
96 const Function *Callee) const {
97 const TargetMachine &TM = getTLI()->getTargetMachine();
98 const FeatureBitset &CallerBits =
99 TM.getSubtargetImpl(*Caller)->getFeatureBits();
100 const FeatureBitset &CalleeBits =
101 TM.getSubtargetImpl(*Callee)->getFeatureBits();
102
103 // To inline a callee, all features not in the allowed list must match exactly.
104 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
105 (CalleeBits & ~InlineFeaturesAllowed);
106 // For features in the allowed list, the callee's features must be a subset of
107 // the callers'.
108 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
109 (CalleeBits & InlineFeaturesAllowed);
110
111 LLVM_DEBUG({
112 if (!MatchExact || !MatchSubset) {
113 dbgs() << "=== Inline compatibility debug ===\n";
114 dbgs() << "Caller: " << Caller->getName() << "\n";
115 dbgs() << "Callee: " << Callee->getName() << "\n";
116
117 // Bit diffs
118 FeatureBitset MissingInCaller = CalleeBits & ~CallerBits; // callee-only
119 FeatureBitset ExtraInCaller = CallerBits & ~CalleeBits; // caller-only
120
121 // Counts
122 dbgs() << "Only-in-caller bit count: " << ExtraInCaller.count() << "\n";
123 dbgs() << "Only-in-callee bit count: " << MissingInCaller.count() << "\n";
124
125 dbgs() << "Only-in-caller feature indices [";
126 {
127 bool First = true;
128 for (size_t I = 0, E = ExtraInCaller.size(); I < E; ++I) {
129 if (ExtraInCaller.test(I)) {
130 if (!First)
131 dbgs() << ", ";
132 dbgs() << I;
133 First = false;
134 }
135 }
136 }
137 dbgs() << "]\n";
138
139 dbgs() << "Only-in-callee feature indices [";
140 {
141 bool First = true;
142 for (size_t I = 0, E = MissingInCaller.size(); I < E; ++I) {
143 if (MissingInCaller.test(I)) {
144 if (!First)
145 dbgs() << ", ";
146 dbgs() << I;
147 First = false;
148 }
149 }
150 }
151 dbgs() << "]\n";
152
153 // Indices map to features as found in
154 // llvm-project/(your_build)/lib/Target/ARM/ARMGenSubtargetInfo.inc
155 dbgs() << "MatchExact=" << (MatchExact ? "true" : "false")
156 << " MatchSubset=" << (MatchSubset ? "true" : "false") << "\n";
157 }
158 });
159 return MatchExact && MatchSubset;
160}
161
162TTI::AddressingModeKind
163ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
164 ScalarEvolution *SE) const {
165 if (ST->hasMVEIntegerOps())
166 return TTI::AMK_PostIndexed;
167
168 if (L->getHeader()->getParent()->hasOptSize())
169 return TTI::AMK_None;
170
171 if (ST->isMClass() && ST->isThumb2() &&
172 L->getNumBlocks() == 1)
173 return TTI::AMK_PreIndexed;
174
175 return TTI::AMK_None;
176}
177
178std::optional<Instruction *>
179ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
180 using namespace PatternMatch;
181 Intrinsic::ID IID = II.getIntrinsicID();
182 switch (IID) {
183 default:
184 break;
185 case Intrinsic::arm_neon_vld1: {
186 Align MemAlign =
187 getKnownAlignment(V: II.getArgOperand(i: 0), DL: IC.getDataLayout(), CxtI: &II,
188 AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
189 if (Value *V = simplifyNeonVld1(II, MemAlign: MemAlign.value(), Builder&: IC.Builder)) {
190 return IC.replaceInstUsesWith(I&: II, V);
191 }
192 break;
193 }
194
195 case Intrinsic::arm_neon_vld2:
196 case Intrinsic::arm_neon_vld3:
197 case Intrinsic::arm_neon_vld4:
198 case Intrinsic::arm_neon_vld2lane:
199 case Intrinsic::arm_neon_vld3lane:
200 case Intrinsic::arm_neon_vld4lane:
201 case Intrinsic::arm_neon_vst1:
202 case Intrinsic::arm_neon_vst2:
203 case Intrinsic::arm_neon_vst3:
204 case Intrinsic::arm_neon_vst4:
205 case Intrinsic::arm_neon_vst2lane:
206 case Intrinsic::arm_neon_vst3lane:
207 case Intrinsic::arm_neon_vst4lane: {
208 Align MemAlign =
209 getKnownAlignment(V: II.getArgOperand(i: 0), DL: IC.getDataLayout(), CxtI: &II,
210 AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
211 unsigned AlignArg = II.arg_size() - 1;
212 Value *AlignArgOp = II.getArgOperand(i: AlignArg);
213 MaybeAlign Align = cast<ConstantInt>(Val: AlignArgOp)->getMaybeAlignValue();
214 if (Align && *Align < MemAlign) {
215 return IC.replaceOperand(
216 I&: II, OpNum: AlignArg,
217 V: ConstantInt::get(Ty: Type::getInt32Ty(C&: II.getContext()), V: MemAlign.value(),
218 IsSigned: false));
219 }
220 break;
221 }
222
223 case Intrinsic::arm_neon_vld1x2:
224 case Intrinsic::arm_neon_vld1x3:
225 case Intrinsic::arm_neon_vld1x4:
226 case Intrinsic::arm_neon_vst1x2:
227 case Intrinsic::arm_neon_vst1x3:
228 case Intrinsic::arm_neon_vst1x4: {
229 Align NewAlign =
230 getKnownAlignment(V: II.getArgOperand(i: 0), DL: IC.getDataLayout(), CxtI: &II,
231 AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
232 Align OldAlign = II.getParamAlign(ArgNo: 0).valueOrOne();
233 if (NewAlign > OldAlign)
234 II.addParamAttr(ArgNo: 0,
235 Attr: Attribute::getWithAlignment(Context&: II.getContext(), Alignment: NewAlign));
236 break;
237 }
238
239 case Intrinsic::arm_mve_pred_i2v: {
240 Value *Arg = II.getArgOperand(i: 0);
241 Value *ArgArg;
242 if (match(V: Arg, P: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
243 Op0: PatternMatch::m_Value(V&: ArgArg))) &&
244 II.getType() == ArgArg->getType()) {
245 return IC.replaceInstUsesWith(I&: II, V: ArgArg);
246 }
247 Constant *XorMask;
248 if (match(V: Arg, P: m_Xor(L: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
249 Op0: PatternMatch::m_Value(V&: ArgArg)),
250 R: PatternMatch::m_Constant(C&: XorMask))) &&
251 II.getType() == ArgArg->getType()) {
252 if (auto *CI = dyn_cast<ConstantInt>(Val: XorMask)) {
253 if (CI->getValue().trunc(width: 16).isAllOnes()) {
254 auto TrueVector = IC.Builder.CreateVectorSplat(
255 NumElts: cast<FixedVectorType>(Val: II.getType())->getNumElements(),
256 V: IC.Builder.getTrue());
257 return BinaryOperator::Create(Op: Instruction::Xor, S1: ArgArg, S2: TrueVector);
258 }
259 }
260 }
261 KnownBits ScalarKnown(32);
262 if (IC.SimplifyDemandedBits(I: &II, OpNo: 0, DemandedMask: APInt::getLowBitsSet(numBits: 32, loBitsSet: 16),
263 Known&: ScalarKnown)) {
264 return &II;
265 }
266 break;
267 }
268 case Intrinsic::arm_mve_pred_v2i: {
269 Value *Arg = II.getArgOperand(i: 0);
270 Value *ArgArg;
271 if (match(V: Arg, P: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
272 Op0: PatternMatch::m_Value(V&: ArgArg)))) {
273 return IC.replaceInstUsesWith(I&: II, V: ArgArg);
274 }
275
276 if (II.getMetadata(KindID: LLVMContext::MD_range))
277 break;
278
279 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
280
281 if (auto CurrentRange = II.getRange()) {
282 Range = Range.intersectWith(CR: *CurrentRange);
283 if (Range == CurrentRange)
284 break;
285 }
286
287 II.addRangeRetAttr(CR: Range);
288 II.addRetAttr(Kind: Attribute::NoUndef);
289 return &II;
290 }
291 case Intrinsic::arm_mve_vadc:
292 case Intrinsic::arm_mve_vadc_predicated: {
293 unsigned CarryOp =
294 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
295 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
296 "Bad type for intrinsic!");
297
298 KnownBits CarryKnown(32);
299 if (IC.SimplifyDemandedBits(I: &II, OpNo: CarryOp, DemandedMask: APInt::getOneBitSet(numBits: 32, BitNo: 29),
300 Known&: CarryKnown)) {
301 return &II;
302 }
303 break;
304 }
305 case Intrinsic::arm_mve_vmldava: {
306 Instruction *I = cast<Instruction>(Val: &II);
307 if (I->hasOneUse()) {
308 auto *User = cast<Instruction>(Val: *I->user_begin());
309 Value *OpZ;
310 if (match(V: User, P: m_c_Add(L: m_Specific(V: I), R: m_Value(V&: OpZ))) &&
311 match(V: I->getOperand(i: 3), P: m_Zero())) {
312 Value *OpX = I->getOperand(i: 4);
313 Value *OpY = I->getOperand(i: 5);
314 Type *OpTy = OpX->getType();
315
316 IC.Builder.SetInsertPoint(User);
317 Value *V =
318 IC.Builder.CreateIntrinsic(ID: Intrinsic::arm_mve_vmldava, Types: {OpTy},
319 Args: {I->getOperand(i: 0), I->getOperand(i: 1),
320 I->getOperand(i: 2), OpZ, OpX, OpY});
321
322 IC.replaceInstUsesWith(I&: *User, V);
323 return IC.eraseInstFromFunction(I&: *User);
324 }
325 }
326 return std::nullopt;
327 }
328 }
329 return std::nullopt;
330}
331
332std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
333 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
334 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
335 std::function<void(Instruction *, unsigned, APInt, APInt &)>
336 SimplifyAndSetOp) const {
337
338 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
339 // opcode specifying a Top/Bottom instruction, which can change between
340 // instructions.
341 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
342 unsigned NumElts = cast<FixedVectorType>(Val: II.getType())->getNumElements();
343 unsigned IsTop = cast<ConstantInt>(Val: II.getOperand(i_nocapture: TopOpc))->getZExtValue();
344
345 // The only odd/even lanes of operand 0 will only be demanded depending
346 // on whether this is a top/bottom instruction.
347 APInt DemandedElts =
348 APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: 2, loBitsSet: 1)
349 : APInt::getHighBitsSet(numBits: 2, hiBitsSet: 1));
350 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
351 // The other lanes will be defined from the inserted elements.
352 UndefElts &= APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: 2, loBitsSet: 1)
353 : APInt::getHighBitsSet(numBits: 2, hiBitsSet: 1));
354 return std::nullopt;
355 };
356
357 switch (II.getIntrinsicID()) {
358 default:
359 break;
360 case Intrinsic::arm_mve_vcvt_narrow:
361 SimplifyNarrowInstrTopBottom(2);
362 break;
363 case Intrinsic::arm_mve_vqmovn:
364 SimplifyNarrowInstrTopBottom(4);
365 break;
366 case Intrinsic::arm_mve_vshrn:
367 SimplifyNarrowInstrTopBottom(7);
368 break;
369 }
370
371 return std::nullopt;
372}
373
374InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
375 TTI::TargetCostKind CostKind) const {
376 assert(Ty->isIntegerTy());
377
378 unsigned Bits = Ty->getPrimitiveSizeInBits();
379 if (Bits == 0 || Imm.getActiveBits() >= 64)
380 return 4;
381
382 int64_t SImmVal = Imm.getSExtValue();
383 uint64_t ZImmVal = Imm.getZExtValue();
384 if (!ST->isThumb()) {
385 if ((SImmVal >= 0 && SImmVal < 65536) ||
386 (ARM_AM::getSOImmVal(Arg: ZImmVal) != -1) ||
387 (ARM_AM::getSOImmVal(Arg: ~ZImmVal) != -1))
388 return 1;
389 return ST->hasV6T2Ops() ? 2 : 3;
390 }
391 if (ST->isThumb2()) {
392 if ((SImmVal >= 0 && SImmVal < 65536) ||
393 (ARM_AM::getT2SOImmVal(Arg: ZImmVal) != -1) ||
394 (ARM_AM::getT2SOImmVal(Arg: ~ZImmVal) != -1))
395 return 1;
396 return ST->hasV6T2Ops() ? 2 : 3;
397 }
398 // Thumb1, any i8 imm cost 1.
399 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
400 return 1;
401 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(V: ZImmVal))
402 return 2;
403 // Load from constantpool.
404 return 3;
405}
406
407// Constants smaller than 256 fit in the immediate field of
408// Thumb1 instructions so we return a zero cost and 1 otherwise.
409InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
410 const APInt &Imm,
411 Type *Ty) const {
412 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
413 return 0;
414
415 return 1;
416}
417
418// Checks whether Inst is part of a min(max()) or max(min()) pattern
419// that will match to an SSAT instruction. Returns the instruction being
420// saturated, or null if no saturation pattern was found.
421static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
422 Value *LHS, *RHS;
423 ConstantInt *C;
424 SelectPatternFlavor InstSPF = matchSelectPattern(V: Inst, LHS, RHS).Flavor;
425
426 if (InstSPF == SPF_SMAX &&
427 PatternMatch::match(V: RHS, P: PatternMatch::m_ConstantInt(CI&: C)) &&
428 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
429
430 auto isSSatMin = [&](Value *MinInst) {
431 if (isa<SelectInst>(Val: MinInst)) {
432 Value *MinLHS, *MinRHS;
433 ConstantInt *MinC;
434 SelectPatternFlavor MinSPF =
435 matchSelectPattern(V: MinInst, LHS&: MinLHS, RHS&: MinRHS).Flavor;
436 if (MinSPF == SPF_SMIN &&
437 PatternMatch::match(V: MinRHS, P: PatternMatch::m_ConstantInt(CI&: MinC)) &&
438 MinC->getValue() == ((-Imm) - 1))
439 return true;
440 }
441 return false;
442 };
443
444 if (isSSatMin(Inst->getOperand(i: 1)))
445 return cast<Instruction>(Val: Inst->getOperand(i: 1))->getOperand(i: 1);
446 if (Inst->hasNUses(N: 2) &&
447 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
448 return Inst->getOperand(i: 1);
449 }
450 return nullptr;
451}
452
453// Look for a FP Saturation pattern, where the instruction can be simplified to
454// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
455static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
456 if (Imm.getBitWidth() != 64 ||
457 Imm != APInt::getHighBitsSet(numBits: 64, hiBitsSet: 33)) // -2147483648
458 return false;
459 Value *FP = isSSATMinMaxPattern(Inst, Imm);
460 if (!FP && isa<ICmpInst>(Val: Inst) && Inst->hasOneUse())
461 FP = isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm);
462 if (!FP)
463 return false;
464 return isa<FPToSIInst>(Val: FP);
465}
466
467InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
468 const APInt &Imm, Type *Ty,
469 TTI::TargetCostKind CostKind,
470 Instruction *Inst) const {
471 // Division by a constant can be turned into multiplication, but only if we
472 // know it's constant. So it's not so much that the immediate is cheap (it's
473 // not), but that the alternative is worse.
474 // FIXME: this is probably unneeded with GlobalISel.
475 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
476 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
477 Idx == 1)
478 return 0;
479
480 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
481 // splitting any large offsets.
482 if (Opcode == Instruction::GetElementPtr && Idx != 0)
483 return 0;
484
485 if (Opcode == Instruction::And) {
486 // UXTB/UXTH
487 if (Imm == 255 || Imm == 65535)
488 return 0;
489 // Conversion to BIC is free, and means we can use ~Imm instead.
490 return std::min(a: getIntImmCost(Imm, Ty, CostKind),
491 b: getIntImmCost(Imm: ~Imm, Ty, CostKind));
492 }
493
494 if (Opcode == Instruction::Add)
495 // Conversion to SUB is free, and means we can use -Imm instead.
496 return std::min(a: getIntImmCost(Imm, Ty, CostKind),
497 b: getIntImmCost(Imm: -Imm, Ty, CostKind));
498
499 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
500 Ty->getIntegerBitWidth() == 32) {
501 int64_t NegImm = -Imm.getSExtValue();
502 if (ST->isThumb2() && NegImm < 1<<12)
503 // icmp X, #-C -> cmn X, #C
504 return 0;
505 if (ST->isThumb() && NegImm < 1<<8)
506 // icmp X, #-C -> adds X, #C
507 return 0;
508 }
509
510 // xor a, -1 can always be folded to MVN
511 if (Opcode == Instruction::Xor && Imm.isAllOnes())
512 return 0;
513
514 // Ensures negative constant of min(max()) or max(min()) patterns that
515 // match to SSAT instructions don't get hoisted
516 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
517 Ty->getIntegerBitWidth() <= 32) {
518 if (isSSATMinMaxPattern(Inst, Imm) ||
519 (isa<ICmpInst>(Val: Inst) && Inst->hasOneUse() &&
520 isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm)))
521 return 0;
522 }
523
524 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
525 return 0;
526
527 // We can convert <= -1 to < 0, which is generally quite cheap.
528 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
529 ICmpInst::Predicate Pred = cast<ICmpInst>(Val: Inst)->getPredicate();
530 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
531 return std::min(a: getIntImmCost(Imm, Ty, CostKind),
532 b: getIntImmCost(Imm: Imm + 1, Ty, CostKind));
533 }
534
535 return getIntImmCost(Imm, Ty, CostKind);
536}
537
538InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
539 TTI::TargetCostKind CostKind,
540 const Instruction *I) const {
541 if (CostKind == TTI::TCK_RecipThroughput &&
542 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
543 // FIXME: The vectorizer is highly sensistive to the cost of these
544 // instructions, which suggests that it may be using the costs incorrectly.
545 // But, for now, just make them free to avoid performance regressions for
546 // vector targets.
547 return 0;
548 }
549 return BaseT::getCFInstrCost(Opcode, CostKind, I);
550}
551
552InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
553 Type *Src,
554 TTI::CastContextHint CCH,
555 TTI::TargetCostKind CostKind,
556 const Instruction *I) const {
557 int ISD = TLI->InstructionOpcodeToISD(Opcode);
558 assert(ISD && "Invalid opcode");
559
560 // TODO: Allow non-throughput costs that aren't binary.
561 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
562 if (CostKind != TTI::TCK_RecipThroughput)
563 return Cost == 0 ? 0 : 1;
564 return Cost;
565 };
566 auto IsLegalFPType = [this](EVT VT) {
567 EVT EltVT = VT.getScalarType();
568 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
569 (EltVT == MVT::f64 && ST->hasFP64()) ||
570 (EltVT == MVT::f16 && ST->hasFullFP16());
571 };
572
573 EVT SrcTy = TLI->getValueType(DL, Ty: Src);
574 EVT DstTy = TLI->getValueType(DL, Ty: Dst);
575
576 if (!SrcTy.isSimple() || !DstTy.isSimple())
577 return AdjustCost(
578 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
579
580 // Extending masked load/Truncating masked stores is expensive because we
581 // currently don't split them. This means that we'll likely end up
582 // loading/storing each element individually (hence the high cost).
583 if ((ST->hasMVEIntegerOps() &&
584 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
585 Opcode == Instruction::SExt)) ||
586 (ST->hasMVEFloatOps() &&
587 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
588 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
589 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
590 return 2 * DstTy.getVectorNumElements() *
591 ST->getMVEVectorCostFactor(CostKind);
592
593 // The extend of other kinds of load is free
594 if (CCH == TTI::CastContextHint::Normal ||
595 CCH == TTI::CastContextHint::Masked) {
596 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
597 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i32, .Src: MVT::i16, .Cost: 0},
598 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i32, .Src: MVT::i16, .Cost: 0},
599 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i32, .Src: MVT::i8, .Cost: 0},
600 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i32, .Src: MVT::i8, .Cost: 0},
601 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i16, .Src: MVT::i8, .Cost: 0},
602 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i16, .Src: MVT::i8, .Cost: 0},
603 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i32, .Cost: 1},
604 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i32, .Cost: 1},
605 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: 1},
606 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: 1},
607 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i8, .Cost: 1},
608 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i8, .Cost: 1},
609 };
610 if (const auto *Entry = ConvertCostTableLookup(
611 Table: LoadConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
612 return AdjustCost(Entry->Cost);
613
614 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
615 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0},
616 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0},
617 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 0},
618 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 0},
619 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0},
620 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0},
621 // The following extend from a legal type to an illegal type, so need to
622 // split the load. This introduced an extra load operation, but the
623 // extend is still "free".
624 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 1},
625 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 1},
626 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 3},
627 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 3},
628 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 1},
629 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 1},
630 };
631 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
632 if (const auto *Entry =
633 ConvertCostTableLookup(Table: MVELoadConversionTbl, ISD,
634 Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
635 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
636 }
637
638 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
639 // FPExtends are similar but also require the VCVT instructions.
640 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: 1},
641 {.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: 3},
642 };
643 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
644 if (const auto *Entry =
645 ConvertCostTableLookup(Table: MVEFLoadConversionTbl, ISD,
646 Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
647 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
648 }
649
650 // The truncate of a store is free. This is the mirror of extends above.
651 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
652 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0},
653 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 0},
654 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0},
655 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 1},
656 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 1},
657 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 3},
658 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 1},
659 };
660 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
661 if (const auto *Entry =
662 ConvertCostTableLookup(Table: MVEStoreConversionTbl, ISD,
663 Dst: SrcTy.getSimpleVT(), Src: DstTy.getSimpleVT()))
664 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
665 }
666
667 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
668 {.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: 1},
669 {.ISD: ISD::FP_ROUND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: 3},
670 };
671 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
672 if (const auto *Entry =
673 ConvertCostTableLookup(Table: MVEFStoreConversionTbl, ISD,
674 Dst: SrcTy.getSimpleVT(), Src: DstTy.getSimpleVT()))
675 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
676 }
677 }
678
679 // NEON vector operations that can extend their inputs.
680 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
681 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
682 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
683 // vaddl
684 { .ISD: ISD::ADD, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0 },
685 { .ISD: ISD::ADD, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0 },
686 // vsubl
687 { .ISD: ISD::SUB, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0 },
688 { .ISD: ISD::SUB, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0 },
689 // vmull
690 { .ISD: ISD::MUL, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0 },
691 { .ISD: ISD::MUL, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0 },
692 // vshll
693 { .ISD: ISD::SHL, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 0 },
694 { .ISD: ISD::SHL, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 0 },
695 };
696
697 auto *User = cast<Instruction>(Val: *I->user_begin());
698 int UserISD = TLI->InstructionOpcodeToISD(Opcode: User->getOpcode());
699 if (auto *Entry = ConvertCostTableLookup(Table: NEONDoubleWidthTbl, ISD: UserISD,
700 Dst: DstTy.getSimpleVT(),
701 Src: SrcTy.getSimpleVT())) {
702 return AdjustCost(Entry->Cost);
703 }
704 }
705
706 // Single to/from double precision conversions.
707 if (Src->isVectorTy() && ST->hasNEON() &&
708 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
709 DstTy.getScalarType() == MVT::f32) ||
710 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
711 DstTy.getScalarType() == MVT::f64))) {
712 static const CostTblEntry NEONFltDblTbl[] = {
713 // Vector fptrunc/fpext conversions.
714 {.ISD: ISD::FP_ROUND, .Type: MVT::v2f64, .Cost: 2},
715 {.ISD: ISD::FP_EXTEND, .Type: MVT::v2f32, .Cost: 2},
716 {.ISD: ISD::FP_EXTEND, .Type: MVT::v4f32, .Cost: 4}};
717
718 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
719 if (const auto *Entry = CostTableLookup(Table: NEONFltDblTbl, ISD, Ty: LT.second))
720 return AdjustCost(LT.first * Entry->Cost);
721 }
722
723 // Some arithmetic, load and store operations have specific instructions
724 // to cast up/down their types automatically at no extra cost.
725 // TODO: Get these tables to know at least what the related operations are.
726 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
727 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1 },
728 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1 },
729 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 1 },
730 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 1 },
731 { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: 0 },
732 { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 1 },
733
734 // The number of vmovl instructions for the extension.
735 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1 },
736 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1 },
737 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2 },
738 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2 },
739 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: 3 },
740 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: 3 },
741 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2 },
742 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2 },
743 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3 },
744 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: 3 },
745 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3 },
746 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: 3 },
747 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7 },
748 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: 7 },
749 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6 },
750 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: 6 },
751 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6 },
752 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6 },
753
754 // Operations that we legalize using splitting.
755 { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 6 },
756 { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: 3 },
757
758 // Vector float <-> i32 conversions.
759 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1 },
760 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1 },
761
762 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3 },
763 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3 },
764 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 2 },
765 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 2 },
766 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1 },
767 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1 },
768 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: 3 },
769 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: 3 },
770 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3 },
771 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3 },
772 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2 },
773 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2 },
774 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4 },
775 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 4 },
776 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: 2 },
777 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: 2 },
778 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: 8 },
779 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: 8 },
780 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: 4 },
781 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: 4 },
782
783 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1 },
784 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: 1 },
785 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 3 },
786 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 3 },
787 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2 },
788 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 2 },
789
790 // Vector double <-> i32 conversions.
791 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2 },
792 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2 },
793
794 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4 },
795 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: 4 },
796 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 3 },
797 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: 3 },
798 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2 },
799 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: 2 },
800
801 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2 },
802 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: 2 },
803 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: 4 },
804 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: 4 },
805 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: 8 },
806 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: 8 }
807 };
808
809 if (SrcTy.isVector() && ST->hasNEON()) {
810 if (const auto *Entry = ConvertCostTableLookup(Table: NEONVectorConversionTbl, ISD,
811 Dst: DstTy.getSimpleVT(),
812 Src: SrcTy.getSimpleVT()))
813 return AdjustCost(Entry->Cost);
814 }
815
816 // Scalar float to integer conversions.
817 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
818 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i1, .Src: MVT::f32, .Cost: 2 },
819 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i1, .Src: MVT::f32, .Cost: 2 },
820 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i1, .Src: MVT::f64, .Cost: 2 },
821 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i1, .Src: MVT::f64, .Cost: 2 },
822 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i8, .Src: MVT::f32, .Cost: 2 },
823 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i8, .Src: MVT::f32, .Cost: 2 },
824 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i8, .Src: MVT::f64, .Cost: 2 },
825 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i8, .Src: MVT::f64, .Cost: 2 },
826 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i16, .Src: MVT::f32, .Cost: 2 },
827 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i16, .Src: MVT::f32, .Cost: 2 },
828 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i16, .Src: MVT::f64, .Cost: 2 },
829 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i16, .Src: MVT::f64, .Cost: 2 },
830 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: 2 },
831 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: 2 },
832 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: 2 },
833 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: 2 },
834 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: 10 },
835 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: 10 },
836 { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: 10 },
837 { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: 10 }
838 };
839 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
840 if (const auto *Entry = ConvertCostTableLookup(Table: NEONFloatConversionTbl, ISD,
841 Dst: DstTy.getSimpleVT(),
842 Src: SrcTy.getSimpleVT()))
843 return AdjustCost(Entry->Cost);
844 }
845
846 // Scalar integer to float conversions.
847 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
848 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i1, .Cost: 2 },
849 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i1, .Cost: 2 },
850 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i1, .Cost: 2 },
851 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i1, .Cost: 2 },
852 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i8, .Cost: 2 },
853 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i8, .Cost: 2 },
854 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i8, .Cost: 2 },
855 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i8, .Cost: 2 },
856 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i16, .Cost: 2 },
857 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i16, .Cost: 2 },
858 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i16, .Cost: 2 },
859 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i16, .Cost: 2 },
860 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: 2 },
861 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: 2 },
862 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: 2 },
863 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: 2 },
864 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: 10 },
865 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: 10 },
866 { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: 10 },
867 { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: 10 }
868 };
869
870 if (SrcTy.isInteger() && ST->hasNEON()) {
871 if (const auto *Entry = ConvertCostTableLookup(Table: NEONIntegerConversionTbl,
872 ISD, Dst: DstTy.getSimpleVT(),
873 Src: SrcTy.getSimpleVT()))
874 return AdjustCost(Entry->Cost);
875 }
876
877 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
878 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
879 // are linearised so take more.
880 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
881 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1 },
882 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1 },
883 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2 },
884 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2 },
885 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: 10 },
886 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: 2 },
887 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1 },
888 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1 },
889 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 10 },
890 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2 },
891 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 8 },
892 { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 2 },
893 };
894
895 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
896 if (const auto *Entry = ConvertCostTableLookup(Table: MVEVectorConversionTbl,
897 ISD, Dst: DstTy.getSimpleVT(),
898 Src: SrcTy.getSimpleVT()))
899 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
900 }
901
902 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
903 // As general rule, fp converts that were not matched above are scalarized
904 // and cost 1 vcvt for each lane, so long as the instruction is available.
905 // If not it will become a series of function calls.
906 const InstructionCost CallCost =
907 getCallInstrCost(F: nullptr, RetTy: Dst, Tys: {Src}, CostKind);
908 int Lanes = 1;
909 if (SrcTy.isFixedLengthVector())
910 Lanes = SrcTy.getVectorNumElements();
911
912 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
913 return Lanes;
914 else
915 return Lanes * CallCost;
916 }
917
918 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
919 SrcTy.isFixedLengthVector()) {
920 // Treat a truncate with larger than legal source (128bits for MVE) as
921 // expensive, 2 instructions per lane.
922 if ((SrcTy.getScalarType() == MVT::i8 ||
923 SrcTy.getScalarType() == MVT::i16 ||
924 SrcTy.getScalarType() == MVT::i32) &&
925 SrcTy.getSizeInBits() > 128 &&
926 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
927 return SrcTy.getVectorNumElements() * 2;
928 }
929
930 // Scalar integer conversion costs.
931 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
932 // i16 -> i64 requires two dependent operations.
933 { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: 2 },
934
935 // Truncates on i64 are assumed to be free.
936 { .ISD: ISD::TRUNCATE, .Dst: MVT::i32, .Src: MVT::i64, .Cost: 0 },
937 { .ISD: ISD::TRUNCATE, .Dst: MVT::i16, .Src: MVT::i64, .Cost: 0 },
938 { .ISD: ISD::TRUNCATE, .Dst: MVT::i8, .Src: MVT::i64, .Cost: 0 },
939 { .ISD: ISD::TRUNCATE, .Dst: MVT::i1, .Src: MVT::i64, .Cost: 0 }
940 };
941
942 if (SrcTy.isInteger()) {
943 if (const auto *Entry = ConvertCostTableLookup(Table: ARMIntegerConversionTbl, ISD,
944 Dst: DstTy.getSimpleVT(),
945 Src: SrcTy.getSimpleVT()))
946 return AdjustCost(Entry->Cost);
947 }
948
949 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
950 ? ST->getMVEVectorCostFactor(CostKind)
951 : 1;
952 return AdjustCost(
953 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
954}
955
956InstructionCost ARMTTIImpl::getVectorInstrCost(
957 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
958 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
959 // Penalize inserting into an D-subregister. We end up with a three times
960 // lower estimated throughput on swift.
961 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
962 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
963 return 3;
964
965 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
966 Opcode == Instruction::ExtractElement)) {
967 // Cross-class copies are expensive on many microarchitectures,
968 // so assume they are expensive by default.
969 if (cast<VectorType>(Val: ValTy)->getElementType()->isIntegerTy())
970 return 3;
971
972 // Even if it's not a cross class copy, this likely leads to mixing
973 // of NEON and VFP code and should be therefore penalized.
974 if (ValTy->isVectorTy() &&
975 ValTy->getScalarSizeInBits() <= 32)
976 return std::max<InstructionCost>(
977 a: BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
978 VIC),
979 b: 2U);
980 }
981
982 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
983 Opcode == Instruction::ExtractElement)) {
984 // Integer cross-lane moves are more expensive than float, which can
985 // sometimes just be vmovs. Integer involve being passes to GPR registers,
986 // causing more of a delay.
987 std::pair<InstructionCost, MVT> LT =
988 getTypeLegalizationCost(Ty: ValTy->getScalarType());
989 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
990 }
991
992 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
993 VIC);
994}
995
996InstructionCost ARMTTIImpl::getCmpSelInstrCost(
997 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
998 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
999 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
1000 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1001
1002 // Thumb scalar code size cost for select.
1003 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
1004 ST->isThumb() && !ValTy->isVectorTy()) {
1005 // Assume expensive structs.
1006 if (TLI->getValueType(DL, Ty: ValTy, AllowUnknown: true) == MVT::Other)
1007 return TTI::TCC_Expensive;
1008
1009 // Select costs can vary because they:
1010 // - may require one or more conditional mov (including an IT),
1011 // - can't operate directly on immediates,
1012 // - require live flags, which we can't copy around easily.
1013 InstructionCost Cost = getTypeLegalizationCost(Ty: ValTy).first;
1014
1015 // Possible IT instruction for Thumb2, or more for Thumb1.
1016 ++Cost;
1017
1018 // i1 values may need rematerialising by using mov immediates and/or
1019 // flag setting instructions.
1020 if (ValTy->isIntegerTy(Bitwidth: 1))
1021 ++Cost;
1022
1023 return Cost;
1024 }
1025
1026 // If this is a vector min/max/abs, use the cost of that intrinsic directly
1027 // instead. Hopefully when min/max intrinsics are more prevalent this code
1028 // will not be needed.
1029 const Instruction *Sel = I;
1030 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
1031 Sel->hasOneUse())
1032 Sel = cast<Instruction>(Val: Sel->user_back());
1033 if (Sel && ValTy->isVectorTy() &&
1034 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
1035 const Value *LHS, *RHS;
1036 SelectPatternFlavor SPF = matchSelectPattern(V: Sel, LHS, RHS).Flavor;
1037 unsigned IID = 0;
1038 switch (SPF) {
1039 case SPF_ABS:
1040 IID = Intrinsic::abs;
1041 break;
1042 case SPF_SMIN:
1043 IID = Intrinsic::smin;
1044 break;
1045 case SPF_SMAX:
1046 IID = Intrinsic::smax;
1047 break;
1048 case SPF_UMIN:
1049 IID = Intrinsic::umin;
1050 break;
1051 case SPF_UMAX:
1052 IID = Intrinsic::umax;
1053 break;
1054 case SPF_FMINNUM:
1055 IID = Intrinsic::minnum;
1056 break;
1057 case SPF_FMAXNUM:
1058 IID = Intrinsic::maxnum;
1059 break;
1060 default:
1061 break;
1062 }
1063 if (IID) {
1064 // The ICmp is free, the select gets the cost of the min/max/etc
1065 if (Sel != I)
1066 return 0;
1067 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1068 return getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
1069 }
1070 }
1071
1072 // On NEON a vector select gets lowered to vbsl.
1073 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1074 // Lowering of some vector selects is currently far from perfect.
1075 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1076 { .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: 4*4 + 1*2 + 1 },
1077 { .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: 50 },
1078 { .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: 100 }
1079 };
1080
1081 EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
1082 EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
1083 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1084 if (const auto *Entry = ConvertCostTableLookup(Table: NEONVectorSelectTbl, ISD,
1085 Dst: SelCondTy.getSimpleVT(),
1086 Src: SelValTy.getSimpleVT()))
1087 return Entry->Cost;
1088 }
1089
1090 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1091 return LT.first;
1092 }
1093
1094 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1095 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1096 cast<FixedVectorType>(Val: ValTy)->getNumElements() > 1) {
1097 FixedVectorType *VecValTy = cast<FixedVectorType>(Val: ValTy);
1098 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(Val: CondTy);
1099 if (!VecCondTy)
1100 VecCondTy = cast<FixedVectorType>(Val: CmpInst::makeCmpResultType(opnd_type: VecValTy));
1101
1102 // If we don't have mve.fp any fp operations will need to be scalarized.
1103 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1104 // One scalaization insert, one scalarization extract and the cost of the
1105 // fcmps.
1106 return BaseT::getScalarizationOverhead(InTy: VecValTy, /*Insert*/ false,
1107 /*Extract*/ true, CostKind) +
1108 BaseT::getScalarizationOverhead(InTy: VecCondTy, /*Insert*/ true,
1109 /*Extract*/ false, CostKind) +
1110 VecValTy->getNumElements() *
1111 getCmpSelInstrCost(Opcode, ValTy: ValTy->getScalarType(),
1112 CondTy: VecCondTy->getScalarType(), VecPred,
1113 CostKind, Op1Info, Op2Info, I);
1114 }
1115
1116 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1117 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1118 // There are two types - the input that specifies the type of the compare
1119 // and the output vXi1 type. Because we don't know how the output will be
1120 // split, we may need an expensive shuffle to get two in sync. This has the
1121 // effect of making larger than legal compares (v8i32 for example)
1122 // expensive.
1123 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1124 if (LT.first > 1)
1125 return LT.first * BaseCost +
1126 BaseT::getScalarizationOverhead(InTy: VecCondTy, /*Insert*/ true,
1127 /*Extract*/ false, CostKind);
1128 return BaseCost;
1129 }
1130 }
1131
1132 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1133 // for "multiple beats" potentially needed by MVE instructions.
1134 int BaseCost = 1;
1135 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1136 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1137
1138 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1139 CostKind, Op1Info, Op2Info, I);
1140}
1141
1142InstructionCost
1143ARMTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
1144 const SCEV *Ptr,
1145 TTI::TargetCostKind CostKind) const {
1146 // Address computations in vectorized code with non-consecutive addresses will
1147 // likely result in more instructions compared to scalar code where the
1148 // computation can more often be merged into the index mode. The resulting
1149 // extra micro-ops can significantly decrease throughput.
1150 unsigned NumVectorInstToHideOverhead = 10;
1151 int MaxMergeDistance = 64;
1152
1153 if (ST->hasNEON()) {
1154 if (PtrTy->isVectorTy() && SE &&
1155 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + 1))
1156 return NumVectorInstToHideOverhead;
1157
1158 // In many cases the address computation is not merged into the instruction
1159 // addressing mode.
1160 return 1;
1161 }
1162 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1163}
1164
1165bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) const {
1166 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
1167 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1168 // optimized, else LSR may block tail-predication.
1169 switch (II->getIntrinsicID()) {
1170 case Intrinsic::arm_mve_vctp8:
1171 case Intrinsic::arm_mve_vctp16:
1172 case Intrinsic::arm_mve_vctp32:
1173 case Intrinsic::arm_mve_vctp64:
1174 return true;
1175 default:
1176 break;
1177 }
1178 }
1179 return false;
1180}
1181
1182bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment,
1183 unsigned /*AddressSpace*/,
1184 TTI::MaskKind /*MaskKind*/) const {
1185 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1186 return false;
1187
1188 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DataTy)) {
1189 // Don't support v2i1 yet.
1190 if (VecTy->getNumElements() == 2)
1191 return false;
1192
1193 // We don't support extending fp types.
1194 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1195 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1196 return false;
1197 }
1198
1199 unsigned EltWidth = DataTy->getScalarSizeInBits();
1200 return (EltWidth == 32 && Alignment >= 4) ||
1201 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1202}
1203
1204bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
1205 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1206 return false;
1207
1208 unsigned EltWidth = Ty->getScalarSizeInBits();
1209 return ((EltWidth == 32 && Alignment >= 4) ||
1210 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1211}
1212
1213/// Given a memcpy/memset/memmove instruction, return the number of memory
1214/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1215/// call is used.
1216int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1217 MemOp MOp;
1218 unsigned DstAddrSpace = ~0u;
1219 unsigned SrcAddrSpace = ~0u;
1220 const Function *F = I->getParent()->getParent();
1221
1222 if (const auto *MC = dyn_cast<MemTransferInst>(Val: I)) {
1223 ConstantInt *C = dyn_cast<ConstantInt>(Val: MC->getLength());
1224 // If 'size' is not a constant, a library call will be generated.
1225 if (!C)
1226 return -1;
1227
1228 const unsigned Size = C->getValue().getZExtValue();
1229 const Align DstAlign = MC->getDestAlign().valueOrOne();
1230 const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1231
1232 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1233 /*IsVolatile*/ false);
1234 DstAddrSpace = MC->getDestAddressSpace();
1235 SrcAddrSpace = MC->getSourceAddressSpace();
1236 }
1237 else if (const auto *MS = dyn_cast<MemSetInst>(Val: I)) {
1238 ConstantInt *C = dyn_cast<ConstantInt>(Val: MS->getLength());
1239 // If 'size' is not a constant, a library call will be generated.
1240 if (!C)
1241 return -1;
1242
1243 const unsigned Size = C->getValue().getZExtValue();
1244 const Align DstAlign = MS->getDestAlign().valueOrOne();
1245
1246 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1247 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1248 DstAddrSpace = MS->getDestAddressSpace();
1249 }
1250 else
1251 llvm_unreachable("Expected a memcpy/move or memset!");
1252
1253 unsigned Limit, Factor = 2;
1254 switch(I->getIntrinsicID()) {
1255 case Intrinsic::memcpy:
1256 Limit = TLI->getMaxStoresPerMemcpy(OptSize: F->hasMinSize());
1257 break;
1258 case Intrinsic::memmove:
1259 Limit = TLI->getMaxStoresPerMemmove(OptSize: F->hasMinSize());
1260 break;
1261 case Intrinsic::memset:
1262 Limit = TLI->getMaxStoresPerMemset(OptSize: F->hasMinSize());
1263 Factor = 1;
1264 break;
1265 default:
1266 llvm_unreachable("Expected a memcpy/move or memset!");
1267 }
1268
1269 // MemOps will be poplulated with a list of data types that needs to be
1270 // loaded and stored. That's why we multiply the number of elements by 2 to
1271 // get the cost for this memcpy.
1272 std::vector<EVT> MemOps;
1273 LLVMContext &C = F->getContext();
1274 if (getTLI()->findOptimalMemOpLowering(Context&: C, MemOps, Limit, Op: MOp, DstAS: DstAddrSpace,
1275 SrcAS: SrcAddrSpace, FuncAttributes: F->getAttributes(),
1276 LargestVT: nullptr))
1277 return MemOps.size() * Factor;
1278
1279 // If we can't find an optimal memop lowering, return the default cost
1280 return -1;
1281}
1282
1283InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) const {
1284 int NumOps = getNumMemOps(I: cast<IntrinsicInst>(Val: I));
1285
1286 // To model the cost of a library call, we assume 1 for the call, and
1287 // 3 for the argument setup.
1288 if (NumOps == -1)
1289 return 4;
1290 return NumOps;
1291}
1292
1293InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1294 VectorType *DstTy, VectorType *SrcTy,
1295 ArrayRef<int> Mask,
1296 TTI::TargetCostKind CostKind,
1297 int Index, VectorType *SubTp,
1298 ArrayRef<const Value *> Args,
1299 const Instruction *CxtI) const {
1300 assert((Mask.empty() || DstTy->isScalableTy() ||
1301 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1302 "Expected the Mask to match the return size if given");
1303 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1304 "Expected the same scalar types");
1305
1306 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
1307 // Treat extractsubvector as single op permutation.
1308 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1309 if (IsExtractSubvector)
1310 Kind = TTI::SK_PermuteSingleSrc;
1311 if (ST->hasNEON()) {
1312 if (Kind == TTI::SK_Broadcast) {
1313 static const CostTblEntry NEONDupTbl[] = {
1314 // VDUP handles these cases.
1315 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: 1},
1316 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: 1},
1317 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: 1},
1318 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: 1},
1319 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: 1},
1320 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i8, .Cost: 1},
1321
1322 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: 1},
1323 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: 1},
1324 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: 1},
1325 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: 1}};
1326
1327 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1328 if (const auto *Entry =
1329 CostTableLookup(Table: NEONDupTbl, ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1330 return LT.first * Entry->Cost;
1331 }
1332 if (Kind == TTI::SK_Reverse) {
1333 static const CostTblEntry NEONShuffleTbl[] = {
1334 // Reverse shuffle cost one instruction if we are shuffling within a
1335 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1336 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: 1},
1337 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: 1},
1338 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: 1},
1339 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: 1},
1340 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: 1},
1341 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i8, .Cost: 1},
1342
1343 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: 2},
1344 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: 2},
1345 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: 2},
1346 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: 2}};
1347
1348 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1349 if (const auto *Entry =
1350 CostTableLookup(Table: NEONShuffleTbl, ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1351 return LT.first * Entry->Cost;
1352 }
1353 if (Kind == TTI::SK_Select) {
1354 static const CostTblEntry NEONSelShuffleTbl[] = {
1355 // Select shuffle cost table for ARM. Cost is the number of
1356 // instructions
1357 // required to create the shuffled vector.
1358
1359 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: 1},
1360 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: 1},
1361 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: 1},
1362 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: 1},
1363
1364 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: 2},
1365 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: 2},
1366 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: 2},
1367
1368 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: 16},
1369
1370 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: 32}};
1371
1372 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1373 if (const auto *Entry = CostTableLookup(Table: NEONSelShuffleTbl,
1374 ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1375 return LT.first * Entry->Cost;
1376 }
1377 }
1378 if (ST->hasMVEIntegerOps()) {
1379 if (Kind == TTI::SK_Broadcast) {
1380 static const CostTblEntry MVEDupTbl[] = {
1381 // VDUP handles these cases.
1382 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: 1},
1383 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: 1},
1384 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: 1},
1385 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: 1},
1386 {.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8f16, .Cost: 1}};
1387
1388 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1389 if (const auto *Entry = CostTableLookup(Table: MVEDupTbl, ISD: ISD::VECTOR_SHUFFLE,
1390 Ty: LT.second))
1391 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
1392 }
1393
1394 if (!Mask.empty()) {
1395 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1396 // Check for LD2/LD4 instructions, which are represented in llvm IR as
1397 // deinterleaving-shuffle(load). The shuffle cost could potentially be
1398 // free, but we model it with a cost of LT.first so that LD2/LD4 have a
1399 // higher cost than just the load.
1400 if (Args.size() >= 1 && isa<LoadInst>(Val: Args[0]) &&
1401 (LT.second.getScalarSizeInBits() == 8 ||
1402 LT.second.getScalarSizeInBits() == 16 ||
1403 LT.second.getScalarSizeInBits() == 32) &&
1404 LT.second.getSizeInBits() == 128 &&
1405 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1406 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 2)) ||
1407 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1408 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 4))))
1409 return ST->getMVEVectorCostFactor(CostKind) *
1410 std::max<InstructionCost>(a: 1, b: LT.first / 4);
1411
1412 // Check for ST2/ST4 instructions, which are represented in llvm IR as
1413 // store(interleaving-shuffle). The shuffle cost could potentially be
1414 // free, but we model it with a cost of LT.first so that ST2/ST4 have a
1415 // higher cost than just the store.
1416 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) &&
1417 (LT.second.getScalarSizeInBits() == 8 ||
1418 LT.second.getScalarSizeInBits() == 16 ||
1419 LT.second.getScalarSizeInBits() == 32) &&
1420 LT.second.getSizeInBits() == 128 &&
1421 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1422 ShuffleVectorInst::isInterleaveMask(
1423 Mask, Factor: 2, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2)) ||
1424 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1425 ShuffleVectorInst::isInterleaveMask(
1426 Mask, Factor: 4, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * 2))))
1427 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1428
1429 if (LT.second.isVector() &&
1430 Mask.size() <= LT.second.getVectorNumElements() &&
1431 (isVREVMask(M: Mask, VT: LT.second, BlockSize: 16) || isVREVMask(M: Mask, VT: LT.second, BlockSize: 32) ||
1432 isVREVMask(M: Mask, VT: LT.second, BlockSize: 64)))
1433 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1434 }
1435 }
1436
1437 // Restore optimal kind.
1438 if (IsExtractSubvector)
1439 Kind = TTI::SK_ExtractSubvector;
1440 int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1441 ? ST->getMVEVectorCostFactor(CostKind)
1442 : 1;
1443 return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1444 Index, SubTp);
1445}
1446
1447InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1448 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1449 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1450 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
1451 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1452 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(Bitwidth: 1)) {
1453 // Make operations on i1 relatively expensive as this often involves
1454 // combining predicates. AND and XOR should be easier to handle with IT
1455 // blocks.
1456 switch (ISDOpcode) {
1457 default:
1458 break;
1459 case ISD::AND:
1460 case ISD::XOR:
1461 return 2;
1462 case ISD::OR:
1463 return 3;
1464 }
1465 }
1466
1467 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1468
1469 if (ST->hasNEON()) {
1470 const unsigned FunctionCallDivCost = 20;
1471 const unsigned ReciprocalDivCost = 10;
1472 static const CostTblEntry CostTbl[] = {
1473 // Division.
1474 // These costs are somewhat random. Choose a cost of 20 to indicate that
1475 // vectorizing devision (added function call) is going to be very expensive.
1476 // Double registers types.
1477 { .ISD: ISD::SDIV, .Type: MVT::v1i64, .Cost: 1 * FunctionCallDivCost},
1478 { .ISD: ISD::UDIV, .Type: MVT::v1i64, .Cost: 1 * FunctionCallDivCost},
1479 { .ISD: ISD::SREM, .Type: MVT::v1i64, .Cost: 1 * FunctionCallDivCost},
1480 { .ISD: ISD::UREM, .Type: MVT::v1i64, .Cost: 1 * FunctionCallDivCost},
1481 { .ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: 2 * FunctionCallDivCost},
1482 { .ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: 2 * FunctionCallDivCost},
1483 { .ISD: ISD::SREM, .Type: MVT::v2i32, .Cost: 2 * FunctionCallDivCost},
1484 { .ISD: ISD::UREM, .Type: MVT::v2i32, .Cost: 2 * FunctionCallDivCost},
1485 { .ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: ReciprocalDivCost},
1486 { .ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: ReciprocalDivCost},
1487 { .ISD: ISD::SREM, .Type: MVT::v4i16, .Cost: 4 * FunctionCallDivCost},
1488 { .ISD: ISD::UREM, .Type: MVT::v4i16, .Cost: 4 * FunctionCallDivCost},
1489 { .ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: ReciprocalDivCost},
1490 { .ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: ReciprocalDivCost},
1491 { .ISD: ISD::SREM, .Type: MVT::v8i8, .Cost: 8 * FunctionCallDivCost},
1492 { .ISD: ISD::UREM, .Type: MVT::v8i8, .Cost: 8 * FunctionCallDivCost},
1493 // Quad register types.
1494 { .ISD: ISD::SDIV, .Type: MVT::v2i64, .Cost: 2 * FunctionCallDivCost},
1495 { .ISD: ISD::UDIV, .Type: MVT::v2i64, .Cost: 2 * FunctionCallDivCost},
1496 { .ISD: ISD::SREM, .Type: MVT::v2i64, .Cost: 2 * FunctionCallDivCost},
1497 { .ISD: ISD::UREM, .Type: MVT::v2i64, .Cost: 2 * FunctionCallDivCost},
1498 { .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: 4 * FunctionCallDivCost},
1499 { .ISD: ISD::UDIV, .Type: MVT::v4i32, .Cost: 4 * FunctionCallDivCost},
1500 { .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: 4 * FunctionCallDivCost},
1501 { .ISD: ISD::UREM, .Type: MVT::v4i32, .Cost: 4 * FunctionCallDivCost},
1502 { .ISD: ISD::SDIV, .Type: MVT::v8i16, .Cost: 8 * FunctionCallDivCost},
1503 { .ISD: ISD::UDIV, .Type: MVT::v8i16, .Cost: 8 * FunctionCallDivCost},
1504 { .ISD: ISD::SREM, .Type: MVT::v8i16, .Cost: 8 * FunctionCallDivCost},
1505 { .ISD: ISD::UREM, .Type: MVT::v8i16, .Cost: 8 * FunctionCallDivCost},
1506 { .ISD: ISD::SDIV, .Type: MVT::v16i8, .Cost: 16 * FunctionCallDivCost},
1507 { .ISD: ISD::UDIV, .Type: MVT::v16i8, .Cost: 16 * FunctionCallDivCost},
1508 { .ISD: ISD::SREM, .Type: MVT::v16i8, .Cost: 16 * FunctionCallDivCost},
1509 { .ISD: ISD::UREM, .Type: MVT::v16i8, .Cost: 16 * FunctionCallDivCost},
1510 // Multiplication.
1511 };
1512
1513 if (const auto *Entry = CostTableLookup(Table: CostTbl, ISD: ISDOpcode, Ty: LT.second))
1514 return LT.first * Entry->Cost;
1515
1516 InstructionCost Cost = BaseT::getArithmeticInstrCost(
1517 Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
1518
1519 // This is somewhat of a hack. The problem that we are facing is that SROA
1520 // creates a sequence of shift, and, or instructions to construct values.
1521 // These sequences are recognized by the ISel and have zero-cost. Not so for
1522 // the vectorized code. Because we have support for v2i64 but not i64 those
1523 // sequences look particularly beneficial to vectorize.
1524 // To work around this we increase the cost of v2i64 operations to make them
1525 // seem less beneficial.
1526 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1527 Cost += 4;
1528
1529 return Cost;
1530 }
1531
1532 // If this operation is a shift on arm/thumb2, it might well be folded into
1533 // the following instruction, hence having a cost of 0.
1534 auto LooksLikeAFreeShift = [&]() {
1535 if (ST->isThumb1Only() || Ty->isVectorTy())
1536 return false;
1537
1538 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1539 return false;
1540 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1541 return false;
1542
1543 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1544 switch (cast<Instruction>(Val: CxtI->user_back())->getOpcode()) {
1545 case Instruction::Add:
1546 case Instruction::Sub:
1547 case Instruction::And:
1548 case Instruction::Xor:
1549 case Instruction::Or:
1550 case Instruction::ICmp:
1551 return true;
1552 default:
1553 return false;
1554 }
1555 };
1556 if (LooksLikeAFreeShift())
1557 return 0;
1558
1559 // When targets have both DSP and MVE we find that the
1560 // the compiler will attempt to vectorize as well as using
1561 // scalar (S/U)MLAL operations. This is in cases where we have
1562 // the pattern ext(mul(ext(i16), ext(i16))) we find
1563 // that codegen performs better when only using (S/U)MLAL scalar
1564 // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1565 // check if a mul instruction is used in a (U/S)MLAL pattern.
1566 auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
1567 Type *Ty) -> bool {
1568 if (!ST->hasDSP())
1569 return false;
1570
1571 if (!I)
1572 return false;
1573
1574 if (Opcode != Instruction::Mul)
1575 return false;
1576
1577 if (Ty->isVectorTy())
1578 return false;
1579
1580 auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
1581 return cast<Instruction>(Val: LHS)->getOpcode() ==
1582 cast<Instruction>(Val: RHS)->getOpcode();
1583 };
1584 auto IsExtInst = [](const Value *V) -> bool {
1585 return isa<ZExtInst>(Val: V) || isa<SExtInst>(Val: V);
1586 };
1587 auto IsExtensionFromHalf = [](const Value *V) -> bool {
1588 return cast<Instruction>(Val: V)->getOperand(i: 0)->getType()->isIntegerTy(Bitwidth: 16);
1589 };
1590
1591 // We check the arguments of the instruction to see if they're extends
1592 auto *BinOp = dyn_cast<BinaryOperator>(Val: I);
1593 if (!BinOp)
1594 return false;
1595 Value *Op0 = BinOp->getOperand(i_nocapture: 0);
1596 Value *Op1 = BinOp->getOperand(i_nocapture: 1);
1597 if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1598 // We're interested in an ext of an i16
1599 if (!I->getType()->isIntegerTy(Bitwidth: 32) || !IsExtensionFromHalf(Op0) ||
1600 !IsExtensionFromHalf(Op1))
1601 return false;
1602 // We need to check if this result will be further extended to i64
1603 // and that all these uses are SExt
1604 for (auto *U : I->users())
1605 if (!IsExtInst(U))
1606 return false;
1607 return true;
1608 }
1609
1610 return false;
1611 };
1612
1613 if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
1614 return 0;
1615
1616 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1617 // for "multiple beats" potentially needed by MVE instructions.
1618 int BaseCost = 1;
1619 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1620 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1621
1622 // The rest of this mostly follows what is done in
1623 // BaseT::getArithmeticInstrCost, without treating floats as more expensive
1624 // that scalars or increasing the costs for custom operations. The results is
1625 // also multiplied by the MVEVectorCostFactor where appropriate.
1626 if (TLI->isOperationLegalOrCustomOrPromote(Op: ISDOpcode, VT: LT.second))
1627 return LT.first * BaseCost;
1628
1629 // Else this is expand, assume that we need to scalarize this op.
1630 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
1631 unsigned Num = VTy->getNumElements();
1632 InstructionCost Cost =
1633 getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind);
1634 // Return the cost of multiple scalar invocation plus the cost of
1635 // inserting and extracting the values.
1636 SmallVector<Type *> Tys(Args.size(), Ty);
1637 return BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind) +
1638 Num * Cost;
1639 }
1640
1641 return BaseCost;
1642}
1643
1644InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1645 Align Alignment,
1646 unsigned AddressSpace,
1647 TTI::TargetCostKind CostKind,
1648 TTI::OperandValueInfo OpInfo,
1649 const Instruction *I) const {
1650 // TODO: Handle other cost kinds.
1651 if (CostKind != TTI::TCK_RecipThroughput)
1652 return 1;
1653
1654 // Type legalization can't handle structs
1655 if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other)
1656 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1657 CostKind);
1658
1659 if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align(16) &&
1660 cast<VectorType>(Val: Src)->getElementType()->isDoubleTy()) {
1661 // Unaligned loads/stores are extremely inefficient.
1662 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1663 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1664 return LT.first * 4;
1665 }
1666
1667 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1668 // Same for stores.
1669 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Val: Src) && I &&
1670 ((Opcode == Instruction::Load && I->hasOneUse() &&
1671 isa<FPExtInst>(Val: *I->user_begin())) ||
1672 (Opcode == Instruction::Store && isa<FPTruncInst>(Val: I->getOperand(i: 0))))) {
1673 FixedVectorType *SrcVTy = cast<FixedVectorType>(Val: Src);
1674 Type *DstTy =
1675 Opcode == Instruction::Load
1676 ? (*I->user_begin())->getType()
1677 : cast<Instruction>(Val: I->getOperand(i: 0))->getOperand(i: 0)->getType();
1678 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1679 DstTy->getScalarType()->isFloatTy())
1680 return ST->getMVEVectorCostFactor(CostKind);
1681 }
1682
1683 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1684 ? ST->getMVEVectorCostFactor(CostKind)
1685 : 1;
1686 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1687 CostKind, OpInfo, I);
1688}
1689
1690InstructionCost
1691ARMTTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
1692 TTI::TargetCostKind CostKind) const {
1693 switch (MICA.getID()) {
1694 case Intrinsic::masked_scatter:
1695 case Intrinsic::masked_gather:
1696 return getGatherScatterOpCost(MICA, CostKind);
1697 case Intrinsic::masked_load:
1698 case Intrinsic::masked_store:
1699 return getMaskedMemoryOpCost(MICA, CostKind);
1700 }
1701 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1702}
1703
1704InstructionCost
1705ARMTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
1706 TTI::TargetCostKind CostKind) const {
1707 unsigned IID = MICA.getID();
1708 Type *Src = MICA.getDataType();
1709 Align Alignment = MICA.getAlignment();
1710 unsigned AddressSpace = MICA.getAddressSpace();
1711 if (ST->hasMVEIntegerOps()) {
1712 if (IID == Intrinsic::masked_load &&
1713 isLegalMaskedLoad(DataTy: Src, Alignment, AddressSpace))
1714 return ST->getMVEVectorCostFactor(CostKind);
1715 if (IID == Intrinsic::masked_store &&
1716 isLegalMaskedStore(DataTy: Src, Alignment, AddressSpace))
1717 return ST->getMVEVectorCostFactor(CostKind);
1718 }
1719 if (!isa<FixedVectorType>(Val: Src))
1720 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1721 // Scalar cost, which is currently very high due to the efficiency of the
1722 // generated code.
1723 return cast<FixedVectorType>(Val: Src)->getNumElements() * 8;
1724}
1725
1726InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1727 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1728 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1729 bool UseMaskForCond, bool UseMaskForGaps) const {
1730 assert(Factor >= 2 && "Invalid interleave factor");
1731 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1732
1733 // vldN/vstN doesn't support vector types of i64/f64 element.
1734 bool EltIs64Bits = DL.getTypeSizeInBits(Ty: VecTy->getScalarType()) == 64;
1735
1736 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1737 !UseMaskForCond && !UseMaskForGaps) {
1738 unsigned NumElts = cast<FixedVectorType>(Val: VecTy)->getNumElements();
1739 auto *SubVecTy =
1740 FixedVectorType::get(ElementType: VecTy->getScalarType(), NumElts: NumElts / Factor);
1741
1742 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1743 // Accesses having vector types that are a multiple of 128 bits can be
1744 // matched to more than one vldN/vstN instruction.
1745 int BaseCost =
1746 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1747 if (NumElts % Factor == 0 &&
1748 TLI->isLegalInterleavedAccessType(Factor, VecTy: SubVecTy, Alignment, DL))
1749 return Factor * BaseCost * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL);
1750
1751 // Some smaller than legal interleaved patterns are cheap as we can make
1752 // use of the vmovn or vrev patterns to interleave a standard load. This is
1753 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1754 // promoted differently). The cost of 2 here is then a load and vrev or
1755 // vmovn.
1756 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1757 VecTy->isIntOrIntVectorTy() &&
1758 DL.getTypeSizeInBits(Ty: SubVecTy).getFixedValue() <= 64)
1759 return 2 * BaseCost;
1760 }
1761
1762 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1763 Alignment, AddressSpace, CostKind,
1764 UseMaskForCond, UseMaskForGaps);
1765}
1766
1767InstructionCost
1768ARMTTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
1769 TTI::TargetCostKind CostKind) const {
1770
1771 Type *DataTy = MICA.getDataType();
1772 const Value *Ptr = MICA.getPointer();
1773 bool VariableMask = MICA.getVariableMask();
1774 Align Alignment = MICA.getAlignment();
1775 const Instruction *I = MICA.getInst();
1776
1777 using namespace PatternMatch;
1778 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1779 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1780
1781 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1782 auto *VTy = cast<FixedVectorType>(Val: DataTy);
1783
1784 // TODO: Splitting, once we do that.
1785
1786 unsigned NumElems = VTy->getNumElements();
1787 unsigned EltSize = VTy->getScalarSizeInBits();
1788 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: DataTy);
1789
1790 // For now, it is assumed that for the MVE gather instructions the loads are
1791 // all effectively serialised. This means the cost is the scalar cost
1792 // multiplied by the number of elements being loaded. This is possibly very
1793 // conservative, but even so we still end up vectorising loops because the
1794 // cost per iteration for many loops is lower than for scalar loops.
1795 InstructionCost VectorCost =
1796 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1797 // The scalarization cost should be a lot higher. We use the number of vector
1798 // elements plus the scalarization overhead. If masking is required then a lot
1799 // of little blocks will be needed and potentially a scalarized p0 mask,
1800 // greatly increasing the cost.
1801 InstructionCost ScalarCost =
1802 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1803 BaseT::getScalarizationOverhead(InTy: VTy, /*Insert*/ true, /*Extract*/ false,
1804 CostKind) +
1805 BaseT::getScalarizationOverhead(InTy: VTy, /*Insert*/ false, /*Extract*/ true,
1806 CostKind);
1807
1808 if (EltSize < 8 || Alignment < EltSize / 8)
1809 return ScalarCost;
1810
1811 unsigned ExtSize = EltSize;
1812 // Check whether there's a single user that asks for an extended type
1813 if (I != nullptr) {
1814 // Dependent of the caller of this function, a gather instruction will
1815 // either have opcode Instruction::Load or be a call to the masked_gather
1816 // intrinsic
1817 if ((I->getOpcode() == Instruction::Load ||
1818 match(V: I, P: m_Intrinsic<Intrinsic::masked_gather>())) &&
1819 I->hasOneUse()) {
1820 const User *Us = *I->users().begin();
1821 if (isa<ZExtInst>(Val: Us) || isa<SExtInst>(Val: Us)) {
1822 // only allow valid type combinations
1823 unsigned TypeSize =
1824 cast<Instruction>(Val: Us)->getType()->getScalarSizeInBits();
1825 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1826 (TypeSize == 16 && EltSize == 8)) &&
1827 TypeSize * NumElems == 128) {
1828 ExtSize = TypeSize;
1829 }
1830 }
1831 }
1832 // Check whether the input data needs to be truncated
1833 TruncInst *T;
1834 if ((I->getOpcode() == Instruction::Store ||
1835 match(V: I, P: m_Intrinsic<Intrinsic::masked_scatter>())) &&
1836 (T = dyn_cast<TruncInst>(Val: I->getOperand(i: 0)))) {
1837 // Only allow valid type combinations
1838 unsigned TypeSize = T->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits();
1839 if (((EltSize == 16 && TypeSize == 32) ||
1840 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1841 TypeSize * NumElems == 128)
1842 ExtSize = TypeSize;
1843 }
1844 }
1845
1846 if (ExtSize * NumElems != 128 || NumElems < 4)
1847 return ScalarCost;
1848
1849 // Any (aligned) i32 gather will not need to be scalarised.
1850 if (ExtSize == 32)
1851 return VectorCost;
1852 // For smaller types, we need to ensure that the gep's inputs are correctly
1853 // extended from a small enough value. Other sizes (including i64) are
1854 // scalarized for now.
1855 if (ExtSize != 8 && ExtSize != 16)
1856 return ScalarCost;
1857
1858 if (const auto *BC = dyn_cast<BitCastInst>(Val: Ptr))
1859 Ptr = BC->getOperand(i_nocapture: 0);
1860 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr)) {
1861 if (GEP->getNumOperands() != 2)
1862 return ScalarCost;
1863 unsigned Scale = DL.getTypeAllocSize(Ty: GEP->getResultElementType());
1864 // Scale needs to be correct (which is only relevant for i16s).
1865 if (Scale != 1 && Scale * 8 != ExtSize)
1866 return ScalarCost;
1867 // And we need to zext (not sext) the indexes from a small enough type.
1868 if (const auto *ZExt = dyn_cast<ZExtInst>(Val: GEP->getOperand(i_nocapture: 1))) {
1869 if (ZExt->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() <= ExtSize)
1870 return VectorCost;
1871 }
1872 return ScalarCost;
1873 }
1874 return ScalarCost;
1875}
1876
1877InstructionCost
1878ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1879 std::optional<FastMathFlags> FMF,
1880 TTI::TargetCostKind CostKind) const {
1881
1882 EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1883 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1884 unsigned EltSize = ValVT.getScalarSizeInBits();
1885
1886 // In general floating point reductions are a series of elementwise
1887 // operations, with free extracts on each step. These are either in-order or
1888 // treewise depending on whether that is allowed by the fast math flags.
1889 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1890 ((EltSize == 32 && ST->hasVFP2Base()) ||
1891 (EltSize == 64 && ST->hasFP64()) ||
1892 (EltSize == 16 && ST->hasFullFP16()))) {
1893 unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1894 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1895 InstructionCost VecCost = 0;
1896 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(Value: NumElts) &&
1897 NumElts * EltSize > VecLimit) {
1898 Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / 2);
1899 VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1900 NumElts /= 2;
1901 }
1902
1903 // For fp16 we need to extract the upper lane elements. MVE can add a
1904 // VREV+FMIN/MAX to perform another vector step instead.
1905 InstructionCost ExtractCost = 0;
1906 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1907 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1908 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1909 NumElts /= 2;
1910 } else if (ValVT.getVectorElementType() == MVT::f16)
1911 ExtractCost = NumElts / 2;
1912
1913 return VecCost + ExtractCost +
1914 NumElts *
1915 getArithmeticInstrCost(Opcode, Ty: ValTy->getElementType(), CostKind);
1916 }
1917
1918 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1919 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1920 unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1921 unsigned VecLimit =
1922 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1923 InstructionCost VecCost = 0;
1924 while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1925 Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / 2);
1926 VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1927 NumElts /= 2;
1928 }
1929 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1930 // step.
1931 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1932 NumElts * EltSize == 64) {
1933 Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts);
1934 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1935 getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1936 NumElts /= 2;
1937 }
1938
1939 // From here we extract the elements and perform the and/or/xor.
1940 InstructionCost ExtractCost = NumElts;
1941 return VecCost + ExtractCost +
1942 (NumElts - 1) * getArithmeticInstrCost(
1943 Opcode, Ty: ValTy->getElementType(), CostKind);
1944 }
1945
1946 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1947 TTI::requiresOrderedReduction(FMF))
1948 return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1949
1950 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1951
1952 static const CostTblEntry CostTblAdd[]{
1953 {.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 1},
1954 {.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 1},
1955 {.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 1},
1956 };
1957 if (const auto *Entry = CostTableLookup(Table: CostTblAdd, ISD, Ty: LT.second))
1958 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1959
1960 return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1961}
1962
1963InstructionCost ARMTTIImpl::getExtendedReductionCost(
1964 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1965 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1966 EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1967 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1968
1969 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1970
1971 switch (ISD) {
1972 case ISD::ADD:
1973 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1974 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1975
1976 // The legal cases are:
1977 // VADDV u/s 8/16/32
1978 // VADDLV u/s 32
1979 // Codegen currently cannot always handle larger than legal vectors very
1980 // well, especially for predicated reductions where the mask needs to be
1981 // split, so restrict to 128bit or smaller input types.
1982 unsigned RevVTSize = ResVT.getSizeInBits();
1983 if (ValVT.getSizeInBits() <= 128 &&
1984 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1985 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1986 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1987 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1988 }
1989 break;
1990 default:
1991 break;
1992 }
1993 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, FMF,
1994 CostKind);
1995}
1996
1997InstructionCost
1998ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
1999 Type *ResTy, VectorType *ValTy,
2000 TTI::TargetCostKind CostKind) const {
2001 if (RedOpcode != Instruction::Add)
2002 return InstructionCost::getInvalid(Val: CostKind);
2003 EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
2004 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
2005
2006 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
2007 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
2008
2009 // The legal cases are:
2010 // VMLAV u/s 8/16/32
2011 // VMLALV u/s 16/32
2012 // Codegen currently cannot always handle larger than legal vectors very
2013 // well, especially for predicated reductions where the mask needs to be
2014 // split, so restrict to 128bit or smaller input types.
2015 unsigned RevVTSize = ResVT.getSizeInBits();
2016 if (ValVT.getSizeInBits() <= 128 &&
2017 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
2018 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
2019 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
2020 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
2021 }
2022
2023 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, Ty: ValTy,
2024 CostKind);
2025}
2026
2027InstructionCost
2028ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
2029 FastMathFlags FMF,
2030 TTI::TargetCostKind CostKind) const {
2031 EVT ValVT = TLI->getValueType(DL, Ty);
2032
2033 // In general floating point reductions are a series of elementwise
2034 // operations, with free extracts on each step. These are either in-order or
2035 // treewise depending on whether that is allowed by the fast math flags.
2036 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
2037 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
2038 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
2039 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
2040 unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
2041 unsigned EltSize = ValVT.getScalarSizeInBits();
2042 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
2043 InstructionCost VecCost;
2044 while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
2045 Type *VecTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElts/2);
2046 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
2047 VecCost += getIntrinsicInstrCost(ICA, CostKind);
2048 NumElts /= 2;
2049 }
2050
2051 // For fp16 we need to extract the upper lane elements. MVE can add a
2052 // VREV+FMIN/MAX to perform another vector step instead.
2053 InstructionCost ExtractCost = 0;
2054 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
2055 NumElts == 8) {
2056 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
2057 NumElts /= 2;
2058 } else if (ValVT.getVectorElementType() == MVT::f16)
2059 ExtractCost = cast<FixedVectorType>(Val: Ty)->getNumElements() / 2;
2060
2061 IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
2062 {Ty->getElementType(), Ty->getElementType()},
2063 FMF);
2064 return VecCost + ExtractCost +
2065 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
2066 }
2067
2068 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
2069 IID == Intrinsic::umin || IID == Intrinsic::umax) {
2070 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2071
2072 // All costs are the same for u/s min/max. These lower to vminv, which are
2073 // given a slightly higher cost as they tend to take multiple cycles for
2074 // smaller type sizes.
2075 static const CostTblEntry CostTblAdd[]{
2076 {.ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: 4},
2077 {.ISD: ISD::SMIN, .Type: MVT::v8i16, .Cost: 3},
2078 {.ISD: ISD::SMIN, .Type: MVT::v4i32, .Cost: 2},
2079 };
2080 if (const auto *Entry = CostTableLookup(Table: CostTblAdd, ISD: ISD::SMIN, Ty: LT.second))
2081 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
2082 }
2083
2084 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2085}
2086
2087InstructionCost
2088ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2089 TTI::TargetCostKind CostKind) const {
2090 unsigned Opc = ICA.getID();
2091 switch (Opc) {
2092 case Intrinsic::get_active_lane_mask:
2093 // Currently we make a somewhat optimistic assumption that
2094 // active_lane_mask's are always free. In reality it may be freely folded
2095 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
2096 // of add/icmp code. We may need to improve this in the future, but being
2097 // able to detect if it is free or not involves looking at a lot of other
2098 // code. We currently assume that the vectorizer inserted these, and knew
2099 // what it was doing in adding one.
2100 if (ST->hasMVEIntegerOps())
2101 return 0;
2102 break;
2103 case Intrinsic::sadd_sat:
2104 case Intrinsic::ssub_sat:
2105 case Intrinsic::uadd_sat:
2106 case Intrinsic::usub_sat: {
2107 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2108 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2109 Type *RetTy = ICA.getReturnType();
2110
2111 if (auto *ITy = dyn_cast<IntegerType>(Val: RetTy)) {
2112 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
2113 return 1; // qadd / qsub
2114 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
2115 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2116 // Otherwise return the cost of expanding the node. Generally an add +
2117 // icmp + sel.
2118 CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2119 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
2120 return getArithmeticInstrCost(Opcode: IsAdd ? Instruction::Add : Instruction::Sub,
2121 Ty: RetTy, CostKind) +
2122 2 * getCmpSelInstrCost(Opcode: BinaryOperator::ICmp, ValTy: RetTy, CondTy, VecPred: Pred,
2123 CostKind) +
2124 2 * getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy, VecPred: Pred,
2125 CostKind);
2126 }
2127
2128 if (!ST->hasMVEIntegerOps())
2129 break;
2130
2131 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
2132 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2133 LT.second == MVT::v16i8) {
2134 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2135 // need to extend the type, as it uses shr(qadd(shl, shl)).
2136 unsigned Instrs =
2137 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
2138 : 4;
2139 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2140 }
2141 break;
2142 }
2143 case Intrinsic::abs:
2144 case Intrinsic::smin:
2145 case Intrinsic::smax:
2146 case Intrinsic::umin:
2147 case Intrinsic::umax: {
2148 if (!ST->hasMVEIntegerOps())
2149 break;
2150 Type *VT = ICA.getReturnType();
2151
2152 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
2153 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2154 LT.second == MVT::v16i8)
2155 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2156 break;
2157 }
2158 case Intrinsic::minnum:
2159 case Intrinsic::maxnum: {
2160 if (!ST->hasMVEFloatOps())
2161 break;
2162 Type *VT = ICA.getReturnType();
2163 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
2164 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
2165 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2166 break;
2167 }
2168 case Intrinsic::fptosi_sat:
2169 case Intrinsic::fptoui_sat: {
2170 if (ICA.getArgTypes().empty())
2171 break;
2172 bool IsSigned = Opc == Intrinsic::fptosi_sat;
2173 auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]);
2174 EVT MTy = TLI->getValueType(DL, Ty: ICA.getReturnType());
2175 // Check for the legal types, with the corect subtarget features.
2176 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
2177 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
2178 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2179 return LT.first;
2180
2181 // Equally for MVE vector types
2182 if (ST->hasMVEFloatOps() &&
2183 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2184 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2185 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2186
2187 // If we can we use a legal convert followed by a min+max
2188 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2189 (ST->hasFP64() && LT.second == MVT::f64) ||
2190 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2191 (ST->hasMVEFloatOps() &&
2192 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2193 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2194 Type *LegalTy = Type::getIntNTy(C&: ICA.getReturnType()->getContext(),
2195 N: LT.second.getScalarSizeInBits());
2196 InstructionCost Cost =
2197 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2198 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2199 : Intrinsic::umin,
2200 LegalTy, {LegalTy, LegalTy});
2201 Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
2202 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2203 : Intrinsic::umax,
2204 LegalTy, {LegalTy, LegalTy});
2205 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
2206 return LT.first * Cost;
2207 }
2208 // Otherwise we need to follow the default expansion that clamps the value
2209 // using a float min/max with a fcmp+sel for nan handling when signed.
2210 Type *FPTy = ICA.getArgTypes()[0];
2211 Type *RetTy = ICA.getReturnType();
2212 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2213 InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind);
2214 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2215 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
2216 Cost +=
2217 getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2218 Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind);
2219 if (IsSigned) {
2220 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
2221 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy,
2222 VecPred: CmpInst::FCMP_UNO, CostKind);
2223 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy,
2224 VecPred: CmpInst::FCMP_UNO, CostKind);
2225 }
2226 return Cost;
2227 }
2228 }
2229
2230 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2231}
2232
2233bool ARMTTIImpl::isLoweredToCall(const Function *F) const {
2234 if (!F->isIntrinsic())
2235 return BaseT::isLoweredToCall(F);
2236
2237 // Assume all Arm-specific intrinsics map to an instruction.
2238 if (F->getName().starts_with(Prefix: "llvm.arm"))
2239 return false;
2240
2241 switch (F->getIntrinsicID()) {
2242 default: break;
2243 case Intrinsic::powi:
2244 case Intrinsic::sin:
2245 case Intrinsic::cos:
2246 case Intrinsic::sincos:
2247 case Intrinsic::pow:
2248 case Intrinsic::log:
2249 case Intrinsic::log10:
2250 case Intrinsic::log2:
2251 case Intrinsic::exp:
2252 case Intrinsic::exp2:
2253 return true;
2254 case Intrinsic::sqrt:
2255 case Intrinsic::fabs:
2256 case Intrinsic::copysign:
2257 case Intrinsic::floor:
2258 case Intrinsic::ceil:
2259 case Intrinsic::trunc:
2260 case Intrinsic::rint:
2261 case Intrinsic::nearbyint:
2262 case Intrinsic::round:
2263 case Intrinsic::canonicalize:
2264 case Intrinsic::lround:
2265 case Intrinsic::llround:
2266 case Intrinsic::lrint:
2267 case Intrinsic::llrint:
2268 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2269 return true;
2270 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2271 return true;
2272 // Some operations can be handled by vector instructions and assume
2273 // unsupported vectors will be expanded into supported scalar ones.
2274 // TODO Handle scalar operations properly.
2275 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2276 case Intrinsic::masked_store:
2277 case Intrinsic::masked_load:
2278 case Intrinsic::masked_gather:
2279 case Intrinsic::masked_scatter:
2280 return !ST->hasMVEIntegerOps();
2281 case Intrinsic::sadd_with_overflow:
2282 case Intrinsic::uadd_with_overflow:
2283 case Intrinsic::ssub_with_overflow:
2284 case Intrinsic::usub_with_overflow:
2285 case Intrinsic::sadd_sat:
2286 case Intrinsic::uadd_sat:
2287 case Intrinsic::ssub_sat:
2288 case Intrinsic::usub_sat:
2289 return false;
2290 }
2291
2292 return BaseT::isLoweredToCall(F);
2293}
2294
2295bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) const {
2296 unsigned ISD = TLI->InstructionOpcodeToISD(Opcode: I.getOpcode());
2297 EVT VT = TLI->getValueType(DL, Ty: I.getType(), AllowUnknown: true);
2298 if (TLI->getOperationAction(Op: ISD, VT) == TargetLowering::LibCall)
2299 return true;
2300
2301 // Check if an intrinsic will be lowered to a call and assume that any
2302 // other CallInst will generate a bl.
2303 if (auto *Call = dyn_cast<CallInst>(Val: &I)) {
2304 if (auto *II = dyn_cast<IntrinsicInst>(Val: Call)) {
2305 switch(II->getIntrinsicID()) {
2306 case Intrinsic::memcpy:
2307 case Intrinsic::memset:
2308 case Intrinsic::memmove:
2309 return getNumMemOps(I: II) == -1;
2310 default:
2311 if (const Function *F = Call->getCalledFunction())
2312 return isLoweredToCall(F);
2313 }
2314 }
2315 return true;
2316 }
2317
2318 // FPv5 provides conversions between integer, double-precision,
2319 // single-precision, and half-precision formats.
2320 switch (I.getOpcode()) {
2321 default:
2322 break;
2323 case Instruction::FPToSI:
2324 case Instruction::FPToUI:
2325 case Instruction::SIToFP:
2326 case Instruction::UIToFP:
2327 case Instruction::FPTrunc:
2328 case Instruction::FPExt:
2329 return !ST->hasFPARMv8Base();
2330 }
2331
2332 // FIXME: Unfortunately the approach of checking the Operation Action does
2333 // not catch all cases of Legalization that use library calls. Our
2334 // Legalization step categorizes some transformations into library calls as
2335 // Custom, Expand or even Legal when doing type legalization. So for now
2336 // we have to special case for instance the SDIV of 64bit integers and the
2337 // use of floating point emulation.
2338 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2339 switch (ISD) {
2340 default:
2341 break;
2342 case ISD::SDIV:
2343 case ISD::UDIV:
2344 case ISD::SREM:
2345 case ISD::UREM:
2346 case ISD::SDIVREM:
2347 case ISD::UDIVREM:
2348 return true;
2349 }
2350 }
2351
2352 // Assume all other non-float operations are supported.
2353 if (!VT.isFloatingPoint())
2354 return false;
2355
2356 // We'll need a library call to handle most floats when using soft.
2357 if (TLI->useSoftFloat()) {
2358 switch (I.getOpcode()) {
2359 default:
2360 return true;
2361 case Instruction::Alloca:
2362 case Instruction::Load:
2363 case Instruction::Store:
2364 case Instruction::Select:
2365 case Instruction::PHI:
2366 return false;
2367 }
2368 }
2369
2370 // We'll need a libcall to perform double precision operations on a single
2371 // precision only FPU.
2372 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2373 return true;
2374
2375 // Likewise for half precision arithmetic.
2376 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2377 return true;
2378
2379 return false;
2380}
2381
2382bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2383 AssumptionCache &AC,
2384 TargetLibraryInfo *LibInfo,
2385 HardwareLoopInfo &HWLoopInfo) const {
2386 // Low-overhead branches are only supported in the 'low-overhead branch'
2387 // extension of v8.1-m.
2388 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2389 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2390 return false;
2391 }
2392
2393 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2394 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2395 return false;
2396 }
2397
2398 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2399 if (isa<SCEVCouldNotCompute>(Val: BackedgeTakenCount)) {
2400 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2401 return false;
2402 }
2403
2404 const SCEV *TripCountSCEV =
2405 SE.getAddExpr(LHS: BackedgeTakenCount,
2406 RHS: SE.getOne(Ty: BackedgeTakenCount->getType()));
2407
2408 // We need to store the trip count in LR, a 32-bit register.
2409 if (SE.getUnsignedRangeMax(S: TripCountSCEV).getBitWidth() > 32) {
2410 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2411 return false;
2412 }
2413
2414 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2415 // point in generating a hardware loop if that's going to happen.
2416
2417 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2418 if (auto *Call = dyn_cast<IntrinsicInst>(Val: &I)) {
2419 switch (Call->getIntrinsicID()) {
2420 default:
2421 break;
2422 case Intrinsic::start_loop_iterations:
2423 case Intrinsic::test_start_loop_iterations:
2424 case Intrinsic::loop_decrement:
2425 case Intrinsic::loop_decrement_reg:
2426 return true;
2427 }
2428 }
2429 return false;
2430 };
2431
2432 // Scan the instructions to see if there's any that we know will turn into a
2433 // call or if this loop is already a low-overhead loop or will become a tail
2434 // predicated loop.
2435 bool IsTailPredLoop = false;
2436 auto ScanLoop = [&](Loop *L) {
2437 for (auto *BB : L->getBlocks()) {
2438 for (auto &I : *BB) {
2439 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2440 isa<InlineAsm>(Val: I)) {
2441 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2442 return false;
2443 }
2444 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2445 IsTailPredLoop |=
2446 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2447 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2448 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2449 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2450 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2451 }
2452 }
2453 return true;
2454 };
2455
2456 // Visit inner loops.
2457 for (auto *Inner : *L)
2458 if (!ScanLoop(Inner))
2459 return false;
2460
2461 if (!ScanLoop(L))
2462 return false;
2463
2464 // TODO: Check whether the trip count calculation is expensive. If L is the
2465 // inner loop but we know it has a low trip count, calculating that trip
2466 // count (in the parent loop) may be detrimental.
2467
2468 LLVMContext &C = L->getHeader()->getContext();
2469 HWLoopInfo.CounterInReg = true;
2470 HWLoopInfo.IsNestingLegal = false;
2471 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2472 HWLoopInfo.CountType = Type::getInt32Ty(C);
2473 HWLoopInfo.LoopDecrement = ConstantInt::get(Ty: HWLoopInfo.CountType, V: 1);
2474 return true;
2475}
2476
2477static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2478 // We don't allow icmp's, and because we only look at single block loops,
2479 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2480 if (isa<ICmpInst>(Val: &I) && ++ICmpCount > 1)
2481 return false;
2482 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2483 // not currently canonical, but soon will be. Code without them uses icmp, and
2484 // so is not tail predicated as per the condition above. In order to get the
2485 // same performance we treat min and max the same as an icmp for tailpred
2486 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2487 // pick more optimial instructions like VQDMULH. They need to be recognized
2488 // directly by the vectorizer).
2489 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2490 if ((II->getIntrinsicID() == Intrinsic::smin ||
2491 II->getIntrinsicID() == Intrinsic::smax ||
2492 II->getIntrinsicID() == Intrinsic::umin ||
2493 II->getIntrinsicID() == Intrinsic::umax) &&
2494 ++ICmpCount > 1)
2495 return false;
2496
2497 if (isa<FCmpInst>(Val: &I))
2498 return false;
2499
2500 // We could allow extending/narrowing FP loads/stores, but codegen is
2501 // too inefficient so reject this for now.
2502 if (isa<FPExtInst>(Val: &I) || isa<FPTruncInst>(Val: &I))
2503 return false;
2504
2505 // Extends have to be extending-loads
2506 if (isa<SExtInst>(Val: &I) || isa<ZExtInst>(Val: &I) )
2507 if (!I.getOperand(i: 0)->hasOneUse() || !isa<LoadInst>(Val: I.getOperand(i: 0)))
2508 return false;
2509
2510 // Truncs have to be narrowing-stores
2511 if (isa<TruncInst>(Val: &I) )
2512 if (!I.hasOneUse() || !isa<StoreInst>(Val: *I.user_begin()))
2513 return false;
2514
2515 return true;
2516}
2517
2518// To set up a tail-predicated loop, we need to know the total number of
2519// elements processed by that loop. Thus, we need to determine the element
2520// size and:
2521// 1) it should be uniform for all operations in the vector loop, so we
2522// e.g. don't want any widening/narrowing operations.
2523// 2) it should be smaller than i64s because we don't have vector operations
2524// that work on i64s.
2525// 3) we don't want elements to be reversed or shuffled, to make sure the
2526// tail-predication masks/predicates the right lanes.
2527//
2528static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2529 const DataLayout &DL,
2530 const LoopAccessInfo *LAI,
2531 const DominatorTree &DT) {
2532 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2533
2534 // If there are live-out values, it is probably a reduction. We can predicate
2535 // most reduction operations freely under MVE using a combination of
2536 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2537 // floating point and integer reductions, but don't check for operators
2538 // specifically here. If the value ends up not being a reduction (and so the
2539 // vectorizer cannot tailfold the loop), we should fall back to standard
2540 // vectorization automatically.
2541 SmallVector< Instruction *, 8 > LiveOuts;
2542 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2543 bool ReductionsDisabled =
2544 EnableTailPredication == TailPredication::EnabledNoReductions ||
2545 EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2546
2547 for (auto *I : LiveOuts) {
2548 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2549 !I->getType()->isHalfTy()) {
2550 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2551 "live-out value\n");
2552 return false;
2553 }
2554 if (ReductionsDisabled) {
2555 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2556 return false;
2557 }
2558 }
2559
2560 // Next, check that all instructions can be tail-predicated.
2561 PredicatedScalarEvolution PSE = LAI->getPSE();
2562 int ICmpCount = 0;
2563
2564 for (BasicBlock *BB : L->blocks()) {
2565 for (Instruction &I : BB->instructionsWithoutDebug()) {
2566 if (isa<PHINode>(Val: &I))
2567 continue;
2568 if (!canTailPredicateInstruction(I, ICmpCount)) {
2569 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2570 return false;
2571 }
2572
2573 Type *T = I.getType();
2574 if (T->getScalarSizeInBits() > 32) {
2575 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2576 return false;
2577 }
2578 if (isa<StoreInst>(Val: I) || isa<LoadInst>(Val: I)) {
2579 Value *Ptr = getLoadStorePointerOperand(V: &I);
2580 Type *AccessTy = getLoadStoreType(I: &I);
2581 int64_t NextStride =
2582 getPtrStride(PSE, AccessTy, Ptr, Lp: L, DT).value_or(u: 0);
2583 if (NextStride == 1) {
2584 // TODO: for now only allow consecutive strides of 1. We could support
2585 // other strides as long as it is uniform, but let's keep it simple
2586 // for now.
2587 continue;
2588 } else if (NextStride == -1 ||
2589 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2590 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2591 LLVM_DEBUG(dbgs()
2592 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2593 "be tail-predicated\n.");
2594 return false;
2595 // TODO: don't tail predicate if there is a reversed load?
2596 } else if (EnableMaskedGatherScatters) {
2597 // Gather/scatters do allow loading from arbitrary strides, at
2598 // least if they are loop invariant.
2599 // TODO: Loop variant strides should in theory work, too, but
2600 // this requires further testing.
2601 const SCEV *PtrScev = PSE.getSE()->getSCEV(V: Ptr);
2602 if (auto AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev)) {
2603 const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE());
2604 if (PSE.getSE()->isLoopInvariant(S: Step, L))
2605 continue;
2606 }
2607 }
2608 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2609 "tail-predicate\n.");
2610 return false;
2611 }
2612 }
2613 }
2614
2615 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2616 return true;
2617}
2618
2619bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
2620 if (!EnableTailPredication) {
2621 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2622 return false;
2623 }
2624
2625 // Creating a predicated vector loop is the first step for generating a
2626 // tail-predicated hardware loop, for which we need the MVE masked
2627 // load/stores instructions:
2628 if (!ST->hasMVEIntegerOps())
2629 return false;
2630
2631 LoopVectorizationLegality *LVL = TFI->LVL;
2632 Loop *L = LVL->getLoop();
2633
2634 // For now, restrict this to single block loops.
2635 if (L->getNumBlocks() > 1) {
2636 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2637 "loop.\n");
2638 return false;
2639 }
2640
2641 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2642
2643 LoopInfo *LI = LVL->getLoopInfo();
2644 HardwareLoopInfo HWLoopInfo(L);
2645 if (!HWLoopInfo.canAnalyze(LI&: *LI)) {
2646 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2647 "analyzable.\n");
2648 return false;
2649 }
2650
2651 AssumptionCache *AC = LVL->getAssumptionCache();
2652 ScalarEvolution *SE = LVL->getScalarEvolution();
2653
2654 // This checks if we have the low-overhead branch architecture
2655 // extension, and if we will create a hardware-loop:
2656 if (!isHardwareLoopProfitable(L, SE&: *SE, AC&: *AC, LibInfo: TFI->TLI, HWLoopInfo)) {
2657 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2658 "profitable.\n");
2659 return false;
2660 }
2661
2662 DominatorTree *DT = LVL->getDominatorTree();
2663 if (!HWLoopInfo.isHardwareLoopCandidate(SE&: *SE, LI&: *LI, DT&: *DT)) {
2664 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2665 "a candidate.\n");
2666 return false;
2667 }
2668
2669 return canTailPredicateLoop(L, LI, SE&: *SE, DL, LAI: LVL->getLAI(),
2670 DT: *LVL->getDominatorTree());
2671}
2672
2673TailFoldingStyle
2674ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2675 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2676 return TailFoldingStyle::DataWithoutLaneMask;
2677
2678 // Intrinsic @llvm.get.active.lane.mask is supported.
2679 // It is used in the MVETailPredication pass, which requires the number of
2680 // elements processed by this vector loop to setup the tail-predicated
2681 // loop.
2682 return TailFoldingStyle::Data;
2683}
2684void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2685 TTI::UnrollingPreferences &UP,
2686 OptimizationRemarkEmitter *ORE) const {
2687 // Enable Upper bound unrolling universally, providing that we do not see an
2688 // active lane mask, which will be better kept as a loop to become tail
2689 // predicated than to be conditionally unrolled.
2690 UP.UpperBound =
2691 !ST->hasMVEIntegerOps() || !any_of(Range&: *L->getHeader(), P: [](Instruction &I) {
2692 return isa<IntrinsicInst>(Val: I) &&
2693 cast<IntrinsicInst>(Val&: I).getIntrinsicID() ==
2694 Intrinsic::get_active_lane_mask;
2695 });
2696
2697 // Only currently enable these preferences for M-Class cores.
2698 if (!ST->isMClass())
2699 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2700
2701 // Disable loop unrolling for Oz and Os.
2702 UP.OptSizeThreshold = 0;
2703 UP.PartialOptSizeThreshold = 0;
2704 if (L->getHeader()->getParent()->hasOptSize())
2705 return;
2706
2707 SmallVector<BasicBlock*, 4> ExitingBlocks;
2708 L->getExitingBlocks(ExitingBlocks);
2709 LLVM_DEBUG(dbgs() << "Loop has:\n"
2710 << "Blocks: " << L->getNumBlocks() << "\n"
2711 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2712
2713 // Only allow another exit other than the latch. This acts as an early exit
2714 // as it mirrors the profitability calculation of the runtime unroller.
2715 if (ExitingBlocks.size() > 2)
2716 return;
2717
2718 // Limit the CFG of the loop body for targets with a branch predictor.
2719 // Allowing 4 blocks permits if-then-else diamonds in the body.
2720 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2721 return;
2722
2723 // Don't unroll vectorized loops, including the remainder loop
2724 if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized"))
2725 return;
2726
2727 // Scan the loop: don't unroll loops with calls as this could prevent
2728 // inlining.
2729 InstructionCost Cost = 0;
2730 for (auto *BB : L->getBlocks()) {
2731 for (auto &I : *BB) {
2732 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2733 // scalar code.
2734 if (I.getType()->isVectorTy())
2735 return;
2736
2737 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) {
2738 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
2739 if (!isLoweredToCall(F))
2740 continue;
2741 }
2742 return;
2743 }
2744
2745 SmallVector<const Value*, 4> Operands(I.operand_values());
2746 Cost += getInstructionCost(U: &I, Operands,
2747 CostKind: TargetTransformInfo::TCK_SizeAndLatency);
2748 }
2749 }
2750
2751 // On v6m cores, there are very few registers available. We can easily end up
2752 // spilling and reloading more registers in an unrolled loop. Look at the
2753 // number of LCSSA phis as a rough measure of how many registers will need to
2754 // be live out of the loop, reducing the default unroll count if more than 1
2755 // value is needed. In the long run, all of this should be being learnt by a
2756 // machine.
2757 unsigned UnrollCount = 4;
2758 if (ST->isThumb1Only()) {
2759 unsigned ExitingValues = 0;
2760 SmallVector<BasicBlock *, 4> ExitBlocks;
2761 L->getExitBlocks(ExitBlocks);
2762 for (auto *Exit : ExitBlocks) {
2763 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2764 // only the last is expected to be needed for address operands.
2765 unsigned LiveOuts = count_if(Range: Exit->phis(), P: [](auto &PH) {
2766 return PH.getNumOperands() != 1 ||
2767 !isa<GetElementPtrInst>(PH.getOperand(0));
2768 });
2769 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2770 }
2771 if (ExitingValues)
2772 UnrollCount /= ExitingValues;
2773 if (UnrollCount <= 1)
2774 return;
2775 }
2776
2777 // For processors with low overhead branching (LOB), runtime unrolling the
2778 // innermost loop is often detrimental to performance. In these cases the loop
2779 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2780 // deeply nested loops get executed multiple times, negating the benefits of
2781 // LOB. This is particularly noticable when the loop trip count of the
2782 // innermost loop varies within the outer loop, such as in the case of
2783 // triangular matrix decompositions. In these cases we will prefer to not
2784 // unroll the innermost loop, with the intention for it to be executed as a
2785 // low overhead loop.
2786 bool Runtime = true;
2787 if (ST->hasLOB()) {
2788 if (SE.hasLoopInvariantBackedgeTakenCount(L)) {
2789 const auto *BETC = SE.getBackedgeTakenCount(L);
2790 auto *Outer = L->getOutermostLoop();
2791 if ((L != Outer && Outer != L->getParentLoop()) ||
2792 (L != Outer && BETC && !SE.isLoopInvariant(S: BETC, L: Outer))) {
2793 Runtime = false;
2794 }
2795 }
2796 }
2797
2798 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2799 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2800
2801 UP.Partial = true;
2802 UP.Runtime = Runtime;
2803 UP.UnrollRemainder = true;
2804 UP.DefaultUnrollRuntimeCount = UnrollCount;
2805 UP.UnrollAndJam = true;
2806 UP.UnrollAndJamInnerLoopThreshold = 60;
2807
2808 // Force unrolling small loops can be very useful because of the branch
2809 // taken cost of the backedge.
2810 if (Cost < ArmForceUnrollThreshold)
2811 UP.Force = true;
2812}
2813
2814void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2815 TTI::PeelingPreferences &PP) const {
2816 BaseT::getPeelingPreferences(L, SE, PP);
2817}
2818
2819bool ARMTTIImpl::preferInLoopReduction(RecurKind Kind, Type *Ty) const {
2820 if (!ST->hasMVEIntegerOps())
2821 return false;
2822
2823 unsigned ScalarBits = Ty->getScalarSizeInBits();
2824 switch (Kind) {
2825 case RecurKind::Add:
2826 return ScalarBits <= 64;
2827 default:
2828 return false;
2829 }
2830}
2831
2832bool ARMTTIImpl::preferPredicatedReductionSelect() const {
2833 if (!ST->hasMVEIntegerOps())
2834 return false;
2835 return true;
2836}
2837
2838InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
2839 StackOffset BaseOffset,
2840 bool HasBaseReg, int64_t Scale,
2841 unsigned AddrSpace) const {
2842 TargetLoweringBase::AddrMode AM;
2843 AM.BaseGV = BaseGV;
2844 AM.BaseOffs = BaseOffset.getFixed();
2845 AM.HasBaseReg = HasBaseReg;
2846 AM.Scale = Scale;
2847 AM.ScalableOffset = BaseOffset.getScalable();
2848 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) {
2849 if (ST->hasFPAO())
2850 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2851 return 0;
2852 }
2853 return InstructionCost::getInvalid();
2854}
2855
2856bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2857 if (Thumb) {
2858 // B.W is available in any Thumb2-supporting target, and also in every
2859 // version of Armv8-M, even Baseline which does not include the rest of
2860 // Thumb2.
2861 return ST->isThumb2() || ST->hasV8MBaselineOps();
2862 } else {
2863 // B is available in all versions of the Arm ISA, so the only question is
2864 // whether that ISA is available at all.
2865 return ST->hasARMOps();
2866 }
2867}
2868
2869/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2870/// of the vector elements.
2871static bool areExtractExts(Value *Ext1, Value *Ext2) {
2872 using namespace PatternMatch;
2873
2874 auto areExtDoubled = [](Instruction *Ext) {
2875 return Ext->getType()->getScalarSizeInBits() ==
2876 2 * Ext->getOperand(i: 0)->getType()->getScalarSizeInBits();
2877 };
2878
2879 if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) ||
2880 !match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) ||
2881 !areExtDoubled(cast<Instruction>(Val: Ext1)) ||
2882 !areExtDoubled(cast<Instruction>(Val: Ext2)))
2883 return false;
2884
2885 return true;
2886}
2887
2888/// Check if sinking \p I's operands to I's basic block is profitable, because
2889/// the operands can be folded into a target instruction, e.g.
2890/// sext/zext can be folded into vsubl.
2891bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,
2892 SmallVectorImpl<Use *> &Ops) const {
2893 using namespace PatternMatch;
2894
2895 if (!I->getType()->isVectorTy())
2896 return false;
2897
2898 if (ST->hasNEON()) {
2899 switch (I->getOpcode()) {
2900 case Instruction::Sub:
2901 case Instruction::Add: {
2902 if (!areExtractExts(Ext1: I->getOperand(i: 0), Ext2: I->getOperand(i: 1)))
2903 return false;
2904 Ops.push_back(Elt: &I->getOperandUse(i: 0));
2905 Ops.push_back(Elt: &I->getOperandUse(i: 1));
2906 return true;
2907 }
2908 default:
2909 return false;
2910 }
2911 }
2912
2913 if (!ST->hasMVEIntegerOps())
2914 return false;
2915
2916 auto IsFMSMul = [&](Instruction *I) {
2917 if (!I->hasOneUse())
2918 return false;
2919 auto *Sub = cast<Instruction>(Val: *I->users().begin());
2920 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(i: 1) == I;
2921 };
2922 auto IsFMS = [&](Instruction *I) {
2923 if (match(V: I->getOperand(i: 0), P: m_FNeg(X: m_Value())) ||
2924 match(V: I->getOperand(i: 1), P: m_FNeg(X: m_Value())))
2925 return true;
2926 return false;
2927 };
2928
2929 auto IsSinker = [&](Instruction *I, int Operand) {
2930 switch (I->getOpcode()) {
2931 case Instruction::Add:
2932 case Instruction::Mul:
2933 case Instruction::FAdd:
2934 case Instruction::ICmp:
2935 case Instruction::FCmp:
2936 return true;
2937 case Instruction::FMul:
2938 return !IsFMSMul(I);
2939 case Instruction::Sub:
2940 case Instruction::FSub:
2941 case Instruction::Shl:
2942 case Instruction::LShr:
2943 case Instruction::AShr:
2944 return Operand == 1;
2945 case Instruction::Call:
2946 if (auto *II = dyn_cast<IntrinsicInst>(Val: I)) {
2947 switch (II->getIntrinsicID()) {
2948 case Intrinsic::fma:
2949 return !IsFMS(I);
2950 case Intrinsic::sadd_sat:
2951 case Intrinsic::uadd_sat:
2952 case Intrinsic::arm_mve_add_predicated:
2953 case Intrinsic::arm_mve_mul_predicated:
2954 case Intrinsic::arm_mve_qadd_predicated:
2955 case Intrinsic::arm_mve_vhadd:
2956 case Intrinsic::arm_mve_hadd_predicated:
2957 case Intrinsic::arm_mve_vqdmull:
2958 case Intrinsic::arm_mve_vqdmull_predicated:
2959 case Intrinsic::arm_mve_vqdmulh:
2960 case Intrinsic::arm_mve_qdmulh_predicated:
2961 case Intrinsic::arm_mve_vqrdmulh:
2962 case Intrinsic::arm_mve_qrdmulh_predicated:
2963 case Intrinsic::arm_mve_fma_predicated:
2964 return true;
2965 case Intrinsic::ssub_sat:
2966 case Intrinsic::usub_sat:
2967 case Intrinsic::arm_mve_sub_predicated:
2968 case Intrinsic::arm_mve_qsub_predicated:
2969 case Intrinsic::arm_mve_hsub_predicated:
2970 case Intrinsic::arm_mve_vhsub:
2971 return Operand == 1;
2972 default:
2973 return false;
2974 }
2975 }
2976 return false;
2977 default:
2978 return false;
2979 }
2980 };
2981
2982 for (auto OpIdx : enumerate(First: I->operands())) {
2983 Instruction *Op = dyn_cast<Instruction>(Val: OpIdx.value().get());
2984 // Make sure we are not already sinking this operand
2985 if (!Op || any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; }))
2986 continue;
2987
2988 Instruction *Shuffle = Op;
2989 if (Shuffle->getOpcode() == Instruction::BitCast)
2990 Shuffle = dyn_cast<Instruction>(Val: Shuffle->getOperand(i: 0));
2991 // We are looking for a splat that can be sunk.
2992 if (!Shuffle || !match(V: Shuffle, P: m_Shuffle(v1: m_InsertElt(Val: m_Undef(), Elt: m_Value(),
2993 Idx: m_ZeroInt()),
2994 v2: m_Undef(), mask: m_ZeroMask())))
2995 continue;
2996 if (!IsSinker(I, OpIdx.index()))
2997 continue;
2998
2999 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3000 // and vector registers
3001 for (Use &U : Op->uses()) {
3002 Instruction *Insn = cast<Instruction>(Val: U.getUser());
3003 if (!IsSinker(Insn, U.getOperandNo()))
3004 return false;
3005 }
3006
3007 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
3008 if (Shuffle != Op)
3009 Ops.push_back(Elt: &Op->getOperandUse(i: 0));
3010 Ops.push_back(Elt: &OpIdx.value());
3011 }
3012 return true;
3013}
3014
3015unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
3016 Type *ArrayType) const {
3017 if (!UseWidenGlobalArrays) {
3018 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
3019 return false;
3020 }
3021
3022 // Don't modify none integer array types
3023 if (!ArrayType || !ArrayType->isArrayTy() ||
3024 !ArrayType->getArrayElementType()->isIntegerTy())
3025 return 0;
3026
3027 // We pad to 4 byte boundaries
3028 if (Size % 4 == 0)
3029 return 0;
3030
3031 unsigned NumBytesToPad = 4 - (Size % 4);
3032 unsigned NewSize = Size + NumBytesToPad;
3033
3034 // Max number of bytes that memcpy allows for lowering to load/stores before
3035 // it uses library function (__aeabi_memcpy).
3036 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
3037
3038 if (NewSize > MaxMemIntrinsicSize)
3039 return 0;
3040
3041 return NumBytesToPad;
3042}
3043