ARMTargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp]

1	//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "ARMTargetTransformInfo.h"
10	#include "ARMSubtarget.h"
11	#include "MCTargetDesc/ARMAddressingModes.h"
12	#include "llvm/ADT/APInt.h"
13	#include "llvm/ADT/SmallVector.h"
14	#include "llvm/Analysis/LoopInfo.h"
15	#include "llvm/CodeGen/CostTable.h"
16	#include "llvm/CodeGen/ISDOpcodes.h"
17	#include "llvm/CodeGen/ValueTypes.h"
18	#include "llvm/CodeGenTypes/MachineValueType.h"
19	#include "llvm/IR/BasicBlock.h"
20	#include "llvm/IR/DataLayout.h"
21	#include "llvm/IR/DerivedTypes.h"
22	#include "llvm/IR/Instruction.h"
23	#include "llvm/IR/Instructions.h"
24	#include "llvm/IR/IntrinsicInst.h"
25	#include "llvm/IR/Intrinsics.h"
26	#include "llvm/IR/IntrinsicsARM.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/IR/Type.h"
29	#include "llvm/Support/Casting.h"
30	#include "llvm/Support/KnownBits.h"
31	#include "llvm/Target/TargetMachine.h"
32	#include "llvm/TargetParser/SubtargetFeature.h"
33	#include "llvm/Transforms/InstCombine/InstCombiner.h"
34	#include "llvm/Transforms/Utils/Local.h"
35	#include "llvm/Transforms/Utils/LoopUtils.h"
36	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
37	#include <algorithm>
38	#include <cassert>
39	#include <cstdint>
40	#include <optional>
41	#include <utility>
42
43	using namespace llvm;
44
45	#define DEBUG_TYPE "armtti"
46
47	static cl::opt<bool> EnableMaskedLoadStores(
48	"enable-arm-maskedldst", cl::Hidden, cl::init(Val: true),
49	cl::desc ("Enable the generation of masked loads and stores"));
50
51	static cl::opt<bool> DisableLowOverheadLoops(
52	"disable-arm-loloops", cl::Hidden, cl::init(Val: false),
53	cl::desc ("Disable the generation of low-overhead loops"));
54
55	static cl::opt<bool>
56	AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(Val: true),
57	cl::desc ("Enable the generation of WLS loops"));
58
59	extern cl::opt<TailPredication::Mode> EnableTailPredication;
60
61	extern cl::opt<bool> EnableMaskedGatherScatters;
62
63	extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
64
65	/// Convert a vector load intrinsic into a simple llvm load instruction.
66	/// This is beneficial when the underlying object being addressed comes
67	/// from a constant, since we get constant-folding for free.
68	static Value simplifyNeonVld1(const* IntrinsicInst &II, unsigned MemAlign,
69	InstCombiner::BuilderTy &Builder) {
70	auto *IntrAlign = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
71
72	if (!IntrAlign)
73	return nullptr;
74
75	unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
76	? MemAlign
77	: IntrAlign->getLimitedValue();
78
79	if (!isPowerOf2_32(Value: Alignment))
80	return nullptr;
81
82	auto *BCastInst = Builder.CreateBitCast(V: II.getArgOperand(i: `0`),
83	DestTy: PointerType::get(ElementType: II.getType(), AddressSpace: `0`));
84	return Builder.CreateAlignedLoad(Ty: II.getType(), Ptr: BCastInst, Align: Align (Alignment));
85	}
86
87	bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
88	const Function Callee) const* {
89	const TargetMachine &TM = getTLI()->getTargetMachine();
90	const FeatureBitset &CallerBits =
91	TM.getSubtargetImpl(*Caller)->getFeatureBits();
92	const FeatureBitset &CalleeBits =
93	TM.getSubtargetImpl(*Callee)->getFeatureBits();
94
95	// To inline a callee, all features not in the allowed list must match exactly.
96	bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
97	(CalleeBits & ~InlineFeaturesAllowed);
98	// For features in the allowed list, the callee's features must be a subset of
99	// the callers'.
100	bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
101	(CalleeBits & InlineFeaturesAllowed);
102	return MatchExact && MatchSubset;
103	}
104
105	TTI::AddressingModeKind
106	ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
107	ScalarEvolution SE) const* {
108	if (ST->hasMVEIntegerOps())
109	return TTI::AMK_PostIndexed;
110
111	if (L->getHeader()->getParent()->hasOptSize())
112	return TTI::AMK_None;
113
114	if (ST->isMClass() && ST->isThumb2() &&
115	L->getNumBlocks() == `1`)
116	return TTI::AMK_PreIndexed;
117
118	return TTI::AMK_None;
119	}
120
121	std::optional<Instruction *>
122	ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
123	using namespace PatternMatch;
124	Intrinsic::ID IID = II.getIntrinsicID();
125	switch (IID) {
126	default:
127	break;
128	case Intrinsic::arm_neon_vld1: {
129	Align MemAlign =
130	getKnownAlignment(V: II.getArgOperand(i: `0`), DL: IC.getDataLayout(), CxtI: &II,
131	AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
132	if (Value *V = simplifyNeonVld1(II, MemAlign: MemAlign.value(), Builder&: IC.Builder)) {
133	return IC.replaceInstUsesWith(I&: II, V);
134	}
135	break;
136	}
137
138	case Intrinsic::arm_neon_vld2:
139	case Intrinsic::arm_neon_vld3:
140	case Intrinsic::arm_neon_vld4:
141	case Intrinsic::arm_neon_vld2lane:
142	case Intrinsic::arm_neon_vld3lane:
143	case Intrinsic::arm_neon_vld4lane:
144	case Intrinsic::arm_neon_vst1:
145	case Intrinsic::arm_neon_vst2:
146	case Intrinsic::arm_neon_vst3:
147	case Intrinsic::arm_neon_vst4:
148	case Intrinsic::arm_neon_vst2lane:
149	case Intrinsic::arm_neon_vst3lane:
150	case Intrinsic::arm_neon_vst4lane: {
151	Align MemAlign =
152	getKnownAlignment(V: II.getArgOperand(i: `0`), DL: IC.getDataLayout(), CxtI: &II,
153	AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
154	unsigned AlignArg = II.arg_size() - `1`;
155	Value *AlignArgOp = II.getArgOperand(i: AlignArg);
156	MaybeAlign Align = cast<ConstantInt>(Val: AlignArgOp)->getMaybeAlignValue();
157	if (Align && *Align < MemAlign) {
158	return IC.replaceOperand(
159	I&: II, OpNum: AlignArg,
160	V: ConstantInt::get(Ty: Type::getInt32Ty(C&: II.getContext()), V: MemAlign.value(),
161	IsSigned: false));
162	}
163	break;
164	}
165
166	case Intrinsic::arm_mve_pred_i2v: {
167	Value *Arg = II.getArgOperand(i: `0`);
168	Value *ArgArg;
169	if (match(V: Arg, P: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
170	Op0: PatternMatch::m_Value(V&: ArgArg))) &&
171	II.getType() == ArgArg->getType()) {
172	return IC.replaceInstUsesWith(I&: II, V: ArgArg);
173	}
174	Constant *XorMask;
175	if (match(V: Arg, P: m_Xor(L: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
176	Op0: PatternMatch::m_Value(V&: ArgArg)),
177	R: PatternMatch::m_Constant(C&: XorMask))) &&
178	II.getType() == ArgArg->getType()) {
179	if (auto *CI = dyn_cast<ConstantInt>(Val: XorMask)) {
180	if (CI->getValue().trunc(width: `16`).isAllOnes()) {
181	auto TrueVector = IC.Builder.CreateVectorSplat(
182	NumElts: cast<FixedVectorType>(Val: II.getType())->getNumElements(),
183	V: IC.Builder.getTrue());
184	return BinaryOperator::Create(Op: Instruction::Xor, S1: ArgArg, S2: TrueVector);
185	}
186	}
187	}
188	KnownBits ScalarKnown(`32`);
189	if (IC.SimplifyDemandedBits(I: &II, OpNo: `0`, DemandedMask: APInt::getLowBitsSet(numBits: `32`, loBitsSet: `16`),
190	Known&: ScalarKnown)) {
191	return &II;
192	}
193	break;
194	}
195	case Intrinsic::arm_mve_pred_v2i: {
196	Value *Arg = II.getArgOperand(i: `0`);
197	Value *ArgArg;
198	if (match(V: Arg, P: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
199	Op0: PatternMatch::m_Value(V&: ArgArg)))) {
200	return IC.replaceInstUsesWith(I&: II, V: ArgArg);
201	}
202
203	if (II.getMetadata(KindID: LLVMContext::MD_range))
204	break;
205
206	ConstantRange Range(APInt (`32`, `0`), APInt (`32`, `0x10000`));
207
208	if (auto CurrentRange = II.getRange()) {
209	Range = Range.intersectWith(CR: *CurrentRange);
210	if (Range == CurrentRange)
211	break;
212	}
213
214	II.addRangeRetAttr(CR: Range);
215	II.addRetAttr(Kind: Attribute::NoUndef);
216	return &II;
217	}
218	case Intrinsic::arm_mve_vadc:
219	case Intrinsic::arm_mve_vadc_predicated: {
220	unsigned CarryOp =
221	(II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? `3` : `2`;
222	assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == `32` &&
223	"Bad type for intrinsic!");
224
225	KnownBits CarryKnown(`32`);
226	if (IC.SimplifyDemandedBits(I: &II, OpNo: CarryOp, DemandedMask: APInt::getOneBitSet(numBits: `32`, BitNo: `29`),
227	Known&: CarryKnown)) {
228	return &II;
229	}
230	break;
231	}
232	case Intrinsic::arm_mve_vmldava: {
233	Instruction *I = cast<Instruction>(Val: &II);
234	if (I->hasOneUse()) {
235	auto User = cast<Instruction>(Val: I->user_begin());
236	Value *OpZ;
237	if (match(V: User, P: m_c_Add(L: m_Specific(V: I), R: m_Value(V&: OpZ))) &&
238	match(V: I->getOperand(i: `3`), P: m_Zero())) {
239	Value *OpX = I->getOperand(i: `4`);
240	Value *OpY = I->getOperand(i: `5`);
241	Type *OpTy = OpX->getType();
242
243	IC.Builder.SetInsertPoint(User);
244	Value *V =
245	IC.Builder.CreateIntrinsic(ID: Intrinsic::arm_mve_vmldava, Types: {OpTy},
246	Args: {I->getOperand(i: `0`), I->getOperand(i: `1`),
247	I->getOperand(i: `2`), OpZ, OpX, OpY});
248
249	IC.replaceInstUsesWith(I&: *User, V);
250	return IC.eraseInstFromFunction(I&: *User);
251	}
252	}
253	return std::nullopt;
254	}
255	}
256	return std::nullopt;
257	}
258
259	std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
260	InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
261	APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
262	std::function<void(Instruction , unsigned*, APInt, APInt &)>
263	SimplifyAndSetOp) const {
264
265	// Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
266	// opcode specifying a Top/Bottom instruction, which can change between
267	// instructions.
268	auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
269	unsigned NumElts = cast<FixedVectorType>(Val: II.getType())->getNumElements();
270	unsigned IsTop = cast<ConstantInt>(Val: II.getOperand(i_nocapture: TopOpc))->getZExtValue();
271
272	// The only odd/even lanes of operand 0 will only be demanded depending
273	// on whether this is a top/bottom instruction.
274	APInt DemandedElts =
275	APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: `2`, loBitsSet: `1`)
276	: APInt::getHighBitsSet(numBits: `2`, hiBitsSet: `1`));
277	SimplifyAndSetOp (&II, `0`, OrigDemandedElts & DemandedElts, UndefElts);
278	// The other lanes will be defined from the inserted elements.
279	UndefElts &= APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: `2`, loBitsSet: `1`)
280	: APInt::getHighBitsSet(numBits: `2`, hiBitsSet: `1`));
281	return std::nullopt;
282	};
283
284	switch (II.getIntrinsicID()) {
285	default:
286	break;
287	case Intrinsic::arm_mve_vcvt_narrow:
288	SimplifyNarrowInstrTopBottom (`2`);
289	break;
290	case Intrinsic::arm_mve_vqmovn:
291	SimplifyNarrowInstrTopBottom (`4`);
292	break;
293	case Intrinsic::arm_mve_vshrn:
294	SimplifyNarrowInstrTopBottom (`7`);
295	break;
296	}
297
298	return std::nullopt;
299	}
300
301	InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
302	TTI::TargetCostKind CostKind) {
303	assert(Ty->isIntegerTy());
304
305	unsigned Bits = Ty->getPrimitiveSizeInBits();
306	if (Bits == `0` \|\| Imm.getActiveBits() >= `64`)
307	return `4`;
308
309	int64_t SImmVal = Imm.getSExtValue();
310	uint64_t ZImmVal = Imm.getZExtValue();
311	if (!ST->isThumb()) {
312	if ((SImmVal >= `0` && SImmVal < `65536`) \|\|
313	(ARM_AM::getSOImmVal(Arg: ZImmVal) != -`1`) \|\|
314	(ARM_AM::getSOImmVal(Arg: ~ZImmVal) != -`1`))
315	return `1`;
316	return ST->hasV6T2Ops() ? `2` : `3`;
317	}
318	if (ST->isThumb2()) {
319	if ((SImmVal >= `0` && SImmVal < `65536`) \|\|
320	(ARM_AM::getT2SOImmVal(Arg: ZImmVal) != -`1`) \|\|
321	(ARM_AM::getT2SOImmVal(Arg: ~ZImmVal) != -`1`))
322	return `1`;
323	return ST->hasV6T2Ops() ? `2` : `3`;
324	}
325	// Thumb1, any i8 imm cost 1.
326	if (Bits == `8` \|\| (SImmVal >= `0` && SImmVal < `256`))
327	return `1`;
328	if ((~SImmVal < `256`) \|\| ARM_AM::isThumbImmShiftedVal(V: ZImmVal))
329	return `2`;
330	// Load from constantpool.
331	return `3`;
332	}
333
334	// Constants smaller than 256 fit in the immediate field of
335	// Thumb1 instructions so we return a zero cost and 1 otherwise.
336	InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
337	const APInt &Imm, Type *Ty) {
338	if (Imm.isNonNegative() && Imm.getLimitedValue() < `256`)
339	return `0`;
340
341	return `1`;
342	}
343
344	// Checks whether Inst is part of a min(max()) or max(min()) pattern
345	// that will match to an SSAT instruction. Returns the instruction being
346	// saturated, or null if no saturation pattern was found.
347	static Value isSSATMinMaxPattern(Instruction Inst, const APInt &Imm) {
348	Value LHS, RHS;
349	ConstantInt *C;
350	SelectPatternFlavor InstSPF = matchSelectPattern(V: Inst, LHS, RHS).Flavor;
351
352	if (InstSPF == SPF_SMAX &&
353	PatternMatch::match(V: RHS, P: PatternMatch::m_ConstantInt(CI&: C)) &&
354	C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
355
356	auto isSSatMin = [&](Value *MinInst) {
357	if (isa<SelectInst>(Val: MinInst)) {
358	Value MinLHS, MinRHS;
359	ConstantInt *MinC;
360	SelectPatternFlavor MinSPF =
361	matchSelectPattern(V: MinInst, LHS&: MinLHS, RHS&: MinRHS).Flavor;
362	if (MinSPF == SPF_SMIN &&
363	PatternMatch::match(V: MinRHS, P: PatternMatch::m_ConstantInt(CI&: MinC)) &&
364	MinC->getValue() == ((-Imm) - `1`))
365	return true;
366	}
367	return false;
368	};
369
370	if (isSSatMin (Inst->getOperand(i: `1`)))
371	return cast<Instruction>(Val: Inst->getOperand(i: `1`))->getOperand(i: `1`);
372	if (Inst->hasNUses(N: `2`) &&
373	(isSSatMin (Inst->user_begin()) \|\| isSSatMin ((++Inst->user_begin()))))
374	return Inst->getOperand(i: `1`);
375	}
376	return nullptr;
377	}
378
379	// Look for a FP Saturation pattern, where the instruction can be simplified to
380	// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
381	static bool isFPSatMinMaxPattern(Instruction Inst, const* APInt &Imm) {
382	if (Imm.getBitWidth() != `64` \|\|
383	Imm != APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `33`)) // -2147483648
384	return false;
385	Value *FP = isSSATMinMaxPattern(Inst, Imm);
386	if (!FP && isa<ICmpInst>(Val: Inst) && Inst->hasOneUse())
387	FP = isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm);
388	if (!FP)
389	return false;
390	return isa<FPToSIInst>(Val: FP);
391	}
392
393	InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
394	const APInt &Imm, Type *Ty,
395	TTI::TargetCostKind CostKind,
396	Instruction *Inst) {
397	// Division by a constant can be turned into multiplication, but only if we
398	// know it's constant. So it's not so much that the immediate is cheap (it's
399	// not), but that the alternative is worse.
400	// FIXME: this is probably unneeded with GlobalISel.
401	if ((Opcode == Instruction::SDiv \|\| Opcode == Instruction::UDiv \|\|
402	Opcode == Instruction::SRem \|\| Opcode == Instruction::URem) &&
403	Idx == `1`)
404	return `0`;
405
406	// Leave any gep offsets for the CodeGenPrepare, which will do a better job at
407	// splitting any large offsets.
408	if (Opcode == Instruction::GetElementPtr && Idx != `0`)
409	return `0`;
410
411	if (Opcode == Instruction::And) {
412	// UXTB/UXTH
413	if (Imm == `255` \|\| Imm == `65535`)
414	return `0`;
415	// Conversion to BIC is free, and means we can use ~Imm instead.
416	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
417	b: getIntImmCost(Imm: ~Imm, Ty, CostKind));
418	}
419
420	if (Opcode == Instruction::Add)
421	// Conversion to SUB is free, and means we can use -Imm instead.
422	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
423	b: getIntImmCost(Imm: -Imm, Ty, CostKind));
424
425	if (Opcode == Instruction::ICmp && Imm.isNegative() &&
426	Ty->getIntegerBitWidth() == `32`) {
427	int64_t NegImm = -Imm.getSExtValue();
428	if (ST->isThumb2() && NegImm < `1`<<`12`)
429	// icmp X, #-C -> cmn X, #C
430	return `0`;
431	if (ST->isThumb() && NegImm < `1`<<`8`)
432	// icmp X, #-C -> adds X, #C
433	return `0`;
434	}
435
436	// xor a, -1 can always be folded to MVN
437	if (Opcode == Instruction::Xor && Imm.isAllOnes())
438	return `0`;
439
440	// Ensures negative constant of min(max()) or max(min()) patterns that
441	// match to SSAT instructions don't get hoisted
442	if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) \|\| ST->isThumb2()) &&
443	Ty->getIntegerBitWidth() <= `32`) {
444	if (isSSATMinMaxPattern(Inst, Imm) \|\|
445	(isa<ICmpInst>(Val: Inst) && Inst->hasOneUse() &&
446	isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm)))
447	return `0`;
448	}
449
450	if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
451	return `0`;
452
453	// We can convert <= -1 to < 0, which is generally quite cheap.
454	if (Inst && Opcode == Instruction::ICmp && Idx == `1` && Imm.isAllOnes()) {
455	ICmpInst::Predicate Pred = cast<ICmpInst>(Val: Inst)->getPredicate();
456	if (Pred == ICmpInst::ICMP_SGT \|\| Pred == ICmpInst::ICMP_SLE)
457	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
458	b: getIntImmCost(Imm: Imm + `1`, Ty, CostKind));
459	}
460
461	return getIntImmCost(Imm, Ty, CostKind);
462	}
463
464	InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
465	TTI::TargetCostKind CostKind,
466	const Instruction *I) {
467	if (CostKind == TTI::TCK_RecipThroughput &&
468	(ST->hasNEON() \|\| ST->hasMVEIntegerOps())) {
469	// FIXME: The vectorizer is highly sensistive to the cost of these
470	// instructions, which suggests that it may be using the costs incorrectly.
471	// But, for now, just make them free to avoid performance regressions for
472	// vector targets.
473	return `0`;
474	}
475	return BaseT::getCFInstrCost(Opcode, CostKind, I);
476	}
477
478	InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
479	Type *Src,
480	TTI::CastContextHint CCH,
481	TTI::TargetCostKind CostKind,
482	const Instruction *I) {
483	int ISD = TLI->InstructionOpcodeToISD(Opcode);
484	assert(ISD && "Invalid opcode");
485
486	// TODO: Allow non-throughput costs that aren't binary.
487	auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
488	if (CostKind != TTI::TCK_RecipThroughput)
489	return Cost == `0` ? `0` : `1`;
490	return Cost;
491	};
492	auto IsLegalFPType = [this](EVT VT) {
493	EVT EltVT = VT.getScalarType();
494	return (EltVT == MVT::f32 && ST->hasVFP2Base()) \|\|
495	(EltVT == MVT::f64 && ST->hasFP64()) \|\|
496	(EltVT == MVT::f16 && ST->hasFullFP16());
497	};
498
499	EVT SrcTy = TLI->getValueType(DL, Ty: Src);
500	EVT DstTy = TLI->getValueType(DL, Ty: Dst);
501
502	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
503	return AdjustCost (
504	BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
505
506	// Extending masked load/Truncating masked stores is expensive because we
507	// currently don't split them. This means that we'll likely end up
508	// loading/storing each element individually (hence the high cost).
509	if ((ST->hasMVEIntegerOps() &&
510	(Opcode == Instruction::Trunc \|\| Opcode == Instruction::ZExt \|\|
511	Opcode == Instruction::SExt)) \|\|
512	(ST->hasMVEFloatOps() &&
513	(Opcode == Instruction::FPExt \|\| Opcode == Instruction::FPTrunc) &&
514	IsLegalFPType (SrcTy) && IsLegalFPType (DstTy)))
515	if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > `128`)
516	return `2` * DstTy.getVectorNumElements() *
517	ST->getMVEVectorCostFactor(CostKind);
518
519	// The extend of other kinds of load is free
520	if (CCH == TTI::CastContextHint::Normal \|\|
521	CCH == TTI::CastContextHint::Masked) {
522	static const TypeConversionCostTblEntry LoadConversionTbl[] = {
523	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i32, .Src: MVT::i16, .Cost: `0`},
524	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i32, .Src: MVT::i16, .Cost: `0`},
525	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i32, .Src: MVT::i8, .Cost: `0`},
526	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i32, .Src: MVT::i8, .Cost: `0`},
527	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i16, .Src: MVT::i8, .Cost: `0`},
528	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i16, .Src: MVT::i8, .Cost: `0`},
529	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i32, .Cost: `1`},
530	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i32, .Cost: `1`},
531	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: `1`},
532	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: `1`},
533	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i8, .Cost: `1`},
534	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i8, .Cost: `1`},
535	};
536	if (const auto *Entry = ConvertCostTableLookup(
537	Table: LoadConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
538	return AdjustCost (Entry->Cost);
539
540	static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
541	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0`},
542	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0`},
543	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `0`},
544	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `0`},
545	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0`},
546	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0`},
547	// The following extend from a legal type to an illegal type, so need to
548	// split the load. This introduced an extra load operation, but the
549	// extend is still "free".
550	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `1`},
551	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `1`},
552	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `3`},
553	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `3`},
554	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `1`},
555	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `1`},
556	};
557	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
558	if (const auto *Entry =
559	ConvertCostTableLookup(Table: MVELoadConversionTbl, ISD,
560	Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
561	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
562	}
563
564	static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
565	// FPExtends are similar but also require the VCVT instructions.
566	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: `1`},
567	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: `3`},
568	};
569	if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
570	if (const auto *Entry =
571	ConvertCostTableLookup(Table: MVEFLoadConversionTbl, ISD,
572	Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
573	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
574	}
575
576	// The truncate of a store is free. This is the mirror of extends above.
577	static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
578	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0`},
579	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `0`},
580	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0`},
581	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `1`},
582	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `1`},
583	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `3`},
584	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `1`},
585	};
586	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
587	if (const auto *Entry =
588	ConvertCostTableLookup(Table: MVEStoreConversionTbl, ISD,
589	Dst: SrcTy.getSimpleVT(), Src: DstTy.getSimpleVT()))
590	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
591	}
592
593	static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
594	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: `1`},
595	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: `3`},
596	};
597	if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
598	if (const auto *Entry =
599	ConvertCostTableLookup(Table: MVEFStoreConversionTbl, ISD,
600	Dst: SrcTy.getSimpleVT(), Src: DstTy.getSimpleVT()))
601	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
602	}
603	}
604
605	// NEON vector operations that can extend their inputs.
606	if ((ISD == ISD::SIGN_EXTEND \|\| ISD == ISD::ZERO_EXTEND) &&
607	I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
608	static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
609	// vaddl
610	{ .ISD: ISD::ADD, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
611	{ .ISD: ISD::ADD, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
612	// vsubl
613	{ .ISD: ISD::SUB, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
614	{ .ISD: ISD::SUB, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
615	// vmull
616	{ .ISD: ISD::MUL, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
617	{ .ISD: ISD::MUL, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
618	// vshll
619	{ .ISD: ISD::SHL, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
620	{ .ISD: ISD::SHL, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
621	};
622
623	auto User = cast<Instruction>(Val: I->user_begin());
624	int UserISD = TLI->InstructionOpcodeToISD(Opcode: User->getOpcode());
625	if (auto *Entry = ConvertCostTableLookup(Table: NEONDoubleWidthTbl, ISD: UserISD,
626	Dst: DstTy.getSimpleVT(),
627	Src: SrcTy.getSimpleVT())) {
628	return AdjustCost (Entry->Cost);
629	}
630	}
631
632	// Single to/from double precision conversions.
633	if (Src->isVectorTy() && ST->hasNEON() &&
634	((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
635	DstTy.getScalarType() == MVT::f32) \|\|
636	(ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
637	DstTy.getScalarType() == MVT::f64))) {
638	static const CostTblEntry NEONFltDblTbl[] = {
639	// Vector fptrunc/fpext conversions.
640	{.ISD: ISD::FP_ROUND, .Type: MVT::v2f64, .Cost: `2`},
641	{.ISD: ISD::FP_EXTEND, .Type: MVT::v2f32, .Cost: `2`},
642	{.ISD: ISD::FP_EXTEND, .Type: MVT::v4f32, .Cost: `4`}};
643
644	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
645	if (const auto *Entry = CostTableLookup(Table: NEONFltDblTbl, ISD, Ty: LT.second))
646	return AdjustCost (LT.first * Entry->Cost);
647	}
648
649	// Some arithmetic, load and store operations have specific instructions
650	// to cast up/down their types automatically at no extra cost.
651	// TODO: Get these tables to know at least what the related operations are.
652	static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
653	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
654	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
655	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `1` },
656	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `1` },
657	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: `0` },
658	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: `1` },
659
660	// The number of vmovl instructions for the extension.
661	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
662	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
663	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
664	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
665	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `3` },
666	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `3` },
667	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `2` },
668	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `2` },
669	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3` },
670	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3` },
671	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3` },
672	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3` },
673	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7` },
674	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7` },
675	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6` },
676	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6` },
677	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6` },
678	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6` },
679
680	// Operations that we legalize using splitting.
681	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: `6` },
682	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: `3` },
683
684	// Vector float <-> i32 conversions.
685	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1` },
686	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1` },
687
688	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3` },
689	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3` },
690	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `2` },
691	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `2` },
692	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1` },
693	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1` },
694	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: `3` },
695	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: `3` },
696	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `3` },
697	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `3` },
698	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2` },
699	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2` },
700	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4` },
701	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4` },
702	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: `2` },
703	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: `2` },
704	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: `8` },
705	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: `8` },
706	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: `4` },
707	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: `4` },
708
709	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1` },
710	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1` },
711	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `3` },
712	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `3` },
713	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2` },
714	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2` },
715
716	// Vector double <-> i32 conversions.
717	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
718	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
719
720	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4` },
721	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4` },
722	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `3` },
723	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `3` },
724	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
725	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
726
727	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2` },
728	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2` },
729	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: `4` },
730	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: `4` },
731	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: `8` },
732	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: `8` }
733	};
734
735	if (SrcTy.isVector() && ST->hasNEON()) {
736	if (const auto *Entry = ConvertCostTableLookup(Table: NEONVectorConversionTbl, ISD,
737	Dst: DstTy.getSimpleVT(),
738	Src: SrcTy.getSimpleVT()))
739	return AdjustCost (Entry->Cost);
740	}
741
742	// Scalar float to integer conversions.
743	static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
744	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i1, .Src: MVT::f32, .Cost: `2` },
745	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i1, .Src: MVT::f32, .Cost: `2` },
746	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i1, .Src: MVT::f64, .Cost: `2` },
747	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i1, .Src: MVT::f64, .Cost: `2` },
748	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i8, .Src: MVT::f32, .Cost: `2` },
749	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i8, .Src: MVT::f32, .Cost: `2` },
750	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i8, .Src: MVT::f64, .Cost: `2` },
751	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i8, .Src: MVT::f64, .Cost: `2` },
752	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i16, .Src: MVT::f32, .Cost: `2` },
753	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i16, .Src: MVT::f32, .Cost: `2` },
754	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i16, .Src: MVT::f64, .Cost: `2` },
755	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i16, .Src: MVT::f64, .Cost: `2` },
756	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: `2` },
757	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: `2` },
758	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: `2` },
759	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: `2` },
760	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: `10` },
761	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: `10` },
762	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: `10` },
763	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: `10` }
764	};
765	if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
766	if (const auto *Entry = ConvertCostTableLookup(Table: NEONFloatConversionTbl, ISD,
767	Dst: DstTy.getSimpleVT(),
768	Src: SrcTy.getSimpleVT()))
769	return AdjustCost (Entry->Cost);
770	}
771
772	// Scalar integer to float conversions.
773	static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
774	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i1, .Cost: `2` },
775	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i1, .Cost: `2` },
776	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i1, .Cost: `2` },
777	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i1, .Cost: `2` },
778	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i8, .Cost: `2` },
779	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i8, .Cost: `2` },
780	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i8, .Cost: `2` },
781	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i8, .Cost: `2` },
782	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i16, .Cost: `2` },
783	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i16, .Cost: `2` },
784	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i16, .Cost: `2` },
785	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i16, .Cost: `2` },
786	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: `2` },
787	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: `2` },
788	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: `2` },
789	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: `2` },
790	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: `10` },
791	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: `10` },
792	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: `10` },
793	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: `10` }
794	};
795
796	if (SrcTy.isInteger() && ST->hasNEON()) {
797	if (const auto *Entry = ConvertCostTableLookup(Table: NEONIntegerConversionTbl,
798	ISD, Dst: DstTy.getSimpleVT(),
799	Src: SrcTy.getSimpleVT()))
800	return AdjustCost (Entry->Cost);
801	}
802
803	// MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
804	// instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
805	// are linearised so take more.
806	static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
807	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
808	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
809	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
810	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
811	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `10` },
812	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `2` },
813	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
814	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
815	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `10` },
816	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `2` },
817	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `8` },
818	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `2` },
819	};
820
821	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
822	if (const auto *Entry = ConvertCostTableLookup(Table: MVEVectorConversionTbl,
823	ISD, Dst: DstTy.getSimpleVT(),
824	Src: SrcTy.getSimpleVT()))
825	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
826	}
827
828	if (ISD == ISD::FP_ROUND \|\| ISD == ISD::FP_EXTEND) {
829	// As general rule, fp converts that were not matched above are scalarized
830	// and cost 1 vcvt for each lane, so long as the instruction is available.
831	// If not it will become a series of function calls.
832	const InstructionCost CallCost =
833	getCallInstrCost(F: nullptr, RetTy: Dst, Tys: {Src}, CostKind);
834	int Lanes = `1`;
835	if (SrcTy.isFixedLengthVector())
836	Lanes = SrcTy.getVectorNumElements();
837
838	if (IsLegalFPType (SrcTy) && IsLegalFPType (DstTy))
839	return Lanes;
840	else
841	return Lanes * CallCost;
842	}
843
844	if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
845	SrcTy.isFixedLengthVector()) {
846	// Treat a truncate with larger than legal source (128bits for MVE) as
847	// expensive, 2 instructions per lane.
848	if ((SrcTy.getScalarType() == MVT::i8 \|\|
849	SrcTy.getScalarType() == MVT::i16 \|\|
850	SrcTy.getScalarType() == MVT::i32) &&
851	SrcTy.getSizeInBits() > `128` &&
852	SrcTy.getSizeInBits() > DstTy.getSizeInBits())
853	return SrcTy.getVectorNumElements() * `2`;
854	}
855
856	// Scalar integer conversion costs.
857	static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
858	// i16 -> i64 requires two dependent operations.
859	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: `2` },
860
861	// Truncates on i64 are assumed to be free.
862	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i32, .Src: MVT::i64, .Cost: `0` },
863	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i16, .Src: MVT::i64, .Cost: `0` },
864	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i8, .Src: MVT::i64, .Cost: `0` },
865	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i1, .Src: MVT::i64, .Cost: `0` }
866	};
867
868	if (SrcTy.isInteger()) {
869	if (const auto *Entry = ConvertCostTableLookup(Table: ARMIntegerConversionTbl, ISD,
870	Dst: DstTy.getSimpleVT(),
871	Src: SrcTy.getSimpleVT()))
872	return AdjustCost (Entry->Cost);
873	}
874
875	int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
876	? ST->getMVEVectorCostFactor(CostKind)
877	: `1`;
878	return AdjustCost (
879	BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
880	}
881
882	InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
883	TTI::TargetCostKind CostKind,
884	unsigned Index, Value *Op0,
885	Value *Op1) {
886	// Penalize inserting into an D-subregister. We end up with a three times
887	// lower estimated throughput on swift.
888	if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
889	ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= `32`)
890	return `3`;
891
892	if (ST->hasNEON() && (Opcode == Instruction::InsertElement \|\|
893	Opcode == Instruction::ExtractElement)) {
894	// Cross-class copies are expensive on many microarchitectures,
895	// so assume they are expensive by default.
896	if (cast<VectorType>(Val: ValTy)->getElementType()->isIntegerTy())
897	return `3`;
898
899	// Even if it's not a cross class copy, this likely leads to mixing
900	// of NEON and VFP code and should be therefore penalized.
901	if (ValTy->isVectorTy() &&
902	ValTy->getScalarSizeInBits() <= `32`)
903	return std::max<InstructionCost>(
904	a: BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1),
905	b: `2U`);
906	}
907
908	if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement \|\|
909	Opcode == Instruction::ExtractElement)) {
910	// Integer cross-lane moves are more expensive than float, which can
911	// sometimes just be vmovs. Integer involve being passes to GPR registers,
912	// causing more of a delay.
913	std::pair<InstructionCost, MVT> LT =
914	getTypeLegalizationCost(Ty: ValTy->getScalarType());
915	return LT.first * (ValTy->getScalarType()->isIntegerTy() ? `4` : `1`);
916	}
917
918	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1);
919	}
920
921	InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
922	Type *CondTy,
923	CmpInst::Predicate VecPred,
924	TTI::TargetCostKind CostKind,
925	const Instruction *I) {
926	int ISD = TLI->InstructionOpcodeToISD(Opcode);
927
928	// Thumb scalar code size cost for select.
929	if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
930	ST->isThumb() && !ValTy->isVectorTy()) {
931	// Assume expensive structs.
932	if (TLI->getValueType(DL, Ty: ValTy, AllowUnknown: true) == MVT::Other)
933	return TTI::TCC_Expensive;
934
935	// Select costs can vary because they:
936	// - may require one or more conditional mov (including an IT),
937	// - can't operate directly on immediates,
938	// - require live flags, which we can't copy around easily.
939	InstructionCost Cost = getTypeLegalizationCost(Ty: ValTy).first;
940
941	// Possible IT instruction for Thumb2, or more for Thumb1.
942	++Cost;
943
944	// i1 values may need rematerialising by using mov immediates and/or
945	// flag setting instructions.
946	if (ValTy->isIntegerTy(Bitwidth: `1`))
947	++Cost;
948
949	return Cost;
950	}
951
952	// If this is a vector min/max/abs, use the cost of that intrinsic directly
953	// instead. Hopefully when min/max intrinsics are more prevalent this code
954	// will not be needed.
955	const Instruction *Sel = I;
956	if ((Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) && Sel &&
957	Sel->hasOneUse())
958	Sel = cast<Instruction>(Val: Sel->user_back());
959	if (Sel && ValTy->isVectorTy() &&
960	(ValTy->isIntOrIntVectorTy() \|\| ValTy->isFPOrFPVectorTy())) {
961	const Value LHS, RHS;
962	SelectPatternFlavor SPF = matchSelectPattern(V: Sel, LHS, RHS).Flavor;
963	unsigned IID = `0`;
964	switch (SPF) {
965	case SPF_ABS:
966	IID = Intrinsic::abs;
967	break;
968	case SPF_SMIN:
969	IID = Intrinsic::smin;
970	break;
971	case SPF_SMAX:
972	IID = Intrinsic::smax;
973	break;
974	case SPF_UMIN:
975	IID = Intrinsic::umin;
976	break;
977	case SPF_UMAX:
978	IID = Intrinsic::umax;
979	break;
980	case SPF_FMINNUM:
981	IID = Intrinsic::minnum;
982	break;
983	case SPF_FMAXNUM:
984	IID = Intrinsic::maxnum;
985	break;
986	default:
987	break;
988	}
989	if (IID) {
990	// The ICmp is free, the select gets the cost of the min/max/etc
991	if (Sel != I)
992	return `0`;
993	IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
994	return getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
995	}
996	}
997
998	// On NEON a vector select gets lowered to vbsl.
999	if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1000	// Lowering of some vector selects is currently far from perfect.
1001	static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1002	{ .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: `4``4` + `1``2` + `1` },
1003	{ .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: `50` },
1004	{ .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: `100` }
1005	};
1006
1007	EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
1008	EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
1009	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1010	if (const auto *Entry = ConvertCostTableLookup(Table: NEONVectorSelectTbl, ISD,
1011	Dst: SelCondTy.getSimpleVT(),
1012	Src: SelValTy.getSimpleVT()))
1013	return Entry->Cost;
1014	}
1015
1016	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1017	return LT.first;
1018	}
1019
1020	if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1021	(Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) &&
1022	cast<FixedVectorType>(Val: ValTy)->getNumElements() > `1`) {
1023	FixedVectorType *VecValTy = cast<FixedVectorType>(Val: ValTy);
1024	FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(Val: CondTy);
1025	if (!VecCondTy)
1026	VecCondTy = cast<FixedVectorType>(Val: CmpInst::makeCmpResultType(opnd_type: VecValTy));
1027
1028	// If we don't have mve.fp any fp operations will need to be scalarized.
1029	if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1030	// One scalaization insert, one scalarization extract and the cost of the
1031	// fcmps.
1032	return BaseT::getScalarizationOverhead(InTy: VecValTy, /Insert/ false,
1033	/Extract/ true, CostKind) +
1034	BaseT::getScalarizationOverhead(InTy: VecCondTy, /Insert/ true,
1035	/Extract/ false, CostKind) +
1036	VecValTy->getNumElements() *
1037	getCmpSelInstrCost(Opcode, ValTy: ValTy->getScalarType(),
1038	CondTy: VecCondTy->getScalarType(), VecPred,
1039	CostKind, I);
1040	}
1041
1042	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1043	int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1044	// There are two types - the input that specifies the type of the compare
1045	// and the output vXi1 type. Because we don't know how the output will be
1046	// split, we may need an expensive shuffle to get two in sync. This has the
1047	// effect of making larger than legal compares (v8i32 for example)
1048	// expensive.
1049	if (LT.second.isVector() && LT.second.getVectorNumElements() > `2`) {
1050	if (LT.first > `1`)
1051	return LT.first * BaseCost +
1052	BaseT::getScalarizationOverhead(InTy: VecCondTy, /Insert/ true,
1053	/Extract/ false, CostKind);
1054	return BaseCost;
1055	}
1056	}
1057
1058	// Default to cheap (throughput/size of 1 instruction) but adjust throughput
1059	// for "multiple beats" potentially needed by MVE instructions.
1060	int BaseCost = `1`;
1061	if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1062	BaseCost = ST->getMVEVectorCostFactor(CostKind);
1063
1064	return BaseCost *
1065	BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1066	}
1067
1068	InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1069	ScalarEvolution *SE,
1070	const SCEV *Ptr) {
1071	// Address computations in vectorized code with non-consecutive addresses will
1072	// likely result in more instructions compared to scalar code where the
1073	// computation can more often be merged into the index mode. The resulting
1074	// extra micro-ops can significantly decrease throughput.
1075	unsigned NumVectorInstToHideOverhead = `10`;
1076	int MaxMergeDistance = `64`;
1077
1078	if (ST->hasNEON()) {
1079	if (Ty->isVectorTy() && SE &&
1080	!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + `1`))
1081	return NumVectorInstToHideOverhead;
1082
1083	// In many cases the address computation is not merged into the instruction
1084	// addressing mode.
1085	return `1`;
1086	}
1087	return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1088	}
1089
1090	bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1091	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
1092	// If a VCTP is part of a chain, it's already profitable and shouldn't be
1093	// optimized, else LSR may block tail-predication.
1094	switch (II->getIntrinsicID()) {
1095	case Intrinsic::arm_mve_vctp8:
1096	case Intrinsic::arm_mve_vctp16:
1097	case Intrinsic::arm_mve_vctp32:
1098	case Intrinsic::arm_mve_vctp64:
1099	return true;
1100	default:
1101	break;
1102	}
1103	}
1104	return false;
1105	}
1106
1107	bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1108	if (!EnableMaskedLoadStores \|\| !ST->hasMVEIntegerOps())
1109	return false;
1110
1111	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DataTy)) {
1112	// Don't support v2i1 yet.
1113	if (VecTy->getNumElements() == `2`)
1114	return false;
1115
1116	// We don't support extending fp types.
1117	unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1118	if (VecWidth != `128` && VecTy->getElementType()->isFloatingPointTy())
1119	return false;
1120	}
1121
1122	unsigned EltWidth = DataTy->getScalarSizeInBits();
1123	return (EltWidth == `32` && Alignment >= `4`) \|\|
1124	(EltWidth == `16` && Alignment >= `2`) \|\| (EltWidth == `8`);
1125	}
1126
1127	bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1128	if (!EnableMaskedGatherScatters \|\| !ST->hasMVEIntegerOps())
1129	return false;
1130
1131	unsigned EltWidth = Ty->getScalarSizeInBits();
1132	return ((EltWidth == `32` && Alignment >= `4`) \|\|
1133	(EltWidth == `16` && Alignment >= `2`) \|\| EltWidth == `8`);
1134	}
1135
1136	/// Given a memcpy/memset/memmove instruction, return the number of memory
1137	/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1138	/// call is used.
1139	int ARMTTIImpl::getNumMemOps(const IntrinsicInst I) const* {
1140	MemOp MOp;
1141	unsigned DstAddrSpace = ~`0u`;
1142	unsigned SrcAddrSpace = ~`0u`;
1143	const Function *F = I->getParent()->getParent();
1144
1145	if (const auto *MC = dyn_cast<MemTransferInst>(Val: I)) {
1146	ConstantInt *C = dyn_cast<ConstantInt>(Val: MC->getLength());
1147	// If 'size' is not a constant, a library call will be generated.
1148	if (!C)
1149	return -`1`;
1150
1151	const unsigned Size = C->getValue().getZExtValue();
1152	const Align DstAlign = *MC->getDestAlign();
1153	const Align SrcAlign = *MC->getSourceAlign();
1154
1155	MOp = MemOp::Copy(Size, /DstAlignCanChange/ false, DstAlign, SrcAlign,
1156	/IsVolatile/ false);
1157	DstAddrSpace = MC->getDestAddressSpace();
1158	SrcAddrSpace = MC->getSourceAddressSpace();
1159	}
1160	else if (const auto *MS = dyn_cast<MemSetInst>(Val: I)) {
1161	ConstantInt *C = dyn_cast<ConstantInt>(Val: MS->getLength());
1162	// If 'size' is not a constant, a library call will be generated.
1163	if (!C)
1164	return -`1`;
1165
1166	const unsigned Size = C->getValue().getZExtValue();
1167	const Align DstAlign = *MS->getDestAlign();
1168
1169	MOp = MemOp::Set(Size, /DstAlignCanChange/ false, DstAlign,
1170	/IsZeroMemset/ false, /IsVolatile/ false);
1171	DstAddrSpace = MS->getDestAddressSpace();
1172	}
1173	else
1174	llvm_unreachable("Expected a memcpy/move or memset!");
1175
1176	unsigned Limit, Factor = `2`;
1177	switch(I->getIntrinsicID()) {
1178	case Intrinsic::memcpy:
1179	Limit = TLI->getMaxStoresPerMemcpy(OptSize: F->hasMinSize());
1180	break;
1181	case Intrinsic::memmove:
1182	Limit = TLI->getMaxStoresPerMemmove(OptSize: F->hasMinSize());
1183	break;
1184	case Intrinsic::memset:
1185	Limit = TLI->getMaxStoresPerMemset(OptSize: F->hasMinSize());
1186	Factor = `1`;
1187	break;
1188	default:
1189	llvm_unreachable("Expected a memcpy/move or memset!");
1190	}
1191
1192	// MemOps will be poplulated with a list of data types that needs to be
1193	// loaded and stored. That's why we multiply the number of elements by 2 to
1194	// get the cost for this memcpy.
1195	std::vector<EVT> MemOps;
1196	if (getTLI()->findOptimalMemOpLowering(
1197	MemOps, Limit, Op: MOp, DstAS: DstAddrSpace,
1198	SrcAS: SrcAddrSpace, FuncAttributes: F->getAttributes()))
1199	return MemOps.size() * Factor;
1200
1201	// If we can't find an optimal memop lowering, return the default cost
1202	return -`1`;
1203	}
1204
1205	InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1206	int NumOps = getNumMemOps(I: cast<IntrinsicInst>(Val: I));
1207
1208	// To model the cost of a library call, we assume 1 for the call, and
1209	// 3 for the argument setup.
1210	if (NumOps == -`1`)
1211	return `4`;
1212	return NumOps;
1213	}
1214
1215	InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1216	VectorType Tp, ArrayRef<int*> Mask,
1217	TTI::TargetCostKind CostKind,
1218	int Index, VectorType *SubTp,
1219	ArrayRef<const Value *> Args,
1220	const Instruction *CxtI) {
1221	Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp);
1222	// Treat extractsubvector as single op permutation.
1223	bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1224	if (IsExtractSubvector)
1225	Kind = TTI::SK_PermuteSingleSrc;
1226	if (ST->hasNEON()) {
1227	if (Kind == TTI::SK_Broadcast) {
1228	static const CostTblEntry NEONDupTbl[] = {
1229	// VDUP handles these cases.
1230	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: `1`},
1231	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: `1`},
1232	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: `1`},
1233	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: `1`},
1234	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: `1`},
1235	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i8, .Cost: `1`},
1236
1237	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `1`},
1238	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `1`},
1239	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `1`},
1240	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `1`}};
1241
1242	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1243	if (const auto *Entry =
1244	CostTableLookup(Table: NEONDupTbl, ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1245	return LT.first * Entry->Cost;
1246	}
1247	if (Kind == TTI::SK_Reverse) {
1248	static const CostTblEntry NEONShuffleTbl[] = {
1249	// Reverse shuffle cost one instruction if we are shuffling within a
1250	// double word (vrev) or two if we shuffle a quad word (vrev, vext).
1251	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: `1`},
1252	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: `1`},
1253	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: `1`},
1254	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: `1`},
1255	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: `1`},
1256	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i8, .Cost: `1`},
1257
1258	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `2`},
1259	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `2`},
1260	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `2`},
1261	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `2`}};
1262
1263	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1264	if (const auto *Entry =
1265	CostTableLookup(Table: NEONShuffleTbl, ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1266	return LT.first * Entry->Cost;
1267	}
1268	if (Kind == TTI::SK_Select) {
1269	static const CostTblEntry NEONSelShuffleTbl[] = {
1270	// Select shuffle cost table for ARM. Cost is the number of
1271	// instructions
1272	// required to create the shuffled vector.
1273
1274	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: `1`},
1275	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: `1`},
1276	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: `1`},
1277	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: `1`},
1278
1279	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `2`},
1280	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `2`},
1281	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: `2`},
1282
1283	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `16`},
1284
1285	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `32`}};
1286
1287	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1288	if (const auto *Entry = CostTableLookup(Table: NEONSelShuffleTbl,
1289	ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1290	return LT.first * Entry->Cost;
1291	}
1292	}
1293	if (ST->hasMVEIntegerOps()) {
1294	if (Kind == TTI::SK_Broadcast) {
1295	static const CostTblEntry MVEDupTbl[] = {
1296	// VDUP handles these cases.
1297	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `1`},
1298	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `1`},
1299	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `1`},
1300	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `1`},
1301	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8f16, .Cost: `1`}};
1302
1303	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1304	if (const auto *Entry = CostTableLookup(Table: MVEDupTbl, ISD: ISD::VECTOR_SHUFFLE,
1305	Ty: LT.second))
1306	return LT.first * Entry->Cost *
1307	ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput);
1308	}
1309
1310	if (!Mask.empty()) {
1311	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1312	if (LT.second.isVector() &&
1313	Mask.size() <= LT.second.getVectorNumElements() &&
1314	(isVREVMask(M: Mask, VT: LT.second, BlockSize: `16`) \|\| isVREVMask(M: Mask, VT: LT.second, BlockSize: `32`) \|\|
1315	isVREVMask(M: Mask, VT: LT.second, BlockSize: `64`)))
1316	return ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput) * LT.first;
1317	}
1318	}
1319
1320	// Restore optimal kind.
1321	if (IsExtractSubvector)
1322	Kind = TTI::SK_ExtractSubvector;
1323	int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1324	? ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput)
1325	: `1`;
1326	return BaseCost *
1327	BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1328	}
1329
1330	InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1331	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1332	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1333	ArrayRef<const Value *> Args,
1334	const Instruction *CxtI) {
1335	int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1336	if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(Bitwidth: `1`)) {
1337	// Make operations on i1 relatively expensive as this often involves
1338	// combining predicates. AND and XOR should be easier to handle with IT
1339	// blocks.
1340	switch (ISDOpcode) {
1341	default:
1342	break;
1343	case ISD::AND:
1344	case ISD::XOR:
1345	return `2`;
1346	case ISD::OR:
1347	return `3`;
1348	}
1349	}
1350
1351	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1352
1353	if (ST->hasNEON()) {
1354	const unsigned FunctionCallDivCost = `20`;
1355	const unsigned ReciprocalDivCost = `10`;
1356	static const CostTblEntry CostTbl[] = {
1357	// Division.
1358	// These costs are somewhat random. Choose a cost of 20 to indicate that
1359	// vectorizing devision (added function call) is going to be very expensive.
1360	// Double registers types.
1361	{ .ISD: ISD::SDIV, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1362	{ .ISD: ISD::UDIV, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1363	{ .ISD: ISD::SREM, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1364	{ .ISD: ISD::UREM, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1365	{ .ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1366	{ .ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1367	{ .ISD: ISD::SREM, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1368	{ .ISD: ISD::UREM, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1369	{ .ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: ReciprocalDivCost},
1370	{ .ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: ReciprocalDivCost},
1371	{ .ISD: ISD::SREM, .Type: MVT::v4i16, .Cost: `4` * FunctionCallDivCost},
1372	{ .ISD: ISD::UREM, .Type: MVT::v4i16, .Cost: `4` * FunctionCallDivCost},
1373	{ .ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: ReciprocalDivCost},
1374	{ .ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: ReciprocalDivCost},
1375	{ .ISD: ISD::SREM, .Type: MVT::v8i8, .Cost: `8` * FunctionCallDivCost},
1376	{ .ISD: ISD::UREM, .Type: MVT::v8i8, .Cost: `8` * FunctionCallDivCost},
1377	// Quad register types.
1378	{ .ISD: ISD::SDIV, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1379	{ .ISD: ISD::UDIV, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1380	{ .ISD: ISD::SREM, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1381	{ .ISD: ISD::UREM, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1382	{ .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1383	{ .ISD: ISD::UDIV, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1384	{ .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1385	{ .ISD: ISD::UREM, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1386	{ .ISD: ISD::SDIV, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1387	{ .ISD: ISD::UDIV, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1388	{ .ISD: ISD::SREM, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1389	{ .ISD: ISD::UREM, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1390	{ .ISD: ISD::SDIV, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1391	{ .ISD: ISD::UDIV, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1392	{ .ISD: ISD::SREM, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1393	{ .ISD: ISD::UREM, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1394	// Multiplication.
1395	};
1396
1397	if (const auto *Entry = CostTableLookup(Table: CostTbl, ISD: ISDOpcode, Ty: LT.second))
1398	return LT.first * Entry->Cost;
1399
1400	InstructionCost Cost = BaseT::getArithmeticInstrCost(
1401	Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
1402
1403	// This is somewhat of a hack. The problem that we are facing is that SROA
1404	// creates a sequence of shift, and, or instructions to construct values.
1405	// These sequences are recognized by the ISel and have zero-cost. Not so for
1406	// the vectorized code. Because we have support for v2i64 but not i64 those
1407	// sequences look particularly beneficial to vectorize.
1408	// To work around this we increase the cost of v2i64 operations to make them
1409	// seem less beneficial.
1410	if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1411	Cost += `4`;
1412
1413	return Cost;
1414	}
1415
1416	// If this operation is a shift on arm/thumb2, it might well be folded into
1417	// the following instruction, hence having a cost of 0.
1418	auto LooksLikeAFreeShift = [&]() {
1419	if (ST->isThumb1Only() \|\| Ty->isVectorTy())
1420	return false;
1421
1422	if (!CxtI \|\| !CxtI->hasOneUse() \|\| !CxtI->isShift())
1423	return false;
1424	if (!Op2Info.isUniform() \|\| !Op2Info.isConstant())
1425	return false;
1426
1427	// Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1428	switch (cast<Instruction>(Val: CxtI->user_back())->getOpcode()) {
1429	case Instruction::Add:
1430	case Instruction::Sub:
1431	case Instruction::And:
1432	case Instruction::Xor:
1433	case Instruction::Or:
1434	case Instruction::ICmp:
1435	return true;
1436	default:
1437	return false;
1438	}
1439	};
1440	if (LooksLikeAFreeShift ())
1441	return `0`;
1442
1443	// Default to cheap (throughput/size of 1 instruction) but adjust throughput
1444	// for "multiple beats" potentially needed by MVE instructions.
1445	int BaseCost = `1`;
1446	if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1447	BaseCost = ST->getMVEVectorCostFactor(CostKind);
1448
1449	// The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1450	// without treating floats as more expensive that scalars or increasing the
1451	// costs for custom operations. The results is also multiplied by the
1452	// MVEVectorCostFactor where appropriate.
1453	if (TLI->isOperationLegalOrCustomOrPromote(Op: ISDOpcode, VT: LT.second))
1454	return LT.first * BaseCost;
1455
1456	// Else this is expand, assume that we need to scalarize this op.
1457	if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
1458	unsigned Num = VTy->getNumElements();
1459	InstructionCost Cost =
1460	getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind);
1461	// Return the cost of multiple scalar invocation plus the cost of
1462	// inserting and extracting the values.
1463	SmallVector<Type *> Tys(Args.size(), Ty);
1464	return BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind) +
1465	Num * Cost;
1466	}
1467
1468	return BaseCost;
1469	}
1470
1471	InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1472	MaybeAlign Alignment,
1473	unsigned AddressSpace,
1474	TTI::TargetCostKind CostKind,
1475	TTI::OperandValueInfo OpInfo,
1476	const Instruction *I) {
1477	// TODO: Handle other cost kinds.
1478	if (CostKind != TTI::TCK_RecipThroughput)
1479	return `1`;
1480
1481	// Type legalization can't handle structs
1482	if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other)
1483	return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1484	CostKind);
1485
1486	if (ST->hasNEON() && Src->isVectorTy() &&
1487	(Alignment && *Alignment != Align (`16`)) &&
1488	cast<VectorType>(Val: Src)->getElementType()->isDoubleTy()) {
1489	// Unaligned loads/stores are extremely inefficient.
1490	// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1491	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1492	return LT.first * `4`;
1493	}
1494
1495	// MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1496	// Same for stores.
1497	if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Val: Src) && I &&
1498	((Opcode == Instruction::Load && I->hasOneUse() &&
1499	isa<FPExtInst>(Val: *I->user_begin())) \|\|
1500	(Opcode == Instruction::Store && isa<FPTruncInst>(Val: I->getOperand(i: `0`))))) {
1501	FixedVectorType *SrcVTy = cast<FixedVectorType>(Val: Src);
1502	Type *DstTy =
1503	Opcode == Instruction::Load
1504	? (*I->user_begin())->getType()
1505	: cast<Instruction>(Val: I->getOperand(i: `0`))->getOperand(i: `0`)->getType();
1506	if (SrcVTy->getNumElements() == `4` && SrcVTy->getScalarType()->isHalfTy() &&
1507	DstTy->getScalarType()->isFloatTy())
1508	return ST->getMVEVectorCostFactor(CostKind);
1509	}
1510
1511	int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1512	? ST->getMVEVectorCostFactor(CostKind)
1513	: `1`;
1514	return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1515	CostKind, OpInfo, I);
1516	}
1517
1518	InstructionCost
1519	ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1520	unsigned AddressSpace,
1521	TTI::TargetCostKind CostKind) {
1522	if (ST->hasMVEIntegerOps()) {
1523	if (Opcode == Instruction::Load && isLegalMaskedLoad(DataTy: Src, Alignment))
1524	return ST->getMVEVectorCostFactor(CostKind);
1525	if (Opcode == Instruction::Store && isLegalMaskedStore(DataTy: Src, Alignment))
1526	return ST->getMVEVectorCostFactor(CostKind);
1527	}
1528	if (!isa<FixedVectorType>(Val: Src))
1529	return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
1530	CostKind);
1531	// Scalar cost, which is currently very high due to the efficiency of the
1532	// generated code.
1533	return cast<FixedVectorType>(Val: Src)->getNumElements() * `8`;
1534	}
1535
1536	InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1537	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
1538	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1539	bool UseMaskForCond, bool UseMaskForGaps) {
1540	assert(Factor >= `2` && "Invalid interleave factor");
1541	assert(isa<VectorType>(VecTy) && "Expect a vector type");
1542
1543	// vldN/vstN doesn't support vector types of i64/f64 element.
1544	bool EltIs64Bits = DL.getTypeSizeInBits(Ty: VecTy->getScalarType()) == `64`;
1545
1546	if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1547	!UseMaskForCond && !UseMaskForGaps) {
1548	unsigned NumElts = cast<FixedVectorType>(Val: VecTy)->getNumElements();
1549	auto *SubVecTy =
1550	FixedVectorType::get(ElementType: VecTy->getScalarType(), NumElts: NumElts / Factor);
1551
1552	// vldN/vstN only support legal vector types of size 64 or 128 in bits.
1553	// Accesses having vector types that are a multiple of 128 bits can be
1554	// matched to more than one vldN/vstN instruction.
1555	int BaseCost =
1556	ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : `1`;
1557	if (NumElts % Factor == `0` &&
1558	TLI->isLegalInterleavedAccessType(Factor, VecTy: SubVecTy, Alignment, DL))
1559	return Factor * BaseCost * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL);
1560
1561	// Some smaller than legal interleaved patterns are cheap as we can make
1562	// use of the vmovn or vrev patterns to interleave a standard load. This is
1563	// true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1564	// promoted differently). The cost of 2 here is then a load and vrev or
1565	// vmovn.
1566	if (ST->hasMVEIntegerOps() && Factor == `2` && NumElts / Factor > `2` &&
1567	VecTy->isIntOrIntVectorTy() &&
1568	DL.getTypeSizeInBits(Ty: SubVecTy).getFixedValue() <= `64`)
1569	return `2` * BaseCost;
1570	}
1571
1572	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1573	Alignment, AddressSpace, CostKind,
1574	UseMaskForCond, UseMaskForGaps);
1575	}
1576
1577	InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1578	unsigned Opcode, Type DataTy, const* Value Ptr, bool* VariableMask,
1579	Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1580	using namespace PatternMatch;
1581	if (!ST->hasMVEIntegerOps() \|\| !EnableMaskedGatherScatters)
1582	return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1583	Alignment, CostKind, I);
1584
1585	assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1586	auto *VTy = cast<FixedVectorType>(Val: DataTy);
1587
1588	// TODO: Splitting, once we do that.
1589
1590	unsigned NumElems = VTy->getNumElements();
1591	unsigned EltSize = VTy->getScalarSizeInBits();
1592	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: DataTy);
1593
1594	// For now, it is assumed that for the MVE gather instructions the loads are
1595	// all effectively serialised. This means the cost is the scalar cost
1596	// multiplied by the number of elements being loaded. This is possibly very
1597	// conservative, but even so we still end up vectorising loops because the
1598	// cost per iteration for many loops is lower than for scalar loops.
1599	InstructionCost VectorCost =
1600	NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1601	// The scalarization cost should be a lot higher. We use the number of vector
1602	// elements plus the scalarization overhead. If masking is required then a lot
1603	// of little blocks will be needed and potentially a scalarized p0 mask,
1604	// greatly increasing the cost.
1605	InstructionCost ScalarCost =
1606	NumElems * LT.first + (VariableMask ? NumElems * `5` : `0`) +
1607	BaseT::getScalarizationOverhead(InTy: VTy, /Insert/ true, /Extract/ false,
1608	CostKind) +
1609	BaseT::getScalarizationOverhead(InTy: VTy, /Insert/ false, /Extract/ true,
1610	CostKind);
1611
1612	if (EltSize < `8` \|\| Alignment < EltSize / `8`)
1613	return ScalarCost;
1614
1615	unsigned ExtSize = EltSize;
1616	// Check whether there's a single user that asks for an extended type
1617	if (I != nullptr) {
1618	// Dependent of the caller of this function, a gather instruction will
1619	// either have opcode Instruction::Load or be a call to the masked_gather
1620	// intrinsic
1621	if ((I->getOpcode() == Instruction::Load \|\|
1622	match(V: I, P: m_Intrinsic<Intrinsic::masked_gather>())) &&
1623	I->hasOneUse()) {
1624	const User Us = I->users().begin();
1625	if (isa<ZExtInst>(Val: Us) \|\| isa<SExtInst>(Val: Us)) {
1626	// only allow valid type combinations
1627	unsigned TypeSize =
1628	cast<Instruction>(Val: Us)->getType()->getScalarSizeInBits();
1629	if (((TypeSize == `32` && (EltSize == `8` \|\| EltSize == `16`)) \|\|
1630	(TypeSize == `16` && EltSize == `8`)) &&
1631	TypeSize * NumElems == `128`) {
1632	ExtSize = TypeSize;
1633	}
1634	}
1635	}
1636	// Check whether the input data needs to be truncated
1637	TruncInst *T;
1638	if ((I->getOpcode() == Instruction::Store \|\|
1639	match(V: I, P: m_Intrinsic<Intrinsic::masked_scatter>())) &&
1640	(T = dyn_cast<TruncInst>(Val: I->getOperand(i: `0`)))) {
1641	// Only allow valid type combinations
1642	unsigned TypeSize = T->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits();
1643	if (((EltSize == `16` && TypeSize == `32`) \|\|
1644	(EltSize == `8` && (TypeSize == `32` \|\| TypeSize == `16`))) &&
1645	TypeSize * NumElems == `128`)
1646	ExtSize = TypeSize;
1647	}
1648	}
1649
1650	if (ExtSize * NumElems != `128` \|\| NumElems < `4`)
1651	return ScalarCost;
1652
1653	// Any (aligned) i32 gather will not need to be scalarised.
1654	if (ExtSize == `32`)
1655	return VectorCost;
1656	// For smaller types, we need to ensure that the gep's inputs are correctly
1657	// extended from a small enough value. Other sizes (including i64) are
1658	// scalarized for now.
1659	if (ExtSize != `8` && ExtSize != `16`)
1660	return ScalarCost;
1661
1662	if (const auto *BC = dyn_cast<BitCastInst>(Val: Ptr))
1663	Ptr = BC->getOperand(i_nocapture: `0`);
1664	if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr)) {
1665	if (GEP->getNumOperands() != `2`)
1666	return ScalarCost;
1667	unsigned Scale = DL.getTypeAllocSize(Ty: GEP->getResultElementType());
1668	// Scale needs to be correct (which is only relevant for i16s).
1669	if (Scale != `1` && Scale * `8` != ExtSize)
1670	return ScalarCost;
1671	// And we need to zext (not sext) the indexes from a small enough type.
1672	if (const auto *ZExt = dyn_cast<ZExtInst>(Val: GEP->getOperand(i_nocapture: `1`))) {
1673	if (ZExt->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits() <= ExtSize)
1674	return VectorCost;
1675	}
1676	return ScalarCost;
1677	}
1678	return ScalarCost;
1679	}
1680
1681	InstructionCost
1682	ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1683	std::optional<FastMathFlags> FMF,
1684	TTI::TargetCostKind CostKind) {
1685
1686	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1687	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1688	unsigned EltSize = ValVT.getScalarSizeInBits();
1689
1690	// In general floating point reductions are a series of elementwise
1691	// operations, with free extracts on each step. These are either in-order or
1692	// treewise depending on whether that is allowed by the fast math flags.
1693	if ((ISD == ISD::FADD \|\| ISD == ISD::FMUL) &&
1694	((EltSize == `32` && ST->hasVFP2Base()) \|\|
1695	(EltSize == `64` && ST->hasFP64()) \|\|
1696	(EltSize == `16` && ST->hasFullFP16()))) {
1697	unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1698	unsigned VecLimit = ST->hasMVEFloatOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
1699	InstructionCost VecCost = `0`;
1700	while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(Value: NumElts) &&
1701	NumElts * EltSize > VecLimit) {
1702	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / `2`);
1703	VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1704	NumElts /= `2`;
1705	}
1706
1707	// For fp16 we need to extract the upper lane elements. MVE can add a
1708	// VREV+FMIN/MAX to perform another vector step instead.
1709	InstructionCost ExtractCost = `0`;
1710	if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1711	ValVT.getVectorElementType() == MVT::f16 && NumElts == `8`) {
1712	VecCost += ST->getMVEVectorCostFactor(CostKind) * `2`;
1713	NumElts /= `2`;
1714	} else if (ValVT.getVectorElementType() == MVT::f16)
1715	ExtractCost = NumElts / `2`;
1716
1717	return VecCost + ExtractCost +
1718	NumElts *
1719	getArithmeticInstrCost(Opcode, Ty: ValTy->getElementType(), CostKind);
1720	}
1721
1722	if ((ISD == ISD::AND \|\| ISD == ISD::OR \|\| ISD == ISD::XOR) &&
1723	(EltSize == `64` \|\| EltSize == `32` \|\| EltSize == `16` \|\| EltSize == `8`)) {
1724	unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1725	unsigned VecLimit =
1726	ST->hasMVEIntegerOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
1727	InstructionCost VecCost = `0`;
1728	while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1729	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / `2`);
1730	VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1731	NumElts /= `2`;
1732	}
1733	// For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1734	// step.
1735	if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= `16` &&
1736	NumElts * EltSize == `64`) {
1737	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts);
1738	VecCost += ST->getMVEVectorCostFactor(CostKind) +
1739	getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1740	NumElts /= `2`;
1741	}
1742
1743	// From here we extract the elements and perform the and/or/xor.
1744	InstructionCost ExtractCost = NumElts;
1745	return VecCost + ExtractCost +
1746	(NumElts - `1`) * getArithmeticInstrCost(
1747	Opcode, Ty: ValTy->getElementType(), CostKind);
1748	}
1749
1750	if (!ST->hasMVEIntegerOps() \|\| !ValVT.isSimple() \|\| ISD != ISD::ADD \|\|
1751	TTI::requiresOrderedReduction(FMF))
1752	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1753
1754	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1755
1756	static const CostTblEntry CostTblAdd[]{
1757	{.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: `1`},
1758	{.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: `1`},
1759	{.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: `1`},
1760	};
1761	if (const auto *Entry = CostTableLookup(Table: CostTblAdd, ISD, Ty: LT.second))
1762	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1763
1764	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1765	}
1766
1767	InstructionCost ARMTTIImpl::getExtendedReductionCost(
1768	unsigned Opcode, bool IsUnsigned, Type ResTy, VectorType ValTy,
1769	FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1770	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1771	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1772
1773	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1774
1775	switch (ISD) {
1776	case ISD::ADD:
1777	if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1778	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1779
1780	// The legal cases are:
1781	// VADDV u/s 8/16/32
1782	// VADDLV u/s 32
1783	// Codegen currently cannot always handle larger than legal vectors very
1784	// well, especially for predicated reductions where the mask needs to be
1785	// split, so restrict to 128bit or smaller input types.
1786	unsigned RevVTSize = ResVT.getSizeInBits();
1787	if (ValVT.getSizeInBits() <= `128` &&
1788	((LT.second == MVT::v16i8 && RevVTSize <= `32`) \|\|
1789	(LT.second == MVT::v8i16 && RevVTSize <= `32`) \|\|
1790	(LT.second == MVT::v4i32 && RevVTSize <= `64`)))
1791	return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1792	}
1793	break;
1794	default:
1795	break;
1796	}
1797	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, FMF,
1798	CostKind);
1799	}
1800
1801	InstructionCost
1802	ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
1803	VectorType *ValTy,
1804	TTI::TargetCostKind CostKind) {
1805	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1806	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1807
1808	if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1809	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1810
1811	// The legal cases are:
1812	// VMLAV u/s 8/16/32
1813	// VMLALV u/s 16/32
1814	// Codegen currently cannot always handle larger than legal vectors very
1815	// well, especially for predicated reductions where the mask needs to be
1816	// split, so restrict to 128bit or smaller input types.
1817	unsigned RevVTSize = ResVT.getSizeInBits();
1818	if (ValVT.getSizeInBits() <= `128` &&
1819	((LT.second == MVT::v16i8 && RevVTSize <= `32`) \|\|
1820	(LT.second == MVT::v8i16 && RevVTSize <= `64`) \|\|
1821	(LT.second == MVT::v4i32 && RevVTSize <= `64`)))
1822	return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1823	}
1824
1825	return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, Ty: ValTy, CostKind);
1826	}
1827
1828	InstructionCost
1829	ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1830	FastMathFlags FMF,
1831	TTI::TargetCostKind CostKind) {
1832	EVT ValVT = TLI->getValueType(DL, Ty);
1833
1834	// In general floating point reductions are a series of elementwise
1835	// operations, with free extracts on each step. These are either in-order or
1836	// treewise depending on whether that is allowed by the fast math flags.
1837	if ((IID == Intrinsic::minnum \|\| IID == Intrinsic::maxnum) &&
1838	((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) \|\|
1839	(ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) \|\|
1840	(ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1841	unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
1842	unsigned EltSize = ValVT.getScalarSizeInBits();
1843	unsigned VecLimit = ST->hasMVEFloatOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
1844	InstructionCost VecCost;
1845	while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1846	Type *VecTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElts/`2`);
1847	IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1848	VecCost += getIntrinsicInstrCost(ICA, CostKind);
1849	NumElts /= `2`;
1850	}
1851
1852	// For fp16 we need to extract the upper lane elements. MVE can add a
1853	// VREV+FMIN/MAX to perform another vector step instead.
1854	InstructionCost ExtractCost = `0`;
1855	if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1856	NumElts == `8`) {
1857	VecCost += ST->getMVEVectorCostFactor(CostKind) * `2`;
1858	NumElts /= `2`;
1859	} else if (ValVT.getVectorElementType() == MVT::f16)
1860	ExtractCost = cast<FixedVectorType>(Val: Ty)->getNumElements() / `2`;
1861
1862	IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
1863	{Ty->getElementType(), Ty->getElementType()},
1864	FMF);
1865	return VecCost + ExtractCost +
1866	(NumElts - `1`) * getIntrinsicInstrCost(ICA, CostKind);
1867	}
1868
1869	if (IID == Intrinsic::smin \|\| IID == Intrinsic::smax \|\|
1870	IID == Intrinsic::umin \|\| IID == Intrinsic::umax) {
1871	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1872
1873	// All costs are the same for u/s min/max. These lower to vminv, which are
1874	// given a slightly higher cost as they tend to take multiple cycles for
1875	// smaller type sizes.
1876	static const CostTblEntry CostTblAdd[]{
1877	{.ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: `4`},
1878	{.ISD: ISD::SMIN, .Type: MVT::v8i16, .Cost: `3`},
1879	{.ISD: ISD::SMIN, .Type: MVT::v4i32, .Cost: `2`},
1880	};
1881	if (const auto *Entry = CostTableLookup(Table: CostTblAdd, ISD: ISD::SMIN, Ty: LT.second))
1882	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1883	}
1884
1885	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1886	}
1887
1888	InstructionCost
1889	ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1890	TTI::TargetCostKind CostKind) {
1891	switch (ICA.getID()) {
1892	case Intrinsic::get_active_lane_mask:
1893	// Currently we make a somewhat optimistic assumption that
1894	// active_lane_mask's are always free. In reality it may be freely folded
1895	// into a tail predicated loop, expanded into a VCPT or expanded into a lot
1896	// of add/icmp code. We may need to improve this in the future, but being
1897	// able to detect if it is free or not involves looking at a lot of other
1898	// code. We currently assume that the vectorizer inserted these, and knew
1899	// what it was doing in adding one.
1900	if (ST->hasMVEIntegerOps())
1901	return `0`;
1902	break;
1903	case Intrinsic::sadd_sat:
1904	case Intrinsic::ssub_sat:
1905	case Intrinsic::uadd_sat:
1906	case Intrinsic::usub_sat: {
1907	if (!ST->hasMVEIntegerOps())
1908	break;
1909	Type *VT = ICA.getReturnType();
1910
1911	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
1912	if (LT.second == MVT::v4i32 \|\| LT.second == MVT::v8i16 \|\|
1913	LT.second == MVT::v16i8) {
1914	// This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1915	// need to extend the type, as it uses shr(qadd(shl, shl)).
1916	unsigned Instrs =
1917	LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? `1` : `4`;
1918	return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1919	}
1920	break;
1921	}
1922	case Intrinsic::abs:
1923	case Intrinsic::smin:
1924	case Intrinsic::smax:
1925	case Intrinsic::umin:
1926	case Intrinsic::umax: {
1927	if (!ST->hasMVEIntegerOps())
1928	break;
1929	Type *VT = ICA.getReturnType();
1930
1931	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
1932	if (LT.second == MVT::v4i32 \|\| LT.second == MVT::v8i16 \|\|
1933	LT.second == MVT::v16i8)
1934	return LT.first * ST->getMVEVectorCostFactor(CostKind);
1935	break;
1936	}
1937	case Intrinsic::minnum:
1938	case Intrinsic::maxnum: {
1939	if (!ST->hasMVEFloatOps())
1940	break;
1941	Type *VT = ICA.getReturnType();
1942	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
1943	if (LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16)
1944	return LT.first * ST->getMVEVectorCostFactor(CostKind);
1945	break;
1946	}
1947	case Intrinsic::fptosi_sat:
1948	case Intrinsic::fptoui_sat: {
1949	if (ICA.getArgTypes().empty())
1950	break;
1951	bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1952	auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[`0`]);
1953	EVT MTy = TLI->getValueType(DL, Ty: ICA.getReturnType());
1954	// Check for the legal types, with the corect subtarget features.
1955	if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) \|\|
1956	(ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) \|\|
1957	(ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1958	return LT.first;
1959
1960	// Equally for MVE vector types
1961	if (ST->hasMVEFloatOps() &&
1962	(LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16) &&
1963	LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1964	return LT.first * ST->getMVEVectorCostFactor(CostKind);
1965
1966	// Otherwise we use a legal convert followed by a min+max
1967	if (((ST->hasVFP2Base() && LT.second == MVT::f32) \|\|
1968	(ST->hasFP64() && LT.second == MVT::f64) \|\|
1969	(ST->hasFullFP16() && LT.second == MVT::f16) \|\|
1970	(ST->hasMVEFloatOps() &&
1971	(LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16))) &&
1972	LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1973	Type *LegalTy = Type::getIntNTy(C&: ICA.getReturnType()->getContext(),
1974	N: LT.second.getScalarSizeInBits());
1975	InstructionCost Cost =
1976	LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : `1`;
1977	IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1978	: Intrinsic::umin,
1979	LegalTy, {LegalTy, LegalTy});
1980	Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
1981	IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1982	: Intrinsic::umax,
1983	LegalTy, {LegalTy, LegalTy});
1984	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
1985	return LT.first * Cost;
1986	}
1987	break;
1988	}
1989	}
1990
1991	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1992	}
1993
1994	bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1995	if (!F->isIntrinsic())
1996	return BaseT::isLoweredToCall(F);
1997
1998	// Assume all Arm-specific intrinsics map to an instruction.
1999	if (F->getName().starts_with(Prefix: "llvm.arm"))
2000	return false;
2001
2002	switch (F->getIntrinsicID()) {
2003	default: break;
2004	case Intrinsic::powi:
2005	case Intrinsic::sin:
2006	case Intrinsic::cos:
2007	case Intrinsic::pow:
2008	case Intrinsic::log:
2009	case Intrinsic::log10:
2010	case Intrinsic::log2:
2011	case Intrinsic::exp:
2012	case Intrinsic::exp2:
2013	return true;
2014	case Intrinsic::sqrt:
2015	case Intrinsic::fabs:
2016	case Intrinsic::copysign:
2017	case Intrinsic::floor:
2018	case Intrinsic::ceil:
2019	case Intrinsic::trunc:
2020	case Intrinsic::rint:
2021	case Intrinsic::nearbyint:
2022	case Intrinsic::round:
2023	case Intrinsic::canonicalize:
2024	case Intrinsic::lround:
2025	case Intrinsic::llround:
2026	case Intrinsic::lrint:
2027	case Intrinsic::llrint:
2028	if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2029	return true;
2030	if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2031	return true;
2032	// Some operations can be handled by vector instructions and assume
2033	// unsupported vectors will be expanded into supported scalar ones.
2034	// TODO Handle scalar operations properly.
2035	return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2036	case Intrinsic::masked_store:
2037	case Intrinsic::masked_load:
2038	case Intrinsic::masked_gather:
2039	case Intrinsic::masked_scatter:
2040	return !ST->hasMVEIntegerOps();
2041	case Intrinsic::sadd_with_overflow:
2042	case Intrinsic::uadd_with_overflow:
2043	case Intrinsic::ssub_with_overflow:
2044	case Intrinsic::usub_with_overflow:
2045	case Intrinsic::sadd_sat:
2046	case Intrinsic::uadd_sat:
2047	case Intrinsic::ssub_sat:
2048	case Intrinsic::usub_sat:
2049	return false;
2050	}
2051
2052	return BaseT::isLoweredToCall(F);
2053	}
2054
2055	bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
2056	unsigned ISD = TLI->InstructionOpcodeToISD(Opcode: I.getOpcode());
2057	EVT VT = TLI->getValueType(DL, Ty: I.getType(), AllowUnknown: true);
2058	if (TLI->getOperationAction(Op: ISD, VT) == TargetLowering::LibCall)
2059	return true;
2060
2061	// Check if an intrinsic will be lowered to a call and assume that any
2062	// other CallInst will generate a bl.
2063	if (auto *Call = dyn_cast<CallInst>(Val: &I)) {
2064	if (auto *II = dyn_cast<IntrinsicInst>(Val: Call)) {
2065	switch(II->getIntrinsicID()) {
2066	case Intrinsic::memcpy:
2067	case Intrinsic::memset:
2068	case Intrinsic::memmove:
2069	return getNumMemOps(I: II) == -`1`;
2070	default:
2071	if (const Function *F = Call->getCalledFunction())
2072	return isLoweredToCall(F);
2073	}
2074	}
2075	return true;
2076	}
2077
2078	// FPv5 provides conversions between integer, double-precision,
2079	// single-precision, and half-precision formats.
2080	switch (I.getOpcode()) {
2081	default:
2082	break;
2083	case Instruction::FPToSI:
2084	case Instruction::FPToUI:
2085	case Instruction::SIToFP:
2086	case Instruction::UIToFP:
2087	case Instruction::FPTrunc:
2088	case Instruction::FPExt:
2089	return !ST->hasFPARMv8Base();
2090	}
2091
2092	// FIXME: Unfortunately the approach of checking the Operation Action does
2093	// not catch all cases of Legalization that use library calls. Our
2094	// Legalization step categorizes some transformations into library calls as
2095	// Custom, Expand or even Legal when doing type legalization. So for now
2096	// we have to special case for instance the SDIV of 64bit integers and the
2097	// use of floating point emulation.
2098	if (VT.isInteger() && VT.getSizeInBits() >= `64`) {
2099	switch (ISD) {
2100	default:
2101	break;
2102	case ISD::SDIV:
2103	case ISD::UDIV:
2104	case ISD::SREM:
2105	case ISD::UREM:
2106	case ISD::SDIVREM:
2107	case ISD::UDIVREM:
2108	return true;
2109	}
2110	}
2111
2112	// Assume all other non-float operations are supported.
2113	if (!VT.isFloatingPoint())
2114	return false;
2115
2116	// We'll need a library call to handle most floats when using soft.
2117	if (TLI->useSoftFloat()) {
2118	switch (I.getOpcode()) {
2119	default:
2120	return true;
2121	case Instruction::Alloca:
2122	case Instruction::Load:
2123	case Instruction::Store:
2124	case Instruction::Select:
2125	case Instruction::PHI:
2126	return false;
2127	}
2128	}
2129
2130	// We'll need a libcall to perform double precision operations on a single
2131	// precision only FPU.
2132	if (I.getType()->isDoubleTy() && !ST->hasFP64())
2133	return true;
2134
2135	// Likewise for half precision arithmetic.
2136	if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2137	return true;
2138
2139	return false;
2140	}
2141
2142	bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2143	AssumptionCache &AC,
2144	TargetLibraryInfo *LibInfo,
2145	HardwareLoopInfo &HWLoopInfo) {
2146	// Low-overhead branches are only supported in the 'low-overhead branch'
2147	// extension of v8.1-m.
2148	if (!ST->hasLOB() \|\| DisableLowOverheadLoops) {
2149	LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2150	return false;
2151	}
2152
2153	if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2154	LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2155	return false;
2156	}
2157
2158	const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2159	if (isa<SCEVCouldNotCompute>(Val: BackedgeTakenCount)) {
2160	LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2161	return false;
2162	}
2163
2164	const SCEV *TripCountSCEV =
2165	SE.getAddExpr(LHS: BackedgeTakenCount,
2166	RHS: SE.getOne(Ty: BackedgeTakenCount->getType()));
2167
2168	// We need to store the trip count in LR, a 32-bit register.
2169	if (SE.getUnsignedRangeMax(S: TripCountSCEV).getBitWidth() > `32`) {
2170	LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2171	return false;
2172	}
2173
2174	// Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2175	// point in generating a hardware loop if that's going to happen.
2176
2177	auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2178	if (auto *Call = dyn_cast<IntrinsicInst>(Val: &I)) {
2179	switch (Call->getIntrinsicID()) {
2180	default:
2181	break;
2182	case Intrinsic::start_loop_iterations:
2183	case Intrinsic::test_start_loop_iterations:
2184	case Intrinsic::loop_decrement:
2185	case Intrinsic::loop_decrement_reg:
2186	return true;
2187	}
2188	}
2189	return false;
2190	};
2191
2192	// Scan the instructions to see if there's any that we know will turn into a
2193	// call or if this loop is already a low-overhead loop or will become a tail
2194	// predicated loop.
2195	bool IsTailPredLoop = false;
2196	auto ScanLoop = [&](Loop *L) {
2197	for (auto *BB : L->getBlocks()) {
2198	for (auto &I : *BB) {
2199	if (maybeLoweredToCall(I) \|\| IsHardwareLoopIntrinsic (I) \|\|
2200	isa<InlineAsm>(Val: I)) {
2201	LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2202	return false;
2203	}
2204	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2205	IsTailPredLoop \|=
2206	II->getIntrinsicID() == Intrinsic::get_active_lane_mask \|\|
2207	II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 \|\|
2208	II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 \|\|
2209	II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 \|\|
2210	II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2211	}
2212	}
2213	return true;
2214	};
2215
2216	// Visit inner loops.
2217	for (auto Inner : L)
2218	if (!ScanLoop (Inner))
2219	return false;
2220
2221	if (!ScanLoop (L))
2222	return false;
2223
2224	// TODO: Check whether the trip count calculation is expensive. If L is the
2225	// inner loop but we know it has a low trip count, calculating that trip
2226	// count (in the parent loop) may be detrimental.
2227
2228	LLVMContext &C = L->getHeader()->getContext();
2229	HWLoopInfo.CounterInReg = true;
2230	HWLoopInfo.IsNestingLegal = false;
2231	HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2232	HWLoopInfo.CountType = Type::getInt32Ty(C);
2233	HWLoopInfo.LoopDecrement = ConstantInt::get(Ty: HWLoopInfo.CountType, V: `1`);
2234	return true;
2235	}
2236
2237	static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2238	// We don't allow icmp's, and because we only look at single block loops,
2239	// we simply count the icmps, i.e. there should only be 1 for the backedge.
2240	if (isa<ICmpInst>(Val: &I) && ++ICmpCount > `1`)
2241	return false;
2242	// FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2243	// not currently canonical, but soon will be. Code without them uses icmp, and
2244	// so is not tail predicated as per the condition above. In order to get the
2245	// same performance we treat min and max the same as an icmp for tailpred
2246	// purposes for the moment (we often rely on non-tailpred and higher VF's to
2247	// pick more optimial instructions like VQDMULH. They need to be recognized
2248	// directly by the vectorizer).
2249	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2250	if ((II->getIntrinsicID() == Intrinsic::smin \|\|
2251	II->getIntrinsicID() == Intrinsic::smax \|\|
2252	II->getIntrinsicID() == Intrinsic::umin \|\|
2253	II->getIntrinsicID() == Intrinsic::umax) &&
2254	++ICmpCount > `1`)
2255	return false;
2256
2257	if (isa<FCmpInst>(Val: &I))
2258	return false;
2259
2260	// We could allow extending/narrowing FP loads/stores, but codegen is
2261	// too inefficient so reject this for now.
2262	if (isa<FPExtInst>(Val: &I) \|\| isa<FPTruncInst>(Val: &I))
2263	return false;
2264
2265	// Extends have to be extending-loads
2266	if (isa<SExtInst>(Val: &I) \|\| isa<ZExtInst>(Val: &I) )
2267	if (!I.getOperand(i: `0`)->hasOneUse() \|\| !isa<LoadInst>(Val: I.getOperand(i: `0`)))
2268	return false;
2269
2270	// Truncs have to be narrowing-stores
2271	if (isa<TruncInst>(Val: &I) )
2272	if (!I.hasOneUse() \|\| !isa<StoreInst>(Val: *I.user_begin()))
2273	return false;
2274
2275	return true;
2276	}
2277
2278	// To set up a tail-predicated loop, we need to know the total number of
2279	// elements processed by that loop. Thus, we need to determine the element
2280	// size and:
2281	// 1) it should be uniform for all operations in the vector loop, so we
2282	// e.g. don't want any widening/narrowing operations.
2283	// 2) it should be smaller than i64s because we don't have vector operations
2284	// that work on i64s.
2285	// 3) we don't want elements to be reversed or shuffled, to make sure the
2286	// tail-predication masks/predicates the right lanes.
2287	//
2288	static bool canTailPredicateLoop(Loop L, LoopInfo LI, ScalarEvolution &SE,
2289	const DataLayout &DL,
2290	const LoopAccessInfo *LAI) {
2291	LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2292
2293	// If there are live-out values, it is probably a reduction. We can predicate
2294	// most reduction operations freely under MVE using a combination of
2295	// prefer-predicated-reduction-select and inloop reductions. We limit this to
2296	// floating point and integer reductions, but don't check for operators
2297	// specifically here. If the value ends up not being a reduction (and so the
2298	// vectorizer cannot tailfold the loop), we should fall back to standard
2299	// vectorization automatically.
2300	SmallVector< Instruction *, `8` > LiveOuts;
2301	LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2302	bool ReductionsDisabled =
2303	EnableTailPredication == TailPredication::EnabledNoReductions \|\|
2304	EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2305
2306	for (auto *I : LiveOuts) {
2307	if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2308	!I->getType()->isHalfTy()) {
2309	LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2310	"live-out value\n");
2311	return false;
2312	}
2313	if (ReductionsDisabled) {
2314	LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2315	return false;
2316	}
2317	}
2318
2319	// Next, check that all instructions can be tail-predicated.
2320	PredicatedScalarEvolution PSE = LAI->getPSE();
2321	SmallVector<Instruction *, `16`> LoadStores;
2322	int ICmpCount = `0`;
2323
2324	for (BasicBlock *BB : L->blocks()) {
2325	for (Instruction &I : BB->instructionsWithoutDebug()) {
2326	if (isa<PHINode>(Val: &I))
2327	continue;
2328	if (!canTailPredicateInstruction(I, ICmpCount)) {
2329	LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2330	return false;
2331	}
2332
2333	Type *T = I.getType();
2334	if (T->getScalarSizeInBits() > `32`) {
2335	LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2336	return false;
2337	}
2338	if (isa<StoreInst>(Val: I) \|\| isa<LoadInst>(Val: I)) {
2339	Value *Ptr = getLoadStorePointerOperand(V: &I);
2340	Type *AccessTy = getLoadStoreType(I: &I);
2341	int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, Lp: L).value_or(u: `0`);
2342	if (NextStride == `1`) {
2343	// TODO: for now only allow consecutive strides of 1. We could support
2344	// other strides as long as it is uniform, but let's keep it simple
2345	// for now.
2346	continue;
2347	} else if (NextStride == -`1` \|\|
2348	(NextStride == `2` && MVEMaxSupportedInterleaveFactor >= `2`) \|\|
2349	(NextStride == `4` && MVEMaxSupportedInterleaveFactor >= `4`)) {
2350	LLVM_DEBUG(dbgs()
2351	<< "Consecutive strides of 2 found, vld2/vstr2 can't "
2352	"be tail-predicated\n.");
2353	return false;
2354	// TODO: don't tail predicate if there is a reversed load?
2355	} else if (EnableMaskedGatherScatters) {
2356	// Gather/scatters do allow loading from arbitrary strides, at
2357	// least if they are loop invariant.
2358	// TODO: Loop variant strides should in theory work, too, but
2359	// this requires further testing.
2360	const SCEV *PtrScev = PSE.getSE()->getSCEV(V: Ptr);
2361	if (auto AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev)) {
2362	const SCEV Step = AR->getStepRecurrence(SE&: PSE.getSE());
2363	if (PSE.getSE()->isLoopInvariant(S: Step, L))
2364	continue;
2365	}
2366	}
2367	LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2368	"tail-predicate\n.");
2369	return false;
2370	}
2371	}
2372	}
2373
2374	LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2375	return true;
2376	}
2377
2378	bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
2379	if (!EnableTailPredication) {
2380	LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2381	return false;
2382	}
2383
2384	// Creating a predicated vector loop is the first step for generating a
2385	// tail-predicated hardware loop, for which we need the MVE masked
2386	// load/stores instructions:
2387	if (!ST->hasMVEIntegerOps())
2388	return false;
2389
2390	LoopVectorizationLegality *LVL = TFI->LVL;
2391	Loop *L = LVL->getLoop();
2392
2393	// For now, restrict this to single block loops.
2394	if (L->getNumBlocks() > `1`) {
2395	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2396	"loop.\n");
2397	return false;
2398	}
2399
2400	assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2401
2402	LoopInfo *LI = LVL->getLoopInfo();
2403	HardwareLoopInfo HWLoopInfo(L);
2404	if (!HWLoopInfo.canAnalyze(LI&: *LI)) {
2405	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2406	"analyzable.\n");
2407	return false;
2408	}
2409
2410	AssumptionCache *AC = LVL->getAssumptionCache();
2411	ScalarEvolution *SE = LVL->getScalarEvolution();
2412
2413	// This checks if we have the low-overhead branch architecture
2414	// extension, and if we will create a hardware-loop:
2415	if (!isHardwareLoopProfitable(L, SE&: SE, AC&: AC, LibInfo: TFI->TLI, HWLoopInfo)) {
2416	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2417	"profitable.\n");
2418	return false;
2419	}
2420
2421	DominatorTree *DT = LVL->getDominatorTree();
2422	if (!HWLoopInfo.isHardwareLoopCandidate(SE&: SE, LI&: LI, DT&: *DT)) {
2423	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2424	"a candidate.\n");
2425	return false;
2426	}
2427
2428	return canTailPredicateLoop(L, LI, SE&: *SE, DL, LAI: LVL->getLAI());
2429	}
2430
2431	TailFoldingStyle
2432	ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2433	if (!ST->hasMVEIntegerOps() \|\| !EnableTailPredication)
2434	return TailFoldingStyle::DataWithoutLaneMask;
2435
2436	// Intrinsic @llvm.get.active.lane.mask is supported.
2437	// It is used in the MVETailPredication pass, which requires the number of
2438	// elements processed by this vector loop to setup the tail-predicated
2439	// loop.
2440	return TailFoldingStyle::Data;
2441	}
2442	void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2443	TTI::UnrollingPreferences &UP,
2444	OptimizationRemarkEmitter *ORE) {
2445	// Enable Upper bound unrolling universally, providing that we do not see an
2446	// active lane mask, which will be better kept as a loop to become tail
2447	// predicated than to be conditionally unrolled.
2448	UP.UpperBound =
2449	!ST->hasMVEIntegerOps() \|\| !any_of(Range&: *L->getHeader(), P: [](Instruction &I) {
2450	return isa<IntrinsicInst>(Val: I) &&
2451	cast<IntrinsicInst>(Val&: I).getIntrinsicID() ==
2452	Intrinsic::get_active_lane_mask;
2453	});
2454
2455	// Only currently enable these preferences for M-Class cores.
2456	if (!ST->isMClass())
2457	return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2458
2459	// Disable loop unrolling for Oz and Os.
2460	UP.OptSizeThreshold = `0`;
2461	UP.PartialOptSizeThreshold = `0`;
2462	if (L->getHeader()->getParent()->hasOptSize())
2463	return;
2464
2465	SmallVector<BasicBlock*, `4`> ExitingBlocks;
2466	L->getExitingBlocks(ExitingBlocks);
2467	LLVM_DEBUG(dbgs() << "Loop has:\n"
2468	<< "Blocks: " << L->getNumBlocks() << "\n"
2469	<< "Exit blocks: " << ExitingBlocks.size() << "\n");
2470
2471	// Only allow another exit other than the latch. This acts as an early exit
2472	// as it mirrors the profitability calculation of the runtime unroller.
2473	if (ExitingBlocks.size() > `2`)
2474	return;
2475
2476	// Limit the CFG of the loop body for targets with a branch predictor.
2477	// Allowing 4 blocks permits if-then-else diamonds in the body.
2478	if (ST->hasBranchPredictor() && L->getNumBlocks() > `4`)
2479	return;
2480
2481	// Don't unroll vectorized loops, including the remainder loop
2482	if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized"))
2483	return;
2484
2485	// Scan the loop: don't unroll loops with calls as this could prevent
2486	// inlining.
2487	InstructionCost Cost = `0`;
2488	for (auto *BB : L->getBlocks()) {
2489	for (auto &I : *BB) {
2490	// Don't unroll vectorised loop. MVE does not benefit from it as much as
2491	// scalar code.
2492	if (I.getType()->isVectorTy())
2493	return;
2494
2495	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I)) {
2496	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
2497	if (!isLoweredToCall(F))
2498	continue;
2499	}
2500	return;
2501	}
2502
2503	SmallVector<const Value*, `4`> Operands(I.operand_values());
2504	Cost += getInstructionCost(U: &I, Operands,
2505	CostKind: TargetTransformInfo::TCK_SizeAndLatency);
2506	}
2507	}
2508
2509	// On v6m cores, there are very few registers available. We can easily end up
2510	// spilling and reloading more registers in an unrolled loop. Look at the
2511	// number of LCSSA phis as a rough measure of how many registers will need to
2512	// be live out of the loop, reducing the default unroll count if more than 1
2513	// value is needed. In the long run, all of this should be being learnt by a
2514	// machine.
2515	unsigned UnrollCount = `4`;
2516	if (ST->isThumb1Only()) {
2517	unsigned ExitingValues = `0`;
2518	SmallVector<BasicBlock *, `4`> ExitBlocks;
2519	L->getExitBlocks(ExitBlocks);
2520	for (auto *Exit : ExitBlocks) {
2521	// Count the number of LCSSA phis. Exclude values coming from GEP's as
2522	// only the last is expected to be needed for address operands.
2523	unsigned LiveOuts = count_if(Range: Exit->phis(), P: [](auto &PH) {
2524	return PH.getNumOperands() != `1` \|\|
2525	!isa<GetElementPtrInst>(PH.getOperand(`0`));
2526	});
2527	ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2528	}
2529	if (ExitingValues)
2530	UnrollCount /= ExitingValues;
2531	if (UnrollCount <= `1`)
2532	return;
2533	}
2534
2535	LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2536	LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2537
2538	UP.Partial = true;
2539	UP.Runtime = true;
2540	UP.UnrollRemainder = true;
2541	UP.DefaultUnrollRuntimeCount = UnrollCount;
2542	UP.UnrollAndJam = true;
2543	UP.UnrollAndJamInnerLoopThreshold = `60`;
2544
2545	// Force unrolling small loops can be very useful because of the branch
2546	// taken cost of the backedge.
2547	if (Cost < `12`)
2548	UP.Force = true;
2549	}
2550
2551	void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2552	TTI::PeelingPreferences &PP) {
2553	BaseT::getPeelingPreferences(L, SE, PP);
2554	}
2555
2556	bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2557	TTI::ReductionFlags Flags) const {
2558	if (!ST->hasMVEIntegerOps())
2559	return false;
2560
2561	unsigned ScalarBits = Ty->getScalarSizeInBits();
2562	switch (Opcode) {
2563	case Instruction::Add:
2564	return ScalarBits <= `64`;
2565	default:
2566	return false;
2567	}
2568	}
2569
2570	bool ARMTTIImpl::preferPredicatedReductionSelect(
2571	unsigned Opcode, Type Ty, TTI::ReductionFlags Flags) const* {
2572	if (!ST->hasMVEIntegerOps())
2573	return false;
2574	return true;
2575	}
2576
2577	InstructionCost ARMTTIImpl::getScalingFactorCost(Type Ty, GlobalValue BaseGV,
2578	StackOffset BaseOffset,
2579	bool HasBaseReg, int64_t Scale,
2580	unsigned AddrSpace) const {
2581	TargetLoweringBase::AddrMode AM;
2582	AM.BaseGV = BaseGV;
2583	AM.BaseOffs = BaseOffset.getFixed();
2584	AM.HasBaseReg = HasBaseReg;
2585	AM.Scale = Scale;
2586	AM.ScalableOffset = BaseOffset.getScalable();
2587	if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) {
2588	if (ST->hasFPAO())
2589	return AM.Scale < `0` ? `1` : `0`; // positive offsets execute faster
2590	return `0`;
2591	}
2592	return -`1`;
2593	}
2594
2595	bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2596	if (Thumb) {
2597	// B.W is available in any Thumb2-supporting target, and also in every
2598	// version of Armv8-M, even Baseline which does not include the rest of
2599	// Thumb2.
2600	return ST->isThumb2() \|\| ST->hasV8MBaselineOps();
2601	} else {
2602	// B is available in all versions of the Arm ISA, so the only question is
2603	// whether that ISA is available at all.
2604	return ST->hasARMOps();
2605	}
2606	}
2607

Browse the source code of llvm_projects/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp