ARMTargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp]

1	//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "ARMTargetTransformInfo.h"
10	#include "ARMSubtarget.h"
11	#include "MCTargetDesc/ARMAddressingModes.h"
12	#include "llvm/ADT/APInt.h"
13	#include "llvm/ADT/SmallVector.h"
14	#include "llvm/Analysis/LoopInfo.h"
15	#include "llvm/CodeGen/CostTable.h"
16	#include "llvm/CodeGen/ISDOpcodes.h"
17	#include "llvm/CodeGen/ValueTypes.h"
18	#include "llvm/CodeGenTypes/MachineValueType.h"
19	#include "llvm/IR/BasicBlock.h"
20	#include "llvm/IR/DataLayout.h"
21	#include "llvm/IR/DerivedTypes.h"
22	#include "llvm/IR/Instruction.h"
23	#include "llvm/IR/Instructions.h"
24	#include "llvm/IR/IntrinsicInst.h"
25	#include "llvm/IR/Intrinsics.h"
26	#include "llvm/IR/IntrinsicsARM.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/IR/Type.h"
29	#include "llvm/Support/Casting.h"
30	#include "llvm/Support/KnownBits.h"
31	#include "llvm/Target/TargetMachine.h"
32	#include "llvm/TargetParser/SubtargetFeature.h"
33	#include "llvm/Transforms/InstCombine/InstCombiner.h"
34	#include "llvm/Transforms/Utils/Local.h"
35	#include "llvm/Transforms/Utils/LoopUtils.h"
36	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
37	#include <algorithm>
38	#include <cassert>
39	#include <cstdint>
40	#include <optional>
41	#include <utility>
42
43	using namespace llvm;
44
45	#define DEBUG_TYPE "armtti"
46
47	static cl::opt<bool> EnableMaskedLoadStores(
48	"enable-arm-maskedldst", cl::Hidden, cl::init(Val: true),
49	cl::desc ("Enable the generation of masked loads and stores"));
50
51	static cl::opt<bool> DisableLowOverheadLoops(
52	"disable-arm-loloops", cl::Hidden, cl::init(Val: false),
53	cl::desc ("Disable the generation of low-overhead loops"));
54
55	static cl::opt<bool>
56	AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(Val: true),
57	cl::desc ("Enable the generation of WLS loops"));
58
59	static cl::opt<bool> UseWidenGlobalArrays(
60	"widen-global-strings", cl::Hidden, cl::init(Val: true),
61	cl::desc ("Enable the widening of global strings to alignment boundaries"));
62
63	extern cl::opt<TailPredication::Mode> EnableTailPredication;
64
65	extern cl::opt<bool> EnableMaskedGatherScatters;
66
67	extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
68
69	/// Convert a vector load intrinsic into a simple llvm load instruction.
70	/// This is beneficial when the underlying object being addressed comes
71	/// from a constant, since we get constant-folding for free.
72	static Value simplifyNeonVld1(const* IntrinsicInst &II, unsigned MemAlign,
73	InstCombiner::BuilderTy &Builder) {
74	auto *IntrAlign = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
75
76	if (!IntrAlign)
77	return nullptr;
78
79	unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
80	? MemAlign
81	: IntrAlign->getLimitedValue();
82
83	if (!isPowerOf2_32(Value: Alignment))
84	return nullptr;
85
86	return Builder.CreateAlignedLoad(Ty: II.getType(), Ptr: II.getArgOperand(i: `0`),
87	Align: Align (Alignment));
88	}
89
90	bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
91	const Function Callee) const* {
92	const TargetMachine &TM = getTLI()->getTargetMachine();
93	const FeatureBitset &CallerBits =
94	TM.getSubtargetImpl(*Caller)->getFeatureBits();
95	const FeatureBitset &CalleeBits =
96	TM.getSubtargetImpl(*Callee)->getFeatureBits();
97
98	// To inline a callee, all features not in the allowed list must match exactly.
99	bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
100	(CalleeBits & ~InlineFeaturesAllowed);
101	// For features in the allowed list, the callee's features must be a subset of
102	// the callers'.
103	bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
104	(CalleeBits & InlineFeaturesAllowed);
105	return MatchExact && MatchSubset;
106	}
107
108	TTI::AddressingModeKind
109	ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
110	ScalarEvolution SE) const* {
111	if (ST->hasMVEIntegerOps())
112	return TTI::AMK_PostIndexed;
113
114	if (L->getHeader()->getParent()->hasOptSize())
115	return TTI::AMK_None;
116
117	if (ST->isMClass() && ST->isThumb2() &&
118	L->getNumBlocks() == `1`)
119	return TTI::AMK_PreIndexed;
120
121	return TTI::AMK_None;
122	}
123
124	std::optional<Instruction *>
125	ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
126	using namespace PatternMatch;
127	Intrinsic::ID IID = II.getIntrinsicID();
128	switch (IID) {
129	default:
130	break;
131	case Intrinsic::arm_neon_vld1: {
132	Align MemAlign =
133	getKnownAlignment(V: II.getArgOperand(i: `0`), DL: IC.getDataLayout(), CxtI: &II,
134	AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
135	if (Value *V = simplifyNeonVld1(II, MemAlign: MemAlign.value(), Builder&: IC.Builder)) {
136	return IC.replaceInstUsesWith(I&: II, V);
137	}
138	break;
139	}
140
141	case Intrinsic::arm_neon_vld2:
142	case Intrinsic::arm_neon_vld3:
143	case Intrinsic::arm_neon_vld4:
144	case Intrinsic::arm_neon_vld2lane:
145	case Intrinsic::arm_neon_vld3lane:
146	case Intrinsic::arm_neon_vld4lane:
147	case Intrinsic::arm_neon_vst1:
148	case Intrinsic::arm_neon_vst2:
149	case Intrinsic::arm_neon_vst3:
150	case Intrinsic::arm_neon_vst4:
151	case Intrinsic::arm_neon_vst2lane:
152	case Intrinsic::arm_neon_vst3lane:
153	case Intrinsic::arm_neon_vst4lane: {
154	Align MemAlign =
155	getKnownAlignment(V: II.getArgOperand(i: `0`), DL: IC.getDataLayout(), CxtI: &II,
156	AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
157	unsigned AlignArg = II.arg_size() - `1`;
158	Value *AlignArgOp = II.getArgOperand(i: AlignArg);
159	MaybeAlign Align = cast<ConstantInt>(Val: AlignArgOp)->getMaybeAlignValue();
160	if (Align && *Align < MemAlign) {
161	return IC.replaceOperand(
162	I&: II, OpNum: AlignArg,
163	V: ConstantInt::get(Ty: Type::getInt32Ty(C&: II.getContext()), V: MemAlign.value(),
164	IsSigned: false));
165	}
166	break;
167	}
168
169	case Intrinsic::arm_neon_vld1x2:
170	case Intrinsic::arm_neon_vld1x3:
171	case Intrinsic::arm_neon_vld1x4:
172	case Intrinsic::arm_neon_vst1x2:
173	case Intrinsic::arm_neon_vst1x3:
174	case Intrinsic::arm_neon_vst1x4: {
175	Align NewAlign =
176	getKnownAlignment(V: II.getArgOperand(i: `0`), DL: IC.getDataLayout(), CxtI: &II,
177	AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
178	Align OldAlign = II.getParamAlign(ArgNo: `0`).valueOrOne();
179	if (NewAlign > OldAlign)
180	II.addParamAttr(ArgNo: `0`,
181	Attr: Attribute::getWithAlignment(Context&: II.getContext(), Alignment: NewAlign));
182	break;
183	}
184
185	case Intrinsic::arm_mve_pred_i2v: {
186	Value *Arg = II.getArgOperand(i: `0`);
187	Value *ArgArg;
188	if (match(V: Arg, P: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
189	Op0: PatternMatch::m_Value(V&: ArgArg))) &&
190	II.getType() == ArgArg->getType()) {
191	return IC.replaceInstUsesWith(I&: II, V: ArgArg);
192	}
193	Constant *XorMask;
194	if (match(V: Arg, P: m_Xor(L: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
195	Op0: PatternMatch::m_Value(V&: ArgArg)),
196	R: PatternMatch::m_Constant(C&: XorMask))) &&
197	II.getType() == ArgArg->getType()) {
198	if (auto *CI = dyn_cast<ConstantInt>(Val: XorMask)) {
199	if (CI->getValue().trunc(width: `16`).isAllOnes()) {
200	auto TrueVector = IC.Builder.CreateVectorSplat(
201	NumElts: cast<FixedVectorType>(Val: II.getType())->getNumElements(),
202	V: IC.Builder.getTrue());
203	return BinaryOperator::Create(Op: Instruction::Xor, S1: ArgArg, S2: TrueVector);
204	}
205	}
206	}
207	KnownBits ScalarKnown(`32`);
208	if (IC.SimplifyDemandedBits(I: &II, OpNo: `0`, DemandedMask: APInt::getLowBitsSet(numBits: `32`, loBitsSet: `16`),
209	Known&: ScalarKnown)) {
210	return &II;
211	}
212	break;
213	}
214	case Intrinsic::arm_mve_pred_v2i: {
215	Value *Arg = II.getArgOperand(i: `0`);
216	Value *ArgArg;
217	if (match(V: Arg, P: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
218	Op0: PatternMatch::m_Value(V&: ArgArg)))) {
219	return IC.replaceInstUsesWith(I&: II, V: ArgArg);
220	}
221
222	if (II.getMetadata(KindID: LLVMContext::MD_range))
223	break;
224
225	ConstantRange Range(APInt (`32`, `0`), APInt (`32`, `0x10000`));
226
227	if (auto CurrentRange = II.getRange()) {
228	Range = Range.intersectWith(CR: *CurrentRange);
229	if (Range == CurrentRange)
230	break;
231	}
232
233	II.addRangeRetAttr(CR: Range);
234	II.addRetAttr(Kind: Attribute::NoUndef);
235	return &II;
236	}
237	case Intrinsic::arm_mve_vadc:
238	case Intrinsic::arm_mve_vadc_predicated: {
239	unsigned CarryOp =
240	(II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? `3` : `2`;
241	assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == `32` &&
242	"Bad type for intrinsic!");
243
244	KnownBits CarryKnown(`32`);
245	if (IC.SimplifyDemandedBits(I: &II, OpNo: CarryOp, DemandedMask: APInt::getOneBitSet(numBits: `32`, BitNo: `29`),
246	Known&: CarryKnown)) {
247	return &II;
248	}
249	break;
250	}
251	case Intrinsic::arm_mve_vmldava: {
252	Instruction *I = cast<Instruction>(Val: &II);
253	if (I->hasOneUse()) {
254	auto User = cast<Instruction>(Val: I->user_begin());
255	Value *OpZ;
256	if (match(V: User, P: m_c_Add(L: m_Specific(V: I), R: m_Value(V&: OpZ))) &&
257	match(V: I->getOperand(i: `3`), P: m_Zero())) {
258	Value *OpX = I->getOperand(i: `4`);
259	Value *OpY = I->getOperand(i: `5`);
260	Type *OpTy = OpX->getType();
261
262	IC.Builder.SetInsertPoint(User);
263	Value *V =
264	IC.Builder.CreateIntrinsic(ID: Intrinsic::arm_mve_vmldava, Types: {OpTy},
265	Args: {I->getOperand(i: `0`), I->getOperand(i: `1`),
266	I->getOperand(i: `2`), OpZ, OpX, OpY});
267
268	IC.replaceInstUsesWith(I&: *User, V);
269	return IC.eraseInstFromFunction(I&: *User);
270	}
271	}
272	return std::nullopt;
273	}
274	}
275	return std::nullopt;
276	}
277
278	std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
279	InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
280	APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
281	std::function<void(Instruction , unsigned*, APInt, APInt &)>
282	SimplifyAndSetOp) const {
283
284	// Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
285	// opcode specifying a Top/Bottom instruction, which can change between
286	// instructions.
287	auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
288	unsigned NumElts = cast<FixedVectorType>(Val: II.getType())->getNumElements();
289	unsigned IsTop = cast<ConstantInt>(Val: II.getOperand(i_nocapture: TopOpc))->getZExtValue();
290
291	// The only odd/even lanes of operand 0 will only be demanded depending
292	// on whether this is a top/bottom instruction.
293	APInt DemandedElts =
294	APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: `2`, loBitsSet: `1`)
295	: APInt::getHighBitsSet(numBits: `2`, hiBitsSet: `1`));
296	SimplifyAndSetOp (&II, `0`, OrigDemandedElts & DemandedElts, UndefElts);
297	// The other lanes will be defined from the inserted elements.
298	UndefElts &= APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: `2`, loBitsSet: `1`)
299	: APInt::getHighBitsSet(numBits: `2`, hiBitsSet: `1`));
300	return std::nullopt;
301	};
302
303	switch (II.getIntrinsicID()) {
304	default:
305	break;
306	case Intrinsic::arm_mve_vcvt_narrow:
307	SimplifyNarrowInstrTopBottom (`2`);
308	break;
309	case Intrinsic::arm_mve_vqmovn:
310	SimplifyNarrowInstrTopBottom (`4`);
311	break;
312	case Intrinsic::arm_mve_vshrn:
313	SimplifyNarrowInstrTopBottom (`7`);
314	break;
315	}
316
317	return std::nullopt;
318	}
319
320	InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
321	TTI::TargetCostKind CostKind) const {
322	assert(Ty->isIntegerTy());
323
324	unsigned Bits = Ty->getPrimitiveSizeInBits();
325	if (Bits == `0` \|\| Imm.getActiveBits() >= `64`)
326	return `4`;
327
328	int64_t SImmVal = Imm.getSExtValue();
329	uint64_t ZImmVal = Imm.getZExtValue();
330	if (!ST->isThumb()) {
331	if ((SImmVal >= `0` && SImmVal < `65536`) \|\|
332	(ARM_AM::getSOImmVal(Arg: ZImmVal) != -`1`) \|\|
333	(ARM_AM::getSOImmVal(Arg: ~ZImmVal) != -`1`))
334	return `1`;
335	return ST->hasV6T2Ops() ? `2` : `3`;
336	}
337	if (ST->isThumb2()) {
338	if ((SImmVal >= `0` && SImmVal < `65536`) \|\|
339	(ARM_AM::getT2SOImmVal(Arg: ZImmVal) != -`1`) \|\|
340	(ARM_AM::getT2SOImmVal(Arg: ~ZImmVal) != -`1`))
341	return `1`;
342	return ST->hasV6T2Ops() ? `2` : `3`;
343	}
344	// Thumb1, any i8 imm cost 1.
345	if (Bits == `8` \|\| (SImmVal >= `0` && SImmVal < `256`))
346	return `1`;
347	if ((~SImmVal < `256`) \|\| ARM_AM::isThumbImmShiftedVal(V: ZImmVal))
348	return `2`;
349	// Load from constantpool.
350	return `3`;
351	}
352
353	// Constants smaller than 256 fit in the immediate field of
354	// Thumb1 instructions so we return a zero cost and 1 otherwise.
355	InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
356	const APInt &Imm,
357	Type Ty) const* {
358	if (Imm.isNonNegative() && Imm.getLimitedValue() < `256`)
359	return `0`;
360
361	return `1`;
362	}
363
364	// Checks whether Inst is part of a min(max()) or max(min()) pattern
365	// that will match to an SSAT instruction. Returns the instruction being
366	// saturated, or null if no saturation pattern was found.
367	static Value isSSATMinMaxPattern(Instruction Inst, const APInt &Imm) {
368	Value LHS, RHS;
369	ConstantInt *C;
370	SelectPatternFlavor InstSPF = matchSelectPattern(V: Inst, LHS, RHS).Flavor;
371
372	if (InstSPF == SPF_SMAX &&
373	PatternMatch::match(V: RHS, P: PatternMatch::m_ConstantInt(CI&: C)) &&
374	C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
375
376	auto isSSatMin = [&](Value *MinInst) {
377	if (isa<SelectInst>(Val: MinInst)) {
378	Value MinLHS, MinRHS;
379	ConstantInt *MinC;
380	SelectPatternFlavor MinSPF =
381	matchSelectPattern(V: MinInst, LHS&: MinLHS, RHS&: MinRHS).Flavor;
382	if (MinSPF == SPF_SMIN &&
383	PatternMatch::match(V: MinRHS, P: PatternMatch::m_ConstantInt(CI&: MinC)) &&
384	MinC->getValue() == ((-Imm) - `1`))
385	return true;
386	}
387	return false;
388	};
389
390	if (isSSatMin (Inst->getOperand(i: `1`)))
391	return cast<Instruction>(Val: Inst->getOperand(i: `1`))->getOperand(i: `1`);
392	if (Inst->hasNUses(N: `2`) &&
393	(isSSatMin (Inst->user_begin()) \|\| isSSatMin ((++Inst->user_begin()))))
394	return Inst->getOperand(i: `1`);
395	}
396	return nullptr;
397	}
398
399	// Look for a FP Saturation pattern, where the instruction can be simplified to
400	// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
401	static bool isFPSatMinMaxPattern(Instruction Inst, const* APInt &Imm) {
402	if (Imm.getBitWidth() != `64` \|\|
403	Imm != APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `33`)) // -2147483648
404	return false;
405	Value *FP = isSSATMinMaxPattern(Inst, Imm);
406	if (!FP && isa<ICmpInst>(Val: Inst) && Inst->hasOneUse())
407	FP = isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm);
408	if (!FP)
409	return false;
410	return isa<FPToSIInst>(Val: FP);
411	}
412
413	InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
414	const APInt &Imm, Type *Ty,
415	TTI::TargetCostKind CostKind,
416	Instruction Inst) const* {
417	// Division by a constant can be turned into multiplication, but only if we
418	// know it's constant. So it's not so much that the immediate is cheap (it's
419	// not), but that the alternative is worse.
420	// FIXME: this is probably unneeded with GlobalISel.
421	if ((Opcode == Instruction::SDiv \|\| Opcode == Instruction::UDiv \|\|
422	Opcode == Instruction::SRem \|\| Opcode == Instruction::URem) &&
423	Idx == `1`)
424	return `0`;
425
426	// Leave any gep offsets for the CodeGenPrepare, which will do a better job at
427	// splitting any large offsets.
428	if (Opcode == Instruction::GetElementPtr && Idx != `0`)
429	return `0`;
430
431	if (Opcode == Instruction::And) {
432	// UXTB/UXTH
433	if (Imm == `255` \|\| Imm == `65535`)
434	return `0`;
435	// Conversion to BIC is free, and means we can use ~Imm instead.
436	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
437	b: getIntImmCost(Imm: ~Imm, Ty, CostKind));
438	}
439
440	if (Opcode == Instruction::Add)
441	// Conversion to SUB is free, and means we can use -Imm instead.
442	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
443	b: getIntImmCost(Imm: -Imm, Ty, CostKind));
444
445	if (Opcode == Instruction::ICmp && Imm.isNegative() &&
446	Ty->getIntegerBitWidth() == `32`) {
447	int64_t NegImm = -Imm.getSExtValue();
448	if (ST->isThumb2() && NegImm < `1`<<`12`)
449	// icmp X, #-C -> cmn X, #C
450	return `0`;
451	if (ST->isThumb() && NegImm < `1`<<`8`)
452	// icmp X, #-C -> adds X, #C
453	return `0`;
454	}
455
456	// xor a, -1 can always be folded to MVN
457	if (Opcode == Instruction::Xor && Imm.isAllOnes())
458	return `0`;
459
460	// Ensures negative constant of min(max()) or max(min()) patterns that
461	// match to SSAT instructions don't get hoisted
462	if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) \|\| ST->isThumb2()) &&
463	Ty->getIntegerBitWidth() <= `32`) {
464	if (isSSATMinMaxPattern(Inst, Imm) \|\|
465	(isa<ICmpInst>(Val: Inst) && Inst->hasOneUse() &&
466	isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm)))
467	return `0`;
468	}
469
470	if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
471	return `0`;
472
473	// We can convert <= -1 to < 0, which is generally quite cheap.
474	if (Inst && Opcode == Instruction::ICmp && Idx == `1` && Imm.isAllOnes()) {
475	ICmpInst::Predicate Pred = cast<ICmpInst>(Val: Inst)->getPredicate();
476	if (Pred == ICmpInst::ICMP_SGT \|\| Pred == ICmpInst::ICMP_SLE)
477	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
478	b: getIntImmCost(Imm: Imm + `1`, Ty, CostKind));
479	}
480
481	return getIntImmCost(Imm, Ty, CostKind);
482	}
483
484	InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
485	TTI::TargetCostKind CostKind,
486	const Instruction I) const* {
487	if (CostKind == TTI::TCK_RecipThroughput &&
488	(ST->hasNEON() \|\| ST->hasMVEIntegerOps())) {
489	// FIXME: The vectorizer is highly sensistive to the cost of these
490	// instructions, which suggests that it may be using the costs incorrectly.
491	// But, for now, just make them free to avoid performance regressions for
492	// vector targets.
493	return `0`;
494	}
495	return BaseT::getCFInstrCost(Opcode, CostKind, I);
496	}
497
498	InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
499	Type *Src,
500	TTI::CastContextHint CCH,
501	TTI::TargetCostKind CostKind,
502	const Instruction I) const* {
503	int ISD = TLI->InstructionOpcodeToISD(Opcode);
504	assert(ISD && "Invalid opcode");
505
506	// TODO: Allow non-throughput costs that aren't binary.
507	auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
508	if (CostKind != TTI::TCK_RecipThroughput)
509	return Cost == `0` ? `0` : `1`;
510	return Cost;
511	};
512	auto IsLegalFPType = [this](EVT VT) {
513	EVT EltVT = VT.getScalarType();
514	return (EltVT == MVT::f32 && ST->hasVFP2Base()) \|\|
515	(EltVT == MVT::f64 && ST->hasFP64()) \|\|
516	(EltVT == MVT::f16 && ST->hasFullFP16());
517	};
518
519	EVT SrcTy = TLI->getValueType(DL, Ty: Src);
520	EVT DstTy = TLI->getValueType(DL, Ty: Dst);
521
522	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
523	return AdjustCost (
524	BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
525
526	// Extending masked load/Truncating masked stores is expensive because we
527	// currently don't split them. This means that we'll likely end up
528	// loading/storing each element individually (hence the high cost).
529	if ((ST->hasMVEIntegerOps() &&
530	(Opcode == Instruction::Trunc \|\| Opcode == Instruction::ZExt \|\|
531	Opcode == Instruction::SExt)) \|\|
532	(ST->hasMVEFloatOps() &&
533	(Opcode == Instruction::FPExt \|\| Opcode == Instruction::FPTrunc) &&
534	IsLegalFPType (SrcTy) && IsLegalFPType (DstTy)))
535	if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > `128`)
536	return `2` * DstTy.getVectorNumElements() *
537	ST->getMVEVectorCostFactor(CostKind);
538
539	// The extend of other kinds of load is free
540	if (CCH == TTI::CastContextHint::Normal \|\|
541	CCH == TTI::CastContextHint::Masked) {
542	static const TypeConversionCostTblEntry LoadConversionTbl[] = {
543	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i32, .Src: MVT::i16, .Cost: `0`},
544	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i32, .Src: MVT::i16, .Cost: `0`},
545	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i32, .Src: MVT::i8, .Cost: `0`},
546	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i32, .Src: MVT::i8, .Cost: `0`},
547	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i16, .Src: MVT::i8, .Cost: `0`},
548	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i16, .Src: MVT::i8, .Cost: `0`},
549	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i32, .Cost: `1`},
550	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i32, .Cost: `1`},
551	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: `1`},
552	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: `1`},
553	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i8, .Cost: `1`},
554	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i8, .Cost: `1`},
555	};
556	if (const auto *Entry = ConvertCostTableLookup(
557	Table: LoadConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
558	return AdjustCost (Entry->Cost);
559
560	static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
561	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0`},
562	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0`},
563	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `0`},
564	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `0`},
565	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0`},
566	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0`},
567	// The following extend from a legal type to an illegal type, so need to
568	// split the load. This introduced an extra load operation, but the
569	// extend is still "free".
570	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `1`},
571	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `1`},
572	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `3`},
573	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `3`},
574	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `1`},
575	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `1`},
576	};
577	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
578	if (const auto *Entry =
579	ConvertCostTableLookup(Table: MVELoadConversionTbl, ISD,
580	Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
581	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
582	}
583
584	static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
585	// FPExtends are similar but also require the VCVT instructions.
586	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: `1`},
587	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: `3`},
588	};
589	if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
590	if (const auto *Entry =
591	ConvertCostTableLookup(Table: MVEFLoadConversionTbl, ISD,
592	Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
593	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
594	}
595
596	// The truncate of a store is free. This is the mirror of extends above.
597	static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
598	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0`},
599	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `0`},
600	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0`},
601	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `1`},
602	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `1`},
603	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `3`},
604	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `1`},
605	};
606	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
607	if (const auto *Entry =
608	ConvertCostTableLookup(Table: MVEStoreConversionTbl, ISD,
609	Dst: SrcTy.getSimpleVT(), Src: DstTy.getSimpleVT()))
610	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
611	}
612
613	static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
614	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: `1`},
615	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: `3`},
616	};
617	if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
618	if (const auto *Entry =
619	ConvertCostTableLookup(Table: MVEFStoreConversionTbl, ISD,
620	Dst: SrcTy.getSimpleVT(), Src: DstTy.getSimpleVT()))
621	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
622	}
623	}
624
625	// NEON vector operations that can extend their inputs.
626	if ((ISD == ISD::SIGN_EXTEND \|\| ISD == ISD::ZERO_EXTEND) &&
627	I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
628	static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
629	// vaddl
630	{ .ISD: ISD::ADD, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
631	{ .ISD: ISD::ADD, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
632	// vsubl
633	{ .ISD: ISD::SUB, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
634	{ .ISD: ISD::SUB, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
635	// vmull
636	{ .ISD: ISD::MUL, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
637	{ .ISD: ISD::MUL, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
638	// vshll
639	{ .ISD: ISD::SHL, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
640	{ .ISD: ISD::SHL, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
641	};
642
643	auto User = cast<Instruction>(Val: I->user_begin());
644	int UserISD = TLI->InstructionOpcodeToISD(Opcode: User->getOpcode());
645	if (auto *Entry = ConvertCostTableLookup(Table: NEONDoubleWidthTbl, ISD: UserISD,
646	Dst: DstTy.getSimpleVT(),
647	Src: SrcTy.getSimpleVT())) {
648	return AdjustCost (Entry->Cost);
649	}
650	}
651
652	// Single to/from double precision conversions.
653	if (Src->isVectorTy() && ST->hasNEON() &&
654	((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
655	DstTy.getScalarType() == MVT::f32) \|\|
656	(ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
657	DstTy.getScalarType() == MVT::f64))) {
658	static const CostTblEntry NEONFltDblTbl[] = {
659	// Vector fptrunc/fpext conversions.
660	{.ISD: ISD::FP_ROUND, .Type: MVT::v2f64, .Cost: `2`},
661	{.ISD: ISD::FP_EXTEND, .Type: MVT::v2f32, .Cost: `2`},
662	{.ISD: ISD::FP_EXTEND, .Type: MVT::v4f32, .Cost: `4`}};
663
664	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
665	if (const auto *Entry = CostTableLookup(Table: NEONFltDblTbl, ISD, Ty: LT.second))
666	return AdjustCost (LT.first * Entry->Cost);
667	}
668
669	// Some arithmetic, load and store operations have specific instructions
670	// to cast up/down their types automatically at no extra cost.
671	// TODO: Get these tables to know at least what the related operations are.
672	static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
673	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
674	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
675	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `1` },
676	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `1` },
677	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: `0` },
678	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: `1` },
679
680	// The number of vmovl instructions for the extension.
681	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
682	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
683	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
684	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
685	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `3` },
686	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `3` },
687	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `2` },
688	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `2` },
689	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3` },
690	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3` },
691	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3` },
692	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3` },
693	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7` },
694	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7` },
695	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6` },
696	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6` },
697	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6` },
698	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6` },
699
700	// Operations that we legalize using splitting.
701	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: `6` },
702	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: `3` },
703
704	// Vector float <-> i32 conversions.
705	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1` },
706	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1` },
707
708	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3` },
709	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3` },
710	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `2` },
711	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `2` },
712	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1` },
713	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1` },
714	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: `3` },
715	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: `3` },
716	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `3` },
717	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `3` },
718	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2` },
719	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2` },
720	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4` },
721	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4` },
722	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: `2` },
723	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: `2` },
724	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: `8` },
725	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: `8` },
726	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: `4` },
727	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: `4` },
728
729	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1` },
730	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1` },
731	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `3` },
732	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `3` },
733	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2` },
734	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2` },
735
736	// Vector double <-> i32 conversions.
737	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
738	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
739
740	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4` },
741	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4` },
742	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `3` },
743	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `3` },
744	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
745	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
746
747	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2` },
748	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2` },
749	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: `4` },
750	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: `4` },
751	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: `8` },
752	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: `8` }
753	};
754
755	if (SrcTy.isVector() && ST->hasNEON()) {
756	if (const auto *Entry = ConvertCostTableLookup(Table: NEONVectorConversionTbl, ISD,
757	Dst: DstTy.getSimpleVT(),
758	Src: SrcTy.getSimpleVT()))
759	return AdjustCost (Entry->Cost);
760	}
761
762	// Scalar float to integer conversions.
763	static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
764	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i1, .Src: MVT::f32, .Cost: `2` },
765	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i1, .Src: MVT::f32, .Cost: `2` },
766	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i1, .Src: MVT::f64, .Cost: `2` },
767	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i1, .Src: MVT::f64, .Cost: `2` },
768	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i8, .Src: MVT::f32, .Cost: `2` },
769	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i8, .Src: MVT::f32, .Cost: `2` },
770	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i8, .Src: MVT::f64, .Cost: `2` },
771	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i8, .Src: MVT::f64, .Cost: `2` },
772	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i16, .Src: MVT::f32, .Cost: `2` },
773	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i16, .Src: MVT::f32, .Cost: `2` },
774	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i16, .Src: MVT::f64, .Cost: `2` },
775	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i16, .Src: MVT::f64, .Cost: `2` },
776	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: `2` },
777	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: `2` },
778	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: `2` },
779	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: `2` },
780	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: `10` },
781	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: `10` },
782	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: `10` },
783	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: `10` }
784	};
785	if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
786	if (const auto *Entry = ConvertCostTableLookup(Table: NEONFloatConversionTbl, ISD,
787	Dst: DstTy.getSimpleVT(),
788	Src: SrcTy.getSimpleVT()))
789	return AdjustCost (Entry->Cost);
790	}
791
792	// Scalar integer to float conversions.
793	static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
794	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i1, .Cost: `2` },
795	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i1, .Cost: `2` },
796	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i1, .Cost: `2` },
797	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i1, .Cost: `2` },
798	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i8, .Cost: `2` },
799	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i8, .Cost: `2` },
800	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i8, .Cost: `2` },
801	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i8, .Cost: `2` },
802	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i16, .Cost: `2` },
803	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i16, .Cost: `2` },
804	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i16, .Cost: `2` },
805	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i16, .Cost: `2` },
806	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: `2` },
807	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: `2` },
808	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: `2` },
809	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: `2` },
810	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: `10` },
811	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: `10` },
812	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: `10` },
813	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: `10` }
814	};
815
816	if (SrcTy.isInteger() && ST->hasNEON()) {
817	if (const auto *Entry = ConvertCostTableLookup(Table: NEONIntegerConversionTbl,
818	ISD, Dst: DstTy.getSimpleVT(),
819	Src: SrcTy.getSimpleVT()))
820	return AdjustCost (Entry->Cost);
821	}
822
823	// MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
824	// instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
825	// are linearised so take more.
826	static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
827	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
828	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
829	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
830	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
831	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `10` },
832	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `2` },
833	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
834	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
835	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `10` },
836	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `2` },
837	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `8` },
838	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `2` },
839	};
840
841	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
842	if (const auto *Entry = ConvertCostTableLookup(Table: MVEVectorConversionTbl,
843	ISD, Dst: DstTy.getSimpleVT(),
844	Src: SrcTy.getSimpleVT()))
845	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
846	}
847
848	if (ISD == ISD::FP_ROUND \|\| ISD == ISD::FP_EXTEND) {
849	// As general rule, fp converts that were not matched above are scalarized
850	// and cost 1 vcvt for each lane, so long as the instruction is available.
851	// If not it will become a series of function calls.
852	const InstructionCost CallCost =
853	getCallInstrCost(F: nullptr, RetTy: Dst, Tys: {Src}, CostKind);
854	int Lanes = `1`;
855	if (SrcTy.isFixedLengthVector())
856	Lanes = SrcTy.getVectorNumElements();
857
858	if (IsLegalFPType (SrcTy) && IsLegalFPType (DstTy))
859	return Lanes;
860	else
861	return Lanes * CallCost;
862	}
863
864	if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
865	SrcTy.isFixedLengthVector()) {
866	// Treat a truncate with larger than legal source (128bits for MVE) as
867	// expensive, 2 instructions per lane.
868	if ((SrcTy.getScalarType() == MVT::i8 \|\|
869	SrcTy.getScalarType() == MVT::i16 \|\|
870	SrcTy.getScalarType() == MVT::i32) &&
871	SrcTy.getSizeInBits() > `128` &&
872	SrcTy.getSizeInBits() > DstTy.getSizeInBits())
873	return SrcTy.getVectorNumElements() * `2`;
874	}
875
876	// Scalar integer conversion costs.
877	static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
878	// i16 -> i64 requires two dependent operations.
879	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: `2` },
880
881	// Truncates on i64 are assumed to be free.
882	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i32, .Src: MVT::i64, .Cost: `0` },
883	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i16, .Src: MVT::i64, .Cost: `0` },
884	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i8, .Src: MVT::i64, .Cost: `0` },
885	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i1, .Src: MVT::i64, .Cost: `0` }
886	};
887
888	if (SrcTy.isInteger()) {
889	if (const auto *Entry = ConvertCostTableLookup(Table: ARMIntegerConversionTbl, ISD,
890	Dst: DstTy.getSimpleVT(),
891	Src: SrcTy.getSimpleVT()))
892	return AdjustCost (Entry->Cost);
893	}
894
895	int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
896	? ST->getMVEVectorCostFactor(CostKind)
897	: `1`;
898	return AdjustCost (
899	BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
900	}
901
902	InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
903	TTI::TargetCostKind CostKind,
904	unsigned Index, const Value *Op0,
905	const Value Op1) const* {
906	// Penalize inserting into an D-subregister. We end up with a three times
907	// lower estimated throughput on swift.
908	if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
909	ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= `32`)
910	return `3`;
911
912	if (ST->hasNEON() && (Opcode == Instruction::InsertElement \|\|
913	Opcode == Instruction::ExtractElement)) {
914	// Cross-class copies are expensive on many microarchitectures,
915	// so assume they are expensive by default.
916	if (cast<VectorType>(Val: ValTy)->getElementType()->isIntegerTy())
917	return `3`;
918
919	// Even if it's not a cross class copy, this likely leads to mixing
920	// of NEON and VFP code and should be therefore penalized.
921	if (ValTy->isVectorTy() &&
922	ValTy->getScalarSizeInBits() <= `32`)
923	return std::max<InstructionCost>(
924	a: BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1),
925	b: `2U`);
926	}
927
928	if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement \|\|
929	Opcode == Instruction::ExtractElement)) {
930	// Integer cross-lane moves are more expensive than float, which can
931	// sometimes just be vmovs. Integer involve being passes to GPR registers,
932	// causing more of a delay.
933	std::pair<InstructionCost, MVT> LT =
934	getTypeLegalizationCost(Ty: ValTy->getScalarType());
935	return LT.first * (ValTy->getScalarType()->isIntegerTy() ? `4` : `1`);
936	}
937
938	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1);
939	}
940
941	InstructionCost ARMTTIImpl::getCmpSelInstrCost(
942	unsigned Opcode, Type ValTy, Type CondTy, CmpInst::Predicate VecPred,
943	TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
944	TTI::OperandValueInfo Op2Info, const Instruction I) const* {
945	int ISD = TLI->InstructionOpcodeToISD(Opcode);
946
947	// Thumb scalar code size cost for select.
948	if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
949	ST->isThumb() && !ValTy->isVectorTy()) {
950	// Assume expensive structs.
951	if (TLI->getValueType(DL, Ty: ValTy, AllowUnknown: true) == MVT::Other)
952	return TTI::TCC_Expensive;
953
954	// Select costs can vary because they:
955	// - may require one or more conditional mov (including an IT),
956	// - can't operate directly on immediates,
957	// - require live flags, which we can't copy around easily.
958	InstructionCost Cost = getTypeLegalizationCost(Ty: ValTy).first;
959
960	// Possible IT instruction for Thumb2, or more for Thumb1.
961	++Cost;
962
963	// i1 values may need rematerialising by using mov immediates and/or
964	// flag setting instructions.
965	if (ValTy->isIntegerTy(Bitwidth: `1`))
966	++Cost;
967
968	return Cost;
969	}
970
971	// If this is a vector min/max/abs, use the cost of that intrinsic directly
972	// instead. Hopefully when min/max intrinsics are more prevalent this code
973	// will not be needed.
974	const Instruction *Sel = I;
975	if ((Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) && Sel &&
976	Sel->hasOneUse())
977	Sel = cast<Instruction>(Val: Sel->user_back());
978	if (Sel && ValTy->isVectorTy() &&
979	(ValTy->isIntOrIntVectorTy() \|\| ValTy->isFPOrFPVectorTy())) {
980	const Value LHS, RHS;
981	SelectPatternFlavor SPF = matchSelectPattern(V: Sel, LHS, RHS).Flavor;
982	unsigned IID = `0`;
983	switch (SPF) {
984	case SPF_ABS:
985	IID = Intrinsic::abs;
986	break;
987	case SPF_SMIN:
988	IID = Intrinsic::smin;
989	break;
990	case SPF_SMAX:
991	IID = Intrinsic::smax;
992	break;
993	case SPF_UMIN:
994	IID = Intrinsic::umin;
995	break;
996	case SPF_UMAX:
997	IID = Intrinsic::umax;
998	break;
999	case SPF_FMINNUM:
1000	IID = Intrinsic::minnum;
1001	break;
1002	case SPF_FMAXNUM:
1003	IID = Intrinsic::maxnum;
1004	break;
1005	default:
1006	break;
1007	}
1008	if (IID) {
1009	// The ICmp is free, the select gets the cost of the min/max/etc
1010	if (Sel != I)
1011	return `0`;
1012	IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1013	return getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
1014	}
1015	}
1016
1017	// On NEON a vector select gets lowered to vbsl.
1018	if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1019	// Lowering of some vector selects is currently far from perfect.
1020	static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1021	{ .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: `4``4` + `1``2` + `1` },
1022	{ .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: `50` },
1023	{ .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: `100` }
1024	};
1025
1026	EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
1027	EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
1028	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1029	if (const auto *Entry = ConvertCostTableLookup(Table: NEONVectorSelectTbl, ISD,
1030	Dst: SelCondTy.getSimpleVT(),
1031	Src: SelValTy.getSimpleVT()))
1032	return Entry->Cost;
1033	}
1034
1035	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1036	return LT.first;
1037	}
1038
1039	if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1040	(Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) &&
1041	cast<FixedVectorType>(Val: ValTy)->getNumElements() > `1`) {
1042	FixedVectorType *VecValTy = cast<FixedVectorType>(Val: ValTy);
1043	FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(Val: CondTy);
1044	if (!VecCondTy)
1045	VecCondTy = cast<FixedVectorType>(Val: CmpInst::makeCmpResultType(opnd_type: VecValTy));
1046
1047	// If we don't have mve.fp any fp operations will need to be scalarized.
1048	if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1049	// One scalaization insert, one scalarization extract and the cost of the
1050	// fcmps.
1051	return BaseT::getScalarizationOverhead(InTy: VecValTy, /Insert/ false,
1052	/Extract/ true, CostKind) +
1053	BaseT::getScalarizationOverhead(InTy: VecCondTy, /Insert/ true,
1054	/Extract/ false, CostKind) +
1055	VecValTy->getNumElements() *
1056	getCmpSelInstrCost(Opcode, ValTy: ValTy->getScalarType(),
1057	CondTy: VecCondTy->getScalarType(), VecPred,
1058	CostKind, Op1Info, Op2Info, I);
1059	}
1060
1061	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1062	int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1063	// There are two types - the input that specifies the type of the compare
1064	// and the output vXi1 type. Because we don't know how the output will be
1065	// split, we may need an expensive shuffle to get two in sync. This has the
1066	// effect of making larger than legal compares (v8i32 for example)
1067	// expensive.
1068	if (LT.second.isVector() && LT.second.getVectorNumElements() > `2`) {
1069	if (LT.first > `1`)
1070	return LT.first * BaseCost +
1071	BaseT::getScalarizationOverhead(InTy: VecCondTy, /Insert/ true,
1072	/Extract/ false, CostKind);
1073	return BaseCost;
1074	}
1075	}
1076
1077	// Default to cheap (throughput/size of 1 instruction) but adjust throughput
1078	// for "multiple beats" potentially needed by MVE instructions.
1079	int BaseCost = `1`;
1080	if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1081	BaseCost = ST->getMVEVectorCostFactor(CostKind);
1082
1083	return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1084	CostKind, Op1Info, Op2Info, I);
1085	}
1086
1087	InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1088	ScalarEvolution *SE,
1089	const SCEV Ptr) const* {
1090	// Address computations in vectorized code with non-consecutive addresses will
1091	// likely result in more instructions compared to scalar code where the
1092	// computation can more often be merged into the index mode. The resulting
1093	// extra micro-ops can significantly decrease throughput.
1094	unsigned NumVectorInstToHideOverhead = `10`;
1095	int MaxMergeDistance = `64`;
1096
1097	if (ST->hasNEON()) {
1098	if (Ty->isVectorTy() && SE &&
1099	!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + `1`))
1100	return NumVectorInstToHideOverhead;
1101
1102	// In many cases the address computation is not merged into the instruction
1103	// addressing mode.
1104	return `1`;
1105	}
1106	return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1107	}
1108
1109	bool ARMTTIImpl::isProfitableLSRChainElement(Instruction I) const* {
1110	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
1111	// If a VCTP is part of a chain, it's already profitable and shouldn't be
1112	// optimized, else LSR may block tail-predication.
1113	switch (II->getIntrinsicID()) {
1114	case Intrinsic::arm_mve_vctp8:
1115	case Intrinsic::arm_mve_vctp16:
1116	case Intrinsic::arm_mve_vctp32:
1117	case Intrinsic::arm_mve_vctp64:
1118	return true;
1119	default:
1120	break;
1121	}
1122	}
1123	return false;
1124	}
1125
1126	bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment,
1127	unsigned /AddressSpace/) const {
1128	if (!EnableMaskedLoadStores \|\| !ST->hasMVEIntegerOps())
1129	return false;
1130
1131	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DataTy)) {
1132	// Don't support v2i1 yet.
1133	if (VecTy->getNumElements() == `2`)
1134	return false;
1135
1136	// We don't support extending fp types.
1137	unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1138	if (VecWidth != `128` && VecTy->getElementType()->isFloatingPointTy())
1139	return false;
1140	}
1141
1142	unsigned EltWidth = DataTy->getScalarSizeInBits();
1143	return (EltWidth == `32` && Alignment >= `4`) \|\|
1144	(EltWidth == `16` && Alignment >= `2`) \|\| (EltWidth == `8`);
1145	}
1146
1147	bool ARMTTIImpl::isLegalMaskedGather(Type Ty, Align Alignment) const* {
1148	if (!EnableMaskedGatherScatters \|\| !ST->hasMVEIntegerOps())
1149	return false;
1150
1151	unsigned EltWidth = Ty->getScalarSizeInBits();
1152	return ((EltWidth == `32` && Alignment >= `4`) \|\|
1153	(EltWidth == `16` && Alignment >= `2`) \|\| EltWidth == `8`);
1154	}
1155
1156	/// Given a memcpy/memset/memmove instruction, return the number of memory
1157	/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1158	/// call is used.
1159	int ARMTTIImpl::getNumMemOps(const IntrinsicInst I) const* {
1160	MemOp MOp;
1161	unsigned DstAddrSpace = ~`0u`;
1162	unsigned SrcAddrSpace = ~`0u`;
1163	const Function *F = I->getParent()->getParent();
1164
1165	if (const auto *MC = dyn_cast<MemTransferInst>(Val: I)) {
1166	ConstantInt *C = dyn_cast<ConstantInt>(Val: MC->getLength());
1167	// If 'size' is not a constant, a library call will be generated.
1168	if (!C)
1169	return -`1`;
1170
1171	const unsigned Size = C->getValue().getZExtValue();
1172	const Align DstAlign = MC->getDestAlign().valueOrOne();
1173	const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1174
1175	MOp = MemOp::Copy(Size, /DstAlignCanChange/ false, DstAlign, SrcAlign,
1176	/IsVolatile/ false);
1177	DstAddrSpace = MC->getDestAddressSpace();
1178	SrcAddrSpace = MC->getSourceAddressSpace();
1179	}
1180	else if (const auto *MS = dyn_cast<MemSetInst>(Val: I)) {
1181	ConstantInt *C = dyn_cast<ConstantInt>(Val: MS->getLength());
1182	// If 'size' is not a constant, a library call will be generated.
1183	if (!C)
1184	return -`1`;
1185
1186	const unsigned Size = C->getValue().getZExtValue();
1187	const Align DstAlign = MS->getDestAlign().valueOrOne();
1188
1189	MOp = MemOp::Set(Size, /DstAlignCanChange/ false, DstAlign,
1190	/IsZeroMemset/ false, /IsVolatile/ false);
1191	DstAddrSpace = MS->getDestAddressSpace();
1192	}
1193	else
1194	llvm_unreachable("Expected a memcpy/move or memset!");
1195
1196	unsigned Limit, Factor = `2`;
1197	switch(I->getIntrinsicID()) {
1198	case Intrinsic::memcpy:
1199	Limit = TLI->getMaxStoresPerMemcpy(OptSize: F->hasMinSize());
1200	break;
1201	case Intrinsic::memmove:
1202	Limit = TLI->getMaxStoresPerMemmove(OptSize: F->hasMinSize());
1203	break;
1204	case Intrinsic::memset:
1205	Limit = TLI->getMaxStoresPerMemset(OptSize: F->hasMinSize());
1206	Factor = `1`;
1207	break;
1208	default:
1209	llvm_unreachable("Expected a memcpy/move or memset!");
1210	}
1211
1212	// MemOps will be poplulated with a list of data types that needs to be
1213	// loaded and stored. That's why we multiply the number of elements by 2 to
1214	// get the cost for this memcpy.
1215	std::vector<EVT> MemOps;
1216	if (getTLI()->findOptimalMemOpLowering(
1217	MemOps, Limit, Op: MOp, DstAS: DstAddrSpace,
1218	SrcAS: SrcAddrSpace, FuncAttributes: F->getAttributes()))
1219	return MemOps.size() * Factor;
1220
1221	// If we can't find an optimal memop lowering, return the default cost
1222	return -`1`;
1223	}
1224
1225	InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction I) const* {
1226	int NumOps = getNumMemOps(I: cast<IntrinsicInst>(Val: I));
1227
1228	// To model the cost of a library call, we assume 1 for the call, and
1229	// 3 for the argument setup.
1230	if (NumOps == -`1`)
1231	return `4`;
1232	return NumOps;
1233	}
1234
1235	InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1236	VectorType DstTy, VectorType SrcTy,
1237	ArrayRef<int> Mask,
1238	TTI::TargetCostKind CostKind,
1239	int Index, VectorType *SubTp,
1240	ArrayRef<const Value *> Args,
1241	const Instruction CxtI) const* {
1242	assert((Mask.empty() \|\| DstTy->isScalableTy() \|\|
1243	Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1244	"Expected the Mask to match the return size if given");
1245	assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1246	"Expected the same scalar types");
1247
1248	Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
1249	// Treat extractsubvector as single op permutation.
1250	bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1251	if (IsExtractSubvector)
1252	Kind = TTI::SK_PermuteSingleSrc;
1253	if (ST->hasNEON()) {
1254	if (Kind == TTI::SK_Broadcast) {
1255	static const CostTblEntry NEONDupTbl[] = {
1256	// VDUP handles these cases.
1257	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: `1`},
1258	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: `1`},
1259	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: `1`},
1260	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: `1`},
1261	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: `1`},
1262	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i8, .Cost: `1`},
1263
1264	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `1`},
1265	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `1`},
1266	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `1`},
1267	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `1`}};
1268
1269	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1270	if (const auto *Entry =
1271	CostTableLookup(Table: NEONDupTbl, ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1272	return LT.first * Entry->Cost;
1273	}
1274	if (Kind == TTI::SK_Reverse) {
1275	static const CostTblEntry NEONShuffleTbl[] = {
1276	// Reverse shuffle cost one instruction if we are shuffling within a
1277	// double word (vrev) or two if we shuffle a quad word (vrev, vext).
1278	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: `1`},
1279	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: `1`},
1280	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: `1`},
1281	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: `1`},
1282	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: `1`},
1283	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i8, .Cost: `1`},
1284
1285	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `2`},
1286	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `2`},
1287	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `2`},
1288	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `2`}};
1289
1290	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1291	if (const auto *Entry =
1292	CostTableLookup(Table: NEONShuffleTbl, ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1293	return LT.first * Entry->Cost;
1294	}
1295	if (Kind == TTI::SK_Select) {
1296	static const CostTblEntry NEONSelShuffleTbl[] = {
1297	// Select shuffle cost table for ARM. Cost is the number of
1298	// instructions
1299	// required to create the shuffled vector.
1300
1301	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: `1`},
1302	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: `1`},
1303	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: `1`},
1304	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: `1`},
1305
1306	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `2`},
1307	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `2`},
1308	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: `2`},
1309
1310	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `16`},
1311
1312	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `32`}};
1313
1314	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1315	if (const auto *Entry = CostTableLookup(Table: NEONSelShuffleTbl,
1316	ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1317	return LT.first * Entry->Cost;
1318	}
1319	}
1320	if (ST->hasMVEIntegerOps()) {
1321	if (Kind == TTI::SK_Broadcast) {
1322	static const CostTblEntry MVEDupTbl[] = {
1323	// VDUP handles these cases.
1324	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `1`},
1325	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `1`},
1326	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `1`},
1327	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `1`},
1328	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8f16, .Cost: `1`}};
1329
1330	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1331	if (const auto *Entry = CostTableLookup(Table: MVEDupTbl, ISD: ISD::VECTOR_SHUFFLE,
1332	Ty: LT.second))
1333	return LT.first * Entry->Cost *
1334	ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput);
1335	}
1336
1337	if (!Mask.empty()) {
1338	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1339	if (LT.second.isVector() &&
1340	Mask.size() <= LT.second.getVectorNumElements() &&
1341	(isVREVMask(M: Mask, VT: LT.second, BlockSize: `16`) \|\| isVREVMask(M: Mask, VT: LT.second, BlockSize: `32`) \|\|
1342	isVREVMask(M: Mask, VT: LT.second, BlockSize: `64`)))
1343	return ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput) * LT.first;
1344	}
1345	}
1346
1347	// Restore optimal kind.
1348	if (IsExtractSubvector)
1349	Kind = TTI::SK_ExtractSubvector;
1350	int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1351	? ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput)
1352	: `1`;
1353	return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1354	Index, SubTp);
1355	}
1356
1357	InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1358	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1359	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1360	ArrayRef<const Value > Args, const* Instruction CxtI) const* {
1361	int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1362	if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(Bitwidth: `1`)) {
1363	// Make operations on i1 relatively expensive as this often involves
1364	// combining predicates. AND and XOR should be easier to handle with IT
1365	// blocks.
1366	switch (ISDOpcode) {
1367	default:
1368	break;
1369	case ISD::AND:
1370	case ISD::XOR:
1371	return `2`;
1372	case ISD::OR:
1373	return `3`;
1374	}
1375	}
1376
1377	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1378
1379	if (ST->hasNEON()) {
1380	const unsigned FunctionCallDivCost = `20`;
1381	const unsigned ReciprocalDivCost = `10`;
1382	static const CostTblEntry CostTbl[] = {
1383	// Division.
1384	// These costs are somewhat random. Choose a cost of 20 to indicate that
1385	// vectorizing devision (added function call) is going to be very expensive.
1386	// Double registers types.
1387	{ .ISD: ISD::SDIV, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1388	{ .ISD: ISD::UDIV, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1389	{ .ISD: ISD::SREM, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1390	{ .ISD: ISD::UREM, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1391	{ .ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1392	{ .ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1393	{ .ISD: ISD::SREM, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1394	{ .ISD: ISD::UREM, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1395	{ .ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: ReciprocalDivCost},
1396	{ .ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: ReciprocalDivCost},
1397	{ .ISD: ISD::SREM, .Type: MVT::v4i16, .Cost: `4` * FunctionCallDivCost},
1398	{ .ISD: ISD::UREM, .Type: MVT::v4i16, .Cost: `4` * FunctionCallDivCost},
1399	{ .ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: ReciprocalDivCost},
1400	{ .ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: ReciprocalDivCost},
1401	{ .ISD: ISD::SREM, .Type: MVT::v8i8, .Cost: `8` * FunctionCallDivCost},
1402	{ .ISD: ISD::UREM, .Type: MVT::v8i8, .Cost: `8` * FunctionCallDivCost},
1403	// Quad register types.
1404	{ .ISD: ISD::SDIV, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1405	{ .ISD: ISD::UDIV, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1406	{ .ISD: ISD::SREM, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1407	{ .ISD: ISD::UREM, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1408	{ .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1409	{ .ISD: ISD::UDIV, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1410	{ .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1411	{ .ISD: ISD::UREM, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1412	{ .ISD: ISD::SDIV, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1413	{ .ISD: ISD::UDIV, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1414	{ .ISD: ISD::SREM, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1415	{ .ISD: ISD::UREM, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1416	{ .ISD: ISD::SDIV, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1417	{ .ISD: ISD::UDIV, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1418	{ .ISD: ISD::SREM, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1419	{ .ISD: ISD::UREM, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1420	// Multiplication.
1421	};
1422
1423	if (const auto *Entry = CostTableLookup(Table: CostTbl, ISD: ISDOpcode, Ty: LT.second))
1424	return LT.first * Entry->Cost;
1425
1426	InstructionCost Cost = BaseT::getArithmeticInstrCost(
1427	Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
1428
1429	// This is somewhat of a hack. The problem that we are facing is that SROA
1430	// creates a sequence of shift, and, or instructions to construct values.
1431	// These sequences are recognized by the ISel and have zero-cost. Not so for
1432	// the vectorized code. Because we have support for v2i64 but not i64 those
1433	// sequences look particularly beneficial to vectorize.
1434	// To work around this we increase the cost of v2i64 operations to make them
1435	// seem less beneficial.
1436	if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1437	Cost += `4`;
1438
1439	return Cost;
1440	}
1441
1442	// If this operation is a shift on arm/thumb2, it might well be folded into
1443	// the following instruction, hence having a cost of 0.
1444	auto LooksLikeAFreeShift = [&]() {
1445	if (ST->isThumb1Only() \|\| Ty->isVectorTy())
1446	return false;
1447
1448	if (!CxtI \|\| !CxtI->hasOneUse() \|\| !CxtI->isShift())
1449	return false;
1450	if (!Op2Info.isUniform() \|\| !Op2Info.isConstant())
1451	return false;
1452
1453	// Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1454	switch (cast<Instruction>(Val: CxtI->user_back())->getOpcode()) {
1455	case Instruction::Add:
1456	case Instruction::Sub:
1457	case Instruction::And:
1458	case Instruction::Xor:
1459	case Instruction::Or:
1460	case Instruction::ICmp:
1461	return true;
1462	default:
1463	return false;
1464	}
1465	};
1466	if (LooksLikeAFreeShift ())
1467	return `0`;
1468
1469	// When targets have both DSP and MVE we find that the
1470	// the compiler will attempt to vectorize as well as using
1471	// scalar (S/U)MLAL operations. This is in cases where we have
1472	// the pattern ext(mul(ext(i16), ext(i16))) we find
1473	// that codegen performs better when only using (S/U)MLAL scalar
1474	// ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1475	// check if a mul instruction is used in a (U/S)MLAL pattern.
1476	auto MulInDSPMLALPattern = [&](const Instruction I, unsigned* Opcode,
1477	Type Ty) -> bool* {
1478	if (!ST->hasDSP())
1479	return false;
1480
1481	if (!I)
1482	return false;
1483
1484	if (Opcode != Instruction::Mul)
1485	return false;
1486
1487	if (Ty->isVectorTy())
1488	return false;
1489
1490	auto ValueOpcodesEqual = [](const Value LHS, const* Value RHS) -> bool* {
1491	return cast<Instruction>(Val: LHS)->getOpcode() ==
1492	cast<Instruction>(Val: RHS)->getOpcode();
1493	};
1494	auto IsExtInst = [](const Value V) -> bool* {
1495	return isa<ZExtInst>(Val: V) \|\| isa<SExtInst>(Val: V);
1496	};
1497	auto IsExtensionFromHalf = [](const Value V) -> bool* {
1498	return cast<Instruction>(Val: V)->getOperand(i: `0`)->getType()->isIntegerTy(Bitwidth: `16`);
1499	};
1500
1501	// We check the arguments of the instruction to see if they're extends
1502	auto *BinOp = dyn_cast<BinaryOperator>(Val: I);
1503	if (!BinOp)
1504	return false;
1505	Value *Op0 = BinOp->getOperand(i_nocapture: `0`);
1506	Value *Op1 = BinOp->getOperand(i_nocapture: `1`);
1507	if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1508	// We're interested in an ext of an i16
1509	if (!I->getType()->isIntegerTy(Bitwidth: `32`) \|\| !IsExtensionFromHalf(Op0) \|\|
1510	!IsExtensionFromHalf(Op1))
1511	return false;
1512	// We need to check if this result will be further extended to i64
1513	// and that all these uses are SExt
1514	for (auto *U : I->users())
1515	if (!IsExtInst(U))
1516	return false;
1517	return true;
1518	}
1519
1520	return false;
1521	};
1522
1523	if (MulInDSPMLALPattern (CxtI, Opcode, Ty))
1524	return `0`;
1525
1526	// Default to cheap (throughput/size of 1 instruction) but adjust throughput
1527	// for "multiple beats" potentially needed by MVE instructions.
1528	int BaseCost = `1`;
1529	if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1530	BaseCost = ST->getMVEVectorCostFactor(CostKind);
1531
1532	// The rest of this mostly follows what is done in
1533	// BaseT::getArithmeticInstrCost, without treating floats as more expensive
1534	// that scalars or increasing the costs for custom operations. The results is
1535	// also multiplied by the MVEVectorCostFactor where appropriate.
1536	if (TLI->isOperationLegalOrCustomOrPromote(Op: ISDOpcode, VT: LT.second))
1537	return LT.first * BaseCost;
1538
1539	// Else this is expand, assume that we need to scalarize this op.
1540	if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
1541	unsigned Num = VTy->getNumElements();
1542	InstructionCost Cost =
1543	getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind);
1544	// Return the cost of multiple scalar invocation plus the cost of
1545	// inserting and extracting the values.
1546	SmallVector<Type *> Tys(Args.size(), Ty);
1547	return BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind) +
1548	Num * Cost;
1549	}
1550
1551	return BaseCost;
1552	}
1553
1554	InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1555	Align Alignment,
1556	unsigned AddressSpace,
1557	TTI::TargetCostKind CostKind,
1558	TTI::OperandValueInfo OpInfo,
1559	const Instruction I) const* {
1560	// TODO: Handle other cost kinds.
1561	if (CostKind != TTI::TCK_RecipThroughput)
1562	return `1`;
1563
1564	// Type legalization can't handle structs
1565	if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other)
1566	return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1567	CostKind);
1568
1569	if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align (`16`) &&
1570	cast<VectorType>(Val: Src)->getElementType()->isDoubleTy()) {
1571	// Unaligned loads/stores are extremely inefficient.
1572	// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1573	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1574	return LT.first * `4`;
1575	}
1576
1577	// MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1578	// Same for stores.
1579	if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Val: Src) && I &&
1580	((Opcode == Instruction::Load && I->hasOneUse() &&
1581	isa<FPExtInst>(Val: *I->user_begin())) \|\|
1582	(Opcode == Instruction::Store && isa<FPTruncInst>(Val: I->getOperand(i: `0`))))) {
1583	FixedVectorType *SrcVTy = cast<FixedVectorType>(Val: Src);
1584	Type *DstTy =
1585	Opcode == Instruction::Load
1586	? (*I->user_begin())->getType()
1587	: cast<Instruction>(Val: I->getOperand(i: `0`))->getOperand(i: `0`)->getType();
1588	if (SrcVTy->getNumElements() == `4` && SrcVTy->getScalarType()->isHalfTy() &&
1589	DstTy->getScalarType()->isFloatTy())
1590	return ST->getMVEVectorCostFactor(CostKind);
1591	}
1592
1593	int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1594	? ST->getMVEVectorCostFactor(CostKind)
1595	: `1`;
1596	return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1597	CostKind, OpInfo, I);
1598	}
1599
1600	InstructionCost
1601	ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1602	unsigned AddressSpace,
1603	TTI::TargetCostKind CostKind) const {
1604	if (ST->hasMVEIntegerOps()) {
1605	if (Opcode == Instruction::Load &&
1606	isLegalMaskedLoad(DataTy: Src, Alignment, AddressSpace))
1607	return ST->getMVEVectorCostFactor(CostKind);
1608	if (Opcode == Instruction::Store &&
1609	isLegalMaskedStore(DataTy: Src, Alignment, AddressSpace))
1610	return ST->getMVEVectorCostFactor(CostKind);
1611	}
1612	if (!isa<FixedVectorType>(Val: Src))
1613	return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
1614	CostKind);
1615	// Scalar cost, which is currently very high due to the efficiency of the
1616	// generated code.
1617	return cast<FixedVectorType>(Val: Src)->getNumElements() * `8`;
1618	}
1619
1620	InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1621	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
1622	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1623	bool UseMaskForCond, bool UseMaskForGaps) const {
1624	assert(Factor >= `2` && "Invalid interleave factor");
1625	assert(isa<VectorType>(VecTy) && "Expect a vector type");
1626
1627	// vldN/vstN doesn't support vector types of i64/f64 element.
1628	bool EltIs64Bits = DL.getTypeSizeInBits(Ty: VecTy->getScalarType()) == `64`;
1629
1630	if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1631	!UseMaskForCond && !UseMaskForGaps) {
1632	unsigned NumElts = cast<FixedVectorType>(Val: VecTy)->getNumElements();
1633	auto *SubVecTy =
1634	FixedVectorType::get(ElementType: VecTy->getScalarType(), NumElts: NumElts / Factor);
1635
1636	// vldN/vstN only support legal vector types of size 64 or 128 in bits.
1637	// Accesses having vector types that are a multiple of 128 bits can be
1638	// matched to more than one vldN/vstN instruction.
1639	int BaseCost =
1640	ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : `1`;
1641	if (NumElts % Factor == `0` &&
1642	TLI->isLegalInterleavedAccessType(Factor, VecTy: SubVecTy, Alignment, DL))
1643	return Factor * BaseCost * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL);
1644
1645	// Some smaller than legal interleaved patterns are cheap as we can make
1646	// use of the vmovn or vrev patterns to interleave a standard load. This is
1647	// true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1648	// promoted differently). The cost of 2 here is then a load and vrev or
1649	// vmovn.
1650	if (ST->hasMVEIntegerOps() && Factor == `2` && NumElts / Factor > `2` &&
1651	VecTy->isIntOrIntVectorTy() &&
1652	DL.getTypeSizeInBits(Ty: SubVecTy).getFixedValue() <= `64`)
1653	return `2` * BaseCost;
1654	}
1655
1656	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1657	Alignment, AddressSpace, CostKind,
1658	UseMaskForCond, UseMaskForGaps);
1659	}
1660
1661	InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1662	unsigned Opcode, Type DataTy, const* Value Ptr, bool* VariableMask,
1663	Align Alignment, TTI::TargetCostKind CostKind, const Instruction I) const* {
1664	using namespace PatternMatch;
1665	if (!ST->hasMVEIntegerOps() \|\| !EnableMaskedGatherScatters)
1666	return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1667	Alignment, CostKind, I);
1668
1669	assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1670	auto *VTy = cast<FixedVectorType>(Val: DataTy);
1671
1672	// TODO: Splitting, once we do that.
1673
1674	unsigned NumElems = VTy->getNumElements();
1675	unsigned EltSize = VTy->getScalarSizeInBits();
1676	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: DataTy);
1677
1678	// For now, it is assumed that for the MVE gather instructions the loads are
1679	// all effectively serialised. This means the cost is the scalar cost
1680	// multiplied by the number of elements being loaded. This is possibly very
1681	// conservative, but even so we still end up vectorising loops because the
1682	// cost per iteration for many loops is lower than for scalar loops.
1683	InstructionCost VectorCost =
1684	NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1685	// The scalarization cost should be a lot higher. We use the number of vector
1686	// elements plus the scalarization overhead. If masking is required then a lot
1687	// of little blocks will be needed and potentially a scalarized p0 mask,
1688	// greatly increasing the cost.
1689	InstructionCost ScalarCost =
1690	NumElems * LT.first + (VariableMask ? NumElems * `5` : `0`) +
1691	BaseT::getScalarizationOverhead(InTy: VTy, /Insert/ true, /Extract/ false,
1692	CostKind) +
1693	BaseT::getScalarizationOverhead(InTy: VTy, /Insert/ false, /Extract/ true,
1694	CostKind);
1695
1696	if (EltSize < `8` \|\| Alignment < EltSize / `8`)
1697	return ScalarCost;
1698
1699	unsigned ExtSize = EltSize;
1700	// Check whether there's a single user that asks for an extended type
1701	if (I != nullptr) {
1702	// Dependent of the caller of this function, a gather instruction will
1703	// either have opcode Instruction::Load or be a call to the masked_gather
1704	// intrinsic
1705	if ((I->getOpcode() == Instruction::Load \|\|
1706	match(V: I, P: m_Intrinsic<Intrinsic::masked_gather>())) &&
1707	I->hasOneUse()) {
1708	const User Us = I->users().begin();
1709	if (isa<ZExtInst>(Val: Us) \|\| isa<SExtInst>(Val: Us)) {
1710	// only allow valid type combinations
1711	unsigned TypeSize =
1712	cast<Instruction>(Val: Us)->getType()->getScalarSizeInBits();
1713	if (((TypeSize == `32` && (EltSize == `8` \|\| EltSize == `16`)) \|\|
1714	(TypeSize == `16` && EltSize == `8`)) &&
1715	TypeSize * NumElems == `128`) {
1716	ExtSize = TypeSize;
1717	}
1718	}
1719	}
1720	// Check whether the input data needs to be truncated
1721	TruncInst *T;
1722	if ((I->getOpcode() == Instruction::Store \|\|
1723	match(V: I, P: m_Intrinsic<Intrinsic::masked_scatter>())) &&
1724	(T = dyn_cast<TruncInst>(Val: I->getOperand(i: `0`)))) {
1725	// Only allow valid type combinations
1726	unsigned TypeSize = T->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits();
1727	if (((EltSize == `16` && TypeSize == `32`) \|\|
1728	(EltSize == `8` && (TypeSize == `32` \|\| TypeSize == `16`))) &&
1729	TypeSize * NumElems == `128`)
1730	ExtSize = TypeSize;
1731	}
1732	}
1733
1734	if (ExtSize * NumElems != `128` \|\| NumElems < `4`)
1735	return ScalarCost;
1736
1737	// Any (aligned) i32 gather will not need to be scalarised.
1738	if (ExtSize == `32`)
1739	return VectorCost;
1740	// For smaller types, we need to ensure that the gep's inputs are correctly
1741	// extended from a small enough value. Other sizes (including i64) are
1742	// scalarized for now.
1743	if (ExtSize != `8` && ExtSize != `16`)
1744	return ScalarCost;
1745
1746	if (const auto *BC = dyn_cast<BitCastInst>(Val: Ptr))
1747	Ptr = BC->getOperand(i_nocapture: `0`);
1748	if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr)) {
1749	if (GEP->getNumOperands() != `2`)
1750	return ScalarCost;
1751	unsigned Scale = DL.getTypeAllocSize(Ty: GEP->getResultElementType());
1752	// Scale needs to be correct (which is only relevant for i16s).
1753	if (Scale != `1` && Scale * `8` != ExtSize)
1754	return ScalarCost;
1755	// And we need to zext (not sext) the indexes from a small enough type.
1756	if (const auto *ZExt = dyn_cast<ZExtInst>(Val: GEP->getOperand(i_nocapture: `1`))) {
1757	if (ZExt->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits() <= ExtSize)
1758	return VectorCost;
1759	}
1760	return ScalarCost;
1761	}
1762	return ScalarCost;
1763	}
1764
1765	InstructionCost
1766	ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1767	std::optional<FastMathFlags> FMF,
1768	TTI::TargetCostKind CostKind) const {
1769
1770	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1771	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1772	unsigned EltSize = ValVT.getScalarSizeInBits();
1773
1774	// In general floating point reductions are a series of elementwise
1775	// operations, with free extracts on each step. These are either in-order or
1776	// treewise depending on whether that is allowed by the fast math flags.
1777	if ((ISD == ISD::FADD \|\| ISD == ISD::FMUL) &&
1778	((EltSize == `32` && ST->hasVFP2Base()) \|\|
1779	(EltSize == `64` && ST->hasFP64()) \|\|
1780	(EltSize == `16` && ST->hasFullFP16()))) {
1781	unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1782	unsigned VecLimit = ST->hasMVEFloatOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
1783	InstructionCost VecCost = `0`;
1784	while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(Value: NumElts) &&
1785	NumElts * EltSize > VecLimit) {
1786	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / `2`);
1787	VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1788	NumElts /= `2`;
1789	}
1790
1791	// For fp16 we need to extract the upper lane elements. MVE can add a
1792	// VREV+FMIN/MAX to perform another vector step instead.
1793	InstructionCost ExtractCost = `0`;
1794	if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1795	ValVT.getVectorElementType() == MVT::f16 && NumElts == `8`) {
1796	VecCost += ST->getMVEVectorCostFactor(CostKind) * `2`;
1797	NumElts /= `2`;
1798	} else if (ValVT.getVectorElementType() == MVT::f16)
1799	ExtractCost = NumElts / `2`;
1800
1801	return VecCost + ExtractCost +
1802	NumElts *
1803	getArithmeticInstrCost(Opcode, Ty: ValTy->getElementType(), CostKind);
1804	}
1805
1806	if ((ISD == ISD::AND \|\| ISD == ISD::OR \|\| ISD == ISD::XOR) &&
1807	(EltSize == `64` \|\| EltSize == `32` \|\| EltSize == `16` \|\| EltSize == `8`)) {
1808	unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1809	unsigned VecLimit =
1810	ST->hasMVEIntegerOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
1811	InstructionCost VecCost = `0`;
1812	while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1813	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / `2`);
1814	VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1815	NumElts /= `2`;
1816	}
1817	// For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1818	// step.
1819	if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= `16` &&
1820	NumElts * EltSize == `64`) {
1821	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts);
1822	VecCost += ST->getMVEVectorCostFactor(CostKind) +
1823	getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1824	NumElts /= `2`;
1825	}
1826
1827	// From here we extract the elements and perform the and/or/xor.
1828	InstructionCost ExtractCost = NumElts;
1829	return VecCost + ExtractCost +
1830	(NumElts - `1`) * getArithmeticInstrCost(
1831	Opcode, Ty: ValTy->getElementType(), CostKind);
1832	}
1833
1834	if (!ST->hasMVEIntegerOps() \|\| !ValVT.isSimple() \|\| ISD != ISD::ADD \|\|
1835	TTI::requiresOrderedReduction(FMF))
1836	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1837
1838	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1839
1840	static const CostTblEntry CostTblAdd[]{
1841	{.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: `1`},
1842	{.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: `1`},
1843	{.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: `1`},
1844	};
1845	if (const auto *Entry = CostTableLookup(Table: CostTblAdd, ISD, Ty: LT.second))
1846	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1847
1848	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1849	}
1850
1851	InstructionCost ARMTTIImpl::getExtendedReductionCost(
1852	unsigned Opcode, bool IsUnsigned, Type ResTy, VectorType ValTy,
1853	std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1854	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1855	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1856
1857	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1858
1859	switch (ISD) {
1860	case ISD::ADD:
1861	if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1862	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1863
1864	// The legal cases are:
1865	// VADDV u/s 8/16/32
1866	// VADDLV u/s 32
1867	// Codegen currently cannot always handle larger than legal vectors very
1868	// well, especially for predicated reductions where the mask needs to be
1869	// split, so restrict to 128bit or smaller input types.
1870	unsigned RevVTSize = ResVT.getSizeInBits();
1871	if (ValVT.getSizeInBits() <= `128` &&
1872	((LT.second == MVT::v16i8 && RevVTSize <= `32`) \|\|
1873	(LT.second == MVT::v8i16 && RevVTSize <= `32`) \|\|
1874	(LT.second == MVT::v4i32 && RevVTSize <= `64`)))
1875	return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1876	}
1877	break;
1878	default:
1879	break;
1880	}
1881	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, FMF,
1882	CostKind);
1883	}
1884
1885	InstructionCost
1886	ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
1887	VectorType *ValTy,
1888	TTI::TargetCostKind CostKind) const {
1889	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1890	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1891
1892	if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1893	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1894
1895	// The legal cases are:
1896	// VMLAV u/s 8/16/32
1897	// VMLALV u/s 16/32
1898	// Codegen currently cannot always handle larger than legal vectors very
1899	// well, especially for predicated reductions where the mask needs to be
1900	// split, so restrict to 128bit or smaller input types.
1901	unsigned RevVTSize = ResVT.getSizeInBits();
1902	if (ValVT.getSizeInBits() <= `128` &&
1903	((LT.second == MVT::v16i8 && RevVTSize <= `32`) \|\|
1904	(LT.second == MVT::v8i16 && RevVTSize <= `64`) \|\|
1905	(LT.second == MVT::v4i32 && RevVTSize <= `64`)))
1906	return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1907	}
1908
1909	return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, Ty: ValTy, CostKind);
1910	}
1911
1912	InstructionCost
1913	ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1914	FastMathFlags FMF,
1915	TTI::TargetCostKind CostKind) const {
1916	EVT ValVT = TLI->getValueType(DL, Ty);
1917
1918	// In general floating point reductions are a series of elementwise
1919	// operations, with free extracts on each step. These are either in-order or
1920	// treewise depending on whether that is allowed by the fast math flags.
1921	if ((IID == Intrinsic::minnum \|\| IID == Intrinsic::maxnum) &&
1922	((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) \|\|
1923	(ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) \|\|
1924	(ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1925	unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
1926	unsigned EltSize = ValVT.getScalarSizeInBits();
1927	unsigned VecLimit = ST->hasMVEFloatOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
1928	InstructionCost VecCost;
1929	while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1930	Type *VecTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElts/`2`);
1931	IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1932	VecCost += getIntrinsicInstrCost(ICA, CostKind);
1933	NumElts /= `2`;
1934	}
1935
1936	// For fp16 we need to extract the upper lane elements. MVE can add a
1937	// VREV+FMIN/MAX to perform another vector step instead.
1938	InstructionCost ExtractCost = `0`;
1939	if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1940	NumElts == `8`) {
1941	VecCost += ST->getMVEVectorCostFactor(CostKind) * `2`;
1942	NumElts /= `2`;
1943	} else if (ValVT.getVectorElementType() == MVT::f16)
1944	ExtractCost = cast<FixedVectorType>(Val: Ty)->getNumElements() / `2`;
1945
1946	IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
1947	{Ty->getElementType(), Ty->getElementType()},
1948	FMF);
1949	return VecCost + ExtractCost +
1950	(NumElts - `1`) * getIntrinsicInstrCost(ICA, CostKind);
1951	}
1952
1953	if (IID == Intrinsic::smin \|\| IID == Intrinsic::smax \|\|
1954	IID == Intrinsic::umin \|\| IID == Intrinsic::umax) {
1955	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1956
1957	// All costs are the same for u/s min/max. These lower to vminv, which are
1958	// given a slightly higher cost as they tend to take multiple cycles for
1959	// smaller type sizes.
1960	static const CostTblEntry CostTblAdd[]{
1961	{.ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: `4`},
1962	{.ISD: ISD::SMIN, .Type: MVT::v8i16, .Cost: `3`},
1963	{.ISD: ISD::SMIN, .Type: MVT::v4i32, .Cost: `2`},
1964	};
1965	if (const auto *Entry = CostTableLookup(Table: CostTblAdd, ISD: ISD::SMIN, Ty: LT.second))
1966	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1967	}
1968
1969	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1970	}
1971
1972	InstructionCost
1973	ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1974	TTI::TargetCostKind CostKind) const {
1975	unsigned Opc = ICA.getID();
1976	switch (Opc) {
1977	case Intrinsic::get_active_lane_mask:
1978	// Currently we make a somewhat optimistic assumption that
1979	// active_lane_mask's are always free. In reality it may be freely folded
1980	// into a tail predicated loop, expanded into a VCPT or expanded into a lot
1981	// of add/icmp code. We may need to improve this in the future, but being
1982	// able to detect if it is free or not involves looking at a lot of other
1983	// code. We currently assume that the vectorizer inserted these, and knew
1984	// what it was doing in adding one.
1985	if (ST->hasMVEIntegerOps())
1986	return `0`;
1987	break;
1988	case Intrinsic::sadd_sat:
1989	case Intrinsic::ssub_sat:
1990	case Intrinsic::uadd_sat:
1991	case Intrinsic::usub_sat: {
1992	bool IsAdd = (Opc == Intrinsic::sadd_sat \|\| Opc == Intrinsic::ssub_sat);
1993	bool IsSigned = (Opc == Intrinsic::sadd_sat \|\| Opc == Intrinsic::ssub_sat);
1994	Type *RetTy = ICA.getReturnType();
1995
1996	if (auto *ITy = dyn_cast<IntegerType>(Val: RetTy)) {
1997	if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == `32`)
1998	return `1`; // qadd / qsub
1999	if (ST->hasDSP() && (ITy->getBitWidth() == `8` \|\| ITy->getBitWidth() == `16`))
2000	return `2`; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2001	// Otherwise return the cost of expanding the node. Generally an add +
2002	// icmp + sel.
2003	CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2004	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
2005	return getArithmeticInstrCost(Opcode: IsAdd ? Instruction::Add : Instruction::Sub,
2006	Ty: RetTy, CostKind) +
2007	`2` * getCmpSelInstrCost(Opcode: BinaryOperator::ICmp, ValTy: RetTy, CondTy, VecPred: Pred,
2008	CostKind) +
2009	`2` * getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy, VecPred: Pred,
2010	CostKind);
2011	}
2012
2013	if (!ST->hasMVEIntegerOps())
2014	break;
2015
2016	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
2017	if (LT.second == MVT::v4i32 \|\| LT.second == MVT::v8i16 \|\|
2018	LT.second == MVT::v16i8) {
2019	// This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2020	// need to extend the type, as it uses shr(qadd(shl, shl)).
2021	unsigned Instrs =
2022	LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? `1`
2023	: `4`;
2024	return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2025	}
2026	break;
2027	}
2028	case Intrinsic::abs:
2029	case Intrinsic::smin:
2030	case Intrinsic::smax:
2031	case Intrinsic::umin:
2032	case Intrinsic::umax: {
2033	if (!ST->hasMVEIntegerOps())
2034	break;
2035	Type *VT = ICA.getReturnType();
2036
2037	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
2038	if (LT.second == MVT::v4i32 \|\| LT.second == MVT::v8i16 \|\|
2039	LT.second == MVT::v16i8)
2040	return LT.first * ST->getMVEVectorCostFactor(CostKind);
2041	break;
2042	}
2043	case Intrinsic::minnum:
2044	case Intrinsic::maxnum: {
2045	if (!ST->hasMVEFloatOps())
2046	break;
2047	Type *VT = ICA.getReturnType();
2048	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
2049	if (LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16)
2050	return LT.first * ST->getMVEVectorCostFactor(CostKind);
2051	break;
2052	}
2053	case Intrinsic::fptosi_sat:
2054	case Intrinsic::fptoui_sat: {
2055	if (ICA.getArgTypes().empty())
2056	break;
2057	bool IsSigned = Opc == Intrinsic::fptosi_sat;
2058	auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[`0`]);
2059	EVT MTy = TLI->getValueType(DL, Ty: ICA.getReturnType());
2060	// Check for the legal types, with the corect subtarget features.
2061	if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) \|\|
2062	(ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) \|\|
2063	(ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2064	return LT.first;
2065
2066	// Equally for MVE vector types
2067	if (ST->hasMVEFloatOps() &&
2068	(LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16) &&
2069	LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2070	return LT.first * ST->getMVEVectorCostFactor(CostKind);
2071
2072	// If we can we use a legal convert followed by a min+max
2073	if (((ST->hasVFP2Base() && LT.second == MVT::f32) \|\|
2074	(ST->hasFP64() && LT.second == MVT::f64) \|\|
2075	(ST->hasFullFP16() && LT.second == MVT::f16) \|\|
2076	(ST->hasMVEFloatOps() &&
2077	(LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16))) &&
2078	LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2079	Type *LegalTy = Type::getIntNTy(C&: ICA.getReturnType()->getContext(),
2080	N: LT.second.getScalarSizeInBits());
2081	InstructionCost Cost =
2082	LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : `1`;
2083	IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2084	: Intrinsic::umin,
2085	LegalTy, {LegalTy, LegalTy});
2086	Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
2087	IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2088	: Intrinsic::umax,
2089	LegalTy, {LegalTy, LegalTy});
2090	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
2091	return LT.first * Cost;
2092	}
2093	// Otherwise we need to follow the default expansion that clamps the value
2094	// using a float min/max with a fcmp+sel for nan handling when signed.
2095	Type *FPTy = ICA.getArgTypes()[`0`];
2096	Type *RetTy = ICA.getReturnType();
2097	IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2098	InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind);
2099	IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2100	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
2101	Cost +=
2102	getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2103	Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind);
2104	if (IsSigned) {
2105	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
2106	Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy,
2107	VecPred: CmpInst::FCMP_UNO, CostKind);
2108	Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy,
2109	VecPred: CmpInst::FCMP_UNO, CostKind);
2110	}
2111	return Cost;
2112	}
2113	}
2114
2115	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2116	}
2117
2118	bool ARMTTIImpl::isLoweredToCall(const Function F) const* {
2119	if (!F->isIntrinsic())
2120	return BaseT::isLoweredToCall(F);
2121
2122	// Assume all Arm-specific intrinsics map to an instruction.
2123	if (F->getName().starts_with(Prefix: "llvm.arm"))
2124	return false;
2125
2126	switch (F->getIntrinsicID()) {
2127	default: break;
2128	case Intrinsic::powi:
2129	case Intrinsic::sin:
2130	case Intrinsic::cos:
2131	case Intrinsic::sincos:
2132	case Intrinsic::pow:
2133	case Intrinsic::log:
2134	case Intrinsic::log10:
2135	case Intrinsic::log2:
2136	case Intrinsic::exp:
2137	case Intrinsic::exp2:
2138	return true;
2139	case Intrinsic::sqrt:
2140	case Intrinsic::fabs:
2141	case Intrinsic::copysign:
2142	case Intrinsic::floor:
2143	case Intrinsic::ceil:
2144	case Intrinsic::trunc:
2145	case Intrinsic::rint:
2146	case Intrinsic::nearbyint:
2147	case Intrinsic::round:
2148	case Intrinsic::canonicalize:
2149	case Intrinsic::lround:
2150	case Intrinsic::llround:
2151	case Intrinsic::lrint:
2152	case Intrinsic::llrint:
2153	if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2154	return true;
2155	if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2156	return true;
2157	// Some operations can be handled by vector instructions and assume
2158	// unsupported vectors will be expanded into supported scalar ones.
2159	// TODO Handle scalar operations properly.
2160	return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2161	case Intrinsic::masked_store:
2162	case Intrinsic::masked_load:
2163	case Intrinsic::masked_gather:
2164	case Intrinsic::masked_scatter:
2165	return !ST->hasMVEIntegerOps();
2166	case Intrinsic::sadd_with_overflow:
2167	case Intrinsic::uadd_with_overflow:
2168	case Intrinsic::ssub_with_overflow:
2169	case Intrinsic::usub_with_overflow:
2170	case Intrinsic::sadd_sat:
2171	case Intrinsic::uadd_sat:
2172	case Intrinsic::ssub_sat:
2173	case Intrinsic::usub_sat:
2174	return false;
2175	}
2176
2177	return BaseT::isLoweredToCall(F);
2178	}
2179
2180	bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) const {
2181	unsigned ISD = TLI->InstructionOpcodeToISD(Opcode: I.getOpcode());
2182	EVT VT = TLI->getValueType(DL, Ty: I.getType(), AllowUnknown: true);
2183	if (TLI->getOperationAction(Op: ISD, VT) == TargetLowering::LibCall)
2184	return true;
2185
2186	// Check if an intrinsic will be lowered to a call and assume that any
2187	// other CallInst will generate a bl.
2188	if (auto *Call = dyn_cast<CallInst>(Val: &I)) {
2189	if (auto *II = dyn_cast<IntrinsicInst>(Val: Call)) {
2190	switch(II->getIntrinsicID()) {
2191	case Intrinsic::memcpy:
2192	case Intrinsic::memset:
2193	case Intrinsic::memmove:
2194	return getNumMemOps(I: II) == -`1`;
2195	default:
2196	if (const Function *F = Call->getCalledFunction())
2197	return isLoweredToCall(F);
2198	}
2199	}
2200	return true;
2201	}
2202
2203	// FPv5 provides conversions between integer, double-precision,
2204	// single-precision, and half-precision formats.
2205	switch (I.getOpcode()) {
2206	default:
2207	break;
2208	case Instruction::FPToSI:
2209	case Instruction::FPToUI:
2210	case Instruction::SIToFP:
2211	case Instruction::UIToFP:
2212	case Instruction::FPTrunc:
2213	case Instruction::FPExt:
2214	return !ST->hasFPARMv8Base();
2215	}
2216
2217	// FIXME: Unfortunately the approach of checking the Operation Action does
2218	// not catch all cases of Legalization that use library calls. Our
2219	// Legalization step categorizes some transformations into library calls as
2220	// Custom, Expand or even Legal when doing type legalization. So for now
2221	// we have to special case for instance the SDIV of 64bit integers and the
2222	// use of floating point emulation.
2223	if (VT.isInteger() && VT.getSizeInBits() >= `64`) {
2224	switch (ISD) {
2225	default:
2226	break;
2227	case ISD::SDIV:
2228	case ISD::UDIV:
2229	case ISD::SREM:
2230	case ISD::UREM:
2231	case ISD::SDIVREM:
2232	case ISD::UDIVREM:
2233	return true;
2234	}
2235	}
2236
2237	// Assume all other non-float operations are supported.
2238	if (!VT.isFloatingPoint())
2239	return false;
2240
2241	// We'll need a library call to handle most floats when using soft.
2242	if (TLI->useSoftFloat()) {
2243	switch (I.getOpcode()) {
2244	default:
2245	return true;
2246	case Instruction::Alloca:
2247	case Instruction::Load:
2248	case Instruction::Store:
2249	case Instruction::Select:
2250	case Instruction::PHI:
2251	return false;
2252	}
2253	}
2254
2255	// We'll need a libcall to perform double precision operations on a single
2256	// precision only FPU.
2257	if (I.getType()->isDoubleTy() && !ST->hasFP64())
2258	return true;
2259
2260	// Likewise for half precision arithmetic.
2261	if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2262	return true;
2263
2264	return false;
2265	}
2266
2267	bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2268	AssumptionCache &AC,
2269	TargetLibraryInfo *LibInfo,
2270	HardwareLoopInfo &HWLoopInfo) const {
2271	// Low-overhead branches are only supported in the 'low-overhead branch'
2272	// extension of v8.1-m.
2273	if (!ST->hasLOB() \|\| DisableLowOverheadLoops) {
2274	LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2275	return false;
2276	}
2277
2278	if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2279	LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2280	return false;
2281	}
2282
2283	const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2284	if (isa<SCEVCouldNotCompute>(Val: BackedgeTakenCount)) {
2285	LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2286	return false;
2287	}
2288
2289	const SCEV *TripCountSCEV =
2290	SE.getAddExpr(LHS: BackedgeTakenCount,
2291	RHS: SE.getOne(Ty: BackedgeTakenCount->getType()));
2292
2293	// We need to store the trip count in LR, a 32-bit register.
2294	if (SE.getUnsignedRangeMax(S: TripCountSCEV).getBitWidth() > `32`) {
2295	LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2296	return false;
2297	}
2298
2299	// Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2300	// point in generating a hardware loop if that's going to happen.
2301
2302	auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2303	if (auto *Call = dyn_cast<IntrinsicInst>(Val: &I)) {
2304	switch (Call->getIntrinsicID()) {
2305	default:
2306	break;
2307	case Intrinsic::start_loop_iterations:
2308	case Intrinsic::test_start_loop_iterations:
2309	case Intrinsic::loop_decrement:
2310	case Intrinsic::loop_decrement_reg:
2311	return true;
2312	}
2313	}
2314	return false;
2315	};
2316
2317	// Scan the instructions to see if there's any that we know will turn into a
2318	// call or if this loop is already a low-overhead loop or will become a tail
2319	// predicated loop.
2320	bool IsTailPredLoop = false;
2321	auto ScanLoop = [&](Loop *L) {
2322	for (auto *BB : L->getBlocks()) {
2323	for (auto &I : *BB) {
2324	if (maybeLoweredToCall(I) \|\| IsHardwareLoopIntrinsic (I) \|\|
2325	isa<InlineAsm>(Val: I)) {
2326	LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2327	return false;
2328	}
2329	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2330	IsTailPredLoop \|=
2331	II->getIntrinsicID() == Intrinsic::get_active_lane_mask \|\|
2332	II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 \|\|
2333	II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 \|\|
2334	II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 \|\|
2335	II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2336	}
2337	}
2338	return true;
2339	};
2340
2341	// Visit inner loops.
2342	for (auto Inner : L)
2343	if (!ScanLoop (Inner))
2344	return false;
2345
2346	if (!ScanLoop (L))
2347	return false;
2348
2349	// TODO: Check whether the trip count calculation is expensive. If L is the
2350	// inner loop but we know it has a low trip count, calculating that trip
2351	// count (in the parent loop) may be detrimental.
2352
2353	LLVMContext &C = L->getHeader()->getContext();
2354	HWLoopInfo.CounterInReg = true;
2355	HWLoopInfo.IsNestingLegal = false;
2356	HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2357	HWLoopInfo.CountType = Type::getInt32Ty(C);
2358	HWLoopInfo.LoopDecrement = ConstantInt::get(Ty: HWLoopInfo.CountType, V: `1`);
2359	return true;
2360	}
2361
2362	static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2363	// We don't allow icmp's, and because we only look at single block loops,
2364	// we simply count the icmps, i.e. there should only be 1 for the backedge.
2365	if (isa<ICmpInst>(Val: &I) && ++ICmpCount > `1`)
2366	return false;
2367	// FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2368	// not currently canonical, but soon will be. Code without them uses icmp, and
2369	// so is not tail predicated as per the condition above. In order to get the
2370	// same performance we treat min and max the same as an icmp for tailpred
2371	// purposes for the moment (we often rely on non-tailpred and higher VF's to
2372	// pick more optimial instructions like VQDMULH. They need to be recognized
2373	// directly by the vectorizer).
2374	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2375	if ((II->getIntrinsicID() == Intrinsic::smin \|\|
2376	II->getIntrinsicID() == Intrinsic::smax \|\|
2377	II->getIntrinsicID() == Intrinsic::umin \|\|
2378	II->getIntrinsicID() == Intrinsic::umax) &&
2379	++ICmpCount > `1`)
2380	return false;
2381
2382	if (isa<FCmpInst>(Val: &I))
2383	return false;
2384
2385	// We could allow extending/narrowing FP loads/stores, but codegen is
2386	// too inefficient so reject this for now.
2387	if (isa<FPExtInst>(Val: &I) \|\| isa<FPTruncInst>(Val: &I))
2388	return false;
2389
2390	// Extends have to be extending-loads
2391	if (isa<SExtInst>(Val: &I) \|\| isa<ZExtInst>(Val: &I) )
2392	if (!I.getOperand(i: `0`)->hasOneUse() \|\| !isa<LoadInst>(Val: I.getOperand(i: `0`)))
2393	return false;
2394
2395	// Truncs have to be narrowing-stores
2396	if (isa<TruncInst>(Val: &I) )
2397	if (!I.hasOneUse() \|\| !isa<StoreInst>(Val: *I.user_begin()))
2398	return false;
2399
2400	return true;
2401	}
2402
2403	// To set up a tail-predicated loop, we need to know the total number of
2404	// elements processed by that loop. Thus, we need to determine the element
2405	// size and:
2406	// 1) it should be uniform for all operations in the vector loop, so we
2407	// e.g. don't want any widening/narrowing operations.
2408	// 2) it should be smaller than i64s because we don't have vector operations
2409	// that work on i64s.
2410	// 3) we don't want elements to be reversed or shuffled, to make sure the
2411	// tail-predication masks/predicates the right lanes.
2412	//
2413	static bool canTailPredicateLoop(Loop L, LoopInfo LI, ScalarEvolution &SE,
2414	const DataLayout &DL,
2415	const LoopAccessInfo *LAI) {
2416	LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2417
2418	// If there are live-out values, it is probably a reduction. We can predicate
2419	// most reduction operations freely under MVE using a combination of
2420	// prefer-predicated-reduction-select and inloop reductions. We limit this to
2421	// floating point and integer reductions, but don't check for operators
2422	// specifically here. If the value ends up not being a reduction (and so the
2423	// vectorizer cannot tailfold the loop), we should fall back to standard
2424	// vectorization automatically.
2425	SmallVector< Instruction *, `8` > LiveOuts;
2426	LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2427	bool ReductionsDisabled =
2428	EnableTailPredication == TailPredication::EnabledNoReductions \|\|
2429	EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2430
2431	for (auto *I : LiveOuts) {
2432	if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2433	!I->getType()->isHalfTy()) {
2434	LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2435	"live-out value\n");
2436	return false;
2437	}
2438	if (ReductionsDisabled) {
2439	LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2440	return false;
2441	}
2442	}
2443
2444	// Next, check that all instructions can be tail-predicated.
2445	PredicatedScalarEvolution PSE = LAI->getPSE();
2446	int ICmpCount = `0`;
2447
2448	for (BasicBlock *BB : L->blocks()) {
2449	for (Instruction &I : BB->instructionsWithoutDebug()) {
2450	if (isa<PHINode>(Val: &I))
2451	continue;
2452	if (!canTailPredicateInstruction(I, ICmpCount)) {
2453	LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2454	return false;
2455	}
2456
2457	Type *T = I.getType();
2458	if (T->getScalarSizeInBits() > `32`) {
2459	LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2460	return false;
2461	}
2462	if (isa<StoreInst>(Val: I) \|\| isa<LoadInst>(Val: I)) {
2463	Value *Ptr = getLoadStorePointerOperand(V: &I);
2464	Type *AccessTy = getLoadStoreType(I: &I);
2465	int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, Lp: L).value_or(u: `0`);
2466	if (NextStride == `1`) {
2467	// TODO: for now only allow consecutive strides of 1. We could support
2468	// other strides as long as it is uniform, but let's keep it simple
2469	// for now.
2470	continue;
2471	} else if (NextStride == -`1` \|\|
2472	(NextStride == `2` && MVEMaxSupportedInterleaveFactor >= `2`) \|\|
2473	(NextStride == `4` && MVEMaxSupportedInterleaveFactor >= `4`)) {
2474	LLVM_DEBUG(dbgs()
2475	<< "Consecutive strides of 2 found, vld2/vstr2 can't "
2476	"be tail-predicated\n.");
2477	return false;
2478	// TODO: don't tail predicate if there is a reversed load?
2479	} else if (EnableMaskedGatherScatters) {
2480	// Gather/scatters do allow loading from arbitrary strides, at
2481	// least if they are loop invariant.
2482	// TODO: Loop variant strides should in theory work, too, but
2483	// this requires further testing.
2484	const SCEV *PtrScev = PSE.getSE()->getSCEV(V: Ptr);
2485	if (auto AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev)) {
2486	const SCEV Step = AR->getStepRecurrence(SE&: PSE.getSE());
2487	if (PSE.getSE()->isLoopInvariant(S: Step, L))
2488	continue;
2489	}
2490	}
2491	LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2492	"tail-predicate\n.");
2493	return false;
2494	}
2495	}
2496	}
2497
2498	LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2499	return true;
2500	}
2501
2502	bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo TFI) const* {
2503	if (!EnableTailPredication) {
2504	LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2505	return false;
2506	}
2507
2508	// Creating a predicated vector loop is the first step for generating a
2509	// tail-predicated hardware loop, for which we need the MVE masked
2510	// load/stores instructions:
2511	if (!ST->hasMVEIntegerOps())
2512	return false;
2513
2514	LoopVectorizationLegality *LVL = TFI->LVL;
2515	Loop *L = LVL->getLoop();
2516
2517	// For now, restrict this to single block loops.
2518	if (L->getNumBlocks() > `1`) {
2519	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2520	"loop.\n");
2521	return false;
2522	}
2523
2524	assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2525
2526	LoopInfo *LI = LVL->getLoopInfo();
2527	HardwareLoopInfo HWLoopInfo(L);
2528	if (!HWLoopInfo.canAnalyze(LI&: *LI)) {
2529	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2530	"analyzable.\n");
2531	return false;
2532	}
2533
2534	AssumptionCache *AC = LVL->getAssumptionCache();
2535	ScalarEvolution *SE = LVL->getScalarEvolution();
2536
2537	// This checks if we have the low-overhead branch architecture
2538	// extension, and if we will create a hardware-loop:
2539	if (!isHardwareLoopProfitable(L, SE&: SE, AC&: AC, LibInfo: TFI->TLI, HWLoopInfo)) {
2540	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2541	"profitable.\n");
2542	return false;
2543	}
2544
2545	DominatorTree *DT = LVL->getDominatorTree();
2546	if (!HWLoopInfo.isHardwareLoopCandidate(SE&: SE, LI&: LI, DT&: *DT)) {
2547	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2548	"a candidate.\n");
2549	return false;
2550	}
2551
2552	return canTailPredicateLoop(L, LI, SE&: *SE, DL, LAI: LVL->getLAI());
2553	}
2554
2555	TailFoldingStyle
2556	ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2557	if (!ST->hasMVEIntegerOps() \|\| !EnableTailPredication)
2558	return TailFoldingStyle::DataWithoutLaneMask;
2559
2560	// Intrinsic @llvm.get.active.lane.mask is supported.
2561	// It is used in the MVETailPredication pass, which requires the number of
2562	// elements processed by this vector loop to setup the tail-predicated
2563	// loop.
2564	return TailFoldingStyle::Data;
2565	}
2566	void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2567	TTI::UnrollingPreferences &UP,
2568	OptimizationRemarkEmitter ORE) const* {
2569	// Enable Upper bound unrolling universally, providing that we do not see an
2570	// active lane mask, which will be better kept as a loop to become tail
2571	// predicated than to be conditionally unrolled.
2572	UP.UpperBound =
2573	!ST->hasMVEIntegerOps() \|\| !any_of(Range&: *L->getHeader(), P: [](Instruction &I) {
2574	return isa<IntrinsicInst>(Val: I) &&
2575	cast<IntrinsicInst>(Val&: I).getIntrinsicID() ==
2576	Intrinsic::get_active_lane_mask;
2577	});
2578
2579	// Only currently enable these preferences for M-Class cores.
2580	if (!ST->isMClass())
2581	return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2582
2583	// Disable loop unrolling for Oz and Os.
2584	UP.OptSizeThreshold = `0`;
2585	UP.PartialOptSizeThreshold = `0`;
2586	if (L->getHeader()->getParent()->hasOptSize())
2587	return;
2588
2589	SmallVector<BasicBlock*, `4`> ExitingBlocks;
2590	L->getExitingBlocks(ExitingBlocks);
2591	LLVM_DEBUG(dbgs() << "Loop has:\n"
2592	<< "Blocks: " << L->getNumBlocks() << "\n"
2593	<< "Exit blocks: " << ExitingBlocks.size() << "\n");
2594
2595	// Only allow another exit other than the latch. This acts as an early exit
2596	// as it mirrors the profitability calculation of the runtime unroller.
2597	if (ExitingBlocks.size() > `2`)
2598	return;
2599
2600	// Limit the CFG of the loop body for targets with a branch predictor.
2601	// Allowing 4 blocks permits if-then-else diamonds in the body.
2602	if (ST->hasBranchPredictor() && L->getNumBlocks() > `4`)
2603	return;
2604
2605	// Don't unroll vectorized loops, including the remainder loop
2606	if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized"))
2607	return;
2608
2609	// Scan the loop: don't unroll loops with calls as this could prevent
2610	// inlining.
2611	InstructionCost Cost = `0`;
2612	for (auto *BB : L->getBlocks()) {
2613	for (auto &I : *BB) {
2614	// Don't unroll vectorised loop. MVE does not benefit from it as much as
2615	// scalar code.
2616	if (I.getType()->isVectorTy())
2617	return;
2618
2619	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I)) {
2620	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
2621	if (!isLoweredToCall(F))
2622	continue;
2623	}
2624	return;
2625	}
2626
2627	SmallVector<const Value*, `4`> Operands(I.operand_values());
2628	Cost += getInstructionCost(U: &I, Operands,
2629	CostKind: TargetTransformInfo::TCK_SizeAndLatency);
2630	}
2631	}
2632
2633	// On v6m cores, there are very few registers available. We can easily end up
2634	// spilling and reloading more registers in an unrolled loop. Look at the
2635	// number of LCSSA phis as a rough measure of how many registers will need to
2636	// be live out of the loop, reducing the default unroll count if more than 1
2637	// value is needed. In the long run, all of this should be being learnt by a
2638	// machine.
2639	unsigned UnrollCount = `4`;
2640	if (ST->isThumb1Only()) {
2641	unsigned ExitingValues = `0`;
2642	SmallVector<BasicBlock *, `4`> ExitBlocks;
2643	L->getExitBlocks(ExitBlocks);
2644	for (auto *Exit : ExitBlocks) {
2645	// Count the number of LCSSA phis. Exclude values coming from GEP's as
2646	// only the last is expected to be needed for address operands.
2647	unsigned LiveOuts = count_if(Range: Exit->phis(), P: [](auto &PH) {
2648	return PH.getNumOperands() != `1` \|\|
2649	!isa<GetElementPtrInst>(PH.getOperand(`0`));
2650	});
2651	ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2652	}
2653	if (ExitingValues)
2654	UnrollCount /= ExitingValues;
2655	if (UnrollCount <= `1`)
2656	return;
2657	}
2658
2659	// For processors with low overhead branching (LOB), runtime unrolling the
2660	// innermost loop is often detrimental to performance. In these cases the loop
2661	// remainder gets unrolled into a series of compare-and-jump blocks, which in
2662	// deeply nested loops get executed multiple times, negating the benefits of
2663	// LOB. This is particularly noticable when the loop trip count of the
2664	// innermost loop varies within the outer loop, such as in the case of
2665	// triangular matrix decompositions. In these cases we will prefer to not
2666	// unroll the innermost loop, with the intention for it to be executed as a
2667	// low overhead loop.
2668	bool Runtime = true;
2669	if (ST->hasLOB()) {
2670	if (SE.hasLoopInvariantBackedgeTakenCount(L)) {
2671	const auto *BETC = SE.getBackedgeTakenCount(L);
2672	auto *Outer = L->getOutermostLoop();
2673	if ((L != Outer && Outer != L->getParentLoop()) \|\|
2674	(L != Outer && BETC && !SE.isLoopInvariant(S: BETC, L: Outer))) {
2675	Runtime = false;
2676	}
2677	}
2678	}
2679
2680	LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2681	LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2682
2683	UP.Partial = true;
2684	UP.Runtime = Runtime;
2685	UP.UnrollRemainder = true;
2686	UP.DefaultUnrollRuntimeCount = UnrollCount;
2687	UP.UnrollAndJam = true;
2688	UP.UnrollAndJamInnerLoopThreshold = `60`;
2689
2690	// Force unrolling small loops can be very useful because of the branch
2691	// taken cost of the backedge.
2692	if (Cost < `12`)
2693	UP.Force = true;
2694	}
2695
2696	void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2697	TTI::PeelingPreferences &PP) const {
2698	BaseT::getPeelingPreferences(L, SE, PP);
2699	}
2700
2701	bool ARMTTIImpl::preferInLoopReduction(RecurKind Kind, Type Ty) const* {
2702	if (!ST->hasMVEIntegerOps())
2703	return false;
2704
2705	unsigned ScalarBits = Ty->getScalarSizeInBits();
2706	switch (Kind) {
2707	case RecurKind::Add:
2708	return ScalarBits <= `64`;
2709	default:
2710	return false;
2711	}
2712	}
2713
2714	bool ARMTTIImpl::preferPredicatedReductionSelect() const {
2715	if (!ST->hasMVEIntegerOps())
2716	return false;
2717	return true;
2718	}
2719
2720	InstructionCost ARMTTIImpl::getScalingFactorCost(Type Ty, GlobalValue BaseGV,
2721	StackOffset BaseOffset,
2722	bool HasBaseReg, int64_t Scale,
2723	unsigned AddrSpace) const {
2724	TargetLoweringBase::AddrMode AM;
2725	AM.BaseGV = BaseGV;
2726	AM.BaseOffs = BaseOffset.getFixed();
2727	AM.HasBaseReg = HasBaseReg;
2728	AM.Scale = Scale;
2729	AM.ScalableOffset = BaseOffset.getScalable();
2730	if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) {
2731	if (ST->hasFPAO())
2732	return AM.Scale < `0` ? `1` : `0`; // positive offsets execute faster
2733	return `0`;
2734	}
2735	return InstructionCost::getInvalid();
2736	}
2737
2738	bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2739	if (Thumb) {
2740	// B.W is available in any Thumb2-supporting target, and also in every
2741	// version of Armv8-M, even Baseline which does not include the rest of
2742	// Thumb2.
2743	return ST->isThumb2() \|\| ST->hasV8MBaselineOps();
2744	} else {
2745	// B is available in all versions of the Arm ISA, so the only question is
2746	// whether that ISA is available at all.
2747	return ST->hasARMOps();
2748	}
2749	}
2750
2751	/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2752	/// of the vector elements.
2753	static bool areExtractExts(Value Ext1, Value Ext2) {
2754	using namespace PatternMatch;
2755
2756	auto areExtDoubled = [](Instruction *Ext) {
2757	return Ext->getType()->getScalarSizeInBits() ==
2758	`2` * Ext->getOperand(i: `0`)->getType()->getScalarSizeInBits();
2759	};
2760
2761	if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) \|\|
2762	!match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) \|\|
2763	!areExtDoubled (cast<Instruction>(Val: Ext1)) \|\|
2764	!areExtDoubled (cast<Instruction>(Val: Ext2)))
2765	return false;
2766
2767	return true;
2768	}
2769
2770	/// Check if sinking \p I's operands to I's basic block is profitable, because
2771	/// the operands can be folded into a target instruction, e.g.
2772	/// sext/zext can be folded into vsubl.
2773	bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,
2774	SmallVectorImpl<Use > &Ops) const* {
2775	using namespace PatternMatch;
2776
2777	if (!I->getType()->isVectorTy())
2778	return false;
2779
2780	if (ST->hasNEON()) {
2781	switch (I->getOpcode()) {
2782	case Instruction::Sub:
2783	case Instruction::Add: {
2784	if (!areExtractExts(Ext1: I->getOperand(i: `0`), Ext2: I->getOperand(i: `1`)))
2785	return false;
2786	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
2787	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
2788	return true;
2789	}
2790	default:
2791	return false;
2792	}
2793	}
2794
2795	if (!ST->hasMVEIntegerOps())
2796	return false;
2797
2798	auto IsFMSMul = [&](Instruction *I) {
2799	if (!I->hasOneUse())
2800	return false;
2801	auto Sub = cast<Instruction>(Val: I->users().begin());
2802	return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(i: `1`) == I;
2803	};
2804	auto IsFMS = [&](Instruction *I) {
2805	if (match(V: I->getOperand(i: `0`), P: m_FNeg(X: m_Value())) \|\|
2806	match(V: I->getOperand(i: `1`), P: m_FNeg(X: m_Value())))
2807	return true;
2808	return false;
2809	};
2810
2811	auto IsSinker = [&](Instruction I, int* Operand) {
2812	switch (I->getOpcode()) {
2813	case Instruction::Add:
2814	case Instruction::Mul:
2815	case Instruction::FAdd:
2816	case Instruction::ICmp:
2817	case Instruction::FCmp:
2818	return true;
2819	case Instruction::FMul:
2820	return !IsFMSMul (I);
2821	case Instruction::Sub:
2822	case Instruction::FSub:
2823	case Instruction::Shl:
2824	case Instruction::LShr:
2825	case Instruction::AShr:
2826	return Operand == `1`;
2827	case Instruction::Call:
2828	if (auto *II = dyn_cast<IntrinsicInst>(Val: I)) {
2829	switch (II->getIntrinsicID()) {
2830	case Intrinsic::fma:
2831	return !IsFMS (I);
2832	case Intrinsic::sadd_sat:
2833	case Intrinsic::uadd_sat:
2834	case Intrinsic::arm_mve_add_predicated:
2835	case Intrinsic::arm_mve_mul_predicated:
2836	case Intrinsic::arm_mve_qadd_predicated:
2837	case Intrinsic::arm_mve_vhadd:
2838	case Intrinsic::arm_mve_hadd_predicated:
2839	case Intrinsic::arm_mve_vqdmull:
2840	case Intrinsic::arm_mve_vqdmull_predicated:
2841	case Intrinsic::arm_mve_vqdmulh:
2842	case Intrinsic::arm_mve_qdmulh_predicated:
2843	case Intrinsic::arm_mve_vqrdmulh:
2844	case Intrinsic::arm_mve_qrdmulh_predicated:
2845	case Intrinsic::arm_mve_fma_predicated:
2846	return true;
2847	case Intrinsic::ssub_sat:
2848	case Intrinsic::usub_sat:
2849	case Intrinsic::arm_mve_sub_predicated:
2850	case Intrinsic::arm_mve_qsub_predicated:
2851	case Intrinsic::arm_mve_hsub_predicated:
2852	case Intrinsic::arm_mve_vhsub:
2853	return Operand == `1`;
2854	default:
2855	return false;
2856	}
2857	}
2858	return false;
2859	default:
2860	return false;
2861	}
2862	};
2863
2864	for (auto OpIdx : enumerate(First: I->operands())) {
2865	Instruction *Op = dyn_cast<Instruction>(Val: OpIdx.value().get());
2866	// Make sure we are not already sinking this operand
2867	if (!Op \|\| any_of(Range&: Ops, P: [&](Use U) { return* U->get() == Op; }))
2868	continue;
2869
2870	Instruction *Shuffle = Op;
2871	if (Shuffle->getOpcode() == Instruction::BitCast)
2872	Shuffle = dyn_cast<Instruction>(Val: Shuffle->getOperand(i: `0`));
2873	// We are looking for a splat that can be sunk.
2874	if (!Shuffle \|\| !match(V: Shuffle, P: m_Shuffle(v1: m_InsertElt(Val: m_Undef(), Elt: m_Value(),
2875	Idx: m_ZeroInt()),
2876	v2: m_Undef(), mask: m_ZeroMask ())))
2877	continue;
2878	if (!IsSinker (I, OpIdx.index()))
2879	continue;
2880
2881	// All uses of the shuffle should be sunk to avoid duplicating it across gpr
2882	// and vector registers
2883	for (Use &U : Op->uses()) {
2884	Instruction *Insn = cast<Instruction>(Val: U.getUser());
2885	if (!IsSinker (Insn, U.getOperandNo()))
2886	return false;
2887	}
2888
2889	Ops.push_back(Elt: &Shuffle->getOperandUse(i: `0`));
2890	if (Shuffle != Op)
2891	Ops.push_back(Elt: &Op->getOperandUse(i: `0`));
2892	Ops.push_back(Elt: &OpIdx.value());
2893	}
2894	return true;
2895	}
2896
2897	unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
2898	Type ArrayType) const* {
2899	if (!UseWidenGlobalArrays) {
2900	LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
2901	return false;
2902	}
2903
2904	// Don't modify none integer array types
2905	if (!ArrayType \|\| !ArrayType->isArrayTy() \|\|
2906	!ArrayType->getArrayElementType()->isIntegerTy())
2907	return `0`;
2908
2909	// We pad to 4 byte boundaries
2910	if (Size % `4` == `0`)
2911	return `0`;
2912
2913	unsigned NumBytesToPad = `4` - (Size % `4`);
2914	unsigned NewSize = Size + NumBytesToPad;
2915
2916	// Max number of bytes that memcpy allows for lowering to load/stores before
2917	// it uses library function (__aeabi_memcpy).
2918	unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
2919
2920	if (NewSize > MaxMemIntrinsicSize)
2921	return `0`;
2922
2923	return NumBytesToPad;
2924	}
2925

Browse the source code of llvm_projects/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp