ARMTargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp]

1	//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "ARMTargetTransformInfo.h"
10	#include "ARMSubtarget.h"
11	#include "MCTargetDesc/ARMAddressingModes.h"
12	#include "llvm/ADT/APInt.h"
13	#include "llvm/ADT/SmallVector.h"
14	#include "llvm/Analysis/LoopInfo.h"
15	#include "llvm/CodeGen/CostTable.h"
16	#include "llvm/CodeGen/ISDOpcodes.h"
17	#include "llvm/CodeGen/ValueTypes.h"
18	#include "llvm/CodeGenTypes/MachineValueType.h"
19	#include "llvm/IR/BasicBlock.h"
20	#include "llvm/IR/DataLayout.h"
21	#include "llvm/IR/DerivedTypes.h"
22	#include "llvm/IR/Instruction.h"
23	#include "llvm/IR/Instructions.h"
24	#include "llvm/IR/IntrinsicInst.h"
25	#include "llvm/IR/Intrinsics.h"
26	#include "llvm/IR/IntrinsicsARM.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/IR/Type.h"
29	#include "llvm/Support/Casting.h"
30	#include "llvm/Support/KnownBits.h"
31	#include "llvm/Target/TargetMachine.h"
32	#include "llvm/TargetParser/SubtargetFeature.h"
33	#include "llvm/Transforms/InstCombine/InstCombiner.h"
34	#include "llvm/Transforms/Utils/Local.h"
35	#include "llvm/Transforms/Utils/LoopUtils.h"
36	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
37	#include <algorithm>
38	#include <cassert>
39	#include <cstdint>
40	#include <optional>
41	#include <utility>
42
43	using namespace llvm;
44
45	#define DEBUG_TYPE "armtti"
46
47	static cl::opt<bool> EnableMaskedLoadStores(
48	"enable-arm-maskedldst", cl::Hidden, cl::init(Val: true),
49	cl::desc ("Enable the generation of masked loads and stores"));
50
51	static cl::opt<bool> DisableLowOverheadLoops(
52	"disable-arm-loloops", cl::Hidden, cl::init(Val: false),
53	cl::desc ("Disable the generation of low-overhead loops"));
54
55	static cl::opt<bool>
56	AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(Val: true),
57	cl::desc ("Enable the generation of WLS loops"));
58
59	static cl::opt<bool> UseWidenGlobalArrays(
60	"widen-global-strings", cl::Hidden, cl::init(Val: true),
61	cl::desc ("Enable the widening of global strings to alignment boundaries"));
62
63	extern cl::opt<TailPredication::Mode> EnableTailPredication;
64
65	extern cl::opt<bool> EnableMaskedGatherScatters;
66
67	extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
68
69	static cl::opt<int> ArmForceUnrollThreshold(
70	"arm-force-unroll-threshold", cl::init(Val: `12`), cl::Hidden,
71	cl::desc (
72	"Threshold for forced unrolling of small loops in Arm architecture"));
73
74	/// Convert a vector load intrinsic into a simple llvm load instruction.
75	/// This is beneficial when the underlying object being addressed comes
76	/// from a constant, since we get constant-folding for free.
77	static Value simplifyNeonVld1(const* IntrinsicInst &II, unsigned MemAlign,
78	InstCombiner::BuilderTy &Builder) {
79	auto *IntrAlign = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
80
81	if (!IntrAlign)
82	return nullptr;
83
84	unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
85	? MemAlign
86	: IntrAlign->getLimitedValue();
87
88	if (!isPowerOf2_32(Value: Alignment))
89	return nullptr;
90
91	return Builder.CreateAlignedLoad(Ty: II.getType(), Ptr: II.getArgOperand(i: `0`),
92	Align: Align (Alignment));
93	}
94
95	bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
96	const Function Callee) const* {
97	const TargetMachine &TM = getTLI()->getTargetMachine();
98	const FeatureBitset &CallerBits =
99	TM.getSubtargetImpl(*Caller)->getFeatureBits();
100	const FeatureBitset &CalleeBits =
101	TM.getSubtargetImpl(*Callee)->getFeatureBits();
102
103	// To inline a callee, all features not in the allowed list must match exactly.
104	bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
105	(CalleeBits & ~InlineFeaturesAllowed);
106	// For features in the allowed list, the callee's features must be a subset of
107	// the callers'.
108	bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
109	(CalleeBits & InlineFeaturesAllowed);
110
111	LLVM_DEBUG({
112	if (!MatchExact \|\| !MatchSubset) {
113	dbgs() << "=== Inline compatibility debug ===\n";
114	dbgs() << "Caller: " << Caller->getName() << "\n";
115	dbgs() << "Callee: " << Callee->getName() << "\n";
116
117	// Bit diffs
118	FeatureBitset MissingInCaller = CalleeBits & ~CallerBits; // callee-only
119	FeatureBitset ExtraInCaller = CallerBits & ~CalleeBits; // caller-only
120
121	// Counts
122	dbgs() << "Only-in-caller bit count: " << ExtraInCaller.count() << "\n";
123	dbgs() << "Only-in-callee bit count: " << MissingInCaller.count() << "\n";
124
125	dbgs() << "Only-in-caller feature indices [";
126	{
127	bool First = true;
128	for (size_t I = `0`, E = ExtraInCaller.size(); I < E; ++I) {
129	if (ExtraInCaller.test(I)) {
130	if (!First)
131	dbgs() << ", ";
132	dbgs() << I;
133	First = false;
134	}
135	}
136	}
137	dbgs() << "]\n";
138
139	dbgs() << "Only-in-callee feature indices [";
140	{
141	bool First = true;
142	for (size_t I = `0`, E = MissingInCaller.size(); I < E; ++I) {
143	if (MissingInCaller.test(I)) {
144	if (!First)
145	dbgs() << ", ";
146	dbgs() << I;
147	First = false;
148	}
149	}
150	}
151	dbgs() << "]\n";
152
153	// Indices map to features as found in
154	// llvm-project/(your_build)/lib/Target/ARM/ARMGenSubtargetInfo.inc
155	dbgs() << "MatchExact=" << (MatchExact ? "true" : "false")
156	<< " MatchSubset=" << (MatchSubset ? "true" : "false") << "\n";
157	}
158	});
159	return MatchExact && MatchSubset;
160	}
161
162	TTI::AddressingModeKind
163	ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
164	ScalarEvolution SE) const* {
165	if (ST->hasMVEIntegerOps())
166	return TTI::AMK_PostIndexed;
167
168	if (L->getHeader()->getParent()->hasOptSize())
169	return TTI::AMK_None;
170
171	if (ST->isMClass() && ST->isThumb2() &&
172	L->getNumBlocks() == `1`)
173	return TTI::AMK_PreIndexed;
174
175	return TTI::AMK_None;
176	}
177
178	std::optional<Instruction *>
179	ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
180	using namespace PatternMatch;
181	Intrinsic::ID IID = II.getIntrinsicID();
182	switch (IID) {
183	default:
184	break;
185	case Intrinsic::arm_neon_vld1: {
186	Align MemAlign =
187	getKnownAlignment(V: II.getArgOperand(i: `0`), DL: IC.getDataLayout(), CxtI: &II,
188	AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
189	if (Value *V = simplifyNeonVld1(II, MemAlign: MemAlign.value(), Builder&: IC.Builder)) {
190	return IC.replaceInstUsesWith(I&: II, V);
191	}
192	break;
193	}
194
195	case Intrinsic::arm_neon_vld2:
196	case Intrinsic::arm_neon_vld3:
197	case Intrinsic::arm_neon_vld4:
198	case Intrinsic::arm_neon_vld2lane:
199	case Intrinsic::arm_neon_vld3lane:
200	case Intrinsic::arm_neon_vld4lane:
201	case Intrinsic::arm_neon_vst1:
202	case Intrinsic::arm_neon_vst2:
203	case Intrinsic::arm_neon_vst3:
204	case Intrinsic::arm_neon_vst4:
205	case Intrinsic::arm_neon_vst2lane:
206	case Intrinsic::arm_neon_vst3lane:
207	case Intrinsic::arm_neon_vst4lane: {
208	Align MemAlign =
209	getKnownAlignment(V: II.getArgOperand(i: `0`), DL: IC.getDataLayout(), CxtI: &II,
210	AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
211	unsigned AlignArg = II.arg_size() - `1`;
212	Value *AlignArgOp = II.getArgOperand(i: AlignArg);
213	MaybeAlign Align = cast<ConstantInt>(Val: AlignArgOp)->getMaybeAlignValue();
214	if (Align && *Align < MemAlign) {
215	return IC.replaceOperand(
216	I&: II, OpNum: AlignArg,
217	V: ConstantInt::get(Ty: Type::getInt32Ty(C&: II.getContext()), V: MemAlign.value(),
218	IsSigned: false));
219	}
220	break;
221	}
222
223	case Intrinsic::arm_neon_vld1x2:
224	case Intrinsic::arm_neon_vld1x3:
225	case Intrinsic::arm_neon_vld1x4:
226	case Intrinsic::arm_neon_vst1x2:
227	case Intrinsic::arm_neon_vst1x3:
228	case Intrinsic::arm_neon_vst1x4: {
229	Align NewAlign =
230	getKnownAlignment(V: II.getArgOperand(i: `0`), DL: IC.getDataLayout(), CxtI: &II,
231	AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
232	Align OldAlign = II.getParamAlign(ArgNo: `0`).valueOrOne();
233	if (NewAlign > OldAlign)
234	II.addParamAttr(ArgNo: `0`,
235	Attr: Attribute::getWithAlignment(Context&: II.getContext(), Alignment: NewAlign));
236	break;
237	}
238
239	case Intrinsic::arm_mve_pred_i2v: {
240	Value *Arg = II.getArgOperand(i: `0`);
241	Value *ArgArg;
242	if (match(V: Arg, P: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
243	Op0: PatternMatch::m_Value(V&: ArgArg))) &&
244	II.getType() == ArgArg->getType()) {
245	return IC.replaceInstUsesWith(I&: II, V: ArgArg);
246	}
247	Constant *XorMask;
248	if (match(V: Arg, P: m_Xor(L: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
249	Op0: PatternMatch::m_Value(V&: ArgArg)),
250	R: PatternMatch::m_Constant(C&: XorMask))) &&
251	II.getType() == ArgArg->getType()) {
252	if (auto *CI = dyn_cast<ConstantInt>(Val: XorMask)) {
253	if (CI->getValue().trunc(width: `16`).isAllOnes()) {
254	auto TrueVector = IC.Builder.CreateVectorSplat(
255	NumElts: cast<FixedVectorType>(Val: II.getType())->getNumElements(),
256	V: IC.Builder.getTrue());
257	return BinaryOperator::Create(Op: Instruction::Xor, S1: ArgArg, S2: TrueVector);
258	}
259	}
260	}
261	KnownBits ScalarKnown(`32`);
262	if (IC.SimplifyDemandedBits(I: &II, OpNo: `0`, DemandedMask: APInt::getLowBitsSet(numBits: `32`, loBitsSet: `16`),
263	Known&: ScalarKnown)) {
264	return &II;
265	}
266	break;
267	}
268	case Intrinsic::arm_mve_pred_v2i: {
269	Value *Arg = II.getArgOperand(i: `0`);
270	Value *ArgArg;
271	if (match(V: Arg, P: PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
272	Op0: PatternMatch::m_Value(V&: ArgArg)))) {
273	return IC.replaceInstUsesWith(I&: II, V: ArgArg);
274	}
275
276	if (II.getMetadata(KindID: LLVMContext::MD_range))
277	break;
278
279	ConstantRange Range(APInt (`32`, `0`), APInt (`32`, `0x10000`));
280
281	if (auto CurrentRange = II.getRange()) {
282	Range = Range.intersectWith(CR: *CurrentRange);
283	if (Range == CurrentRange)
284	break;
285	}
286
287	II.addRangeRetAttr(CR: Range);
288	II.addRetAttr(Kind: Attribute::NoUndef);
289	return &II;
290	}
291	case Intrinsic::arm_mve_vadc:
292	case Intrinsic::arm_mve_vadc_predicated: {
293	unsigned CarryOp =
294	(II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? `3` : `2`;
295	assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == `32` &&
296	"Bad type for intrinsic!");
297
298	KnownBits CarryKnown(`32`);
299	if (IC.SimplifyDemandedBits(I: &II, OpNo: CarryOp, DemandedMask: APInt::getOneBitSet(numBits: `32`, BitNo: `29`),
300	Known&: CarryKnown)) {
301	return &II;
302	}
303	break;
304	}
305	case Intrinsic::arm_mve_vmldava: {
306	Instruction *I = cast<Instruction>(Val: &II);
307	if (I->hasOneUse()) {
308	auto User = cast<Instruction>(Val: I->user_begin());
309	Value *OpZ;
310	if (match(V: User, P: m_c_Add(L: m_Specific(V: I), R: m_Value(V&: OpZ))) &&
311	match(V: I->getOperand(i: `3`), P: m_Zero())) {
312	Value *OpX = I->getOperand(i: `4`);
313	Value *OpY = I->getOperand(i: `5`);
314	Type *OpTy = OpX->getType();
315
316	IC.Builder.SetInsertPoint(User);
317	Value *V =
318	IC.Builder.CreateIntrinsic(ID: Intrinsic::arm_mve_vmldava, Types: {OpTy},
319	Args: {I->getOperand(i: `0`), I->getOperand(i: `1`),
320	I->getOperand(i: `2`), OpZ, OpX, OpY});
321
322	IC.replaceInstUsesWith(I&: *User, V);
323	return IC.eraseInstFromFunction(I&: *User);
324	}
325	}
326	return std::nullopt;
327	}
328	}
329	return std::nullopt;
330	}
331
332	std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
333	InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
334	APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
335	std::function<void(Instruction , unsigned*, APInt, APInt &)>
336	SimplifyAndSetOp) const {
337
338	// Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
339	// opcode specifying a Top/Bottom instruction, which can change between
340	// instructions.
341	auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
342	unsigned NumElts = cast<FixedVectorType>(Val: II.getType())->getNumElements();
343	unsigned IsTop = cast<ConstantInt>(Val: II.getOperand(i_nocapture: TopOpc))->getZExtValue();
344
345	// The only odd/even lanes of operand 0 will only be demanded depending
346	// on whether this is a top/bottom instruction.
347	APInt DemandedElts =
348	APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: `2`, loBitsSet: `1`)
349	: APInt::getHighBitsSet(numBits: `2`, hiBitsSet: `1`));
350	SimplifyAndSetOp (&II, `0`, OrigDemandedElts & DemandedElts, UndefElts);
351	// The other lanes will be defined from the inserted elements.
352	UndefElts &= APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: `2`, loBitsSet: `1`)
353	: APInt::getHighBitsSet(numBits: `2`, hiBitsSet: `1`));
354	return std::nullopt;
355	};
356
357	switch (II.getIntrinsicID()) {
358	default:
359	break;
360	case Intrinsic::arm_mve_vcvt_narrow:
361	SimplifyNarrowInstrTopBottom (`2`);
362	break;
363	case Intrinsic::arm_mve_vqmovn:
364	SimplifyNarrowInstrTopBottom (`4`);
365	break;
366	case Intrinsic::arm_mve_vshrn:
367	SimplifyNarrowInstrTopBottom (`7`);
368	break;
369	}
370
371	return std::nullopt;
372	}
373
374	InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
375	TTI::TargetCostKind CostKind) const {
376	assert(Ty->isIntegerTy());
377
378	unsigned Bits = Ty->getPrimitiveSizeInBits();
379	if (Bits == `0` \|\| Imm.getActiveBits() >= `64`)
380	return `4`;
381
382	int64_t SImmVal = Imm.getSExtValue();
383	uint64_t ZImmVal = Imm.getZExtValue();
384	if (!ST->isThumb()) {
385	if ((SImmVal >= `0` && SImmVal < `65536`) \|\|
386	(ARM_AM::getSOImmVal(Arg: ZImmVal) != -`1`) \|\|
387	(ARM_AM::getSOImmVal(Arg: ~ZImmVal) != -`1`))
388	return `1`;
389	return ST->hasV6T2Ops() ? `2` : `3`;
390	}
391	if (ST->isThumb2()) {
392	if ((SImmVal >= `0` && SImmVal < `65536`) \|\|
393	(ARM_AM::getT2SOImmVal(Arg: ZImmVal) != -`1`) \|\|
394	(ARM_AM::getT2SOImmVal(Arg: ~ZImmVal) != -`1`))
395	return `1`;
396	return ST->hasV6T2Ops() ? `2` : `3`;
397	}
398	// Thumb1, any i8 imm cost 1.
399	if (Bits == `8` \|\| (SImmVal >= `0` && SImmVal < `256`))
400	return `1`;
401	if ((~SImmVal < `256`) \|\| ARM_AM::isThumbImmShiftedVal(V: ZImmVal))
402	return `2`;
403	// Load from constantpool.
404	return `3`;
405	}
406
407	// Constants smaller than 256 fit in the immediate field of
408	// Thumb1 instructions so we return a zero cost and 1 otherwise.
409	InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
410	const APInt &Imm,
411	Type Ty) const* {
412	if (Imm.isNonNegative() && Imm.getLimitedValue() < `256`)
413	return `0`;
414
415	return `1`;
416	}
417
418	// Checks whether Inst is part of a min(max()) or max(min()) pattern
419	// that will match to an SSAT instruction. Returns the instruction being
420	// saturated, or null if no saturation pattern was found.
421	static Value isSSATMinMaxPattern(Instruction Inst, const APInt &Imm) {
422	Value LHS, RHS;
423	ConstantInt *C;
424	SelectPatternFlavor InstSPF = matchSelectPattern(V: Inst, LHS, RHS).Flavor;
425
426	if (InstSPF == SPF_SMAX &&
427	PatternMatch::match(V: RHS, P: PatternMatch::m_ConstantInt(CI&: C)) &&
428	C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
429
430	auto isSSatMin = [&](Value *MinInst) {
431	if (isa<SelectInst>(Val: MinInst)) {
432	Value MinLHS, MinRHS;
433	ConstantInt *MinC;
434	SelectPatternFlavor MinSPF =
435	matchSelectPattern(V: MinInst, LHS&: MinLHS, RHS&: MinRHS).Flavor;
436	if (MinSPF == SPF_SMIN &&
437	PatternMatch::match(V: MinRHS, P: PatternMatch::m_ConstantInt(CI&: MinC)) &&
438	MinC->getValue() == ((-Imm) - `1`))
439	return true;
440	}
441	return false;
442	};
443
444	if (isSSatMin (Inst->getOperand(i: `1`)))
445	return cast<Instruction>(Val: Inst->getOperand(i: `1`))->getOperand(i: `1`);
446	if (Inst->hasNUses(N: `2`) &&
447	(isSSatMin (Inst->user_begin()) \|\| isSSatMin ((++Inst->user_begin()))))
448	return Inst->getOperand(i: `1`);
449	}
450	return nullptr;
451	}
452
453	// Look for a FP Saturation pattern, where the instruction can be simplified to
454	// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
455	static bool isFPSatMinMaxPattern(Instruction Inst, const* APInt &Imm) {
456	if (Imm.getBitWidth() != `64` \|\|
457	Imm != APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `33`)) // -2147483648
458	return false;
459	Value *FP = isSSATMinMaxPattern(Inst, Imm);
460	if (!FP && isa<ICmpInst>(Val: Inst) && Inst->hasOneUse())
461	FP = isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm);
462	if (!FP)
463	return false;
464	return isa<FPToSIInst>(Val: FP);
465	}
466
467	InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
468	const APInt &Imm, Type *Ty,
469	TTI::TargetCostKind CostKind,
470	Instruction Inst) const* {
471	// Division by a constant can be turned into multiplication, but only if we
472	// know it's constant. So it's not so much that the immediate is cheap (it's
473	// not), but that the alternative is worse.
474	// FIXME: this is probably unneeded with GlobalISel.
475	if ((Opcode == Instruction::SDiv \|\| Opcode == Instruction::UDiv \|\|
476	Opcode == Instruction::SRem \|\| Opcode == Instruction::URem) &&
477	Idx == `1`)
478	return `0`;
479
480	// Leave any gep offsets for the CodeGenPrepare, which will do a better job at
481	// splitting any large offsets.
482	if (Opcode == Instruction::GetElementPtr && Idx != `0`)
483	return `0`;
484
485	if (Opcode == Instruction::And) {
486	// UXTB/UXTH
487	if (Imm == `255` \|\| Imm == `65535`)
488	return `0`;
489	// Conversion to BIC is free, and means we can use ~Imm instead.
490	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
491	b: getIntImmCost(Imm: ~Imm, Ty, CostKind));
492	}
493
494	if (Opcode == Instruction::Add)
495	// Conversion to SUB is free, and means we can use -Imm instead.
496	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
497	b: getIntImmCost(Imm: -Imm, Ty, CostKind));
498
499	if (Opcode == Instruction::ICmp && Imm.isNegative() &&
500	Ty->getIntegerBitWidth() == `32`) {
501	int64_t NegImm = -Imm.getSExtValue();
502	if (ST->isThumb2() && NegImm < `1`<<`12`)
503	// icmp X, #-C -> cmn X, #C
504	return `0`;
505	if (ST->isThumb() && NegImm < `1`<<`8`)
506	// icmp X, #-C -> adds X, #C
507	return `0`;
508	}
509
510	// xor a, -1 can always be folded to MVN
511	if (Opcode == Instruction::Xor && Imm.isAllOnes())
512	return `0`;
513
514	// Ensures negative constant of min(max()) or max(min()) patterns that
515	// match to SSAT instructions don't get hoisted
516	if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) \|\| ST->isThumb2()) &&
517	Ty->getIntegerBitWidth() <= `32`) {
518	if (isSSATMinMaxPattern(Inst, Imm) \|\|
519	(isa<ICmpInst>(Val: Inst) && Inst->hasOneUse() &&
520	isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm)))
521	return `0`;
522	}
523
524	if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
525	return `0`;
526
527	// We can convert <= -1 to < 0, which is generally quite cheap.
528	if (Inst && Opcode == Instruction::ICmp && Idx == `1` && Imm.isAllOnes()) {
529	ICmpInst::Predicate Pred = cast<ICmpInst>(Val: Inst)->getPredicate();
530	if (Pred == ICmpInst::ICMP_SGT \|\| Pred == ICmpInst::ICMP_SLE)
531	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
532	b: getIntImmCost(Imm: Imm + `1`, Ty, CostKind));
533	}
534
535	return getIntImmCost(Imm, Ty, CostKind);
536	}
537
538	InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
539	TTI::TargetCostKind CostKind,
540	const Instruction I) const* {
541	if (CostKind == TTI::TCK_RecipThroughput &&
542	(ST->hasNEON() \|\| ST->hasMVEIntegerOps())) {
543	// FIXME: The vectorizer is highly sensitive to the cost of these
544	// instructions, which suggests that it may be using the costs incorrectly.
545	// But, for now, just make them free to avoid performance regressions for
546	// vector targets.
547	return `0`;
548	}
549	return BaseT::getCFInstrCost(Opcode, CostKind, I);
550	}
551
552	InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
553	Type *Src,
554	TTI::CastContextHint CCH,
555	TTI::TargetCostKind CostKind,
556	const Instruction I) const* {
557	int ISD = TLI->InstructionOpcodeToISD(Opcode);
558	assert(ISD && "Invalid opcode");
559
560	// TODO: Allow non-throughput costs that aren't binary.
561	auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
562	if (CostKind != TTI::TCK_RecipThroughput)
563	return Cost == `0` ? `0` : `1`;
564	return Cost;
565	};
566	auto IsLegalFPType = [this](EVT VT) {
567	EVT EltVT = VT.getScalarType();
568	return (EltVT == MVT::f32 && ST->hasVFP2Base()) \|\|
569	(EltVT == MVT::f64 && ST->hasFP64()) \|\|
570	(EltVT == MVT::f16 && ST->hasFullFP16());
571	};
572
573	EVT SrcTy = TLI->getValueType(DL, Ty: Src);
574	EVT DstTy = TLI->getValueType(DL, Ty: Dst);
575
576	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
577	return AdjustCost (
578	BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
579
580	// Extending masked load/Truncating masked stores is expensive because we
581	// currently don't split them. This means that we'll likely end up
582	// loading/storing each element individually (hence the high cost).
583	if ((ST->hasMVEIntegerOps() &&
584	(Opcode == Instruction::Trunc \|\| Opcode == Instruction::ZExt \|\|
585	Opcode == Instruction::SExt)) \|\|
586	(ST->hasMVEFloatOps() &&
587	(Opcode == Instruction::FPExt \|\| Opcode == Instruction::FPTrunc) &&
588	IsLegalFPType (SrcTy) && IsLegalFPType (DstTy)))
589	if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > `128`)
590	return `2` * DstTy.getVectorNumElements() *
591	ST->getMVEVectorCostFactor(CostKind);
592
593	// The extend of other kinds of load is free
594	if (CCH == TTI::CastContextHint::Normal \|\|
595	CCH == TTI::CastContextHint::Masked) {
596	static const TypeConversionCostTblEntry LoadConversionTbl[] = {
597	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i32, .Src: MVT::i16, .Cost: `0`},
598	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i32, .Src: MVT::i16, .Cost: `0`},
599	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i32, .Src: MVT::i8, .Cost: `0`},
600	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i32, .Src: MVT::i8, .Cost: `0`},
601	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i16, .Src: MVT::i8, .Cost: `0`},
602	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i16, .Src: MVT::i8, .Cost: `0`},
603	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i32, .Cost: `1`},
604	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i32, .Cost: `1`},
605	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: `1`},
606	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: `1`},
607	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i8, .Cost: `1`},
608	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::i64, .Src: MVT::i8, .Cost: `1`},
609	};
610	if (const auto *Entry = ConvertCostTableLookup(
611	Table: LoadConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
612	return AdjustCost (Entry->Cost);
613
614	static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
615	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0`},
616	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0`},
617	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `0`},
618	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `0`},
619	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0`},
620	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0`},
621	// The following extend from a legal type to an illegal type, so need to
622	// split the load. This introduced an extra load operation, but the
623	// extend is still "free".
624	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `1`},
625	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `1`},
626	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `3`},
627	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `3`},
628	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `1`},
629	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `1`},
630	};
631	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
632	if (const auto *Entry =
633	ConvertCostTableLookup(Table: MVELoadConversionTbl, ISD,
634	Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
635	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
636	}
637
638	static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
639	// FPExtends are similar but also require the VCVT instructions.
640	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: `1`},
641	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: `3`},
642	};
643	if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
644	if (const auto *Entry =
645	ConvertCostTableLookup(Table: MVEFLoadConversionTbl, ISD,
646	Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
647	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
648	}
649
650	// The truncate of a store is free. This is the mirror of extends above.
651	static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
652	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0`},
653	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `0`},
654	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0`},
655	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `1`},
656	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `1`},
657	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `3`},
658	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `1`},
659	};
660	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
661	if (const auto *Entry =
662	ConvertCostTableLookup(Table: MVEStoreConversionTbl, ISD,
663	Dst: SrcTy.getSimpleVT(), Src: DstTy.getSimpleVT()))
664	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
665	}
666
667	static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
668	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: `1`},
669	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: `3`},
670	};
671	if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
672	if (const auto *Entry =
673	ConvertCostTableLookup(Table: MVEFStoreConversionTbl, ISD,
674	Dst: SrcTy.getSimpleVT(), Src: DstTy.getSimpleVT()))
675	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
676	}
677	}
678
679	// NEON vector operations that can extend their inputs.
680	if ((ISD == ISD::SIGN_EXTEND \|\| ISD == ISD::ZERO_EXTEND) &&
681	I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
682	static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
683	// vaddl
684	{ .ISD: ISD::ADD, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
685	{ .ISD: ISD::ADD, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
686	// vsubl
687	{ .ISD: ISD::SUB, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
688	{ .ISD: ISD::SUB, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
689	// vmull
690	{ .ISD: ISD::MUL, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
691	{ .ISD: ISD::MUL, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
692	// vshll
693	{ .ISD: ISD::SHL, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `0` },
694	{ .ISD: ISD::SHL, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `0` },
695	};
696
697	auto User = cast<Instruction>(Val: I->user_begin());
698	int UserISD = TLI->InstructionOpcodeToISD(Opcode: User->getOpcode());
699	if (auto *Entry = ConvertCostTableLookup(Table: NEONDoubleWidthTbl, ISD: UserISD,
700	Dst: DstTy.getSimpleVT(),
701	Src: SrcTy.getSimpleVT())) {
702	return AdjustCost (Entry->Cost);
703	}
704	}
705
706	// Single to/from double precision conversions.
707	if (Src->isVectorTy() && ST->hasNEON() &&
708	((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
709	DstTy.getScalarType() == MVT::f32) \|\|
710	(ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
711	DstTy.getScalarType() == MVT::f64))) {
712	static const CostTblEntry NEONFltDblTbl[] = {
713	// Vector fptrunc/fpext conversions.
714	{.ISD: ISD::FP_ROUND, .Type: MVT::v2f64, .Cost: `2`},
715	{.ISD: ISD::FP_EXTEND, .Type: MVT::v2f32, .Cost: `2`},
716	{.ISD: ISD::FP_EXTEND, .Type: MVT::v4f32, .Cost: `4`}};
717
718	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
719	if (const auto *Entry = CostTableLookup(Table: NEONFltDblTbl, ISD, Ty: LT.second))
720	return AdjustCost (LT.first * Entry->Cost);
721	}
722
723	// Some arithmetic, load and store operations have specific instructions
724	// to cast up/down their types automatically at no extra cost.
725	// TODO: Get these tables to know at least what the related operations are.
726	static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
727	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
728	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
729	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `1` },
730	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `1` },
731	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: `0` },
732	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: `1` },
733
734	// The number of vmovl instructions for the extension.
735	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
736	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
737	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
738	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
739	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `3` },
740	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `3` },
741	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `2` },
742	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `2` },
743	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3` },
744	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3` },
745	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3` },
746	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3` },
747	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7` },
748	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7` },
749	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6` },
750	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6` },
751	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6` },
752	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6` },
753
754	// Operations that we legalize using splitting.
755	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: `6` },
756	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: `3` },
757
758	// Vector float <-> i32 conversions.
759	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1` },
760	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1` },
761
762	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3` },
763	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3` },
764	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `2` },
765	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `2` },
766	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1` },
767	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1` },
768	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: `3` },
769	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: `3` },
770	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `3` },
771	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `3` },
772	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2` },
773	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2` },
774	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4` },
775	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4` },
776	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: `2` },
777	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: `2` },
778	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: `8` },
779	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: `8` },
780	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: `4` },
781	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: `4` },
782
783	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1` },
784	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1` },
785	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `3` },
786	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `3` },
787	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2` },
788	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2` },
789
790	// Vector double <-> i32 conversions.
791	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
792	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
793
794	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4` },
795	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4` },
796	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `3` },
797	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `3` },
798	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
799	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
800
801	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2` },
802	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2` },
803	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: `4` },
804	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: `4` },
805	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: `8` },
806	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: `8` }
807	};
808
809	if (SrcTy.isVector() && ST->hasNEON()) {
810	if (const auto *Entry = ConvertCostTableLookup(Table: NEONVectorConversionTbl, ISD,
811	Dst: DstTy.getSimpleVT(),
812	Src: SrcTy.getSimpleVT()))
813	return AdjustCost (Entry->Cost);
814	}
815
816	// Scalar float to integer conversions.
817	static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
818	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i1, .Src: MVT::f32, .Cost: `2` },
819	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i1, .Src: MVT::f32, .Cost: `2` },
820	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i1, .Src: MVT::f64, .Cost: `2` },
821	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i1, .Src: MVT::f64, .Cost: `2` },
822	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i8, .Src: MVT::f32, .Cost: `2` },
823	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i8, .Src: MVT::f32, .Cost: `2` },
824	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i8, .Src: MVT::f64, .Cost: `2` },
825	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i8, .Src: MVT::f64, .Cost: `2` },
826	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i16, .Src: MVT::f32, .Cost: `2` },
827	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i16, .Src: MVT::f32, .Cost: `2` },
828	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i16, .Src: MVT::f64, .Cost: `2` },
829	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i16, .Src: MVT::f64, .Cost: `2` },
830	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: `2` },
831	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: `2` },
832	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: `2` },
833	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: `2` },
834	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: `10` },
835	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: `10` },
836	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: `10` },
837	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: `10` }
838	};
839	if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
840	if (const auto *Entry = ConvertCostTableLookup(Table: NEONFloatConversionTbl, ISD,
841	Dst: DstTy.getSimpleVT(),
842	Src: SrcTy.getSimpleVT()))
843	return AdjustCost (Entry->Cost);
844	}
845
846	// Scalar integer to float conversions.
847	static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
848	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i1, .Cost: `2` },
849	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i1, .Cost: `2` },
850	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i1, .Cost: `2` },
851	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i1, .Cost: `2` },
852	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i8, .Cost: `2` },
853	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i8, .Cost: `2` },
854	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i8, .Cost: `2` },
855	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i8, .Cost: `2` },
856	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i16, .Cost: `2` },
857	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i16, .Cost: `2` },
858	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i16, .Cost: `2` },
859	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i16, .Cost: `2` },
860	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: `2` },
861	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: `2` },
862	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: `2` },
863	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: `2` },
864	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: `10` },
865	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: `10` },
866	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: `10` },
867	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: `10` }
868	};
869
870	if (SrcTy.isInteger() && ST->hasNEON()) {
871	if (const auto *Entry = ConvertCostTableLookup(Table: NEONIntegerConversionTbl,
872	ISD, Dst: DstTy.getSimpleVT(),
873	Src: SrcTy.getSimpleVT()))
874	return AdjustCost (Entry->Cost);
875	}
876
877	// MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
878	// instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
879	// are linearised so take more.
880	static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
881	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
882	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1` },
883	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
884	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2` },
885	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `10` },
886	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i8, .Cost: `2` },
887	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
888	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1` },
889	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `10` },
890	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `2` },
891	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `8` },
892	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `2` },
893	};
894
895	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
896	if (const auto *Entry = ConvertCostTableLookup(Table: MVEVectorConversionTbl,
897	ISD, Dst: DstTy.getSimpleVT(),
898	Src: SrcTy.getSimpleVT()))
899	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
900	}
901
902	if (ISD == ISD::FP_ROUND \|\| ISD == ISD::FP_EXTEND) {
903	// As general rule, fp converts that were not matched above are scalarized
904	// and cost 1 vcvt for each lane, so long as the instruction is available.
905	// If not it will become a series of function calls.
906	const InstructionCost CallCost =
907	getCallInstrCost(F: nullptr, RetTy: Dst, Tys: {Src}, CostKind);
908	int Lanes = `1`;
909	if (SrcTy.isFixedLengthVector())
910	Lanes = SrcTy.getVectorNumElements();
911
912	if (IsLegalFPType (SrcTy) && IsLegalFPType (DstTy))
913	return Lanes;
914	else
915	return Lanes * CallCost;
916	}
917
918	if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
919	SrcTy.isFixedLengthVector()) {
920	// Treat a truncate with larger than legal source (128bits for MVE) as
921	// expensive, 2 instructions per lane.
922	if ((SrcTy.getScalarType() == MVT::i8 \|\|
923	SrcTy.getScalarType() == MVT::i16 \|\|
924	SrcTy.getScalarType() == MVT::i32) &&
925	SrcTy.getSizeInBits() > `128` &&
926	SrcTy.getSizeInBits() > DstTy.getSizeInBits())
927	return SrcTy.getVectorNumElements() * `2`;
928	}
929
930	// Scalar integer conversion costs.
931	static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
932	// i16 -> i64 requires two dependent operations.
933	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::i64, .Src: MVT::i16, .Cost: `2` },
934
935	// Truncates on i64 are assumed to be free.
936	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i32, .Src: MVT::i64, .Cost: `0` },
937	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i16, .Src: MVT::i64, .Cost: `0` },
938	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i8, .Src: MVT::i64, .Cost: `0` },
939	{ .ISD: ISD::TRUNCATE, .Dst: MVT::i1, .Src: MVT::i64, .Cost: `0` }
940	};
941
942	if (SrcTy.isInteger()) {
943	if (const auto *Entry = ConvertCostTableLookup(Table: ARMIntegerConversionTbl, ISD,
944	Dst: DstTy.getSimpleVT(),
945	Src: SrcTy.getSimpleVT()))
946	return AdjustCost (Entry->Cost);
947	}
948
949	int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
950	? ST->getMVEVectorCostFactor(CostKind)
951	: `1`;
952	return AdjustCost (
953	BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
954	}
955
956	InstructionCost ARMTTIImpl::getVectorInstrCost(
957	unsigned Opcode, Type ValTy, TTI::TargetCostKind CostKind, unsigned* Index,
958	const Value Op0, const* Value Op1, TTI::VectorInstrContext VIC) const* {
959	// Penalize inserting into an D-subregister. We end up with a three times
960	// lower estimated throughput on swift.
961	if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
962	ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= `32`)
963	return `3`;
964
965	if (ST->hasNEON() && (Opcode == Instruction::InsertElement \|\|
966	Opcode == Instruction::ExtractElement)) {
967	// Cross-class copies are expensive on many microarchitectures,
968	// so assume they are expensive by default.
969	if (cast<VectorType>(Val: ValTy)->getElementType()->isIntegerTy())
970	return `3`;
971
972	// Even if it's not a cross class copy, this likely leads to mixing
973	// of NEON and VFP code and should be therefore penalized.
974	if (ValTy->isVectorTy() &&
975	ValTy->getScalarSizeInBits() <= `32`)
976	return std::max<InstructionCost>(
977	a: BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
978	VIC),
979	b: `2U`);
980	}
981
982	if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement \|\|
983	Opcode == Instruction::ExtractElement)) {
984	// Integer cross-lane moves are more expensive than float, which can
985	// sometimes just be vmovs. Integer involve being passes to GPR registers,
986	// causing more of a delay.
987	std::pair<InstructionCost, MVT> LT =
988	getTypeLegalizationCost(Ty: ValTy->getScalarType());
989	return LT.first * (ValTy->getScalarType()->isIntegerTy() ? `4` : `1`);
990	}
991
992	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
993	VIC);
994	}
995
996	InstructionCost ARMTTIImpl::getCmpSelInstrCost(
997	unsigned Opcode, Type ValTy, Type CondTy, CmpInst::Predicate VecPred,
998	TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
999	TTI::OperandValueInfo Op2Info, const Instruction I) const* {
1000	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1001
1002	// Thumb scalar code size cost for select.
1003	if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
1004	ST->isThumb() && !ValTy->isVectorTy()) {
1005	// Assume expensive structs.
1006	if (TLI->getValueType(DL, Ty: ValTy, AllowUnknown: true) == MVT::Other)
1007	return TTI::TCC_Expensive;
1008
1009	// Select costs can vary because they:
1010	// - may require one or more conditional mov (including an IT),
1011	// - can't operate directly on immediates,
1012	// - require live flags, which we can't copy around easily.
1013	InstructionCost Cost = getTypeLegalizationCost(Ty: ValTy).first;
1014
1015	// Possible IT instruction for Thumb2, or more for Thumb1.
1016	++Cost;
1017
1018	// i1 values may need rematerialising by using mov immediates and/or
1019	// flag setting instructions.
1020	if (ValTy->isIntegerTy(Bitwidth: `1`))
1021	++Cost;
1022
1023	return Cost;
1024	}
1025
1026	// If this is a vector min/max/abs, use the cost of that intrinsic directly
1027	// instead. Hopefully when min/max intrinsics are more prevalent this code
1028	// will not be needed.
1029	const Instruction *Sel = I;
1030	if ((Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) && Sel &&
1031	Sel->hasOneUse())
1032	Sel = cast<Instruction>(Val: Sel->user_back());
1033	if (Sel && ValTy->isVectorTy() &&
1034	(ValTy->isIntOrIntVectorTy() \|\| ValTy->isFPOrFPVectorTy())) {
1035	const Value LHS, RHS;
1036	SelectPatternFlavor SPF = matchSelectPattern(V: Sel, LHS, RHS).Flavor;
1037	unsigned IID = `0`;
1038	switch (SPF) {
1039	case SPF_ABS:
1040	IID = Intrinsic::abs;
1041	break;
1042	case SPF_SMIN:
1043	IID = Intrinsic::smin;
1044	break;
1045	case SPF_SMAX:
1046	IID = Intrinsic::smax;
1047	break;
1048	case SPF_UMIN:
1049	IID = Intrinsic::umin;
1050	break;
1051	case SPF_UMAX:
1052	IID = Intrinsic::umax;
1053	break;
1054	case SPF_FMINNUM:
1055	IID = Intrinsic::minnum;
1056	break;
1057	case SPF_FMAXNUM:
1058	IID = Intrinsic::maxnum;
1059	break;
1060	default:
1061	break;
1062	}
1063	if (IID) {
1064	// The ICmp is free, the select gets the cost of the min/max/etc
1065	if (Sel != I)
1066	return `0`;
1067	IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1068	return getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
1069	}
1070	}
1071
1072	// On NEON a vector select gets lowered to vbsl.
1073	if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1074	// Lowering of some vector selects is currently far from perfect.
1075	static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1076	{ .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: `4``4` + `1``2` + `1` },
1077	{ .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: `50` },
1078	{ .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: `100` }
1079	};
1080
1081	EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
1082	EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
1083	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1084	if (const auto *Entry = ConvertCostTableLookup(Table: NEONVectorSelectTbl, ISD,
1085	Dst: SelCondTy.getSimpleVT(),
1086	Src: SelValTy.getSimpleVT()))
1087	return Entry->Cost;
1088	}
1089
1090	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1091	return LT.first;
1092	}
1093
1094	if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1095	(Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) &&
1096	cast<FixedVectorType>(Val: ValTy)->getNumElements() > `1`) {
1097	FixedVectorType *VecValTy = cast<FixedVectorType>(Val: ValTy);
1098	FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(Val: CondTy);
1099	if (!VecCondTy)
1100	VecCondTy = cast<FixedVectorType>(Val: CmpInst::makeCmpResultType(opnd_type: VecValTy));
1101
1102	// If we don't have mve.fp any fp operations will need to be scalarized.
1103	if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1104	// One scalaization insert, one scalarization extract and the cost of the
1105	// fcmps.
1106	return BaseT::getScalarizationOverhead(InTy: VecValTy, /Insert/ false,
1107	/Extract/ true, CostKind) +
1108	BaseT::getScalarizationOverhead(InTy: VecCondTy, /Insert/ true,
1109	/Extract/ false, CostKind) +
1110	VecValTy->getNumElements() *
1111	getCmpSelInstrCost(Opcode, ValTy: ValTy->getScalarType(),
1112	CondTy: VecCondTy->getScalarType(), VecPred,
1113	CostKind, Op1Info, Op2Info, I);
1114	}
1115
1116	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1117	int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1118	// There are two types - the input that specifies the type of the compare
1119	// and the output vXi1 type. Because we don't know how the output will be
1120	// split, we may need an expensive shuffle to get two in sync. This has the
1121	// effect of making larger than legal compares (v8i32 for example)
1122	// expensive.
1123	if (LT.second.isVector() && LT.second.getVectorNumElements() > `2`) {
1124	if (LT.first > `1`)
1125	return LT.first * BaseCost +
1126	BaseT::getScalarizationOverhead(InTy: VecCondTy, /Insert/ true,
1127	/Extract/ false, CostKind);
1128	return BaseCost;
1129	}
1130	}
1131
1132	// Default to cheap (throughput/size of 1 instruction) but adjust throughput
1133	// for "multiple beats" potentially needed by MVE instructions.
1134	int BaseCost = `1`;
1135	if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1136	BaseCost = ST->getMVEVectorCostFactor(CostKind);
1137
1138	return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1139	CostKind, Op1Info, Op2Info, I);
1140	}
1141
1142	InstructionCost
1143	ARMTTIImpl::getAddressComputationCost(Type PtrTy, ScalarEvolution SE,
1144	const SCEV *Ptr,
1145	TTI::TargetCostKind CostKind) const {
1146	// Address computations in vectorized code with non-consecutive addresses will
1147	// likely result in more instructions compared to scalar code where the
1148	// computation can more often be merged into the index mode. The resulting
1149	// extra micro-ops can significantly decrease throughput.
1150	unsigned NumVectorInstToHideOverhead = `10`;
1151	int MaxMergeDistance = `64`;
1152
1153	if (ST->hasNEON()) {
1154	if (PtrTy->isVectorTy() && SE &&
1155	!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + `1`))
1156	return NumVectorInstToHideOverhead;
1157
1158	// In many cases the address computation is not merged into the instruction
1159	// addressing mode.
1160	return `1`;
1161	}
1162	return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1163	}
1164
1165	bool ARMTTIImpl::isProfitableLSRChainElement(Instruction I) const* {
1166	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
1167	// If a VCTP is part of a chain, it's already profitable and shouldn't be
1168	// optimized, else LSR may block tail-predication.
1169	switch (II->getIntrinsicID()) {
1170	case Intrinsic::arm_mve_vctp8:
1171	case Intrinsic::arm_mve_vctp16:
1172	case Intrinsic::arm_mve_vctp32:
1173	case Intrinsic::arm_mve_vctp64:
1174	return true;
1175	default:
1176	break;
1177	}
1178	}
1179	return false;
1180	}
1181
1182	bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment,
1183	unsigned /AddressSpace/,
1184	TTI::MaskKind /MaskKind/) const {
1185	if (!EnableMaskedLoadStores \|\| !ST->hasMVEIntegerOps())
1186	return false;
1187
1188	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DataTy)) {
1189	// Don't support v2i1 yet.
1190	if (VecTy->getNumElements() == `2`)
1191	return false;
1192
1193	// We don't support extending fp types.
1194	unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1195	if (VecWidth != `128` && VecTy->getElementType()->isFloatingPointTy())
1196	return false;
1197	}
1198
1199	unsigned EltWidth = DataTy->getScalarSizeInBits();
1200	return (EltWidth == `32` && Alignment >= `4`) \|\|
1201	(EltWidth == `16` && Alignment >= `2`) \|\| (EltWidth == `8`);
1202	}
1203
1204	bool ARMTTIImpl::isLegalMaskedGather(Type Ty, Align Alignment) const* {
1205	if (!EnableMaskedGatherScatters \|\| !ST->hasMVEIntegerOps())
1206	return false;
1207
1208	unsigned EltWidth = Ty->getScalarSizeInBits();
1209	return ((EltWidth == `32` && Alignment >= `4`) \|\|
1210	(EltWidth == `16` && Alignment >= `2`) \|\| EltWidth == `8`);
1211	}
1212
1213	/// Given a memcpy/memset/memmove instruction, return the number of memory
1214	/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1215	/// call is used.
1216	int ARMTTIImpl::getNumMemOps(const IntrinsicInst I) const* {
1217	MemOp MOp;
1218	unsigned DstAddrSpace = ~`0u`;
1219	unsigned SrcAddrSpace = ~`0u`;
1220	const Function *F = I->getParent()->getParent();
1221
1222	if (const auto *MC = dyn_cast<MemTransferInst>(Val: I)) {
1223	ConstantInt *C = dyn_cast<ConstantInt>(Val: MC->getLength());
1224	// If 'size' is not a constant, a library call will be generated.
1225	if (!C)
1226	return -`1`;
1227
1228	const unsigned Size = C->getValue().getZExtValue();
1229	const Align DstAlign = MC->getDestAlign().valueOrOne();
1230	const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1231
1232	MOp = MemOp::Copy(Size, /DstAlignCanChange/ false, DstAlign, SrcAlign,
1233	/IsVolatile/ false);
1234	DstAddrSpace = MC->getDestAddressSpace();
1235	SrcAddrSpace = MC->getSourceAddressSpace();
1236	}
1237	else if (const auto *MS = dyn_cast<MemSetInst>(Val: I)) {
1238	ConstantInt *C = dyn_cast<ConstantInt>(Val: MS->getLength());
1239	// If 'size' is not a constant, a library call will be generated.
1240	if (!C)
1241	return -`1`;
1242
1243	const unsigned Size = C->getValue().getZExtValue();
1244	const Align DstAlign = MS->getDestAlign().valueOrOne();
1245
1246	MOp = MemOp::Set(Size, /DstAlignCanChange/ false, DstAlign,
1247	/IsZeroMemset/ false, /IsVolatile/ false);
1248	DstAddrSpace = MS->getDestAddressSpace();
1249	}
1250	else
1251	llvm_unreachable("Expected a memcpy/move or memset!");
1252
1253	unsigned Limit, Factor = `2`;
1254	switch(I->getIntrinsicID()) {
1255	case Intrinsic::memcpy:
1256	Limit = TLI->getMaxStoresPerMemcpy(OptSize: F->hasMinSize());
1257	break;
1258	case Intrinsic::memmove:
1259	Limit = TLI->getMaxStoresPerMemmove(OptSize: F->hasMinSize());
1260	break;
1261	case Intrinsic::memset:
1262	Limit = TLI->getMaxStoresPerMemset(OptSize: F->hasMinSize());
1263	Factor = `1`;
1264	break;
1265	default:
1266	llvm_unreachable("Expected a memcpy/move or memset!");
1267	}
1268
1269	// MemOps will be poplulated with a list of data types that needs to be
1270	// loaded and stored. That's why we multiply the number of elements by 2 to
1271	// get the cost for this memcpy.
1272	std::vector<EVT> MemOps;
1273	LLVMContext &C = F->getContext();
1274	if (getTLI()->findOptimalMemOpLowering(Context&: C, MemOps, Limit, Op: MOp, DstAS: DstAddrSpace,
1275	SrcAS: SrcAddrSpace, FuncAttributes: F->getAttributes(),
1276	LargestVT: nullptr))
1277	return MemOps.size() * Factor;
1278
1279	// If we can't find an optimal memop lowering, return the default cost
1280	return -`1`;
1281	}
1282
1283	InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction I) const* {
1284	int NumOps = getNumMemOps(I: cast<IntrinsicInst>(Val: I));
1285
1286	// To model the cost of a library call, we assume 1 for the call, and
1287	// 3 for the argument setup.
1288	if (NumOps == -`1`)
1289	return `4`;
1290	return NumOps;
1291	}
1292
1293	InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1294	VectorType DstTy, VectorType SrcTy,
1295	ArrayRef<int> Mask,
1296	TTI::TargetCostKind CostKind,
1297	int Index, VectorType *SubTp,
1298	ArrayRef<const Value *> Args,
1299	const Instruction CxtI) const* {
1300	assert((Mask.empty() \|\| DstTy->isScalableTy() \|\|
1301	Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1302	"Expected the Mask to match the return size if given");
1303	assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1304	"Expected the same scalar types");
1305
1306	Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
1307	// Treat extractsubvector as single op permutation.
1308	bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1309	if (IsExtractSubvector)
1310	Kind = TTI::SK_PermuteSingleSrc;
1311	if (ST->hasNEON()) {
1312	if (Kind == TTI::SK_Broadcast) {
1313	static const CostTblEntry NEONDupTbl[] = {
1314	// VDUP handles these cases.
1315	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: `1`},
1316	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: `1`},
1317	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: `1`},
1318	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: `1`},
1319	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: `1`},
1320	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i8, .Cost: `1`},
1321
1322	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `1`},
1323	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `1`},
1324	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `1`},
1325	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `1`}};
1326
1327	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1328	if (const auto *Entry =
1329	CostTableLookup(Table: NEONDupTbl, ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1330	return LT.first * Entry->Cost;
1331	}
1332	if (Kind == TTI::SK_Reverse) {
1333	static const CostTblEntry NEONShuffleTbl[] = {
1334	// Reverse shuffle cost one instruction if we are shuffling within a
1335	// double word (vrev) or two if we shuffle a quad word (vrev, vext).
1336	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: `1`},
1337	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: `1`},
1338	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: `1`},
1339	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: `1`},
1340	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: `1`},
1341	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i8, .Cost: `1`},
1342
1343	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `2`},
1344	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `2`},
1345	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `2`},
1346	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `2`}};
1347
1348	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1349	if (const auto *Entry =
1350	CostTableLookup(Table: NEONShuffleTbl, ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1351	return LT.first * Entry->Cost;
1352	}
1353	if (Kind == TTI::SK_Select) {
1354	static const CostTblEntry NEONSelShuffleTbl[] = {
1355	// Select shuffle cost table for ARM. Cost is the number of
1356	// instructions
1357	// required to create the shuffled vector.
1358
1359	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f32, .Cost: `1`},
1360	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i64, .Cost: `1`},
1361	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2f64, .Cost: `1`},
1362	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v2i32, .Cost: `1`},
1363
1364	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `2`},
1365	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `2`},
1366	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i16, .Cost: `2`},
1367
1368	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `16`},
1369
1370	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `32`}};
1371
1372	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1373	if (const auto *Entry = CostTableLookup(Table: NEONSelShuffleTbl,
1374	ISD: ISD::VECTOR_SHUFFLE, Ty: LT.second))
1375	return LT.first * Entry->Cost;
1376	}
1377	}
1378	if (ST->hasMVEIntegerOps()) {
1379	if (Kind == TTI::SK_Broadcast) {
1380	static const CostTblEntry MVEDupTbl[] = {
1381	// VDUP handles these cases.
1382	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4i32, .Cost: `1`},
1383	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8i16, .Cost: `1`},
1384	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v16i8, .Cost: `1`},
1385	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v4f32, .Cost: `1`},
1386	{.ISD: ISD::VECTOR_SHUFFLE, .Type: MVT::v8f16, .Cost: `1`}};
1387
1388	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1389	if (const auto *Entry = CostTableLookup(Table: MVEDupTbl, ISD: ISD::VECTOR_SHUFFLE,
1390	Ty: LT.second))
1391	return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
1392	}
1393
1394	if (!Mask.empty()) {
1395	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
1396	// Check for LD2/LD4 instructions, which are represented in llvm IR as
1397	// deinterleaving-shuffle(load). The shuffle cost could potentially be
1398	// free, but we model it with a cost of LT.first so that LD2/LD4 have a
1399	// higher cost than just the load.
1400	if (Args.size() >= `1` && isa<LoadInst>(Val: Args [`0`]) &&
1401	(LT.second.getScalarSizeInBits() == `8` \|\|
1402	LT.second.getScalarSizeInBits() == `16` \|\|
1403	LT.second.getScalarSizeInBits() == `32`) &&
1404	LT.second.getSizeInBits() == `128` &&
1405	((TLI->getMaxSupportedInterleaveFactor() >= `2` &&
1406	ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: `2`)) \|\|
1407	(TLI->getMaxSupportedInterleaveFactor() == `4` &&
1408	ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: `4`))))
1409	return ST->getMVEVectorCostFactor(CostKind) *
1410	std::max<InstructionCost>(a: `1`, b: LT.first / `4`);
1411
1412	// Check for ST2/ST4 instructions, which are represented in llvm IR as
1413	// store(interleaving-shuffle). The shuffle cost could potentially be
1414	// free, but we model it with a cost of LT.first so that ST2/ST4 have a
1415	// higher cost than just the store.
1416	if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) &&
1417	(LT.second.getScalarSizeInBits() == `8` \|\|
1418	LT.second.getScalarSizeInBits() == `16` \|\|
1419	LT.second.getScalarSizeInBits() == `32`) &&
1420	LT.second.getSizeInBits() == `128` &&
1421	((TLI->getMaxSupportedInterleaveFactor() >= `2` &&
1422	ShuffleVectorInst::isInterleaveMask(
1423	Mask, Factor: `2`, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * `2`)) \|\|
1424	(TLI->getMaxSupportedInterleaveFactor() == `4` &&
1425	ShuffleVectorInst::isInterleaveMask(
1426	Mask, Factor: `4`, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * `2`))))
1427	return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1428
1429	if (LT.second.isVector() &&
1430	Mask.size() <= LT.second.getVectorNumElements() &&
1431	(isVREVMask(M: Mask, VT: LT.second, BlockSize: `16`) \|\| isVREVMask(M: Mask, VT: LT.second, BlockSize: `32`) \|\|
1432	isVREVMask(M: Mask, VT: LT.second, BlockSize: `64`)))
1433	return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1434	}
1435	}
1436
1437	// Restore optimal kind.
1438	if (IsExtractSubvector)
1439	Kind = TTI::SK_ExtractSubvector;
1440	int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1441	? ST->getMVEVectorCostFactor(CostKind)
1442	: `1`;
1443	return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1444	Index, SubTp);
1445	}
1446
1447	InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1448	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1449	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1450	ArrayRef<const Value > Args, const* Instruction CxtI) const* {
1451	int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1452	if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(Bitwidth: `1`)) {
1453	// Make operations on i1 relatively expensive as this often involves
1454	// combining predicates. AND and XOR should be easier to handle with IT
1455	// blocks.
1456	switch (ISDOpcode) {
1457	default:
1458	break;
1459	case ISD::AND:
1460	case ISD::XOR:
1461	return `2`;
1462	case ISD::OR:
1463	return `3`;
1464	}
1465	}
1466
1467	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1468
1469	if (ST->hasNEON()) {
1470	const unsigned FunctionCallDivCost = `20`;
1471	const unsigned ReciprocalDivCost = `10`;
1472	static const CostTblEntry CostTbl[] = {
1473	// Division.
1474	// These costs are somewhat random. Choose a cost of 20 to indicate that
1475	// vectorizing devision (added function call) is going to be very expensive.
1476	// Double registers types.
1477	{ .ISD: ISD::SDIV, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1478	{ .ISD: ISD::UDIV, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1479	{ .ISD: ISD::SREM, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1480	{ .ISD: ISD::UREM, .Type: MVT::v1i64, .Cost: `1` * FunctionCallDivCost},
1481	{ .ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1482	{ .ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1483	{ .ISD: ISD::SREM, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1484	{ .ISD: ISD::UREM, .Type: MVT::v2i32, .Cost: `2` * FunctionCallDivCost},
1485	{ .ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: ReciprocalDivCost},
1486	{ .ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: ReciprocalDivCost},
1487	{ .ISD: ISD::SREM, .Type: MVT::v4i16, .Cost: `4` * FunctionCallDivCost},
1488	{ .ISD: ISD::UREM, .Type: MVT::v4i16, .Cost: `4` * FunctionCallDivCost},
1489	{ .ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: ReciprocalDivCost},
1490	{ .ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: ReciprocalDivCost},
1491	{ .ISD: ISD::SREM, .Type: MVT::v8i8, .Cost: `8` * FunctionCallDivCost},
1492	{ .ISD: ISD::UREM, .Type: MVT::v8i8, .Cost: `8` * FunctionCallDivCost},
1493	// Quad register types.
1494	{ .ISD: ISD::SDIV, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1495	{ .ISD: ISD::UDIV, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1496	{ .ISD: ISD::SREM, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1497	{ .ISD: ISD::UREM, .Type: MVT::v2i64, .Cost: `2` * FunctionCallDivCost},
1498	{ .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1499	{ .ISD: ISD::UDIV, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1500	{ .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1501	{ .ISD: ISD::UREM, .Type: MVT::v4i32, .Cost: `4` * FunctionCallDivCost},
1502	{ .ISD: ISD::SDIV, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1503	{ .ISD: ISD::UDIV, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1504	{ .ISD: ISD::SREM, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1505	{ .ISD: ISD::UREM, .Type: MVT::v8i16, .Cost: `8` * FunctionCallDivCost},
1506	{ .ISD: ISD::SDIV, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1507	{ .ISD: ISD::UDIV, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1508	{ .ISD: ISD::SREM, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1509	{ .ISD: ISD::UREM, .Type: MVT::v16i8, .Cost: `16` * FunctionCallDivCost},
1510	// Multiplication.
1511	};
1512
1513	if (const auto *Entry = CostTableLookup(Table: CostTbl, ISD: ISDOpcode, Ty: LT.second))
1514	return LT.first * Entry->Cost;
1515
1516	InstructionCost Cost = BaseT::getArithmeticInstrCost(
1517	Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
1518
1519	// This is somewhat of a hack. The problem that we are facing is that SROA
1520	// creates a sequence of shift, and, or instructions to construct values.
1521	// These sequences are recognized by the ISel and have zero-cost. Not so for
1522	// the vectorized code. Because we have support for v2i64 but not i64 those
1523	// sequences look particularly beneficial to vectorize.
1524	// To work around this we increase the cost of v2i64 operations to make them
1525	// seem less beneficial.
1526	if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1527	Cost += `4`;
1528
1529	return Cost;
1530	}
1531
1532	// If this operation is a shift on arm/thumb2, it might well be folded into
1533	// the following instruction, hence having a cost of 0.
1534	auto LooksLikeAFreeShift = [&]() {
1535	if (ST->isThumb1Only() \|\| Ty->isVectorTy())
1536	return false;
1537
1538	if (!CxtI \|\| !CxtI->hasOneUse() \|\| !CxtI->isShift())
1539	return false;
1540	if (!Op2Info.isUniform() \|\| !Op2Info.isConstant())
1541	return false;
1542
1543	// Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1544	switch (cast<Instruction>(Val: CxtI->user_back())->getOpcode()) {
1545	case Instruction::Add:
1546	case Instruction::Sub:
1547	case Instruction::And:
1548	case Instruction::Xor:
1549	case Instruction::Or:
1550	case Instruction::ICmp:
1551	return true;
1552	default:
1553	return false;
1554	}
1555	};
1556	if (LooksLikeAFreeShift ())
1557	return `0`;
1558
1559	// When targets have both DSP and MVE we find that the
1560	// the compiler will attempt to vectorize as well as using
1561	// scalar (S/U)MLAL operations. This is in cases where we have
1562	// the pattern ext(mul(ext(i16), ext(i16))) we find
1563	// that codegen performs better when only using (S/U)MLAL scalar
1564	// ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1565	// check if a mul instruction is used in a (U/S)MLAL pattern.
1566	auto MulInDSPMLALPattern = [&](const Instruction I, unsigned* Opcode,
1567	Type Ty) -> bool* {
1568	if (!ST->hasDSP())
1569	return false;
1570
1571	if (!I)
1572	return false;
1573
1574	if (Opcode != Instruction::Mul)
1575	return false;
1576
1577	if (Ty->isVectorTy())
1578	return false;
1579
1580	auto ValueOpcodesEqual = [](const Value LHS, const* Value RHS) -> bool* {
1581	return cast<Instruction>(Val: LHS)->getOpcode() ==
1582	cast<Instruction>(Val: RHS)->getOpcode();
1583	};
1584	auto IsExtInst = [](const Value V) -> bool* {
1585	return isa<ZExtInst>(Val: V) \|\| isa<SExtInst>(Val: V);
1586	};
1587	auto IsExtensionFromHalf = [](const Value V) -> bool* {
1588	return cast<Instruction>(Val: V)->getOperand(i: `0`)->getType()->isIntegerTy(Bitwidth: `16`);
1589	};
1590
1591	// We check the arguments of the instruction to see if they're extends
1592	auto *BinOp = dyn_cast<BinaryOperator>(Val: I);
1593	if (!BinOp)
1594	return false;
1595	Value *Op0 = BinOp->getOperand(i_nocapture: `0`);
1596	Value *Op1 = BinOp->getOperand(i_nocapture: `1`);
1597	if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1598	// We're interested in an ext of an i16
1599	if (!I->getType()->isIntegerTy(Bitwidth: `32`) \|\| !IsExtensionFromHalf(Op0) \|\|
1600	!IsExtensionFromHalf(Op1))
1601	return false;
1602	// We need to check if this result will be further extended to i64
1603	// and that all these uses are SExt
1604	for (auto *U : I->users())
1605	if (!IsExtInst(U))
1606	return false;
1607	return true;
1608	}
1609
1610	return false;
1611	};
1612
1613	if (MulInDSPMLALPattern (CxtI, Opcode, Ty))
1614	return `0`;
1615
1616	// Default to cheap (throughput/size of 1 instruction) but adjust throughput
1617	// for "multiple beats" potentially needed by MVE instructions.
1618	int BaseCost = `1`;
1619	if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1620	BaseCost = ST->getMVEVectorCostFactor(CostKind);
1621
1622	// The rest of this mostly follows what is done in
1623	// BaseT::getArithmeticInstrCost, without treating floats as more expensive
1624	// that scalars or increasing the costs for custom operations. The results is
1625	// also multiplied by the MVEVectorCostFactor where appropriate.
1626	if (TLI->isOperationLegalOrCustomOrPromote(Op: ISDOpcode, VT: LT.second))
1627	return LT.first * BaseCost;
1628
1629	// Else this is expand, assume that we need to scalarize this op.
1630	if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
1631	unsigned Num = VTy->getNumElements();
1632	InstructionCost Cost =
1633	getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind);
1634	// Return the cost of multiple scalar invocation plus the cost of
1635	// inserting and extracting the values.
1636	SmallVector<Type *> Tys(Args.size(), Ty);
1637	return BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind) +
1638	Num * Cost;
1639	}
1640
1641	return BaseCost;
1642	}
1643
1644	InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1645	Align Alignment,
1646	unsigned AddressSpace,
1647	TTI::TargetCostKind CostKind,
1648	TTI::OperandValueInfo OpInfo,
1649	const Instruction I) const* {
1650	// TODO: Handle other cost kinds.
1651	if (CostKind != TTI::TCK_RecipThroughput)
1652	return `1`;
1653
1654	// Type legalization can't handle structs
1655	if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other)
1656	return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1657	CostKind);
1658
1659	if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align (`16`) &&
1660	cast<VectorType>(Val: Src)->getElementType()->isDoubleTy()) {
1661	// Unaligned loads/stores are extremely inefficient.
1662	// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1663	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1664	return LT.first * `4`;
1665	}
1666
1667	// MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1668	// Same for stores.
1669	if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Val: Src) && I &&
1670	((Opcode == Instruction::Load && I->hasOneUse() &&
1671	isa<FPExtInst>(Val: *I->user_begin())) \|\|
1672	(Opcode == Instruction::Store && isa<FPTruncInst>(Val: I->getOperand(i: `0`))))) {
1673	FixedVectorType *SrcVTy = cast<FixedVectorType>(Val: Src);
1674	Type *DstTy =
1675	Opcode == Instruction::Load
1676	? (*I->user_begin())->getType()
1677	: cast<Instruction>(Val: I->getOperand(i: `0`))->getOperand(i: `0`)->getType();
1678	if (SrcVTy->getNumElements() == `4` && SrcVTy->getScalarType()->isHalfTy() &&
1679	DstTy->getScalarType()->isFloatTy())
1680	return ST->getMVEVectorCostFactor(CostKind);
1681	}
1682
1683	int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1684	? ST->getMVEVectorCostFactor(CostKind)
1685	: `1`;
1686	return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1687	CostKind, OpInfo, I);
1688	}
1689
1690	InstructionCost
1691	ARMTTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
1692	TTI::TargetCostKind CostKind) const {
1693	switch (MICA.getID()) {
1694	case Intrinsic::masked_scatter:
1695	case Intrinsic::masked_gather:
1696	return getGatherScatterOpCost(MICA, CostKind);
1697	case Intrinsic::masked_load:
1698	case Intrinsic::masked_store:
1699	return getMaskedMemoryOpCost(MICA, CostKind);
1700	}
1701	return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1702	}
1703
1704	InstructionCost
1705	ARMTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
1706	TTI::TargetCostKind CostKind) const {
1707	unsigned IID = MICA.getID();
1708	Type *Src = MICA.getDataType();
1709	Align Alignment = MICA.getAlignment();
1710	unsigned AddressSpace = MICA.getAddressSpace();
1711	if (ST->hasMVEIntegerOps()) {
1712	if (IID == Intrinsic::masked_load &&
1713	isLegalMaskedLoad(DataTy: Src, Alignment, AddressSpace))
1714	return ST->getMVEVectorCostFactor(CostKind);
1715	if (IID == Intrinsic::masked_store &&
1716	isLegalMaskedStore(DataTy: Src, Alignment, AddressSpace))
1717	return ST->getMVEVectorCostFactor(CostKind);
1718	}
1719	if (!isa<FixedVectorType>(Val: Src))
1720	return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1721	// Scalar cost, which is currently very high due to the efficiency of the
1722	// generated code.
1723	return cast<FixedVectorType>(Val: Src)->getNumElements() * `8`;
1724	}
1725
1726	InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1727	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
1728	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1729	bool UseMaskForCond, bool UseMaskForGaps) const {
1730	assert(Factor >= `2` && "Invalid interleave factor");
1731	assert(isa<VectorType>(VecTy) && "Expect a vector type");
1732
1733	// vldN/vstN doesn't support vector types of i64/f64 element.
1734	bool EltIs64Bits = DL.getTypeSizeInBits(Ty: VecTy->getScalarType()) == `64`;
1735
1736	if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1737	!UseMaskForCond && !UseMaskForGaps) {
1738	unsigned NumElts = cast<FixedVectorType>(Val: VecTy)->getNumElements();
1739	auto *SubVecTy =
1740	FixedVectorType::get(ElementType: VecTy->getScalarType(), NumElts: NumElts / Factor);
1741
1742	// vldN/vstN only support legal vector types of size 64 or 128 in bits.
1743	// Accesses having vector types that are a multiple of 128 bits can be
1744	// matched to more than one vldN/vstN instruction.
1745	int BaseCost =
1746	ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : `1`;
1747	if (NumElts % Factor == `0` &&
1748	TLI->isLegalInterleavedAccessType(Factor, VecTy: SubVecTy, Alignment, DL))
1749	return Factor * BaseCost * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL);
1750
1751	// Some smaller than legal interleaved patterns are cheap as we can make
1752	// use of the vmovn or vrev patterns to interleave a standard load. This is
1753	// true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1754	// promoted differently). The cost of 2 here is then a load and vrev or
1755	// vmovn.
1756	if (ST->hasMVEIntegerOps() && Factor == `2` && NumElts / Factor > `2` &&
1757	VecTy->isIntOrIntVectorTy() &&
1758	DL.getTypeSizeInBits(Ty: SubVecTy).getFixedValue() <= `64`)
1759	return `2` * BaseCost;
1760	}
1761
1762	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1763	Alignment, AddressSpace, CostKind,
1764	UseMaskForCond, UseMaskForGaps);
1765	}
1766
1767	InstructionCost
1768	ARMTTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
1769	TTI::TargetCostKind CostKind) const {
1770
1771	Type *DataTy = MICA.getDataType();
1772	const Value *Ptr = MICA.getPointer();
1773	bool VariableMask = MICA.getVariableMask();
1774	Align Alignment = MICA.getAlignment();
1775	const Instruction *I = MICA.getInst();
1776
1777	using namespace PatternMatch;
1778	if (!ST->hasMVEIntegerOps() \|\| !EnableMaskedGatherScatters)
1779	return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1780
1781	assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1782	auto *VTy = cast<FixedVectorType>(Val: DataTy);
1783
1784	// TODO: Splitting, once we do that.
1785
1786	unsigned NumElems = VTy->getNumElements();
1787	unsigned EltSize = VTy->getScalarSizeInBits();
1788	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: DataTy);
1789
1790	// For now, it is assumed that for the MVE gather instructions the loads are
1791	// all effectively serialised. This means the cost is the scalar cost
1792	// multiplied by the number of elements being loaded. This is possibly very
1793	// conservative, but even so we still end up vectorising loops because the
1794	// cost per iteration for many loops is lower than for scalar loops.
1795	InstructionCost VectorCost =
1796	NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1797	// The scalarization cost should be a lot higher. We use the number of vector
1798	// elements plus the scalarization overhead. If masking is required then a lot
1799	// of little blocks will be needed and potentially a scalarized p0 mask,
1800	// greatly increasing the cost.
1801	InstructionCost ScalarCost =
1802	NumElems * LT.first + (VariableMask ? NumElems * `5` : `0`) +
1803	BaseT::getScalarizationOverhead(InTy: VTy, /Insert/ true, /Extract/ false,
1804	CostKind) +
1805	BaseT::getScalarizationOverhead(InTy: VTy, /Insert/ false, /Extract/ true,
1806	CostKind);
1807
1808	if (EltSize < `8` \|\| Alignment < EltSize / `8`)
1809	return ScalarCost;
1810
1811	unsigned ExtSize = EltSize;
1812	// Check whether there's a single user that asks for an extended type
1813	if (I != nullptr) {
1814	// Dependent of the caller of this function, a gather instruction will
1815	// either have opcode Instruction::Load or be a call to the masked_gather
1816	// intrinsic
1817	if ((I->getOpcode() == Instruction::Load \|\|
1818	match(V: I, P: m_Intrinsic<Intrinsic::masked_gather>())) &&
1819	I->hasOneUse()) {
1820	const User Us = I->users().begin();
1821	if (isa<ZExtInst>(Val: Us) \|\| isa<SExtInst>(Val: Us)) {
1822	// only allow valid type combinations
1823	unsigned TypeSize =
1824	cast<Instruction>(Val: Us)->getType()->getScalarSizeInBits();
1825	if (((TypeSize == `32` && (EltSize == `8` \|\| EltSize == `16`)) \|\|
1826	(TypeSize == `16` && EltSize == `8`)) &&
1827	TypeSize * NumElems == `128`) {
1828	ExtSize = TypeSize;
1829	}
1830	}
1831	}
1832	// Check whether the input data needs to be truncated
1833	TruncInst *T;
1834	if ((I->getOpcode() == Instruction::Store \|\|
1835	match(V: I, P: m_Intrinsic<Intrinsic::masked_scatter>())) &&
1836	(T = dyn_cast<TruncInst>(Val: I->getOperand(i: `0`)))) {
1837	// Only allow valid type combinations
1838	unsigned TypeSize = T->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits();
1839	if (((EltSize == `16` && TypeSize == `32`) \|\|
1840	(EltSize == `8` && (TypeSize == `32` \|\| TypeSize == `16`))) &&
1841	TypeSize * NumElems == `128`)
1842	ExtSize = TypeSize;
1843	}
1844	}
1845
1846	if (ExtSize * NumElems != `128` \|\| NumElems < `4`)
1847	return ScalarCost;
1848
1849	// Any (aligned) i32 gather will not need to be scalarised.
1850	if (ExtSize == `32`)
1851	return VectorCost;
1852	// For smaller types, we need to ensure that the gep's inputs are correctly
1853	// extended from a small enough value. Other sizes (including i64) are
1854	// scalarized for now.
1855	if (ExtSize != `8` && ExtSize != `16`)
1856	return ScalarCost;
1857
1858	if (const auto *BC = dyn_cast<BitCastInst>(Val: Ptr))
1859	Ptr = BC->getOperand(i_nocapture: `0`);
1860	if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr)) {
1861	if (GEP->getNumOperands() != `2`)
1862	return ScalarCost;
1863	unsigned Scale = DL.getTypeAllocSize(Ty: GEP->getResultElementType());
1864	// Scale needs to be correct (which is only relevant for i16s).
1865	if (Scale != `1` && Scale * `8` != ExtSize)
1866	return ScalarCost;
1867	// And we need to zext (not sext) the indexes from a small enough type.
1868	if (const auto *ZExt = dyn_cast<ZExtInst>(Val: GEP->getOperand(i_nocapture: `1`))) {
1869	if (ZExt->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits() <= ExtSize)
1870	return VectorCost;
1871	}
1872	return ScalarCost;
1873	}
1874	return ScalarCost;
1875	}
1876
1877	InstructionCost
1878	ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1879	std::optional<FastMathFlags> FMF,
1880	TTI::TargetCostKind CostKind) const {
1881
1882	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1883	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1884	unsigned EltSize = ValVT.getScalarSizeInBits();
1885
1886	// In general floating point reductions are a series of elementwise
1887	// operations, with free extracts on each step. These are either in-order or
1888	// treewise depending on whether that is allowed by the fast math flags.
1889	if ((ISD == ISD::FADD \|\| ISD == ISD::FMUL) &&
1890	((EltSize == `32` && ST->hasVFP2Base()) \|\|
1891	(EltSize == `64` && ST->hasFP64()) \|\|
1892	(EltSize == `16` && ST->hasFullFP16()))) {
1893	unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1894	unsigned VecLimit = ST->hasMVEFloatOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
1895	InstructionCost VecCost = `0`;
1896	while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(Value: NumElts) &&
1897	NumElts * EltSize > VecLimit) {
1898	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / `2`);
1899	VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1900	NumElts /= `2`;
1901	}
1902
1903	// For fp16 we need to extract the upper lane elements. MVE can add a
1904	// VREV+FMIN/MAX to perform another vector step instead.
1905	InstructionCost ExtractCost = `0`;
1906	if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1907	ValVT.getVectorElementType() == MVT::f16 && NumElts == `8`) {
1908	VecCost += ST->getMVEVectorCostFactor(CostKind) * `2`;
1909	NumElts /= `2`;
1910	} else if (ValVT.getVectorElementType() == MVT::f16)
1911	ExtractCost = NumElts / `2`;
1912
1913	return VecCost + ExtractCost +
1914	NumElts *
1915	getArithmeticInstrCost(Opcode, Ty: ValTy->getElementType(), CostKind);
1916	}
1917
1918	if ((ISD == ISD::AND \|\| ISD == ISD::OR \|\| ISD == ISD::XOR) &&
1919	(EltSize == `64` \|\| EltSize == `32` \|\| EltSize == `16` \|\| EltSize == `8`)) {
1920	unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1921	unsigned VecLimit =
1922	ST->hasMVEIntegerOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
1923	InstructionCost VecCost = `0`;
1924	while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1925	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / `2`);
1926	VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1927	NumElts /= `2`;
1928	}
1929	// For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1930	// step.
1931	if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= `16` &&
1932	NumElts * EltSize == `64`) {
1933	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts);
1934	VecCost += ST->getMVEVectorCostFactor(CostKind) +
1935	getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1936	NumElts /= `2`;
1937	}
1938
1939	// From here we extract the elements and perform the and/or/xor.
1940	InstructionCost ExtractCost = NumElts;
1941	return VecCost + ExtractCost +
1942	(NumElts - `1`) * getArithmeticInstrCost(
1943	Opcode, Ty: ValTy->getElementType(), CostKind);
1944	}
1945
1946	if (!ST->hasMVEIntegerOps() \|\| !ValVT.isSimple() \|\| ISD != ISD::ADD \|\|
1947	TTI::requiresOrderedReduction(FMF))
1948	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1949
1950	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1951
1952	static const CostTblEntry CostTblAdd[]{
1953	{.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: `1`},
1954	{.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: `1`},
1955	{.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: `1`},
1956	};
1957	if (const auto *Entry = CostTableLookup(Table: CostTblAdd, ISD, Ty: LT.second))
1958	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1959
1960	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1961	}
1962
1963	InstructionCost ARMTTIImpl::getExtendedReductionCost(
1964	unsigned Opcode, bool IsUnsigned, Type ResTy, VectorType ValTy,
1965	std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1966	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1967	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1968
1969	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1970
1971	switch (ISD) {
1972	case ISD::ADD:
1973	if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1974	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1975
1976	// The legal cases are:
1977	// VADDV u/s 8/16/32
1978	// VADDLV u/s 32
1979	// Codegen currently cannot always handle larger than legal vectors very
1980	// well, especially for predicated reductions where the mask needs to be
1981	// split, so restrict to 128bit or smaller input types.
1982	unsigned RevVTSize = ResVT.getSizeInBits();
1983	if (ValVT.getSizeInBits() <= `128` &&
1984	((LT.second == MVT::v16i8 && RevVTSize <= `32`) \|\|
1985	(LT.second == MVT::v8i16 && RevVTSize <= `32`) \|\|
1986	(LT.second == MVT::v4i32 && RevVTSize <= `64`)))
1987	return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1988	}
1989	break;
1990	default:
1991	break;
1992	}
1993	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, FMF,
1994	CostKind);
1995	}
1996
1997	InstructionCost
1998	ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
1999	Type ResTy, VectorType ValTy,
2000	TTI::TargetCostKind CostKind) const {
2001	if (RedOpcode != Instruction::Add)
2002	return InstructionCost::getInvalid(Val: CostKind);
2003	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
2004	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
2005
2006	if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
2007	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
2008
2009	// The legal cases are:
2010	// VMLAV u/s 8/16/32
2011	// VMLALV u/s 16/32
2012	// Codegen currently cannot always handle larger than legal vectors very
2013	// well, especially for predicated reductions where the mask needs to be
2014	// split, so restrict to 128bit or smaller input types.
2015	unsigned RevVTSize = ResVT.getSizeInBits();
2016	if (ValVT.getSizeInBits() <= `128` &&
2017	((LT.second == MVT::v16i8 && RevVTSize <= `32`) \|\|
2018	(LT.second == MVT::v8i16 && RevVTSize <= `64`) \|\|
2019	(LT.second == MVT::v4i32 && RevVTSize <= `64`)))
2020	return ST->getMVEVectorCostFactor(CostKind) * LT.first;
2021	}
2022
2023	return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, Ty: ValTy,
2024	CostKind);
2025	}
2026
2027	InstructionCost
2028	ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
2029	FastMathFlags FMF,
2030	TTI::TargetCostKind CostKind) const {
2031	EVT ValVT = TLI->getValueType(DL, Ty);
2032
2033	// In general floating point reductions are a series of elementwise
2034	// operations, with free extracts on each step. These are either in-order or
2035	// treewise depending on whether that is allowed by the fast math flags.
2036	if ((IID == Intrinsic::minnum \|\| IID == Intrinsic::maxnum) &&
2037	((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) \|\|
2038	(ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) \|\|
2039	(ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
2040	unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
2041	unsigned EltSize = ValVT.getScalarSizeInBits();
2042	unsigned VecLimit = ST->hasMVEFloatOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
2043	InstructionCost VecCost;
2044	while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
2045	Type *VecTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElts/`2`);
2046	IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
2047	VecCost += getIntrinsicInstrCost(ICA, CostKind);
2048	NumElts /= `2`;
2049	}
2050
2051	// For fp16 we need to extract the upper lane elements. MVE can add a
2052	// VREV+FMIN/MAX to perform another vector step instead.
2053	InstructionCost ExtractCost = `0`;
2054	if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
2055	NumElts == `8`) {
2056	VecCost += ST->getMVEVectorCostFactor(CostKind) * `2`;
2057	NumElts /= `2`;
2058	} else if (ValVT.getVectorElementType() == MVT::f16)
2059	ExtractCost = cast<FixedVectorType>(Val: Ty)->getNumElements() / `2`;
2060
2061	IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
2062	{Ty->getElementType(), Ty->getElementType()},
2063	FMF);
2064	return VecCost + ExtractCost +
2065	(NumElts - `1`) * getIntrinsicInstrCost(ICA, CostKind);
2066	}
2067
2068	if (IID == Intrinsic::smin \|\| IID == Intrinsic::smax \|\|
2069	IID == Intrinsic::umin \|\| IID == Intrinsic::umax) {
2070	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2071
2072	// All costs are the same for u/s min/max. These lower to vminv, which are
2073	// given a slightly higher cost as they tend to take multiple cycles for
2074	// smaller type sizes.
2075	static const CostTblEntry CostTblAdd[]{
2076	{.ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: `4`},
2077	{.ISD: ISD::SMIN, .Type: MVT::v8i16, .Cost: `3`},
2078	{.ISD: ISD::SMIN, .Type: MVT::v4i32, .Cost: `2`},
2079	};
2080	if (const auto *Entry = CostTableLookup(Table: CostTblAdd, ISD: ISD::SMIN, Ty: LT.second))
2081	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
2082	}
2083
2084	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2085	}
2086
2087	InstructionCost
2088	ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
2089	TTI::TargetCostKind CostKind) const {
2090	unsigned Opc = ICA.getID();
2091	switch (Opc) {
2092	case Intrinsic::get_active_lane_mask:
2093	// Currently we make a somewhat optimistic assumption that
2094	// active_lane_mask's are always free. In reality it may be freely folded
2095	// into a tail predicated loop, expanded into a VCPT or expanded into a lot
2096	// of add/icmp code. We may need to improve this in the future, but being
2097	// able to detect if it is free or not involves looking at a lot of other
2098	// code. We currently assume that the vectorizer inserted these, and knew
2099	// what it was doing in adding one.
2100	if (ST->hasMVEIntegerOps())
2101	return `0`;
2102	break;
2103	case Intrinsic::sadd_sat:
2104	case Intrinsic::ssub_sat:
2105	case Intrinsic::uadd_sat:
2106	case Intrinsic::usub_sat: {
2107	bool IsAdd = (Opc == Intrinsic::sadd_sat \|\| Opc == Intrinsic::ssub_sat);
2108	bool IsSigned = (Opc == Intrinsic::sadd_sat \|\| Opc == Intrinsic::ssub_sat);
2109	Type *RetTy = ICA.getReturnType();
2110
2111	if (auto *ITy = dyn_cast<IntegerType>(Val: RetTy)) {
2112	if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == `32`)
2113	return `1`; // qadd / qsub
2114	if (ST->hasDSP() && (ITy->getBitWidth() == `8` \|\| ITy->getBitWidth() == `16`))
2115	return `2`; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2116	// Otherwise return the cost of expanding the node. Generally an add +
2117	// icmp + sel.
2118	CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2119	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
2120	return getArithmeticInstrCost(Opcode: IsAdd ? Instruction::Add : Instruction::Sub,
2121	Ty: RetTy, CostKind) +
2122	`2` * getCmpSelInstrCost(Opcode: BinaryOperator::ICmp, ValTy: RetTy, CondTy, VecPred: Pred,
2123	CostKind) +
2124	`2` * getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy, VecPred: Pred,
2125	CostKind);
2126	}
2127
2128	if (!ST->hasMVEIntegerOps())
2129	break;
2130
2131	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
2132	if (LT.second == MVT::v4i32 \|\| LT.second == MVT::v8i16 \|\|
2133	LT.second == MVT::v16i8) {
2134	// This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2135	// need to extend the type, as it uses shr(qadd(shl, shl)).
2136	unsigned Instrs =
2137	LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? `1`
2138	: `4`;
2139	return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2140	}
2141	break;
2142	}
2143	case Intrinsic::abs:
2144	case Intrinsic::smin:
2145	case Intrinsic::smax:
2146	case Intrinsic::umin:
2147	case Intrinsic::umax: {
2148	if (!ST->hasMVEIntegerOps())
2149	break;
2150	Type *VT = ICA.getReturnType();
2151
2152	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
2153	if (LT.second == MVT::v4i32 \|\| LT.second == MVT::v8i16 \|\|
2154	LT.second == MVT::v16i8)
2155	return LT.first * ST->getMVEVectorCostFactor(CostKind);
2156	break;
2157	}
2158	case Intrinsic::minnum:
2159	case Intrinsic::maxnum: {
2160	if (!ST->hasMVEFloatOps())
2161	break;
2162	Type *VT = ICA.getReturnType();
2163	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
2164	if (LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16)
2165	return LT.first * ST->getMVEVectorCostFactor(CostKind);
2166	break;
2167	}
2168	case Intrinsic::fptosi_sat:
2169	case Intrinsic::fptoui_sat: {
2170	if (ICA.getArgTypes().empty())
2171	break;
2172	bool IsSigned = Opc == Intrinsic::fptosi_sat;
2173	auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[`0`]);
2174	EVT MTy = TLI->getValueType(DL, Ty: ICA.getReturnType());
2175	// Check for the legal types, with the correct subtarget features.
2176	if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) \|\|
2177	(ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) \|\|
2178	(ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2179	return LT.first;
2180
2181	// Equally for MVE vector types
2182	if (ST->hasMVEFloatOps() &&
2183	(LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16) &&
2184	LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2185	return LT.first * ST->getMVEVectorCostFactor(CostKind);
2186
2187	// If we can we use a legal convert followed by a min+max
2188	if (((ST->hasVFP2Base() && LT.second == MVT::f32) \|\|
2189	(ST->hasFP64() && LT.second == MVT::f64) \|\|
2190	(ST->hasFullFP16() && LT.second == MVT::f16) \|\|
2191	(ST->hasMVEFloatOps() &&
2192	(LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16))) &&
2193	LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2194	Type *LegalTy = Type::getIntNTy(C&: ICA.getReturnType()->getContext(),
2195	N: LT.second.getScalarSizeInBits());
2196	InstructionCost Cost =
2197	LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : `1`;
2198	IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2199	: Intrinsic::umin,
2200	LegalTy, {LegalTy, LegalTy});
2201	Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
2202	IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2203	: Intrinsic::umax,
2204	LegalTy, {LegalTy, LegalTy});
2205	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
2206	return LT.first * Cost;
2207	}
2208	// Otherwise we need to follow the default expansion that clamps the value
2209	// using a float min/max with a fcmp+sel for nan handling when signed.
2210	Type *FPTy = ICA.getArgTypes()[`0`];
2211	Type *RetTy = ICA.getReturnType();
2212	IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2213	InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind);
2214	IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2215	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
2216	Cost +=
2217	getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2218	Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind);
2219	if (IsSigned) {
2220	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
2221	Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy,
2222	VecPred: CmpInst::FCMP_UNO, CostKind);
2223	Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy,
2224	VecPred: CmpInst::FCMP_UNO, CostKind);
2225	}
2226	return Cost;
2227	}
2228	}
2229
2230	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
2231	}
2232
2233	bool ARMTTIImpl::isLoweredToCall(const Function F) const* {
2234	if (!F->isIntrinsic())
2235	return BaseT::isLoweredToCall(F);
2236
2237	// Assume all Arm-specific intrinsics map to an instruction.
2238	if (F->getName().starts_with(Prefix: "llvm.arm"))
2239	return false;
2240
2241	switch (F->getIntrinsicID()) {
2242	default: break;
2243	case Intrinsic::powi:
2244	case Intrinsic::sin:
2245	case Intrinsic::cos:
2246	case Intrinsic::sincos:
2247	case Intrinsic::pow:
2248	case Intrinsic::log:
2249	case Intrinsic::log10:
2250	case Intrinsic::log2:
2251	case Intrinsic::exp:
2252	case Intrinsic::exp2:
2253	return true;
2254	case Intrinsic::sqrt:
2255	case Intrinsic::fabs:
2256	case Intrinsic::copysign:
2257	case Intrinsic::floor:
2258	case Intrinsic::ceil:
2259	case Intrinsic::trunc:
2260	case Intrinsic::rint:
2261	case Intrinsic::nearbyint:
2262	case Intrinsic::round:
2263	case Intrinsic::canonicalize:
2264	case Intrinsic::lround:
2265	case Intrinsic::llround:
2266	case Intrinsic::lrint:
2267	case Intrinsic::llrint:
2268	if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2269	return true;
2270	if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2271	return true;
2272	// Some operations can be handled by vector instructions and assume
2273	// unsupported vectors will be expanded into supported scalar ones.
2274	// TODO Handle scalar operations properly.
2275	return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2276	case Intrinsic::masked_store:
2277	case Intrinsic::masked_load:
2278	case Intrinsic::masked_gather:
2279	case Intrinsic::masked_scatter:
2280	return !ST->hasMVEIntegerOps();
2281	case Intrinsic::sadd_with_overflow:
2282	case Intrinsic::uadd_with_overflow:
2283	case Intrinsic::ssub_with_overflow:
2284	case Intrinsic::usub_with_overflow:
2285	case Intrinsic::sadd_sat:
2286	case Intrinsic::uadd_sat:
2287	case Intrinsic::ssub_sat:
2288	case Intrinsic::usub_sat:
2289	return false;
2290	}
2291
2292	return BaseT::isLoweredToCall(F);
2293	}
2294
2295	bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) const {
2296	unsigned ISD = TLI->InstructionOpcodeToISD(Opcode: I.getOpcode());
2297	EVT VT = TLI->getValueType(DL, Ty: I.getType(), AllowUnknown: true);
2298	if (TLI->getOperationAction(Op: ISD, VT) == TargetLowering::LibCall)
2299	return true;
2300
2301	// Check if an intrinsic will be lowered to a call and assume that any
2302	// other CallInst will generate a bl.
2303	if (auto *Call = dyn_cast<CallInst>(Val: &I)) {
2304	if (auto *II = dyn_cast<IntrinsicInst>(Val: Call)) {
2305	switch(II->getIntrinsicID()) {
2306	case Intrinsic::memcpy:
2307	case Intrinsic::memset:
2308	case Intrinsic::memmove:
2309	return getNumMemOps(I: II) == -`1`;
2310	default:
2311	if (const Function *F = Call->getCalledFunction())
2312	return isLoweredToCall(F);
2313	}
2314	}
2315	return true;
2316	}
2317
2318	// FPv5 provides conversions between integer, double-precision,
2319	// single-precision, and half-precision formats.
2320	switch (I.getOpcode()) {
2321	default:
2322	break;
2323	case Instruction::FPToSI:
2324	case Instruction::FPToUI:
2325	case Instruction::SIToFP:
2326	case Instruction::UIToFP:
2327	case Instruction::FPTrunc:
2328	case Instruction::FPExt:
2329	return !ST->hasFPARMv8Base();
2330	}
2331
2332	// FIXME: Unfortunately the approach of checking the Operation Action does
2333	// not catch all cases of Legalization that use library calls. Our
2334	// Legalization step categorizes some transformations into library calls as
2335	// Custom, Expand or even Legal when doing type legalization. So for now
2336	// we have to special case for instance the SDIV of 64bit integers and the
2337	// use of floating point emulation.
2338	if (VT.isInteger() && VT.getSizeInBits() >= `64`) {
2339	switch (ISD) {
2340	default:
2341	break;
2342	case ISD::SDIV:
2343	case ISD::UDIV:
2344	case ISD::SREM:
2345	case ISD::UREM:
2346	case ISD::SDIVREM:
2347	case ISD::UDIVREM:
2348	return true;
2349	}
2350	}
2351
2352	// Assume all other non-float operations are supported.
2353	if (!VT.isFloatingPoint())
2354	return false;
2355
2356	// We'll need a library call to handle most floats when using soft.
2357	if (TLI->useSoftFloat()) {
2358	switch (I.getOpcode()) {
2359	default:
2360	return true;
2361	case Instruction::Alloca:
2362	case Instruction::Load:
2363	case Instruction::Store:
2364	case Instruction::Select:
2365	case Instruction::PHI:
2366	return false;
2367	}
2368	}
2369
2370	// We'll need a libcall to perform double precision operations on a single
2371	// precision only FPU.
2372	if (I.getType()->isDoubleTy() && !ST->hasFP64())
2373	return true;
2374
2375	// Likewise for half precision arithmetic.
2376	if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2377	return true;
2378
2379	return false;
2380	}
2381
2382	bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2383	AssumptionCache &AC,
2384	TargetLibraryInfo *LibInfo,
2385	HardwareLoopInfo &HWLoopInfo) const {
2386	// Low-overhead branches are only supported in the 'low-overhead branch'
2387	// extension of v8.1-m.
2388	if (!ST->hasLOB() \|\| DisableLowOverheadLoops) {
2389	LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2390	return false;
2391	}
2392
2393	if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2394	LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2395	return false;
2396	}
2397
2398	const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2399	if (isa<SCEVCouldNotCompute>(Val: BackedgeTakenCount)) {
2400	LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2401	return false;
2402	}
2403
2404	const SCEV *TripCountSCEV =
2405	SE.getAddExpr(LHS: BackedgeTakenCount,
2406	RHS: SE.getOne(Ty: BackedgeTakenCount->getType()));
2407
2408	// We need to store the trip count in LR, a 32-bit register.
2409	if (SE.getUnsignedRangeMax(S: TripCountSCEV).getBitWidth() > `32`) {
2410	LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2411	return false;
2412	}
2413
2414	// Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2415	// point in generating a hardware loop if that's going to happen.
2416
2417	auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2418	if (auto *Call = dyn_cast<IntrinsicInst>(Val: &I)) {
2419	switch (Call->getIntrinsicID()) {
2420	default:
2421	break;
2422	case Intrinsic::start_loop_iterations:
2423	case Intrinsic::test_start_loop_iterations:
2424	case Intrinsic::loop_decrement:
2425	case Intrinsic::loop_decrement_reg:
2426	return true;
2427	}
2428	}
2429	return false;
2430	};
2431
2432	// Scan the instructions to see if there's any that we know will turn into a
2433	// call or if this loop is already a low-overhead loop or will become a tail
2434	// predicated loop.
2435	bool IsTailPredLoop = false;
2436	auto ScanLoop = [&](Loop *L) {
2437	for (auto *BB : L->getBlocks()) {
2438	for (auto &I : *BB) {
2439	if (maybeLoweredToCall(I) \|\| IsHardwareLoopIntrinsic (I) \|\|
2440	isa<InlineAsm>(Val: I)) {
2441	LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2442	return false;
2443	}
2444	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2445	IsTailPredLoop \|=
2446	II->getIntrinsicID() == Intrinsic::get_active_lane_mask \|\|
2447	II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 \|\|
2448	II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 \|\|
2449	II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 \|\|
2450	II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2451	}
2452	}
2453	return true;
2454	};
2455
2456	// Visit inner loops.
2457	for (auto Inner : L)
2458	if (!ScanLoop (Inner))
2459	return false;
2460
2461	if (!ScanLoop (L))
2462	return false;
2463
2464	// TODO: Check whether the trip count calculation is expensive. If L is the
2465	// inner loop but we know it has a low trip count, calculating that trip
2466	// count (in the parent loop) may be detrimental.
2467
2468	LLVMContext &C = L->getHeader()->getContext();
2469	HWLoopInfo.CounterInReg = true;
2470	HWLoopInfo.IsNestingLegal = false;
2471	HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2472	HWLoopInfo.CountType = Type::getInt32Ty(C);
2473	HWLoopInfo.LoopDecrement = ConstantInt::get(Ty: HWLoopInfo.CountType, V: `1`);
2474	return true;
2475	}
2476
2477	static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2478	// We don't allow icmp's, and because we only look at single block loops,
2479	// we simply count the icmps, i.e. there should only be 1 for the backedge.
2480	if (isa<ICmpInst>(Val: &I) && ++ICmpCount > `1`)
2481	return false;
2482	// FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2483	// not currently canonical, but soon will be. Code without them uses icmp, and
2484	// so is not tail predicated as per the condition above. In order to get the
2485	// same performance we treat min and max the same as an icmp for tailpred
2486	// purposes for the moment (we often rely on non-tailpred and higher VF's to
2487	// pick more optimal instructions like VQDMULH. They need to be recognized
2488	// directly by the vectorizer).
2489	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2490	if ((II->getIntrinsicID() == Intrinsic::smin \|\|
2491	II->getIntrinsicID() == Intrinsic::smax \|\|
2492	II->getIntrinsicID() == Intrinsic::umin \|\|
2493	II->getIntrinsicID() == Intrinsic::umax) &&
2494	++ICmpCount > `1`)
2495	return false;
2496
2497	if (isa<FCmpInst>(Val: &I))
2498	return false;
2499
2500	// We could allow extending/narrowing FP loads/stores, but codegen is
2501	// too inefficient so reject this for now.
2502	if (isa<FPExtInst>(Val: &I) \|\| isa<FPTruncInst>(Val: &I))
2503	return false;
2504
2505	// Extends have to be extending-loads
2506	if (isa<SExtInst>(Val: &I) \|\| isa<ZExtInst>(Val: &I) )
2507	if (!I.getOperand(i: `0`)->hasOneUse() \|\| !isa<LoadInst>(Val: I.getOperand(i: `0`)))
2508	return false;
2509
2510	// Truncs have to be narrowing-stores
2511	if (isa<TruncInst>(Val: &I) )
2512	if (!I.hasOneUse() \|\| !isa<StoreInst>(Val: *I.user_begin()))
2513	return false;
2514
2515	return true;
2516	}
2517
2518	// To set up a tail-predicated loop, we need to know the total number of
2519	// elements processed by that loop. Thus, we need to determine the element
2520	// size and:
2521	// 1) it should be uniform for all operations in the vector loop, so we
2522	// e.g. don't want any widening/narrowing operations.
2523	// 2) it should be smaller than i64s because we don't have vector operations
2524	// that work on i64s.
2525	// 3) we don't want elements to be reversed or shuffled, to make sure the
2526	// tail-predication masks/predicates the right lanes.
2527	//
2528	static bool canTailPredicateLoop(Loop L, LoopInfo LI, ScalarEvolution &SE,
2529	const DataLayout &DL,
2530	const LoopAccessInfo *LAI,
2531	const DominatorTree &DT) {
2532	LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2533
2534	// If there are live-out values, it is probably a reduction. We can predicate
2535	// most reduction operations freely under MVE using a combination of
2536	// prefer-predicated-reduction-select and inloop reductions. We limit this to
2537	// floating point and integer reductions, but don't check for operators
2538	// specifically here. If the value ends up not being a reduction (and so the
2539	// vectorizer cannot tailfold the loop), we should fall back to standard
2540	// vectorization automatically.
2541	SmallVector< Instruction *, `8` > LiveOuts;
2542	LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2543	bool ReductionsDisabled =
2544	EnableTailPredication == TailPredication::EnabledNoReductions \|\|
2545	EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2546
2547	for (auto *I : LiveOuts) {
2548	if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2549	!I->getType()->isHalfTy()) {
2550	LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2551	"live-out value\n");
2552	return false;
2553	}
2554	if (ReductionsDisabled) {
2555	LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2556	return false;
2557	}
2558	}
2559
2560	// Next, check that all instructions can be tail-predicated.
2561	PredicatedScalarEvolution PSE = LAI->getPSE();
2562	int ICmpCount = `0`;
2563
2564	for (BasicBlock *BB : L->blocks()) {
2565	for (Instruction &I : BB->instructionsWithoutDebug()) {
2566	if (isa<PHINode>(Val: &I))
2567	continue;
2568	if (!canTailPredicateInstruction(I, ICmpCount)) {
2569	LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2570	return false;
2571	}
2572
2573	Type *T = I.getType();
2574	if (T->getScalarSizeInBits() > `32`) {
2575	LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2576	return false;
2577	}
2578	if (isa<StoreInst>(Val: I) \|\| isa<LoadInst>(Val: I)) {
2579	Value *Ptr = getLoadStorePointerOperand(V: &I);
2580	Type *AccessTy = getLoadStoreType(I: &I);
2581	int64_t NextStride =
2582	getPtrStride(PSE, AccessTy, Ptr, Lp: L, DT).value_or(u: `0`);
2583	if (NextStride == `1`) {
2584	// TODO: for now only allow consecutive strides of 1. We could support
2585	// other strides as long as it is uniform, but let's keep it simple
2586	// for now.
2587	continue;
2588	} else if (NextStride == -`1` \|\|
2589	(NextStride == `2` && MVEMaxSupportedInterleaveFactor >= `2`) \|\|
2590	(NextStride == `4` && MVEMaxSupportedInterleaveFactor >= `4`)) {
2591	LLVM_DEBUG(dbgs()
2592	<< "Consecutive strides of 2 found, vld2/vstr2 can't "
2593	"be tail-predicated\n.");
2594	return false;
2595	// TODO: don't tail predicate if there is a reversed load?
2596	} else if (EnableMaskedGatherScatters) {
2597	// Gather/scatters do allow loading from arbitrary strides, at
2598	// least if they are loop invariant.
2599	// TODO: Loop variant strides should in theory work, too, but
2600	// this requires further testing.
2601	const SCEV *PtrScev = PSE.getSE()->getSCEV(V: Ptr);
2602	if (auto AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev)) {
2603	const SCEV Step = AR->getStepRecurrence(SE&: PSE.getSE());
2604	if (PSE.getSE()->isLoopInvariant(S: Step, L))
2605	continue;
2606	}
2607	}
2608	LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2609	"tail-predicate\n.");
2610	return false;
2611	}
2612	}
2613	}
2614
2615	LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2616	return true;
2617	}
2618
2619	bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo TFI) const* {
2620	if (!EnableTailPredication) {
2621	LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2622	return false;
2623	}
2624
2625	// Creating a predicated vector loop is the first step for generating a
2626	// tail-predicated hardware loop, for which we need the MVE masked
2627	// load/stores instructions:
2628	if (!ST->hasMVEIntegerOps())
2629	return false;
2630
2631	LoopVectorizationLegality *LVL = TFI->LVL;
2632	Loop *L = LVL->getLoop();
2633
2634	// For now, restrict this to single block loops.
2635	if (L->getNumBlocks() > `1`) {
2636	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2637	"loop.\n");
2638	return false;
2639	}
2640
2641	assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2642
2643	LoopInfo *LI = LVL->getLoopInfo();
2644	HardwareLoopInfo HWLoopInfo(L);
2645	if (!HWLoopInfo.canAnalyze(LI&: *LI)) {
2646	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2647	"analyzable.\n");
2648	return false;
2649	}
2650
2651	AssumptionCache *AC = LVL->getAssumptionCache();
2652	ScalarEvolution *SE = LVL->getScalarEvolution();
2653
2654	// This checks if we have the low-overhead branch architecture
2655	// extension, and if we will create a hardware-loop:
2656	if (!isHardwareLoopProfitable(L, SE&: SE, AC&: AC, LibInfo: TFI->TLI, HWLoopInfo)) {
2657	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2658	"profitable.\n");
2659	return false;
2660	}
2661
2662	DominatorTree *DT = LVL->getDominatorTree();
2663	if (!HWLoopInfo.isHardwareLoopCandidate(SE&: SE, LI&: LI, DT&: *DT)) {
2664	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2665	"a candidate.\n");
2666	return false;
2667	}
2668
2669	return canTailPredicateLoop(L, LI, SE&: *SE, DL, LAI: LVL->getLAI(),
2670	DT: *LVL->getDominatorTree());
2671	}
2672
2673	TailFoldingStyle ARMTTIImpl::getPreferredTailFoldingStyle() const {
2674	if (!ST->hasMVEIntegerOps() \|\| !EnableTailPredication)
2675	return TailFoldingStyle::DataWithoutLaneMask;
2676
2677	// Intrinsic @llvm.get.active.lane.mask is supported.
2678	// It is used in the MVETailPredication pass, which requires the number of
2679	// elements processed by this vector loop to setup the tail-predicated
2680	// loop.
2681	return TailFoldingStyle::Data;
2682	}
2683	void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2684	TTI::UnrollingPreferences &UP,
2685	OptimizationRemarkEmitter ORE) const* {
2686	// Enable Upper bound unrolling universally, providing that we do not see an
2687	// active lane mask, which will be better kept as a loop to become tail
2688	// predicated than to be conditionally unrolled.
2689	UP.UpperBound =
2690	!ST->hasMVEIntegerOps() \|\| !any_of(Range&: *L->getHeader(), P: [](Instruction &I) {
2691	return isa<IntrinsicInst>(Val: I) &&
2692	cast<IntrinsicInst>(Val&: I).getIntrinsicID() ==
2693	Intrinsic::get_active_lane_mask;
2694	});
2695
2696	// Only currently enable these preferences for M-Class cores.
2697	if (!ST->isMClass())
2698	return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2699
2700	// Disable loop unrolling for Oz and Os.
2701	UP.OptSizeThreshold = `0`;
2702	UP.PartialOptSizeThreshold = `0`;
2703	if (L->getHeader()->getParent()->hasOptSize())
2704	return;
2705
2706	SmallVector<BasicBlock*, `4`> ExitingBlocks;
2707	L->getExitingBlocks(ExitingBlocks);
2708	LLVM_DEBUG(dbgs() << "Loop has:\n"
2709	<< "Blocks: " << L->getNumBlocks() << "\n"
2710	<< "Exit blocks: " << ExitingBlocks.size() << "\n");
2711
2712	// Only allow another exit other than the latch. This acts as an early exit
2713	// as it mirrors the profitability calculation of the runtime unroller.
2714	if (ExitingBlocks.size() > `2`)
2715	return;
2716
2717	// Limit the CFG of the loop body for targets with a branch predictor.
2718	// Allowing 4 blocks permits if-then-else diamonds in the body.
2719	if (ST->hasBranchPredictor() && L->getNumBlocks() > `4`)
2720	return;
2721
2722	// Don't unroll vectorized loops, including the remainder loop
2723	if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized"))
2724	return;
2725
2726	// Scan the loop: don't unroll loops with calls as this could prevent
2727	// inlining.
2728	InstructionCost Cost = `0`;
2729	for (auto *BB : L->getBlocks()) {
2730	for (auto &I : *BB) {
2731	// Don't unroll vectorised loop. MVE does not benefit from it as much as
2732	// scalar code.
2733	if (I.getType()->isVectorTy())
2734	return;
2735
2736	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I)) {
2737	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
2738	if (!isLoweredToCall(F))
2739	continue;
2740	}
2741	return;
2742	}
2743
2744	SmallVector<const Value*, `4`> Operands(I.operand_values());
2745	Cost += getInstructionCost(U: &I, Operands,
2746	CostKind: TargetTransformInfo::TCK_SizeAndLatency);
2747	}
2748	}
2749
2750	// On v6m cores, there are very few registers available. We can easily end up
2751	// spilling and reloading more registers in an unrolled loop. Look at the
2752	// number of LCSSA phis as a rough measure of how many registers will need to
2753	// be live out of the loop, reducing the default unroll count if more than 1
2754	// value is needed. In the long run, all of this should be being learnt by a
2755	// machine.
2756	unsigned UnrollCount = `4`;
2757	if (ST->isThumb1Only()) {
2758	unsigned ExitingValues = `0`;
2759	SmallVector<BasicBlock *, `4`> ExitBlocks;
2760	L->getExitBlocks(ExitBlocks);
2761	for (auto *Exit : ExitBlocks) {
2762	// Count the number of LCSSA phis. Exclude values coming from GEP's as
2763	// only the last is expected to be needed for address operands.
2764	unsigned LiveOuts = count_if(Range: Exit->phis(), P: [](auto &PH) {
2765	return PH.getNumOperands() != `1` \|\|
2766	!isa<GetElementPtrInst>(PH.getOperand(`0`));
2767	});
2768	ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2769	}
2770	if (ExitingValues)
2771	UnrollCount /= ExitingValues;
2772	if (UnrollCount <= `1`)
2773	return;
2774	}
2775
2776	// For processors with low overhead branching (LOB), runtime unrolling the
2777	// innermost loop is often detrimental to performance. In these cases the loop
2778	// remainder gets unrolled into a series of compare-and-jump blocks, which in
2779	// deeply nested loops get executed multiple times, negating the benefits of
2780	// LOB. This is particularly noticeable when the loop trip count of the
2781	// innermost loop varies within the outer loop, such as in the case of
2782	// triangular matrix decompositions. In these cases we will prefer to not
2783	// unroll the innermost loop, with the intention for it to be executed as a
2784	// low overhead loop.
2785	bool Runtime = true;
2786	if (ST->hasLOB()) {
2787	if (SE.hasLoopInvariantBackedgeTakenCount(L)) {
2788	const SCEV *BETC = SE.getBackedgeTakenCount(L);
2789	auto *Outer = L->getOutermostLoop();
2790	if ((L != Outer && Outer != L->getParentLoop()) \|\|
2791	(L != Outer && BETC && !SE.isLoopInvariant(S: BETC, L: Outer))) {
2792	Runtime = false;
2793	}
2794	}
2795	}
2796
2797	LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2798	LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2799
2800	UP.Partial = true;
2801	UP.Runtime = Runtime;
2802	UP.UnrollRemainder = true;
2803	UP.DefaultUnrollRuntimeCount = UnrollCount;
2804	UP.UnrollAndJam = true;
2805	UP.UnrollAndJamInnerLoopThreshold = `60`;
2806
2807	// Force unrolling small loops can be very useful because of the branch
2808	// taken cost of the backedge.
2809	if (Cost < ArmForceUnrollThreshold)
2810	UP.Force = true;
2811	}
2812
2813	void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2814	TTI::PeelingPreferences &PP) const {
2815	BaseT::getPeelingPreferences(L, SE, PP);
2816	}
2817
2818	bool ARMTTIImpl::preferInLoopReduction(RecurKind Kind, Type Ty) const* {
2819	if (!ST->hasMVEIntegerOps())
2820	return false;
2821
2822	unsigned ScalarBits = Ty->getScalarSizeInBits();
2823	switch (Kind) {
2824	case RecurKind::Add:
2825	return ScalarBits <= `64`;
2826	default:
2827	return false;
2828	}
2829	}
2830
2831	bool ARMTTIImpl::preferPredicatedReductionSelect() const {
2832	if (!ST->hasMVEIntegerOps())
2833	return false;
2834	return true;
2835	}
2836
2837	InstructionCost ARMTTIImpl::getScalingFactorCost(Type Ty, GlobalValue BaseGV,
2838	StackOffset BaseOffset,
2839	bool HasBaseReg, int64_t Scale,
2840	unsigned AddrSpace) const {
2841	TargetLoweringBase::AddrMode AM;
2842	AM.BaseGV = BaseGV;
2843	AM.BaseOffs = BaseOffset.getFixed();
2844	AM.HasBaseReg = HasBaseReg;
2845	AM.Scale = Scale;
2846	AM.ScalableOffset = BaseOffset.getScalable();
2847	if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) {
2848	if (ST->hasFPAO())
2849	return AM.Scale < `0` ? `1` : `0`; // positive offsets execute faster
2850	return `0`;
2851	}
2852	return InstructionCost::getInvalid();
2853	}
2854
2855	bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2856	if (Thumb) {
2857	// B.W is available in any Thumb2-supporting target, and also in every
2858	// version of Armv8-M, even Baseline which does not include the rest of
2859	// Thumb2.
2860	return ST->isThumb2() \|\| ST->hasV8MBaselineOps();
2861	} else {
2862	// B is available in all versions of the Arm ISA, so the only question is
2863	// whether that ISA is available at all.
2864	return ST->hasARMOps();
2865	}
2866	}
2867
2868	/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2869	/// of the vector elements.
2870	static bool areExtractExts(Value Ext1, Value Ext2) {
2871	using namespace PatternMatch;
2872
2873	auto areExtDoubled = [](Instruction *Ext) {
2874	return Ext->getType()->getScalarSizeInBits() ==
2875	`2` * Ext->getOperand(i: `0`)->getType()->getScalarSizeInBits();
2876	};
2877
2878	if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) \|\|
2879	!match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) \|\|
2880	!areExtDoubled (cast<Instruction>(Val: Ext1)) \|\|
2881	!areExtDoubled (cast<Instruction>(Val: Ext2)))
2882	return false;
2883
2884	return true;
2885	}
2886
2887	/// Check if sinking \p I's operands to I's basic block is profitable, because
2888	/// the operands can be folded into a target instruction, e.g.
2889	/// sext/zext can be folded into vsubl.
2890	bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,
2891	SmallVectorImpl<Use > &Ops) const* {
2892	using namespace PatternMatch;
2893
2894	if (!I->getType()->isVectorTy())
2895	return false;
2896
2897	if (ST->hasNEON()) {
2898	switch (I->getOpcode()) {
2899	case Instruction::Sub:
2900	case Instruction::Add: {
2901	if (!areExtractExts(Ext1: I->getOperand(i: `0`), Ext2: I->getOperand(i: `1`)))
2902	return false;
2903	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
2904	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
2905	return true;
2906	}
2907	default:
2908	return false;
2909	}
2910	}
2911
2912	if (!ST->hasMVEIntegerOps())
2913	return false;
2914
2915	auto IsFMSMul = [&](Instruction *I) {
2916	if (!I->hasOneUse())
2917	return false;
2918	auto Sub = cast<Instruction>(Val: I->users().begin());
2919	return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(i: `1`) == I;
2920	};
2921	auto IsFMS = [&](Instruction *I) {
2922	if (match(V: I->getOperand(i: `0`), P: m_FNeg(X: m_Value())) \|\|
2923	match(V: I->getOperand(i: `1`), P: m_FNeg(X: m_Value())))
2924	return true;
2925	return false;
2926	};
2927
2928	auto IsSinker = [&](Instruction I, int* Operand) {
2929	switch (I->getOpcode()) {
2930	case Instruction::Add:
2931	case Instruction::Mul:
2932	case Instruction::FAdd:
2933	case Instruction::ICmp:
2934	case Instruction::FCmp:
2935	return true;
2936	case Instruction::FMul:
2937	return !IsFMSMul (I);
2938	case Instruction::Sub:
2939	case Instruction::FSub:
2940	case Instruction::Shl:
2941	case Instruction::LShr:
2942	case Instruction::AShr:
2943	return Operand == `1`;
2944	case Instruction::Call:
2945	if (auto *II = dyn_cast<IntrinsicInst>(Val: I)) {
2946	switch (II->getIntrinsicID()) {
2947	case Intrinsic::fma:
2948	return !IsFMS (I);
2949	case Intrinsic::sadd_sat:
2950	case Intrinsic::uadd_sat:
2951	case Intrinsic::arm_mve_add_predicated:
2952	case Intrinsic::arm_mve_mul_predicated:
2953	case Intrinsic::arm_mve_qadd_predicated:
2954	case Intrinsic::arm_mve_vhadd:
2955	case Intrinsic::arm_mve_hadd_predicated:
2956	case Intrinsic::arm_mve_vqdmull:
2957	case Intrinsic::arm_mve_vqdmull_predicated:
2958	case Intrinsic::arm_mve_vqdmulh:
2959	case Intrinsic::arm_mve_qdmulh_predicated:
2960	case Intrinsic::arm_mve_vqrdmulh:
2961	case Intrinsic::arm_mve_qrdmulh_predicated:
2962	case Intrinsic::arm_mve_fma_predicated:
2963	return true;
2964	case Intrinsic::ssub_sat:
2965	case Intrinsic::usub_sat:
2966	case Intrinsic::arm_mve_sub_predicated:
2967	case Intrinsic::arm_mve_qsub_predicated:
2968	case Intrinsic::arm_mve_hsub_predicated:
2969	case Intrinsic::arm_mve_vhsub:
2970	return Operand == `1`;
2971	default:
2972	return false;
2973	}
2974	}
2975	return false;
2976	default:
2977	return false;
2978	}
2979	};
2980
2981	for (auto OpIdx : enumerate(First: I->operands())) {
2982	Instruction *Op = dyn_cast<Instruction>(Val: OpIdx.value().get());
2983	// Make sure we are not already sinking this operand
2984	if (!Op \|\| any_of(Range&: Ops, P: [&](Use U) { return* U->get() == Op; }))
2985	continue;
2986
2987	Instruction *Shuffle = Op;
2988	if (Shuffle->getOpcode() == Instruction::BitCast)
2989	Shuffle = dyn_cast<Instruction>(Val: Shuffle->getOperand(i: `0`));
2990	// We are looking for a splat that can be sunk.
2991	if (!Shuffle \|\| !match(V: Shuffle, P: m_Shuffle(v1: m_InsertElt(Val: m_Undef(), Elt: m_Value(),
2992	Idx: m_ZeroInt()),
2993	v2: m_Undef(), mask: m_ZeroMask ())))
2994	continue;
2995	if (!IsSinker (I, OpIdx.index()))
2996	continue;
2997
2998	// All uses of the shuffle should be sunk to avoid duplicating it across gpr
2999	// and vector registers
3000	for (Use &U : Op->uses()) {
3001	Instruction *Insn = cast<Instruction>(Val: U.getUser());
3002	if (!IsSinker (Insn, U.getOperandNo()))
3003	return false;
3004	}
3005
3006	Ops.push_back(Elt: &Shuffle->getOperandUse(i: `0`));
3007	if (Shuffle != Op)
3008	Ops.push_back(Elt: &Op->getOperandUse(i: `0`));
3009	Ops.push_back(Elt: &OpIdx.value());
3010	}
3011	return true;
3012	}
3013
3014	unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
3015	Type ArrayType) const* {
3016	if (!UseWidenGlobalArrays) {
3017	LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
3018	return false;
3019	}
3020
3021	// Don't modify none integer array types
3022	if (!ArrayType \|\| !ArrayType->isArrayTy() \|\|
3023	!ArrayType->getArrayElementType()->isIntegerTy())
3024	return `0`;
3025
3026	// We pad to 4 byte boundaries
3027	if (Size % `4` == `0`)
3028	return `0`;
3029
3030	unsigned NumBytesToPad = `4` - (Size % `4`);
3031	unsigned NewSize = Size + NumBytesToPad;
3032
3033	// Max number of bytes that memcpy allows for lowering to load/stores before
3034	// it uses library function (__aeabi_memcpy).
3035	unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
3036
3037	if (NewSize > MaxMemIntrinsicSize)
3038	return `0`;
3039
3040	return NumBytesToPad;
3041	}
3042

Browse the source code of llvm_projects/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp