SystemZTargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp]

1	//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements a TargetTransformInfo analysis pass specific to the
10	// SystemZ target machine. It uses the target's detailed information to provide
11	// more precise answers to certain TTI queries, while letting the target
12	// independent and default TTI implementations handle the rest.
13	//
14	//===----------------------------------------------------------------------===//
15
16	#include "SystemZTargetTransformInfo.h"
17	#include "llvm/Analysis/TargetTransformInfo.h"
18	#include "llvm/CodeGen/BasicTTIImpl.h"
19	#include "llvm/CodeGen/TargetLowering.h"
20	#include "llvm/IR/DerivedTypes.h"
21	#include "llvm/IR/InstIterator.h"
22	#include "llvm/IR/IntrinsicInst.h"
23	#include "llvm/IR/Intrinsics.h"
24	#include "llvm/Support/Debug.h"
25	#include "llvm/Support/InstructionCost.h"
26	#include "llvm/Support/MathExtras.h"
27
28	using namespace llvm;
29
30	#define DEBUG_TYPE "systemztti"
31
32	//===----------------------------------------------------------------------===//
33	//
34	// SystemZ cost model.
35	//
36	//===----------------------------------------------------------------------===//
37
38	static bool isUsedAsMemCpySource(const Value V, bool* &OtherUse) {
39	bool UsedAsMemCpySource = false;
40	for (const User *U : V->users())
41	if (const Instruction *User = dyn_cast<Instruction>(Val: U)) {
42	if (isa<BitCastInst>(Val: User) \|\| isa<GetElementPtrInst>(Val: User)) {
43	UsedAsMemCpySource \|= isUsedAsMemCpySource(V: User, OtherUse);
44	continue;
45	}
46	if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(Val: User)) {
47	if (Memcpy->getOperand(i_nocapture: `1`) == V && !Memcpy->isVolatile()) {
48	UsedAsMemCpySource = true;
49	continue;
50	}
51	}
52	OtherUse = true;
53	}
54	return UsedAsMemCpySource;
55	}
56
57	static void countNumMemAccesses(const Value Ptr, unsigned* &NumStores,
58	unsigned &NumLoads, const Function *F) {
59	if (!isa<PointerType>(Val: Ptr->getType()))
60	return;
61	for (const User *U : Ptr->users())
62	if (const Instruction *User = dyn_cast<Instruction>(Val: U)) {
63	if (User->getParent()->getParent() == F) {
64	if (const auto *SI = dyn_cast<StoreInst>(Val: User)) {
65	if (SI->getPointerOperand() == Ptr && !SI->isVolatile())
66	NumStores++;
67	} else if (const auto *LI = dyn_cast<LoadInst>(Val: User)) {
68	if (LI->getPointerOperand() == Ptr && !LI->isVolatile())
69	NumLoads++;
70	} else if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: User)) {
71	if (GEP->getPointerOperand() == Ptr)
72	countNumMemAccesses(Ptr: GEP, NumStores, NumLoads, F);
73	}
74	}
75	}
76	}
77
78	unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase CB) const* {
79	unsigned Bonus = `0`;
80	const Function *Caller = CB->getParent()->getParent();
81	const Function *Callee = CB->getCalledFunction();
82	if (!Callee)
83	return `0`;
84
85	// Increase the threshold if an incoming argument is used only as a memcpy
86	// source.
87	for (const Argument &Arg : Callee->args()) {
88	bool OtherUse = false;
89	if (isUsedAsMemCpySource(V: &Arg, OtherUse) && !OtherUse) {
90	Bonus = `1000`;
91	break;
92	}
93	}
94
95	// Give bonus for globals used much in both caller and a relatively small
96	// callee.
97	unsigned InstrCount = `0`;
98	SmallDenseMap<const Value , unsigned*> Ptr2NumUses;
99	for (auto &I : instructions(F: Callee)) {
100	if (++InstrCount == `200`) {
101	Ptr2NumUses.clear();
102	break;
103	}
104	if (const auto *SI = dyn_cast<StoreInst>(Val: &I)) {
105	if (!SI->isVolatile())
106	if (auto *GV = dyn_cast<GlobalVariable>(Val: SI->getPointerOperand()))
107	Ptr2NumUses [GV]++;
108	} else if (const auto *LI = dyn_cast<LoadInst>(Val: &I)) {
109	if (!LI->isVolatile())
110	if (auto *GV = dyn_cast<GlobalVariable>(Val: LI->getPointerOperand()))
111	Ptr2NumUses [GV]++;
112	} else if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
113	if (auto *GV = dyn_cast<GlobalVariable>(Val: GEP->getPointerOperand())) {
114	unsigned NumStores = `0`, NumLoads = `0`;
115	countNumMemAccesses(Ptr: GEP, NumStores, NumLoads, F: Callee);
116	Ptr2NumUses [GV] += NumLoads + NumStores;
117	}
118	}
119	}
120
121	for (auto [Ptr, NumCalleeUses] : Ptr2NumUses)
122	if (NumCalleeUses > `10`) {
123	unsigned CallerStores = `0`, CallerLoads = `0`;
124	countNumMemAccesses(Ptr, NumStores&: CallerStores, NumLoads&: CallerLoads, F: Caller);
125	if (CallerStores + CallerLoads > `10`) {
126	Bonus = `1000`;
127	break;
128	}
129	}
130
131	// Give bonus when Callee accesses an Alloca of Caller heavily.
132	unsigned NumStores = `0`;
133	unsigned NumLoads = `0`;
134	for (unsigned OpIdx = `0`; OpIdx != Callee->arg_size(); ++OpIdx) {
135	Value *CallerArg = CB->getArgOperand(i: OpIdx);
136	Argument *CalleeArg = Callee->getArg(i: OpIdx);
137	if (isa<AllocaInst>(Val: CallerArg))
138	countNumMemAccesses(Ptr: CalleeArg, NumStores, NumLoads, F: Callee);
139	}
140	if (NumLoads > `10`)
141	Bonus += NumLoads * `50`;
142	if (NumStores > `10`)
143	Bonus += NumStores * `50`;
144	Bonus = std::min(a: Bonus, b: unsigned(`1000`));
145
146	LLVM_DEBUG(if (Bonus)
147	dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
148	return Bonus;
149	}
150
151	InstructionCost
152	SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
153	TTI::TargetCostKind CostKind) const {
154	assert(Ty->isIntegerTy());
155
156	unsigned BitSize = Ty->getPrimitiveSizeInBits();
157	// There is no cost model for constants with a bit size of 0. Return TCC_Free
158	// here, so that constant hoisting will ignore this constant.
159	if (BitSize == `0`)
160	return TTI::TCC_Free;
161	// No cost model for operations on integers larger than 128 bit implemented yet.
162	if ((!ST->hasVector() && BitSize > `64`) \|\| BitSize > `128`)
163	return TTI::TCC_Free;
164
165	if (Imm == `0`)
166	return TTI::TCC_Free;
167
168	if (Imm.getBitWidth() <= `64`) {
169	// Constants loaded via lgfi.
170	if (isInt<`32`>(x: Imm.getSExtValue()))
171	return TTI::TCC_Basic;
172	// Constants loaded via llilf.
173	if (isUInt<`32`>(x: Imm.getZExtValue()))
174	return TTI::TCC_Basic;
175	// Constants loaded via llihf:
176	if ((Imm.getZExtValue() & `0xffffffff`) == `0`)
177	return TTI::TCC_Basic;
178
179	return `2` * TTI::TCC_Basic;
180	}
181
182	// i128 immediates loads from Constant Pool
183	return `2` * TTI::TCC_Basic;
184	}
185
186	InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
187	const APInt &Imm, Type *Ty,
188	TTI::TargetCostKind CostKind,
189	Instruction Inst) const* {
190	assert(Ty->isIntegerTy());
191
192	unsigned BitSize = Ty->getPrimitiveSizeInBits();
193	// There is no cost model for constants with a bit size of 0. Return TCC_Free
194	// here, so that constant hoisting will ignore this constant.
195	if (BitSize == `0`)
196	return TTI::TCC_Free;
197	// No cost model for operations on integers larger than 64 bit implemented yet.
198	if (BitSize > `64`)
199	return TTI::TCC_Free;
200
201	switch (Opcode) {
202	default:
203	return TTI::TCC_Free;
204	case Instruction::GetElementPtr:
205	// Always hoist the base address of a GetElementPtr. This prevents the
206	// creation of new constants for every base constant that gets constant
207	// folded with the offset.
208	if (Idx == `0`)
209	return `2` * TTI::TCC_Basic;
210	return TTI::TCC_Free;
211	case Instruction::Store:
212	if (Idx == `0` && Imm.getBitWidth() <= `64`) {
213	// Any 8-bit immediate store can by implemented via mvi.
214	if (BitSize == `8`)
215	return TTI::TCC_Free;
216	// 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
217	if (isInt<`16`>(x: Imm.getSExtValue()))
218	return TTI::TCC_Free;
219	}
220	break;
221	case Instruction::ICmp:
222	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
223	// Comparisons against signed 32-bit immediates implemented via cgfi.
224	if (isInt<`32`>(x: Imm.getSExtValue()))
225	return TTI::TCC_Free;
226	// Comparisons against unsigned 32-bit immediates implemented via clgfi.
227	if (isUInt<`32`>(x: Imm.getZExtValue()))
228	return TTI::TCC_Free;
229	}
230	break;
231	case Instruction::Add:
232	case Instruction::Sub:
233	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
234	// We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
235	if (isUInt<`32`>(x: Imm.getZExtValue()))
236	return TTI::TCC_Free;
237	// Or their negation, by swapping addition vs. subtraction.
238	if (isUInt<`32`>(x: -Imm.getSExtValue()))
239	return TTI::TCC_Free;
240	}
241	break;
242	case Instruction::Mul:
243	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
244	// We use msgfi to multiply by 32-bit signed immediates.
245	if (isInt<`32`>(x: Imm.getSExtValue()))
246	return TTI::TCC_Free;
247	}
248	break;
249	case Instruction::Or:
250	case Instruction::Xor:
251	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
252	// Masks supported by oilf/xilf.
253	if (isUInt<`32`>(x: Imm.getZExtValue()))
254	return TTI::TCC_Free;
255	// Masks supported by oihf/xihf.
256	if ((Imm.getZExtValue() & `0xffffffff`) == `0`)
257	return TTI::TCC_Free;
258	}
259	break;
260	case Instruction::And:
261	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
262	// Any 32-bit AND operation can by implemented via nilf.
263	if (BitSize <= `32`)
264	return TTI::TCC_Free;
265	// 64-bit masks supported by nilf.
266	if (isUInt<`32`>(x: ~Imm.getZExtValue()))
267	return TTI::TCC_Free;
268	// 64-bit masks supported by nilh.
269	if ((Imm.getZExtValue() & `0xffffffff`) == `0xffffffff`)
270	return TTI::TCC_Free;
271	// Some 64-bit AND operations can be implemented via risbg.
272	const SystemZInstrInfo *TII = ST->getInstrInfo();
273	unsigned Start, End;
274	if (TII->isRxSBGMask(Mask: Imm.getZExtValue(), BitSize, Start, End))
275	return TTI::TCC_Free;
276	}
277	break;
278	case Instruction::Shl:
279	case Instruction::LShr:
280	case Instruction::AShr:
281	// Always return TCC_Free for the shift value of a shift instruction.
282	if (Idx == `1`)
283	return TTI::TCC_Free;
284	break;
285	case Instruction::UDiv:
286	case Instruction::SDiv:
287	case Instruction::URem:
288	case Instruction::SRem:
289	case Instruction::Trunc:
290	case Instruction::ZExt:
291	case Instruction::SExt:
292	case Instruction::IntToPtr:
293	case Instruction::PtrToInt:
294	case Instruction::BitCast:
295	case Instruction::PHI:
296	case Instruction::Call:
297	case Instruction::Select:
298	case Instruction::Ret:
299	case Instruction::Load:
300	break;
301	}
302
303	return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
304	}
305
306	InstructionCost
307	SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
308	const APInt &Imm, Type *Ty,
309	TTI::TargetCostKind CostKind) const {
310	assert(Ty->isIntegerTy());
311
312	unsigned BitSize = Ty->getPrimitiveSizeInBits();
313	// There is no cost model for constants with a bit size of 0. Return TCC_Free
314	// here, so that constant hoisting will ignore this constant.
315	if (BitSize == `0`)
316	return TTI::TCC_Free;
317	// No cost model for operations on integers larger than 64 bit implemented yet.
318	if (BitSize > `64`)
319	return TTI::TCC_Free;
320
321	switch (IID) {
322	default:
323	return TTI::TCC_Free;
324	case Intrinsic::sadd_with_overflow:
325	case Intrinsic::uadd_with_overflow:
326	case Intrinsic::ssub_with_overflow:
327	case Intrinsic::usub_with_overflow:
328	// These get expanded to include a normal addition/subtraction.
329	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
330	if (isUInt<`32`>(x: Imm.getZExtValue()))
331	return TTI::TCC_Free;
332	if (isUInt<`32`>(x: -Imm.getSExtValue()))
333	return TTI::TCC_Free;
334	}
335	break;
336	case Intrinsic::smul_with_overflow:
337	case Intrinsic::umul_with_overflow:
338	// These get expanded to include a normal multiplication.
339	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
340	if (isInt<`32`>(x: Imm.getSExtValue()))
341	return TTI::TCC_Free;
342	}
343	break;
344	case Intrinsic::experimental_stackmap:
345	if ((Idx < `2`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
346	return TTI::TCC_Free;
347	break;
348	case Intrinsic::experimental_patchpoint_void:
349	case Intrinsic::experimental_patchpoint:
350	if ((Idx < `4`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
351	return TTI::TCC_Free;
352	break;
353	}
354	return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
355	}
356
357	TargetTransformInfo::PopcntSupportKind
358	SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) const {
359	assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
360	if (ST->hasPopulationCount() && TyWidth <= `64`)
361	return TTI::PSK_FastHardware;
362	return TTI::PSK_Software;
363	}
364
365	void SystemZTTIImpl::getUnrollingPreferences(
366	Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
367	OptimizationRemarkEmitter ORE) const* {
368	// Find out if L contains a call, what the machine instruction count
369	// estimate is, and how many stores there are.
370	bool HasCall = false;
371	InstructionCost NumStores = `0`;
372	for (auto &BB : L->blocks())
373	for (auto &I : *BB) {
374	if (isa<CallInst>(Val: &I) \|\| isa<InvokeInst>(Val: &I)) {
375	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
376	if (isLoweredToCall(F))
377	HasCall = true;
378	if (F->getIntrinsicID() == Intrinsic::memcpy \|\|
379	F->getIntrinsicID() == Intrinsic::memset)
380	NumStores ++;
381	} else { // indirect call.
382	HasCall = true;
383	}
384	}
385	if (isa<StoreInst>(Val: &I)) {
386	Type *MemAccessTy = I.getOperand(i: `0`)->getType();
387	NumStores += getMemoryOpCost(Opcode: Instruction::Store, Src: MemAccessTy, Alignment: Align (),
388	AddressSpace: `0`, CostKind: TTI::TCK_RecipThroughput);
389	}
390	}
391
392	// The z13 processor will run out of store tags if too many stores
393	// are fed into it too quickly. Therefore make sure there are not
394	// too many stores in the resulting unrolled loop.
395	unsigned const NumStoresVal = NumStores.getValue();
396	unsigned const Max = (NumStoresVal ? (`12` / NumStoresVal) : UINT_MAX);
397
398	if (HasCall) {
399	// Only allow full unrolling if loop has any calls.
400	UP.FullUnrollMaxCount = Max;
401	UP.MaxCount = `1`;
402	return;
403	}
404
405	UP.MaxCount = Max;
406	if (UP.MaxCount <= `1`)
407	return;
408
409	// Allow partial and runtime trip count unrolling.
410	UP.Partial = UP.Runtime = true;
411
412	UP.PartialThreshold = `75`;
413	UP.DefaultUnrollRuntimeCount = `4`;
414
415	// Allow expensive instructions in the pre-header of the loop.
416	UP.AllowExpensiveTripCount = true;
417
418	UP.Force = true;
419	}
420
421	void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
422	TTI::PeelingPreferences &PP) const {
423	BaseT::getPeelingPreferences(L, SE, PP);
424	}
425
426	bool SystemZTTIImpl::isLSRCostLess(
427	const TargetTransformInfo::LSRCost &C1,
428	const TargetTransformInfo::LSRCost &C2) const {
429	// SystemZ specific: check instruction count (first), and don't care about
430	// ImmCost, since offsets are checked explicitly.
431	return std::tie(args: C1.Insns, args: C1.NumRegs, args: C1.AddRecCost,
432	args: C1.NumIVMuls, args: C1.NumBaseAdds,
433	args: C1.ScaleCost, args: C1.SetupCost) <
434	std::tie(args: C2.Insns, args: C2.NumRegs, args: C2.AddRecCost,
435	args: C2.NumIVMuls, args: C2.NumBaseAdds,
436	args: C2.ScaleCost, args: C2.SetupCost);
437	}
438
439	bool SystemZTTIImpl::areInlineCompatible(const Function *Caller,
440	const Function Callee) const* {
441	const TargetMachine &TM = getTLI()->getTargetMachine();
442
443	const FeatureBitset &CallerBits =
444	TM.getSubtargetImpl(*Caller)->getFeatureBits();
445	const FeatureBitset &CalleeBits =
446	TM.getSubtargetImpl(*Callee)->getFeatureBits();
447
448	// Support only equal feature bitsets. Restriction should be relaxed in the
449	// future to allow inlining when callee's bits are subset of the caller's.
450	return CallerBits == CalleeBits;
451	}
452
453	unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
454	bool Vector = (ClassID == `1`);
455	if (!Vector)
456	// Discount the stack pointer. Also leave out %r0, since it can't
457	// be used in an address.
458	return `14`;
459	if (ST->hasVector())
460	return `32`;
461	return `0`;
462	}
463
464	TypeSize
465	SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
466	switch (K) {
467	case TargetTransformInfo::RGK_Scalar:
468	return TypeSize::getFixed(ExactSize: `64`);
469	case TargetTransformInfo::RGK_FixedWidthVector:
470	return TypeSize::getFixed(ExactSize: ST->hasVector() ? `128` : `0`);
471	case TargetTransformInfo::RGK_ScalableVector:
472	return TypeSize::getScalable(MinimumSize: `0`);
473	}
474
475	llvm_unreachable("Unsupported register kind");
476	}
477
478	unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
479	unsigned NumStridedMemAccesses,
480	unsigned NumPrefetches,
481	bool HasCall) const {
482	// Don't prefetch a loop with many far apart accesses.
483	if (NumPrefetches > `16`)
484	return UINT_MAX;
485
486	// Emit prefetch instructions for smaller strides in cases where we think
487	// the hardware prefetcher might not be able to keep up.
488	if (NumStridedMemAccesses > `32` && !HasCall &&
489	(NumMemAccesses - NumStridedMemAccesses) * `32` <= NumStridedMemAccesses)
490	return `1`;
491
492	return ST->hasMiscellaneousExtensions3() ? `8192` : `2048`;
493	}
494
495	bool SystemZTTIImpl::hasDivRemOp(Type DataType, bool* IsSigned) const {
496	EVT VT = TLI->getValueType(DL, Ty: DataType);
497	return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
498	}
499
500	static bool isFreeEltLoad(const Value *Op) {
501	if (isa<LoadInst>(Val: Op) && Op->hasOneUse()) {
502	const Instruction UserI = cast<Instruction>(Val: Op->user_begin());
503	return !isa<StoreInst>(Val: UserI); // Prefer MVC
504	}
505	return false;
506	}
507
508	InstructionCost SystemZTTIImpl::getScalarizationOverhead(
509	VectorType Ty, const* APInt &DemandedElts, bool Insert, bool Extract,
510	TTI::TargetCostKind CostKind, bool ForPoisonSrc,
511	ArrayRef<Value > VL) const* {
512	unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
513	InstructionCost Cost = `0`;
514
515	if (Insert && Ty->isIntOrIntVectorTy(BitWidth: `64`)) {
516	// VLVGP will insert two GPRs with one instruction, while VLE will load
517	// an element directly with no extra cost
518	assert((VL.empty() \|\| VL.size() == NumElts) &&
519	"Type does not match the number of values.");
520	InstructionCost CurrVectorCost = `0`;
521	for (unsigned Idx = `0`; Idx < NumElts; ++Idx) {
522	if (DemandedElts [Idx] && !(VL.size() && isFreeEltLoad(Op: VL [Idx])))
523	++CurrVectorCost;
524	if (Idx % `2` == `1`) {
525	Cost += std::min(a: InstructionCost (`1`), b: CurrVectorCost);
526	CurrVectorCost = `0`;
527	}
528	}
529	Insert = false;
530	}
531
532	Cost += BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract,
533	CostKind, ForPoisonSrc, VL);
534	return Cost;
535	}
536
537	// Return the bit size for the scalar type or vector element
538	// type. getScalarSizeInBits() returns 0 for a pointer type.
539	static unsigned getScalarSizeInBits(Type *Ty) {
540	unsigned Size =
541	(Ty->isPtrOrPtrVectorTy() ? `64U` : Ty->getScalarSizeInBits());
542	assert(Size > `0` && "Element must have non-zero size.");
543	return Size;
544	}
545
546	// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
547	// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
548	// 3.
549	static unsigned getNumVectorRegs(Type *Ty) {
550	auto *VTy = cast<FixedVectorType>(Val: Ty);
551	unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
552	assert(WideBits > `0` && "Could not compute size of vector");
553	return ((WideBits % `128U`) ? ((WideBits / `128U`) + `1`) : (WideBits / `128U`));
554	}
555
556	InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
557	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
558	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
559	ArrayRef<const Value > Args, const* Instruction CxtI) const* {
560
561	// TODO: Handle more cost kinds.
562	if (CostKind != TTI::TCK_RecipThroughput)
563	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
564	Opd2Info: Op2Info, Args, CxtI);
565
566	// TODO: return a good value for BB-VECTORIZER that includes the
567	// immediate loads, which we do not want to count for the loop
568	// vectorizer, since they are hopefully hoisted out of the loop. This
569	// would require a new parameter 'InLoop', but not sure if constant
570	// args are common enough to motivate this.
571
572	unsigned ScalarBits = Ty->getScalarSizeInBits();
573
574	// There are thre cases of division and remainder: Dividing with a register
575	// needs a divide instruction. A divisor which is a power of two constant
576	// can be implemented with a sequence of shifts. Any other constant needs a
577	// multiply and shifts.
578	const unsigned DivInstrCost = `20`;
579	const unsigned DivMulSeqCost = `10`;
580	const unsigned SDivPow2Cost = `4`;
581
582	bool SignedDivRem =
583	Opcode == Instruction::SDiv \|\| Opcode == Instruction::SRem;
584	bool UnsignedDivRem =
585	Opcode == Instruction::UDiv \|\| Opcode == Instruction::URem;
586
587	// Check for a constant divisor.
588	bool DivRemConst = false;
589	bool DivRemConstPow2 = false;
590	if ((SignedDivRem \|\| UnsignedDivRem) && Args.size() == `2`) {
591	if (const Constant *C = dyn_cast<Constant>(Val: Args [`1`])) {
592	const ConstantInt *CVal =
593	(C->getType()->isVectorTy()
594	? dyn_cast_or_null<const ConstantInt>(Val: C->getSplatValue())
595	: dyn_cast<const ConstantInt>(Val: C));
596	if (CVal && (CVal->getValue().isPowerOf2() \|\|
597	CVal->getValue().isNegatedPowerOf2()))
598	DivRemConstPow2 = true;
599	else
600	DivRemConst = true;
601	}
602	}
603
604	if (!Ty->isVectorTy()) {
605	// These FP operations are supported with a dedicated instruction for
606	// float, double and fp128 (base implementation assumes float generally
607	// costs 2).
608	if (Opcode == Instruction::FAdd \|\| Opcode == Instruction::FSub \|\|
609	Opcode == Instruction::FMul \|\| Opcode == Instruction::FDiv)
610	return `1`;
611
612	// There is no native support for FRem.
613	if (Opcode == Instruction::FRem)
614	return LIBCALL_COST;
615
616	// Give discount for some combined logical operations if supported.
617	if (Args.size() == `2`) {
618	if (Opcode == Instruction::Xor) {
619	for (const Value *A : Args) {
620	if (const Instruction *I = dyn_cast<Instruction>(Val: A))
621	if (I->hasOneUse() &&
622	(I->getOpcode() == Instruction::Or \|\|
623	I->getOpcode() == Instruction::And \|\|
624	I->getOpcode() == Instruction::Xor))
625	if ((ScalarBits <= `64` && ST->hasMiscellaneousExtensions3()) \|\|
626	(isInt128InVR(Ty) &&
627	(I->getOpcode() == Instruction::Or \|\| ST->hasVectorEnhancements1())))
628	return `0`;
629	}
630	}
631	else if (Opcode == Instruction::And \|\| Opcode == Instruction::Or) {
632	for (const Value *A : Args) {
633	if (const Instruction *I = dyn_cast<Instruction>(Val: A))
634	if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
635	((ScalarBits <= `64` && ST->hasMiscellaneousExtensions3()) \|\|
636	(isInt128InVR(Ty) &&
637	(Opcode == Instruction::And \|\| ST->hasVectorEnhancements1()))))
638	return `0`;
639	}
640	}
641	}
642
643	// Or requires one instruction, although it has custom handling for i64.
644	if (Opcode == Instruction::Or)
645	return `1`;
646
647	if (Opcode == Instruction::Xor && ScalarBits == `1`) {
648	if (ST->hasLoadStoreOnCond2())
649	return `5`; // 2 (li 0; loc 1); xor*
650	return `7`; // 2 ipm sequences ; xor ; shift ; compare*
651	}
652
653	if (DivRemConstPow2)
654	return (SignedDivRem ? SDivPow2Cost : `1`);
655	if (DivRemConst)
656	return DivMulSeqCost;
657	if (SignedDivRem \|\| UnsignedDivRem)
658	return DivInstrCost;
659	}
660	else if (ST->hasVector()) {
661	auto *VTy = cast<FixedVectorType>(Val: Ty);
662	unsigned VF = VTy->getNumElements();
663	unsigned NumVectors = getNumVectorRegs(Ty);
664
665	// These vector operations are custom handled, but are still supported
666	// with one instruction per vector, regardless of element size.
667	if (Opcode == Instruction::Shl \|\| Opcode == Instruction::LShr \|\|
668	Opcode == Instruction::AShr) {
669	return NumVectors;
670	}
671
672	if (DivRemConstPow2)
673	return (NumVectors * (SignedDivRem ? SDivPow2Cost : `1`));
674	if (DivRemConst) {
675	SmallVector<Type *> Tys(Args.size(), Ty);
676	return VF * DivMulSeqCost +
677	BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind);
678	}
679	if (SignedDivRem \|\| UnsignedDivRem) {
680	if (ST->hasVectorEnhancements3() && ScalarBits >= `32`)
681	return NumVectors * DivInstrCost;
682	else if (VF > `4`)
683	// Temporary hack: disable high vectorization factors with integer
684	// division/remainder, which will get scalarized and handled with
685	// GR128 registers. The mischeduler is not clever enough to avoid
686	// spilling yet.
687	return `1000`;
688	}
689
690	// These FP operations are supported with a single vector instruction for
691	// double (base implementation assumes float generally costs 2). For
692	// FP128, the scalar cost is 1, and there is no overhead since the values
693	// are already in scalar registers.
694	if (Opcode == Instruction::FAdd \|\| Opcode == Instruction::FSub \|\|
695	Opcode == Instruction::FMul \|\| Opcode == Instruction::FDiv) {
696	switch (ScalarBits) {
697	case `32`: {
698	// The vector enhancements facility 1 provides v4f32 instructions.
699	if (ST->hasVectorEnhancements1())
700	return NumVectors;
701	// Return the cost of multiple scalar invocation plus the cost of
702	// inserting and extracting the values.
703	InstructionCost ScalarCost =
704	getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind);
705	SmallVector<Type *> Tys(Args.size(), Ty);
706	InstructionCost Cost =
707	(VF * ScalarCost) +
708	BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind);
709	// FIXME: VF 2 for these FP operations are currently just as
710	// expensive as for VF 4.
711	if (VF == `2`)
712	Cost *= `2`;
713	return Cost;
714	}
715	case `64`:
716	case `128`:
717	return NumVectors;
718	default:
719	break;
720	}
721	}
722
723	// There is no native support for FRem.
724	if (Opcode == Instruction::FRem) {
725	SmallVector<Type *> Tys(Args.size(), Ty);
726	InstructionCost Cost =
727	(VF * LIBCALL_COST) +
728	BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind);
729	// FIXME: VF 2 for float is currently just as expensive as for VF 4.
730	if (VF == `2` && ScalarBits == `32`)
731	Cost *= `2`;
732	return Cost;
733	}
734	}
735
736	// Fallback to the default implementation.
737	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
738	Args, CxtI);
739	}
740
741	InstructionCost
742	SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
743	VectorType SrcTy, ArrayRef<int*> Mask,
744	TTI::TargetCostKind CostKind, int Index,
745	VectorType SubTp, ArrayRef<const* Value *> Args,
746	const Instruction CxtI) const* {
747	Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
748	if (ST->hasVector()) {
749	unsigned NumVectors = getNumVectorRegs(Ty: SrcTy);
750
751	// TODO: Since fp32 is expanded, the shuffle cost should always be 0.
752
753	// FP128 values are always in scalar registers, so there is no work
754	// involved with a shuffle, except for broadcast. In that case register
755	// moves are done with a single instruction per element.
756	if (SrcTy->getScalarType()->isFP128Ty())
757	return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - `1` : `0`);
758
759	switch (Kind) {
760	case TargetTransformInfo::SK_ExtractSubvector:
761	// ExtractSubvector Index indicates start offset.
762
763	// Extracting a subvector from first index is a noop.
764	return (Index == `0` ? `0` : NumVectors);
765
766	case TargetTransformInfo::SK_Broadcast:
767	// Loop vectorizer calls here to figure out the extra cost of
768	// broadcasting a loaded value to all elements of a vector. Since vlrep
769	// loads and replicates with a single instruction, adjust the returned
770	// value.
771	return NumVectors - `1`;
772
773	default:
774
775	// SystemZ supports single instruction permutation / replication.
776	return NumVectors;
777	}
778	}
779
780	return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
781	SubTp);
782	}
783
784	// Return the log2 difference of the element sizes of the two vector types.
785	static unsigned getElSizeLog2Diff(Type Ty0, Type Ty1) {
786	unsigned Bits0 = Ty0->getScalarSizeInBits();
787	unsigned Bits1 = Ty1->getScalarSizeInBits();
788
789	if (Bits1 > Bits0)
790	return (Log2_32(Value: Bits1) - Log2_32(Value: Bits0));
791
792	return (Log2_32(Value: Bits0) - Log2_32(Value: Bits1));
793	}
794
795	// Return the number of instructions needed to truncate SrcTy to DstTy.
796	unsigned SystemZTTIImpl::getVectorTruncCost(Type SrcTy, Type DstTy) const {
797	assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
798	assert(SrcTy->getPrimitiveSizeInBits().getFixedValue() >
799	DstTy->getPrimitiveSizeInBits().getFixedValue() &&
800	"Packing must reduce size of vector type.");
801	assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
802	cast<FixedVectorType>(DstTy)->getNumElements() &&
803	"Packing should not change number of elements.");
804
805	// TODO: Since fp32 is expanded, the extract cost should always be 0.
806
807	unsigned NumParts = getNumVectorRegs(Ty: SrcTy);
808	if (NumParts <= `2`)
809	// Up to 2 vector registers can be truncated efficiently with pack or
810	// permute. The latter requires an immediate mask to be loaded, which
811	// typically gets hoisted out of a loop. TODO: return a good value for
812	// BB-VECTORIZER that includes the immediate loads, which we do not want
813	// to count for the loop vectorizer.
814	return `1`;
815
816	unsigned Cost = `0`;
817	unsigned Log2Diff = getElSizeLog2Diff(Ty0: SrcTy, Ty1: DstTy);
818	unsigned VF = cast<FixedVectorType>(Val: SrcTy)->getNumElements();
819	for (unsigned P = `0`; P < Log2Diff; ++P) {
820	if (NumParts > `1`)
821	NumParts /= `2`;
822	Cost += NumParts;
823	}
824
825	// Currently, a general mix of permutes and pack instructions is output by
826	// isel, which follow the cost computation above except for this case which
827	// is one instruction less:
828	if (VF == `8` && SrcTy->getScalarSizeInBits() == `64` &&
829	DstTy->getScalarSizeInBits() == `8`)
830	Cost--;
831
832	return Cost;
833	}
834
835	// Return the cost of converting a vector bitmask produced by a compare
836	// (SrcTy), to the type of the select or extend instruction (DstTy).
837	unsigned SystemZTTIImpl::getVectorBitmaskConversionCost(Type *SrcTy,
838	Type DstTy) const* {
839	assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
840	"Should only be called with vector types.");
841
842	unsigned PackCost = `0`;
843	unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
844	unsigned DstScalarBits = DstTy->getScalarSizeInBits();
845	unsigned Log2Diff = getElSizeLog2Diff(Ty0: SrcTy, Ty1: DstTy);
846	if (SrcScalarBits > DstScalarBits)
847	// The bitmask will be truncated.
848	PackCost = getVectorTruncCost(SrcTy, DstTy);
849	else if (SrcScalarBits < DstScalarBits) {
850	unsigned DstNumParts = getNumVectorRegs(Ty: DstTy);
851	// Each vector select needs its part of the bitmask unpacked.
852	PackCost = Log2Diff * DstNumParts;
853	// Extra cost for moving part of mask before unpacking.
854	PackCost += DstNumParts - `1`;
855	}
856
857	return PackCost;
858	}
859
860	// Return the type of the compared operands. This is needed to compute the
861	// cost for a Select / ZExt or SExt instruction.
862	static Type getCmpOpsType(const* Instruction I, unsigned* VF = `1`) {
863	Type OpTy = nullptr*;
864	if (CmpInst *CI = dyn_cast<CmpInst>(Val: I->getOperand(i: `0`)))
865	OpTy = CI->getOperand(i_nocapture: `0`)->getType();
866	else if (Instruction *LogicI = dyn_cast<Instruction>(Val: I->getOperand(i: `0`)))
867	if (LogicI->getNumOperands() == `2`)
868	if (CmpInst *CI0 = dyn_cast<CmpInst>(Val: LogicI->getOperand(i: `0`)))
869	if (isa<CmpInst>(Val: LogicI->getOperand(i: `1`)))
870	OpTy = CI0->getOperand(i_nocapture: `0`)->getType();
871
872	if (OpTy != nullptr) {
873	if (VF == `1`) {
874	assert (!OpTy->isVectorTy() && "Expected scalar type");
875	return OpTy;
876	}
877	// Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
878	// be either scalar or already vectorized with a same or lesser VF.
879	Type *ElTy = OpTy->getScalarType();
880	return FixedVectorType::get(ElementType: ElTy, NumElts: VF);
881	}
882
883	return nullptr;
884	}
885
886	// Get the cost of converting a boolean vector to a vector with same width
887	// and element size as Dst, plus the cost of zero extending if needed.
888	unsigned
889	SystemZTTIImpl::getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
890	const Instruction I) const* {
891	auto *DstVTy = cast<FixedVectorType>(Val: Dst);
892	unsigned VF = DstVTy->getNumElements();
893	unsigned Cost = `0`;
894	// If we know what the widths of the compared operands, get any cost of
895	// converting it to match Dst. Otherwise assume same widths.
896	Type CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr*);
897	if (CmpOpTy != nullptr)
898	Cost = getVectorBitmaskConversionCost(SrcTy: CmpOpTy, DstTy: Dst);
899	if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::UIToFP)
900	// One 'vn' per dst vector with an immediate mask.
901	Cost += getNumVectorRegs(Ty: Dst);
902	return Cost;
903	}
904
905	InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
906	Type *Src,
907	TTI::CastContextHint CCH,
908	TTI::TargetCostKind CostKind,
909	const Instruction I) const* {
910	// FIXME: Can the logic below also be used for these cost kinds?
911	if (CostKind == TTI::TCK_CodeSize \|\| CostKind == TTI::TCK_SizeAndLatency) {
912	auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
913	return BaseCost == `0` ? BaseCost : `1`;
914	}
915
916	unsigned DstScalarBits = Dst->getScalarSizeInBits();
917	unsigned SrcScalarBits = Src->getScalarSizeInBits();
918
919	if (!Src->isVectorTy()) {
920	if (Dst->isVectorTy())
921	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
922
923	if (Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP) {
924	if (Src->isIntegerTy(Bitwidth: `128`))
925	return LIBCALL_COST;
926	if (SrcScalarBits >= `32` \|\|
927	(I != nullptr && isa<LoadInst>(Val: I->getOperand(i: `0`))))
928	return `1`;
929	return SrcScalarBits > `1` ? `2` /i8/i16 extend/ : `5` /branch seq./;
930	}
931
932	if ((Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI) &&
933	Dst->isIntegerTy(Bitwidth: `128`))
934	return LIBCALL_COST;
935
936	if ((Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt)) {
937	if (Src->isIntegerTy(Bitwidth: `1`)) {
938	if (DstScalarBits == `128`) {
939	if (Opcode == Instruction::SExt && ST->hasVectorEnhancements3())
940	return `0`;/VCEQQ/
941	return `5` /branch seq./;
942	}
943
944	if (ST->hasLoadStoreOnCond2())
945	return `2`; // li 0; loc 1
946
947	// This should be extension of a compare i1 result, which is done with
948	// ipm and a varying sequence of instructions.
949	unsigned Cost = `0`;
950	if (Opcode == Instruction::SExt)
951	Cost = (DstScalarBits < `64` ? `3` : `4`);
952	if (Opcode == Instruction::ZExt)
953	Cost = `3`;
954	Type CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr*);
955	if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
956	// If operands of an fp-type was compared, this costs +1.
957	Cost++;
958	return Cost;
959	}
960	else if (isInt128InVR(Ty: Dst)) {
961	// Extensions from GPR to i128 (in VR) typically costs two instructions,
962	// but a zero-extending load would be just one extra instruction.
963	if (Opcode == Instruction::ZExt && I != nullptr)
964	if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
965	if (Ld->hasOneUse())
966	return `1`;
967	return `2`;
968	}
969	}
970
971	if (Opcode == Instruction::Trunc && isInt128InVR(Ty: Src) && I != nullptr) {
972	if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
973	if (Ld->hasOneUse())
974	return `0`; // Will be converted to GPR load.
975	bool OnlyTruncatingStores = true;
976	for (const User *U : I->users())
977	if (!isa<StoreInst>(Val: U)) {
978	OnlyTruncatingStores = false;
979	break;
980	}
981	if (OnlyTruncatingStores)
982	return `0`;
983	return `2`; // Vector element extraction.
984	}
985	}
986	else if (ST->hasVector()) {
987	// Vector to scalar cast.
988	auto *SrcVecTy = cast<FixedVectorType>(Val: Src);
989	auto *DstVecTy = dyn_cast<FixedVectorType>(Val: Dst);
990	if (!DstVecTy) {
991	// TODO: tune vector-to-scalar cast.
992	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
993	}
994	unsigned VF = SrcVecTy->getNumElements();
995	unsigned NumDstVectors = getNumVectorRegs(Ty: Dst);
996	unsigned NumSrcVectors = getNumVectorRegs(Ty: Src);
997
998	if (Opcode == Instruction::Trunc) {
999	if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
1000	return `0`; // Check for NOOP conversions.
1001	return getVectorTruncCost(SrcTy: Src, DstTy: Dst);
1002	}
1003
1004	if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt) {
1005	if (SrcScalarBits >= `8`) {
1006	// ZExt will use either a single unpack or a vector permute.
1007	if (Opcode == Instruction::ZExt)
1008	return NumDstVectors;
1009
1010	// SExt will be handled with one unpack per doubling of width.
1011	unsigned NumUnpacks = getElSizeLog2Diff(Ty0: Src, Ty1: Dst);
1012
1013	// For types that spans multiple vector registers, some additional
1014	// instructions are used to setup the unpacking.
1015	unsigned NumSrcVectorOps =
1016	(NumUnpacks > `1` ? (NumDstVectors - NumSrcVectors)
1017	: (NumDstVectors / `2`));
1018
1019	return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
1020	}
1021	else if (SrcScalarBits == `1`)
1022	return getBoolVecToIntConversionCost(Opcode, Dst, I);
1023	}
1024
1025	if (Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP \|\|
1026	Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI) {
1027	// TODO: Fix base implementation which could simplify things a bit here
1028	// (seems to miss on differentiating on scalar/vector types).
1029
1030	// Only 64 bit vector conversions are natively supported before z15.
1031	if (DstScalarBits == `64` \|\| ST->hasVectorEnhancements2()) {
1032	if (SrcScalarBits == DstScalarBits)
1033	return NumDstVectors;
1034
1035	if (SrcScalarBits == `1`)
1036	return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
1037	}
1038
1039	// Return the cost of multiple scalar invocation plus the cost of
1040	// inserting and extracting the values. Base implementation does not
1041	// realize float->int gets scalarized.
1042	InstructionCost ScalarCost = getCastInstrCost(
1043	Opcode, Dst: Dst->getScalarType(), Src: Src->getScalarType(), CCH, CostKind);
1044	InstructionCost TotCost = VF * ScalarCost;
1045	bool NeedsInserts = true, NeedsExtracts = true;
1046	// FP128 registers do not get inserted or extracted.
1047	if (DstScalarBits == `128` &&
1048	(Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP))
1049	NeedsInserts = false;
1050	if (SrcScalarBits == `128` &&
1051	(Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI))
1052	NeedsExtracts = false;
1053
1054	TotCost += BaseT::getScalarizationOverhead(InTy: SrcVecTy, /Insert/ false,
1055	Extract: NeedsExtracts, CostKind);
1056	TotCost += BaseT::getScalarizationOverhead(InTy: DstVecTy, Insert: NeedsInserts,
1057	/Extract/ false, CostKind);
1058
1059	// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
1060	if (VF == `2` && SrcScalarBits == `32` && DstScalarBits == `32`)
1061	TotCost *= `2`;
1062
1063	return TotCost;
1064	}
1065
1066	if (Opcode == Instruction::FPTrunc) {
1067	if (SrcScalarBits == `128`) // fp128 -> double/float + inserts of elements.
1068	return VF /ldxbr/lexbr/ +
1069	BaseT::getScalarizationOverhead(InTy: DstVecTy, /Insert/ true,
1070	/Extract/ false, CostKind);
1071	else // double -> float
1072	return VF / `2` /vledb/ + std::max(a: `1U`, b: VF / `4` /vperm/);
1073	}
1074
1075	if (Opcode == Instruction::FPExt) {
1076	if (SrcScalarBits == `32` && DstScalarBits == `64`) {
1077	// float -> double is very rare and currently unoptimized. Instead of
1078	// using vldeb, which can do two at a time, all conversions are
1079	// scalarized.
1080	return VF * `2`;
1081	}
1082	// -> fp128. VF lxdb/lxeb + extraction of elements.*
1083	return VF + BaseT::getScalarizationOverhead(InTy: SrcVecTy, /Insert/ false,
1084	/Extract/ true, CostKind);
1085	}
1086	}
1087
1088	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1089	}
1090
1091	// Scalar i8 / i16 operations will typically be made after first extending
1092	// the operands to i32.
1093	static unsigned getOperandsExtensionCost(const Instruction *I) {
1094	unsigned ExtCost = `0`;
1095	for (Value *Op : I->operands())
1096	// A load of i8 or i16 sign/zero extends to i32.
1097	if (!isa<LoadInst>(Val: Op) && !isa<ConstantInt>(Val: Op))
1098	ExtCost++;
1099
1100	return ExtCost;
1101	}
1102
1103	InstructionCost SystemZTTIImpl::getCmpSelInstrCost(
1104	unsigned Opcode, Type ValTy, Type CondTy, CmpInst::Predicate VecPred,
1105	TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
1106	TTI::OperandValueInfo Op2Info, const Instruction I) const* {
1107	if (CostKind != TTI::TCK_RecipThroughput)
1108	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1109	Op1Info, Op2Info);
1110
1111	if (!ValTy->isVectorTy()) {
1112	switch (Opcode) {
1113	case Instruction::ICmp: {
1114	// A loaded value compared with 0 with multiple users becomes Load and
1115	// Test. The load is then not foldable, so return 0 cost for the ICmp.
1116	unsigned ScalarBits = ValTy->getScalarSizeInBits();
1117	if (I != nullptr && (ScalarBits == `32` \|\| ScalarBits == `64`))
1118	if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
1119	if (const ConstantInt *C = dyn_cast<ConstantInt>(Val: I->getOperand(i: `1`)))
1120	if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
1121	C->isZero())
1122	return `0`;
1123
1124	unsigned Cost = `1`;
1125	if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= `16`)
1126	Cost += (I != nullptr ? getOperandsExtensionCost(I) : `2`);
1127	return Cost;
1128	}
1129	case Instruction::Select:
1130	if (ValTy->isFloatingPointTy())
1131	return `4`; // No LOC for FP - costs a conditional jump.
1132
1133	// When selecting based on an i128 comparison, LOC / VSEL is possible
1134	// if i128 comparisons are directly supported.
1135	if (I != nullptr)
1136	if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: I->getOperand(i: `0`)))
1137	if (CI->getOperand(i_nocapture: `0`)->getType()->isIntegerTy(Bitwidth: `128`))
1138	return ST->hasVectorEnhancements3() ? `1` : `4`;
1139
1140	// Load On Condition / Select Register available, except for i128.
1141	return !isInt128InVR(Ty: ValTy) ? `1` : `4`;
1142	}
1143	}
1144	else if (ST->hasVector()) {
1145	unsigned VF = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1146
1147	// Called with a compare instruction.
1148	if (Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) {
1149	unsigned PredicateExtraCost = `0`;
1150	if (I != nullptr) {
1151	// Some predicates cost one or two extra instructions.
1152	switch (cast<CmpInst>(Val: I)->getPredicate()) {
1153	case CmpInst::Predicate::ICMP_NE:
1154	case CmpInst::Predicate::ICMP_UGE:
1155	case CmpInst::Predicate::ICMP_ULE:
1156	case CmpInst::Predicate::ICMP_SGE:
1157	case CmpInst::Predicate::ICMP_SLE:
1158	PredicateExtraCost = `1`;
1159	break;
1160	case CmpInst::Predicate::FCMP_ONE:
1161	case CmpInst::Predicate::FCMP_ORD:
1162	case CmpInst::Predicate::FCMP_UEQ:
1163	case CmpInst::Predicate::FCMP_UNO:
1164	PredicateExtraCost = `2`;
1165	break;
1166	default:
1167	break;
1168	}
1169	}
1170
1171	// Float is handled with 2vmr[lh]f + 2vldeb + vfchdb for each pair of
1172	// floats. FIXME: <2 x float> generates same code as <4 x float>.
1173	unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? `10` : `1`);
1174	unsigned NumVecs_cmp = getNumVectorRegs(Ty: ValTy);
1175
1176	unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1177	return Cost;
1178	}
1179	else { // Called with a select instruction.
1180	assert (Opcode == Instruction::Select);
1181
1182	// We can figure out the extra cost of packing / unpacking if the
1183	// instruction was passed and the compare instruction is found.
1184	unsigned PackCost = `0`;
1185	Type CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr*);
1186	if (CmpOpTy != nullptr)
1187	PackCost =
1188	getVectorBitmaskConversionCost(SrcTy: CmpOpTy, DstTy: ValTy);
1189
1190	return getNumVectorRegs(Ty: ValTy) /vsel/ + PackCost;
1191	}
1192	}
1193
1194	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1195	Op1Info, Op2Info);
1196	}
1197
1198	InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1199	TTI::TargetCostKind CostKind,
1200	unsigned Index,
1201	const Value *Op0,
1202	const Value Op1) const* {
1203	if (Opcode == Instruction::InsertElement) {
1204	// Vector Element Load.
1205	if (Op1 != nullptr && isFreeEltLoad(Op: Op1))
1206	return `0`;
1207
1208	// vlvgp will insert two grs into a vector register, so count half the
1209	// number of instructions as an estimate when we don't have the full
1210	// picture (as in getScalarizationOverhead()).
1211	if (Val->isIntOrIntVectorTy(BitWidth: `64`))
1212	return ((Index % `2` == `0`) ? `1` : `0`);
1213	}
1214
1215	if (Opcode == Instruction::ExtractElement) {
1216	int Cost = ((getScalarSizeInBits(Ty: Val) == `1`) ? `2` /+test-under-mask/ : `1`);
1217
1218	// Give a slight penalty for moving out of vector pipeline to FXU unit.
1219	if (Index == `0` && Val->isIntOrIntVectorTy())
1220	Cost += `1`;
1221
1222	return Cost;
1223	}
1224
1225	return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1226	}
1227
1228	// Check if a load may be folded as a memory operand in its user.
1229	bool SystemZTTIImpl::isFoldableLoad(const LoadInst *Ld,
1230	const Instruction &FoldedValue) const* {
1231	if (!Ld->hasOneUse())
1232	return false;
1233	FoldedValue = Ld;
1234	const Instruction UserI = cast<Instruction>(Val: Ld->user_begin());
1235	unsigned LoadedBits = getScalarSizeInBits(Ty: Ld->getType());
1236	unsigned TruncBits = `0`;
1237	unsigned SExtBits = `0`;
1238	unsigned ZExtBits = `0`;
1239	if (UserI->hasOneUse()) {
1240	unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1241	if (isa<TruncInst>(Val: UserI))
1242	TruncBits = UserBits;
1243	else if (isa<SExtInst>(Val: UserI))
1244	SExtBits = UserBits;
1245	else if (isa<ZExtInst>(Val: UserI))
1246	ZExtBits = UserBits;
1247	}
1248	if (TruncBits \|\| SExtBits \|\| ZExtBits) {
1249	FoldedValue = UserI;
1250	UserI = cast<Instruction>(Val: *UserI->user_begin());
1251	// Load (single use) -> trunc/extend (single use) -> UserI
1252	}
1253	if ((UserI->getOpcode() == Instruction::Sub \|\|
1254	UserI->getOpcode() == Instruction::SDiv \|\|
1255	UserI->getOpcode() == Instruction::UDiv) &&
1256	UserI->getOperand(i: `1`) != FoldedValue)
1257	return false; // Not commutative, only RHS foldable.
1258	// LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1259	// extension was made of the load.
1260	unsigned LoadOrTruncBits =
1261	((SExtBits \|\| ZExtBits) ? `0` : (TruncBits ? TruncBits : LoadedBits));
1262	switch (UserI->getOpcode()) {
1263	case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1264	case Instruction::Sub:
1265	case Instruction::ICmp:
1266	if (LoadedBits == `32` && ZExtBits == `64`)
1267	return true;
1268	[[fallthrough]];
1269	case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1270	if (UserI->getOpcode() != Instruction::ICmp) {
1271	if (LoadedBits == `16` &&
1272	(SExtBits == `32` \|\|
1273	(SExtBits == `64` && ST->hasMiscellaneousExtensions2())))
1274	return true;
1275	if (LoadOrTruncBits == `16`)
1276	return true;
1277	}
1278	[[fallthrough]];
1279	case Instruction::SDiv:// SE: 32->64
1280	if (LoadedBits == `32` && SExtBits == `64`)
1281	return true;
1282	[[fallthrough]];
1283	case Instruction::UDiv:
1284	case Instruction::And:
1285	case Instruction::Or:
1286	case Instruction::Xor:
1287	// This also makes sense for float operations, but disabled for now due
1288	// to regressions.
1289	// case Instruction::FCmp:
1290	// case Instruction::FAdd:
1291	// case Instruction::FSub:
1292	// case Instruction::FMul:
1293	// case Instruction::FDiv:
1294
1295	// All possible extensions of memory checked above.
1296
1297	// Comparison between memory and immediate.
1298	if (UserI->getOpcode() == Instruction::ICmp)
1299	if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: UserI->getOperand(i: `1`)))
1300	if (CI->getValue().isIntN(N: `16`))
1301	return true;
1302	return (LoadOrTruncBits == `32` \|\| LoadOrTruncBits == `64`);
1303	break;
1304	}
1305	return false;
1306	}
1307
1308	static bool isBswapIntrinsicCall(const Value *V) {
1309	if (const Instruction *I = dyn_cast<Instruction>(Val: V))
1310	if (auto *CI = dyn_cast<CallInst>(Val: I))
1311	if (auto *F = CI->getCalledFunction())
1312	if (F->getIntrinsicID() == Intrinsic::bswap)
1313	return true;
1314	return false;
1315	}
1316
1317	InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1318	Align Alignment,
1319	unsigned AddressSpace,
1320	TTI::TargetCostKind CostKind,
1321	TTI::OperandValueInfo OpInfo,
1322	const Instruction I) const* {
1323	assert(!Src->isVoidTy() && "Invalid type");
1324
1325	// TODO: Handle other cost kinds.
1326	if (CostKind != TTI::TCK_RecipThroughput)
1327	return `1`;
1328
1329	if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1330	// Store the load or its truncated or extended value in FoldedValue.
1331	const Instruction FoldedValue = nullptr*;
1332	if (isFoldableLoad(Ld: cast<LoadInst>(Val: I), FoldedValue)) {
1333	const Instruction UserI = cast<Instruction>(Val: FoldedValue->user_begin());
1334	assert (UserI->getNumOperands() == `2` && "Expected a binop.");
1335
1336	// UserI can't fold two loads, so in that case return 0 cost only
1337	// half of the time.
1338	for (unsigned i = `0`; i < `2`; ++i) {
1339	if (UserI->getOperand(i) == FoldedValue)
1340	continue;
1341
1342	if (Instruction *OtherOp = dyn_cast<Instruction>(Val: UserI->getOperand(i))){
1343	LoadInst *OtherLoad = dyn_cast<LoadInst>(Val: OtherOp);
1344	if (!OtherLoad &&
1345	(isa<TruncInst>(Val: OtherOp) \|\| isa<SExtInst>(Val: OtherOp) \|\|
1346	isa<ZExtInst>(Val: OtherOp)))
1347	OtherLoad = dyn_cast<LoadInst>(Val: OtherOp->getOperand(i: `0`));
1348	if (OtherLoad && isFoldableLoad(Ld: OtherLoad, FoldedValue/dummy/))
1349	return i == `0`; // Both operands foldable.
1350	}
1351	}
1352
1353	return `0`; // Only I is foldable in user.
1354	}
1355	}
1356
1357	// Type legalization (via getNumberOfParts) can't handle structs
1358	if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other)
1359	return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1360	CostKind);
1361
1362	// FP128 is a legal type but kept in a register pair on older CPUs.
1363	if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1364	return `2`;
1365
1366	unsigned NumOps =
1367	(Src->isVectorTy() ? getNumVectorRegs(Ty: Src) : getNumberOfParts(Tp: Src));
1368
1369	// Store/Load reversed saves one instruction.
1370	if (((!Src->isVectorTy() && NumOps == `1`) \|\| ST->hasVectorEnhancements2()) &&
1371	I != nullptr) {
1372	if (Opcode == Instruction::Load && I->hasOneUse()) {
1373	const Instruction LdUser = cast<Instruction>(Val: I->user_begin());
1374	// In case of load -> bswap -> store, return normal cost for the load.
1375	if (isBswapIntrinsicCall(V: LdUser) &&
1376	(!LdUser->hasOneUse() \|\| !isa<StoreInst>(Val: *LdUser->user_begin())))
1377	return `0`;
1378	}
1379	else if (const StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
1380	const Value *StoredVal = SI->getValueOperand();
1381	if (StoredVal->hasOneUse() && isBswapIntrinsicCall(V: StoredVal))
1382	return `0`;
1383	}
1384	}
1385
1386	return NumOps;
1387	}
1388
1389	// The generic implementation of getInterleavedMemoryOpCost() is based on
1390	// adding costs of the memory operations plus all the extracts and inserts
1391	// needed for using / defining the vector operands. The SystemZ version does
1392	// roughly the same but bases the computations on vector permutations
1393	// instead.
1394	InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
1395	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
1396	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1397	bool UseMaskForCond, bool UseMaskForGaps) const {
1398	if (UseMaskForCond \|\| UseMaskForGaps)
1399	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1400	Alignment, AddressSpace, CostKind,
1401	UseMaskForCond, UseMaskForGaps);
1402	assert(isa<VectorType>(VecTy) &&
1403	"Expect a vector type for interleaved memory op");
1404
1405	unsigned NumElts = cast<FixedVectorType>(Val: VecTy)->getNumElements();
1406	assert(Factor > `1` && NumElts % Factor == `0` && "Invalid interleave factor");
1407	unsigned VF = NumElts / Factor;
1408	unsigned NumEltsPerVecReg = (`128U` / getScalarSizeInBits(Ty: VecTy));
1409	unsigned NumVectorMemOps = getNumVectorRegs(Ty: VecTy);
1410	unsigned NumPermutes = `0`;
1411
1412	if (Opcode == Instruction::Load) {
1413	// Loading interleave groups may have gaps, which may mean fewer
1414	// loads. Find out how many vectors will be loaded in total, and in how
1415	// many of them each value will be in.
1416	BitVector UsedInsts(NumVectorMemOps, false);
1417	std::vector<BitVector> ValueVecs(Factor, BitVector (NumVectorMemOps, false));
1418	for (unsigned Index : Indices)
1419	for (unsigned Elt = `0`; Elt < VF; ++Elt) {
1420	unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1421	UsedInsts.set(Vec);
1422	ValueVecs [Index].set(Vec);
1423	}
1424	NumVectorMemOps = UsedInsts.count();
1425
1426	for (unsigned Index : Indices) {
1427	// Estimate that each loaded source vector containing this Index
1428	// requires one operation, except that vperm can handle two input
1429	// registers first time for each dst vector.
1430	unsigned NumSrcVecs = ValueVecs [Index].count();
1431	unsigned NumDstVecs = divideCeil(Numerator: VF * getScalarSizeInBits(Ty: VecTy), Denominator: `128U`);
1432	assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1433	NumPermutes += std::max(a: `1U`, b: NumSrcVecs - NumDstVecs);
1434	}
1435	} else {
1436	// Estimate the permutes for each stored vector as the smaller of the
1437	// number of elements and the number of source vectors. Subtract one per
1438	// dst vector for vperm (S.A.).
1439	unsigned NumSrcVecs = std::min(a: NumEltsPerVecReg, b: Factor);
1440	unsigned NumDstVecs = NumVectorMemOps;
1441	NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1442	}
1443
1444	// Cost of load/store operations and the permutations needed.
1445	return NumVectorMemOps + NumPermutes;
1446	}
1447
1448	InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) {
1449	InstructionCost Cost = `0`;
1450	// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1451	Cost += NumVec - `1`;
1452	// For integer adds, VSUM creates shorter reductions on the final vector.
1453	Cost += (ScalarBits < `32`) ? `3` : `2`;
1454	return Cost;
1455	}
1456
1457	InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems,
1458	unsigned ScalarBits) {
1459	unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
1460	InstructionCost Cost = `0`;
1461	// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1462	Cost += NumVec - `1`;
1463	// For each shuffle / arithmetic layer, we need 2 instructions, and we need
1464	// log2(Elements in Last Vector) layers.
1465	Cost += `2` * Log2_32_Ceil(Value: std::min(a: NumElems, b: NumEltsPerVecReg));
1466	return Cost;
1467	}
1468
1469	inline bool customCostReductions(unsigned Opcode) {
1470	return Opcode == Instruction::FAdd \|\| Opcode == Instruction::FMul \|\|
1471	Opcode == Instruction::Add \|\| Opcode == Instruction::Mul;
1472	}
1473
1474	InstructionCost
1475	SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1476	std::optional<FastMathFlags> FMF,
1477	TTI::TargetCostKind CostKind) const {
1478	unsigned ScalarBits = Ty->getScalarSizeInBits();
1479	// The following is only for subtargets with vector math, non-ordered
1480	// reductions, and reasonable scalar sizes for int and fp add/mul.
1481	if (customCostReductions(Opcode) && ST->hasVector() &&
1482	!TTI::requiresOrderedReduction(FMF) &&
1483	ScalarBits <= SystemZ::VectorBits) {
1484	unsigned NumVectors = getNumVectorRegs(Ty);
1485	unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1486	// Integer Add is using custom code gen, that needs to be accounted for.
1487	if (Opcode == Instruction::Add)
1488	return getIntAddReductionCost(NumVec: NumVectors, ScalarBits);
1489	// The base cost is the same across all other arithmetic instructions
1490	InstructionCost Cost =
1491	getFastReductionCost(NumVec: NumVectors, NumElems, ScalarBits);
1492	// But we need to account for the final op involving the scalar operand.
1493	if ((Opcode == Instruction::FAdd) \|\| (Opcode == Instruction::FMul))
1494	Cost += `1`;
1495	return Cost;
1496	}
1497	// otherwise, fall back to the standard implementation
1498	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1499	}
1500
1501	InstructionCost
1502	SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1503	FastMathFlags FMF,
1504	TTI::TargetCostKind CostKind) const {
1505	// Return custom costs only on subtargets with vector enhancements.
1506	if (ST->hasVectorEnhancements1()) {
1507	unsigned NumVectors = getNumVectorRegs(Ty);
1508	unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1509	unsigned ScalarBits = Ty->getScalarSizeInBits();
1510	InstructionCost Cost = `0`;
1511	// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1512	Cost += NumVectors - `1`;
1513	// For the final vector, we need shuffle + min/max operations, and
1514	// we need #Elements - 1 of them.
1515	Cost += `2` * (std::min(a: NumElems, b: SystemZ::VectorBits / ScalarBits) - `1`);
1516	return Cost;
1517	}
1518	// For other targets, fall back to the standard implementation
1519	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1520	}
1521
1522	static int
1523	getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1524	const SmallVectorImpl<Type *> &ParamTys) {
1525	if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1526	return getNumVectorRegs(Ty: RetTy); // VPERM
1527
1528	return -`1`;
1529	}
1530
1531	InstructionCost
1532	SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1533	TTI::TargetCostKind CostKind) const {
1534	InstructionCost Cost = getVectorIntrinsicInstrCost(
1535	ID: ICA.getID(), RetTy: ICA.getReturnType(), ParamTys: ICA.getArgTypes());
1536	if (Cost != -`1`)
1537	return Cost;
1538	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1539	}
1540
1541	bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst II) const* {
1542	// Always expand on Subtargets without vector instructions.
1543	if (!ST->hasVector())
1544	return true;
1545
1546	// Whether or not to expand is a per-intrinsic decision.
1547	switch (II->getIntrinsicID()) {
1548	default:
1549	return true;
1550	// Do not expand vector.reduce.add...
1551	case Intrinsic::vector_reduce_add:
1552	auto *VType = cast<FixedVectorType>(Val: II->getOperand(i_nocapture: `0`)->getType());
1553	// ...unless the scalar size is i64 or larger,
1554	// or the operand vector is not full, since the
1555	// performance benefit is dubious in those cases.
1556	return VType->getScalarSizeInBits() >= `64` \|\|
1557	VType->getPrimitiveSizeInBits() < SystemZ::VectorBits;
1558	}
1559	}
1560

Browse the source code of llvm_projects/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp