SystemZTargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp]

1	//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements a TargetTransformInfo analysis pass specific to the
10	// SystemZ target machine. It uses the target's detailed information to provide
11	// more precise answers to certain TTI queries, while letting the target
12	// independent and default TTI implementations handle the rest.
13	//
14	//===----------------------------------------------------------------------===//
15
16	#include "SystemZTargetTransformInfo.h"
17	#include "llvm/Analysis/TargetTransformInfo.h"
18	#include "llvm/CodeGen/BasicTTIImpl.h"
19	#include "llvm/CodeGen/TargetLowering.h"
20	#include "llvm/IR/DerivedTypes.h"
21	#include "llvm/IR/InstIterator.h"
22	#include "llvm/IR/IntrinsicInst.h"
23	#include "llvm/IR/Intrinsics.h"
24	#include "llvm/Support/Debug.h"
25	#include "llvm/Support/InstructionCost.h"
26	#include "llvm/Support/MathExtras.h"
27
28	using namespace llvm;
29
30	#define DEBUG_TYPE "systemztti"
31
32	//===----------------------------------------------------------------------===//
33	//
34	// SystemZ cost model.
35	//
36	//===----------------------------------------------------------------------===//
37
38	static bool isUsedAsMemCpySource(const Value V, bool* &OtherUse) {
39	bool UsedAsMemCpySource = false;
40	for (const User *U : V->users())
41	if (const Instruction *User = dyn_cast<Instruction>(Val: U)) {
42	if (isa<BitCastInst>(Val: User) \|\| isa<GetElementPtrInst>(Val: User)) {
43	UsedAsMemCpySource \|= isUsedAsMemCpySource(V: User, OtherUse);
44	continue;
45	}
46	if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(Val: User)) {
47	if (Memcpy->getOperand(i_nocapture: `1`) == V && !Memcpy->isVolatile()) {
48	UsedAsMemCpySource = true;
49	continue;
50	}
51	}
52	OtherUse = true;
53	}
54	return UsedAsMemCpySource;
55	}
56
57	static void countNumMemAccesses(const Value Ptr, unsigned* &NumStores,
58	unsigned &NumLoads, const Function *F) {
59	if (!isa<PointerType>(Val: Ptr->getType()))
60	return;
61	for (const User *U : Ptr->users())
62	if (const Instruction *User = dyn_cast<Instruction>(Val: U)) {
63	if (User->getParent()->getParent() == F) {
64	if (const auto *SI = dyn_cast<StoreInst>(Val: User)) {
65	if (SI->getPointerOperand() == Ptr && !SI->isVolatile())
66	NumStores++;
67	} else if (const auto *LI = dyn_cast<LoadInst>(Val: User)) {
68	if (LI->getPointerOperand() == Ptr && !LI->isVolatile())
69	NumLoads++;
70	} else if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: User)) {
71	if (GEP->getPointerOperand() == Ptr)
72	countNumMemAccesses(Ptr: GEP, NumStores, NumLoads, F);
73	}
74	}
75	}
76	}
77
78	unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase CB) const* {
79	unsigned Bonus = `0`;
80	const Function *Caller = CB->getParent()->getParent();
81	const Function *Callee = CB->getCalledFunction();
82	if (!Callee)
83	return `0`;
84
85	// Increase the threshold if an incoming argument is used only as a memcpy
86	// source.
87	for (const Argument &Arg : Callee->args()) {
88	bool OtherUse = false;
89	if (isUsedAsMemCpySource(V: &Arg, OtherUse) && !OtherUse) {
90	Bonus = `1000`;
91	break;
92	}
93	}
94
95	// Give bonus for globals used much in both caller and a relatively small
96	// callee.
97	unsigned InstrCount = `0`;
98	SmallDenseMap<const Value , unsigned*> Ptr2NumUses;
99	for (auto &I : instructions(F: Callee)) {
100	if (++InstrCount == `200`) {
101	Ptr2NumUses.clear();
102	break;
103	}
104	if (const auto *SI = dyn_cast<StoreInst>(Val: &I)) {
105	if (!SI->isVolatile())
106	if (auto *GV = dyn_cast<GlobalVariable>(Val: SI->getPointerOperand()))
107	Ptr2NumUses [GV]++;
108	} else if (const auto *LI = dyn_cast<LoadInst>(Val: &I)) {
109	if (!LI->isVolatile())
110	if (auto *GV = dyn_cast<GlobalVariable>(Val: LI->getPointerOperand()))
111	Ptr2NumUses [GV]++;
112	} else if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
113	if (auto *GV = dyn_cast<GlobalVariable>(Val: GEP->getPointerOperand())) {
114	unsigned NumStores = `0`, NumLoads = `0`;
115	countNumMemAccesses(Ptr: GEP, NumStores, NumLoads, F: Callee);
116	Ptr2NumUses [GV] += NumLoads + NumStores;
117	}
118	}
119	}
120
121	for (auto [Ptr, NumCalleeUses] : Ptr2NumUses)
122	if (NumCalleeUses > `10`) {
123	unsigned CallerStores = `0`, CallerLoads = `0`;
124	countNumMemAccesses(Ptr, NumStores&: CallerStores, NumLoads&: CallerLoads, F: Caller);
125	if (CallerStores + CallerLoads > `10`) {
126	Bonus = `1000`;
127	break;
128	}
129	}
130
131	// Give bonus when Callee accesses an Alloca of Caller heavily.
132	unsigned NumStores = `0`;
133	unsigned NumLoads = `0`;
134	for (unsigned OpIdx = `0`; OpIdx != Callee->arg_size(); ++OpIdx) {
135	Value *CallerArg = CB->getArgOperand(i: OpIdx);
136	Argument *CalleeArg = Callee->getArg(i: OpIdx);
137	if (isa<AllocaInst>(Val: CallerArg))
138	countNumMemAccesses(Ptr: CalleeArg, NumStores, NumLoads, F: Callee);
139	}
140	if (NumLoads > `10`)
141	Bonus += NumLoads * `50`;
142	if (NumStores > `10`)
143	Bonus += NumStores * `50`;
144	Bonus = std::min(a: Bonus, b: unsigned(`1000`));
145
146	LLVM_DEBUG(if (Bonus)
147	dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
148	return Bonus;
149	}
150
151	InstructionCost
152	SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
153	TTI::TargetCostKind CostKind) const {
154	assert(Ty->isIntegerTy());
155
156	unsigned BitSize = Ty->getPrimitiveSizeInBits();
157	// There is no cost model for constants with a bit size of 0. Return TCC_Free
158	// here, so that constant hoisting will ignore this constant.
159	if (BitSize == `0`)
160	return TTI::TCC_Free;
161	// No cost model for operations on integers larger than 128 bit implemented yet.
162	if ((!ST->hasVector() && BitSize > `64`) \|\| BitSize > `128`)
163	return TTI::TCC_Free;
164
165	if (Imm == `0`)
166	return TTI::TCC_Free;
167
168	if (Imm.getBitWidth() <= `64`) {
169	// Constants loaded via lgfi.
170	if (isInt<`32`>(x: Imm.getSExtValue()))
171	return TTI::TCC_Basic;
172	// Constants loaded via llilf.
173	if (isUInt<`32`>(x: Imm.getZExtValue()))
174	return TTI::TCC_Basic;
175	// Constants loaded via llihf:
176	if ((Imm.getZExtValue() & `0xffffffff`) == `0`)
177	return TTI::TCC_Basic;
178
179	return `2` * TTI::TCC_Basic;
180	}
181
182	// i128 immediates loads from Constant Pool
183	return `2` * TTI::TCC_Basic;
184	}
185
186	InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
187	const APInt &Imm, Type *Ty,
188	TTI::TargetCostKind CostKind,
189	Instruction Inst) const* {
190	assert(Ty->isIntegerTy());
191
192	unsigned BitSize = Ty->getPrimitiveSizeInBits();
193	// There is no cost model for constants with a bit size of 0. Return TCC_Free
194	// here, so that constant hoisting will ignore this constant.
195	if (BitSize == `0`)
196	return TTI::TCC_Free;
197	// No cost model for operations on integers larger than 64 bit implemented yet.
198	if (BitSize > `64`)
199	return TTI::TCC_Free;
200
201	switch (Opcode) {
202	default:
203	return TTI::TCC_Free;
204	case Instruction::GetElementPtr:
205	// Always hoist the base address of a GetElementPtr. This prevents the
206	// creation of new constants for every base constant that gets constant
207	// folded with the offset.
208	if (Idx == `0`)
209	return `2` * TTI::TCC_Basic;
210	return TTI::TCC_Free;
211	case Instruction::Store:
212	if (Idx == `0` && Imm.getBitWidth() <= `64`) {
213	// Any 8-bit immediate store can by implemented via mvi.
214	if (BitSize == `8`)
215	return TTI::TCC_Free;
216	// 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
217	if (isInt<`16`>(x: Imm.getSExtValue()))
218	return TTI::TCC_Free;
219	}
220	break;
221	case Instruction::ICmp:
222	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
223	// Comparisons against signed 32-bit immediates implemented via cgfi.
224	if (isInt<`32`>(x: Imm.getSExtValue()))
225	return TTI::TCC_Free;
226	// Comparisons against unsigned 32-bit immediates implemented via clgfi.
227	if (isUInt<`32`>(x: Imm.getZExtValue()))
228	return TTI::TCC_Free;
229	}
230	break;
231	case Instruction::Add:
232	case Instruction::Sub:
233	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
234	// We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
235	if (isUInt<`32`>(x: Imm.getZExtValue()))
236	return TTI::TCC_Free;
237	// Or their negation, by swapping addition vs. subtraction.
238	if (isUInt<`32`>(x: -Imm.getSExtValue()))
239	return TTI::TCC_Free;
240	}
241	break;
242	case Instruction::Mul:
243	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
244	// We use msgfi to multiply by 32-bit signed immediates.
245	if (isInt<`32`>(x: Imm.getSExtValue()))
246	return TTI::TCC_Free;
247	}
248	break;
249	case Instruction::Or:
250	case Instruction::Xor:
251	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
252	// Masks supported by oilf/xilf.
253	if (isUInt<`32`>(x: Imm.getZExtValue()))
254	return TTI::TCC_Free;
255	// Masks supported by oihf/xihf.
256	if ((Imm.getZExtValue() & `0xffffffff`) == `0`)
257	return TTI::TCC_Free;
258	}
259	break;
260	case Instruction::And:
261	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
262	// Any 32-bit AND operation can by implemented via nilf.
263	if (BitSize <= `32`)
264	return TTI::TCC_Free;
265	// 64-bit masks supported by nilf.
266	if (isUInt<`32`>(x: ~Imm.getZExtValue()))
267	return TTI::TCC_Free;
268	// 64-bit masks supported by nilh.
269	if ((Imm.getZExtValue() & `0xffffffff`) == `0xffffffff`)
270	return TTI::TCC_Free;
271	// Some 64-bit AND operations can be implemented via risbg.
272	const SystemZInstrInfo *TII = ST->getInstrInfo();
273	unsigned Start, End;
274	if (TII->isRxSBGMask(Mask: Imm.getZExtValue(), BitSize, Start, End))
275	return TTI::TCC_Free;
276	}
277	break;
278	case Instruction::Shl:
279	case Instruction::LShr:
280	case Instruction::AShr:
281	// Always return TCC_Free for the shift value of a shift instruction.
282	if (Idx == `1`)
283	return TTI::TCC_Free;
284	break;
285	case Instruction::UDiv:
286	case Instruction::SDiv:
287	case Instruction::URem:
288	case Instruction::SRem:
289	case Instruction::Trunc:
290	case Instruction::ZExt:
291	case Instruction::SExt:
292	case Instruction::IntToPtr:
293	case Instruction::PtrToInt:
294	case Instruction::BitCast:
295	case Instruction::PHI:
296	case Instruction::Call:
297	case Instruction::Select:
298	case Instruction::Ret:
299	case Instruction::Load:
300	break;
301	}
302
303	return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
304	}
305
306	InstructionCost
307	SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
308	const APInt &Imm, Type *Ty,
309	TTI::TargetCostKind CostKind) const {
310	assert(Ty->isIntegerTy());
311
312	unsigned BitSize = Ty->getPrimitiveSizeInBits();
313	// There is no cost model for constants with a bit size of 0. Return TCC_Free
314	// here, so that constant hoisting will ignore this constant.
315	if (BitSize == `0`)
316	return TTI::TCC_Free;
317	// No cost model for operations on integers larger than 64 bit implemented yet.
318	if (BitSize > `64`)
319	return TTI::TCC_Free;
320
321	switch (IID) {
322	default:
323	return TTI::TCC_Free;
324	case Intrinsic::sadd_with_overflow:
325	case Intrinsic::uadd_with_overflow:
326	case Intrinsic::ssub_with_overflow:
327	case Intrinsic::usub_with_overflow:
328	// These get expanded to include a normal addition/subtraction.
329	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
330	if (isUInt<`32`>(x: Imm.getZExtValue()))
331	return TTI::TCC_Free;
332	if (isUInt<`32`>(x: -Imm.getSExtValue()))
333	return TTI::TCC_Free;
334	}
335	break;
336	case Intrinsic::smul_with_overflow:
337	case Intrinsic::umul_with_overflow:
338	// These get expanded to include a normal multiplication.
339	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
340	if (isInt<`32`>(x: Imm.getSExtValue()))
341	return TTI::TCC_Free;
342	}
343	break;
344	case Intrinsic::experimental_stackmap:
345	if ((Idx < `2`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
346	return TTI::TCC_Free;
347	break;
348	case Intrinsic::experimental_patchpoint_void:
349	case Intrinsic::experimental_patchpoint:
350	if ((Idx < `4`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
351	return TTI::TCC_Free;
352	break;
353	}
354	return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
355	}
356
357	TargetTransformInfo::PopcntSupportKind
358	SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) const {
359	assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
360	if (ST->hasPopulationCount() && TyWidth <= `64`)
361	return TTI::PSK_FastHardware;
362	return TTI::PSK_Software;
363	}
364
365	void SystemZTTIImpl::getUnrollingPreferences(
366	Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
367	OptimizationRemarkEmitter ORE) const* {
368	// Find out if L contains a call, what the machine instruction count
369	// estimate is, and how many stores there are.
370	bool HasCall = false;
371	InstructionCost NumStores = `0`;
372	for (auto &BB : L->blocks())
373	for (auto &I : *BB) {
374	if (isa<CallInst>(Val: &I) \|\| isa<InvokeInst>(Val: &I)) {
375	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
376	if (isLoweredToCall(F))
377	HasCall = true;
378	if (F->getIntrinsicID() == Intrinsic::memcpy \|\|
379	F->getIntrinsicID() == Intrinsic::memset)
380	NumStores ++;
381	} else { // indirect call.
382	HasCall = true;
383	}
384	}
385	if (isa<StoreInst>(Val: &I)) {
386	Type *MemAccessTy = I.getOperand(i: `0`)->getType();
387	NumStores += getMemoryOpCost(Opcode: Instruction::Store, Src: MemAccessTy, Alignment: Align (),
388	AddressSpace: `0`, CostKind: TTI::TCK_RecipThroughput);
389	}
390	}
391
392	// The z13 processor will run out of store tags if too many stores
393	// are fed into it too quickly. Therefore make sure there are not
394	// too many stores in the resulting unrolled loop.
395	unsigned const NumStoresVal = NumStores.getValue();
396	unsigned const Max = (NumStoresVal ? (`12` / NumStoresVal) : UINT_MAX);
397
398	if (HasCall) {
399	// Only allow full unrolling if loop has any calls.
400	UP.FullUnrollMaxCount = Max;
401	UP.MaxCount = `1`;
402	return;
403	}
404
405	UP.MaxCount = Max;
406	if (UP.MaxCount <= `1`)
407	return;
408
409	// Allow partial and runtime trip count unrolling.
410	UP.Partial = UP.Runtime = true;
411
412	UP.PartialThreshold = `75`;
413	UP.DefaultUnrollRuntimeCount = `4`;
414
415	// Allow expensive instructions in the pre-header of the loop.
416	UP.AllowExpensiveTripCount = true;
417
418	UP.Force = true;
419	}
420
421	void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
422	TTI::PeelingPreferences &PP) const {
423	BaseT::getPeelingPreferences(L, SE, PP);
424	}
425
426	bool SystemZTTIImpl::isLSRCostLess(
427	const TargetTransformInfo::LSRCost &C1,
428	const TargetTransformInfo::LSRCost &C2) const {
429	// SystemZ specific: check instruction count (first), and don't care about
430	// ImmCost, since offsets are checked explicitly.
431	return std::tie(args: C1.Insns, args: C1.NumRegs, args: C1.AddRecCost,
432	args: C1.NumIVMuls, args: C1.NumBaseAdds,
433	args: C1.ScaleCost, args: C1.SetupCost) <
434	std::tie(args: C2.Insns, args: C2.NumRegs, args: C2.AddRecCost,
435	args: C2.NumIVMuls, args: C2.NumBaseAdds,
436	args: C2.ScaleCost, args: C2.SetupCost);
437	}
438
439	unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
440	bool Vector = (ClassID == `1`);
441	if (!Vector)
442	// Discount the stack pointer. Also leave out %r0, since it can't
443	// be used in an address.
444	return `14`;
445	if (ST->hasVector())
446	return `32`;
447	return `0`;
448	}
449
450	TypeSize
451	SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
452	switch (K) {
453	case TargetTransformInfo::RGK_Scalar:
454	return TypeSize::getFixed(ExactSize: `64`);
455	case TargetTransformInfo::RGK_FixedWidthVector:
456	return TypeSize::getFixed(ExactSize: ST->hasVector() ? `128` : `0`);
457	case TargetTransformInfo::RGK_ScalableVector:
458	return TypeSize::getScalable(MinimumSize: `0`);
459	}
460
461	llvm_unreachable("Unsupported register kind");
462	}
463
464	unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
465	unsigned NumStridedMemAccesses,
466	unsigned NumPrefetches,
467	bool HasCall) const {
468	// Don't prefetch a loop with many far apart accesses.
469	if (NumPrefetches > `16`)
470	return UINT_MAX;
471
472	// Emit prefetch instructions for smaller strides in cases where we think
473	// the hardware prefetcher might not be able to keep up.
474	if (NumStridedMemAccesses > `32` && !HasCall &&
475	(NumMemAccesses - NumStridedMemAccesses) * `32` <= NumStridedMemAccesses)
476	return `1`;
477
478	return ST->hasMiscellaneousExtensions3() ? `8192` : `2048`;
479	}
480
481	bool SystemZTTIImpl::hasDivRemOp(Type DataType, bool* IsSigned) const {
482	EVT VT = TLI->getValueType(DL, Ty: DataType);
483	return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
484	}
485
486	static bool isFreeEltLoad(const Value *Op) {
487	if (isa<LoadInst>(Val: Op) && Op->hasOneUse()) {
488	const Instruction UserI = cast<Instruction>(Val: Op->user_begin());
489	return !isa<StoreInst>(Val: UserI); // Prefer MVC
490	}
491	return false;
492	}
493
494	InstructionCost SystemZTTIImpl::getScalarizationOverhead(
495	VectorType Ty, const* APInt &DemandedElts, bool Insert, bool Extract,
496	TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
497	TTI::VectorInstrContext VIC) const {
498	unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
499	InstructionCost Cost = `0`;
500
501	if (Insert && Ty->isIntOrIntVectorTy(BitWidth: `64`)) {
502	// VLVGP will insert two GPRs with one instruction, while VLE will load
503	// an element directly with no extra cost
504	assert((VL.empty() \|\| VL.size() == NumElts) &&
505	"Type does not match the number of values.");
506	InstructionCost CurrVectorCost = `0`;
507	for (unsigned Idx = `0`; Idx < NumElts; ++Idx) {
508	if (DemandedElts [Idx] && !(VL.size() && isFreeEltLoad(Op: VL [Idx])))
509	++CurrVectorCost;
510	if (Idx % `2` == `1`) {
511	Cost += std::min(a: InstructionCost (`1`), b: CurrVectorCost);
512	CurrVectorCost = `0`;
513	}
514	}
515	Insert = false;
516	}
517
518	Cost += BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract,
519	CostKind, ForPoisonSrc, VL);
520	return Cost;
521	}
522
523	// Return the bit size for the scalar type or vector element
524	// type. getScalarSizeInBits() returns 0 for a pointer type.
525	static unsigned getScalarSizeInBits(Type *Ty) {
526	unsigned Size =
527	(Ty->isPtrOrPtrVectorTy() ? `64U` : Ty->getScalarSizeInBits());
528	assert(Size > `0` && "Element must have non-zero size.");
529	return Size;
530	}
531
532	// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
533	// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
534	// 3.
535	static unsigned getNumVectorRegs(Type *Ty) {
536	auto *VTy = cast<FixedVectorType>(Val: Ty);
537	unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
538	assert(WideBits > `0` && "Could not compute size of vector");
539	return ((WideBits % `128U`) ? ((WideBits / `128U`) + `1`) : (WideBits / `128U`));
540	}
541
542	InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
543	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
544	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
545	ArrayRef<const Value > Args, const* Instruction CxtI) const* {
546
547	// TODO: Handle more cost kinds.
548	if (CostKind != TTI::TCK_RecipThroughput)
549	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
550	Opd2Info: Op2Info, Args, CxtI);
551
552	// TODO: return a good value for BB-VECTORIZER that includes the
553	// immediate loads, which we do not want to count for the loop
554	// vectorizer, since they are hopefully hoisted out of the loop. This
555	// would require a new parameter 'InLoop', but not sure if constant
556	// args are common enough to motivate this.
557
558	unsigned ScalarBits = Ty->getScalarSizeInBits();
559
560	// There are thre cases of division and remainder: Dividing with a register
561	// needs a divide instruction. A divisor which is a power of two constant
562	// can be implemented with a sequence of shifts. Any other constant needs a
563	// multiply and shifts.
564	const unsigned DivInstrCost = `20`;
565	const unsigned DivMulSeqCost = `10`;
566	const unsigned SDivPow2Cost = `4`;
567
568	bool SignedDivRem =
569	Opcode == Instruction::SDiv \|\| Opcode == Instruction::SRem;
570	bool UnsignedDivRem =
571	Opcode == Instruction::UDiv \|\| Opcode == Instruction::URem;
572
573	// Check for a constant divisor.
574	bool DivRemConst = false;
575	bool DivRemConstPow2 = false;
576	if ((SignedDivRem \|\| UnsignedDivRem) && Args.size() == `2`) {
577	if (const Constant *C = dyn_cast<Constant>(Val: Args [`1`])) {
578	const ConstantInt *CVal =
579	(C->getType()->isVectorTy()
580	? dyn_cast_or_null<const ConstantInt>(Val: C->getSplatValue())
581	: dyn_cast<const ConstantInt>(Val: C));
582	if (CVal && (CVal->getValue().isPowerOf2() \|\|
583	CVal->getValue().isNegatedPowerOf2()))
584	DivRemConstPow2 = true;
585	else
586	DivRemConst = true;
587	}
588	}
589
590	if (!Ty->isVectorTy()) {
591	// These FP operations are supported with a dedicated instruction for
592	// float, double and fp128 (base implementation assumes float generally
593	// costs 2).
594	if (Opcode == Instruction::FAdd \|\| Opcode == Instruction::FSub \|\|
595	Opcode == Instruction::FMul \|\| Opcode == Instruction::FDiv)
596	return `1`;
597
598	// There is no native support for FRem.
599	if (Opcode == Instruction::FRem)
600	return LIBCALL_COST;
601
602	// Give discount for some combined logical operations if supported.
603	if (Args.size() == `2`) {
604	if (Opcode == Instruction::Xor) {
605	for (const Value *A : Args) {
606	if (const Instruction *I = dyn_cast<Instruction>(Val: A))
607	if (I->hasOneUse() &&
608	(I->getOpcode() == Instruction::Or \|\|
609	I->getOpcode() == Instruction::And \|\|
610	I->getOpcode() == Instruction::Xor))
611	if ((ScalarBits <= `64` && ST->hasMiscellaneousExtensions3()) \|\|
612	(isInt128InVR(Ty) &&
613	(I->getOpcode() == Instruction::Or \|\| ST->hasVectorEnhancements1())))
614	return `0`;
615	}
616	}
617	else if (Opcode == Instruction::And \|\| Opcode == Instruction::Or) {
618	for (const Value *A : Args) {
619	if (const Instruction *I = dyn_cast<Instruction>(Val: A))
620	if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
621	((ScalarBits <= `64` && ST->hasMiscellaneousExtensions3()) \|\|
622	(isInt128InVR(Ty) &&
623	(Opcode == Instruction::And \|\| ST->hasVectorEnhancements1()))))
624	return `0`;
625	}
626	}
627	}
628
629	// Or requires one instruction, although it has custom handling for i64.
630	if (Opcode == Instruction::Or)
631	return `1`;
632
633	if (Opcode == Instruction::Xor && ScalarBits == `1`) {
634	if (ST->hasLoadStoreOnCond2())
635	return `5`; // 2 (li 0; loc 1); xor*
636	return `7`; // 2 ipm sequences ; xor ; shift ; compare*
637	}
638
639	if (DivRemConstPow2)
640	return (SignedDivRem ? SDivPow2Cost : `1`);
641	if (DivRemConst)
642	return DivMulSeqCost;
643	if (SignedDivRem \|\| UnsignedDivRem)
644	return DivInstrCost;
645	}
646	else if (ST->hasVector()) {
647	auto *VTy = cast<FixedVectorType>(Val: Ty);
648	unsigned VF = VTy->getNumElements();
649	unsigned NumVectors = getNumVectorRegs(Ty);
650
651	// These vector operations are custom handled, but are still supported
652	// with one instruction per vector, regardless of element size.
653	if (Opcode == Instruction::Shl \|\| Opcode == Instruction::LShr \|\|
654	Opcode == Instruction::AShr) {
655	return NumVectors;
656	}
657
658	if (DivRemConstPow2)
659	return (NumVectors * (SignedDivRem ? SDivPow2Cost : `1`));
660	if (DivRemConst) {
661	SmallVector<Type *> Tys(Args.size(), Ty);
662	return VF * DivMulSeqCost +
663	BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind);
664	}
665	if (SignedDivRem \|\| UnsignedDivRem) {
666	if (ST->hasVectorEnhancements3() && ScalarBits >= `32`)
667	return NumVectors * DivInstrCost;
668	else if (VF > `4`)
669	// Temporary hack: disable high vectorization factors with integer
670	// division/remainder, which will get scalarized and handled with
671	// GR128 registers. The mischeduler is not clever enough to avoid
672	// spilling yet.
673	return `1000`;
674	}
675
676	// These FP operations are supported with a single vector instruction for
677	// double (base implementation assumes float generally costs 2). For
678	// FP128, the scalar cost is 1, and there is no overhead since the values
679	// are already in scalar registers.
680	if (Opcode == Instruction::FAdd \|\| Opcode == Instruction::FSub \|\|
681	Opcode == Instruction::FMul \|\| Opcode == Instruction::FDiv) {
682	switch (ScalarBits) {
683	case `32`: {
684	// The vector enhancements facility 1 provides v4f32 instructions.
685	if (ST->hasVectorEnhancements1())
686	return NumVectors;
687	// Return the cost of multiple scalar invocation plus the cost of
688	// inserting and extracting the values.
689	InstructionCost ScalarCost =
690	getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind);
691	SmallVector<Type *> Tys(Args.size(), Ty);
692	InstructionCost Cost =
693	(VF * ScalarCost) +
694	BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind);
695	// FIXME: VF 2 for these FP operations are currently just as
696	// expensive as for VF 4.
697	if (VF == `2`)
698	Cost *= `2`;
699	return Cost;
700	}
701	case `64`:
702	case `128`:
703	return NumVectors;
704	default:
705	break;
706	}
707	}
708
709	// There is no native support for FRem.
710	if (Opcode == Instruction::FRem) {
711	SmallVector<Type *> Tys(Args.size(), Ty);
712	InstructionCost Cost =
713	(VF * LIBCALL_COST) +
714	BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind);
715	// FIXME: VF 2 for float is currently just as expensive as for VF 4.
716	if (VF == `2` && ScalarBits == `32`)
717	Cost *= `2`;
718	return Cost;
719	}
720	}
721
722	// Fallback to the default implementation.
723	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
724	Args, CxtI);
725	}
726
727	InstructionCost
728	SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
729	VectorType SrcTy, ArrayRef<int*> Mask,
730	TTI::TargetCostKind CostKind, int Index,
731	VectorType SubTp, ArrayRef<const* Value *> Args,
732	const Instruction CxtI) const* {
733	Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
734	if (ST->hasVector()) {
735	unsigned NumVectors = getNumVectorRegs(Ty: SrcTy);
736
737	// TODO: Since fp32 is expanded, the shuffle cost should always be 0.
738
739	// FP128 values are always in scalar registers, so there is no work
740	// involved with a shuffle, except for broadcast. In that case register
741	// moves are done with a single instruction per element.
742	if (SrcTy->getScalarType()->isFP128Ty())
743	return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - `1` : `0`);
744
745	switch (Kind) {
746	case TargetTransformInfo::SK_ExtractSubvector:
747	// ExtractSubvector Index indicates start offset.
748
749	// Extracting a subvector from first index is a noop.
750	return (Index == `0` ? `0` : NumVectors);
751
752	case TargetTransformInfo::SK_Broadcast:
753	// Loop vectorizer calls here to figure out the extra cost of
754	// broadcasting a loaded value to all elements of a vector. Since vlrep
755	// loads and replicates with a single instruction, adjust the returned
756	// value.
757	return NumVectors - `1`;
758
759	default:
760
761	// SystemZ supports single instruction permutation / replication.
762	return NumVectors;
763	}
764	}
765
766	return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
767	SubTp);
768	}
769
770	// Return the log2 difference of the element sizes of the two vector types.
771	static unsigned getElSizeLog2Diff(Type Ty0, Type Ty1) {
772	unsigned Bits0 = Ty0->getScalarSizeInBits();
773	unsigned Bits1 = Ty1->getScalarSizeInBits();
774
775	if (Bits1 > Bits0)
776	return (Log2_32(Value: Bits1) - Log2_32(Value: Bits0));
777
778	return (Log2_32(Value: Bits0) - Log2_32(Value: Bits1));
779	}
780
781	// Return the number of instructions needed to truncate SrcTy to DstTy.
782	unsigned SystemZTTIImpl::getVectorTruncCost(Type SrcTy, Type DstTy) const {
783	assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
784	assert(SrcTy->getPrimitiveSizeInBits().getFixedValue() >
785	DstTy->getPrimitiveSizeInBits().getFixedValue() &&
786	"Packing must reduce size of vector type.");
787	assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
788	cast<FixedVectorType>(DstTy)->getNumElements() &&
789	"Packing should not change number of elements.");
790
791	// TODO: Since fp32 is expanded, the extract cost should always be 0.
792
793	unsigned NumParts = getNumVectorRegs(Ty: SrcTy);
794	if (NumParts <= `2`)
795	// Up to 2 vector registers can be truncated efficiently with pack or
796	// permute. The latter requires an immediate mask to be loaded, which
797	// typically gets hoisted out of a loop. TODO: return a good value for
798	// BB-VECTORIZER that includes the immediate loads, which we do not want
799	// to count for the loop vectorizer.
800	return `1`;
801
802	unsigned Cost = `0`;
803	unsigned Log2Diff = getElSizeLog2Diff(Ty0: SrcTy, Ty1: DstTy);
804	unsigned VF = cast<FixedVectorType>(Val: SrcTy)->getNumElements();
805	for (unsigned P = `0`; P < Log2Diff; ++P) {
806	if (NumParts > `1`)
807	NumParts /= `2`;
808	Cost += NumParts;
809	}
810
811	// Currently, a general mix of permutes and pack instructions is output by
812	// isel, which follow the cost computation above except for this case which
813	// is one instruction less:
814	if (VF == `8` && SrcTy->getScalarSizeInBits() == `64` &&
815	DstTy->getScalarSizeInBits() == `8`)
816	Cost--;
817
818	return Cost;
819	}
820
821	// Return the cost of converting a vector bitmask produced by a compare
822	// (SrcTy), to the type of the select or extend instruction (DstTy).
823	unsigned SystemZTTIImpl::getVectorBitmaskConversionCost(Type *SrcTy,
824	Type DstTy) const* {
825	assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
826	"Should only be called with vector types.");
827
828	unsigned PackCost = `0`;
829	unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
830	unsigned DstScalarBits = DstTy->getScalarSizeInBits();
831	unsigned Log2Diff = getElSizeLog2Diff(Ty0: SrcTy, Ty1: DstTy);
832	if (SrcScalarBits > DstScalarBits)
833	// The bitmask will be truncated.
834	PackCost = getVectorTruncCost(SrcTy, DstTy);
835	else if (SrcScalarBits < DstScalarBits) {
836	unsigned DstNumParts = getNumVectorRegs(Ty: DstTy);
837	// Each vector select needs its part of the bitmask unpacked.
838	PackCost = Log2Diff * DstNumParts;
839	// Extra cost for moving part of mask before unpacking.
840	PackCost += DstNumParts - `1`;
841	}
842
843	return PackCost;
844	}
845
846	// Return the type of the compared operands. This is needed to compute the
847	// cost for a Select / ZExt or SExt instruction.
848	static Type getCmpOpsType(const* Instruction I, unsigned* VF = `1`) {
849	Type OpTy = nullptr*;
850	if (CmpInst *CI = dyn_cast<CmpInst>(Val: I->getOperand(i: `0`)))
851	OpTy = CI->getOperand(i_nocapture: `0`)->getType();
852	else if (Instruction *LogicI = dyn_cast<Instruction>(Val: I->getOperand(i: `0`)))
853	if (LogicI->getNumOperands() == `2`)
854	if (CmpInst *CI0 = dyn_cast<CmpInst>(Val: LogicI->getOperand(i: `0`)))
855	if (isa<CmpInst>(Val: LogicI->getOperand(i: `1`)))
856	OpTy = CI0->getOperand(i_nocapture: `0`)->getType();
857
858	if (OpTy != nullptr) {
859	if (VF == `1`) {
860	assert (!OpTy->isVectorTy() && "Expected scalar type");
861	return OpTy;
862	}
863	// Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
864	// be either scalar or already vectorized with a same or lesser VF.
865	Type *ElTy = OpTy->getScalarType();
866	return FixedVectorType::get(ElementType: ElTy, NumElts: VF);
867	}
868
869	return nullptr;
870	}
871
872	// Get the cost of converting a boolean vector to a vector with same width
873	// and element size as Dst, plus the cost of zero extending if needed.
874	unsigned
875	SystemZTTIImpl::getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
876	const Instruction I) const* {
877	auto *DstVTy = cast<FixedVectorType>(Val: Dst);
878	unsigned VF = DstVTy->getNumElements();
879	unsigned Cost = `0`;
880	// If we know what the widths of the compared operands, get any cost of
881	// converting it to match Dst. Otherwise assume same widths.
882	Type CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr*);
883	if (CmpOpTy != nullptr)
884	Cost = getVectorBitmaskConversionCost(SrcTy: CmpOpTy, DstTy: Dst);
885	if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::UIToFP)
886	// One 'vn' per dst vector with an immediate mask.
887	Cost += getNumVectorRegs(Ty: Dst);
888	return Cost;
889	}
890
891	InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
892	Type *Src,
893	TTI::CastContextHint CCH,
894	TTI::TargetCostKind CostKind,
895	const Instruction I) const* {
896	// FIXME: Can the logic below also be used for these cost kinds?
897	if (CostKind == TTI::TCK_CodeSize \|\| CostKind == TTI::TCK_SizeAndLatency) {
898	auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
899	return BaseCost == `0` ? BaseCost : `1`;
900	}
901
902	unsigned DstScalarBits = Dst->getScalarSizeInBits();
903	unsigned SrcScalarBits = Src->getScalarSizeInBits();
904
905	if (!Src->isVectorTy()) {
906	if (Dst->isVectorTy())
907	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
908
909	if (Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP) {
910	if (Src->isIntegerTy(Bitwidth: `128`))
911	return LIBCALL_COST;
912	if (SrcScalarBits >= `32` \|\|
913	(I != nullptr && isa<LoadInst>(Val: I->getOperand(i: `0`))))
914	return `1`;
915	return SrcScalarBits > `1` ? `2` /i8/i16 extend/ : `5` /branch seq./;
916	}
917
918	if ((Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI) &&
919	Dst->isIntegerTy(Bitwidth: `128`))
920	return LIBCALL_COST;
921
922	if ((Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt)) {
923	if (Src->isIntegerTy(Bitwidth: `1`)) {
924	if (DstScalarBits == `128`) {
925	if (Opcode == Instruction::SExt && ST->hasVectorEnhancements3())
926	return `0`;/VCEQQ/
927	return `5` /branch seq./;
928	}
929
930	if (ST->hasLoadStoreOnCond2())
931	return `2`; // li 0; loc 1
932
933	// This should be extension of a compare i1 result, which is done with
934	// ipm and a varying sequence of instructions.
935	unsigned Cost = `0`;
936	if (Opcode == Instruction::SExt)
937	Cost = (DstScalarBits < `64` ? `3` : `4`);
938	if (Opcode == Instruction::ZExt)
939	Cost = `3`;
940	Type CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr*);
941	if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
942	// If operands of an fp-type was compared, this costs +1.
943	Cost++;
944	return Cost;
945	}
946	else if (isInt128InVR(Ty: Dst)) {
947	// Extensions from GPR to i128 (in VR) typically costs two instructions,
948	// but a zero-extending load would be just one extra instruction.
949	if (Opcode == Instruction::ZExt && I != nullptr)
950	if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
951	if (Ld->hasOneUse())
952	return `1`;
953	return `2`;
954	}
955	}
956
957	if (Opcode == Instruction::Trunc && isInt128InVR(Ty: Src) && I != nullptr) {
958	if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
959	if (Ld->hasOneUse())
960	return `0`; // Will be converted to GPR load.
961	bool OnlyTruncatingStores = true;
962	for (const User *U : I->users())
963	if (!isa<StoreInst>(Val: U)) {
964	OnlyTruncatingStores = false;
965	break;
966	}
967	if (OnlyTruncatingStores)
968	return `0`;
969	return `2`; // Vector element extraction.
970	}
971	}
972	else if (ST->hasVector()) {
973	// Vector to scalar cast.
974	auto *SrcVecTy = cast<FixedVectorType>(Val: Src);
975	auto *DstVecTy = dyn_cast<FixedVectorType>(Val: Dst);
976	if (!DstVecTy) {
977	// TODO: tune vector-to-scalar cast.
978	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
979	}
980	unsigned VF = SrcVecTy->getNumElements();
981	unsigned NumDstVectors = getNumVectorRegs(Ty: Dst);
982	unsigned NumSrcVectors = getNumVectorRegs(Ty: Src);
983
984	if (Opcode == Instruction::Trunc) {
985	if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
986	return `0`; // Check for NOOP conversions.
987	return getVectorTruncCost(SrcTy: Src, DstTy: Dst);
988	}
989
990	if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt) {
991	if (SrcScalarBits >= `8`) {
992	// ZExt will use either a single unpack or a vector permute.
993	if (Opcode == Instruction::ZExt)
994	return NumDstVectors;
995
996	// SExt will be handled with one unpack per doubling of width.
997	unsigned NumUnpacks = getElSizeLog2Diff(Ty0: Src, Ty1: Dst);
998
999	// For types that spans multiple vector registers, some additional
1000	// instructions are used to setup the unpacking.
1001	unsigned NumSrcVectorOps =
1002	(NumUnpacks > `1` ? (NumDstVectors - NumSrcVectors)
1003	: (NumDstVectors / `2`));
1004
1005	return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
1006	}
1007	else if (SrcScalarBits == `1`)
1008	return getBoolVecToIntConversionCost(Opcode, Dst, I);
1009	}
1010
1011	if (Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP \|\|
1012	Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI) {
1013	// TODO: Fix base implementation which could simplify things a bit here
1014	// (seems to miss on differentiating on scalar/vector types).
1015
1016	// Only 64 bit vector conversions are natively supported before z15.
1017	if (DstScalarBits == `64` \|\| ST->hasVectorEnhancements2()) {
1018	if (SrcScalarBits == DstScalarBits)
1019	return NumDstVectors;
1020
1021	if (SrcScalarBits == `1`)
1022	return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
1023	}
1024
1025	// Return the cost of multiple scalar invocation plus the cost of
1026	// inserting and extracting the values. Base implementation does not
1027	// realize float->int gets scalarized.
1028	InstructionCost ScalarCost = getCastInstrCost(
1029	Opcode, Dst: Dst->getScalarType(), Src: Src->getScalarType(), CCH, CostKind);
1030	InstructionCost TotCost = VF * ScalarCost;
1031	bool NeedsInserts = true, NeedsExtracts = true;
1032	// FP128 registers do not get inserted or extracted.
1033	if (DstScalarBits == `128` &&
1034	(Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP))
1035	NeedsInserts = false;
1036	if (SrcScalarBits == `128` &&
1037	(Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI))
1038	NeedsExtracts = false;
1039
1040	TotCost += BaseT::getScalarizationOverhead(InTy: SrcVecTy, /Insert/ false,
1041	Extract: NeedsExtracts, CostKind);
1042	TotCost += BaseT::getScalarizationOverhead(InTy: DstVecTy, Insert: NeedsInserts,
1043	/Extract/ false, CostKind);
1044
1045	// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
1046	if (VF == `2` && SrcScalarBits == `32` && DstScalarBits == `32`)
1047	TotCost *= `2`;
1048
1049	return TotCost;
1050	}
1051
1052	if (Opcode == Instruction::FPTrunc) {
1053	if (SrcScalarBits == `128`) // fp128 -> double/float + inserts of elements.
1054	return VF /ldxbr/lexbr/ +
1055	BaseT::getScalarizationOverhead(InTy: DstVecTy, /Insert/ true,
1056	/Extract/ false, CostKind);
1057	else // double -> float
1058	return VF / `2` /vledb/ + std::max(a: `1U`, b: VF / `4` /vperm/);
1059	}
1060
1061	if (Opcode == Instruction::FPExt) {
1062	if (SrcScalarBits == `32` && DstScalarBits == `64`) {
1063	// float -> double is very rare and currently unoptimized. Instead of
1064	// using vldeb, which can do two at a time, all conversions are
1065	// scalarized.
1066	return VF * `2`;
1067	}
1068	// -> fp128. VF lxdb/lxeb + extraction of elements.*
1069	return VF + BaseT::getScalarizationOverhead(InTy: SrcVecTy, /Insert/ false,
1070	/Extract/ true, CostKind);
1071	}
1072	}
1073
1074	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1075	}
1076
1077	// Scalar i8 / i16 operations will typically be made after first extending
1078	// the operands to i32.
1079	static unsigned getOperandsExtensionCost(const Instruction *I) {
1080	unsigned ExtCost = `0`;
1081	for (Value *Op : I->operands())
1082	// A load of i8 or i16 sign/zero extends to i32.
1083	if (!isa<LoadInst>(Val: Op) && !isa<ConstantInt>(Val: Op))
1084	ExtCost++;
1085
1086	return ExtCost;
1087	}
1088
1089	InstructionCost SystemZTTIImpl::getCmpSelInstrCost(
1090	unsigned Opcode, Type ValTy, Type CondTy, CmpInst::Predicate VecPred,
1091	TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
1092	TTI::OperandValueInfo Op2Info, const Instruction I) const* {
1093	if (CostKind != TTI::TCK_RecipThroughput)
1094	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1095	Op1Info, Op2Info);
1096
1097	if (!ValTy->isVectorTy()) {
1098	switch (Opcode) {
1099	case Instruction::ICmp: {
1100	// A loaded value compared with 0 with multiple users becomes Load and
1101	// Test. The load is then not foldable, so return 0 cost for the ICmp.
1102	unsigned ScalarBits = ValTy->getScalarSizeInBits();
1103	if (I != nullptr && (ScalarBits == `32` \|\| ScalarBits == `64`))
1104	if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
1105	if (const ConstantInt *C = dyn_cast<ConstantInt>(Val: I->getOperand(i: `1`)))
1106	if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
1107	C->isZero())
1108	return `0`;
1109
1110	unsigned Cost = `1`;
1111	if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= `16`)
1112	Cost += (I != nullptr ? getOperandsExtensionCost(I) : `2`);
1113	return Cost;
1114	}
1115	case Instruction::Select:
1116	if (ValTy->isFloatingPointTy())
1117	return `4`; // No LOC for FP - costs a conditional jump.
1118
1119	// When selecting based on an i128 comparison, LOC / VSEL is possible
1120	// if i128 comparisons are directly supported.
1121	if (I != nullptr)
1122	if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: I->getOperand(i: `0`)))
1123	if (CI->getOperand(i_nocapture: `0`)->getType()->isIntegerTy(Bitwidth: `128`))
1124	return ST->hasVectorEnhancements3() ? `1` : `4`;
1125
1126	// Load On Condition / Select Register available, except for i128.
1127	return !isInt128InVR(Ty: ValTy) ? `1` : `4`;
1128	}
1129	}
1130	else if (ST->hasVector()) {
1131	unsigned VF = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1132
1133	// Called with a compare instruction.
1134	if (Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) {
1135	unsigned PredicateExtraCost = `0`;
1136	if (I != nullptr) {
1137	// Some predicates cost one or two extra instructions.
1138	switch (cast<CmpInst>(Val: I)->getPredicate()) {
1139	case CmpInst::Predicate::ICMP_NE:
1140	case CmpInst::Predicate::ICMP_UGE:
1141	case CmpInst::Predicate::ICMP_ULE:
1142	case CmpInst::Predicate::ICMP_SGE:
1143	case CmpInst::Predicate::ICMP_SLE:
1144	PredicateExtraCost = `1`;
1145	break;
1146	case CmpInst::Predicate::FCMP_ONE:
1147	case CmpInst::Predicate::FCMP_ORD:
1148	case CmpInst::Predicate::FCMP_UEQ:
1149	case CmpInst::Predicate::FCMP_UNO:
1150	PredicateExtraCost = `2`;
1151	break;
1152	default:
1153	break;
1154	}
1155	}
1156
1157	// Float is handled with 2vmr[lh]f + 2vldeb + vfchdb for each pair of
1158	// floats. FIXME: <2 x float> generates same code as <4 x float>.
1159	unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? `10` : `1`);
1160	unsigned NumVecs_cmp = getNumVectorRegs(Ty: ValTy);
1161
1162	unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1163	return Cost;
1164	}
1165	else { // Called with a select instruction.
1166	assert (Opcode == Instruction::Select);
1167
1168	// We can figure out the extra cost of packing / unpacking if the
1169	// instruction was passed and the compare instruction is found.
1170	unsigned PackCost = `0`;
1171	Type CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr*);
1172	if (CmpOpTy != nullptr)
1173	PackCost =
1174	getVectorBitmaskConversionCost(SrcTy: CmpOpTy, DstTy: ValTy);
1175
1176	return getNumVectorRegs(Ty: ValTy) /vsel/ + PackCost;
1177	}
1178	}
1179
1180	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1181	Op1Info, Op2Info);
1182	}
1183
1184	InstructionCost SystemZTTIImpl::getVectorInstrCost(
1185	unsigned Opcode, Type Val, TTI::TargetCostKind CostKind, unsigned* Index,
1186	const Value Op0, const* Value Op1, TTI::VectorInstrContext VIC) const* {
1187	if (Opcode == Instruction::InsertElement) {
1188	// Vector Element Load.
1189	if (Op1 != nullptr && isFreeEltLoad(Op: Op1))
1190	return `0`;
1191
1192	// vlvgp will insert two grs into a vector register, so count half the
1193	// number of instructions as an estimate when we don't have the full
1194	// picture (as in getScalarizationOverhead()).
1195	if (Val->isIntOrIntVectorTy(BitWidth: `64`))
1196	return ((Index % `2` == `0`) ? `1` : `0`);
1197	}
1198
1199	if (Opcode == Instruction::ExtractElement) {
1200	int Cost = ((getScalarSizeInBits(Ty: Val) == `1`) ? `2` /+test-under-mask/ : `1`);
1201
1202	// Give a slight penalty for moving out of vector pipeline to FXU unit.
1203	if (Index == `0` && Val->isIntOrIntVectorTy())
1204	Cost += `1`;
1205
1206	return Cost;
1207	}
1208
1209	return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1, VIC);
1210	}
1211
1212	// Check if a load may be folded as a memory operand in its user.
1213	bool SystemZTTIImpl::isFoldableLoad(const LoadInst *Ld,
1214	const Instruction &FoldedValue) const* {
1215	if (!Ld->hasOneUse())
1216	return false;
1217	FoldedValue = Ld;
1218	const Instruction UserI = cast<Instruction>(Val: Ld->user_begin());
1219	unsigned LoadedBits = getScalarSizeInBits(Ty: Ld->getType());
1220	unsigned TruncBits = `0`;
1221	unsigned SExtBits = `0`;
1222	unsigned ZExtBits = `0`;
1223	if (UserI->hasOneUse()) {
1224	unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1225	if (isa<TruncInst>(Val: UserI))
1226	TruncBits = UserBits;
1227	else if (isa<SExtInst>(Val: UserI))
1228	SExtBits = UserBits;
1229	else if (isa<ZExtInst>(Val: UserI))
1230	ZExtBits = UserBits;
1231	}
1232	if (TruncBits \|\| SExtBits \|\| ZExtBits) {
1233	FoldedValue = UserI;
1234	UserI = cast<Instruction>(Val: *UserI->user_begin());
1235	// Load (single use) -> trunc/extend (single use) -> UserI
1236	}
1237	if ((UserI->getOpcode() == Instruction::Sub \|\|
1238	UserI->getOpcode() == Instruction::SDiv \|\|
1239	UserI->getOpcode() == Instruction::UDiv) &&
1240	UserI->getOperand(i: `1`) != FoldedValue)
1241	return false; // Not commutative, only RHS foldable.
1242	// LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1243	// extension was made of the load.
1244	unsigned LoadOrTruncBits =
1245	((SExtBits \|\| ZExtBits) ? `0` : (TruncBits ? TruncBits : LoadedBits));
1246	switch (UserI->getOpcode()) {
1247	case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1248	case Instruction::Sub:
1249	case Instruction::ICmp:
1250	if (LoadedBits == `32` && ZExtBits == `64`)
1251	return true;
1252	[[fallthrough]];
1253	case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1254	if (UserI->getOpcode() != Instruction::ICmp) {
1255	if (LoadedBits == `16` &&
1256	(SExtBits == `32` \|\|
1257	(SExtBits == `64` && ST->hasMiscellaneousExtensions2())))
1258	return true;
1259	if (LoadOrTruncBits == `16`)
1260	return true;
1261	}
1262	[[fallthrough]];
1263	case Instruction::SDiv:// SE: 32->64
1264	if (LoadedBits == `32` && SExtBits == `64`)
1265	return true;
1266	[[fallthrough]];
1267	case Instruction::UDiv:
1268	case Instruction::And:
1269	case Instruction::Or:
1270	case Instruction::Xor:
1271	// This also makes sense for float operations, but disabled for now due
1272	// to regressions.
1273	// case Instruction::FCmp:
1274	// case Instruction::FAdd:
1275	// case Instruction::FSub:
1276	// case Instruction::FMul:
1277	// case Instruction::FDiv:
1278
1279	// All possible extensions of memory checked above.
1280
1281	// Comparison between memory and immediate.
1282	if (UserI->getOpcode() == Instruction::ICmp)
1283	if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: UserI->getOperand(i: `1`)))
1284	if (CI->getValue().isIntN(N: `16`))
1285	return true;
1286	return (LoadOrTruncBits == `32` \|\| LoadOrTruncBits == `64`);
1287	break;
1288	}
1289	return false;
1290	}
1291
1292	static bool isBswapIntrinsicCall(const Value *V) {
1293	if (const Instruction *I = dyn_cast<Instruction>(Val: V))
1294	if (auto *CI = dyn_cast<CallInst>(Val: I))
1295	if (auto *F = CI->getCalledFunction())
1296	if (F->getIntrinsicID() == Intrinsic::bswap)
1297	return true;
1298	return false;
1299	}
1300
1301	InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1302	Align Alignment,
1303	unsigned AddressSpace,
1304	TTI::TargetCostKind CostKind,
1305	TTI::OperandValueInfo OpInfo,
1306	const Instruction I) const* {
1307	assert(!Src->isVoidTy() && "Invalid type");
1308
1309	// TODO: Handle other cost kinds.
1310	if (CostKind != TTI::TCK_RecipThroughput)
1311	return `1`;
1312
1313	if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1314	// Store the load or its truncated or extended value in FoldedValue.
1315	const Instruction FoldedValue = nullptr*;
1316	if (isFoldableLoad(Ld: cast<LoadInst>(Val: I), FoldedValue)) {
1317	const Instruction UserI = cast<Instruction>(Val: FoldedValue->user_begin());
1318	assert (UserI->getNumOperands() == `2` && "Expected a binop.");
1319
1320	// UserI can't fold two loads, so in that case return 0 cost only
1321	// half of the time.
1322	for (unsigned i = `0`; i < `2`; ++i) {
1323	if (UserI->getOperand(i) == FoldedValue)
1324	continue;
1325
1326	if (Instruction *OtherOp = dyn_cast<Instruction>(Val: UserI->getOperand(i))){
1327	LoadInst *OtherLoad = dyn_cast<LoadInst>(Val: OtherOp);
1328	if (!OtherLoad &&
1329	(isa<TruncInst>(Val: OtherOp) \|\| isa<SExtInst>(Val: OtherOp) \|\|
1330	isa<ZExtInst>(Val: OtherOp)))
1331	OtherLoad = dyn_cast<LoadInst>(Val: OtherOp->getOperand(i: `0`));
1332	if (OtherLoad && isFoldableLoad(Ld: OtherLoad, FoldedValue/dummy/))
1333	return i == `0`; // Both operands foldable.
1334	}
1335	}
1336
1337	return `0`; // Only I is foldable in user.
1338	}
1339	}
1340
1341	// Type legalization (via getNumberOfParts) can't handle structs
1342	if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other)
1343	return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1344	CostKind);
1345
1346	// FP128 is a legal type but kept in a register pair on older CPUs.
1347	if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1348	return `2`;
1349
1350	unsigned NumOps =
1351	(Src->isVectorTy() ? getNumVectorRegs(Ty: Src) : getNumberOfParts(Tp: Src));
1352
1353	// Store/Load reversed saves one instruction.
1354	if (((!Src->isVectorTy() && NumOps == `1`) \|\| ST->hasVectorEnhancements2()) &&
1355	I != nullptr) {
1356	if (Opcode == Instruction::Load && I->hasOneUse()) {
1357	const Instruction LdUser = cast<Instruction>(Val: I->user_begin());
1358	// In case of load -> bswap -> store, return normal cost for the load.
1359	if (isBswapIntrinsicCall(V: LdUser) &&
1360	(!LdUser->hasOneUse() \|\| !isa<StoreInst>(Val: *LdUser->user_begin())))
1361	return `0`;
1362	}
1363	else if (const StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
1364	const Value *StoredVal = SI->getValueOperand();
1365	if (StoredVal->hasOneUse() && isBswapIntrinsicCall(V: StoredVal))
1366	return `0`;
1367	}
1368	}
1369
1370	return NumOps;
1371	}
1372
1373	// The generic implementation of getInterleavedMemoryOpCost() is based on
1374	// adding costs of the memory operations plus all the extracts and inserts
1375	// needed for using / defining the vector operands. The SystemZ version does
1376	// roughly the same but bases the computations on vector permutations
1377	// instead.
1378	InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
1379	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
1380	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1381	bool UseMaskForCond, bool UseMaskForGaps) const {
1382	if (UseMaskForCond \|\| UseMaskForGaps)
1383	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1384	Alignment, AddressSpace, CostKind,
1385	UseMaskForCond, UseMaskForGaps);
1386	assert(isa<VectorType>(VecTy) &&
1387	"Expect a vector type for interleaved memory op");
1388
1389	unsigned NumElts = cast<FixedVectorType>(Val: VecTy)->getNumElements();
1390	assert(Factor > `1` && NumElts % Factor == `0` && "Invalid interleave factor");
1391	unsigned VF = NumElts / Factor;
1392	unsigned NumEltsPerVecReg = (`128U` / getScalarSizeInBits(Ty: VecTy));
1393	unsigned NumVectorMemOps = getNumVectorRegs(Ty: VecTy);
1394	unsigned NumPermutes = `0`;
1395
1396	if (Opcode == Instruction::Load) {
1397	// Loading interleave groups may have gaps, which may mean fewer
1398	// loads. Find out how many vectors will be loaded in total, and in how
1399	// many of them each value will be in.
1400	BitVector UsedInsts(NumVectorMemOps, false);
1401	std::vector<BitVector> ValueVecs(Factor, BitVector (NumVectorMemOps, false));
1402	for (unsigned Index : Indices)
1403	for (unsigned Elt = `0`; Elt < VF; ++Elt) {
1404	unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1405	UsedInsts.set(Vec);
1406	ValueVecs [Index].set(Vec);
1407	}
1408	NumVectorMemOps = UsedInsts.count();
1409
1410	for (unsigned Index : Indices) {
1411	// Estimate that each loaded source vector containing this Index
1412	// requires one operation, except that vperm can handle two input
1413	// registers first time for each dst vector.
1414	unsigned NumSrcVecs = ValueVecs [Index].count();
1415	unsigned NumDstVecs = divideCeil(Numerator: VF * getScalarSizeInBits(Ty: VecTy), Denominator: `128U`);
1416	assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1417	NumPermutes += std::max(a: `1U`, b: NumSrcVecs - NumDstVecs);
1418	}
1419	} else {
1420	// Estimate the permutes for each stored vector as the smaller of the
1421	// number of elements and the number of source vectors. Subtract one per
1422	// dst vector for vperm (S.A.).
1423	unsigned NumSrcVecs = std::min(a: NumEltsPerVecReg, b: Factor);
1424	unsigned NumDstVecs = NumVectorMemOps;
1425	NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1426	}
1427
1428	// Cost of load/store operations and the permutations needed.
1429	return NumVectorMemOps + NumPermutes;
1430	}
1431
1432	InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) {
1433	InstructionCost Cost = `0`;
1434	// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1435	Cost += NumVec - `1`;
1436	// For integer adds, VSUM creates shorter reductions on the final vector.
1437	Cost += (ScalarBits < `32`) ? `3` : `2`;
1438	return Cost;
1439	}
1440
1441	InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems,
1442	unsigned ScalarBits) {
1443	unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
1444	InstructionCost Cost = `0`;
1445	// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1446	Cost += NumVec - `1`;
1447	// For each shuffle / arithmetic layer, we need 2 instructions, and we need
1448	// log2(Elements in Last Vector) layers.
1449	Cost += `2` * Log2_32_Ceil(Value: std::min(a: NumElems, b: NumEltsPerVecReg));
1450	return Cost;
1451	}
1452
1453	inline bool customCostReductions(unsigned Opcode) {
1454	return Opcode == Instruction::FAdd \|\| Opcode == Instruction::FMul \|\|
1455	Opcode == Instruction::Add \|\| Opcode == Instruction::Mul;
1456	}
1457
1458	InstructionCost
1459	SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1460	std::optional<FastMathFlags> FMF,
1461	TTI::TargetCostKind CostKind) const {
1462	unsigned ScalarBits = Ty->getScalarSizeInBits();
1463	// The following is only for subtargets with vector math, non-ordered
1464	// reductions, and reasonable scalar sizes for int and fp add/mul.
1465	if (customCostReductions(Opcode) && ST->hasVector() &&
1466	!TTI::requiresOrderedReduction(FMF) &&
1467	ScalarBits <= SystemZ::VectorBits) {
1468	unsigned NumVectors = getNumVectorRegs(Ty);
1469	unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1470	// Integer Add is using custom code gen, that needs to be accounted for.
1471	if (Opcode == Instruction::Add)
1472	return getIntAddReductionCost(NumVec: NumVectors, ScalarBits);
1473	// The base cost is the same across all other arithmetic instructions
1474	InstructionCost Cost =
1475	getFastReductionCost(NumVec: NumVectors, NumElems, ScalarBits);
1476	// But we need to account for the final op involving the scalar operand.
1477	if ((Opcode == Instruction::FAdd) \|\| (Opcode == Instruction::FMul))
1478	Cost += `1`;
1479	return Cost;
1480	}
1481	// otherwise, fall back to the standard implementation
1482	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1483	}
1484
1485	InstructionCost
1486	SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1487	FastMathFlags FMF,
1488	TTI::TargetCostKind CostKind) const {
1489	// Return custom costs only on subtargets with vector enhancements.
1490	if (ST->hasVectorEnhancements1()) {
1491	unsigned NumVectors = getNumVectorRegs(Ty);
1492	unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1493	unsigned ScalarBits = Ty->getScalarSizeInBits();
1494	InstructionCost Cost = `0`;
1495	// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1496	Cost += NumVectors - `1`;
1497	// For the final vector, we need shuffle + min/max operations, and
1498	// we need #Elements - 1 of them.
1499	Cost += `2` * (std::min(a: NumElems, b: SystemZ::VectorBits / ScalarBits) - `1`);
1500	return Cost;
1501	}
1502	// For other targets, fall back to the standard implementation
1503	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1504	}
1505
1506	static int
1507	getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1508	const SmallVectorImpl<Type *> &ParamTys) {
1509	if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1510	return getNumVectorRegs(Ty: RetTy); // VPERM
1511
1512	return -`1`;
1513	}
1514
1515	InstructionCost
1516	SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1517	TTI::TargetCostKind CostKind) const {
1518	InstructionCost Cost = getVectorIntrinsicInstrCost(
1519	ID: ICA.getID(), RetTy: ICA.getReturnType(), ParamTys: ICA.getArgTypes());
1520	if (Cost != -`1`)
1521	return Cost;
1522	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1523	}
1524
1525	bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst II) const* {
1526	// Always expand on Subtargets without vector instructions.
1527	if (!ST->hasVector())
1528	return true;
1529
1530	// Whether or not to expand is a per-intrinsic decision.
1531	switch (II->getIntrinsicID()) {
1532	default:
1533	return true;
1534	// Do not expand vector.reduce.add...
1535	case Intrinsic::vector_reduce_add:
1536	auto *VType = cast<FixedVectorType>(Val: II->getOperand(i_nocapture: `0`)->getType());
1537	// ...unless the scalar size is i64 or larger,
1538	// or the operand vector is not full, since the
1539	// performance benefit is dubious in those cases.
1540	return VType->getScalarSizeInBits() >= `64` \|\|
1541	VType->getPrimitiveSizeInBits() < SystemZ::VectorBits;
1542	}
1543	}
1544

Browse the source code of llvm_projects/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp