AMDGPUTargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp]

1	//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file
10	// This file implements a TargetTransformInfo analysis pass specific to the
11	// AMDGPU target machine. It uses the target's detailed information to provide
12	// more precise answers to certain TTI queries, while letting the target
13	// independent and default TTI implementations handle the rest.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "AMDGPUTargetTransformInfo.h"
18	#include "AMDGPUTargetMachine.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "SIModeRegisterDefaults.h"
21	#include "llvm/Analysis/InlineCost.h"
22	#include "llvm/Analysis/LoopInfo.h"
23	#include "llvm/Analysis/ValueTracking.h"
24	#include "llvm/CodeGen/Analysis.h"
25	#include "llvm/IR/IRBuilder.h"
26	#include "llvm/IR/IntrinsicsAMDGPU.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/Support/KnownBits.h"
29	#include <optional>
30
31	using namespace llvm;
32
33	#define DEBUG_TYPE "AMDGPUtti"
34
35	static cl::opt<unsigned> UnrollThresholdPrivate(
36	"amdgpu-unroll-threshold-private",
37	cl::desc ("Unroll threshold for AMDGPU if private memory used in a loop"),
38	cl::init(Val: `2700`), cl::Hidden);
39
40	static cl::opt<unsigned> UnrollThresholdLocal(
41	"amdgpu-unroll-threshold-local",
42	cl::desc ("Unroll threshold for AMDGPU if local memory used in a loop"),
43	cl::init(Val: `1000`), cl::Hidden);
44
45	static cl::opt<unsigned> UnrollThresholdIf(
46	"amdgpu-unroll-threshold-if",
47	cl::desc ("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48	cl::init(Val: `200`), cl::Hidden);
49
50	static cl::opt<bool> UnrollRuntimeLocal(
51	"amdgpu-unroll-runtime-local",
52	cl::desc ("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53	cl::init(Val: true), cl::Hidden);
54
55	static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
56	"amdgpu-unroll-max-block-to-analyze",
57	cl::desc ("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58	cl::init(Val: `32`), cl::Hidden);
59
60	static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61	cl::Hidden, cl::init(Val: `4000`),
62	cl::desc ("Cost of alloca argument"));
63
64	// If the amount of scratch memory to eliminate exceeds our ability to allocate
65	// it into registers we gain nothing by aggressively inlining functions for that
66	// heuristic.
67	static cl::opt<unsigned>
68	ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69	cl::init(Val: `256`),
70	cl::desc ("Maximum alloca size to use for inline cost"));
71
72	// Inliner constraint to achieve reasonable compilation time.
73	static cl::opt<size_t> InlineMaxBB(
74	"amdgpu-inline-max-bb", cl::Hidden, cl::init(Val: `1100`),
75	cl::desc ("Maximum number of BBs allowed in a function after inlining"
76	" (compile time constraint)"));
77
78	static bool dependsOnLocalPhi(const Loop L, const* Value *Cond,
79	unsigned Depth = `0`) {
80	const Instruction *I = dyn_cast<Instruction>(Val: Cond);
81	if (!I)
82	return false;
83
84	for (const Value *V : I->operand_values()) {
85	if (!L->contains(Inst: I))
86	continue;
87	if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) {
88	if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) {
89	return SubLoop->contains(Inst: PHI); }))
90	return true;
91	} else if (Depth < `10` && dependsOnLocalPhi(L, Cond: V, Depth: Depth+`1`))
92	return true;
93	}
94	return false;
95	}
96
97	AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine TM, const* Function &F)
98	: BaseT (TM, F.getDataLayout()),
99	TargetTriple (TM->getTargetTriple()),
100	ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
101	TLI(ST->getTargetLowering()) {}
102
103	void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
104	TTI::UnrollingPreferences &UP,
105	OptimizationRemarkEmitter *ORE) {
106	const Function &F = *L->getHeader()->getParent();
107	UP.Threshold =
108	F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold", Default: `300`);
109	UP.MaxCount = std::numeric_limits<unsigned>::max();
110	UP.Partial = true;
111
112	// Conditional branch in a loop back edge needs 3 additional exec
113	// manipulations in average.
114	UP.BEInsns += `3`;
115
116	// We want to run unroll even for the loops which have been vectorized.
117	UP.UnrollVectorizedLoop = true;
118
119	// TODO: Do we want runtime unrolling?
120
121	// Maximum alloca size than can fit registers. Reserve 16 registers.
122	const unsigned MaxAlloca = (`256` - `16`) * `4`;
123	unsigned ThresholdPrivate = UnrollThresholdPrivate;
124	unsigned ThresholdLocal = UnrollThresholdLocal;
125
126	// If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
127	// provided threshold value as the default for Threshold
128	if (MDNode *LoopUnrollThreshold =
129	findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold")) {
130	if (LoopUnrollThreshold->getNumOperands() == `2`) {
131	ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
132	MD: LoopUnrollThreshold->getOperand(I: `1`));
133	if (MetaThresholdValue) {
134	// We will also use the supplied value for PartialThreshold for now.
135	// We may introduce additional metadata if it becomes necessary in the
136	// future.
137	UP.Threshold = MetaThresholdValue->getSExtValue();
138	UP.PartialThreshold = UP.Threshold;
139	ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold);
140	ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold);
141	}
142	}
143	}
144
145	unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal);
146	for (const BasicBlock *BB : L->getBlocks()) {
147	const DataLayout &DL = BB->getDataLayout();
148	unsigned LocalGEPsSeen = `0`;
149
150	if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) {
151	return SubLoop->contains(BB); }))
152	continue; // Block belongs to an inner loop.
153
154	for (const Instruction &I : *BB) {
155	// Unroll a loop which contains an "if" statement whose condition
156	// defined by a PHI belonging to the loop. This may help to eliminate
157	// if region and potentially even PHI itself, saving on both divergence
158	// and registers used for the PHI.
159	// Add a small bonus for each of such "if" statements.
160	if (const BranchInst *Br = dyn_cast<BranchInst>(Val: &I)) {
161	if (UP.Threshold < MaxBoost && Br->isConditional()) {
162	BasicBlock *Succ0 = Br->getSuccessor(i: `0`);
163	BasicBlock *Succ1 = Br->getSuccessor(i: `1`);
164	if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) \|\|
165	(L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1)))
166	continue;
167	if (dependsOnLocalPhi(L, Cond: Br->getCondition())) {
168	UP.Threshold += UnrollThresholdIf;
169	LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
170	<< " for loop:\n"
171	<< L << " due to " << Br << `'\n'`);
172	if (UP.Threshold >= MaxBoost)
173	return;
174	}
175	}
176	continue;
177	}
178
179	const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I);
180	if (!GEP)
181	continue;
182
183	unsigned AS = GEP->getAddressSpace();
184	unsigned Threshold = `0`;
185	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
186	Threshold = ThresholdPrivate;
187	else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS)
188	Threshold = ThresholdLocal;
189	else
190	continue;
191
192	if (UP.Threshold >= Threshold)
193	continue;
194
195	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
196	const Value *Ptr = GEP->getPointerOperand();
197	const AllocaInst *Alloca =
198	dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr));
199	if (!Alloca \|\| !Alloca->isStaticAlloca())
200	continue;
201	Type *Ty = Alloca->getAllocatedType();
202	unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : `0`;
203	if (AllocaSize > MaxAlloca)
204	continue;
205	} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\|
206	AS == AMDGPUAS::REGION_ADDRESS) {
207	LocalGEPsSeen++;
208	// Inhibit unroll for local memory if we have seen addressing not to
209	// a variable, most likely we will be unable to combine it.
210	// Do not unroll too deep inner loops for local memory to give a chance
211	// to unroll an outer loop for a more important reason.
212	if (LocalGEPsSeen > `1` \|\| L->getLoopDepth() > `2` \|\|
213	(!isa<GlobalVariable>(Val: GEP->getPointerOperand()) &&
214	!isa<Argument>(Val: GEP->getPointerOperand())))
215	continue;
216	LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
217	<< *L << " due to LDS use.\n");
218	UP.Runtime = UnrollRuntimeLocal;
219	}
220
221	// Check if GEP depends on a value defined by this loop itself.
222	bool HasLoopDef = false;
223	for (const Value *Op : GEP->operands()) {
224	const Instruction *Inst = dyn_cast<Instruction>(Val: Op);
225	if (!Inst \|\| L->isLoopInvariant(V: Op))
226	continue;
227
228	if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) {
229	return SubLoop->contains(Inst); }))
230	continue;
231	HasLoopDef = true;
232	break;
233	}
234	if (!HasLoopDef)
235	continue;
236
237	// We want to do whatever we can to limit the number of alloca
238	// instructions that make it through to the code generator. allocas
239	// require us to use indirect addressing, which is slow and prone to
240	// compiler bugs. If this loop does an address calculation on an
241	// alloca ptr, then we want to use a higher than normal loop unroll
242	// threshold. This will give SROA a better chance to eliminate these
243	// allocas.
244	//
245	// We also want to have more unrolling for local memory to let ds
246	// instructions with different offsets combine.
247	//
248	// Don't use the maximum allowed value here as it will make some
249	// programs way too big.
250	UP.Threshold = Threshold;
251	LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
252	<< " for loop:\n"
253	<< L << " due to " << GEP << `'\n'`);
254	if (UP.Threshold >= MaxBoost)
255	return;
256	}
257
258	// If we got a GEP in a small BB from inner loop then increase max trip
259	// count to analyze for better estimation cost in unroll
260	if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
261	UP.MaxIterationsCountToAnalyze = `32`;
262	}
263	}
264
265	void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
266	TTI::PeelingPreferences &PP) {
267	BaseT::getPeelingPreferences(L, SE, PP);
268	}
269
270	int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
271	return `1024`;
272	}
273
274	const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
275	// Codegen control options which don't matter.
276	AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
277	AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
278	AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
279	AMDGPU::FeatureUnalignedAccessMode,
280
281	AMDGPU::FeatureAutoWaitcntBeforeBarrier,
282
283	// Property of the kernel/environment which can't actually differ.
284	AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
285	AMDGPU::FeatureTrapHandler,
286
287	// The default assumption needs to be ecc is enabled, but no directly
288	// exposed operations depend on it, so it can be safely inlined.
289	AMDGPU::FeatureSRAMECC,
290
291	// Perf-tuning features
292	AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
293
294	GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine TM, const* Function &F)
295	: BaseT (TM, F.getDataLayout()),
296	ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
297	TLI(ST->getTargetLowering()), CommonTTI (TM, F),
298	IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) {
299	SIModeRegisterDefaults Mode(F, *ST);
300	HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
301	HasFP64FP16Denormals =
302	Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
303	}
304
305	bool GCNTTIImpl::hasBranchDivergence(const Function F) const* {
306	return !F \|\| !ST->isSingleLaneExecution(Kernel: *F);
307	}
308
309	unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
310	// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311	// registers. See getRegisterClassForType for the implementation.
312	// In this case vector registers are not vector in terms of
313	// VGPRs, but those which can hold multiple values.
314
315	// This is really the number of registers to fill when vectorizing /
316	// interleaving loops, so we lie to avoid trying to use all registers.
317	return `4`;
318	}
319
320	TypeSize
321	GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
322	switch (K) {
323	case TargetTransformInfo::RGK_Scalar:
324	return TypeSize::getFixed(ExactSize: `32`);
325	case TargetTransformInfo::RGK_FixedWidthVector:
326	return TypeSize::getFixed(ExactSize: ST->hasPackedFP32Ops() ? `64` : `32`);
327	case TargetTransformInfo::RGK_ScalableVector:
328	return TypeSize::getScalable(MinimumSize: `0`);
329	}
330	llvm_unreachable("Unsupported register kind");
331	}
332
333	unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
334	return `32`;
335	}
336
337	unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338	if (Opcode == Instruction::Load \|\| Opcode == Instruction::Store)
339	return `32` * `4` / ElemWidth;
340	return (ElemWidth == `16` && ST->has16BitInsts()) ? `2`
341	: (ElemWidth == `32` && ST->hasPackedFP32Ops()) ? `2`
342	: `1`;
343	}
344
345	unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
346	unsigned ChainSizeInBytes,
347	VectorType VecTy) const* {
348	unsigned VecRegBitWidth = VF * LoadSize;
349	if (VecRegBitWidth > `128` && VecTy->getScalarSizeInBits() < `32`)
350	// TODO: Support element-size less than 32bit?
351	return `128` / LoadSize;
352
353	return VF;
354	}
355
356	unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
357	unsigned ChainSizeInBytes,
358	VectorType VecTy) const* {
359	unsigned VecRegBitWidth = VF * StoreSize;
360	if (VecRegBitWidth > `128`)
361	return `128` / StoreSize;
362
363	return VF;
364	}
365
366	unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
367	if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS \|\|
368	AddrSpace == AMDGPUAS::CONSTANT_ADDRESS \|\|
369	AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
370	AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER \|\|
371	AddrSpace == AMDGPUAS::BUFFER_RESOURCE \|\|
372	AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
373	return `512`;
374	}
375
376	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
377	return `8` * ST->getMaxPrivateElementSize();
378
379	// Common to flat, global, local and region. Assume for unknown addrspace.
380	return `128`;
381	}
382
383	bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
384	Align Alignment,
385	unsigned AddrSpace) const {
386	// We allow vectorization of flat stores, even though we may need to decompose
387	// them later if they may access private memory. We don't have enough context
388	// here, and legalization can handle it.
389	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
390	return (Alignment >= `4` \|\| ST->hasUnalignedScratchAccess()) &&
391	ChainSizeInBytes <= ST->getMaxPrivateElementSize();
392	}
393	return true;
394	}
395
396	bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
397	Align Alignment,
398	unsigned AddrSpace) const {
399	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
400	}
401
402	bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
403	Align Alignment,
404	unsigned AddrSpace) const {
405	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
406	}
407
408	int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
409	return `1024`;
410	}
411
412	// FIXME: Really we would like to issue multiple 128-bit loads and stores per
413	// iteration. Should we report a larger size and let it legalize?
414	//
415	// FIXME: Should we use narrower types for local/region, or account for when
416	// unaligned access is legal?
417	//
418	// FIXME: This could use fine tuning and microbenchmarks.
419	Type *GCNTTIImpl::getMemcpyLoopLoweringType(
420	LLVMContext &Context, Value Length, unsigned* SrcAddrSpace,
421	unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
422	std::optional<uint32_t> AtomicElementSize) const {
423
424	if (AtomicElementSize)
425	return Type::getIntNTy(C&: Context, N: AtomicElementSize `8`);
426
427	unsigned MinAlign = std::min(a: SrcAlign, b: DestAlign);
428
429	// A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
430	// hardware into byte accesses. If you assume all alignments are equally
431	// probable, it's more efficient on average to use short accesses for this
432	// case.
433	if (MinAlign == `2`)
434	return Type::getInt16Ty(C&: Context);
435
436	// Not all subtargets have 128-bit DS instructions, and we currently don't
437	// form them by default.
438	if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS \|\|
439	SrcAddrSpace == AMDGPUAS::REGION_ADDRESS \|\|
440	DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS \|\|
441	DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
442	return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: `2`);
443	}
444
445	// Global memory works best with 16-byte accesses. Private memory will also
446	// hit this, although they'll be decomposed.
447	return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: `4`);
448	}
449
450	void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
451	SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
452	unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
453	unsigned SrcAlign, unsigned DestAlign,
454	std::optional<uint32_t> AtomicCpySize) const {
455	assert(RemainingBytes < `16`);
456
457	if (AtomicCpySize)
458	BaseT::getMemcpyLoopResidualLoweringType(
459	OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
460	DestAlign, AtomicCpySize);
461
462	unsigned MinAlign = std::min(a: SrcAlign, b: DestAlign);
463
464	if (MinAlign != `2`) {
465	Type *I64Ty = Type::getInt64Ty(C&: Context);
466	while (RemainingBytes >= `8`) {
467	OpsOut.push_back(Elt: I64Ty);
468	RemainingBytes -= `8`;
469	}
470
471	Type *I32Ty = Type::getInt32Ty(C&: Context);
472	while (RemainingBytes >= `4`) {
473	OpsOut.push_back(Elt: I32Ty);
474	RemainingBytes -= `4`;
475	}
476	}
477
478	Type *I16Ty = Type::getInt16Ty(C&: Context);
479	while (RemainingBytes >= `2`) {
480	OpsOut.push_back(Elt: I16Ty);
481	RemainingBytes -= `2`;
482	}
483
484	Type *I8Ty = Type::getInt8Ty(C&: Context);
485	while (RemainingBytes) {
486	OpsOut.push_back(Elt: I8Ty);
487	--RemainingBytes;
488	}
489	}
490
491	unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
492	// Disable unrolling if the loop is not vectorized.
493	// TODO: Enable this again.
494	if (VF.isScalar())
495	return `1`;
496
497	return `8`;
498	}
499
500	bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
501	MemIntrinsicInfo &Info) const {
502	switch (Inst->getIntrinsicID()) {
503	case Intrinsic::amdgcn_ds_ordered_add:
504	case Intrinsic::amdgcn_ds_ordered_swap: {
505	auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: `2`));
506	auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: `4`));
507	if (!Ordering \|\| !Volatile)
508	return false; // Invalid.
509
510	unsigned OrderingVal = Ordering->getZExtValue();
511	if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
512	return false;
513
514	Info.PtrVal = Inst->getArgOperand(i: `0`);
515	Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
516	Info.ReadMem = true;
517	Info.WriteMem = true;
518	Info.IsVolatile = !Volatile->isZero();
519	return true;
520	}
521	default:
522	return false;
523	}
524	}
525
526	InstructionCost GCNTTIImpl::getArithmeticInstrCost(
527	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
528	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
529	ArrayRef<const Value *> Args,
530	const Instruction *CxtI) {
531
532	// Legalize the type.
533	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
534	int ISD = TLI->InstructionOpcodeToISD(Opcode);
535
536	// Because we don't have any legal vector operations, but the legal types, we
537	// need to account for split vectors.
538	unsigned NElts = LT.second.isVector() ?
539	LT.second.getVectorNumElements() : `1`;
540
541	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
542
543	switch (ISD) {
544	case ISD::SHL:
545	case ISD::SRL:
546	case ISD::SRA:
547	if (SLT == MVT::i64)
548	return get64BitInstrCost(CostKind) * LT.first * NElts;
549
550	if (ST->has16BitInsts() && SLT == MVT::i16)
551	NElts = (NElts + `1`) / `2`;
552
553	// i32
554	return getFullRateInstrCost() * LT.first * NElts;
555	case ISD::ADD:
556	case ISD::SUB:
557	case ISD::AND:
558	case ISD::OR:
559	case ISD::XOR:
560	if (SLT == MVT::i64) {
561	// and, or and xor are typically split into 2 VALU instructions.
562	return `2` * getFullRateInstrCost() * LT.first * NElts;
563	}
564
565	if (ST->has16BitInsts() && SLT == MVT::i16)
566	NElts = (NElts + `1`) / `2`;
567
568	return LT.first * NElts * getFullRateInstrCost();
569	case ISD::MUL: {
570	const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
571	if (SLT == MVT::i64) {
572	const int FullRateCost = getFullRateInstrCost();
573	return (`4` * QuarterRateCost + (`2` * `2`) * FullRateCost) * LT.first * NElts;
574	}
575
576	if (ST->has16BitInsts() && SLT == MVT::i16)
577	NElts = (NElts + `1`) / `2`;
578
579	// i32
580	return QuarterRateCost * NElts * LT.first;
581	}
582	case ISD::FMUL:
583	// Check possible fuse {fadd\|fsub}(a,fmul(b,c)) and return zero cost for
584	// fmul(b,c) supposing the fadd\|fsub will get estimated cost for the whole
585	// fused operation.
586	if (CxtI && CxtI->hasOneUse())
587	if (const auto FAdd = dyn_cast<BinaryOperator>(Val: CxtI->user_begin())) {
588	const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode());
589	if (OPC == ISD::FADD \|\| OPC == ISD::FSUB) {
590	if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
591	return TargetTransformInfo::TCC_Free;
592	if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
593	return TargetTransformInfo::TCC_Free;
594
595	// Estimate all types may be fused with contract/unsafe flags
596	const TargetOptions &Options = TLI->getTargetMachine().Options;
597	if (Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
598	Options.UnsafeFPMath \|\|
599	(FAdd->hasAllowContract() && CxtI->hasAllowContract()))
600	return TargetTransformInfo::TCC_Free;
601	}
602	}
603	[[fallthrough]];
604	case ISD::FADD:
605	case ISD::FSUB:
606	if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
607	NElts = (NElts + `1`) / `2`;
608	if (SLT == MVT::f64)
609	return LT.first * NElts * get64BitInstrCost(CostKind);
610
611	if (ST->has16BitInsts() && SLT == MVT::f16)
612	NElts = (NElts + `1`) / `2`;
613
614	if (SLT == MVT::f32 \|\| SLT == MVT::f16)
615	return LT.first * NElts * getFullRateInstrCost();
616	break;
617	case ISD::FDIV:
618	case ISD::FREM:
619	// FIXME: frem should be handled separately. The fdiv in it is most of it,
620	// but the current lowering is also not entirely correct.
621	if (SLT == MVT::f64) {
622	int Cost = `7` * get64BitInstrCost(CostKind) +
623	getQuarterRateInstrCost(CostKind) +
624	`3` * getHalfRateInstrCost(CostKind);
625	// Add cost of workaround.
626	if (!ST->hasUsableDivScaleConditionOutput())
627	Cost += `3` * getFullRateInstrCost();
628
629	return LT.first * Cost * NElts;
630	}
631
632	if (!Args.empty() && match(V: Args [`0`], P: PatternMatch::m_FPOne())) {
633	// TODO: This is more complicated, unsafe flags etc.
634	if ((SLT == MVT::f32 && !HasFP32Denormals) \|\|
635	(SLT == MVT::f16 && ST->has16BitInsts())) {
636	return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
637	}
638	}
639
640	if (SLT == MVT::f16 && ST->has16BitInsts()) {
641	// 2 x v_cvt_f32_f16
642	// f32 rcp
643	// f32 fmul
644	// v_cvt_f16_f32
645	// f16 div_fixup
646	int Cost =
647	`4` * getFullRateInstrCost() + `2` * getQuarterRateInstrCost(CostKind);
648	return LT.first * Cost * NElts;
649	}
650
651	if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) \|\|
652	TLI->getTargetMachine().Options.UnsafeFPMath)) {
653	// Fast unsafe fdiv lowering:
654	// f32 rcp
655	// f32 fmul
656	int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
657	return LT.first * Cost * NElts;
658	}
659
660	if (SLT == MVT::f32 \|\| SLT == MVT::f16) {
661	// 4 more v_cvt_ insts without f16 insts support*
662	int Cost = (SLT == MVT::f16 ? `14` : `10`) * getFullRateInstrCost() +
663	`1` * getQuarterRateInstrCost(CostKind);
664
665	if (!HasFP32Denormals) {
666	// FP mode switches.
667	Cost += `2` * getFullRateInstrCost();
668	}
669
670	return LT.first * NElts * Cost;
671	}
672	break;
673	case ISD::FNEG:
674	// Use the backend' estimation. If fneg is not free each element will cost
675	// one additional instruction.
676	return TLI->isFNegFree(VT: SLT) ? `0` : NElts;
677	default:
678	break;
679	}
680
681	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
682	Args, CxtI);
683	}
684
685	// Return true if there's a potential benefit from using v2f16/v2i16
686	// instructions for an intrinsic, even if it requires nontrivial legalization.
687	static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
688	switch (ID) {
689	case Intrinsic::fma: // TODO: fmuladd
690	// There's a small benefit to using vector ops in the legalized code.
691	case Intrinsic::round:
692	case Intrinsic::uadd_sat:
693	case Intrinsic::usub_sat:
694	case Intrinsic::sadd_sat:
695	case Intrinsic::ssub_sat:
696	return true;
697	default:
698	return false;
699	}
700	}
701
702	InstructionCost
703	GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
704	TTI::TargetCostKind CostKind) {
705	if (ICA.getID() == Intrinsic::fabs)
706	return `0`;
707
708	if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID()))
709	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
710
711	Type *RetTy = ICA.getReturnType();
712
713	// Legalize the type.
714	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
715
716	unsigned NElts = LT.second.isVector() ?
717	LT.second.getVectorNumElements() : `1`;
718
719	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
720
721	if (SLT == MVT::f64)
722	return LT.first * NElts * get64BitInstrCost(CostKind);
723
724	if ((ST->has16BitInsts() && SLT == MVT::f16) \|\|
725	(ST->hasPackedFP32Ops() && SLT == MVT::f32))
726	NElts = (NElts + `1`) / `2`;
727
728	// TODO: Get more refined intrinsic costs?
729	unsigned InstRate = getQuarterRateInstrCost(CostKind);
730
731	switch (ICA.getID()) {
732	case Intrinsic::fma:
733	InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
734	: getQuarterRateInstrCost(CostKind);
735	break;
736	case Intrinsic::uadd_sat:
737	case Intrinsic::usub_sat:
738	case Intrinsic::sadd_sat:
739	case Intrinsic::ssub_sat:
740	static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
741	if (any_of(Range: ValidSatTys, P: [&LT](MVT M) { return M == LT.second; }))
742	NElts = `1`;
743	break;
744	}
745
746	return LT.first * NElts * InstRate;
747	}
748
749	InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
750	TTI::TargetCostKind CostKind,
751	const Instruction *I) {
752	assert((I == nullptr \|\| I->getOpcode() == Opcode) &&
753	"Opcode should reflect passed instruction.");
754	const bool SCost =
755	(CostKind == TTI::TCK_CodeSize \|\| CostKind == TTI::TCK_SizeAndLatency);
756	const int CBrCost = SCost ? `5` : `7`;
757	switch (Opcode) {
758	case Instruction::Br: {
759	// Branch instruction takes about 4 slots on gfx900.
760	auto BI = dyn_cast_or_null<BranchInst>(Val: I);
761	if (BI && BI->isUnconditional())
762	return SCost ? `1` : `4`;
763	// Suppose conditional branch takes additional 3 exec manipulations
764	// instructions in average.
765	return CBrCost;
766	}
767	case Instruction::Switch: {
768	auto SI = dyn_cast_or_null<SwitchInst>(Val: I);
769	// Each case (including default) takes 1 cmp + 1 cbr instructions in
770	// average.
771	return (SI ? (SI->getNumCases() + `1`) : `4`) * (CBrCost + `1`);
772	}
773	case Instruction::Ret:
774	return SCost ? `1` : `10`;
775	}
776	return BaseT::getCFInstrCost(Opcode, CostKind, I);
777	}
778
779	InstructionCost
780	GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
781	std::optional<FastMathFlags> FMF,
782	TTI::TargetCostKind CostKind) {
783	if (TTI::requiresOrderedReduction(FMF))
784	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
785
786	EVT OrigTy = TLI->getValueType(DL, Ty);
787
788	// Computes cost on targets that have packed math instructions(which support
789	// 16-bit types only).
790	if (!ST->hasVOP3PInsts() \|\| OrigTy.getScalarSizeInBits() != `16`)
791	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
792
793	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
794	return LT.first * getFullRateInstrCost();
795	}
796
797	InstructionCost
798	GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
799	FastMathFlags FMF,
800	TTI::TargetCostKind CostKind) {
801	EVT OrigTy = TLI->getValueType(DL, Ty);
802
803	// Computes cost on targets that have packed math instructions(which support
804	// 16-bit types only).
805	if (!ST->hasVOP3PInsts() \|\| OrigTy.getScalarSizeInBits() != `16`)
806	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
807
808	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
809	return LT.first * getHalfRateInstrCost(CostKind);
810	}
811
812	InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
813	TTI::TargetCostKind CostKind,
814	unsigned Index, Value *Op0,
815	Value *Op1) {
816	switch (Opcode) {
817	case Instruction::ExtractElement:
818	case Instruction::InsertElement: {
819	unsigned EltSize
820	= DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType());
821	if (EltSize < `32`) {
822	if (EltSize == `16` && Index == `0` && ST->has16BitInsts())
823	return `0`;
824	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0,
825	Op1);
826	}
827
828	// Extracts are just reads of a subregister, so are free. Inserts are
829	// considered free because we don't want to have any cost for scalarizing
830	// operations, and we don't have to copy into a different register class.
831
832	// Dynamic indexing isn't free and is best avoided.
833	return Index == ~`0u` ? `2` : `0`;
834	}
835	default:
836	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1);
837	}
838	}
839
840	/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
841	/// this is analyzing the collective result of all output registers. Otherwise,
842	/// this is only querying a specific result index if this returns multiple
843	/// registers in a struct.
844	bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
845	const CallInst CI, ArrayRef<unsigned> Indices) const* {
846	// TODO: Handle complex extract indices
847	if (Indices.size() > `1`)
848	return true;
849
850	const DataLayout &DL = CI->getDataLayout();
851	const SIRegisterInfo *TRI = ST->getRegisterInfo();
852	TargetLowering::AsmOperandInfoVector TargetConstraints =
853	TLI->ParseConstraints(DL, TRI: ST->getRegisterInfo(), Call: *CI);
854
855	const int TargetOutputIdx = Indices.empty() ? -`1` : Indices [`0`];
856
857	int OutputIdx = `0`;
858	for (auto &TC : TargetConstraints) {
859	if (TC.Type != InlineAsm::isOutput)
860	continue;
861
862	// Skip outputs we don't care about.
863	if (TargetOutputIdx != -`1` && TargetOutputIdx != OutputIdx++)
864	continue;
865
866	TLI->ComputeConstraintToUse(OpInfo&: TC, Op: SDValue ());
867
868	const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
869	TRI, Constraint: TC.ConstraintCode, VT: TC.ConstraintVT).second;
870
871	// For AGPR constraints null is returned on subtargets without AGPRs, so
872	// assume divergent for null.
873	if (!RC \|\| !TRI->isSGPRClass(RC))
874	return true;
875	}
876
877	return false;
878	}
879
880	bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
881	const IntrinsicInst ReadReg) const* {
882	Metadata *MD =
883	cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: `0`))->getMetadata();
884	StringRef RegName =
885	cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: `0`))->getString();
886
887	// Special case registers that look like VCC.
888	MVT VT = MVT::getVT(Ty: ReadReg->getType());
889	if (VT == MVT::i1)
890	return true;
891
892	// Special case scalar registers that start with 'v'.
893	if (RegName.starts_with(Prefix: "vcc") \|\| RegName.empty())
894	return false;
895
896	// VGPR or AGPR is divergent. There aren't any specially named vector
897	// registers.
898	return RegName [`0`] == `'v'` \|\| RegName [`0`] == `'a'`;
899	}
900
901	/// \returns true if the result of the value could potentially be
902	/// different across workitems in a wavefront.
903	bool GCNTTIImpl::isSourceOfDivergence(const Value V) const* {
904	if (const Argument *A = dyn_cast<Argument>(Val: V))
905	return !AMDGPU::isArgPassedInSGPR(Arg: A);
906
907	// Loads from the private and flat address spaces are divergent, because
908	// threads can execute the load instruction with the same inputs and get
909	// different results.
910	//
911	// All other loads are not divergent, because if threads issue loads with the
912	// same arguments, they will always get the same result.
913	if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V))
914	return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS \|\|
915	Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
916
917	// Atomics are divergent because they are executed sequentially: when an
918	// atomic operation refers to the same address in each thread, then each
919	// thread after the first sees the value written by the previous thread as
920	// original value.
921	if (isa<AtomicRMWInst>(Val: V) \|\| isa<AtomicCmpXchgInst>(Val: V))
922	return true;
923
924	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
925	if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
926	return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic);
927
928	return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: Intrinsic->getIntrinsicID());
929	}
930
931	// Assume all function calls are a source of divergence.
932	if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
933	if (CI->isInlineAsm())
934	return isInlineAsmSourceOfDivergence(CI);
935	return true;
936	}
937
938	// Assume all function calls are a source of divergence.
939	if (isa<InvokeInst>(Val: V))
940	return true;
941
942	return false;
943	}
944
945	bool GCNTTIImpl::isAlwaysUniform(const Value V) const* {
946	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V))
947	return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID());
948
949	if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
950	if (CI->isInlineAsm())
951	return !isInlineAsmSourceOfDivergence(CI);
952	return false;
953	}
954
955	// In most cases TID / wavefrontsize is uniform.
956	//
957	// However, if a kernel has uneven dimesions we can have a value of
958	// workitem-id-x divided by the wavefrontsize non-uniform. For example
959	// dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
960	// packed into a same wave which gives 1 and 0 after the division by 64
961	// respectively.
962	//
963	// FIXME: limit it to 1D kernels only, although that shall be possible
964	// to perform this optimization is the size of the X dimension is a power
965	// of 2, we just do not currently have infrastructure to query it.
966	using namespace llvm::PatternMatch;
967	uint64_t C;
968	if (match(V, P: m_LShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
969	R: m_ConstantInt(V&: C))) \|\|
970	match(V, P: m_AShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
971	R: m_ConstantInt(V&: C)))) {
972	const Function *F = cast<Instruction>(Val: V)->getFunction();
973	return C >= ST->getWavefrontSizeLog2() &&
974	ST->getMaxWorkitemID(Kernel: F, Dimension: `1`) == `0` && ST->getMaxWorkitemID(Kernel: F, Dimension: `2`) == `0`;
975	}
976
977	Value *Mask;
978	if (match(V, P: m_c_And(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
979	R: m_Value(V&: Mask)))) {
980	const Function *F = cast<Instruction>(Val: V)->getFunction();
981	const DataLayout &DL = F->getDataLayout();
982	return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >=
983	ST->getWavefrontSizeLog2() &&
984	ST->getMaxWorkitemID(Kernel: F, Dimension: `1`) == `0` && ST->getMaxWorkitemID(Kernel: F, Dimension: `2`) == `0`;
985	}
986
987	const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V);
988	if (!ExtValue)
989	return false;
990
991	const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: `0`));
992	if (!CI)
993	return false;
994
995	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) {
996	switch (Intrinsic->getIntrinsicID()) {
997	default:
998	return false;
999	case Intrinsic::amdgcn_if:
1000	case Intrinsic::amdgcn_else: {
1001	ArrayRef<unsigned> Indices = ExtValue->getIndices();
1002	return Indices.size() == `1` && Indices [`0`] == `1`;
1003	}
1004	}
1005	}
1006
1007	// If we have inline asm returning mixed SGPR and VGPR results, we inferred
1008	// divergent for the overall struct return. We need to override it in the
1009	// case we're extracting an SGPR component here.
1010	if (CI->isInlineAsm())
1011	return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices());
1012
1013	return false;
1014	}
1015
1016	bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1017	Intrinsic::ID IID) const {
1018	switch (IID) {
1019	case Intrinsic::amdgcn_is_shared:
1020	case Intrinsic::amdgcn_is_private:
1021	case Intrinsic::amdgcn_flat_atomic_fadd:
1022	case Intrinsic::amdgcn_flat_atomic_fmax:
1023	case Intrinsic::amdgcn_flat_atomic_fmin:
1024	case Intrinsic::amdgcn_flat_atomic_fmax_num:
1025	case Intrinsic::amdgcn_flat_atomic_fmin_num:
1026	OpIndexes.push_back(Elt: `0`);
1027	return true;
1028	default:
1029	return false;
1030	}
1031	}
1032
1033	Value GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst II,
1034	Value *OldV,
1035	Value NewV) const* {
1036	auto IntrID = II->getIntrinsicID();
1037	switch (IntrID) {
1038	case Intrinsic::amdgcn_is_shared:
1039	case Intrinsic::amdgcn_is_private: {
1040	unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1041	AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1042	unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1043	LLVMContext &Ctx = NewV->getType()->getContext();
1044	ConstantInt *NewVal = (TrueAS == NewAS) ?
1045	ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx);
1046	return NewVal;
1047	}
1048	case Intrinsic::ptrmask: {
1049	unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1050	unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1051	Value *MaskOp = II->getArgOperand(i: `1`);
1052	Type *MaskTy = MaskOp->getType();
1053
1054	bool DoTruncate = false;
1055
1056	const GCNTargetMachine &TM =
1057	static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1058	if (!TM.isNoopAddrSpaceCast(SrcAS: OldAS, DestAS: NewAS)) {
1059	// All valid 64-bit to 32-bit casts work by chopping off the high
1060	// bits. Any masking only clearing the low bits will also apply in the new
1061	// address space.
1062	if (DL.getPointerSizeInBits(AS: OldAS) != `64` \|\|
1063	DL.getPointerSizeInBits(AS: NewAS) != `32`)
1064	return nullptr;
1065
1066	// TODO: Do we need to thread more context in here?
1067	KnownBits Known = computeKnownBits(V: MaskOp, DL, Depth: `0`, AC: nullptr, CxtI: II);
1068	if (Known.countMinLeadingOnes() < `32`)
1069	return nullptr;
1070
1071	DoTruncate = true;
1072	}
1073
1074	IRBuilder<> B(II);
1075	if (DoTruncate) {
1076	MaskTy = B.getInt32Ty();
1077	MaskOp = B.CreateTrunc(V: MaskOp, DestTy: MaskTy);
1078	}
1079
1080	return B.CreateIntrinsic(ID: Intrinsic::ptrmask, Types: {NewV->getType(), MaskTy},
1081	Args: {NewV, MaskOp});
1082	}
1083	case Intrinsic::amdgcn_flat_atomic_fadd:
1084	case Intrinsic::amdgcn_flat_atomic_fmax:
1085	case Intrinsic::amdgcn_flat_atomic_fmin:
1086	case Intrinsic::amdgcn_flat_atomic_fmax_num:
1087	case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1088	Type *DestTy = II->getType();
1089	Type *SrcTy = NewV->getType();
1090	unsigned NewAS = SrcTy->getPointerAddressSpace();
1091	if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS))
1092	return nullptr;
1093	Module *M = II->getModule();
1094	Function *NewDecl = Intrinsic::getDeclaration(M, id: II->getIntrinsicID(),
1095	Tys: {DestTy, SrcTy, DestTy});
1096	II->setArgOperand(i: `0`, v: NewV);
1097	II->setCalledFunction(NewDecl);
1098	return II;
1099	}
1100	default:
1101	return nullptr;
1102	}
1103	}
1104
1105	InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1106	VectorType VT, ArrayRef<int*> Mask,
1107	TTI::TargetCostKind CostKind,
1108	int Index, VectorType *SubTp,
1109	ArrayRef<const Value *> Args,
1110	const Instruction *CxtI) {
1111	if (!isa<FixedVectorType>(Val: VT))
1112	return BaseT::getShuffleCost(Kind, Tp: VT, Mask, CostKind, Index, SubTp);
1113
1114	Kind = improveShuffleKindFromMask(Kind, Mask, Ty: VT, Index, SubTy&: SubTp);
1115
1116	// Larger vector widths may require additional instructions, but are
1117	// typically cheaper than scalarized versions.
1118	unsigned NumVectorElts = cast<FixedVectorType>(Val: VT)->getNumElements();
1119	if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1120	DL.getTypeSizeInBits(Ty: VT->getElementType()) == `16`) {
1121	bool HasVOP3P = ST->hasVOP3PInsts();
1122	unsigned RequestedElts =
1123	count_if(Range&: Mask, P: [](int MaskElt) { return MaskElt != -`1`; });
1124	if (RequestedElts == `0`)
1125	return `0`;
1126	switch (Kind) {
1127	case TTI::SK_Broadcast:
1128	case TTI::SK_Reverse:
1129	case TTI::SK_PermuteSingleSrc: {
1130	// With op_sel VOP3P instructions freely can access the low half or high
1131	// half of a register, so any swizzle of two elements is free.
1132	if (HasVOP3P && NumVectorElts == `2`)
1133	return `0`;
1134	unsigned NumPerms = alignTo(Value: RequestedElts, Align: `2`) / `2`;
1135	// SK_Broadcast just reuses the same mask
1136	unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? `1` : NumPerms;
1137	return NumPerms + NumPermMasks;
1138	}
1139	case TTI::SK_ExtractSubvector:
1140	case TTI::SK_InsertSubvector: {
1141	// Even aligned accesses are free
1142	if (!(Index % `2`))
1143	return `0`;
1144	// Insert/extract subvectors only require shifts / extract code to get the
1145	// relevant bits
1146	return alignTo(Value: RequestedElts, Align: `2`) / `2`;
1147	}
1148	case TTI::SK_PermuteTwoSrc:
1149	case TTI::SK_Splice:
1150	case TTI::SK_Select: {
1151	unsigned NumPerms = alignTo(Value: RequestedElts, Align: `2`) / `2`;
1152	// SK_Select just reuses the same mask
1153	unsigned NumPermMasks = Kind == TTI::SK_Select ? `1` : NumPerms;
1154	return NumPerms + NumPermMasks;
1155	}
1156
1157	default:
1158	break;
1159	}
1160	}
1161
1162	return BaseT::getShuffleCost(Kind, Tp: VT, Mask, CostKind, Index, SubTp);
1163	}
1164
1165	bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1166	const Function Callee) const* {
1167	const TargetMachine &TM = getTLI()->getTargetMachine();
1168	const GCNSubtarget *CallerST
1169	= static_cast<const GCNSubtarget >(TM.getSubtargetImpl(Caller));
1170	const GCNSubtarget *CalleeST
1171	= static_cast<const GCNSubtarget >(TM.getSubtargetImpl(Callee));
1172
1173	const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1174	const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1175
1176	FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1177	FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1178	if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1179	return false;
1180
1181	// FIXME: dx10_clamp can just take the caller setting, but there seems to be
1182	// no way to support merge for backend defined attributes.
1183	SIModeRegisterDefaults CallerMode(Caller, CallerST);
1184	SIModeRegisterDefaults CalleeMode(Callee, CalleeST);
1185	if (!CallerMode.isInlineCompatible(CalleeMode))
1186	return false;
1187
1188	if (Callee->hasFnAttribute(Kind: Attribute::AlwaysInline) \|\|
1189	Callee->hasFnAttribute(Kind: Attribute::InlineHint))
1190	return true;
1191
1192	// Hack to make compile times reasonable.
1193	if (InlineMaxBB) {
1194	// Single BB does not increase total BB amount.
1195	if (Callee->size() == `1`)
1196	return true;
1197	size_t BBSize = Caller->size() + Callee->size() - `1`;
1198	return BBSize <= InlineMaxBB;
1199	}
1200
1201	return true;
1202	}
1203
1204	static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1205	const SITargetLowering *TLI,
1206	const GCNTTIImpl *TTIImpl) {
1207	const int NrOfSGPRUntilSpill = `26`;
1208	const int NrOfVGPRUntilSpill = `32`;
1209
1210	const DataLayout &DL = TTIImpl->getDataLayout();
1211
1212	unsigned adjustThreshold = `0`;
1213	int SGPRsInUse = `0`;
1214	int VGPRsInUse = `0`;
1215	for (const Use &A : CB->args()) {
1216	SmallVector<EVT, `4`> ValueVTs;
1217	ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs);
1218	for (auto ArgVT : ValueVTs) {
1219	unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1220	Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT);
1221	if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A)))
1222	SGPRsInUse += CCRegNum;
1223	else
1224	VGPRsInUse += CCRegNum;
1225	}
1226	}
1227
1228	// The cost of passing function arguments through the stack:
1229	// 1 instruction to put a function argument on the stack in the caller.
1230	// 1 instruction to take a function argument from the stack in callee.
1231	// 1 instruction is explicitly take care of data dependencies in callee
1232	// function.
1233	InstructionCost ArgStackCost(`1`);
1234	ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1235	Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align (`4`),
1236	AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1237	ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1238	Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align (`4`),
1239	AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1240
1241	// The penalty cost is computed relative to the cost of instructions and does
1242	// not model any storage costs.
1243	adjustThreshold += std::max(a: `0`, b: SGPRsInUse - NrOfSGPRUntilSpill) *
1244	ArgStackCost.getValue() InlineConstants::getInstrCost();
1245	adjustThreshold += std::max(a: `0`, b: VGPRsInUse - NrOfVGPRUntilSpill) *
1246	ArgStackCost.getValue() InlineConstants::getInstrCost();
1247	return adjustThreshold;
1248	}
1249
1250	static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1251	const DataLayout &DL) {
1252	// If we have a pointer to a private array passed into a function
1253	// it will not be optimized out, leaving scratch usage.
1254	// This function calculates the total size in bytes of the memory that would
1255	// end in scratch if the call was not inlined.
1256	unsigned AllocaSize = `0`;
1257	SmallPtrSet<const AllocaInst *, `8`> AIVisited;
1258	for (Value *PtrArg : CB->args()) {
1259	PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType());
1260	if (!Ty)
1261	continue;
1262
1263	unsigned AddrSpace = Ty->getAddressSpace();
1264	if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1265	AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1266	continue;
1267
1268	const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg));
1269	if (!AI \|\| !AI->isStaticAlloca() \|\| !AIVisited.insert(Ptr: AI).second)
1270	continue;
1271
1272	AllocaSize += DL.getTypeAllocSize(Ty: AI->getAllocatedType());
1273	}
1274	return AllocaSize;
1275	}
1276
1277	unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase CB) const* {
1278	unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this);
1279
1280	// Private object passed as arguments may end up in scratch usage if the call
1281	// is not inlined. Increase the inline threshold to promote inlining.
1282	unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1283	if (AllocaSize > `0`)
1284	Threshold += ArgAllocaCost;
1285	return Threshold;
1286	}
1287
1288	unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1289	const AllocaInst AI) const* {
1290
1291	// Below the cutoff, assume that the private memory objects would be
1292	// optimized
1293	auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1294	if (AllocaSize <= ArgAllocaCutoff)
1295	return `0`;
1296
1297	// Above the cutoff, we give a cost to each private memory object
1298	// depending its size. If the array can be optimized by SROA this cost is not
1299	// added to the total-cost in the inliner cost analysis.
1300	//
1301	// We choose the total cost of the alloca such that their sum cancels the
1302	// bonus given in the threshold (ArgAllocaCost).
1303	//
1304	// Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1305	//
1306	// Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1307	// the single-bb bonus and the vector-bonus.
1308	//
1309	// We compensate the first two multipliers, by repeating logic from the
1310	// inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1311	static_assert(InlinerVectorBonusPercent == `0`, "vector bonus assumed to be 0");
1312	unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1313
1314	bool SingleBB = none_of(Range&: CB->getCalledFunction(), P: [](const* BasicBlock &BB) {
1315	return BB.getTerminator()->getNumSuccessors() > `1`;
1316	});
1317	if (SingleBB) {
1318	Threshold += Threshold / `2`;
1319	}
1320
1321	auto ArgAllocaSize = DL.getTypeAllocSize(Ty: AI->getAllocatedType());
1322
1323	// Attribute the bonus proportionally to the alloca size
1324	unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1325
1326	return AllocaThresholdBonus;
1327	}
1328
1329	void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1330	TTI::UnrollingPreferences &UP,
1331	OptimizationRemarkEmitter *ORE) {
1332	CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1333	}
1334
1335	void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1336	TTI::PeelingPreferences &PP) {
1337	CommonTTI.getPeelingPreferences(L, SE, PP);
1338	}
1339
1340	int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1341	return ST->hasFullRate64Ops()
1342	? getFullRateInstrCost()
1343	: ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1344	: getQuarterRateInstrCost(CostKind);
1345	}
1346
1347	std::pair<InstructionCost, MVT>
1348	GCNTTIImpl::getTypeLegalizationCost(Type Ty) const* {
1349	std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1350	auto Size = DL.getTypeSizeInBits(Ty);
1351	// Maximum load or store can handle 8 dwords for scalar and 4 for
1352	// vector ALU. Let's assume anything above 8 dwords is expensive
1353	// even if legal.
1354	if (Size <= `256`)
1355	return Cost;
1356
1357	Cost.first += (Size + `255`) / `256`;
1358	return Cost;
1359	}
1360
1361	unsigned GCNTTIImpl::getPrefetchDistance() const {
1362	return ST->hasPrefetch() ? `128` : `0`;
1363	}
1364
1365	bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1366	return AMDGPU::isFlatGlobalAddrSpace(AS);
1367	}
1368

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp