AMDGPUTargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp]

1	//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file
10	// This file implements a TargetTransformInfo analysis pass specific to the
11	// AMDGPU target machine. It uses the target's detailed information to provide
12	// more precise answers to certain TTI queries, while letting the target
13	// independent and default TTI implementations handle the rest.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "AMDGPUTargetTransformInfo.h"
18	#include "AMDGPUTargetMachine.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "SIModeRegisterDefaults.h"
21	#include "llvm/Analysis/InlineCost.h"
22	#include "llvm/Analysis/LoopInfo.h"
23	#include "llvm/Analysis/ValueTracking.h"
24	#include "llvm/CodeGen/Analysis.h"
25	#include "llvm/IR/Function.h"
26	#include "llvm/IR/IRBuilder.h"
27	#include "llvm/IR/IntrinsicsAMDGPU.h"
28	#include "llvm/IR/PatternMatch.h"
29	#include "llvm/Support/KnownBits.h"
30	#include "llvm/Transforms/Utils/UnrollLoop.h"
31	#include <optional>
32
33	using namespace llvm;
34
35	#define DEBUG_TYPE "AMDGPUtti"
36
37	static cl::opt<unsigned> UnrollThresholdPrivate(
38	"amdgpu-unroll-threshold-private",
39	cl::desc ("Unroll threshold for AMDGPU if private memory used in a loop"),
40	cl::init(Val: `2700`), cl::Hidden);
41
42	static cl::opt<unsigned> UnrollThresholdLocal(
43	"amdgpu-unroll-threshold-local",
44	cl::desc ("Unroll threshold for AMDGPU if local memory used in a loop"),
45	cl::init(Val: `1000`), cl::Hidden);
46
47	static cl::opt<unsigned> UnrollThresholdIf(
48	"amdgpu-unroll-threshold-if",
49	cl::desc ("Unroll threshold increment for AMDGPU for each if statement inside loop"),
50	cl::init(Val: `200`), cl::Hidden);
51
52	static cl::opt<bool> UnrollRuntimeLocal(
53	"amdgpu-unroll-runtime-local",
54	cl::desc ("Allow runtime unroll for AMDGPU if local memory used in a loop"),
55	cl::init(Val: true), cl::Hidden);
56
57	static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
58	"amdgpu-unroll-max-block-to-analyze",
59	cl::desc ("Inner loop block size threshold to analyze in unroll for AMDGPU"),
60	cl::init(Val: `32`), cl::Hidden);
61
62	static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
63	cl::Hidden, cl::init(Val: `4000`),
64	cl::desc ("Cost of alloca argument"));
65
66	// If the amount of scratch memory to eliminate exceeds our ability to allocate
67	// it into registers we gain nothing by aggressively inlining functions for that
68	// heuristic.
69	static cl::opt<unsigned>
70	ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
71	cl::init(Val: `256`),
72	cl::desc ("Maximum alloca size to use for inline cost"));
73
74	// Inliner constraint to achieve reasonable compilation time.
75	static cl::opt<size_t> InlineMaxBB(
76	"amdgpu-inline-max-bb", cl::Hidden, cl::init(Val: `1100`),
77	cl::desc ("Maximum number of BBs allowed in a function after inlining"
78	" (compile time constraint)"));
79
80	// This default unroll factor is based on microbenchmarks on gfx1030.
81	static cl::opt<unsigned> MemcpyLoopUnroll(
82	"amdgpu-memcpy-loop-unroll",
83	cl::desc ("Unroll factor (affecting 4x32-bit operations) to use for memory "
84	"operations when lowering statically-sized memcpy, memmove, or"
85	"memset as a loop"),
86	cl::init(Val: `16`), cl::Hidden);
87
88	static bool dependsOnLocalPhi(const Loop L, const* Value *Cond,
89	unsigned Depth = `0`) {
90	const Instruction *I = dyn_cast<Instruction>(Val: Cond);
91	if (!I)
92	return false;
93
94	for (const Value *V : I->operand_values()) {
95	if (!L->contains(Inst: I))
96	continue;
97	if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) {
98	if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) {
99	return SubLoop->contains(Inst: PHI); }))
100	return true;
101	} else if (Depth < `10` && dependsOnLocalPhi(L, Cond: V, Depth: Depth+`1`))
102	return true;
103	}
104	return false;
105	}
106
107	AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine TM, const* Function &F)
108	: BaseT (TM, F.getDataLayout()),
109	TargetTriple (TM->getTargetTriple()),
110	ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
111	TLI(ST->getTargetLowering()) {}
112
113	void AMDGPUTTIImpl::getUnrollingPreferences(
114	Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
115	OptimizationRemarkEmitter ORE) const* {
116	const Function &F = *L->getHeader()->getParent();
117	UP.Threshold =
118	F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold", Default: `300`);
119	UP.MaxCount = std::numeric_limits<unsigned>::max();
120	UP.Partial = true;
121
122	// Conditional branch in a loop back edge needs 3 additional exec
123	// manipulations in average.
124	UP.BEInsns += `3`;
125
126	// We want to run unroll even for the loops which have been vectorized.
127	UP.UnrollVectorizedLoop = true;
128
129	// TODO: Do we want runtime unrolling?
130
131	// Maximum alloca size than can fit registers. Reserve 16 registers.
132	const unsigned MaxAlloca = (`256` - `16`) * `4`;
133	unsigned ThresholdPrivate = UnrollThresholdPrivate;
134	unsigned ThresholdLocal = UnrollThresholdLocal;
135
136	// If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
137	// provided threshold value as the default for Threshold
138	if (MDNode *LoopUnrollThreshold =
139	findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold")) {
140	if (LoopUnrollThreshold->getNumOperands() == `2`) {
141	ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
142	MD: LoopUnrollThreshold->getOperand(I: `1`));
143	if (MetaThresholdValue) {
144	// We will also use the supplied value for PartialThreshold for now.
145	// We may introduce additional metadata if it becomes necessary in the
146	// future.
147	UP.Threshold = MetaThresholdValue->getSExtValue();
148	UP.PartialThreshold = UP.Threshold;
149	ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold);
150	ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold);
151	}
152	}
153	}
154
155	unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal);
156	for (const BasicBlock *BB : L->getBlocks()) {
157	const DataLayout &DL = BB->getDataLayout();
158	unsigned LocalGEPsSeen = `0`;
159
160	if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) {
161	return SubLoop->contains(BB); }))
162	continue; // Block belongs to an inner loop.
163
164	for (const Instruction &I : *BB) {
165	// Unroll a loop which contains an "if" statement whose condition
166	// defined by a PHI belonging to the loop. This may help to eliminate
167	// if region and potentially even PHI itself, saving on both divergence
168	// and registers used for the PHI.
169	// Add a small bonus for each of such "if" statements.
170	if (const CondBrInst *Br = dyn_cast<CondBrInst>(Val: &I)) {
171	if (UP.Threshold < MaxBoost) {
172	BasicBlock *Succ0 = Br->getSuccessor(i: `0`);
173	BasicBlock *Succ1 = Br->getSuccessor(i: `1`);
174	if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) \|\|
175	(L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1)))
176	continue;
177	if (dependsOnLocalPhi(L, Cond: Br->getCondition())) {
178	UP.Threshold += UnrollThresholdIf;
179	LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
180	<< " for loop:\n"
181	<< L << " due to " << Br << `'\n'`);
182	if (UP.Threshold >= MaxBoost)
183	return;
184	}
185	}
186	continue;
187	}
188
189	const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I);
190	if (!GEP)
191	continue;
192
193	unsigned AS = GEP->getAddressSpace();
194	unsigned Threshold = `0`;
195	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
196	Threshold = ThresholdPrivate;
197	else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS)
198	Threshold = ThresholdLocal;
199	else
200	continue;
201
202	if (UP.Threshold >= Threshold)
203	continue;
204
205	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
206	const Value *Ptr = GEP->getPointerOperand();
207	const AllocaInst *Alloca =
208	dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr));
209	if (!Alloca \|\| !Alloca->isStaticAlloca())
210	continue;
211	auto AllocaSize = Alloca->getAllocationSize(DL);
212	if (!AllocaSize \|\| AllocaSize ->getFixedValue() > MaxAlloca)
213	continue;
214	} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\|
215	AS == AMDGPUAS::REGION_ADDRESS) {
216	LocalGEPsSeen++;
217	// Inhibit unroll for local memory if we have seen addressing not to
218	// a variable, most likely we will be unable to combine it.
219	// Do not unroll too deep inner loops for local memory to give a chance
220	// to unroll an outer loop for a more important reason.
221	if (LocalGEPsSeen > `1` \|\| L->getLoopDepth() > `2` \|\|
222	(!isa<GlobalVariable>(Val: GEP->getPointerOperand()) &&
223	!isa<Argument>(Val: GEP->getPointerOperand())))
224	continue;
225	LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
226	<< *L << " due to LDS use.\n");
227	UP.Runtime = UnrollRuntimeLocal;
228	}
229
230	// Check if GEP depends on a value defined by this loop itself.
231	bool HasLoopDef = false;
232	for (const Value *Op : GEP->operands()) {
233	const Instruction *Inst = dyn_cast<Instruction>(Val: Op);
234	if (!Inst \|\| L->isLoopInvariant(V: Op))
235	continue;
236
237	if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) {
238	return SubLoop->contains(Inst); }))
239	continue;
240	HasLoopDef = true;
241	break;
242	}
243	if (!HasLoopDef)
244	continue;
245
246	// We want to do whatever we can to limit the number of alloca
247	// instructions that make it through to the code generator. allocas
248	// require us to use indirect addressing, which is slow and prone to
249	// compiler bugs. If this loop does an address calculation on an
250	// alloca ptr, then we want to use a higher than normal loop unroll
251	// threshold. This will give SROA a better chance to eliminate these
252	// allocas.
253	//
254	// We also want to have more unrolling for local memory to let ds
255	// instructions with different offsets combine.
256	//
257	// Don't use the maximum allowed value here as it will make some
258	// programs way too big.
259	UP.Threshold = Threshold;
260	LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
261	<< " for loop:\n"
262	<< L << " due to " << GEP << `'\n'`);
263	if (UP.Threshold >= MaxBoost)
264	return;
265	}
266
267	// If we got a GEP in a small BB from inner loop then increase max trip
268	// count to analyze for better estimation cost in unroll
269	if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
270	UP.MaxIterationsCountToAnalyze = `32`;
271	}
272	// If a user provided an explicit unroll pragma (with or without count),
273	// override expensive trip count checks
274	UnrollPragmaInfo PInfo(L);
275	if (PInfo.PragmaEnableUnroll \|\| PInfo.PragmaCount > `0`)
276	UP.AllowExpensiveTripCount = true;
277	}
278
279	void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
280	TTI::PeelingPreferences &PP) const {
281	BaseT::getPeelingPreferences(L, SE, PP);
282	}
283
284	uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
285	return `1024`;
286	}
287
288	const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
289	// Codegen control options which don't matter.
290	AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
291	AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
292	AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
293
294	AMDGPU::FeatureAutoWaitcntBeforeBarrier,
295
296	// Property of the kernel/environment which can't actually differ.
297	AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
298	AMDGPU::FeatureTrapHandler,
299
300	// The default assumption needs to be ecc is enabled, but no directly
301	// exposed operations depend on it, so it can be safely inlined.
302	AMDGPU::FeatureSRAMECC,
303
304	// Perf-tuning features
305	AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
306
307	GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine TM, const* Function &F)
308	: BaseT (TM, F.getDataLayout()),
309	ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
310	TLI(ST->getTargetLowering()), CommonTTI (TM, F),
311	IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) {
312	SIModeRegisterDefaults Mode(F, *ST);
313	HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
314	HasFP64FP16Denormals =
315	Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
316	}
317
318	bool GCNTTIImpl::hasBranchDivergence(const Function F) const* {
319	return !F \|\| !ST->isSingleLaneExecution(Kernel: *F);
320	}
321
322	unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
323	// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
324	// registers. See getRegisterClassForType for the implementation.
325	// In this case vector registers are not vector in terms of
326	// VGPRs, but those which can hold multiple values.
327
328	// This is really the number of registers to fill when vectorizing /
329	// interleaving loops, so we lie to avoid trying to use all registers.
330	return `4`;
331	}
332
333	TypeSize
334	GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
335	switch (K) {
336	case TargetTransformInfo::RGK_Scalar:
337	return TypeSize::getFixed(ExactSize: `32`);
338	case TargetTransformInfo::RGK_FixedWidthVector:
339	return TypeSize::getFixed(ExactSize: ST->hasPackedFP32Ops() ? `64` : `32`);
340	case TargetTransformInfo::RGK_ScalableVector:
341	return TypeSize::getScalable(MinimumSize: `0`);
342	}
343	llvm_unreachable("Unsupported register kind");
344	}
345
346	unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
347	return `32`;
348	}
349
350	unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
351	if (Opcode == Instruction::Load \|\| Opcode == Instruction::Store)
352	return `32` * `4` / ElemWidth;
353	// For a given width return the max 0number of elements that can be combined
354	// into a wider bit value:
355	return (ElemWidth == `8` && ST->has16BitInsts()) ? `4`
356	: (ElemWidth == `16` && ST->has16BitInsts()) ? `2`
357	: (ElemWidth == `32` && ST->hasPackedFP32Ops()) ? `2`
358	: `1`;
359	}
360
361	unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
362	unsigned ChainSizeInBytes,
363	VectorType VecTy) const* {
364	unsigned VecRegBitWidth = VF * LoadSize;
365	if (VecRegBitWidth > `128` && VecTy->getScalarSizeInBits() < `32`)
366	// TODO: Support element-size less than 32bit?
367	return `128` / LoadSize;
368
369	return VF;
370	}
371
372	unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
373	unsigned ChainSizeInBytes,
374	VectorType VecTy) const* {
375	unsigned VecRegBitWidth = VF * StoreSize;
376	if (VecRegBitWidth > `128`)
377	return `128` / StoreSize;
378
379	return VF;
380	}
381
382	unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
383	if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS \|\|
384	AddrSpace == AMDGPUAS::CONSTANT_ADDRESS \|\|
385	AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
386	AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER \|\|
387	AddrSpace == AMDGPUAS::BUFFER_RESOURCE \|\|
388	AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
389	return `512`;
390	}
391
392	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
393	return `8` * ST->getMaxPrivateElementSize();
394
395	// Common to flat, global, local and region. Assume for unknown addrspace.
396	return `128`;
397	}
398
399	bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
400	Align Alignment,
401	unsigned AddrSpace) const {
402	// We allow vectorization of flat stores, even though we may need to decompose
403	// them later if they may access private memory. We don't have enough context
404	// here, and legalization can handle it.
405	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
406	return (Alignment >= `4` \|\| ST->hasUnalignedScratchAccessEnabled()) &&
407	ChainSizeInBytes <= ST->getMaxPrivateElementSize();
408	}
409	return true;
410	}
411
412	bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
413	Align Alignment,
414	unsigned AddrSpace) const {
415	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
416	}
417
418	bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
419	Align Alignment,
420	unsigned AddrSpace) const {
421	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
422	}
423
424	uint64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
425	return `1024`;
426	}
427
428	Type *GCNTTIImpl::getMemcpyLoopLoweringType(
429	LLVMContext &Context, Value Length, unsigned* SrcAddrSpace,
430	unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
431	std::optional<uint32_t> AtomicElementSize) const {
432
433	if (AtomicElementSize)
434	return Type::getIntNTy(C&: Context, N: AtomicElementSize `8`);
435
436	// 16-byte accesses achieve the highest copy throughput.
437	// If the operation has a fixed known length that is large enough, it is
438	// worthwhile to return an even wider type and let legalization lower it into
439	// multiple accesses, effectively unrolling the memcpy loop.
440	// We also rely on legalization to decompose into smaller accesses for
441	// subtargets and address spaces where it is necessary.
442	//
443	// Don't unroll if Length is not a constant, since unrolling leads to worse
444	// performance for length values that are smaller or slightly larger than the
445	// total size of the type returned here. Mitigating that would require a more
446	// complex lowering for variable-length memcpy and memmove.
447	unsigned I32EltsInVector = `4`;
448	if (MemcpyLoopUnroll > `0` && isa<ConstantInt>(Val: Length))
449	return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context),
450	NumElts: MemcpyLoopUnroll * I32EltsInVector);
451
452	return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: I32EltsInVector);
453	}
454
455	void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
456	SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
457	unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
458	Align SrcAlign, Align DestAlign,
459	std::optional<uint32_t> AtomicCpySize) const {
460
461	if (AtomicCpySize)
462	BaseT::getMemcpyLoopResidualLoweringType(
463	OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
464	DestAlign, AtomicCpySize);
465
466	Type *I32x4Ty = FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: `4`);
467	while (RemainingBytes >= `16`) {
468	OpsOut.push_back(Elt: I32x4Ty);
469	RemainingBytes -= `16`;
470	}
471
472	Type *I64Ty = Type::getInt64Ty(C&: Context);
473	while (RemainingBytes >= `8`) {
474	OpsOut.push_back(Elt: I64Ty);
475	RemainingBytes -= `8`;
476	}
477
478	Type *I32Ty = Type::getInt32Ty(C&: Context);
479	while (RemainingBytes >= `4`) {
480	OpsOut.push_back(Elt: I32Ty);
481	RemainingBytes -= `4`;
482	}
483
484	Type *I16Ty = Type::getInt16Ty(C&: Context);
485	while (RemainingBytes >= `2`) {
486	OpsOut.push_back(Elt: I16Ty);
487	RemainingBytes -= `2`;
488	}
489
490	Type *I8Ty = Type::getInt8Ty(C&: Context);
491	while (RemainingBytes) {
492	OpsOut.push_back(Elt: I8Ty);
493	--RemainingBytes;
494	}
495	}
496
497	unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
498	// Disable unrolling if the loop is not vectorized.
499	// TODO: Enable this again.
500	if (VF.isScalar())
501	return `1`;
502
503	return `8`;
504	}
505
506	bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
507	MemIntrinsicInfo &Info) const {
508	switch (Inst->getIntrinsicID()) {
509	case Intrinsic::amdgcn_ds_ordered_add:
510	case Intrinsic::amdgcn_ds_ordered_swap: {
511	auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: `2`));
512	auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: `4`));
513	if (!Ordering \|\| !Volatile)
514	return false; // Invalid.
515
516	unsigned OrderingVal = Ordering->getZExtValue();
517	if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
518	return false;
519
520	Info.PtrVal = Inst->getArgOperand(i: `0`);
521	Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
522	Info.ReadMem = true;
523	Info.WriteMem = true;
524	Info.IsVolatile = !Volatile->isZero();
525	return true;
526	}
527	default:
528	return false;
529	}
530	}
531
532	InstructionCost GCNTTIImpl::getArithmeticInstrCost(
533	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
534	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
535	ArrayRef<const Value > Args, const* Instruction CxtI) const* {
536
537	// Legalize the type.
538	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
539	int ISD = TLI->InstructionOpcodeToISD(Opcode);
540
541	// Because we don't have any legal vector operations, but the legal types, we
542	// need to account for split vectors.
543	unsigned NElts = LT.second.isVector() ?
544	LT.second.getVectorNumElements() : `1`;
545
546	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
547
548	switch (ISD) {
549	case ISD::SHL:
550	case ISD::SRL:
551	case ISD::SRA:
552	if (SLT == MVT::i64)
553	return get64BitInstrCost(CostKind) * LT.first * NElts;
554
555	if (ST->has16BitInsts() && SLT == MVT::i16)
556	NElts = (NElts + `1`) / `2`;
557
558	// i32
559	return getFullRateInstrCost() * LT.first * NElts;
560	case ISD::ADD:
561	case ISD::SUB:
562	case ISD::AND:
563	case ISD::OR:
564	case ISD::XOR:
565	if (SLT == MVT::i64) {
566	// and, or and xor are typically split into 2 VALU instructions.
567	return `2` * getFullRateInstrCost() * LT.first * NElts;
568	}
569
570	if (ST->has16BitInsts() && SLT == MVT::i16)
571	NElts = (NElts + `1`) / `2`;
572
573	return LT.first * NElts * getFullRateInstrCost();
574	case ISD::MUL: {
575	const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
576	if (SLT == MVT::i64) {
577	const int FullRateCost = getFullRateInstrCost();
578	return (`4` * QuarterRateCost + (`2` * `2`) * FullRateCost) * LT.first * NElts;
579	}
580
581	if (ST->has16BitInsts() && SLT == MVT::i16)
582	NElts = (NElts + `1`) / `2`;
583
584	// i32
585	return QuarterRateCost * NElts * LT.first;
586	}
587	case ISD::FMUL:
588	// Check possible fuse {fadd\|fsub}(a,fmul(b,c)) and return zero cost for
589	// fmul(b,c) supposing the fadd\|fsub will get estimated cost for the whole
590	// fused operation.
591	if (CxtI && CxtI->hasOneUse())
592	if (const auto FAdd = dyn_cast<BinaryOperator>(Val: CxtI->user_begin())) {
593	const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode());
594	if (OPC == ISD::FADD \|\| OPC == ISD::FSUB) {
595	if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
596	return TargetTransformInfo::TCC_Free;
597	if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
598	return TargetTransformInfo::TCC_Free;
599
600	// Estimate all types may be fused with contract/unsafe flags
601	const TargetOptions &Options = TLI->getTargetMachine().Options;
602	if (Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
603	(FAdd->hasAllowContract() && CxtI->hasAllowContract()))
604	return TargetTransformInfo::TCC_Free;
605	}
606	}
607	[[fallthrough]];
608	case ISD::FADD:
609	case ISD::FSUB:
610	if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
611	NElts = (NElts + `1`) / `2`;
612	if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
613	NElts = (NElts + `1`) / `2`;
614	if (SLT == MVT::f64)
615	return LT.first * NElts * get64BitInstrCost(CostKind);
616
617	if (ST->has16BitInsts() && SLT == MVT::f16)
618	NElts = (NElts + `1`) / `2`;
619
620	if (SLT == MVT::f32 \|\| SLT == MVT::f16 \|\| SLT == MVT::bf16)
621	return LT.first * NElts * getFullRateInstrCost();
622	break;
623	case ISD::FDIV:
624	case ISD::FREM:
625	// FIXME: frem should be handled separately. The fdiv in it is most of it,
626	// but the current lowering is also not entirely correct.
627	if (SLT == MVT::f64) {
628	int Cost = `7` * get64BitInstrCost(CostKind) +
629	getQuarterRateInstrCost(CostKind) +
630	`3` * getHalfRateInstrCost(CostKind);
631	// Add cost of workaround.
632	if (!ST->hasUsableDivScaleConditionOutput())
633	Cost += `3` * getFullRateInstrCost();
634
635	return LT.first * Cost * NElts;
636	}
637
638	if (!Args.empty() && match(V: Args [`0`], P: PatternMatch::m_FPOne())) {
639	// TODO: This is more complicated, unsafe flags etc.
640	if ((SLT == MVT::f32 && !HasFP32Denormals) \|\|
641	(SLT == MVT::f16 && ST->has16BitInsts())) {
642	return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
643	}
644	}
645
646	if (SLT == MVT::f16 && ST->has16BitInsts()) {
647	// 2 x v_cvt_f32_f16
648	// f32 rcp
649	// f32 fmul
650	// v_cvt_f16_f32
651	// f16 div_fixup
652	int Cost =
653	`4` * getFullRateInstrCost() + `2` * getQuarterRateInstrCost(CostKind);
654	return LT.first * Cost * NElts;
655	}
656
657	if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
658	// Fast unsafe fdiv lowering:
659	// f32 rcp
660	// f32 fmul
661	int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
662	return LT.first * Cost * NElts;
663	}
664
665	if (SLT == MVT::f32 \|\| SLT == MVT::f16) {
666	// 4 more v_cvt_ insts without f16 insts support*
667	int Cost = (SLT == MVT::f16 ? `14` : `10`) * getFullRateInstrCost() +
668	`1` * getQuarterRateInstrCost(CostKind);
669
670	if (!HasFP32Denormals) {
671	// FP mode switches.
672	Cost += `2` * getFullRateInstrCost();
673	}
674
675	return LT.first * NElts * Cost;
676	}
677	break;
678	case ISD::FNEG:
679	// Use the backend' estimation. If fneg is not free each element will cost
680	// one additional instruction.
681	return TLI->isFNegFree(VT: SLT) ? `0` : NElts;
682	default:
683	break;
684	}
685
686	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
687	Args, CxtI);
688	}
689
690	// Return true if there's a potential benefit from using v2f16/v2i16
691	// instructions for an intrinsic, even if it requires nontrivial legalization.
692	static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
693	switch (ID) {
694	case Intrinsic::fma:
695	case Intrinsic::fmuladd:
696	case Intrinsic::copysign:
697	case Intrinsic::minimumnum:
698	case Intrinsic::maximumnum:
699	case Intrinsic::canonicalize:
700	// There's a small benefit to using vector ops in the legalized code.
701	case Intrinsic::round:
702	case Intrinsic::uadd_sat:
703	case Intrinsic::usub_sat:
704	case Intrinsic::sadd_sat:
705	case Intrinsic::ssub_sat:
706	case Intrinsic::abs:
707	return true;
708	default:
709	return false;
710	}
711	}
712
713	InstructionCost
714	GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
715	TTI::TargetCostKind CostKind) const {
716	switch (ICA.getID()) {
717	case Intrinsic::fabs:
718	// Free source modifier in the common case.
719	return `0`;
720	case Intrinsic::amdgcn_workitem_id_x:
721	case Intrinsic::amdgcn_workitem_id_y:
722	case Intrinsic::amdgcn_workitem_id_z:
723	// TODO: If hasPackedTID, or if the calling context is not an entry point
724	// there may be a bit instruction.
725	return `0`;
726	case Intrinsic::amdgcn_workgroup_id_x:
727	case Intrinsic::amdgcn_workgroup_id_y:
728	case Intrinsic::amdgcn_workgroup_id_z:
729	case Intrinsic::amdgcn_lds_kernel_id:
730	case Intrinsic::amdgcn_dispatch_ptr:
731	case Intrinsic::amdgcn_dispatch_id:
732	case Intrinsic::amdgcn_implicitarg_ptr:
733	case Intrinsic::amdgcn_queue_ptr:
734	// Read from an argument register.
735	return `0`;
736	default:
737	break;
738	}
739
740	Type *RetTy = ICA.getReturnType();
741
742	Intrinsic::ID IID = ICA.getID();
743	switch (IID) {
744	case Intrinsic::exp:
745	case Intrinsic::exp2:
746	case Intrinsic::exp10: {
747	// Legalize the type.
748	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
749	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
750	unsigned NElts =
751	LT.second.isVector() ? LT.second.getVectorNumElements() : `1`;
752
753	if (SLT == MVT::f64) {
754	unsigned NumOps = `20`;
755	if (IID == Intrinsic::exp)
756	++NumOps;
757	else if (IID == Intrinsic::exp10)
758	NumOps += `3`;
759
760	return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
761	}
762
763	if (SLT == MVT::f32) {
764	unsigned NumFullRateOps = `0`;
765	// v_exp_f32 (quarter rate).
766	unsigned NumQuarterRateOps = `1`;
767
768	if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
769	// Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
770	// overflow/underflow checks (lowerFEXP). Denorm is also handled.
771	// FMA preamble: ~13 full-rate ops; non-FMA: ~17.
772	NumFullRateOps = ST->hasFastFMAF32() ? `13` : `17`;
773	} else {
774	if (IID == Intrinsic::exp) {
775	// lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
776	NumFullRateOps = `1`;
777	} else if (IID == Intrinsic::exp10) {
778	// lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
779	NumFullRateOps = `3`;
780	NumQuarterRateOps = `2`;
781	}
782	// Denorm scaling adds setcc + select + fadd + select + fmul.
783	if (HasFP32Denormals)
784	NumFullRateOps += `5`;
785	}
786
787	InstructionCost Cost =
788	NumFullRateOps * getFullRateInstrCost() +
789	NumQuarterRateOps * getQuarterRateInstrCost(CostKind);
790	return LT.first * NElts * Cost;
791	}
792
793	break;
794	}
795	default:
796	break;
797	}
798
799	if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID()))
800	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
801
802	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
803	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
804	unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : `1`;
805
806	if ((ST->hasVOP3PInsts() &&
807	(SLT == MVT::f16 \|\| SLT == MVT::i16 \|\|
808	(SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) \|\|
809	(ST->hasPackedFP32Ops() && SLT == MVT::f32))
810	NElts = (NElts + `1`) / `2`;
811
812	// TODO: Get more refined intrinsic costs?
813	unsigned InstRate = getQuarterRateInstrCost(CostKind);
814
815	switch (ICA.getID()) {
816	case Intrinsic::fma:
817	case Intrinsic::fmuladd:
818	if (SLT == MVT::f64) {
819	InstRate = get64BitInstrCost(CostKind);
820	break;
821	}
822
823	if ((SLT == MVT::f32 && ST->hasFastFMAF32()) \|\| SLT == MVT::f16)
824	InstRate = getFullRateInstrCost();
825	else {
826	InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
827	: getQuarterRateInstrCost(CostKind);
828	}
829	break;
830	case Intrinsic::copysign:
831	return NElts * getFullRateInstrCost();
832	case Intrinsic::minimumnum:
833	case Intrinsic::maximumnum: {
834	// Instruction + 2 canonicalizes. For cases that need type promotion, we the
835	// promotion takes the place of the canonicalize.
836	unsigned NumOps = `3`;
837	if (const IntrinsicInst *II = ICA.getInst()) {
838	// Directly legal with ieee=0
839	// TODO: Not directly legal with strictfp
840	if (fpenvIEEEMode(I: *II) == KnownIEEEMode::Off)
841	NumOps = `1`;
842	}
843
844	unsigned BaseRate =
845	SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
846	InstRate = BaseRate * NumOps;
847	break;
848	}
849	case Intrinsic::canonicalize: {
850	InstRate =
851	SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
852	break;
853	}
854	case Intrinsic::uadd_sat:
855	case Intrinsic::usub_sat:
856	case Intrinsic::sadd_sat:
857	case Intrinsic::ssub_sat: {
858	if (SLT == MVT::i16 \|\| SLT == MVT::i32)
859	InstRate = getFullRateInstrCost();
860
861	static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
862	if (any_of(Range: ValidSatTys, P: equal_to(Arg&: LT.second)))
863	NElts = `1`;
864	break;
865	}
866	case Intrinsic::abs:
867	// Expansion takes 2 instructions for VALU
868	if (SLT == MVT::i16 \|\| SLT == MVT::i32)
869	InstRate = `2` * getFullRateInstrCost();
870	break;
871	default:
872	break;
873	}
874
875	return LT.first * NElts * InstRate;
876	}
877
878	InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
879	TTI::TargetCostKind CostKind,
880	const Instruction I) const* {
881	assert((I == nullptr \|\| I->getOpcode() == Opcode) &&
882	"Opcode should reflect passed instruction.");
883	const bool SCost =
884	(CostKind == TTI::TCK_CodeSize \|\| CostKind == TTI::TCK_SizeAndLatency);
885	const int CBrCost = SCost ? `5` : `7`;
886	switch (Opcode) {
887	case Instruction::UncondBr:
888	// Branch instruction takes about 4 slots on gfx900.
889	return SCost ? `1` : `4`;
890	case Instruction::CondBr:
891	// Suppose conditional branch takes additional 3 exec manipulations
892	// instructions in average.
893	return CBrCost;
894	case Instruction::Switch: {
895	const auto *SI = dyn_cast_or_null<SwitchInst>(Val: I);
896	// Each case (including default) takes 1 cmp + 1 cbr instructions in
897	// average.
898	return (SI ? (SI->getNumCases() + `1`) : `4`) * (CBrCost + `1`);
899	}
900	case Instruction::Ret:
901	return SCost ? `1` : `10`;
902	}
903	return BaseT::getCFInstrCost(Opcode, CostKind, I);
904	}
905
906	InstructionCost
907	GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
908	std::optional<FastMathFlags> FMF,
909	TTI::TargetCostKind CostKind) const {
910	if (TTI::requiresOrderedReduction(FMF))
911	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
912
913	EVT OrigTy = TLI->getValueType(DL, Ty);
914
915	// Computes cost on targets that have packed math instructions(which support
916	// 16-bit types only).
917	if (!ST->hasVOP3PInsts() \|\| OrigTy.getScalarSizeInBits() != `16`)
918	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
919
920	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
921	return LT.first * getFullRateInstrCost();
922	}
923
924	InstructionCost
925	GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
926	FastMathFlags FMF,
927	TTI::TargetCostKind CostKind) const {
928	EVT OrigTy = TLI->getValueType(DL, Ty);
929
930	// Computes cost on targets that have packed math instructions(which support
931	// 16-bit types only).
932	if (!ST->hasVOP3PInsts() \|\| OrigTy.getScalarSizeInBits() != `16`)
933	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
934
935	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
936	return LT.first * getHalfRateInstrCost(CostKind);
937	}
938
939	InstructionCost GCNTTIImpl::getVectorInstrCost(
940	unsigned Opcode, Type ValTy, TTI::TargetCostKind CostKind, unsigned* Index,
941	const Value Op0, const* Value Op1, TTI::VectorInstrContext VIC) const* {
942	switch (Opcode) {
943	case Instruction::ExtractElement:
944	case Instruction::InsertElement: {
945	unsigned EltSize
946	= DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType());
947	if (EltSize < `32`) {
948	if (EltSize == `16` && Index == `0` && ST->has16BitInsts())
949	return `0`;
950	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
951	VIC);
952	}
953
954	// Extracts are just reads of a subregister, so are free. Inserts are
955	// considered free because we don't want to have any cost for scalarizing
956	// operations, and we don't have to copy into a different register class.
957
958	// Dynamic indexing isn't free and is best avoided.
959	return Index == ~`0u` ? `2` : `0`;
960	}
961	default:
962	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1,
963	VIC);
964	}
965	}
966
967	/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
968	/// this is analyzing the collective result of all output registers. Otherwise,
969	/// this is only querying a specific result index if this returns multiple
970	/// registers in a struct.
971	bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
972	const CallInst CI, ArrayRef<unsigned> Indices) const* {
973	// TODO: Handle complex extract indices
974	if (Indices.size() > `1`)
975	return true;
976
977	const DataLayout &DL = CI->getDataLayout();
978	const SIRegisterInfo *TRI = ST->getRegisterInfo();
979	TargetLowering::AsmOperandInfoVector TargetConstraints =
980	TLI->ParseConstraints(DL, TRI: ST->getRegisterInfo(), Call: *CI);
981
982	const int TargetOutputIdx = Indices.empty() ? -`1` : Indices [`0`];
983
984	int OutputIdx = `0`;
985	for (auto &TC : TargetConstraints) {
986	if (TC.Type != InlineAsm::isOutput)
987	continue;
988
989	// Skip outputs we don't care about.
990	if (TargetOutputIdx != -`1` && TargetOutputIdx != OutputIdx++)
991	continue;
992
993	TLI->ComputeConstraintToUse(OpInfo&: TC, Op: SDValue ());
994
995	const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
996	TRI, Constraint: TC.ConstraintCode, VT: TC.ConstraintVT).second;
997
998	// For AGPR constraints null is returned on subtargets without AGPRs, so
999	// assume divergent for null.
1000	if (!RC \|\| !TRI->isSGPRClass(RC))
1001	return true;
1002	}
1003
1004	return false;
1005	}
1006
1007	bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
1008	const IntrinsicInst ReadReg) const* {
1009	Metadata *MD =
1010	cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: `0`))->getMetadata();
1011	StringRef RegName =
1012	cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: `0`))->getString();
1013
1014	// Special case registers that look like VCC.
1015	MVT VT = MVT::getVT(Ty: ReadReg->getType());
1016	if (VT == MVT::i1)
1017	return true;
1018
1019	// Special case scalar registers that start with 'v'.
1020	if (RegName.starts_with(Prefix: "vcc") \|\| RegName.empty())
1021	return false;
1022
1023	// VGPR or AGPR is divergent. There aren't any specially named vector
1024	// registers.
1025	return RegName [`0`] == `'v'` \|\| RegName [`0`] == `'a'`;
1026	}
1027
1028	/// \returns true if the result of the value could potentially be
1029	/// different across workitems in a wavefront.
1030	bool GCNTTIImpl::isSourceOfDivergence(const Value V) const* {
1031	if (const Argument *A = dyn_cast<Argument>(Val: V))
1032	return !AMDGPU::isArgPassedInSGPR(Arg: A);
1033
1034	// Loads from the private and flat address spaces are divergent, because
1035	// threads can execute the load instruction with the same inputs and get
1036	// different results.
1037	//
1038	// All other loads are not divergent, because if threads issue loads with the
1039	// same arguments, they will always get the same result.
1040	if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V))
1041	return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS \|\|
1042	Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1043
1044	// Atomics are divergent because they are executed sequentially: when an
1045	// atomic operation refers to the same address in each thread, then each
1046	// thread after the first sees the value written by the previous thread as
1047	// original value.
1048	if (isa<AtomicRMWInst, AtomicCmpXchgInst>(Val: V))
1049	return true;
1050
1051	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
1052	Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1053	switch (IID) {
1054	case Intrinsic::read_register:
1055	return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic);
1056	case Intrinsic::amdgcn_addrspacecast_nonnull: {
1057	unsigned SrcAS =
1058	Intrinsic->getOperand(i_nocapture: `0`)->getType()->getPointerAddressSpace();
1059	unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1060	return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1061	DstAS == AMDGPUAS::FLAT_ADDRESS &&
1062	ST->hasGloballyAddressableScratch();
1063	}
1064	case Intrinsic::amdgcn_workitem_id_y:
1065	case Intrinsic::amdgcn_workitem_id_z: {
1066	const Function *F = Intrinsic->getFunction();
1067	bool HasUniformYZ =
1068	ST->hasWavefrontsEvenlySplittingXDim(F: F, /RequitezUniformYZ=/REquiresUniformYZ: true*);
1069	std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1070	F: *F, Dim: IID == Intrinsic::amdgcn_workitem_id_y ? `1` : `2`);
1071	return !HasUniformYZ && (!ThisDimSize \|\| *ThisDimSize != `1`);
1072	}
1073	default:
1074	return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: IID);
1075	}
1076	}
1077
1078	// Assume all function calls are a source of divergence.
1079	if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1080	if (CI->isInlineAsm())
1081	return isInlineAsmSourceOfDivergence(CI);
1082	return true;
1083	}
1084
1085	// Assume all function calls are a source of divergence.
1086	if (isa<InvokeInst>(Val: V))
1087	return true;
1088
1089	// If the target supports globally addressable scratch, the mapping from
1090	// scratch memory to the flat aperture changes therefore an address space cast
1091	// is no longer uniform.
1092	if (auto *CastI = dyn_cast<AddrSpaceCastInst>(Val: V)) {
1093	return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1094	CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1095	ST->hasGloballyAddressableScratch();
1096	}
1097
1098	return false;
1099	}
1100
1101	bool GCNTTIImpl::isAlwaysUniform(const Value V) const* {
1102	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V))
1103	return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID());
1104
1105	if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1106	if (CI->isInlineAsm())
1107	return !isInlineAsmSourceOfDivergence(CI);
1108	return false;
1109	}
1110
1111	// In most cases TID / wavefrontsize is uniform.
1112	//
1113	// However, if a kernel has uneven dimesions we can have a value of
1114	// workitem-id-x divided by the wavefrontsize non-uniform. For example
1115	// dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1116	// packed into a same wave which gives 1 and 0 after the division by 64
1117	// respectively.
1118	//
1119	// The X dimension doesn't reset within a wave if either both the Y
1120	// and Z dimensions are of length 1, or if the X dimension's required
1121	// size is a power of 2. Note, however, if the X dimension's maximum
1122	// size is a power of 2 < the wavefront size, division by the wavefront
1123	// size is guaranteed to yield 0, so this is also a no-reset case.
1124	bool XDimDoesntResetWithinWaves = false;
1125	if (auto *I = dyn_cast<Instruction>(Val: V)) {
1126	const Function *F = I->getFunction();
1127	XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(F: *F);
1128	}
1129	using namespace llvm::PatternMatch;
1130	uint64_t C;
1131	if (match(V, P: m_LShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1132	R: m_ConstantInt(V&: C))) \|\|
1133	match(V, P: m_AShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1134	R: m_ConstantInt(V&: C)))) {
1135	return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1136	}
1137
1138	Value *Mask;
1139	if (match(V, P: m_c_And(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1140	R: m_Value(V&: Mask)))) {
1141	return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >=
1142	ST->getWavefrontSizeLog2() &&
1143	XDimDoesntResetWithinWaves;
1144	}
1145
1146	const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V);
1147	if (!ExtValue)
1148	return false;
1149
1150	const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: `0`));
1151	if (!CI)
1152	return false;
1153
1154	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) {
1155	switch (Intrinsic->getIntrinsicID()) {
1156	default:
1157	return false;
1158	case Intrinsic::amdgcn_if:
1159	case Intrinsic::amdgcn_else: {
1160	ArrayRef<unsigned> Indices = ExtValue->getIndices();
1161	return Indices.size() == `1` && Indices [`0`] == `1`;
1162	}
1163	}
1164	}
1165
1166	// If we have inline asm returning mixed SGPR and VGPR results, we inferred
1167	// divergent for the overall struct return. We need to override it in the
1168	// case we're extracting an SGPR component here.
1169	if (CI->isInlineAsm())
1170	return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices());
1171
1172	return false;
1173	}
1174
1175	bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1176	Intrinsic::ID IID) const {
1177	switch (IID) {
1178	case Intrinsic::amdgcn_is_shared:
1179	case Intrinsic::amdgcn_is_private:
1180	case Intrinsic::amdgcn_flat_atomic_fmax_num:
1181	case Intrinsic::amdgcn_flat_atomic_fmin_num:
1182	case Intrinsic::amdgcn_load_to_lds:
1183	case Intrinsic::amdgcn_make_buffer_rsrc:
1184	OpIndexes.push_back(Elt: `0`);
1185	return true;
1186	default:
1187	return false;
1188	}
1189	}
1190
1191	Value GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst II,
1192	Value *OldV,
1193	Value NewV) const* {
1194	auto IntrID = II->getIntrinsicID();
1195	switch (IntrID) {
1196	case Intrinsic::amdgcn_is_shared:
1197	case Intrinsic::amdgcn_is_private: {
1198	unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1199	AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1200	unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1201	LLVMContext &Ctx = NewV->getType()->getContext();
1202	ConstantInt *NewVal = (TrueAS == NewAS) ?
1203	ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx);
1204	return NewVal;
1205	}
1206	case Intrinsic::amdgcn_flat_atomic_fmax_num:
1207	case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1208	Type *DestTy = II->getType();
1209	Type *SrcTy = NewV->getType();
1210	unsigned NewAS = SrcTy->getPointerAddressSpace();
1211	if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS))
1212	return nullptr;
1213	Module *M = II->getModule();
1214	Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1215	M, id: II->getIntrinsicID(), Tys: {DestTy, SrcTy, DestTy});
1216	II->setArgOperand(i: `0`, v: NewV);
1217	II->setCalledFunction(NewDecl);
1218	return II;
1219	}
1220	case Intrinsic::amdgcn_load_to_lds: {
1221	Type *SrcTy = NewV->getType();
1222	Module *M = II->getModule();
1223	Function *NewDecl =
1224	Intrinsic::getOrInsertDeclaration(M, id: II->getIntrinsicID(), Tys: {SrcTy});
1225	II->setArgOperand(i: `0`, v: NewV);
1226	II->setCalledFunction(NewDecl);
1227	return II;
1228	}
1229	case Intrinsic::amdgcn_make_buffer_rsrc: {
1230	Type *SrcTy = NewV->getType();
1231	Type *DstTy = II->getType();
1232	Module *M = II->getModule();
1233	Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1234	M, id: II->getIntrinsicID(), Tys: {DstTy, SrcTy});
1235	II->setArgOperand(i: `0`, v: NewV);
1236	II->setCalledFunction(NewDecl);
1237	return II;
1238	}
1239	default:
1240	return nullptr;
1241	}
1242	}
1243
1244	InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1245	VectorType DstTy, VectorType SrcTy,
1246	ArrayRef<int> Mask,
1247	TTI::TargetCostKind CostKind,
1248	int Index, VectorType *SubTp,
1249	ArrayRef<const Value *> Args,
1250	const Instruction CxtI) const* {
1251	if (!isa<FixedVectorType>(Val: SrcTy))
1252	return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1253	SubTp);
1254
1255	Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
1256
1257	unsigned ScalarSize = DL.getTypeSizeInBits(Ty: SrcTy->getElementType());
1258	if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1259	(ScalarSize == `16` \|\| ScalarSize == `8`)) {
1260	// Larger vector widths may require additional instructions, but are
1261	// typically cheaper than scalarized versions.
1262	//
1263	// We assume that shuffling at a register granularity can be done for free.
1264	// This is not true for vectors fed into memory instructions, but it is
1265	// effectively true for all other shuffling. The emphasis of the logic here
1266	// is to assist generic transform in cleaning up / canonicalizing those
1267	// shuffles.
1268
1269	// With op_sel VOP3P instructions freely can access the low half or high
1270	// half of a register, so any swizzle of two elements is free.
1271	if (auto *SrcVecTy = dyn_cast<FixedVectorType>(Val: SrcTy)) {
1272	unsigned NumSrcElts = SrcVecTy->getNumElements();
1273	if (ST->hasVOP3PInsts() && ScalarSize == `16` && NumSrcElts == `2` &&
1274	(Kind == TTI::SK_Broadcast \|\| Kind == TTI::SK_Reverse \|\|
1275	Kind == TTI::SK_PermuteSingleSrc))
1276	return `0`;
1277	}
1278
1279	unsigned EltsPerReg = `32` / ScalarSize;
1280	switch (Kind) {
1281	case TTI::SK_Broadcast:
1282	// A single v_perm_b32 can be re-used for all destination registers.
1283	return `1`;
1284	case TTI::SK_Reverse:
1285	// One instruction per register.
1286	if (auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy))
1287	return divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: EltsPerReg);
1288	return InstructionCost::getInvalid();
1289	case TTI::SK_ExtractSubvector:
1290	if (Index % EltsPerReg == `0`)
1291	return `0`; // Shuffling at register granularity
1292	if (auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy))
1293	return divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: EltsPerReg);
1294	return InstructionCost::getInvalid();
1295	case TTI::SK_InsertSubvector: {
1296	auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy);
1297	if (!DstVecTy)
1298	return InstructionCost::getInvalid();
1299	unsigned NumDstElts = DstVecTy->getNumElements();
1300	unsigned NumInsertElts = cast<FixedVectorType>(Val: SubTp)->getNumElements();
1301	unsigned EndIndex = Index + NumInsertElts;
1302	unsigned BeginSubIdx = Index % EltsPerReg;
1303	unsigned EndSubIdx = EndIndex % EltsPerReg;
1304	unsigned Cost = `0`;
1305
1306	if (BeginSubIdx != `0`) {
1307	// Need to shift the inserted vector into place. The cost is the number
1308	// of destination registers overlapped by the inserted vector.
1309	Cost = divideCeil(Numerator: EndIndex, Denominator: EltsPerReg) - (Index / EltsPerReg);
1310	}
1311
1312	// If the last register overlap is partial, there may be three source
1313	// registers feeding into it; that takes an extra instruction.
1314	if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1315	Cost += `1`;
1316
1317	return Cost;
1318	}
1319	case TTI::SK_Splice: {
1320	auto *DstVecTy = dyn_cast<FixedVectorType>(Val: DstTy);
1321	if (!DstVecTy)
1322	return InstructionCost::getInvalid();
1323	unsigned NumElts = DstVecTy->getNumElements();
1324	assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1325	// Determine the sub-region of the result vector that requires
1326	// sub-register shuffles / mixing.
1327	unsigned EltsFromLHS = NumElts - Index;
1328	bool LHSIsAligned = (Index % EltsPerReg) == `0`;
1329	bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == `0`;
1330	if (LHSIsAligned && RHSIsAligned)
1331	return `0`;
1332	if (LHSIsAligned && !RHSIsAligned)
1333	return divideCeil(Numerator: NumElts, Denominator: EltsPerReg) - (EltsFromLHS / EltsPerReg);
1334	if (!LHSIsAligned && RHSIsAligned)
1335	return divideCeil(Numerator: EltsFromLHS, Denominator: EltsPerReg);
1336	return divideCeil(Numerator: NumElts, Denominator: EltsPerReg);
1337	}
1338	default:
1339	break;
1340	}
1341
1342	if (!Mask.empty()) {
1343	unsigned NumSrcElts = cast<FixedVectorType>(Val: SrcTy)->getNumElements();
1344
1345	// Generically estimate the cost by assuming that each destination
1346	// register is derived from sources via v_perm_b32 instructions if it
1347	// can't be copied as-is.
1348	//
1349	// For each destination register, derive the cost of obtaining it based
1350	// on the number of source registers that feed into it.
1351	unsigned Cost = `0`;
1352	for (unsigned DstIdx = `0`; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1353	SmallVector<int, `4`> Regs;
1354	bool Aligned = true;
1355	for (unsigned I = `0`; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1356	int SrcIdx = Mask [DstIdx + I];
1357	if (SrcIdx == -`1`)
1358	continue;
1359	int Reg;
1360	if (SrcIdx < (int)NumSrcElts) {
1361	Reg = SrcIdx / EltsPerReg;
1362	if (SrcIdx % EltsPerReg != I)
1363	Aligned = false;
1364	} else {
1365	Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1366	if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1367	Aligned = false;
1368	}
1369	if (!llvm::is_contained(Range&: Regs, Element: Reg))
1370	Regs.push_back(Elt: Reg);
1371	}
1372	if (Regs.size() >= `2`)
1373	Cost += Regs.size() - `1`;
1374	else if (!Aligned)
1375	Cost += `1`;
1376	}
1377	return Cost;
1378	}
1379	}
1380
1381	return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1382	SubTp);
1383	}
1384
1385	/// Whether it is profitable to sink the operands of an
1386	/// Instruction I to the basic block of I.
1387	/// This helps using several modifiers (like abs and neg) more often.
1388	bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
1389	SmallVectorImpl<Use > &Ops) const* {
1390	using namespace PatternMatch;
1391
1392	for (auto &Op : I->operands()) {
1393	// Ensure we are not already sinking this operand.
1394	if (any_of(Range&: Ops, P: [&](Use U) { return* U->get() == Op.get(); }))
1395	continue;
1396
1397	if (match(V: &Op, P: m_FAbs(Op0: m_Value())) \|\| match(V: &Op, P: m_FNeg(X: m_Value()))) {
1398	Ops.push_back(Elt: &Op);
1399	continue;
1400	}
1401
1402	// Check for zero-cost multiple use InsertElement/ExtractElement
1403	// instructions
1404	if (Instruction *OpInst = dyn_cast<Instruction>(Val: Op.get())) {
1405	if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > `1`) {
1406	Instruction *VecOpInst = dyn_cast<Instruction>(Val: OpInst->getOperand(i: `0`));
1407	if (VecOpInst && VecOpInst->hasOneUse())
1408	continue;
1409
1410	if (getVectorInstrCost(Opcode: OpInst->getOpcode(), ValTy: OpInst->getType(),
1411	CostKind: TTI::TCK_RecipThroughput, Index: `0`,
1412	Op0: OpInst->getOperand(i: `0`),
1413	Op1: OpInst->getOperand(i: `1`)) == `0`) {
1414	Ops.push_back(Elt: &Op);
1415	continue;
1416	}
1417	}
1418	}
1419
1420	if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: Op.get())) {
1421
1422	unsigned EltSize = DL.getTypeSizeInBits(
1423	Ty: cast<VectorType>(Val: Shuffle->getType())->getElementType());
1424
1425	// For i32 (or greater) shufflevectors, these will be lowered into a
1426	// series of insert / extract elements, which will be coalesced away.
1427	if (EltSize < `16` \|\| !ST->has16BitInsts())
1428	continue;
1429
1430	int NumSubElts, SubIndex;
1431	if (Shuffle->changesLength()) {
1432	if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1433	Ops.push_back(Elt: &Op);
1434	continue;
1435	}
1436
1437	if ((Shuffle->isExtractSubvectorMask(Index&: SubIndex) \|\|
1438	Shuffle->isInsertSubvectorMask(NumSubElts, Index&: SubIndex)) &&
1439	!(SubIndex & `0x1`)) {
1440	Ops.push_back(Elt: &Op);
1441	continue;
1442	}
1443	}
1444
1445	if (Shuffle->isReverse() \|\| Shuffle->isZeroEltSplat() \|\|
1446	Shuffle->isSingleSource()) {
1447	Ops.push_back(Elt: &Op);
1448	continue;
1449	}
1450	}
1451	}
1452
1453	return !Ops.empty();
1454	}
1455
1456	bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1457	const Function Callee) const* {
1458	const TargetMachine &TM = getTLI()->getTargetMachine();
1459	const GCNSubtarget *CallerST
1460	= static_cast<const GCNSubtarget >(TM.getSubtargetImpl(Caller));
1461	const GCNSubtarget *CalleeST
1462	= static_cast<const GCNSubtarget >(TM.getSubtargetImpl(Callee));
1463
1464	const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1465	const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1466
1467	FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1468	FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1469	if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1470	return false;
1471
1472	// FIXME: dx10_clamp can just take the caller setting, but there seems to be
1473	// no way to support merge for backend defined attributes.
1474	SIModeRegisterDefaults CallerMode(Caller, CallerST);
1475	SIModeRegisterDefaults CalleeMode(Callee, CalleeST);
1476	if (!CallerMode.isInlineCompatible(CalleeMode))
1477	return false;
1478
1479	if (Callee->hasFnAttribute(Kind: Attribute::AlwaysInline) \|\|
1480	Callee->hasFnAttribute(Kind: Attribute::InlineHint))
1481	return true;
1482
1483	// Hack to make compile times reasonable.
1484	if (InlineMaxBB) {
1485	// Single BB does not increase total BB amount.
1486	if (Callee->size() == `1`)
1487	return true;
1488	size_t BBSize = Caller->size() + Callee->size() - `1`;
1489	return BBSize <= InlineMaxBB;
1490	}
1491
1492	return true;
1493	}
1494
1495	static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1496	const SITargetLowering *TLI,
1497	const GCNTTIImpl *TTIImpl) {
1498	const int NrOfSGPRUntilSpill = `26`;
1499	const int NrOfVGPRUntilSpill = `32`;
1500
1501	const DataLayout &DL = TTIImpl->getDataLayout();
1502
1503	unsigned adjustThreshold = `0`;
1504	int SGPRsInUse = `0`;
1505	int VGPRsInUse = `0`;
1506	for (const Use &A : CB->args()) {
1507	SmallVector<EVT, `4`> ValueVTs;
1508	ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs);
1509	for (auto ArgVT : ValueVTs) {
1510	unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1511	Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT);
1512	if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A)))
1513	SGPRsInUse += CCRegNum;
1514	else
1515	VGPRsInUse += CCRegNum;
1516	}
1517	}
1518
1519	// The cost of passing function arguments through the stack:
1520	// 1 instruction to put a function argument on the stack in the caller.
1521	// 1 instruction to take a function argument from the stack in callee.
1522	// 1 instruction is explicitly take care of data dependencies in callee
1523	// function.
1524	InstructionCost ArgStackCost(`1`);
1525	ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1526	Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align (`4`),
1527	AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1528	ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1529	Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align (`4`),
1530	AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1531
1532	// The penalty cost is computed relative to the cost of instructions and does
1533	// not model any storage costs.
1534	adjustThreshold += std::max(a: `0`, b: SGPRsInUse - NrOfSGPRUntilSpill) *
1535	ArgStackCost.getValue() * InlineConstants::getInstrCost();
1536	adjustThreshold += std::max(a: `0`, b: VGPRsInUse - NrOfVGPRUntilSpill) *
1537	ArgStackCost.getValue() * InlineConstants::getInstrCost();
1538	return adjustThreshold;
1539	}
1540
1541	static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1542	const DataLayout &DL) {
1543	// If we have a pointer to a private array passed into a function
1544	// it will not be optimized out, leaving scratch usage.
1545	// This function calculates the total size in bytes of the memory that would
1546	// end in scratch if the call was not inlined.
1547	unsigned AllocaSize = `0`;
1548	SmallPtrSet<const AllocaInst *, `8`> AIVisited;
1549	for (Value *PtrArg : CB->args()) {
1550	PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType());
1551	if (!Ty)
1552	continue;
1553
1554	unsigned AddrSpace = Ty->getAddressSpace();
1555	if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1556	AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1557	continue;
1558
1559	const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg));
1560	if (!AI \|\| !AI->isStaticAlloca() \|\| !AIVisited.insert(Ptr: AI).second)
1561	continue;
1562
1563	if (auto Size = AI->getAllocationSize(DL))
1564	AllocaSize += Size ->getFixedValue();
1565	}
1566	return AllocaSize;
1567	}
1568
1569	int GCNTTIImpl::getInliningLastCallToStaticBonus() const {
1570	return BaseT::getInliningLastCallToStaticBonus() *
1571	getInliningThresholdMultiplier();
1572	}
1573
1574	unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase CB) const* {
1575	unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this);
1576
1577	// Private object passed as arguments may end up in scratch usage if the call
1578	// is not inlined. Increase the inline threshold to promote inlining.
1579	unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1580	if (AllocaSize > `0`)
1581	Threshold += ArgAllocaCost;
1582	return Threshold;
1583	}
1584
1585	unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1586	const AllocaInst AI) const* {
1587
1588	// Below the cutoff, assume that the private memory objects would be
1589	// optimized
1590	auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1591	if (AllocaSize <= ArgAllocaCutoff)
1592	return `0`;
1593
1594	// Above the cutoff, we give a cost to each private memory object
1595	// depending its size. If the array can be optimized by SROA this cost is not
1596	// added to the total-cost in the inliner cost analysis.
1597	//
1598	// We choose the total cost of the alloca such that their sum cancels the
1599	// bonus given in the threshold (ArgAllocaCost).
1600	//
1601	// Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1602	//
1603	// Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1604	// the single-bb bonus and the vector-bonus.
1605	//
1606	// We compensate the first two multipliers, by repeating logic from the
1607	// inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1608	static_assert(InlinerVectorBonusPercent == `0`, "vector bonus assumed to be 0");
1609	unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1610
1611	bool SingleBB = none_of(Range&: CB->getCalledFunction(), P: [](const* BasicBlock &BB) {
1612	return BB.getTerminator()->getNumSuccessors() > `1`;
1613	});
1614	if (SingleBB) {
1615	Threshold += Threshold / `2`;
1616	}
1617
1618	auto ArgAllocaSize = AI->getAllocationSize(DL);
1619	if (!ArgAllocaSize)
1620	return `0`;
1621
1622	// Attribute the bonus proportionally to the alloca size
1623	unsigned AllocaThresholdBonus =
1624	(Threshold * ArgAllocaSize ->getFixedValue()) / AllocaSize;
1625
1626	return AllocaThresholdBonus;
1627	}
1628
1629	void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1630	TTI::UnrollingPreferences &UP,
1631	OptimizationRemarkEmitter ORE) const* {
1632	CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1633	}
1634
1635	void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1636	TTI::PeelingPreferences &PP) const {
1637	CommonTTI.getPeelingPreferences(L, SE, PP);
1638	}
1639
1640	int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1641	return ST->hasFullRate64Ops()
1642	? getFullRateInstrCost()
1643	: ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1644	: getQuarterRateInstrCost(CostKind);
1645	}
1646
1647	std::pair<InstructionCost, MVT>
1648	GCNTTIImpl::getTypeLegalizationCost(Type Ty) const* {
1649	std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1650	auto Size = DL.getTypeSizeInBits(Ty);
1651	// Maximum load or store can handle 8 dwords for scalar and 4 for
1652	// vector ALU. Let's assume anything above 8 dwords is expensive
1653	// even if legal.
1654	if (Size <= `256`)
1655	return Cost;
1656
1657	Cost.first += (Size + `255`) / `256`;
1658	return Cost;
1659	}
1660
1661	unsigned GCNTTIImpl::getPrefetchDistance() const {
1662	return ST->hasPrefetch() ? `128` : `0`;
1663	}
1664
1665	bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1666	return AMDGPU::isFlatGlobalAddrSpace(AS);
1667	}
1668
1669	void GCNTTIImpl::collectKernelLaunchBounds(
1670	const Function &F,
1671	SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1672	SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1673	LB.push_back(Elt: {"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups [`0`]});
1674	LB.push_back(Elt: {"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups [`1`]});
1675	LB.push_back(Elt: {"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups [`2`]});
1676	std::pair<unsigned, unsigned> FlatWorkGroupSize =
1677	ST->getFlatWorkGroupSizes(F);
1678	LB.push_back(Elt: {"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1679	LB.push_back(Elt: {"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1680	std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1681	LB.push_back(Elt: {"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1682	LB.push_back(Elt: {"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1683	}
1684
1685	GCNTTIImpl::KnownIEEEMode
1686	GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
1687	if (!ST->hasFeature(Feature: AMDGPU::FeatureDX10ClampAndIEEEMode))
1688	return KnownIEEEMode::On; // Only mode on gfx1170+
1689
1690	const Function *F = I.getFunction();
1691	if (!F)
1692	return KnownIEEEMode::Unknown;
1693
1694	Attribute IEEEAttr = F->getFnAttribute(Kind: "amdgpu-ieee");
1695	if (IEEEAttr.isValid())
1696	return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
1697
1698	return AMDGPU::isShader(CC: F->getCallingConv()) ? KnownIEEEMode::Off
1699	: KnownIEEEMode::On;
1700	}
1701
1702	InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1703	Align Alignment,
1704	unsigned AddressSpace,
1705	TTI::TargetCostKind CostKind,
1706	TTI::OperandValueInfo OpInfo,
1707	const Instruction I) const* {
1708	if (VectorType *VecTy = dyn_cast<VectorType>(Val: Src)) {
1709	if ((Opcode == Instruction::Load \|\| Opcode == Instruction::Store) &&
1710	VecTy->getElementType()->isIntegerTy(Bitwidth: `8`)) {
1711	return divideCeil(Numerator: DL.getTypeSizeInBits(Ty: VecTy) - `1`,
1712	Denominator: getLoadStoreVecRegBitWidth(AddrSpace: AddressSpace));
1713	}
1714	}
1715	return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1716	OpInfo, I);
1717	}
1718
1719	unsigned GCNTTIImpl::getNumberOfParts(Type Tp) const* {
1720	if (VectorType *VecTy = dyn_cast<VectorType>(Val: Tp)) {
1721	if (VecTy->getElementType()->isIntegerTy(Bitwidth: `8`)) {
1722	unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1723	return divideCeil(Numerator: ElementCount - `1`, Denominator: `4`);
1724	}
1725	}
1726	return BaseT::getNumberOfParts(Tp);
1727	}
1728
1729	InstructionUniformity
1730	GCNTTIImpl::getInstructionUniformity(const Value V) const* {
1731	if (isAlwaysUniform(V))
1732	return InstructionUniformity::AlwaysUniform;
1733
1734	if (isSourceOfDivergence(V))
1735	return InstructionUniformity::NeverUniform;
1736
1737	return InstructionUniformity::Default;
1738	}
1739

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp