AMDGPUTargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp]

1	//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file
10	// This file implements a TargetTransformInfo analysis pass specific to the
11	// AMDGPU target machine. It uses the target's detailed information to provide
12	// more precise answers to certain TTI queries, while letting the target
13	// independent and default TTI implementations handle the rest.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "AMDGPUTargetTransformInfo.h"
18	#include "AMDGPUTargetMachine.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "SIModeRegisterDefaults.h"
21	#include "llvm/Analysis/InlineCost.h"
22	#include "llvm/Analysis/LoopInfo.h"
23	#include "llvm/Analysis/ValueTracking.h"
24	#include "llvm/CodeGen/Analysis.h"
25	#include "llvm/IR/IRBuilder.h"
26	#include "llvm/IR/IntrinsicsAMDGPU.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/Support/KnownBits.h"
29	#include <optional>
30
31	using namespace llvm;
32
33	#define DEBUG_TYPE "AMDGPUtti"
34
35	static cl::opt<unsigned> UnrollThresholdPrivate(
36	"amdgpu-unroll-threshold-private",
37	cl::desc ("Unroll threshold for AMDGPU if private memory used in a loop"),
38	cl::init(Val: `2700`), cl::Hidden);
39
40	static cl::opt<unsigned> UnrollThresholdLocal(
41	"amdgpu-unroll-threshold-local",
42	cl::desc ("Unroll threshold for AMDGPU if local memory used in a loop"),
43	cl::init(Val: `1000`), cl::Hidden);
44
45	static cl::opt<unsigned> UnrollThresholdIf(
46	"amdgpu-unroll-threshold-if",
47	cl::desc ("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48	cl::init(Val: `200`), cl::Hidden);
49
50	static cl::opt<bool> UnrollRuntimeLocal(
51	"amdgpu-unroll-runtime-local",
52	cl::desc ("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53	cl::init(Val: true), cl::Hidden);
54
55	static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
56	"amdgpu-unroll-max-block-to-analyze",
57	cl::desc ("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58	cl::init(Val: `32`), cl::Hidden);
59
60	static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61	cl::Hidden, cl::init(Val: `4000`),
62	cl::desc ("Cost of alloca argument"));
63
64	// If the amount of scratch memory to eliminate exceeds our ability to allocate
65	// it into registers we gain nothing by aggressively inlining functions for that
66	// heuristic.
67	static cl::opt<unsigned>
68	ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69	cl::init(Val: `256`),
70	cl::desc ("Maximum alloca size to use for inline cost"));
71
72	// Inliner constraint to achieve reasonable compilation time.
73	static cl::opt<size_t> InlineMaxBB(
74	"amdgpu-inline-max-bb", cl::Hidden, cl::init(Val: `1100`),
75	cl::desc ("Maximum number of BBs allowed in a function after inlining"
76	" (compile time constraint)"));
77
78	// This default unroll factor is based on microbenchmarks on gfx1030.
79	static cl::opt<unsigned> MemcpyLoopUnroll(
80	"amdgpu-memcpy-loop-unroll",
81	cl::desc ("Unroll factor (affecting 4x32-bit operations) to use for memory "
82	"operations when lowering memcpy as a loop"),
83	cl::init(Val: `16`), cl::Hidden);
84
85	static bool dependsOnLocalPhi(const Loop L, const* Value *Cond,
86	unsigned Depth = `0`) {
87	const Instruction *I = dyn_cast<Instruction>(Val: Cond);
88	if (!I)
89	return false;
90
91	for (const Value *V : I->operand_values()) {
92	if (!L->contains(Inst: I))
93	continue;
94	if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) {
95	if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) {
96	return SubLoop->contains(Inst: PHI); }))
97	return true;
98	} else if (Depth < `10` && dependsOnLocalPhi(L, Cond: V, Depth: Depth+`1`))
99	return true;
100	}
101	return false;
102	}
103
104	AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine TM, const* Function &F)
105	: BaseT (TM, F.getDataLayout()),
106	TargetTriple (TM->getTargetTriple()),
107	ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
108	TLI(ST->getTargetLowering()) {}
109
110	void AMDGPUTTIImpl::getUnrollingPreferences(
111	Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
112	OptimizationRemarkEmitter ORE) const* {
113	const Function &F = *L->getHeader()->getParent();
114	UP.Threshold =
115	F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold", Default: `300`);
116	UP.MaxCount = std::numeric_limits<unsigned>::max();
117	UP.Partial = true;
118
119	// Conditional branch in a loop back edge needs 3 additional exec
120	// manipulations in average.
121	UP.BEInsns += `3`;
122
123	// We want to run unroll even for the loops which have been vectorized.
124	UP.UnrollVectorizedLoop = true;
125
126	// TODO: Do we want runtime unrolling?
127
128	// Maximum alloca size than can fit registers. Reserve 16 registers.
129	const unsigned MaxAlloca = (`256` - `16`) * `4`;
130	unsigned ThresholdPrivate = UnrollThresholdPrivate;
131	unsigned ThresholdLocal = UnrollThresholdLocal;
132
133	// If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
134	// provided threshold value as the default for Threshold
135	if (MDNode *LoopUnrollThreshold =
136	findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold")) {
137	if (LoopUnrollThreshold->getNumOperands() == `2`) {
138	ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
139	MD: LoopUnrollThreshold->getOperand(I: `1`));
140	if (MetaThresholdValue) {
141	// We will also use the supplied value for PartialThreshold for now.
142	// We may introduce additional metadata if it becomes necessary in the
143	// future.
144	UP.Threshold = MetaThresholdValue->getSExtValue();
145	UP.PartialThreshold = UP.Threshold;
146	ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold);
147	ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold);
148	}
149	}
150	}
151
152	unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal);
153	for (const BasicBlock *BB : L->getBlocks()) {
154	const DataLayout &DL = BB->getDataLayout();
155	unsigned LocalGEPsSeen = `0`;
156
157	if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) {
158	return SubLoop->contains(BB); }))
159	continue; // Block belongs to an inner loop.
160
161	for (const Instruction &I : *BB) {
162	// Unroll a loop which contains an "if" statement whose condition
163	// defined by a PHI belonging to the loop. This may help to eliminate
164	// if region and potentially even PHI itself, saving on both divergence
165	// and registers used for the PHI.
166	// Add a small bonus for each of such "if" statements.
167	if (const BranchInst *Br = dyn_cast<BranchInst>(Val: &I)) {
168	if (UP.Threshold < MaxBoost && Br->isConditional()) {
169	BasicBlock *Succ0 = Br->getSuccessor(i: `0`);
170	BasicBlock *Succ1 = Br->getSuccessor(i: `1`);
171	if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) \|\|
172	(L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1)))
173	continue;
174	if (dependsOnLocalPhi(L, Cond: Br->getCondition())) {
175	UP.Threshold += UnrollThresholdIf;
176	LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
177	<< " for loop:\n"
178	<< L << " due to " << Br << `'\n'`);
179	if (UP.Threshold >= MaxBoost)
180	return;
181	}
182	}
183	continue;
184	}
185
186	const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I);
187	if (!GEP)
188	continue;
189
190	unsigned AS = GEP->getAddressSpace();
191	unsigned Threshold = `0`;
192	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
193	Threshold = ThresholdPrivate;
194	else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS)
195	Threshold = ThresholdLocal;
196	else
197	continue;
198
199	if (UP.Threshold >= Threshold)
200	continue;
201
202	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
203	const Value *Ptr = GEP->getPointerOperand();
204	const AllocaInst *Alloca =
205	dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr));
206	if (!Alloca \|\| !Alloca->isStaticAlloca())
207	continue;
208	Type *Ty = Alloca->getAllocatedType();
209	unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : `0`;
210	if (AllocaSize > MaxAlloca)
211	continue;
212	} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\|
213	AS == AMDGPUAS::REGION_ADDRESS) {
214	LocalGEPsSeen++;
215	// Inhibit unroll for local memory if we have seen addressing not to
216	// a variable, most likely we will be unable to combine it.
217	// Do not unroll too deep inner loops for local memory to give a chance
218	// to unroll an outer loop for a more important reason.
219	if (LocalGEPsSeen > `1` \|\| L->getLoopDepth() > `2` \|\|
220	(!isa<GlobalVariable>(Val: GEP->getPointerOperand()) &&
221	!isa<Argument>(Val: GEP->getPointerOperand())))
222	continue;
223	LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
224	<< *L << " due to LDS use.\n");
225	UP.Runtime = UnrollRuntimeLocal;
226	}
227
228	// Check if GEP depends on a value defined by this loop itself.
229	bool HasLoopDef = false;
230	for (const Value *Op : GEP->operands()) {
231	const Instruction *Inst = dyn_cast<Instruction>(Val: Op);
232	if (!Inst \|\| L->isLoopInvariant(V: Op))
233	continue;
234
235	if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) {
236	return SubLoop->contains(Inst); }))
237	continue;
238	HasLoopDef = true;
239	break;
240	}
241	if (!HasLoopDef)
242	continue;
243
244	// We want to do whatever we can to limit the number of alloca
245	// instructions that make it through to the code generator. allocas
246	// require us to use indirect addressing, which is slow and prone to
247	// compiler bugs. If this loop does an address calculation on an
248	// alloca ptr, then we want to use a higher than normal loop unroll
249	// threshold. This will give SROA a better chance to eliminate these
250	// allocas.
251	//
252	// We also want to have more unrolling for local memory to let ds
253	// instructions with different offsets combine.
254	//
255	// Don't use the maximum allowed value here as it will make some
256	// programs way too big.
257	UP.Threshold = Threshold;
258	LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
259	<< " for loop:\n"
260	<< L << " due to " << GEP << `'\n'`);
261	if (UP.Threshold >= MaxBoost)
262	return;
263	}
264
265	// If we got a GEP in a small BB from inner loop then increase max trip
266	// count to analyze for better estimation cost in unroll
267	if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
268	UP.MaxIterationsCountToAnalyze = `32`;
269	}
270	}
271
272	void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
273	TTI::PeelingPreferences &PP) const {
274	BaseT::getPeelingPreferences(L, SE, PP);
275	}
276
277	uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
278	return `1024`;
279	}
280
281	const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
282	// Codegen control options which don't matter.
283	AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
284	AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
285	AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
286	AMDGPU::FeatureUnalignedAccessMode,
287
288	AMDGPU::FeatureAutoWaitcntBeforeBarrier,
289
290	// Property of the kernel/environment which can't actually differ.
291	AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
292	AMDGPU::FeatureTrapHandler,
293
294	// The default assumption needs to be ecc is enabled, but no directly
295	// exposed operations depend on it, so it can be safely inlined.
296	AMDGPU::FeatureSRAMECC,
297
298	// Perf-tuning features
299	AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
300
301	GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine TM, const* Function &F)
302	: BaseT (TM, F.getDataLayout()),
303	ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
304	TLI(ST->getTargetLowering()), CommonTTI (TM, F),
305	IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) {
306	SIModeRegisterDefaults Mode(F, *ST);
307	HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
308	HasFP64FP16Denormals =
309	Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
310	}
311
312	bool GCNTTIImpl::hasBranchDivergence(const Function F) const* {
313	return !F \|\| !ST->isSingleLaneExecution(Kernel: *F);
314	}
315
316	unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
317	// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
318	// registers. See getRegisterClassForType for the implementation.
319	// In this case vector registers are not vector in terms of
320	// VGPRs, but those which can hold multiple values.
321
322	// This is really the number of registers to fill when vectorizing /
323	// interleaving loops, so we lie to avoid trying to use all registers.
324	return `4`;
325	}
326
327	TypeSize
328	GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
329	switch (K) {
330	case TargetTransformInfo::RGK_Scalar:
331	return TypeSize::getFixed(ExactSize: `32`);
332	case TargetTransformInfo::RGK_FixedWidthVector:
333	return TypeSize::getFixed(ExactSize: ST->hasPackedFP32Ops() ? `64` : `32`);
334	case TargetTransformInfo::RGK_ScalableVector:
335	return TypeSize::getScalable(MinimumSize: `0`);
336	}
337	llvm_unreachable("Unsupported register kind");
338	}
339
340	unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
341	return `32`;
342	}
343
344	unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345	if (Opcode == Instruction::Load \|\| Opcode == Instruction::Store)
346	return `32` * `4` / ElemWidth;
347	// For a given width return the max 0number of elements that can be combined
348	// into a wider bit value:
349	return (ElemWidth == `8` && ST->has16BitInsts()) ? `4`
350	: (ElemWidth == `16` && ST->has16BitInsts()) ? `2`
351	: (ElemWidth == `32` && ST->hasPackedFP32Ops()) ? `2`
352	: `1`;
353	}
354
355	unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
356	unsigned ChainSizeInBytes,
357	VectorType VecTy) const* {
358	unsigned VecRegBitWidth = VF * LoadSize;
359	if (VecRegBitWidth > `128` && VecTy->getScalarSizeInBits() < `32`)
360	// TODO: Support element-size less than 32bit?
361	return `128` / LoadSize;
362
363	return VF;
364	}
365
366	unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
367	unsigned ChainSizeInBytes,
368	VectorType VecTy) const* {
369	unsigned VecRegBitWidth = VF * StoreSize;
370	if (VecRegBitWidth > `128`)
371	return `128` / StoreSize;
372
373	return VF;
374	}
375
376	unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
377	if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS \|\|
378	AddrSpace == AMDGPUAS::CONSTANT_ADDRESS \|\|
379	AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
380	AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER \|\|
381	AddrSpace == AMDGPUAS::BUFFER_RESOURCE \|\|
382	AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
383	return `512`;
384	}
385
386	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
387	return `8` * ST->getMaxPrivateElementSize();
388
389	// Common to flat, global, local and region. Assume for unknown addrspace.
390	return `128`;
391	}
392
393	bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
394	Align Alignment,
395	unsigned AddrSpace) const {
396	// We allow vectorization of flat stores, even though we may need to decompose
397	// them later if they may access private memory. We don't have enough context
398	// here, and legalization can handle it.
399	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
400	return (Alignment >= `4` \|\| ST->hasUnalignedScratchAccessEnabled()) &&
401	ChainSizeInBytes <= ST->getMaxPrivateElementSize();
402	}
403	return true;
404	}
405
406	bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
407	Align Alignment,
408	unsigned AddrSpace) const {
409	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
410	}
411
412	bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
413	Align Alignment,
414	unsigned AddrSpace) const {
415	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
416	}
417
418	uint64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
419	return `1024`;
420	}
421
422	Type *GCNTTIImpl::getMemcpyLoopLoweringType(
423	LLVMContext &Context, Value Length, unsigned* SrcAddrSpace,
424	unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
425	std::optional<uint32_t> AtomicElementSize) const {
426
427	if (AtomicElementSize)
428	return Type::getIntNTy(C&: Context, N: AtomicElementSize `8`);
429
430	// 16-byte accesses achieve the highest copy throughput.
431	// If the operation has a fixed known length that is large enough, it is
432	// worthwhile to return an even wider type and let legalization lower it into
433	// multiple accesses, effectively unrolling the memcpy loop.
434	// We also rely on legalization to decompose into smaller accesses for
435	// subtargets and address spaces where it is necessary.
436	//
437	// Don't unroll if Length is not a constant, since unrolling leads to worse
438	// performance for length values that are smaller or slightly larger than the
439	// total size of the type returned here. Mitigating that would require a more
440	// complex lowering for variable-length memcpy and memmove.
441	unsigned I32EltsInVector = `4`;
442	if (MemcpyLoopUnroll > `0` && isa<ConstantInt>(Val: Length))
443	return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context),
444	NumElts: MemcpyLoopUnroll * I32EltsInVector);
445
446	return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: I32EltsInVector);
447	}
448
449	void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
450	SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
451	unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
452	Align SrcAlign, Align DestAlign,
453	std::optional<uint32_t> AtomicCpySize) const {
454
455	if (AtomicCpySize)
456	BaseT::getMemcpyLoopResidualLoweringType(
457	OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
458	DestAlign, AtomicCpySize);
459
460	Type *I32x4Ty = FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: `4`);
461	while (RemainingBytes >= `16`) {
462	OpsOut.push_back(Elt: I32x4Ty);
463	RemainingBytes -= `16`;
464	}
465
466	Type *I64Ty = Type::getInt64Ty(C&: Context);
467	while (RemainingBytes >= `8`) {
468	OpsOut.push_back(Elt: I64Ty);
469	RemainingBytes -= `8`;
470	}
471
472	Type *I32Ty = Type::getInt32Ty(C&: Context);
473	while (RemainingBytes >= `4`) {
474	OpsOut.push_back(Elt: I32Ty);
475	RemainingBytes -= `4`;
476	}
477
478	Type *I16Ty = Type::getInt16Ty(C&: Context);
479	while (RemainingBytes >= `2`) {
480	OpsOut.push_back(Elt: I16Ty);
481	RemainingBytes -= `2`;
482	}
483
484	Type *I8Ty = Type::getInt8Ty(C&: Context);
485	while (RemainingBytes) {
486	OpsOut.push_back(Elt: I8Ty);
487	--RemainingBytes;
488	}
489	}
490
491	unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
492	// Disable unrolling if the loop is not vectorized.
493	// TODO: Enable this again.
494	if (VF.isScalar())
495	return `1`;
496
497	return `8`;
498	}
499
500	bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
501	MemIntrinsicInfo &Info) const {
502	switch (Inst->getIntrinsicID()) {
503	case Intrinsic::amdgcn_ds_ordered_add:
504	case Intrinsic::amdgcn_ds_ordered_swap: {
505	auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: `2`));
506	auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: `4`));
507	if (!Ordering \|\| !Volatile)
508	return false; // Invalid.
509
510	unsigned OrderingVal = Ordering->getZExtValue();
511	if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
512	return false;
513
514	Info.PtrVal = Inst->getArgOperand(i: `0`);
515	Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
516	Info.ReadMem = true;
517	Info.WriteMem = true;
518	Info.IsVolatile = !Volatile->isZero();
519	return true;
520	}
521	default:
522	return false;
523	}
524	}
525
526	InstructionCost GCNTTIImpl::getArithmeticInstrCost(
527	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
528	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
529	ArrayRef<const Value > Args, const* Instruction CxtI) const* {
530
531	// Legalize the type.
532	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
533	int ISD = TLI->InstructionOpcodeToISD(Opcode);
534
535	// Because we don't have any legal vector operations, but the legal types, we
536	// need to account for split vectors.
537	unsigned NElts = LT.second.isVector() ?
538	LT.second.getVectorNumElements() : `1`;
539
540	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
541
542	switch (ISD) {
543	case ISD::SHL:
544	case ISD::SRL:
545	case ISD::SRA:
546	if (SLT == MVT::i64)
547	return get64BitInstrCost(CostKind) * LT.first * NElts;
548
549	if (ST->has16BitInsts() && SLT == MVT::i16)
550	NElts = (NElts + `1`) / `2`;
551
552	// i32
553	return getFullRateInstrCost() * LT.first * NElts;
554	case ISD::ADD:
555	case ISD::SUB:
556	case ISD::AND:
557	case ISD::OR:
558	case ISD::XOR:
559	if (SLT == MVT::i64) {
560	// and, or and xor are typically split into 2 VALU instructions.
561	return `2` * getFullRateInstrCost() * LT.first * NElts;
562	}
563
564	if (ST->has16BitInsts() && SLT == MVT::i16)
565	NElts = (NElts + `1`) / `2`;
566
567	return LT.first * NElts * getFullRateInstrCost();
568	case ISD::MUL: {
569	const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
570	if (SLT == MVT::i64) {
571	const int FullRateCost = getFullRateInstrCost();
572	return (`4` * QuarterRateCost + (`2` * `2`) * FullRateCost) * LT.first * NElts;
573	}
574
575	if (ST->has16BitInsts() && SLT == MVT::i16)
576	NElts = (NElts + `1`) / `2`;
577
578	// i32
579	return QuarterRateCost * NElts * LT.first;
580	}
581	case ISD::FMUL:
582	// Check possible fuse {fadd\|fsub}(a,fmul(b,c)) and return zero cost for
583	// fmul(b,c) supposing the fadd\|fsub will get estimated cost for the whole
584	// fused operation.
585	if (CxtI && CxtI->hasOneUse())
586	if (const auto FAdd = dyn_cast<BinaryOperator>(Val: CxtI->user_begin())) {
587	const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode());
588	if (OPC == ISD::FADD \|\| OPC == ISD::FSUB) {
589	if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
590	return TargetTransformInfo::TCC_Free;
591	if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
592	return TargetTransformInfo::TCC_Free;
593
594	// Estimate all types may be fused with contract/unsafe flags
595	const TargetOptions &Options = TLI->getTargetMachine().Options;
596	if (Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
597	Options.UnsafeFPMath \|\|
598	(FAdd->hasAllowContract() && CxtI->hasAllowContract()))
599	return TargetTransformInfo::TCC_Free;
600	}
601	}
602	[[fallthrough]];
603	case ISD::FADD:
604	case ISD::FSUB:
605	if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
606	NElts = (NElts + `1`) / `2`;
607	if (SLT == MVT::f64)
608	return LT.first * NElts * get64BitInstrCost(CostKind);
609
610	if (ST->has16BitInsts() && SLT == MVT::f16)
611	NElts = (NElts + `1`) / `2`;
612
613	if (SLT == MVT::f32 \|\| SLT == MVT::f16)
614	return LT.first * NElts * getFullRateInstrCost();
615	break;
616	case ISD::FDIV:
617	case ISD::FREM:
618	// FIXME: frem should be handled separately. The fdiv in it is most of it,
619	// but the current lowering is also not entirely correct.
620	if (SLT == MVT::f64) {
621	int Cost = `7` * get64BitInstrCost(CostKind) +
622	getQuarterRateInstrCost(CostKind) +
623	`3` * getHalfRateInstrCost(CostKind);
624	// Add cost of workaround.
625	if (!ST->hasUsableDivScaleConditionOutput())
626	Cost += `3` * getFullRateInstrCost();
627
628	return LT.first * Cost * NElts;
629	}
630
631	if (!Args.empty() && match(V: Args [`0`], P: PatternMatch::m_FPOne())) {
632	// TODO: This is more complicated, unsafe flags etc.
633	if ((SLT == MVT::f32 && !HasFP32Denormals) \|\|
634	(SLT == MVT::f16 && ST->has16BitInsts())) {
635	return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
636	}
637	}
638
639	if (SLT == MVT::f16 && ST->has16BitInsts()) {
640	// 2 x v_cvt_f32_f16
641	// f32 rcp
642	// f32 fmul
643	// v_cvt_f16_f32
644	// f16 div_fixup
645	int Cost =
646	`4` * getFullRateInstrCost() + `2` * getQuarterRateInstrCost(CostKind);
647	return LT.first * Cost * NElts;
648	}
649
650	if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) \|\|
651	TLI->getTargetMachine().Options.UnsafeFPMath)) {
652	// Fast unsafe fdiv lowering:
653	// f32 rcp
654	// f32 fmul
655	int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
656	return LT.first * Cost * NElts;
657	}
658
659	if (SLT == MVT::f32 \|\| SLT == MVT::f16) {
660	// 4 more v_cvt_ insts without f16 insts support*
661	int Cost = (SLT == MVT::f16 ? `14` : `10`) * getFullRateInstrCost() +
662	`1` * getQuarterRateInstrCost(CostKind);
663
664	if (!HasFP32Denormals) {
665	// FP mode switches.
666	Cost += `2` * getFullRateInstrCost();
667	}
668
669	return LT.first * NElts * Cost;
670	}
671	break;
672	case ISD::FNEG:
673	// Use the backend' estimation. If fneg is not free each element will cost
674	// one additional instruction.
675	return TLI->isFNegFree(VT: SLT) ? `0` : NElts;
676	default:
677	break;
678	}
679
680	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
681	Args, CxtI);
682	}
683
684	// Return true if there's a potential benefit from using v2f16/v2i16
685	// instructions for an intrinsic, even if it requires nontrivial legalization.
686	static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
687	switch (ID) {
688	case Intrinsic::fma:
689	case Intrinsic::fmuladd:
690	case Intrinsic::copysign:
691	case Intrinsic::minimumnum:
692	case Intrinsic::maximumnum:
693	case Intrinsic::canonicalize:
694	// There's a small benefit to using vector ops in the legalized code.
695	case Intrinsic::round:
696	case Intrinsic::uadd_sat:
697	case Intrinsic::usub_sat:
698	case Intrinsic::sadd_sat:
699	case Intrinsic::ssub_sat:
700	case Intrinsic::abs:
701	return true;
702	default:
703	return false;
704	}
705	}
706
707	InstructionCost
708	GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
709	TTI::TargetCostKind CostKind) const {
710	switch (ICA.getID()) {
711	case Intrinsic::fabs:
712	// Free source modifier in the common case.
713	return `0`;
714	case Intrinsic::amdgcn_workitem_id_x:
715	case Intrinsic::amdgcn_workitem_id_y:
716	case Intrinsic::amdgcn_workitem_id_z:
717	// TODO: If hasPackedTID, or if the calling context is not an entry point
718	// there may be a bit instruction.
719	return `0`;
720	case Intrinsic::amdgcn_workgroup_id_x:
721	case Intrinsic::amdgcn_workgroup_id_y:
722	case Intrinsic::amdgcn_workgroup_id_z:
723	case Intrinsic::amdgcn_lds_kernel_id:
724	case Intrinsic::amdgcn_dispatch_ptr:
725	case Intrinsic::amdgcn_dispatch_id:
726	case Intrinsic::amdgcn_implicitarg_ptr:
727	case Intrinsic::amdgcn_queue_ptr:
728	// Read from an argument register.
729	return `0`;
730	default:
731	break;
732	}
733
734	if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID()))
735	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
736
737	Type *RetTy = ICA.getReturnType();
738
739	// Legalize the type.
740	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
741
742	unsigned NElts = LT.second.isVector() ?
743	LT.second.getVectorNumElements() : `1`;
744
745	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
746
747	if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 \|\| SLT == MVT::i16)) \|\|
748	(ST->hasPackedFP32Ops() && SLT == MVT::f32))
749	NElts = (NElts + `1`) / `2`;
750
751	// TODO: Get more refined intrinsic costs?
752	unsigned InstRate = getQuarterRateInstrCost(CostKind);
753
754	switch (ICA.getID()) {
755	case Intrinsic::fma:
756	case Intrinsic::fmuladd:
757	if (SLT == MVT::f64) {
758	InstRate = get64BitInstrCost(CostKind);
759	break;
760	}
761
762	if ((SLT == MVT::f32 && ST->hasFastFMAF32()) \|\| SLT == MVT::f16)
763	InstRate = getFullRateInstrCost();
764	else {
765	InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
766	: getQuarterRateInstrCost(CostKind);
767	}
768	break;
769	case Intrinsic::copysign:
770	return NElts * getFullRateInstrCost();
771	case Intrinsic::minimumnum:
772	case Intrinsic::maximumnum: {
773	// Instruction + 2 canonicalizes. For cases that need type promotion, we the
774	// promotion takes the place of the canonicalize.
775	unsigned NumOps = `3`;
776	if (const IntrinsicInst *II = ICA.getInst()) {
777	// Directly legal with ieee=0
778	// TODO: Not directly legal with strictfp
779	if (fpenvIEEEMode(I: *II) == KnownIEEEMode::Off)
780	NumOps = `1`;
781	}
782
783	unsigned BaseRate =
784	SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
785	InstRate = BaseRate * NumOps;
786	break;
787	}
788	case Intrinsic::canonicalize: {
789	InstRate =
790	SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
791	break;
792	}
793	case Intrinsic::uadd_sat:
794	case Intrinsic::usub_sat:
795	case Intrinsic::sadd_sat:
796	case Intrinsic::ssub_sat: {
797	if (SLT == MVT::i16 \|\| SLT == MVT::i32)
798	InstRate = getFullRateInstrCost();
799
800	static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
801	if (any_of(Range: ValidSatTys, P: [&LT](MVT M) { return M == LT.second; }))
802	NElts = `1`;
803	break;
804	}
805	case Intrinsic::abs:
806	// Expansion takes 2 instructions for VALU
807	if (SLT == MVT::i16 \|\| SLT == MVT::i32)
808	InstRate = `2` * getFullRateInstrCost();
809	break;
810	default:
811	break;
812	}
813
814	return LT.first * NElts * InstRate;
815	}
816
817	InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
818	TTI::TargetCostKind CostKind,
819	const Instruction I) const* {
820	assert((I == nullptr \|\| I->getOpcode() == Opcode) &&
821	"Opcode should reflect passed instruction.");
822	const bool SCost =
823	(CostKind == TTI::TCK_CodeSize \|\| CostKind == TTI::TCK_SizeAndLatency);
824	const int CBrCost = SCost ? `5` : `7`;
825	switch (Opcode) {
826	case Instruction::Br: {
827	// Branch instruction takes about 4 slots on gfx900.
828	const auto *BI = dyn_cast_or_null<BranchInst>(Val: I);
829	if (BI && BI->isUnconditional())
830	return SCost ? `1` : `4`;
831	// Suppose conditional branch takes additional 3 exec manipulations
832	// instructions in average.
833	return CBrCost;
834	}
835	case Instruction::Switch: {
836	const auto *SI = dyn_cast_or_null<SwitchInst>(Val: I);
837	// Each case (including default) takes 1 cmp + 1 cbr instructions in
838	// average.
839	return (SI ? (SI->getNumCases() + `1`) : `4`) * (CBrCost + `1`);
840	}
841	case Instruction::Ret:
842	return SCost ? `1` : `10`;
843	}
844	return BaseT::getCFInstrCost(Opcode, CostKind, I);
845	}
846
847	InstructionCost
848	GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
849	std::optional<FastMathFlags> FMF,
850	TTI::TargetCostKind CostKind) const {
851	if (TTI::requiresOrderedReduction(FMF))
852	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
853
854	EVT OrigTy = TLI->getValueType(DL, Ty);
855
856	// Computes cost on targets that have packed math instructions(which support
857	// 16-bit types only).
858	if (!ST->hasVOP3PInsts() \|\| OrigTy.getScalarSizeInBits() != `16`)
859	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
860
861	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
862	return LT.first * getFullRateInstrCost();
863	}
864
865	InstructionCost
866	GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
867	FastMathFlags FMF,
868	TTI::TargetCostKind CostKind) const {
869	EVT OrigTy = TLI->getValueType(DL, Ty);
870
871	// Computes cost on targets that have packed math instructions(which support
872	// 16-bit types only).
873	if (!ST->hasVOP3PInsts() \|\| OrigTy.getScalarSizeInBits() != `16`)
874	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
875
876	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
877	return LT.first * getHalfRateInstrCost(CostKind);
878	}
879
880	InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
881	TTI::TargetCostKind CostKind,
882	unsigned Index, const Value *Op0,
883	const Value Op1) const* {
884	switch (Opcode) {
885	case Instruction::ExtractElement:
886	case Instruction::InsertElement: {
887	unsigned EltSize
888	= DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType());
889	if (EltSize < `32`) {
890	if (EltSize == `16` && Index == `0` && ST->has16BitInsts())
891	return `0`;
892	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0,
893	Op1);
894	}
895
896	// Extracts are just reads of a subregister, so are free. Inserts are
897	// considered free because we don't want to have any cost for scalarizing
898	// operations, and we don't have to copy into a different register class.
899
900	// Dynamic indexing isn't free and is best avoided.
901	return Index == ~`0u` ? `2` : `0`;
902	}
903	default:
904	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1);
905	}
906	}
907
908	/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
909	/// this is analyzing the collective result of all output registers. Otherwise,
910	/// this is only querying a specific result index if this returns multiple
911	/// registers in a struct.
912	bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
913	const CallInst CI, ArrayRef<unsigned> Indices) const* {
914	// TODO: Handle complex extract indices
915	if (Indices.size() > `1`)
916	return true;
917
918	const DataLayout &DL = CI->getDataLayout();
919	const SIRegisterInfo *TRI = ST->getRegisterInfo();
920	TargetLowering::AsmOperandInfoVector TargetConstraints =
921	TLI->ParseConstraints(DL, TRI: ST->getRegisterInfo(), Call: *CI);
922
923	const int TargetOutputIdx = Indices.empty() ? -`1` : Indices [`0`];
924
925	int OutputIdx = `0`;
926	for (auto &TC : TargetConstraints) {
927	if (TC.Type != InlineAsm::isOutput)
928	continue;
929
930	// Skip outputs we don't care about.
931	if (TargetOutputIdx != -`1` && TargetOutputIdx != OutputIdx++)
932	continue;
933
934	TLI->ComputeConstraintToUse(OpInfo&: TC, Op: SDValue ());
935
936	const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
937	TRI, Constraint: TC.ConstraintCode, VT: TC.ConstraintVT).second;
938
939	// For AGPR constraints null is returned on subtargets without AGPRs, so
940	// assume divergent for null.
941	if (!RC \|\| !TRI->isSGPRClass(RC))
942	return true;
943	}
944
945	return false;
946	}
947
948	bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
949	const IntrinsicInst ReadReg) const* {
950	Metadata *MD =
951	cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: `0`))->getMetadata();
952	StringRef RegName =
953	cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: `0`))->getString();
954
955	// Special case registers that look like VCC.
956	MVT VT = MVT::getVT(Ty: ReadReg->getType());
957	if (VT == MVT::i1)
958	return true;
959
960	// Special case scalar registers that start with 'v'.
961	if (RegName.starts_with(Prefix: "vcc") \|\| RegName.empty())
962	return false;
963
964	// VGPR or AGPR is divergent. There aren't any specially named vector
965	// registers.
966	return RegName [`0`] == `'v'` \|\| RegName [`0`] == `'a'`;
967	}
968
969	/// \returns true if the result of the value could potentially be
970	/// different across workitems in a wavefront.
971	bool GCNTTIImpl::isSourceOfDivergence(const Value V) const* {
972	if (const Argument *A = dyn_cast<Argument>(Val: V))
973	return !AMDGPU::isArgPassedInSGPR(Arg: A);
974
975	// Loads from the private and flat address spaces are divergent, because
976	// threads can execute the load instruction with the same inputs and get
977	// different results.
978	//
979	// All other loads are not divergent, because if threads issue loads with the
980	// same arguments, they will always get the same result.
981	if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V))
982	return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS \|\|
983	Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
984
985	// Atomics are divergent because they are executed sequentially: when an
986	// atomic operation refers to the same address in each thread, then each
987	// thread after the first sees the value written by the previous thread as
988	// original value.
989	if (isa<AtomicRMWInst, AtomicCmpXchgInst>(Val: V))
990	return true;
991
992	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
993	if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
994	return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic);
995
996	return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: Intrinsic->getIntrinsicID());
997	}
998
999	// Assume all function calls are a source of divergence.
1000	if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1001	if (CI->isInlineAsm())
1002	return isInlineAsmSourceOfDivergence(CI);
1003	return true;
1004	}
1005
1006	// Assume all function calls are a source of divergence.
1007	if (isa<InvokeInst>(Val: V))
1008	return true;
1009
1010	return false;
1011	}
1012
1013	bool GCNTTIImpl::isAlwaysUniform(const Value V) const* {
1014	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V))
1015	return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID());
1016
1017	if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
1018	if (CI->isInlineAsm())
1019	return !isInlineAsmSourceOfDivergence(CI);
1020	return false;
1021	}
1022
1023	// In most cases TID / wavefrontsize is uniform.
1024	//
1025	// However, if a kernel has uneven dimesions we can have a value of
1026	// workitem-id-x divided by the wavefrontsize non-uniform. For example
1027	// dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1028	// packed into a same wave which gives 1 and 0 after the division by 64
1029	// respectively.
1030	//
1031	// FIXME: limit it to 1D kernels only, although that shall be possible
1032	// to perform this optimization is the size of the X dimension is a power
1033	// of 2, we just do not currently have infrastructure to query it.
1034	using namespace llvm::PatternMatch;
1035	uint64_t C;
1036	if (match(V, P: m_LShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1037	R: m_ConstantInt(V&: C))) \|\|
1038	match(V, P: m_AShr(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1039	R: m_ConstantInt(V&: C)))) {
1040	const Function *F = cast<Instruction>(Val: V)->getFunction();
1041	return C >= ST->getWavefrontSizeLog2() &&
1042	ST->getMaxWorkitemID(Kernel: F, Dimension: `1`) == `0` && ST->getMaxWorkitemID(Kernel: F, Dimension: `2`) == `0`;
1043	}
1044
1045	Value *Mask;
1046	if (match(V, P: m_c_And(L: m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
1047	R: m_Value(V&: Mask)))) {
1048	const Function *F = cast<Instruction>(Val: V)->getFunction();
1049	const DataLayout &DL = F->getDataLayout();
1050	return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >=
1051	ST->getWavefrontSizeLog2() &&
1052	ST->getMaxWorkitemID(Kernel: F, Dimension: `1`) == `0` && ST->getMaxWorkitemID(Kernel: F, Dimension: `2`) == `0`;
1053	}
1054
1055	const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V);
1056	if (!ExtValue)
1057	return false;
1058
1059	const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: `0`));
1060	if (!CI)
1061	return false;
1062
1063	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) {
1064	switch (Intrinsic->getIntrinsicID()) {
1065	default:
1066	return false;
1067	case Intrinsic::amdgcn_if:
1068	case Intrinsic::amdgcn_else: {
1069	ArrayRef<unsigned> Indices = ExtValue->getIndices();
1070	return Indices.size() == `1` && Indices [`0`] == `1`;
1071	}
1072	}
1073	}
1074
1075	// If we have inline asm returning mixed SGPR and VGPR results, we inferred
1076	// divergent for the overall struct return. We need to override it in the
1077	// case we're extracting an SGPR component here.
1078	if (CI->isInlineAsm())
1079	return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices());
1080
1081	return false;
1082	}
1083
1084	bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1085	Intrinsic::ID IID) const {
1086	switch (IID) {
1087	case Intrinsic::amdgcn_is_shared:
1088	case Intrinsic::amdgcn_is_private:
1089	case Intrinsic::amdgcn_flat_atomic_fmax_num:
1090	case Intrinsic::amdgcn_flat_atomic_fmin_num:
1091	case Intrinsic::amdgcn_load_to_lds:
1092	case Intrinsic::amdgcn_make_buffer_rsrc:
1093	OpIndexes.push_back(Elt: `0`);
1094	return true;
1095	default:
1096	return false;
1097	}
1098	}
1099
1100	Value GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst II,
1101	Value *OldV,
1102	Value NewV) const* {
1103	auto IntrID = II->getIntrinsicID();
1104	switch (IntrID) {
1105	case Intrinsic::amdgcn_is_shared:
1106	case Intrinsic::amdgcn_is_private: {
1107	unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1108	AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1109	unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1110	LLVMContext &Ctx = NewV->getType()->getContext();
1111	ConstantInt *NewVal = (TrueAS == NewAS) ?
1112	ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx);
1113	return NewVal;
1114	}
1115	case Intrinsic::ptrmask: {
1116	unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1117	unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1118	Value *MaskOp = II->getArgOperand(i: `1`);
1119	Type *MaskTy = MaskOp->getType();
1120
1121	bool DoTruncate = false;
1122
1123	const GCNTargetMachine &TM =
1124	static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1125	if (!TM.isNoopAddrSpaceCast(SrcAS: OldAS, DestAS: NewAS)) {
1126	// All valid 64-bit to 32-bit casts work by chopping off the high
1127	// bits. Any masking only clearing the low bits will also apply in the new
1128	// address space.
1129	if (DL.getPointerSizeInBits(AS: OldAS) != `64` \|\|
1130	DL.getPointerSizeInBits(AS: NewAS) != `32`)
1131	return nullptr;
1132
1133	// TODO: Do we need to thread more context in here?
1134	KnownBits Known = computeKnownBits(V: MaskOp, DL, AC: nullptr, CxtI: II);
1135	if (Known.countMinLeadingOnes() < `32`)
1136	return nullptr;
1137
1138	DoTruncate = true;
1139	}
1140
1141	IRBuilder<> B(II);
1142	if (DoTruncate) {
1143	MaskTy = B.getInt32Ty();
1144	MaskOp = B.CreateTrunc(V: MaskOp, DestTy: MaskTy);
1145	}
1146
1147	return B.CreateIntrinsic(ID: Intrinsic::ptrmask, Types: {NewV->getType(), MaskTy},
1148	Args: {NewV, MaskOp});
1149	}
1150	case Intrinsic::amdgcn_flat_atomic_fmax_num:
1151	case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1152	Type *DestTy = II->getType();
1153	Type *SrcTy = NewV->getType();
1154	unsigned NewAS = SrcTy->getPointerAddressSpace();
1155	if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS))
1156	return nullptr;
1157	Module *M = II->getModule();
1158	Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1159	M, id: II->getIntrinsicID(), Tys: {DestTy, SrcTy, DestTy});
1160	II->setArgOperand(i: `0`, v: NewV);
1161	II->setCalledFunction(NewDecl);
1162	return II;
1163	}
1164	case Intrinsic::amdgcn_load_to_lds: {
1165	Type *SrcTy = NewV->getType();
1166	Module *M = II->getModule();
1167	Function *NewDecl =
1168	Intrinsic::getOrInsertDeclaration(M, id: II->getIntrinsicID(), Tys: {SrcTy});
1169	II->setArgOperand(i: `0`, v: NewV);
1170	II->setCalledFunction(NewDecl);
1171	return II;
1172	}
1173	case Intrinsic::amdgcn_make_buffer_rsrc: {
1174	Type *SrcTy = NewV->getType();
1175	Type *DstTy = II->getType();
1176	Module *M = II->getModule();
1177	Function *NewDecl = Intrinsic::getOrInsertDeclaration(
1178	M, id: II->getIntrinsicID(), Tys: {DstTy, SrcTy});
1179	II->setArgOperand(i: `0`, v: NewV);
1180	II->setCalledFunction(NewDecl);
1181	return II;
1182	}
1183	default:
1184	return nullptr;
1185	}
1186	}
1187
1188	InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1189	VectorType DstTy, VectorType SrcTy,
1190	ArrayRef<int> Mask,
1191	TTI::TargetCostKind CostKind,
1192	int Index, VectorType *SubTp,
1193	ArrayRef<const Value *> Args,
1194	const Instruction CxtI) const* {
1195	if (!isa<FixedVectorType>(Val: SrcTy))
1196	return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1197	SubTp);
1198
1199	Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
1200
1201	unsigned ScalarSize = DL.getTypeSizeInBits(Ty: SrcTy->getElementType());
1202	if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1203	(ScalarSize == `16` \|\| ScalarSize == `8`)) {
1204	// Larger vector widths may require additional instructions, but are
1205	// typically cheaper than scalarized versions.
1206	unsigned NumVectorElts = cast<FixedVectorType>(Val: SrcTy)->getNumElements();
1207	unsigned RequestedElts =
1208	count_if(Range&: Mask, P: [](int MaskElt) { return MaskElt != -`1`; });
1209	unsigned EltsPerReg = `32` / ScalarSize;
1210	if (RequestedElts == `0`)
1211	return `0`;
1212	switch (Kind) {
1213	case TTI::SK_Broadcast:
1214	case TTI::SK_Reverse:
1215	case TTI::SK_PermuteSingleSrc: {
1216	// With op_sel VOP3P instructions freely can access the low half or high
1217	// half of a register, so any swizzle of two elements is free.
1218	if (ST->hasVOP3PInsts() && ScalarSize == `16` && NumVectorElts == `2`)
1219	return `0`;
1220	unsigned NumPerms = alignTo(Value: RequestedElts, Align: EltsPerReg) / EltsPerReg;
1221	// SK_Broadcast just reuses the same mask
1222	unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? `1` : NumPerms;
1223	return NumPerms + NumPermMasks;
1224	}
1225	case TTI::SK_ExtractSubvector:
1226	case TTI::SK_InsertSubvector: {
1227	// Even aligned accesses are free
1228	if (!(Index % `2`))
1229	return `0`;
1230	// Insert/extract subvectors only require shifts / extract code to get the
1231	// relevant bits
1232	return alignTo(Value: RequestedElts, Align: EltsPerReg) / EltsPerReg;
1233	}
1234	case TTI::SK_PermuteTwoSrc:
1235	case TTI::SK_Splice:
1236	case TTI::SK_Select: {
1237	unsigned NumPerms = alignTo(Value: RequestedElts, Align: EltsPerReg) / EltsPerReg;
1238	// SK_Select just reuses the same mask
1239	unsigned NumPermMasks = Kind == TTI::SK_Select ? `1` : NumPerms;
1240	return NumPerms + NumPermMasks;
1241	}
1242
1243	default:
1244	break;
1245	}
1246	}
1247
1248	return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1249	SubTp);
1250	}
1251
1252	/// Whether it is profitable to sink the operands of an
1253	/// Instruction I to the basic block of I.
1254	/// This helps using several modifiers (like abs and neg) more often.
1255	bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
1256	SmallVectorImpl<Use > &Ops) const* {
1257	using namespace PatternMatch;
1258
1259	for (auto &Op : I->operands()) {
1260	// Ensure we are not already sinking this operand.
1261	if (any_of(Range&: Ops, P: [&](Use U) { return* U->get() == Op.get(); }))
1262	continue;
1263
1264	if (match(V: &Op, P: m_FAbs(Op0: m_Value())) \|\| match(V: &Op, P: m_FNeg(X: m_Value())))
1265	Ops.push_back(Elt: &Op);
1266	}
1267
1268	return !Ops.empty();
1269	}
1270
1271	bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1272	const Function Callee) const* {
1273	const TargetMachine &TM = getTLI()->getTargetMachine();
1274	const GCNSubtarget *CallerST
1275	= static_cast<const GCNSubtarget >(TM.getSubtargetImpl(Caller));
1276	const GCNSubtarget *CalleeST
1277	= static_cast<const GCNSubtarget >(TM.getSubtargetImpl(Callee));
1278
1279	const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1280	const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1281
1282	FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1283	FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1284	if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1285	return false;
1286
1287	// FIXME: dx10_clamp can just take the caller setting, but there seems to be
1288	// no way to support merge for backend defined attributes.
1289	SIModeRegisterDefaults CallerMode(Caller, CallerST);
1290	SIModeRegisterDefaults CalleeMode(Callee, CalleeST);
1291	if (!CallerMode.isInlineCompatible(CalleeMode))
1292	return false;
1293
1294	if (Callee->hasFnAttribute(Kind: Attribute::AlwaysInline) \|\|
1295	Callee->hasFnAttribute(Kind: Attribute::InlineHint))
1296	return true;
1297
1298	// Hack to make compile times reasonable.
1299	if (InlineMaxBB) {
1300	// Single BB does not increase total BB amount.
1301	if (Callee->size() == `1`)
1302	return true;
1303	size_t BBSize = Caller->size() + Callee->size() - `1`;
1304	return BBSize <= InlineMaxBB;
1305	}
1306
1307	return true;
1308	}
1309
1310	static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1311	const SITargetLowering *TLI,
1312	const GCNTTIImpl *TTIImpl) {
1313	const int NrOfSGPRUntilSpill = `26`;
1314	const int NrOfVGPRUntilSpill = `32`;
1315
1316	const DataLayout &DL = TTIImpl->getDataLayout();
1317
1318	unsigned adjustThreshold = `0`;
1319	int SGPRsInUse = `0`;
1320	int VGPRsInUse = `0`;
1321	for (const Use &A : CB->args()) {
1322	SmallVector<EVT, `4`> ValueVTs;
1323	ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs);
1324	for (auto ArgVT : ValueVTs) {
1325	unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1326	Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT);
1327	if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A)))
1328	SGPRsInUse += CCRegNum;
1329	else
1330	VGPRsInUse += CCRegNum;
1331	}
1332	}
1333
1334	// The cost of passing function arguments through the stack:
1335	// 1 instruction to put a function argument on the stack in the caller.
1336	// 1 instruction to take a function argument from the stack in callee.
1337	// 1 instruction is explicitly take care of data dependencies in callee
1338	// function.
1339	InstructionCost ArgStackCost(`1`);
1340	ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1341	Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align (`4`),
1342	AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1343	ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1344	Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align (`4`),
1345	AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1346
1347	// The penalty cost is computed relative to the cost of instructions and does
1348	// not model any storage costs.
1349	adjustThreshold += std::max(a: `0`, b: SGPRsInUse - NrOfSGPRUntilSpill) *
1350	ArgStackCost.getValue() * InlineConstants::getInstrCost();
1351	adjustThreshold += std::max(a: `0`, b: VGPRsInUse - NrOfVGPRUntilSpill) *
1352	ArgStackCost.getValue() * InlineConstants::getInstrCost();
1353	return adjustThreshold;
1354	}
1355
1356	static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1357	const DataLayout &DL) {
1358	// If we have a pointer to a private array passed into a function
1359	// it will not be optimized out, leaving scratch usage.
1360	// This function calculates the total size in bytes of the memory that would
1361	// end in scratch if the call was not inlined.
1362	unsigned AllocaSize = `0`;
1363	SmallPtrSet<const AllocaInst *, `8`> AIVisited;
1364	for (Value *PtrArg : CB->args()) {
1365	PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType());
1366	if (!Ty)
1367	continue;
1368
1369	unsigned AddrSpace = Ty->getAddressSpace();
1370	if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1371	AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1372	continue;
1373
1374	const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg));
1375	if (!AI \|\| !AI->isStaticAlloca() \|\| !AIVisited.insert(Ptr: AI).second)
1376	continue;
1377
1378	AllocaSize += DL.getTypeAllocSize(Ty: AI->getAllocatedType());
1379	}
1380	return AllocaSize;
1381	}
1382
1383	int GCNTTIImpl::getInliningLastCallToStaticBonus() const {
1384	return BaseT::getInliningLastCallToStaticBonus() *
1385	getInliningThresholdMultiplier();
1386	}
1387
1388	unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase CB) const* {
1389	unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this);
1390
1391	// Private object passed as arguments may end up in scratch usage if the call
1392	// is not inlined. Increase the inline threshold to promote inlining.
1393	unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1394	if (AllocaSize > `0`)
1395	Threshold += ArgAllocaCost;
1396	return Threshold;
1397	}
1398
1399	unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1400	const AllocaInst AI) const* {
1401
1402	// Below the cutoff, assume that the private memory objects would be
1403	// optimized
1404	auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1405	if (AllocaSize <= ArgAllocaCutoff)
1406	return `0`;
1407
1408	// Above the cutoff, we give a cost to each private memory object
1409	// depending its size. If the array can be optimized by SROA this cost is not
1410	// added to the total-cost in the inliner cost analysis.
1411	//
1412	// We choose the total cost of the alloca such that their sum cancels the
1413	// bonus given in the threshold (ArgAllocaCost).
1414	//
1415	// Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1416	//
1417	// Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1418	// the single-bb bonus and the vector-bonus.
1419	//
1420	// We compensate the first two multipliers, by repeating logic from the
1421	// inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1422	static_assert(InlinerVectorBonusPercent == `0`, "vector bonus assumed to be 0");
1423	unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1424
1425	bool SingleBB = none_of(Range&: CB->getCalledFunction(), P: [](const* BasicBlock &BB) {
1426	return BB.getTerminator()->getNumSuccessors() > `1`;
1427	});
1428	if (SingleBB) {
1429	Threshold += Threshold / `2`;
1430	}
1431
1432	auto ArgAllocaSize = DL.getTypeAllocSize(Ty: AI->getAllocatedType());
1433
1434	// Attribute the bonus proportionally to the alloca size
1435	unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1436
1437	return AllocaThresholdBonus;
1438	}
1439
1440	void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1441	TTI::UnrollingPreferences &UP,
1442	OptimizationRemarkEmitter ORE) const* {
1443	CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1444	}
1445
1446	void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1447	TTI::PeelingPreferences &PP) const {
1448	CommonTTI.getPeelingPreferences(L, SE, PP);
1449	}
1450
1451	int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1452	return ST->hasFullRate64Ops()
1453	? getFullRateInstrCost()
1454	: ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1455	: getQuarterRateInstrCost(CostKind);
1456	}
1457
1458	std::pair<InstructionCost, MVT>
1459	GCNTTIImpl::getTypeLegalizationCost(Type Ty) const* {
1460	std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1461	auto Size = DL.getTypeSizeInBits(Ty);
1462	// Maximum load or store can handle 8 dwords for scalar and 4 for
1463	// vector ALU. Let's assume anything above 8 dwords is expensive
1464	// even if legal.
1465	if (Size <= `256`)
1466	return Cost;
1467
1468	Cost.first += (Size + `255`) / `256`;
1469	return Cost;
1470	}
1471
1472	unsigned GCNTTIImpl::getPrefetchDistance() const {
1473	return ST->hasPrefetch() ? `128` : `0`;
1474	}
1475
1476	bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1477	return AMDGPU::isFlatGlobalAddrSpace(AS);
1478	}
1479
1480	void GCNTTIImpl::collectKernelLaunchBounds(
1481	const Function &F,
1482	SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1483	SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1484	LB.push_back(Elt: {"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups [`0`]});
1485	LB.push_back(Elt: {"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups [`1`]});
1486	LB.push_back(Elt: {"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups [`2`]});
1487	std::pair<unsigned, unsigned> FlatWorkGroupSize =
1488	ST->getFlatWorkGroupSizes(F);
1489	LB.push_back(Elt: {"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1490	LB.push_back(Elt: {"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1491	std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1492	LB.push_back(Elt: {"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1493	LB.push_back(Elt: {"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1494	}
1495
1496	GCNTTIImpl::KnownIEEEMode
1497	GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
1498	if (!ST->hasIEEEMode()) // Only mode on gfx12
1499	return KnownIEEEMode::On;
1500
1501	const Function *F = I.getFunction();
1502	if (!F)
1503	return KnownIEEEMode::Unknown;
1504
1505	Attribute IEEEAttr = F->getFnAttribute(Kind: "amdgpu-ieee");
1506	if (IEEEAttr.isValid())
1507	return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
1508
1509	return AMDGPU::isShader(CC: F->getCallingConv()) ? KnownIEEEMode::Off
1510	: KnownIEEEMode::On;
1511	}
1512
1513	InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1514	Align Alignment,
1515	unsigned AddressSpace,
1516	TTI::TargetCostKind CostKind,
1517	TTI::OperandValueInfo OpInfo,
1518	const Instruction I) const* {
1519	if (VectorType *VecTy = dyn_cast<VectorType>(Val: Src)) {
1520	if ((Opcode == Instruction::Load \|\| Opcode == Instruction::Store) &&
1521	VecTy->getElementType()->isIntegerTy(Bitwidth: `8`)) {
1522	return divideCeil(Numerator: DL.getTypeSizeInBits(Ty: VecTy) - `1`,
1523	Denominator: getLoadStoreVecRegBitWidth(AddrSpace: AddressSpace));
1524	}
1525	}
1526	return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1527	OpInfo, I);
1528	}
1529
1530	unsigned GCNTTIImpl::getNumberOfParts(Type Tp) const* {
1531	if (VectorType *VecTy = dyn_cast<VectorType>(Val: Tp)) {
1532	if (VecTy->getElementType()->isIntegerTy(Bitwidth: `8`)) {
1533	unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1534	return divideCeil(Numerator: ElementCount - `1`, Denominator: `4`);
1535	}
1536	}
1537	return BaseT::getNumberOfParts(Tp);
1538	}
1539

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp