AMDGPUSubtarget.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp]

1	//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Implements the AMDGPU specific subclass of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPUSubtarget.h"
15	#include "AMDGPUCallLowering.h"
16	#include "AMDGPUInstructionSelector.h"
17	#include "AMDGPULegalizerInfo.h"
18	#include "AMDGPURegisterBankInfo.h"
19	#include "AMDGPUTargetMachine.h"
20	#include "GCNSubtarget.h"
21	#include "R600Subtarget.h"
22	#include "SIMachineFunctionInfo.h"
23	#include "Utils/AMDGPUBaseInfo.h"
24	#include "llvm/ADT/SmallString.h"
25	#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
26	#include "llvm/CodeGen/MachineScheduler.h"
27	#include "llvm/CodeGen/TargetFrameLowering.h"
28	#include "llvm/IR/DiagnosticInfo.h"
29	#include "llvm/IR/IntrinsicsAMDGPU.h"
30	#include "llvm/IR/IntrinsicsR600.h"
31	#include "llvm/IR/MDBuilder.h"
32	#include "llvm/MC/MCSubtargetInfo.h"
33	#include <algorithm>
34
35	using namespace llvm;
36
37	#define DEBUG_TYPE "amdgpu-subtarget"
38
39	#define GET_SUBTARGETINFO_TARGET_DESC
40	#define GET_SUBTARGETINFO_CTOR
41	#define AMDGPUSubtarget GCNSubtarget
42	#include "AMDGPUGenSubtargetInfo.inc"
43	#undef AMDGPUSubtarget
44
45	static cl::opt<bool> EnablePowerSched(
46	"amdgpu-enable-power-sched",
47	cl::desc ("Enable scheduling to minimize mAI power bursts"),
48	cl::init(Val: false));
49
50	static cl::opt<bool> EnableVGPRIndexMode(
51	"amdgpu-vgpr-index-mode",
52	cl::desc ("Use GPR indexing mode instead of movrel for vector indexing"),
53	cl::init(Val: false));
54
55	static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
56	cl::desc ("Enable the use of AA during codegen."),
57	cl::init(Val: true));
58
59	static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
60	cl::desc ("Number of addresses from which to enable MIMG NSA."),
61	cl::init(Val: `3`), cl::Hidden);
62
63	GCNSubtarget::~GCNSubtarget() = default;
64
65	GCNSubtarget &
66	GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67	StringRef GPU, StringRef FS) {
68	// Determine default and user-specified characteristics
69	//
70	// We want to be able to turn these off, but making this a subtarget feature
71	// for SI has the unhelpful behavior that it unsets everything else if you
72	// disable it.
73	//
74	// Similarly we want enable-prt-strict-null to be on by default and not to
75	// unset everything else if it is disabled
76
77	SmallString<`256`> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
78
79	// Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
80	if (isAmdHsaOS())
81	FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
82
83	FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
84
85	// Disable mutually exclusive bits.
86	if (FS.contains_insensitive(Other: "+wavefrontsize")) {
87	if (!FS.contains_insensitive(Other: "wavefrontsize16"))
88	FullFS += "-wavefrontsize16,";
89	if (!FS.contains_insensitive(Other: "wavefrontsize32"))
90	FullFS += "-wavefrontsize32,";
91	if (!FS.contains_insensitive(Other: "wavefrontsize64"))
92	FullFS += "-wavefrontsize64,";
93	}
94
95	FullFS += FS;
96
97	ParseSubtargetFeatures(CPU: GPU, /TuneCPU/ GPU, FS: FullFS);
98
99	// Implement the "generic" processors, which acts as the default when no
100	// generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
101	// the first amdgcn target that supports flat addressing. Other OSes defaults
102	// to the first amdgcn target.
103	if (Gen == AMDGPUSubtarget::INVALID) {
104	Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
105	: AMDGPUSubtarget::SOUTHERN_ISLANDS;
106	}
107
108	if (!hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
109	!hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
110	// If there is no default wave size it must be a generation before gfx10,
111	// these have FeatureWavefrontSize64 in their definition already. For gfx10+
112	// set wave32 as a default.
113	ToggleFeature(FB: AMDGPU::FeatureWavefrontSize32);
114	}
115
116	// We don't support FP64 for EG/NI atm.
117	assert(!hasFP64() \|\| (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
118
119	// Targets must either support 64-bit offsets for MUBUF instructions, and/or
120	// support flat operations, otherwise they cannot access a 64-bit global
121	// address space
122	assert(hasAddr64() \|\| hasFlat());
123	// Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
124	// that do not support ADDR64 variants of MUBUF instructions. Such targets
125	// cannot use a 64 bit offset with a MUBUF instruction to access the global
126	// address space
127	if (!hasAddr64() && !FS.contains(Other: "flat-for-global") && !FlatForGlobal) {
128	ToggleFeature(FB: AMDGPU::FeatureFlatForGlobal);
129	FlatForGlobal = true;
130	}
131	// Unless +-flat-for-global is specified, use MUBUF instructions for global
132	// address space access if flat operations are not available.
133	if (!hasFlat() && !FS.contains(Other: "flat-for-global") && FlatForGlobal) {
134	ToggleFeature(FB: AMDGPU::FeatureFlatForGlobal);
135	FlatForGlobal = false;
136	}
137
138	// Set defaults if needed.
139	if (MaxPrivateElementSize == `0`)
140	MaxPrivateElementSize = `4`;
141
142	if (LDSBankCount == `0`)
143	LDSBankCount = `32`;
144
145	if (TT.getArch() == Triple::amdgcn) {
146	if (LocalMemorySize == `0`)
147	LocalMemorySize = `32768`;
148
149	// Do something sensible for unspecified target.
150	if (!HasMovrel && !HasVGPRIndexMode)
151	HasMovrel = true;
152	}
153
154	AddressableLocalMemorySize = LocalMemorySize;
155
156	if (AMDGPU::isGFX10Plus(STI: *this) &&
157	!getFeatureBits().test(I: AMDGPU::FeatureCuMode))
158	LocalMemorySize *= `2`;
159
160	// Don't crash on invalid devices.
161	if (WavefrontSizeLog2 == `0`)
162	WavefrontSizeLog2 = `5`;
163
164	HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
165	HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
166
167	TargetID.setTargetIDFromFeaturesString(FS);
168
169	LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170	<< TargetID.getXnackSetting() << `'\n'`);
171	LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172	<< TargetID.getSramEccSetting() << `'\n'`);
173
174	return *this;
175	}
176
177	void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
178	LLVMContext &Ctx = F.getContext();
179	if (hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) ==
180	hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
181	Ctx.diagnose(DI: DiagnosticInfoUnsupported (
182	F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
183	}
184	}
185
186	AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple (std::move(TT)) {}
187
188	bool AMDGPUSubtarget::useRealTrue16Insts() const {
189	return hasTrue16BitInsts() && EnableRealTrue16Insts;
190	}
191
192	GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
193	const GCNTargetMachine &TM)
194	: // clang-format off
195	AMDGPUGenSubtargetInfo (TT, GPU, /TuneCPU/ GPU, FS),
196	AMDGPUSubtarget (TT),
197	TargetTriple (TT),
198	TargetID (*this),
199	InstrItins(getInstrItineraryForCPU(CPU: GPU)),
200	InstrInfo (initializeSubtargetDependencies(TT, GPU, FS)),
201	TLInfo (TM, *this),
202	FrameLowering (TargetFrameLowering::StackGrowsUp, getStackAlignment(), `0`) {
203	// clang-format on
204	MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(STI: this);
205	EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(STI: this);
206	CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(args: *getTargetLowering());
207	InlineAsmLoweringInfo =
208	std::make_unique<InlineAsmLowering>(args: getTargetLowering());
209	Legalizer = std::make_unique<AMDGPULegalizerInfo>(args&: *this, args: TM);
210	RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(args&: *this);
211	InstSelector =
212	std::make_unique<AMDGPUInstructionSelector>(args&: *this, args&: *RegBankInfo, args: TM);
213	}
214
215	unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
216	if (getGeneration() < GFX10)
217	return `1`;
218
219	switch (Opcode) {
220	case AMDGPU::V_LSHLREV_B64_e64:
221	case AMDGPU::V_LSHLREV_B64_gfx10:
222	case AMDGPU::V_LSHLREV_B64_e64_gfx11:
223	case AMDGPU::V_LSHLREV_B64_e32_gfx12:
224	case AMDGPU::V_LSHLREV_B64_e64_gfx12:
225	case AMDGPU::V_LSHL_B64_e64:
226	case AMDGPU::V_LSHRREV_B64_e64:
227	case AMDGPU::V_LSHRREV_B64_gfx10:
228	case AMDGPU::V_LSHRREV_B64_e64_gfx11:
229	case AMDGPU::V_LSHRREV_B64_e64_gfx12:
230	case AMDGPU::V_LSHR_B64_e64:
231	case AMDGPU::V_ASHRREV_I64_e64:
232	case AMDGPU::V_ASHRREV_I64_gfx10:
233	case AMDGPU::V_ASHRREV_I64_e64_gfx11:
234	case AMDGPU::V_ASHRREV_I64_e64_gfx12:
235	case AMDGPU::V_ASHR_I64_e64:
236	return `1`;
237	}
238
239	return `2`;
240	}
241
242	/// This list was mostly derived from experimentation.
243	bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
244	switch (Opcode) {
245	case AMDGPU::V_CVT_F16_F32_e32:
246	case AMDGPU::V_CVT_F16_F32_e64:
247	case AMDGPU::V_CVT_F16_U16_e32:
248	case AMDGPU::V_CVT_F16_U16_e64:
249	case AMDGPU::V_CVT_F16_I16_e32:
250	case AMDGPU::V_CVT_F16_I16_e64:
251	case AMDGPU::V_RCP_F16_e64:
252	case AMDGPU::V_RCP_F16_e32:
253	case AMDGPU::V_RSQ_F16_e64:
254	case AMDGPU::V_RSQ_F16_e32:
255	case AMDGPU::V_SQRT_F16_e64:
256	case AMDGPU::V_SQRT_F16_e32:
257	case AMDGPU::V_LOG_F16_e64:
258	case AMDGPU::V_LOG_F16_e32:
259	case AMDGPU::V_EXP_F16_e64:
260	case AMDGPU::V_EXP_F16_e32:
261	case AMDGPU::V_SIN_F16_e64:
262	case AMDGPU::V_SIN_F16_e32:
263	case AMDGPU::V_COS_F16_e64:
264	case AMDGPU::V_COS_F16_e32:
265	case AMDGPU::V_FLOOR_F16_e64:
266	case AMDGPU::V_FLOOR_F16_e32:
267	case AMDGPU::V_CEIL_F16_e64:
268	case AMDGPU::V_CEIL_F16_e32:
269	case AMDGPU::V_TRUNC_F16_e64:
270	case AMDGPU::V_TRUNC_F16_e32:
271	case AMDGPU::V_RNDNE_F16_e64:
272	case AMDGPU::V_RNDNE_F16_e32:
273	case AMDGPU::V_FRACT_F16_e64:
274	case AMDGPU::V_FRACT_F16_e32:
275	case AMDGPU::V_FREXP_MANT_F16_e64:
276	case AMDGPU::V_FREXP_MANT_F16_e32:
277	case AMDGPU::V_FREXP_EXP_I16_F16_e64:
278	case AMDGPU::V_FREXP_EXP_I16_F16_e32:
279	case AMDGPU::V_LDEXP_F16_e64:
280	case AMDGPU::V_LDEXP_F16_e32:
281	case AMDGPU::V_LSHLREV_B16_e64:
282	case AMDGPU::V_LSHLREV_B16_e32:
283	case AMDGPU::V_LSHRREV_B16_e64:
284	case AMDGPU::V_LSHRREV_B16_e32:
285	case AMDGPU::V_ASHRREV_I16_e64:
286	case AMDGPU::V_ASHRREV_I16_e32:
287	case AMDGPU::V_ADD_U16_e64:
288	case AMDGPU::V_ADD_U16_e32:
289	case AMDGPU::V_SUB_U16_e64:
290	case AMDGPU::V_SUB_U16_e32:
291	case AMDGPU::V_SUBREV_U16_e64:
292	case AMDGPU::V_SUBREV_U16_e32:
293	case AMDGPU::V_MUL_LO_U16_e64:
294	case AMDGPU::V_MUL_LO_U16_e32:
295	case AMDGPU::V_ADD_F16_e64:
296	case AMDGPU::V_ADD_F16_e32:
297	case AMDGPU::V_SUB_F16_e64:
298	case AMDGPU::V_SUB_F16_e32:
299	case AMDGPU::V_SUBREV_F16_e64:
300	case AMDGPU::V_SUBREV_F16_e32:
301	case AMDGPU::V_MUL_F16_e64:
302	case AMDGPU::V_MUL_F16_e32:
303	case AMDGPU::V_MAX_F16_e64:
304	case AMDGPU::V_MAX_F16_e32:
305	case AMDGPU::V_MIN_F16_e64:
306	case AMDGPU::V_MIN_F16_e32:
307	case AMDGPU::V_MAX_U16_e64:
308	case AMDGPU::V_MAX_U16_e32:
309	case AMDGPU::V_MIN_U16_e64:
310	case AMDGPU::V_MIN_U16_e32:
311	case AMDGPU::V_MAX_I16_e64:
312	case AMDGPU::V_MAX_I16_e32:
313	case AMDGPU::V_MIN_I16_e64:
314	case AMDGPU::V_MIN_I16_e32:
315	case AMDGPU::V_MAD_F16_e64:
316	case AMDGPU::V_MAD_U16_e64:
317	case AMDGPU::V_MAD_I16_e64:
318	case AMDGPU::V_FMA_F16_e64:
319	case AMDGPU::V_DIV_FIXUP_F16_e64:
320	// On gfx10, all 16-bit instructions preserve the high bits.
321	return getGeneration() <= AMDGPUSubtarget::GFX9;
322	case AMDGPU::V_MADAK_F16:
323	case AMDGPU::V_MADMK_F16:
324	case AMDGPU::V_MAC_F16_e64:
325	case AMDGPU::V_MAC_F16_e32:
326	case AMDGPU::V_FMAMK_F16:
327	case AMDGPU::V_FMAAK_F16:
328	case AMDGPU::V_FMAC_F16_e64:
329	case AMDGPU::V_FMAC_F16_e32:
330	// In gfx9, the preferred handling of the unused high 16-bits changed. Most
331	// instructions maintain the legacy behavior of 0ing. Some instructions
332	// changed to preserving the high bits.
333	return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
334	case AMDGPU::V_MAD_MIXLO_F16:
335	case AMDGPU::V_MAD_MIXHI_F16:
336	default:
337	return false;
338	}
339	}
340
341	// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
342	// allows the given function to achieve an occupancy of NWaves waves per
343	// SIMD / EU, taking into account only the function's maximum* workgroup size.*
344	unsigned
345	AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
346	const Function &F) const {
347	const unsigned WaveSize = getWavefrontSize();
348	const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
349	const unsigned WavesPerWorkgroup =
350	std::max(a: `1u`, b: (WorkGroupSize + WaveSize - `1`) / WaveSize);
351
352	const unsigned WorkGroupsPerCU =
353	std::max(a: `1u`, b: (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
354
355	return getLocalMemorySize() / WorkGroupsPerCU;
356	}
357
358	// FIXME: Should return min,max range.
359	//
360	// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
361	// be achieved when only the given function is running on the machine; and
362	// taking into account the overall number of wave slots, the (maximum) workgroup
363	// size, and the per-workgroup LDS allocation size.
364	unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
365	const Function &F) const {
366	const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
367	const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(FlatWorkGroupSize: MaxWorkGroupSize);
368	if (!MaxWorkGroupsPerCu)
369	return `0`;
370
371	const unsigned WaveSize = getWavefrontSize();
372
373	// FIXME: Do we need to account for alignment requirement of LDS rounding the
374	// size up?
375	// Compute restriction based on LDS usage
376	unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : `1u`);
377
378	// This can be queried with more LDS than is possible, so just assume the
379	// worst.
380	if (NumGroups == `0`)
381	return `1`;
382
383	NumGroups = std::min(a: MaxWorkGroupsPerCu, b: NumGroups);
384
385	// Round to the number of waves per CU.
386	const unsigned MaxGroupNumWaves = divideCeil(Numerator: MaxWorkGroupSize, Denominator: WaveSize);
387	unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
388
389	// Number of waves per EU (SIMD).
390	MaxWaves = divideCeil(Numerator: MaxWaves, Denominator: getEUsPerCU());
391
392	// Clamp to the maximum possible number of waves.
393	MaxWaves = std::min(a: MaxWaves, b: getMaxWavesPerEU());
394
395	// FIXME: Needs to be a multiple of the group size?
396	//MaxWaves = MaxGroupNumWaves (MaxWaves / MaxGroupNumWaves);*
397
398	assert(MaxWaves > `0` && MaxWaves <= getMaxWavesPerEU() &&
399	"computed invalid occupancy");
400	return MaxWaves;
401	}
402
403	unsigned
404	AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
405	const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
406	return getOccupancyWithLocalMemSize(Bytes: MFI->getLDSSize(), F: MF.getFunction());
407	}
408
409	std::pair<unsigned, unsigned>
410	AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
411	switch (CC) {
412	case CallingConv::AMDGPU_VS:
413	case CallingConv::AMDGPU_LS:
414	case CallingConv::AMDGPU_HS:
415	case CallingConv::AMDGPU_ES:
416	case CallingConv::AMDGPU_GS:
417	case CallingConv::AMDGPU_PS:
418	return std::pair(`1`, getWavefrontSize());
419	default:
420	return std::pair(`1u`, getMaxFlatWorkGroupSize());
421	}
422	}
423
424	std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
425	const Function &F) const {
426	// Default minimum/maximum flat work group sizes.
427	std::pair<unsigned, unsigned> Default =
428	getDefaultFlatWorkGroupSize(CC: F.getCallingConv());
429
430	// Requested minimum/maximum flat work group sizes.
431	std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
432	F, Name: "amdgpu-flat-work-group-size", Default);
433
434	// Make sure requested minimum is less than requested maximum.
435	if (Requested.first > Requested.second)
436	return Default;
437
438	// Make sure requested values do not violate subtarget's specifications.
439	if (Requested.first < getMinFlatWorkGroupSize())
440	return Default;
441	if (Requested.second > getMaxFlatWorkGroupSize())
442	return Default;
443
444	return Requested;
445	}
446
447	std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
448	std::pair<unsigned, unsigned> Requested,
449	std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
450	// Default minimum/maximum number of waves per execution unit.
451	std::pair<unsigned, unsigned> Default(`1`, getMaxWavesPerEU());
452
453	// If minimum/maximum flat work group sizes were explicitly requested using
454	// "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
455	// number of waves per execution unit to values implied by requested
456	// minimum/maximum flat work group sizes.
457	unsigned MinImpliedByFlatWorkGroupSize =
458	getWavesPerEUForWorkGroup(FlatWorkGroupSize: FlatWorkGroupSizes.second);
459	Default.first = MinImpliedByFlatWorkGroupSize;
460
461	// Make sure requested minimum is less than requested maximum.
462	if (Requested.second && Requested.first > Requested.second)
463	return Default;
464
465	// Make sure requested values do not violate subtarget's specifications.
466	if (Requested.first < getMinWavesPerEU() \|\|
467	Requested.second > getMaxWavesPerEU())
468	return Default;
469
470	// Make sure requested values are compatible with values implied by requested
471	// minimum/maximum flat work group sizes.
472	if (Requested.first < MinImpliedByFlatWorkGroupSize)
473	return Default;
474
475	return Requested;
476	}
477
478	std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
479	const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
480	// Default minimum/maximum number of waves per execution unit.
481	std::pair<unsigned, unsigned> Default(`1`, getMaxWavesPerEU());
482
483	// Requested minimum/maximum number of waves per execution unit.
484	std::pair<unsigned, unsigned> Requested =
485	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default, OnlyFirstRequired: true);
486	return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
487	}
488
489	static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
490	auto Node = Kernel.getMetadata(Kind: "reqd_work_group_size");
491	if (Node && Node->getNumOperands() == `3`)
492	return mdconst::extract<ConstantInt>(MD: Node->getOperand(I: Dim))->getZExtValue();
493	return std::numeric_limits<unsigned>::max();
494	}
495
496	bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
497	return isMesa3DOS() && !AMDGPU::isShader(CC: F.getCallingConv());
498	}
499
500	unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
501	unsigned Dimension) const {
502	unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dim: Dimension);
503	if (ReqdSize != std::numeric_limits<unsigned>::max())
504	return ReqdSize - `1`;
505	return getFlatWorkGroupSizes(F: Kernel).second - `1`;
506	}
507
508	bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
509	for (int I = `0`; I < `3`; ++I) {
510	if (getMaxWorkitemID(Kernel: Func, Dimension: I) > `0`)
511	return false;
512	}
513
514	return true;
515	}
516
517	bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction I) const* {
518	Function *Kernel = I->getParent()->getParent();
519	unsigned MinSize = `0`;
520	unsigned MaxSize = getFlatWorkGroupSizes(F: *Kernel).second;
521	bool IdQuery = false;
522
523	// If reqd_work_group_size is present it narrows value down.
524	if (auto *CI = dyn_cast<CallInst>(Val: I)) {
525	const Function *F = CI->getCalledFunction();
526	if (F) {
527	unsigned Dim = UINT_MAX;
528	switch (F->getIntrinsicID()) {
529	case Intrinsic::amdgcn_workitem_id_x:
530	case Intrinsic::r600_read_tidig_x:
531	IdQuery = true;
532	[[fallthrough]];
533	case Intrinsic::r600_read_local_size_x:
534	Dim = `0`;
535	break;
536	case Intrinsic::amdgcn_workitem_id_y:
537	case Intrinsic::r600_read_tidig_y:
538	IdQuery = true;
539	[[fallthrough]];
540	case Intrinsic::r600_read_local_size_y:
541	Dim = `1`;
542	break;
543	case Intrinsic::amdgcn_workitem_id_z:
544	case Intrinsic::r600_read_tidig_z:
545	IdQuery = true;
546	[[fallthrough]];
547	case Intrinsic::r600_read_local_size_z:
548	Dim = `2`;
549	break;
550	default:
551	break;
552	}
553
554	if (Dim <= `3`) {
555	unsigned ReqdSize = getReqdWorkGroupSize(Kernel: *Kernel, Dim);
556	if (ReqdSize != std::numeric_limits<unsigned>::max())
557	MinSize = MaxSize = ReqdSize;
558	}
559	}
560	}
561
562	if (!MaxSize)
563	return false;
564
565	// Range metadata is [Lo, Hi). For ID query we need to pass max size
566	// as Hi. For size query we need to pass Hi + 1.
567	if (IdQuery)
568	MinSize = `0`;
569	else
570	++MaxSize;
571
572	APInt Lower{`32`, MinSize};
573	APInt Upper{`32`, MaxSize};
574	if (auto *CI = dyn_cast<CallBase>(Val: I)) {
575	ConstantRange Range(Lower, Upper);
576	CI->addRangeRetAttr(CR: Range);
577	} else {
578	MDBuilder MDB(I->getContext());
579	MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lo: Lower, Hi: Upper);
580	I->setMetadata(KindID: LLVMContext::MD_range, Node: MaxWorkGroupSizeRange);
581	}
582	return true;
583	}
584
585	unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
586	assert(AMDGPU::isKernel(F.getCallingConv()));
587
588	// We don't allocate the segment if we know the implicit arguments weren't
589	// used, even if the ABI implies we need them.
590	if (F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr"))
591	return `0`;
592
593	if (isMesaKernel(F))
594	return `16`;
595
596	// Assume all implicit inputs are used by default
597	const Module *M = F.getParent();
598	unsigned NBytes =
599	AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5 ? `256` : `56`;
600	return F.getFnAttributeAsParsedInteger(Kind: "amdgpu-implicitarg-num-bytes",
601	Default: NBytes);
602	}
603
604	uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
605	Align &MaxAlign) const {
606	assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
607	F.getCallingConv() == CallingConv::SPIR_KERNEL);
608
609	const DataLayout &DL = F.getDataLayout();
610	uint64_t ExplicitArgBytes = `0`;
611	MaxAlign = Align (`1`);
612
613	for (const Argument &Arg : F.args()) {
614	const bool IsByRef = Arg.hasByRefAttr();
615	Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
616	Align Alignment = DL.getValueOrABITypeAlignment(
617	Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: ArgTy);
618	uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
619	ExplicitArgBytes = alignTo(Size: ExplicitArgBytes, A: Alignment) + AllocSize;
620	MaxAlign = std::max(a: MaxAlign, b: Alignment);
621	}
622
623	return ExplicitArgBytes;
624	}
625
626	unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
627	Align &MaxAlign) const {
628	if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
629	F.getCallingConv() != CallingConv::SPIR_KERNEL)
630	return `0`;
631
632	uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
633
634	unsigned ExplicitOffset = getExplicitKernelArgOffset();
635
636	uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
637	unsigned ImplicitBytes = getImplicitArgNumBytes(F);
638	if (ImplicitBytes != `0`) {
639	const Align Alignment = getAlignmentForImplicitArgPtr();
640	TotalSize = alignTo(Size: ExplicitArgBytes, A: Alignment) + ImplicitBytes;
641	MaxAlign = std::max(a: MaxAlign, b: Alignment);
642	}
643
644	// Being able to dereference past the end is useful for emitting scalar loads.
645	return alignTo(Value: TotalSize, Align: `4`);
646	}
647
648	AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
649	return getWavefrontSize() == `32` ? AMDGPUDwarfFlavour::Wave32
650	: AMDGPUDwarfFlavour::Wave64;
651	}
652
653	void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
654	unsigned NumRegionInstrs) const {
655	// Track register pressure so the scheduler can try to decrease
656	// pressure once register usage is above the threshold defined by
657	// SIRegisterInfo::getRegPressureSetLimit()
658	Policy.ShouldTrackPressure = true;
659
660	// Enabling both top down and bottom up scheduling seems to give us less
661	// register spills than just using one of these approaches on its own.
662	Policy.OnlyTopDown = false;
663	Policy.OnlyBottomUp = false;
664
665	// Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
666	if (!enableSIScheduler())
667	Policy.ShouldTrackLaneMasks = true;
668	}
669
670	void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
671	if (isWave32()) {
672	// Fix implicit $vcc operands after MIParser has verified that they match
673	// the instruction definitions.
674	for (auto &MBB : MF) {
675	for (auto &MI : MBB)
676	InstrInfo.fixImplicitOperands(MI);
677	}
678	}
679	}
680
681	bool GCNSubtarget::hasMadF16() const {
682	return InstrInfo.pseudoToMCOpcode(Opcode: AMDGPU::V_MAD_F16_e64) != -`1`;
683	}
684
685	bool GCNSubtarget::useVGPRIndexMode() const {
686	return !hasMovrel() \|\| (EnableVGPRIndexMode && hasVGPRIndexMode());
687	}
688
689	bool GCNSubtarget::useAA() const { return UseAA; }
690
691	unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
692	return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, MaxWaves: getMaxWavesPerEU(),
693	Gen: getGeneration());
694	}
695
696	unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
697	return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(STI: this, NumVGPRs);
698	}
699
700	unsigned
701	GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
702	if (getGeneration() >= AMDGPUSubtarget::GFX10)
703	return `2`; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
704
705	if (HasFlatScratch \|\| HasArchitectedFlatScratch) {
706	if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
707	return `6`; // FLAT_SCRATCH, XNACK, VCC (in that order).
708	if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
709	return `4`; // FLAT_SCRATCH, VCC (in that order).
710	}
711
712	if (isXNACKEnabled())
713	return `4`; // XNACK, VCC (in that order).
714	return `2`; // VCC.
715	}
716
717	unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
718	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
719	return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit());
720	}
721
722	unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
723	// In principle we do not need to reserve SGPR pair used for flat_scratch if
724	// we know flat instructions do not access the stack anywhere in the
725	// program. For now assume it's needed if we have flat instructions.
726	const bool KernelUsesFlatScratch = hasFlatAddressSpace();
727	return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch);
728	}
729
730	unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
731	unsigned NumSGPRs,
732	unsigned NumVGPRs) const {
733	unsigned Occupancy =
734	std::min(a: getMaxWavesPerEU(),
735	b: getOccupancyWithLocalMemSize(Bytes: LDSSize, F));
736	if (NumSGPRs)
737	Occupancy = std::min(a: Occupancy, b: getOccupancyWithNumSGPRs(SGPRs: NumSGPRs));
738	if (NumVGPRs)
739	Occupancy = std::min(a: Occupancy, b: getOccupancyWithNumVGPRs(NumVGPRs));
740	return Occupancy;
741	}
742
743	unsigned GCNSubtarget::getBaseMaxNumSGPRs(
744	const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
745	unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
746	// Compute maximum number of SGPRs function can use using default/requested
747	// minimum number of waves per execution unit.
748	unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false);
749	unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true);
750
751	// Check if maximum number of SGPRs was explicitly requested using
752	// "amdgpu-num-sgpr" attribute.
753	if (F.hasFnAttribute(Kind: "amdgpu-num-sgpr")) {
754	unsigned Requested =
755	F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr", Default: MaxNumSGPRs);
756
757	// Make sure requested value does not violate subtarget's specifications.
758	if (Requested && (Requested <= ReservedNumSGPRs))
759	Requested = `0`;
760
761	// If more SGPRs are required to support the input user/system SGPRs,
762	// increase to accommodate them.
763	//
764	// FIXME: This really ends up using the requested number of SGPRs + number
765	// of reserved special registers in total. Theoretically you could re-use
766	// the last input registers for these special registers, but this would
767	// require a lot of complexity to deal with the weird aliasing.
768	unsigned InputNumSGPRs = PreloadedSGPRs;
769	if (Requested && Requested < InputNumSGPRs)
770	Requested = InputNumSGPRs;
771
772	// Make sure requested value is compatible with values implied by
773	// default/requested minimum/maximum number of waves per execution unit.
774	if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false))
775	Requested = `0`;
776	if (WavesPerEU.second &&
777	Requested && Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second))
778	Requested = `0`;
779
780	if (Requested)
781	MaxNumSGPRs = Requested;
782	}
783
784	if (hasSGPRInitBug())
785	MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
786
787	return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs);
788	}
789
790	unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
791	const Function &F = MF.getFunction();
792	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
793	return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(),
794	ReservedNumSGPRs: getReservedNumSGPRs(MF));
795	}
796
797	static unsigned getMaxNumPreloadedSGPRs() {
798	using USI = GCNUserSGPRUsageInfo;
799	// Max number of user SGPRs
800	const unsigned MaxUserSGPRs =
801	USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) +
802	USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) +
803	USI::getNumUserSGPRForField(ID: USI::QueuePtrID) +
804	USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) +
805	USI::getNumUserSGPRForField(ID: USI::DispatchIdID) +
806	USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) +
807	USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID);
808
809	// Max number of system SGPRs
810	const unsigned MaxSystemSGPRs = `1` + // WorkGroupIDX
811	`1` + // WorkGroupIDY
812	`1` + // WorkGroupIDZ
813	`1` + // WorkGroupInfo
814	`1`; // private segment wave byte offset
815
816	// Max number of synthetic SGPRs
817	const unsigned SyntheticSGPRs = `1`; // LDSKernelId
818
819	return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
820	}
821
822	unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
823	return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(),
824	ReservedNumSGPRs: getReservedNumSGPRs(F));
825	}
826
827	unsigned GCNSubtarget::getBaseMaxNumVGPRs(
828	const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
829	// Compute maximum number of VGPRs function can use using default/requested
830	// minimum number of waves per execution unit.
831	unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU: WavesPerEU.first);
832
833	// Check if maximum number of VGPRs was explicitly requested using
834	// "amdgpu-num-vgpr" attribute.
835	if (F.hasFnAttribute(Kind: "amdgpu-num-vgpr")) {
836	unsigned Requested =
837	F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr", Default: MaxNumVGPRs);
838
839	if (hasGFX90AInsts())
840	Requested *= `2`;
841
842	// Make sure requested value is compatible with values implied by
843	// default/requested minimum/maximum number of waves per execution unit.
844	if (Requested && Requested > getMaxNumVGPRs(WavesPerEU: WavesPerEU.first))
845	Requested = `0`;
846	if (WavesPerEU.second &&
847	Requested && Requested < getMinNumVGPRs(WavesPerEU: WavesPerEU.second))
848	Requested = `0`;
849
850	if (Requested)
851	MaxNumVGPRs = Requested;
852	}
853
854	return MaxNumVGPRs;
855	}
856
857	unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
858	return getBaseMaxNumVGPRs(F, WavesPerEU: getWavesPerEU(F));
859	}
860
861	unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
862	const Function &F = MF.getFunction();
863	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
864	return getBaseMaxNumVGPRs(F, WavesPerEU: MFI.getWavesPerEU());
865	}
866
867	void GCNSubtarget::adjustSchedDependency(
868	SUnit Def, int* DefOpIdx, SUnit Use, int* UseOpIdx, SDep &Dep,
869	const TargetSchedModel SchedModel) const* {
870	if (Dep.getKind() != SDep::Kind::Data \|\| !Dep.getReg() \|\|
871	!Def->isInstr() \|\| !Use->isInstr())
872	return;
873
874	MachineInstr *DefI = Def->getInstr();
875	MachineInstr *UseI = Use->getInstr();
876
877	if (DefI->isBundle()) {
878	const SIRegisterInfo *TRI = getRegisterInfo();
879	auto Reg = Dep.getReg();
880	MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
881	MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
882	unsigned Lat = `0`;
883	for (++I; I != E && I ->isBundledWithPred(); ++I) {
884	if (I ->modifiesRegister(Reg, TRI))
885	Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *I);
886	else if (Lat)
887	--Lat;
888	}
889	Dep.setLatency(Lat);
890	} else if (UseI->isBundle()) {
891	const SIRegisterInfo *TRI = getRegisterInfo();
892	auto Reg = Dep.getReg();
893	MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
894	MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
895	unsigned Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *DefI);
896	for (++I; I != E && I ->isBundledWithPred() && Lat; ++I) {
897	if (I ->readsRegister(Reg, TRI))
898	break;
899	--Lat;
900	}
901	Dep.setLatency(Lat);
902	} else if (Dep.getLatency() == `0` && Dep.getReg() == AMDGPU::VCC_LO) {
903	// Work around the fact that SIInstrInfo::fixImplicitOperands modifies
904	// implicit operands which come from the MCInstrDesc, which can fool
905	// ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
906	// pseudo operands.
907	Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
908	DefMI: DefI, DefOperIdx: DefOpIdx, UseMI: UseI, UseOperIdx: UseOpIdx));
909	}
910	}
911
912	namespace {
913	struct FillMFMAShadowMutation : ScheduleDAGMutation {
914	const SIInstrInfo *TII;
915
916	ScheduleDAGMI *DAG;
917
918	FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
919
920	bool isSALU(const SUnit SU) const* {
921	const MachineInstr *MI = SU->getInstr();
922	return MI && TII->isSALU(MI: *MI) && !MI->isTerminator();
923	}
924
925	bool isVALU(const SUnit SU) const* {
926	const MachineInstr *MI = SU->getInstr();
927	return MI && TII->isVALU(MI: *MI);
928	}
929
930	// Link as many SALU instructions in chain as possible. Return the size
931	// of the chain. Links up to MaxChain instructions.
932	unsigned linkSALUChain(SUnit From, SUnit To, unsigned MaxChain,
933	SmallPtrSetImpl<SUnit > &Visited) const* {
934	SmallVector<SUnit *, `8`> Worklist({To});
935	unsigned Linked = `0`;
936
937	while (!Worklist.empty() && MaxChain-- > `0`) {
938	SUnit *SU = Worklist.pop_back_val();
939	if (!Visited.insert(Ptr: SU).second)
940	continue;
941
942	LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
943	dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << `'\n'`);
944
945	if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SuccSU: SU, PredSU: From))
946	if (DAG->addEdge(SuccSU: SU, PredDep: SDep (From, SDep::Artificial)))
947	++Linked;
948
949	for (SDep &SI : From->Succs) {
950	SUnit *SUv = SI.getSUnit();
951	if (SUv != From && SU != &DAG->ExitSU && isVALU(SU: SUv) &&
952	DAG->canAddEdge(SuccSU: SUv, PredSU: SU))
953	DAG->addEdge(SuccSU: SUv, PredDep: SDep (SU, SDep::Artificial));
954	}
955
956	for (SDep &SI : SU->Succs) {
957	SUnit *Succ = SI.getSUnit();
958	if (Succ != SU && isSALU(SU: Succ))
959	Worklist.push_back(Elt: Succ);
960	}
961	}
962
963	return Linked;
964	}
965
966	void apply(ScheduleDAGInstrs *DAGInstrs) override {
967	const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
968	if (!ST.hasMAIInsts())
969	return;
970	DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
971	const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
972	if (!TSchedModel \|\| DAG->SUnits.empty())
973	return;
974
975	// Scan for MFMA long latency instructions and try to add a dependency
976	// of available SALU instructions to give them a chance to fill MFMA
977	// shadow. That is desirable to fill MFMA shadow with SALU instructions
978	// rather than VALU to prevent power consumption bursts and throttle.
979	auto LastSALU = DAG->SUnits.begin();
980	auto E = DAG->SUnits.end();
981	SmallPtrSet<SUnit*, `32`> Visited;
982	for (SUnit &SU : DAG->SUnits) {
983	MachineInstr &MAI = *SU.getInstr();
984	if (!TII->isMAI(MI: MAI) \|\|
985	MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 \|\|
986	MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
987	continue;
988
989	unsigned Lat = TSchedModel->computeInstrLatency(MI: &MAI) - `1`;
990
991	LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
992	dbgs() << "Need " << Lat
993	<< " instructions to cover latency.\n");
994
995	// Find up to Lat independent scalar instructions as early as
996	// possible such that they can be scheduled after this MFMA.
997	for ( ; Lat && LastSALU != E; ++LastSALU) {
998	if (Visited.count(Ptr: &*LastSALU))
999	continue;
1000
1001	if (&SU == &DAG->ExitSU \|\| &SU == &LastSALU \|\| !isSALU(SU: &LastSALU) \|\|
1002	!DAG->canAddEdge(SuccSU: &*LastSALU, PredSU: &SU))
1003	continue;
1004
1005	Lat -= linkSALUChain(From: &SU, To: &*LastSALU, MaxChain: Lat, Visited);
1006	}
1007	}
1008	}
1009	};
1010	} // namespace
1011
1012	void GCNSubtarget::getPostRAMutations(
1013	std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1014	Mutations.push_back(x: std::make_unique<FillMFMAShadowMutation>(args: &InstrInfo));
1015	}
1016
1017	std::unique_ptr<ScheduleDAGMutation>
1018	GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo TII) const* {
1019	return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(args: &InstrInfo)
1020	: nullptr;
1021	}
1022
1023	unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
1024	if (getGeneration() >= AMDGPUSubtarget::GFX12)
1025	return `0`; // Not MIMG encoding.
1026
1027	if (NSAThreshold.getNumOccurrences() > `0`)
1028	return std::max(a: NSAThreshold.getValue(), b: `2u`);
1029
1030	int Value = MF.getFunction().getFnAttributeAsParsedInteger(
1031	Kind: "amdgpu-nsa-threshold", Default: -`1`);
1032	if (Value > `0`)
1033	return std::max(a: Value, b: `2`);
1034
1035	return `3`;
1036	}
1037
1038	const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1039	if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1040	return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1041	return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
1042	}
1043
1044	const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1045	if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1046	return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1047	return static_cast<const AMDGPUSubtarget &>(
1048	TM.getSubtarget<R600Subtarget>(F));
1049	}
1050
1051	GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
1052	const GCNSubtarget &ST)
1053	: ST(ST) {
1054	const CallingConv::ID CC = F.getCallingConv();
1055	const bool IsKernel =
1056	CC == CallingConv::AMDGPU_KERNEL \|\| CC == CallingConv::SPIR_KERNEL;
1057	// FIXME: Should have analysis or something rather than attribute to detect
1058	// calls.
1059	const bool HasCalls = F.hasFnAttribute(Kind: "amdgpu-calls");
1060	// FIXME: This attribute is a hack, we just need an analysis on the function
1061	// to look for allocas.
1062	const bool HasStackObjects = F.hasFnAttribute(Kind: "amdgpu-stack-objects");
1063
1064	if (IsKernel && (!F.arg_empty() \|\| ST.getImplicitArgNumBytes(F) != `0`))
1065	KernargSegmentPtr = true;
1066
1067	bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
1068	if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
1069	PrivateSegmentBuffer = true;
1070	else if (ST.isMesaGfxShader(F))
1071	ImplicitBufferPtr = true;
1072
1073	if (!AMDGPU::isGraphics(CC)) {
1074	if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr"))
1075	DispatchPtr = true;
1076
1077	// FIXME: Can this always be disabled with < COv5?
1078	if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr"))
1079	QueuePtr = true;
1080
1081	if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id"))
1082	DispatchID = true;
1083	}
1084
1085	// TODO: This could be refined a lot. The attribute is a poor way of
1086	// detecting calls or stack objects that may require it before argument
1087	// lowering.
1088	if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
1089	(IsAmdHsaOrMesa \|\| ST.enableFlatScratch()) &&
1090	(HasCalls \|\| HasStackObjects \|\| ST.enableFlatScratch()) &&
1091	!ST.flatScratchIsArchitected()) {
1092	FlatScratchInit = true;
1093	}
1094
1095	if (hasImplicitBufferPtr())
1096	NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID);
1097
1098	if (hasPrivateSegmentBuffer())
1099	NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID);
1100
1101	if (hasDispatchPtr())
1102	NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID);
1103
1104	if (hasQueuePtr())
1105	NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID);
1106
1107	if (hasKernargSegmentPtr())
1108	NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID);
1109
1110	if (hasDispatchID())
1111	NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID);
1112
1113	if (hasFlatScratchInit())
1114	NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID);
1115
1116	if (hasPrivateSegmentSize())
1117	NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentSizeID);
1118	}
1119
1120	void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
1121	assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
1122	NumKernargPreloadSGPRs += NumSGPRs;
1123	NumUsedUserSGPRs += NumSGPRs;
1124	}
1125
1126	unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
1127	return AMDGPU::getMaxNumUserSGPRs(STI: ST) - NumUsedUserSGPRs;
1128	}
1129
1130	SmallVector<unsigned>
1131	AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
1132	return AMDGPU::getIntegerVecAttribute(F, Name: "amdgpu-max-num-workgroups", Size: `3`);
1133	}
1134

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp