GCNSubtarget.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp]

1	//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Implements the GCN specific subclass of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "GCNSubtarget.h"
15	#include "AMDGPUCallLowering.h"
16	#include "AMDGPUInstructionSelector.h"
17	#include "AMDGPULegalizerInfo.h"
18	#include "AMDGPURegisterBankInfo.h"
19	#include "AMDGPUSelectionDAGInfo.h"
20	#include "AMDGPUTargetMachine.h"
21	#include "SIMachineFunctionInfo.h"
22	#include "Utils/AMDGPUBaseInfo.h"
23	#include "llvm/ADT/SmallString.h"
24	#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25	#include "llvm/CodeGen/MachineScheduler.h"
26	#include "llvm/CodeGen/TargetFrameLowering.h"
27	#include "llvm/IR/DiagnosticInfo.h"
28	#include "llvm/IR/MDBuilder.h"
29	#include <algorithm>
30
31	using namespace llvm;
32
33	#define DEBUG_TYPE "gcn-subtarget"
34
35	#define GET_SUBTARGETINFO_TARGET_DESC
36	#define GET_SUBTARGETINFO_CTOR
37	#define AMDGPUSubtarget GCNSubtarget
38	#include "AMDGPUGenSubtargetInfo.inc"
39	#undef AMDGPUSubtarget
40
41	static cl::opt<bool> EnableVGPRIndexMode(
42	"amdgpu-vgpr-index-mode",
43	cl::desc ("Use GPR indexing mode instead of movrel for vector indexing"),
44	cl::init(Val: false));
45
46	static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47	cl::desc ("Enable the use of AA during codegen."),
48	cl::init(Val: true));
49
50	static cl::opt<unsigned>
51	NSAThreshold("amdgpu-nsa-threshold",
52	cl::desc ("Number of addresses from which to enable MIMG NSA."),
53	cl::init(Val: `2`), cl::Hidden);
54
55	GCNSubtarget::~GCNSubtarget() = default;
56
57	GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
58	StringRef GPU,
59	StringRef FS) {
60	// Determine default and user-specified characteristics
61	//
62	// We want to be able to turn these off, but making this a subtarget feature
63	// for SI has the unhelpful behavior that it unsets everything else if you
64	// disable it.
65	//
66	// Similarly we want enable-prt-strict-null to be on by default and not to
67	// unset everything else if it is disabled
68
69	SmallString<`256`> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
70
71	// Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72	// default
73	if (isAmdHsaOS())
74	FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76	FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78	// Disable mutually exclusive bits.
79	if (FS.contains_insensitive(Other: "+wavefrontsize")) {
80	if (!FS.contains_insensitive(Other: "wavefrontsize16"))
81	FullFS += "-wavefrontsize16,";
82	if (!FS.contains_insensitive(Other: "wavefrontsize32"))
83	FullFS += "-wavefrontsize32,";
84	if (!FS.contains_insensitive(Other: "wavefrontsize64"))
85	FullFS += "-wavefrontsize64,";
86	}
87
88	FullFS += FS;
89
90	ParseSubtargetFeatures(CPU: GPU, /TuneCPU/ GPU, FS: FullFS);
91
92	// Implement the "generic" processors, which acts as the default when no
93	// generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94	// the first amdgcn target that supports flat addressing. Other OSes defaults
95	// to the first amdgcn target.
96	if (Gen == AMDGPUSubtarget::INVALID) {
97	Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
98	: AMDGPUSubtarget::SOUTHERN_ISLANDS;
99	// Assume wave64 for the unknown target, if not explicitly set.
100	if (getWavefrontSizeLog2() == `0`)
101	WavefrontSizeLog2 = `6`;
102	} else if (!hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
103	!hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
104	// If there is no default wave size it must be a generation before gfx10,
105	// these have FeatureWavefrontSize64 in their definition already. For gfx10+
106	// set wave32 as a default.
107	ToggleFeature(FB: AMDGPU::FeatureWavefrontSize32);
108	WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? `5` : `6`;
109	}
110
111	// We don't support FP64 for EG/NI atm.
112	assert(!hasFP64() \|\| (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113
114	// Targets must either support 64-bit offsets for MUBUF instructions, and/or
115	// support flat operations, otherwise they cannot access a 64-bit global
116	// address space
117	assert(hasAddr64() \|\| hasFlat());
118	// Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119	// that do not support ADDR64 variants of MUBUF instructions. Such targets
120	// cannot use a 64 bit offset with a MUBUF instruction to access the global
121	// address space
122	if (!hasAddr64() && !FS.contains(Other: "flat-for-global") && !FlatForGlobal) {
123	ToggleFeature(FB: AMDGPU::FeatureFlatForGlobal);
124	FlatForGlobal = true;
125	}
126	// Unless +-flat-for-global is specified, use MUBUF instructions for global
127	// address space access if flat operations are not available.
128	if (!hasFlat() && !FS.contains(Other: "flat-for-global") && FlatForGlobal) {
129	ToggleFeature(FB: AMDGPU::FeatureFlatForGlobal);
130	FlatForGlobal = false;
131	}
132
133	// Set defaults if needed.
134	if (MaxPrivateElementSize == `0`)
135	MaxPrivateElementSize = `4`;
136
137	if (LDSBankCount == `0`)
138	LDSBankCount = `32`;
139
140	if (TT.isAMDGCN() && AddressableLocalMemorySize == `0`)
141	AddressableLocalMemorySize = `32768`;
142
143	LocalMemorySize = AddressableLocalMemorySize;
144	if (AMDGPU::isGFX10Plus(STI: *this) &&
145	!getFeatureBits().test(I: AMDGPU::FeatureCuMode))
146	LocalMemorySize *= `2`;
147
148	HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
149	HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
150
151	TargetID.setTargetIDFromFeaturesString(FS);
152
153	LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
154	<< TargetID.getXnackSetting() << `'\n'`);
155	LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
156	<< TargetID.getSramEccSetting() << `'\n'`);
157
158	return *this;
159	}
160
161	void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
162	LLVMContext &Ctx = F.getContext();
163	if (hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
164	hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
165	Ctx.diagnose(DI: DiagnosticInfoUnsupported (
166	F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
167	}
168	}
169
170	GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
171	const GCNTargetMachine &TM)
172	: // clang-format off
173	AMDGPUGenSubtargetInfo (TT, GPU, /TuneCPU/ GPU, FS),
174	AMDGPUSubtarget (TT),
175	TargetTriple (TT),
176	TargetID (*this),
177	InstrItins(getInstrItineraryForCPU(CPU: GPU)),
178	InstrInfo (initializeSubtargetDependencies(TT, GPU, FS)),
179	TLInfo (TM, *this),
180	FrameLowering (TargetFrameLowering::StackGrowsUp, getStackAlignment(), `0`) {
181	// clang-format on
182	MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(STI: this);
183	EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(STI: this);
184
185	TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
186
187	CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(args: *getTargetLowering());
188	InlineAsmLoweringInfo =
189	std::make_unique<InlineAsmLowering>(args: getTargetLowering());
190	Legalizer = std::make_unique<AMDGPULegalizerInfo>(args&: *this, args: TM);
191	RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(args&: *this);
192	InstSelector =
193	std::make_unique<AMDGPUInstructionSelector>(args&: *this, args&: *RegBankInfo, args: TM);
194	}
195
196	const SelectionDAGTargetInfo GCNSubtarget::getSelectionDAGInfo() const* {
197	return TSInfo.get();
198	}
199
200	unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
201	if (getGeneration() < GFX10)
202	return `1`;
203
204	switch (Opcode) {
205	case AMDGPU::V_LSHLREV_B64_e64:
206	case AMDGPU::V_LSHLREV_B64_gfx10:
207	case AMDGPU::V_LSHLREV_B64_e64_gfx11:
208	case AMDGPU::V_LSHLREV_B64_e32_gfx12:
209	case AMDGPU::V_LSHLREV_B64_e64_gfx12:
210	case AMDGPU::V_LSHL_B64_e64:
211	case AMDGPU::V_LSHRREV_B64_e64:
212	case AMDGPU::V_LSHRREV_B64_gfx10:
213	case AMDGPU::V_LSHRREV_B64_e64_gfx11:
214	case AMDGPU::V_LSHRREV_B64_e64_gfx12:
215	case AMDGPU::V_LSHR_B64_e64:
216	case AMDGPU::V_ASHRREV_I64_e64:
217	case AMDGPU::V_ASHRREV_I64_gfx10:
218	case AMDGPU::V_ASHRREV_I64_e64_gfx11:
219	case AMDGPU::V_ASHRREV_I64_e64_gfx12:
220	case AMDGPU::V_ASHR_I64_e64:
221	return `1`;
222	}
223
224	return `2`;
225	}
226
227	/// This list was mostly derived from experimentation.
228	bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
229	switch (Opcode) {
230	case AMDGPU::V_CVT_F16_F32_e32:
231	case AMDGPU::V_CVT_F16_F32_e64:
232	case AMDGPU::V_CVT_F16_U16_e32:
233	case AMDGPU::V_CVT_F16_U16_e64:
234	case AMDGPU::V_CVT_F16_I16_e32:
235	case AMDGPU::V_CVT_F16_I16_e64:
236	case AMDGPU::V_RCP_F16_e64:
237	case AMDGPU::V_RCP_F16_e32:
238	case AMDGPU::V_RSQ_F16_e64:
239	case AMDGPU::V_RSQ_F16_e32:
240	case AMDGPU::V_SQRT_F16_e64:
241	case AMDGPU::V_SQRT_F16_e32:
242	case AMDGPU::V_LOG_F16_e64:
243	case AMDGPU::V_LOG_F16_e32:
244	case AMDGPU::V_EXP_F16_e64:
245	case AMDGPU::V_EXP_F16_e32:
246	case AMDGPU::V_SIN_F16_e64:
247	case AMDGPU::V_SIN_F16_e32:
248	case AMDGPU::V_COS_F16_e64:
249	case AMDGPU::V_COS_F16_e32:
250	case AMDGPU::V_FLOOR_F16_e64:
251	case AMDGPU::V_FLOOR_F16_e32:
252	case AMDGPU::V_CEIL_F16_e64:
253	case AMDGPU::V_CEIL_F16_e32:
254	case AMDGPU::V_TRUNC_F16_e64:
255	case AMDGPU::V_TRUNC_F16_e32:
256	case AMDGPU::V_RNDNE_F16_e64:
257	case AMDGPU::V_RNDNE_F16_e32:
258	case AMDGPU::V_FRACT_F16_e64:
259	case AMDGPU::V_FRACT_F16_e32:
260	case AMDGPU::V_FREXP_MANT_F16_e64:
261	case AMDGPU::V_FREXP_MANT_F16_e32:
262	case AMDGPU::V_FREXP_EXP_I16_F16_e64:
263	case AMDGPU::V_FREXP_EXP_I16_F16_e32:
264	case AMDGPU::V_LDEXP_F16_e64:
265	case AMDGPU::V_LDEXP_F16_e32:
266	case AMDGPU::V_LSHLREV_B16_e64:
267	case AMDGPU::V_LSHLREV_B16_e32:
268	case AMDGPU::V_LSHRREV_B16_e64:
269	case AMDGPU::V_LSHRREV_B16_e32:
270	case AMDGPU::V_ASHRREV_I16_e64:
271	case AMDGPU::V_ASHRREV_I16_e32:
272	case AMDGPU::V_ADD_U16_e64:
273	case AMDGPU::V_ADD_U16_e32:
274	case AMDGPU::V_SUB_U16_e64:
275	case AMDGPU::V_SUB_U16_e32:
276	case AMDGPU::V_SUBREV_U16_e64:
277	case AMDGPU::V_SUBREV_U16_e32:
278	case AMDGPU::V_MUL_LO_U16_e64:
279	case AMDGPU::V_MUL_LO_U16_e32:
280	case AMDGPU::V_ADD_F16_e64:
281	case AMDGPU::V_ADD_F16_e32:
282	case AMDGPU::V_SUB_F16_e64:
283	case AMDGPU::V_SUB_F16_e32:
284	case AMDGPU::V_SUBREV_F16_e64:
285	case AMDGPU::V_SUBREV_F16_e32:
286	case AMDGPU::V_MUL_F16_e64:
287	case AMDGPU::V_MUL_F16_e32:
288	case AMDGPU::V_MAX_F16_e64:
289	case AMDGPU::V_MAX_F16_e32:
290	case AMDGPU::V_MIN_F16_e64:
291	case AMDGPU::V_MIN_F16_e32:
292	case AMDGPU::V_MAX_U16_e64:
293	case AMDGPU::V_MAX_U16_e32:
294	case AMDGPU::V_MIN_U16_e64:
295	case AMDGPU::V_MIN_U16_e32:
296	case AMDGPU::V_MAX_I16_e64:
297	case AMDGPU::V_MAX_I16_e32:
298	case AMDGPU::V_MIN_I16_e64:
299	case AMDGPU::V_MIN_I16_e32:
300	case AMDGPU::V_MAD_F16_e64:
301	case AMDGPU::V_MAD_U16_e64:
302	case AMDGPU::V_MAD_I16_e64:
303	case AMDGPU::V_FMA_F16_e64:
304	case AMDGPU::V_DIV_FIXUP_F16_e64:
305	// On gfx10, all 16-bit instructions preserve the high bits.
306	return getGeneration() <= AMDGPUSubtarget::GFX9;
307	case AMDGPU::V_MADAK_F16:
308	case AMDGPU::V_MADMK_F16:
309	case AMDGPU::V_MAC_F16_e64:
310	case AMDGPU::V_MAC_F16_e32:
311	case AMDGPU::V_FMAMK_F16:
312	case AMDGPU::V_FMAAK_F16:
313	case AMDGPU::V_FMAC_F16_e64:
314	case AMDGPU::V_FMAC_F16_e32:
315	// In gfx9, the preferred handling of the unused high 16-bits changed. Most
316	// instructions maintain the legacy behavior of 0ing. Some instructions
317	// changed to preserving the high bits.
318	return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
319	case AMDGPU::V_MAD_MIXLO_F16:
320	case AMDGPU::V_MAD_MIXHI_F16:
321	default:
322	return false;
323	}
324	}
325
326	void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
327	unsigned NumRegionInstrs) const {
328	// Track register pressure so the scheduler can try to decrease
329	// pressure once register usage is above the threshold defined by
330	// SIRegisterInfo::getRegPressureSetLimit()
331	Policy.ShouldTrackPressure = true;
332
333	// Enabling both top down and bottom up scheduling seems to give us less
334	// register spills than just using one of these approaches on its own.
335	Policy.OnlyTopDown = false;
336	Policy.OnlyBottomUp = false;
337
338	// Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
339	if (!enableSIScheduler())
340	Policy.ShouldTrackLaneMasks = true;
341	}
342
343	void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
344	if (isWave32()) {
345	// Fix implicit $vcc operands after MIParser has verified that they match
346	// the instruction definitions.
347	for (auto &MBB : MF) {
348	for (auto &MI : MBB)
349	InstrInfo.fixImplicitOperands(MI);
350	}
351	}
352	}
353
354	bool GCNSubtarget::hasMadF16() const {
355	return InstrInfo.pseudoToMCOpcode(Opcode: AMDGPU::V_MAD_F16_e64) != -`1`;
356	}
357
358	bool GCNSubtarget::useVGPRIndexMode() const {
359	return hasVGPRIndexMode() && (!hasMovrel() \|\| EnableVGPRIndexMode);
360	}
361
362	bool GCNSubtarget::useAA() const { return UseAA; }
363
364	unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
365	return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, MaxWaves: getMaxWavesPerEU(),
366	Gen: getGeneration());
367	}
368
369	unsigned
370	GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
371	unsigned DynamicVGPRBlockSize) const {
372	return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(STI: this, NumVGPRs,
373	DynamicVGPRBlockSize);
374	}
375
376	unsigned
377	GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
378	if (getGeneration() >= AMDGPUSubtarget::GFX10)
379	return `2`; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
380
381	if (HasFlatScratch \|\| HasArchitectedFlatScratch) {
382	if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
383	return `6`; // FLAT_SCRATCH, XNACK, VCC (in that order).
384	if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
385	return `4`; // FLAT_SCRATCH, VCC (in that order).
386	}
387
388	if (isXNACKEnabled())
389	return `4`; // XNACK, VCC (in that order).
390	return `2`; // VCC.
391	}
392
393	unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
394	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
395	return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit());
396	}
397
398	unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
399	// In principle we do not need to reserve SGPR pair used for flat_scratch if
400	// we know flat instructions do not access the stack anywhere in the
401	// program. For now assume it's needed if we have flat instructions.
402	const bool KernelUsesFlatScratch = hasFlatAddressSpace();
403	return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch);
404	}
405
406	std::pair<unsigned, unsigned>
407	GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
408	unsigned NumSGPRs, unsigned NumVGPRs) const {
409	unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
410	// Temporarily check both the attribute and the subtarget feature until the
411	// latter is removed.
412	if (DynamicVGPRBlockSize == `0` && isDynamicVGPREnabled())
413	DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
414
415	auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSBytes: LDSSize, F);
416	unsigned SGPROcc = getOccupancyWithNumSGPRs(SGPRs: NumSGPRs);
417	unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
418
419	// Maximum occupancy may be further limited by high SGPR/VGPR usage.
420	MaxOcc = std::min(a: MaxOcc, b: std::min(a: SGPROcc, b: VGPROcc));
421	return {std::min(a: MinOcc, b: MaxOcc), MaxOcc};
422	}
423
424	unsigned GCNSubtarget::getBaseMaxNumSGPRs(
425	const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
426	unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
427	// Compute maximum number of SGPRs function can use using default/requested
428	// minimum number of waves per execution unit.
429	unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false);
430	unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true);
431
432	// Check if maximum number of SGPRs was explicitly requested using
433	// "amdgpu-num-sgpr" attribute.
434	unsigned Requested =
435	F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr", Default: MaxNumSGPRs);
436
437	if (Requested != MaxNumSGPRs) {
438	// Make sure requested value does not violate subtarget's specifications.
439	if (Requested && (Requested <= ReservedNumSGPRs))
440	Requested = `0`;
441
442	// If more SGPRs are required to support the input user/system SGPRs,
443	// increase to accommodate them.
444	//
445	// FIXME: This really ends up using the requested number of SGPRs + number
446	// of reserved special registers in total. Theoretically you could re-use
447	// the last input registers for these special registers, but this would
448	// require a lot of complexity to deal with the weird aliasing.
449	unsigned InputNumSGPRs = PreloadedSGPRs;
450	if (Requested && Requested < InputNumSGPRs)
451	Requested = InputNumSGPRs;
452
453	// Make sure requested value is compatible with values implied by
454	// default/requested minimum/maximum number of waves per execution unit.
455	if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false))
456	Requested = `0`;
457	if (WavesPerEU.second && Requested &&
458	Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second))
459	Requested = `0`;
460
461	if (Requested)
462	MaxNumSGPRs = Requested;
463	}
464
465	if (hasSGPRInitBug())
466	MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
467
468	return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs);
469	}
470
471	unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
472	const Function &F = MF.getFunction();
473	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
474	return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(),
475	ReservedNumSGPRs: getReservedNumSGPRs(MF));
476	}
477
478	unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
479	using USI = GCNUserSGPRUsageInfo;
480	// Max number of user SGPRs
481	const unsigned MaxUserSGPRs =
482	USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) +
483	USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) +
484	USI::getNumUserSGPRForField(ID: USI::QueuePtrID) +
485	USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) +
486	USI::getNumUserSGPRForField(ID: USI::DispatchIdID) +
487	USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) +
488	USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID);
489
490	// Max number of system SGPRs
491	const unsigned MaxSystemSGPRs = `1` + // WorkGroupIDX
492	`1` + // WorkGroupIDY
493	`1` + // WorkGroupIDZ
494	`1` + // WorkGroupInfo
495	`1`; // private segment wave byte offset
496
497	// Max number of synthetic SGPRs
498	const unsigned SyntheticSGPRs = `1`; // LDSKernelId
499
500	return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
501	}
502
503	unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
504	return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(),
505	ReservedNumSGPRs: getReservedNumSGPRs(F));
506	}
507
508	unsigned GCNSubtarget::getBaseMaxNumVGPRs(
509	const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
510	const auto &[Min, Max] = NumVGPRBounds;
511
512	// Check if maximum number of VGPRs was explicitly requested using
513	// "amdgpu-num-vgpr" attribute.
514
515	unsigned Requested = F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr", Default: Max);
516	if (Requested != Max && hasGFX90AInsts())
517	Requested *= `2`;
518
519	// Make sure requested value is inside the range of possible VGPR usage.
520	return std::clamp(val: Requested, lo: Min, hi: Max);
521	}
522
523	unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
524	// Temporarily check both the attribute and the subtarget feature, until the
525	// latter is removed.
526	unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
527	if (DynamicVGPRBlockSize == `0` && isDynamicVGPREnabled())
528	DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
529
530	std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
531	return getBaseMaxNumVGPRs(
532	F, NumVGPRBounds: {getMinNumVGPRs(WavesPerEU: Waves.second, DynamicVGPRBlockSize),
533	getMaxNumVGPRs(WavesPerEU: Waves.first, DynamicVGPRBlockSize)});
534	}
535
536	unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
537	return getMaxNumVGPRs(F: MF.getFunction());
538	}
539
540	void GCNSubtarget::adjustSchedDependency(
541	SUnit Def, int* DefOpIdx, SUnit Use, int* UseOpIdx, SDep &Dep,
542	const TargetSchedModel SchedModel) const* {
543	if (Dep.getKind() != SDep::Kind::Data \|\| !Dep.getReg() \|\| !Def->isInstr() \|\|
544	!Use->isInstr())
545	return;
546
547	MachineInstr *DefI = Def->getInstr();
548	MachineInstr *UseI = Use->getInstr();
549
550	if (DefI->isBundle()) {
551	const SIRegisterInfo *TRI = getRegisterInfo();
552	auto Reg = Dep.getReg();
553	MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
554	MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
555	unsigned Lat = `0`;
556	for (++I; I != E && I ->isBundledWithPred(); ++I) {
557	if (I ->modifiesRegister(Reg, TRI))
558	Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *I);
559	else if (Lat)
560	--Lat;
561	}
562	Dep.setLatency(Lat);
563	} else if (UseI->isBundle()) {
564	const SIRegisterInfo *TRI = getRegisterInfo();
565	auto Reg = Dep.getReg();
566	MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
567	MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
568	unsigned Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *DefI);
569	for (++I; I != E && I ->isBundledWithPred() && Lat; ++I) {
570	if (I ->readsRegister(Reg, TRI))
571	break;
572	--Lat;
573	}
574	Dep.setLatency(Lat);
575	} else if (Dep.getLatency() == `0` && Dep.getReg() == AMDGPU::VCC_LO) {
576	// Work around the fact that SIInstrInfo::fixImplicitOperands modifies
577	// implicit operands which come from the MCInstrDesc, which can fool
578	// ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
579	// pseudo operands.
580	Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
581	DefMI: DefI, DefOperIdx: DefOpIdx, UseMI: UseI, UseOperIdx: UseOpIdx));
582	}
583	}
584
585	unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
586	if (getGeneration() >= AMDGPUSubtarget::GFX12)
587	return `0`; // Not MIMG encoding.
588
589	if (NSAThreshold.getNumOccurrences() > `0`)
590	return std::max(a: NSAThreshold.getValue(), b: `2u`);
591
592	int Value = MF.getFunction().getFnAttributeAsParsedInteger(
593	Kind: "amdgpu-nsa-threshold", Default: -`1`);
594	if (Value > `0`)
595	return std::max(a: Value, b: `2`);
596
597	return NSAThreshold;
598	}
599
600	GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
601	const GCNSubtarget &ST)
602	: ST(ST) {
603	const CallingConv::ID CC = F.getCallingConv();
604	const bool IsKernel =
605	CC == CallingConv::AMDGPU_KERNEL \|\| CC == CallingConv::SPIR_KERNEL;
606
607	if (IsKernel && (!F.arg_empty() \|\| ST.getImplicitArgNumBytes(F) != `0`))
608	KernargSegmentPtr = true;
609
610	bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
611	if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
612	PrivateSegmentBuffer = true;
613	else if (ST.isMesaGfxShader(F))
614	ImplicitBufferPtr = true;
615
616	if (!AMDGPU::isGraphics(CC)) {
617	if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr"))
618	DispatchPtr = true;
619
620	// FIXME: Can this always be disabled with < COv5?
621	if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr"))
622	QueuePtr = true;
623
624	if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id"))
625	DispatchID = true;
626	}
627
628	if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
629	(IsAmdHsaOrMesa \|\| ST.enableFlatScratch()) &&
630	// FlatScratchInit cannot be true for graphics CC if enableFlatScratch()
631	// is false.
632	(ST.enableFlatScratch() \|\|
633	(!AMDGPU::isGraphics(CC) &&
634	!F.hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))) &&
635	!ST.flatScratchIsArchitected()) {
636	FlatScratchInit = true;
637	}
638
639	if (hasImplicitBufferPtr())
640	NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID);
641
642	if (hasPrivateSegmentBuffer())
643	NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID);
644
645	if (hasDispatchPtr())
646	NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID);
647
648	if (hasQueuePtr())
649	NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID);
650
651	if (hasKernargSegmentPtr())
652	NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID);
653
654	if (hasDispatchID())
655	NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID);
656
657	if (hasFlatScratchInit())
658	NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID);
659
660	if (hasPrivateSegmentSize())
661	NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentSizeID);
662	}
663
664	void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
665	assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
666	NumKernargPreloadSGPRs += NumSGPRs;
667	NumUsedUserSGPRs += NumSGPRs;
668	}
669
670	unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
671	return AMDGPU::getMaxNumUserSGPRs(STI: ST) - NumUsedUserSGPRs;
672	}
673

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp