GCNSubtarget.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp]

1	//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Implements the GCN specific subclass of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "GCNSubtarget.h"
15	#include "AMDGPUCallLowering.h"
16	#include "AMDGPUInstructionSelector.h"
17	#include "AMDGPULegalizerInfo.h"
18	#include "AMDGPURegisterBankInfo.h"
19	#include "AMDGPUSelectionDAGInfo.h"
20	#include "AMDGPUTargetMachine.h"
21	#include "SIMachineFunctionInfo.h"
22	#include "Utils/AMDGPUBaseInfo.h"
23	#include "llvm/ADT/SmallString.h"
24	#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25	#include "llvm/CodeGen/MachineScheduler.h"
26	#include "llvm/CodeGen/TargetFrameLowering.h"
27	#include "llvm/IR/DiagnosticInfo.h"
28	#include "llvm/IR/MDBuilder.h"
29	#include <algorithm>
30
31	using namespace llvm;
32
33	#define DEBUG_TYPE "gcn-subtarget"
34
35	#define GET_SUBTARGETINFO_TARGET_DESC
36	#define GET_SUBTARGETINFO_CTOR
37	#define AMDGPUSubtarget GCNSubtarget
38	#include "AMDGPUGenSubtargetInfo.inc"
39	#undef AMDGPUSubtarget
40
41	static cl::opt<bool> EnableVGPRIndexMode(
42	"amdgpu-vgpr-index-mode",
43	cl::desc ("Use GPR indexing mode instead of movrel for vector indexing"),
44	cl::init(Val: false));
45
46	static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47	cl::desc ("Enable the use of AA during codegen."),
48	cl::init(Val: true));
49
50	static cl::opt<unsigned>
51	NSAThreshold("amdgpu-nsa-threshold",
52	cl::desc ("Number of addresses from which to enable MIMG NSA."),
53	cl::init(Val: `2`), cl::Hidden);
54
55	GCNSubtarget::~GCNSubtarget() = default;
56
57	GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
58	StringRef GPU,
59	StringRef FS) {
60	// Determine default and user-specified characteristics
61	//
62	// We want to be able to turn these off, but making this a subtarget feature
63	// for SI has the unhelpful behavior that it unsets everything else if you
64	// disable it.
65	//
66	// Similarly we want enable-prt-strict-null to be on by default and not to
67	// unset everything else if it is disabled
68
69	SmallString<`256`> FullFS("+load-store-opt,+enable-ds128,");
70
71	// Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72	// default
73	if (isAmdHsaOS())
74	FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76	FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78	// Disable mutually exclusive bits.
79	if (FS.contains_insensitive(Other: "+wavefrontsize")) {
80	if (!FS.contains_insensitive(Other: "wavefrontsize16"))
81	FullFS += "-wavefrontsize16,";
82	if (!FS.contains_insensitive(Other: "wavefrontsize32"))
83	FullFS += "-wavefrontsize32,";
84	if (!FS.contains_insensitive(Other: "wavefrontsize64"))
85	FullFS += "-wavefrontsize64,";
86	}
87
88	FullFS += FS;
89
90	ParseSubtargetFeatures(CPU: GPU, /TuneCPU/ GPU, FS: FullFS);
91
92	// Implement the "generic" processors, which acts as the default when no
93	// generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94	// the first amdgcn target that supports flat addressing. Other OSes defaults
95	// to the first amdgcn target.
96	if (Gen == AMDGPUSubtarget::INVALID) {
97	Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
98	: AMDGPUSubtarget::SOUTHERN_ISLANDS;
99	// Assume wave64 for the unknown target, if not explicitly set.
100	if (getWavefrontSizeLog2() == `0`)
101	WavefrontSizeLog2 = `6`;
102	} else if (!hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
103	!hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
104	// If there is no default wave size it must be a generation before gfx10,
105	// these have FeatureWavefrontSize64 in their definition already. For gfx10+
106	// set wave32 as a default.
107	ToggleFeature(FB: AMDGPU::FeatureWavefrontSize32);
108	WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? `5` : `6`;
109	}
110
111	// We don't support FP64 for EG/NI atm.
112	assert(!hasFP64() \|\| (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113
114	// Targets must either support 64-bit offsets for MUBUF instructions, and/or
115	// support flat operations, otherwise they cannot access a 64-bit global
116	// address space
117	assert(hasAddr64() \|\| hasFlat());
118	// Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119	// that do not support ADDR64 variants of MUBUF instructions. Such targets
120	// cannot use a 64 bit offset with a MUBUF instruction to access the global
121	// address space
122	if (!hasAddr64() && !FS.contains(Other: "flat-for-global") && !UseFlatForGlobal) {
123	ToggleFeature(FB: AMDGPU::FeatureUseFlatForGlobal);
124	UseFlatForGlobal = true;
125	}
126	// Unless +-flat-for-global is specified, use MUBUF instructions for global
127	// address space access if flat operations are not available.
128	if (!hasFlat() && !FS.contains(Other: "flat-for-global") && UseFlatForGlobal) {
129	ToggleFeature(FB: AMDGPU::FeatureUseFlatForGlobal);
130	UseFlatForGlobal = false;
131	}
132
133	// Set defaults if needed.
134	if (MaxPrivateElementSize == `0`)
135	MaxPrivateElementSize = `4`;
136
137	if (LDSBankCount == `0`)
138	LDSBankCount = `32`;
139
140	if (AddressableLocalMemorySize == `0`)
141	AddressableLocalMemorySize = `32768`;
142
143	if (FlatOffsetBitWidth == `0`)
144	FlatOffsetBitWidth = `13`;
145
146	LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(STI: this);
147
148	HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
149	HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
150
151	// InstCacheLineSize is set from TableGen subtarget features
152	// (FeatureInstCacheLineSize64 / FeatureInstCacheLineSize128).
153	// Fall back to 64 if no feature was specified (e.g. generic targets).
154	if (InstCacheLineSize == `0`)
155	InstCacheLineSize = `64`;
156
157	assert(llvm::isPowerOf2_32(InstCacheLineSize) &&
158	"InstCacheLineSize must be a power of 2");
159
160	TargetID.setTargetIDFromFeaturesString(FS);
161
162	LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
163	<< TargetID.getXnackSetting() << `'\n'`);
164	LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
165	<< TargetID.getSramEccSetting() << `'\n'`);
166
167	return *this;
168	}
169
170	void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
171	LLVMContext &Ctx = F.getContext();
172	if (hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
173	hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
174	Ctx.diagnose(DI: DiagnosticInfoUnsupported (
175	F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
176	}
177	}
178
179	GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
180	const GCNTargetMachine &TM)
181	: // clang-format off
182	AMDGPUGenSubtargetInfo (TT, GPU, /TuneCPU/ GPU, FS),
183	AMDGPUSubtarget (TT),
184	TargetID (*this),
185	InstrItins(getInstrItineraryForCPU(CPU: GPU)),
186	InstrInfo (initializeSubtargetDependencies(TT, GPU, FS)),
187	TLInfo (TM, *this),
188	FrameLowering (TargetFrameLowering::StackGrowsUp, getStackAlignment(), `0`) {
189	// clang-format on
190	MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(STI: this);
191	EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(STI: this);
192
193	TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
194
195	CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(args: *getTargetLowering());
196	InlineAsmLoweringInfo =
197	std::make_unique<InlineAsmLowering>(args: getTargetLowering());
198	Legalizer = std::make_unique<AMDGPULegalizerInfo>(args&: *this, args: TM);
199	RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(args&: *this);
200	InstSelector =
201	std::make_unique<AMDGPUInstructionSelector>(args&: *this, args&: *RegBankInfo, args: TM);
202	}
203
204	const SelectionDAGTargetInfo GCNSubtarget::getSelectionDAGInfo() const* {
205	return TSInfo.get();
206	}
207
208	unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
209	if (getGeneration() < GFX10)
210	return `1`;
211
212	switch (Opcode) {
213	case AMDGPU::V_LSHLREV_B64_e64:
214	case AMDGPU::V_LSHLREV_B64_gfx10:
215	case AMDGPU::V_LSHLREV_B64_e64_gfx11:
216	case AMDGPU::V_LSHLREV_B64_e32_gfx12:
217	case AMDGPU::V_LSHLREV_B64_e64_gfx12:
218	case AMDGPU::V_LSHL_B64_e64:
219	case AMDGPU::V_LSHRREV_B64_e64:
220	case AMDGPU::V_LSHRREV_B64_gfx10:
221	case AMDGPU::V_LSHRREV_B64_e64_gfx11:
222	case AMDGPU::V_LSHRREV_B64_e64_gfx12:
223	case AMDGPU::V_LSHR_B64_e64:
224	case AMDGPU::V_ASHRREV_I64_e64:
225	case AMDGPU::V_ASHRREV_I64_gfx10:
226	case AMDGPU::V_ASHRREV_I64_e64_gfx11:
227	case AMDGPU::V_ASHRREV_I64_e64_gfx12:
228	case AMDGPU::V_ASHR_I64_e64:
229	return `1`;
230	}
231
232	return `2`;
233	}
234
235	/// This list was mostly derived from experimentation.
236	bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
237	switch (Opcode) {
238	case AMDGPU::V_CVT_F16_F32_e32:
239	case AMDGPU::V_CVT_F16_F32_e64:
240	case AMDGPU::V_CVT_F16_U16_e32:
241	case AMDGPU::V_CVT_F16_U16_e64:
242	case AMDGPU::V_CVT_F16_I16_e32:
243	case AMDGPU::V_CVT_F16_I16_e64:
244	case AMDGPU::V_RCP_F16_e64:
245	case AMDGPU::V_RCP_F16_e32:
246	case AMDGPU::V_RSQ_F16_e64:
247	case AMDGPU::V_RSQ_F16_e32:
248	case AMDGPU::V_SQRT_F16_e64:
249	case AMDGPU::V_SQRT_F16_e32:
250	case AMDGPU::V_LOG_F16_e64:
251	case AMDGPU::V_LOG_F16_e32:
252	case AMDGPU::V_EXP_F16_e64:
253	case AMDGPU::V_EXP_F16_e32:
254	case AMDGPU::V_SIN_F16_e64:
255	case AMDGPU::V_SIN_F16_e32:
256	case AMDGPU::V_COS_F16_e64:
257	case AMDGPU::V_COS_F16_e32:
258	case AMDGPU::V_FLOOR_F16_e64:
259	case AMDGPU::V_FLOOR_F16_e32:
260	case AMDGPU::V_CEIL_F16_e64:
261	case AMDGPU::V_CEIL_F16_e32:
262	case AMDGPU::V_TRUNC_F16_e64:
263	case AMDGPU::V_TRUNC_F16_e32:
264	case AMDGPU::V_RNDNE_F16_e64:
265	case AMDGPU::V_RNDNE_F16_e32:
266	case AMDGPU::V_FRACT_F16_e64:
267	case AMDGPU::V_FRACT_F16_e32:
268	case AMDGPU::V_FREXP_MANT_F16_e64:
269	case AMDGPU::V_FREXP_MANT_F16_e32:
270	case AMDGPU::V_FREXP_EXP_I16_F16_e64:
271	case AMDGPU::V_FREXP_EXP_I16_F16_e32:
272	case AMDGPU::V_LDEXP_F16_e64:
273	case AMDGPU::V_LDEXP_F16_e32:
274	case AMDGPU::V_LSHLREV_B16_e64:
275	case AMDGPU::V_LSHLREV_B16_e32:
276	case AMDGPU::V_LSHRREV_B16_e64:
277	case AMDGPU::V_LSHRREV_B16_e32:
278	case AMDGPU::V_ASHRREV_I16_e64:
279	case AMDGPU::V_ASHRREV_I16_e32:
280	case AMDGPU::V_ADD_U16_e64:
281	case AMDGPU::V_ADD_U16_e32:
282	case AMDGPU::V_SUB_U16_e64:
283	case AMDGPU::V_SUB_U16_e32:
284	case AMDGPU::V_SUBREV_U16_e64:
285	case AMDGPU::V_SUBREV_U16_e32:
286	case AMDGPU::V_MUL_LO_U16_e64:
287	case AMDGPU::V_MUL_LO_U16_e32:
288	case AMDGPU::V_ADD_F16_e64:
289	case AMDGPU::V_ADD_F16_e32:
290	case AMDGPU::V_SUB_F16_e64:
291	case AMDGPU::V_SUB_F16_e32:
292	case AMDGPU::V_SUBREV_F16_e64:
293	case AMDGPU::V_SUBREV_F16_e32:
294	case AMDGPU::V_MUL_F16_e64:
295	case AMDGPU::V_MUL_F16_e32:
296	case AMDGPU::V_MAX_F16_e64:
297	case AMDGPU::V_MAX_F16_e32:
298	case AMDGPU::V_MIN_F16_e64:
299	case AMDGPU::V_MIN_F16_e32:
300	case AMDGPU::V_MAX_U16_e64:
301	case AMDGPU::V_MAX_U16_e32:
302	case AMDGPU::V_MIN_U16_e64:
303	case AMDGPU::V_MIN_U16_e32:
304	case AMDGPU::V_MAX_I16_e64:
305	case AMDGPU::V_MAX_I16_e32:
306	case AMDGPU::V_MIN_I16_e64:
307	case AMDGPU::V_MIN_I16_e32:
308	case AMDGPU::V_MAD_F16_e64:
309	case AMDGPU::V_MAD_U16_e64:
310	case AMDGPU::V_MAD_I16_e64:
311	case AMDGPU::V_FMA_F16_e64:
312	case AMDGPU::V_DIV_FIXUP_F16_e64:
313	// On gfx10, all 16-bit instructions preserve the high bits.
314	return getGeneration() <= AMDGPUSubtarget::GFX9;
315	case AMDGPU::V_MADAK_F16:
316	case AMDGPU::V_MADMK_F16:
317	case AMDGPU::V_MAC_F16_e64:
318	case AMDGPU::V_MAC_F16_e32:
319	case AMDGPU::V_FMAMK_F16:
320	case AMDGPU::V_FMAAK_F16:
321	case AMDGPU::V_FMAC_F16_e64:
322	case AMDGPU::V_FMAC_F16_e32:
323	// In gfx9, the preferred handling of the unused high 16-bits changed. Most
324	// instructions maintain the legacy behavior of 0ing. Some instructions
325	// changed to preserving the high bits.
326	return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
327	case AMDGPU::V_MAD_MIXLO_F16:
328	case AMDGPU::V_MAD_MIXHI_F16:
329	default:
330	return false;
331	}
332	}
333
334	void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
335	const SchedRegion &Region) const {
336	// Track register pressure so the scheduler can try to decrease
337	// pressure once register usage is above the threshold defined by
338	// SIRegisterInfo::getRegPressureSetLimit()
339	Policy.ShouldTrackPressure = true;
340
341	// Enabling both top down and bottom up scheduling seems to give us less
342	// register spills than just using one of these approaches on its own.
343	Policy.OnlyTopDown = false;
344	Policy.OnlyBottomUp = false;
345
346	// Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
347	if (!enableSIScheduler())
348	Policy.ShouldTrackLaneMasks = true;
349	}
350
351	void GCNSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,
352	const SchedRegion &Region) const {
353	const Function &F = Region.RegionBegin ->getMF()->getFunction();
354	Attribute PostRADirectionAttr = F.getFnAttribute(Kind: "amdgpu-post-ra-direction");
355	if (!PostRADirectionAttr.isValid())
356	return;
357
358	StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();
359	if (PostRADirectionStr == "topdown") {
360	Policy.OnlyTopDown = true;
361	Policy.OnlyBottomUp = false;
362	} else if (PostRADirectionStr == "bottomup") {
363	Policy.OnlyTopDown = false;
364	Policy.OnlyBottomUp = true;
365	} else if (PostRADirectionStr == "bidirectional") {
366	Policy.OnlyTopDown = false;
367	Policy.OnlyBottomUp = false;
368	} else {
369	DiagnosticInfoOptimizationFailure Diag(
370	F, F.getSubprogram(), "invalid value for postRA direction attribute");
371	F.getContext().diagnose(DI: Diag);
372	}
373
374	LLVM_DEBUG({
375	const char *DirStr = "default";
376	if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)
377	DirStr = "topdown";
378	else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)
379	DirStr = "bottomup";
380	else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)
381	DirStr = "bidirectional";
382
383	dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr
384	<< `'\n'`;
385	});
386	}
387
388	void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
389	if (isWave32()) {
390	// Fix implicit $vcc operands after MIParser has verified that they match
391	// the instruction definitions.
392	for (auto &MBB : MF) {
393	for (auto &MI : MBB)
394	InstrInfo.fixImplicitOperands(MI);
395	}
396	}
397	}
398
399	bool GCNSubtarget::hasMadF16() const {
400	return InstrInfo.pseudoToMCOpcode(Opcode: AMDGPU::V_MAD_F16_e64) != -`1`;
401	}
402
403	bool GCNSubtarget::useVGPRIndexMode() const {
404	return hasVGPRIndexMode() && (!hasMovrel() \|\| EnableVGPRIndexMode);
405	}
406
407	bool GCNSubtarget::useAA() const { return UseAA; }
408
409	unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
410	return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, MaxWaves: getMaxWavesPerEU(),
411	Gen: getGeneration());
412	}
413
414	unsigned
415	GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
416	unsigned DynamicVGPRBlockSize) const {
417	return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(STI: this, NumVGPRs,
418	DynamicVGPRBlockSize);
419	}
420
421	unsigned
422	GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
423	if (getGeneration() >= AMDGPUSubtarget::GFX10)
424	return `2`; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
425
426	if (HasFlatScratch \|\| HasArchitectedFlatScratch) {
427	if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
428	return `6`; // FLAT_SCRATCH, XNACK, VCC (in that order).
429	if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
430	return `4`; // FLAT_SCRATCH, VCC (in that order).
431	}
432
433	if (isXNACKEnabled())
434	return `4`; // XNACK, VCC (in that order).
435	return `2`; // VCC.
436	}
437
438	unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
439	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
440	return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit());
441	}
442
443	unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
444	// In principle we do not need to reserve SGPR pair used for flat_scratch if
445	// we know flat instructions do not access the stack anywhere in the
446	// program. For now assume it's needed if we have flat instructions.
447	const bool KernelUsesFlatScratch = hasFlatAddressSpace();
448	return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch);
449	}
450
451	std::pair<unsigned, unsigned>
452	GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
453	unsigned NumSGPRs, unsigned NumVGPRs) const {
454	unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
455	// Temporarily check both the attribute and the subtarget feature until the
456	// latter is removed.
457	if (DynamicVGPRBlockSize == `0` && isDynamicVGPREnabled())
458	DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
459
460	auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSBytes: LDSSize, F);
461	unsigned SGPROcc = getOccupancyWithNumSGPRs(SGPRs: NumSGPRs);
462	unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
463
464	// Maximum occupancy may be further limited by high SGPR/VGPR usage.
465	MaxOcc = std::min(a: MaxOcc, b: std::min(a: SGPROcc, b: VGPROcc));
466	return {std::min(a: MinOcc, b: MaxOcc), MaxOcc};
467	}
468
469	unsigned GCNSubtarget::getBaseMaxNumSGPRs(
470	const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
471	unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
472	// Compute maximum number of SGPRs function can use using default/requested
473	// minimum number of waves per execution unit.
474	unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false);
475	unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true);
476
477	// Check if maximum number of SGPRs was explicitly requested using
478	// "amdgpu-num-sgpr" attribute.
479	unsigned Requested =
480	F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr", Default: MaxNumSGPRs);
481
482	if (Requested != MaxNumSGPRs) {
483	// Make sure requested value does not violate subtarget's specifications.
484	if (Requested && (Requested <= ReservedNumSGPRs))
485	Requested = `0`;
486
487	// If more SGPRs are required to support the input user/system SGPRs,
488	// increase to accommodate them.
489	//
490	// FIXME: This really ends up using the requested number of SGPRs + number
491	// of reserved special registers in total. Theoretically you could re-use
492	// the last input registers for these special registers, but this would
493	// require a lot of complexity to deal with the weird aliasing.
494	unsigned InputNumSGPRs = PreloadedSGPRs;
495	if (Requested && Requested < InputNumSGPRs)
496	Requested = InputNumSGPRs;
497
498	// Make sure requested value is compatible with values implied by
499	// default/requested minimum/maximum number of waves per execution unit.
500	if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false))
501	Requested = `0`;
502	if (WavesPerEU.second && Requested &&
503	Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second))
504	Requested = `0`;
505
506	if (Requested)
507	MaxNumSGPRs = Requested;
508	}
509
510	if (hasSGPRInitBug())
511	MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
512
513	return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs);
514	}
515
516	unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
517	const Function &F = MF.getFunction();
518	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
519	return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(),
520	ReservedNumSGPRs: getReservedNumSGPRs(MF));
521	}
522
523	unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
524	using USI = GCNUserSGPRUsageInfo;
525	// Max number of user SGPRs
526	const unsigned MaxUserSGPRs =
527	USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) +
528	USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) +
529	USI::getNumUserSGPRForField(ID: USI::QueuePtrID) +
530	USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) +
531	USI::getNumUserSGPRForField(ID: USI::DispatchIdID) +
532	USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) +
533	USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID);
534
535	// Max number of system SGPRs
536	const unsigned MaxSystemSGPRs = `1` + // WorkGroupIDX
537	`1` + // WorkGroupIDY
538	`1` + // WorkGroupIDZ
539	`1` + // WorkGroupInfo
540	`1`; // private segment wave byte offset
541
542	// Max number of synthetic SGPRs
543	const unsigned SyntheticSGPRs = `1`; // LDSKernelId
544
545	return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
546	}
547
548	unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
549	return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(),
550	ReservedNumSGPRs: getReservedNumSGPRs(F));
551	}
552
553	unsigned GCNSubtarget::getBaseMaxNumVGPRs(
554	const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
555	const auto [Min, Max] = NumVGPRBounds;
556
557	// Check if maximum number of VGPRs was explicitly requested using
558	// "amdgpu-num-vgpr" attribute.
559
560	unsigned Requested = F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr", Default: Max);
561	if (Requested != Max && hasGFX90AInsts())
562	Requested *= `2`;
563
564	// Make sure requested value is inside the range of possible VGPR usage.
565	return std::clamp(val: Requested, lo: Min, hi: Max);
566	}
567
568	unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
569	// Temporarily check both the attribute and the subtarget feature, until the
570	// latter is removed.
571	unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
572	if (DynamicVGPRBlockSize == `0` && isDynamicVGPREnabled())
573	DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
574
575	std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
576	return getBaseMaxNumVGPRs(
577	F, NumVGPRBounds: {getMinNumVGPRs(WavesPerEU: Waves.second, DynamicVGPRBlockSize),
578	getMaxNumVGPRs(WavesPerEU: Waves.first, DynamicVGPRBlockSize)});
579	}
580
581	unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
582	return getMaxNumVGPRs(F: MF.getFunction());
583	}
584
585	std::pair<unsigned, unsigned>
586	GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
587	const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
588
589	unsigned MaxNumVGPRs = MaxVectorRegs;
590	unsigned MaxNumAGPRs = `0`;
591	unsigned NumArchVGPRs = getAddressableNumArchVGPRs();
592
593	// On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
594	// a wave may have up to 512 total vector registers combining together both
595	// VGPRs and AGPRs. Hence, in an entry function without calls and without
596	// AGPRs used within it, it is possible to use the whole vector register
597	// budget for VGPRs.
598	//
599	// TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
600	// register file accordingly.
601	if (hasGFX90AInsts()) {
602	unsigned MinNumAGPRs = `0`;
603	const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
604
605	const std::pair<unsigned, unsigned> DefaultNumAGPR = {~`0u`, ~`0u`};
606
607	// TODO: The lower bound should probably force the number of required
608	// registers up, overriding amdgpu-waves-per-eu.
609	std::tie(args&: MinNumAGPRs, args&: MaxNumAGPRs) =
610	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-agpr-alloc", Default: DefaultNumAGPR,
611	/OnlyFirstRequired=/true);
612
613	if (MinNumAGPRs == DefaultNumAGPR.first) {
614	// Default to splitting half the registers if AGPRs are required.
615	MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / `2`;
616	} else {
617	// Align to accum_offset's allocation granularity.
618	MinNumAGPRs = alignTo(Value: MinNumAGPRs, Align: `4`);
619
620	MinNumAGPRs = std::min(a: MinNumAGPRs, b: TotalNumAGPRs);
621	}
622
623	// Clamp values to be inbounds of our limits, and ensure min <= max.
624
625	MaxNumAGPRs = std::min(a: std::max(a: MinNumAGPRs, b: MaxNumAGPRs), b: MaxVectorRegs);
626	MinNumAGPRs = std::min(a: std::min(a: MinNumAGPRs, b: TotalNumAGPRs), b: MaxNumAGPRs);
627
628	MaxNumVGPRs = std::min(a: MaxVectorRegs - MinNumAGPRs, b: NumArchVGPRs);
629	MaxNumAGPRs = std::min(a: MaxVectorRegs - MaxNumVGPRs, b: MaxNumAGPRs);
630
631	assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
632	MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&
633	"invalid register counts");
634	} else if (hasMAIInsts()) {
635	// On gfx908 the number of AGPRs always equals the number of VGPRs.
636	MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
637	}
638
639	return std::pair(MaxNumVGPRs, MaxNumAGPRs);
640	}
641
642	void GCNSubtarget::adjustSchedDependency(
643	SUnit Def, int* DefOpIdx, SUnit Use, int* UseOpIdx, SDep &Dep,
644	const TargetSchedModel SchedModel) const* {
645	if (Dep.getKind() != SDep::Kind::Data \|\| !Dep.getReg() \|\| !Def->isInstr() \|\|
646	!Use->isInstr())
647	return;
648
649	MachineInstr *DefI = Def->getInstr();
650	MachineInstr *UseI = Use->getInstr();
651
652	if (DefI->isBundle()) {
653	const SIRegisterInfo *TRI = getRegisterInfo();
654	auto Reg = Dep.getReg();
655	MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
656	MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
657	unsigned Lat = `0`;
658	for (++I; I != E && I ->isBundledWithPred(); ++I) {
659	if (I ->isMetaInstruction())
660	continue;
661	if (I ->modifiesRegister(Reg, TRI))
662	Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *I);
663	else if (Lat)
664	--Lat;
665	}
666	Dep.setLatency(Lat);
667	} else if (UseI->isBundle()) {
668	const SIRegisterInfo *TRI = getRegisterInfo();
669	auto Reg = Dep.getReg();
670	MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
671	MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
672	unsigned Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *DefI);
673	for (++I; I != E && I ->isBundledWithPred() && Lat; ++I) {
674	if (I ->isMetaInstruction())
675	continue;
676	if (I ->readsRegister(Reg, TRI))
677	break;
678	--Lat;
679	}
680	Dep.setLatency(Lat);
681	} else if (Dep.getLatency() == `0` && Dep.getReg() == AMDGPU::VCC_LO) {
682	// Work around the fact that SIInstrInfo::fixImplicitOperands modifies
683	// implicit operands which come from the MCInstrDesc, which can fool
684	// ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
685	// pseudo operands.
686	Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
687	DefMI: DefI, DefOperIdx: DefOpIdx, UseMI: UseI, UseOperIdx: UseOpIdx));
688	}
689	}
690
691	unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
692	if (getGeneration() >= AMDGPUSubtarget::GFX12)
693	return `0`; // Not MIMG encoding.
694
695	if (NSAThreshold.getNumOccurrences() > `0`)
696	return std::max(a: NSAThreshold.getValue(), b: `2u`);
697
698	int Value = MF.getFunction().getFnAttributeAsParsedInteger(
699	Kind: "amdgpu-nsa-threshold", Default: -`1`);
700	if (Value > `0`)
701	return std::max(a: Value, b: `2`);
702
703	return NSAThreshold;
704	}
705
706	GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
707	const GCNSubtarget &ST)
708	: ST(ST) {
709	const CallingConv::ID CC = F.getCallingConv();
710	const bool IsKernel =
711	CC == CallingConv::AMDGPU_KERNEL \|\| CC == CallingConv::SPIR_KERNEL;
712
713	if (IsKernel && (!F.arg_empty() \|\| ST.getImplicitArgNumBytes(F) != `0`))
714	KernargSegmentPtr = true;
715
716	bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
717	if (IsAmdHsaOrMesa && !ST.hasFlatScratchEnabled())
718	PrivateSegmentBuffer = true;
719	else if (ST.isMesaGfxShader(F))
720	ImplicitBufferPtr = true;
721
722	if (!AMDGPU::isGraphics(CC)) {
723	if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr"))
724	DispatchPtr = true;
725
726	// FIXME: Can this always be disabled with < COv5?
727	if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr"))
728	QueuePtr = true;
729
730	if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id"))
731	DispatchID = true;
732	}
733
734	if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
735	(IsAmdHsaOrMesa \|\| ST.hasFlatScratchEnabled()) &&
736	// FlatScratchInit cannot be true for graphics CC if
737	// hasFlatScratchEnabled() is false.
738	(ST.hasFlatScratchEnabled() \|\|
739	(!AMDGPU::isGraphics(CC) &&
740	!F.hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))) &&
741	!ST.hasArchitectedFlatScratch()) {
742	FlatScratchInit = true;
743	}
744
745	if (hasImplicitBufferPtr())
746	NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID);
747
748	if (hasPrivateSegmentBuffer())
749	NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID);
750
751	if (hasDispatchPtr())
752	NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID);
753
754	if (hasQueuePtr())
755	NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID);
756
757	if (hasKernargSegmentPtr())
758	NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID);
759
760	if (hasDispatchID())
761	NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID);
762
763	if (hasFlatScratchInit())
764	NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID);
765
766	if (hasPrivateSegmentSize())
767	NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentSizeID);
768	}
769
770	void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
771	assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
772	NumKernargPreloadSGPRs += NumSGPRs;
773	NumUsedUserSGPRs += NumSGPRs;
774	}
775
776	unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
777	return AMDGPU::getMaxNumUserSGPRs(STI: ST) - NumUsedUserSGPRs;
778	}
779

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp