1//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "GCNSubtarget.h"
15#include "AMDGPUCallLowering.h"
16#include "AMDGPUInstructionSelector.h"
17#include "AMDGPULegalizerInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSelectionDAGInfo.h"
20#include "AMDGPUTargetMachine.h"
21#include "SIMachineFunctionInfo.h"
22#include "Utils/AMDGPUBaseInfo.h"
23#include "llvm/ADT/SmallString.h"
24#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25#include "llvm/CodeGen/MachineScheduler.h"
26#include "llvm/CodeGen/TargetFrameLowering.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "gcn-subtarget"
34
35#define GET_SUBTARGETINFO_TARGET_DESC
36#define GET_SUBTARGETINFO_CTOR
37#define AMDGPUSubtarget GCNSubtarget
38#include "AMDGPUGenSubtargetInfo.inc"
39#undef AMDGPUSubtarget
40
41static cl::opt<bool> EnableVGPRIndexMode(
42 "amdgpu-vgpr-index-mode",
43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44 cl::init(Val: false));
45
46static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47 cl::desc("Enable the use of AA during codegen."),
48 cl::init(Val: true));
49
50static cl::opt<unsigned>
51 NSAThreshold("amdgpu-nsa-threshold",
52 cl::desc("Number of addresses from which to enable MIMG NSA."),
53 cl::init(Val: 2), cl::Hidden);
54
55GCNSubtarget::~GCNSubtarget() = default;
56
57GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
58 StringRef GPU,
59 StringRef FS) {
60 // Determine default and user-specified characteristics
61 //
62 // We want to be able to turn these off, but making this a subtarget feature
63 // for SI has the unhelpful behavior that it unsets everything else if you
64 // disable it.
65 //
66 // Similarly we want enable-prt-strict-null to be on by default and not to
67 // unset everything else if it is disabled
68
69 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
70
71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72 // default
73 if (isAmdHsaOS())
74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78 // Disable mutually exclusive bits.
79 if (FS.contains_insensitive(Other: "+wavefrontsize")) {
80 if (!FS.contains_insensitive(Other: "wavefrontsize16"))
81 FullFS += "-wavefrontsize16,";
82 if (!FS.contains_insensitive(Other: "wavefrontsize32"))
83 FullFS += "-wavefrontsize32,";
84 if (!FS.contains_insensitive(Other: "wavefrontsize64"))
85 FullFS += "-wavefrontsize64,";
86 }
87
88 FullFS += FS;
89
90 ParseSubtargetFeatures(CPU: GPU, /*TuneCPU*/ GPU, FS: FullFS);
91
92 // Implement the "generic" processors, which acts as the default when no
93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94 // the first amdgcn target that supports flat addressing. Other OSes defaults
95 // to the first amdgcn target.
96 if (Gen == AMDGPUSubtarget::INVALID) {
97 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
98 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
99 // Assume wave64 for the unknown target, if not explicitly set.
100 if (getWavefrontSizeLog2() == 0)
101 WavefrontSizeLog2 = 6;
102 } else if (!hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
103 !hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
104 // If there is no default wave size it must be a generation before gfx10,
105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106 // set wave32 as a default.
107 ToggleFeature(FB: AMDGPU::FeatureWavefrontSize32);
108 WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6;
109 }
110
111 // We don't support FP64 for EG/NI atm.
112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113
114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115 // support flat operations, otherwise they cannot access a 64-bit global
116 // address space
117 assert(hasAddr64() || hasFlat());
118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119 // that do not support ADDR64 variants of MUBUF instructions. Such targets
120 // cannot use a 64 bit offset with a MUBUF instruction to access the global
121 // address space
122 if (!hasAddr64() && !FS.contains(Other: "flat-for-global") && !UseFlatForGlobal) {
123 ToggleFeature(FB: AMDGPU::FeatureUseFlatForGlobal);
124 UseFlatForGlobal = true;
125 }
126 // Unless +-flat-for-global is specified, use MUBUF instructions for global
127 // address space access if flat operations are not available.
128 if (!hasFlat() && !FS.contains(Other: "flat-for-global") && UseFlatForGlobal) {
129 ToggleFeature(FB: AMDGPU::FeatureUseFlatForGlobal);
130 UseFlatForGlobal = false;
131 }
132
133 // Set defaults if needed.
134 if (MaxPrivateElementSize == 0)
135 MaxPrivateElementSize = 4;
136
137 if (LDSBankCount == 0)
138 LDSBankCount = 32;
139
140 if (AddressableLocalMemorySize == 0)
141 AddressableLocalMemorySize = 32768;
142
143 LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(STI: this);
144
145 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
146 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
147
148 TargetID.setTargetIDFromFeaturesString(FS);
149
150 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
151 << TargetID.getXnackSetting() << '\n');
152 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
153 << TargetID.getSramEccSetting() << '\n');
154
155 return *this;
156}
157
158void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
159 LLVMContext &Ctx = F.getContext();
160 if (hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
161 hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
162 Ctx.diagnose(DI: DiagnosticInfoUnsupported(
163 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
164 }
165}
166
167GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
168 const GCNTargetMachine &TM)
169 : // clang-format off
170 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
171 AMDGPUSubtarget(TT),
172 TargetID(*this),
173 InstrItins(getInstrItineraryForCPU(CPU: GPU)),
174 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
175 TLInfo(TM, *this),
176 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
177 // clang-format on
178 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(STI: this);
179 EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(STI: this);
180
181 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
182
183 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(args: *getTargetLowering());
184 InlineAsmLoweringInfo =
185 std::make_unique<InlineAsmLowering>(args: getTargetLowering());
186 Legalizer = std::make_unique<AMDGPULegalizerInfo>(args&: *this, args: TM);
187 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(args&: *this);
188 InstSelector =
189 std::make_unique<AMDGPUInstructionSelector>(args&: *this, args&: *RegBankInfo, args: TM);
190}
191
192const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {
193 return TSInfo.get();
194}
195
196unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
197 if (getGeneration() < GFX10)
198 return 1;
199
200 switch (Opcode) {
201 case AMDGPU::V_LSHLREV_B64_e64:
202 case AMDGPU::V_LSHLREV_B64_gfx10:
203 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
204 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
205 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
206 case AMDGPU::V_LSHL_B64_e64:
207 case AMDGPU::V_LSHRREV_B64_e64:
208 case AMDGPU::V_LSHRREV_B64_gfx10:
209 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
210 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
211 case AMDGPU::V_LSHR_B64_e64:
212 case AMDGPU::V_ASHRREV_I64_e64:
213 case AMDGPU::V_ASHRREV_I64_gfx10:
214 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
215 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
216 case AMDGPU::V_ASHR_I64_e64:
217 return 1;
218 }
219
220 return 2;
221}
222
223/// This list was mostly derived from experimentation.
224bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
225 switch (Opcode) {
226 case AMDGPU::V_CVT_F16_F32_e32:
227 case AMDGPU::V_CVT_F16_F32_e64:
228 case AMDGPU::V_CVT_F16_U16_e32:
229 case AMDGPU::V_CVT_F16_U16_e64:
230 case AMDGPU::V_CVT_F16_I16_e32:
231 case AMDGPU::V_CVT_F16_I16_e64:
232 case AMDGPU::V_RCP_F16_e64:
233 case AMDGPU::V_RCP_F16_e32:
234 case AMDGPU::V_RSQ_F16_e64:
235 case AMDGPU::V_RSQ_F16_e32:
236 case AMDGPU::V_SQRT_F16_e64:
237 case AMDGPU::V_SQRT_F16_e32:
238 case AMDGPU::V_LOG_F16_e64:
239 case AMDGPU::V_LOG_F16_e32:
240 case AMDGPU::V_EXP_F16_e64:
241 case AMDGPU::V_EXP_F16_e32:
242 case AMDGPU::V_SIN_F16_e64:
243 case AMDGPU::V_SIN_F16_e32:
244 case AMDGPU::V_COS_F16_e64:
245 case AMDGPU::V_COS_F16_e32:
246 case AMDGPU::V_FLOOR_F16_e64:
247 case AMDGPU::V_FLOOR_F16_e32:
248 case AMDGPU::V_CEIL_F16_e64:
249 case AMDGPU::V_CEIL_F16_e32:
250 case AMDGPU::V_TRUNC_F16_e64:
251 case AMDGPU::V_TRUNC_F16_e32:
252 case AMDGPU::V_RNDNE_F16_e64:
253 case AMDGPU::V_RNDNE_F16_e32:
254 case AMDGPU::V_FRACT_F16_e64:
255 case AMDGPU::V_FRACT_F16_e32:
256 case AMDGPU::V_FREXP_MANT_F16_e64:
257 case AMDGPU::V_FREXP_MANT_F16_e32:
258 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
259 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
260 case AMDGPU::V_LDEXP_F16_e64:
261 case AMDGPU::V_LDEXP_F16_e32:
262 case AMDGPU::V_LSHLREV_B16_e64:
263 case AMDGPU::V_LSHLREV_B16_e32:
264 case AMDGPU::V_LSHRREV_B16_e64:
265 case AMDGPU::V_LSHRREV_B16_e32:
266 case AMDGPU::V_ASHRREV_I16_e64:
267 case AMDGPU::V_ASHRREV_I16_e32:
268 case AMDGPU::V_ADD_U16_e64:
269 case AMDGPU::V_ADD_U16_e32:
270 case AMDGPU::V_SUB_U16_e64:
271 case AMDGPU::V_SUB_U16_e32:
272 case AMDGPU::V_SUBREV_U16_e64:
273 case AMDGPU::V_SUBREV_U16_e32:
274 case AMDGPU::V_MUL_LO_U16_e64:
275 case AMDGPU::V_MUL_LO_U16_e32:
276 case AMDGPU::V_ADD_F16_e64:
277 case AMDGPU::V_ADD_F16_e32:
278 case AMDGPU::V_SUB_F16_e64:
279 case AMDGPU::V_SUB_F16_e32:
280 case AMDGPU::V_SUBREV_F16_e64:
281 case AMDGPU::V_SUBREV_F16_e32:
282 case AMDGPU::V_MUL_F16_e64:
283 case AMDGPU::V_MUL_F16_e32:
284 case AMDGPU::V_MAX_F16_e64:
285 case AMDGPU::V_MAX_F16_e32:
286 case AMDGPU::V_MIN_F16_e64:
287 case AMDGPU::V_MIN_F16_e32:
288 case AMDGPU::V_MAX_U16_e64:
289 case AMDGPU::V_MAX_U16_e32:
290 case AMDGPU::V_MIN_U16_e64:
291 case AMDGPU::V_MIN_U16_e32:
292 case AMDGPU::V_MAX_I16_e64:
293 case AMDGPU::V_MAX_I16_e32:
294 case AMDGPU::V_MIN_I16_e64:
295 case AMDGPU::V_MIN_I16_e32:
296 case AMDGPU::V_MAD_F16_e64:
297 case AMDGPU::V_MAD_U16_e64:
298 case AMDGPU::V_MAD_I16_e64:
299 case AMDGPU::V_FMA_F16_e64:
300 case AMDGPU::V_DIV_FIXUP_F16_e64:
301 // On gfx10, all 16-bit instructions preserve the high bits.
302 return getGeneration() <= AMDGPUSubtarget::GFX9;
303 case AMDGPU::V_MADAK_F16:
304 case AMDGPU::V_MADMK_F16:
305 case AMDGPU::V_MAC_F16_e64:
306 case AMDGPU::V_MAC_F16_e32:
307 case AMDGPU::V_FMAMK_F16:
308 case AMDGPU::V_FMAAK_F16:
309 case AMDGPU::V_FMAC_F16_e64:
310 case AMDGPU::V_FMAC_F16_e32:
311 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
312 // instructions maintain the legacy behavior of 0ing. Some instructions
313 // changed to preserving the high bits.
314 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
315 case AMDGPU::V_MAD_MIXLO_F16:
316 case AMDGPU::V_MAD_MIXHI_F16:
317 default:
318 return false;
319 }
320}
321
322void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
323 const SchedRegion &Region) const {
324 // Track register pressure so the scheduler can try to decrease
325 // pressure once register usage is above the threshold defined by
326 // SIRegisterInfo::getRegPressureSetLimit()
327 Policy.ShouldTrackPressure = true;
328
329 // Enabling both top down and bottom up scheduling seems to give us less
330 // register spills than just using one of these approaches on its own.
331 Policy.OnlyTopDown = false;
332 Policy.OnlyBottomUp = false;
333
334 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
335 if (!enableSIScheduler())
336 Policy.ShouldTrackLaneMasks = true;
337}
338
339void GCNSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,
340 const SchedRegion &Region) const {
341 const Function &F = Region.RegionBegin->getMF()->getFunction();
342 Attribute PostRADirectionAttr = F.getFnAttribute(Kind: "amdgpu-post-ra-direction");
343 if (!PostRADirectionAttr.isValid())
344 return;
345
346 StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();
347 if (PostRADirectionStr == "topdown") {
348 Policy.OnlyTopDown = true;
349 Policy.OnlyBottomUp = false;
350 } else if (PostRADirectionStr == "bottomup") {
351 Policy.OnlyTopDown = false;
352 Policy.OnlyBottomUp = true;
353 } else if (PostRADirectionStr == "bidirectional") {
354 Policy.OnlyTopDown = false;
355 Policy.OnlyBottomUp = false;
356 } else {
357 DiagnosticInfoOptimizationFailure Diag(
358 F, F.getSubprogram(), "invalid value for postRA direction attribute");
359 F.getContext().diagnose(DI: Diag);
360 }
361
362 LLVM_DEBUG({
363 const char *DirStr = "default";
364 if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)
365 DirStr = "topdown";
366 else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)
367 DirStr = "bottomup";
368 else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)
369 DirStr = "bidirectional";
370
371 dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr
372 << '\n';
373 });
374}
375
376void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
377 if (isWave32()) {
378 // Fix implicit $vcc operands after MIParser has verified that they match
379 // the instruction definitions.
380 for (auto &MBB : MF) {
381 for (auto &MI : MBB)
382 InstrInfo.fixImplicitOperands(MI);
383 }
384 }
385}
386
387bool GCNSubtarget::hasMadF16() const {
388 return InstrInfo.pseudoToMCOpcode(Opcode: AMDGPU::V_MAD_F16_e64) != -1;
389}
390
391bool GCNSubtarget::useVGPRIndexMode() const {
392 return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
393}
394
395bool GCNSubtarget::useAA() const { return UseAA; }
396
397unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
398 return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, MaxWaves: getMaxWavesPerEU(),
399 Gen: getGeneration());
400}
401
402unsigned
403GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
404 unsigned DynamicVGPRBlockSize) const {
405 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(STI: this, NumVGPRs,
406 DynamicVGPRBlockSize);
407}
408
409unsigned
410GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
411 if (getGeneration() >= AMDGPUSubtarget::GFX10)
412 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
413
414 if (HasFlatScratch || HasArchitectedFlatScratch) {
415 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
416 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
417 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
418 return 4; // FLAT_SCRATCH, VCC (in that order).
419 }
420
421 if (isXNACKEnabled())
422 return 4; // XNACK, VCC (in that order).
423 return 2; // VCC.
424}
425
426unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
427 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
428 return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit());
429}
430
431unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
432 // In principle we do not need to reserve SGPR pair used for flat_scratch if
433 // we know flat instructions do not access the stack anywhere in the
434 // program. For now assume it's needed if we have flat instructions.
435 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
436 return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch);
437}
438
439std::pair<unsigned, unsigned>
440GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
441 unsigned NumSGPRs, unsigned NumVGPRs) const {
442 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
443 // Temporarily check both the attribute and the subtarget feature until the
444 // latter is removed.
445 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
446 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
447
448 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSBytes: LDSSize, F);
449 unsigned SGPROcc = getOccupancyWithNumSGPRs(SGPRs: NumSGPRs);
450 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
451
452 // Maximum occupancy may be further limited by high SGPR/VGPR usage.
453 MaxOcc = std::min(a: MaxOcc, b: std::min(a: SGPROcc, b: VGPROcc));
454 return {std::min(a: MinOcc, b: MaxOcc), MaxOcc};
455}
456
457unsigned GCNSubtarget::getBaseMaxNumSGPRs(
458 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
459 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
460 // Compute maximum number of SGPRs function can use using default/requested
461 // minimum number of waves per execution unit.
462 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false);
463 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true);
464
465 // Check if maximum number of SGPRs was explicitly requested using
466 // "amdgpu-num-sgpr" attribute.
467 unsigned Requested =
468 F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr", Default: MaxNumSGPRs);
469
470 if (Requested != MaxNumSGPRs) {
471 // Make sure requested value does not violate subtarget's specifications.
472 if (Requested && (Requested <= ReservedNumSGPRs))
473 Requested = 0;
474
475 // If more SGPRs are required to support the input user/system SGPRs,
476 // increase to accommodate them.
477 //
478 // FIXME: This really ends up using the requested number of SGPRs + number
479 // of reserved special registers in total. Theoretically you could re-use
480 // the last input registers for these special registers, but this would
481 // require a lot of complexity to deal with the weird aliasing.
482 unsigned InputNumSGPRs = PreloadedSGPRs;
483 if (Requested && Requested < InputNumSGPRs)
484 Requested = InputNumSGPRs;
485
486 // Make sure requested value is compatible with values implied by
487 // default/requested minimum/maximum number of waves per execution unit.
488 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false))
489 Requested = 0;
490 if (WavesPerEU.second && Requested &&
491 Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second))
492 Requested = 0;
493
494 if (Requested)
495 MaxNumSGPRs = Requested;
496 }
497
498 if (hasSGPRInitBug())
499 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
500
501 return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs);
502}
503
504unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
505 const Function &F = MF.getFunction();
506 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
507 return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(),
508 ReservedNumSGPRs: getReservedNumSGPRs(MF));
509}
510
511unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
512 using USI = GCNUserSGPRUsageInfo;
513 // Max number of user SGPRs
514 const unsigned MaxUserSGPRs =
515 USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) +
516 USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) +
517 USI::getNumUserSGPRForField(ID: USI::QueuePtrID) +
518 USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) +
519 USI::getNumUserSGPRForField(ID: USI::DispatchIdID) +
520 USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) +
521 USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID);
522
523 // Max number of system SGPRs
524 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
525 1 + // WorkGroupIDY
526 1 + // WorkGroupIDZ
527 1 + // WorkGroupInfo
528 1; // private segment wave byte offset
529
530 // Max number of synthetic SGPRs
531 const unsigned SyntheticSGPRs = 1; // LDSKernelId
532
533 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
534}
535
536unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
537 return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(),
538 ReservedNumSGPRs: getReservedNumSGPRs(F));
539}
540
541unsigned GCNSubtarget::getBaseMaxNumVGPRs(
542 const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
543 const auto [Min, Max] = NumVGPRBounds;
544
545 // Check if maximum number of VGPRs was explicitly requested using
546 // "amdgpu-num-vgpr" attribute.
547
548 unsigned Requested = F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr", Default: Max);
549 if (Requested != Max && hasGFX90AInsts())
550 Requested *= 2;
551
552 // Make sure requested value is inside the range of possible VGPR usage.
553 return std::clamp(val: Requested, lo: Min, hi: Max);
554}
555
556unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
557 // Temporarily check both the attribute and the subtarget feature, until the
558 // latter is removed.
559 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
560 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
561 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
562
563 std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
564 return getBaseMaxNumVGPRs(
565 F, NumVGPRBounds: {getMinNumVGPRs(WavesPerEU: Waves.second, DynamicVGPRBlockSize),
566 getMaxNumVGPRs(WavesPerEU: Waves.first, DynamicVGPRBlockSize)});
567}
568
569unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
570 return getMaxNumVGPRs(F: MF.getFunction());
571}
572
573std::pair<unsigned, unsigned>
574GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
575 const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
576
577 unsigned MaxNumVGPRs = MaxVectorRegs;
578 unsigned MaxNumAGPRs = 0;
579 unsigned NumArchVGPRs = has1024AddressableVGPRs() ? 1024 : 256;
580
581 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
582 // a wave may have up to 512 total vector registers combining together both
583 // VGPRs and AGPRs. Hence, in an entry function without calls and without
584 // AGPRs used within it, it is possible to use the whole vector register
585 // budget for VGPRs.
586 //
587 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
588 // register file accordingly.
589 if (hasGFX90AInsts()) {
590 unsigned MinNumAGPRs = 0;
591 const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
592
593 const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
594
595 // TODO: The lower bound should probably force the number of required
596 // registers up, overriding amdgpu-waves-per-eu.
597 std::tie(args&: MinNumAGPRs, args&: MaxNumAGPRs) =
598 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-agpr-alloc", Default: DefaultNumAGPR,
599 /*OnlyFirstRequired=*/true);
600
601 if (MinNumAGPRs == DefaultNumAGPR.first) {
602 // Default to splitting half the registers if AGPRs are required.
603 MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
604 } else {
605 // Align to accum_offset's allocation granularity.
606 MinNumAGPRs = alignTo(Value: MinNumAGPRs, Align: 4);
607
608 MinNumAGPRs = std::min(a: MinNumAGPRs, b: TotalNumAGPRs);
609 }
610
611 // Clamp values to be inbounds of our limits, and ensure min <= max.
612
613 MaxNumAGPRs = std::min(a: std::max(a: MinNumAGPRs, b: MaxNumAGPRs), b: MaxVectorRegs);
614 MinNumAGPRs = std::min(a: std::min(a: MinNumAGPRs, b: TotalNumAGPRs), b: MaxNumAGPRs);
615
616 MaxNumVGPRs = std::min(a: MaxVectorRegs - MinNumAGPRs, b: NumArchVGPRs);
617 MaxNumAGPRs = std::min(a: MaxVectorRegs - MaxNumVGPRs, b: MaxNumAGPRs);
618
619 assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
620 MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&
621 "invalid register counts");
622 } else if (hasMAIInsts()) {
623 // On gfx908 the number of AGPRs always equals the number of VGPRs.
624 MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
625 }
626
627 return std::pair(MaxNumVGPRs, MaxNumAGPRs);
628}
629
630void GCNSubtarget::adjustSchedDependency(
631 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
632 const TargetSchedModel *SchedModel) const {
633 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
634 !Use->isInstr())
635 return;
636
637 MachineInstr *DefI = Def->getInstr();
638 MachineInstr *UseI = Use->getInstr();
639
640 if (DefI->isBundle()) {
641 const SIRegisterInfo *TRI = getRegisterInfo();
642 auto Reg = Dep.getReg();
643 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
644 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
645 unsigned Lat = 0;
646 for (++I; I != E && I->isBundledWithPred(); ++I) {
647 if (I->isMetaInstruction())
648 continue;
649 if (I->modifiesRegister(Reg, TRI))
650 Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *I);
651 else if (Lat)
652 --Lat;
653 }
654 Dep.setLatency(Lat);
655 } else if (UseI->isBundle()) {
656 const SIRegisterInfo *TRI = getRegisterInfo();
657 auto Reg = Dep.getReg();
658 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
659 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
660 unsigned Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *DefI);
661 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
662 if (I->isMetaInstruction())
663 continue;
664 if (I->readsRegister(Reg, TRI))
665 break;
666 --Lat;
667 }
668 Dep.setLatency(Lat);
669 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
670 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
671 // implicit operands which come from the MCInstrDesc, which can fool
672 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
673 // pseudo operands.
674 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
675 DefMI: DefI, DefOperIdx: DefOpIdx, UseMI: UseI, UseOperIdx: UseOpIdx));
676 }
677}
678
679unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
680 if (getGeneration() >= AMDGPUSubtarget::GFX12)
681 return 0; // Not MIMG encoding.
682
683 if (NSAThreshold.getNumOccurrences() > 0)
684 return std::max(a: NSAThreshold.getValue(), b: 2u);
685
686 int Value = MF.getFunction().getFnAttributeAsParsedInteger(
687 Kind: "amdgpu-nsa-threshold", Default: -1);
688 if (Value > 0)
689 return std::max(a: Value, b: 2);
690
691 return NSAThreshold;
692}
693
694GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
695 const GCNSubtarget &ST)
696 : ST(ST) {
697 const CallingConv::ID CC = F.getCallingConv();
698 const bool IsKernel =
699 CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
700
701 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
702 KernargSegmentPtr = true;
703
704 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
705 if (IsAmdHsaOrMesa && !ST.hasFlatScratchEnabled())
706 PrivateSegmentBuffer = true;
707 else if (ST.isMesaGfxShader(F))
708 ImplicitBufferPtr = true;
709
710 if (!AMDGPU::isGraphics(CC)) {
711 if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr"))
712 DispatchPtr = true;
713
714 // FIXME: Can this always be disabled with < COv5?
715 if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr"))
716 QueuePtr = true;
717
718 if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id"))
719 DispatchID = true;
720 }
721
722 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
723 (IsAmdHsaOrMesa || ST.hasFlatScratchEnabled()) &&
724 // FlatScratchInit cannot be true for graphics CC if
725 // hasFlatScratchEnabled() is false.
726 (ST.hasFlatScratchEnabled() ||
727 (!AMDGPU::isGraphics(CC) &&
728 !F.hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))) &&
729 !ST.hasArchitectedFlatScratch()) {
730 FlatScratchInit = true;
731 }
732
733 if (hasImplicitBufferPtr())
734 NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID);
735
736 if (hasPrivateSegmentBuffer())
737 NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID);
738
739 if (hasDispatchPtr())
740 NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID);
741
742 if (hasQueuePtr())
743 NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID);
744
745 if (hasKernargSegmentPtr())
746 NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID);
747
748 if (hasDispatchID())
749 NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID);
750
751 if (hasFlatScratchInit())
752 NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID);
753
754 if (hasPrivateSegmentSize())
755 NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentSizeID);
756}
757
758void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
759 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
760 NumKernargPreloadSGPRs += NumSGPRs;
761 NumUsedUserSGPRs += NumSGPRs;
762}
763
764unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
765 return AMDGPU::getMaxNumUserSGPRs(STI: ST) - NumUsedUserSGPRs;
766}
767