1//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "GCNSubtarget.h"
15#include "AMDGPUCallLowering.h"
16#include "AMDGPUInstructionSelector.h"
17#include "AMDGPULegalizerInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSelectionDAGInfo.h"
20#include "AMDGPUTargetMachine.h"
21#include "SIMachineFunctionInfo.h"
22#include "Utils/AMDGPUBaseInfo.h"
23#include "llvm/ADT/SmallString.h"
24#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25#include "llvm/CodeGen/MachineScheduler.h"
26#include "llvm/CodeGen/TargetFrameLowering.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "gcn-subtarget"
34
35#define GET_SUBTARGETINFO_TARGET_DESC
36#define GET_SUBTARGETINFO_CTOR
37#define AMDGPUSubtarget GCNSubtarget
38#include "AMDGPUGenSubtargetInfo.inc"
39#undef AMDGPUSubtarget
40
41static cl::opt<bool> EnableVGPRIndexMode(
42 "amdgpu-vgpr-index-mode",
43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44 cl::init(Val: false));
45
46static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47 cl::desc("Enable the use of AA during codegen."),
48 cl::init(Val: true));
49
50static cl::opt<unsigned>
51 NSAThreshold("amdgpu-nsa-threshold",
52 cl::desc("Number of addresses from which to enable MIMG NSA."),
53 cl::init(Val: 2), cl::Hidden);
54
55GCNSubtarget::~GCNSubtarget() = default;
56
57GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
58 StringRef GPU,
59 StringRef FS) {
60 // Determine default and user-specified characteristics
61 //
62 // We want to be able to turn these off, but making this a subtarget feature
63 // for SI has the unhelpful behavior that it unsets everything else if you
64 // disable it.
65 //
66 // Similarly we want enable-prt-strict-null to be on by default and not to
67 // unset everything else if it is disabled
68
69 SmallString<256> FullFS("+load-store-opt,+enable-ds128,");
70
71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72 // default
73 if (isAmdHsaOS())
74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78 // Disable mutually exclusive bits.
79 if (FS.contains_insensitive(Other: "+wavefrontsize")) {
80 if (!FS.contains_insensitive(Other: "wavefrontsize16"))
81 FullFS += "-wavefrontsize16,";
82 if (!FS.contains_insensitive(Other: "wavefrontsize32"))
83 FullFS += "-wavefrontsize32,";
84 if (!FS.contains_insensitive(Other: "wavefrontsize64"))
85 FullFS += "-wavefrontsize64,";
86 }
87
88 FullFS += FS;
89
90 ParseSubtargetFeatures(CPU: GPU, /*TuneCPU*/ GPU, FS: FullFS);
91
92 // Implement the "generic" processors, which acts as the default when no
93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94 // the first amdgcn target that supports flat addressing. Other OSes defaults
95 // to the first amdgcn target.
96 if (Gen == AMDGPUSubtarget::INVALID) {
97 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
98 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
99 // Assume wave64 for the unknown target, if not explicitly set.
100 if (getWavefrontSizeLog2() == 0)
101 WavefrontSizeLog2 = 6;
102 } else if (!hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
103 !hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
104 // If there is no default wave size it must be a generation before gfx10,
105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106 // set wave32 as a default.
107 ToggleFeature(FB: AMDGPU::FeatureWavefrontSize32);
108 WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6;
109 }
110
111 // We don't support FP64 for EG/NI atm.
112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113
114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115 // support flat operations, otherwise they cannot access a 64-bit global
116 // address space
117 assert(hasAddr64() || hasFlat());
118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119 // that do not support ADDR64 variants of MUBUF instructions. Such targets
120 // cannot use a 64 bit offset with a MUBUF instruction to access the global
121 // address space
122 if (!hasAddr64() && !FS.contains(Other: "flat-for-global") && !UseFlatForGlobal) {
123 ToggleFeature(FB: AMDGPU::FeatureUseFlatForGlobal);
124 UseFlatForGlobal = true;
125 }
126 // Unless +-flat-for-global is specified, use MUBUF instructions for global
127 // address space access if flat operations are not available.
128 if (!hasFlat() && !FS.contains(Other: "flat-for-global") && UseFlatForGlobal) {
129 ToggleFeature(FB: AMDGPU::FeatureUseFlatForGlobal);
130 UseFlatForGlobal = false;
131 }
132
133 // Set defaults if needed.
134 if (MaxPrivateElementSize == 0)
135 MaxPrivateElementSize = 4;
136
137 if (LDSBankCount == 0)
138 LDSBankCount = 32;
139
140 if (AddressableLocalMemorySize == 0)
141 AddressableLocalMemorySize = 32768;
142
143 if (FlatOffsetBitWidth == 0)
144 FlatOffsetBitWidth = 13;
145
146 LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(STI: this);
147
148 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
149 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
150
151 // InstCacheLineSize is set from TableGen subtarget features
152 // (FeatureInstCacheLineSize64 / FeatureInstCacheLineSize128).
153 // Fall back to 64 if no feature was specified (e.g. generic targets).
154 if (InstCacheLineSize == 0)
155 InstCacheLineSize = 64;
156
157 assert(llvm::isPowerOf2_32(InstCacheLineSize) &&
158 "InstCacheLineSize must be a power of 2");
159
160 TargetID.setTargetIDFromFeaturesString(FS);
161
162 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
163 << TargetID.getXnackSetting() << '\n');
164 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
165 << TargetID.getSramEccSetting() << '\n');
166
167 return *this;
168}
169
170void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
171 LLVMContext &Ctx = F.getContext();
172 if (hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
173 hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
174 Ctx.diagnose(DI: DiagnosticInfoUnsupported(
175 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
176 }
177}
178
179GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
180 const GCNTargetMachine &TM)
181 : // clang-format off
182 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
183 AMDGPUSubtarget(TT),
184 TargetID(*this),
185 InstrItins(getInstrItineraryForCPU(CPU: GPU)),
186 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
187 TLInfo(TM, *this),
188 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
189 // clang-format on
190 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(STI: this);
191 EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(STI: this);
192
193 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
194
195 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(args: *getTargetLowering());
196 InlineAsmLoweringInfo =
197 std::make_unique<InlineAsmLowering>(args: getTargetLowering());
198 Legalizer = std::make_unique<AMDGPULegalizerInfo>(args&: *this, args: TM);
199 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(args&: *this);
200 InstSelector =
201 std::make_unique<AMDGPUInstructionSelector>(args&: *this, args&: *RegBankInfo, args: TM);
202}
203
204const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {
205 return TSInfo.get();
206}
207
208unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
209 if (getGeneration() < GFX10)
210 return 1;
211
212 switch (Opcode) {
213 case AMDGPU::V_LSHLREV_B64_e64:
214 case AMDGPU::V_LSHLREV_B64_gfx10:
215 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
216 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
217 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
218 case AMDGPU::V_LSHL_B64_e64:
219 case AMDGPU::V_LSHRREV_B64_e64:
220 case AMDGPU::V_LSHRREV_B64_gfx10:
221 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
222 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
223 case AMDGPU::V_LSHR_B64_e64:
224 case AMDGPU::V_ASHRREV_I64_e64:
225 case AMDGPU::V_ASHRREV_I64_gfx10:
226 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
227 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
228 case AMDGPU::V_ASHR_I64_e64:
229 return 1;
230 }
231
232 return 2;
233}
234
235/// This list was mostly derived from experimentation.
236bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
237 switch (Opcode) {
238 case AMDGPU::V_CVT_F16_F32_e32:
239 case AMDGPU::V_CVT_F16_F32_e64:
240 case AMDGPU::V_CVT_F16_U16_e32:
241 case AMDGPU::V_CVT_F16_U16_e64:
242 case AMDGPU::V_CVT_F16_I16_e32:
243 case AMDGPU::V_CVT_F16_I16_e64:
244 case AMDGPU::V_RCP_F16_e64:
245 case AMDGPU::V_RCP_F16_e32:
246 case AMDGPU::V_RSQ_F16_e64:
247 case AMDGPU::V_RSQ_F16_e32:
248 case AMDGPU::V_SQRT_F16_e64:
249 case AMDGPU::V_SQRT_F16_e32:
250 case AMDGPU::V_LOG_F16_e64:
251 case AMDGPU::V_LOG_F16_e32:
252 case AMDGPU::V_EXP_F16_e64:
253 case AMDGPU::V_EXP_F16_e32:
254 case AMDGPU::V_SIN_F16_e64:
255 case AMDGPU::V_SIN_F16_e32:
256 case AMDGPU::V_COS_F16_e64:
257 case AMDGPU::V_COS_F16_e32:
258 case AMDGPU::V_FLOOR_F16_e64:
259 case AMDGPU::V_FLOOR_F16_e32:
260 case AMDGPU::V_CEIL_F16_e64:
261 case AMDGPU::V_CEIL_F16_e32:
262 case AMDGPU::V_TRUNC_F16_e64:
263 case AMDGPU::V_TRUNC_F16_e32:
264 case AMDGPU::V_RNDNE_F16_e64:
265 case AMDGPU::V_RNDNE_F16_e32:
266 case AMDGPU::V_FRACT_F16_e64:
267 case AMDGPU::V_FRACT_F16_e32:
268 case AMDGPU::V_FREXP_MANT_F16_e64:
269 case AMDGPU::V_FREXP_MANT_F16_e32:
270 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
271 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
272 case AMDGPU::V_LDEXP_F16_e64:
273 case AMDGPU::V_LDEXP_F16_e32:
274 case AMDGPU::V_LSHLREV_B16_e64:
275 case AMDGPU::V_LSHLREV_B16_e32:
276 case AMDGPU::V_LSHRREV_B16_e64:
277 case AMDGPU::V_LSHRREV_B16_e32:
278 case AMDGPU::V_ASHRREV_I16_e64:
279 case AMDGPU::V_ASHRREV_I16_e32:
280 case AMDGPU::V_ADD_U16_e64:
281 case AMDGPU::V_ADD_U16_e32:
282 case AMDGPU::V_SUB_U16_e64:
283 case AMDGPU::V_SUB_U16_e32:
284 case AMDGPU::V_SUBREV_U16_e64:
285 case AMDGPU::V_SUBREV_U16_e32:
286 case AMDGPU::V_MUL_LO_U16_e64:
287 case AMDGPU::V_MUL_LO_U16_e32:
288 case AMDGPU::V_ADD_F16_e64:
289 case AMDGPU::V_ADD_F16_e32:
290 case AMDGPU::V_SUB_F16_e64:
291 case AMDGPU::V_SUB_F16_e32:
292 case AMDGPU::V_SUBREV_F16_e64:
293 case AMDGPU::V_SUBREV_F16_e32:
294 case AMDGPU::V_MUL_F16_e64:
295 case AMDGPU::V_MUL_F16_e32:
296 case AMDGPU::V_MAX_F16_e64:
297 case AMDGPU::V_MAX_F16_e32:
298 case AMDGPU::V_MIN_F16_e64:
299 case AMDGPU::V_MIN_F16_e32:
300 case AMDGPU::V_MAX_U16_e64:
301 case AMDGPU::V_MAX_U16_e32:
302 case AMDGPU::V_MIN_U16_e64:
303 case AMDGPU::V_MIN_U16_e32:
304 case AMDGPU::V_MAX_I16_e64:
305 case AMDGPU::V_MAX_I16_e32:
306 case AMDGPU::V_MIN_I16_e64:
307 case AMDGPU::V_MIN_I16_e32:
308 case AMDGPU::V_MAD_F16_e64:
309 case AMDGPU::V_MAD_U16_e64:
310 case AMDGPU::V_MAD_I16_e64:
311 case AMDGPU::V_FMA_F16_e64:
312 case AMDGPU::V_DIV_FIXUP_F16_e64:
313 // On gfx10, all 16-bit instructions preserve the high bits.
314 return getGeneration() <= AMDGPUSubtarget::GFX9;
315 case AMDGPU::V_MADAK_F16:
316 case AMDGPU::V_MADMK_F16:
317 case AMDGPU::V_MAC_F16_e64:
318 case AMDGPU::V_MAC_F16_e32:
319 case AMDGPU::V_FMAMK_F16:
320 case AMDGPU::V_FMAAK_F16:
321 case AMDGPU::V_FMAC_F16_e64:
322 case AMDGPU::V_FMAC_F16_e32:
323 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
324 // instructions maintain the legacy behavior of 0ing. Some instructions
325 // changed to preserving the high bits.
326 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
327 case AMDGPU::V_MAD_MIXLO_F16:
328 case AMDGPU::V_MAD_MIXHI_F16:
329 default:
330 return false;
331 }
332}
333
334void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
335 const SchedRegion &Region) const {
336 // Track register pressure so the scheduler can try to decrease
337 // pressure once register usage is above the threshold defined by
338 // SIRegisterInfo::getRegPressureSetLimit()
339 Policy.ShouldTrackPressure = true;
340
341 // Enabling both top down and bottom up scheduling seems to give us less
342 // register spills than just using one of these approaches on its own.
343 Policy.OnlyTopDown = false;
344 Policy.OnlyBottomUp = false;
345
346 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
347 if (!enableSIScheduler())
348 Policy.ShouldTrackLaneMasks = true;
349}
350
351void GCNSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,
352 const SchedRegion &Region) const {
353 const Function &F = Region.RegionBegin->getMF()->getFunction();
354 Attribute PostRADirectionAttr = F.getFnAttribute(Kind: "amdgpu-post-ra-direction");
355 if (!PostRADirectionAttr.isValid())
356 return;
357
358 StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();
359 if (PostRADirectionStr == "topdown") {
360 Policy.OnlyTopDown = true;
361 Policy.OnlyBottomUp = false;
362 } else if (PostRADirectionStr == "bottomup") {
363 Policy.OnlyTopDown = false;
364 Policy.OnlyBottomUp = true;
365 } else if (PostRADirectionStr == "bidirectional") {
366 Policy.OnlyTopDown = false;
367 Policy.OnlyBottomUp = false;
368 } else {
369 DiagnosticInfoOptimizationFailure Diag(
370 F, F.getSubprogram(), "invalid value for postRA direction attribute");
371 F.getContext().diagnose(DI: Diag);
372 }
373
374 LLVM_DEBUG({
375 const char *DirStr = "default";
376 if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)
377 DirStr = "topdown";
378 else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)
379 DirStr = "bottomup";
380 else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)
381 DirStr = "bidirectional";
382
383 dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr
384 << '\n';
385 });
386}
387
388void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
389 if (isWave32()) {
390 // Fix implicit $vcc operands after MIParser has verified that they match
391 // the instruction definitions.
392 for (auto &MBB : MF) {
393 for (auto &MI : MBB)
394 InstrInfo.fixImplicitOperands(MI);
395 }
396 }
397}
398
399bool GCNSubtarget::hasMadF16() const {
400 return InstrInfo.pseudoToMCOpcode(Opcode: AMDGPU::V_MAD_F16_e64) != -1;
401}
402
403bool GCNSubtarget::useVGPRIndexMode() const {
404 return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
405}
406
407bool GCNSubtarget::useAA() const { return UseAA; }
408
409unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
410 return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, MaxWaves: getMaxWavesPerEU(),
411 Gen: getGeneration());
412}
413
414unsigned
415GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
416 unsigned DynamicVGPRBlockSize) const {
417 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(STI: this, NumVGPRs,
418 DynamicVGPRBlockSize);
419}
420
421unsigned
422GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
423 if (getGeneration() >= AMDGPUSubtarget::GFX10)
424 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
425
426 if (HasFlatScratch || HasArchitectedFlatScratch) {
427 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
428 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
429 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
430 return 4; // FLAT_SCRATCH, VCC (in that order).
431 }
432
433 if (isXNACKEnabled())
434 return 4; // XNACK, VCC (in that order).
435 return 2; // VCC.
436}
437
438unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
439 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
440 return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit());
441}
442
443unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
444 // In principle we do not need to reserve SGPR pair used for flat_scratch if
445 // we know flat instructions do not access the stack anywhere in the
446 // program. For now assume it's needed if we have flat instructions.
447 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
448 return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch);
449}
450
451std::pair<unsigned, unsigned>
452GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
453 unsigned NumSGPRs, unsigned NumVGPRs) const {
454 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
455 // Temporarily check both the attribute and the subtarget feature until the
456 // latter is removed.
457 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
458 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
459
460 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSBytes: LDSSize, F);
461 unsigned SGPROcc = getOccupancyWithNumSGPRs(SGPRs: NumSGPRs);
462 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
463
464 // Maximum occupancy may be further limited by high SGPR/VGPR usage.
465 MaxOcc = std::min(a: MaxOcc, b: std::min(a: SGPROcc, b: VGPROcc));
466 return {std::min(a: MinOcc, b: MaxOcc), MaxOcc};
467}
468
469unsigned GCNSubtarget::getBaseMaxNumSGPRs(
470 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
471 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
472 // Compute maximum number of SGPRs function can use using default/requested
473 // minimum number of waves per execution unit.
474 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false);
475 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true);
476
477 // Check if maximum number of SGPRs was explicitly requested using
478 // "amdgpu-num-sgpr" attribute.
479 unsigned Requested =
480 F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr", Default: MaxNumSGPRs);
481
482 if (Requested != MaxNumSGPRs) {
483 // Make sure requested value does not violate subtarget's specifications.
484 if (Requested && (Requested <= ReservedNumSGPRs))
485 Requested = 0;
486
487 // If more SGPRs are required to support the input user/system SGPRs,
488 // increase to accommodate them.
489 //
490 // FIXME: This really ends up using the requested number of SGPRs + number
491 // of reserved special registers in total. Theoretically you could re-use
492 // the last input registers for these special registers, but this would
493 // require a lot of complexity to deal with the weird aliasing.
494 unsigned InputNumSGPRs = PreloadedSGPRs;
495 if (Requested && Requested < InputNumSGPRs)
496 Requested = InputNumSGPRs;
497
498 // Make sure requested value is compatible with values implied by
499 // default/requested minimum/maximum number of waves per execution unit.
500 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false))
501 Requested = 0;
502 if (WavesPerEU.second && Requested &&
503 Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second))
504 Requested = 0;
505
506 if (Requested)
507 MaxNumSGPRs = Requested;
508 }
509
510 if (hasSGPRInitBug())
511 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
512
513 return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs);
514}
515
516unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
517 const Function &F = MF.getFunction();
518 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
519 return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(),
520 ReservedNumSGPRs: getReservedNumSGPRs(MF));
521}
522
523unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
524 using USI = GCNUserSGPRUsageInfo;
525 // Max number of user SGPRs
526 const unsigned MaxUserSGPRs =
527 USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) +
528 USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) +
529 USI::getNumUserSGPRForField(ID: USI::QueuePtrID) +
530 USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) +
531 USI::getNumUserSGPRForField(ID: USI::DispatchIdID) +
532 USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) +
533 USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID);
534
535 // Max number of system SGPRs
536 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
537 1 + // WorkGroupIDY
538 1 + // WorkGroupIDZ
539 1 + // WorkGroupInfo
540 1; // private segment wave byte offset
541
542 // Max number of synthetic SGPRs
543 const unsigned SyntheticSGPRs = 1; // LDSKernelId
544
545 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
546}
547
548unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
549 return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(),
550 ReservedNumSGPRs: getReservedNumSGPRs(F));
551}
552
553unsigned GCNSubtarget::getBaseMaxNumVGPRs(
554 const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
555 const auto [Min, Max] = NumVGPRBounds;
556
557 // Check if maximum number of VGPRs was explicitly requested using
558 // "amdgpu-num-vgpr" attribute.
559
560 unsigned Requested = F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr", Default: Max);
561 if (Requested != Max && hasGFX90AInsts())
562 Requested *= 2;
563
564 // Make sure requested value is inside the range of possible VGPR usage.
565 return std::clamp(val: Requested, lo: Min, hi: Max);
566}
567
568unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
569 // Temporarily check both the attribute and the subtarget feature, until the
570 // latter is removed.
571 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
572 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
573 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
574
575 std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
576 return getBaseMaxNumVGPRs(
577 F, NumVGPRBounds: {getMinNumVGPRs(WavesPerEU: Waves.second, DynamicVGPRBlockSize),
578 getMaxNumVGPRs(WavesPerEU: Waves.first, DynamicVGPRBlockSize)});
579}
580
581unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
582 return getMaxNumVGPRs(F: MF.getFunction());
583}
584
585std::pair<unsigned, unsigned>
586GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
587 const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
588
589 unsigned MaxNumVGPRs = MaxVectorRegs;
590 unsigned MaxNumAGPRs = 0;
591 unsigned NumArchVGPRs = getAddressableNumArchVGPRs();
592
593 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
594 // a wave may have up to 512 total vector registers combining together both
595 // VGPRs and AGPRs. Hence, in an entry function without calls and without
596 // AGPRs used within it, it is possible to use the whole vector register
597 // budget for VGPRs.
598 //
599 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
600 // register file accordingly.
601 if (hasGFX90AInsts()) {
602 unsigned MinNumAGPRs = 0;
603 const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
604
605 const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
606
607 // TODO: The lower bound should probably force the number of required
608 // registers up, overriding amdgpu-waves-per-eu.
609 std::tie(args&: MinNumAGPRs, args&: MaxNumAGPRs) =
610 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-agpr-alloc", Default: DefaultNumAGPR,
611 /*OnlyFirstRequired=*/true);
612
613 if (MinNumAGPRs == DefaultNumAGPR.first) {
614 // Default to splitting half the registers if AGPRs are required.
615 MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
616 } else {
617 // Align to accum_offset's allocation granularity.
618 MinNumAGPRs = alignTo(Value: MinNumAGPRs, Align: 4);
619
620 MinNumAGPRs = std::min(a: MinNumAGPRs, b: TotalNumAGPRs);
621 }
622
623 // Clamp values to be inbounds of our limits, and ensure min <= max.
624
625 MaxNumAGPRs = std::min(a: std::max(a: MinNumAGPRs, b: MaxNumAGPRs), b: MaxVectorRegs);
626 MinNumAGPRs = std::min(a: std::min(a: MinNumAGPRs, b: TotalNumAGPRs), b: MaxNumAGPRs);
627
628 MaxNumVGPRs = std::min(a: MaxVectorRegs - MinNumAGPRs, b: NumArchVGPRs);
629 MaxNumAGPRs = std::min(a: MaxVectorRegs - MaxNumVGPRs, b: MaxNumAGPRs);
630
631 assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
632 MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&
633 "invalid register counts");
634 } else if (hasMAIInsts()) {
635 // On gfx908 the number of AGPRs always equals the number of VGPRs.
636 MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
637 }
638
639 return std::pair(MaxNumVGPRs, MaxNumAGPRs);
640}
641
642void GCNSubtarget::adjustSchedDependency(
643 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
644 const TargetSchedModel *SchedModel) const {
645 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
646 !Use->isInstr())
647 return;
648
649 MachineInstr *DefI = Def->getInstr();
650 MachineInstr *UseI = Use->getInstr();
651
652 if (DefI->isBundle()) {
653 const SIRegisterInfo *TRI = getRegisterInfo();
654 auto Reg = Dep.getReg();
655 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
656 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
657 unsigned Lat = 0;
658 for (++I; I != E && I->isBundledWithPred(); ++I) {
659 if (I->isMetaInstruction())
660 continue;
661 if (I->modifiesRegister(Reg, TRI))
662 Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *I);
663 else if (Lat)
664 --Lat;
665 }
666 Dep.setLatency(Lat);
667 } else if (UseI->isBundle()) {
668 const SIRegisterInfo *TRI = getRegisterInfo();
669 auto Reg = Dep.getReg();
670 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
671 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
672 unsigned Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *DefI);
673 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
674 if (I->isMetaInstruction())
675 continue;
676 if (I->readsRegister(Reg, TRI))
677 break;
678 --Lat;
679 }
680 Dep.setLatency(Lat);
681 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
682 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
683 // implicit operands which come from the MCInstrDesc, which can fool
684 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
685 // pseudo operands.
686 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
687 DefMI: DefI, DefOperIdx: DefOpIdx, UseMI: UseI, UseOperIdx: UseOpIdx));
688 }
689}
690
691unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
692 if (getGeneration() >= AMDGPUSubtarget::GFX12)
693 return 0; // Not MIMG encoding.
694
695 if (NSAThreshold.getNumOccurrences() > 0)
696 return std::max(a: NSAThreshold.getValue(), b: 2u);
697
698 int Value = MF.getFunction().getFnAttributeAsParsedInteger(
699 Kind: "amdgpu-nsa-threshold", Default: -1);
700 if (Value > 0)
701 return std::max(a: Value, b: 2);
702
703 return NSAThreshold;
704}
705
706GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
707 const GCNSubtarget &ST)
708 : ST(ST) {
709 const CallingConv::ID CC = F.getCallingConv();
710 const bool IsKernel =
711 CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
712
713 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
714 KernargSegmentPtr = true;
715
716 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
717 if (IsAmdHsaOrMesa && !ST.hasFlatScratchEnabled())
718 PrivateSegmentBuffer = true;
719 else if (ST.isMesaGfxShader(F))
720 ImplicitBufferPtr = true;
721
722 if (!AMDGPU::isGraphics(CC)) {
723 if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr"))
724 DispatchPtr = true;
725
726 // FIXME: Can this always be disabled with < COv5?
727 if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr"))
728 QueuePtr = true;
729
730 if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id"))
731 DispatchID = true;
732 }
733
734 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
735 (IsAmdHsaOrMesa || ST.hasFlatScratchEnabled()) &&
736 // FlatScratchInit cannot be true for graphics CC if
737 // hasFlatScratchEnabled() is false.
738 (ST.hasFlatScratchEnabled() ||
739 (!AMDGPU::isGraphics(CC) &&
740 !F.hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))) &&
741 !ST.hasArchitectedFlatScratch()) {
742 FlatScratchInit = true;
743 }
744
745 if (hasImplicitBufferPtr())
746 NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID);
747
748 if (hasPrivateSegmentBuffer())
749 NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID);
750
751 if (hasDispatchPtr())
752 NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID);
753
754 if (hasQueuePtr())
755 NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID);
756
757 if (hasKernargSegmentPtr())
758 NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID);
759
760 if (hasDispatchID())
761 NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID);
762
763 if (hasFlatScratchInit())
764 NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID);
765
766 if (hasPrivateSegmentSize())
767 NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentSizeID);
768}
769
770void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
771 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
772 NumKernargPreloadSGPRs += NumSGPRs;
773 NumUsedUserSGPRs += NumSGPRs;
774}
775
776unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
777 return AMDGPU::getMaxNumUserSGPRs(STI: ST) - NumUsedUserSGPRs;
778}
779