1//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "GCNSubtarget.h"
15#include "AMDGPUCallLowering.h"
16#include "AMDGPUInstructionSelector.h"
17#include "AMDGPULegalizerInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSelectionDAGInfo.h"
20#include "AMDGPUTargetMachine.h"
21#include "SIMachineFunctionInfo.h"
22#include "Utils/AMDGPUBaseInfo.h"
23#include "llvm/ADT/SmallString.h"
24#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25#include "llvm/CodeGen/MachineScheduler.h"
26#include "llvm/CodeGen/TargetFrameLowering.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "gcn-subtarget"
34
35#define GET_SUBTARGETINFO_TARGET_DESC
36#define GET_SUBTARGETINFO_CTOR
37#define AMDGPUSubtarget GCNSubtarget
38#include "AMDGPUGenSubtargetInfo.inc"
39#undef AMDGPUSubtarget
40
41static cl::opt<bool> EnableVGPRIndexMode(
42 "amdgpu-vgpr-index-mode",
43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44 cl::init(Val: false));
45
46static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47 cl::desc("Enable the use of AA during codegen."),
48 cl::init(Val: true));
49
50static cl::opt<unsigned>
51 NSAThreshold("amdgpu-nsa-threshold",
52 cl::desc("Number of addresses from which to enable MIMG NSA."),
53 cl::init(Val: 2), cl::Hidden);
54
55GCNSubtarget::~GCNSubtarget() = default;
56
57GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
58 StringRef GPU,
59 StringRef FS) {
60 // Determine default and user-specified characteristics
61 //
62 // We want to be able to turn these off, but making this a subtarget feature
63 // for SI has the unhelpful behavior that it unsets everything else if you
64 // disable it.
65 //
66 // Similarly we want enable-prt-strict-null to be on by default and not to
67 // unset everything else if it is disabled
68
69 SmallString<256> FullFS("+load-store-opt,+enable-ds128,");
70
71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72 // default
73 if (isAmdHsaOS())
74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78 // Disable mutually exclusive bits.
79 if (FS.contains_insensitive(Other: "+wavefrontsize")) {
80 if (!FS.contains_insensitive(Other: "wavefrontsize16"))
81 FullFS += "-wavefrontsize16,";
82 if (!FS.contains_insensitive(Other: "wavefrontsize32"))
83 FullFS += "-wavefrontsize32,";
84 if (!FS.contains_insensitive(Other: "wavefrontsize64"))
85 FullFS += "-wavefrontsize64,";
86 }
87
88 FullFS += FS;
89
90 ParseSubtargetFeatures(CPU: GPU, /*TuneCPU*/ GPU, FS: FullFS);
91
92 // Implement the "generic" processors, which acts as the default when no
93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94 // the first amdgcn target that supports flat addressing. Other OSes defaults
95 // to the first amdgcn target.
96 if (Gen == AMDGPUSubtarget::INVALID) {
97 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
98 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
99 // Assume wave64 for the unknown target, if not explicitly set.
100 if (getWavefrontSizeLog2() == 0)
101 WavefrontSizeLog2 = 6;
102 } else if (!hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
103 !hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
104 // If there is no default wave size it must be a generation before gfx10,
105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106 // set wave32 as a default.
107 ToggleFeature(FB: AMDGPU::FeatureWavefrontSize32);
108 WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6;
109 }
110
111 // We don't support FP64 for EG/NI atm.
112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113
114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115 // support flat operations, otherwise they cannot access a 64-bit global
116 // address space
117 assert(hasAddr64() || hasFlat());
118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119 // that do not support ADDR64 variants of MUBUF instructions. Such targets
120 // cannot use a 64 bit offset with a MUBUF instruction to access the global
121 // address space
122 if (!hasAddr64() && !FS.contains(Other: "flat-for-global") && !UseFlatForGlobal) {
123 ToggleFeature(FB: AMDGPU::FeatureUseFlatForGlobal);
124 UseFlatForGlobal = true;
125 }
126 // Unless +-flat-for-global is specified, use MUBUF instructions for global
127 // address space access if flat operations are not available.
128 if (!hasFlat() && !FS.contains(Other: "flat-for-global") && UseFlatForGlobal) {
129 ToggleFeature(FB: AMDGPU::FeatureUseFlatForGlobal);
130 UseFlatForGlobal = false;
131 }
132
133 // Set defaults if needed.
134 if (MaxPrivateElementSize == 0)
135 MaxPrivateElementSize = 4;
136
137 if (LDSBankCount == 0)
138 LDSBankCount = 32;
139
140 if (AddressableLocalMemorySize == 0)
141 AddressableLocalMemorySize = 32768;
142
143 if (FlatOffsetBitWidth == 0)
144 FlatOffsetBitWidth = 13;
145
146 LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(STI: this);
147
148 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
149 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
150
151 // InstCacheLineSize is set from TableGen subtarget features
152 // (FeatureInstCacheLineSize64 / FeatureInstCacheLineSize128).
153 // Fall back to 64 if no feature was specified (e.g. generic targets).
154 if (InstCacheLineSize == 0)
155 InstCacheLineSize = 64;
156
157 assert(llvm::isPowerOf2_32(InstCacheLineSize) &&
158 "InstCacheLineSize must be a power of 2");
159
160 TargetID.setTargetIDFromFeaturesString(FS);
161
162 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
163 << TargetID.getXnackSetting() << '\n');
164 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
165 << TargetID.getSramEccSetting() << '\n');
166
167 return *this;
168}
169
170void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
171 LLVMContext &Ctx = F.getContext();
172 if (hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
173 hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
174 Ctx.diagnose(DI: DiagnosticInfoUnsupported(
175 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
176 }
177}
178
179GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
180 const GCNTargetMachine &TM)
181 : // clang-format off
182 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
183 AMDGPUSubtarget(TT),
184 TargetID(*this),
185 InstrItins(getInstrItineraryForCPU(CPU: GPU)),
186 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
187 TLInfo(TM, *this),
188 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
189 // clang-format on
190 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(STI: this);
191 EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(STI: this);
192
193 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
194
195 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(args: *getTargetLowering());
196 InlineAsmLoweringInfo =
197 std::make_unique<InlineAsmLowering>(args: getTargetLowering());
198 Legalizer = std::make_unique<AMDGPULegalizerInfo>(args&: *this, args: TM);
199 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(args&: *this);
200 InstSelector =
201 std::make_unique<AMDGPUInstructionSelector>(args&: *this, args&: *RegBankInfo, args: TM);
202}
203
204const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {
205 return TSInfo.get();
206}
207
208unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
209 if (getGeneration() < GFX10)
210 return 1;
211
212 switch (Opcode) {
213 case AMDGPU::V_LSHLREV_B64_e64:
214 case AMDGPU::V_LSHLREV_B64_gfx10:
215 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
216 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
217 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
218 case AMDGPU::V_LSHL_B64_e64:
219 case AMDGPU::V_LSHRREV_B64_e64:
220 case AMDGPU::V_LSHRREV_B64_gfx10:
221 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
222 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
223 case AMDGPU::V_LSHR_B64_e64:
224 case AMDGPU::V_ASHRREV_I64_e64:
225 case AMDGPU::V_ASHRREV_I64_gfx10:
226 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
227 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
228 case AMDGPU::V_ASHR_I64_e64:
229 return 1;
230 }
231
232 return 2;
233}
234
235/// This list was mostly derived from experimentation.
236bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
237 switch (Opcode) {
238 case AMDGPU::V_CVT_F16_F32_e32:
239 case AMDGPU::V_CVT_F16_F32_e64:
240 case AMDGPU::V_CVT_F16_U16_e32:
241 case AMDGPU::V_CVT_F16_U16_e64:
242 case AMDGPU::V_CVT_F16_I16_e32:
243 case AMDGPU::V_CVT_F16_I16_e64:
244 case AMDGPU::V_RCP_F16_e64:
245 case AMDGPU::V_RCP_F16_e32:
246 case AMDGPU::V_RSQ_F16_e64:
247 case AMDGPU::V_RSQ_F16_e32:
248 case AMDGPU::V_SQRT_F16_e64:
249 case AMDGPU::V_SQRT_F16_e32:
250 case AMDGPU::V_LOG_F16_e64:
251 case AMDGPU::V_LOG_F16_e32:
252 case AMDGPU::V_EXP_F16_e64:
253 case AMDGPU::V_EXP_F16_e32:
254 case AMDGPU::V_SIN_F16_e64:
255 case AMDGPU::V_SIN_F16_e32:
256 case AMDGPU::V_COS_F16_e64:
257 case AMDGPU::V_COS_F16_e32:
258 case AMDGPU::V_FLOOR_F16_e64:
259 case AMDGPU::V_FLOOR_F16_e32:
260 case AMDGPU::V_CEIL_F16_e64:
261 case AMDGPU::V_CEIL_F16_e32:
262 case AMDGPU::V_TRUNC_F16_e64:
263 case AMDGPU::V_TRUNC_F16_e32:
264 case AMDGPU::V_RNDNE_F16_e64:
265 case AMDGPU::V_RNDNE_F16_e32:
266 case AMDGPU::V_FRACT_F16_e64:
267 case AMDGPU::V_FRACT_F16_e32:
268 case AMDGPU::V_FREXP_MANT_F16_e64:
269 case AMDGPU::V_FREXP_MANT_F16_e32:
270 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
271 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
272 case AMDGPU::V_LDEXP_F16_e64:
273 case AMDGPU::V_LDEXP_F16_e32:
274 case AMDGPU::V_LSHLREV_B16_e64:
275 case AMDGPU::V_LSHLREV_B16_e32:
276 case AMDGPU::V_LSHRREV_B16_e64:
277 case AMDGPU::V_LSHRREV_B16_e32:
278 case AMDGPU::V_ASHRREV_I16_e64:
279 case AMDGPU::V_ASHRREV_I16_e32:
280 case AMDGPU::V_ADD_U16_e64:
281 case AMDGPU::V_ADD_U16_e32:
282 case AMDGPU::V_SUB_U16_e64:
283 case AMDGPU::V_SUB_U16_e32:
284 case AMDGPU::V_SUBREV_U16_e64:
285 case AMDGPU::V_SUBREV_U16_e32:
286 case AMDGPU::V_MUL_LO_U16_e64:
287 case AMDGPU::V_MUL_LO_U16_e32:
288 case AMDGPU::V_ADD_F16_e64:
289 case AMDGPU::V_ADD_F16_e32:
290 case AMDGPU::V_SUB_F16_e64:
291 case AMDGPU::V_SUB_F16_e32:
292 case AMDGPU::V_SUBREV_F16_e64:
293 case AMDGPU::V_SUBREV_F16_e32:
294 case AMDGPU::V_MUL_F16_e64:
295 case AMDGPU::V_MUL_F16_e32:
296 case AMDGPU::V_MAX_F16_e64:
297 case AMDGPU::V_MAX_F16_e32:
298 case AMDGPU::V_MIN_F16_e64:
299 case AMDGPU::V_MIN_F16_e32:
300 case AMDGPU::V_MAX_U16_e64:
301 case AMDGPU::V_MAX_U16_e32:
302 case AMDGPU::V_MIN_U16_e64:
303 case AMDGPU::V_MIN_U16_e32:
304 case AMDGPU::V_MAX_I16_e64:
305 case AMDGPU::V_MAX_I16_e32:
306 case AMDGPU::V_MIN_I16_e64:
307 case AMDGPU::V_MIN_I16_e32:
308 case AMDGPU::V_MAD_F16_e64:
309 case AMDGPU::V_MAD_U16_e64:
310 case AMDGPU::V_MAD_I16_e64:
311 case AMDGPU::V_FMA_F16_e64:
312 case AMDGPU::V_DIV_FIXUP_F16_e64:
313 // On gfx10, all 16-bit instructions preserve the high bits.
314 return getGeneration() <= AMDGPUSubtarget::GFX9;
315 case AMDGPU::V_MADAK_F16:
316 case AMDGPU::V_MADMK_F16:
317 case AMDGPU::V_MAC_F16_e64:
318 case AMDGPU::V_MAC_F16_e32:
319 case AMDGPU::V_FMAMK_F16:
320 case AMDGPU::V_FMAAK_F16:
321 case AMDGPU::V_FMAC_F16_e64:
322 case AMDGPU::V_FMAC_F16_e32:
323 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
324 // instructions maintain the legacy behavior of 0ing. Some instructions
325 // changed to preserving the high bits.
326 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
327 case AMDGPU::V_MAD_MIXLO_F16:
328 case AMDGPU::V_MAD_MIXHI_F16:
329 default:
330 return false;
331 }
332}
333
334void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
335 const SchedRegion &Region) const {
336 // Track register pressure so the scheduler can try to decrease
337 // pressure once register usage is above the threshold defined by
338 // SIRegisterInfo::getRegPressureSetLimit()
339 Policy.ShouldTrackPressure = true;
340
341 const Function &F = Region.RegionBegin->getMF()->getFunction();
342 if (AMDGPU::getSchedStrategy(F) == "coexec") {
343 Policy.OnlyTopDown = true;
344 Policy.OnlyBottomUp = false;
345 return;
346 }
347
348 // Enabling both top down and bottom up scheduling seems to give us less
349 // register spills than just using one of these approaches on its own.
350 Policy.OnlyTopDown = false;
351 Policy.OnlyBottomUp = false;
352
353 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
354 if (!enableSIScheduler())
355 Policy.ShouldTrackLaneMasks = true;
356}
357
358void GCNSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,
359 const SchedRegion &Region) const {
360 const Function &F = Region.RegionBegin->getMF()->getFunction();
361 Attribute PostRADirectionAttr = F.getFnAttribute(Kind: "amdgpu-post-ra-direction");
362 if (!PostRADirectionAttr.isValid())
363 return;
364
365 StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();
366 if (PostRADirectionStr == "topdown") {
367 Policy.OnlyTopDown = true;
368 Policy.OnlyBottomUp = false;
369 } else if (PostRADirectionStr == "bottomup") {
370 Policy.OnlyTopDown = false;
371 Policy.OnlyBottomUp = true;
372 } else if (PostRADirectionStr == "bidirectional") {
373 Policy.OnlyTopDown = false;
374 Policy.OnlyBottomUp = false;
375 } else {
376 DiagnosticInfoOptimizationFailure Diag(
377 F, F.getSubprogram(), "invalid value for postRA direction attribute");
378 F.getContext().diagnose(DI: Diag);
379 }
380
381 LLVM_DEBUG({
382 const char *DirStr = "default";
383 if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)
384 DirStr = "topdown";
385 else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)
386 DirStr = "bottomup";
387 else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)
388 DirStr = "bidirectional";
389
390 dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr
391 << '\n';
392 });
393}
394
395void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
396 if (isWave32()) {
397 // Fix implicit $vcc operands after MIParser has verified that they match
398 // the instruction definitions.
399 for (auto &MBB : MF) {
400 for (auto &MI : MBB)
401 InstrInfo.fixImplicitOperands(MI);
402 }
403 }
404}
405
406bool GCNSubtarget::hasMadF16() const {
407 return InstrInfo.pseudoToMCOpcode(Opcode: AMDGPU::V_MAD_F16_e64) != -1;
408}
409
410bool GCNSubtarget::useVGPRIndexMode() const {
411 return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
412}
413
414bool GCNSubtarget::useAA() const { return UseAA; }
415
416unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
417 return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, MaxWaves: getMaxWavesPerEU(),
418 Gen: getGeneration());
419}
420
421unsigned
422GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
423 unsigned DynamicVGPRBlockSize) const {
424 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(STI: this, NumVGPRs,
425 DynamicVGPRBlockSize);
426}
427
428unsigned
429GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
430 if (getGeneration() >= AMDGPUSubtarget::GFX10)
431 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
432
433 if (HasFlatScratch || HasArchitectedFlatScratch) {
434 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
435 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
436 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
437 return 4; // FLAT_SCRATCH, VCC (in that order).
438 }
439
440 if (isXNACKEnabled())
441 return 4; // XNACK, VCC (in that order).
442 return 2; // VCC.
443}
444
445unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
446 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
447 return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit());
448}
449
450unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
451 // In principle we do not need to reserve SGPR pair used for flat_scratch if
452 // we know flat instructions do not access the stack anywhere in the
453 // program. For now assume it's needed if we have flat instructions.
454 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
455 return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch);
456}
457
458std::pair<unsigned, unsigned>
459GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
460 unsigned NumSGPRs, unsigned NumVGPRs) const {
461 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
462 // Temporarily check both the attribute and the subtarget feature until the
463 // latter is removed.
464 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
465 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
466
467 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSBytes: LDSSize, F);
468 unsigned SGPROcc = getOccupancyWithNumSGPRs(SGPRs: NumSGPRs);
469 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
470
471 // Maximum occupancy may be further limited by high SGPR/VGPR usage.
472 MaxOcc = std::min(a: MaxOcc, b: std::min(a: SGPROcc, b: VGPROcc));
473 return {std::min(a: MinOcc, b: MaxOcc), MaxOcc};
474}
475
476unsigned GCNSubtarget::getBaseMaxNumSGPRs(
477 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
478 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
479 // Compute maximum number of SGPRs function can use using default/requested
480 // minimum number of waves per execution unit.
481 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false);
482 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true);
483
484 // Check if maximum number of SGPRs was explicitly requested using
485 // "amdgpu-num-sgpr" attribute.
486 unsigned Requested =
487 F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr", Default: MaxNumSGPRs);
488
489 if (Requested != MaxNumSGPRs) {
490 // Make sure requested value does not violate subtarget's specifications.
491 if (Requested && (Requested <= ReservedNumSGPRs))
492 Requested = 0;
493
494 // If more SGPRs are required to support the input user/system SGPRs,
495 // increase to accommodate them.
496 //
497 // FIXME: This really ends up using the requested number of SGPRs + number
498 // of reserved special registers in total. Theoretically you could re-use
499 // the last input registers for these special registers, but this would
500 // require a lot of complexity to deal with the weird aliasing.
501 unsigned InputNumSGPRs = PreloadedSGPRs;
502 if (Requested && Requested < InputNumSGPRs)
503 Requested = InputNumSGPRs;
504
505 // Make sure requested value is compatible with values implied by
506 // default/requested minimum/maximum number of waves per execution unit.
507 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false))
508 Requested = 0;
509 if (WavesPerEU.second && Requested &&
510 Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second))
511 Requested = 0;
512
513 if (Requested)
514 MaxNumSGPRs = Requested;
515 }
516
517 if (hasSGPRInitBug())
518 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
519
520 return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs);
521}
522
523unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
524 const Function &F = MF.getFunction();
525 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
526 return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(),
527 ReservedNumSGPRs: getReservedNumSGPRs(MF));
528}
529
530unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
531 using USI = GCNUserSGPRUsageInfo;
532 // Max number of user SGPRs
533 const unsigned MaxUserSGPRs =
534 USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) +
535 USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) +
536 USI::getNumUserSGPRForField(ID: USI::QueuePtrID) +
537 USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) +
538 USI::getNumUserSGPRForField(ID: USI::DispatchIdID) +
539 USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) +
540 USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID);
541
542 // Max number of system SGPRs
543 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
544 1 + // WorkGroupIDY
545 1 + // WorkGroupIDZ
546 1 + // WorkGroupInfo
547 1; // private segment wave byte offset
548
549 // Max number of synthetic SGPRs
550 const unsigned SyntheticSGPRs = 1; // LDSKernelId
551
552 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
553}
554
555unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
556 return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(),
557 ReservedNumSGPRs: getReservedNumSGPRs(F));
558}
559
560unsigned GCNSubtarget::getBaseMaxNumVGPRs(
561 const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
562 const auto [Min, Max] = NumVGPRBounds;
563
564 // Check if maximum number of VGPRs was explicitly requested using
565 // "amdgpu-num-vgpr" attribute.
566
567 unsigned Requested = F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr", Default: Max);
568 if (Requested != Max && hasGFX90AInsts())
569 Requested *= 2;
570
571 // Make sure requested value is inside the range of possible VGPR usage.
572 return std::clamp(val: Requested, lo: Min, hi: Max);
573}
574
575unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
576 // Temporarily check both the attribute and the subtarget feature, until the
577 // latter is removed.
578 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
579 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
580 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
581
582 std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
583 return getBaseMaxNumVGPRs(
584 F, NumVGPRBounds: {getMinNumVGPRs(WavesPerEU: Waves.second, DynamicVGPRBlockSize),
585 getMaxNumVGPRs(WavesPerEU: Waves.first, DynamicVGPRBlockSize)});
586}
587
588unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
589 return getMaxNumVGPRs(F: MF.getFunction());
590}
591
592std::pair<unsigned, unsigned>
593GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
594 const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
595
596 unsigned MaxNumVGPRs = MaxVectorRegs;
597 unsigned MaxNumAGPRs = 0;
598 unsigned NumArchVGPRs = getAddressableNumArchVGPRs();
599
600 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
601 // a wave may have up to 512 total vector registers combining together both
602 // VGPRs and AGPRs. Hence, in an entry function without calls and without
603 // AGPRs used within it, it is possible to use the whole vector register
604 // budget for VGPRs.
605 //
606 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
607 // register file accordingly.
608 if (hasGFX90AInsts()) {
609 unsigned MinNumAGPRs = 0;
610 const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
611
612 const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
613
614 // TODO: The lower bound should probably force the number of required
615 // registers up, overriding amdgpu-waves-per-eu.
616 std::tie(args&: MinNumAGPRs, args&: MaxNumAGPRs) =
617 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-agpr-alloc", Default: DefaultNumAGPR,
618 /*OnlyFirstRequired=*/true);
619
620 if (MinNumAGPRs == DefaultNumAGPR.first) {
621 // Default to splitting half the registers if AGPRs are required.
622 MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
623 } else {
624 // Align to accum_offset's allocation granularity.
625 MinNumAGPRs = alignTo(Value: MinNumAGPRs, Align: 4);
626
627 MinNumAGPRs = std::min(a: MinNumAGPRs, b: TotalNumAGPRs);
628 }
629
630 // Clamp values to be inbounds of our limits, and ensure min <= max.
631
632 MaxNumAGPRs = std::min(a: std::max(a: MinNumAGPRs, b: MaxNumAGPRs), b: MaxVectorRegs);
633 MinNumAGPRs = std::min(a: std::min(a: MinNumAGPRs, b: TotalNumAGPRs), b: MaxNumAGPRs);
634
635 MaxNumVGPRs = std::min(a: MaxVectorRegs - MinNumAGPRs, b: NumArchVGPRs);
636 MaxNumAGPRs = std::min(a: MaxVectorRegs - MaxNumVGPRs, b: MaxNumAGPRs);
637
638 assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
639 MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&
640 "invalid register counts");
641 } else if (hasMAIInsts()) {
642 // On gfx908 the number of AGPRs always equals the number of VGPRs.
643 MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
644 }
645
646 return std::pair(MaxNumVGPRs, MaxNumAGPRs);
647}
648
649void GCNSubtarget::adjustSchedDependency(
650 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
651 const TargetSchedModel *SchedModel) const {
652 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
653 !Use->isInstr())
654 return;
655
656 MachineInstr *DefI = Def->getInstr();
657 MachineInstr *UseI = Use->getInstr();
658
659 if (DefI->isBundle()) {
660 const SIRegisterInfo *TRI = getRegisterInfo();
661 auto Reg = Dep.getReg();
662 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
663 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
664 unsigned Lat = 0;
665 for (++I; I != E && I->isBundledWithPred(); ++I) {
666 if (I->isMetaInstruction())
667 continue;
668 if (I->modifiesRegister(Reg, TRI))
669 Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *I);
670 else if (Lat)
671 --Lat;
672 }
673 Dep.setLatency(Lat);
674 } else if (UseI->isBundle()) {
675 const SIRegisterInfo *TRI = getRegisterInfo();
676 auto Reg = Dep.getReg();
677 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
678 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
679 unsigned Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *DefI);
680 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
681 if (I->isMetaInstruction())
682 continue;
683 if (I->readsRegister(Reg, TRI))
684 break;
685 --Lat;
686 }
687 Dep.setLatency(Lat);
688 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
689 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
690 // implicit operands which come from the MCInstrDesc, which can fool
691 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
692 // pseudo operands.
693 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
694 DefMI: DefI, DefOperIdx: DefOpIdx, UseMI: UseI, UseOperIdx: UseOpIdx));
695 }
696}
697
698unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
699 if (getGeneration() >= AMDGPUSubtarget::GFX12)
700 return 0; // Not MIMG encoding.
701
702 if (NSAThreshold.getNumOccurrences() > 0)
703 return std::max(a: NSAThreshold.getValue(), b: 2u);
704
705 int Value = MF.getFunction().getFnAttributeAsParsedInteger(
706 Kind: "amdgpu-nsa-threshold", Default: -1);
707 if (Value > 0)
708 return std::max(a: Value, b: 2);
709
710 return NSAThreshold;
711}
712
713GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
714 const GCNSubtarget &ST)
715 : ST(ST) {
716 const CallingConv::ID CC = F.getCallingConv();
717 const bool IsKernel =
718 CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
719
720 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
721 KernargSegmentPtr = true;
722
723 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
724 if (IsAmdHsaOrMesa && !ST.hasFlatScratchEnabled())
725 PrivateSegmentBuffer = true;
726 else if (ST.isMesaGfxShader(F))
727 ImplicitBufferPtr = true;
728
729 if (!AMDGPU::isGraphics(CC)) {
730 if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr"))
731 DispatchPtr = true;
732
733 // FIXME: Can this always be disabled with < COv5?
734 if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr"))
735 QueuePtr = true;
736
737 if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id"))
738 DispatchID = true;
739 }
740
741 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
742 (IsAmdHsaOrMesa || ST.hasFlatScratchEnabled()) &&
743 // FlatScratchInit cannot be true for graphics CC if
744 // hasFlatScratchEnabled() is false.
745 (ST.hasFlatScratchEnabled() ||
746 (!AMDGPU::isGraphics(CC) &&
747 !F.hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))) &&
748 !ST.hasArchitectedFlatScratch()) {
749 FlatScratchInit = true;
750 }
751
752 if (hasImplicitBufferPtr())
753 NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID);
754
755 if (hasPrivateSegmentBuffer())
756 NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID);
757
758 if (hasDispatchPtr())
759 NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID);
760
761 if (hasQueuePtr())
762 NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID);
763
764 if (hasKernargSegmentPtr())
765 NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID);
766
767 if (hasDispatchID())
768 NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID);
769
770 if (hasFlatScratchInit())
771 NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID);
772
773 if (hasPrivateSegmentSize())
774 NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentSizeID);
775}
776
777void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
778 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
779 NumKernargPreloadSGPRs += NumSGPRs;
780 NumUsedUserSGPRs += NumSGPRs;
781}
782
783unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
784 return AMDGPU::getMaxNumUserSGPRs(STI: ST) - NumUsedUserSGPRs;
785}
786