1//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "GCNSubtarget.h"
15#include "AMDGPUCallLowering.h"
16#include "AMDGPUInstructionSelector.h"
17#include "AMDGPULegalizerInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSelectionDAGInfo.h"
20#include "AMDGPUTargetMachine.h"
21#include "SIMachineFunctionInfo.h"
22#include "Utils/AMDGPUBaseInfo.h"
23#include "llvm/ADT/SmallString.h"
24#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25#include "llvm/CodeGen/MachineScheduler.h"
26#include "llvm/CodeGen/TargetFrameLowering.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "gcn-subtarget"
34
35#define GET_SUBTARGETINFO_TARGET_DESC
36#define GET_SUBTARGETINFO_CTOR
37#define AMDGPUSubtarget GCNSubtarget
38#include "AMDGPUGenSubtargetInfo.inc"
39#undef AMDGPUSubtarget
40
41static cl::opt<bool> EnableVGPRIndexMode(
42 "amdgpu-vgpr-index-mode",
43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44 cl::init(Val: false));
45
46static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47 cl::desc("Enable the use of AA during codegen."),
48 cl::init(Val: true));
49
50static cl::opt<unsigned>
51 NSAThreshold("amdgpu-nsa-threshold",
52 cl::desc("Number of addresses from which to enable MIMG NSA."),
53 cl::init(Val: 2), cl::Hidden);
54
55GCNSubtarget::~GCNSubtarget() = default;
56
57GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
58 StringRef GPU,
59 StringRef FS) {
60 // Determine default and user-specified characteristics
61 //
62 // We want to be able to turn these off, but making this a subtarget feature
63 // for SI has the unhelpful behavior that it unsets everything else if you
64 // disable it.
65 //
66 // Similarly we want enable-prt-strict-null to be on by default and not to
67 // unset everything else if it is disabled
68
69 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
70
71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72 // default
73 if (isAmdHsaOS())
74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78 // Disable mutually exclusive bits.
79 if (FS.contains_insensitive(Other: "+wavefrontsize")) {
80 if (!FS.contains_insensitive(Other: "wavefrontsize16"))
81 FullFS += "-wavefrontsize16,";
82 if (!FS.contains_insensitive(Other: "wavefrontsize32"))
83 FullFS += "-wavefrontsize32,";
84 if (!FS.contains_insensitive(Other: "wavefrontsize64"))
85 FullFS += "-wavefrontsize64,";
86 }
87
88 FullFS += FS;
89
90 ParseSubtargetFeatures(CPU: GPU, /*TuneCPU*/ GPU, FS: FullFS);
91
92 // Implement the "generic" processors, which acts as the default when no
93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94 // the first amdgcn target that supports flat addressing. Other OSes defaults
95 // to the first amdgcn target.
96 if (Gen == AMDGPUSubtarget::INVALID) {
97 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
98 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
99 // Assume wave64 for the unknown target, if not explicitly set.
100 if (getWavefrontSizeLog2() == 0)
101 WavefrontSizeLog2 = 6;
102 } else if (!hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
103 !hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
104 // If there is no default wave size it must be a generation before gfx10,
105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106 // set wave32 as a default.
107 ToggleFeature(FB: AMDGPU::FeatureWavefrontSize32);
108 WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6;
109 }
110
111 // We don't support FP64 for EG/NI atm.
112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113
114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115 // support flat operations, otherwise they cannot access a 64-bit global
116 // address space
117 assert(hasAddr64() || hasFlat());
118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119 // that do not support ADDR64 variants of MUBUF instructions. Such targets
120 // cannot use a 64 bit offset with a MUBUF instruction to access the global
121 // address space
122 if (!hasAddr64() && !FS.contains(Other: "flat-for-global") && !FlatForGlobal) {
123 ToggleFeature(FB: AMDGPU::FeatureFlatForGlobal);
124 FlatForGlobal = true;
125 }
126 // Unless +-flat-for-global is specified, use MUBUF instructions for global
127 // address space access if flat operations are not available.
128 if (!hasFlat() && !FS.contains(Other: "flat-for-global") && FlatForGlobal) {
129 ToggleFeature(FB: AMDGPU::FeatureFlatForGlobal);
130 FlatForGlobal = false;
131 }
132
133 // Set defaults if needed.
134 if (MaxPrivateElementSize == 0)
135 MaxPrivateElementSize = 4;
136
137 if (LDSBankCount == 0)
138 LDSBankCount = 32;
139
140 if (TT.isAMDGCN() && AddressableLocalMemorySize == 0)
141 AddressableLocalMemorySize = 32768;
142
143 LocalMemorySize = AddressableLocalMemorySize;
144 if (AMDGPU::isGFX10Plus(STI: *this) &&
145 !getFeatureBits().test(I: AMDGPU::FeatureCuMode))
146 LocalMemorySize *= 2;
147
148 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
149 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
150
151 TargetID.setTargetIDFromFeaturesString(FS);
152
153 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
154 << TargetID.getXnackSetting() << '\n');
155 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
156 << TargetID.getSramEccSetting() << '\n');
157
158 return *this;
159}
160
161void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
162 LLVMContext &Ctx = F.getContext();
163 if (hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
164 hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
165 Ctx.diagnose(DI: DiagnosticInfoUnsupported(
166 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
167 }
168}
169
170GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
171 const GCNTargetMachine &TM)
172 : // clang-format off
173 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
174 AMDGPUSubtarget(TT),
175 TargetTriple(TT),
176 TargetID(*this),
177 InstrItins(getInstrItineraryForCPU(CPU: GPU)),
178 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
179 TLInfo(TM, *this),
180 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
181 // clang-format on
182 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(STI: this);
183 EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(STI: this);
184
185 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
186
187 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(args: *getTargetLowering());
188 InlineAsmLoweringInfo =
189 std::make_unique<InlineAsmLowering>(args: getTargetLowering());
190 Legalizer = std::make_unique<AMDGPULegalizerInfo>(args&: *this, args: TM);
191 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(args&: *this);
192 InstSelector =
193 std::make_unique<AMDGPUInstructionSelector>(args&: *this, args&: *RegBankInfo, args: TM);
194}
195
196const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {
197 return TSInfo.get();
198}
199
200unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
201 if (getGeneration() < GFX10)
202 return 1;
203
204 switch (Opcode) {
205 case AMDGPU::V_LSHLREV_B64_e64:
206 case AMDGPU::V_LSHLREV_B64_gfx10:
207 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
208 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
209 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
210 case AMDGPU::V_LSHL_B64_e64:
211 case AMDGPU::V_LSHRREV_B64_e64:
212 case AMDGPU::V_LSHRREV_B64_gfx10:
213 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
214 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
215 case AMDGPU::V_LSHR_B64_e64:
216 case AMDGPU::V_ASHRREV_I64_e64:
217 case AMDGPU::V_ASHRREV_I64_gfx10:
218 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
219 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
220 case AMDGPU::V_ASHR_I64_e64:
221 return 1;
222 }
223
224 return 2;
225}
226
227/// This list was mostly derived from experimentation.
228bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
229 switch (Opcode) {
230 case AMDGPU::V_CVT_F16_F32_e32:
231 case AMDGPU::V_CVT_F16_F32_e64:
232 case AMDGPU::V_CVT_F16_U16_e32:
233 case AMDGPU::V_CVT_F16_U16_e64:
234 case AMDGPU::V_CVT_F16_I16_e32:
235 case AMDGPU::V_CVT_F16_I16_e64:
236 case AMDGPU::V_RCP_F16_e64:
237 case AMDGPU::V_RCP_F16_e32:
238 case AMDGPU::V_RSQ_F16_e64:
239 case AMDGPU::V_RSQ_F16_e32:
240 case AMDGPU::V_SQRT_F16_e64:
241 case AMDGPU::V_SQRT_F16_e32:
242 case AMDGPU::V_LOG_F16_e64:
243 case AMDGPU::V_LOG_F16_e32:
244 case AMDGPU::V_EXP_F16_e64:
245 case AMDGPU::V_EXP_F16_e32:
246 case AMDGPU::V_SIN_F16_e64:
247 case AMDGPU::V_SIN_F16_e32:
248 case AMDGPU::V_COS_F16_e64:
249 case AMDGPU::V_COS_F16_e32:
250 case AMDGPU::V_FLOOR_F16_e64:
251 case AMDGPU::V_FLOOR_F16_e32:
252 case AMDGPU::V_CEIL_F16_e64:
253 case AMDGPU::V_CEIL_F16_e32:
254 case AMDGPU::V_TRUNC_F16_e64:
255 case AMDGPU::V_TRUNC_F16_e32:
256 case AMDGPU::V_RNDNE_F16_e64:
257 case AMDGPU::V_RNDNE_F16_e32:
258 case AMDGPU::V_FRACT_F16_e64:
259 case AMDGPU::V_FRACT_F16_e32:
260 case AMDGPU::V_FREXP_MANT_F16_e64:
261 case AMDGPU::V_FREXP_MANT_F16_e32:
262 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
263 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
264 case AMDGPU::V_LDEXP_F16_e64:
265 case AMDGPU::V_LDEXP_F16_e32:
266 case AMDGPU::V_LSHLREV_B16_e64:
267 case AMDGPU::V_LSHLREV_B16_e32:
268 case AMDGPU::V_LSHRREV_B16_e64:
269 case AMDGPU::V_LSHRREV_B16_e32:
270 case AMDGPU::V_ASHRREV_I16_e64:
271 case AMDGPU::V_ASHRREV_I16_e32:
272 case AMDGPU::V_ADD_U16_e64:
273 case AMDGPU::V_ADD_U16_e32:
274 case AMDGPU::V_SUB_U16_e64:
275 case AMDGPU::V_SUB_U16_e32:
276 case AMDGPU::V_SUBREV_U16_e64:
277 case AMDGPU::V_SUBREV_U16_e32:
278 case AMDGPU::V_MUL_LO_U16_e64:
279 case AMDGPU::V_MUL_LO_U16_e32:
280 case AMDGPU::V_ADD_F16_e64:
281 case AMDGPU::V_ADD_F16_e32:
282 case AMDGPU::V_SUB_F16_e64:
283 case AMDGPU::V_SUB_F16_e32:
284 case AMDGPU::V_SUBREV_F16_e64:
285 case AMDGPU::V_SUBREV_F16_e32:
286 case AMDGPU::V_MUL_F16_e64:
287 case AMDGPU::V_MUL_F16_e32:
288 case AMDGPU::V_MAX_F16_e64:
289 case AMDGPU::V_MAX_F16_e32:
290 case AMDGPU::V_MIN_F16_e64:
291 case AMDGPU::V_MIN_F16_e32:
292 case AMDGPU::V_MAX_U16_e64:
293 case AMDGPU::V_MAX_U16_e32:
294 case AMDGPU::V_MIN_U16_e64:
295 case AMDGPU::V_MIN_U16_e32:
296 case AMDGPU::V_MAX_I16_e64:
297 case AMDGPU::V_MAX_I16_e32:
298 case AMDGPU::V_MIN_I16_e64:
299 case AMDGPU::V_MIN_I16_e32:
300 case AMDGPU::V_MAD_F16_e64:
301 case AMDGPU::V_MAD_U16_e64:
302 case AMDGPU::V_MAD_I16_e64:
303 case AMDGPU::V_FMA_F16_e64:
304 case AMDGPU::V_DIV_FIXUP_F16_e64:
305 // On gfx10, all 16-bit instructions preserve the high bits.
306 return getGeneration() <= AMDGPUSubtarget::GFX9;
307 case AMDGPU::V_MADAK_F16:
308 case AMDGPU::V_MADMK_F16:
309 case AMDGPU::V_MAC_F16_e64:
310 case AMDGPU::V_MAC_F16_e32:
311 case AMDGPU::V_FMAMK_F16:
312 case AMDGPU::V_FMAAK_F16:
313 case AMDGPU::V_FMAC_F16_e64:
314 case AMDGPU::V_FMAC_F16_e32:
315 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
316 // instructions maintain the legacy behavior of 0ing. Some instructions
317 // changed to preserving the high bits.
318 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
319 case AMDGPU::V_MAD_MIXLO_F16:
320 case AMDGPU::V_MAD_MIXHI_F16:
321 default:
322 return false;
323 }
324}
325
326void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
327 unsigned NumRegionInstrs) const {
328 // Track register pressure so the scheduler can try to decrease
329 // pressure once register usage is above the threshold defined by
330 // SIRegisterInfo::getRegPressureSetLimit()
331 Policy.ShouldTrackPressure = true;
332
333 // Enabling both top down and bottom up scheduling seems to give us less
334 // register spills than just using one of these approaches on its own.
335 Policy.OnlyTopDown = false;
336 Policy.OnlyBottomUp = false;
337
338 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
339 if (!enableSIScheduler())
340 Policy.ShouldTrackLaneMasks = true;
341}
342
343void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
344 if (isWave32()) {
345 // Fix implicit $vcc operands after MIParser has verified that they match
346 // the instruction definitions.
347 for (auto &MBB : MF) {
348 for (auto &MI : MBB)
349 InstrInfo.fixImplicitOperands(MI);
350 }
351 }
352}
353
354bool GCNSubtarget::hasMadF16() const {
355 return InstrInfo.pseudoToMCOpcode(Opcode: AMDGPU::V_MAD_F16_e64) != -1;
356}
357
358bool GCNSubtarget::useVGPRIndexMode() const {
359 return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
360}
361
362bool GCNSubtarget::useAA() const { return UseAA; }
363
364unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
365 return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, MaxWaves: getMaxWavesPerEU(),
366 Gen: getGeneration());
367}
368
369unsigned
370GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
371 unsigned DynamicVGPRBlockSize) const {
372 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(STI: this, NumVGPRs,
373 DynamicVGPRBlockSize);
374}
375
376unsigned
377GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
378 if (getGeneration() >= AMDGPUSubtarget::GFX10)
379 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
380
381 if (HasFlatScratch || HasArchitectedFlatScratch) {
382 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
383 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
384 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
385 return 4; // FLAT_SCRATCH, VCC (in that order).
386 }
387
388 if (isXNACKEnabled())
389 return 4; // XNACK, VCC (in that order).
390 return 2; // VCC.
391}
392
393unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
394 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
395 return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit());
396}
397
398unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
399 // In principle we do not need to reserve SGPR pair used for flat_scratch if
400 // we know flat instructions do not access the stack anywhere in the
401 // program. For now assume it's needed if we have flat instructions.
402 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
403 return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch);
404}
405
406std::pair<unsigned, unsigned>
407GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
408 unsigned NumSGPRs, unsigned NumVGPRs) const {
409 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
410 // Temporarily check both the attribute and the subtarget feature until the
411 // latter is removed.
412 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
413 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
414
415 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSBytes: LDSSize, F);
416 unsigned SGPROcc = getOccupancyWithNumSGPRs(SGPRs: NumSGPRs);
417 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
418
419 // Maximum occupancy may be further limited by high SGPR/VGPR usage.
420 MaxOcc = std::min(a: MaxOcc, b: std::min(a: SGPROcc, b: VGPROcc));
421 return {std::min(a: MinOcc, b: MaxOcc), MaxOcc};
422}
423
424unsigned GCNSubtarget::getBaseMaxNumSGPRs(
425 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
426 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
427 // Compute maximum number of SGPRs function can use using default/requested
428 // minimum number of waves per execution unit.
429 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false);
430 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true);
431
432 // Check if maximum number of SGPRs was explicitly requested using
433 // "amdgpu-num-sgpr" attribute.
434 unsigned Requested =
435 F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr", Default: MaxNumSGPRs);
436
437 if (Requested != MaxNumSGPRs) {
438 // Make sure requested value does not violate subtarget's specifications.
439 if (Requested && (Requested <= ReservedNumSGPRs))
440 Requested = 0;
441
442 // If more SGPRs are required to support the input user/system SGPRs,
443 // increase to accommodate them.
444 //
445 // FIXME: This really ends up using the requested number of SGPRs + number
446 // of reserved special registers in total. Theoretically you could re-use
447 // the last input registers for these special registers, but this would
448 // require a lot of complexity to deal with the weird aliasing.
449 unsigned InputNumSGPRs = PreloadedSGPRs;
450 if (Requested && Requested < InputNumSGPRs)
451 Requested = InputNumSGPRs;
452
453 // Make sure requested value is compatible with values implied by
454 // default/requested minimum/maximum number of waves per execution unit.
455 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false))
456 Requested = 0;
457 if (WavesPerEU.second && Requested &&
458 Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second))
459 Requested = 0;
460
461 if (Requested)
462 MaxNumSGPRs = Requested;
463 }
464
465 if (hasSGPRInitBug())
466 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
467
468 return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs);
469}
470
471unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
472 const Function &F = MF.getFunction();
473 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
474 return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(),
475 ReservedNumSGPRs: getReservedNumSGPRs(MF));
476}
477
478unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
479 using USI = GCNUserSGPRUsageInfo;
480 // Max number of user SGPRs
481 const unsigned MaxUserSGPRs =
482 USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) +
483 USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) +
484 USI::getNumUserSGPRForField(ID: USI::QueuePtrID) +
485 USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) +
486 USI::getNumUserSGPRForField(ID: USI::DispatchIdID) +
487 USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) +
488 USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID);
489
490 // Max number of system SGPRs
491 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
492 1 + // WorkGroupIDY
493 1 + // WorkGroupIDZ
494 1 + // WorkGroupInfo
495 1; // private segment wave byte offset
496
497 // Max number of synthetic SGPRs
498 const unsigned SyntheticSGPRs = 1; // LDSKernelId
499
500 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
501}
502
503unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
504 return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(),
505 ReservedNumSGPRs: getReservedNumSGPRs(F));
506}
507
508unsigned GCNSubtarget::getBaseMaxNumVGPRs(
509 const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
510 const auto &[Min, Max] = NumVGPRBounds;
511
512 // Check if maximum number of VGPRs was explicitly requested using
513 // "amdgpu-num-vgpr" attribute.
514
515 unsigned Requested = F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr", Default: Max);
516 if (Requested != Max && hasGFX90AInsts())
517 Requested *= 2;
518
519 // Make sure requested value is inside the range of possible VGPR usage.
520 return std::clamp(val: Requested, lo: Min, hi: Max);
521}
522
523unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
524 // Temporarily check both the attribute and the subtarget feature, until the
525 // latter is removed.
526 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
527 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
528 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
529
530 std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
531 return getBaseMaxNumVGPRs(
532 F, NumVGPRBounds: {getMinNumVGPRs(WavesPerEU: Waves.second, DynamicVGPRBlockSize),
533 getMaxNumVGPRs(WavesPerEU: Waves.first, DynamicVGPRBlockSize)});
534}
535
536unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
537 return getMaxNumVGPRs(F: MF.getFunction());
538}
539
540void GCNSubtarget::adjustSchedDependency(
541 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
542 const TargetSchedModel *SchedModel) const {
543 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
544 !Use->isInstr())
545 return;
546
547 MachineInstr *DefI = Def->getInstr();
548 MachineInstr *UseI = Use->getInstr();
549
550 if (DefI->isBundle()) {
551 const SIRegisterInfo *TRI = getRegisterInfo();
552 auto Reg = Dep.getReg();
553 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
554 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
555 unsigned Lat = 0;
556 for (++I; I != E && I->isBundledWithPred(); ++I) {
557 if (I->modifiesRegister(Reg, TRI))
558 Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *I);
559 else if (Lat)
560 --Lat;
561 }
562 Dep.setLatency(Lat);
563 } else if (UseI->isBundle()) {
564 const SIRegisterInfo *TRI = getRegisterInfo();
565 auto Reg = Dep.getReg();
566 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
567 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
568 unsigned Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *DefI);
569 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
570 if (I->readsRegister(Reg, TRI))
571 break;
572 --Lat;
573 }
574 Dep.setLatency(Lat);
575 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
576 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
577 // implicit operands which come from the MCInstrDesc, which can fool
578 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
579 // pseudo operands.
580 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
581 DefMI: DefI, DefOperIdx: DefOpIdx, UseMI: UseI, UseOperIdx: UseOpIdx));
582 }
583}
584
585unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
586 if (getGeneration() >= AMDGPUSubtarget::GFX12)
587 return 0; // Not MIMG encoding.
588
589 if (NSAThreshold.getNumOccurrences() > 0)
590 return std::max(a: NSAThreshold.getValue(), b: 2u);
591
592 int Value = MF.getFunction().getFnAttributeAsParsedInteger(
593 Kind: "amdgpu-nsa-threshold", Default: -1);
594 if (Value > 0)
595 return std::max(a: Value, b: 2);
596
597 return NSAThreshold;
598}
599
600GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
601 const GCNSubtarget &ST)
602 : ST(ST) {
603 const CallingConv::ID CC = F.getCallingConv();
604 const bool IsKernel =
605 CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
606
607 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
608 KernargSegmentPtr = true;
609
610 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
611 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
612 PrivateSegmentBuffer = true;
613 else if (ST.isMesaGfxShader(F))
614 ImplicitBufferPtr = true;
615
616 if (!AMDGPU::isGraphics(CC)) {
617 if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr"))
618 DispatchPtr = true;
619
620 // FIXME: Can this always be disabled with < COv5?
621 if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr"))
622 QueuePtr = true;
623
624 if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id"))
625 DispatchID = true;
626 }
627
628 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
629 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
630 // FlatScratchInit cannot be true for graphics CC if enableFlatScratch()
631 // is false.
632 (ST.enableFlatScratch() ||
633 (!AMDGPU::isGraphics(CC) &&
634 !F.hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))) &&
635 !ST.flatScratchIsArchitected()) {
636 FlatScratchInit = true;
637 }
638
639 if (hasImplicitBufferPtr())
640 NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID);
641
642 if (hasPrivateSegmentBuffer())
643 NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID);
644
645 if (hasDispatchPtr())
646 NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID);
647
648 if (hasQueuePtr())
649 NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID);
650
651 if (hasKernargSegmentPtr())
652 NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID);
653
654 if (hasDispatchID())
655 NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID);
656
657 if (hasFlatScratchInit())
658 NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID);
659
660 if (hasPrivateSegmentSize())
661 NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentSizeID);
662}
663
664void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
665 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
666 NumKernargPreloadSGPRs += NumSGPRs;
667 NumUsedUserSGPRs += NumSGPRs;
668}
669
670unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
671 return AMDGPU::getMaxNumUserSGPRs(STI: ST) - NumUsedUserSGPRs;
672}
673