1//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "GCNSubtarget.h"
15#include "AMDGPUCallLowering.h"
16#include "AMDGPUInstructionSelector.h"
17#include "AMDGPULegalizerInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSelectionDAGInfo.h"
20#include "AMDGPUTargetMachine.h"
21#include "SIMachineFunctionInfo.h"
22#include "Utils/AMDGPUBaseInfo.h"
23#include "llvm/ADT/SmallString.h"
24#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25#include "llvm/CodeGen/MachineScheduler.h"
26#include "llvm/CodeGen/TargetFrameLowering.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "gcn-subtarget"
34
35#define GET_SUBTARGETINFO_TARGET_DESC
36#define GET_SUBTARGETINFO_CTOR
37#define AMDGPUSubtarget GCNSubtarget
38#include "AMDGPUGenSubtargetInfo.inc"
39#undef AMDGPUSubtarget
40
41static cl::opt<bool> EnableVGPRIndexMode(
42 "amdgpu-vgpr-index-mode",
43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44 cl::init(Val: false));
45
46static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47 cl::desc("Enable the use of AA during codegen."),
48 cl::init(Val: true));
49
50static cl::opt<unsigned>
51 NSAThreshold("amdgpu-nsa-threshold",
52 cl::desc("Number of addresses from which to enable MIMG NSA."),
53 cl::init(Val: 2), cl::Hidden);
54
55GCNSubtarget::~GCNSubtarget() = default;
56
57GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
58 StringRef GPU,
59 StringRef FS) {
60 // Determine default and user-specified characteristics
61 //
62 // We want to be able to turn these off, but making this a subtarget feature
63 // for SI has the unhelpful behavior that it unsets everything else if you
64 // disable it.
65 //
66 // Similarly we want enable-prt-strict-null to be on by default and not to
67 // unset everything else if it is disabled
68
69 SmallString<256> FullFS("+load-store-opt,+enable-ds128,");
70
71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72 // default
73 if (isAmdHsaOS())
74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78 // Disable mutually exclusive bits.
79 if (FS.contains_insensitive(Other: "+wavefrontsize")) {
80 if (!FS.contains_insensitive(Other: "wavefrontsize16"))
81 FullFS += "-wavefrontsize16,";
82 if (!FS.contains_insensitive(Other: "wavefrontsize32"))
83 FullFS += "-wavefrontsize32,";
84 if (!FS.contains_insensitive(Other: "wavefrontsize64"))
85 FullFS += "-wavefrontsize64,";
86 }
87
88 FullFS += FS;
89
90 ParseSubtargetFeatures(CPU: GPU, /*TuneCPU*/ GPU, FS: FullFS);
91
92 // Implement the "generic" processors, which acts as the default when no
93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94 // the first amdgcn target that supports flat addressing. Other OSes defaults
95 // to the first amdgcn target.
96 if (Gen == AMDGPUSubtarget::INVALID) {
97 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
98 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
99 // Assume wave64 for the unknown target, if not explicitly set.
100 if (getWavefrontSizeLog2() == 0)
101 WavefrontSizeLog2 = 6;
102 } else if (!hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
103 !hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
104 // If there is no default wave size it must be a generation before gfx10,
105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106 // set wave32 as a default.
107 ToggleFeature(FB: AMDGPU::FeatureWavefrontSize32);
108 WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6;
109 }
110
111 // We don't support FP64 for EG/NI atm.
112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113
114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115 // support flat operations, otherwise they cannot access a 64-bit global
116 // address space
117 assert(hasAddr64() || hasFlat());
118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119 // that do not support ADDR64 variants of MUBUF instructions. Such targets
120 // cannot use a 64 bit offset with a MUBUF instruction to access the global
121 // address space
122 if (!hasAddr64() && !FS.contains(Other: "flat-for-global") && !UseFlatForGlobal) {
123 ToggleFeature(FB: AMDGPU::FeatureUseFlatForGlobal);
124 UseFlatForGlobal = true;
125 }
126 // Unless +-flat-for-global is specified, use MUBUF instructions for global
127 // address space access if flat operations are not available.
128 if (!hasFlat() && !FS.contains(Other: "flat-for-global") && UseFlatForGlobal) {
129 ToggleFeature(FB: AMDGPU::FeatureUseFlatForGlobal);
130 UseFlatForGlobal = false;
131 }
132
133 // Set defaults if needed.
134 if (MaxPrivateElementSize == 0)
135 MaxPrivateElementSize = 4;
136
137 if (LDSBankCount == 0)
138 LDSBankCount = 32;
139
140 if (AddressableLocalMemorySize == 0)
141 AddressableLocalMemorySize = 32768;
142
143 if (FlatOffsetBitWidth == 0)
144 FlatOffsetBitWidth = 13;
145
146 LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(STI: *this);
147 // LDS Allocation Granularity calculated in bytes from dwords
148 LDSAllocationGranularity =
149 AMDGPU::getLdsDwGranularity(ST: *this) * sizeof(uint32_t);
150
151 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
152 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
153
154 // InstCacheLineSize is set from TableGen subtarget features
155 // (FeatureInstCacheLineSize64 / FeatureInstCacheLineSize128).
156 // Fall back to 64 if no feature was specified (e.g. generic targets).
157 if (InstCacheLineSize == 0)
158 InstCacheLineSize = 64;
159
160 assert(llvm::isPowerOf2_32(InstCacheLineSize) &&
161 "InstCacheLineSize must be a power of 2");
162
163 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
164 << TargetID.getXnackSetting() << '\n');
165 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
166 << TargetID.getSramEccSetting() << '\n');
167
168 return *this;
169}
170
171void GCNSubtarget::checkSubtargetFeatures(const Function &F) const {
172 LLVMContext &Ctx = F.getContext();
173 if (hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) &&
174 hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) {
175 Ctx.diagnose(DI: DiagnosticInfoUnsupported(
176 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
177 }
178}
179
180GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
181 const GCNTargetMachine &TM, bool BufferOOBRelaxed,
182 bool TBufferOOBRelaxed)
183 : // clang-format off
184 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
185 AMDGPUSubtarget(TT),
186 TargetID(AMDGPU::createAMDGPUTargetID(STI: *this, FeatureString: FS)),
187 InstrItins(getInstrItineraryForCPU(CPU: GPU)),
188 BufferOOBRelaxed(BufferOOBRelaxed),
189 TBufferOOBRelaxed(TBufferOOBRelaxed),
190 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
191 TLInfo(TM, *this),
192 // Frame index expansion sometimes assumes the low bit of SP is 0
193 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0,
194 /*TransAl=*/Align(4)) {
195 // clang-format on
196 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(STI: *this);
197 EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(STI: *this);
198
199 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
200
201 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(args: *getTargetLowering());
202 InlineAsmLoweringInfo =
203 std::make_unique<InlineAsmLowering>(args: getTargetLowering());
204 Legalizer = std::make_unique<AMDGPULegalizerInfo>(args&: *this, args: TM);
205 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(args&: *this);
206 InstSelector =
207 std::make_unique<AMDGPUInstructionSelector>(args&: *this, args&: *RegBankInfo);
208}
209
210const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {
211 return TSInfo.get();
212}
213
214unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
215 if (getGeneration() < GFX10)
216 return 1;
217
218 switch (Opcode) {
219 case AMDGPU::V_LSHLREV_B64_e64:
220 case AMDGPU::V_LSHLREV_B64_gfx10:
221 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
222 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
223 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
224 case AMDGPU::V_LSHL_B64_e64:
225 case AMDGPU::V_LSHRREV_B64_e64:
226 case AMDGPU::V_LSHRREV_B64_gfx10:
227 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
228 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
229 case AMDGPU::V_LSHR_B64_e64:
230 case AMDGPU::V_ASHRREV_I64_e64:
231 case AMDGPU::V_ASHRREV_I64_gfx10:
232 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
233 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
234 case AMDGPU::V_ASHR_I64_e64:
235 return 1;
236 }
237
238 return 2;
239}
240
241/// This list was mostly derived from experimentation.
242bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
243 switch (Opcode) {
244 case AMDGPU::V_CVT_F16_F32_e32:
245 case AMDGPU::V_CVT_F16_F32_e64:
246 case AMDGPU::V_CVT_F16_U16_e32:
247 case AMDGPU::V_CVT_F16_U16_e64:
248 case AMDGPU::V_CVT_F16_I16_e32:
249 case AMDGPU::V_CVT_F16_I16_e64:
250 case AMDGPU::V_RCP_F16_e64:
251 case AMDGPU::V_RCP_F16_e32:
252 case AMDGPU::V_RSQ_F16_e64:
253 case AMDGPU::V_RSQ_F16_e32:
254 case AMDGPU::V_SQRT_F16_e64:
255 case AMDGPU::V_SQRT_F16_e32:
256 case AMDGPU::V_LOG_F16_e64:
257 case AMDGPU::V_LOG_F16_e32:
258 case AMDGPU::V_EXP_F16_e64:
259 case AMDGPU::V_EXP_F16_e32:
260 case AMDGPU::V_SIN_F16_e64:
261 case AMDGPU::V_SIN_F16_e32:
262 case AMDGPU::V_COS_F16_e64:
263 case AMDGPU::V_COS_F16_e32:
264 case AMDGPU::V_FLOOR_F16_e64:
265 case AMDGPU::V_FLOOR_F16_e32:
266 case AMDGPU::V_CEIL_F16_e64:
267 case AMDGPU::V_CEIL_F16_e32:
268 case AMDGPU::V_TRUNC_F16_e64:
269 case AMDGPU::V_TRUNC_F16_e32:
270 case AMDGPU::V_RNDNE_F16_e64:
271 case AMDGPU::V_RNDNE_F16_e32:
272 case AMDGPU::V_FRACT_F16_e64:
273 case AMDGPU::V_FRACT_F16_e32:
274 case AMDGPU::V_FREXP_MANT_F16_e64:
275 case AMDGPU::V_FREXP_MANT_F16_e32:
276 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
277 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
278 case AMDGPU::V_LDEXP_F16_e64:
279 case AMDGPU::V_LDEXP_F16_e32:
280 case AMDGPU::V_LSHLREV_B16_e64:
281 case AMDGPU::V_LSHLREV_B16_e32:
282 case AMDGPU::V_LSHRREV_B16_e64:
283 case AMDGPU::V_LSHRREV_B16_e32:
284 case AMDGPU::V_ASHRREV_I16_e64:
285 case AMDGPU::V_ASHRREV_I16_e32:
286 case AMDGPU::V_ADD_U16_e64:
287 case AMDGPU::V_ADD_U16_e32:
288 case AMDGPU::V_SUB_U16_e64:
289 case AMDGPU::V_SUB_U16_e32:
290 case AMDGPU::V_SUBREV_U16_e64:
291 case AMDGPU::V_SUBREV_U16_e32:
292 case AMDGPU::V_MUL_LO_U16_e64:
293 case AMDGPU::V_MUL_LO_U16_e32:
294 case AMDGPU::V_ADD_F16_e64:
295 case AMDGPU::V_ADD_F16_e32:
296 case AMDGPU::V_SUB_F16_e64:
297 case AMDGPU::V_SUB_F16_e32:
298 case AMDGPU::V_SUBREV_F16_e64:
299 case AMDGPU::V_SUBREV_F16_e32:
300 case AMDGPU::V_MUL_F16_e64:
301 case AMDGPU::V_MUL_F16_e32:
302 case AMDGPU::V_MAX_F16_e64:
303 case AMDGPU::V_MAX_F16_e32:
304 case AMDGPU::V_MIN_F16_e64:
305 case AMDGPU::V_MIN_F16_e32:
306 case AMDGPU::V_MAX_U16_e64:
307 case AMDGPU::V_MAX_U16_e32:
308 case AMDGPU::V_MIN_U16_e64:
309 case AMDGPU::V_MIN_U16_e32:
310 case AMDGPU::V_MAX_I16_e64:
311 case AMDGPU::V_MAX_I16_e32:
312 case AMDGPU::V_MIN_I16_e64:
313 case AMDGPU::V_MIN_I16_e32:
314 case AMDGPU::V_MAD_F16_e64:
315 case AMDGPU::V_MAD_U16_e64:
316 case AMDGPU::V_MAD_I16_e64:
317 case AMDGPU::V_FMA_F16_e64:
318 case AMDGPU::V_DIV_FIXUP_F16_e64:
319 // On gfx10, all 16-bit instructions preserve the high bits.
320 return getGeneration() <= AMDGPUSubtarget::GFX9;
321 case AMDGPU::V_MADAK_F16:
322 case AMDGPU::V_MADMK_F16:
323 case AMDGPU::V_MAC_F16_e64:
324 case AMDGPU::V_MAC_F16_e32:
325 case AMDGPU::V_FMAMK_F16:
326 case AMDGPU::V_FMAAK_F16:
327 case AMDGPU::V_FMAC_F16_e64:
328 case AMDGPU::V_FMAC_F16_e32:
329 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
330 // instructions maintain the legacy behavior of 0ing. Some instructions
331 // changed to preserving the high bits.
332 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
333 case AMDGPU::V_MAD_MIXLO_F16:
334 case AMDGPU::V_MAD_MIXHI_F16:
335 default:
336 return false;
337 }
338}
339
340void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
341 const SchedRegion &Region) const {
342 // Track register pressure so the scheduler can try to decrease
343 // pressure once register usage is above the threshold defined by
344 // SIRegisterInfo::getRegPressureSetLimit()
345 Policy.ShouldTrackPressure = true;
346
347 const Function &F = Region.RegionBegin->getMF()->getFunction();
348 if (AMDGPU::getSchedStrategy(F) == "coexec") {
349 Policy.OnlyTopDown = true;
350 Policy.OnlyBottomUp = false;
351 return;
352 }
353
354 // Enabling both top down and bottom up scheduling seems to give us less
355 // register spills than just using one of these approaches on its own.
356 Policy.OnlyTopDown = false;
357 Policy.OnlyBottomUp = false;
358
359 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
360 if (!enableSIScheduler())
361 Policy.ShouldTrackLaneMasks = true;
362}
363
364void GCNSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,
365 const SchedRegion &Region) const {
366 const Function &F = Region.RegionBegin->getMF()->getFunction();
367 Attribute PostRADirectionAttr = F.getFnAttribute(Kind: "amdgpu-post-ra-direction");
368 if (!PostRADirectionAttr.isValid())
369 return;
370
371 StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();
372 if (PostRADirectionStr == "topdown") {
373 Policy.OnlyTopDown = true;
374 Policy.OnlyBottomUp = false;
375 } else if (PostRADirectionStr == "bottomup") {
376 Policy.OnlyTopDown = false;
377 Policy.OnlyBottomUp = true;
378 } else if (PostRADirectionStr == "bidirectional") {
379 Policy.OnlyTopDown = false;
380 Policy.OnlyBottomUp = false;
381 } else {
382 DiagnosticInfoOptimizationFailure Diag(
383 F, F.getSubprogram(), "invalid value for postRA direction attribute");
384 F.getContext().diagnose(DI: Diag);
385 }
386
387 LLVM_DEBUG({
388 const char *DirStr = "default";
389 if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)
390 DirStr = "topdown";
391 else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)
392 DirStr = "bottomup";
393 else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)
394 DirStr = "bidirectional";
395
396 dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr
397 << '\n';
398 });
399}
400
401void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
402 if (isWave32()) {
403 // Fix implicit $vcc operands after MIParser has verified that they match
404 // the instruction definitions.
405 for (auto &MBB : MF) {
406 for (auto &MI : MBB)
407 InstrInfo.fixImplicitOperands(MI);
408 }
409 }
410}
411
412bool GCNSubtarget::hasMadF16() const {
413 return InstrInfo.pseudoToMCOpcode(Opcode: AMDGPU::V_MAD_F16_e64) != -1;
414}
415
416bool GCNSubtarget::useVGPRIndexMode() const {
417 return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
418}
419
420bool GCNSubtarget::useAA() const { return UseAA; }
421
422unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
423 return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(STI: *this, SGPRs);
424}
425
426unsigned
427GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
428 unsigned DynamicVGPRBlockSize) const {
429 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(STI: *this, NumVGPRs,
430 DynamicVGPRBlockSize);
431}
432
433unsigned
434GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
435 if (getGeneration() >= AMDGPUSubtarget::GFX10)
436 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
437
438 if (HasFlatScratch || HasArchitectedFlatScratch) {
439 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
440 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
441 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
442 return 4; // FLAT_SCRATCH, VCC (in that order).
443 }
444
445 if (isXNACKEnabled())
446 return 4; // XNACK, VCC (in that order).
447 return 2; // VCC.
448}
449
450unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
451 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
452 return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit());
453}
454
455unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
456 // In principle we do not need to reserve SGPR pair used for flat_scratch if
457 // we know flat instructions do not access the stack anywhere in the
458 // program. For now assume it's needed if we have flat instructions.
459 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
460 return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch);
461}
462
463std::pair<unsigned, unsigned>
464GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
465 unsigned NumSGPRs, unsigned NumVGPRs) const {
466 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
467 // Temporarily check both the attribute and the subtarget feature until the
468 // latter is removed.
469 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
470 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
471
472 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSBytes: LDSSize, F);
473 unsigned SGPROcc = getOccupancyWithNumSGPRs(SGPRs: NumSGPRs);
474 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
475
476 // Maximum occupancy may be further limited by high SGPR/VGPR usage.
477 MaxOcc = std::min(a: MaxOcc, b: std::min(a: SGPROcc, b: VGPROcc));
478 return {std::min(a: MinOcc, b: MaxOcc), MaxOcc};
479}
480
481unsigned GCNSubtarget::getBaseMaxNumSGPRs(
482 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
483 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
484 // Compute maximum number of SGPRs function can use using default/requested
485 // minimum number of waves per execution unit.
486 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false);
487 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true);
488
489 // Check if maximum number of SGPRs was explicitly requested using
490 // "amdgpu-num-sgpr" attribute.
491 unsigned Requested =
492 F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr", Default: MaxNumSGPRs);
493
494 if (Requested != MaxNumSGPRs) {
495 // Make sure requested value does not violate subtarget's specifications.
496 if (Requested && (Requested <= ReservedNumSGPRs))
497 Requested = 0;
498
499 // If more SGPRs are required to support the input user/system SGPRs,
500 // increase to accommodate them.
501 //
502 // FIXME: This really ends up using the requested number of SGPRs + number
503 // of reserved special registers in total. Theoretically you could re-use
504 // the last input registers for these special registers, but this would
505 // require a lot of complexity to deal with the weird aliasing.
506 unsigned InputNumSGPRs = PreloadedSGPRs;
507 if (Requested && Requested < InputNumSGPRs)
508 Requested = InputNumSGPRs;
509
510 // Make sure requested value is compatible with values implied by
511 // default/requested minimum/maximum number of waves per execution unit.
512 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false))
513 Requested = 0;
514 if (WavesPerEU.second && Requested &&
515 Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second))
516 Requested = 0;
517
518 if (Requested)
519 MaxNumSGPRs = Requested;
520 }
521
522 if (hasSGPRInitBug())
523 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
524
525 return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs);
526}
527
528unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
529 const Function &F = MF.getFunction();
530 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
531 return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(),
532 ReservedNumSGPRs: getReservedNumSGPRs(MF));
533}
534
535unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
536 using USI = GCNUserSGPRUsageInfo;
537 // Max number of user SGPRs
538 const unsigned MaxUserSGPRs =
539 USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) +
540 USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) +
541 USI::getNumUserSGPRForField(ID: USI::QueuePtrID) +
542 USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) +
543 USI::getNumUserSGPRForField(ID: USI::DispatchIdID) +
544 USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) +
545 USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID);
546
547 // Max number of system SGPRs
548 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
549 1 + // WorkGroupIDY
550 1 + // WorkGroupIDZ
551 1 + // WorkGroupInfo
552 1; // private segment wave byte offset
553
554 // Max number of synthetic SGPRs
555 const unsigned SyntheticSGPRs = 1; // LDSKernelId
556
557 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
558}
559
560unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
561 return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(),
562 ReservedNumSGPRs: getReservedNumSGPRs(F));
563}
564
565unsigned GCNSubtarget::getBaseMaxNumVGPRs(
566 const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
567 const auto [Min, Max] = NumVGPRBounds;
568
569 // Check if maximum number of VGPRs was explicitly requested using
570 // "amdgpu-num-vgpr" attribute.
571
572 unsigned Requested = F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr", Default: Max);
573 if (Requested != Max && hasGFX90AInsts())
574 Requested *= 2;
575
576 // Make sure requested value is inside the range of possible VGPR usage.
577 return std::clamp(val: Requested, lo: Min, hi: Max);
578}
579
580unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
581 // Temporarily check both the attribute and the subtarget feature, until the
582 // latter is removed.
583 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
584 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
585 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
586
587 std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
588 return getBaseMaxNumVGPRs(
589 F, NumVGPRBounds: {getMinNumVGPRs(WavesPerEU: Waves.second, DynamicVGPRBlockSize),
590 getMaxNumVGPRs(WavesPerEU: Waves.first, DynamicVGPRBlockSize)});
591}
592
593unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
594 return getMaxNumVGPRs(F: MF.getFunction());
595}
596
597std::pair<unsigned, unsigned>
598GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
599 const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
600
601 unsigned MaxNumVGPRs = MaxVectorRegs;
602 unsigned MaxNumAGPRs = 0;
603 unsigned NumArchVGPRs = getAddressableNumArchVGPRs();
604
605 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
606 // a wave may have up to 512 total vector registers combining together both
607 // VGPRs and AGPRs. Hence, in an entry function without calls and without
608 // AGPRs used within it, it is possible to use the whole vector register
609 // budget for VGPRs.
610 //
611 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
612 // register file accordingly.
613 if (hasGFX90AInsts()) {
614 unsigned MinNumAGPRs = 0;
615 const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
616
617 const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
618
619 // TODO: The lower bound should probably force the number of required
620 // registers up, overriding amdgpu-waves-per-eu.
621 std::tie(args&: MinNumAGPRs, args&: MaxNumAGPRs) =
622 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-agpr-alloc", Default: DefaultNumAGPR,
623 /*OnlyFirstRequired=*/true);
624
625 if (MinNumAGPRs == DefaultNumAGPR.first) {
626 // Default to splitting half the registers if AGPRs are required.
627 MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
628 } else {
629 // Align to accum_offset's allocation granularity.
630 MinNumAGPRs = alignTo(Value: MinNumAGPRs, Align: 4);
631
632 MinNumAGPRs = std::min(a: MinNumAGPRs, b: TotalNumAGPRs);
633 }
634
635 // Clamp values to be inbounds of our limits, and ensure min <= max.
636
637 MaxNumAGPRs = std::min(a: std::max(a: MinNumAGPRs, b: MaxNumAGPRs), b: MaxVectorRegs);
638 MinNumAGPRs = std::min(a: std::min(a: MinNumAGPRs, b: TotalNumAGPRs), b: MaxNumAGPRs);
639
640 MaxNumVGPRs = std::min(a: MaxVectorRegs - MinNumAGPRs, b: NumArchVGPRs);
641 MaxNumAGPRs = std::min(a: MaxVectorRegs - MaxNumVGPRs, b: MaxNumAGPRs);
642
643 assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
644 MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&
645 "invalid register counts");
646 } else if (hasMAIInsts()) {
647 // On gfx908 the number of AGPRs always equals the number of VGPRs.
648 MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
649 }
650
651 return std::pair(MaxNumVGPRs, MaxNumAGPRs);
652}
653
654// Check to which source operand UseOpIdx points to and return a pointer to the
655// operand of the corresponding source modifier.
656// Return nullptr if UseOpIdx either doesn't point to src0/1/2 or if there is no
657// operand for the corresponding source modifier.
658static const MachineOperand *
659getVOP3PSourceModifierFromOpIdx(const MachineInstr &UseI, int UseOpIdx,
660 const SIInstrInfo &InstrInfo) {
661 AMDGPU::OpName UseName =
662 AMDGPU::getOperandIdxName(Opcode: UseI.getOpcode(), Idx: UseOpIdx);
663 switch (UseName) {
664 case AMDGPU::OpName::src0:
665 return InstrInfo.getNamedOperand(MI: UseI, OperandName: AMDGPU::OpName::src0_modifiers);
666 case AMDGPU::OpName::src1:
667 return InstrInfo.getNamedOperand(MI: UseI, OperandName: AMDGPU::OpName::src1_modifiers);
668 case AMDGPU::OpName::src2:
669 return InstrInfo.getNamedOperand(MI: UseI, OperandName: AMDGPU::OpName::src2_modifiers);
670 default:
671 return nullptr;
672 }
673}
674
675// Get the subreg idx of the subreg that is used by the given instruction
676// operand, considering the given op_sel modifier.
677// Return 0 if the whole register is used or as a conservative fallback.
678static unsigned getEffectiveSubRegIdx(const SIRegisterInfo &TRI,
679 const SIInstrInfo &InstrInfo,
680 const MachineInstr &I,
681 const MachineOperand &Op) {
682 if (!InstrInfo.isVOP3P(MI: I) || InstrInfo.isWMMA(MI: I) || InstrInfo.isSWMMAC(MI: I))
683 return AMDGPU::NoSubRegister;
684
685 const MachineOperand *OpMod =
686 getVOP3PSourceModifierFromOpIdx(UseI: I, UseOpIdx: Op.getOperandNo(), InstrInfo);
687 if (!OpMod)
688 return AMDGPU::NoSubRegister;
689
690 // Note: the FMA_MIX* and MAD_MIX* instructions have different semantics for
691 // the op_sel and op_sel_hi source modifiers:
692 // - op_sel: selects low/high operand bits as input to the operation;
693 // has only meaning for 16-bit source operands
694 // - op_sel_hi: specifies the size of the source operands (16 or 32 bits);
695 // a value of 0 indicates 32 bit, 1 indicates 16 bit
696 // For the other VOP3P instructions, the semantics are:
697 // - op_sel: selects low/high operand bits as input to the operation which
698 // results in the lower-half of the destination
699 // - op_sel_hi: selects the low/high operand bits as input to the operation
700 // which results in the higher-half of the destination
701 int64_t OpSel = OpMod->getImm() & SISrcMods::OP_SEL_0;
702 int64_t OpSelHi = OpMod->getImm() & SISrcMods::OP_SEL_1;
703
704 // Check if all parts of the register are being used (= op_sel and op_sel_hi
705 // differ for VOP3P or op_sel_hi=0 for VOP3PMix). In that case we can return
706 // early.
707 if ((!InstrInfo.isVOP3PMix(MI: I) && (!OpSel || !OpSelHi) &&
708 (OpSel || OpSelHi)) ||
709 (InstrInfo.isVOP3PMix(MI: I) && !OpSelHi))
710 return AMDGPU::NoSubRegister;
711
712 const MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
713 const TargetRegisterClass *RC = TRI.getRegClassForOperandReg(MRI, MO: Op);
714
715 if (unsigned SubRegIdx = OpSel ? AMDGPU::sub1 : AMDGPU::sub0;
716 TRI.getSubClassWithSubReg(RC, SubRegIdx) == RC)
717 return SubRegIdx;
718 if (unsigned SubRegIdx = OpSel ? AMDGPU::hi16 : AMDGPU::lo16;
719 TRI.getSubClassWithSubReg(RC, SubRegIdx) == RC)
720 return SubRegIdx;
721
722 return AMDGPU::NoSubRegister;
723}
724
725Register GCNSubtarget::getRealSchedDependency(const MachineInstr &DefI,
726 int DefOpIdx,
727 const MachineInstr &UseI,
728 int UseOpIdx) const {
729 const SIRegisterInfo *TRI = getRegisterInfo();
730 const MachineOperand &DefOp = DefI.getOperand(i: DefOpIdx);
731 const MachineOperand &UseOp = UseI.getOperand(i: UseOpIdx);
732 Register DefReg = DefOp.getReg();
733 Register UseReg = UseOp.getReg();
734
735 // If the registers aren't restricted to a sub-register, there is no point in
736 // further analysis. This check makes only sense for virtual registers because
737 // physical registers may form a tuple and thus be part of a superregister
738 // although they are not a subregister themselves (vgpr0 is a "subreg" of
739 // vgpr0_vgpr1 without being a subreg in itself).
740 unsigned DefSubRegIdx = DefOp.getSubReg();
741 if (DefReg.isVirtual() && DefSubRegIdx == AMDGPU::NoSubRegister)
742 return DefReg;
743 unsigned UseSubRegIdx = getEffectiveSubRegIdx(TRI: *TRI, InstrInfo, I: UseI, Op: UseOp);
744 if (UseReg.isVirtual() && UseSubRegIdx == AMDGPU::NoSubRegister)
745 return DefReg;
746
747 if (!TRI->checkSubRegInterference(RegA: DefReg, SubA: DefSubRegIdx, RegB: UseReg, SubB: UseSubRegIdx))
748 return Register(); // No real dependency
749
750 // UseReg might be smaller or larger than DefReg, depending on the subreg and
751 // on whether DefReg is a subreg, too. -> Find the smaller one. This does not
752 // apply to virtual registers because we cannot construct a subreg for them.
753 if (DefReg.isVirtual())
754 return DefReg;
755 MCRegister DefMCReg =
756 DefSubRegIdx ? TRI->getSubReg(Reg: DefReg, Idx: DefSubRegIdx) : DefReg.asMCReg();
757 MCRegister UseMCReg =
758 UseSubRegIdx ? TRI->getSubReg(Reg: UseReg, Idx: UseSubRegIdx) : UseReg.asMCReg();
759 return TRI->isSubRegisterEq(RegA: DefMCReg, RegB: UseMCReg) ? UseMCReg : DefMCReg;
760}
761
762void GCNSubtarget::adjustSchedDependency(
763 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
764 const TargetSchedModel *SchedModel) const {
765 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
766 !Use->isInstr())
767 return;
768
769 MachineInstr *DefI = Def->getInstr();
770 MachineInstr *UseI = Use->getInstr();
771
772 // Check for false latency on $tensorcnt / $asynccnt dependencies
773 if (Dep.getReg() == AMDGPU::TENSORcnt || Dep.getReg() == AMDGPU::ASYNCcnt) {
774 unsigned UseOp = UseI->getOpcode();
775 // Do not adjust latency for load->s_wait
776 bool IsBarrierCase =
777 InstrInfo.isLDSDMA(MI: *DefI) &&
778 (UseOp == AMDGPU::S_WAIT_TENSORCNT || UseOp == AMDGPU::S_WAIT_ASYNCCNT);
779 if (!IsBarrierCase) {
780 Dep.setLatency(1);
781 return;
782 }
783 }
784
785 if (Register Reg = getRealSchedDependency(DefI: *DefI, DefOpIdx, UseI: *UseI, UseOpIdx)) {
786 Dep.setReg(Reg);
787 } else {
788 Dep = SDep(Def, SDep::Artificial);
789 return; // This is not a data dependency anymore.
790 }
791
792 if (DefI->isBundle()) {
793 const SIRegisterInfo *TRI = getRegisterInfo();
794 auto Reg = Dep.getReg();
795 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
796 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
797 unsigned Lat = 0;
798 for (++I; I != E && I->isBundledWithPred(); ++I) {
799 if (I->isMetaInstruction())
800 continue;
801 if (I->modifiesRegister(Reg, TRI))
802 Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *I);
803 else if (Lat)
804 --Lat;
805 }
806 Dep.setLatency(Lat);
807 } else if (UseI->isBundle()) {
808 const SIRegisterInfo *TRI = getRegisterInfo();
809 auto Reg = Dep.getReg();
810 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
811 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
812 unsigned Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *DefI);
813 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
814 if (I->isMetaInstruction())
815 continue;
816 if (I->readsRegister(Reg, TRI))
817 break;
818 --Lat;
819 }
820 Dep.setLatency(Lat);
821 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
822 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
823 // implicit operands which come from the MCInstrDesc, which can fool
824 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
825 // pseudo operands.
826 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
827 DefMI: DefI, DefOperIdx: DefOpIdx, UseMI: UseI, UseOperIdx: UseOpIdx));
828 }
829}
830
831unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
832 if (getGeneration() >= AMDGPUSubtarget::GFX12)
833 return 0; // Not MIMG encoding.
834
835 if (NSAThreshold.getNumOccurrences() > 0)
836 return std::max(a: NSAThreshold.getValue(), b: 2u);
837
838 int Value = MF.getFunction().getFnAttributeAsParsedInteger(
839 Kind: "amdgpu-nsa-threshold", Default: -1);
840 if (Value > 0)
841 return std::max(a: Value, b: 2);
842
843 return NSAThreshold;
844}
845
846GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
847 const GCNSubtarget &ST)
848 : ST(ST) {
849 const CallingConv::ID CC = F.getCallingConv();
850 const bool IsKernel =
851 CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
852
853 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
854 KernargSegmentPtr = true;
855
856 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
857 if (IsAmdHsaOrMesa && !ST.hasFlatScratchEnabled())
858 PrivateSegmentBuffer = true;
859 else if (ST.isMesaGfxShader(F))
860 ImplicitBufferPtr = true;
861
862 if (!AMDGPU::isGraphics(CC)) {
863 if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr"))
864 DispatchPtr = true;
865
866 // FIXME: Can this always be disabled with < COv5?
867 if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr"))
868 QueuePtr = true;
869
870 if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id"))
871 DispatchID = true;
872 }
873
874 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
875 (IsAmdHsaOrMesa || ST.hasFlatScratchEnabled()) &&
876 // FlatScratchInit cannot be true for graphics CC if
877 // hasFlatScratchEnabled() is false.
878 (ST.hasFlatScratchEnabled() ||
879 (!AMDGPU::isGraphics(CC) &&
880 !F.hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init"))) &&
881 !ST.hasArchitectedFlatScratch()) {
882 FlatScratchInit = true;
883 }
884
885 if (hasImplicitBufferPtr())
886 NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID);
887
888 if (hasPrivateSegmentBuffer())
889 NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID);
890
891 if (hasDispatchPtr())
892 NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID);
893
894 if (hasQueuePtr())
895 NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID);
896
897 if (hasKernargSegmentPtr())
898 NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID);
899
900 if (hasDispatchID())
901 NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID);
902
903 if (hasFlatScratchInit())
904 NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID);
905
906 if (hasPrivateSegmentSize())
907 NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentSizeID);
908}
909
910void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
911 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
912 NumKernargPreloadSGPRs += NumSGPRs;
913 NumUsedUserSGPRs += NumSGPRs;
914}
915
916unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
917 return AMDGPU::getMaxNumUserSGPRs(STI: ST) - NumUsedUserSGPRs;
918}
919