1 | //===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Implements the GCN specific subclass of TargetSubtarget. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "GCNSubtarget.h" |
15 | #include "AMDGPUCallLowering.h" |
16 | #include "AMDGPUInstructionSelector.h" |
17 | #include "AMDGPULegalizerInfo.h" |
18 | #include "AMDGPURegisterBankInfo.h" |
19 | #include "AMDGPUSelectionDAGInfo.h" |
20 | #include "AMDGPUTargetMachine.h" |
21 | #include "SIMachineFunctionInfo.h" |
22 | #include "Utils/AMDGPUBaseInfo.h" |
23 | #include "llvm/ADT/SmallString.h" |
24 | #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" |
25 | #include "llvm/CodeGen/MachineScheduler.h" |
26 | #include "llvm/CodeGen/TargetFrameLowering.h" |
27 | #include "llvm/IR/DiagnosticInfo.h" |
28 | #include "llvm/IR/MDBuilder.h" |
29 | #include <algorithm> |
30 | |
31 | using namespace llvm; |
32 | |
33 | #define DEBUG_TYPE "gcn-subtarget" |
34 | |
35 | #define GET_SUBTARGETINFO_TARGET_DESC |
36 | #define GET_SUBTARGETINFO_CTOR |
37 | #define AMDGPUSubtarget GCNSubtarget |
38 | #include "AMDGPUGenSubtargetInfo.inc" |
39 | #undef AMDGPUSubtarget |
40 | |
41 | static cl::opt<bool> EnableVGPRIndexMode( |
42 | "amdgpu-vgpr-index-mode" , |
43 | cl::desc("Use GPR indexing mode instead of movrel for vector indexing" ), |
44 | cl::init(Val: false)); |
45 | |
46 | static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen" , |
47 | cl::desc("Enable the use of AA during codegen." ), |
48 | cl::init(Val: true)); |
49 | |
50 | static cl::opt<unsigned> |
51 | NSAThreshold("amdgpu-nsa-threshold" , |
52 | cl::desc("Number of addresses from which to enable MIMG NSA." ), |
53 | cl::init(Val: 2), cl::Hidden); |
54 | |
55 | GCNSubtarget::~GCNSubtarget() = default; |
56 | |
57 | GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, |
58 | StringRef GPU, |
59 | StringRef FS) { |
60 | // Determine default and user-specified characteristics |
61 | // |
62 | // We want to be able to turn these off, but making this a subtarget feature |
63 | // for SI has the unhelpful behavior that it unsets everything else if you |
64 | // disable it. |
65 | // |
66 | // Similarly we want enable-prt-strict-null to be on by default and not to |
67 | // unset everything else if it is disabled |
68 | |
69 | SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128," ); |
70 | |
71 | // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by |
72 | // default |
73 | if (isAmdHsaOS()) |
74 | FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler," ; |
75 | |
76 | FullFS += "+enable-prt-strict-null," ; // This is overridden by a disable in FS |
77 | |
78 | // Disable mutually exclusive bits. |
79 | if (FS.contains_insensitive(Other: "+wavefrontsize" )) { |
80 | if (!FS.contains_insensitive(Other: "wavefrontsize16" )) |
81 | FullFS += "-wavefrontsize16," ; |
82 | if (!FS.contains_insensitive(Other: "wavefrontsize32" )) |
83 | FullFS += "-wavefrontsize32," ; |
84 | if (!FS.contains_insensitive(Other: "wavefrontsize64" )) |
85 | FullFS += "-wavefrontsize64," ; |
86 | } |
87 | |
88 | FullFS += FS; |
89 | |
90 | ParseSubtargetFeatures(CPU: GPU, /*TuneCPU*/ GPU, FS: FullFS); |
91 | |
92 | // Implement the "generic" processors, which acts as the default when no |
93 | // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to |
94 | // the first amdgcn target that supports flat addressing. Other OSes defaults |
95 | // to the first amdgcn target. |
96 | if (Gen == AMDGPUSubtarget::INVALID) { |
97 | Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS |
98 | : AMDGPUSubtarget::SOUTHERN_ISLANDS; |
99 | // Assume wave64 for the unknown target, if not explicitly set. |
100 | if (getWavefrontSizeLog2() == 0) |
101 | WavefrontSizeLog2 = 6; |
102 | } else if (!hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) && |
103 | !hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) { |
104 | // If there is no default wave size it must be a generation before gfx10, |
105 | // these have FeatureWavefrontSize64 in their definition already. For gfx10+ |
106 | // set wave32 as a default. |
107 | ToggleFeature(FB: AMDGPU::FeatureWavefrontSize32); |
108 | WavefrontSizeLog2 = getGeneration() >= AMDGPUSubtarget::GFX10 ? 5 : 6; |
109 | } |
110 | |
111 | // We don't support FP64 for EG/NI atm. |
112 | assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); |
113 | |
114 | // Targets must either support 64-bit offsets for MUBUF instructions, and/or |
115 | // support flat operations, otherwise they cannot access a 64-bit global |
116 | // address space |
117 | assert(hasAddr64() || hasFlat()); |
118 | // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets |
119 | // that do not support ADDR64 variants of MUBUF instructions. Such targets |
120 | // cannot use a 64 bit offset with a MUBUF instruction to access the global |
121 | // address space |
122 | if (!hasAddr64() && !FS.contains(Other: "flat-for-global" ) && !FlatForGlobal) { |
123 | ToggleFeature(FB: AMDGPU::FeatureFlatForGlobal); |
124 | FlatForGlobal = true; |
125 | } |
126 | // Unless +-flat-for-global is specified, use MUBUF instructions for global |
127 | // address space access if flat operations are not available. |
128 | if (!hasFlat() && !FS.contains(Other: "flat-for-global" ) && FlatForGlobal) { |
129 | ToggleFeature(FB: AMDGPU::FeatureFlatForGlobal); |
130 | FlatForGlobal = false; |
131 | } |
132 | |
133 | // Set defaults if needed. |
134 | if (MaxPrivateElementSize == 0) |
135 | MaxPrivateElementSize = 4; |
136 | |
137 | if (LDSBankCount == 0) |
138 | LDSBankCount = 32; |
139 | |
140 | if (TT.isAMDGCN() && AddressableLocalMemorySize == 0) |
141 | AddressableLocalMemorySize = 32768; |
142 | |
143 | LocalMemorySize = AddressableLocalMemorySize; |
144 | if (AMDGPU::isGFX10Plus(STI: *this) && |
145 | !getFeatureBits().test(I: AMDGPU::FeatureCuMode)) |
146 | LocalMemorySize *= 2; |
147 | |
148 | HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; |
149 | HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; |
150 | |
151 | TargetID.setTargetIDFromFeaturesString(FS); |
152 | |
153 | LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " |
154 | << TargetID.getXnackSetting() << '\n'); |
155 | LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " |
156 | << TargetID.getSramEccSetting() << '\n'); |
157 | |
158 | return *this; |
159 | } |
160 | |
161 | void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { |
162 | LLVMContext &Ctx = F.getContext(); |
163 | if (hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) && |
164 | hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) { |
165 | Ctx.diagnose(DI: DiagnosticInfoUnsupported( |
166 | F, "must specify exactly one of wavefrontsize32 and wavefrontsize64" )); |
167 | } |
168 | } |
169 | |
170 | GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, |
171 | const GCNTargetMachine &TM) |
172 | : // clang-format off |
173 | AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), |
174 | AMDGPUSubtarget(TT), |
175 | TargetTriple(TT), |
176 | TargetID(*this), |
177 | InstrItins(getInstrItineraryForCPU(CPU: GPU)), |
178 | InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), |
179 | TLInfo(TM, *this), |
180 | FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { |
181 | // clang-format on |
182 | MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(STI: this); |
183 | EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(STI: this); |
184 | |
185 | TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>(); |
186 | |
187 | CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(args: *getTargetLowering()); |
188 | InlineAsmLoweringInfo = |
189 | std::make_unique<InlineAsmLowering>(args: getTargetLowering()); |
190 | Legalizer = std::make_unique<AMDGPULegalizerInfo>(args&: *this, args: TM); |
191 | RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(args&: *this); |
192 | InstSelector = |
193 | std::make_unique<AMDGPUInstructionSelector>(args&: *this, args&: *RegBankInfo, args: TM); |
194 | } |
195 | |
196 | const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const { |
197 | return TSInfo.get(); |
198 | } |
199 | |
200 | unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { |
201 | if (getGeneration() < GFX10) |
202 | return 1; |
203 | |
204 | switch (Opcode) { |
205 | case AMDGPU::V_LSHLREV_B64_e64: |
206 | case AMDGPU::V_LSHLREV_B64_gfx10: |
207 | case AMDGPU::V_LSHLREV_B64_e64_gfx11: |
208 | case AMDGPU::V_LSHLREV_B64_e32_gfx12: |
209 | case AMDGPU::V_LSHLREV_B64_e64_gfx12: |
210 | case AMDGPU::V_LSHL_B64_e64: |
211 | case AMDGPU::V_LSHRREV_B64_e64: |
212 | case AMDGPU::V_LSHRREV_B64_gfx10: |
213 | case AMDGPU::V_LSHRREV_B64_e64_gfx11: |
214 | case AMDGPU::V_LSHRREV_B64_e64_gfx12: |
215 | case AMDGPU::V_LSHR_B64_e64: |
216 | case AMDGPU::V_ASHRREV_I64_e64: |
217 | case AMDGPU::V_ASHRREV_I64_gfx10: |
218 | case AMDGPU::V_ASHRREV_I64_e64_gfx11: |
219 | case AMDGPU::V_ASHRREV_I64_e64_gfx12: |
220 | case AMDGPU::V_ASHR_I64_e64: |
221 | return 1; |
222 | } |
223 | |
224 | return 2; |
225 | } |
226 | |
227 | /// This list was mostly derived from experimentation. |
228 | bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { |
229 | switch (Opcode) { |
230 | case AMDGPU::V_CVT_F16_F32_e32: |
231 | case AMDGPU::V_CVT_F16_F32_e64: |
232 | case AMDGPU::V_CVT_F16_U16_e32: |
233 | case AMDGPU::V_CVT_F16_U16_e64: |
234 | case AMDGPU::V_CVT_F16_I16_e32: |
235 | case AMDGPU::V_CVT_F16_I16_e64: |
236 | case AMDGPU::V_RCP_F16_e64: |
237 | case AMDGPU::V_RCP_F16_e32: |
238 | case AMDGPU::V_RSQ_F16_e64: |
239 | case AMDGPU::V_RSQ_F16_e32: |
240 | case AMDGPU::V_SQRT_F16_e64: |
241 | case AMDGPU::V_SQRT_F16_e32: |
242 | case AMDGPU::V_LOG_F16_e64: |
243 | case AMDGPU::V_LOG_F16_e32: |
244 | case AMDGPU::V_EXP_F16_e64: |
245 | case AMDGPU::V_EXP_F16_e32: |
246 | case AMDGPU::V_SIN_F16_e64: |
247 | case AMDGPU::V_SIN_F16_e32: |
248 | case AMDGPU::V_COS_F16_e64: |
249 | case AMDGPU::V_COS_F16_e32: |
250 | case AMDGPU::V_FLOOR_F16_e64: |
251 | case AMDGPU::V_FLOOR_F16_e32: |
252 | case AMDGPU::V_CEIL_F16_e64: |
253 | case AMDGPU::V_CEIL_F16_e32: |
254 | case AMDGPU::V_TRUNC_F16_e64: |
255 | case AMDGPU::V_TRUNC_F16_e32: |
256 | case AMDGPU::V_RNDNE_F16_e64: |
257 | case AMDGPU::V_RNDNE_F16_e32: |
258 | case AMDGPU::V_FRACT_F16_e64: |
259 | case AMDGPU::V_FRACT_F16_e32: |
260 | case AMDGPU::V_FREXP_MANT_F16_e64: |
261 | case AMDGPU::V_FREXP_MANT_F16_e32: |
262 | case AMDGPU::V_FREXP_EXP_I16_F16_e64: |
263 | case AMDGPU::V_FREXP_EXP_I16_F16_e32: |
264 | case AMDGPU::V_LDEXP_F16_e64: |
265 | case AMDGPU::V_LDEXP_F16_e32: |
266 | case AMDGPU::V_LSHLREV_B16_e64: |
267 | case AMDGPU::V_LSHLREV_B16_e32: |
268 | case AMDGPU::V_LSHRREV_B16_e64: |
269 | case AMDGPU::V_LSHRREV_B16_e32: |
270 | case AMDGPU::V_ASHRREV_I16_e64: |
271 | case AMDGPU::V_ASHRREV_I16_e32: |
272 | case AMDGPU::V_ADD_U16_e64: |
273 | case AMDGPU::V_ADD_U16_e32: |
274 | case AMDGPU::V_SUB_U16_e64: |
275 | case AMDGPU::V_SUB_U16_e32: |
276 | case AMDGPU::V_SUBREV_U16_e64: |
277 | case AMDGPU::V_SUBREV_U16_e32: |
278 | case AMDGPU::V_MUL_LO_U16_e64: |
279 | case AMDGPU::V_MUL_LO_U16_e32: |
280 | case AMDGPU::V_ADD_F16_e64: |
281 | case AMDGPU::V_ADD_F16_e32: |
282 | case AMDGPU::V_SUB_F16_e64: |
283 | case AMDGPU::V_SUB_F16_e32: |
284 | case AMDGPU::V_SUBREV_F16_e64: |
285 | case AMDGPU::V_SUBREV_F16_e32: |
286 | case AMDGPU::V_MUL_F16_e64: |
287 | case AMDGPU::V_MUL_F16_e32: |
288 | case AMDGPU::V_MAX_F16_e64: |
289 | case AMDGPU::V_MAX_F16_e32: |
290 | case AMDGPU::V_MIN_F16_e64: |
291 | case AMDGPU::V_MIN_F16_e32: |
292 | case AMDGPU::V_MAX_U16_e64: |
293 | case AMDGPU::V_MAX_U16_e32: |
294 | case AMDGPU::V_MIN_U16_e64: |
295 | case AMDGPU::V_MIN_U16_e32: |
296 | case AMDGPU::V_MAX_I16_e64: |
297 | case AMDGPU::V_MAX_I16_e32: |
298 | case AMDGPU::V_MIN_I16_e64: |
299 | case AMDGPU::V_MIN_I16_e32: |
300 | case AMDGPU::V_MAD_F16_e64: |
301 | case AMDGPU::V_MAD_U16_e64: |
302 | case AMDGPU::V_MAD_I16_e64: |
303 | case AMDGPU::V_FMA_F16_e64: |
304 | case AMDGPU::V_DIV_FIXUP_F16_e64: |
305 | // On gfx10, all 16-bit instructions preserve the high bits. |
306 | return getGeneration() <= AMDGPUSubtarget::GFX9; |
307 | case AMDGPU::V_MADAK_F16: |
308 | case AMDGPU::V_MADMK_F16: |
309 | case AMDGPU::V_MAC_F16_e64: |
310 | case AMDGPU::V_MAC_F16_e32: |
311 | case AMDGPU::V_FMAMK_F16: |
312 | case AMDGPU::V_FMAAK_F16: |
313 | case AMDGPU::V_FMAC_F16_e64: |
314 | case AMDGPU::V_FMAC_F16_e32: |
315 | // In gfx9, the preferred handling of the unused high 16-bits changed. Most |
316 | // instructions maintain the legacy behavior of 0ing. Some instructions |
317 | // changed to preserving the high bits. |
318 | return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; |
319 | case AMDGPU::V_MAD_MIXLO_F16: |
320 | case AMDGPU::V_MAD_MIXHI_F16: |
321 | default: |
322 | return false; |
323 | } |
324 | } |
325 | |
326 | void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, |
327 | unsigned NumRegionInstrs) const { |
328 | // Track register pressure so the scheduler can try to decrease |
329 | // pressure once register usage is above the threshold defined by |
330 | // SIRegisterInfo::getRegPressureSetLimit() |
331 | Policy.ShouldTrackPressure = true; |
332 | |
333 | // Enabling both top down and bottom up scheduling seems to give us less |
334 | // register spills than just using one of these approaches on its own. |
335 | Policy.OnlyTopDown = false; |
336 | Policy.OnlyBottomUp = false; |
337 | |
338 | // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. |
339 | if (!enableSIScheduler()) |
340 | Policy.ShouldTrackLaneMasks = true; |
341 | } |
342 | |
343 | void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { |
344 | if (isWave32()) { |
345 | // Fix implicit $vcc operands after MIParser has verified that they match |
346 | // the instruction definitions. |
347 | for (auto &MBB : MF) { |
348 | for (auto &MI : MBB) |
349 | InstrInfo.fixImplicitOperands(MI); |
350 | } |
351 | } |
352 | } |
353 | |
354 | bool GCNSubtarget::hasMadF16() const { |
355 | return InstrInfo.pseudoToMCOpcode(Opcode: AMDGPU::V_MAD_F16_e64) != -1; |
356 | } |
357 | |
358 | bool GCNSubtarget::useVGPRIndexMode() const { |
359 | return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode); |
360 | } |
361 | |
362 | bool GCNSubtarget::useAA() const { return UseAA; } |
363 | |
364 | unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { |
365 | return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, MaxWaves: getMaxWavesPerEU(), |
366 | Gen: getGeneration()); |
367 | } |
368 | |
369 | unsigned |
370 | GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs, |
371 | unsigned DynamicVGPRBlockSize) const { |
372 | return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(STI: this, NumVGPRs, |
373 | DynamicVGPRBlockSize); |
374 | } |
375 | |
376 | unsigned |
377 | GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { |
378 | if (getGeneration() >= AMDGPUSubtarget::GFX10) |
379 | return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. |
380 | |
381 | if (HasFlatScratch || HasArchitectedFlatScratch) { |
382 | if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
383 | return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). |
384 | if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) |
385 | return 4; // FLAT_SCRATCH, VCC (in that order). |
386 | } |
387 | |
388 | if (isXNACKEnabled()) |
389 | return 4; // XNACK, VCC (in that order). |
390 | return 2; // VCC. |
391 | } |
392 | |
393 | unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { |
394 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
395 | return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit()); |
396 | } |
397 | |
398 | unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { |
399 | // In principle we do not need to reserve SGPR pair used for flat_scratch if |
400 | // we know flat instructions do not access the stack anywhere in the |
401 | // program. For now assume it's needed if we have flat instructions. |
402 | const bool KernelUsesFlatScratch = hasFlatAddressSpace(); |
403 | return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch); |
404 | } |
405 | |
406 | std::pair<unsigned, unsigned> |
407 | GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, |
408 | unsigned NumSGPRs, unsigned NumVGPRs) const { |
409 | unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F); |
410 | // Temporarily check both the attribute and the subtarget feature until the |
411 | // latter is removed. |
412 | if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled()) |
413 | DynamicVGPRBlockSize = getDynamicVGPRBlockSize(); |
414 | |
415 | auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSBytes: LDSSize, F); |
416 | unsigned SGPROcc = getOccupancyWithNumSGPRs(SGPRs: NumSGPRs); |
417 | unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize); |
418 | |
419 | // Maximum occupancy may be further limited by high SGPR/VGPR usage. |
420 | MaxOcc = std::min(a: MaxOcc, b: std::min(a: SGPROcc, b: VGPROcc)); |
421 | return {std::min(a: MinOcc, b: MaxOcc), MaxOcc}; |
422 | } |
423 | |
424 | unsigned GCNSubtarget::getBaseMaxNumSGPRs( |
425 | const Function &F, std::pair<unsigned, unsigned> WavesPerEU, |
426 | unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { |
427 | // Compute maximum number of SGPRs function can use using default/requested |
428 | // minimum number of waves per execution unit. |
429 | unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false); |
430 | unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true); |
431 | |
432 | // Check if maximum number of SGPRs was explicitly requested using |
433 | // "amdgpu-num-sgpr" attribute. |
434 | unsigned Requested = |
435 | F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr" , Default: MaxNumSGPRs); |
436 | |
437 | if (Requested != MaxNumSGPRs) { |
438 | // Make sure requested value does not violate subtarget's specifications. |
439 | if (Requested && (Requested <= ReservedNumSGPRs)) |
440 | Requested = 0; |
441 | |
442 | // If more SGPRs are required to support the input user/system SGPRs, |
443 | // increase to accommodate them. |
444 | // |
445 | // FIXME: This really ends up using the requested number of SGPRs + number |
446 | // of reserved special registers in total. Theoretically you could re-use |
447 | // the last input registers for these special registers, but this would |
448 | // require a lot of complexity to deal with the weird aliasing. |
449 | unsigned InputNumSGPRs = PreloadedSGPRs; |
450 | if (Requested && Requested < InputNumSGPRs) |
451 | Requested = InputNumSGPRs; |
452 | |
453 | // Make sure requested value is compatible with values implied by |
454 | // default/requested minimum/maximum number of waves per execution unit. |
455 | if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false)) |
456 | Requested = 0; |
457 | if (WavesPerEU.second && Requested && |
458 | Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second)) |
459 | Requested = 0; |
460 | |
461 | if (Requested) |
462 | MaxNumSGPRs = Requested; |
463 | } |
464 | |
465 | if (hasSGPRInitBug()) |
466 | MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; |
467 | |
468 | return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs); |
469 | } |
470 | |
471 | unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { |
472 | const Function &F = MF.getFunction(); |
473 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
474 | return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(), |
475 | ReservedNumSGPRs: getReservedNumSGPRs(MF)); |
476 | } |
477 | |
478 | unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const { |
479 | using USI = GCNUserSGPRUsageInfo; |
480 | // Max number of user SGPRs |
481 | const unsigned MaxUserSGPRs = |
482 | USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) + |
483 | USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) + |
484 | USI::getNumUserSGPRForField(ID: USI::QueuePtrID) + |
485 | USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) + |
486 | USI::getNumUserSGPRForField(ID: USI::DispatchIdID) + |
487 | USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) + |
488 | USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID); |
489 | |
490 | // Max number of system SGPRs |
491 | const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX |
492 | 1 + // WorkGroupIDY |
493 | 1 + // WorkGroupIDZ |
494 | 1 + // WorkGroupInfo |
495 | 1; // private segment wave byte offset |
496 | |
497 | // Max number of synthetic SGPRs |
498 | const unsigned SyntheticSGPRs = 1; // LDSKernelId |
499 | |
500 | return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; |
501 | } |
502 | |
503 | unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { |
504 | return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(), |
505 | ReservedNumSGPRs: getReservedNumSGPRs(F)); |
506 | } |
507 | |
508 | unsigned GCNSubtarget::getBaseMaxNumVGPRs( |
509 | const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const { |
510 | const auto &[Min, Max] = NumVGPRBounds; |
511 | |
512 | // Check if maximum number of VGPRs was explicitly requested using |
513 | // "amdgpu-num-vgpr" attribute. |
514 | |
515 | unsigned Requested = F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr" , Default: Max); |
516 | if (Requested != Max && hasGFX90AInsts()) |
517 | Requested *= 2; |
518 | |
519 | // Make sure requested value is inside the range of possible VGPR usage. |
520 | return std::clamp(val: Requested, lo: Min, hi: Max); |
521 | } |
522 | |
523 | unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { |
524 | // Temporarily check both the attribute and the subtarget feature, until the |
525 | // latter is removed. |
526 | unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F); |
527 | if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled()) |
528 | DynamicVGPRBlockSize = getDynamicVGPRBlockSize(); |
529 | |
530 | std::pair<unsigned, unsigned> Waves = getWavesPerEU(F); |
531 | return getBaseMaxNumVGPRs( |
532 | F, NumVGPRBounds: {getMinNumVGPRs(WavesPerEU: Waves.second, DynamicVGPRBlockSize), |
533 | getMaxNumVGPRs(WavesPerEU: Waves.first, DynamicVGPRBlockSize)}); |
534 | } |
535 | |
536 | unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { |
537 | return getMaxNumVGPRs(F: MF.getFunction()); |
538 | } |
539 | |
540 | void GCNSubtarget::adjustSchedDependency( |
541 | SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, |
542 | const TargetSchedModel *SchedModel) const { |
543 | if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() || |
544 | !Use->isInstr()) |
545 | return; |
546 | |
547 | MachineInstr *DefI = Def->getInstr(); |
548 | MachineInstr *UseI = Use->getInstr(); |
549 | |
550 | if (DefI->isBundle()) { |
551 | const SIRegisterInfo *TRI = getRegisterInfo(); |
552 | auto Reg = Dep.getReg(); |
553 | MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); |
554 | MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); |
555 | unsigned Lat = 0; |
556 | for (++I; I != E && I->isBundledWithPred(); ++I) { |
557 | if (I->modifiesRegister(Reg, TRI)) |
558 | Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *I); |
559 | else if (Lat) |
560 | --Lat; |
561 | } |
562 | Dep.setLatency(Lat); |
563 | } else if (UseI->isBundle()) { |
564 | const SIRegisterInfo *TRI = getRegisterInfo(); |
565 | auto Reg = Dep.getReg(); |
566 | MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); |
567 | MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); |
568 | unsigned Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *DefI); |
569 | for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { |
570 | if (I->readsRegister(Reg, TRI)) |
571 | break; |
572 | --Lat; |
573 | } |
574 | Dep.setLatency(Lat); |
575 | } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { |
576 | // Work around the fact that SIInstrInfo::fixImplicitOperands modifies |
577 | // implicit operands which come from the MCInstrDesc, which can fool |
578 | // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit |
579 | // pseudo operands. |
580 | Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( |
581 | DefMI: DefI, DefOperIdx: DefOpIdx, UseMI: UseI, UseOperIdx: UseOpIdx)); |
582 | } |
583 | } |
584 | |
585 | unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { |
586 | if (getGeneration() >= AMDGPUSubtarget::GFX12) |
587 | return 0; // Not MIMG encoding. |
588 | |
589 | if (NSAThreshold.getNumOccurrences() > 0) |
590 | return std::max(a: NSAThreshold.getValue(), b: 2u); |
591 | |
592 | int Value = MF.getFunction().getFnAttributeAsParsedInteger( |
593 | Kind: "amdgpu-nsa-threshold" , Default: -1); |
594 | if (Value > 0) |
595 | return std::max(a: Value, b: 2); |
596 | |
597 | return NSAThreshold; |
598 | } |
599 | |
600 | GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, |
601 | const GCNSubtarget &ST) |
602 | : ST(ST) { |
603 | const CallingConv::ID CC = F.getCallingConv(); |
604 | const bool IsKernel = |
605 | CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; |
606 | |
607 | if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) |
608 | KernargSegmentPtr = true; |
609 | |
610 | bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); |
611 | if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) |
612 | PrivateSegmentBuffer = true; |
613 | else if (ST.isMesaGfxShader(F)) |
614 | ImplicitBufferPtr = true; |
615 | |
616 | if (!AMDGPU::isGraphics(CC)) { |
617 | if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr" )) |
618 | DispatchPtr = true; |
619 | |
620 | // FIXME: Can this always be disabled with < COv5? |
621 | if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr" )) |
622 | QueuePtr = true; |
623 | |
624 | if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id" )) |
625 | DispatchID = true; |
626 | } |
627 | |
628 | if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && |
629 | (IsAmdHsaOrMesa || ST.enableFlatScratch()) && |
630 | // FlatScratchInit cannot be true for graphics CC if enableFlatScratch() |
631 | // is false. |
632 | (ST.enableFlatScratch() || |
633 | (!AMDGPU::isGraphics(CC) && |
634 | !F.hasFnAttribute(Kind: "amdgpu-no-flat-scratch-init" ))) && |
635 | !ST.flatScratchIsArchitected()) { |
636 | FlatScratchInit = true; |
637 | } |
638 | |
639 | if (hasImplicitBufferPtr()) |
640 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID); |
641 | |
642 | if (hasPrivateSegmentBuffer()) |
643 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID); |
644 | |
645 | if (hasDispatchPtr()) |
646 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID); |
647 | |
648 | if (hasQueuePtr()) |
649 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID); |
650 | |
651 | if (hasKernargSegmentPtr()) |
652 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID); |
653 | |
654 | if (hasDispatchID()) |
655 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID); |
656 | |
657 | if (hasFlatScratchInit()) |
658 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID); |
659 | |
660 | if (hasPrivateSegmentSize()) |
661 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentSizeID); |
662 | } |
663 | |
664 | void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { |
665 | assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); |
666 | NumKernargPreloadSGPRs += NumSGPRs; |
667 | NumUsedUserSGPRs += NumSGPRs; |
668 | } |
669 | |
670 | unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { |
671 | return AMDGPU::getMaxNumUserSGPRs(STI: ST) - NumUsedUserSGPRs; |
672 | } |
673 | |