1 | //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Implements the AMDGPU specific subclass of TargetSubtarget. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPUSubtarget.h" |
15 | #include "AMDGPUCallLowering.h" |
16 | #include "AMDGPUInstructionSelector.h" |
17 | #include "AMDGPULegalizerInfo.h" |
18 | #include "AMDGPURegisterBankInfo.h" |
19 | #include "AMDGPUTargetMachine.h" |
20 | #include "GCNSubtarget.h" |
21 | #include "R600Subtarget.h" |
22 | #include "SIMachineFunctionInfo.h" |
23 | #include "Utils/AMDGPUBaseInfo.h" |
24 | #include "llvm/ADT/SmallString.h" |
25 | #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" |
26 | #include "llvm/CodeGen/MachineScheduler.h" |
27 | #include "llvm/CodeGen/TargetFrameLowering.h" |
28 | #include "llvm/IR/DiagnosticInfo.h" |
29 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
30 | #include "llvm/IR/IntrinsicsR600.h" |
31 | #include "llvm/IR/MDBuilder.h" |
32 | #include "llvm/MC/MCSubtargetInfo.h" |
33 | #include <algorithm> |
34 | |
35 | using namespace llvm; |
36 | |
37 | #define DEBUG_TYPE "amdgpu-subtarget" |
38 | |
39 | #define GET_SUBTARGETINFO_TARGET_DESC |
40 | #define GET_SUBTARGETINFO_CTOR |
41 | #define AMDGPUSubtarget GCNSubtarget |
42 | #include "AMDGPUGenSubtargetInfo.inc" |
43 | #undef AMDGPUSubtarget |
44 | |
45 | static cl::opt<bool> EnablePowerSched( |
46 | "amdgpu-enable-power-sched" , |
47 | cl::desc("Enable scheduling to minimize mAI power bursts" ), |
48 | cl::init(Val: false)); |
49 | |
50 | static cl::opt<bool> EnableVGPRIndexMode( |
51 | "amdgpu-vgpr-index-mode" , |
52 | cl::desc("Use GPR indexing mode instead of movrel for vector indexing" ), |
53 | cl::init(Val: false)); |
54 | |
55 | static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen" , |
56 | cl::desc("Enable the use of AA during codegen." ), |
57 | cl::init(Val: true)); |
58 | |
59 | static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold" , |
60 | cl::desc("Number of addresses from which to enable MIMG NSA." ), |
61 | cl::init(Val: 3), cl::Hidden); |
62 | |
63 | GCNSubtarget::~GCNSubtarget() = default; |
64 | |
65 | GCNSubtarget & |
66 | GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, |
67 | StringRef GPU, StringRef FS) { |
68 | // Determine default and user-specified characteristics |
69 | // |
70 | // We want to be able to turn these off, but making this a subtarget feature |
71 | // for SI has the unhelpful behavior that it unsets everything else if you |
72 | // disable it. |
73 | // |
74 | // Similarly we want enable-prt-strict-null to be on by default and not to |
75 | // unset everything else if it is disabled |
76 | |
77 | SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128," ); |
78 | |
79 | // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default |
80 | if (isAmdHsaOS()) |
81 | FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler," ; |
82 | |
83 | FullFS += "+enable-prt-strict-null," ; // This is overridden by a disable in FS |
84 | |
85 | // Disable mutually exclusive bits. |
86 | if (FS.contains_insensitive(Other: "+wavefrontsize" )) { |
87 | if (!FS.contains_insensitive(Other: "wavefrontsize16" )) |
88 | FullFS += "-wavefrontsize16," ; |
89 | if (!FS.contains_insensitive(Other: "wavefrontsize32" )) |
90 | FullFS += "-wavefrontsize32," ; |
91 | if (!FS.contains_insensitive(Other: "wavefrontsize64" )) |
92 | FullFS += "-wavefrontsize64," ; |
93 | } |
94 | |
95 | FullFS += FS; |
96 | |
97 | ParseSubtargetFeatures(CPU: GPU, /*TuneCPU*/ GPU, FS: FullFS); |
98 | |
99 | // Implement the "generic" processors, which acts as the default when no |
100 | // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to |
101 | // the first amdgcn target that supports flat addressing. Other OSes defaults |
102 | // to the first amdgcn target. |
103 | if (Gen == AMDGPUSubtarget::INVALID) { |
104 | Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS |
105 | : AMDGPUSubtarget::SOUTHERN_ISLANDS; |
106 | } |
107 | |
108 | if (!hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) && |
109 | !hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) { |
110 | // If there is no default wave size it must be a generation before gfx10, |
111 | // these have FeatureWavefrontSize64 in their definition already. For gfx10+ |
112 | // set wave32 as a default. |
113 | ToggleFeature(FB: AMDGPU::FeatureWavefrontSize32); |
114 | } |
115 | |
116 | // We don't support FP64 for EG/NI atm. |
117 | assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); |
118 | |
119 | // Targets must either support 64-bit offsets for MUBUF instructions, and/or |
120 | // support flat operations, otherwise they cannot access a 64-bit global |
121 | // address space |
122 | assert(hasAddr64() || hasFlat()); |
123 | // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets |
124 | // that do not support ADDR64 variants of MUBUF instructions. Such targets |
125 | // cannot use a 64 bit offset with a MUBUF instruction to access the global |
126 | // address space |
127 | if (!hasAddr64() && !FS.contains(Other: "flat-for-global" ) && !FlatForGlobal) { |
128 | ToggleFeature(FB: AMDGPU::FeatureFlatForGlobal); |
129 | FlatForGlobal = true; |
130 | } |
131 | // Unless +-flat-for-global is specified, use MUBUF instructions for global |
132 | // address space access if flat operations are not available. |
133 | if (!hasFlat() && !FS.contains(Other: "flat-for-global" ) && FlatForGlobal) { |
134 | ToggleFeature(FB: AMDGPU::FeatureFlatForGlobal); |
135 | FlatForGlobal = false; |
136 | } |
137 | |
138 | // Set defaults if needed. |
139 | if (MaxPrivateElementSize == 0) |
140 | MaxPrivateElementSize = 4; |
141 | |
142 | if (LDSBankCount == 0) |
143 | LDSBankCount = 32; |
144 | |
145 | if (TT.getArch() == Triple::amdgcn) { |
146 | if (LocalMemorySize == 0) |
147 | LocalMemorySize = 32768; |
148 | |
149 | // Do something sensible for unspecified target. |
150 | if (!HasMovrel && !HasVGPRIndexMode) |
151 | HasMovrel = true; |
152 | } |
153 | |
154 | AddressableLocalMemorySize = LocalMemorySize; |
155 | |
156 | if (AMDGPU::isGFX10Plus(STI: *this) && |
157 | !getFeatureBits().test(I: AMDGPU::FeatureCuMode)) |
158 | LocalMemorySize *= 2; |
159 | |
160 | // Don't crash on invalid devices. |
161 | if (WavefrontSizeLog2 == 0) |
162 | WavefrontSizeLog2 = 5; |
163 | |
164 | HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; |
165 | HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; |
166 | |
167 | TargetID.setTargetIDFromFeaturesString(FS); |
168 | |
169 | LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " |
170 | << TargetID.getXnackSetting() << '\n'); |
171 | LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " |
172 | << TargetID.getSramEccSetting() << '\n'); |
173 | |
174 | return *this; |
175 | } |
176 | |
177 | void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { |
178 | LLVMContext &Ctx = F.getContext(); |
179 | if (hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) == |
180 | hasFeature(Feature: AMDGPU::FeatureWavefrontSize64)) { |
181 | Ctx.diagnose(DI: DiagnosticInfoUnsupported( |
182 | F, "must specify exactly one of wavefrontsize32 and wavefrontsize64" )); |
183 | } |
184 | } |
185 | |
186 | AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} |
187 | |
188 | bool AMDGPUSubtarget::useRealTrue16Insts() const { |
189 | return hasTrue16BitInsts() && EnableRealTrue16Insts; |
190 | } |
191 | |
192 | GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, |
193 | const GCNTargetMachine &TM) |
194 | : // clang-format off |
195 | AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), |
196 | AMDGPUSubtarget(TT), |
197 | TargetTriple(TT), |
198 | TargetID(*this), |
199 | InstrItins(getInstrItineraryForCPU(CPU: GPU)), |
200 | InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), |
201 | TLInfo(TM, *this), |
202 | FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { |
203 | // clang-format on |
204 | MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(STI: this); |
205 | EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(STI: this); |
206 | CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(args: *getTargetLowering()); |
207 | InlineAsmLoweringInfo = |
208 | std::make_unique<InlineAsmLowering>(args: getTargetLowering()); |
209 | Legalizer = std::make_unique<AMDGPULegalizerInfo>(args&: *this, args: TM); |
210 | RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(args&: *this); |
211 | InstSelector = |
212 | std::make_unique<AMDGPUInstructionSelector>(args&: *this, args&: *RegBankInfo, args: TM); |
213 | } |
214 | |
215 | unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { |
216 | if (getGeneration() < GFX10) |
217 | return 1; |
218 | |
219 | switch (Opcode) { |
220 | case AMDGPU::V_LSHLREV_B64_e64: |
221 | case AMDGPU::V_LSHLREV_B64_gfx10: |
222 | case AMDGPU::V_LSHLREV_B64_e64_gfx11: |
223 | case AMDGPU::V_LSHLREV_B64_e32_gfx12: |
224 | case AMDGPU::V_LSHLREV_B64_e64_gfx12: |
225 | case AMDGPU::V_LSHL_B64_e64: |
226 | case AMDGPU::V_LSHRREV_B64_e64: |
227 | case AMDGPU::V_LSHRREV_B64_gfx10: |
228 | case AMDGPU::V_LSHRREV_B64_e64_gfx11: |
229 | case AMDGPU::V_LSHRREV_B64_e64_gfx12: |
230 | case AMDGPU::V_LSHR_B64_e64: |
231 | case AMDGPU::V_ASHRREV_I64_e64: |
232 | case AMDGPU::V_ASHRREV_I64_gfx10: |
233 | case AMDGPU::V_ASHRREV_I64_e64_gfx11: |
234 | case AMDGPU::V_ASHRREV_I64_e64_gfx12: |
235 | case AMDGPU::V_ASHR_I64_e64: |
236 | return 1; |
237 | } |
238 | |
239 | return 2; |
240 | } |
241 | |
242 | /// This list was mostly derived from experimentation. |
243 | bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { |
244 | switch (Opcode) { |
245 | case AMDGPU::V_CVT_F16_F32_e32: |
246 | case AMDGPU::V_CVT_F16_F32_e64: |
247 | case AMDGPU::V_CVT_F16_U16_e32: |
248 | case AMDGPU::V_CVT_F16_U16_e64: |
249 | case AMDGPU::V_CVT_F16_I16_e32: |
250 | case AMDGPU::V_CVT_F16_I16_e64: |
251 | case AMDGPU::V_RCP_F16_e64: |
252 | case AMDGPU::V_RCP_F16_e32: |
253 | case AMDGPU::V_RSQ_F16_e64: |
254 | case AMDGPU::V_RSQ_F16_e32: |
255 | case AMDGPU::V_SQRT_F16_e64: |
256 | case AMDGPU::V_SQRT_F16_e32: |
257 | case AMDGPU::V_LOG_F16_e64: |
258 | case AMDGPU::V_LOG_F16_e32: |
259 | case AMDGPU::V_EXP_F16_e64: |
260 | case AMDGPU::V_EXP_F16_e32: |
261 | case AMDGPU::V_SIN_F16_e64: |
262 | case AMDGPU::V_SIN_F16_e32: |
263 | case AMDGPU::V_COS_F16_e64: |
264 | case AMDGPU::V_COS_F16_e32: |
265 | case AMDGPU::V_FLOOR_F16_e64: |
266 | case AMDGPU::V_FLOOR_F16_e32: |
267 | case AMDGPU::V_CEIL_F16_e64: |
268 | case AMDGPU::V_CEIL_F16_e32: |
269 | case AMDGPU::V_TRUNC_F16_e64: |
270 | case AMDGPU::V_TRUNC_F16_e32: |
271 | case AMDGPU::V_RNDNE_F16_e64: |
272 | case AMDGPU::V_RNDNE_F16_e32: |
273 | case AMDGPU::V_FRACT_F16_e64: |
274 | case AMDGPU::V_FRACT_F16_e32: |
275 | case AMDGPU::V_FREXP_MANT_F16_e64: |
276 | case AMDGPU::V_FREXP_MANT_F16_e32: |
277 | case AMDGPU::V_FREXP_EXP_I16_F16_e64: |
278 | case AMDGPU::V_FREXP_EXP_I16_F16_e32: |
279 | case AMDGPU::V_LDEXP_F16_e64: |
280 | case AMDGPU::V_LDEXP_F16_e32: |
281 | case AMDGPU::V_LSHLREV_B16_e64: |
282 | case AMDGPU::V_LSHLREV_B16_e32: |
283 | case AMDGPU::V_LSHRREV_B16_e64: |
284 | case AMDGPU::V_LSHRREV_B16_e32: |
285 | case AMDGPU::V_ASHRREV_I16_e64: |
286 | case AMDGPU::V_ASHRREV_I16_e32: |
287 | case AMDGPU::V_ADD_U16_e64: |
288 | case AMDGPU::V_ADD_U16_e32: |
289 | case AMDGPU::V_SUB_U16_e64: |
290 | case AMDGPU::V_SUB_U16_e32: |
291 | case AMDGPU::V_SUBREV_U16_e64: |
292 | case AMDGPU::V_SUBREV_U16_e32: |
293 | case AMDGPU::V_MUL_LO_U16_e64: |
294 | case AMDGPU::V_MUL_LO_U16_e32: |
295 | case AMDGPU::V_ADD_F16_e64: |
296 | case AMDGPU::V_ADD_F16_e32: |
297 | case AMDGPU::V_SUB_F16_e64: |
298 | case AMDGPU::V_SUB_F16_e32: |
299 | case AMDGPU::V_SUBREV_F16_e64: |
300 | case AMDGPU::V_SUBREV_F16_e32: |
301 | case AMDGPU::V_MUL_F16_e64: |
302 | case AMDGPU::V_MUL_F16_e32: |
303 | case AMDGPU::V_MAX_F16_e64: |
304 | case AMDGPU::V_MAX_F16_e32: |
305 | case AMDGPU::V_MIN_F16_e64: |
306 | case AMDGPU::V_MIN_F16_e32: |
307 | case AMDGPU::V_MAX_U16_e64: |
308 | case AMDGPU::V_MAX_U16_e32: |
309 | case AMDGPU::V_MIN_U16_e64: |
310 | case AMDGPU::V_MIN_U16_e32: |
311 | case AMDGPU::V_MAX_I16_e64: |
312 | case AMDGPU::V_MAX_I16_e32: |
313 | case AMDGPU::V_MIN_I16_e64: |
314 | case AMDGPU::V_MIN_I16_e32: |
315 | case AMDGPU::V_MAD_F16_e64: |
316 | case AMDGPU::V_MAD_U16_e64: |
317 | case AMDGPU::V_MAD_I16_e64: |
318 | case AMDGPU::V_FMA_F16_e64: |
319 | case AMDGPU::V_DIV_FIXUP_F16_e64: |
320 | // On gfx10, all 16-bit instructions preserve the high bits. |
321 | return getGeneration() <= AMDGPUSubtarget::GFX9; |
322 | case AMDGPU::V_MADAK_F16: |
323 | case AMDGPU::V_MADMK_F16: |
324 | case AMDGPU::V_MAC_F16_e64: |
325 | case AMDGPU::V_MAC_F16_e32: |
326 | case AMDGPU::V_FMAMK_F16: |
327 | case AMDGPU::V_FMAAK_F16: |
328 | case AMDGPU::V_FMAC_F16_e64: |
329 | case AMDGPU::V_FMAC_F16_e32: |
330 | // In gfx9, the preferred handling of the unused high 16-bits changed. Most |
331 | // instructions maintain the legacy behavior of 0ing. Some instructions |
332 | // changed to preserving the high bits. |
333 | return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; |
334 | case AMDGPU::V_MAD_MIXLO_F16: |
335 | case AMDGPU::V_MAD_MIXHI_F16: |
336 | default: |
337 | return false; |
338 | } |
339 | } |
340 | |
341 | // Returns the maximum per-workgroup LDS allocation size (in bytes) that still |
342 | // allows the given function to achieve an occupancy of NWaves waves per |
343 | // SIMD / EU, taking into account only the function's *maximum* workgroup size. |
344 | unsigned |
345 | AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, |
346 | const Function &F) const { |
347 | const unsigned WaveSize = getWavefrontSize(); |
348 | const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; |
349 | const unsigned WavesPerWorkgroup = |
350 | std::max(a: 1u, b: (WorkGroupSize + WaveSize - 1) / WaveSize); |
351 | |
352 | const unsigned WorkGroupsPerCU = |
353 | std::max(a: 1u, b: (NWaves * getEUsPerCU()) / WavesPerWorkgroup); |
354 | |
355 | return getLocalMemorySize() / WorkGroupsPerCU; |
356 | } |
357 | |
358 | // FIXME: Should return min,max range. |
359 | // |
360 | // Returns the maximum occupancy, in number of waves per SIMD / EU, that can |
361 | // be achieved when only the given function is running on the machine; and |
362 | // taking into account the overall number of wave slots, the (maximum) workgroup |
363 | // size, and the per-workgroup LDS allocation size. |
364 | unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, |
365 | const Function &F) const { |
366 | const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; |
367 | const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(FlatWorkGroupSize: MaxWorkGroupSize); |
368 | if (!MaxWorkGroupsPerCu) |
369 | return 0; |
370 | |
371 | const unsigned WaveSize = getWavefrontSize(); |
372 | |
373 | // FIXME: Do we need to account for alignment requirement of LDS rounding the |
374 | // size up? |
375 | // Compute restriction based on LDS usage |
376 | unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); |
377 | |
378 | // This can be queried with more LDS than is possible, so just assume the |
379 | // worst. |
380 | if (NumGroups == 0) |
381 | return 1; |
382 | |
383 | NumGroups = std::min(a: MaxWorkGroupsPerCu, b: NumGroups); |
384 | |
385 | // Round to the number of waves per CU. |
386 | const unsigned MaxGroupNumWaves = divideCeil(Numerator: MaxWorkGroupSize, Denominator: WaveSize); |
387 | unsigned MaxWaves = NumGroups * MaxGroupNumWaves; |
388 | |
389 | // Number of waves per EU (SIMD). |
390 | MaxWaves = divideCeil(Numerator: MaxWaves, Denominator: getEUsPerCU()); |
391 | |
392 | // Clamp to the maximum possible number of waves. |
393 | MaxWaves = std::min(a: MaxWaves, b: getMaxWavesPerEU()); |
394 | |
395 | // FIXME: Needs to be a multiple of the group size? |
396 | //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); |
397 | |
398 | assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && |
399 | "computed invalid occupancy" ); |
400 | return MaxWaves; |
401 | } |
402 | |
403 | unsigned |
404 | AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { |
405 | const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
406 | return getOccupancyWithLocalMemSize(Bytes: MFI->getLDSSize(), F: MF.getFunction()); |
407 | } |
408 | |
409 | std::pair<unsigned, unsigned> |
410 | AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { |
411 | switch (CC) { |
412 | case CallingConv::AMDGPU_VS: |
413 | case CallingConv::AMDGPU_LS: |
414 | case CallingConv::AMDGPU_HS: |
415 | case CallingConv::AMDGPU_ES: |
416 | case CallingConv::AMDGPU_GS: |
417 | case CallingConv::AMDGPU_PS: |
418 | return std::pair(1, getWavefrontSize()); |
419 | default: |
420 | return std::pair(1u, getMaxFlatWorkGroupSize()); |
421 | } |
422 | } |
423 | |
424 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( |
425 | const Function &F) const { |
426 | // Default minimum/maximum flat work group sizes. |
427 | std::pair<unsigned, unsigned> Default = |
428 | getDefaultFlatWorkGroupSize(CC: F.getCallingConv()); |
429 | |
430 | // Requested minimum/maximum flat work group sizes. |
431 | std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( |
432 | F, Name: "amdgpu-flat-work-group-size" , Default); |
433 | |
434 | // Make sure requested minimum is less than requested maximum. |
435 | if (Requested.first > Requested.second) |
436 | return Default; |
437 | |
438 | // Make sure requested values do not violate subtarget's specifications. |
439 | if (Requested.first < getMinFlatWorkGroupSize()) |
440 | return Default; |
441 | if (Requested.second > getMaxFlatWorkGroupSize()) |
442 | return Default; |
443 | |
444 | return Requested; |
445 | } |
446 | |
447 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU( |
448 | std::pair<unsigned, unsigned> Requested, |
449 | std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { |
450 | // Default minimum/maximum number of waves per execution unit. |
451 | std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); |
452 | |
453 | // If minimum/maximum flat work group sizes were explicitly requested using |
454 | // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum |
455 | // number of waves per execution unit to values implied by requested |
456 | // minimum/maximum flat work group sizes. |
457 | unsigned MinImpliedByFlatWorkGroupSize = |
458 | getWavesPerEUForWorkGroup(FlatWorkGroupSize: FlatWorkGroupSizes.second); |
459 | Default.first = MinImpliedByFlatWorkGroupSize; |
460 | |
461 | // Make sure requested minimum is less than requested maximum. |
462 | if (Requested.second && Requested.first > Requested.second) |
463 | return Default; |
464 | |
465 | // Make sure requested values do not violate subtarget's specifications. |
466 | if (Requested.first < getMinWavesPerEU() || |
467 | Requested.second > getMaxWavesPerEU()) |
468 | return Default; |
469 | |
470 | // Make sure requested values are compatible with values implied by requested |
471 | // minimum/maximum flat work group sizes. |
472 | if (Requested.first < MinImpliedByFlatWorkGroupSize) |
473 | return Default; |
474 | |
475 | return Requested; |
476 | } |
477 | |
478 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( |
479 | const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { |
480 | // Default minimum/maximum number of waves per execution unit. |
481 | std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); |
482 | |
483 | // Requested minimum/maximum number of waves per execution unit. |
484 | std::pair<unsigned, unsigned> Requested = |
485 | AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu" , Default, OnlyFirstRequired: true); |
486 | return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes); |
487 | } |
488 | |
489 | static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { |
490 | auto Node = Kernel.getMetadata(Kind: "reqd_work_group_size" ); |
491 | if (Node && Node->getNumOperands() == 3) |
492 | return mdconst::extract<ConstantInt>(MD: Node->getOperand(I: Dim))->getZExtValue(); |
493 | return std::numeric_limits<unsigned>::max(); |
494 | } |
495 | |
496 | bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { |
497 | return isMesa3DOS() && !AMDGPU::isShader(CC: F.getCallingConv()); |
498 | } |
499 | |
500 | unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, |
501 | unsigned Dimension) const { |
502 | unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dim: Dimension); |
503 | if (ReqdSize != std::numeric_limits<unsigned>::max()) |
504 | return ReqdSize - 1; |
505 | return getFlatWorkGroupSizes(F: Kernel).second - 1; |
506 | } |
507 | |
508 | bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { |
509 | for (int I = 0; I < 3; ++I) { |
510 | if (getMaxWorkitemID(Kernel: Func, Dimension: I) > 0) |
511 | return false; |
512 | } |
513 | |
514 | return true; |
515 | } |
516 | |
517 | bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { |
518 | Function *Kernel = I->getParent()->getParent(); |
519 | unsigned MinSize = 0; |
520 | unsigned MaxSize = getFlatWorkGroupSizes(F: *Kernel).second; |
521 | bool IdQuery = false; |
522 | |
523 | // If reqd_work_group_size is present it narrows value down. |
524 | if (auto *CI = dyn_cast<CallInst>(Val: I)) { |
525 | const Function *F = CI->getCalledFunction(); |
526 | if (F) { |
527 | unsigned Dim = UINT_MAX; |
528 | switch (F->getIntrinsicID()) { |
529 | case Intrinsic::amdgcn_workitem_id_x: |
530 | case Intrinsic::r600_read_tidig_x: |
531 | IdQuery = true; |
532 | [[fallthrough]]; |
533 | case Intrinsic::r600_read_local_size_x: |
534 | Dim = 0; |
535 | break; |
536 | case Intrinsic::amdgcn_workitem_id_y: |
537 | case Intrinsic::r600_read_tidig_y: |
538 | IdQuery = true; |
539 | [[fallthrough]]; |
540 | case Intrinsic::r600_read_local_size_y: |
541 | Dim = 1; |
542 | break; |
543 | case Intrinsic::amdgcn_workitem_id_z: |
544 | case Intrinsic::r600_read_tidig_z: |
545 | IdQuery = true; |
546 | [[fallthrough]]; |
547 | case Intrinsic::r600_read_local_size_z: |
548 | Dim = 2; |
549 | break; |
550 | default: |
551 | break; |
552 | } |
553 | |
554 | if (Dim <= 3) { |
555 | unsigned ReqdSize = getReqdWorkGroupSize(Kernel: *Kernel, Dim); |
556 | if (ReqdSize != std::numeric_limits<unsigned>::max()) |
557 | MinSize = MaxSize = ReqdSize; |
558 | } |
559 | } |
560 | } |
561 | |
562 | if (!MaxSize) |
563 | return false; |
564 | |
565 | // Range metadata is [Lo, Hi). For ID query we need to pass max size |
566 | // as Hi. For size query we need to pass Hi + 1. |
567 | if (IdQuery) |
568 | MinSize = 0; |
569 | else |
570 | ++MaxSize; |
571 | |
572 | APInt Lower{32, MinSize}; |
573 | APInt Upper{32, MaxSize}; |
574 | if (auto *CI = dyn_cast<CallBase>(Val: I)) { |
575 | ConstantRange Range(Lower, Upper); |
576 | CI->addRangeRetAttr(CR: Range); |
577 | } else { |
578 | MDBuilder MDB(I->getContext()); |
579 | MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lo: Lower, Hi: Upper); |
580 | I->setMetadata(KindID: LLVMContext::MD_range, Node: MaxWorkGroupSizeRange); |
581 | } |
582 | return true; |
583 | } |
584 | |
585 | unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { |
586 | assert(AMDGPU::isKernel(F.getCallingConv())); |
587 | |
588 | // We don't allocate the segment if we know the implicit arguments weren't |
589 | // used, even if the ABI implies we need them. |
590 | if (F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr" )) |
591 | return 0; |
592 | |
593 | if (isMesaKernel(F)) |
594 | return 16; |
595 | |
596 | // Assume all implicit inputs are used by default |
597 | const Module *M = F.getParent(); |
598 | unsigned NBytes = |
599 | AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56; |
600 | return F.getFnAttributeAsParsedInteger(Kind: "amdgpu-implicitarg-num-bytes" , |
601 | Default: NBytes); |
602 | } |
603 | |
604 | uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, |
605 | Align &MaxAlign) const { |
606 | assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || |
607 | F.getCallingConv() == CallingConv::SPIR_KERNEL); |
608 | |
609 | const DataLayout &DL = F.getDataLayout(); |
610 | uint64_t ExplicitArgBytes = 0; |
611 | MaxAlign = Align(1); |
612 | |
613 | for (const Argument &Arg : F.args()) { |
614 | const bool IsByRef = Arg.hasByRefAttr(); |
615 | Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); |
616 | Align Alignment = DL.getValueOrABITypeAlignment( |
617 | Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: ArgTy); |
618 | uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy); |
619 | ExplicitArgBytes = alignTo(Size: ExplicitArgBytes, A: Alignment) + AllocSize; |
620 | MaxAlign = std::max(a: MaxAlign, b: Alignment); |
621 | } |
622 | |
623 | return ExplicitArgBytes; |
624 | } |
625 | |
626 | unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, |
627 | Align &MaxAlign) const { |
628 | if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && |
629 | F.getCallingConv() != CallingConv::SPIR_KERNEL) |
630 | return 0; |
631 | |
632 | uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); |
633 | |
634 | unsigned ExplicitOffset = getExplicitKernelArgOffset(); |
635 | |
636 | uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; |
637 | unsigned ImplicitBytes = getImplicitArgNumBytes(F); |
638 | if (ImplicitBytes != 0) { |
639 | const Align Alignment = getAlignmentForImplicitArgPtr(); |
640 | TotalSize = alignTo(Size: ExplicitArgBytes, A: Alignment) + ImplicitBytes; |
641 | MaxAlign = std::max(a: MaxAlign, b: Alignment); |
642 | } |
643 | |
644 | // Being able to dereference past the end is useful for emitting scalar loads. |
645 | return alignTo(Value: TotalSize, Align: 4); |
646 | } |
647 | |
648 | AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { |
649 | return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 |
650 | : AMDGPUDwarfFlavour::Wave64; |
651 | } |
652 | |
653 | void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, |
654 | unsigned NumRegionInstrs) const { |
655 | // Track register pressure so the scheduler can try to decrease |
656 | // pressure once register usage is above the threshold defined by |
657 | // SIRegisterInfo::getRegPressureSetLimit() |
658 | Policy.ShouldTrackPressure = true; |
659 | |
660 | // Enabling both top down and bottom up scheduling seems to give us less |
661 | // register spills than just using one of these approaches on its own. |
662 | Policy.OnlyTopDown = false; |
663 | Policy.OnlyBottomUp = false; |
664 | |
665 | // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. |
666 | if (!enableSIScheduler()) |
667 | Policy.ShouldTrackLaneMasks = true; |
668 | } |
669 | |
670 | void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { |
671 | if (isWave32()) { |
672 | // Fix implicit $vcc operands after MIParser has verified that they match |
673 | // the instruction definitions. |
674 | for (auto &MBB : MF) { |
675 | for (auto &MI : MBB) |
676 | InstrInfo.fixImplicitOperands(MI); |
677 | } |
678 | } |
679 | } |
680 | |
681 | bool GCNSubtarget::hasMadF16() const { |
682 | return InstrInfo.pseudoToMCOpcode(Opcode: AMDGPU::V_MAD_F16_e64) != -1; |
683 | } |
684 | |
685 | bool GCNSubtarget::useVGPRIndexMode() const { |
686 | return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); |
687 | } |
688 | |
689 | bool GCNSubtarget::useAA() const { return UseAA; } |
690 | |
691 | unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { |
692 | return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, MaxWaves: getMaxWavesPerEU(), |
693 | Gen: getGeneration()); |
694 | } |
695 | |
696 | unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { |
697 | return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(STI: this, NumVGPRs); |
698 | } |
699 | |
700 | unsigned |
701 | GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { |
702 | if (getGeneration() >= AMDGPUSubtarget::GFX10) |
703 | return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. |
704 | |
705 | if (HasFlatScratch || HasArchitectedFlatScratch) { |
706 | if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
707 | return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). |
708 | if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) |
709 | return 4; // FLAT_SCRATCH, VCC (in that order). |
710 | } |
711 | |
712 | if (isXNACKEnabled()) |
713 | return 4; // XNACK, VCC (in that order). |
714 | return 2; // VCC. |
715 | } |
716 | |
717 | unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { |
718 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
719 | return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit()); |
720 | } |
721 | |
722 | unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { |
723 | // In principle we do not need to reserve SGPR pair used for flat_scratch if |
724 | // we know flat instructions do not access the stack anywhere in the |
725 | // program. For now assume it's needed if we have flat instructions. |
726 | const bool KernelUsesFlatScratch = hasFlatAddressSpace(); |
727 | return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch); |
728 | } |
729 | |
730 | unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, |
731 | unsigned NumSGPRs, |
732 | unsigned NumVGPRs) const { |
733 | unsigned Occupancy = |
734 | std::min(a: getMaxWavesPerEU(), |
735 | b: getOccupancyWithLocalMemSize(Bytes: LDSSize, F)); |
736 | if (NumSGPRs) |
737 | Occupancy = std::min(a: Occupancy, b: getOccupancyWithNumSGPRs(SGPRs: NumSGPRs)); |
738 | if (NumVGPRs) |
739 | Occupancy = std::min(a: Occupancy, b: getOccupancyWithNumVGPRs(NumVGPRs)); |
740 | return Occupancy; |
741 | } |
742 | |
743 | unsigned GCNSubtarget::getBaseMaxNumSGPRs( |
744 | const Function &F, std::pair<unsigned, unsigned> WavesPerEU, |
745 | unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { |
746 | // Compute maximum number of SGPRs function can use using default/requested |
747 | // minimum number of waves per execution unit. |
748 | unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false); |
749 | unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true); |
750 | |
751 | // Check if maximum number of SGPRs was explicitly requested using |
752 | // "amdgpu-num-sgpr" attribute. |
753 | if (F.hasFnAttribute(Kind: "amdgpu-num-sgpr" )) { |
754 | unsigned Requested = |
755 | F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr" , Default: MaxNumSGPRs); |
756 | |
757 | // Make sure requested value does not violate subtarget's specifications. |
758 | if (Requested && (Requested <= ReservedNumSGPRs)) |
759 | Requested = 0; |
760 | |
761 | // If more SGPRs are required to support the input user/system SGPRs, |
762 | // increase to accommodate them. |
763 | // |
764 | // FIXME: This really ends up using the requested number of SGPRs + number |
765 | // of reserved special registers in total. Theoretically you could re-use |
766 | // the last input registers for these special registers, but this would |
767 | // require a lot of complexity to deal with the weird aliasing. |
768 | unsigned InputNumSGPRs = PreloadedSGPRs; |
769 | if (Requested && Requested < InputNumSGPRs) |
770 | Requested = InputNumSGPRs; |
771 | |
772 | // Make sure requested value is compatible with values implied by |
773 | // default/requested minimum/maximum number of waves per execution unit. |
774 | if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false)) |
775 | Requested = 0; |
776 | if (WavesPerEU.second && |
777 | Requested && Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second)) |
778 | Requested = 0; |
779 | |
780 | if (Requested) |
781 | MaxNumSGPRs = Requested; |
782 | } |
783 | |
784 | if (hasSGPRInitBug()) |
785 | MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; |
786 | |
787 | return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs); |
788 | } |
789 | |
790 | unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { |
791 | const Function &F = MF.getFunction(); |
792 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
793 | return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(), |
794 | ReservedNumSGPRs: getReservedNumSGPRs(MF)); |
795 | } |
796 | |
797 | static unsigned getMaxNumPreloadedSGPRs() { |
798 | using USI = GCNUserSGPRUsageInfo; |
799 | // Max number of user SGPRs |
800 | const unsigned MaxUserSGPRs = |
801 | USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) + |
802 | USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) + |
803 | USI::getNumUserSGPRForField(ID: USI::QueuePtrID) + |
804 | USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) + |
805 | USI::getNumUserSGPRForField(ID: USI::DispatchIdID) + |
806 | USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) + |
807 | USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID); |
808 | |
809 | // Max number of system SGPRs |
810 | const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX |
811 | 1 + // WorkGroupIDY |
812 | 1 + // WorkGroupIDZ |
813 | 1 + // WorkGroupInfo |
814 | 1; // private segment wave byte offset |
815 | |
816 | // Max number of synthetic SGPRs |
817 | const unsigned SyntheticSGPRs = 1; // LDSKernelId |
818 | |
819 | return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; |
820 | } |
821 | |
822 | unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { |
823 | return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(), |
824 | ReservedNumSGPRs: getReservedNumSGPRs(F)); |
825 | } |
826 | |
827 | unsigned GCNSubtarget::getBaseMaxNumVGPRs( |
828 | const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { |
829 | // Compute maximum number of VGPRs function can use using default/requested |
830 | // minimum number of waves per execution unit. |
831 | unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU: WavesPerEU.first); |
832 | |
833 | // Check if maximum number of VGPRs was explicitly requested using |
834 | // "amdgpu-num-vgpr" attribute. |
835 | if (F.hasFnAttribute(Kind: "amdgpu-num-vgpr" )) { |
836 | unsigned Requested = |
837 | F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr" , Default: MaxNumVGPRs); |
838 | |
839 | if (hasGFX90AInsts()) |
840 | Requested *= 2; |
841 | |
842 | // Make sure requested value is compatible with values implied by |
843 | // default/requested minimum/maximum number of waves per execution unit. |
844 | if (Requested && Requested > getMaxNumVGPRs(WavesPerEU: WavesPerEU.first)) |
845 | Requested = 0; |
846 | if (WavesPerEU.second && |
847 | Requested && Requested < getMinNumVGPRs(WavesPerEU: WavesPerEU.second)) |
848 | Requested = 0; |
849 | |
850 | if (Requested) |
851 | MaxNumVGPRs = Requested; |
852 | } |
853 | |
854 | return MaxNumVGPRs; |
855 | } |
856 | |
857 | unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { |
858 | return getBaseMaxNumVGPRs(F, WavesPerEU: getWavesPerEU(F)); |
859 | } |
860 | |
861 | unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { |
862 | const Function &F = MF.getFunction(); |
863 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
864 | return getBaseMaxNumVGPRs(F, WavesPerEU: MFI.getWavesPerEU()); |
865 | } |
866 | |
867 | void GCNSubtarget::adjustSchedDependency( |
868 | SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, |
869 | const TargetSchedModel *SchedModel) const { |
870 | if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || |
871 | !Def->isInstr() || !Use->isInstr()) |
872 | return; |
873 | |
874 | MachineInstr *DefI = Def->getInstr(); |
875 | MachineInstr *UseI = Use->getInstr(); |
876 | |
877 | if (DefI->isBundle()) { |
878 | const SIRegisterInfo *TRI = getRegisterInfo(); |
879 | auto Reg = Dep.getReg(); |
880 | MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); |
881 | MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); |
882 | unsigned Lat = 0; |
883 | for (++I; I != E && I->isBundledWithPred(); ++I) { |
884 | if (I->modifiesRegister(Reg, TRI)) |
885 | Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *I); |
886 | else if (Lat) |
887 | --Lat; |
888 | } |
889 | Dep.setLatency(Lat); |
890 | } else if (UseI->isBundle()) { |
891 | const SIRegisterInfo *TRI = getRegisterInfo(); |
892 | auto Reg = Dep.getReg(); |
893 | MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); |
894 | MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); |
895 | unsigned Lat = InstrInfo.getInstrLatency(ItinData: getInstrItineraryData(), MI: *DefI); |
896 | for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { |
897 | if (I->readsRegister(Reg, TRI)) |
898 | break; |
899 | --Lat; |
900 | } |
901 | Dep.setLatency(Lat); |
902 | } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { |
903 | // Work around the fact that SIInstrInfo::fixImplicitOperands modifies |
904 | // implicit operands which come from the MCInstrDesc, which can fool |
905 | // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit |
906 | // pseudo operands. |
907 | Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( |
908 | DefMI: DefI, DefOperIdx: DefOpIdx, UseMI: UseI, UseOperIdx: UseOpIdx)); |
909 | } |
910 | } |
911 | |
912 | namespace { |
913 | struct FillMFMAShadowMutation : ScheduleDAGMutation { |
914 | const SIInstrInfo *TII; |
915 | |
916 | ScheduleDAGMI *DAG; |
917 | |
918 | FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} |
919 | |
920 | bool isSALU(const SUnit *SU) const { |
921 | const MachineInstr *MI = SU->getInstr(); |
922 | return MI && TII->isSALU(MI: *MI) && !MI->isTerminator(); |
923 | } |
924 | |
925 | bool isVALU(const SUnit *SU) const { |
926 | const MachineInstr *MI = SU->getInstr(); |
927 | return MI && TII->isVALU(MI: *MI); |
928 | } |
929 | |
930 | // Link as many SALU instructions in chain as possible. Return the size |
931 | // of the chain. Links up to MaxChain instructions. |
932 | unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, |
933 | SmallPtrSetImpl<SUnit *> &Visited) const { |
934 | SmallVector<SUnit *, 8> Worklist({To}); |
935 | unsigned Linked = 0; |
936 | |
937 | while (!Worklist.empty() && MaxChain-- > 0) { |
938 | SUnit *SU = Worklist.pop_back_val(); |
939 | if (!Visited.insert(Ptr: SU).second) |
940 | continue; |
941 | |
942 | LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); |
943 | dbgs() << "to\n" ; DAG->dumpNode(*SU); dbgs() << '\n'); |
944 | |
945 | if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SuccSU: SU, PredSU: From)) |
946 | if (DAG->addEdge(SuccSU: SU, PredDep: SDep(From, SDep::Artificial))) |
947 | ++Linked; |
948 | |
949 | for (SDep &SI : From->Succs) { |
950 | SUnit *SUv = SI.getSUnit(); |
951 | if (SUv != From && SU != &DAG->ExitSU && isVALU(SU: SUv) && |
952 | DAG->canAddEdge(SuccSU: SUv, PredSU: SU)) |
953 | DAG->addEdge(SuccSU: SUv, PredDep: SDep(SU, SDep::Artificial)); |
954 | } |
955 | |
956 | for (SDep &SI : SU->Succs) { |
957 | SUnit *Succ = SI.getSUnit(); |
958 | if (Succ != SU && isSALU(SU: Succ)) |
959 | Worklist.push_back(Elt: Succ); |
960 | } |
961 | } |
962 | |
963 | return Linked; |
964 | } |
965 | |
966 | void apply(ScheduleDAGInstrs *DAGInstrs) override { |
967 | const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); |
968 | if (!ST.hasMAIInsts()) |
969 | return; |
970 | DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); |
971 | const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); |
972 | if (!TSchedModel || DAG->SUnits.empty()) |
973 | return; |
974 | |
975 | // Scan for MFMA long latency instructions and try to add a dependency |
976 | // of available SALU instructions to give them a chance to fill MFMA |
977 | // shadow. That is desirable to fill MFMA shadow with SALU instructions |
978 | // rather than VALU to prevent power consumption bursts and throttle. |
979 | auto LastSALU = DAG->SUnits.begin(); |
980 | auto E = DAG->SUnits.end(); |
981 | SmallPtrSet<SUnit*, 32> Visited; |
982 | for (SUnit &SU : DAG->SUnits) { |
983 | MachineInstr &MAI = *SU.getInstr(); |
984 | if (!TII->isMAI(MI: MAI) || |
985 | MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || |
986 | MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) |
987 | continue; |
988 | |
989 | unsigned Lat = TSchedModel->computeInstrLatency(MI: &MAI) - 1; |
990 | |
991 | LLVM_DEBUG(dbgs() << "Found MFMA: " ; DAG->dumpNode(SU); |
992 | dbgs() << "Need " << Lat |
993 | << " instructions to cover latency.\n" ); |
994 | |
995 | // Find up to Lat independent scalar instructions as early as |
996 | // possible such that they can be scheduled after this MFMA. |
997 | for ( ; Lat && LastSALU != E; ++LastSALU) { |
998 | if (Visited.count(Ptr: &*LastSALU)) |
999 | continue; |
1000 | |
1001 | if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(SU: &*LastSALU) || |
1002 | !DAG->canAddEdge(SuccSU: &*LastSALU, PredSU: &SU)) |
1003 | continue; |
1004 | |
1005 | Lat -= linkSALUChain(From: &SU, To: &*LastSALU, MaxChain: Lat, Visited); |
1006 | } |
1007 | } |
1008 | } |
1009 | }; |
1010 | } // namespace |
1011 | |
1012 | void GCNSubtarget::getPostRAMutations( |
1013 | std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { |
1014 | Mutations.push_back(x: std::make_unique<FillMFMAShadowMutation>(args: &InstrInfo)); |
1015 | } |
1016 | |
1017 | std::unique_ptr<ScheduleDAGMutation> |
1018 | GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { |
1019 | return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(args: &InstrInfo) |
1020 | : nullptr; |
1021 | } |
1022 | |
1023 | unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { |
1024 | if (getGeneration() >= AMDGPUSubtarget::GFX12) |
1025 | return 0; // Not MIMG encoding. |
1026 | |
1027 | if (NSAThreshold.getNumOccurrences() > 0) |
1028 | return std::max(a: NSAThreshold.getValue(), b: 2u); |
1029 | |
1030 | int Value = MF.getFunction().getFnAttributeAsParsedInteger( |
1031 | Kind: "amdgpu-nsa-threshold" , Default: -1); |
1032 | if (Value > 0) |
1033 | return std::max(a: Value, b: 2); |
1034 | |
1035 | return 3; |
1036 | } |
1037 | |
1038 | const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { |
1039 | if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) |
1040 | return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); |
1041 | return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>()); |
1042 | } |
1043 | |
1044 | const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { |
1045 | if (TM.getTargetTriple().getArch() == Triple::amdgcn) |
1046 | return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); |
1047 | return static_cast<const AMDGPUSubtarget &>( |
1048 | TM.getSubtarget<R600Subtarget>(F)); |
1049 | } |
1050 | |
1051 | GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, |
1052 | const GCNSubtarget &ST) |
1053 | : ST(ST) { |
1054 | const CallingConv::ID CC = F.getCallingConv(); |
1055 | const bool IsKernel = |
1056 | CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; |
1057 | // FIXME: Should have analysis or something rather than attribute to detect |
1058 | // calls. |
1059 | const bool HasCalls = F.hasFnAttribute(Kind: "amdgpu-calls" ); |
1060 | // FIXME: This attribute is a hack, we just need an analysis on the function |
1061 | // to look for allocas. |
1062 | const bool HasStackObjects = F.hasFnAttribute(Kind: "amdgpu-stack-objects" ); |
1063 | |
1064 | if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) |
1065 | KernargSegmentPtr = true; |
1066 | |
1067 | bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); |
1068 | if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) |
1069 | PrivateSegmentBuffer = true; |
1070 | else if (ST.isMesaGfxShader(F)) |
1071 | ImplicitBufferPtr = true; |
1072 | |
1073 | if (!AMDGPU::isGraphics(CC)) { |
1074 | if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr" )) |
1075 | DispatchPtr = true; |
1076 | |
1077 | // FIXME: Can this always be disabled with < COv5? |
1078 | if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr" )) |
1079 | QueuePtr = true; |
1080 | |
1081 | if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id" )) |
1082 | DispatchID = true; |
1083 | } |
1084 | |
1085 | // TODO: This could be refined a lot. The attribute is a poor way of |
1086 | // detecting calls or stack objects that may require it before argument |
1087 | // lowering. |
1088 | if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && |
1089 | (IsAmdHsaOrMesa || ST.enableFlatScratch()) && |
1090 | (HasCalls || HasStackObjects || ST.enableFlatScratch()) && |
1091 | !ST.flatScratchIsArchitected()) { |
1092 | FlatScratchInit = true; |
1093 | } |
1094 | |
1095 | if (hasImplicitBufferPtr()) |
1096 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID); |
1097 | |
1098 | if (hasPrivateSegmentBuffer()) |
1099 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID); |
1100 | |
1101 | if (hasDispatchPtr()) |
1102 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID); |
1103 | |
1104 | if (hasQueuePtr()) |
1105 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID); |
1106 | |
1107 | if (hasKernargSegmentPtr()) |
1108 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID); |
1109 | |
1110 | if (hasDispatchID()) |
1111 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID); |
1112 | |
1113 | if (hasFlatScratchInit()) |
1114 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID); |
1115 | |
1116 | if (hasPrivateSegmentSize()) |
1117 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentSizeID); |
1118 | } |
1119 | |
1120 | void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { |
1121 | assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); |
1122 | NumKernargPreloadSGPRs += NumSGPRs; |
1123 | NumUsedUserSGPRs += NumSGPRs; |
1124 | } |
1125 | |
1126 | unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { |
1127 | return AMDGPU::getMaxNumUserSGPRs(STI: ST) - NumUsedUserSGPRs; |
1128 | } |
1129 | |
1130 | SmallVector<unsigned> |
1131 | AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { |
1132 | return AMDGPU::getIntegerVecAttribute(F, Name: "amdgpu-max-num-workgroups" , Size: 3); |
1133 | } |
1134 | |