1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
23#include "Utils/AMDGPUBaseInfo.h"
24#include "llvm/Support/ErrorHandling.h"
25
26#define GET_SUBTARGETINFO_HEADER
27#include "AMDGPUGenSubtargetInfo.inc"
28
29namespace llvm {
30
31class GCNTargetMachine;
32
33class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
34 public AMDGPUSubtarget {
35public:
36 using AMDGPUSubtarget::getMaxWavesPerEU;
37
38 // Following 2 enums are documented at:
39 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
40 enum class TrapHandlerAbi {
41 NONE = 0x00,
42 AMDHSA = 0x01,
43 };
44
45 enum class TrapID {
46 LLVMAMDHSATrap = 0x02,
47 LLVMAMDHSADebugTrap = 0x03,
48 };
49
50private:
51 /// SelectionDAGISel related APIs.
52 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
53
54 /// GlobalISel related APIs.
55 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
56 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
57 std::unique_ptr<InstructionSelector> InstSelector;
58 std::unique_ptr<LegalizerInfo> Legalizer;
59 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
60
61protected:
62 // Basic subtarget description.
63 AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
64 unsigned Gen = INVALID;
65 InstrItineraryData InstrItins;
66 int LDSBankCount = 0;
67 unsigned MaxPrivateElementSize = 0;
68
69 // Dynamically set bits that enable features.
70 bool DynamicVGPR = false;
71 bool DynamicVGPRBlockSize32 = false;
72 bool ScalarizeGlobal = false;
73
74 /// The maximum number of instructions that may be placed within an S_CLAUSE,
75 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
76 /// indicates a lack of S_CLAUSE support.
77 unsigned MaxHardClauseLength = 0;
78
79#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
80 bool ATTRIBUTE = DEFAULT;
81#include "AMDGPUGenSubtargetInfo.inc"
82
83private:
84 SIInstrInfo InstrInfo;
85 SITargetLowering TLInfo;
86 SIFrameLowering FrameLowering;
87
88public:
89 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
90 const GCNTargetMachine &TM);
91 ~GCNSubtarget() override;
92
93 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU,
94 StringRef FS);
95
96 /// Diagnose inconsistent subtarget features before attempting to codegen
97 /// function \p F.
98 void checkSubtargetFeatures(const Function &F) const;
99
100 const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
101
102 const SIFrameLowering *getFrameLowering() const override {
103 return &FrameLowering;
104 }
105
106 const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
107
108 const SIRegisterInfo *getRegisterInfo() const override {
109 return &InstrInfo.getRegisterInfo();
110 }
111
112 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
113
114 const CallLowering *getCallLowering() const override {
115 return CallLoweringInfo.get();
116 }
117
118 const InlineAsmLowering *getInlineAsmLowering() const override {
119 return InlineAsmLoweringInfo.get();
120 }
121
122 InstructionSelector *getInstructionSelector() const override {
123 return InstSelector.get();
124 }
125
126 const LegalizerInfo *getLegalizerInfo() const override {
127 return Legalizer.get();
128 }
129
130 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
131 return RegBankInfo.get();
132 }
133
134 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
135 return TargetID;
136 }
137
138 const InstrItineraryData *getInstrItineraryData() const override {
139 return &InstrItins;
140 }
141
142 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
143
144 Generation getGeneration() const { return (Generation)Gen; }
145
146 bool isGFX11Plus() const { return getGeneration() >= GFX11; }
147
148#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
149 bool GETTER() const override { return ATTRIBUTE; }
150#include "AMDGPUGenSubtargetInfo.inc"
151
152 unsigned getMaxWaveScratchSize() const {
153 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
154 if (getGeneration() >= GFX12) {
155 // 18-bit field in units of 64-dword.
156 return (64 * 4) * ((1 << 18) - 1);
157 }
158 if (getGeneration() == GFX11) {
159 // 15-bit field in units of 64-dword.
160 return (64 * 4) * ((1 << 15) - 1);
161 }
162 // 13-bit field in units of 256-dword.
163 return (256 * 4) * ((1 << 13) - 1);
164 }
165
166 /// Return the number of high bits known to be zero for a frame index.
167 unsigned getKnownHighZeroBitsForFrameIndex() const {
168 return llvm::countl_zero(Val: getMaxWaveScratchSize()) + getWavefrontSizeLog2();
169 }
170
171 int getLDSBankCount() const { return LDSBankCount; }
172
173 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
174 return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
175 : 16;
176 }
177
178 unsigned getConstantBusLimit(unsigned Opcode) const;
179
180 /// Returns if the result of this instruction with a 16-bit result returned in
181 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
182 /// the original value.
183 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
184
185 bool supportsWGP() const {
186 if (HasGFX1250Insts)
187 return false;
188 return getGeneration() >= GFX10;
189 }
190
191 bool hasHWFP64() const { return HasFP64; }
192
193 bool hasAddr64() const {
194 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
195 }
196
197 bool hasFlat() const {
198 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
199 }
200
201 // Return true if the target only has the reverse operand versions of VALU
202 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
203 bool hasOnlyRevVALUShifts() const {
204 return getGeneration() >= VOLCANIC_ISLANDS;
205 }
206
207 bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
208
209 bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
210
211 bool hasMin3Max3_16() const {
212 return getGeneration() >= AMDGPUSubtarget::GFX9;
213 }
214
215 bool hasSwap() const { return HasGFX9Insts; }
216
217 bool hasScalarPackInsts() const { return HasGFX9Insts; }
218
219 bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
220
221 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
222
223 TrapHandlerAbi getTrapHandlerAbi() const {
224 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
225 }
226
227 bool supportsGetDoorbellID() const {
228 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
229 return getGeneration() >= GFX9;
230 }
231
232 /// True if the offset field of DS instructions works as expected. On SI, the
233 /// offset uses a 16-bit adder and does not always wrap properly.
234 bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
235
236 bool unsafeDSOffsetFoldingEnabled() const {
237 return EnableUnsafeDSOffsetFolding;
238 }
239
240 /// Condition output from div_scale is usable.
241 bool hasUsableDivScaleConditionOutput() const {
242 return getGeneration() != SOUTHERN_ISLANDS;
243 }
244
245 /// Extra wait hazard is needed in some cases before
246 /// s_cbranch_vccnz/s_cbranch_vccz.
247 bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
248
249 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
250 bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
251
252 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
253 /// was written by a VALU instruction.
254 bool hasSMRDReadVALUDefHazard() const {
255 return getGeneration() == SOUTHERN_ISLANDS;
256 }
257
258 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
259 /// SGPR was written by a VALU Instruction.
260 bool hasVMEMReadSGPRVALUDefHazard() const {
261 return getGeneration() >= VOLCANIC_ISLANDS;
262 }
263
264 bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
265
266 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
267 unsigned getSetRegWaitStates() const {
268 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
269 }
270
271 /// Return the amount of LDS that can be used that will not restrict the
272 /// occupancy lower than WaveCount.
273 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
274 const Function &) const;
275
276 bool supportsMinMaxDenormModes() const {
277 return getGeneration() >= AMDGPUSubtarget::GFX9;
278 }
279
280 /// \returns If target supports S_DENORM_MODE.
281 bool hasDenormModeInst() const {
282 return getGeneration() >= AMDGPUSubtarget::GFX10;
283 }
284
285 /// \returns If target supports ds_read/write_b128 and user enables generation
286 /// of ds_read/write_b128.
287 bool useDS128() const { return HasCIInsts && EnableDS128; }
288
289 /// \return If target supports ds_read/write_b96/128.
290 bool hasDS96AndDS128() const { return HasCIInsts; }
291
292 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
293 bool haveRoundOpsF64() const { return HasCIInsts; }
294
295 /// \returns If MUBUF instructions always perform range checking, even for
296 /// buffer resources used for private memory access.
297 bool privateMemoryResourceIsRangeChecked() const {
298 return getGeneration() < AMDGPUSubtarget::GFX9;
299 }
300
301 /// \returns If target requires PRT Struct NULL support (zero result registers
302 /// for sparse texture support).
303 bool usePRTStrictNull() const { return EnablePRTStrictNull; }
304
305 bool hasUnalignedBufferAccessEnabled() const {
306 return HasUnalignedBufferAccess && HasUnalignedAccessMode;
307 }
308
309 bool hasUnalignedDSAccessEnabled() const {
310 return HasUnalignedDSAccess && HasUnalignedAccessMode;
311 }
312
313 bool hasUnalignedScratchAccessEnabled() const {
314 return HasUnalignedScratchAccess && HasUnalignedAccessMode;
315 }
316
317 bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
318
319 bool isTgSplitEnabled() const { return EnableTgSplit; }
320
321 bool isCuModeEnabled() const { return EnableCuMode; }
322
323 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
324
325 bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
326
327 // Check if target supports ST addressing mode with FLAT scratch instructions.
328 // The ST addressing mode means no registers are used, either VGPR or SGPR,
329 // but only immediate offset is swizzled and added to the FLAT scratch base.
330 bool hasFlatScratchSTMode() const {
331 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
332 }
333
334 bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
335
336 bool hasFlatScratchEnabled() const {
337 return hasArchitectedFlatScratch() ||
338 (EnableFlatScratch && hasFlatScratchInsts());
339 }
340
341 bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
342
343 bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
344
345 bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
346
347 bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
348
349 bool hasExportInsts() const {
350 return !hasGFX940Insts() && !hasGFX1250Insts();
351 }
352
353 bool hasVINTERPEncoding() const {
354 return HasGFX11Insts && !hasGFX1250Insts();
355 }
356
357 // DS_ADD_F64/DS_ADD_RTN_F64
358 bool hasLdsAtomicAddF64() const {
359 return hasGFX90AInsts() || hasGFX1250Insts();
360 }
361
362 bool hasMultiDwordFlatScratchAddressing() const {
363 return getGeneration() >= GFX9;
364 }
365
366 bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
367
368 bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
369
370 bool d16PreservesUnusedBits() const {
371 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
372 }
373
374 bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
375
376 /// Return if most LDS instructions have an m0 use that require m0 to be
377 /// initialized.
378 bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
379
380 // True if the hardware rewinds and replays GWS operations if a wave is
381 // preempted.
382 //
383 // If this is false, a GWS operation requires testing if a nack set the
384 // MEM_VIOL bit, and repeating if so.
385 bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
386
387 /// \returns if target has ds_gws_sema_release_all instruction.
388 bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
389
390 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
391
392 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
393
394 // Covers VS/PS/CS graphics shaders
395 bool isMesaGfxShader(const Function &F) const {
396 return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
397 }
398
399 bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
400
401 bool hasAtomicFaddInsts() const {
402 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
403 }
404
405 bool vmemWriteNeedsExpWaitcnt() const {
406 return getGeneration() < SEA_ISLANDS;
407 }
408
409 bool hasInstPrefetch() const {
410 return getGeneration() == GFX10 || getGeneration() == GFX11;
411 }
412
413 bool hasPrefetch() const { return HasGFX12Insts; }
414
415 // Has s_cmpk_* instructions.
416 bool hasSCmpK() const { return getGeneration() < GFX12; }
417
418 // Scratch is allocated in 256 dword per wave blocks for the entire
419 // wavefront. When viewed from the perspective of an arbitrary workitem, this
420 // is 4-byte aligned.
421 //
422 // Only 4-byte alignment is really needed to access anything. Transformations
423 // on the pointer value itself may rely on the alignment / known low bits of
424 // the pointer. Set this to something above the minimum to avoid needing
425 // dynamic realignment in common cases.
426 Align getStackAlignment() const { return Align(16); }
427
428 bool enableMachineScheduler() const override { return true; }
429
430 bool useAA() const override;
431
432 bool enableSubRegLiveness() const override { return true; }
433
434 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
435 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
436
437 // static wrappers
438 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
439
440 // XXX - Why is this here if it isn't in the default pass set?
441 bool enableEarlyIfConversion() const override { return true; }
442
443 void overrideSchedPolicy(MachineSchedPolicy &Policy,
444 const SchedRegion &Region) const override;
445
446 void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
447 const SchedRegion &Region) const override;
448
449 void mirFileLoaded(MachineFunction &MF) const override;
450
451 unsigned getMaxNumUserSGPRs() const {
452 return AMDGPU::getMaxNumUserSGPRs(STI: *this);
453 }
454
455 bool useVGPRIndexMode() const;
456
457 bool hasScalarCompareEq64() const {
458 return getGeneration() >= VOLCANIC_ISLANDS;
459 }
460
461 bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
462 bool hasLDSFPAtomicAddF64() const {
463 return HasGFX90AInsts || HasGFX1250Insts;
464 }
465
466 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
467 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
468
469 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
470 bool hasPermLane64() const { return getGeneration() >= GFX11; }
471
472 bool hasDPPBroadcasts() const { return HasDPP && getGeneration() < GFX10; }
473
474 bool hasDPPWavefrontShifts() const {
475 return HasDPP && getGeneration() < GFX10;
476 }
477
478 // Has V_PK_MOV_B32 opcode
479 bool hasPkMovB32() const { return HasGFX90AInsts; }
480
481 bool hasFmaakFmamkF32Insts() const {
482 return getGeneration() >= GFX10 || hasGFX940Insts();
483 }
484
485 bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
486
487 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
488
489 unsigned getNSAMaxSize(bool HasSampler = false) const {
490 return AMDGPU::getNSAMaxSize(STI: *this, HasSampler);
491 }
492
493 bool hasMadF16() const;
494
495 bool hasMovB64() const { return HasGFX940Insts || HasGFX1250Insts; }
496
497 // Scalar and global loads support scale_offset bit.
498 bool hasScaleOffset() const { return HasGFX1250Insts; }
499
500 // FLAT GLOBAL VOffset is signed
501 bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
502
503 bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
504
505 bool hasUserSGPRInit16BugInWave32() const {
506 return HasUserSGPRInit16Bug && isWave32();
507 }
508
509 bool has12DWordStoreHazard() const {
510 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
511 }
512
513 // \returns true if the subtarget supports DWORDX3 load/store instructions.
514 bool hasDwordx3LoadStores() const { return HasCIInsts; }
515
516 bool hasReadM0MovRelInterpHazard() const {
517 return getGeneration() == AMDGPUSubtarget::GFX9;
518 }
519
520 bool hasReadM0SendMsgHazard() const {
521 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
522 getGeneration() <= AMDGPUSubtarget::GFX9;
523 }
524
525 bool hasReadM0LdsDmaHazard() const {
526 return getGeneration() == AMDGPUSubtarget::GFX9;
527 }
528
529 bool hasReadM0LdsDirectHazard() const {
530 return getGeneration() == AMDGPUSubtarget::GFX9;
531 }
532
533 bool hasLDSMisalignedBugInWGPMode() const {
534 return HasLDSMisalignedBug && !EnableCuMode;
535 }
536
537 // Shift amount of a 64 bit shift cannot be a highest allocated register
538 // if also at the end of the allocation block.
539 bool hasShift64HighRegBug() const {
540 return HasGFX90AInsts && !HasGFX940Insts;
541 }
542
543 // Has one cycle hazard on transcendental instruction feeding a
544 // non transcendental VALU.
545 bool hasTransForwardingHazard() const { return HasGFX940Insts; }
546
547 // Has one cycle hazard on a VALU instruction partially writing dst with
548 // a shift of result bits feeding another VALU instruction.
549 bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
550
551 // Cannot use op_sel with v_dot instructions.
552 bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
553
554 // Does not have HW interlocs for VALU writing and then reading SGPRs.
555 bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
556
557 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
558
559 bool hasFPAtomicToDenormModeHazard() const {
560 return getGeneration() == GFX10;
561 }
562
563 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
564
565 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
566
567 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
568
569 bool hasVALUPartialForwardingHazard() const {
570 return getGeneration() == GFX11;
571 }
572
573 bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
574
575 bool requiresCodeObjectV6() const { return RequiresCOV6; }
576
577 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
578
579 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
580
581 bool hasVALUReadSGPRHazard() const {
582 return HasGFX12Insts && !HasGFX1250Insts;
583 }
584
585 bool setRegModeNeedsVNOPs() const {
586 return HasGFX1250Insts && getGeneration() == GFX12;
587 }
588
589 /// Return if operations acting on VGPR tuples require even alignment.
590 bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
591
592 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
593 bool hasSPackHL() const { return HasGFX11Insts; }
594
595 /// Return true if the target's EXP instruction has the COMPR flag, which
596 /// affects the meaning of the EN (enable) bits.
597 bool hasCompressedExport() const { return !HasGFX11Insts; }
598
599 /// Return true if the target's EXP instruction supports the NULL export
600 /// target.
601 bool hasNullExportTarget() const { return !HasGFX11Insts; }
602
603 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
604
605 /// Return true if the target has the S_DELAY_ALU instruction.
606 bool hasDelayAlu() const { return HasGFX11Insts; }
607
608 /// Returns true if the target supports
609 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
610 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
611 bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
612
613 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
614 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
615 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
616
617 /// \returns true if inline constants are not supported for F16 pseudo
618 /// scalar transcendentals.
619 bool hasNoF16PseudoScalarTransInlineConstants() const {
620 return getGeneration() == GFX12;
621 }
622
623 /// \returns true if the target has packed f32 instructions that only read 32
624 /// bits from a scalar operand (SGPR or literal) and replicates the bits to
625 /// both channels.
626 bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const {
627 return getGeneration() == GFX12 && HasGFX1250Insts;
628 }
629
630 bool hasAddPC64Inst() const { return HasGFX1250Insts; }
631
632 /// \returns true if the target supports expert scheduling mode 2 which relies
633 /// on the compiler to insert waits to avoid hazards between VMEM and VALU
634 /// instructions in some instances.
635 bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
636
637 /// \returns The maximum number of instructions that can be enclosed in an
638 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
639 /// instruction.
640 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
641
642 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
643 /// SGPRs
644 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
645
646 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
647 /// VGPRs
648 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
649 unsigned DynamicVGPRBlockSize) const;
650
651 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
652 /// be achieved when the only function running on a CU is \p F, each workgroup
653 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
654 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
655 /// range, so this returns a range as well.
656 ///
657 /// Note that occupancy can be affected by the scratch allocation as well, but
658 /// we do not have enough information to compute it.
659 std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
660 unsigned LDSSize = 0,
661 unsigned NumSGPRs = 0,
662 unsigned NumVGPRs = 0) const;
663
664 /// \returns true if the flat_scratch register should be initialized with the
665 /// pointer to the wave's scratch memory rather than a size and offset.
666 bool flatScratchIsPointer() const {
667 return getGeneration() >= AMDGPUSubtarget::GFX9;
668 }
669
670 /// \returns true if the machine has merged shaders in which s0-s7 are
671 /// reserved by the hardware and user SGPRs start at s8
672 bool hasMergedShaders() const { return getGeneration() >= GFX9; }
673
674 // \returns true if the target supports the pre-NGG legacy geometry path.
675 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
676
677 // \returns true if the target has split barriers feature
678 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
679
680 // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
681 bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
682
683 // \returns true if the target has IEEE kernel descriptor mode bit
684 bool hasIEEEMode() const { return getGeneration() < GFX12; }
685
686 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
687 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
688
689 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
690 /// values.
691 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
692
693 bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
694
695 bool hasVOPD3() const { return HasGFX1250Insts; }
696
697 // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
698 bool hasVectorMulU64() const { return HasGFX1250Insts; }
699
700 // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
701 // instructions.
702 bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; }
703
704 // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
705 bool hasIntMinMax64() const { return HasGFX1250Insts; }
706
707 // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
708 bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
709
710 // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
711 bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
712
713 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
714 // of sign-extending. Note that GFX1250 has not only fixed the bug but also
715 // extended VA to 57 bits.
716 bool hasGetPCZeroExtension() const {
717 return HasGFX12Insts && !HasGFX1250Insts;
718 }
719
720 // \returns true if the target needs to create a prolog for backward
721 // compatibility when preloading kernel arguments.
722 bool needsKernArgPreloadProlog() const {
723 return hasKernargPreload() && !HasGFX1250Insts;
724 }
725
726 bool hasCondSubInsts() const { return HasGFX12Insts; }
727
728 bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
729
730 /// \returns SGPR allocation granularity supported by the subtarget.
731 unsigned getSGPRAllocGranule() const {
732 return AMDGPU::IsaInfo::getSGPRAllocGranule(STI: this);
733 }
734
735 /// \returns SGPR encoding granularity supported by the subtarget.
736 unsigned getSGPREncodingGranule() const {
737 return AMDGPU::IsaInfo::getSGPREncodingGranule(STI: this);
738 }
739
740 /// \returns Total number of SGPRs supported by the subtarget.
741 unsigned getTotalNumSGPRs() const {
742 return AMDGPU::IsaInfo::getTotalNumSGPRs(STI: this);
743 }
744
745 /// \returns Addressable number of SGPRs supported by the subtarget.
746 unsigned getAddressableNumSGPRs() const {
747 return AMDGPU::IsaInfo::getAddressableNumSGPRs(STI: this);
748 }
749
750 /// \returns Minimum number of SGPRs that meets the given number of waves per
751 /// execution unit requirement supported by the subtarget.
752 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
753 return AMDGPU::IsaInfo::getMinNumSGPRs(STI: this, WavesPerEU);
754 }
755
756 /// \returns Maximum number of SGPRs that meets the given number of waves per
757 /// execution unit requirement supported by the subtarget.
758 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
759 return AMDGPU::IsaInfo::getMaxNumSGPRs(STI: this, WavesPerEU, Addressable);
760 }
761
762 /// \returns Reserved number of SGPRs. This is common
763 /// utility function called by MachineFunction and
764 /// Function variants of getReservedNumSGPRs.
765 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
766 /// \returns Reserved number of SGPRs for given machine function \p MF.
767 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
768
769 /// \returns Reserved number of SGPRs for given function \p F.
770 unsigned getReservedNumSGPRs(const Function &F) const;
771
772 /// \returns Maximum number of preloaded SGPRs for the subtarget.
773 unsigned getMaxNumPreloadedSGPRs() const;
774
775 /// \returns max num SGPRs. This is the common utility
776 /// function called by MachineFunction and Function
777 /// variants of getMaxNumSGPRs.
778 unsigned getBaseMaxNumSGPRs(const Function &F,
779 std::pair<unsigned, unsigned> WavesPerEU,
780 unsigned PreloadedSGPRs,
781 unsigned ReservedNumSGPRs) const;
782
783 /// \returns Maximum number of SGPRs that meets number of waves per execution
784 /// unit requirement for function \p MF, or number of SGPRs explicitly
785 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
786 ///
787 /// \returns Value that meets number of waves per execution unit requirement
788 /// if explicitly requested value cannot be converted to integer, violates
789 /// subtarget's specifications, or does not meet number of waves per execution
790 /// unit requirement.
791 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
792
793 /// \returns Maximum number of SGPRs that meets number of waves per execution
794 /// unit requirement for function \p F, or number of SGPRs explicitly
795 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
796 ///
797 /// \returns Value that meets number of waves per execution unit requirement
798 /// if explicitly requested value cannot be converted to integer, violates
799 /// subtarget's specifications, or does not meet number of waves per execution
800 /// unit requirement.
801 unsigned getMaxNumSGPRs(const Function &F) const;
802
803 /// \returns VGPR allocation granularity supported by the subtarget.
804 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
805 return AMDGPU::IsaInfo::getVGPRAllocGranule(STI: this, DynamicVGPRBlockSize);
806 }
807
808 /// \returns VGPR encoding granularity supported by the subtarget.
809 unsigned getVGPREncodingGranule() const {
810 return AMDGPU::IsaInfo::getVGPREncodingGranule(STI: this);
811 }
812
813 /// \returns Total number of VGPRs supported by the subtarget.
814 unsigned getTotalNumVGPRs() const {
815 return AMDGPU::IsaInfo::getTotalNumVGPRs(STI: this);
816 }
817
818 /// \returns Addressable number of architectural VGPRs supported by the
819 /// subtarget.
820 unsigned getAddressableNumArchVGPRs() const {
821 return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(STI: this);
822 }
823
824 /// \returns Addressable number of VGPRs supported by the subtarget.
825 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
826 return AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: this, DynamicVGPRBlockSize);
827 }
828
829 /// \returns the minimum number of VGPRs that will prevent achieving more than
830 /// the specified number of waves \p WavesPerEU.
831 unsigned getMinNumVGPRs(unsigned WavesPerEU,
832 unsigned DynamicVGPRBlockSize) const {
833 return AMDGPU::IsaInfo::getMinNumVGPRs(STI: this, WavesPerEU,
834 DynamicVGPRBlockSize);
835 }
836
837 /// \returns the maximum number of VGPRs that can be used and still achieved
838 /// at least the specified number of waves \p WavesPerEU.
839 unsigned getMaxNumVGPRs(unsigned WavesPerEU,
840 unsigned DynamicVGPRBlockSize) const {
841 return AMDGPU::IsaInfo::getMaxNumVGPRs(STI: this, WavesPerEU,
842 DynamicVGPRBlockSize);
843 }
844
845 /// \returns max num VGPRs. This is the common utility function
846 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
847 unsigned
848 getBaseMaxNumVGPRs(const Function &F,
849 std::pair<unsigned, unsigned> NumVGPRBounds) const;
850
851 /// \returns Maximum number of VGPRs that meets number of waves per execution
852 /// unit requirement for function \p F, or number of VGPRs explicitly
853 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
854 ///
855 /// \returns Value that meets number of waves per execution unit requirement
856 /// if explicitly requested value cannot be converted to integer, violates
857 /// subtarget's specifications, or does not meet number of waves per execution
858 /// unit requirement.
859 unsigned getMaxNumVGPRs(const Function &F) const;
860
861 unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
862
863 /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
864 /// of waves per execution unit required for the function \p MF.
865 std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
866
867 /// \returns Maximum number of VGPRs that meets number of waves per execution
868 /// unit requirement for function \p MF, or number of VGPRs explicitly
869 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
870 ///
871 /// \returns Value that meets number of waves per execution unit requirement
872 /// if explicitly requested value cannot be converted to integer, violates
873 /// subtarget's specifications, or does not meet number of waves per execution
874 /// unit requirement.
875 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
876
877 bool supportsWave32() const { return getGeneration() >= GFX10; }
878
879 bool supportsWave64() const { return !hasGFX1250Insts(); }
880
881 bool isWave32() const { return getWavefrontSize() == 32; }
882
883 bool isWave64() const { return getWavefrontSize() == 64; }
884
885 /// Returns if the wavesize of this subtarget is known reliable. This is false
886 /// only for the a default target-cpu that does not have an explicit
887 /// +wavefrontsize target feature.
888 bool isWaveSizeKnown() const {
889 return hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) ||
890 hasFeature(Feature: AMDGPU::FeatureWavefrontSize64);
891 }
892
893 const TargetRegisterClass *getBoolRC() const {
894 return getRegisterInfo()->getBoolRC();
895 }
896
897 /// \returns Maximum number of work groups per compute unit supported by the
898 /// subtarget and limited by given \p FlatWorkGroupSize.
899 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
900 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(STI: this, FlatWorkGroupSize);
901 }
902
903 /// \returns Minimum flat work group size supported by the subtarget.
904 unsigned getMinFlatWorkGroupSize() const override {
905 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(STI: this);
906 }
907
908 /// \returns Maximum flat work group size supported by the subtarget.
909 unsigned getMaxFlatWorkGroupSize() const override {
910 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(STI: this);
911 }
912
913 /// \returns Number of waves per execution unit required to support the given
914 /// \p FlatWorkGroupSize.
915 unsigned
916 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
917 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(STI: this, FlatWorkGroupSize);
918 }
919
920 /// \returns Minimum number of waves per execution unit supported by the
921 /// subtarget.
922 unsigned getMinWavesPerEU() const override {
923 return AMDGPU::IsaInfo::getMinWavesPerEU(STI: this);
924 }
925
926 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
927 SDep &Dep,
928 const TargetSchedModel *SchedModel) const override;
929
930 // \returns true if it's beneficial on this subtarget for the scheduler to
931 // cluster stores as well as loads.
932 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
933
934 // \returns the number of address arguments from which to enable MIMG NSA
935 // on supported architectures.
936 unsigned getNSAThreshold(const MachineFunction &MF) const;
937
938 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
939 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
940 bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
941
942 // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
943 // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
944 bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
945
946 bool isDynamicVGPREnabled() const { return DynamicVGPR; }
947 unsigned getDynamicVGPRBlockSize() const {
948 return DynamicVGPRBlockSize32 ? 32 : 16;
949 }
950
951 bool requiresDisjointEarlyClobberAndUndef() const override {
952 // AMDGPU doesn't care if early-clobber and undef operands are allocated
953 // to the same register.
954 return false;
955 }
956
957 // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
958 // and surronded by S_WAIT_ALU(0xFFE3).
959 bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
960 return getGeneration() == GFX12;
961 }
962
963 // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
964 // read.
965 bool hasScratchBaseForwardingHazard() const {
966 return HasGFX1250Insts && getGeneration() == GFX12;
967 }
968
969 // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
970 // result.
971 bool hasFlatScratchHiInB64InstHazard() const {
972 return HasGFX1250Insts && getGeneration() == GFX12;
973 }
974
975 /// \returns true if the subtarget requires a wait for xcnt before VMEM
976 /// accesses that must never be repeated in the event of a page fault/re-try.
977 /// Atomic stores/rmw and all volatile accesses fall under this criteria.
978 bool requiresWaitXCntForSingleAccessInstructions() const {
979 return HasGFX1250Insts;
980 }
981
982 /// \returns the number of significant bits in the immediate field of the
983 /// S_NOP instruction.
984 unsigned getSNopBits() const {
985 if (getGeneration() >= AMDGPUSubtarget::GFX12)
986 return 7;
987 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
988 return 4;
989 return 3;
990 }
991
992 bool supportsBPermute() const {
993 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
994 }
995
996 bool supportsWaveWideBPermute() const {
997 return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
998 getGeneration() == AMDGPUSubtarget::GFX12) ||
999 isWave32();
1000 }
1001
1002 /// Return true if real (non-fake) variants of True16 instructions using
1003 /// 16-bit registers should be code-generated. Fake True16 instructions are
1004 /// identical to non-fake ones except that they take 32-bit registers as
1005 /// operands and always use their low halves.
1006 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
1007 // supported and the support for fake True16 instructions is removed.
1008 bool useRealTrue16Insts() const {
1009 return hasTrue16BitInsts() && EnableRealTrue16Insts;
1010 }
1011
1012 bool requiresWaitOnWorkgroupReleaseFence() const {
1013 return getGeneration() >= GFX10 || isTgSplitEnabled();
1014 }
1015};
1016
1017class GCNUserSGPRUsageInfo {
1018public:
1019 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1020
1021 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1022
1023 bool hasDispatchPtr() const { return DispatchPtr; }
1024
1025 bool hasQueuePtr() const { return QueuePtr; }
1026
1027 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1028
1029 bool hasDispatchID() const { return DispatchID; }
1030
1031 bool hasFlatScratchInit() const { return FlatScratchInit; }
1032
1033 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1034
1035 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1036
1037 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1038
1039 unsigned getNumFreeUserSGPRs();
1040
1041 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1042
1043 enum UserSGPRID : unsigned {
1044 ImplicitBufferPtrID = 0,
1045 PrivateSegmentBufferID = 1,
1046 DispatchPtrID = 2,
1047 QueuePtrID = 3,
1048 KernargSegmentPtrID = 4,
1049 DispatchIdID = 5,
1050 FlatScratchInitID = 6,
1051 PrivateSegmentSizeID = 7
1052 };
1053
1054 // Returns the size in number of SGPRs for preload user SGPR field.
1055 static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1056 switch (ID) {
1057 case ImplicitBufferPtrID:
1058 return 2;
1059 case PrivateSegmentBufferID:
1060 return 4;
1061 case DispatchPtrID:
1062 return 2;
1063 case QueuePtrID:
1064 return 2;
1065 case KernargSegmentPtrID:
1066 return 2;
1067 case DispatchIdID:
1068 return 2;
1069 case FlatScratchInitID:
1070 return 2;
1071 case PrivateSegmentSizeID:
1072 return 1;
1073 }
1074 llvm_unreachable("Unknown UserSGPRID.");
1075 }
1076
1077 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1078
1079private:
1080 const GCNSubtarget &ST;
1081
1082 // Private memory buffer
1083 // Compute directly in sgpr[0:1]
1084 // Other shaders indirect 64-bits at sgpr[0:1]
1085 bool ImplicitBufferPtr = false;
1086
1087 bool PrivateSegmentBuffer = false;
1088
1089 bool DispatchPtr = false;
1090
1091 bool QueuePtr = false;
1092
1093 bool KernargSegmentPtr = false;
1094
1095 bool DispatchID = false;
1096
1097 bool FlatScratchInit = false;
1098
1099 bool PrivateSegmentSize = false;
1100
1101 unsigned NumKernargPreloadSGPRs = 0;
1102
1103 unsigned NumUsedUserSGPRs = 0;
1104};
1105
1106} // end namespace llvm
1107
1108#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1109