1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
23#include "Utils/AMDGPUBaseInfo.h"
24#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
25#include "llvm/Support/ErrorHandling.h"
26
27#define GET_SUBTARGETINFO_HEADER
28#include "AMDGPUGenSubtargetInfo.inc"
29
30namespace llvm {
31
32class GCNTargetMachine;
33
34class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
35 public AMDGPUSubtarget {
36public:
37 using AMDGPUSubtarget::getMaxWavesPerEU;
38
39 // Following 2 enums are documented at:
40 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
41 enum class TrapHandlerAbi {
42 NONE = 0x00,
43 AMDHSA = 0x01,
44 };
45
46 enum class TrapID {
47 LLVMAMDHSATrap = 0x02,
48 LLVMAMDHSADebugTrap = 0x03,
49 };
50
51private:
52 /// GlobalISel related APIs.
53 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
54 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
55 std::unique_ptr<InstructionSelector> InstSelector;
56 std::unique_ptr<LegalizerInfo> Legalizer;
57 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
58
59protected:
60 // Basic subtarget description.
61 Triple TargetTriple;
62 AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
63 unsigned Gen = INVALID;
64 InstrItineraryData InstrItins;
65 int LDSBankCount = 0;
66 unsigned MaxPrivateElementSize = 0;
67
68 // Possibly statically set by tablegen, but may want to be overridden.
69 bool FastDenormalF32 = false;
70 bool HalfRate64Ops = false;
71 bool FullRate64Ops = false;
72
73 // Dynamically set bits that enable features.
74 bool FlatForGlobal = false;
75 bool AutoWaitcntBeforeBarrier = false;
76 bool BackOffBarrier = false;
77 bool UnalignedScratchAccess = false;
78 bool UnalignedAccessMode = false;
79 bool HasApertureRegs = false;
80 bool SupportsXNACK = false;
81 bool KernargPreload = false;
82
83 // This should not be used directly. 'TargetID' tracks the dynamic settings
84 // for XNACK.
85 bool EnableXNACK = false;
86
87 bool EnableTgSplit = false;
88 bool EnableCuMode = false;
89 bool TrapHandler = false;
90 bool EnablePreciseMemory = false;
91
92 // Used as options.
93 bool EnableLoadStoreOpt = false;
94 bool EnableUnsafeDSOffsetFolding = false;
95 bool EnableSIScheduler = false;
96 bool EnableDS128 = false;
97 bool EnablePRTStrictNull = false;
98 bool DumpCode = false;
99
100 // Subtarget statically properties set by tablegen
101 bool FP64 = false;
102 bool FMA = false;
103 bool MIMG_R128 = false;
104 bool CIInsts = false;
105 bool GFX8Insts = false;
106 bool GFX9Insts = false;
107 bool GFX90AInsts = false;
108 bool GFX940Insts = false;
109 bool GFX10Insts = false;
110 bool GFX11Insts = false;
111 bool GFX12Insts = false;
112 bool GFX10_3Insts = false;
113 bool GFX7GFX8GFX9Insts = false;
114 bool SGPRInitBug = false;
115 bool UserSGPRInit16Bug = false;
116 bool NegativeScratchOffsetBug = false;
117 bool NegativeUnalignedScratchOffsetBug = false;
118 bool HasSMemRealTime = false;
119 bool HasIntClamp = false;
120 bool HasFmaMixInsts = false;
121 bool HasMovrel = false;
122 bool HasVGPRIndexMode = false;
123 bool HasScalarDwordx3Loads = false;
124 bool HasScalarStores = false;
125 bool HasScalarAtomics = false;
126 bool HasSDWAOmod = false;
127 bool HasSDWAScalar = false;
128 bool HasSDWASdst = false;
129 bool HasSDWAMac = false;
130 bool HasSDWAOutModsVOPC = false;
131 bool HasDPP = false;
132 bool HasDPP8 = false;
133 bool HasDPALU_DPP = false;
134 bool HasDPPSrc1SGPR = false;
135 bool HasPackedFP32Ops = false;
136 bool HasImageInsts = false;
137 bool HasExtendedImageInsts = false;
138 bool HasR128A16 = false;
139 bool HasA16 = false;
140 bool HasG16 = false;
141 bool HasNSAEncoding = false;
142 bool HasPartialNSAEncoding = false;
143 bool GFX10_AEncoding = false;
144 bool GFX10_BEncoding = false;
145 bool HasDLInsts = false;
146 bool HasFmacF64Inst = false;
147 bool HasDot1Insts = false;
148 bool HasDot2Insts = false;
149 bool HasDot3Insts = false;
150 bool HasDot4Insts = false;
151 bool HasDot5Insts = false;
152 bool HasDot6Insts = false;
153 bool HasDot7Insts = false;
154 bool HasDot8Insts = false;
155 bool HasDot9Insts = false;
156 bool HasDot10Insts = false;
157 bool HasDot11Insts = false;
158 bool HasMAIInsts = false;
159 bool HasFP8Insts = false;
160 bool HasFP8ConversionInsts = false;
161 bool HasPkFmacF16Inst = false;
162 bool HasAtomicFMinFMaxF32GlobalInsts = false;
163 bool HasAtomicFMinFMaxF64GlobalInsts = false;
164 bool HasAtomicFMinFMaxF32FlatInsts = false;
165 bool HasAtomicFMinFMaxF64FlatInsts = false;
166 bool HasAtomicDsPkAdd16Insts = false;
167 bool HasAtomicFlatPkAdd16Insts = false;
168 bool HasAtomicFaddRtnInsts = false;
169 bool HasAtomicFaddNoRtnInsts = false;
170 bool HasMemoryAtomicFaddF32DenormalSupport = false;
171 bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
172 bool HasAtomicBufferGlobalPkAddF16Insts = false;
173 bool HasAtomicCSubNoRtnInsts = false;
174 bool HasAtomicGlobalPkAddBF16Inst = false;
175 bool HasAtomicBufferPkAddBF16Inst = false;
176 bool HasFlatAtomicFaddF32Inst = false;
177 bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
178 bool HasDefaultComponentZero = false;
179 bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
180 bool HasDefaultComponentBroadcast = false;
181 /// The maximum number of instructions that may be placed within an S_CLAUSE,
182 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
183 /// indicates a lack of S_CLAUSE support.
184 unsigned MaxHardClauseLength = 0;
185 bool SupportsSRAMECC = false;
186
187 // This should not be used directly. 'TargetID' tracks the dynamic settings
188 // for SRAMECC.
189 bool EnableSRAMECC = false;
190
191 bool HasNoSdstCMPX = false;
192 bool HasVscnt = false;
193 bool HasGetWaveIdInst = false;
194 bool HasSMemTimeInst = false;
195 bool HasShaderCyclesRegister = false;
196 bool HasShaderCyclesHiLoRegisters = false;
197 bool HasVOP3Literal = false;
198 bool HasNoDataDepHazard = false;
199 bool FlatAddressSpace = false;
200 bool FlatInstOffsets = false;
201 bool FlatGlobalInsts = false;
202 bool FlatScratchInsts = false;
203 bool ScalarFlatScratchInsts = false;
204 bool HasArchitectedFlatScratch = false;
205 bool EnableFlatScratch = false;
206 bool HasArchitectedSGPRs = false;
207 bool HasGDS = false;
208 bool HasGWS = false;
209 bool AddNoCarryInsts = false;
210 bool HasUnpackedD16VMem = false;
211 bool LDSMisalignedBug = false;
212 bool HasMFMAInlineLiteralBug = false;
213 bool UnalignedBufferAccess = false;
214 bool UnalignedDSAccess = false;
215 bool HasPackedTID = false;
216 bool ScalarizeGlobal = false;
217 bool HasSALUFloatInsts = false;
218 bool HasVGPRSingleUseHintInsts = false;
219 bool HasPseudoScalarTrans = false;
220 bool HasRestrictedSOffset = false;
221
222 bool HasVcmpxPermlaneHazard = false;
223 bool HasVMEMtoScalarWriteHazard = false;
224 bool HasSMEMtoVectorWriteHazard = false;
225 bool HasInstFwdPrefetchBug = false;
226 bool HasVcmpxExecWARHazard = false;
227 bool HasLdsBranchVmemWARHazard = false;
228 bool HasNSAtoVMEMBug = false;
229 bool HasNSAClauseBug = false;
230 bool HasOffset3fBug = false;
231 bool HasFlatSegmentOffsetBug = false;
232 bool HasImageStoreD16Bug = false;
233 bool HasImageGather4D16Bug = false;
234 bool HasMSAALoadDstSelBug = false;
235 bool HasPrivEnabledTrap2NopBug = false;
236 bool Has1_5xVGPRs = false;
237 bool HasMADIntraFwdBug = false;
238 bool HasVOPDInsts = false;
239 bool HasVALUTransUseHazard = false;
240 bool HasForceStoreSC0SC1 = false;
241 bool HasRequiredExportPriority = false;
242 bool HasVmemWriteVgprInOrder = false;
243
244 bool RequiresCOV6 = false;
245
246 // Dummy feature to use for assembler in tablegen.
247 bool FeatureDisable = false;
248
249 SelectionDAGTargetInfo TSInfo;
250private:
251 SIInstrInfo InstrInfo;
252 SITargetLowering TLInfo;
253 SIFrameLowering FrameLowering;
254
255public:
256 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
257 const GCNTargetMachine &TM);
258 ~GCNSubtarget() override;
259
260 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
261 StringRef GPU, StringRef FS);
262
263 /// Diagnose inconsistent subtarget features before attempting to codegen
264 /// function \p F.
265 void checkSubtargetFeatures(const Function &F) const;
266
267 const SIInstrInfo *getInstrInfo() const override {
268 return &InstrInfo;
269 }
270
271 const SIFrameLowering *getFrameLowering() const override {
272 return &FrameLowering;
273 }
274
275 const SITargetLowering *getTargetLowering() const override {
276 return &TLInfo;
277 }
278
279 const SIRegisterInfo *getRegisterInfo() const override {
280 return &InstrInfo.getRegisterInfo();
281 }
282
283 const CallLowering *getCallLowering() const override {
284 return CallLoweringInfo.get();
285 }
286
287 const InlineAsmLowering *getInlineAsmLowering() const override {
288 return InlineAsmLoweringInfo.get();
289 }
290
291 InstructionSelector *getInstructionSelector() const override {
292 return InstSelector.get();
293 }
294
295 const LegalizerInfo *getLegalizerInfo() const override {
296 return Legalizer.get();
297 }
298
299 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
300 return RegBankInfo.get();
301 }
302
303 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
304 return TargetID;
305 }
306
307 // Nothing implemented, just prevent crashes on use.
308 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
309 return &TSInfo;
310 }
311
312 const InstrItineraryData *getInstrItineraryData() const override {
313 return &InstrItins;
314 }
315
316 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
317
318 Generation getGeneration() const {
319 return (Generation)Gen;
320 }
321
322 unsigned getMaxWaveScratchSize() const {
323 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
324 if (getGeneration() >= GFX12) {
325 // 18-bit field in units of 64-dword.
326 return (64 * 4) * ((1 << 18) - 1);
327 }
328 if (getGeneration() == GFX11) {
329 // 15-bit field in units of 64-dword.
330 return (64 * 4) * ((1 << 15) - 1);
331 }
332 // 13-bit field in units of 256-dword.
333 return (256 * 4) * ((1 << 13) - 1);
334 }
335
336 /// Return the number of high bits known to be zero for a frame index.
337 unsigned getKnownHighZeroBitsForFrameIndex() const {
338 return llvm::countl_zero(Val: getMaxWaveScratchSize()) + getWavefrontSizeLog2();
339 }
340
341 int getLDSBankCount() const {
342 return LDSBankCount;
343 }
344
345 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
346 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
347 }
348
349 unsigned getConstantBusLimit(unsigned Opcode) const;
350
351 /// Returns if the result of this instruction with a 16-bit result returned in
352 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
353 /// the original value.
354 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
355
356 bool supportsWGP() const { return getGeneration() >= GFX10; }
357
358 bool hasIntClamp() const {
359 return HasIntClamp;
360 }
361
362 bool hasFP64() const {
363 return FP64;
364 }
365
366 bool hasMIMG_R128() const {
367 return MIMG_R128;
368 }
369
370 bool hasHWFP64() const {
371 return FP64;
372 }
373
374 bool hasHalfRate64Ops() const {
375 return HalfRate64Ops;
376 }
377
378 bool hasFullRate64Ops() const {
379 return FullRate64Ops;
380 }
381
382 bool hasAddr64() const {
383 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
384 }
385
386 bool hasFlat() const {
387 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
388 }
389
390 // Return true if the target only has the reverse operand versions of VALU
391 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
392 bool hasOnlyRevVALUShifts() const {
393 return getGeneration() >= VOLCANIC_ISLANDS;
394 }
395
396 bool hasFractBug() const {
397 return getGeneration() == SOUTHERN_ISLANDS;
398 }
399
400 bool hasBFE() const {
401 return true;
402 }
403
404 bool hasBFI() const {
405 return true;
406 }
407
408 bool hasBFM() const {
409 return hasBFE();
410 }
411
412 bool hasBCNT(unsigned Size) const {
413 return true;
414 }
415
416 bool hasFFBL() const {
417 return true;
418 }
419
420 bool hasFFBH() const {
421 return true;
422 }
423
424 bool hasMed3_16() const {
425 return getGeneration() >= AMDGPUSubtarget::GFX9;
426 }
427
428 bool hasMin3Max3_16() const {
429 return getGeneration() >= AMDGPUSubtarget::GFX9;
430 }
431
432 bool hasFmaMixInsts() const {
433 return HasFmaMixInsts;
434 }
435
436 bool hasCARRY() const {
437 return true;
438 }
439
440 bool hasFMA() const {
441 return FMA;
442 }
443
444 bool hasSwap() const {
445 return GFX9Insts;
446 }
447
448 bool hasScalarPackInsts() const {
449 return GFX9Insts;
450 }
451
452 bool hasScalarMulHiInsts() const {
453 return GFX9Insts;
454 }
455
456 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
457
458 TrapHandlerAbi getTrapHandlerAbi() const {
459 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
460 }
461
462 bool supportsGetDoorbellID() const {
463 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
464 return getGeneration() >= GFX9;
465 }
466
467 /// True if the offset field of DS instructions works as expected. On SI, the
468 /// offset uses a 16-bit adder and does not always wrap properly.
469 bool hasUsableDSOffset() const {
470 return getGeneration() >= SEA_ISLANDS;
471 }
472
473 bool unsafeDSOffsetFoldingEnabled() const {
474 return EnableUnsafeDSOffsetFolding;
475 }
476
477 /// Condition output from div_scale is usable.
478 bool hasUsableDivScaleConditionOutput() const {
479 return getGeneration() != SOUTHERN_ISLANDS;
480 }
481
482 /// Extra wait hazard is needed in some cases before
483 /// s_cbranch_vccnz/s_cbranch_vccz.
484 bool hasReadVCCZBug() const {
485 return getGeneration() <= SEA_ISLANDS;
486 }
487
488 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
489 bool partialVCCWritesUpdateVCCZ() const {
490 return getGeneration() >= GFX10;
491 }
492
493 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
494 /// was written by a VALU instruction.
495 bool hasSMRDReadVALUDefHazard() const {
496 return getGeneration() == SOUTHERN_ISLANDS;
497 }
498
499 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
500 /// SGPR was written by a VALU Instruction.
501 bool hasVMEMReadSGPRVALUDefHazard() const {
502 return getGeneration() >= VOLCANIC_ISLANDS;
503 }
504
505 bool hasRFEHazards() const {
506 return getGeneration() >= VOLCANIC_ISLANDS;
507 }
508
509 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
510 unsigned getSetRegWaitStates() const {
511 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
512 }
513
514 bool dumpCode() const {
515 return DumpCode;
516 }
517
518 /// Return the amount of LDS that can be used that will not restrict the
519 /// occupancy lower than WaveCount.
520 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
521 const Function &) const;
522
523 bool supportsMinMaxDenormModes() const {
524 return getGeneration() >= AMDGPUSubtarget::GFX9;
525 }
526
527 /// \returns If target supports S_DENORM_MODE.
528 bool hasDenormModeInst() const {
529 return getGeneration() >= AMDGPUSubtarget::GFX10;
530 }
531
532 bool useFlatForGlobal() const {
533 return FlatForGlobal;
534 }
535
536 /// \returns If target supports ds_read/write_b128 and user enables generation
537 /// of ds_read/write_b128.
538 bool useDS128() const {
539 return CIInsts && EnableDS128;
540 }
541
542 /// \return If target supports ds_read/write_b96/128.
543 bool hasDS96AndDS128() const {
544 return CIInsts;
545 }
546
547 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
548 bool haveRoundOpsF64() const {
549 return CIInsts;
550 }
551
552 /// \returns If MUBUF instructions always perform range checking, even for
553 /// buffer resources used for private memory access.
554 bool privateMemoryResourceIsRangeChecked() const {
555 return getGeneration() < AMDGPUSubtarget::GFX9;
556 }
557
558 /// \returns If target requires PRT Struct NULL support (zero result registers
559 /// for sparse texture support).
560 bool usePRTStrictNull() const {
561 return EnablePRTStrictNull;
562 }
563
564 bool hasAutoWaitcntBeforeBarrier() const {
565 return AutoWaitcntBeforeBarrier;
566 }
567
568 /// \returns true if the target supports backing off of s_barrier instructions
569 /// when an exception is raised.
570 bool supportsBackOffBarrier() const {
571 return BackOffBarrier;
572 }
573
574 bool hasUnalignedBufferAccess() const {
575 return UnalignedBufferAccess;
576 }
577
578 bool hasUnalignedBufferAccessEnabled() const {
579 return UnalignedBufferAccess && UnalignedAccessMode;
580 }
581
582 bool hasUnalignedDSAccess() const {
583 return UnalignedDSAccess;
584 }
585
586 bool hasUnalignedDSAccessEnabled() const {
587 return UnalignedDSAccess && UnalignedAccessMode;
588 }
589
590 bool hasUnalignedScratchAccess() const {
591 return UnalignedScratchAccess;
592 }
593
594 bool hasUnalignedAccessMode() const {
595 return UnalignedAccessMode;
596 }
597
598 bool hasApertureRegs() const {
599 return HasApertureRegs;
600 }
601
602 bool isTrapHandlerEnabled() const {
603 return TrapHandler;
604 }
605
606 bool isXNACKEnabled() const {
607 return TargetID.isXnackOnOrAny();
608 }
609
610 bool isTgSplitEnabled() const {
611 return EnableTgSplit;
612 }
613
614 bool isCuModeEnabled() const {
615 return EnableCuMode;
616 }
617
618 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
619
620 bool hasFlatAddressSpace() const {
621 return FlatAddressSpace;
622 }
623
624 bool hasFlatScrRegister() const {
625 return hasFlatAddressSpace();
626 }
627
628 bool hasFlatInstOffsets() const {
629 return FlatInstOffsets;
630 }
631
632 bool hasFlatGlobalInsts() const {
633 return FlatGlobalInsts;
634 }
635
636 bool hasFlatScratchInsts() const {
637 return FlatScratchInsts;
638 }
639
640 // Check if target supports ST addressing mode with FLAT scratch instructions.
641 // The ST addressing mode means no registers are used, either VGPR or SGPR,
642 // but only immediate offset is swizzled and added to the FLAT scratch base.
643 bool hasFlatScratchSTMode() const {
644 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
645 }
646
647 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
648
649 bool hasScalarFlatScratchInsts() const {
650 return ScalarFlatScratchInsts;
651 }
652
653 bool enableFlatScratch() const {
654 return flatScratchIsArchitected() ||
655 (EnableFlatScratch && hasFlatScratchInsts());
656 }
657
658 bool hasGlobalAddTidInsts() const {
659 return GFX10_BEncoding;
660 }
661
662 bool hasAtomicCSub() const {
663 return GFX10_BEncoding;
664 }
665
666 bool hasExportInsts() const {
667 return !hasGFX940Insts();
668 }
669
670 bool hasVINTERPEncoding() const {
671 return GFX11Insts;
672 }
673
674 // DS_ADD_F64/DS_ADD_RTN_F64
675 bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
676
677 bool hasMultiDwordFlatScratchAddressing() const {
678 return getGeneration() >= GFX9;
679 }
680
681 bool hasFlatSegmentOffsetBug() const {
682 return HasFlatSegmentOffsetBug;
683 }
684
685 bool hasFlatLgkmVMemCountInOrder() const {
686 return getGeneration() > GFX9;
687 }
688
689 bool hasD16LoadStore() const {
690 return getGeneration() >= GFX9;
691 }
692
693 bool d16PreservesUnusedBits() const {
694 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
695 }
696
697 bool hasD16Images() const {
698 return getGeneration() >= VOLCANIC_ISLANDS;
699 }
700
701 /// Return if most LDS instructions have an m0 use that require m0 to be
702 /// initialized.
703 bool ldsRequiresM0Init() const {
704 return getGeneration() < GFX9;
705 }
706
707 // True if the hardware rewinds and replays GWS operations if a wave is
708 // preempted.
709 //
710 // If this is false, a GWS operation requires testing if a nack set the
711 // MEM_VIOL bit, and repeating if so.
712 bool hasGWSAutoReplay() const {
713 return getGeneration() >= GFX9;
714 }
715
716 /// \returns if target has ds_gws_sema_release_all instruction.
717 bool hasGWSSemaReleaseAll() const {
718 return CIInsts;
719 }
720
721 /// \returns true if the target has integer add/sub instructions that do not
722 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
723 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
724 /// for saturation.
725 bool hasAddNoCarry() const {
726 return AddNoCarryInsts;
727 }
728
729 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
730
731 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
732
733 bool hasUnpackedD16VMem() const {
734 return HasUnpackedD16VMem;
735 }
736
737 // Covers VS/PS/CS graphics shaders
738 bool isMesaGfxShader(const Function &F) const {
739 return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
740 }
741
742 bool hasMad64_32() const {
743 return getGeneration() >= SEA_ISLANDS;
744 }
745
746 bool hasSDWAOmod() const {
747 return HasSDWAOmod;
748 }
749
750 bool hasSDWAScalar() const {
751 return HasSDWAScalar;
752 }
753
754 bool hasSDWASdst() const {
755 return HasSDWASdst;
756 }
757
758 bool hasSDWAMac() const {
759 return HasSDWAMac;
760 }
761
762 bool hasSDWAOutModsVOPC() const {
763 return HasSDWAOutModsVOPC;
764 }
765
766 bool hasDLInsts() const {
767 return HasDLInsts;
768 }
769
770 bool hasFmacF64Inst() const { return HasFmacF64Inst; }
771
772 bool hasDot1Insts() const {
773 return HasDot1Insts;
774 }
775
776 bool hasDot2Insts() const {
777 return HasDot2Insts;
778 }
779
780 bool hasDot3Insts() const {
781 return HasDot3Insts;
782 }
783
784 bool hasDot4Insts() const {
785 return HasDot4Insts;
786 }
787
788 bool hasDot5Insts() const {
789 return HasDot5Insts;
790 }
791
792 bool hasDot6Insts() const {
793 return HasDot6Insts;
794 }
795
796 bool hasDot7Insts() const {
797 return HasDot7Insts;
798 }
799
800 bool hasDot8Insts() const {
801 return HasDot8Insts;
802 }
803
804 bool hasDot9Insts() const {
805 return HasDot9Insts;
806 }
807
808 bool hasDot10Insts() const {
809 return HasDot10Insts;
810 }
811
812 bool hasDot11Insts() const {
813 return HasDot11Insts;
814 }
815
816 bool hasMAIInsts() const {
817 return HasMAIInsts;
818 }
819
820 bool hasFP8Insts() const {
821 return HasFP8Insts;
822 }
823
824 bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
825
826 bool hasPkFmacF16Inst() const {
827 return HasPkFmacF16Inst;
828 }
829
830 bool hasAtomicFMinFMaxF32GlobalInsts() const {
831 return HasAtomicFMinFMaxF32GlobalInsts;
832 }
833
834 bool hasAtomicFMinFMaxF64GlobalInsts() const {
835 return HasAtomicFMinFMaxF64GlobalInsts;
836 }
837
838 bool hasAtomicFMinFMaxF32FlatInsts() const {
839 return HasAtomicFMinFMaxF32FlatInsts;
840 }
841
842 bool hasAtomicFMinFMaxF64FlatInsts() const {
843 return HasAtomicFMinFMaxF64FlatInsts;
844 }
845
846 bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
847
848 bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
849
850 bool hasAtomicFaddInsts() const {
851 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
852 }
853
854 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
855
856 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
857
858 bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
859 return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
860 }
861
862 bool hasAtomicBufferGlobalPkAddF16Insts() const {
863 return HasAtomicBufferGlobalPkAddF16Insts;
864 }
865
866 bool hasAtomicGlobalPkAddBF16Inst() const {
867 return HasAtomicGlobalPkAddBF16Inst;
868 }
869
870 bool hasAtomicBufferPkAddBF16Inst() const {
871 return HasAtomicBufferPkAddBF16Inst;
872 }
873
874 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
875
876 /// \return true if the target has flat, global, and buffer atomic fadd for
877 /// double.
878 bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
879 return HasFlatBufferGlobalAtomicFaddF64Inst;
880 }
881
882 /// \return true if the target's flat, global, and buffer atomic fadd for
883 /// float supports denormal handling.
884 bool hasMemoryAtomicFaddF32DenormalSupport() const {
885 return HasMemoryAtomicFaddF32DenormalSupport;
886 }
887
888 /// \return true if atomic operations targeting fine-grained memory work
889 /// correctly at device scope, in allocations in host or peer PCIe device
890 /// memory.
891 bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
892 return HasAgentScopeFineGrainedRemoteMemoryAtomics;
893 }
894
895 bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
896
897 bool hasDefaultComponentBroadcast() const {
898 return HasDefaultComponentBroadcast;
899 }
900
901 bool hasNoSdstCMPX() const {
902 return HasNoSdstCMPX;
903 }
904
905 bool hasVscnt() const {
906 return HasVscnt;
907 }
908
909 bool hasGetWaveIdInst() const {
910 return HasGetWaveIdInst;
911 }
912
913 bool hasSMemTimeInst() const {
914 return HasSMemTimeInst;
915 }
916
917 bool hasShaderCyclesRegister() const {
918 return HasShaderCyclesRegister;
919 }
920
921 bool hasShaderCyclesHiLoRegisters() const {
922 return HasShaderCyclesHiLoRegisters;
923 }
924
925 bool hasVOP3Literal() const {
926 return HasVOP3Literal;
927 }
928
929 bool hasNoDataDepHazard() const {
930 return HasNoDataDepHazard;
931 }
932
933 bool vmemWriteNeedsExpWaitcnt() const {
934 return getGeneration() < SEA_ISLANDS;
935 }
936
937 bool hasInstPrefetch() const {
938 return getGeneration() == GFX10 || getGeneration() == GFX11;
939 }
940
941 bool hasPrefetch() const { return GFX12Insts; }
942
943 // Has s_cmpk_* instructions.
944 bool hasSCmpK() const { return getGeneration() < GFX12; }
945
946 // Scratch is allocated in 256 dword per wave blocks for the entire
947 // wavefront. When viewed from the perspective of an arbitrary workitem, this
948 // is 4-byte aligned.
949 //
950 // Only 4-byte alignment is really needed to access anything. Transformations
951 // on the pointer value itself may rely on the alignment / known low bits of
952 // the pointer. Set this to something above the minimum to avoid needing
953 // dynamic realignment in common cases.
954 Align getStackAlignment() const { return Align(16); }
955
956 bool enableMachineScheduler() const override {
957 return true;
958 }
959
960 bool useAA() const override;
961
962 bool enableSubRegLiveness() const override {
963 return true;
964 }
965
966 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
967 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
968
969 // static wrappers
970 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
971
972 // XXX - Why is this here if it isn't in the default pass set?
973 bool enableEarlyIfConversion() const override {
974 return true;
975 }
976
977 void overrideSchedPolicy(MachineSchedPolicy &Policy,
978 unsigned NumRegionInstrs) const override;
979
980 void mirFileLoaded(MachineFunction &MF) const override;
981
982 unsigned getMaxNumUserSGPRs() const {
983 return AMDGPU::getMaxNumUserSGPRs(STI: *this);
984 }
985
986 bool hasSMemRealTime() const {
987 return HasSMemRealTime;
988 }
989
990 bool hasMovrel() const {
991 return HasMovrel;
992 }
993
994 bool hasVGPRIndexMode() const {
995 return HasVGPRIndexMode;
996 }
997
998 bool useVGPRIndexMode() const;
999
1000 bool hasScalarCompareEq64() const {
1001 return getGeneration() >= VOLCANIC_ISLANDS;
1002 }
1003
1004 bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
1005
1006 bool hasScalarStores() const {
1007 return HasScalarStores;
1008 }
1009
1010 bool hasScalarAtomics() const {
1011 return HasScalarAtomics;
1012 }
1013
1014 bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
1015 bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
1016
1017 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
1018 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
1019
1020 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
1021 bool hasPermLane64() const { return getGeneration() >= GFX11; }
1022
1023 bool hasDPP() const {
1024 return HasDPP;
1025 }
1026
1027 bool hasDPPBroadcasts() const {
1028 return HasDPP && getGeneration() < GFX10;
1029 }
1030
1031 bool hasDPPWavefrontShifts() const {
1032 return HasDPP && getGeneration() < GFX10;
1033 }
1034
1035 bool hasDPP8() const {
1036 return HasDPP8;
1037 }
1038
1039 bool hasDPALU_DPP() const {
1040 return HasDPALU_DPP;
1041 }
1042
1043 bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
1044
1045 bool hasPackedFP32Ops() const {
1046 return HasPackedFP32Ops;
1047 }
1048
1049 // Has V_PK_MOV_B32 opcode
1050 bool hasPkMovB32() const {
1051 return GFX90AInsts;
1052 }
1053
1054 bool hasFmaakFmamkF32Insts() const {
1055 return getGeneration() >= GFX10 || hasGFX940Insts();
1056 }
1057
1058 bool hasImageInsts() const {
1059 return HasImageInsts;
1060 }
1061
1062 bool hasExtendedImageInsts() const {
1063 return HasExtendedImageInsts;
1064 }
1065
1066 bool hasR128A16() const {
1067 return HasR128A16;
1068 }
1069
1070 bool hasA16() const { return HasA16; }
1071
1072 bool hasG16() const { return HasG16; }
1073
1074 bool hasOffset3fBug() const {
1075 return HasOffset3fBug;
1076 }
1077
1078 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1079
1080 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1081
1082 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1083
1084 bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1085
1086 bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1087
1088 bool hasNSAEncoding() const { return HasNSAEncoding; }
1089
1090 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1091
1092 bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1093
1094 unsigned getNSAMaxSize(bool HasSampler = false) const {
1095 return AMDGPU::getNSAMaxSize(STI: *this, HasSampler);
1096 }
1097
1098 bool hasGFX10_AEncoding() const {
1099 return GFX10_AEncoding;
1100 }
1101
1102 bool hasGFX10_BEncoding() const {
1103 return GFX10_BEncoding;
1104 }
1105
1106 bool hasGFX10_3Insts() const {
1107 return GFX10_3Insts;
1108 }
1109
1110 bool hasMadF16() const;
1111
1112 bool hasMovB64() const { return GFX940Insts; }
1113
1114 bool hasLshlAddB64() const { return GFX940Insts; }
1115
1116 bool enableSIScheduler() const {
1117 return EnableSIScheduler;
1118 }
1119
1120 bool loadStoreOptEnabled() const {
1121 return EnableLoadStoreOpt;
1122 }
1123
1124 bool hasSGPRInitBug() const {
1125 return SGPRInitBug;
1126 }
1127
1128 bool hasUserSGPRInit16Bug() const {
1129 return UserSGPRInit16Bug && isWave32();
1130 }
1131
1132 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1133
1134 bool hasNegativeUnalignedScratchOffsetBug() const {
1135 return NegativeUnalignedScratchOffsetBug;
1136 }
1137
1138 bool hasMFMAInlineLiteralBug() const {
1139 return HasMFMAInlineLiteralBug;
1140 }
1141
1142 bool has12DWordStoreHazard() const {
1143 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1144 }
1145
1146 // \returns true if the subtarget supports DWORDX3 load/store instructions.
1147 bool hasDwordx3LoadStores() const {
1148 return CIInsts;
1149 }
1150
1151 bool hasReadM0MovRelInterpHazard() const {
1152 return getGeneration() == AMDGPUSubtarget::GFX9;
1153 }
1154
1155 bool hasReadM0SendMsgHazard() const {
1156 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1157 getGeneration() <= AMDGPUSubtarget::GFX9;
1158 }
1159
1160 bool hasReadM0LdsDmaHazard() const {
1161 return getGeneration() == AMDGPUSubtarget::GFX9;
1162 }
1163
1164 bool hasReadM0LdsDirectHazard() const {
1165 return getGeneration() == AMDGPUSubtarget::GFX9;
1166 }
1167
1168 bool hasVcmpxPermlaneHazard() const {
1169 return HasVcmpxPermlaneHazard;
1170 }
1171
1172 bool hasVMEMtoScalarWriteHazard() const {
1173 return HasVMEMtoScalarWriteHazard;
1174 }
1175
1176 bool hasSMEMtoVectorWriteHazard() const {
1177 return HasSMEMtoVectorWriteHazard;
1178 }
1179
1180 bool hasLDSMisalignedBug() const {
1181 return LDSMisalignedBug && !EnableCuMode;
1182 }
1183
1184 bool hasInstFwdPrefetchBug() const {
1185 return HasInstFwdPrefetchBug;
1186 }
1187
1188 bool hasVcmpxExecWARHazard() const {
1189 return HasVcmpxExecWARHazard;
1190 }
1191
1192 bool hasLdsBranchVmemWARHazard() const {
1193 return HasLdsBranchVmemWARHazard;
1194 }
1195
1196 // Shift amount of a 64 bit shift cannot be a highest allocated register
1197 // if also at the end of the allocation block.
1198 bool hasShift64HighRegBug() const {
1199 return GFX90AInsts && !GFX940Insts;
1200 }
1201
1202 // Has one cycle hazard on transcendental instruction feeding a
1203 // non transcendental VALU.
1204 bool hasTransForwardingHazard() const { return GFX940Insts; }
1205
1206 // Has one cycle hazard on a VALU instruction partially writing dst with
1207 // a shift of result bits feeding another VALU instruction.
1208 bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1209
1210 // Cannot use op_sel with v_dot instructions.
1211 bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
1212
1213 // Does not have HW interlocs for VALU writing and then reading SGPRs.
1214 bool hasVDecCoExecHazard() const {
1215 return GFX940Insts;
1216 }
1217
1218 bool hasNSAtoVMEMBug() const {
1219 return HasNSAtoVMEMBug;
1220 }
1221
1222 bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1223
1224 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
1225
1226 bool hasGFX90AInsts() const { return GFX90AInsts; }
1227
1228 bool hasFPAtomicToDenormModeHazard() const {
1229 return getGeneration() == GFX10;
1230 }
1231
1232 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1233
1234 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1235
1236 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1237
1238 bool hasVALUPartialForwardingHazard() const {
1239 return getGeneration() == GFX11;
1240 }
1241
1242 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1243
1244 bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
1245
1246 bool requiresCodeObjectV6() const { return RequiresCOV6; }
1247
1248 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1249
1250 /// Return if operations acting on VGPR tuples require even alignment.
1251 bool needsAlignedVGPRs() const { return GFX90AInsts; }
1252
1253 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1254 bool hasSPackHL() const { return GFX11Insts; }
1255
1256 /// Return true if the target's EXP instruction has the COMPR flag, which
1257 /// affects the meaning of the EN (enable) bits.
1258 bool hasCompressedExport() const { return !GFX11Insts; }
1259
1260 /// Return true if the target's EXP instruction supports the NULL export
1261 /// target.
1262 bool hasNullExportTarget() const { return !GFX11Insts; }
1263
1264 bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
1265
1266 bool hasVOPDInsts() const { return HasVOPDInsts; }
1267
1268 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1269
1270 /// Return true if the target has the S_DELAY_ALU instruction.
1271 bool hasDelayAlu() const { return GFX11Insts; }
1272
1273 bool hasPackedTID() const { return HasPackedTID; }
1274
1275 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1276 // hasGFX90AInsts is also true.
1277 bool hasGFX940Insts() const { return GFX940Insts; }
1278
1279 bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1280
1281 bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
1282
1283 bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1284
1285 bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1286
1287 bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
1288
1289 bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
1290
1291 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1292 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
1293 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1294
1295 /// \returns true if inline constants are not supported for F16 pseudo
1296 /// scalar transcendentals.
1297 bool hasNoF16PseudoScalarTransInlineConstants() const {
1298 return getGeneration() == GFX12;
1299 }
1300
1301 /// \returns The maximum number of instructions that can be enclosed in an
1302 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1303 /// instruction.
1304 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1305
1306 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1307 /// SGPRs
1308 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1309
1310 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1311 /// VGPRs
1312 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1313
1314 /// Return occupancy for the given function. Used LDS and a number of
1315 /// registers if provided.
1316 /// Note, occupancy can be affected by the scratch allocation as well, but
1317 /// we do not have enough information to compute it.
1318 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1319 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1320
1321 /// \returns true if the flat_scratch register should be initialized with the
1322 /// pointer to the wave's scratch memory rather than a size and offset.
1323 bool flatScratchIsPointer() const {
1324 return getGeneration() >= AMDGPUSubtarget::GFX9;
1325 }
1326
1327 /// \returns true if the flat_scratch register is initialized by the HW.
1328 /// In this case it is readonly.
1329 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1330
1331 /// \returns true if the architected SGPRs are enabled.
1332 bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1333
1334 /// \returns true if Global Data Share is supported.
1335 bool hasGDS() const { return HasGDS; }
1336
1337 /// \returns true if Global Wave Sync is supported.
1338 bool hasGWS() const { return HasGWS; }
1339
1340 /// \returns true if the machine has merged shaders in which s0-s7 are
1341 /// reserved by the hardware and user SGPRs start at s8
1342 bool hasMergedShaders() const {
1343 return getGeneration() >= GFX9;
1344 }
1345
1346 // \returns true if the target supports the pre-NGG legacy geometry path.
1347 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1348
1349 // \returns true if preloading kernel arguments is supported.
1350 bool hasKernargPreload() const { return KernargPreload; }
1351
1352 // \returns true if the target has split barriers feature
1353 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1354
1355 // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1356 bool hasCvtFP8VOP1Bug() const { return true; }
1357
1358 // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1359 // no-return form.
1360 bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1361
1362 // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1363 bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1364
1365 // \returns true if the target has IEEE kernel descriptor mode bit
1366 bool hasIEEEMode() const { return getGeneration() < GFX12; }
1367
1368 // \returns true if the target has IEEE fminimum/fmaximum instructions
1369 bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1370
1371 // \returns true if the target has IEEE fminimum3/fmaximum3 instructions
1372 bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
1373
1374 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1375 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1376
1377 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1378 /// values.
1379 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1380
1381 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1382 // of sign-extending.
1383 bool hasGetPCZeroExtension() const { return GFX12Insts; }
1384
1385 /// \returns SGPR allocation granularity supported by the subtarget.
1386 unsigned getSGPRAllocGranule() const {
1387 return AMDGPU::IsaInfo::getSGPRAllocGranule(STI: this);
1388 }
1389
1390 /// \returns SGPR encoding granularity supported by the subtarget.
1391 unsigned getSGPREncodingGranule() const {
1392 return AMDGPU::IsaInfo::getSGPREncodingGranule(STI: this);
1393 }
1394
1395 /// \returns Total number of SGPRs supported by the subtarget.
1396 unsigned getTotalNumSGPRs() const {
1397 return AMDGPU::IsaInfo::getTotalNumSGPRs(STI: this);
1398 }
1399
1400 /// \returns Addressable number of SGPRs supported by the subtarget.
1401 unsigned getAddressableNumSGPRs() const {
1402 return AMDGPU::IsaInfo::getAddressableNumSGPRs(STI: this);
1403 }
1404
1405 /// \returns Minimum number of SGPRs that meets the given number of waves per
1406 /// execution unit requirement supported by the subtarget.
1407 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1408 return AMDGPU::IsaInfo::getMinNumSGPRs(STI: this, WavesPerEU);
1409 }
1410
1411 /// \returns Maximum number of SGPRs that meets the given number of waves per
1412 /// execution unit requirement supported by the subtarget.
1413 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1414 return AMDGPU::IsaInfo::getMaxNumSGPRs(STI: this, WavesPerEU, Addressable);
1415 }
1416
1417 /// \returns Reserved number of SGPRs. This is common
1418 /// utility function called by MachineFunction and
1419 /// Function variants of getReservedNumSGPRs.
1420 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1421 /// \returns Reserved number of SGPRs for given machine function \p MF.
1422 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1423
1424 /// \returns Reserved number of SGPRs for given function \p F.
1425 unsigned getReservedNumSGPRs(const Function &F) const;
1426
1427 /// \returns max num SGPRs. This is the common utility
1428 /// function called by MachineFunction and Function
1429 /// variants of getMaxNumSGPRs.
1430 unsigned getBaseMaxNumSGPRs(const Function &F,
1431 std::pair<unsigned, unsigned> WavesPerEU,
1432 unsigned PreloadedSGPRs,
1433 unsigned ReservedNumSGPRs) const;
1434
1435 /// \returns Maximum number of SGPRs that meets number of waves per execution
1436 /// unit requirement for function \p MF, or number of SGPRs explicitly
1437 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1438 ///
1439 /// \returns Value that meets number of waves per execution unit requirement
1440 /// if explicitly requested value cannot be converted to integer, violates
1441 /// subtarget's specifications, or does not meet number of waves per execution
1442 /// unit requirement.
1443 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1444
1445 /// \returns Maximum number of SGPRs that meets number of waves per execution
1446 /// unit requirement for function \p F, or number of SGPRs explicitly
1447 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1448 ///
1449 /// \returns Value that meets number of waves per execution unit requirement
1450 /// if explicitly requested value cannot be converted to integer, violates
1451 /// subtarget's specifications, or does not meet number of waves per execution
1452 /// unit requirement.
1453 unsigned getMaxNumSGPRs(const Function &F) const;
1454
1455 /// \returns VGPR allocation granularity supported by the subtarget.
1456 unsigned getVGPRAllocGranule() const {
1457 return AMDGPU::IsaInfo::getVGPRAllocGranule(STI: this);
1458 }
1459
1460 /// \returns VGPR encoding granularity supported by the subtarget.
1461 unsigned getVGPREncodingGranule() const {
1462 return AMDGPU::IsaInfo::getVGPREncodingGranule(STI: this);
1463 }
1464
1465 /// \returns Total number of VGPRs supported by the subtarget.
1466 unsigned getTotalNumVGPRs() const {
1467 return AMDGPU::IsaInfo::getTotalNumVGPRs(STI: this);
1468 }
1469
1470 /// \returns Addressable number of architectural VGPRs supported by the
1471 /// subtarget.
1472 unsigned getAddressableNumArchVGPRs() const {
1473 return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(STI: this);
1474 }
1475
1476 /// \returns Addressable number of VGPRs supported by the subtarget.
1477 unsigned getAddressableNumVGPRs() const {
1478 return AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: this);
1479 }
1480
1481 /// \returns the minimum number of VGPRs that will prevent achieving more than
1482 /// the specified number of waves \p WavesPerEU.
1483 unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1484 return AMDGPU::IsaInfo::getMinNumVGPRs(STI: this, WavesPerEU);
1485 }
1486
1487 /// \returns the maximum number of VGPRs that can be used and still achieved
1488 /// at least the specified number of waves \p WavesPerEU.
1489 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1490 return AMDGPU::IsaInfo::getMaxNumVGPRs(STI: this, WavesPerEU);
1491 }
1492
1493 /// \returns max num VGPRs. This is the common utility function
1494 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1495 unsigned getBaseMaxNumVGPRs(const Function &F,
1496 std::pair<unsigned, unsigned> WavesPerEU) const;
1497 /// \returns Maximum number of VGPRs that meets number of waves per execution
1498 /// unit requirement for function \p F, or number of VGPRs explicitly
1499 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1500 ///
1501 /// \returns Value that meets number of waves per execution unit requirement
1502 /// if explicitly requested value cannot be converted to integer, violates
1503 /// subtarget's specifications, or does not meet number of waves per execution
1504 /// unit requirement.
1505 unsigned getMaxNumVGPRs(const Function &F) const;
1506
1507 unsigned getMaxNumAGPRs(const Function &F) const {
1508 return getMaxNumVGPRs(F);
1509 }
1510
1511 /// \returns Maximum number of VGPRs that meets number of waves per execution
1512 /// unit requirement for function \p MF, or number of VGPRs explicitly
1513 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1514 ///
1515 /// \returns Value that meets number of waves per execution unit requirement
1516 /// if explicitly requested value cannot be converted to integer, violates
1517 /// subtarget's specifications, or does not meet number of waves per execution
1518 /// unit requirement.
1519 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1520
1521 void getPostRAMutations(
1522 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1523 const override;
1524
1525 std::unique_ptr<ScheduleDAGMutation>
1526 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1527
1528 bool isWave32() const {
1529 return getWavefrontSize() == 32;
1530 }
1531
1532 bool isWave64() const {
1533 return getWavefrontSize() == 64;
1534 }
1535
1536 const TargetRegisterClass *getBoolRC() const {
1537 return getRegisterInfo()->getBoolRC();
1538 }
1539
1540 /// \returns Maximum number of work groups per compute unit supported by the
1541 /// subtarget and limited by given \p FlatWorkGroupSize.
1542 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1543 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(STI: this, FlatWorkGroupSize);
1544 }
1545
1546 /// \returns Minimum flat work group size supported by the subtarget.
1547 unsigned getMinFlatWorkGroupSize() const override {
1548 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(STI: this);
1549 }
1550
1551 /// \returns Maximum flat work group size supported by the subtarget.
1552 unsigned getMaxFlatWorkGroupSize() const override {
1553 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(STI: this);
1554 }
1555
1556 /// \returns Number of waves per execution unit required to support the given
1557 /// \p FlatWorkGroupSize.
1558 unsigned
1559 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1560 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(STI: this, FlatWorkGroupSize);
1561 }
1562
1563 /// \returns Minimum number of waves per execution unit supported by the
1564 /// subtarget.
1565 unsigned getMinWavesPerEU() const override {
1566 return AMDGPU::IsaInfo::getMinWavesPerEU(STI: this);
1567 }
1568
1569 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1570 SDep &Dep,
1571 const TargetSchedModel *SchedModel) const override;
1572
1573 // \returns true if it's beneficial on this subtarget for the scheduler to
1574 // cluster stores as well as loads.
1575 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1576
1577 // \returns the number of address arguments from which to enable MIMG NSA
1578 // on supported architectures.
1579 unsigned getNSAThreshold(const MachineFunction &MF) const;
1580
1581 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
1582 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1583 bool requiresNopBeforeDeallocVGPRs() const {
1584 // Currently all targets that support the dealloc VGPRs message also require
1585 // the nop.
1586 return true;
1587 }
1588};
1589
1590class GCNUserSGPRUsageInfo {
1591public:
1592 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1593
1594 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1595
1596 bool hasDispatchPtr() const { return DispatchPtr; }
1597
1598 bool hasQueuePtr() const { return QueuePtr; }
1599
1600 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1601
1602 bool hasDispatchID() const { return DispatchID; }
1603
1604 bool hasFlatScratchInit() const { return FlatScratchInit; }
1605
1606 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1607
1608 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1609
1610 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1611
1612 unsigned getNumFreeUserSGPRs();
1613
1614 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1615
1616 enum UserSGPRID : unsigned {
1617 ImplicitBufferPtrID = 0,
1618 PrivateSegmentBufferID = 1,
1619 DispatchPtrID = 2,
1620 QueuePtrID = 3,
1621 KernargSegmentPtrID = 4,
1622 DispatchIdID = 5,
1623 FlatScratchInitID = 6,
1624 PrivateSegmentSizeID = 7
1625 };
1626
1627 // Returns the size in number of SGPRs for preload user SGPR field.
1628 static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1629 switch (ID) {
1630 case ImplicitBufferPtrID:
1631 return 2;
1632 case PrivateSegmentBufferID:
1633 return 4;
1634 case DispatchPtrID:
1635 return 2;
1636 case QueuePtrID:
1637 return 2;
1638 case KernargSegmentPtrID:
1639 return 2;
1640 case DispatchIdID:
1641 return 2;
1642 case FlatScratchInitID:
1643 return 2;
1644 case PrivateSegmentSizeID:
1645 return 1;
1646 }
1647 llvm_unreachable("Unknown UserSGPRID.");
1648 }
1649
1650 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1651
1652private:
1653 const GCNSubtarget &ST;
1654
1655 // Private memory buffer
1656 // Compute directly in sgpr[0:1]
1657 // Other shaders indirect 64-bits at sgpr[0:1]
1658 bool ImplicitBufferPtr = false;
1659
1660 bool PrivateSegmentBuffer = false;
1661
1662 bool DispatchPtr = false;
1663
1664 bool QueuePtr = false;
1665
1666 bool KernargSegmentPtr = false;
1667
1668 bool DispatchID = false;
1669
1670 bool FlatScratchInit = false;
1671
1672 bool PrivateSegmentSize = false;
1673
1674 unsigned NumKernargPreloadSGPRs = 0;
1675
1676 unsigned NumUsedUserSGPRs = 0;
1677};
1678
1679} // end namespace llvm
1680
1681#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1682