1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
23#include "Utils/AMDGPUBaseInfo.h"
24#include "llvm/Support/ErrorHandling.h"
25
26#define GET_SUBTARGETINFO_HEADER
27#include "AMDGPUGenSubtargetInfo.inc"
28
29namespace llvm {
30
31class GCNTargetMachine;
32
33class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
34 public AMDGPUSubtarget {
35public:
36 using AMDGPUSubtarget::getMaxWavesPerEU;
37
38 // Following 2 enums are documented at:
39 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
40 enum class TrapHandlerAbi {
41 NONE = 0x00,
42 AMDHSA = 0x01,
43 };
44
45 enum class TrapID {
46 LLVMAMDHSATrap = 0x02,
47 LLVMAMDHSADebugTrap = 0x03,
48 };
49
50private:
51 /// SelectionDAGISel related APIs.
52 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
53
54 /// GlobalISel related APIs.
55 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
56 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
57 std::unique_ptr<InstructionSelector> InstSelector;
58 std::unique_ptr<LegalizerInfo> Legalizer;
59 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
60
61protected:
62 // Basic subtarget description.
63 Triple TargetTriple;
64 AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
65 unsigned Gen = INVALID;
66 InstrItineraryData InstrItins;
67 int LDSBankCount = 0;
68 unsigned MaxPrivateElementSize = 0;
69
70 // Possibly statically set by tablegen, but may want to be overridden.
71 bool FastDenormalF32 = false;
72 bool HalfRate64Ops = false;
73 bool FullRate64Ops = false;
74
75 // Dynamically set bits that enable features.
76 bool FlatForGlobal = false;
77 bool AutoWaitcntBeforeBarrier = false;
78 bool BackOffBarrier = false;
79 bool UnalignedScratchAccess = false;
80 bool UnalignedAccessMode = false;
81 bool RelaxedBufferOOBMode = false;
82 bool HasApertureRegs = false;
83 bool SupportsXNACK = false;
84 bool KernargPreload = false;
85
86 // This should not be used directly. 'TargetID' tracks the dynamic settings
87 // for XNACK.
88 bool EnableXNACK = false;
89
90 bool EnableTgSplit = false;
91 bool EnableCuMode = false;
92 bool TrapHandler = false;
93 bool EnablePreciseMemory = false;
94
95 // Used as options.
96 bool EnableLoadStoreOpt = false;
97 bool EnableUnsafeDSOffsetFolding = false;
98 bool EnableSIScheduler = false;
99 bool EnableDS128 = false;
100 bool EnablePRTStrictNull = false;
101 bool DumpCode = false;
102
103 // Subtarget statically properties set by tablegen
104 bool FP64 = false;
105 bool FMA = false;
106 bool MIMG_R128 = false;
107 bool CIInsts = false;
108 bool GFX8Insts = false;
109 bool GFX9Insts = false;
110 bool GFX90AInsts = false;
111 bool GFX940Insts = false;
112 bool GFX950Insts = false;
113 bool GFX10Insts = false;
114 bool GFX11Insts = false;
115 bool GFX12Insts = false;
116 bool GFX1250Insts = false;
117 bool GFX10_3Insts = false;
118 bool GFX7GFX8GFX9Insts = false;
119 bool SGPRInitBug = false;
120 bool UserSGPRInit16Bug = false;
121 bool NegativeScratchOffsetBug = false;
122 bool NegativeUnalignedScratchOffsetBug = false;
123 bool HasSMemRealTime = false;
124 bool HasIntClamp = false;
125 bool HasFmaMixInsts = false;
126 bool HasMovrel = false;
127 bool HasVGPRIndexMode = false;
128 bool HasScalarDwordx3Loads = false;
129 bool HasScalarStores = false;
130 bool HasScalarAtomics = false;
131 bool HasSDWAOmod = false;
132 bool HasSDWAScalar = false;
133 bool HasSDWASdst = false;
134 bool HasSDWAMac = false;
135 bool HasSDWAOutModsVOPC = false;
136 bool HasDPP = false;
137 bool HasDPP8 = false;
138 bool HasDPALU_DPP = false;
139 bool HasDPPSrc1SGPR = false;
140 bool HasPackedFP32Ops = false;
141 bool HasImageInsts = false;
142 bool HasExtendedImageInsts = false;
143 bool HasR128A16 = false;
144 bool HasA16 = false;
145 bool HasG16 = false;
146 bool HasNSAEncoding = false;
147 bool HasPartialNSAEncoding = false;
148 bool GFX10_AEncoding = false;
149 bool GFX10_BEncoding = false;
150 bool HasDLInsts = false;
151 bool HasFmacF64Inst = false;
152 bool HasDot1Insts = false;
153 bool HasDot2Insts = false;
154 bool HasDot3Insts = false;
155 bool HasDot4Insts = false;
156 bool HasDot5Insts = false;
157 bool HasDot6Insts = false;
158 bool HasDot7Insts = false;
159 bool HasDot8Insts = false;
160 bool HasDot9Insts = false;
161 bool HasDot10Insts = false;
162 bool HasDot11Insts = false;
163 bool HasDot12Insts = false;
164 bool HasDot13Insts = false;
165 bool HasMAIInsts = false;
166 bool HasFP8Insts = false;
167 bool HasFP8ConversionInsts = false;
168 bool HasCvtFP8Vop1Bug = false;
169 bool HasPkFmacF16Inst = false;
170 bool HasAtomicFMinFMaxF32GlobalInsts = false;
171 bool HasAtomicFMinFMaxF64GlobalInsts = false;
172 bool HasAtomicFMinFMaxF32FlatInsts = false;
173 bool HasAtomicFMinFMaxF64FlatInsts = false;
174 bool HasAtomicDsPkAdd16Insts = false;
175 bool HasAtomicFlatPkAdd16Insts = false;
176 bool HasAtomicFaddRtnInsts = false;
177 bool HasAtomicFaddNoRtnInsts = false;
178 bool HasMemoryAtomicFaddF32DenormalSupport = false;
179 bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
180 bool HasAtomicBufferGlobalPkAddF16Insts = false;
181 bool HasAtomicCSubNoRtnInsts = false;
182 bool HasAtomicGlobalPkAddBF16Inst = false;
183 bool HasAtomicBufferPkAddBF16Inst = false;
184 bool HasFlatAtomicFaddF32Inst = false;
185 bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
186 bool HasDefaultComponentZero = false;
187 bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
188 bool HasDefaultComponentBroadcast = false;
189 bool HasXF32Insts = false;
190 /// The maximum number of instructions that may be placed within an S_CLAUSE,
191 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
192 /// indicates a lack of S_CLAUSE support.
193 unsigned MaxHardClauseLength = 0;
194 bool SupportsSRAMECC = false;
195 bool DynamicVGPR = false;
196 bool DynamicVGPRBlockSize32 = false;
197 bool HasVMemToLDSLoad = false;
198
199 // This should not be used directly. 'TargetID' tracks the dynamic settings
200 // for SRAMECC.
201 bool EnableSRAMECC = false;
202
203 bool HasNoSdstCMPX = false;
204 bool HasVscnt = false;
205 bool HasWaitXcnt = false;
206 bool HasGetWaveIdInst = false;
207 bool HasSMemTimeInst = false;
208 bool HasShaderCyclesRegister = false;
209 bool HasShaderCyclesHiLoRegisters = false;
210 bool HasVOP3Literal = false;
211 bool HasNoDataDepHazard = false;
212 bool FlatAddressSpace = false;
213 bool FlatInstOffsets = false;
214 bool FlatGlobalInsts = false;
215 bool FlatScratchInsts = false;
216 bool ScalarFlatScratchInsts = false;
217 bool HasArchitectedFlatScratch = false;
218 bool EnableFlatScratch = false;
219 bool HasArchitectedSGPRs = false;
220 bool HasGDS = false;
221 bool HasGWS = false;
222 bool AddNoCarryInsts = false;
223 bool HasUnpackedD16VMem = false;
224 bool LDSMisalignedBug = false;
225 bool HasMFMAInlineLiteralBug = false;
226 bool UnalignedBufferAccess = false;
227 bool UnalignedDSAccess = false;
228 bool HasPackedTID = false;
229 bool ScalarizeGlobal = false;
230 bool HasSALUFloatInsts = false;
231 bool HasPseudoScalarTrans = false;
232 bool HasRestrictedSOffset = false;
233 bool HasBitOp3Insts = false;
234 bool HasTransposeLoadF4F6Insts = false;
235 bool HasPrngInst = false;
236 bool HasBVHDualAndBVH8Insts = false;
237 bool HasPermlane16Swap = false;
238 bool HasPermlane32Swap = false;
239 bool HasVcmpxPermlaneHazard = false;
240 bool HasVMEMtoScalarWriteHazard = false;
241 bool HasSMEMtoVectorWriteHazard = false;
242 bool HasInstFwdPrefetchBug = false;
243 bool HasSafeSmemPrefetch = false;
244 bool HasVcmpxExecWARHazard = false;
245 bool HasLdsBranchVmemWARHazard = false;
246 bool HasNSAtoVMEMBug = false;
247 bool HasNSAClauseBug = false;
248 bool HasOffset3fBug = false;
249 bool HasFlatSegmentOffsetBug = false;
250 bool HasImageStoreD16Bug = false;
251 bool HasImageGather4D16Bug = false;
252 bool HasMSAALoadDstSelBug = false;
253 bool HasPrivEnabledTrap2NopBug = false;
254 bool Has1_5xVGPRs = false;
255 bool HasMADIntraFwdBug = false;
256 bool HasVOPDInsts = false;
257 bool HasVALUTransUseHazard = false;
258 bool HasRequiredExportPriority = false;
259 bool HasVmemWriteVgprInOrder = false;
260 bool HasAshrPkInsts = false;
261 bool HasMinimum3Maximum3F32 = false;
262 bool HasMinimum3Maximum3F16 = false;
263 bool HasMinimum3Maximum3PKF16 = false;
264 bool HasLshlAddU64Inst = false;
265 bool HasPointSampleAccel = false;
266 bool HasLdsBarrierArriveAtomic = false;
267 bool HasSetPrioIncWgInst = false;
268
269 bool RequiresCOV6 = false;
270 bool UseBlockVGPROpsForCSR = false;
271
272 // Dummy feature to use for assembler in tablegen.
273 bool FeatureDisable = false;
274
275private:
276 SIInstrInfo InstrInfo;
277 SITargetLowering TLInfo;
278 SIFrameLowering FrameLowering;
279
280public:
281 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
282 const GCNTargetMachine &TM);
283 ~GCNSubtarget() override;
284
285 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
286 StringRef GPU, StringRef FS);
287
288 /// Diagnose inconsistent subtarget features before attempting to codegen
289 /// function \p F.
290 void checkSubtargetFeatures(const Function &F) const;
291
292 const SIInstrInfo *getInstrInfo() const override {
293 return &InstrInfo;
294 }
295
296 const SIFrameLowering *getFrameLowering() const override {
297 return &FrameLowering;
298 }
299
300 const SITargetLowering *getTargetLowering() const override {
301 return &TLInfo;
302 }
303
304 const SIRegisterInfo *getRegisterInfo() const override {
305 return &InstrInfo.getRegisterInfo();
306 }
307
308 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
309
310 const CallLowering *getCallLowering() const override {
311 return CallLoweringInfo.get();
312 }
313
314 const InlineAsmLowering *getInlineAsmLowering() const override {
315 return InlineAsmLoweringInfo.get();
316 }
317
318 InstructionSelector *getInstructionSelector() const override {
319 return InstSelector.get();
320 }
321
322 const LegalizerInfo *getLegalizerInfo() const override {
323 return Legalizer.get();
324 }
325
326 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
327 return RegBankInfo.get();
328 }
329
330 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
331 return TargetID;
332 }
333
334 const InstrItineraryData *getInstrItineraryData() const override {
335 return &InstrItins;
336 }
337
338 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
339
340 Generation getGeneration() const {
341 return (Generation)Gen;
342 }
343
344 unsigned getMaxWaveScratchSize() const {
345 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
346 if (getGeneration() >= GFX12) {
347 // 18-bit field in units of 64-dword.
348 return (64 * 4) * ((1 << 18) - 1);
349 }
350 if (getGeneration() == GFX11) {
351 // 15-bit field in units of 64-dword.
352 return (64 * 4) * ((1 << 15) - 1);
353 }
354 // 13-bit field in units of 256-dword.
355 return (256 * 4) * ((1 << 13) - 1);
356 }
357
358 /// Return the number of high bits known to be zero for a frame index.
359 unsigned getKnownHighZeroBitsForFrameIndex() const {
360 return llvm::countl_zero(Val: getMaxWaveScratchSize()) + getWavefrontSizeLog2();
361 }
362
363 int getLDSBankCount() const {
364 return LDSBankCount;
365 }
366
367 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
368 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
369 }
370
371 unsigned getConstantBusLimit(unsigned Opcode) const;
372
373 /// Returns if the result of this instruction with a 16-bit result returned in
374 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
375 /// the original value.
376 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
377
378 bool supportsWGP() const { return getGeneration() >= GFX10; }
379
380 bool hasIntClamp() const {
381 return HasIntClamp;
382 }
383
384 bool hasFP64() const {
385 return FP64;
386 }
387
388 bool hasMIMG_R128() const {
389 return MIMG_R128;
390 }
391
392 bool hasHWFP64() const {
393 return FP64;
394 }
395
396 bool hasHalfRate64Ops() const {
397 return HalfRate64Ops;
398 }
399
400 bool hasFullRate64Ops() const {
401 return FullRate64Ops;
402 }
403
404 bool hasAddr64() const {
405 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
406 }
407
408 bool hasFlat() const {
409 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
410 }
411
412 // Return true if the target only has the reverse operand versions of VALU
413 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
414 bool hasOnlyRevVALUShifts() const {
415 return getGeneration() >= VOLCANIC_ISLANDS;
416 }
417
418 bool hasFractBug() const {
419 return getGeneration() == SOUTHERN_ISLANDS;
420 }
421
422 bool hasBFE() const {
423 return true;
424 }
425
426 bool hasBFI() const {
427 return true;
428 }
429
430 bool hasBFM() const {
431 return hasBFE();
432 }
433
434 bool hasBCNT(unsigned Size) const {
435 return true;
436 }
437
438 bool hasFFBL() const {
439 return true;
440 }
441
442 bool hasFFBH() const {
443 return true;
444 }
445
446 bool hasMed3_16() const {
447 return getGeneration() >= AMDGPUSubtarget::GFX9;
448 }
449
450 bool hasMin3Max3_16() const {
451 return getGeneration() >= AMDGPUSubtarget::GFX9;
452 }
453
454 bool hasFmaMixInsts() const {
455 return HasFmaMixInsts;
456 }
457
458 bool hasCARRY() const {
459 return true;
460 }
461
462 bool hasFMA() const {
463 return FMA;
464 }
465
466 bool hasSwap() const {
467 return GFX9Insts;
468 }
469
470 bool hasScalarPackInsts() const {
471 return GFX9Insts;
472 }
473
474 bool hasScalarMulHiInsts() const {
475 return GFX9Insts;
476 }
477
478 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
479
480 TrapHandlerAbi getTrapHandlerAbi() const {
481 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
482 }
483
484 bool supportsGetDoorbellID() const {
485 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
486 return getGeneration() >= GFX9;
487 }
488
489 /// True if the offset field of DS instructions works as expected. On SI, the
490 /// offset uses a 16-bit adder and does not always wrap properly.
491 bool hasUsableDSOffset() const {
492 return getGeneration() >= SEA_ISLANDS;
493 }
494
495 bool unsafeDSOffsetFoldingEnabled() const {
496 return EnableUnsafeDSOffsetFolding;
497 }
498
499 /// Condition output from div_scale is usable.
500 bool hasUsableDivScaleConditionOutput() const {
501 return getGeneration() != SOUTHERN_ISLANDS;
502 }
503
504 /// Extra wait hazard is needed in some cases before
505 /// s_cbranch_vccnz/s_cbranch_vccz.
506 bool hasReadVCCZBug() const {
507 return getGeneration() <= SEA_ISLANDS;
508 }
509
510 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
511 bool partialVCCWritesUpdateVCCZ() const {
512 return getGeneration() >= GFX10;
513 }
514
515 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
516 /// was written by a VALU instruction.
517 bool hasSMRDReadVALUDefHazard() const {
518 return getGeneration() == SOUTHERN_ISLANDS;
519 }
520
521 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
522 /// SGPR was written by a VALU Instruction.
523 bool hasVMEMReadSGPRVALUDefHazard() const {
524 return getGeneration() >= VOLCANIC_ISLANDS;
525 }
526
527 bool hasRFEHazards() const {
528 return getGeneration() >= VOLCANIC_ISLANDS;
529 }
530
531 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
532 unsigned getSetRegWaitStates() const {
533 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
534 }
535
536 bool dumpCode() const {
537 return DumpCode;
538 }
539
540 /// Return the amount of LDS that can be used that will not restrict the
541 /// occupancy lower than WaveCount.
542 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
543 const Function &) const;
544
545 bool supportsMinMaxDenormModes() const {
546 return getGeneration() >= AMDGPUSubtarget::GFX9;
547 }
548
549 /// \returns If target supports S_DENORM_MODE.
550 bool hasDenormModeInst() const {
551 return getGeneration() >= AMDGPUSubtarget::GFX10;
552 }
553
554 bool useFlatForGlobal() const {
555 return FlatForGlobal;
556 }
557
558 /// \returns If target supports ds_read/write_b128 and user enables generation
559 /// of ds_read/write_b128.
560 bool useDS128() const {
561 return CIInsts && EnableDS128;
562 }
563
564 /// \return If target supports ds_read/write_b96/128.
565 bool hasDS96AndDS128() const {
566 return CIInsts;
567 }
568
569 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
570 bool haveRoundOpsF64() const {
571 return CIInsts;
572 }
573
574 /// \returns If MUBUF instructions always perform range checking, even for
575 /// buffer resources used for private memory access.
576 bool privateMemoryResourceIsRangeChecked() const {
577 return getGeneration() < AMDGPUSubtarget::GFX9;
578 }
579
580 /// \returns If target requires PRT Struct NULL support (zero result registers
581 /// for sparse texture support).
582 bool usePRTStrictNull() const {
583 return EnablePRTStrictNull;
584 }
585
586 bool hasAutoWaitcntBeforeBarrier() const {
587 return AutoWaitcntBeforeBarrier;
588 }
589
590 /// \returns true if the target supports backing off of s_barrier instructions
591 /// when an exception is raised.
592 bool supportsBackOffBarrier() const {
593 return BackOffBarrier;
594 }
595
596 bool hasUnalignedBufferAccess() const {
597 return UnalignedBufferAccess;
598 }
599
600 bool hasUnalignedBufferAccessEnabled() const {
601 return UnalignedBufferAccess && UnalignedAccessMode;
602 }
603
604 bool hasUnalignedDSAccess() const {
605 return UnalignedDSAccess;
606 }
607
608 bool hasUnalignedDSAccessEnabled() const {
609 return UnalignedDSAccess && UnalignedAccessMode;
610 }
611
612 bool hasUnalignedScratchAccess() const {
613 return UnalignedScratchAccess;
614 }
615
616 bool hasUnalignedScratchAccessEnabled() const {
617 return UnalignedScratchAccess && UnalignedAccessMode;
618 }
619
620 bool hasUnalignedAccessMode() const {
621 return UnalignedAccessMode;
622 }
623
624 bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; }
625
626 bool hasApertureRegs() const {
627 return HasApertureRegs;
628 }
629
630 bool isTrapHandlerEnabled() const {
631 return TrapHandler;
632 }
633
634 bool isXNACKEnabled() const {
635 return TargetID.isXnackOnOrAny();
636 }
637
638 bool isTgSplitEnabled() const {
639 return EnableTgSplit;
640 }
641
642 bool isCuModeEnabled() const {
643 return EnableCuMode;
644 }
645
646 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
647
648 bool hasFlatAddressSpace() const {
649 return FlatAddressSpace;
650 }
651
652 bool hasFlatScrRegister() const {
653 return hasFlatAddressSpace();
654 }
655
656 bool hasFlatInstOffsets() const {
657 return FlatInstOffsets;
658 }
659
660 bool hasFlatGlobalInsts() const {
661 return FlatGlobalInsts;
662 }
663
664 bool hasFlatScratchInsts() const {
665 return FlatScratchInsts;
666 }
667
668 // Check if target supports ST addressing mode with FLAT scratch instructions.
669 // The ST addressing mode means no registers are used, either VGPR or SGPR,
670 // but only immediate offset is swizzled and added to the FLAT scratch base.
671 bool hasFlatScratchSTMode() const {
672 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
673 }
674
675 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
676
677 bool hasScalarFlatScratchInsts() const {
678 return ScalarFlatScratchInsts;
679 }
680
681 bool enableFlatScratch() const {
682 return flatScratchIsArchitected() ||
683 (EnableFlatScratch && hasFlatScratchInsts());
684 }
685
686 bool hasGlobalAddTidInsts() const {
687 return GFX10_BEncoding;
688 }
689
690 bool hasAtomicCSub() const {
691 return GFX10_BEncoding;
692 }
693
694 bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
695
696 bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
697
698 bool hasExportInsts() const {
699 return !hasGFX940Insts() && !hasGFX1250Insts();
700 }
701
702 bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
703
704 // DS_ADD_F64/DS_ADD_RTN_F64
705 bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
706
707 bool hasMultiDwordFlatScratchAddressing() const {
708 return getGeneration() >= GFX9;
709 }
710
711 bool hasFlatSegmentOffsetBug() const {
712 return HasFlatSegmentOffsetBug;
713 }
714
715 bool hasFlatLgkmVMemCountInOrder() const {
716 return getGeneration() > GFX9;
717 }
718
719 bool hasD16LoadStore() const {
720 return getGeneration() >= GFX9;
721 }
722
723 bool d16PreservesUnusedBits() const {
724 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
725 }
726
727 bool hasD16Images() const {
728 return getGeneration() >= VOLCANIC_ISLANDS;
729 }
730
731 /// Return if most LDS instructions have an m0 use that require m0 to be
732 /// initialized.
733 bool ldsRequiresM0Init() const {
734 return getGeneration() < GFX9;
735 }
736
737 // True if the hardware rewinds and replays GWS operations if a wave is
738 // preempted.
739 //
740 // If this is false, a GWS operation requires testing if a nack set the
741 // MEM_VIOL bit, and repeating if so.
742 bool hasGWSAutoReplay() const {
743 return getGeneration() >= GFX9;
744 }
745
746 /// \returns if target has ds_gws_sema_release_all instruction.
747 bool hasGWSSemaReleaseAll() const {
748 return CIInsts;
749 }
750
751 /// \returns true if the target has integer add/sub instructions that do not
752 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
753 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
754 /// for saturation.
755 bool hasAddNoCarry() const {
756 return AddNoCarryInsts;
757 }
758
759 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
760
761 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
762
763 bool hasUnpackedD16VMem() const {
764 return HasUnpackedD16VMem;
765 }
766
767 // Covers VS/PS/CS graphics shaders
768 bool isMesaGfxShader(const Function &F) const {
769 return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
770 }
771
772 bool hasMad64_32() const {
773 return getGeneration() >= SEA_ISLANDS;
774 }
775
776 bool hasSDWAOmod() const {
777 return HasSDWAOmod;
778 }
779
780 bool hasSDWAScalar() const {
781 return HasSDWAScalar;
782 }
783
784 bool hasSDWASdst() const {
785 return HasSDWASdst;
786 }
787
788 bool hasSDWAMac() const {
789 return HasSDWAMac;
790 }
791
792 bool hasSDWAOutModsVOPC() const {
793 return HasSDWAOutModsVOPC;
794 }
795
796 bool hasDLInsts() const {
797 return HasDLInsts;
798 }
799
800 bool hasFmacF64Inst() const { return HasFmacF64Inst; }
801
802 bool hasDot1Insts() const {
803 return HasDot1Insts;
804 }
805
806 bool hasDot2Insts() const {
807 return HasDot2Insts;
808 }
809
810 bool hasDot3Insts() const {
811 return HasDot3Insts;
812 }
813
814 bool hasDot4Insts() const {
815 return HasDot4Insts;
816 }
817
818 bool hasDot5Insts() const {
819 return HasDot5Insts;
820 }
821
822 bool hasDot6Insts() const {
823 return HasDot6Insts;
824 }
825
826 bool hasDot7Insts() const {
827 return HasDot7Insts;
828 }
829
830 bool hasDot8Insts() const {
831 return HasDot8Insts;
832 }
833
834 bool hasDot9Insts() const {
835 return HasDot9Insts;
836 }
837
838 bool hasDot10Insts() const {
839 return HasDot10Insts;
840 }
841
842 bool hasDot11Insts() const {
843 return HasDot11Insts;
844 }
845
846 bool hasDot12Insts() const {
847 return HasDot12Insts;
848 }
849
850 bool hasDot13Insts() const {
851 return HasDot13Insts;
852 }
853
854 bool hasMAIInsts() const {
855 return HasMAIInsts;
856 }
857
858 bool hasFP8Insts() const {
859 return HasFP8Insts;
860 }
861
862 bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
863
864 bool hasPkFmacF16Inst() const {
865 return HasPkFmacF16Inst;
866 }
867
868 bool hasAtomicFMinFMaxF32GlobalInsts() const {
869 return HasAtomicFMinFMaxF32GlobalInsts;
870 }
871
872 bool hasAtomicFMinFMaxF64GlobalInsts() const {
873 return HasAtomicFMinFMaxF64GlobalInsts;
874 }
875
876 bool hasAtomicFMinFMaxF32FlatInsts() const {
877 return HasAtomicFMinFMaxF32FlatInsts;
878 }
879
880 bool hasAtomicFMinFMaxF64FlatInsts() const {
881 return HasAtomicFMinFMaxF64FlatInsts;
882 }
883
884 bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
885
886 bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
887
888 bool hasAtomicFaddInsts() const {
889 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
890 }
891
892 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
893
894 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
895
896 bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
897 return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
898 }
899
900 bool hasAtomicBufferGlobalPkAddF16Insts() const {
901 return HasAtomicBufferGlobalPkAddF16Insts;
902 }
903
904 bool hasAtomicGlobalPkAddBF16Inst() const {
905 return HasAtomicGlobalPkAddBF16Inst;
906 }
907
908 bool hasAtomicBufferPkAddBF16Inst() const {
909 return HasAtomicBufferPkAddBF16Inst;
910 }
911
912 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
913
914 /// \return true if the target has flat, global, and buffer atomic fadd for
915 /// double.
916 bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
917 return HasFlatBufferGlobalAtomicFaddF64Inst;
918 }
919
920 /// \return true if the target's flat, global, and buffer atomic fadd for
921 /// float supports denormal handling.
922 bool hasMemoryAtomicFaddF32DenormalSupport() const {
923 return HasMemoryAtomicFaddF32DenormalSupport;
924 }
925
926 /// \return true if atomic operations targeting fine-grained memory work
927 /// correctly at device scope, in allocations in host or peer PCIe device
928 /// memory.
929 bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
930 return HasAgentScopeFineGrainedRemoteMemoryAtomics;
931 }
932
933 bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
934
935 bool hasDefaultComponentBroadcast() const {
936 return HasDefaultComponentBroadcast;
937 }
938
939 bool hasNoSdstCMPX() const {
940 return HasNoSdstCMPX;
941 }
942
943 bool hasVscnt() const {
944 return HasVscnt;
945 }
946
947 bool hasGetWaveIdInst() const {
948 return HasGetWaveIdInst;
949 }
950
951 bool hasSMemTimeInst() const {
952 return HasSMemTimeInst;
953 }
954
955 bool hasShaderCyclesRegister() const {
956 return HasShaderCyclesRegister;
957 }
958
959 bool hasShaderCyclesHiLoRegisters() const {
960 return HasShaderCyclesHiLoRegisters;
961 }
962
963 bool hasVOP3Literal() const {
964 return HasVOP3Literal;
965 }
966
967 bool hasNoDataDepHazard() const {
968 return HasNoDataDepHazard;
969 }
970
971 bool vmemWriteNeedsExpWaitcnt() const {
972 return getGeneration() < SEA_ISLANDS;
973 }
974
975 bool hasInstPrefetch() const {
976 return getGeneration() == GFX10 || getGeneration() == GFX11;
977 }
978
979 bool hasPrefetch() const { return GFX12Insts; }
980
981 bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
982
983 // Has s_cmpk_* instructions.
984 bool hasSCmpK() const { return getGeneration() < GFX12; }
985
986 // Scratch is allocated in 256 dword per wave blocks for the entire
987 // wavefront. When viewed from the perspective of an arbitrary workitem, this
988 // is 4-byte aligned.
989 //
990 // Only 4-byte alignment is really needed to access anything. Transformations
991 // on the pointer value itself may rely on the alignment / known low bits of
992 // the pointer. Set this to something above the minimum to avoid needing
993 // dynamic realignment in common cases.
994 Align getStackAlignment() const { return Align(16); }
995
996 bool enableMachineScheduler() const override {
997 return true;
998 }
999
1000 bool useAA() const override;
1001
1002 bool enableSubRegLiveness() const override {
1003 return true;
1004 }
1005
1006 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
1007 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
1008
1009 // static wrappers
1010 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
1011
1012 // XXX - Why is this here if it isn't in the default pass set?
1013 bool enableEarlyIfConversion() const override {
1014 return true;
1015 }
1016
1017 void overrideSchedPolicy(MachineSchedPolicy &Policy,
1018 unsigned NumRegionInstrs) const override;
1019
1020 void mirFileLoaded(MachineFunction &MF) const override;
1021
1022 unsigned getMaxNumUserSGPRs() const {
1023 return AMDGPU::getMaxNumUserSGPRs(STI: *this);
1024 }
1025
1026 bool hasSMemRealTime() const {
1027 return HasSMemRealTime;
1028 }
1029
1030 bool hasMovrel() const {
1031 return HasMovrel;
1032 }
1033
1034 bool hasVGPRIndexMode() const {
1035 return HasVGPRIndexMode;
1036 }
1037
1038 bool useVGPRIndexMode() const;
1039
1040 bool hasScalarCompareEq64() const {
1041 return getGeneration() >= VOLCANIC_ISLANDS;
1042 }
1043
1044 bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
1045
1046 bool hasScalarStores() const {
1047 return HasScalarStores;
1048 }
1049
1050 bool hasScalarAtomics() const {
1051 return HasScalarAtomics;
1052 }
1053
1054 bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
1055 bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
1056
1057 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
1058 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
1059
1060 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
1061 bool hasPermLane64() const { return getGeneration() >= GFX11; }
1062
1063 bool hasDPP() const {
1064 return HasDPP;
1065 }
1066
1067 bool hasDPPBroadcasts() const {
1068 return HasDPP && getGeneration() < GFX10;
1069 }
1070
1071 bool hasDPPWavefrontShifts() const {
1072 return HasDPP && getGeneration() < GFX10;
1073 }
1074
1075 bool hasDPP8() const {
1076 return HasDPP8;
1077 }
1078
1079 bool hasDPALU_DPP() const {
1080 return HasDPALU_DPP;
1081 }
1082
1083 bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
1084
1085 bool hasPackedFP32Ops() const {
1086 return HasPackedFP32Ops;
1087 }
1088
1089 // Has V_PK_MOV_B32 opcode
1090 bool hasPkMovB32() const {
1091 return GFX90AInsts;
1092 }
1093
1094 bool hasFmaakFmamkF32Insts() const {
1095 return getGeneration() >= GFX10 || hasGFX940Insts();
1096 }
1097
1098 bool hasImageInsts() const {
1099 return HasImageInsts;
1100 }
1101
1102 bool hasExtendedImageInsts() const {
1103 return HasExtendedImageInsts;
1104 }
1105
1106 bool hasR128A16() const {
1107 return HasR128A16;
1108 }
1109
1110 bool hasA16() const { return HasA16; }
1111
1112 bool hasG16() const { return HasG16; }
1113
1114 bool hasOffset3fBug() const {
1115 return HasOffset3fBug;
1116 }
1117
1118 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1119
1120 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1121
1122 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1123
1124 bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1125
1126 bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1127
1128 bool hasNSAEncoding() const { return HasNSAEncoding; }
1129
1130 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1131
1132 bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1133
1134 unsigned getNSAMaxSize(bool HasSampler = false) const {
1135 return AMDGPU::getNSAMaxSize(STI: *this, HasSampler);
1136 }
1137
1138 bool hasGFX10_AEncoding() const {
1139 return GFX10_AEncoding;
1140 }
1141
1142 bool hasGFX10_BEncoding() const {
1143 return GFX10_BEncoding;
1144 }
1145
1146 bool hasGFX10_3Insts() const {
1147 return GFX10_3Insts;
1148 }
1149
1150 bool hasMadF16() const;
1151
1152 bool hasMovB64() const { return GFX940Insts; }
1153
1154 bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
1155
1156 bool enableSIScheduler() const {
1157 return EnableSIScheduler;
1158 }
1159
1160 bool loadStoreOptEnabled() const {
1161 return EnableLoadStoreOpt;
1162 }
1163
1164 bool hasSGPRInitBug() const {
1165 return SGPRInitBug;
1166 }
1167
1168 bool hasUserSGPRInit16Bug() const {
1169 return UserSGPRInit16Bug && isWave32();
1170 }
1171
1172 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1173
1174 bool hasNegativeUnalignedScratchOffsetBug() const {
1175 return NegativeUnalignedScratchOffsetBug;
1176 }
1177
1178 bool hasMFMAInlineLiteralBug() const {
1179 return HasMFMAInlineLiteralBug;
1180 }
1181
1182 bool has12DWordStoreHazard() const {
1183 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1184 }
1185
1186 // \returns true if the subtarget supports DWORDX3 load/store instructions.
1187 bool hasDwordx3LoadStores() const {
1188 return CIInsts;
1189 }
1190
1191 bool hasReadM0MovRelInterpHazard() const {
1192 return getGeneration() == AMDGPUSubtarget::GFX9;
1193 }
1194
1195 bool hasReadM0SendMsgHazard() const {
1196 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1197 getGeneration() <= AMDGPUSubtarget::GFX9;
1198 }
1199
1200 bool hasReadM0LdsDmaHazard() const {
1201 return getGeneration() == AMDGPUSubtarget::GFX9;
1202 }
1203
1204 bool hasReadM0LdsDirectHazard() const {
1205 return getGeneration() == AMDGPUSubtarget::GFX9;
1206 }
1207
1208 bool hasVcmpxPermlaneHazard() const {
1209 return HasVcmpxPermlaneHazard;
1210 }
1211
1212 bool hasVMEMtoScalarWriteHazard() const {
1213 return HasVMEMtoScalarWriteHazard;
1214 }
1215
1216 bool hasSMEMtoVectorWriteHazard() const {
1217 return HasSMEMtoVectorWriteHazard;
1218 }
1219
1220 bool hasLDSMisalignedBug() const {
1221 return LDSMisalignedBug && !EnableCuMode;
1222 }
1223
1224 bool hasInstFwdPrefetchBug() const {
1225 return HasInstFwdPrefetchBug;
1226 }
1227
1228 bool hasVcmpxExecWARHazard() const {
1229 return HasVcmpxExecWARHazard;
1230 }
1231
1232 bool hasLdsBranchVmemWARHazard() const {
1233 return HasLdsBranchVmemWARHazard;
1234 }
1235
1236 // Shift amount of a 64 bit shift cannot be a highest allocated register
1237 // if also at the end of the allocation block.
1238 bool hasShift64HighRegBug() const {
1239 return GFX90AInsts && !GFX940Insts;
1240 }
1241
1242 // Has one cycle hazard on transcendental instruction feeding a
1243 // non transcendental VALU.
1244 bool hasTransForwardingHazard() const { return GFX940Insts; }
1245
1246 // Has one cycle hazard on a VALU instruction partially writing dst with
1247 // a shift of result bits feeding another VALU instruction.
1248 bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1249
1250 // Cannot use op_sel with v_dot instructions.
1251 bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
1252
1253 // Does not have HW interlocs for VALU writing and then reading SGPRs.
1254 bool hasVDecCoExecHazard() const {
1255 return GFX940Insts;
1256 }
1257
1258 bool hasNSAtoVMEMBug() const {
1259 return HasNSAtoVMEMBug;
1260 }
1261
1262 bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1263
1264 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
1265
1266 bool hasGFX90AInsts() const { return GFX90AInsts; }
1267
1268 bool hasFPAtomicToDenormModeHazard() const {
1269 return getGeneration() == GFX10;
1270 }
1271
1272 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1273
1274 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1275
1276 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1277
1278 bool hasVALUPartialForwardingHazard() const {
1279 return getGeneration() == GFX11;
1280 }
1281
1282 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1283
1284 bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
1285
1286 bool requiresCodeObjectV6() const { return RequiresCOV6; }
1287
1288 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
1289
1290 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1291
1292 bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
1293
1294 /// Return if operations acting on VGPR tuples require even alignment.
1295 bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
1296
1297 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1298 bool hasSPackHL() const { return GFX11Insts; }
1299
1300 /// Return true if the target's EXP instruction has the COMPR flag, which
1301 /// affects the meaning of the EN (enable) bits.
1302 bool hasCompressedExport() const { return !GFX11Insts; }
1303
1304 /// Return true if the target's EXP instruction supports the NULL export
1305 /// target.
1306 bool hasNullExportTarget() const { return !GFX11Insts; }
1307
1308 bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
1309
1310 bool hasVOPDInsts() const { return HasVOPDInsts; }
1311
1312 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1313
1314 /// Return true if the target has the S_DELAY_ALU instruction.
1315 bool hasDelayAlu() const { return GFX11Insts; }
1316
1317 bool hasPackedTID() const { return HasPackedTID; }
1318
1319 // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that
1320 // hasGFX90AInsts is also true.
1321 bool hasGFX940Insts() const { return GFX940Insts; }
1322
1323 // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that
1324 // hasGFX940Insts and hasGFX90AInsts are also true.
1325 bool hasGFX950Insts() const { return GFX950Insts; }
1326
1327 /// Returns true if the target supports
1328 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
1329 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
1330 bool hasLDSLoadB96_B128() const {
1331 return hasGFX950Insts();
1332 }
1333
1334 bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; }
1335
1336 bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1337
1338 bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1339
1340 bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1341
1342 bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
1343
1344 bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
1345
1346 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1347 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
1348 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1349
1350 /// \returns true if inline constants are not supported for F16 pseudo
1351 /// scalar transcendentals.
1352 bool hasNoF16PseudoScalarTransInlineConstants() const {
1353 return getGeneration() == GFX12;
1354 }
1355
1356 /// \returns true if the target has instructions with xf32 format support.
1357 bool hasXF32Insts() const { return HasXF32Insts; }
1358
1359 bool hasBitOp3Insts() const { return HasBitOp3Insts; }
1360
1361 bool hasPermlane16Swap() const { return HasPermlane16Swap; }
1362 bool hasPermlane32Swap() const { return HasPermlane32Swap; }
1363 bool hasAshrPkInsts() const { return HasAshrPkInsts; }
1364
1365 bool hasMinimum3Maximum3F32() const {
1366 return HasMinimum3Maximum3F32;
1367 }
1368
1369 bool hasMinimum3Maximum3F16() const {
1370 return HasMinimum3Maximum3F16;
1371 }
1372
1373 bool hasMinimum3Maximum3PKF16() const {
1374 return HasMinimum3Maximum3PKF16;
1375 }
1376
1377 bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
1378
1379 /// \returns true if the target has s_wait_xcnt insertion. Supported for
1380 /// GFX1250.
1381 bool hasWaitXCnt() const { return HasWaitXcnt; }
1382
1383 bool hasPointSampleAccel() const { return HasPointSampleAccel; }
1384
1385 bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; }
1386
1387 /// \returns The maximum number of instructions that can be enclosed in an
1388 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1389 /// instruction.
1390 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1391
1392 bool hasPrngInst() const { return HasPrngInst; }
1393
1394 bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; }
1395
1396 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1397 /// SGPRs
1398 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1399
1400 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1401 /// VGPRs
1402 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
1403 unsigned DynamicVGPRBlockSize) const;
1404
1405 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
1406 /// be achieved when the only function running on a CU is \p F, each workgroup
1407 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
1408 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
1409 /// range, so this returns a range as well.
1410 ///
1411 /// Note that occupancy can be affected by the scratch allocation as well, but
1412 /// we do not have enough information to compute it.
1413 std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
1414 unsigned LDSSize = 0,
1415 unsigned NumSGPRs = 0,
1416 unsigned NumVGPRs = 0) const;
1417
1418 /// \returns true if the flat_scratch register should be initialized with the
1419 /// pointer to the wave's scratch memory rather than a size and offset.
1420 bool flatScratchIsPointer() const {
1421 return getGeneration() >= AMDGPUSubtarget::GFX9;
1422 }
1423
1424 /// \returns true if the flat_scratch register is initialized by the HW.
1425 /// In this case it is readonly.
1426 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1427
1428 /// \returns true if the architected SGPRs are enabled.
1429 bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1430
1431 /// \returns true if Global Data Share is supported.
1432 bool hasGDS() const { return HasGDS; }
1433
1434 /// \returns true if Global Wave Sync is supported.
1435 bool hasGWS() const { return HasGWS; }
1436
1437 /// \returns true if the machine has merged shaders in which s0-s7 are
1438 /// reserved by the hardware and user SGPRs start at s8
1439 bool hasMergedShaders() const {
1440 return getGeneration() >= GFX9;
1441 }
1442
1443 // \returns true if the target supports the pre-NGG legacy geometry path.
1444 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1445
1446 // \returns true if preloading kernel arguments is supported.
1447 bool hasKernargPreload() const { return KernargPreload; }
1448
1449 // \returns true if the target has split barriers feature
1450 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1451
1452 // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1453 bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; }
1454
1455 // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1456 // no-return form.
1457 bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1458
1459 // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1460 bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1461
1462 // \returns true if the target has IEEE kernel descriptor mode bit
1463 bool hasIEEEMode() const { return getGeneration() < GFX12; }
1464
1465 // \returns true if the target has IEEE fminimum/fmaximum instructions
1466 bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1467
1468 // \returns true if the target has IEEE fminimum3/fmaximum3 instructions
1469 bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
1470
1471 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1472 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1473
1474 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1475 /// values.
1476 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1477
1478 bool hasGFX1250Insts() const { return GFX1250Insts; }
1479
1480 // \returns true if target has S_SETPRIO_INC_WG instruction.
1481 bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
1482
1483 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1484 // of sign-extending.
1485 bool hasGetPCZeroExtension() const { return GFX12Insts; }
1486
1487 /// \returns SGPR allocation granularity supported by the subtarget.
1488 unsigned getSGPRAllocGranule() const {
1489 return AMDGPU::IsaInfo::getSGPRAllocGranule(STI: this);
1490 }
1491
1492 /// \returns SGPR encoding granularity supported by the subtarget.
1493 unsigned getSGPREncodingGranule() const {
1494 return AMDGPU::IsaInfo::getSGPREncodingGranule(STI: this);
1495 }
1496
1497 /// \returns Total number of SGPRs supported by the subtarget.
1498 unsigned getTotalNumSGPRs() const {
1499 return AMDGPU::IsaInfo::getTotalNumSGPRs(STI: this);
1500 }
1501
1502 /// \returns Addressable number of SGPRs supported by the subtarget.
1503 unsigned getAddressableNumSGPRs() const {
1504 return AMDGPU::IsaInfo::getAddressableNumSGPRs(STI: this);
1505 }
1506
1507 /// \returns Minimum number of SGPRs that meets the given number of waves per
1508 /// execution unit requirement supported by the subtarget.
1509 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1510 return AMDGPU::IsaInfo::getMinNumSGPRs(STI: this, WavesPerEU);
1511 }
1512
1513 /// \returns Maximum number of SGPRs that meets the given number of waves per
1514 /// execution unit requirement supported by the subtarget.
1515 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1516 return AMDGPU::IsaInfo::getMaxNumSGPRs(STI: this, WavesPerEU, Addressable);
1517 }
1518
1519 /// \returns Reserved number of SGPRs. This is common
1520 /// utility function called by MachineFunction and
1521 /// Function variants of getReservedNumSGPRs.
1522 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1523 /// \returns Reserved number of SGPRs for given machine function \p MF.
1524 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1525
1526 /// \returns Reserved number of SGPRs for given function \p F.
1527 unsigned getReservedNumSGPRs(const Function &F) const;
1528
1529 /// \returns Maximum number of preloaded SGPRs for the subtarget.
1530 unsigned getMaxNumPreloadedSGPRs() const;
1531
1532 /// \returns max num SGPRs. This is the common utility
1533 /// function called by MachineFunction and Function
1534 /// variants of getMaxNumSGPRs.
1535 unsigned getBaseMaxNumSGPRs(const Function &F,
1536 std::pair<unsigned, unsigned> WavesPerEU,
1537 unsigned PreloadedSGPRs,
1538 unsigned ReservedNumSGPRs) const;
1539
1540 /// \returns Maximum number of SGPRs that meets number of waves per execution
1541 /// unit requirement for function \p MF, or number of SGPRs explicitly
1542 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1543 ///
1544 /// \returns Value that meets number of waves per execution unit requirement
1545 /// if explicitly requested value cannot be converted to integer, violates
1546 /// subtarget's specifications, or does not meet number of waves per execution
1547 /// unit requirement.
1548 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1549
1550 /// \returns Maximum number of SGPRs that meets number of waves per execution
1551 /// unit requirement for function \p F, or number of SGPRs explicitly
1552 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1553 ///
1554 /// \returns Value that meets number of waves per execution unit requirement
1555 /// if explicitly requested value cannot be converted to integer, violates
1556 /// subtarget's specifications, or does not meet number of waves per execution
1557 /// unit requirement.
1558 unsigned getMaxNumSGPRs(const Function &F) const;
1559
1560 /// \returns VGPR allocation granularity supported by the subtarget.
1561 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
1562 return AMDGPU::IsaInfo::getVGPRAllocGranule(STI: this, DynamicVGPRBlockSize);
1563 }
1564
1565 /// \returns VGPR encoding granularity supported by the subtarget.
1566 unsigned getVGPREncodingGranule() const {
1567 return AMDGPU::IsaInfo::getVGPREncodingGranule(STI: this);
1568 }
1569
1570 /// \returns Total number of VGPRs supported by the subtarget.
1571 unsigned getTotalNumVGPRs() const {
1572 return AMDGPU::IsaInfo::getTotalNumVGPRs(STI: this);
1573 }
1574
1575 /// \returns Addressable number of architectural VGPRs supported by the
1576 /// subtarget.
1577 unsigned getAddressableNumArchVGPRs() const {
1578 return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(STI: this);
1579 }
1580
1581 /// \returns Addressable number of VGPRs supported by the subtarget.
1582 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
1583 return AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: this, DynamicVGPRBlockSize);
1584 }
1585
1586 /// \returns the minimum number of VGPRs that will prevent achieving more than
1587 /// the specified number of waves \p WavesPerEU.
1588 unsigned getMinNumVGPRs(unsigned WavesPerEU,
1589 unsigned DynamicVGPRBlockSize) const {
1590 return AMDGPU::IsaInfo::getMinNumVGPRs(STI: this, WavesPerEU,
1591 DynamicVGPRBlockSize);
1592 }
1593
1594 /// \returns the maximum number of VGPRs that can be used and still achieved
1595 /// at least the specified number of waves \p WavesPerEU.
1596 unsigned getMaxNumVGPRs(unsigned WavesPerEU,
1597 unsigned DynamicVGPRBlockSize) const {
1598 return AMDGPU::IsaInfo::getMaxNumVGPRs(STI: this, WavesPerEU,
1599 DynamicVGPRBlockSize);
1600 }
1601
1602 /// \returns max num VGPRs. This is the common utility function
1603 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1604 unsigned
1605 getBaseMaxNumVGPRs(const Function &F,
1606 std::pair<unsigned, unsigned> NumVGPRBounds) const;
1607
1608 /// \returns Maximum number of VGPRs that meets number of waves per execution
1609 /// unit requirement for function \p F, or number of VGPRs explicitly
1610 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1611 ///
1612 /// \returns Value that meets number of waves per execution unit requirement
1613 /// if explicitly requested value cannot be converted to integer, violates
1614 /// subtarget's specifications, or does not meet number of waves per execution
1615 /// unit requirement.
1616 unsigned getMaxNumVGPRs(const Function &F) const;
1617
1618 unsigned getMaxNumAGPRs(const Function &F) const {
1619 return getMaxNumVGPRs(F);
1620 }
1621
1622 /// \returns Maximum number of VGPRs that meets number of waves per execution
1623 /// unit requirement for function \p MF, or number of VGPRs explicitly
1624 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1625 ///
1626 /// \returns Value that meets number of waves per execution unit requirement
1627 /// if explicitly requested value cannot be converted to integer, violates
1628 /// subtarget's specifications, or does not meet number of waves per execution
1629 /// unit requirement.
1630 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1631
1632 bool isWave32() const {
1633 return getWavefrontSize() == 32;
1634 }
1635
1636 bool isWave64() const {
1637 return getWavefrontSize() == 64;
1638 }
1639
1640 /// Returns if the wavesize of this subtarget is known reliable. This is false
1641 /// only for the a default target-cpu that does not have an explicit
1642 /// +wavefrontsize target feature.
1643 bool isWaveSizeKnown() const {
1644 return hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) ||
1645 hasFeature(Feature: AMDGPU::FeatureWavefrontSize64);
1646 }
1647
1648 const TargetRegisterClass *getBoolRC() const {
1649 return getRegisterInfo()->getBoolRC();
1650 }
1651
1652 /// \returns Maximum number of work groups per compute unit supported by the
1653 /// subtarget and limited by given \p FlatWorkGroupSize.
1654 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1655 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(STI: this, FlatWorkGroupSize);
1656 }
1657
1658 /// \returns Minimum flat work group size supported by the subtarget.
1659 unsigned getMinFlatWorkGroupSize() const override {
1660 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(STI: this);
1661 }
1662
1663 /// \returns Maximum flat work group size supported by the subtarget.
1664 unsigned getMaxFlatWorkGroupSize() const override {
1665 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(STI: this);
1666 }
1667
1668 /// \returns Number of waves per execution unit required to support the given
1669 /// \p FlatWorkGroupSize.
1670 unsigned
1671 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1672 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(STI: this, FlatWorkGroupSize);
1673 }
1674
1675 /// \returns Minimum number of waves per execution unit supported by the
1676 /// subtarget.
1677 unsigned getMinWavesPerEU() const override {
1678 return AMDGPU::IsaInfo::getMinWavesPerEU(STI: this);
1679 }
1680
1681 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1682 SDep &Dep,
1683 const TargetSchedModel *SchedModel) const override;
1684
1685 // \returns true if it's beneficial on this subtarget for the scheduler to
1686 // cluster stores as well as loads.
1687 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1688
1689 // \returns the number of address arguments from which to enable MIMG NSA
1690 // on supported architectures.
1691 unsigned getNSAThreshold(const MachineFunction &MF) const;
1692
1693 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
1694 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1695 bool requiresNopBeforeDeallocVGPRs() const {
1696 // Currently all targets that support the dealloc VGPRs message also require
1697 // the nop.
1698 return true;
1699 }
1700
1701 bool isDynamicVGPREnabled() const { return DynamicVGPR; }
1702 unsigned getDynamicVGPRBlockSize() const {
1703 return DynamicVGPRBlockSize32 ? 32 : 16;
1704 }
1705
1706 bool requiresDisjointEarlyClobberAndUndef() const override {
1707 // AMDGPU doesn't care if early-clobber and undef operands are allocated
1708 // to the same register.
1709 return false;
1710 }
1711};
1712
1713class GCNUserSGPRUsageInfo {
1714public:
1715 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1716
1717 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1718
1719 bool hasDispatchPtr() const { return DispatchPtr; }
1720
1721 bool hasQueuePtr() const { return QueuePtr; }
1722
1723 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1724
1725 bool hasDispatchID() const { return DispatchID; }
1726
1727 bool hasFlatScratchInit() const { return FlatScratchInit; }
1728
1729 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1730
1731 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1732
1733 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1734
1735 unsigned getNumFreeUserSGPRs();
1736
1737 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1738
1739 enum UserSGPRID : unsigned {
1740 ImplicitBufferPtrID = 0,
1741 PrivateSegmentBufferID = 1,
1742 DispatchPtrID = 2,
1743 QueuePtrID = 3,
1744 KernargSegmentPtrID = 4,
1745 DispatchIdID = 5,
1746 FlatScratchInitID = 6,
1747 PrivateSegmentSizeID = 7
1748 };
1749
1750 // Returns the size in number of SGPRs for preload user SGPR field.
1751 static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1752 switch (ID) {
1753 case ImplicitBufferPtrID:
1754 return 2;
1755 case PrivateSegmentBufferID:
1756 return 4;
1757 case DispatchPtrID:
1758 return 2;
1759 case QueuePtrID:
1760 return 2;
1761 case KernargSegmentPtrID:
1762 return 2;
1763 case DispatchIdID:
1764 return 2;
1765 case FlatScratchInitID:
1766 return 2;
1767 case PrivateSegmentSizeID:
1768 return 1;
1769 }
1770 llvm_unreachable("Unknown UserSGPRID.");
1771 }
1772
1773 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1774
1775private:
1776 const GCNSubtarget &ST;
1777
1778 // Private memory buffer
1779 // Compute directly in sgpr[0:1]
1780 // Other shaders indirect 64-bits at sgpr[0:1]
1781 bool ImplicitBufferPtr = false;
1782
1783 bool PrivateSegmentBuffer = false;
1784
1785 bool DispatchPtr = false;
1786
1787 bool QueuePtr = false;
1788
1789 bool KernargSegmentPtr = false;
1790
1791 bool DispatchID = false;
1792
1793 bool FlatScratchInit = false;
1794
1795 bool PrivateSegmentSize = false;
1796
1797 unsigned NumKernargPreloadSGPRs = 0;
1798
1799 unsigned NumUsedUserSGPRs = 0;
1800};
1801
1802} // end namespace llvm
1803
1804#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1805