GCNSubtarget.h source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNSubtarget.h]

1	//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// AMD GCN specific subclass of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15	#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17	#include "AMDGPUCallLowering.h"
18	#include "AMDGPURegisterBankInfo.h"
19	#include "AMDGPUSubtarget.h"
20	#include "SIFrameLowering.h"
21	#include "SIISelLowering.h"
22	#include "SIInstrInfo.h"
23	#include "Utils/AMDGPUBaseInfo.h"
24	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
25	#include "llvm/Support/ErrorHandling.h"
26
27	#define GET_SUBTARGETINFO_HEADER
28	#include "AMDGPUGenSubtargetInfo.inc"
29
30	namespace llvm {
31
32	class GCNTargetMachine;
33
34	class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
35	public AMDGPUSubtarget {
36	public:
37	using AMDGPUSubtarget::getMaxWavesPerEU;
38
39	// Following 2 enums are documented at:
40	// - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
41	enum class TrapHandlerAbi {
42	NONE = `0x00`,
43	AMDHSA = `0x01`,
44	};
45
46	enum class TrapID {
47	LLVMAMDHSATrap = `0x02`,
48	LLVMAMDHSADebugTrap = `0x03`,
49	};
50
51	private:
52	/// GlobalISel related APIs.
53	std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
54	std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
55	std::unique_ptr<InstructionSelector> InstSelector;
56	std::unique_ptr<LegalizerInfo> Legalizer;
57	std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
58
59	protected:
60	// Basic subtarget description.
61	Triple TargetTriple;
62	AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
63	unsigned Gen = INVALID;
64	InstrItineraryData InstrItins;
65	int LDSBankCount = `0`;
66	unsigned MaxPrivateElementSize = `0`;
67
68	// Possibly statically set by tablegen, but may want to be overridden.
69	bool FastDenormalF32 = false;
70	bool HalfRate64Ops = false;
71	bool FullRate64Ops = false;
72
73	// Dynamically set bits that enable features.
74	bool FlatForGlobal = false;
75	bool AutoWaitcntBeforeBarrier = false;
76	bool BackOffBarrier = false;
77	bool UnalignedScratchAccess = false;
78	bool UnalignedAccessMode = false;
79	bool HasApertureRegs = false;
80	bool SupportsXNACK = false;
81	bool KernargPreload = false;
82
83	// This should not be used directly. 'TargetID' tracks the dynamic settings
84	// for XNACK.
85	bool EnableXNACK = false;
86
87	bool EnableTgSplit = false;
88	bool EnableCuMode = false;
89	bool TrapHandler = false;
90	bool EnablePreciseMemory = false;
91
92	// Used as options.
93	bool EnableLoadStoreOpt = false;
94	bool EnableUnsafeDSOffsetFolding = false;
95	bool EnableSIScheduler = false;
96	bool EnableDS128 = false;
97	bool EnablePRTStrictNull = false;
98	bool DumpCode = false;
99
100	// Subtarget statically properties set by tablegen
101	bool FP64 = false;
102	bool FMA = false;
103	bool MIMG_R128 = false;
104	bool CIInsts = false;
105	bool GFX8Insts = false;
106	bool GFX9Insts = false;
107	bool GFX90AInsts = false;
108	bool GFX940Insts = false;
109	bool GFX10Insts = false;
110	bool GFX11Insts = false;
111	bool GFX12Insts = false;
112	bool GFX10_3Insts = false;
113	bool GFX7GFX8GFX9Insts = false;
114	bool SGPRInitBug = false;
115	bool UserSGPRInit16Bug = false;
116	bool NegativeScratchOffsetBug = false;
117	bool NegativeUnalignedScratchOffsetBug = false;
118	bool HasSMemRealTime = false;
119	bool HasIntClamp = false;
120	bool HasFmaMixInsts = false;
121	bool HasMovrel = false;
122	bool HasVGPRIndexMode = false;
123	bool HasScalarDwordx3Loads = false;
124	bool HasScalarStores = false;
125	bool HasScalarAtomics = false;
126	bool HasSDWAOmod = false;
127	bool HasSDWAScalar = false;
128	bool HasSDWASdst = false;
129	bool HasSDWAMac = false;
130	bool HasSDWAOutModsVOPC = false;
131	bool HasDPP = false;
132	bool HasDPP8 = false;
133	bool HasDPALU_DPP = false;
134	bool HasDPPSrc1SGPR = false;
135	bool HasPackedFP32Ops = false;
136	bool HasImageInsts = false;
137	bool HasExtendedImageInsts = false;
138	bool HasR128A16 = false;
139	bool HasA16 = false;
140	bool HasG16 = false;
141	bool HasNSAEncoding = false;
142	bool HasPartialNSAEncoding = false;
143	bool GFX10_AEncoding = false;
144	bool GFX10_BEncoding = false;
145	bool HasDLInsts = false;
146	bool HasFmacF64Inst = false;
147	bool HasDot1Insts = false;
148	bool HasDot2Insts = false;
149	bool HasDot3Insts = false;
150	bool HasDot4Insts = false;
151	bool HasDot5Insts = false;
152	bool HasDot6Insts = false;
153	bool HasDot7Insts = false;
154	bool HasDot8Insts = false;
155	bool HasDot9Insts = false;
156	bool HasDot10Insts = false;
157	bool HasDot11Insts = false;
158	bool HasMAIInsts = false;
159	bool HasFP8Insts = false;
160	bool HasFP8ConversionInsts = false;
161	bool HasPkFmacF16Inst = false;
162	bool HasAtomicFMinFMaxF32GlobalInsts = false;
163	bool HasAtomicFMinFMaxF64GlobalInsts = false;
164	bool HasAtomicFMinFMaxF32FlatInsts = false;
165	bool HasAtomicFMinFMaxF64FlatInsts = false;
166	bool HasAtomicDsPkAdd16Insts = false;
167	bool HasAtomicFlatPkAdd16Insts = false;
168	bool HasAtomicFaddRtnInsts = false;
169	bool HasAtomicFaddNoRtnInsts = false;
170	bool HasMemoryAtomicFaddF32DenormalSupport = false;
171	bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
172	bool HasAtomicBufferGlobalPkAddF16Insts = false;
173	bool HasAtomicCSubNoRtnInsts = false;
174	bool HasAtomicGlobalPkAddBF16Inst = false;
175	bool HasAtomicBufferPkAddBF16Inst = false;
176	bool HasFlatAtomicFaddF32Inst = false;
177	bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
178	bool HasDefaultComponentZero = false;
179	bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
180	bool HasDefaultComponentBroadcast = false;
181	/// The maximum number of instructions that may be placed within an S_CLAUSE,
182	/// which is one greater than the maximum argument to S_CLAUSE. A value of 0
183	/// indicates a lack of S_CLAUSE support.
184	unsigned MaxHardClauseLength = `0`;
185	bool SupportsSRAMECC = false;
186
187	// This should not be used directly. 'TargetID' tracks the dynamic settings
188	// for SRAMECC.
189	bool EnableSRAMECC = false;
190
191	bool HasNoSdstCMPX = false;
192	bool HasVscnt = false;
193	bool HasGetWaveIdInst = false;
194	bool HasSMemTimeInst = false;
195	bool HasShaderCyclesRegister = false;
196	bool HasShaderCyclesHiLoRegisters = false;
197	bool HasVOP3Literal = false;
198	bool HasNoDataDepHazard = false;
199	bool FlatAddressSpace = false;
200	bool FlatInstOffsets = false;
201	bool FlatGlobalInsts = false;
202	bool FlatScratchInsts = false;
203	bool ScalarFlatScratchInsts = false;
204	bool HasArchitectedFlatScratch = false;
205	bool EnableFlatScratch = false;
206	bool HasArchitectedSGPRs = false;
207	bool HasGDS = false;
208	bool HasGWS = false;
209	bool AddNoCarryInsts = false;
210	bool HasUnpackedD16VMem = false;
211	bool LDSMisalignedBug = false;
212	bool HasMFMAInlineLiteralBug = false;
213	bool UnalignedBufferAccess = false;
214	bool UnalignedDSAccess = false;
215	bool HasPackedTID = false;
216	bool ScalarizeGlobal = false;
217	bool HasSALUFloatInsts = false;
218	bool HasVGPRSingleUseHintInsts = false;
219	bool HasPseudoScalarTrans = false;
220	bool HasRestrictedSOffset = false;
221
222	bool HasVcmpxPermlaneHazard = false;
223	bool HasVMEMtoScalarWriteHazard = false;
224	bool HasSMEMtoVectorWriteHazard = false;
225	bool HasInstFwdPrefetchBug = false;
226	bool HasVcmpxExecWARHazard = false;
227	bool HasLdsBranchVmemWARHazard = false;
228	bool HasNSAtoVMEMBug = false;
229	bool HasNSAClauseBug = false;
230	bool HasOffset3fBug = false;
231	bool HasFlatSegmentOffsetBug = false;
232	bool HasImageStoreD16Bug = false;
233	bool HasImageGather4D16Bug = false;
234	bool HasMSAALoadDstSelBug = false;
235	bool HasPrivEnabledTrap2NopBug = false;
236	bool Has1_5xVGPRs = false;
237	bool HasMADIntraFwdBug = false;
238	bool HasVOPDInsts = false;
239	bool HasVALUTransUseHazard = false;
240	bool HasForceStoreSC0SC1 = false;
241	bool HasRequiredExportPriority = false;
242	bool HasVmemWriteVgprInOrder = false;
243
244	bool RequiresCOV6 = false;
245
246	// Dummy feature to use for assembler in tablegen.
247	bool FeatureDisable = false;
248
249	SelectionDAGTargetInfo TSInfo;
250	private:
251	SIInstrInfo InstrInfo;
252	SITargetLowering TLInfo;
253	SIFrameLowering FrameLowering;
254
255	public:
256	GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
257	const GCNTargetMachine &TM);
258	~GCNSubtarget() override;
259
260	GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
261	StringRef GPU, StringRef FS);
262
263	/// Diagnose inconsistent subtarget features before attempting to codegen
264	/// function \p F.
265	void checkSubtargetFeatures(const Function &F) const;
266
267	const SIInstrInfo getInstrInfo() const* override {
268	return &InstrInfo;
269	}
270
271	const SIFrameLowering getFrameLowering() const* override {
272	return &FrameLowering;
273	}
274
275	const SITargetLowering getTargetLowering() const* override {
276	return &TLInfo;
277	}
278
279	const SIRegisterInfo getRegisterInfo() const* override {
280	return &InstrInfo.getRegisterInfo();
281	}
282
283	const CallLowering getCallLowering() const* override {
284	return CallLoweringInfo.get();
285	}
286
287	const InlineAsmLowering getInlineAsmLowering() const* override {
288	return InlineAsmLoweringInfo.get();
289	}
290
291	InstructionSelector getInstructionSelector() const* override {
292	return InstSelector.get();
293	}
294
295	const LegalizerInfo getLegalizerInfo() const* override {
296	return Legalizer.get();
297	}
298
299	const AMDGPURegisterBankInfo getRegBankInfo() const* override {
300	return RegBankInfo.get();
301	}
302
303	const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
304	return TargetID;
305	}
306
307	// Nothing implemented, just prevent crashes on use.
308	const SelectionDAGTargetInfo getSelectionDAGInfo() const* override {
309	return &TSInfo;
310	}
311
312	const InstrItineraryData getInstrItineraryData() const* override {
313	return &InstrItins;
314	}
315
316	void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
317
318	Generation getGeneration() const {
319	return (Generation)Gen;
320	}
321
322	unsigned getMaxWaveScratchSize() const {
323	// See COMPUTE_TMPRING_SIZE.WAVESIZE.
324	if (getGeneration() >= GFX12) {
325	// 18-bit field in units of 64-dword.
326	return (`64` * `4`) * ((`1` << `18`) - `1`);
327	}
328	if (getGeneration() == GFX11) {
329	// 15-bit field in units of 64-dword.
330	return (`64` * `4`) * ((`1` << `15`) - `1`);
331	}
332	// 13-bit field in units of 256-dword.
333	return (`256` * `4`) * ((`1` << `13`) - `1`);
334	}
335
336	/// Return the number of high bits known to be zero for a frame index.
337	unsigned getKnownHighZeroBitsForFrameIndex() const {
338	return llvm::countl_zero(Val: getMaxWaveScratchSize()) + getWavefrontSizeLog2();
339	}
340
341	int getLDSBankCount() const {
342	return LDSBankCount;
343	}
344
345	unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
346	return (ForBufferRSrc \|\| !enableFlatScratch()) ? MaxPrivateElementSize : `16`;
347	}
348
349	unsigned getConstantBusLimit(unsigned Opcode) const;
350
351	/// Returns if the result of this instruction with a 16-bit result returned in
352	/// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
353	/// the original value.
354	bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
355
356	bool supportsWGP() const { return getGeneration() >= GFX10; }
357
358	bool hasIntClamp() const {
359	return HasIntClamp;
360	}
361
362	bool hasFP64() const {
363	return FP64;
364	}
365
366	bool hasMIMG_R128() const {
367	return MIMG_R128;
368	}
369
370	bool hasHWFP64() const {
371	return FP64;
372	}
373
374	bool hasHalfRate64Ops() const {
375	return HalfRate64Ops;
376	}
377
378	bool hasFullRate64Ops() const {
379	return FullRate64Ops;
380	}
381
382	bool hasAddr64() const {
383	return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
384	}
385
386	bool hasFlat() const {
387	return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
388	}
389
390	// Return true if the target only has the reverse operand versions of VALU
391	// shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
392	bool hasOnlyRevVALUShifts() const {
393	return getGeneration() >= VOLCANIC_ISLANDS;
394	}
395
396	bool hasFractBug() const {
397	return getGeneration() == SOUTHERN_ISLANDS;
398	}
399
400	bool hasBFE() const {
401	return true;
402	}
403
404	bool hasBFI() const {
405	return true;
406	}
407
408	bool hasBFM() const {
409	return hasBFE();
410	}
411
412	bool hasBCNT(unsigned Size) const {
413	return true;
414	}
415
416	bool hasFFBL() const {
417	return true;
418	}
419
420	bool hasFFBH() const {
421	return true;
422	}
423
424	bool hasMed3_16() const {
425	return getGeneration() >= AMDGPUSubtarget::GFX9;
426	}
427
428	bool hasMin3Max3_16() const {
429	return getGeneration() >= AMDGPUSubtarget::GFX9;
430	}
431
432	bool hasFmaMixInsts() const {
433	return HasFmaMixInsts;
434	}
435
436	bool hasCARRY() const {
437	return true;
438	}
439
440	bool hasFMA() const {
441	return FMA;
442	}
443
444	bool hasSwap() const {
445	return GFX9Insts;
446	}
447
448	bool hasScalarPackInsts() const {
449	return GFX9Insts;
450	}
451
452	bool hasScalarMulHiInsts() const {
453	return GFX9Insts;
454	}
455
456	bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
457
458	TrapHandlerAbi getTrapHandlerAbi() const {
459	return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
460	}
461
462	bool supportsGetDoorbellID() const {
463	// The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
464	return getGeneration() >= GFX9;
465	}
466
467	/// True if the offset field of DS instructions works as expected. On SI, the
468	/// offset uses a 16-bit adder and does not always wrap properly.
469	bool hasUsableDSOffset() const {
470	return getGeneration() >= SEA_ISLANDS;
471	}
472
473	bool unsafeDSOffsetFoldingEnabled() const {
474	return EnableUnsafeDSOffsetFolding;
475	}
476
477	/// Condition output from div_scale is usable.
478	bool hasUsableDivScaleConditionOutput() const {
479	return getGeneration() != SOUTHERN_ISLANDS;
480	}
481
482	/// Extra wait hazard is needed in some cases before
483	/// s_cbranch_vccnz/s_cbranch_vccz.
484	bool hasReadVCCZBug() const {
485	return getGeneration() <= SEA_ISLANDS;
486	}
487
488	/// Writes to VCC_LO/VCC_HI update the VCCZ flag.
489	bool partialVCCWritesUpdateVCCZ() const {
490	return getGeneration() >= GFX10;
491	}
492
493	/// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
494	/// was written by a VALU instruction.
495	bool hasSMRDReadVALUDefHazard() const {
496	return getGeneration() == SOUTHERN_ISLANDS;
497	}
498
499	/// A read of an SGPR by a VMEM instruction requires 5 wait states when the
500	/// SGPR was written by a VALU Instruction.
501	bool hasVMEMReadSGPRVALUDefHazard() const {
502	return getGeneration() >= VOLCANIC_ISLANDS;
503	}
504
505	bool hasRFEHazards() const {
506	return getGeneration() >= VOLCANIC_ISLANDS;
507	}
508
509	/// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
510	unsigned getSetRegWaitStates() const {
511	return getGeneration() <= SEA_ISLANDS ? `1` : `2`;
512	}
513
514	bool dumpCode() const {
515	return DumpCode;
516	}
517
518	/// Return the amount of LDS that can be used that will not restrict the
519	/// occupancy lower than WaveCount.
520	unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
521	const Function &) const;
522
523	bool supportsMinMaxDenormModes() const {
524	return getGeneration() >= AMDGPUSubtarget::GFX9;
525	}
526
527	/// \returns If target supports S_DENORM_MODE.
528	bool hasDenormModeInst() const {
529	return getGeneration() >= AMDGPUSubtarget::GFX10;
530	}
531
532	bool useFlatForGlobal() const {
533	return FlatForGlobal;
534	}
535
536	/// \returns If target supports ds_read/write_b128 and user enables generation
537	/// of ds_read/write_b128.
538	bool useDS128() const {
539	return CIInsts && EnableDS128;
540	}
541
542	/// \return If target supports ds_read/write_b96/128.
543	bool hasDS96AndDS128() const {
544	return CIInsts;
545	}
546
547	/// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
548	bool haveRoundOpsF64() const {
549	return CIInsts;
550	}
551
552	/// \returns If MUBUF instructions always perform range checking, even for
553	/// buffer resources used for private memory access.
554	bool privateMemoryResourceIsRangeChecked() const {
555	return getGeneration() < AMDGPUSubtarget::GFX9;
556	}
557
558	/// \returns If target requires PRT Struct NULL support (zero result registers
559	/// for sparse texture support).
560	bool usePRTStrictNull() const {
561	return EnablePRTStrictNull;
562	}
563
564	bool hasAutoWaitcntBeforeBarrier() const {
565	return AutoWaitcntBeforeBarrier;
566	}
567
568	/// \returns true if the target supports backing off of s_barrier instructions
569	/// when an exception is raised.
570	bool supportsBackOffBarrier() const {
571	return BackOffBarrier;
572	}
573
574	bool hasUnalignedBufferAccess() const {
575	return UnalignedBufferAccess;
576	}
577
578	bool hasUnalignedBufferAccessEnabled() const {
579	return UnalignedBufferAccess && UnalignedAccessMode;
580	}
581
582	bool hasUnalignedDSAccess() const {
583	return UnalignedDSAccess;
584	}
585
586	bool hasUnalignedDSAccessEnabled() const {
587	return UnalignedDSAccess && UnalignedAccessMode;
588	}
589
590	bool hasUnalignedScratchAccess() const {
591	return UnalignedScratchAccess;
592	}
593
594	bool hasUnalignedAccessMode() const {
595	return UnalignedAccessMode;
596	}
597
598	bool hasApertureRegs() const {
599	return HasApertureRegs;
600	}
601
602	bool isTrapHandlerEnabled() const {
603	return TrapHandler;
604	}
605
606	bool isXNACKEnabled() const {
607	return TargetID.isXnackOnOrAny();
608	}
609
610	bool isTgSplitEnabled() const {
611	return EnableTgSplit;
612	}
613
614	bool isCuModeEnabled() const {
615	return EnableCuMode;
616	}
617
618	bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
619
620	bool hasFlatAddressSpace() const {
621	return FlatAddressSpace;
622	}
623
624	bool hasFlatScrRegister() const {
625	return hasFlatAddressSpace();
626	}
627
628	bool hasFlatInstOffsets() const {
629	return FlatInstOffsets;
630	}
631
632	bool hasFlatGlobalInsts() const {
633	return FlatGlobalInsts;
634	}
635
636	bool hasFlatScratchInsts() const {
637	return FlatScratchInsts;
638	}
639
640	// Check if target supports ST addressing mode with FLAT scratch instructions.
641	// The ST addressing mode means no registers are used, either VGPR or SGPR,
642	// but only immediate offset is swizzled and added to the FLAT scratch base.
643	bool hasFlatScratchSTMode() const {
644	return hasFlatScratchInsts() && (hasGFX10_3Insts() \|\| hasGFX940Insts());
645	}
646
647	bool hasFlatScratchSVSMode() const { return GFX940Insts \|\| GFX11Insts; }
648
649	bool hasScalarFlatScratchInsts() const {
650	return ScalarFlatScratchInsts;
651	}
652
653	bool enableFlatScratch() const {
654	return flatScratchIsArchitected() \|\|
655	(EnableFlatScratch && hasFlatScratchInsts());
656	}
657
658	bool hasGlobalAddTidInsts() const {
659	return GFX10_BEncoding;
660	}
661
662	bool hasAtomicCSub() const {
663	return GFX10_BEncoding;
664	}
665
666	bool hasExportInsts() const {
667	return !hasGFX940Insts();
668	}
669
670	bool hasVINTERPEncoding() const {
671	return GFX11Insts;
672	}
673
674	// DS_ADD_F64/DS_ADD_RTN_F64
675	bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
676
677	bool hasMultiDwordFlatScratchAddressing() const {
678	return getGeneration() >= GFX9;
679	}
680
681	bool hasFlatSegmentOffsetBug() const {
682	return HasFlatSegmentOffsetBug;
683	}
684
685	bool hasFlatLgkmVMemCountInOrder() const {
686	return getGeneration() > GFX9;
687	}
688
689	bool hasD16LoadStore() const {
690	return getGeneration() >= GFX9;
691	}
692
693	bool d16PreservesUnusedBits() const {
694	return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
695	}
696
697	bool hasD16Images() const {
698	return getGeneration() >= VOLCANIC_ISLANDS;
699	}
700
701	/// Return if most LDS instructions have an m0 use that require m0 to be
702	/// initialized.
703	bool ldsRequiresM0Init() const {
704	return getGeneration() < GFX9;
705	}
706
707	// True if the hardware rewinds and replays GWS operations if a wave is
708	// preempted.
709	//
710	// If this is false, a GWS operation requires testing if a nack set the
711	// MEM_VIOL bit, and repeating if so.
712	bool hasGWSAutoReplay() const {
713	return getGeneration() >= GFX9;
714	}
715
716	/// \returns if target has ds_gws_sema_release_all instruction.
717	bool hasGWSSemaReleaseAll() const {
718	return CIInsts;
719	}
720
721	/// \returns true if the target has integer add/sub instructions that do not
722	/// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
723	/// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
724	/// for saturation.
725	bool hasAddNoCarry() const {
726	return AddNoCarryInsts;
727	}
728
729	bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
730
731	bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
732
733	bool hasUnpackedD16VMem() const {
734	return HasUnpackedD16VMem;
735	}
736
737	// Covers VS/PS/CS graphics shaders
738	bool isMesaGfxShader(const Function &F) const {
739	return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
740	}
741
742	bool hasMad64_32() const {
743	return getGeneration() >= SEA_ISLANDS;
744	}
745
746	bool hasSDWAOmod() const {
747	return HasSDWAOmod;
748	}
749
750	bool hasSDWAScalar() const {
751	return HasSDWAScalar;
752	}
753
754	bool hasSDWASdst() const {
755	return HasSDWASdst;
756	}
757
758	bool hasSDWAMac() const {
759	return HasSDWAMac;
760	}
761
762	bool hasSDWAOutModsVOPC() const {
763	return HasSDWAOutModsVOPC;
764	}
765
766	bool hasDLInsts() const {
767	return HasDLInsts;
768	}
769
770	bool hasFmacF64Inst() const { return HasFmacF64Inst; }
771
772	bool hasDot1Insts() const {
773	return HasDot1Insts;
774	}
775
776	bool hasDot2Insts() const {
777	return HasDot2Insts;
778	}
779
780	bool hasDot3Insts() const {
781	return HasDot3Insts;
782	}
783
784	bool hasDot4Insts() const {
785	return HasDot4Insts;
786	}
787
788	bool hasDot5Insts() const {
789	return HasDot5Insts;
790	}
791
792	bool hasDot6Insts() const {
793	return HasDot6Insts;
794	}
795
796	bool hasDot7Insts() const {
797	return HasDot7Insts;
798	}
799
800	bool hasDot8Insts() const {
801	return HasDot8Insts;
802	}
803
804	bool hasDot9Insts() const {
805	return HasDot9Insts;
806	}
807
808	bool hasDot10Insts() const {
809	return HasDot10Insts;
810	}
811
812	bool hasDot11Insts() const {
813	return HasDot11Insts;
814	}
815
816	bool hasMAIInsts() const {
817	return HasMAIInsts;
818	}
819
820	bool hasFP8Insts() const {
821	return HasFP8Insts;
822	}
823
824	bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
825
826	bool hasPkFmacF16Inst() const {
827	return HasPkFmacF16Inst;
828	}
829
830	bool hasAtomicFMinFMaxF32GlobalInsts() const {
831	return HasAtomicFMinFMaxF32GlobalInsts;
832	}
833
834	bool hasAtomicFMinFMaxF64GlobalInsts() const {
835	return HasAtomicFMinFMaxF64GlobalInsts;
836	}
837
838	bool hasAtomicFMinFMaxF32FlatInsts() const {
839	return HasAtomicFMinFMaxF32FlatInsts;
840	}
841
842	bool hasAtomicFMinFMaxF64FlatInsts() const {
843	return HasAtomicFMinFMaxF64FlatInsts;
844	}
845
846	bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
847
848	bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
849
850	bool hasAtomicFaddInsts() const {
851	return HasAtomicFaddRtnInsts \|\| HasAtomicFaddNoRtnInsts;
852	}
853
854	bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
855
856	bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
857
858	bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
859	return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
860	}
861
862	bool hasAtomicBufferGlobalPkAddF16Insts() const {
863	return HasAtomicBufferGlobalPkAddF16Insts;
864	}
865
866	bool hasAtomicGlobalPkAddBF16Inst() const {
867	return HasAtomicGlobalPkAddBF16Inst;
868	}
869
870	bool hasAtomicBufferPkAddBF16Inst() const {
871	return HasAtomicBufferPkAddBF16Inst;
872	}
873
874	bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
875
876	/// \return true if the target has flat, global, and buffer atomic fadd for
877	/// double.
878	bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
879	return HasFlatBufferGlobalAtomicFaddF64Inst;
880	}
881
882	/// \return true if the target's flat, global, and buffer atomic fadd for
883	/// float supports denormal handling.
884	bool hasMemoryAtomicFaddF32DenormalSupport() const {
885	return HasMemoryAtomicFaddF32DenormalSupport;
886	}
887
888	/// \return true if atomic operations targeting fine-grained memory work
889	/// correctly at device scope, in allocations in host or peer PCIe device
890	/// memory.
891	bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
892	return HasAgentScopeFineGrainedRemoteMemoryAtomics;
893	}
894
895	bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
896
897	bool hasDefaultComponentBroadcast() const {
898	return HasDefaultComponentBroadcast;
899	}
900
901	bool hasNoSdstCMPX() const {
902	return HasNoSdstCMPX;
903	}
904
905	bool hasVscnt() const {
906	return HasVscnt;
907	}
908
909	bool hasGetWaveIdInst() const {
910	return HasGetWaveIdInst;
911	}
912
913	bool hasSMemTimeInst() const {
914	return HasSMemTimeInst;
915	}
916
917	bool hasShaderCyclesRegister() const {
918	return HasShaderCyclesRegister;
919	}
920
921	bool hasShaderCyclesHiLoRegisters() const {
922	return HasShaderCyclesHiLoRegisters;
923	}
924
925	bool hasVOP3Literal() const {
926	return HasVOP3Literal;
927	}
928
929	bool hasNoDataDepHazard() const {
930	return HasNoDataDepHazard;
931	}
932
933	bool vmemWriteNeedsExpWaitcnt() const {
934	return getGeneration() < SEA_ISLANDS;
935	}
936
937	bool hasInstPrefetch() const {
938	return getGeneration() == GFX10 \|\| getGeneration() == GFX11;
939	}
940
941	bool hasPrefetch() const { return GFX12Insts; }
942
943	// Has s_cmpk_ instructions.*
944	bool hasSCmpK() const { return getGeneration() < GFX12; }
945
946	// Scratch is allocated in 256 dword per wave blocks for the entire
947	// wavefront. When viewed from the perspective of an arbitrary workitem, this
948	// is 4-byte aligned.
949	//
950	// Only 4-byte alignment is really needed to access anything. Transformations
951	// on the pointer value itself may rely on the alignment / known low bits of
952	// the pointer. Set this to something above the minimum to avoid needing
953	// dynamic realignment in common cases.
954	Align getStackAlignment() const { return Align (`16`); }
955
956	bool enableMachineScheduler() const override {
957	return true;
958	}
959
960	bool useAA() const override;
961
962	bool enableSubRegLiveness() const override {
963	return true;
964	}
965
966	void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
967	bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
968
969	// static wrappers
970	static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
971
972	// XXX - Why is this here if it isn't in the default pass set?
973	bool enableEarlyIfConversion() const override {
974	return true;
975	}
976
977	void overrideSchedPolicy(MachineSchedPolicy &Policy,
978	unsigned NumRegionInstrs) const override;
979
980	void mirFileLoaded(MachineFunction &MF) const override;
981
982	unsigned getMaxNumUserSGPRs() const {
983	return AMDGPU::getMaxNumUserSGPRs(STI: *this);
984	}
985
986	bool hasSMemRealTime() const {
987	return HasSMemRealTime;
988	}
989
990	bool hasMovrel() const {
991	return HasMovrel;
992	}
993
994	bool hasVGPRIndexMode() const {
995	return HasVGPRIndexMode;
996	}
997
998	bool useVGPRIndexMode() const;
999
1000	bool hasScalarCompareEq64() const {
1001	return getGeneration() >= VOLCANIC_ISLANDS;
1002	}
1003
1004	bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
1005
1006	bool hasScalarStores() const {
1007	return HasScalarStores;
1008	}
1009
1010	bool hasScalarAtomics() const {
1011	return HasScalarAtomics;
1012	}
1013
1014	bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
1015	bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
1016
1017	/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
1018	bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
1019
1020	/// \returns true if the subtarget has the v_permlane64_b32 instruction.
1021	bool hasPermLane64() const { return getGeneration() >= GFX11; }
1022
1023	bool hasDPP() const {
1024	return HasDPP;
1025	}
1026
1027	bool hasDPPBroadcasts() const {
1028	return HasDPP && getGeneration() < GFX10;
1029	}
1030
1031	bool hasDPPWavefrontShifts() const {
1032	return HasDPP && getGeneration() < GFX10;
1033	}
1034
1035	bool hasDPP8() const {
1036	return HasDPP8;
1037	}
1038
1039	bool hasDPALU_DPP() const {
1040	return HasDPALU_DPP;
1041	}
1042
1043	bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
1044
1045	bool hasPackedFP32Ops() const {
1046	return HasPackedFP32Ops;
1047	}
1048
1049	// Has V_PK_MOV_B32 opcode
1050	bool hasPkMovB32() const {
1051	return GFX90AInsts;
1052	}
1053
1054	bool hasFmaakFmamkF32Insts() const {
1055	return getGeneration() >= GFX10 \|\| hasGFX940Insts();
1056	}
1057
1058	bool hasImageInsts() const {
1059	return HasImageInsts;
1060	}
1061
1062	bool hasExtendedImageInsts() const {
1063	return HasExtendedImageInsts;
1064	}
1065
1066	bool hasR128A16() const {
1067	return HasR128A16;
1068	}
1069
1070	bool hasA16() const { return HasA16; }
1071
1072	bool hasG16() const { return HasG16; }
1073
1074	bool hasOffset3fBug() const {
1075	return HasOffset3fBug;
1076	}
1077
1078	bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1079
1080	bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1081
1082	bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1083
1084	bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1085
1086	bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1087
1088	bool hasNSAEncoding() const { return HasNSAEncoding; }
1089
1090	bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1091
1092	bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1093
1094	unsigned getNSAMaxSize(bool HasSampler = false) const {
1095	return AMDGPU::getNSAMaxSize(STI: *this, HasSampler);
1096	}
1097
1098	bool hasGFX10_AEncoding() const {
1099	return GFX10_AEncoding;
1100	}
1101
1102	bool hasGFX10_BEncoding() const {
1103	return GFX10_BEncoding;
1104	}
1105
1106	bool hasGFX10_3Insts() const {
1107	return GFX10_3Insts;
1108	}
1109
1110	bool hasMadF16() const;
1111
1112	bool hasMovB64() const { return GFX940Insts; }
1113
1114	bool hasLshlAddB64() const { return GFX940Insts; }
1115
1116	bool enableSIScheduler() const {
1117	return EnableSIScheduler;
1118	}
1119
1120	bool loadStoreOptEnabled() const {
1121	return EnableLoadStoreOpt;
1122	}
1123
1124	bool hasSGPRInitBug() const {
1125	return SGPRInitBug;
1126	}
1127
1128	bool hasUserSGPRInit16Bug() const {
1129	return UserSGPRInit16Bug && isWave32();
1130	}
1131
1132	bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1133
1134	bool hasNegativeUnalignedScratchOffsetBug() const {
1135	return NegativeUnalignedScratchOffsetBug;
1136	}
1137
1138	bool hasMFMAInlineLiteralBug() const {
1139	return HasMFMAInlineLiteralBug;
1140	}
1141
1142	bool has12DWordStoreHazard() const {
1143	return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1144	}
1145
1146	// \returns true if the subtarget supports DWORDX3 load/store instructions.
1147	bool hasDwordx3LoadStores() const {
1148	return CIInsts;
1149	}
1150
1151	bool hasReadM0MovRelInterpHazard() const {
1152	return getGeneration() == AMDGPUSubtarget::GFX9;
1153	}
1154
1155	bool hasReadM0SendMsgHazard() const {
1156	return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1157	getGeneration() <= AMDGPUSubtarget::GFX9;
1158	}
1159
1160	bool hasReadM0LdsDmaHazard() const {
1161	return getGeneration() == AMDGPUSubtarget::GFX9;
1162	}
1163
1164	bool hasReadM0LdsDirectHazard() const {
1165	return getGeneration() == AMDGPUSubtarget::GFX9;
1166	}
1167
1168	bool hasVcmpxPermlaneHazard() const {
1169	return HasVcmpxPermlaneHazard;
1170	}
1171
1172	bool hasVMEMtoScalarWriteHazard() const {
1173	return HasVMEMtoScalarWriteHazard;
1174	}
1175
1176	bool hasSMEMtoVectorWriteHazard() const {
1177	return HasSMEMtoVectorWriteHazard;
1178	}
1179
1180	bool hasLDSMisalignedBug() const {
1181	return LDSMisalignedBug && !EnableCuMode;
1182	}
1183
1184	bool hasInstFwdPrefetchBug() const {
1185	return HasInstFwdPrefetchBug;
1186	}
1187
1188	bool hasVcmpxExecWARHazard() const {
1189	return HasVcmpxExecWARHazard;
1190	}
1191
1192	bool hasLdsBranchVmemWARHazard() const {
1193	return HasLdsBranchVmemWARHazard;
1194	}
1195
1196	// Shift amount of a 64 bit shift cannot be a highest allocated register
1197	// if also at the end of the allocation block.
1198	bool hasShift64HighRegBug() const {
1199	return GFX90AInsts && !GFX940Insts;
1200	}
1201
1202	// Has one cycle hazard on transcendental instruction feeding a
1203	// non transcendental VALU.
1204	bool hasTransForwardingHazard() const { return GFX940Insts; }
1205
1206	// Has one cycle hazard on a VALU instruction partially writing dst with
1207	// a shift of result bits feeding another VALU instruction.
1208	bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1209
1210	// Cannot use op_sel with v_dot instructions.
1211	bool hasDOTOpSelHazard() const { return GFX940Insts \|\| GFX11Insts; }
1212
1213	// Does not have HW interlocs for VALU writing and then reading SGPRs.
1214	bool hasVDecCoExecHazard() const {
1215	return GFX940Insts;
1216	}
1217
1218	bool hasNSAtoVMEMBug() const {
1219	return HasNSAtoVMEMBug;
1220	}
1221
1222	bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1223
1224	bool hasHardClauses() const { return MaxHardClauseLength > `0`; }
1225
1226	bool hasGFX90AInsts() const { return GFX90AInsts; }
1227
1228	bool hasFPAtomicToDenormModeHazard() const {
1229	return getGeneration() == GFX10;
1230	}
1231
1232	bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1233
1234	bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1235
1236	bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1237
1238	bool hasVALUPartialForwardingHazard() const {
1239	return getGeneration() == GFX11;
1240	}
1241
1242	bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1243
1244	bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
1245
1246	bool requiresCodeObjectV6() const { return RequiresCOV6; }
1247
1248	bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1249
1250	/// Return if operations acting on VGPR tuples require even alignment.
1251	bool needsAlignedVGPRs() const { return GFX90AInsts; }
1252
1253	/// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1254	bool hasSPackHL() const { return GFX11Insts; }
1255
1256	/// Return true if the target's EXP instruction has the COMPR flag, which
1257	/// affects the meaning of the EN (enable) bits.
1258	bool hasCompressedExport() const { return !GFX11Insts; }
1259
1260	/// Return true if the target's EXP instruction supports the NULL export
1261	/// target.
1262	bool hasNullExportTarget() const { return !GFX11Insts; }
1263
1264	bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
1265
1266	bool hasVOPDInsts() const { return HasVOPDInsts; }
1267
1268	bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1269
1270	/// Return true if the target has the S_DELAY_ALU instruction.
1271	bool hasDelayAlu() const { return GFX11Insts; }
1272
1273	bool hasPackedTID() const { return HasPackedTID; }
1274
1275	// GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1276	// hasGFX90AInsts is also true.
1277	bool hasGFX940Insts() const { return GFX940Insts; }
1278
1279	bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1280
1281	bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
1282
1283	bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1284
1285	bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1286
1287	bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
1288
1289	bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
1290
1291	/// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1292	/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
1293	bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1294
1295	/// \returns true if inline constants are not supported for F16 pseudo
1296	/// scalar transcendentals.
1297	bool hasNoF16PseudoScalarTransInlineConstants() const {
1298	return getGeneration() == GFX12;
1299	}
1300
1301	/// \returns The maximum number of instructions that can be enclosed in an
1302	/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1303	/// instruction.
1304	unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1305
1306	/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1307	/// SGPRs
1308	unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1309
1310	/// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1311	/// VGPRs
1312	unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1313
1314	/// Return occupancy for the given function. Used LDS and a number of
1315	/// registers if provided.
1316	/// Note, occupancy can be affected by the scratch allocation as well, but
1317	/// we do not have enough information to compute it.
1318	unsigned computeOccupancy(const Function &F, unsigned LDSSize = `0`,
1319	unsigned NumSGPRs = `0`, unsigned NumVGPRs = `0`) const;
1320
1321	/// \returns true if the flat_scratch register should be initialized with the
1322	/// pointer to the wave's scratch memory rather than a size and offset.
1323	bool flatScratchIsPointer() const {
1324	return getGeneration() >= AMDGPUSubtarget::GFX9;
1325	}
1326
1327	/// \returns true if the flat_scratch register is initialized by the HW.
1328	/// In this case it is readonly.
1329	bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1330
1331	/// \returns true if the architected SGPRs are enabled.
1332	bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1333
1334	/// \returns true if Global Data Share is supported.
1335	bool hasGDS() const { return HasGDS; }
1336
1337	/// \returns true if Global Wave Sync is supported.
1338	bool hasGWS() const { return HasGWS; }
1339
1340	/// \returns true if the machine has merged shaders in which s0-s7 are
1341	/// reserved by the hardware and user SGPRs start at s8
1342	bool hasMergedShaders() const {
1343	return getGeneration() >= GFX9;
1344	}
1345
1346	// \returns true if the target supports the pre-NGG legacy geometry path.
1347	bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1348
1349	// \returns true if preloading kernel arguments is supported.
1350	bool hasKernargPreload() const { return KernargPreload; }
1351
1352	// \returns true if the target has split barriers feature
1353	bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1354
1355	// \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1356	bool hasCvtFP8VOP1Bug() const { return true; }
1357
1358	// \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1359	// no-return form.
1360	bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1361
1362	// \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1363	bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1364
1365	// \returns true if the target has IEEE kernel descriptor mode bit
1366	bool hasIEEEMode() const { return getGeneration() < GFX12; }
1367
1368	// \returns true if the target has IEEE fminimum/fmaximum instructions
1369	bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1370
1371	// \returns true if the target has IEEE fminimum3/fmaximum3 instructions
1372	bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
1373
1374	// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1375	bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1376
1377	/// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1378	/// values.
1379	bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1380
1381	// \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1382	// of sign-extending.
1383	bool hasGetPCZeroExtension() const { return GFX12Insts; }
1384
1385	/// \returns SGPR allocation granularity supported by the subtarget.
1386	unsigned getSGPRAllocGranule() const {
1387	return AMDGPU::IsaInfo::getSGPRAllocGranule(STI: this);
1388	}
1389
1390	/// \returns SGPR encoding granularity supported by the subtarget.
1391	unsigned getSGPREncodingGranule() const {
1392	return AMDGPU::IsaInfo::getSGPREncodingGranule(STI: this);
1393	}
1394
1395	/// \returns Total number of SGPRs supported by the subtarget.
1396	unsigned getTotalNumSGPRs() const {
1397	return AMDGPU::IsaInfo::getTotalNumSGPRs(STI: this);
1398	}
1399
1400	/// \returns Addressable number of SGPRs supported by the subtarget.
1401	unsigned getAddressableNumSGPRs() const {
1402	return AMDGPU::IsaInfo::getAddressableNumSGPRs(STI: this);
1403	}
1404
1405	/// \returns Minimum number of SGPRs that meets the given number of waves per
1406	/// execution unit requirement supported by the subtarget.
1407	unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1408	return AMDGPU::IsaInfo::getMinNumSGPRs(STI: this, WavesPerEU);
1409	}
1410
1411	/// \returns Maximum number of SGPRs that meets the given number of waves per
1412	/// execution unit requirement supported by the subtarget.
1413	unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1414	return AMDGPU::IsaInfo::getMaxNumSGPRs(STI: this, WavesPerEU, Addressable);
1415	}
1416
1417	/// \returns Reserved number of SGPRs. This is common
1418	/// utility function called by MachineFunction and
1419	/// Function variants of getReservedNumSGPRs.
1420	unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1421	/// \returns Reserved number of SGPRs for given machine function \p MF.
1422	unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1423
1424	/// \returns Reserved number of SGPRs for given function \p F.
1425	unsigned getReservedNumSGPRs(const Function &F) const;
1426
1427	/// \returns max num SGPRs. This is the common utility
1428	/// function called by MachineFunction and Function
1429	/// variants of getMaxNumSGPRs.
1430	unsigned getBaseMaxNumSGPRs(const Function &F,
1431	std::pair<unsigned, unsigned> WavesPerEU,
1432	unsigned PreloadedSGPRs,
1433	unsigned ReservedNumSGPRs) const;
1434
1435	/// \returns Maximum number of SGPRs that meets number of waves per execution
1436	/// unit requirement for function \p MF, or number of SGPRs explicitly
1437	/// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1438	///
1439	/// \returns Value that meets number of waves per execution unit requirement
1440	/// if explicitly requested value cannot be converted to integer, violates
1441	/// subtarget's specifications, or does not meet number of waves per execution
1442	/// unit requirement.
1443	unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1444
1445	/// \returns Maximum number of SGPRs that meets number of waves per execution
1446	/// unit requirement for function \p F, or number of SGPRs explicitly
1447	/// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1448	///
1449	/// \returns Value that meets number of waves per execution unit requirement
1450	/// if explicitly requested value cannot be converted to integer, violates
1451	/// subtarget's specifications, or does not meet number of waves per execution
1452	/// unit requirement.
1453	unsigned getMaxNumSGPRs(const Function &F) const;
1454
1455	/// \returns VGPR allocation granularity supported by the subtarget.
1456	unsigned getVGPRAllocGranule() const {
1457	return AMDGPU::IsaInfo::getVGPRAllocGranule(STI: this);
1458	}
1459
1460	/// \returns VGPR encoding granularity supported by the subtarget.
1461	unsigned getVGPREncodingGranule() const {
1462	return AMDGPU::IsaInfo::getVGPREncodingGranule(STI: this);
1463	}
1464
1465	/// \returns Total number of VGPRs supported by the subtarget.
1466	unsigned getTotalNumVGPRs() const {
1467	return AMDGPU::IsaInfo::getTotalNumVGPRs(STI: this);
1468	}
1469
1470	/// \returns Addressable number of architectural VGPRs supported by the
1471	/// subtarget.
1472	unsigned getAddressableNumArchVGPRs() const {
1473	return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(STI: this);
1474	}
1475
1476	/// \returns Addressable number of VGPRs supported by the subtarget.
1477	unsigned getAddressableNumVGPRs() const {
1478	return AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: this);
1479	}
1480
1481	/// \returns the minimum number of VGPRs that will prevent achieving more than
1482	/// the specified number of waves \p WavesPerEU.
1483	unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1484	return AMDGPU::IsaInfo::getMinNumVGPRs(STI: this, WavesPerEU);
1485	}
1486
1487	/// \returns the maximum number of VGPRs that can be used and still achieved
1488	/// at least the specified number of waves \p WavesPerEU.
1489	unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1490	return AMDGPU::IsaInfo::getMaxNumVGPRs(STI: this, WavesPerEU);
1491	}
1492
1493	/// \returns max num VGPRs. This is the common utility function
1494	/// called by MachineFunction and Function variants of getMaxNumVGPRs.
1495	unsigned getBaseMaxNumVGPRs(const Function &F,
1496	std::pair<unsigned, unsigned> WavesPerEU) const;
1497	/// \returns Maximum number of VGPRs that meets number of waves per execution
1498	/// unit requirement for function \p F, or number of VGPRs explicitly
1499	/// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1500	///
1501	/// \returns Value that meets number of waves per execution unit requirement
1502	/// if explicitly requested value cannot be converted to integer, violates
1503	/// subtarget's specifications, or does not meet number of waves per execution
1504	/// unit requirement.
1505	unsigned getMaxNumVGPRs(const Function &F) const;
1506
1507	unsigned getMaxNumAGPRs(const Function &F) const {
1508	return getMaxNumVGPRs(F);
1509	}
1510
1511	/// \returns Maximum number of VGPRs that meets number of waves per execution
1512	/// unit requirement for function \p MF, or number of VGPRs explicitly
1513	/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1514	///
1515	/// \returns Value that meets number of waves per execution unit requirement
1516	/// if explicitly requested value cannot be converted to integer, violates
1517	/// subtarget's specifications, or does not meet number of waves per execution
1518	/// unit requirement.
1519	unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1520
1521	void getPostRAMutations(
1522	std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1523	const override;
1524
1525	std::unique_ptr<ScheduleDAGMutation>
1526	createFillMFMAShadowMutation(const TargetInstrInfo TII) const*;
1527
1528	bool isWave32() const {
1529	return getWavefrontSize() == `32`;
1530	}
1531
1532	bool isWave64() const {
1533	return getWavefrontSize() == `64`;
1534	}
1535
1536	const TargetRegisterClass getBoolRC() const* {
1537	return getRegisterInfo()->getBoolRC();
1538	}
1539
1540	/// \returns Maximum number of work groups per compute unit supported by the
1541	/// subtarget and limited by given \p FlatWorkGroupSize.
1542	unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1543	return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(STI: this, FlatWorkGroupSize);
1544	}
1545
1546	/// \returns Minimum flat work group size supported by the subtarget.
1547	unsigned getMinFlatWorkGroupSize() const override {
1548	return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(STI: this);
1549	}
1550
1551	/// \returns Maximum flat work group size supported by the subtarget.
1552	unsigned getMaxFlatWorkGroupSize() const override {
1553	return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(STI: this);
1554	}
1555
1556	/// \returns Number of waves per execution unit required to support the given
1557	/// \p FlatWorkGroupSize.
1558	unsigned
1559	getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1560	return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(STI: this, FlatWorkGroupSize);
1561	}
1562
1563	/// \returns Minimum number of waves per execution unit supported by the
1564	/// subtarget.
1565	unsigned getMinWavesPerEU() const override {
1566	return AMDGPU::IsaInfo::getMinWavesPerEU(STI: this);
1567	}
1568
1569	void adjustSchedDependency(SUnit Def, int* DefOpIdx, SUnit Use, int* UseOpIdx,
1570	SDep &Dep,
1571	const TargetSchedModel SchedModel) const* override;
1572
1573	// \returns true if it's beneficial on this subtarget for the scheduler to
1574	// cluster stores as well as loads.
1575	bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1576
1577	// \returns the number of address arguments from which to enable MIMG NSA
1578	// on supported architectures.
1579	unsigned getNSAThreshold(const MachineFunction &MF) const;
1580
1581	// \returns true if the subtarget has a hazard requiring an "s_nop 0"
1582	// instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1583	bool requiresNopBeforeDeallocVGPRs() const {
1584	// Currently all targets that support the dealloc VGPRs message also require
1585	// the nop.
1586	return true;
1587	}
1588	};
1589
1590	class GCNUserSGPRUsageInfo {
1591	public:
1592	bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1593
1594	bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1595
1596	bool hasDispatchPtr() const { return DispatchPtr; }
1597
1598	bool hasQueuePtr() const { return QueuePtr; }
1599
1600	bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1601
1602	bool hasDispatchID() const { return DispatchID; }
1603
1604	bool hasFlatScratchInit() const { return FlatScratchInit; }
1605
1606	bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1607
1608	unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1609
1610	unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1611
1612	unsigned getNumFreeUserSGPRs();
1613
1614	void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1615
1616	enum UserSGPRID : unsigned {
1617	ImplicitBufferPtrID = `0`,
1618	PrivateSegmentBufferID = `1`,
1619	DispatchPtrID = `2`,
1620	QueuePtrID = `3`,
1621	KernargSegmentPtrID = `4`,
1622	DispatchIdID = `5`,
1623	FlatScratchInitID = `6`,
1624	PrivateSegmentSizeID = `7`
1625	};
1626
1627	// Returns the size in number of SGPRs for preload user SGPR field.
1628	static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1629	switch (ID) {
1630	case ImplicitBufferPtrID:
1631	return `2`;
1632	case PrivateSegmentBufferID:
1633	return `4`;
1634	case DispatchPtrID:
1635	return `2`;
1636	case QueuePtrID:
1637	return `2`;
1638	case KernargSegmentPtrID:
1639	return `2`;
1640	case DispatchIdID:
1641	return `2`;
1642	case FlatScratchInitID:
1643	return `2`;
1644	case PrivateSegmentSizeID:
1645	return `1`;
1646	}
1647	llvm_unreachable("Unknown UserSGPRID.");
1648	}
1649
1650	GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1651
1652	private:
1653	const GCNSubtarget &ST;
1654
1655	// Private memory buffer
1656	// Compute directly in sgpr[0:1]
1657	// Other shaders indirect 64-bits at sgpr[0:1]
1658	bool ImplicitBufferPtr = false;
1659
1660	bool PrivateSegmentBuffer = false;
1661
1662	bool DispatchPtr = false;
1663
1664	bool QueuePtr = false;
1665
1666	bool KernargSegmentPtr = false;
1667
1668	bool DispatchID = false;
1669
1670	bool FlatScratchInit = false;
1671
1672	bool PrivateSegmentSize = false;
1673
1674	unsigned NumKernargPreloadSGPRs = `0`;
1675
1676	unsigned NumUsedUserSGPRs = `0`;
1677	};
1678
1679	} // end namespace llvm
1680
1681	#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1682

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNSubtarget.h