GCNSubtarget.h source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNSubtarget.h]

1	//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// AMD GCN specific subclass of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15	#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17	#include "AMDGPUCallLowering.h"
18	#include "AMDGPURegisterBankInfo.h"
19	#include "AMDGPUSubtarget.h"
20	#include "SIFrameLowering.h"
21	#include "SIISelLowering.h"
22	#include "SIInstrInfo.h"
23	#include "Utils/AMDGPUBaseInfo.h"
24	#include "llvm/Support/ErrorHandling.h"
25
26	#define GET_SUBTARGETINFO_HEADER
27	#include "AMDGPUGenSubtargetInfo.inc"
28
29	namespace llvm {
30
31	class GCNTargetMachine;
32
33	class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
34	public AMDGPUSubtarget {
35	public:
36	using AMDGPUSubtarget::getMaxWavesPerEU;
37
38	// Following 2 enums are documented at:
39	// - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
40	enum class TrapHandlerAbi {
41	NONE = `0x00`,
42	AMDHSA = `0x01`,
43	};
44
45	enum class TrapID {
46	LLVMAMDHSATrap = `0x02`,
47	LLVMAMDHSADebugTrap = `0x03`,
48	};
49
50	private:
51	/// SelectionDAGISel related APIs.
52	std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
53
54	/// GlobalISel related APIs.
55	std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
56	std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
57	std::unique_ptr<InstructionSelector> InstSelector;
58	std::unique_ptr<LegalizerInfo> Legalizer;
59	std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
60
61	protected:
62	// Basic subtarget description.
63	Triple TargetTriple;
64	AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
65	unsigned Gen = INVALID;
66	InstrItineraryData InstrItins;
67	int LDSBankCount = `0`;
68	unsigned MaxPrivateElementSize = `0`;
69
70	// Possibly statically set by tablegen, but may want to be overridden.
71	bool FastDenormalF32 = false;
72	bool HalfRate64Ops = false;
73	bool FullRate64Ops = false;
74
75	// Dynamically set bits that enable features.
76	bool FlatForGlobal = false;
77	bool AutoWaitcntBeforeBarrier = false;
78	bool BackOffBarrier = false;
79	bool UnalignedScratchAccess = false;
80	bool UnalignedAccessMode = false;
81	bool RelaxedBufferOOBMode = false;
82	bool HasApertureRegs = false;
83	bool SupportsXNACK = false;
84	bool KernargPreload = false;
85
86	// This should not be used directly. 'TargetID' tracks the dynamic settings
87	// for XNACK.
88	bool EnableXNACK = false;
89
90	bool EnableTgSplit = false;
91	bool EnableCuMode = false;
92	bool TrapHandler = false;
93	bool EnablePreciseMemory = false;
94
95	// Used as options.
96	bool EnableLoadStoreOpt = false;
97	bool EnableUnsafeDSOffsetFolding = false;
98	bool EnableSIScheduler = false;
99	bool EnableDS128 = false;
100	bool EnablePRTStrictNull = false;
101	bool DumpCode = false;
102
103	// Subtarget statically properties set by tablegen
104	bool FP64 = false;
105	bool FMA = false;
106	bool MIMG_R128 = false;
107	bool CIInsts = false;
108	bool GFX8Insts = false;
109	bool GFX9Insts = false;
110	bool GFX90AInsts = false;
111	bool GFX940Insts = false;
112	bool GFX950Insts = false;
113	bool GFX10Insts = false;
114	bool GFX11Insts = false;
115	bool GFX12Insts = false;
116	bool GFX1250Insts = false;
117	bool GFX10_3Insts = false;
118	bool GFX7GFX8GFX9Insts = false;
119	bool SGPRInitBug = false;
120	bool UserSGPRInit16Bug = false;
121	bool NegativeScratchOffsetBug = false;
122	bool NegativeUnalignedScratchOffsetBug = false;
123	bool HasSMemRealTime = false;
124	bool HasIntClamp = false;
125	bool HasFmaMixInsts = false;
126	bool HasMovrel = false;
127	bool HasVGPRIndexMode = false;
128	bool HasScalarDwordx3Loads = false;
129	bool HasScalarStores = false;
130	bool HasScalarAtomics = false;
131	bool HasSDWAOmod = false;
132	bool HasSDWAScalar = false;
133	bool HasSDWASdst = false;
134	bool HasSDWAMac = false;
135	bool HasSDWAOutModsVOPC = false;
136	bool HasDPP = false;
137	bool HasDPP8 = false;
138	bool HasDPALU_DPP = false;
139	bool HasDPPSrc1SGPR = false;
140	bool HasPackedFP32Ops = false;
141	bool HasImageInsts = false;
142	bool HasExtendedImageInsts = false;
143	bool HasR128A16 = false;
144	bool HasA16 = false;
145	bool HasG16 = false;
146	bool HasNSAEncoding = false;
147	bool HasPartialNSAEncoding = false;
148	bool GFX10_AEncoding = false;
149	bool GFX10_BEncoding = false;
150	bool HasDLInsts = false;
151	bool HasFmacF64Inst = false;
152	bool HasDot1Insts = false;
153	bool HasDot2Insts = false;
154	bool HasDot3Insts = false;
155	bool HasDot4Insts = false;
156	bool HasDot5Insts = false;
157	bool HasDot6Insts = false;
158	bool HasDot7Insts = false;
159	bool HasDot8Insts = false;
160	bool HasDot9Insts = false;
161	bool HasDot10Insts = false;
162	bool HasDot11Insts = false;
163	bool HasDot12Insts = false;
164	bool HasDot13Insts = false;
165	bool HasMAIInsts = false;
166	bool HasFP8Insts = false;
167	bool HasFP8ConversionInsts = false;
168	bool HasCvtFP8Vop1Bug = false;
169	bool HasPkFmacF16Inst = false;
170	bool HasAtomicFMinFMaxF32GlobalInsts = false;
171	bool HasAtomicFMinFMaxF64GlobalInsts = false;
172	bool HasAtomicFMinFMaxF32FlatInsts = false;
173	bool HasAtomicFMinFMaxF64FlatInsts = false;
174	bool HasAtomicDsPkAdd16Insts = false;
175	bool HasAtomicFlatPkAdd16Insts = false;
176	bool HasAtomicFaddRtnInsts = false;
177	bool HasAtomicFaddNoRtnInsts = false;
178	bool HasMemoryAtomicFaddF32DenormalSupport = false;
179	bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
180	bool HasAtomicBufferGlobalPkAddF16Insts = false;
181	bool HasAtomicCSubNoRtnInsts = false;
182	bool HasAtomicGlobalPkAddBF16Inst = false;
183	bool HasAtomicBufferPkAddBF16Inst = false;
184	bool HasFlatAtomicFaddF32Inst = false;
185	bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
186	bool HasDefaultComponentZero = false;
187	bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
188	bool HasDefaultComponentBroadcast = false;
189	bool HasXF32Insts = false;
190	/// The maximum number of instructions that may be placed within an S_CLAUSE,
191	/// which is one greater than the maximum argument to S_CLAUSE. A value of 0
192	/// indicates a lack of S_CLAUSE support.
193	unsigned MaxHardClauseLength = `0`;
194	bool SupportsSRAMECC = false;
195	bool DynamicVGPR = false;
196	bool DynamicVGPRBlockSize32 = false;
197	bool HasVMemToLDSLoad = false;
198
199	// This should not be used directly. 'TargetID' tracks the dynamic settings
200	// for SRAMECC.
201	bool EnableSRAMECC = false;
202
203	bool HasNoSdstCMPX = false;
204	bool HasVscnt = false;
205	bool HasWaitXcnt = false;
206	bool HasGetWaveIdInst = false;
207	bool HasSMemTimeInst = false;
208	bool HasShaderCyclesRegister = false;
209	bool HasShaderCyclesHiLoRegisters = false;
210	bool HasVOP3Literal = false;
211	bool HasNoDataDepHazard = false;
212	bool FlatAddressSpace = false;
213	bool FlatInstOffsets = false;
214	bool FlatGlobalInsts = false;
215	bool FlatScratchInsts = false;
216	bool ScalarFlatScratchInsts = false;
217	bool HasArchitectedFlatScratch = false;
218	bool EnableFlatScratch = false;
219	bool HasArchitectedSGPRs = false;
220	bool HasGDS = false;
221	bool HasGWS = false;
222	bool AddNoCarryInsts = false;
223	bool HasUnpackedD16VMem = false;
224	bool LDSMisalignedBug = false;
225	bool HasMFMAInlineLiteralBug = false;
226	bool UnalignedBufferAccess = false;
227	bool UnalignedDSAccess = false;
228	bool HasPackedTID = false;
229	bool ScalarizeGlobal = false;
230	bool HasSALUFloatInsts = false;
231	bool HasPseudoScalarTrans = false;
232	bool HasRestrictedSOffset = false;
233	bool HasBitOp3Insts = false;
234	bool HasTransposeLoadF4F6Insts = false;
235	bool HasPrngInst = false;
236	bool HasBVHDualAndBVH8Insts = false;
237	bool HasPermlane16Swap = false;
238	bool HasPermlane32Swap = false;
239	bool HasVcmpxPermlaneHazard = false;
240	bool HasVMEMtoScalarWriteHazard = false;
241	bool HasSMEMtoVectorWriteHazard = false;
242	bool HasInstFwdPrefetchBug = false;
243	bool HasSafeSmemPrefetch = false;
244	bool HasVcmpxExecWARHazard = false;
245	bool HasLdsBranchVmemWARHazard = false;
246	bool HasNSAtoVMEMBug = false;
247	bool HasNSAClauseBug = false;
248	bool HasOffset3fBug = false;
249	bool HasFlatSegmentOffsetBug = false;
250	bool HasImageStoreD16Bug = false;
251	bool HasImageGather4D16Bug = false;
252	bool HasMSAALoadDstSelBug = false;
253	bool HasPrivEnabledTrap2NopBug = false;
254	bool Has1_5xVGPRs = false;
255	bool HasMADIntraFwdBug = false;
256	bool HasVOPDInsts = false;
257	bool HasVALUTransUseHazard = false;
258	bool HasRequiredExportPriority = false;
259	bool HasVmemWriteVgprInOrder = false;
260	bool HasAshrPkInsts = false;
261	bool HasMinimum3Maximum3F32 = false;
262	bool HasMinimum3Maximum3F16 = false;
263	bool HasMinimum3Maximum3PKF16 = false;
264	bool HasLshlAddU64Inst = false;
265	bool HasPointSampleAccel = false;
266	bool HasLdsBarrierArriveAtomic = false;
267	bool HasSetPrioIncWgInst = false;
268
269	bool RequiresCOV6 = false;
270	bool UseBlockVGPROpsForCSR = false;
271
272	// Dummy feature to use for assembler in tablegen.
273	bool FeatureDisable = false;
274
275	private:
276	SIInstrInfo InstrInfo;
277	SITargetLowering TLInfo;
278	SIFrameLowering FrameLowering;
279
280	public:
281	GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
282	const GCNTargetMachine &TM);
283	~GCNSubtarget() override;
284
285	GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
286	StringRef GPU, StringRef FS);
287
288	/// Diagnose inconsistent subtarget features before attempting to codegen
289	/// function \p F.
290	void checkSubtargetFeatures(const Function &F) const;
291
292	const SIInstrInfo getInstrInfo() const* override {
293	return &InstrInfo;
294	}
295
296	const SIFrameLowering getFrameLowering() const* override {
297	return &FrameLowering;
298	}
299
300	const SITargetLowering getTargetLowering() const* override {
301	return &TLInfo;
302	}
303
304	const SIRegisterInfo getRegisterInfo() const* override {
305	return &InstrInfo.getRegisterInfo();
306	}
307
308	const SelectionDAGTargetInfo getSelectionDAGInfo() const* override;
309
310	const CallLowering getCallLowering() const* override {
311	return CallLoweringInfo.get();
312	}
313
314	const InlineAsmLowering getInlineAsmLowering() const* override {
315	return InlineAsmLoweringInfo.get();
316	}
317
318	InstructionSelector getInstructionSelector() const* override {
319	return InstSelector.get();
320	}
321
322	const LegalizerInfo getLegalizerInfo() const* override {
323	return Legalizer.get();
324	}
325
326	const AMDGPURegisterBankInfo getRegBankInfo() const* override {
327	return RegBankInfo.get();
328	}
329
330	const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
331	return TargetID;
332	}
333
334	const InstrItineraryData getInstrItineraryData() const* override {
335	return &InstrItins;
336	}
337
338	void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
339
340	Generation getGeneration() const {
341	return (Generation)Gen;
342	}
343
344	unsigned getMaxWaveScratchSize() const {
345	// See COMPUTE_TMPRING_SIZE.WAVESIZE.
346	if (getGeneration() >= GFX12) {
347	// 18-bit field in units of 64-dword.
348	return (`64` * `4`) * ((`1` << `18`) - `1`);
349	}
350	if (getGeneration() == GFX11) {
351	// 15-bit field in units of 64-dword.
352	return (`64` * `4`) * ((`1` << `15`) - `1`);
353	}
354	// 13-bit field in units of 256-dword.
355	return (`256` * `4`) * ((`1` << `13`) - `1`);
356	}
357
358	/// Return the number of high bits known to be zero for a frame index.
359	unsigned getKnownHighZeroBitsForFrameIndex() const {
360	return llvm::countl_zero(Val: getMaxWaveScratchSize()) + getWavefrontSizeLog2();
361	}
362
363	int getLDSBankCount() const {
364	return LDSBankCount;
365	}
366
367	unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
368	return (ForBufferRSrc \|\| !enableFlatScratch()) ? MaxPrivateElementSize : `16`;
369	}
370
371	unsigned getConstantBusLimit(unsigned Opcode) const;
372
373	/// Returns if the result of this instruction with a 16-bit result returned in
374	/// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
375	/// the original value.
376	bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
377
378	bool supportsWGP() const { return getGeneration() >= GFX10; }
379
380	bool hasIntClamp() const {
381	return HasIntClamp;
382	}
383
384	bool hasFP64() const {
385	return FP64;
386	}
387
388	bool hasMIMG_R128() const {
389	return MIMG_R128;
390	}
391
392	bool hasHWFP64() const {
393	return FP64;
394	}
395
396	bool hasHalfRate64Ops() const {
397	return HalfRate64Ops;
398	}
399
400	bool hasFullRate64Ops() const {
401	return FullRate64Ops;
402	}
403
404	bool hasAddr64() const {
405	return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
406	}
407
408	bool hasFlat() const {
409	return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
410	}
411
412	// Return true if the target only has the reverse operand versions of VALU
413	// shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
414	bool hasOnlyRevVALUShifts() const {
415	return getGeneration() >= VOLCANIC_ISLANDS;
416	}
417
418	bool hasFractBug() const {
419	return getGeneration() == SOUTHERN_ISLANDS;
420	}
421
422	bool hasBFE() const {
423	return true;
424	}
425
426	bool hasBFI() const {
427	return true;
428	}
429
430	bool hasBFM() const {
431	return hasBFE();
432	}
433
434	bool hasBCNT(unsigned Size) const {
435	return true;
436	}
437
438	bool hasFFBL() const {
439	return true;
440	}
441
442	bool hasFFBH() const {
443	return true;
444	}
445
446	bool hasMed3_16() const {
447	return getGeneration() >= AMDGPUSubtarget::GFX9;
448	}
449
450	bool hasMin3Max3_16() const {
451	return getGeneration() >= AMDGPUSubtarget::GFX9;
452	}
453
454	bool hasFmaMixInsts() const {
455	return HasFmaMixInsts;
456	}
457
458	bool hasCARRY() const {
459	return true;
460	}
461
462	bool hasFMA() const {
463	return FMA;
464	}
465
466	bool hasSwap() const {
467	return GFX9Insts;
468	}
469
470	bool hasScalarPackInsts() const {
471	return GFX9Insts;
472	}
473
474	bool hasScalarMulHiInsts() const {
475	return GFX9Insts;
476	}
477
478	bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
479
480	TrapHandlerAbi getTrapHandlerAbi() const {
481	return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
482	}
483
484	bool supportsGetDoorbellID() const {
485	// The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
486	return getGeneration() >= GFX9;
487	}
488
489	/// True if the offset field of DS instructions works as expected. On SI, the
490	/// offset uses a 16-bit adder and does not always wrap properly.
491	bool hasUsableDSOffset() const {
492	return getGeneration() >= SEA_ISLANDS;
493	}
494
495	bool unsafeDSOffsetFoldingEnabled() const {
496	return EnableUnsafeDSOffsetFolding;
497	}
498
499	/// Condition output from div_scale is usable.
500	bool hasUsableDivScaleConditionOutput() const {
501	return getGeneration() != SOUTHERN_ISLANDS;
502	}
503
504	/// Extra wait hazard is needed in some cases before
505	/// s_cbranch_vccnz/s_cbranch_vccz.
506	bool hasReadVCCZBug() const {
507	return getGeneration() <= SEA_ISLANDS;
508	}
509
510	/// Writes to VCC_LO/VCC_HI update the VCCZ flag.
511	bool partialVCCWritesUpdateVCCZ() const {
512	return getGeneration() >= GFX10;
513	}
514
515	/// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
516	/// was written by a VALU instruction.
517	bool hasSMRDReadVALUDefHazard() const {
518	return getGeneration() == SOUTHERN_ISLANDS;
519	}
520
521	/// A read of an SGPR by a VMEM instruction requires 5 wait states when the
522	/// SGPR was written by a VALU Instruction.
523	bool hasVMEMReadSGPRVALUDefHazard() const {
524	return getGeneration() >= VOLCANIC_ISLANDS;
525	}
526
527	bool hasRFEHazards() const {
528	return getGeneration() >= VOLCANIC_ISLANDS;
529	}
530
531	/// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
532	unsigned getSetRegWaitStates() const {
533	return getGeneration() <= SEA_ISLANDS ? `1` : `2`;
534	}
535
536	bool dumpCode() const {
537	return DumpCode;
538	}
539
540	/// Return the amount of LDS that can be used that will not restrict the
541	/// occupancy lower than WaveCount.
542	unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
543	const Function &) const;
544
545	bool supportsMinMaxDenormModes() const {
546	return getGeneration() >= AMDGPUSubtarget::GFX9;
547	}
548
549	/// \returns If target supports S_DENORM_MODE.
550	bool hasDenormModeInst() const {
551	return getGeneration() >= AMDGPUSubtarget::GFX10;
552	}
553
554	bool useFlatForGlobal() const {
555	return FlatForGlobal;
556	}
557
558	/// \returns If target supports ds_read/write_b128 and user enables generation
559	/// of ds_read/write_b128.
560	bool useDS128() const {
561	return CIInsts && EnableDS128;
562	}
563
564	/// \return If target supports ds_read/write_b96/128.
565	bool hasDS96AndDS128() const {
566	return CIInsts;
567	}
568
569	/// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
570	bool haveRoundOpsF64() const {
571	return CIInsts;
572	}
573
574	/// \returns If MUBUF instructions always perform range checking, even for
575	/// buffer resources used for private memory access.
576	bool privateMemoryResourceIsRangeChecked() const {
577	return getGeneration() < AMDGPUSubtarget::GFX9;
578	}
579
580	/// \returns If target requires PRT Struct NULL support (zero result registers
581	/// for sparse texture support).
582	bool usePRTStrictNull() const {
583	return EnablePRTStrictNull;
584	}
585
586	bool hasAutoWaitcntBeforeBarrier() const {
587	return AutoWaitcntBeforeBarrier;
588	}
589
590	/// \returns true if the target supports backing off of s_barrier instructions
591	/// when an exception is raised.
592	bool supportsBackOffBarrier() const {
593	return BackOffBarrier;
594	}
595
596	bool hasUnalignedBufferAccess() const {
597	return UnalignedBufferAccess;
598	}
599
600	bool hasUnalignedBufferAccessEnabled() const {
601	return UnalignedBufferAccess && UnalignedAccessMode;
602	}
603
604	bool hasUnalignedDSAccess() const {
605	return UnalignedDSAccess;
606	}
607
608	bool hasUnalignedDSAccessEnabled() const {
609	return UnalignedDSAccess && UnalignedAccessMode;
610	}
611
612	bool hasUnalignedScratchAccess() const {
613	return UnalignedScratchAccess;
614	}
615
616	bool hasUnalignedScratchAccessEnabled() const {
617	return UnalignedScratchAccess && UnalignedAccessMode;
618	}
619
620	bool hasUnalignedAccessMode() const {
621	return UnalignedAccessMode;
622	}
623
624	bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; }
625
626	bool hasApertureRegs() const {
627	return HasApertureRegs;
628	}
629
630	bool isTrapHandlerEnabled() const {
631	return TrapHandler;
632	}
633
634	bool isXNACKEnabled() const {
635	return TargetID.isXnackOnOrAny();
636	}
637
638	bool isTgSplitEnabled() const {
639	return EnableTgSplit;
640	}
641
642	bool isCuModeEnabled() const {
643	return EnableCuMode;
644	}
645
646	bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
647
648	bool hasFlatAddressSpace() const {
649	return FlatAddressSpace;
650	}
651
652	bool hasFlatScrRegister() const {
653	return hasFlatAddressSpace();
654	}
655
656	bool hasFlatInstOffsets() const {
657	return FlatInstOffsets;
658	}
659
660	bool hasFlatGlobalInsts() const {
661	return FlatGlobalInsts;
662	}
663
664	bool hasFlatScratchInsts() const {
665	return FlatScratchInsts;
666	}
667
668	// Check if target supports ST addressing mode with FLAT scratch instructions.
669	// The ST addressing mode means no registers are used, either VGPR or SGPR,
670	// but only immediate offset is swizzled and added to the FLAT scratch base.
671	bool hasFlatScratchSTMode() const {
672	return hasFlatScratchInsts() && (hasGFX10_3Insts() \|\| hasGFX940Insts());
673	}
674
675	bool hasFlatScratchSVSMode() const { return GFX940Insts \|\| GFX11Insts; }
676
677	bool hasScalarFlatScratchInsts() const {
678	return ScalarFlatScratchInsts;
679	}
680
681	bool enableFlatScratch() const {
682	return flatScratchIsArchitected() \|\|
683	(EnableFlatScratch && hasFlatScratchInsts());
684	}
685
686	bool hasGlobalAddTidInsts() const {
687	return GFX10_BEncoding;
688	}
689
690	bool hasAtomicCSub() const {
691	return GFX10_BEncoding;
692	}
693
694	bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
695
696	bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
697
698	bool hasExportInsts() const {
699	return !hasGFX940Insts() && !hasGFX1250Insts();
700	}
701
702	bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
703
704	// DS_ADD_F64/DS_ADD_RTN_F64
705	bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
706
707	bool hasMultiDwordFlatScratchAddressing() const {
708	return getGeneration() >= GFX9;
709	}
710
711	bool hasFlatSegmentOffsetBug() const {
712	return HasFlatSegmentOffsetBug;
713	}
714
715	bool hasFlatLgkmVMemCountInOrder() const {
716	return getGeneration() > GFX9;
717	}
718
719	bool hasD16LoadStore() const {
720	return getGeneration() >= GFX9;
721	}
722
723	bool d16PreservesUnusedBits() const {
724	return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
725	}
726
727	bool hasD16Images() const {
728	return getGeneration() >= VOLCANIC_ISLANDS;
729	}
730
731	/// Return if most LDS instructions have an m0 use that require m0 to be
732	/// initialized.
733	bool ldsRequiresM0Init() const {
734	return getGeneration() < GFX9;
735	}
736
737	// True if the hardware rewinds and replays GWS operations if a wave is
738	// preempted.
739	//
740	// If this is false, a GWS operation requires testing if a nack set the
741	// MEM_VIOL bit, and repeating if so.
742	bool hasGWSAutoReplay() const {
743	return getGeneration() >= GFX9;
744	}
745
746	/// \returns if target has ds_gws_sema_release_all instruction.
747	bool hasGWSSemaReleaseAll() const {
748	return CIInsts;
749	}
750
751	/// \returns true if the target has integer add/sub instructions that do not
752	/// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
753	/// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
754	/// for saturation.
755	bool hasAddNoCarry() const {
756	return AddNoCarryInsts;
757	}
758
759	bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
760
761	bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
762
763	bool hasUnpackedD16VMem() const {
764	return HasUnpackedD16VMem;
765	}
766
767	// Covers VS/PS/CS graphics shaders
768	bool isMesaGfxShader(const Function &F) const {
769	return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
770	}
771
772	bool hasMad64_32() const {
773	return getGeneration() >= SEA_ISLANDS;
774	}
775
776	bool hasSDWAOmod() const {
777	return HasSDWAOmod;
778	}
779
780	bool hasSDWAScalar() const {
781	return HasSDWAScalar;
782	}
783
784	bool hasSDWASdst() const {
785	return HasSDWASdst;
786	}
787
788	bool hasSDWAMac() const {
789	return HasSDWAMac;
790	}
791
792	bool hasSDWAOutModsVOPC() const {
793	return HasSDWAOutModsVOPC;
794	}
795
796	bool hasDLInsts() const {
797	return HasDLInsts;
798	}
799
800	bool hasFmacF64Inst() const { return HasFmacF64Inst; }
801
802	bool hasDot1Insts() const {
803	return HasDot1Insts;
804	}
805
806	bool hasDot2Insts() const {
807	return HasDot2Insts;
808	}
809
810	bool hasDot3Insts() const {
811	return HasDot3Insts;
812	}
813
814	bool hasDot4Insts() const {
815	return HasDot4Insts;
816	}
817
818	bool hasDot5Insts() const {
819	return HasDot5Insts;
820	}
821
822	bool hasDot6Insts() const {
823	return HasDot6Insts;
824	}
825
826	bool hasDot7Insts() const {
827	return HasDot7Insts;
828	}
829
830	bool hasDot8Insts() const {
831	return HasDot8Insts;
832	}
833
834	bool hasDot9Insts() const {
835	return HasDot9Insts;
836	}
837
838	bool hasDot10Insts() const {
839	return HasDot10Insts;
840	}
841
842	bool hasDot11Insts() const {
843	return HasDot11Insts;
844	}
845
846	bool hasDot12Insts() const {
847	return HasDot12Insts;
848	}
849
850	bool hasDot13Insts() const {
851	return HasDot13Insts;
852	}
853
854	bool hasMAIInsts() const {
855	return HasMAIInsts;
856	}
857
858	bool hasFP8Insts() const {
859	return HasFP8Insts;
860	}
861
862	bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
863
864	bool hasPkFmacF16Inst() const {
865	return HasPkFmacF16Inst;
866	}
867
868	bool hasAtomicFMinFMaxF32GlobalInsts() const {
869	return HasAtomicFMinFMaxF32GlobalInsts;
870	}
871
872	bool hasAtomicFMinFMaxF64GlobalInsts() const {
873	return HasAtomicFMinFMaxF64GlobalInsts;
874	}
875
876	bool hasAtomicFMinFMaxF32FlatInsts() const {
877	return HasAtomicFMinFMaxF32FlatInsts;
878	}
879
880	bool hasAtomicFMinFMaxF64FlatInsts() const {
881	return HasAtomicFMinFMaxF64FlatInsts;
882	}
883
884	bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
885
886	bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
887
888	bool hasAtomicFaddInsts() const {
889	return HasAtomicFaddRtnInsts \|\| HasAtomicFaddNoRtnInsts;
890	}
891
892	bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
893
894	bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
895
896	bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
897	return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
898	}
899
900	bool hasAtomicBufferGlobalPkAddF16Insts() const {
901	return HasAtomicBufferGlobalPkAddF16Insts;
902	}
903
904	bool hasAtomicGlobalPkAddBF16Inst() const {
905	return HasAtomicGlobalPkAddBF16Inst;
906	}
907
908	bool hasAtomicBufferPkAddBF16Inst() const {
909	return HasAtomicBufferPkAddBF16Inst;
910	}
911
912	bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
913
914	/// \return true if the target has flat, global, and buffer atomic fadd for
915	/// double.
916	bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
917	return HasFlatBufferGlobalAtomicFaddF64Inst;
918	}
919
920	/// \return true if the target's flat, global, and buffer atomic fadd for
921	/// float supports denormal handling.
922	bool hasMemoryAtomicFaddF32DenormalSupport() const {
923	return HasMemoryAtomicFaddF32DenormalSupport;
924	}
925
926	/// \return true if atomic operations targeting fine-grained memory work
927	/// correctly at device scope, in allocations in host or peer PCIe device
928	/// memory.
929	bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
930	return HasAgentScopeFineGrainedRemoteMemoryAtomics;
931	}
932
933	bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
934
935	bool hasDefaultComponentBroadcast() const {
936	return HasDefaultComponentBroadcast;
937	}
938
939	bool hasNoSdstCMPX() const {
940	return HasNoSdstCMPX;
941	}
942
943	bool hasVscnt() const {
944	return HasVscnt;
945	}
946
947	bool hasGetWaveIdInst() const {
948	return HasGetWaveIdInst;
949	}
950
951	bool hasSMemTimeInst() const {
952	return HasSMemTimeInst;
953	}
954
955	bool hasShaderCyclesRegister() const {
956	return HasShaderCyclesRegister;
957	}
958
959	bool hasShaderCyclesHiLoRegisters() const {
960	return HasShaderCyclesHiLoRegisters;
961	}
962
963	bool hasVOP3Literal() const {
964	return HasVOP3Literal;
965	}
966
967	bool hasNoDataDepHazard() const {
968	return HasNoDataDepHazard;
969	}
970
971	bool vmemWriteNeedsExpWaitcnt() const {
972	return getGeneration() < SEA_ISLANDS;
973	}
974
975	bool hasInstPrefetch() const {
976	return getGeneration() == GFX10 \|\| getGeneration() == GFX11;
977	}
978
979	bool hasPrefetch() const { return GFX12Insts; }
980
981	bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
982
983	// Has s_cmpk_ instructions.*
984	bool hasSCmpK() const { return getGeneration() < GFX12; }
985
986	// Scratch is allocated in 256 dword per wave blocks for the entire
987	// wavefront. When viewed from the perspective of an arbitrary workitem, this
988	// is 4-byte aligned.
989	//
990	// Only 4-byte alignment is really needed to access anything. Transformations
991	// on the pointer value itself may rely on the alignment / known low bits of
992	// the pointer. Set this to something above the minimum to avoid needing
993	// dynamic realignment in common cases.
994	Align getStackAlignment() const { return Align (`16`); }
995
996	bool enableMachineScheduler() const override {
997	return true;
998	}
999
1000	bool useAA() const override;
1001
1002	bool enableSubRegLiveness() const override {
1003	return true;
1004	}
1005
1006	void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
1007	bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
1008
1009	// static wrappers
1010	static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
1011
1012	// XXX - Why is this here if it isn't in the default pass set?
1013	bool enableEarlyIfConversion() const override {
1014	return true;
1015	}
1016
1017	void overrideSchedPolicy(MachineSchedPolicy &Policy,
1018	unsigned NumRegionInstrs) const override;
1019
1020	void mirFileLoaded(MachineFunction &MF) const override;
1021
1022	unsigned getMaxNumUserSGPRs() const {
1023	return AMDGPU::getMaxNumUserSGPRs(STI: *this);
1024	}
1025
1026	bool hasSMemRealTime() const {
1027	return HasSMemRealTime;
1028	}
1029
1030	bool hasMovrel() const {
1031	return HasMovrel;
1032	}
1033
1034	bool hasVGPRIndexMode() const {
1035	return HasVGPRIndexMode;
1036	}
1037
1038	bool useVGPRIndexMode() const;
1039
1040	bool hasScalarCompareEq64() const {
1041	return getGeneration() >= VOLCANIC_ISLANDS;
1042	}
1043
1044	bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
1045
1046	bool hasScalarStores() const {
1047	return HasScalarStores;
1048	}
1049
1050	bool hasScalarAtomics() const {
1051	return HasScalarAtomics;
1052	}
1053
1054	bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
1055	bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
1056
1057	/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
1058	bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
1059
1060	/// \returns true if the subtarget has the v_permlane64_b32 instruction.
1061	bool hasPermLane64() const { return getGeneration() >= GFX11; }
1062
1063	bool hasDPP() const {
1064	return HasDPP;
1065	}
1066
1067	bool hasDPPBroadcasts() const {
1068	return HasDPP && getGeneration() < GFX10;
1069	}
1070
1071	bool hasDPPWavefrontShifts() const {
1072	return HasDPP && getGeneration() < GFX10;
1073	}
1074
1075	bool hasDPP8() const {
1076	return HasDPP8;
1077	}
1078
1079	bool hasDPALU_DPP() const {
1080	return HasDPALU_DPP;
1081	}
1082
1083	bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
1084
1085	bool hasPackedFP32Ops() const {
1086	return HasPackedFP32Ops;
1087	}
1088
1089	// Has V_PK_MOV_B32 opcode
1090	bool hasPkMovB32() const {
1091	return GFX90AInsts;
1092	}
1093
1094	bool hasFmaakFmamkF32Insts() const {
1095	return getGeneration() >= GFX10 \|\| hasGFX940Insts();
1096	}
1097
1098	bool hasImageInsts() const {
1099	return HasImageInsts;
1100	}
1101
1102	bool hasExtendedImageInsts() const {
1103	return HasExtendedImageInsts;
1104	}
1105
1106	bool hasR128A16() const {
1107	return HasR128A16;
1108	}
1109
1110	bool hasA16() const { return HasA16; }
1111
1112	bool hasG16() const { return HasG16; }
1113
1114	bool hasOffset3fBug() const {
1115	return HasOffset3fBug;
1116	}
1117
1118	bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1119
1120	bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1121
1122	bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1123
1124	bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1125
1126	bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1127
1128	bool hasNSAEncoding() const { return HasNSAEncoding; }
1129
1130	bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1131
1132	bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1133
1134	unsigned getNSAMaxSize(bool HasSampler = false) const {
1135	return AMDGPU::getNSAMaxSize(STI: *this, HasSampler);
1136	}
1137
1138	bool hasGFX10_AEncoding() const {
1139	return GFX10_AEncoding;
1140	}
1141
1142	bool hasGFX10_BEncoding() const {
1143	return GFX10_BEncoding;
1144	}
1145
1146	bool hasGFX10_3Insts() const {
1147	return GFX10_3Insts;
1148	}
1149
1150	bool hasMadF16() const;
1151
1152	bool hasMovB64() const { return GFX940Insts; }
1153
1154	bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
1155
1156	bool enableSIScheduler() const {
1157	return EnableSIScheduler;
1158	}
1159
1160	bool loadStoreOptEnabled() const {
1161	return EnableLoadStoreOpt;
1162	}
1163
1164	bool hasSGPRInitBug() const {
1165	return SGPRInitBug;
1166	}
1167
1168	bool hasUserSGPRInit16Bug() const {
1169	return UserSGPRInit16Bug && isWave32();
1170	}
1171
1172	bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1173
1174	bool hasNegativeUnalignedScratchOffsetBug() const {
1175	return NegativeUnalignedScratchOffsetBug;
1176	}
1177
1178	bool hasMFMAInlineLiteralBug() const {
1179	return HasMFMAInlineLiteralBug;
1180	}
1181
1182	bool has12DWordStoreHazard() const {
1183	return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1184	}
1185
1186	// \returns true if the subtarget supports DWORDX3 load/store instructions.
1187	bool hasDwordx3LoadStores() const {
1188	return CIInsts;
1189	}
1190
1191	bool hasReadM0MovRelInterpHazard() const {
1192	return getGeneration() == AMDGPUSubtarget::GFX9;
1193	}
1194
1195	bool hasReadM0SendMsgHazard() const {
1196	return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1197	getGeneration() <= AMDGPUSubtarget::GFX9;
1198	}
1199
1200	bool hasReadM0LdsDmaHazard() const {
1201	return getGeneration() == AMDGPUSubtarget::GFX9;
1202	}
1203
1204	bool hasReadM0LdsDirectHazard() const {
1205	return getGeneration() == AMDGPUSubtarget::GFX9;
1206	}
1207
1208	bool hasVcmpxPermlaneHazard() const {
1209	return HasVcmpxPermlaneHazard;
1210	}
1211
1212	bool hasVMEMtoScalarWriteHazard() const {
1213	return HasVMEMtoScalarWriteHazard;
1214	}
1215
1216	bool hasSMEMtoVectorWriteHazard() const {
1217	return HasSMEMtoVectorWriteHazard;
1218	}
1219
1220	bool hasLDSMisalignedBug() const {
1221	return LDSMisalignedBug && !EnableCuMode;
1222	}
1223
1224	bool hasInstFwdPrefetchBug() const {
1225	return HasInstFwdPrefetchBug;
1226	}
1227
1228	bool hasVcmpxExecWARHazard() const {
1229	return HasVcmpxExecWARHazard;
1230	}
1231
1232	bool hasLdsBranchVmemWARHazard() const {
1233	return HasLdsBranchVmemWARHazard;
1234	}
1235
1236	// Shift amount of a 64 bit shift cannot be a highest allocated register
1237	// if also at the end of the allocation block.
1238	bool hasShift64HighRegBug() const {
1239	return GFX90AInsts && !GFX940Insts;
1240	}
1241
1242	// Has one cycle hazard on transcendental instruction feeding a
1243	// non transcendental VALU.
1244	bool hasTransForwardingHazard() const { return GFX940Insts; }
1245
1246	// Has one cycle hazard on a VALU instruction partially writing dst with
1247	// a shift of result bits feeding another VALU instruction.
1248	bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1249
1250	// Cannot use op_sel with v_dot instructions.
1251	bool hasDOTOpSelHazard() const { return GFX940Insts \|\| GFX11Insts; }
1252
1253	// Does not have HW interlocs for VALU writing and then reading SGPRs.
1254	bool hasVDecCoExecHazard() const {
1255	return GFX940Insts;
1256	}
1257
1258	bool hasNSAtoVMEMBug() const {
1259	return HasNSAtoVMEMBug;
1260	}
1261
1262	bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1263
1264	bool hasHardClauses() const { return MaxHardClauseLength > `0`; }
1265
1266	bool hasGFX90AInsts() const { return GFX90AInsts; }
1267
1268	bool hasFPAtomicToDenormModeHazard() const {
1269	return getGeneration() == GFX10;
1270	}
1271
1272	bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1273
1274	bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1275
1276	bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1277
1278	bool hasVALUPartialForwardingHazard() const {
1279	return getGeneration() == GFX11;
1280	}
1281
1282	bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1283
1284	bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
1285
1286	bool requiresCodeObjectV6() const { return RequiresCOV6; }
1287
1288	bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
1289
1290	bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1291
1292	bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
1293
1294	/// Return if operations acting on VGPR tuples require even alignment.
1295	bool needsAlignedVGPRs() const { return GFX90AInsts \|\| GFX1250Insts; }
1296
1297	/// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1298	bool hasSPackHL() const { return GFX11Insts; }
1299
1300	/// Return true if the target's EXP instruction has the COMPR flag, which
1301	/// affects the meaning of the EN (enable) bits.
1302	bool hasCompressedExport() const { return !GFX11Insts; }
1303
1304	/// Return true if the target's EXP instruction supports the NULL export
1305	/// target.
1306	bool hasNullExportTarget() const { return !GFX11Insts; }
1307
1308	bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
1309
1310	bool hasVOPDInsts() const { return HasVOPDInsts; }
1311
1312	bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1313
1314	/// Return true if the target has the S_DELAY_ALU instruction.
1315	bool hasDelayAlu() const { return GFX11Insts; }
1316
1317	bool hasPackedTID() const { return HasPackedTID; }
1318
1319	// GFX94 is a derivation to GFX90A. hasGFX940Insts() being true implies that*
1320	// hasGFX90AInsts is also true.
1321	bool hasGFX940Insts() const { return GFX940Insts; }
1322
1323	// GFX950 is a derivation to GFX94. hasGFX950Insts() implies that*
1324	// hasGFX940Insts and hasGFX90AInsts are also true.
1325	bool hasGFX950Insts() const { return GFX950Insts; }
1326
1327	/// Returns true if the target supports
1328	/// global_load_lds_dwordx3/global_load_lds_dwordx4 or
1329	/// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
1330	bool hasLDSLoadB96_B128() const {
1331	return hasGFX950Insts();
1332	}
1333
1334	bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; }
1335
1336	bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1337
1338	bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1339
1340	bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1341
1342	bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
1343
1344	bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
1345
1346	/// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1347	/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
1348	bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1349
1350	/// \returns true if inline constants are not supported for F16 pseudo
1351	/// scalar transcendentals.
1352	bool hasNoF16PseudoScalarTransInlineConstants() const {
1353	return getGeneration() == GFX12;
1354	}
1355
1356	/// \returns true if the target has instructions with xf32 format support.
1357	bool hasXF32Insts() const { return HasXF32Insts; }
1358
1359	bool hasBitOp3Insts() const { return HasBitOp3Insts; }
1360
1361	bool hasPermlane16Swap() const { return HasPermlane16Swap; }
1362	bool hasPermlane32Swap() const { return HasPermlane32Swap; }
1363	bool hasAshrPkInsts() const { return HasAshrPkInsts; }
1364
1365	bool hasMinimum3Maximum3F32() const {
1366	return HasMinimum3Maximum3F32;
1367	}
1368
1369	bool hasMinimum3Maximum3F16() const {
1370	return HasMinimum3Maximum3F16;
1371	}
1372
1373	bool hasMinimum3Maximum3PKF16() const {
1374	return HasMinimum3Maximum3PKF16;
1375	}
1376
1377	bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
1378
1379	/// \returns true if the target has s_wait_xcnt insertion. Supported for
1380	/// GFX1250.
1381	bool hasWaitXCnt() const { return HasWaitXcnt; }
1382
1383	bool hasPointSampleAccel() const { return HasPointSampleAccel; }
1384
1385	bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; }
1386
1387	/// \returns The maximum number of instructions that can be enclosed in an
1388	/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1389	/// instruction.
1390	unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1391
1392	bool hasPrngInst() const { return HasPrngInst; }
1393
1394	bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; }
1395
1396	/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1397	/// SGPRs
1398	unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1399
1400	/// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1401	/// VGPRs
1402	unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
1403	unsigned DynamicVGPRBlockSize) const;
1404
1405	/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
1406	/// be achieved when the only function running on a CU is \p F, each workgroup
1407	/// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
1408	/// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
1409	/// range, so this returns a range as well.
1410	///
1411	/// Note that occupancy can be affected by the scratch allocation as well, but
1412	/// we do not have enough information to compute it.
1413	std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
1414	unsigned LDSSize = `0`,
1415	unsigned NumSGPRs = `0`,
1416	unsigned NumVGPRs = `0`) const;
1417
1418	/// \returns true if the flat_scratch register should be initialized with the
1419	/// pointer to the wave's scratch memory rather than a size and offset.
1420	bool flatScratchIsPointer() const {
1421	return getGeneration() >= AMDGPUSubtarget::GFX9;
1422	}
1423
1424	/// \returns true if the flat_scratch register is initialized by the HW.
1425	/// In this case it is readonly.
1426	bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1427
1428	/// \returns true if the architected SGPRs are enabled.
1429	bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1430
1431	/// \returns true if Global Data Share is supported.
1432	bool hasGDS() const { return HasGDS; }
1433
1434	/// \returns true if Global Wave Sync is supported.
1435	bool hasGWS() const { return HasGWS; }
1436
1437	/// \returns true if the machine has merged shaders in which s0-s7 are
1438	/// reserved by the hardware and user SGPRs start at s8
1439	bool hasMergedShaders() const {
1440	return getGeneration() >= GFX9;
1441	}
1442
1443	// \returns true if the target supports the pre-NGG legacy geometry path.
1444	bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1445
1446	// \returns true if preloading kernel arguments is supported.
1447	bool hasKernargPreload() const { return KernargPreload; }
1448
1449	// \returns true if the target has split barriers feature
1450	bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1451
1452	// \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1453	bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; }
1454
1455	// \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1456	// no-return form.
1457	bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1458
1459	// \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1460	bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1461
1462	// \returns true if the target has IEEE kernel descriptor mode bit
1463	bool hasIEEEMode() const { return getGeneration() < GFX12; }
1464
1465	// \returns true if the target has IEEE fminimum/fmaximum instructions
1466	bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1467
1468	// \returns true if the target has IEEE fminimum3/fmaximum3 instructions
1469	bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
1470
1471	// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1472	bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1473
1474	/// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1475	/// values.
1476	bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1477
1478	bool hasGFX1250Insts() const { return GFX1250Insts; }
1479
1480	// \returns true if target has S_SETPRIO_INC_WG instruction.
1481	bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
1482
1483	// \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1484	// of sign-extending.
1485	bool hasGetPCZeroExtension() const { return GFX12Insts; }
1486
1487	/// \returns SGPR allocation granularity supported by the subtarget.
1488	unsigned getSGPRAllocGranule() const {
1489	return AMDGPU::IsaInfo::getSGPRAllocGranule(STI: this);
1490	}
1491
1492	/// \returns SGPR encoding granularity supported by the subtarget.
1493	unsigned getSGPREncodingGranule() const {
1494	return AMDGPU::IsaInfo::getSGPREncodingGranule(STI: this);
1495	}
1496
1497	/// \returns Total number of SGPRs supported by the subtarget.
1498	unsigned getTotalNumSGPRs() const {
1499	return AMDGPU::IsaInfo::getTotalNumSGPRs(STI: this);
1500	}
1501
1502	/// \returns Addressable number of SGPRs supported by the subtarget.
1503	unsigned getAddressableNumSGPRs() const {
1504	return AMDGPU::IsaInfo::getAddressableNumSGPRs(STI: this);
1505	}
1506
1507	/// \returns Minimum number of SGPRs that meets the given number of waves per
1508	/// execution unit requirement supported by the subtarget.
1509	unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1510	return AMDGPU::IsaInfo::getMinNumSGPRs(STI: this, WavesPerEU);
1511	}
1512
1513	/// \returns Maximum number of SGPRs that meets the given number of waves per
1514	/// execution unit requirement supported by the subtarget.
1515	unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1516	return AMDGPU::IsaInfo::getMaxNumSGPRs(STI: this, WavesPerEU, Addressable);
1517	}
1518
1519	/// \returns Reserved number of SGPRs. This is common
1520	/// utility function called by MachineFunction and
1521	/// Function variants of getReservedNumSGPRs.
1522	unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1523	/// \returns Reserved number of SGPRs for given machine function \p MF.
1524	unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1525
1526	/// \returns Reserved number of SGPRs for given function \p F.
1527	unsigned getReservedNumSGPRs(const Function &F) const;
1528
1529	/// \returns Maximum number of preloaded SGPRs for the subtarget.
1530	unsigned getMaxNumPreloadedSGPRs() const;
1531
1532	/// \returns max num SGPRs. This is the common utility
1533	/// function called by MachineFunction and Function
1534	/// variants of getMaxNumSGPRs.
1535	unsigned getBaseMaxNumSGPRs(const Function &F,
1536	std::pair<unsigned, unsigned> WavesPerEU,
1537	unsigned PreloadedSGPRs,
1538	unsigned ReservedNumSGPRs) const;
1539
1540	/// \returns Maximum number of SGPRs that meets number of waves per execution
1541	/// unit requirement for function \p MF, or number of SGPRs explicitly
1542	/// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1543	///
1544	/// \returns Value that meets number of waves per execution unit requirement
1545	/// if explicitly requested value cannot be converted to integer, violates
1546	/// subtarget's specifications, or does not meet number of waves per execution
1547	/// unit requirement.
1548	unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1549
1550	/// \returns Maximum number of SGPRs that meets number of waves per execution
1551	/// unit requirement for function \p F, or number of SGPRs explicitly
1552	/// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1553	///
1554	/// \returns Value that meets number of waves per execution unit requirement
1555	/// if explicitly requested value cannot be converted to integer, violates
1556	/// subtarget's specifications, or does not meet number of waves per execution
1557	/// unit requirement.
1558	unsigned getMaxNumSGPRs(const Function &F) const;
1559
1560	/// \returns VGPR allocation granularity supported by the subtarget.
1561	unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
1562	return AMDGPU::IsaInfo::getVGPRAllocGranule(STI: this, DynamicVGPRBlockSize);
1563	}
1564
1565	/// \returns VGPR encoding granularity supported by the subtarget.
1566	unsigned getVGPREncodingGranule() const {
1567	return AMDGPU::IsaInfo::getVGPREncodingGranule(STI: this);
1568	}
1569
1570	/// \returns Total number of VGPRs supported by the subtarget.
1571	unsigned getTotalNumVGPRs() const {
1572	return AMDGPU::IsaInfo::getTotalNumVGPRs(STI: this);
1573	}
1574
1575	/// \returns Addressable number of architectural VGPRs supported by the
1576	/// subtarget.
1577	unsigned getAddressableNumArchVGPRs() const {
1578	return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(STI: this);
1579	}
1580
1581	/// \returns Addressable number of VGPRs supported by the subtarget.
1582	unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
1583	return AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: this, DynamicVGPRBlockSize);
1584	}
1585
1586	/// \returns the minimum number of VGPRs that will prevent achieving more than
1587	/// the specified number of waves \p WavesPerEU.
1588	unsigned getMinNumVGPRs(unsigned WavesPerEU,
1589	unsigned DynamicVGPRBlockSize) const {
1590	return AMDGPU::IsaInfo::getMinNumVGPRs(STI: this, WavesPerEU,
1591	DynamicVGPRBlockSize);
1592	}
1593
1594	/// \returns the maximum number of VGPRs that can be used and still achieved
1595	/// at least the specified number of waves \p WavesPerEU.
1596	unsigned getMaxNumVGPRs(unsigned WavesPerEU,
1597	unsigned DynamicVGPRBlockSize) const {
1598	return AMDGPU::IsaInfo::getMaxNumVGPRs(STI: this, WavesPerEU,
1599	DynamicVGPRBlockSize);
1600	}
1601
1602	/// \returns max num VGPRs. This is the common utility function
1603	/// called by MachineFunction and Function variants of getMaxNumVGPRs.
1604	unsigned
1605	getBaseMaxNumVGPRs(const Function &F,
1606	std::pair<unsigned, unsigned> NumVGPRBounds) const;
1607
1608	/// \returns Maximum number of VGPRs that meets number of waves per execution
1609	/// unit requirement for function \p F, or number of VGPRs explicitly
1610	/// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1611	///
1612	/// \returns Value that meets number of waves per execution unit requirement
1613	/// if explicitly requested value cannot be converted to integer, violates
1614	/// subtarget's specifications, or does not meet number of waves per execution
1615	/// unit requirement.
1616	unsigned getMaxNumVGPRs(const Function &F) const;
1617
1618	unsigned getMaxNumAGPRs(const Function &F) const {
1619	return getMaxNumVGPRs(F);
1620	}
1621
1622	/// \returns Maximum number of VGPRs that meets number of waves per execution
1623	/// unit requirement for function \p MF, or number of VGPRs explicitly
1624	/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1625	///
1626	/// \returns Value that meets number of waves per execution unit requirement
1627	/// if explicitly requested value cannot be converted to integer, violates
1628	/// subtarget's specifications, or does not meet number of waves per execution
1629	/// unit requirement.
1630	unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1631
1632	bool isWave32() const {
1633	return getWavefrontSize() == `32`;
1634	}
1635
1636	bool isWave64() const {
1637	return getWavefrontSize() == `64`;
1638	}
1639
1640	/// Returns if the wavesize of this subtarget is known reliable. This is false
1641	/// only for the a default target-cpu that does not have an explicit
1642	/// +wavefrontsize target feature.
1643	bool isWaveSizeKnown() const {
1644	return hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) \|\|
1645	hasFeature(Feature: AMDGPU::FeatureWavefrontSize64);
1646	}
1647
1648	const TargetRegisterClass getBoolRC() const* {
1649	return getRegisterInfo()->getBoolRC();
1650	}
1651
1652	/// \returns Maximum number of work groups per compute unit supported by the
1653	/// subtarget and limited by given \p FlatWorkGroupSize.
1654	unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1655	return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(STI: this, FlatWorkGroupSize);
1656	}
1657
1658	/// \returns Minimum flat work group size supported by the subtarget.
1659	unsigned getMinFlatWorkGroupSize() const override {
1660	return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(STI: this);
1661	}
1662
1663	/// \returns Maximum flat work group size supported by the subtarget.
1664	unsigned getMaxFlatWorkGroupSize() const override {
1665	return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(STI: this);
1666	}
1667
1668	/// \returns Number of waves per execution unit required to support the given
1669	/// \p FlatWorkGroupSize.
1670	unsigned
1671	getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1672	return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(STI: this, FlatWorkGroupSize);
1673	}
1674
1675	/// \returns Minimum number of waves per execution unit supported by the
1676	/// subtarget.
1677	unsigned getMinWavesPerEU() const override {
1678	return AMDGPU::IsaInfo::getMinWavesPerEU(STI: this);
1679	}
1680
1681	void adjustSchedDependency(SUnit Def, int* DefOpIdx, SUnit Use, int* UseOpIdx,
1682	SDep &Dep,
1683	const TargetSchedModel SchedModel) const* override;
1684
1685	// \returns true if it's beneficial on this subtarget for the scheduler to
1686	// cluster stores as well as loads.
1687	bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1688
1689	// \returns the number of address arguments from which to enable MIMG NSA
1690	// on supported architectures.
1691	unsigned getNSAThreshold(const MachineFunction &MF) const;
1692
1693	// \returns true if the subtarget has a hazard requiring an "s_nop 0"
1694	// instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1695	bool requiresNopBeforeDeallocVGPRs() const {
1696	// Currently all targets that support the dealloc VGPRs message also require
1697	// the nop.
1698	return true;
1699	}
1700
1701	bool isDynamicVGPREnabled() const { return DynamicVGPR; }
1702	unsigned getDynamicVGPRBlockSize() const {
1703	return DynamicVGPRBlockSize32 ? `32` : `16`;
1704	}
1705
1706	bool requiresDisjointEarlyClobberAndUndef() const override {
1707	// AMDGPU doesn't care if early-clobber and undef operands are allocated
1708	// to the same register.
1709	return false;
1710	}
1711	};
1712
1713	class GCNUserSGPRUsageInfo {
1714	public:
1715	bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1716
1717	bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1718
1719	bool hasDispatchPtr() const { return DispatchPtr; }
1720
1721	bool hasQueuePtr() const { return QueuePtr; }
1722
1723	bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1724
1725	bool hasDispatchID() const { return DispatchID; }
1726
1727	bool hasFlatScratchInit() const { return FlatScratchInit; }
1728
1729	bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1730
1731	unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1732
1733	unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1734
1735	unsigned getNumFreeUserSGPRs();
1736
1737	void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1738
1739	enum UserSGPRID : unsigned {
1740	ImplicitBufferPtrID = `0`,
1741	PrivateSegmentBufferID = `1`,
1742	DispatchPtrID = `2`,
1743	QueuePtrID = `3`,
1744	KernargSegmentPtrID = `4`,
1745	DispatchIdID = `5`,
1746	FlatScratchInitID = `6`,
1747	PrivateSegmentSizeID = `7`
1748	};
1749
1750	// Returns the size in number of SGPRs for preload user SGPR field.
1751	static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1752	switch (ID) {
1753	case ImplicitBufferPtrID:
1754	return `2`;
1755	case PrivateSegmentBufferID:
1756	return `4`;
1757	case DispatchPtrID:
1758	return `2`;
1759	case QueuePtrID:
1760	return `2`;
1761	case KernargSegmentPtrID:
1762	return `2`;
1763	case DispatchIdID:
1764	return `2`;
1765	case FlatScratchInitID:
1766	return `2`;
1767	case PrivateSegmentSizeID:
1768	return `1`;
1769	}
1770	llvm_unreachable("Unknown UserSGPRID.");
1771	}
1772
1773	GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1774
1775	private:
1776	const GCNSubtarget &ST;
1777
1778	// Private memory buffer
1779	// Compute directly in sgpr[0:1]
1780	// Other shaders indirect 64-bits at sgpr[0:1]
1781	bool ImplicitBufferPtr = false;
1782
1783	bool PrivateSegmentBuffer = false;
1784
1785	bool DispatchPtr = false;
1786
1787	bool QueuePtr = false;
1788
1789	bool KernargSegmentPtr = false;
1790
1791	bool DispatchID = false;
1792
1793	bool FlatScratchInit = false;
1794
1795	bool PrivateSegmentSize = false;
1796
1797	unsigned NumKernargPreloadSGPRs = `0`;
1798
1799	unsigned NumUsedUserSGPRs = `0`;
1800	};
1801
1802	} // end namespace llvm
1803
1804	#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1805

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNSubtarget.h