GCNSubtarget.h source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNSubtarget.h]

1	//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// AMD GCN specific subclass of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15	#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17	#include "AMDGPUCallLowering.h"
18	#include "AMDGPURegisterBankInfo.h"
19	#include "AMDGPUSubtarget.h"
20	#include "SIFrameLowering.h"
21	#include "SIISelLowering.h"
22	#include "SIInstrInfo.h"
23	#include "Utils/AMDGPUBaseInfo.h"
24	#include "llvm/Support/ErrorHandling.h"
25
26	#define GET_SUBTARGETINFO_HEADER
27	#include "AMDGPUGenSubtargetInfo.inc"
28
29	namespace llvm {
30
31	class GCNTargetMachine;
32
33	class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
34	public AMDGPUSubtarget {
35	public:
36	using AMDGPUSubtarget::getMaxWavesPerEU;
37
38	// Following 2 enums are documented at:
39	// - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
40	enum class TrapHandlerAbi {
41	NONE = `0x00`,
42	AMDHSA = `0x01`,
43	};
44
45	enum class TrapID {
46	LLVMAMDHSATrap = `0x02`,
47	LLVMAMDHSADebugTrap = `0x03`,
48	};
49
50	private:
51	/// SelectionDAGISel related APIs.
52	std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
53
54	/// GlobalISel related APIs.
55	std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
56	std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
57	std::unique_ptr<InstructionSelector> InstSelector;
58	std::unique_ptr<LegalizerInfo> Legalizer;
59	std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
60
61	protected:
62	// Basic subtarget description.
63	AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
64	unsigned Gen = INVALID;
65	InstrItineraryData InstrItins;
66	int LDSBankCount = `0`;
67	unsigned MaxPrivateElementSize = `0`;
68
69	// Instruction cache line size in bytes; set from TableGen subtarget features.
70	unsigned InstCacheLineSize = `0`;
71
72	// Dynamically set bits that enable features.
73	bool DynamicVGPR = false;
74	bool DynamicVGPRBlockSize32 = false;
75	bool ScalarizeGlobal = false;
76
77	/// The maximum number of instructions that may be placed within an S_CLAUSE,
78	/// which is one greater than the maximum argument to S_CLAUSE. A value of 0
79	/// indicates a lack of S_CLAUSE support.
80	unsigned MaxHardClauseLength = `0`;
81
82	#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
83	bool ATTRIBUTE = DEFAULT;
84	#include "AMDGPUGenSubtargetInfo.inc"
85
86	private:
87	SIInstrInfo InstrInfo;
88	SITargetLowering TLInfo;
89	SIFrameLowering FrameLowering;
90
91	public:
92	GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
93	const GCNTargetMachine &TM);
94	~GCNSubtarget() override;
95
96	GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU,
97	StringRef FS);
98
99	/// Diagnose inconsistent subtarget features before attempting to codegen
100	/// function \p F.
101	void checkSubtargetFeatures(const Function &F) const;
102
103	const SIInstrInfo getInstrInfo() const* override { return &InstrInfo; }
104
105	const SIFrameLowering getFrameLowering() const* override {
106	return &FrameLowering;
107	}
108
109	const SITargetLowering getTargetLowering() const* override { return &TLInfo; }
110
111	const SIRegisterInfo getRegisterInfo() const* override {
112	return &InstrInfo.getRegisterInfo();
113	}
114
115	const SelectionDAGTargetInfo getSelectionDAGInfo() const* override;
116
117	const CallLowering getCallLowering() const* override {
118	return CallLoweringInfo.get();
119	}
120
121	const InlineAsmLowering getInlineAsmLowering() const* override {
122	return InlineAsmLoweringInfo.get();
123	}
124
125	InstructionSelector getInstructionSelector() const* override {
126	return InstSelector.get();
127	}
128
129	const LegalizerInfo getLegalizerInfo() const* override {
130	return Legalizer.get();
131	}
132
133	const AMDGPURegisterBankInfo getRegBankInfo() const* override {
134	return RegBankInfo.get();
135	}
136
137	const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
138	return TargetID;
139	}
140
141	const InstrItineraryData getInstrItineraryData() const* override {
142	return &InstrItins;
143	}
144
145	void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
146
147	Generation getGeneration() const { return (Generation)Gen; }
148
149	bool isGFX11Plus() const { return getGeneration() >= GFX11; }
150
151	#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
152	bool GETTER() const override { return ATTRIBUTE; }
153	#include "AMDGPUGenSubtargetInfo.inc"
154
155	unsigned getMaxWaveScratchSize() const {
156	// See COMPUTE_TMPRING_SIZE.WAVESIZE.
157	if (getGeneration() >= GFX12) {
158	// 18-bit field in units of 64-dword.
159	return (`64` * `4`) * ((`1` << `18`) - `1`);
160	}
161	if (getGeneration() == GFX11) {
162	// 15-bit field in units of 64-dword.
163	return (`64` * `4`) * ((`1` << `15`) - `1`);
164	}
165	// 13-bit field in units of 256-dword.
166	return (`256` * `4`) * ((`1` << `13`) - `1`);
167	}
168
169	/// Return the number of high bits known to be zero for a frame index.
170	unsigned getKnownHighZeroBitsForFrameIndex() const {
171	return llvm::countl_zero(Val: getMaxWaveScratchSize()) + getWavefrontSizeLog2();
172	}
173
174	int getLDSBankCount() const { return LDSBankCount; }
175
176	/// Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
177	unsigned getInstCacheLineSize() const { return InstCacheLineSize; }
178
179	unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
180	return (ForBufferRSrc \|\| !hasFlatScratchEnabled()) ? MaxPrivateElementSize
181	: `16`;
182	}
183
184	unsigned getConstantBusLimit(unsigned Opcode) const;
185
186	/// Returns if the result of this instruction with a 16-bit result returned in
187	/// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
188	/// the original value.
189	bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
190
191	bool supportsWGP() const {
192	if (HasGFX1250Insts)
193	return false;
194	return getGeneration() >= GFX10;
195	}
196
197	bool hasHWFP64() const { return HasFP64; }
198
199	bool hasAddr64() const {
200	return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
201	}
202
203	bool hasFlat() const {
204	return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
205	}
206
207	// Return true if the target only has the reverse operand versions of VALU
208	// shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
209	bool hasOnlyRevVALUShifts() const {
210	return getGeneration() >= VOLCANIC_ISLANDS;
211	}
212
213	bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
214
215	bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
216
217	bool hasMin3Max3_16() const {
218	return getGeneration() >= AMDGPUSubtarget::GFX9;
219	}
220
221	bool hasSwap() const { return HasGFX9Insts; }
222
223	bool hasScalarPackInsts() const { return HasGFX9Insts; }
224
225	bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
226
227	bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
228
229	TrapHandlerAbi getTrapHandlerAbi() const {
230	return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
231	}
232
233	bool supportsGetDoorbellID() const {
234	// The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
235	return getGeneration() >= GFX9;
236	}
237
238	/// True if the offset field of DS instructions works as expected. On SI, the
239	/// offset uses a 16-bit adder and does not always wrap properly.
240	bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
241
242	bool unsafeDSOffsetFoldingEnabled() const {
243	return EnableUnsafeDSOffsetFolding;
244	}
245
246	/// Condition output from div_scale is usable.
247	bool hasUsableDivScaleConditionOutput() const {
248	return getGeneration() != SOUTHERN_ISLANDS;
249	}
250
251	/// Extra wait hazard is needed in some cases before
252	/// s_cbranch_vccnz/s_cbranch_vccz.
253	bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
254
255	/// Writes to VCC_LO/VCC_HI update the VCCZ flag.
256	bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
257
258	/// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
259	/// was written by a VALU instruction.
260	bool hasSMRDReadVALUDefHazard() const {
261	return getGeneration() == SOUTHERN_ISLANDS;
262	}
263
264	/// A read of an SGPR by a VMEM instruction requires 5 wait states when the
265	/// SGPR was written by a VALU Instruction.
266	bool hasVMEMReadSGPRVALUDefHazard() const {
267	return getGeneration() >= VOLCANIC_ISLANDS;
268	}
269
270	bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
271
272	/// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
273	unsigned getSetRegWaitStates() const {
274	return getGeneration() <= SEA_ISLANDS ? `1` : `2`;
275	}
276
277	/// Return the amount of LDS that can be used that will not restrict the
278	/// occupancy lower than WaveCount.
279	unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
280	const Function &) const;
281
282	bool supportsMinMaxDenormModes() const {
283	return getGeneration() >= AMDGPUSubtarget::GFX9;
284	}
285
286	/// \returns If target supports S_DENORM_MODE.
287	bool hasDenormModeInst() const {
288	return getGeneration() >= AMDGPUSubtarget::GFX10;
289	}
290
291	/// \returns If target supports ds_read/write_b128 and user enables generation
292	/// of ds_read/write_b128.
293	bool useDS128() const { return HasCIInsts && EnableDS128; }
294
295	/// \return If target supports ds_read/write_b96/128.
296	bool hasDS96AndDS128() const { return HasCIInsts; }
297
298	/// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
299	bool haveRoundOpsF64() const { return HasCIInsts; }
300
301	/// \returns If MUBUF instructions always perform range checking, even for
302	/// buffer resources used for private memory access.
303	bool privateMemoryResourceIsRangeChecked() const {
304	return getGeneration() < AMDGPUSubtarget::GFX9;
305	}
306
307	/// \returns If target requires PRT Struct NULL support (zero result registers
308	/// for sparse texture support).
309	bool usePRTStrictNull() const { return EnablePRTStrictNull; }
310
311	bool hasUnalignedBufferAccessEnabled() const {
312	return HasUnalignedBufferAccess && HasUnalignedAccessMode;
313	}
314
315	bool hasUnalignedDSAccessEnabled() const {
316	return HasUnalignedDSAccess && HasUnalignedAccessMode;
317	}
318
319	bool hasUnalignedScratchAccessEnabled() const {
320	return HasUnalignedScratchAccess && HasUnalignedAccessMode;
321	}
322
323	bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
324
325	bool isTgSplitEnabled() const { return EnableTgSplit; }
326
327	bool isCuModeEnabled() const { return EnableCuMode; }
328
329	bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
330
331	bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
332
333	// Check if target supports ST addressing mode with FLAT scratch instructions.
334	// The ST addressing mode means no registers are used, either VGPR or SGPR,
335	// but only immediate offset is swizzled and added to the FLAT scratch base.
336	bool hasFlatScratchSTMode() const {
337	return hasFlatScratchInsts() && (hasGFX10_3Insts() \|\| hasGFX940Insts());
338	}
339
340	bool hasFlatScratchSVSMode() const { return HasGFX940Insts \|\| HasGFX11Insts; }
341
342	bool hasFlatScratchEnabled() const {
343	return hasArchitectedFlatScratch() \|\|
344	(EnableFlatScratch && hasFlatScratchInsts());
345	}
346
347	bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
348
349	bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
350
351	bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
352
353	bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
354
355	bool hasExportInsts() const {
356	return !hasGFX940Insts() && !hasGFX1250Insts();
357	}
358
359	bool hasVINTERPEncoding() const {
360	return HasGFX11Insts && !hasGFX1250Insts();
361	}
362
363	// DS_ADD_F64/DS_ADD_RTN_F64
364	bool hasLdsAtomicAddF64() const {
365	return hasGFX90AInsts() \|\| hasGFX1250Insts();
366	}
367
368	bool hasMultiDwordFlatScratchAddressing() const {
369	return getGeneration() >= GFX9;
370	}
371
372	bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
373
374	bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
375
376	bool d16PreservesUnusedBits() const {
377	return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
378	}
379
380	bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
381
382	/// Return if most LDS instructions have an m0 use that require m0 to be
383	/// initialized.
384	bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
385
386	// True if the hardware rewinds and replays GWS operations if a wave is
387	// preempted.
388	//
389	// If this is false, a GWS operation requires testing if a nack set the
390	// MEM_VIOL bit, and repeating if so.
391	bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
392
393	/// \returns if target has ds_gws_sema_release_all instruction.
394	bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
395
396	bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
397
398	bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
399
400	// Covers VS/PS/CS graphics shaders
401	bool isMesaGfxShader(const Function &F) const {
402	return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
403	}
404
405	bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
406
407	bool hasAtomicFaddInsts() const {
408	return HasAtomicFaddRtnInsts \|\| HasAtomicFaddNoRtnInsts;
409	}
410
411	bool vmemWriteNeedsExpWaitcnt() const {
412	return getGeneration() < SEA_ISLANDS;
413	}
414
415	bool hasInstPrefetch() const {
416	return getGeneration() == GFX10 \|\| getGeneration() == GFX11;
417	}
418
419	bool hasPrefetch() const { return HasGFX12Insts; }
420
421	// Has s_cmpk_ instructions.*
422	bool hasSCmpK() const { return getGeneration() < GFX12; }
423
424	// Scratch is allocated in 256 dword per wave blocks for the entire
425	// wavefront. When viewed from the perspective of an arbitrary workitem, this
426	// is 4-byte aligned.
427	//
428	// Only 4-byte alignment is really needed to access anything. Transformations
429	// on the pointer value itself may rely on the alignment / known low bits of
430	// the pointer. Set this to something above the minimum to avoid needing
431	// dynamic realignment in common cases.
432	Align getStackAlignment() const { return Align (`16`); }
433
434	bool enableMachineScheduler() const override { return true; }
435
436	bool useAA() const override;
437
438	bool enableSubRegLiveness() const override { return true; }
439
440	void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
441	bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
442
443	// static wrappers
444	static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
445
446	// XXX - Why is this here if it isn't in the default pass set?
447	bool enableEarlyIfConversion() const override { return true; }
448
449	void overrideSchedPolicy(MachineSchedPolicy &Policy,
450	const SchedRegion &Region) const override;
451
452	void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
453	const SchedRegion &Region) const override;
454
455	void mirFileLoaded(MachineFunction &MF) const override;
456
457	unsigned getMaxNumUserSGPRs() const {
458	return AMDGPU::getMaxNumUserSGPRs(STI: *this);
459	}
460
461	bool useVGPRIndexMode() const;
462
463	bool hasScalarCompareEq64() const {
464	return getGeneration() >= VOLCANIC_ISLANDS;
465	}
466
467	bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
468	bool hasLDSFPAtomicAddF64() const {
469	return HasGFX90AInsts \|\| HasGFX1250Insts;
470	}
471
472	/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
473	bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
474
475	/// \returns true if the subtarget has the v_permlane64_b32 instruction.
476	bool hasPermLane64() const { return getGeneration() >= GFX11; }
477
478	bool hasDPPRowShare() const {
479	return HasDPP && (HasGFX90AInsts \|\| getGeneration() >= GFX10);
480	}
481
482	// Has V_PK_MOV_B32 opcode
483	bool hasPkMovB32() const { return HasGFX90AInsts; }
484
485	bool hasFmaakFmamkF32Insts() const {
486	return getGeneration() >= GFX10 \|\| hasGFX940Insts();
487	}
488
489	bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
490
491	bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
492
493	unsigned getNSAMaxSize(bool HasSampler = false) const {
494	return AMDGPU::getNSAMaxSize(STI: *this, HasSampler);
495	}
496
497	bool hasMadF16() const;
498
499	bool hasMovB64() const { return HasGFX940Insts \|\| HasGFX1250Insts; }
500
501	// Scalar and global loads support scale_offset bit.
502	bool hasScaleOffset() const { return HasGFX1250Insts; }
503
504	// FLAT GLOBAL VOffset is signed
505	bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
506
507	bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
508
509	bool hasUserSGPRInit16BugInWave32() const {
510	return HasUserSGPRInit16Bug && isWave32();
511	}
512
513	bool has12DWordStoreHazard() const {
514	return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
515	}
516
517	// \returns true if the subtarget supports DWORDX3 load/store instructions.
518	bool hasDwordx3LoadStores() const { return HasCIInsts; }
519
520	bool hasReadM0MovRelInterpHazard() const {
521	return getGeneration() == AMDGPUSubtarget::GFX9;
522	}
523
524	bool hasReadM0SendMsgHazard() const {
525	return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
526	getGeneration() <= AMDGPUSubtarget::GFX9;
527	}
528
529	bool hasReadM0LdsDmaHazard() const {
530	return getGeneration() == AMDGPUSubtarget::GFX9;
531	}
532
533	bool hasReadM0LdsDirectHazard() const {
534	return getGeneration() == AMDGPUSubtarget::GFX9;
535	}
536
537	bool hasLDSMisalignedBugInWGPMode() const {
538	return HasLDSMisalignedBug && !EnableCuMode;
539	}
540
541	// Shift amount of a 64 bit shift cannot be a highest allocated register
542	// if also at the end of the allocation block.
543	bool hasShift64HighRegBug() const { return HasGFX90AInsts; }
544
545	// Has one cycle hazard on transcendental instruction feeding a
546	// non transcendental VALU.
547	bool hasTransForwardingHazard() const { return HasGFX940Insts; }
548
549	// Has one cycle hazard on a VALU instruction partially writing dst with
550	// a shift of result bits feeding another VALU instruction.
551	bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
552
553	// Cannot use op_sel with v_dot instructions.
554	bool hasDOTOpSelHazard() const { return HasGFX940Insts \|\| HasGFX11Insts; }
555
556	// Does not have HW interlocs for VALU writing and then reading SGPRs.
557	bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
558
559	bool hasHardClauses() const { return MaxHardClauseLength > `0`; }
560
561	bool hasFPAtomicToDenormModeHazard() const {
562	return getGeneration() == GFX10;
563	}
564
565	bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
566
567	bool hasLdsDirect() const { return getGeneration() >= GFX11; }
568
569	bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
570
571	bool hasVALUPartialForwardingHazard() const {
572	return getGeneration() == GFX11;
573	}
574
575	bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
576
577	// All GFX9 targets experience a fetch delay when an instruction at the start
578	// of a loop header is split by a 32-byte fetch window boundary, but GFX950
579	// is uniquely sensitive to this: the delay triggers further performance
580	// degradation beyond the fetch latency itself.
581	bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
582
583	bool requiresCodeObjectV6() const { return RequiresCOV6; }
584
585	bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
586
587	bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
588
589	bool hasVALUReadSGPRHazard() const {
590	return HasGFX12Insts && !HasGFX1250Insts;
591	}
592
593	bool setRegModeNeedsVNOPs() const {
594	return HasGFX1250Insts && getGeneration() == GFX12;
595	}
596
597	/// Return if operations acting on VGPR tuples require even alignment.
598	bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
599
600	/// Return true if the target has the S_PACK_HL_B32_B16 instruction.
601	bool hasSPackHL() const { return HasGFX11Insts; }
602
603	/// Return true if the target's EXP instruction has the COMPR flag, which
604	/// affects the meaning of the EN (enable) bits.
605	bool hasCompressedExport() const { return !HasGFX11Insts; }
606
607	/// Return true if the target's EXP instruction supports the NULL export
608	/// target.
609	bool hasNullExportTarget() const { return !HasGFX11Insts; }
610
611	bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
612
613	/// Return true if the target has the S_DELAY_ALU instruction.
614	bool hasDelayAlu() const { return HasGFX11Insts; }
615
616	/// Returns true if the target supports
617	/// global_load_lds_dwordx3/global_load_lds_dwordx4 or
618	/// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
619	bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
620
621	/// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
622	/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
623	bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
624
625	/// \returns true if inline constants are not supported for F16 pseudo
626	/// scalar transcendentals.
627	bool hasNoF16PseudoScalarTransInlineConstants() const {
628	return getGeneration() == GFX12;
629	}
630
631	/// \returns true if the target has packed f32 instructions that only read 32
632	/// bits from a scalar operand (SGPR or literal) and replicates the bits to
633	/// both channels.
634	bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const {
635	return getGeneration() == GFX12 && HasGFX1250Insts;
636	}
637
638	bool hasAddPC64Inst() const { return HasGFX1250Insts; }
639
640	/// \returns true if the target supports expert scheduling mode 2 which relies
641	/// on the compiler to insert waits to avoid hazards between VMEM and VALU
642	/// instructions in some instances.
643	bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
644
645	/// \returns The maximum number of instructions that can be enclosed in an
646	/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
647	/// instruction.
648	unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
649
650	/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
651	/// SGPRs
652	unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
653
654	/// Return the maximum number of waves per SIMD for kernels using \p VGPRs
655	/// VGPRs
656	unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
657	unsigned DynamicVGPRBlockSize) const;
658
659	/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
660	/// be achieved when the only function running on a CU is \p F, each workgroup
661	/// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
662	/// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
663	/// range, so this returns a range as well.
664	///
665	/// Note that occupancy can be affected by the scratch allocation as well, but
666	/// we do not have enough information to compute it.
667	std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
668	unsigned LDSSize = `0`,
669	unsigned NumSGPRs = `0`,
670	unsigned NumVGPRs = `0`) const;
671
672	/// \returns true if the flat_scratch register should be initialized with the
673	/// pointer to the wave's scratch memory rather than a size and offset.
674	bool flatScratchIsPointer() const {
675	return getGeneration() >= AMDGPUSubtarget::GFX9;
676	}
677
678	/// \returns true if the machine has merged shaders in which s0-s7 are
679	/// reserved by the hardware and user SGPRs start at s8
680	bool hasMergedShaders() const { return getGeneration() >= GFX9; }
681
682	// \returns true if the target supports the pre-NGG legacy geometry path.
683	bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
684
685	// \returns true if the target has split barriers feature
686	bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
687
688	// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
689	bool hasRrWGMode() const { return getGeneration() >= GFX12; }
690
691	/// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
692	/// values.
693	bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
694
695	bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
696
697	bool hasVOPD3() const { return HasGFX1250Insts; }
698
699	// \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
700	bool hasVectorMulU64() const { return HasGFX1250Insts; }
701
702	// \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
703	// instructions.
704	bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; }
705
706	// \returns true if the target has V_{MIN\|MAX}_{I\|U}64 instructions.
707	bool hasIntMinMax64() const { return HasGFX1250Insts; }
708
709	// \returns true if the target has V_PK_{MIN\|MAX}3_{I\|U}16 instructions.
710	bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
711
712	// \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
713	bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
714
715	// \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
716	// of sign-extending. Note that GFX1250 has not only fixed the bug but also
717	// extended VA to 57 bits.
718	bool hasGetPCZeroExtension() const {
719	return HasGFX12Insts && !HasGFX1250Insts;
720	}
721
722	// \returns true if the target needs to create a prolog for backward
723	// compatibility when preloading kernel arguments.
724	bool needsKernArgPreloadProlog() const {
725	return hasKernargPreload() && !HasGFX1250Insts;
726	}
727
728	bool hasCondSubInsts() const { return HasGFX12Insts; }
729
730	bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
731
732	/// \returns SGPR allocation granularity supported by the subtarget.
733	unsigned getSGPRAllocGranule() const {
734	return AMDGPU::IsaInfo::getSGPRAllocGranule(STI: this);
735	}
736
737	/// \returns SGPR encoding granularity supported by the subtarget.
738	unsigned getSGPREncodingGranule() const {
739	return AMDGPU::IsaInfo::getSGPREncodingGranule(STI: this);
740	}
741
742	/// \returns Total number of SGPRs supported by the subtarget.
743	unsigned getTotalNumSGPRs() const {
744	return AMDGPU::IsaInfo::getTotalNumSGPRs(STI: this);
745	}
746
747	/// \returns Addressable number of SGPRs supported by the subtarget.
748	unsigned getAddressableNumSGPRs() const {
749	return AMDGPU::IsaInfo::getAddressableNumSGPRs(STI: this);
750	}
751
752	/// \returns Minimum number of SGPRs that meets the given number of waves per
753	/// execution unit requirement supported by the subtarget.
754	unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
755	return AMDGPU::IsaInfo::getMinNumSGPRs(STI: this, WavesPerEU);
756	}
757
758	/// \returns Maximum number of SGPRs that meets the given number of waves per
759	/// execution unit requirement supported by the subtarget.
760	unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
761	return AMDGPU::IsaInfo::getMaxNumSGPRs(STI: this, WavesPerEU, Addressable);
762	}
763
764	/// \returns Reserved number of SGPRs. This is common
765	/// utility function called by MachineFunction and
766	/// Function variants of getReservedNumSGPRs.
767	unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
768	/// \returns Reserved number of SGPRs for given machine function \p MF.
769	unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
770
771	/// \returns Reserved number of SGPRs for given function \p F.
772	unsigned getReservedNumSGPRs(const Function &F) const;
773
774	/// \returns Maximum number of preloaded SGPRs for the subtarget.
775	unsigned getMaxNumPreloadedSGPRs() const;
776
777	/// \returns max num SGPRs. This is the common utility
778	/// function called by MachineFunction and Function
779	/// variants of getMaxNumSGPRs.
780	unsigned getBaseMaxNumSGPRs(const Function &F,
781	std::pair<unsigned, unsigned> WavesPerEU,
782	unsigned PreloadedSGPRs,
783	unsigned ReservedNumSGPRs) const;
784
785	/// \returns Maximum number of SGPRs that meets number of waves per execution
786	/// unit requirement for function \p MF, or number of SGPRs explicitly
787	/// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
788	///
789	/// \returns Value that meets number of waves per execution unit requirement
790	/// if explicitly requested value cannot be converted to integer, violates
791	/// subtarget's specifications, or does not meet number of waves per execution
792	/// unit requirement.
793	unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
794
795	/// \returns Maximum number of SGPRs that meets number of waves per execution
796	/// unit requirement for function \p F, or number of SGPRs explicitly
797	/// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
798	///
799	/// \returns Value that meets number of waves per execution unit requirement
800	/// if explicitly requested value cannot be converted to integer, violates
801	/// subtarget's specifications, or does not meet number of waves per execution
802	/// unit requirement.
803	unsigned getMaxNumSGPRs(const Function &F) const;
804
805	/// \returns VGPR allocation granularity supported by the subtarget.
806	unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
807	return AMDGPU::IsaInfo::getVGPRAllocGranule(STI: this, DynamicVGPRBlockSize);
808	}
809
810	/// \returns VGPR encoding granularity supported by the subtarget.
811	unsigned getVGPREncodingGranule() const {
812	return AMDGPU::IsaInfo::getVGPREncodingGranule(STI: this);
813	}
814
815	/// \returns Total number of VGPRs supported by the subtarget.
816	unsigned getTotalNumVGPRs() const {
817	return AMDGPU::IsaInfo::getTotalNumVGPRs(STI: this);
818	}
819
820	/// \returns Addressable number of architectural VGPRs supported by the
821	/// subtarget.
822	unsigned getAddressableNumArchVGPRs() const {
823	return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(STI: this);
824	}
825
826	/// \returns Addressable number of VGPRs supported by the subtarget.
827	unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
828	return AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: this, DynamicVGPRBlockSize);
829	}
830
831	/// \returns the minimum number of VGPRs that will prevent achieving more than
832	/// the specified number of waves \p WavesPerEU.
833	unsigned getMinNumVGPRs(unsigned WavesPerEU,
834	unsigned DynamicVGPRBlockSize) const {
835	return AMDGPU::IsaInfo::getMinNumVGPRs(STI: this, WavesPerEU,
836	DynamicVGPRBlockSize);
837	}
838
839	/// \returns the maximum number of VGPRs that can be used and still achieved
840	/// at least the specified number of waves \p WavesPerEU.
841	unsigned getMaxNumVGPRs(unsigned WavesPerEU,
842	unsigned DynamicVGPRBlockSize) const {
843	return AMDGPU::IsaInfo::getMaxNumVGPRs(STI: this, WavesPerEU,
844	DynamicVGPRBlockSize);
845	}
846
847	/// \returns max num VGPRs. This is the common utility function
848	/// called by MachineFunction and Function variants of getMaxNumVGPRs.
849	unsigned
850	getBaseMaxNumVGPRs(const Function &F,
851	std::pair<unsigned, unsigned> NumVGPRBounds) const;
852
853	/// \returns Maximum number of VGPRs that meets number of waves per execution
854	/// unit requirement for function \p F, or number of VGPRs explicitly
855	/// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
856	///
857	/// \returns Value that meets number of waves per execution unit requirement
858	/// if explicitly requested value cannot be converted to integer, violates
859	/// subtarget's specifications, or does not meet number of waves per execution
860	/// unit requirement.
861	unsigned getMaxNumVGPRs(const Function &F) const;
862
863	unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
864
865	/// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
866	/// of waves per execution unit required for the function \p MF.
867	std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
868
869	/// \returns Maximum number of VGPRs that meets number of waves per execution
870	/// unit requirement for function \p MF, or number of VGPRs explicitly
871	/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
872	///
873	/// \returns Value that meets number of waves per execution unit requirement
874	/// if explicitly requested value cannot be converted to integer, violates
875	/// subtarget's specifications, or does not meet number of waves per execution
876	/// unit requirement.
877	unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
878
879	bool supportsWave32() const { return getGeneration() >= GFX10; }
880
881	bool supportsWave64() const { return !hasGFX1250Insts(); }
882
883	bool isWave32() const { return getWavefrontSize() == `32`; }
884
885	bool isWave64() const { return getWavefrontSize() == `64`; }
886
887	/// Returns if the wavesize of this subtarget is known reliable. This is false
888	/// only for the a default target-cpu that does not have an explicit
889	/// +wavefrontsize target feature.
890	bool isWaveSizeKnown() const {
891	return hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) \|\|
892	hasFeature(Feature: AMDGPU::FeatureWavefrontSize64);
893	}
894
895	const TargetRegisterClass getBoolRC() const* {
896	return getRegisterInfo()->getBoolRC();
897	}
898
899	/// \returns Maximum number of work groups per compute unit supported by the
900	/// subtarget and limited by given \p FlatWorkGroupSize.
901	unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
902	return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(STI: this, FlatWorkGroupSize);
903	}
904
905	/// \returns Minimum flat work group size supported by the subtarget.
906	unsigned getMinFlatWorkGroupSize() const override {
907	return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(STI: this);
908	}
909
910	/// \returns Maximum flat work group size supported by the subtarget.
911	unsigned getMaxFlatWorkGroupSize() const override {
912	return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize();
913	}
914
915	/// \returns Number of waves per execution unit required to support the given
916	/// \p FlatWorkGroupSize.
917	unsigned
918	getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
919	return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(STI: this, FlatWorkGroupSize);
920	}
921
922	/// \returns Minimum number of waves per execution unit supported by the
923	/// subtarget.
924	unsigned getMinWavesPerEU() const override {
925	return AMDGPU::IsaInfo::getMinWavesPerEU(STI: this);
926	}
927
928	void adjustSchedDependency(SUnit Def, int* DefOpIdx, SUnit Use, int* UseOpIdx,
929	SDep &Dep,
930	const TargetSchedModel SchedModel) const* override;
931
932	// \returns true if it's beneficial on this subtarget for the scheduler to
933	// cluster stores as well as loads.
934	bool shouldClusterStores() const { return getGeneration() >= GFX11; }
935
936	// \returns the number of address arguments from which to enable MIMG NSA
937	// on supported architectures.
938	unsigned getNSAThreshold(const MachineFunction &MF) const;
939
940	// \returns true if the subtarget has a hazard requiring an "s_nop 0"
941	// instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
942	bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
943
944	// \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
945	// STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
946	bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
947
948	bool isDynamicVGPREnabled() const { return DynamicVGPR; }
949	unsigned getDynamicVGPRBlockSize() const {
950	return DynamicVGPRBlockSize32 ? `32` : `16`;
951	}
952
953	bool requiresDisjointEarlyClobberAndUndef() const override {
954	// AMDGPU doesn't care if early-clobber and undef operands are allocated
955	// to the same register.
956	return false;
957	}
958
959	// DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
960	// and surronded by S_WAIT_ALU(0xFFE3).
961	bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
962	return getGeneration() == GFX12;
963	}
964
965	// Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
966	// read.
967	bool hasScratchBaseForwardingHazard() const {
968	return HasGFX1250Insts && getGeneration() == GFX12;
969	}
970
971	// src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
972	// result.
973	bool hasFlatScratchHiInB64InstHazard() const {
974	return HasGFX1250Insts && getGeneration() == GFX12;
975	}
976
977	/// \returns true if the subtarget requires a wait for xcnt before VMEM
978	/// accesses that must never be repeated in the event of a page fault/re-try.
979	/// Atomic stores/rmw and all volatile accesses fall under this criteria.
980	bool requiresWaitXCntForSingleAccessInstructions() const {
981	return HasGFX1250Insts;
982	}
983
984	/// \returns the number of significant bits in the immediate field of the
985	/// S_NOP instruction.
986	unsigned getSNopBits() const {
987	if (getGeneration() >= AMDGPUSubtarget::GFX12)
988	return `7`;
989	if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
990	return `4`;
991	return `3`;
992	}
993
994	bool supportsBPermute() const {
995	return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
996	}
997
998	bool supportsWaveWideBPermute() const {
999	return (getGeneration() <= AMDGPUSubtarget::GFX9 \|\|
1000	getGeneration() == AMDGPUSubtarget::GFX12) \|\|
1001	isWave32();
1002	}
1003
1004	/// Return true if real (non-fake) variants of True16 instructions using
1005	/// 16-bit registers should be code-generated. Fake True16 instructions are
1006	/// identical to non-fake ones except that they take 32-bit registers as
1007	/// operands and always use their low halves.
1008	// TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
1009	// supported and the support for fake True16 instructions is removed.
1010	bool useRealTrue16Insts() const {
1011	return hasTrue16BitInsts() && EnableRealTrue16Insts;
1012	}
1013
1014	bool requiresWaitOnWorkgroupReleaseFence() const {
1015	return getGeneration() >= GFX10 \|\| isTgSplitEnabled();
1016	}
1017	};
1018
1019	class GCNUserSGPRUsageInfo {
1020	public:
1021	bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1022
1023	bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1024
1025	bool hasDispatchPtr() const { return DispatchPtr; }
1026
1027	bool hasQueuePtr() const { return QueuePtr; }
1028
1029	bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1030
1031	bool hasDispatchID() const { return DispatchID; }
1032
1033	bool hasFlatScratchInit() const { return FlatScratchInit; }
1034
1035	bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1036
1037	unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1038
1039	unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1040
1041	unsigned getNumFreeUserSGPRs();
1042
1043	void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1044
1045	enum UserSGPRID : unsigned {
1046	ImplicitBufferPtrID = `0`,
1047	PrivateSegmentBufferID = `1`,
1048	DispatchPtrID = `2`,
1049	QueuePtrID = `3`,
1050	KernargSegmentPtrID = `4`,
1051	DispatchIdID = `5`,
1052	FlatScratchInitID = `6`,
1053	PrivateSegmentSizeID = `7`
1054	};
1055
1056	// Returns the size in number of SGPRs for preload user SGPR field.
1057	static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1058	switch (ID) {
1059	case ImplicitBufferPtrID:
1060	return `2`;
1061	case PrivateSegmentBufferID:
1062	return `4`;
1063	case DispatchPtrID:
1064	return `2`;
1065	case QueuePtrID:
1066	return `2`;
1067	case KernargSegmentPtrID:
1068	return `2`;
1069	case DispatchIdID:
1070	return `2`;
1071	case FlatScratchInitID:
1072	return `2`;
1073	case PrivateSegmentSizeID:
1074	return `1`;
1075	}
1076	llvm_unreachable("Unknown UserSGPRID.");
1077	}
1078
1079	GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1080
1081	private:
1082	const GCNSubtarget &ST;
1083
1084	// Private memory buffer
1085	// Compute directly in sgpr[0:1]
1086	// Other shaders indirect 64-bits at sgpr[0:1]
1087	bool ImplicitBufferPtr = false;
1088
1089	bool PrivateSegmentBuffer = false;
1090
1091	bool DispatchPtr = false;
1092
1093	bool QueuePtr = false;
1094
1095	bool KernargSegmentPtr = false;
1096
1097	bool DispatchID = false;
1098
1099	bool FlatScratchInit = false;
1100
1101	bool PrivateSegmentSize = false;
1102
1103	unsigned NumKernargPreloadSGPRs = `0`;
1104
1105	unsigned NumUsedUserSGPRs = `0`;
1106	};
1107
1108	} // end namespace llvm
1109
1110	#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1111

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNSubtarget.h