1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
23#include "Utils/AMDGPUBaseInfo.h"
24#include "llvm/Support/ErrorHandling.h"
25
26#define GET_SUBTARGETINFO_HEADER
27#include "AMDGPUGenSubtargetInfo.inc"
28
29namespace llvm {
30
31class GCNTargetMachine;
32
33class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
34 public AMDGPUSubtarget {
35public:
36 using AMDGPUSubtarget::getMaxWavesPerEU;
37
38 // Following 2 enums are documented at:
39 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
40 enum class TrapHandlerAbi {
41 NONE = 0x00,
42 AMDHSA = 0x01,
43 };
44
45 enum class TrapID {
46 LLVMAMDHSATrap = 0x02,
47 LLVMAMDHSADebugTrap = 0x03,
48 };
49
50private:
51 /// SelectionDAGISel related APIs.
52 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
53
54 /// GlobalISel related APIs.
55 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
56 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
57 std::unique_ptr<InstructionSelector> InstSelector;
58 std::unique_ptr<LegalizerInfo> Legalizer;
59 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
60
61protected:
62 // Basic subtarget description.
63 AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
64 unsigned Gen = INVALID;
65 InstrItineraryData InstrItins;
66 int LDSBankCount = 0;
67 unsigned MaxPrivateElementSize = 0;
68
69 // Instruction cache line size in bytes; set from TableGen subtarget features.
70 unsigned InstCacheLineSize = 0;
71
72 // Dynamically set bits that enable features.
73 bool DynamicVGPR = false;
74 bool DynamicVGPRBlockSize32 = false;
75 bool ScalarizeGlobal = false;
76
77 /// The maximum number of instructions that may be placed within an S_CLAUSE,
78 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
79 /// indicates a lack of S_CLAUSE support.
80 unsigned MaxHardClauseLength = 0;
81
82#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
83 bool ATTRIBUTE = DEFAULT;
84#include "AMDGPUGenSubtargetInfo.inc"
85
86private:
87 SIInstrInfo InstrInfo;
88 SITargetLowering TLInfo;
89 SIFrameLowering FrameLowering;
90
91public:
92 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
93 const GCNTargetMachine &TM);
94 ~GCNSubtarget() override;
95
96 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU,
97 StringRef FS);
98
99 /// Diagnose inconsistent subtarget features before attempting to codegen
100 /// function \p F.
101 void checkSubtargetFeatures(const Function &F) const;
102
103 const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
104
105 const SIFrameLowering *getFrameLowering() const override {
106 return &FrameLowering;
107 }
108
109 const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
110
111 const SIRegisterInfo *getRegisterInfo() const override {
112 return &InstrInfo.getRegisterInfo();
113 }
114
115 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
116
117 const CallLowering *getCallLowering() const override {
118 return CallLoweringInfo.get();
119 }
120
121 const InlineAsmLowering *getInlineAsmLowering() const override {
122 return InlineAsmLoweringInfo.get();
123 }
124
125 InstructionSelector *getInstructionSelector() const override {
126 return InstSelector.get();
127 }
128
129 const LegalizerInfo *getLegalizerInfo() const override {
130 return Legalizer.get();
131 }
132
133 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
134 return RegBankInfo.get();
135 }
136
137 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
138 return TargetID;
139 }
140
141 const InstrItineraryData *getInstrItineraryData() const override {
142 return &InstrItins;
143 }
144
145 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
146
147 Generation getGeneration() const { return (Generation)Gen; }
148
149 bool isGFX11Plus() const { return getGeneration() >= GFX11; }
150
151#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
152 bool GETTER() const override { return ATTRIBUTE; }
153#include "AMDGPUGenSubtargetInfo.inc"
154
155 unsigned getMaxWaveScratchSize() const {
156 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
157 if (getGeneration() >= GFX12) {
158 // 18-bit field in units of 64-dword.
159 return (64 * 4) * ((1 << 18) - 1);
160 }
161 if (getGeneration() == GFX11) {
162 // 15-bit field in units of 64-dword.
163 return (64 * 4) * ((1 << 15) - 1);
164 }
165 // 13-bit field in units of 256-dword.
166 return (256 * 4) * ((1 << 13) - 1);
167 }
168
169 /// Return the number of high bits known to be zero for a frame index.
170 unsigned getKnownHighZeroBitsForFrameIndex() const {
171 return llvm::countl_zero(Val: getMaxWaveScratchSize()) + getWavefrontSizeLog2();
172 }
173
174 int getLDSBankCount() const { return LDSBankCount; }
175
176 /// Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
177 unsigned getInstCacheLineSize() const { return InstCacheLineSize; }
178
179 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
180 return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
181 : 16;
182 }
183
184 unsigned getConstantBusLimit(unsigned Opcode) const;
185
186 /// Returns if the result of this instruction with a 16-bit result returned in
187 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
188 /// the original value.
189 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
190
191 bool supportsWGP() const {
192 if (HasGFX1250Insts)
193 return false;
194 return getGeneration() >= GFX10;
195 }
196
197 bool hasHWFP64() const { return HasFP64; }
198
199 bool hasAddr64() const {
200 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
201 }
202
203 bool hasFlat() const {
204 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
205 }
206
207 // Return true if the target only has the reverse operand versions of VALU
208 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
209 bool hasOnlyRevVALUShifts() const {
210 return getGeneration() >= VOLCANIC_ISLANDS;
211 }
212
213 bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
214
215 bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
216
217 bool hasMin3Max3_16() const {
218 return getGeneration() >= AMDGPUSubtarget::GFX9;
219 }
220
221 bool hasSwap() const { return HasGFX9Insts; }
222
223 bool hasScalarPackInsts() const { return HasGFX9Insts; }
224
225 bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
226
227 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
228
229 TrapHandlerAbi getTrapHandlerAbi() const {
230 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
231 }
232
233 bool supportsGetDoorbellID() const {
234 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
235 return getGeneration() >= GFX9;
236 }
237
238 /// True if the offset field of DS instructions works as expected. On SI, the
239 /// offset uses a 16-bit adder and does not always wrap properly.
240 bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
241
242 bool unsafeDSOffsetFoldingEnabled() const {
243 return EnableUnsafeDSOffsetFolding;
244 }
245
246 /// Condition output from div_scale is usable.
247 bool hasUsableDivScaleConditionOutput() const {
248 return getGeneration() != SOUTHERN_ISLANDS;
249 }
250
251 /// Extra wait hazard is needed in some cases before
252 /// s_cbranch_vccnz/s_cbranch_vccz.
253 bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
254
255 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
256 bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
257
258 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
259 /// was written by a VALU instruction.
260 bool hasSMRDReadVALUDefHazard() const {
261 return getGeneration() == SOUTHERN_ISLANDS;
262 }
263
264 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
265 /// SGPR was written by a VALU Instruction.
266 bool hasVMEMReadSGPRVALUDefHazard() const {
267 return getGeneration() >= VOLCANIC_ISLANDS;
268 }
269
270 bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
271
272 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
273 unsigned getSetRegWaitStates() const {
274 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
275 }
276
277 /// Return the amount of LDS that can be used that will not restrict the
278 /// occupancy lower than WaveCount.
279 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
280 const Function &) const;
281
282 bool supportsMinMaxDenormModes() const {
283 return getGeneration() >= AMDGPUSubtarget::GFX9;
284 }
285
286 /// \returns If target supports S_DENORM_MODE.
287 bool hasDenormModeInst() const {
288 return getGeneration() >= AMDGPUSubtarget::GFX10;
289 }
290
291 /// \returns If target supports ds_read/write_b128 and user enables generation
292 /// of ds_read/write_b128.
293 bool useDS128() const { return HasCIInsts && EnableDS128; }
294
295 /// \return If target supports ds_read/write_b96/128.
296 bool hasDS96AndDS128() const { return HasCIInsts; }
297
298 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
299 bool haveRoundOpsF64() const { return HasCIInsts; }
300
301 /// \returns If MUBUF instructions always perform range checking, even for
302 /// buffer resources used for private memory access.
303 bool privateMemoryResourceIsRangeChecked() const {
304 return getGeneration() < AMDGPUSubtarget::GFX9;
305 }
306
307 /// \returns If target requires PRT Struct NULL support (zero result registers
308 /// for sparse texture support).
309 bool usePRTStrictNull() const { return EnablePRTStrictNull; }
310
311 bool hasUnalignedBufferAccessEnabled() const {
312 return HasUnalignedBufferAccess && HasUnalignedAccessMode;
313 }
314
315 bool hasUnalignedDSAccessEnabled() const {
316 return HasUnalignedDSAccess && HasUnalignedAccessMode;
317 }
318
319 bool hasUnalignedScratchAccessEnabled() const {
320 return HasUnalignedScratchAccess && HasUnalignedAccessMode;
321 }
322
323 bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
324
325 bool isTgSplitEnabled() const { return EnableTgSplit; }
326
327 bool isCuModeEnabled() const { return EnableCuMode; }
328
329 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
330
331 bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
332
333 // Check if target supports ST addressing mode with FLAT scratch instructions.
334 // The ST addressing mode means no registers are used, either VGPR or SGPR,
335 // but only immediate offset is swizzled and added to the FLAT scratch base.
336 bool hasFlatScratchSTMode() const {
337 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
338 }
339
340 bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
341
342 bool hasFlatScratchEnabled() const {
343 return hasArchitectedFlatScratch() ||
344 (EnableFlatScratch && hasFlatScratchInsts());
345 }
346
347 bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
348
349 bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
350
351 bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
352
353 bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
354
355 bool hasExportInsts() const {
356 return !hasGFX940Insts() && !hasGFX1250Insts();
357 }
358
359 bool hasVINTERPEncoding() const {
360 return HasGFX11Insts && !hasGFX1250Insts();
361 }
362
363 // DS_ADD_F64/DS_ADD_RTN_F64
364 bool hasLdsAtomicAddF64() const {
365 return hasGFX90AInsts() || hasGFX1250Insts();
366 }
367
368 bool hasMultiDwordFlatScratchAddressing() const {
369 return getGeneration() >= GFX9;
370 }
371
372 bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
373
374 bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
375
376 bool d16PreservesUnusedBits() const {
377 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
378 }
379
380 bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
381
382 /// Return if most LDS instructions have an m0 use that require m0 to be
383 /// initialized.
384 bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
385
386 // True if the hardware rewinds and replays GWS operations if a wave is
387 // preempted.
388 //
389 // If this is false, a GWS operation requires testing if a nack set the
390 // MEM_VIOL bit, and repeating if so.
391 bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
392
393 /// \returns if target has ds_gws_sema_release_all instruction.
394 bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
395
396 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
397
398 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
399
400 // Covers VS/PS/CS graphics shaders
401 bool isMesaGfxShader(const Function &F) const {
402 return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
403 }
404
405 bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
406
407 bool hasAtomicFaddInsts() const {
408 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
409 }
410
411 bool vmemWriteNeedsExpWaitcnt() const {
412 return getGeneration() < SEA_ISLANDS;
413 }
414
415 bool hasInstPrefetch() const {
416 return getGeneration() == GFX10 || getGeneration() == GFX11;
417 }
418
419 bool hasPrefetch() const { return HasGFX12Insts; }
420
421 // Has s_cmpk_* instructions.
422 bool hasSCmpK() const { return getGeneration() < GFX12; }
423
424 // Scratch is allocated in 256 dword per wave blocks for the entire
425 // wavefront. When viewed from the perspective of an arbitrary workitem, this
426 // is 4-byte aligned.
427 //
428 // Only 4-byte alignment is really needed to access anything. Transformations
429 // on the pointer value itself may rely on the alignment / known low bits of
430 // the pointer. Set this to something above the minimum to avoid needing
431 // dynamic realignment in common cases.
432 Align getStackAlignment() const { return Align(16); }
433
434 bool enableMachineScheduler() const override { return true; }
435
436 bool useAA() const override;
437
438 bool enableSubRegLiveness() const override { return true; }
439
440 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
441 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
442
443 // static wrappers
444 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
445
446 // XXX - Why is this here if it isn't in the default pass set?
447 bool enableEarlyIfConversion() const override { return true; }
448
449 void overrideSchedPolicy(MachineSchedPolicy &Policy,
450 const SchedRegion &Region) const override;
451
452 void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
453 const SchedRegion &Region) const override;
454
455 void mirFileLoaded(MachineFunction &MF) const override;
456
457 unsigned getMaxNumUserSGPRs() const {
458 return AMDGPU::getMaxNumUserSGPRs(STI: *this);
459 }
460
461 bool useVGPRIndexMode() const;
462
463 bool hasScalarCompareEq64() const {
464 return getGeneration() >= VOLCANIC_ISLANDS;
465 }
466
467 bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
468 bool hasLDSFPAtomicAddF64() const {
469 return HasGFX90AInsts || HasGFX1250Insts;
470 }
471
472 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
473 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
474
475 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
476 bool hasPermLane64() const { return getGeneration() >= GFX11; }
477
478 bool hasDPPRowShare() const {
479 return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10);
480 }
481
482 // Has V_PK_MOV_B32 opcode
483 bool hasPkMovB32() const { return HasGFX90AInsts; }
484
485 bool hasFmaakFmamkF32Insts() const {
486 return getGeneration() >= GFX10 || hasGFX940Insts();
487 }
488
489 bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
490
491 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
492
493 unsigned getNSAMaxSize(bool HasSampler = false) const {
494 return AMDGPU::getNSAMaxSize(STI: *this, HasSampler);
495 }
496
497 bool hasMadF16() const;
498
499 bool hasMovB64() const { return HasGFX940Insts || HasGFX1250Insts; }
500
501 // Scalar and global loads support scale_offset bit.
502 bool hasScaleOffset() const { return HasGFX1250Insts; }
503
504 // FLAT GLOBAL VOffset is signed
505 bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
506
507 bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
508
509 bool hasUserSGPRInit16BugInWave32() const {
510 return HasUserSGPRInit16Bug && isWave32();
511 }
512
513 bool has12DWordStoreHazard() const {
514 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
515 }
516
517 // \returns true if the subtarget supports DWORDX3 load/store instructions.
518 bool hasDwordx3LoadStores() const { return HasCIInsts; }
519
520 bool hasReadM0MovRelInterpHazard() const {
521 return getGeneration() == AMDGPUSubtarget::GFX9;
522 }
523
524 bool hasReadM0SendMsgHazard() const {
525 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
526 getGeneration() <= AMDGPUSubtarget::GFX9;
527 }
528
529 bool hasReadM0LdsDmaHazard() const {
530 return getGeneration() == AMDGPUSubtarget::GFX9;
531 }
532
533 bool hasReadM0LdsDirectHazard() const {
534 return getGeneration() == AMDGPUSubtarget::GFX9;
535 }
536
537 bool hasLDSMisalignedBugInWGPMode() const {
538 return HasLDSMisalignedBug && !EnableCuMode;
539 }
540
541 // Shift amount of a 64 bit shift cannot be a highest allocated register
542 // if also at the end of the allocation block.
543 bool hasShift64HighRegBug() const { return HasGFX90AInsts; }
544
545 // Has one cycle hazard on transcendental instruction feeding a
546 // non transcendental VALU.
547 bool hasTransForwardingHazard() const { return HasGFX940Insts; }
548
549 // Has one cycle hazard on a VALU instruction partially writing dst with
550 // a shift of result bits feeding another VALU instruction.
551 bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
552
553 // Cannot use op_sel with v_dot instructions.
554 bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
555
556 // Does not have HW interlocs for VALU writing and then reading SGPRs.
557 bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
558
559 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
560
561 bool hasFPAtomicToDenormModeHazard() const {
562 return getGeneration() == GFX10;
563 }
564
565 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
566
567 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
568
569 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
570
571 bool hasVALUPartialForwardingHazard() const {
572 return getGeneration() == GFX11;
573 }
574
575 bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
576
577 // All GFX9 targets experience a fetch delay when an instruction at the start
578 // of a loop header is split by a 32-byte fetch window boundary, but GFX950
579 // is uniquely sensitive to this: the delay triggers further performance
580 // degradation beyond the fetch latency itself.
581 bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
582
583 bool requiresCodeObjectV6() const { return RequiresCOV6; }
584
585 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
586
587 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
588
589 bool hasVALUReadSGPRHazard() const {
590 return HasGFX12Insts && !HasGFX1250Insts;
591 }
592
593 bool setRegModeNeedsVNOPs() const {
594 return HasGFX1250Insts && getGeneration() == GFX12;
595 }
596
597 /// Return if operations acting on VGPR tuples require even alignment.
598 bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
599
600 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
601 bool hasSPackHL() const { return HasGFX11Insts; }
602
603 /// Return true if the target's EXP instruction has the COMPR flag, which
604 /// affects the meaning of the EN (enable) bits.
605 bool hasCompressedExport() const { return !HasGFX11Insts; }
606
607 /// Return true if the target's EXP instruction supports the NULL export
608 /// target.
609 bool hasNullExportTarget() const { return !HasGFX11Insts; }
610
611 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
612
613 /// Return true if the target has the S_DELAY_ALU instruction.
614 bool hasDelayAlu() const { return HasGFX11Insts; }
615
616 /// Returns true if the target supports
617 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
618 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
619 bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
620
621 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
622 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
623 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
624
625 /// \returns true if the target has packed f32 instructions that only read 32
626 /// bits from a scalar operand (SGPR or literal) and replicates the bits to
627 /// both channels.
628 bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const {
629 return getGeneration() == GFX12 && HasGFX1250Insts;
630 }
631
632 bool hasAddPC64Inst() const { return HasGFX1250Insts; }
633
634 /// \returns true if the target supports expert scheduling mode 2 which relies
635 /// on the compiler to insert waits to avoid hazards between VMEM and VALU
636 /// instructions in some instances.
637 bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
638
639 /// \returns The maximum number of instructions that can be enclosed in an
640 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
641 /// instruction.
642 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
643
644 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
645 /// SGPRs
646 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
647
648 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
649 /// VGPRs
650 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
651 unsigned DynamicVGPRBlockSize) const;
652
653 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
654 /// be achieved when the only function running on a CU is \p F, each workgroup
655 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
656 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
657 /// range, so this returns a range as well.
658 ///
659 /// Note that occupancy can be affected by the scratch allocation as well, but
660 /// we do not have enough information to compute it.
661 std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
662 unsigned LDSSize = 0,
663 unsigned NumSGPRs = 0,
664 unsigned NumVGPRs = 0) const;
665
666 /// \returns true if the flat_scratch register should be initialized with the
667 /// pointer to the wave's scratch memory rather than a size and offset.
668 bool flatScratchIsPointer() const {
669 return getGeneration() >= AMDGPUSubtarget::GFX9;
670 }
671
672 /// \returns true if the machine has merged shaders in which s0-s7 are
673 /// reserved by the hardware and user SGPRs start at s8
674 bool hasMergedShaders() const { return getGeneration() >= GFX9; }
675
676 // \returns true if the target supports the pre-NGG legacy geometry path.
677 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
678
679 // \returns true if the target has split barriers feature
680 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
681
682 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
683 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
684
685 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
686 /// values.
687 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
688
689 bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
690
691 bool hasVOPD3() const { return HasGFX1250Insts; }
692
693 // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
694 bool hasVectorMulU64() const { return HasGFX1250Insts; }
695
696 // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
697 // instructions.
698 bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; }
699
700 // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
701 bool hasIntMinMax64() const { return HasGFX1250Insts; }
702
703 // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
704 bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
705
706 // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
707 bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
708
709 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
710 // of sign-extending. Note that GFX1250 has not only fixed the bug but also
711 // extended VA to 57 bits.
712 bool hasGetPCZeroExtension() const {
713 return HasGFX12Insts && !HasGFX1250Insts;
714 }
715
716 // \returns true if the target needs to create a prolog for backward
717 // compatibility when preloading kernel arguments.
718 bool needsKernArgPreloadProlog() const {
719 return hasKernargPreload() && !HasGFX1250Insts;
720 }
721
722 bool hasCondSubInsts() const { return HasGFX12Insts; }
723
724 bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
725
726 /// \returns SGPR allocation granularity supported by the subtarget.
727 unsigned getSGPRAllocGranule() const {
728 return AMDGPU::IsaInfo::getSGPRAllocGranule(STI: this);
729 }
730
731 /// \returns SGPR encoding granularity supported by the subtarget.
732 unsigned getSGPREncodingGranule() const {
733 return AMDGPU::IsaInfo::getSGPREncodingGranule(STI: this);
734 }
735
736 /// \returns Total number of SGPRs supported by the subtarget.
737 unsigned getTotalNumSGPRs() const {
738 return AMDGPU::IsaInfo::getTotalNumSGPRs(STI: this);
739 }
740
741 /// \returns Addressable number of SGPRs supported by the subtarget.
742 unsigned getAddressableNumSGPRs() const {
743 return AMDGPU::IsaInfo::getAddressableNumSGPRs(STI: this);
744 }
745
746 /// \returns Minimum number of SGPRs that meets the given number of waves per
747 /// execution unit requirement supported by the subtarget.
748 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
749 return AMDGPU::IsaInfo::getMinNumSGPRs(STI: this, WavesPerEU);
750 }
751
752 /// \returns Maximum number of SGPRs that meets the given number of waves per
753 /// execution unit requirement supported by the subtarget.
754 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
755 return AMDGPU::IsaInfo::getMaxNumSGPRs(STI: this, WavesPerEU, Addressable);
756 }
757
758 /// \returns Reserved number of SGPRs. This is common
759 /// utility function called by MachineFunction and
760 /// Function variants of getReservedNumSGPRs.
761 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
762 /// \returns Reserved number of SGPRs for given machine function \p MF.
763 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
764
765 /// \returns Reserved number of SGPRs for given function \p F.
766 unsigned getReservedNumSGPRs(const Function &F) const;
767
768 /// \returns Maximum number of preloaded SGPRs for the subtarget.
769 unsigned getMaxNumPreloadedSGPRs() const;
770
771 /// \returns max num SGPRs. This is the common utility
772 /// function called by MachineFunction and Function
773 /// variants of getMaxNumSGPRs.
774 unsigned getBaseMaxNumSGPRs(const Function &F,
775 std::pair<unsigned, unsigned> WavesPerEU,
776 unsigned PreloadedSGPRs,
777 unsigned ReservedNumSGPRs) const;
778
779 /// \returns Maximum number of SGPRs that meets number of waves per execution
780 /// unit requirement for function \p MF, or number of SGPRs explicitly
781 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
782 ///
783 /// \returns Value that meets number of waves per execution unit requirement
784 /// if explicitly requested value cannot be converted to integer, violates
785 /// subtarget's specifications, or does not meet number of waves per execution
786 /// unit requirement.
787 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
788
789 /// \returns Maximum number of SGPRs that meets number of waves per execution
790 /// unit requirement for function \p F, or number of SGPRs explicitly
791 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
792 ///
793 /// \returns Value that meets number of waves per execution unit requirement
794 /// if explicitly requested value cannot be converted to integer, violates
795 /// subtarget's specifications, or does not meet number of waves per execution
796 /// unit requirement.
797 unsigned getMaxNumSGPRs(const Function &F) const;
798
799 /// \returns VGPR allocation granularity supported by the subtarget.
800 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
801 return AMDGPU::IsaInfo::getVGPRAllocGranule(STI: this, DynamicVGPRBlockSize);
802 }
803
804 /// \returns VGPR encoding granularity supported by the subtarget.
805 unsigned getVGPREncodingGranule() const {
806 return AMDGPU::IsaInfo::getVGPREncodingGranule(STI: this);
807 }
808
809 /// \returns Total number of VGPRs supported by the subtarget.
810 unsigned getTotalNumVGPRs() const {
811 return AMDGPU::IsaInfo::getTotalNumVGPRs(STI: this);
812 }
813
814 /// \returns Addressable number of architectural VGPRs supported by the
815 /// subtarget.
816 unsigned getAddressableNumArchVGPRs() const {
817 return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(STI: this);
818 }
819
820 /// \returns Addressable number of VGPRs supported by the subtarget.
821 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
822 return AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: this, DynamicVGPRBlockSize);
823 }
824
825 /// \returns the minimum number of VGPRs that will prevent achieving more than
826 /// the specified number of waves \p WavesPerEU.
827 unsigned getMinNumVGPRs(unsigned WavesPerEU,
828 unsigned DynamicVGPRBlockSize) const {
829 return AMDGPU::IsaInfo::getMinNumVGPRs(STI: this, WavesPerEU,
830 DynamicVGPRBlockSize);
831 }
832
833 /// \returns the maximum number of VGPRs that can be used and still achieved
834 /// at least the specified number of waves \p WavesPerEU.
835 unsigned getMaxNumVGPRs(unsigned WavesPerEU,
836 unsigned DynamicVGPRBlockSize) const {
837 return AMDGPU::IsaInfo::getMaxNumVGPRs(STI: this, WavesPerEU,
838 DynamicVGPRBlockSize);
839 }
840
841 /// \returns max num VGPRs. This is the common utility function
842 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
843 unsigned
844 getBaseMaxNumVGPRs(const Function &F,
845 std::pair<unsigned, unsigned> NumVGPRBounds) const;
846
847 /// \returns Maximum number of VGPRs that meets number of waves per execution
848 /// unit requirement for function \p F, or number of VGPRs explicitly
849 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
850 ///
851 /// \returns Value that meets number of waves per execution unit requirement
852 /// if explicitly requested value cannot be converted to integer, violates
853 /// subtarget's specifications, or does not meet number of waves per execution
854 /// unit requirement.
855 unsigned getMaxNumVGPRs(const Function &F) const;
856
857 unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
858
859 /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
860 /// of waves per execution unit required for the function \p MF.
861 std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
862
863 /// \returns Maximum number of VGPRs that meets number of waves per execution
864 /// unit requirement for function \p MF, or number of VGPRs explicitly
865 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
866 ///
867 /// \returns Value that meets number of waves per execution unit requirement
868 /// if explicitly requested value cannot be converted to integer, violates
869 /// subtarget's specifications, or does not meet number of waves per execution
870 /// unit requirement.
871 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
872
873 bool supportsWave32() const { return getGeneration() >= GFX10; }
874
875 bool supportsWave64() const { return !hasGFX1250Insts(); }
876
877 bool isWave32() const { return getWavefrontSize() == 32; }
878
879 bool isWave64() const { return getWavefrontSize() == 64; }
880
881 /// Returns if the wavesize of this subtarget is known reliable. This is false
882 /// only for the a default target-cpu that does not have an explicit
883 /// +wavefrontsize target feature.
884 bool isWaveSizeKnown() const {
885 return hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) ||
886 hasFeature(Feature: AMDGPU::FeatureWavefrontSize64);
887 }
888
889 const TargetRegisterClass *getBoolRC() const {
890 return getRegisterInfo()->getBoolRC();
891 }
892
893 /// \returns Maximum number of work groups per compute unit supported by the
894 /// subtarget and limited by given \p FlatWorkGroupSize.
895 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
896 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(STI: this, FlatWorkGroupSize);
897 }
898
899 /// \returns Minimum flat work group size supported by the subtarget.
900 unsigned getMinFlatWorkGroupSize() const override {
901 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(STI: this);
902 }
903
904 /// \returns Maximum flat work group size supported by the subtarget.
905 unsigned getMaxFlatWorkGroupSize() const override {
906 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize();
907 }
908
909 /// \returns Number of waves per execution unit required to support the given
910 /// \p FlatWorkGroupSize.
911 unsigned
912 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
913 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(STI: this, FlatWorkGroupSize);
914 }
915
916 /// \returns Minimum number of waves per execution unit supported by the
917 /// subtarget.
918 unsigned getMinWavesPerEU() const override {
919 return AMDGPU::IsaInfo::getMinWavesPerEU(STI: this);
920 }
921
922 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
923 SDep &Dep,
924 const TargetSchedModel *SchedModel) const override;
925
926 // \returns true if it's beneficial on this subtarget for the scheduler to
927 // cluster stores as well as loads.
928 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
929
930 // \returns the number of address arguments from which to enable MIMG NSA
931 // on supported architectures.
932 unsigned getNSAThreshold(const MachineFunction &MF) const;
933
934 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
935 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
936 bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
937
938 // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
939 // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
940 bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
941
942 bool isDynamicVGPREnabled() const { return DynamicVGPR; }
943 unsigned getDynamicVGPRBlockSize() const {
944 return DynamicVGPRBlockSize32 ? 32 : 16;
945 }
946
947 bool requiresDisjointEarlyClobberAndUndef() const override {
948 // AMDGPU doesn't care if early-clobber and undef operands are allocated
949 // to the same register.
950 return false;
951 }
952
953 // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
954 // and surronded by S_WAIT_ALU(0xFFE3).
955 bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
956 return getGeneration() == GFX12;
957 }
958
959 // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
960 // read.
961 bool hasScratchBaseForwardingHazard() const {
962 return HasGFX1250Insts && getGeneration() == GFX12;
963 }
964
965 // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
966 // result.
967 bool hasFlatScratchHiInB64InstHazard() const {
968 return HasGFX1250Insts && getGeneration() == GFX12;
969 }
970
971 /// \returns true if the subtarget requires a wait for xcnt before VMEM
972 /// accesses that must never be repeated in the event of a page fault/re-try.
973 /// Atomic stores/rmw and all volatile accesses fall under this criteria.
974 bool requiresWaitXCntForSingleAccessInstructions() const {
975 return HasGFX1250Insts;
976 }
977
978 /// \returns the number of significant bits in the immediate field of the
979 /// S_NOP instruction.
980 unsigned getSNopBits() const {
981 if (getGeneration() >= AMDGPUSubtarget::GFX12)
982 return 7;
983 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
984 return 4;
985 return 3;
986 }
987
988 bool supportsBPermute() const {
989 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
990 }
991
992 bool supportsWaveWideBPermute() const {
993 return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
994 getGeneration() == AMDGPUSubtarget::GFX12) ||
995 isWave32();
996 }
997
998 /// Return true if real (non-fake) variants of True16 instructions using
999 /// 16-bit registers should be code-generated. Fake True16 instructions are
1000 /// identical to non-fake ones except that they take 32-bit registers as
1001 /// operands and always use their low halves.
1002 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
1003 // supported and the support for fake True16 instructions is removed.
1004 bool useRealTrue16Insts() const {
1005 return hasTrue16BitInsts() && EnableRealTrue16Insts;
1006 }
1007
1008 bool requiresWaitOnWorkgroupReleaseFence() const {
1009 return getGeneration() >= GFX10 || isTgSplitEnabled();
1010 }
1011};
1012
1013class GCNUserSGPRUsageInfo {
1014public:
1015 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1016
1017 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1018
1019 bool hasDispatchPtr() const { return DispatchPtr; }
1020
1021 bool hasQueuePtr() const { return QueuePtr; }
1022
1023 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1024
1025 bool hasDispatchID() const { return DispatchID; }
1026
1027 bool hasFlatScratchInit() const { return FlatScratchInit; }
1028
1029 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1030
1031 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1032
1033 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1034
1035 unsigned getNumFreeUserSGPRs();
1036
1037 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1038
1039 enum UserSGPRID : unsigned {
1040 ImplicitBufferPtrID = 0,
1041 PrivateSegmentBufferID = 1,
1042 DispatchPtrID = 2,
1043 QueuePtrID = 3,
1044 KernargSegmentPtrID = 4,
1045 DispatchIdID = 5,
1046 FlatScratchInitID = 6,
1047 PrivateSegmentSizeID = 7
1048 };
1049
1050 // Returns the size in number of SGPRs for preload user SGPR field.
1051 static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1052 switch (ID) {
1053 case ImplicitBufferPtrID:
1054 return 2;
1055 case PrivateSegmentBufferID:
1056 return 4;
1057 case DispatchPtrID:
1058 return 2;
1059 case QueuePtrID:
1060 return 2;
1061 case KernargSegmentPtrID:
1062 return 2;
1063 case DispatchIdID:
1064 return 2;
1065 case FlatScratchInitID:
1066 return 2;
1067 case PrivateSegmentSizeID:
1068 return 1;
1069 }
1070 llvm_unreachable("Unknown UserSGPRID.");
1071 }
1072
1073 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1074
1075private:
1076 const GCNSubtarget &ST;
1077
1078 // Private memory buffer
1079 // Compute directly in sgpr[0:1]
1080 // Other shaders indirect 64-bits at sgpr[0:1]
1081 bool ImplicitBufferPtr = false;
1082
1083 bool PrivateSegmentBuffer = false;
1084
1085 bool DispatchPtr = false;
1086
1087 bool QueuePtr = false;
1088
1089 bool KernargSegmentPtr = false;
1090
1091 bool DispatchID = false;
1092
1093 bool FlatScratchInit = false;
1094
1095 bool PrivateSegmentSize = false;
1096
1097 unsigned NumKernargPreloadSGPRs = 0;
1098
1099 unsigned NumUsedUserSGPRs = 0;
1100};
1101
1102} // end namespace llvm
1103
1104#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1105