1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
23#include "Utils/AMDGPUBaseInfo.h"
24#include "llvm/Support/ErrorHandling.h"
25
26#define GET_SUBTARGETINFO_HEADER
27#include "AMDGPUGenSubtargetInfo.inc"
28
29namespace llvm {
30
31class GCNTargetMachine;
32
33class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
34 public AMDGPUSubtarget {
35public:
36 using AMDGPUSubtarget::getMaxWavesPerEU;
37
38 // Following 2 enums are documented at:
39 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
40 enum class TrapHandlerAbi {
41 NONE = 0x00,
42 AMDHSA = 0x01,
43 };
44
45 enum class TrapID {
46 LLVMAMDHSATrap = 0x02,
47 LLVMAMDHSADebugTrap = 0x03,
48 };
49
50private:
51 /// SelectionDAGISel related APIs.
52 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
53
54 /// GlobalISel related APIs.
55 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
56 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
57 std::unique_ptr<InstructionSelector> InstSelector;
58 std::unique_ptr<LegalizerInfo> Legalizer;
59 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
60
61protected:
62 // Basic subtarget description.
63 AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
64 unsigned Gen = INVALID;
65 InstrItineraryData InstrItins;
66 int LDSBankCount = 0;
67 unsigned MaxPrivateElementSize = 0;
68
69 // Instruction cache line size in bytes; set from TableGen subtarget features.
70 unsigned InstCacheLineSize = 0;
71
72 // Dynamically set bits that enable features.
73 bool DynamicVGPR = false;
74 bool DynamicVGPRBlockSize32 = false;
75 bool ScalarizeGlobal = false;
76
77 /// The maximum number of instructions that may be placed within an S_CLAUSE,
78 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
79 /// indicates a lack of S_CLAUSE support.
80 unsigned MaxHardClauseLength = 0;
81
82#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
83 bool ATTRIBUTE = DEFAULT;
84#include "AMDGPUGenSubtargetInfo.inc"
85
86private:
87 SIInstrInfo InstrInfo;
88 SITargetLowering TLInfo;
89 SIFrameLowering FrameLowering;
90
91public:
92 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
93 const GCNTargetMachine &TM);
94 ~GCNSubtarget() override;
95
96 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU,
97 StringRef FS);
98
99 /// Diagnose inconsistent subtarget features before attempting to codegen
100 /// function \p F.
101 void checkSubtargetFeatures(const Function &F) const;
102
103 const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
104
105 const SIFrameLowering *getFrameLowering() const override {
106 return &FrameLowering;
107 }
108
109 const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
110
111 const SIRegisterInfo *getRegisterInfo() const override {
112 return &InstrInfo.getRegisterInfo();
113 }
114
115 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
116
117 const CallLowering *getCallLowering() const override {
118 return CallLoweringInfo.get();
119 }
120
121 const InlineAsmLowering *getInlineAsmLowering() const override {
122 return InlineAsmLoweringInfo.get();
123 }
124
125 InstructionSelector *getInstructionSelector() const override {
126 return InstSelector.get();
127 }
128
129 const LegalizerInfo *getLegalizerInfo() const override {
130 return Legalizer.get();
131 }
132
133 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
134 return RegBankInfo.get();
135 }
136
137 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
138 return TargetID;
139 }
140
141 const InstrItineraryData *getInstrItineraryData() const override {
142 return &InstrItins;
143 }
144
145 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
146
147 Generation getGeneration() const { return (Generation)Gen; }
148
149 bool isGFX11Plus() const { return getGeneration() >= GFX11; }
150
151#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
152 bool GETTER() const override { return ATTRIBUTE; }
153#include "AMDGPUGenSubtargetInfo.inc"
154
155 unsigned getMaxWaveScratchSize() const {
156 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
157 if (getGeneration() >= GFX12) {
158 // 18-bit field in units of 64-dword.
159 return (64 * 4) * ((1 << 18) - 1);
160 }
161 if (getGeneration() == GFX11) {
162 // 15-bit field in units of 64-dword.
163 return (64 * 4) * ((1 << 15) - 1);
164 }
165 // 13-bit field in units of 256-dword.
166 return (256 * 4) * ((1 << 13) - 1);
167 }
168
169 /// Return the number of high bits known to be zero for a frame index.
170 unsigned getKnownHighZeroBitsForFrameIndex() const {
171 return llvm::countl_zero(Val: getMaxWaveScratchSize()) + getWavefrontSizeLog2();
172 }
173
174 int getLDSBankCount() const { return LDSBankCount; }
175
176 /// Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
177 unsigned getInstCacheLineSize() const { return InstCacheLineSize; }
178
179 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
180 return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
181 : 16;
182 }
183
184 unsigned getConstantBusLimit(unsigned Opcode) const;
185
186 /// Returns if the result of this instruction with a 16-bit result returned in
187 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
188 /// the original value.
189 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
190
191 bool supportsWGP() const {
192 if (HasGFX1250Insts)
193 return false;
194 return getGeneration() >= GFX10;
195 }
196
197 bool hasHWFP64() const { return HasFP64; }
198
199 bool hasAddr64() const {
200 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
201 }
202
203 bool hasFlat() const {
204 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
205 }
206
207 // Return true if the target only has the reverse operand versions of VALU
208 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
209 bool hasOnlyRevVALUShifts() const {
210 return getGeneration() >= VOLCANIC_ISLANDS;
211 }
212
213 bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
214
215 bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
216
217 bool hasMin3Max3_16() const {
218 return getGeneration() >= AMDGPUSubtarget::GFX9;
219 }
220
221 bool hasSwap() const { return HasGFX9Insts; }
222
223 bool hasScalarPackInsts() const { return HasGFX9Insts; }
224
225 bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
226
227 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
228
229 TrapHandlerAbi getTrapHandlerAbi() const {
230 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
231 }
232
233 bool supportsGetDoorbellID() const {
234 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
235 return getGeneration() >= GFX9;
236 }
237
238 /// True if the offset field of DS instructions works as expected. On SI, the
239 /// offset uses a 16-bit adder and does not always wrap properly.
240 bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
241
242 bool unsafeDSOffsetFoldingEnabled() const {
243 return EnableUnsafeDSOffsetFolding;
244 }
245
246 /// Condition output from div_scale is usable.
247 bool hasUsableDivScaleConditionOutput() const {
248 return getGeneration() != SOUTHERN_ISLANDS;
249 }
250
251 /// Extra wait hazard is needed in some cases before
252 /// s_cbranch_vccnz/s_cbranch_vccz.
253 bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
254
255 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
256 bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
257
258 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
259 /// was written by a VALU instruction.
260 bool hasSMRDReadVALUDefHazard() const {
261 return getGeneration() == SOUTHERN_ISLANDS;
262 }
263
264 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
265 /// SGPR was written by a VALU Instruction.
266 bool hasVMEMReadSGPRVALUDefHazard() const {
267 return getGeneration() >= VOLCANIC_ISLANDS;
268 }
269
270 bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
271
272 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
273 unsigned getSetRegWaitStates() const {
274 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
275 }
276
277 /// Return the amount of LDS that can be used that will not restrict the
278 /// occupancy lower than WaveCount.
279 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
280 const Function &) const;
281
282 bool supportsMinMaxDenormModes() const {
283 return getGeneration() >= AMDGPUSubtarget::GFX9;
284 }
285
286 /// \returns If target supports S_DENORM_MODE.
287 bool hasDenormModeInst() const {
288 return getGeneration() >= AMDGPUSubtarget::GFX10;
289 }
290
291 /// \returns If target supports ds_read/write_b128 and user enables generation
292 /// of ds_read/write_b128.
293 bool useDS128() const { return HasCIInsts && EnableDS128; }
294
295 /// \return If target supports ds_read/write_b96/128.
296 bool hasDS96AndDS128() const { return HasCIInsts; }
297
298 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
299 bool haveRoundOpsF64() const { return HasCIInsts; }
300
301 /// \returns If MUBUF instructions always perform range checking, even for
302 /// buffer resources used for private memory access.
303 bool privateMemoryResourceIsRangeChecked() const {
304 return getGeneration() < AMDGPUSubtarget::GFX9;
305 }
306
307 /// \returns If target requires PRT Struct NULL support (zero result registers
308 /// for sparse texture support).
309 bool usePRTStrictNull() const { return EnablePRTStrictNull; }
310
311 bool hasUnalignedBufferAccessEnabled() const {
312 return HasUnalignedBufferAccess && HasUnalignedAccessMode;
313 }
314
315 bool hasUnalignedDSAccessEnabled() const {
316 return HasUnalignedDSAccess && HasUnalignedAccessMode;
317 }
318
319 bool hasUnalignedScratchAccessEnabled() const {
320 return HasUnalignedScratchAccess && HasUnalignedAccessMode;
321 }
322
323 bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
324
325 bool isTgSplitEnabled() const { return EnableTgSplit; }
326
327 bool isCuModeEnabled() const { return EnableCuMode; }
328
329 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
330
331 bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
332
333 // Check if target supports ST addressing mode with FLAT scratch instructions.
334 // The ST addressing mode means no registers are used, either VGPR or SGPR,
335 // but only immediate offset is swizzled and added to the FLAT scratch base.
336 bool hasFlatScratchSTMode() const {
337 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
338 }
339
340 bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
341
342 bool hasFlatScratchEnabled() const {
343 return hasArchitectedFlatScratch() ||
344 (EnableFlatScratch && hasFlatScratchInsts());
345 }
346
347 bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
348
349 bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
350
351 bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
352
353 bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
354
355 bool hasExportInsts() const {
356 return !hasGFX940Insts() && !hasGFX1250Insts();
357 }
358
359 bool hasVINTERPEncoding() const {
360 return HasGFX11Insts && !hasGFX1250Insts();
361 }
362
363 // DS_ADD_F64/DS_ADD_RTN_F64
364 bool hasLdsAtomicAddF64() const {
365 return hasGFX90AInsts() || hasGFX1250Insts();
366 }
367
368 bool hasMultiDwordFlatScratchAddressing() const {
369 return getGeneration() >= GFX9;
370 }
371
372 bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
373
374 bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
375
376 bool d16PreservesUnusedBits() const {
377 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
378 }
379
380 bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
381
382 /// Return if most LDS instructions have an m0 use that require m0 to be
383 /// initialized.
384 bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
385
386 // True if the hardware rewinds and replays GWS operations if a wave is
387 // preempted.
388 //
389 // If this is false, a GWS operation requires testing if a nack set the
390 // MEM_VIOL bit, and repeating if so.
391 bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
392
393 /// \returns if target has ds_gws_sema_release_all instruction.
394 bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
395
396 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
397
398 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
399
400 // Covers VS/PS/CS graphics shaders
401 bool isMesaGfxShader(const Function &F) const {
402 return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
403 }
404
405 bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
406
407 bool hasAtomicFaddInsts() const {
408 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
409 }
410
411 bool vmemWriteNeedsExpWaitcnt() const {
412 return getGeneration() < SEA_ISLANDS;
413 }
414
415 bool hasInstPrefetch() const {
416 return getGeneration() == GFX10 || getGeneration() == GFX11;
417 }
418
419 bool hasPrefetch() const { return HasGFX12Insts; }
420
421 // Has s_cmpk_* instructions.
422 bool hasSCmpK() const { return getGeneration() < GFX12; }
423
424 // Scratch is allocated in 256 dword per wave blocks for the entire
425 // wavefront. When viewed from the perspective of an arbitrary workitem, this
426 // is 4-byte aligned.
427 //
428 // Only 4-byte alignment is really needed to access anything. Transformations
429 // on the pointer value itself may rely on the alignment / known low bits of
430 // the pointer. Set this to something above the minimum to avoid needing
431 // dynamic realignment in common cases.
432 Align getStackAlignment() const { return Align(16); }
433
434 bool enableMachineScheduler() const override { return true; }
435
436 bool useAA() const override;
437
438 bool enableSubRegLiveness() const override { return true; }
439
440 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
441 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
442
443 // static wrappers
444 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
445
446 // XXX - Why is this here if it isn't in the default pass set?
447 bool enableEarlyIfConversion() const override { return true; }
448
449 void overrideSchedPolicy(MachineSchedPolicy &Policy,
450 const SchedRegion &Region) const override;
451
452 void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
453 const SchedRegion &Region) const override;
454
455 void mirFileLoaded(MachineFunction &MF) const override;
456
457 unsigned getMaxNumUserSGPRs() const {
458 return AMDGPU::getMaxNumUserSGPRs(STI: *this);
459 }
460
461 bool useVGPRIndexMode() const;
462
463 bool hasScalarCompareEq64() const {
464 return getGeneration() >= VOLCANIC_ISLANDS;
465 }
466
467 bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
468 bool hasLDSFPAtomicAddF64() const {
469 return HasGFX90AInsts || HasGFX1250Insts;
470 }
471
472 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
473 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
474
475 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
476 bool hasPermLane64() const { return getGeneration() >= GFX11; }
477
478 bool hasDPPRowShare() const {
479 return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10);
480 }
481
482 // Has V_PK_MOV_B32 opcode
483 bool hasPkMovB32() const { return HasGFX90AInsts; }
484
485 bool hasFmaakFmamkF32Insts() const {
486 return getGeneration() >= GFX10 || hasGFX940Insts();
487 }
488
489 bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
490
491 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
492
493 unsigned getNSAMaxSize(bool HasSampler = false) const {
494 return AMDGPU::getNSAMaxSize(STI: *this, HasSampler);
495 }
496
497 bool hasMadF16() const;
498
499 bool hasMovB64() const { return HasGFX940Insts || HasGFX1250Insts; }
500
501 // Scalar and global loads support scale_offset bit.
502 bool hasScaleOffset() const { return HasGFX1250Insts; }
503
504 // FLAT GLOBAL VOffset is signed
505 bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
506
507 bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
508
509 bool hasUserSGPRInit16BugInWave32() const {
510 return HasUserSGPRInit16Bug && isWave32();
511 }
512
513 bool has12DWordStoreHazard() const {
514 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
515 }
516
517 // \returns true if the subtarget supports DWORDX3 load/store instructions.
518 bool hasDwordx3LoadStores() const { return HasCIInsts; }
519
520 bool hasReadM0MovRelInterpHazard() const {
521 return getGeneration() == AMDGPUSubtarget::GFX9;
522 }
523
524 bool hasReadM0SendMsgHazard() const {
525 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
526 getGeneration() <= AMDGPUSubtarget::GFX9;
527 }
528
529 bool hasReadM0LdsDmaHazard() const {
530 return getGeneration() == AMDGPUSubtarget::GFX9;
531 }
532
533 bool hasReadM0LdsDirectHazard() const {
534 return getGeneration() == AMDGPUSubtarget::GFX9;
535 }
536
537 bool hasLDSMisalignedBugInWGPMode() const {
538 return HasLDSMisalignedBug && !EnableCuMode;
539 }
540
541 // Shift amount of a 64 bit shift cannot be a highest allocated register
542 // if also at the end of the allocation block.
543 bool hasShift64HighRegBug() const { return HasGFX90AInsts; }
544
545 // Has one cycle hazard on transcendental instruction feeding a
546 // non transcendental VALU.
547 bool hasTransForwardingHazard() const { return HasGFX940Insts; }
548
549 // Has one cycle hazard on a VALU instruction partially writing dst with
550 // a shift of result bits feeding another VALU instruction.
551 bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
552
553 // Cannot use op_sel with v_dot instructions.
554 bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
555
556 // Does not have HW interlocs for VALU writing and then reading SGPRs.
557 bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
558
559 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
560
561 bool hasFPAtomicToDenormModeHazard() const {
562 return getGeneration() == GFX10;
563 }
564
565 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
566
567 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
568
569 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
570
571 bool hasVALUPartialForwardingHazard() const {
572 return getGeneration() == GFX11;
573 }
574
575 bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
576
577 // All GFX9 targets experience a fetch delay when an instruction at the start
578 // of a loop header is split by a 32-byte fetch window boundary, but GFX950
579 // is uniquely sensitive to this: the delay triggers further performance
580 // degradation beyond the fetch latency itself.
581 bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
582
583 bool requiresCodeObjectV6() const { return RequiresCOV6; }
584
585 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
586
587 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
588
589 bool hasVALUReadSGPRHazard() const {
590 return HasGFX12Insts && !HasGFX1250Insts;
591 }
592
593 bool setRegModeNeedsVNOPs() const {
594 return HasGFX1250Insts && getGeneration() == GFX12;
595 }
596
597 /// Return if operations acting on VGPR tuples require even alignment.
598 bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
599
600 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
601 bool hasSPackHL() const { return HasGFX11Insts; }
602
603 /// Return true if the target's EXP instruction has the COMPR flag, which
604 /// affects the meaning of the EN (enable) bits.
605 bool hasCompressedExport() const { return !HasGFX11Insts; }
606
607 /// Return true if the target's EXP instruction supports the NULL export
608 /// target.
609 bool hasNullExportTarget() const { return !HasGFX11Insts; }
610
611 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
612
613 /// Return true if the target has the S_DELAY_ALU instruction.
614 bool hasDelayAlu() const { return HasGFX11Insts; }
615
616 /// Returns true if the target supports
617 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
618 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
619 bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
620
621 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
622 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
623 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
624
625 /// \returns true if inline constants are not supported for F16 pseudo
626 /// scalar transcendentals.
627 bool hasNoF16PseudoScalarTransInlineConstants() const {
628 return getGeneration() == GFX12;
629 }
630
631 /// \returns true if the target has packed f32 instructions that only read 32
632 /// bits from a scalar operand (SGPR or literal) and replicates the bits to
633 /// both channels.
634 bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const {
635 return getGeneration() == GFX12 && HasGFX1250Insts;
636 }
637
638 bool hasAddPC64Inst() const { return HasGFX1250Insts; }
639
640 /// \returns true if the target supports expert scheduling mode 2 which relies
641 /// on the compiler to insert waits to avoid hazards between VMEM and VALU
642 /// instructions in some instances.
643 bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
644
645 /// \returns The maximum number of instructions that can be enclosed in an
646 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
647 /// instruction.
648 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
649
650 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
651 /// SGPRs
652 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
653
654 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
655 /// VGPRs
656 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
657 unsigned DynamicVGPRBlockSize) const;
658
659 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
660 /// be achieved when the only function running on a CU is \p F, each workgroup
661 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
662 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
663 /// range, so this returns a range as well.
664 ///
665 /// Note that occupancy can be affected by the scratch allocation as well, but
666 /// we do not have enough information to compute it.
667 std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
668 unsigned LDSSize = 0,
669 unsigned NumSGPRs = 0,
670 unsigned NumVGPRs = 0) const;
671
672 /// \returns true if the flat_scratch register should be initialized with the
673 /// pointer to the wave's scratch memory rather than a size and offset.
674 bool flatScratchIsPointer() const {
675 return getGeneration() >= AMDGPUSubtarget::GFX9;
676 }
677
678 /// \returns true if the machine has merged shaders in which s0-s7 are
679 /// reserved by the hardware and user SGPRs start at s8
680 bool hasMergedShaders() const { return getGeneration() >= GFX9; }
681
682 // \returns true if the target supports the pre-NGG legacy geometry path.
683 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
684
685 // \returns true if the target has split barriers feature
686 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
687
688 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
689 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
690
691 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
692 /// values.
693 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
694
695 bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
696
697 bool hasVOPD3() const { return HasGFX1250Insts; }
698
699 // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
700 bool hasVectorMulU64() const { return HasGFX1250Insts; }
701
702 // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
703 // instructions.
704 bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; }
705
706 // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
707 bool hasIntMinMax64() const { return HasGFX1250Insts; }
708
709 // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
710 bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
711
712 // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
713 bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
714
715 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
716 // of sign-extending. Note that GFX1250 has not only fixed the bug but also
717 // extended VA to 57 bits.
718 bool hasGetPCZeroExtension() const {
719 return HasGFX12Insts && !HasGFX1250Insts;
720 }
721
722 // \returns true if the target needs to create a prolog for backward
723 // compatibility when preloading kernel arguments.
724 bool needsKernArgPreloadProlog() const {
725 return hasKernargPreload() && !HasGFX1250Insts;
726 }
727
728 bool hasCondSubInsts() const { return HasGFX12Insts; }
729
730 bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
731
732 /// \returns SGPR allocation granularity supported by the subtarget.
733 unsigned getSGPRAllocGranule() const {
734 return AMDGPU::IsaInfo::getSGPRAllocGranule(STI: this);
735 }
736
737 /// \returns SGPR encoding granularity supported by the subtarget.
738 unsigned getSGPREncodingGranule() const {
739 return AMDGPU::IsaInfo::getSGPREncodingGranule(STI: this);
740 }
741
742 /// \returns Total number of SGPRs supported by the subtarget.
743 unsigned getTotalNumSGPRs() const {
744 return AMDGPU::IsaInfo::getTotalNumSGPRs(STI: this);
745 }
746
747 /// \returns Addressable number of SGPRs supported by the subtarget.
748 unsigned getAddressableNumSGPRs() const {
749 return AMDGPU::IsaInfo::getAddressableNumSGPRs(STI: this);
750 }
751
752 /// \returns Minimum number of SGPRs that meets the given number of waves per
753 /// execution unit requirement supported by the subtarget.
754 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
755 return AMDGPU::IsaInfo::getMinNumSGPRs(STI: this, WavesPerEU);
756 }
757
758 /// \returns Maximum number of SGPRs that meets the given number of waves per
759 /// execution unit requirement supported by the subtarget.
760 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
761 return AMDGPU::IsaInfo::getMaxNumSGPRs(STI: this, WavesPerEU, Addressable);
762 }
763
764 /// \returns Reserved number of SGPRs. This is common
765 /// utility function called by MachineFunction and
766 /// Function variants of getReservedNumSGPRs.
767 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
768 /// \returns Reserved number of SGPRs for given machine function \p MF.
769 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
770
771 /// \returns Reserved number of SGPRs for given function \p F.
772 unsigned getReservedNumSGPRs(const Function &F) const;
773
774 /// \returns Maximum number of preloaded SGPRs for the subtarget.
775 unsigned getMaxNumPreloadedSGPRs() const;
776
777 /// \returns max num SGPRs. This is the common utility
778 /// function called by MachineFunction and Function
779 /// variants of getMaxNumSGPRs.
780 unsigned getBaseMaxNumSGPRs(const Function &F,
781 std::pair<unsigned, unsigned> WavesPerEU,
782 unsigned PreloadedSGPRs,
783 unsigned ReservedNumSGPRs) const;
784
785 /// \returns Maximum number of SGPRs that meets number of waves per execution
786 /// unit requirement for function \p MF, or number of SGPRs explicitly
787 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
788 ///
789 /// \returns Value that meets number of waves per execution unit requirement
790 /// if explicitly requested value cannot be converted to integer, violates
791 /// subtarget's specifications, or does not meet number of waves per execution
792 /// unit requirement.
793 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
794
795 /// \returns Maximum number of SGPRs that meets number of waves per execution
796 /// unit requirement for function \p F, or number of SGPRs explicitly
797 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
798 ///
799 /// \returns Value that meets number of waves per execution unit requirement
800 /// if explicitly requested value cannot be converted to integer, violates
801 /// subtarget's specifications, or does not meet number of waves per execution
802 /// unit requirement.
803 unsigned getMaxNumSGPRs(const Function &F) const;
804
805 /// \returns VGPR allocation granularity supported by the subtarget.
806 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
807 return AMDGPU::IsaInfo::getVGPRAllocGranule(STI: this, DynamicVGPRBlockSize);
808 }
809
810 /// \returns VGPR encoding granularity supported by the subtarget.
811 unsigned getVGPREncodingGranule() const {
812 return AMDGPU::IsaInfo::getVGPREncodingGranule(STI: this);
813 }
814
815 /// \returns Total number of VGPRs supported by the subtarget.
816 unsigned getTotalNumVGPRs() const {
817 return AMDGPU::IsaInfo::getTotalNumVGPRs(STI: this);
818 }
819
820 /// \returns Addressable number of architectural VGPRs supported by the
821 /// subtarget.
822 unsigned getAddressableNumArchVGPRs() const {
823 return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(STI: this);
824 }
825
826 /// \returns Addressable number of VGPRs supported by the subtarget.
827 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
828 return AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: this, DynamicVGPRBlockSize);
829 }
830
831 /// \returns the minimum number of VGPRs that will prevent achieving more than
832 /// the specified number of waves \p WavesPerEU.
833 unsigned getMinNumVGPRs(unsigned WavesPerEU,
834 unsigned DynamicVGPRBlockSize) const {
835 return AMDGPU::IsaInfo::getMinNumVGPRs(STI: this, WavesPerEU,
836 DynamicVGPRBlockSize);
837 }
838
839 /// \returns the maximum number of VGPRs that can be used and still achieved
840 /// at least the specified number of waves \p WavesPerEU.
841 unsigned getMaxNumVGPRs(unsigned WavesPerEU,
842 unsigned DynamicVGPRBlockSize) const {
843 return AMDGPU::IsaInfo::getMaxNumVGPRs(STI: this, WavesPerEU,
844 DynamicVGPRBlockSize);
845 }
846
847 /// \returns max num VGPRs. This is the common utility function
848 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
849 unsigned
850 getBaseMaxNumVGPRs(const Function &F,
851 std::pair<unsigned, unsigned> NumVGPRBounds) const;
852
853 /// \returns Maximum number of VGPRs that meets number of waves per execution
854 /// unit requirement for function \p F, or number of VGPRs explicitly
855 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
856 ///
857 /// \returns Value that meets number of waves per execution unit requirement
858 /// if explicitly requested value cannot be converted to integer, violates
859 /// subtarget's specifications, or does not meet number of waves per execution
860 /// unit requirement.
861 unsigned getMaxNumVGPRs(const Function &F) const;
862
863 unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
864
865 /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
866 /// of waves per execution unit required for the function \p MF.
867 std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
868
869 /// \returns Maximum number of VGPRs that meets number of waves per execution
870 /// unit requirement for function \p MF, or number of VGPRs explicitly
871 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
872 ///
873 /// \returns Value that meets number of waves per execution unit requirement
874 /// if explicitly requested value cannot be converted to integer, violates
875 /// subtarget's specifications, or does not meet number of waves per execution
876 /// unit requirement.
877 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
878
879 bool supportsWave32() const { return getGeneration() >= GFX10; }
880
881 bool supportsWave64() const { return !hasGFX1250Insts(); }
882
883 bool isWave32() const { return getWavefrontSize() == 32; }
884
885 bool isWave64() const { return getWavefrontSize() == 64; }
886
887 /// Returns if the wavesize of this subtarget is known reliable. This is false
888 /// only for the a default target-cpu that does not have an explicit
889 /// +wavefrontsize target feature.
890 bool isWaveSizeKnown() const {
891 return hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) ||
892 hasFeature(Feature: AMDGPU::FeatureWavefrontSize64);
893 }
894
895 const TargetRegisterClass *getBoolRC() const {
896 return getRegisterInfo()->getBoolRC();
897 }
898
899 /// \returns Maximum number of work groups per compute unit supported by the
900 /// subtarget and limited by given \p FlatWorkGroupSize.
901 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
902 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(STI: this, FlatWorkGroupSize);
903 }
904
905 /// \returns Minimum flat work group size supported by the subtarget.
906 unsigned getMinFlatWorkGroupSize() const override {
907 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(STI: this);
908 }
909
910 /// \returns Maximum flat work group size supported by the subtarget.
911 unsigned getMaxFlatWorkGroupSize() const override {
912 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize();
913 }
914
915 /// \returns Number of waves per execution unit required to support the given
916 /// \p FlatWorkGroupSize.
917 unsigned
918 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
919 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(STI: this, FlatWorkGroupSize);
920 }
921
922 /// \returns Minimum number of waves per execution unit supported by the
923 /// subtarget.
924 unsigned getMinWavesPerEU() const override {
925 return AMDGPU::IsaInfo::getMinWavesPerEU(STI: this);
926 }
927
928 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
929 SDep &Dep,
930 const TargetSchedModel *SchedModel) const override;
931
932 // \returns true if it's beneficial on this subtarget for the scheduler to
933 // cluster stores as well as loads.
934 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
935
936 // \returns the number of address arguments from which to enable MIMG NSA
937 // on supported architectures.
938 unsigned getNSAThreshold(const MachineFunction &MF) const;
939
940 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
941 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
942 bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
943
944 // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
945 // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
946 bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
947
948 bool isDynamicVGPREnabled() const { return DynamicVGPR; }
949 unsigned getDynamicVGPRBlockSize() const {
950 return DynamicVGPRBlockSize32 ? 32 : 16;
951 }
952
953 bool requiresDisjointEarlyClobberAndUndef() const override {
954 // AMDGPU doesn't care if early-clobber and undef operands are allocated
955 // to the same register.
956 return false;
957 }
958
959 // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
960 // and surronded by S_WAIT_ALU(0xFFE3).
961 bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
962 return getGeneration() == GFX12;
963 }
964
965 // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
966 // read.
967 bool hasScratchBaseForwardingHazard() const {
968 return HasGFX1250Insts && getGeneration() == GFX12;
969 }
970
971 // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
972 // result.
973 bool hasFlatScratchHiInB64InstHazard() const {
974 return HasGFX1250Insts && getGeneration() == GFX12;
975 }
976
977 /// \returns true if the subtarget requires a wait for xcnt before VMEM
978 /// accesses that must never be repeated in the event of a page fault/re-try.
979 /// Atomic stores/rmw and all volatile accesses fall under this criteria.
980 bool requiresWaitXCntForSingleAccessInstructions() const {
981 return HasGFX1250Insts;
982 }
983
984 /// \returns the number of significant bits in the immediate field of the
985 /// S_NOP instruction.
986 unsigned getSNopBits() const {
987 if (getGeneration() >= AMDGPUSubtarget::GFX12)
988 return 7;
989 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
990 return 4;
991 return 3;
992 }
993
994 bool supportsBPermute() const {
995 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
996 }
997
998 bool supportsWaveWideBPermute() const {
999 return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
1000 getGeneration() == AMDGPUSubtarget::GFX12) ||
1001 isWave32();
1002 }
1003
1004 /// Return true if real (non-fake) variants of True16 instructions using
1005 /// 16-bit registers should be code-generated. Fake True16 instructions are
1006 /// identical to non-fake ones except that they take 32-bit registers as
1007 /// operands and always use their low halves.
1008 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
1009 // supported and the support for fake True16 instructions is removed.
1010 bool useRealTrue16Insts() const {
1011 return hasTrue16BitInsts() && EnableRealTrue16Insts;
1012 }
1013
1014 bool requiresWaitOnWorkgroupReleaseFence() const {
1015 return getGeneration() >= GFX10 || isTgSplitEnabled();
1016 }
1017};
1018
1019class GCNUserSGPRUsageInfo {
1020public:
1021 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1022
1023 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1024
1025 bool hasDispatchPtr() const { return DispatchPtr; }
1026
1027 bool hasQueuePtr() const { return QueuePtr; }
1028
1029 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1030
1031 bool hasDispatchID() const { return DispatchID; }
1032
1033 bool hasFlatScratchInit() const { return FlatScratchInit; }
1034
1035 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1036
1037 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1038
1039 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1040
1041 unsigned getNumFreeUserSGPRs();
1042
1043 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1044
1045 enum UserSGPRID : unsigned {
1046 ImplicitBufferPtrID = 0,
1047 PrivateSegmentBufferID = 1,
1048 DispatchPtrID = 2,
1049 QueuePtrID = 3,
1050 KernargSegmentPtrID = 4,
1051 DispatchIdID = 5,
1052 FlatScratchInitID = 6,
1053 PrivateSegmentSizeID = 7
1054 };
1055
1056 // Returns the size in number of SGPRs for preload user SGPR field.
1057 static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1058 switch (ID) {
1059 case ImplicitBufferPtrID:
1060 return 2;
1061 case PrivateSegmentBufferID:
1062 return 4;
1063 case DispatchPtrID:
1064 return 2;
1065 case QueuePtrID:
1066 return 2;
1067 case KernargSegmentPtrID:
1068 return 2;
1069 case DispatchIdID:
1070 return 2;
1071 case FlatScratchInitID:
1072 return 2;
1073 case PrivateSegmentSizeID:
1074 return 1;
1075 }
1076 llvm_unreachable("Unknown UserSGPRID.");
1077 }
1078
1079 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1080
1081private:
1082 const GCNSubtarget &ST;
1083
1084 // Private memory buffer
1085 // Compute directly in sgpr[0:1]
1086 // Other shaders indirect 64-bits at sgpr[0:1]
1087 bool ImplicitBufferPtr = false;
1088
1089 bool PrivateSegmentBuffer = false;
1090
1091 bool DispatchPtr = false;
1092
1093 bool QueuePtr = false;
1094
1095 bool KernargSegmentPtr = false;
1096
1097 bool DispatchID = false;
1098
1099 bool FlatScratchInit = false;
1100
1101 bool PrivateSegmentSize = false;
1102
1103 unsigned NumKernargPreloadSGPRs = 0;
1104
1105 unsigned NumUsedUserSGPRs = 0;
1106};
1107
1108} // end namespace llvm
1109
1110#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1111