1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
23#include "Utils/AMDGPUBaseInfo.h"
24#include "llvm/Support/ErrorHandling.h"
25
26#define GET_SUBTARGETINFO_HEADER
27#include "AMDGPUGenSubtargetInfo.inc"
28
29namespace llvm {
30
31class GCNTargetMachine;
32
33class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
34 public AMDGPUSubtarget {
35public:
36 using AMDGPUSubtarget::getMaxWavesPerEU;
37
38 // Following 2 enums are documented at:
39 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
40 enum class TrapHandlerAbi {
41 NONE = 0x00,
42 AMDHSA = 0x01,
43 };
44
45 enum class TrapID {
46 LLVMAMDHSATrap = 0x02,
47 LLVMAMDHSADebugTrap = 0x03,
48 };
49
50private:
51 /// SelectionDAGISel related APIs.
52 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
53
54 /// GlobalISel related APIs.
55 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
56 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
57 std::unique_ptr<InstructionSelector> InstSelector;
58 std::unique_ptr<LegalizerInfo> Legalizer;
59 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
60
61protected:
62 // Basic subtarget description.
63 AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
64 unsigned Gen = INVALID;
65 InstrItineraryData InstrItins;
66 int LDSBankCount = 0;
67 unsigned MaxPrivateElementSize = 0;
68
69 // Dynamically set bits that enable features.
70 bool DynamicVGPR = false;
71 bool DynamicVGPRBlockSize32 = false;
72 bool ScalarizeGlobal = false;
73
74 /// The maximum number of instructions that may be placed within an S_CLAUSE,
75 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
76 /// indicates a lack of S_CLAUSE support.
77 unsigned MaxHardClauseLength = 0;
78
79#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
80 bool ATTRIBUTE = DEFAULT;
81#include "AMDGPUGenSubtargetInfo.inc"
82
83private:
84 SIInstrInfo InstrInfo;
85 SITargetLowering TLInfo;
86 SIFrameLowering FrameLowering;
87
88public:
89 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
90 const GCNTargetMachine &TM);
91 ~GCNSubtarget() override;
92
93 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU,
94 StringRef FS);
95
96 /// Diagnose inconsistent subtarget features before attempting to codegen
97 /// function \p F.
98 void checkSubtargetFeatures(const Function &F) const;
99
100 const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
101
102 const SIFrameLowering *getFrameLowering() const override {
103 return &FrameLowering;
104 }
105
106 const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
107
108 const SIRegisterInfo *getRegisterInfo() const override {
109 return &InstrInfo.getRegisterInfo();
110 }
111
112 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
113
114 const CallLowering *getCallLowering() const override {
115 return CallLoweringInfo.get();
116 }
117
118 const InlineAsmLowering *getInlineAsmLowering() const override {
119 return InlineAsmLoweringInfo.get();
120 }
121
122 InstructionSelector *getInstructionSelector() const override {
123 return InstSelector.get();
124 }
125
126 const LegalizerInfo *getLegalizerInfo() const override {
127 return Legalizer.get();
128 }
129
130 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
131 return RegBankInfo.get();
132 }
133
134 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
135 return TargetID;
136 }
137
138 const InstrItineraryData *getInstrItineraryData() const override {
139 return &InstrItins;
140 }
141
142 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
143
144 Generation getGeneration() const { return (Generation)Gen; }
145
146 bool isGFX11Plus() const { return getGeneration() >= GFX11; }
147
148#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
149 bool GETTER() const override { return ATTRIBUTE; }
150#include "AMDGPUGenSubtargetInfo.inc"
151
152 unsigned getMaxWaveScratchSize() const {
153 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
154 if (getGeneration() >= GFX12) {
155 // 18-bit field in units of 64-dword.
156 return (64 * 4) * ((1 << 18) - 1);
157 }
158 if (getGeneration() == GFX11) {
159 // 15-bit field in units of 64-dword.
160 return (64 * 4) * ((1 << 15) - 1);
161 }
162 // 13-bit field in units of 256-dword.
163 return (256 * 4) * ((1 << 13) - 1);
164 }
165
166 /// Return the number of high bits known to be zero for a frame index.
167 unsigned getKnownHighZeroBitsForFrameIndex() const {
168 return llvm::countl_zero(Val: getMaxWaveScratchSize()) + getWavefrontSizeLog2();
169 }
170
171 int getLDSBankCount() const { return LDSBankCount; }
172
173 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
174 return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
175 : 16;
176 }
177
178 unsigned getConstantBusLimit(unsigned Opcode) const;
179
180 /// Returns if the result of this instruction with a 16-bit result returned in
181 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
182 /// the original value.
183 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
184
185 bool supportsWGP() const {
186 if (HasGFX1250Insts)
187 return false;
188 return getGeneration() >= GFX10;
189 }
190
191 bool hasHWFP64() const { return HasFP64; }
192
193 bool hasAddr64() const {
194 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
195 }
196
197 bool hasFlat() const {
198 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
199 }
200
201 // Return true if the target only has the reverse operand versions of VALU
202 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
203 bool hasOnlyRevVALUShifts() const {
204 return getGeneration() >= VOLCANIC_ISLANDS;
205 }
206
207 bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
208
209 bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
210
211 bool hasMin3Max3_16() const {
212 return getGeneration() >= AMDGPUSubtarget::GFX9;
213 }
214
215 bool hasSwap() const { return HasGFX9Insts; }
216
217 bool hasScalarPackInsts() const { return HasGFX9Insts; }
218
219 bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
220
221 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
222
223 TrapHandlerAbi getTrapHandlerAbi() const {
224 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
225 }
226
227 bool supportsGetDoorbellID() const {
228 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
229 return getGeneration() >= GFX9;
230 }
231
232 /// True if the offset field of DS instructions works as expected. On SI, the
233 /// offset uses a 16-bit adder and does not always wrap properly.
234 bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
235
236 bool unsafeDSOffsetFoldingEnabled() const {
237 return EnableUnsafeDSOffsetFolding;
238 }
239
240 /// Condition output from div_scale is usable.
241 bool hasUsableDivScaleConditionOutput() const {
242 return getGeneration() != SOUTHERN_ISLANDS;
243 }
244
245 /// Extra wait hazard is needed in some cases before
246 /// s_cbranch_vccnz/s_cbranch_vccz.
247 bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
248
249 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
250 bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
251
252 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
253 /// was written by a VALU instruction.
254 bool hasSMRDReadVALUDefHazard() const {
255 return getGeneration() == SOUTHERN_ISLANDS;
256 }
257
258 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
259 /// SGPR was written by a VALU Instruction.
260 bool hasVMEMReadSGPRVALUDefHazard() const {
261 return getGeneration() >= VOLCANIC_ISLANDS;
262 }
263
264 bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
265
266 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
267 unsigned getSetRegWaitStates() const {
268 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
269 }
270
271 /// Return the amount of LDS that can be used that will not restrict the
272 /// occupancy lower than WaveCount.
273 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
274 const Function &) const;
275
276 bool supportsMinMaxDenormModes() const {
277 return getGeneration() >= AMDGPUSubtarget::GFX9;
278 }
279
280 /// \returns If target supports S_DENORM_MODE.
281 bool hasDenormModeInst() const {
282 return getGeneration() >= AMDGPUSubtarget::GFX10;
283 }
284
285 /// \returns If target supports ds_read/write_b128 and user enables generation
286 /// of ds_read/write_b128.
287 bool useDS128() const { return HasCIInsts && EnableDS128; }
288
289 /// \return If target supports ds_read/write_b96/128.
290 bool hasDS96AndDS128() const { return HasCIInsts; }
291
292 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
293 bool haveRoundOpsF64() const { return HasCIInsts; }
294
295 /// \returns If MUBUF instructions always perform range checking, even for
296 /// buffer resources used for private memory access.
297 bool privateMemoryResourceIsRangeChecked() const {
298 return getGeneration() < AMDGPUSubtarget::GFX9;
299 }
300
301 /// \returns If target requires PRT Struct NULL support (zero result registers
302 /// for sparse texture support).
303 bool usePRTStrictNull() const { return EnablePRTStrictNull; }
304
305 bool hasUnalignedBufferAccessEnabled() const {
306 return HasUnalignedBufferAccess && HasUnalignedAccessMode;
307 }
308
309 bool hasUnalignedDSAccessEnabled() const {
310 return HasUnalignedDSAccess && HasUnalignedAccessMode;
311 }
312
313 bool hasUnalignedScratchAccessEnabled() const {
314 return HasUnalignedScratchAccess && HasUnalignedAccessMode;
315 }
316
317 bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
318
319 bool isTgSplitEnabled() const { return EnableTgSplit; }
320
321 bool isCuModeEnabled() const { return EnableCuMode; }
322
323 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
324
325 bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
326
327 // Check if target supports ST addressing mode with FLAT scratch instructions.
328 // The ST addressing mode means no registers are used, either VGPR or SGPR,
329 // but only immediate offset is swizzled and added to the FLAT scratch base.
330 bool hasFlatScratchSTMode() const {
331 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
332 }
333
334 bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
335
336 bool hasFlatScratchEnabled() const {
337 return hasArchitectedFlatScratch() ||
338 (EnableFlatScratch && hasFlatScratchInsts());
339 }
340
341 bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
342
343 bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
344
345 bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
346
347 bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
348
349 bool hasExportInsts() const {
350 return !hasGFX940Insts() && !hasGFX1250Insts();
351 }
352
353 bool hasVINTERPEncoding() const {
354 return HasGFX11Insts && !hasGFX1250Insts();
355 }
356
357 // DS_ADD_F64/DS_ADD_RTN_F64
358 bool hasLdsAtomicAddF64() const {
359 return hasGFX90AInsts() || hasGFX1250Insts();
360 }
361
362 bool hasMultiDwordFlatScratchAddressing() const {
363 return getGeneration() >= GFX9;
364 }
365
366 bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
367
368 bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
369
370 bool d16PreservesUnusedBits() const {
371 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
372 }
373
374 bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
375
376 /// Return if most LDS instructions have an m0 use that require m0 to be
377 /// initialized.
378 bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
379
380 // True if the hardware rewinds and replays GWS operations if a wave is
381 // preempted.
382 //
383 // If this is false, a GWS operation requires testing if a nack set the
384 // MEM_VIOL bit, and repeating if so.
385 bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
386
387 /// \returns if target has ds_gws_sema_release_all instruction.
388 bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
389
390 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
391
392 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
393
394 // Covers VS/PS/CS graphics shaders
395 bool isMesaGfxShader(const Function &F) const {
396 return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
397 }
398
399 bool isGFX1170() const {
400 return getGeneration() == GFX11 && hasWMMA128bInsts();
401 }
402
403 bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
404
405 bool hasAtomicFaddInsts() const {
406 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
407 }
408
409 bool vmemWriteNeedsExpWaitcnt() const {
410 return getGeneration() < SEA_ISLANDS;
411 }
412
413 bool hasInstPrefetch() const {
414 return getGeneration() == GFX10 || getGeneration() == GFX11;
415 }
416
417 bool hasPrefetch() const { return HasGFX12Insts; }
418
419 // Has s_cmpk_* instructions.
420 bool hasSCmpK() const { return getGeneration() < GFX12; }
421
422 // Scratch is allocated in 256 dword per wave blocks for the entire
423 // wavefront. When viewed from the perspective of an arbitrary workitem, this
424 // is 4-byte aligned.
425 //
426 // Only 4-byte alignment is really needed to access anything. Transformations
427 // on the pointer value itself may rely on the alignment / known low bits of
428 // the pointer. Set this to something above the minimum to avoid needing
429 // dynamic realignment in common cases.
430 Align getStackAlignment() const { return Align(16); }
431
432 bool enableMachineScheduler() const override { return true; }
433
434 bool useAA() const override;
435
436 bool enableSubRegLiveness() const override { return true; }
437
438 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
439 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
440
441 // static wrappers
442 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
443
444 // XXX - Why is this here if it isn't in the default pass set?
445 bool enableEarlyIfConversion() const override { return true; }
446
447 void overrideSchedPolicy(MachineSchedPolicy &Policy,
448 const SchedRegion &Region) const override;
449
450 void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
451 const SchedRegion &Region) const override;
452
453 void mirFileLoaded(MachineFunction &MF) const override;
454
455 unsigned getMaxNumUserSGPRs() const {
456 return AMDGPU::getMaxNumUserSGPRs(STI: *this);
457 }
458
459 bool useVGPRIndexMode() const;
460
461 bool hasScalarCompareEq64() const {
462 return getGeneration() >= VOLCANIC_ISLANDS;
463 }
464
465 bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
466 bool hasLDSFPAtomicAddF64() const {
467 return HasGFX90AInsts || HasGFX1250Insts;
468 }
469
470 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
471 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
472
473 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
474 bool hasPermLane64() const { return getGeneration() >= GFX11; }
475
476 bool hasDPPRowShare() const {
477 return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10);
478 }
479
480 // Has V_PK_MOV_B32 opcode
481 bool hasPkMovB32() const { return HasGFX90AInsts; }
482
483 bool hasFmaakFmamkF32Insts() const {
484 return getGeneration() >= GFX10 || hasGFX940Insts();
485 }
486
487 bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
488
489 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
490
491 unsigned getNSAMaxSize(bool HasSampler = false) const {
492 return AMDGPU::getNSAMaxSize(STI: *this, HasSampler);
493 }
494
495 bool hasMadF16() const;
496
497 bool hasMovB64() const { return HasGFX940Insts || HasGFX1250Insts; }
498
499 // Scalar and global loads support scale_offset bit.
500 bool hasScaleOffset() const { return HasGFX1250Insts; }
501
502 // FLAT GLOBAL VOffset is signed
503 bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
504
505 bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
506
507 bool hasUserSGPRInit16BugInWave32() const {
508 return HasUserSGPRInit16Bug && isWave32();
509 }
510
511 bool has12DWordStoreHazard() const {
512 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
513 }
514
515 // \returns true if the subtarget supports DWORDX3 load/store instructions.
516 bool hasDwordx3LoadStores() const { return HasCIInsts; }
517
518 bool hasReadM0MovRelInterpHazard() const {
519 return getGeneration() == AMDGPUSubtarget::GFX9;
520 }
521
522 bool hasReadM0SendMsgHazard() const {
523 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
524 getGeneration() <= AMDGPUSubtarget::GFX9;
525 }
526
527 bool hasReadM0LdsDmaHazard() const {
528 return getGeneration() == AMDGPUSubtarget::GFX9;
529 }
530
531 bool hasReadM0LdsDirectHazard() const {
532 return getGeneration() == AMDGPUSubtarget::GFX9;
533 }
534
535 bool hasLDSMisalignedBugInWGPMode() const {
536 return HasLDSMisalignedBug && !EnableCuMode;
537 }
538
539 // Shift amount of a 64 bit shift cannot be a highest allocated register
540 // if also at the end of the allocation block.
541 bool hasShift64HighRegBug() const {
542 return HasGFX90AInsts && !HasGFX940Insts;
543 }
544
545 // Has one cycle hazard on transcendental instruction feeding a
546 // non transcendental VALU.
547 bool hasTransForwardingHazard() const { return HasGFX940Insts; }
548
549 // Has one cycle hazard on a VALU instruction partially writing dst with
550 // a shift of result bits feeding another VALU instruction.
551 bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
552
553 // Cannot use op_sel with v_dot instructions.
554 bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
555
556 // Does not have HW interlocs for VALU writing and then reading SGPRs.
557 bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
558
559 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
560
561 bool hasFPAtomicToDenormModeHazard() const {
562 return getGeneration() == GFX10;
563 }
564
565 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
566
567 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
568
569 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
570
571 bool hasVALUPartialForwardingHazard() const {
572 return getGeneration() == GFX11;
573 }
574
575 bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
576
577 // All GFX9 targets experience a fetch delay when an instruction at the start
578 // of a loop header is split by a 32-byte fetch window boundary, but GFX950
579 // is uniquely sensitive to this: the delay triggers further performance
580 // degradation beyond the fetch latency itself.
581 bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
582
583 bool requiresCodeObjectV6() const { return RequiresCOV6; }
584
585 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
586
587 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
588
589 bool hasVALUReadSGPRHazard() const {
590 return HasGFX12Insts && !HasGFX1250Insts;
591 }
592
593 bool setRegModeNeedsVNOPs() const {
594 return HasGFX1250Insts && getGeneration() == GFX12;
595 }
596
597 /// Return if operations acting on VGPR tuples require even alignment.
598 bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
599
600 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
601 bool hasSPackHL() const { return HasGFX11Insts; }
602
603 /// Return true if the target's EXP instruction has the COMPR flag, which
604 /// affects the meaning of the EN (enable) bits.
605 bool hasCompressedExport() const { return !HasGFX11Insts; }
606
607 /// Return true if the target's EXP instruction supports the NULL export
608 /// target.
609 bool hasNullExportTarget() const { return !HasGFX11Insts; }
610
611 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
612
613 /// Return true if the target has the S_DELAY_ALU instruction.
614 bool hasDelayAlu() const { return HasGFX11Insts; }
615
616 /// Returns true if the target supports
617 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
618 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
619 bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
620
621 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
622 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
623 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
624
625 /// \returns true if inline constants are not supported for F16 pseudo
626 /// scalar transcendentals.
627 bool hasNoF16PseudoScalarTransInlineConstants() const {
628 return getGeneration() == GFX12;
629 }
630
631 /// \returns true if the target has packed f32 instructions that only read 32
632 /// bits from a scalar operand (SGPR or literal) and replicates the bits to
633 /// both channels.
634 bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const {
635 return getGeneration() == GFX12 && HasGFX1250Insts;
636 }
637
638 bool hasAddPC64Inst() const { return HasGFX1250Insts; }
639
640 /// \returns true if the target supports expert scheduling mode 2 which relies
641 /// on the compiler to insert waits to avoid hazards between VMEM and VALU
642 /// instructions in some instances.
643 bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
644
645 /// \returns The maximum number of instructions that can be enclosed in an
646 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
647 /// instruction.
648 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
649
650 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
651 /// SGPRs
652 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
653
654 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
655 /// VGPRs
656 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
657 unsigned DynamicVGPRBlockSize) const;
658
659 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
660 /// be achieved when the only function running on a CU is \p F, each workgroup
661 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
662 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
663 /// range, so this returns a range as well.
664 ///
665 /// Note that occupancy can be affected by the scratch allocation as well, but
666 /// we do not have enough information to compute it.
667 std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
668 unsigned LDSSize = 0,
669 unsigned NumSGPRs = 0,
670 unsigned NumVGPRs = 0) const;
671
672 /// \returns true if the flat_scratch register should be initialized with the
673 /// pointer to the wave's scratch memory rather than a size and offset.
674 bool flatScratchIsPointer() const {
675 return getGeneration() >= AMDGPUSubtarget::GFX9;
676 }
677
678 /// \returns true if the machine has merged shaders in which s0-s7 are
679 /// reserved by the hardware and user SGPRs start at s8
680 bool hasMergedShaders() const { return getGeneration() >= GFX9; }
681
682 // \returns true if the target supports the pre-NGG legacy geometry path.
683 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
684
685 // \returns true if the target has split barriers feature
686 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
687
688 // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
689 bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
690
691 // \returns true if the target has IEEE kernel descriptor mode bit
692 bool hasIEEEMode() const { return getGeneration() < GFX12; }
693
694 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
695 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
696
697 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
698 /// values.
699 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
700
701 bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
702
703 bool hasVOPD3() const { return HasGFX1250Insts; }
704
705 // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
706 bool hasVectorMulU64() const { return HasGFX1250Insts; }
707
708 // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
709 // instructions.
710 bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; }
711
712 // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
713 bool hasIntMinMax64() const { return HasGFX1250Insts; }
714
715 // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
716 bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
717
718 // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
719 bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
720
721 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
722 // of sign-extending. Note that GFX1250 has not only fixed the bug but also
723 // extended VA to 57 bits.
724 bool hasGetPCZeroExtension() const {
725 return HasGFX12Insts && !HasGFX1250Insts;
726 }
727
728 // \returns true if the target needs to create a prolog for backward
729 // compatibility when preloading kernel arguments.
730 bool needsKernArgPreloadProlog() const {
731 return hasKernargPreload() && !HasGFX1250Insts;
732 }
733
734 bool hasCondSubInsts() const { return HasGFX12Insts; }
735
736 bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
737
738 /// \returns SGPR allocation granularity supported by the subtarget.
739 unsigned getSGPRAllocGranule() const {
740 return AMDGPU::IsaInfo::getSGPRAllocGranule(STI: this);
741 }
742
743 /// \returns SGPR encoding granularity supported by the subtarget.
744 unsigned getSGPREncodingGranule() const {
745 return AMDGPU::IsaInfo::getSGPREncodingGranule(STI: this);
746 }
747
748 /// \returns Total number of SGPRs supported by the subtarget.
749 unsigned getTotalNumSGPRs() const {
750 return AMDGPU::IsaInfo::getTotalNumSGPRs(STI: this);
751 }
752
753 /// \returns Addressable number of SGPRs supported by the subtarget.
754 unsigned getAddressableNumSGPRs() const {
755 return AMDGPU::IsaInfo::getAddressableNumSGPRs(STI: this);
756 }
757
758 /// \returns Minimum number of SGPRs that meets the given number of waves per
759 /// execution unit requirement supported by the subtarget.
760 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
761 return AMDGPU::IsaInfo::getMinNumSGPRs(STI: this, WavesPerEU);
762 }
763
764 /// \returns Maximum number of SGPRs that meets the given number of waves per
765 /// execution unit requirement supported by the subtarget.
766 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
767 return AMDGPU::IsaInfo::getMaxNumSGPRs(STI: this, WavesPerEU, Addressable);
768 }
769
770 /// \returns Reserved number of SGPRs. This is common
771 /// utility function called by MachineFunction and
772 /// Function variants of getReservedNumSGPRs.
773 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
774 /// \returns Reserved number of SGPRs for given machine function \p MF.
775 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
776
777 /// \returns Reserved number of SGPRs for given function \p F.
778 unsigned getReservedNumSGPRs(const Function &F) const;
779
780 /// \returns Maximum number of preloaded SGPRs for the subtarget.
781 unsigned getMaxNumPreloadedSGPRs() const;
782
783 /// \returns max num SGPRs. This is the common utility
784 /// function called by MachineFunction and Function
785 /// variants of getMaxNumSGPRs.
786 unsigned getBaseMaxNumSGPRs(const Function &F,
787 std::pair<unsigned, unsigned> WavesPerEU,
788 unsigned PreloadedSGPRs,
789 unsigned ReservedNumSGPRs) const;
790
791 /// \returns Maximum number of SGPRs that meets number of waves per execution
792 /// unit requirement for function \p MF, or number of SGPRs explicitly
793 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
794 ///
795 /// \returns Value that meets number of waves per execution unit requirement
796 /// if explicitly requested value cannot be converted to integer, violates
797 /// subtarget's specifications, or does not meet number of waves per execution
798 /// unit requirement.
799 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
800
801 /// \returns Maximum number of SGPRs that meets number of waves per execution
802 /// unit requirement for function \p F, or number of SGPRs explicitly
803 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
804 ///
805 /// \returns Value that meets number of waves per execution unit requirement
806 /// if explicitly requested value cannot be converted to integer, violates
807 /// subtarget's specifications, or does not meet number of waves per execution
808 /// unit requirement.
809 unsigned getMaxNumSGPRs(const Function &F) const;
810
811 /// \returns VGPR allocation granularity supported by the subtarget.
812 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
813 return AMDGPU::IsaInfo::getVGPRAllocGranule(STI: this, DynamicVGPRBlockSize);
814 }
815
816 /// \returns VGPR encoding granularity supported by the subtarget.
817 unsigned getVGPREncodingGranule() const {
818 return AMDGPU::IsaInfo::getVGPREncodingGranule(STI: this);
819 }
820
821 /// \returns Total number of VGPRs supported by the subtarget.
822 unsigned getTotalNumVGPRs() const {
823 return AMDGPU::IsaInfo::getTotalNumVGPRs(STI: this);
824 }
825
826 /// \returns Addressable number of architectural VGPRs supported by the
827 /// subtarget.
828 unsigned getAddressableNumArchVGPRs() const {
829 return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(STI: this);
830 }
831
832 /// \returns Addressable number of VGPRs supported by the subtarget.
833 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
834 return AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: this, DynamicVGPRBlockSize);
835 }
836
837 /// \returns the minimum number of VGPRs that will prevent achieving more than
838 /// the specified number of waves \p WavesPerEU.
839 unsigned getMinNumVGPRs(unsigned WavesPerEU,
840 unsigned DynamicVGPRBlockSize) const {
841 return AMDGPU::IsaInfo::getMinNumVGPRs(STI: this, WavesPerEU,
842 DynamicVGPRBlockSize);
843 }
844
845 /// \returns the maximum number of VGPRs that can be used and still achieved
846 /// at least the specified number of waves \p WavesPerEU.
847 unsigned getMaxNumVGPRs(unsigned WavesPerEU,
848 unsigned DynamicVGPRBlockSize) const {
849 return AMDGPU::IsaInfo::getMaxNumVGPRs(STI: this, WavesPerEU,
850 DynamicVGPRBlockSize);
851 }
852
853 /// \returns max num VGPRs. This is the common utility function
854 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
855 unsigned
856 getBaseMaxNumVGPRs(const Function &F,
857 std::pair<unsigned, unsigned> NumVGPRBounds) const;
858
859 /// \returns Maximum number of VGPRs that meets number of waves per execution
860 /// unit requirement for function \p F, or number of VGPRs explicitly
861 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
862 ///
863 /// \returns Value that meets number of waves per execution unit requirement
864 /// if explicitly requested value cannot be converted to integer, violates
865 /// subtarget's specifications, or does not meet number of waves per execution
866 /// unit requirement.
867 unsigned getMaxNumVGPRs(const Function &F) const;
868
869 unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
870
871 /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
872 /// of waves per execution unit required for the function \p MF.
873 std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
874
875 /// \returns Maximum number of VGPRs that meets number of waves per execution
876 /// unit requirement for function \p MF, or number of VGPRs explicitly
877 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
878 ///
879 /// \returns Value that meets number of waves per execution unit requirement
880 /// if explicitly requested value cannot be converted to integer, violates
881 /// subtarget's specifications, or does not meet number of waves per execution
882 /// unit requirement.
883 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
884
885 bool supportsWave32() const { return getGeneration() >= GFX10; }
886
887 bool supportsWave64() const { return !hasGFX1250Insts(); }
888
889 bool isWave32() const { return getWavefrontSize() == 32; }
890
891 bool isWave64() const { return getWavefrontSize() == 64; }
892
893 /// Returns if the wavesize of this subtarget is known reliable. This is false
894 /// only for the a default target-cpu that does not have an explicit
895 /// +wavefrontsize target feature.
896 bool isWaveSizeKnown() const {
897 return hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) ||
898 hasFeature(Feature: AMDGPU::FeatureWavefrontSize64);
899 }
900
901 const TargetRegisterClass *getBoolRC() const {
902 return getRegisterInfo()->getBoolRC();
903 }
904
905 /// \returns Maximum number of work groups per compute unit supported by the
906 /// subtarget and limited by given \p FlatWorkGroupSize.
907 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
908 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(STI: this, FlatWorkGroupSize);
909 }
910
911 /// \returns Minimum flat work group size supported by the subtarget.
912 unsigned getMinFlatWorkGroupSize() const override {
913 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(STI: this);
914 }
915
916 /// \returns Maximum flat work group size supported by the subtarget.
917 unsigned getMaxFlatWorkGroupSize() const override {
918 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(STI: this);
919 }
920
921 /// \returns Number of waves per execution unit required to support the given
922 /// \p FlatWorkGroupSize.
923 unsigned
924 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
925 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(STI: this, FlatWorkGroupSize);
926 }
927
928 /// \returns Minimum number of waves per execution unit supported by the
929 /// subtarget.
930 unsigned getMinWavesPerEU() const override {
931 return AMDGPU::IsaInfo::getMinWavesPerEU(STI: this);
932 }
933
934 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
935 SDep &Dep,
936 const TargetSchedModel *SchedModel) const override;
937
938 // \returns true if it's beneficial on this subtarget for the scheduler to
939 // cluster stores as well as loads.
940 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
941
942 // \returns the number of address arguments from which to enable MIMG NSA
943 // on supported architectures.
944 unsigned getNSAThreshold(const MachineFunction &MF) const;
945
946 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
947 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
948 bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
949
950 // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
951 // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
952 bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
953
954 bool isDynamicVGPREnabled() const { return DynamicVGPR; }
955 unsigned getDynamicVGPRBlockSize() const {
956 return DynamicVGPRBlockSize32 ? 32 : 16;
957 }
958
959 bool requiresDisjointEarlyClobberAndUndef() const override {
960 // AMDGPU doesn't care if early-clobber and undef operands are allocated
961 // to the same register.
962 return false;
963 }
964
965 // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
966 // and surronded by S_WAIT_ALU(0xFFE3).
967 bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
968 return getGeneration() == GFX12;
969 }
970
971 // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
972 // read.
973 bool hasScratchBaseForwardingHazard() const {
974 return HasGFX1250Insts && getGeneration() == GFX12;
975 }
976
977 // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
978 // result.
979 bool hasFlatScratchHiInB64InstHazard() const {
980 return HasGFX1250Insts && getGeneration() == GFX12;
981 }
982
983 /// \returns true if the subtarget requires a wait for xcnt before VMEM
984 /// accesses that must never be repeated in the event of a page fault/re-try.
985 /// Atomic stores/rmw and all volatile accesses fall under this criteria.
986 bool requiresWaitXCntForSingleAccessInstructions() const {
987 return HasGFX1250Insts;
988 }
989
990 /// \returns the number of significant bits in the immediate field of the
991 /// S_NOP instruction.
992 unsigned getSNopBits() const {
993 if (getGeneration() >= AMDGPUSubtarget::GFX12)
994 return 7;
995 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
996 return 4;
997 return 3;
998 }
999
1000 bool supportsBPermute() const {
1001 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
1002 }
1003
1004 bool supportsWaveWideBPermute() const {
1005 return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
1006 getGeneration() == AMDGPUSubtarget::GFX12) ||
1007 isWave32();
1008 }
1009
1010 /// Return true if real (non-fake) variants of True16 instructions using
1011 /// 16-bit registers should be code-generated. Fake True16 instructions are
1012 /// identical to non-fake ones except that they take 32-bit registers as
1013 /// operands and always use their low halves.
1014 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
1015 // supported and the support for fake True16 instructions is removed.
1016 bool useRealTrue16Insts() const {
1017 return hasTrue16BitInsts() && EnableRealTrue16Insts;
1018 }
1019
1020 bool requiresWaitOnWorkgroupReleaseFence() const {
1021 return getGeneration() >= GFX10 || isTgSplitEnabled();
1022 }
1023};
1024
1025class GCNUserSGPRUsageInfo {
1026public:
1027 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1028
1029 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1030
1031 bool hasDispatchPtr() const { return DispatchPtr; }
1032
1033 bool hasQueuePtr() const { return QueuePtr; }
1034
1035 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1036
1037 bool hasDispatchID() const { return DispatchID; }
1038
1039 bool hasFlatScratchInit() const { return FlatScratchInit; }
1040
1041 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1042
1043 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1044
1045 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1046
1047 unsigned getNumFreeUserSGPRs();
1048
1049 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1050
1051 enum UserSGPRID : unsigned {
1052 ImplicitBufferPtrID = 0,
1053 PrivateSegmentBufferID = 1,
1054 DispatchPtrID = 2,
1055 QueuePtrID = 3,
1056 KernargSegmentPtrID = 4,
1057 DispatchIdID = 5,
1058 FlatScratchInitID = 6,
1059 PrivateSegmentSizeID = 7
1060 };
1061
1062 // Returns the size in number of SGPRs for preload user SGPR field.
1063 static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1064 switch (ID) {
1065 case ImplicitBufferPtrID:
1066 return 2;
1067 case PrivateSegmentBufferID:
1068 return 4;
1069 case DispatchPtrID:
1070 return 2;
1071 case QueuePtrID:
1072 return 2;
1073 case KernargSegmentPtrID:
1074 return 2;
1075 case DispatchIdID:
1076 return 2;
1077 case FlatScratchInitID:
1078 return 2;
1079 case PrivateSegmentSizeID:
1080 return 1;
1081 }
1082 llvm_unreachable("Unknown UserSGPRID.");
1083 }
1084
1085 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1086
1087private:
1088 const GCNSubtarget &ST;
1089
1090 // Private memory buffer
1091 // Compute directly in sgpr[0:1]
1092 // Other shaders indirect 64-bits at sgpr[0:1]
1093 bool ImplicitBufferPtr = false;
1094
1095 bool PrivateSegmentBuffer = false;
1096
1097 bool DispatchPtr = false;
1098
1099 bool QueuePtr = false;
1100
1101 bool KernargSegmentPtr = false;
1102
1103 bool DispatchID = false;
1104
1105 bool FlatScratchInit = false;
1106
1107 bool PrivateSegmentSize = false;
1108
1109 unsigned NumKernargPreloadSGPRs = 0;
1110
1111 unsigned NumUsedUserSGPRs = 0;
1112};
1113
1114} // end namespace llvm
1115
1116#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1117