1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
23#include "Utils/AMDGPUBaseInfo.h"
24#include "llvm/Support/AMDHSAKernelDescriptor.h"
25#include "llvm/Support/ErrorHandling.h"
26
27#define GET_SUBTARGETINFO_HEADER
28#include "AMDGPUGenSubtargetInfo.inc"
29
30namespace llvm {
31
32class GCNTargetMachine;
33
34/// Module flag names controlling out-of-bounds buffer access semantics.
35/// Each flag is an i32 with Module::Max merge behaviour and tri-state values:
36/// 0 = any (absent/default - backend currently treats as strict)
37/// 1 = relaxed
38/// 2 = strict
39namespace AMDGPUOOBMode {
40inline constexpr StringLiteral BufferFlag("amdgpu.buffer.oob.mode");
41inline constexpr StringLiteral TBufferFlag("amdgpu.tbuffer.oob.mode");
42} // namespace AMDGPUOOBMode
43
44class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
45 public AMDGPUSubtarget {
46public:
47 using AMDGPUSubtarget::getMaxWavesPerEU;
48
49 // Following 2 enums are documented at:
50 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
51 enum class TrapHandlerAbi {
52 NONE = 0x00,
53 AMDHSA = 0x01,
54 };
55
56 enum class TrapID {
57 LLVMAMDHSATrap = 0x02,
58 LLVMAMDHSADebugTrap = 0x03,
59 };
60
61private:
62 /// SelectionDAGISel related APIs.
63 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
64
65 /// GlobalISel related APIs.
66 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
67 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
68 std::unique_ptr<InstructionSelector> InstSelector;
69 std::unique_ptr<LegalizerInfo> Legalizer;
70 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
71
72protected:
73 // Basic subtarget description.
74 AMDGPU::TargetID TargetID;
75 unsigned Gen = INVALID;
76 InstrItineraryData InstrItins;
77 int LDSBankCount = 0;
78 unsigned MaxPrivateElementSize = 0;
79
80 // Instruction cache line size in bytes; set from TableGen subtarget features.
81 unsigned InstCacheLineSize = 0;
82
83 // Dynamically set bits that enable features.
84 bool DynamicVGPR = false;
85 bool DynamicVGPRBlockSize32 = false;
86 bool ScalarizeGlobal = false;
87 const bool BufferOOBRelaxed;
88 const bool TBufferOOBRelaxed;
89
90 /// The maximum number of instructions that may be placed within an S_CLAUSE,
91 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
92 /// indicates a lack of S_CLAUSE support.
93 unsigned MaxHardClauseLength = 0;
94
95#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
96 bool ATTRIBUTE = DEFAULT;
97#include "AMDGPUGenSubtargetInfo.inc"
98
99private:
100 SIInstrInfo InstrInfo;
101 SITargetLowering TLInfo;
102 SIFrameLowering FrameLowering;
103
104 /// Get the register that represents the actual dependency between the
105 /// definition and the use. The definition might only affect a subregister
106 /// that is not actually used. Works for both virtual and physical registers.
107 /// Note: Currently supports VOP3P instructions (without WMMA an SWMMAC).
108 /// Returns the definition register if there is a real dependency and no
109 /// better match is found.
110 Register getRealSchedDependency(const MachineInstr &DefI, int DefOpIdx,
111 const MachineInstr &UseI, int UseOpIdx) const;
112
113public:
114 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
115 const GCNTargetMachine &TM, bool BufferOOBRelaxed = false,
116 bool TBufferOOBRelaxed = false);
117 ~GCNSubtarget() override;
118
119 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU,
120 StringRef FS);
121
122 /// Diagnose inconsistent subtarget features before attempting to codegen
123 /// function \p F.
124 void checkSubtargetFeatures(const Function &F) const;
125
126 const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
127
128 const SIFrameLowering *getFrameLowering() const override {
129 return &FrameLowering;
130 }
131
132 const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
133
134 const SIRegisterInfo *getRegisterInfo() const override {
135 return &InstrInfo.getRegisterInfo();
136 }
137
138 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
139
140 const CallLowering *getCallLowering() const override {
141 return CallLoweringInfo.get();
142 }
143
144 const InlineAsmLowering *getInlineAsmLowering() const override {
145 return InlineAsmLoweringInfo.get();
146 }
147
148 InstructionSelector *getInstructionSelector() const override {
149 return InstSelector.get();
150 }
151
152 const LegalizerInfo *getLegalizerInfo() const override {
153 return Legalizer.get();
154 }
155
156 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
157 return RegBankInfo.get();
158 }
159
160 const AMDGPU::TargetID &getTargetID() const { return TargetID; }
161
162 const InstrItineraryData *getInstrItineraryData() const override {
163 return &InstrItins;
164 }
165
166 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
167
168 Generation getGeneration() const { return (Generation)Gen; }
169
170 bool isGFX11Plus() const { return getGeneration() >= GFX11; }
171
172#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
173 bool GETTER() const override { return ATTRIBUTE; }
174#include "AMDGPUGenSubtargetInfo.inc"
175
176 unsigned getMaxWaveScratchSize() const {
177 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
178 if (getGeneration() >= GFX12) {
179 // 18-bit field in units of 64-dword.
180 return (64 * 4) * ((1 << 18) - 1);
181 }
182 if (getGeneration() == GFX11) {
183 // 15-bit field in units of 64-dword.
184 return (64 * 4) * ((1 << 15) - 1);
185 }
186 // 13-bit field in units of 256-dword.
187 return (256 * 4) * ((1 << 13) - 1);
188 }
189
190 /// Return the number of high bits known to be zero for a frame index.
191 unsigned getKnownHighZeroBitsForFrameIndex() const {
192 return llvm::countl_zero(Val: getMaxWaveScratchSize()) + getWavefrontSizeLog2();
193 }
194
195 int getLDSBankCount() const { return LDSBankCount; }
196
197 /// Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
198 unsigned getInstCacheLineSize() const { return InstCacheLineSize; }
199
200 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
201 return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
202 : 16;
203 }
204
205 unsigned getConstantBusLimit(unsigned Opcode) const;
206
207 /// Returns if the result of this instruction with a 16-bit result returned in
208 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
209 /// the original value.
210 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
211
212 bool supportsWGP() const {
213 if (HasGFX1250Insts)
214 return false;
215 return getGeneration() >= GFX10;
216 }
217
218 bool hasHWFP64() const { return HasFP64; }
219
220 bool hasAddr64() const {
221 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
222 }
223
224 bool hasFlat() const {
225 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
226 }
227
228 // Return true if the target only has the reverse operand versions of VALU
229 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
230 bool hasOnlyRevVALUShifts() const {
231 return getGeneration() >= VOLCANIC_ISLANDS;
232 }
233
234 bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
235
236 bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
237
238 bool hasMin3Max3_16() const {
239 return getGeneration() >= AMDGPUSubtarget::GFX9;
240 }
241
242 bool hasSwap() const { return HasGFX9Insts; }
243
244 bool hasScalarPackInsts() const { return HasGFX9Insts; }
245
246 bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
247
248 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
249
250 bool hasAsyncMark() const { return hasVMemToLDSLoad() || HasAsynccnt; }
251
252 TrapHandlerAbi getTrapHandlerAbi() const {
253 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
254 }
255
256 bool supportsGetDoorbellID() const {
257 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
258 return getGeneration() >= GFX9;
259 }
260
261 /// True if the offset field of DS instructions works as expected. On SI, the
262 /// offset uses a 16-bit adder and does not always wrap properly.
263 bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
264
265 bool unsafeDSOffsetFoldingEnabled() const {
266 return EnableUnsafeDSOffsetFolding;
267 }
268
269 /// Condition output from div_scale is usable.
270 bool hasUsableDivScaleConditionOutput() const {
271 return getGeneration() != SOUTHERN_ISLANDS;
272 }
273
274 /// Extra wait hazard is needed in some cases before
275 /// s_cbranch_vccnz/s_cbranch_vccz.
276 bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
277
278 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
279 bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
280
281 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
282 /// was written by a VALU instruction.
283 bool hasSMRDReadVALUDefHazard() const {
284 return getGeneration() == SOUTHERN_ISLANDS;
285 }
286
287 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
288 /// SGPR was written by a VALU Instruction.
289 bool hasVMEMReadSGPRVALUDefHazard() const {
290 return getGeneration() >= VOLCANIC_ISLANDS;
291 }
292
293 bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
294
295 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
296 unsigned getSetRegWaitStates() const {
297 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
298 }
299
300 /// Return the amount of LDS that can be used that will not restrict the
301 /// occupancy lower than WaveCount.
302 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
303 const Function &) const;
304
305 bool supportsMinMaxDenormModes() const {
306 return getGeneration() >= AMDGPUSubtarget::GFX9;
307 }
308
309 /// \returns If target supports S_DENORM_MODE.
310 bool hasDenormModeInst() const {
311 return getGeneration() >= AMDGPUSubtarget::GFX10;
312 }
313
314 /// \returns If target supports ds_read/write_b128 and user enables generation
315 /// of ds_read/write_b128.
316 bool useDS128() const { return HasCIInsts && EnableDS128; }
317
318 /// \return If target supports ds_read/write_b96/128.
319 bool hasDS96AndDS128() const { return HasCIInsts; }
320
321 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
322 bool haveRoundOpsF64() const { return HasCIInsts; }
323
324 /// \returns If MUBUF instructions always perform range checking, even for
325 /// buffer resources used for private memory access.
326 bool privateMemoryResourceIsRangeChecked() const {
327 return getGeneration() < AMDGPUSubtarget::GFX9;
328 }
329
330 /// \returns If target requires PRT Struct NULL support (zero result registers
331 /// for sparse texture support).
332 bool usePRTStrictNull() const { return EnablePRTStrictNull; }
333
334 bool hasUnalignedBufferAccessEnabled() const {
335 return HasUnalignedBufferAccess && HasUnalignedAccessMode;
336 }
337
338 bool hasUnalignedDSAccessEnabled() const {
339 return HasUnalignedDSAccess && HasUnalignedAccessMode;
340 }
341
342 bool hasUnalignedScratchAccessEnabled() const {
343 return HasUnalignedScratchAccess && HasUnalignedAccessMode;
344 }
345
346 bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
347
348 bool isTgSplitEnabled() const { return EnableTgSplit; }
349
350 bool hasRelaxedBufferOOBMode() const { return BufferOOBRelaxed; }
351 bool hasRelaxedTBufferOOBMode() const { return TBufferOOBRelaxed; }
352
353 bool isCuModeEnabled() const { return EnableCuMode; }
354
355 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
356
357 bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
358
359 // Check if target supports ST addressing mode with FLAT scratch instructions.
360 // The ST addressing mode means no registers are used, either VGPR or SGPR,
361 // but only immediate offset is swizzled and added to the FLAT scratch base.
362 bool hasFlatScratchSTMode() const {
363 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
364 }
365
366 bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
367
368 bool hasFlatScratchEnabled() const {
369 return hasArchitectedFlatScratch() ||
370 (EnableFlatScratch && hasFlatScratchInsts());
371 }
372
373 bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
374
375 bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
376
377 bool hasExportInsts() const {
378 return !hasGFX940Insts() && !hasGFX1250Insts();
379 }
380
381 bool hasVINTERPEncoding() const {
382 return HasGFX11Insts && !hasGFX1250Insts();
383 }
384
385 // DS_ADD_F64/DS_ADD_RTN_F64
386 bool hasLdsAtomicAddF64() const {
387 return hasGFX90AInsts() || hasGFX1250Insts();
388 }
389
390 bool hasMultiDwordFlatScratchAddressing() const {
391 return getGeneration() >= GFX9;
392 }
393
394 bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
395
396 bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
397
398 bool d16PreservesUnusedBits() const {
399 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
400 }
401
402 bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
403
404 /// Return if most LDS instructions have an m0 use that require m0 to be
405 /// initialized.
406 bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
407
408 // True if the hardware rewinds and replays GWS operations if a wave is
409 // preempted.
410 //
411 // If this is false, a GWS operation requires testing if a nack set the
412 // MEM_VIOL bit, and repeating if so.
413 bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
414
415 /// \returns if target has ds_gws_sema_release_all instruction.
416 bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
417
418 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
419
420 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
421
422 // Covers VS/PS/CS graphics shaders
423 bool isMesaGfxShader(const Function &F) const {
424 return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
425 }
426
427 bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
428
429 bool hasAtomicFaddInsts() const {
430 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
431 }
432
433 bool vmemWriteNeedsExpWaitcnt() const {
434 return getGeneration() < SEA_ISLANDS;
435 }
436
437 bool hasInstPrefetch() const {
438 return getGeneration() == GFX10 || getGeneration() == GFX11;
439 }
440
441 bool hasPrefetch() const { return HasGFX12Insts; }
442
443 bool hasInstPrefSize() const { return isGFX11Plus(); }
444
445 void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width,
446 uint32_t &CacheLineSize) const {
447 assert(isGFX11Plus());
448 CacheLineSize = getInstCacheLineSize();
449 if (getGeneration() == GFX11) {
450 Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
451 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
452 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
453 } else {
454 Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
455 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
456 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
457 }
458 }
459
460 // Has s_cmpk_* instructions.
461 bool hasSCmpK() const { return getGeneration() < GFX12; }
462
463 // Scratch is allocated in 256 dword per wave blocks for the entire
464 // wavefront. When viewed from the perspective of an arbitrary workitem, this
465 // is 4-byte aligned.
466 //
467 // Only 4-byte alignment is really needed to access anything. Transformations
468 // on the pointer value itself may rely on the alignment / known low bits of
469 // the pointer. Set this to something above the minimum to avoid needing
470 // dynamic realignment in common cases.
471 Align getStackAlignment() const { return Align(16); }
472
473 bool enableMachineScheduler() const override { return true; }
474
475 bool useAA() const override;
476
477 bool enableSubRegLiveness() const override { return true; }
478
479 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
480 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
481
482 // static wrappers
483 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
484
485 // XXX - Why is this here if it isn't in the default pass set?
486 bool enableEarlyIfConversion() const override { return true; }
487
488 void overrideSchedPolicy(MachineSchedPolicy &Policy,
489 const SchedRegion &Region) const override;
490
491 void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
492 const SchedRegion &Region) const override;
493
494 void mirFileLoaded(MachineFunction &MF) const override;
495
496 unsigned getMaxNumUserSGPRs() const {
497 return AMDGPU::getMaxNumUserSGPRs(STI: *this);
498 }
499
500 bool useVGPRIndexMode() const;
501
502 bool hasScalarCompareEq64() const {
503 return getGeneration() >= VOLCANIC_ISLANDS;
504 }
505
506 bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
507 bool hasLDSFPAtomicAddF64() const {
508 return HasGFX90AInsts || HasGFX1250Insts;
509 }
510
511 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
512 bool hasPermLane64() const { return getGeneration() >= GFX11; }
513
514 /// \returns true if the subtarget supports the ds_swizzle rotate and FFT
515 /// swizzle modes (GFX9+).
516 bool hasDsSwizzleRotateMode() const { return getGeneration() >= GFX9; }
517
518 bool hasDPPRowShare() const {
519 return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10);
520 }
521
522 // Has V_PK_MOV_B32 opcode
523 bool hasPkMovB32() const { return HasGFX90AInsts; }
524
525 bool hasFmaakFmamkF32Insts() const {
526 return getGeneration() >= GFX10 || hasGFX940Insts();
527 }
528
529 bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
530
531 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
532
533 unsigned getNSAMaxSize(bool HasSampler = false) const {
534 return AMDGPU::getNSAMaxSize(STI: *this, HasSampler);
535 }
536
537 bool hasMadF16() const;
538
539 // Scalar and global loads support scale_offset bit.
540 bool hasScaleOffset() const { return HasGFX1250Insts; }
541
542 // FLAT GLOBAL VOffset is signed
543 bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
544
545 bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
546
547 bool hasUserSGPRInit16BugInWave32() const {
548 return HasUserSGPRInit16Bug && isWave32();
549 }
550
551 bool has12DWordStoreHazard() const {
552 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
553 }
554
555 // \returns true if the subtarget supports DWORDX3 load/store instructions.
556 bool hasDwordx3LoadStores() const { return HasCIInsts; }
557
558 bool hasReadM0MovRelInterpHazard() const {
559 return getGeneration() == AMDGPUSubtarget::GFX9;
560 }
561
562 bool hasReadM0SendMsgHazard() const {
563 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
564 getGeneration() <= AMDGPUSubtarget::GFX9;
565 }
566
567 bool hasReadM0LdsDmaHazard() const {
568 return getGeneration() == AMDGPUSubtarget::GFX9;
569 }
570
571 bool hasReadM0LdsDirectHazard() const {
572 return getGeneration() == AMDGPUSubtarget::GFX9;
573 }
574
575 bool hasLDSMisalignedBugInWGPMode() const {
576 return HasLDSMisalignedBug && !EnableCuMode;
577 }
578
579 // Shift amount of a 64 bit shift cannot be a highest allocated register
580 // if also at the end of the allocation block.
581 bool hasShift64HighRegBug() const { return HasGFX90AInsts; }
582
583 // Has one cycle hazard on transcendental instruction feeding a
584 // non transcendental VALU.
585 bool hasTransForwardingHazard() const { return HasGFX940Insts; }
586
587 // Has one cycle hazard on a VALU instruction partially writing dst with
588 // a shift of result bits feeding another VALU instruction.
589 bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
590
591 // Cannot use op_sel with v_dot instructions.
592 bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
593
594 // Does not have HW interlocs for VALU writing and then reading SGPRs.
595 bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
596
597 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
598
599 bool hasFPAtomicToDenormModeHazard() const {
600 return getGeneration() == GFX10;
601 }
602
603 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
604
605 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
606
607 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
608
609 bool hasVALUPartialForwardingHazard() const {
610 return getGeneration() == GFX11;
611 }
612
613 bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
614
615 // All GFX9 targets experience a fetch delay when an instruction at the start
616 // of a loop header is split by a 32-byte fetch window boundary, but GFX950
617 // is uniquely sensitive to this: the delay triggers further performance
618 // degradation beyond the fetch latency itself.
619 bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
620
621 bool requiresCodeObjectV6() const { return RequiresCOV6; }
622
623 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
624
625 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
626
627 bool hasVALUReadSGPRHazard() const {
628 return HasGFX12Insts && !HasGFX1250Insts;
629 }
630
631 bool setRegModeNeedsVNOPs() const {
632 return HasGFX1250Insts && getGeneration() == GFX12;
633 }
634
635 /// Return if operations acting on VGPR tuples require even alignment.
636 bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
637
638 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
639 bool hasSPackHL() const { return HasGFX11Insts; }
640
641 /// Return true if the target has the V_CVT_PK_I16_F32/V_CVT_PK_U16_F32
642 /// instructions.
643 bool hasVCvtPkIU16F32() const { return HasGFX11Insts; }
644
645 /// Return true if the target's EXP instruction has the COMPR flag, which
646 /// affects the meaning of the EN (enable) bits.
647 bool hasCompressedExport() const { return !HasGFX11Insts; }
648
649 /// Return true if the target's EXP instruction supports the NULL export
650 /// target.
651 bool hasNullExportTarget() const { return !HasGFX11Insts; }
652
653 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
654
655 /// Return true if the target has the S_DELAY_ALU instruction.
656 bool hasDelayAlu() const { return HasGFX11Insts; }
657
658 /// Returns true if the target supports
659 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
660 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
661 bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
662
663 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
664 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
665 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
666
667 /// \returns true if the target has packed f32 instructions that only read 32
668 /// bits from a scalar operand (SGPR or literal) and replicates the bits to
669 /// both channels.
670 bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const {
671 return getGeneration() == GFX12 && HasGFX1250Insts;
672 }
673
674 bool hasAddPC64Inst() const { return HasGFX1250Insts; }
675
676 /// \returns true if the target supports expert scheduling mode 2 which relies
677 /// on the compiler to insert waits to avoid hazards between VMEM and VALU
678 /// instructions in some instances.
679 bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
680
681 /// \returns The maximum number of instructions that can be enclosed in an
682 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
683 /// instruction.
684 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
685
686 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
687 /// SGPRs
688 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
689
690 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
691 /// VGPRs
692 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
693 unsigned DynamicVGPRBlockSize) const;
694
695 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
696 /// be achieved when the only function running on a CU is \p F, each workgroup
697 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
698 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
699 /// range, so this returns a range as well.
700 ///
701 /// Note that occupancy can be affected by the scratch allocation as well, but
702 /// we do not have enough information to compute it.
703 std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
704 unsigned LDSSize = 0,
705 unsigned NumSGPRs = 0,
706 unsigned NumVGPRs = 0) const;
707
708 /// \returns true if the flat_scratch register should be initialized with the
709 /// pointer to the wave's scratch memory rather than a size and offset.
710 bool flatScratchIsPointer() const {
711 return getGeneration() >= AMDGPUSubtarget::GFX9;
712 }
713
714 /// \returns true if the machine has merged shaders in which s0-s7 are
715 /// reserved by the hardware and user SGPRs start at s8
716 bool hasMergedShaders() const { return getGeneration() >= GFX9; }
717
718 // \returns true if the target supports the pre-NGG legacy geometry path.
719 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
720
721 // \returns true if the target has split barriers feature
722 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
723
724 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
725 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
726
727 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
728 /// values.
729 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
730
731 bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
732
733 bool hasVOPD3() const { return HasGFX1250Insts; }
734
735 // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
736 bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
737
738 // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
739 bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
740
741 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
742 // of sign-extending. Note that GFX1250 has not only fixed the bug but also
743 // extended VA to 57 bits.
744 bool hasGetPCZeroExtension() const {
745 return HasGFX12Insts && !HasGFX1250Insts;
746 }
747
748 // \returns true if the target needs to create a prolog for backward
749 // compatibility when preloading kernel arguments.
750 bool needsKernArgPreloadProlog() const {
751 return hasKernargPreload() && !HasGFX1250Insts;
752 }
753
754 bool hasCondSubInsts() const { return HasGFX12Insts; }
755
756 bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
757
758 bool hasFmaLegacy32Insts() const { return hasGFX10_3Insts(); }
759
760 /// \returns SGPR allocation granularity supported by the subtarget.
761 unsigned getSGPRAllocGranule() const {
762 return AMDGPU::IsaInfo::getSGPRAllocGranule(STI: *this);
763 }
764
765 /// \returns SGPR encoding granularity supported by the subtarget.
766 unsigned getSGPREncodingGranule() const {
767 return AMDGPU::IsaInfo::getSGPREncodingGranule(STI: *this);
768 }
769
770 /// \returns Total number of SGPRs supported by the subtarget.
771 unsigned getTotalNumSGPRs() const {
772 return AMDGPU::IsaInfo::getTotalNumSGPRs(STI: *this);
773 }
774
775 /// \returns Addressable number of SGPRs supported by the subtarget.
776 unsigned getAddressableNumSGPRs() const {
777 return AMDGPU::IsaInfo::getAddressableNumSGPRs(STI: *this);
778 }
779
780 /// \returns Minimum number of SGPRs that meets the given number of waves per
781 /// execution unit requirement supported by the subtarget.
782 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
783 return AMDGPU::IsaInfo::getMinNumSGPRs(STI: *this, WavesPerEU);
784 }
785
786 /// \returns Maximum number of SGPRs that meets the given number of waves per
787 /// execution unit requirement supported by the subtarget.
788 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
789 return AMDGPU::IsaInfo::getMaxNumSGPRs(STI: *this, WavesPerEU, Addressable);
790 }
791
792 /// \returns Reserved number of SGPRs. This is common
793 /// utility function called by MachineFunction and
794 /// Function variants of getReservedNumSGPRs.
795 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
796 /// \returns Reserved number of SGPRs for given machine function \p MF.
797 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
798
799 /// \returns Reserved number of SGPRs for given function \p F.
800 unsigned getReservedNumSGPRs(const Function &F) const;
801
802 /// \returns Maximum number of preloaded SGPRs for the subtarget.
803 unsigned getMaxNumPreloadedSGPRs() const;
804
805 /// \returns max num SGPRs. This is the common utility
806 /// function called by MachineFunction and Function
807 /// variants of getMaxNumSGPRs.
808 unsigned getBaseMaxNumSGPRs(const Function &F,
809 std::pair<unsigned, unsigned> WavesPerEU,
810 unsigned PreloadedSGPRs,
811 unsigned ReservedNumSGPRs) const;
812
813 /// \returns Maximum number of SGPRs that meets number of waves per execution
814 /// unit requirement for function \p MF, or number of SGPRs explicitly
815 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
816 ///
817 /// \returns Value that meets number of waves per execution unit requirement
818 /// if explicitly requested value cannot be converted to integer, violates
819 /// subtarget's specifications, or does not meet number of waves per execution
820 /// unit requirement.
821 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
822
823 /// \returns Maximum number of SGPRs that meets number of waves per execution
824 /// unit requirement for function \p F, or number of SGPRs explicitly
825 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
826 ///
827 /// \returns Value that meets number of waves per execution unit requirement
828 /// if explicitly requested value cannot be converted to integer, violates
829 /// subtarget's specifications, or does not meet number of waves per execution
830 /// unit requirement.
831 unsigned getMaxNumSGPRs(const Function &F) const;
832
833 /// \returns VGPR allocation granularity supported by the subtarget.
834 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
835 return AMDGPU::IsaInfo::getVGPRAllocGranule(STI: *this, DynamicVGPRBlockSize);
836 }
837
838 /// \returns VGPR encoding granularity supported by the subtarget.
839 unsigned getVGPREncodingGranule() const {
840 return AMDGPU::IsaInfo::getVGPREncodingGranule(STI: *this);
841 }
842
843 /// \returns Total number of VGPRs supported by the subtarget.
844 unsigned getTotalNumVGPRs() const {
845 return AMDGPU::IsaInfo::getTotalNumVGPRs(STI: *this);
846 }
847
848 /// \returns Addressable number of architectural VGPRs supported by the
849 /// subtarget.
850 unsigned getAddressableNumArchVGPRs() const {
851 return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(STI: *this);
852 }
853
854 /// \returns Addressable number of VGPRs supported by the subtarget.
855 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
856 return AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: *this, DynamicVGPRBlockSize);
857 }
858
859 /// \returns the minimum number of VGPRs that will prevent achieving more than
860 /// the specified number of waves \p WavesPerEU.
861 unsigned getMinNumVGPRs(unsigned WavesPerEU,
862 unsigned DynamicVGPRBlockSize) const {
863 return AMDGPU::IsaInfo::getMinNumVGPRs(STI: *this, WavesPerEU,
864 DynamicVGPRBlockSize);
865 }
866
867 /// \returns the maximum number of VGPRs that can be used and still achieved
868 /// at least the specified number of waves \p WavesPerEU.
869 unsigned getMaxNumVGPRs(unsigned WavesPerEU,
870 unsigned DynamicVGPRBlockSize) const {
871 return AMDGPU::IsaInfo::getMaxNumVGPRs(STI: *this, WavesPerEU,
872 DynamicVGPRBlockSize);
873 }
874
875 /// \returns max num VGPRs. This is the common utility function
876 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
877 unsigned
878 getBaseMaxNumVGPRs(const Function &F,
879 std::pair<unsigned, unsigned> NumVGPRBounds) const;
880
881 /// \returns Maximum number of VGPRs that meets number of waves per execution
882 /// unit requirement for function \p F, or number of VGPRs explicitly
883 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
884 ///
885 /// \returns Value that meets number of waves per execution unit requirement
886 /// if explicitly requested value cannot be converted to integer, violates
887 /// subtarget's specifications, or does not meet number of waves per execution
888 /// unit requirement.
889 unsigned getMaxNumVGPRs(const Function &F) const;
890
891 unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
892
893 /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
894 /// of waves per execution unit required for the function \p MF.
895 std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
896
897 /// \returns Maximum number of VGPRs that meets number of waves per execution
898 /// unit requirement for function \p MF, or number of VGPRs explicitly
899 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
900 ///
901 /// \returns Value that meets number of waves per execution unit requirement
902 /// if explicitly requested value cannot be converted to integer, violates
903 /// subtarget's specifications, or does not meet number of waves per execution
904 /// unit requirement.
905 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
906
907 bool supportsWave32() const { return getGeneration() >= GFX10; }
908
909 bool supportsWave64() const { return !hasGFX1250Insts() || HasGFX13Insts; }
910
911 bool isWave32() const { return getWavefrontSize() == 32; }
912
913 bool isWave64() const { return getWavefrontSize() == 64; }
914
915 /// Returns if the wavesize of this subtarget is known reliable. This is false
916 /// only for the a default target-cpu that does not have an explicit
917 /// +wavefrontsize target feature.
918 bool isWaveSizeKnown() const {
919 return hasFeature(Feature: AMDGPU::FeatureWavefrontSize32) ||
920 hasFeature(Feature: AMDGPU::FeatureWavefrontSize64);
921 }
922
923 const TargetRegisterClass *getBoolRC() const {
924 return getRegisterInfo()->getBoolRC();
925 }
926
927 /// \returns Maximum number of work groups per compute unit supported by the
928 /// subtarget and limited by given \p FlatWorkGroupSize.
929 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
930 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(STI: *this, FlatWorkGroupSize);
931 }
932
933 /// \returns Minimum flat work group size supported by the subtarget.
934 unsigned getMinFlatWorkGroupSize() const override {
935 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(STI: *this);
936 }
937
938 /// \returns Maximum flat work group size supported by the subtarget.
939 unsigned getMaxFlatWorkGroupSize() const override {
940 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize();
941 }
942
943 /// \returns Number of waves per execution unit required to support the given
944 /// \p FlatWorkGroupSize.
945 unsigned
946 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
947 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(STI: *this, FlatWorkGroupSize);
948 }
949
950 /// \returns Minimum number of waves per execution unit supported by the
951 /// subtarget.
952 unsigned getMinWavesPerEU() const override {
953 return AMDGPU::IsaInfo::getMinWavesPerEU(STI: *this);
954 }
955
956 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
957 SDep &Dep,
958 const TargetSchedModel *SchedModel) const override;
959
960 // \returns true if it's beneficial on this subtarget for the scheduler to
961 // cluster stores as well as loads.
962 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
963
964 // \returns the number of address arguments from which to enable MIMG NSA
965 // on supported architectures.
966 unsigned getNSAThreshold(const MachineFunction &MF) const;
967
968 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
969 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
970 bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
971
972 // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
973 // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
974 bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
975
976 bool isDynamicVGPREnabled() const { return DynamicVGPR; }
977 unsigned getDynamicVGPRBlockSize() const {
978 return DynamicVGPRBlockSize32 ? 32 : 16;
979 }
980
981 bool requiresDisjointEarlyClobberAndUndef() const override {
982 // AMDGPU doesn't care if early-clobber and undef operands are allocated
983 // to the same register.
984 return false;
985 }
986
987 // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
988 // and surronded by S_WAIT_ALU(0xFFE3).
989 bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
990 return getGeneration() == GFX12;
991 }
992
993 // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
994 // read.
995 bool hasScratchBaseForwardingHazard() const {
996 return HasGFX1250Insts && getGeneration() == GFX12;
997 }
998
999 // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
1000 // result.
1001 bool hasFlatScratchHiInB64InstHazard() const {
1002 return HasGFX1250Insts && getGeneration() == GFX12;
1003 }
1004
1005 /// \returns true if the subtarget requires a wait for xcnt before VMEM
1006 /// accesses that must never be repeated in the event of a page fault/re-try.
1007 /// Atomic stores/rmw and all volatile accesses fall under this criteria.
1008 bool requiresWaitXCntForSingleAccessInstructions() const {
1009 return HasGFX1250Insts;
1010 }
1011
1012 /// \returns the number of significant bits in the immediate field of the
1013 /// S_NOP instruction.
1014 unsigned getSNopBits() const {
1015 if (getGeneration() >= AMDGPUSubtarget::GFX12)
1016 return 7;
1017 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1018 return 4;
1019 return 3;
1020 }
1021
1022 bool supportsBPermute() const {
1023 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
1024 }
1025
1026 bool supportsWaveWideBPermute() const {
1027 return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
1028 getGeneration() == AMDGPUSubtarget::GFX12) ||
1029 isWave32();
1030 }
1031
1032 /// Return true if real (non-fake) variants of True16 instructions using
1033 /// 16-bit registers should be code-generated. Fake True16 instructions are
1034 /// identical to non-fake ones except that they take 32-bit registers as
1035 /// operands and always use their low halves.
1036 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
1037 // supported and the support for fake True16 instructions is removed.
1038 bool useRealTrue16Insts() const {
1039 return hasTrue16BitInsts() && EnableRealTrue16Insts;
1040 }
1041
1042 bool requiresWaitOnWorkgroupReleaseFence() const {
1043 return getGeneration() >= GFX10 || isTgSplitEnabled();
1044 }
1045};
1046
1047class GCNUserSGPRUsageInfo {
1048public:
1049 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1050
1051 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1052
1053 bool hasDispatchPtr() const { return DispatchPtr; }
1054
1055 bool hasQueuePtr() const { return QueuePtr; }
1056
1057 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1058
1059 bool hasDispatchID() const { return DispatchID; }
1060
1061 bool hasFlatScratchInit() const { return FlatScratchInit; }
1062
1063 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1064
1065 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1066
1067 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1068
1069 unsigned getNumFreeUserSGPRs();
1070
1071 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1072
1073 enum UserSGPRID : unsigned {
1074 ImplicitBufferPtrID = 0,
1075 PrivateSegmentBufferID = 1,
1076 DispatchPtrID = 2,
1077 QueuePtrID = 3,
1078 KernargSegmentPtrID = 4,
1079 DispatchIdID = 5,
1080 FlatScratchInitID = 6,
1081 PrivateSegmentSizeID = 7
1082 };
1083
1084 // Returns the size in number of SGPRs for preload user SGPR field.
1085 static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1086 switch (ID) {
1087 case ImplicitBufferPtrID:
1088 return 2;
1089 case PrivateSegmentBufferID:
1090 return 4;
1091 case DispatchPtrID:
1092 return 2;
1093 case QueuePtrID:
1094 return 2;
1095 case KernargSegmentPtrID:
1096 return 2;
1097 case DispatchIdID:
1098 return 2;
1099 case FlatScratchInitID:
1100 return 2;
1101 case PrivateSegmentSizeID:
1102 return 1;
1103 }
1104 llvm_unreachable("Unknown UserSGPRID.");
1105 }
1106
1107 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1108
1109private:
1110 const GCNSubtarget &ST;
1111
1112 // Private memory buffer
1113 // Compute directly in sgpr[0:1]
1114 // Other shaders indirect 64-bits at sgpr[0:1]
1115 bool ImplicitBufferPtr = false;
1116
1117 bool PrivateSegmentBuffer = false;
1118
1119 bool DispatchPtr = false;
1120
1121 bool QueuePtr = false;
1122
1123 bool KernargSegmentPtr = false;
1124
1125 bool DispatchID = false;
1126
1127 bool FlatScratchInit = false;
1128
1129 bool PrivateSegmentSize = false;
1130
1131 unsigned NumKernargPreloadSGPRs = 0;
1132
1133 unsigned NumUsedUserSGPRs = 0;
1134};
1135
1136} // end namespace llvm
1137
1138#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1139