| 1 | //===- AMDGPUWaitcntUtils.h -------------------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUWAITCNTUTILS_H |
| 10 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUWAITCNTUTILS_H |
| 11 | |
| 12 | #include "llvm/ADT/Sequence.h" |
| 13 | #include "llvm/ADT/StringExtras.h" |
| 14 | #include "llvm/Support/Debug.h" |
| 15 | #include "llvm/Support/raw_ostream.h" |
| 16 | #include "llvm/TargetParser/AMDGPUTargetParser.h" |
| 17 | |
| 18 | namespace llvm { |
| 19 | |
| 20 | namespace AMDGPU { |
| 21 | |
| 22 | enum InstCounterType { |
| 23 | LOAD_CNT = 0, // VMcnt prior to gfx12. |
| 24 | DS_CNT, // LKGMcnt prior to gfx12. |
| 25 | EXP_CNT, // |
| 26 | STORE_CNT, // VScnt in gfx10/gfx11. |
| 27 | NUM_NORMAL_INST_CNTS, |
| 28 | SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only. |
| 29 | BVH_CNT, // gfx12+ only. |
| 30 | KM_CNT, // gfx12+ only. |
| 31 | X_CNT, // gfx1250. |
| 32 | ASYNC_CNT, // gfx1250. |
| 33 | TENSOR_CNT, // gfx1250. |
| 34 | NUM_EXTENDED_INST_CNTS, |
| 35 | VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only. |
| 36 | VM_VSRC, // gfx12+ expert mode only. |
| 37 | NUM_EXPERT_INST_CNTS, |
| 38 | NUM_INST_CNTS = NUM_EXPERT_INST_CNTS |
| 39 | }; |
| 40 | |
| 41 | StringLiteral getInstCounterName(InstCounterType T); |
| 42 | |
| 43 | // Return an iterator over all counters between LOAD_CNT (the first counter) |
| 44 | // and \c MaxCounter (exclusive, default value yields an enumeration over |
| 45 | // all counters). |
| 46 | iota_range<InstCounterType> |
| 47 | inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS); |
| 48 | |
| 49 | /// Represents the hardware counter limits for different wait count types. |
| 50 | struct HardwareLimits { |
| 51 | unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12. |
| 52 | unsigned ExpcntMax; |
| 53 | unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. |
| 54 | unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. |
| 55 | unsigned SamplecntMax; // gfx12+ only. |
| 56 | unsigned BvhcntMax; // gfx12+ only. |
| 57 | unsigned KmcntMax; // gfx12+ only. |
| 58 | unsigned XcntMax; // gfx1250. |
| 59 | unsigned AsyncMax; // gfx1250. |
| 60 | unsigned VaVdstMax; // gfx12+ expert mode only. |
| 61 | unsigned VmVsrcMax; // gfx12+ expert mode only. |
| 62 | |
| 63 | HardwareLimits() = default; |
| 64 | |
| 65 | /// Initializes hardware limits from ISA version. |
| 66 | HardwareLimits(const IsaVersion &IV); |
| 67 | |
| 68 | unsigned get(InstCounterType T) const; |
| 69 | }; |
| 70 | |
| 71 | } // namespace AMDGPU |
| 72 | |
| 73 | template <> struct enum_iteration_traits<AMDGPU::InstCounterType> { |
| 74 | static constexpr bool is_iterable = true; |
| 75 | }; |
| 76 | |
| 77 | namespace AMDGPU { |
| 78 | |
| 79 | /// Represents the counter values to wait for in an s_waitcnt instruction. |
| 80 | /// |
| 81 | /// Large values (including the maximum possible integer) can be used to |
| 82 | /// represent "don't care" waits. |
| 83 | class Waitcnt { |
| 84 | std::array<unsigned, NUM_INST_CNTS> Cnt; |
| 85 | |
| 86 | public: |
| 87 | unsigned get(InstCounterType T) const { return Cnt[T]; } |
| 88 | void set(InstCounterType T, unsigned Val) { Cnt[T] = Val; } |
| 89 | |
| 90 | Waitcnt() { fill(Range&: Cnt, Value: ~0u); } |
| 91 | // Pre-gfx12 constructor. |
| 92 | Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt) |
| 93 | : Waitcnt() { |
| 94 | Cnt[LOAD_CNT] = VmCnt; |
| 95 | Cnt[EXP_CNT] = ExpCnt; |
| 96 | Cnt[DS_CNT] = LgkmCnt; |
| 97 | Cnt[STORE_CNT] = VsCnt; |
| 98 | } |
| 99 | |
| 100 | // gfx12+ constructor. |
| 101 | Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt, |
| 102 | unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt, |
| 103 | unsigned AsyncCnt, unsigned TensorCnt, unsigned VaVdst, |
| 104 | unsigned VmVsrc) |
| 105 | : Waitcnt() { |
| 106 | Cnt[LOAD_CNT] = LoadCnt; |
| 107 | Cnt[DS_CNT] = DsCnt; |
| 108 | Cnt[EXP_CNT] = ExpCnt; |
| 109 | Cnt[STORE_CNT] = StoreCnt; |
| 110 | Cnt[SAMPLE_CNT] = SampleCnt; |
| 111 | Cnt[BVH_CNT] = BvhCnt; |
| 112 | Cnt[KM_CNT] = KmCnt; |
| 113 | Cnt[X_CNT] = XCnt; |
| 114 | Cnt[ASYNC_CNT] = AsyncCnt; |
| 115 | Cnt[TENSOR_CNT] = TensorCnt; |
| 116 | Cnt[VA_VDST] = VaVdst; |
| 117 | Cnt[VM_VSRC] = VmVsrc; |
| 118 | } |
| 119 | |
| 120 | bool hasWait() const { |
| 121 | return any_of(Range: Cnt, P: [](unsigned Val) { return Val != ~0u; }); |
| 122 | } |
| 123 | |
| 124 | bool hasWaitExceptStoreCnt() const { |
| 125 | for (InstCounterType T : inst_counter_types()) { |
| 126 | if (T == STORE_CNT) |
| 127 | continue; |
| 128 | if (Cnt[T] != ~0u) |
| 129 | return true; |
| 130 | } |
| 131 | return false; |
| 132 | } |
| 133 | |
| 134 | void add(AMDGPU::InstCounterType T, unsigned Count) { |
| 135 | set(T, Val: std::min(a: get(T), b: Count)); |
| 136 | } |
| 137 | |
| 138 | void clear(AMDGPU::InstCounterType T) { set(T, Val: ~0u); } |
| 139 | |
| 140 | bool hasWaitStoreCnt() const { return Cnt[STORE_CNT] != ~0u; } |
| 141 | |
| 142 | bool hasWaitDepctr() const { |
| 143 | return Cnt[VA_VDST] != ~0u || Cnt[VM_VSRC] != ~0u; |
| 144 | } |
| 145 | |
| 146 | Waitcnt combined(const Waitcnt &Other) const { |
| 147 | // Does the right thing provided self and Other are either both pre-gfx12 |
| 148 | // or both gfx12+. |
| 149 | Waitcnt Wait; |
| 150 | for (InstCounterType T : inst_counter_types()) |
| 151 | Wait.Cnt[T] = std::min(a: Cnt[T], b: Other.Cnt[T]); |
| 152 | return Wait; |
| 153 | } |
| 154 | |
| 155 | void print(raw_ostream &OS) const { |
| 156 | ListSeparator LS; |
| 157 | for (InstCounterType T : inst_counter_types()) |
| 158 | OS << LS << getInstCounterName(T) << ": " << Cnt[T]; |
| 159 | if (LS.unused()) |
| 160 | OS << "none" ; |
| 161 | OS << '\n'; |
| 162 | } |
| 163 | |
| 164 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 165 | LLVM_DUMP_METHOD void dump() const; |
| 166 | #endif |
| 167 | |
| 168 | friend raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait) { |
| 169 | Wait.print(OS); |
| 170 | return OS; |
| 171 | } |
| 172 | }; |
| 173 | |
| 174 | Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded); |
| 175 | |
| 176 | unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded); |
| 177 | |
| 178 | // The following are only meaningful on targets that support |
| 179 | // S_WAIT_LOADCNT_DSCNT and S_WAIT_STORECNT_DSCNT. |
| 180 | |
| 181 | /// \returns Decoded Waitcnt structure from given \p LoadcntDscnt for given |
| 182 | /// isa \p Version. |
| 183 | Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt); |
| 184 | |
| 185 | /// \returns Decoded Waitcnt structure from given \p StorecntDscnt for given |
| 186 | /// isa \p Version. |
| 187 | Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt); |
| 188 | |
| 189 | /// \returns \p Loadcnt and \p Dscnt components of \p Decoded encoded as an |
| 190 | /// immediate that can be used with S_WAIT_LOADCNT_DSCNT for given isa |
| 191 | /// \p Version. |
| 192 | unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded); |
| 193 | |
| 194 | /// \returns \p Storecnt and \p Dscnt components of \p Decoded encoded as an |
| 195 | /// immediate that can be used with S_WAIT_STORECNT_DSCNT for given isa |
| 196 | /// \p Version. |
| 197 | unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded); |
| 198 | |
| 199 | /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction, |
| 200 | /// and if so, which counter it is waiting on. |
| 201 | std::optional<AMDGPU::InstCounterType> counterTypeForInstr(unsigned Opcode); |
| 202 | |
| 203 | } // namespace AMDGPU |
| 204 | |
| 205 | } // namespace llvm |
| 206 | |
| 207 | #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUWAITCNTUTILS_H |
| 208 | |