| 1 | //===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// Coexecution-focused scheduling strategy for AMDGPU. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H |
| 15 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H |
| 16 | |
| 17 | #include "GCNSchedStrategy.h" |
| 18 | #include "llvm/CodeGen/MachineScheduler.h" |
| 19 | |
| 20 | namespace llvm { |
| 21 | |
| 22 | namespace AMDGPU { |
| 23 | |
| 24 | //===----------------------------------------------------------------------===// |
| 25 | // Instruction Flavor Classification |
| 26 | //===----------------------------------------------------------------------===// |
| 27 | |
| 28 | enum class InstructionFlavor : uint8_t { |
| 29 | WMMA, // WMMA/MFMA matrix operations |
| 30 | SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT) |
| 31 | TRANS, // Transcendental ops (v_exp, v_log, etc.) |
| 32 | MultiCycleVALU, // VALU instructions with repeat rate > 1 |
| 33 | VMEM, // FLAT/GLOBAL memory operations |
| 34 | DS, // LDS/GDS operations |
| 35 | SALU, // Scalar ALU |
| 36 | DMA, // Tensor DMA operations |
| 37 | Fence, // Fences and waits |
| 38 | Other, // Everything else |
| 39 | NUM_FLAVORS |
| 40 | }; |
| 41 | |
| 42 | inline StringRef getFlavorName(InstructionFlavor F) { |
| 43 | switch (F) { |
| 44 | case InstructionFlavor::WMMA: |
| 45 | return "WMMA" ; |
| 46 | case InstructionFlavor::SingleCycleVALU: |
| 47 | return "VALU(1c)" ; |
| 48 | case InstructionFlavor::TRANS: |
| 49 | return "TRANS" ; |
| 50 | case InstructionFlavor::MultiCycleVALU: |
| 51 | return "VALU(Nc)" ; |
| 52 | case InstructionFlavor::VMEM: |
| 53 | return "VMEM" ; |
| 54 | case InstructionFlavor::DS: |
| 55 | return "DS" ; |
| 56 | case InstructionFlavor::SALU: |
| 57 | return "SALU" ; |
| 58 | case InstructionFlavor::DMA: |
| 59 | return "DMA" ; |
| 60 | case InstructionFlavor::Fence: |
| 61 | return "Fence" ; |
| 62 | case InstructionFlavor::Other: |
| 63 | return "Other" ; |
| 64 | case InstructionFlavor::NUM_FLAVORS: |
| 65 | return "???" ; |
| 66 | } |
| 67 | llvm_unreachable("Unknown InstructionFlavor" ); |
| 68 | } |
| 69 | |
| 70 | inline StringRef getFlavorShortName(InstructionFlavor F) { |
| 71 | switch (F) { |
| 72 | case InstructionFlavor::WMMA: |
| 73 | return "W" ; |
| 74 | case InstructionFlavor::SingleCycleVALU: |
| 75 | return "V" ; |
| 76 | case InstructionFlavor::TRANS: |
| 77 | return "T" ; |
| 78 | case InstructionFlavor::MultiCycleVALU: |
| 79 | return "C" ; |
| 80 | case InstructionFlavor::VMEM: |
| 81 | return "M" ; |
| 82 | case InstructionFlavor::DS: |
| 83 | return "D" ; |
| 84 | case InstructionFlavor::SALU: |
| 85 | return "S" ; |
| 86 | case InstructionFlavor::DMA: |
| 87 | return "X" ; |
| 88 | case InstructionFlavor::Fence: |
| 89 | return "F" ; |
| 90 | case InstructionFlavor::Other: |
| 91 | return "O" ; |
| 92 | case InstructionFlavor::NUM_FLAVORS: |
| 93 | return "?" ; |
| 94 | } |
| 95 | llvm_unreachable("Unknown InstructionFlavor" ); |
| 96 | } |
| 97 | |
| 98 | InstructionFlavor classifyFlavor(const MachineInstr &MI, |
| 99 | const SIInstrInfo &SII); |
| 100 | |
| 101 | using FlavorGroup = SmallVector<InstructionFlavor, 4>; |
| 102 | |
| 103 | namespace FlavorGroups { |
| 104 | inline FlavorGroup allVALU() { |
| 105 | return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS, |
| 106 | InstructionFlavor::MultiCycleVALU}; |
| 107 | } |
| 108 | inline FlavorGroup allMem() { |
| 109 | return {InstructionFlavor::VMEM, InstructionFlavor::DS, |
| 110 | InstructionFlavor::DMA}; |
| 111 | } |
| 112 | inline FlavorGroup individual(InstructionFlavor F) { return {F}; } |
| 113 | inline FlavorGroup all() { |
| 114 | FlavorGroup G; |
| 115 | for (unsigned I = 0; |
| 116 | I < static_cast<unsigned>(InstructionFlavor::NUM_FLAVORS); ++I) |
| 117 | G.push_back(Elt: static_cast<InstructionFlavor>(I)); |
| 118 | return G; |
| 119 | } |
| 120 | } // namespace FlavorGroups |
| 121 | |
| 122 | /// AMDGPU-specific scheduling decision reasons. These provide more granularity |
| 123 | /// than the generic CandReason enum for debugging purposes. |
| 124 | enum class AMDGPUSchedReason : uint8_t { |
| 125 | None, |
| 126 | CritResourceBalance, // tryCriticalResource chose based on resource pressure |
| 127 | CritResourceDep, // tryCriticalResourceDependency chose based on enabling |
| 128 | NUM_REASONS |
| 129 | }; |
| 130 | |
| 131 | inline StringRef getReasonName(AMDGPUSchedReason R) { |
| 132 | switch (R) { |
| 133 | case AMDGPUSchedReason::None: |
| 134 | return "None" ; |
| 135 | case AMDGPUSchedReason::CritResourceBalance: |
| 136 | return "CritResource" ; |
| 137 | case AMDGPUSchedReason::CritResourceDep: |
| 138 | return "CritResourceDep" ; |
| 139 | case AMDGPUSchedReason::NUM_REASONS: |
| 140 | return "???" ; |
| 141 | } |
| 142 | llvm_unreachable("Unknown AMDGPUSchedReason" ); |
| 143 | } |
| 144 | |
| 145 | } // End namespace AMDGPU |
| 146 | |
| 147 | //===----------------------------------------------------------------------===// |
| 148 | // Hardware Unit Information |
| 149 | //===----------------------------------------------------------------------===// |
| 150 | |
| 151 | /// HardwareUnitInfo is a wrapper class which maps to some real hardware |
| 152 | /// resource. This is used to model hardware resource pressure per region, and |
| 153 | /// guide scheduling heuristics. |
| 154 | class HardwareUnitInfo { |
| 155 | private: |
| 156 | /// PrioritySUs maintains a list of the SUs we want to prioritize scheduling |
| 157 | /// for this HardwareUnit. This is used for agreement between |
| 158 | /// tryCriticalResourceDependency and tryCriticalResource: we schedule the |
| 159 | /// dependencies for a SU on critical resource, then schedule that same SU on |
| 160 | /// the critical resource. This agreement results in shorter live ranges and |
| 161 | /// more regular HardwareUnit access patterns. SUs are prioritized based on |
| 162 | /// depth for top-down scheduling. |
| 163 | SmallSetVector<SUnit *, 16> PrioritySUs; |
| 164 | /// All the SUs in the region that consume this resource |
| 165 | SmallSetVector<SUnit *, 16> AllSUs; |
| 166 | /// The total number of busy cycles for this HardwareUnit for a given region. |
| 167 | unsigned TotalCycles = 0; |
| 168 | // InstructionFlavor mapping |
| 169 | AMDGPU::InstructionFlavor Type; |
| 170 | // Whether or not instructions on this HardwareUnit may produce a window in |
| 171 | // which instructions in other HardwareUnits can coexecute. For example, WMMA |
| 172 | // / MFMA instructions may take multiple cycles, which may be overlapped with |
| 173 | // instructions on other HardwareUnits |
| 174 | bool ProducesCoexecWindow = false; |
| 175 | |
| 176 | public: |
| 177 | HardwareUnitInfo() {} |
| 178 | |
| 179 | unsigned size() { return AllSUs.size(); } |
| 180 | |
| 181 | unsigned getTotalCycles() { return TotalCycles; } |
| 182 | |
| 183 | void setType(unsigned TheType) { |
| 184 | assert(TheType < (unsigned)AMDGPU::InstructionFlavor::NUM_FLAVORS); |
| 185 | Type = (AMDGPU::InstructionFlavor)(TheType); |
| 186 | } |
| 187 | |
| 188 | AMDGPU::InstructionFlavor getType() const { return Type; } |
| 189 | |
| 190 | bool producesCoexecWindow() const { return ProducesCoexecWindow; } |
| 191 | |
| 192 | void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; } |
| 193 | |
| 194 | bool contains(SUnit *SU) const { return AllSUs.contains(key: SU); } |
| 195 | |
| 196 | /// \returns true if there is a difference in priority between \p SU and \p |
| 197 | /// Other. If so, \returns the SUnit with higher priority. This |
| 198 | /// method looks through the PrioritySUs to determine if one SU is more |
| 199 | /// prioritized than the other. If neither are in the PrioritySUs list, then |
| 200 | /// neither have priority over each other. |
| 201 | SUnit *getHigherPriority(SUnit *SU, SUnit *Other) const { |
| 202 | for (auto *SUOrder : PrioritySUs) { |
| 203 | if (SUOrder == SU) |
| 204 | return SU; |
| 205 | |
| 206 | if (SUOrder == Other) |
| 207 | return Other; |
| 208 | } |
| 209 | return nullptr; |
| 210 | } |
| 211 | |
| 212 | void reset() { |
| 213 | AllSUs.clear(); |
| 214 | PrioritySUs.clear(); |
| 215 | TotalCycles = 0; |
| 216 | Type = AMDGPU::InstructionFlavor::Other; |
| 217 | ProducesCoexecWindow = false; |
| 218 | } |
| 219 | |
| 220 | /// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is |
| 221 | /// set, we will look beyond the PrioritySUs (if all the PrioritySUs are |
| 222 | /// ready) to AllSUs to attempt to find a target SU. When looking through |
| 223 | /// AllSUs we sort pick the target SU by minimal depth for top-down |
| 224 | /// scheduling. getNextTargetSU is useful for determining which SU on this |
| 225 | /// HardwareUnit we are trying to schedule - this info helps us determine |
| 226 | /// which dependencies to schedule. LookDeep is useful if the dependencies are |
| 227 | /// long latency (e.g. memory instructions). If we have many long latency |
| 228 | /// dependencies, it is beneficial to enable SUs multiple levels ahead. |
| 229 | SUnit *getNextTargetSU(bool LookDeep = false) const; |
| 230 | /// Insert the \p SU into the AllSUs and account its \p BlockingCycles into |
| 231 | /// the TotalCycles. This maintains the list of PrioritySUs. |
| 232 | void insert(SUnit *SU, unsigned BlockingCycles); |
| 233 | /// Update the state for \p SU being scheduled by removing it from the AllSus |
| 234 | /// and reducing its \p BlockingCycles from the TotalCycles. This maintains |
| 235 | /// the list of PrioritySUS. |
| 236 | void markScheduled(SUnit *SU, unsigned BlockingCycles); |
| 237 | }; |
| 238 | |
| 239 | //===----------------------------------------------------------------------===// |
| 240 | // Candidate Heuristics |
| 241 | //===----------------------------------------------------------------------===// |
| 242 | |
| 243 | /// CandidateHeuristics contains state and implementations to facilitate making |
| 244 | /// per instruction scheduling decisions; it contains methods used in |
| 245 | /// tryCandidate to decide which instruction to schedule next. |
| 246 | class CandidateHeuristics { |
| 247 | protected: |
| 248 | ScheduleDAGMI *DAG; |
| 249 | const SIInstrInfo *SII; |
| 250 | const SIRegisterInfo *SRI; |
| 251 | const TargetSchedModel *SchedModel; |
| 252 | SmallVector<HardwareUnitInfo, 8> HWUInfo; |
| 253 | |
| 254 | /// Walk over the region and collect total usage per HardwareUnit |
| 255 | void collectHWUIPressure(); |
| 256 | |
| 257 | /// Compute the blocking cycles for the appropriate HardwareUnit given an \p |
| 258 | /// SU |
| 259 | unsigned getHWUICyclesForInst(SUnit *SU); |
| 260 | |
| 261 | /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the |
| 262 | /// mapped HardwareUnit. |
| 263 | HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor); |
| 264 | |
| 265 | public: |
| 266 | CandidateHeuristics() = default; |
| 267 | |
| 268 | void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel, |
| 269 | const TargetRegisterInfo *TRI); |
| 270 | |
| 271 | /// Update the state to reflect that \p SU is going to be scheduled. |
| 272 | void updateForScheduling(SUnit *SU); |
| 273 | |
| 274 | /// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest |
| 275 | /// priority are first. Priority is determined by maximizing coexecution and |
| 276 | /// keeping the critical HardwareUnit busy. |
| 277 | void sortHWUIResources(); |
| 278 | |
| 279 | /// Check for critical resource consumption. Prefer the candidate that uses |
| 280 | /// the most prioritized HardwareUnit. If both candidates use the same |
| 281 | /// HarwareUnit, prefer the candidate with higher priority on that |
| 282 | /// HardwareUnit. |
| 283 | bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand, |
| 284 | GenericSchedulerBase::SchedCandidate &Cand, |
| 285 | SchedBoundary *Zone) const; |
| 286 | |
| 287 | /// Check for dependencies of instructions that use prioritized HardwareUnits. |
| 288 | /// Prefer the candidate that is a dependency of an instruction that uses the |
| 289 | /// most prioritized HardwareUnit. If both candidates enable the same |
| 290 | /// HardwareUnit, prefer the candidate that enables the higher priority |
| 291 | /// instruction on that HardwareUnit. |
| 292 | bool |
| 293 | tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand, |
| 294 | GenericSchedulerBase::SchedCandidate &Cand, |
| 295 | SchedBoundary *Zone) const; |
| 296 | |
| 297 | void dumpRegionSummary(); |
| 298 | }; |
| 299 | |
| 300 | class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy { |
| 301 | protected: |
| 302 | bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand, |
| 303 | SchedBoundary &Zone) const; |
| 304 | AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None; |
| 305 | CandidateHeuristics Heurs; |
| 306 | |
| 307 | #ifndef NDEBUG |
| 308 | void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand); |
| 309 | #endif |
| 310 | |
| 311 | bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand, |
| 312 | SchedBoundary *Zone); |
| 313 | void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, |
| 314 | const RegPressureTracker &RPTracker, |
| 315 | SchedCandidate &Cand, bool &PickedPending, |
| 316 | bool IsBottomUp); |
| 317 | |
| 318 | public: |
| 319 | AMDGPUCoExecSchedStrategy(const MachineSchedContext *C); |
| 320 | |
| 321 | void initPolicy(MachineBasicBlock::iterator Begin, |
| 322 | MachineBasicBlock::iterator End, |
| 323 | unsigned NumRegionInstrs) override; |
| 324 | void initialize(ScheduleDAGMI *DAG) override; |
| 325 | SUnit *pickNode(bool &IsTopNode) override; |
| 326 | void schedNode(SUnit *SU, bool IsTopNode) override; |
| 327 | }; |
| 328 | |
| 329 | ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C); |
| 330 | ScheduleDAGInstrs *createGCNNoopPostMachineScheduler(MachineSchedContext *C); |
| 331 | |
| 332 | } // End namespace llvm |
| 333 | |
| 334 | #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H |
| 335 | |