1//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Coexecution-focused scheduling strategy for AMDGPU.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
16
17#include "GCNSchedStrategy.h"
18#include "llvm/CodeGen/MachineScheduler.h"
19
20namespace llvm {
21
22namespace AMDGPU {
23
24//===----------------------------------------------------------------------===//
25// Instruction Flavor Classification
26//===----------------------------------------------------------------------===//
27
28enum class InstructionFlavor : uint8_t {
29 WMMA, // WMMA/MFMA matrix operations
30 SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
31 TRANS, // Transcendental ops (v_exp, v_log, etc.)
32 MultiCycleVALU, // VALU instructions with repeat rate > 1
33 VMEM, // FLAT/GLOBAL memory operations
34 DS, // LDS/GDS operations
35 SALU, // Scalar ALU
36 DMA, // Tensor DMA operations
37 Fence, // Fences and waits
38 Other, // Everything else
39 NUM_FLAVORS
40};
41
42inline StringRef getFlavorName(InstructionFlavor F) {
43 switch (F) {
44 case InstructionFlavor::WMMA:
45 return "WMMA";
46 case InstructionFlavor::SingleCycleVALU:
47 return "VALU(1c)";
48 case InstructionFlavor::TRANS:
49 return "TRANS";
50 case InstructionFlavor::MultiCycleVALU:
51 return "VALU(Nc)";
52 case InstructionFlavor::VMEM:
53 return "VMEM";
54 case InstructionFlavor::DS:
55 return "DS";
56 case InstructionFlavor::SALU:
57 return "SALU";
58 case InstructionFlavor::DMA:
59 return "DMA";
60 case InstructionFlavor::Fence:
61 return "Fence";
62 case InstructionFlavor::Other:
63 return "Other";
64 case InstructionFlavor::NUM_FLAVORS:
65 llvm_unreachable("Unknown InstructionFlavor");
66 }
67 llvm_unreachable("Unknown InstructionFlavor");
68}
69
70inline StringRef getFlavorShortName(InstructionFlavor F) {
71 switch (F) {
72 case InstructionFlavor::WMMA:
73 return "W";
74 case InstructionFlavor::SingleCycleVALU:
75 return "V";
76 case InstructionFlavor::TRANS:
77 return "T";
78 case InstructionFlavor::MultiCycleVALU:
79 return "C";
80 case InstructionFlavor::VMEM:
81 return "M";
82 case InstructionFlavor::DS:
83 return "D";
84 case InstructionFlavor::SALU:
85 return "S";
86 case InstructionFlavor::DMA:
87 return "X";
88 case InstructionFlavor::Fence:
89 return "F";
90 case InstructionFlavor::Other:
91 return "O";
92 case InstructionFlavor::NUM_FLAVORS:
93 llvm_unreachable("Unknown InstructionFlavor");
94 }
95 llvm_unreachable("Unknown InstructionFlavor");
96}
97
98InstructionFlavor classifyFlavor(const MachineInstr &MI,
99 const SIInstrInfo &SII);
100
101using FlavorGroup = SmallVector<InstructionFlavor, 4>;
102
103namespace FlavorGroups {
104inline FlavorGroup allVALU() {
105 return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS,
106 InstructionFlavor::MultiCycleVALU};
107}
108inline FlavorGroup allMem() {
109 return {InstructionFlavor::VMEM, InstructionFlavor::DS,
110 InstructionFlavor::DMA};
111}
112inline FlavorGroup individual(InstructionFlavor F) { return {F}; }
113inline FlavorGroup all() {
114 FlavorGroup G;
115 for (unsigned I = 0;
116 I < static_cast<unsigned>(InstructionFlavor::NUM_FLAVORS); ++I)
117 G.push_back(Elt: static_cast<InstructionFlavor>(I));
118 return G;
119}
120} // namespace FlavorGroups
121
122/// AMDGPU-specific scheduling decision reasons. These provide more granularity
123/// than the generic CandReason enum for debugging purposes.
124enum class AMDGPUSchedReason : uint8_t {
125 None,
126 CritResourceBalance, // tryCriticalResource chose based on resource pressure
127 CritResourceDep, // tryCriticalResourceDependency chose based on enabling
128 NUM_REASONS
129};
130
131inline StringRef getReasonName(AMDGPUSchedReason R) {
132 switch (R) {
133 case AMDGPUSchedReason::None:
134 return "None";
135 case AMDGPUSchedReason::CritResourceBalance:
136 return "CritResource";
137 case AMDGPUSchedReason::CritResourceDep:
138 return "CritResourceDep";
139 case AMDGPUSchedReason::NUM_REASONS:
140 llvm_unreachable("Unknown AMDGPUSchedReason");
141 }
142 llvm_unreachable("Unknown AMDGPUSchedReason");
143}
144
145} // End namespace AMDGPU
146
147//===----------------------------------------------------------------------===//
148// Hardware Unit Information
149//===----------------------------------------------------------------------===//
150
151/// HardwareUnitInfo is a wrapper class which maps to some real hardware
152/// resource. This is used to model hardware resource pressure per region, and
153/// guide scheduling heuristics.
154class HardwareUnitInfo {
155private:
156 /// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
157 /// for this HardwareUnit. This is used for agreement between
158 /// tryCriticalResourceDependency and tryCriticalResource: we schedule the
159 /// dependencies for a SU on critical resource, then schedule that same SU on
160 /// the critical resource. This agreement results in shorter live ranges and
161 /// more regular HardwareUnit access patterns. SUs are prioritized based on
162 /// depth for top-down scheduling.
163 SmallSetVector<SUnit *, 16> PrioritySUs;
164 /// All the SUs in the region that consume this resource.
165 SmallSetVector<SUnit *, 16> AllSUs;
166 /// The total number of busy cycles for this HardwareUnit for a given region.
167 unsigned TotalCycles = 0;
168 /// InstructionFlavor mapping.
169 AMDGPU::InstructionFlavor Type;
170 /// Whether or not instructions on this HardwareUnit may produce a window in
171 /// which instructions in other HardwareUnits can coexecute. For example, WMMA
172 /// / MFMA instructions may take multiple cycles, which may be overlapped with
173 /// instructions on other HardwareUnits.
174 bool ProducesCoexecWindow = false;
175
176public:
177 HardwareUnitInfo() {}
178
179 unsigned size() { return AllSUs.size(); }
180
181 unsigned getTotalCycles() { return TotalCycles; }
182
183 void setType(unsigned TheType) {
184 assert(TheType < (unsigned)AMDGPU::InstructionFlavor::NUM_FLAVORS);
185 Type = (AMDGPU::InstructionFlavor)(TheType);
186 }
187
188 AMDGPU::InstructionFlavor getType() const { return Type; }
189
190 bool producesCoexecWindow() const { return ProducesCoexecWindow; }
191
192 void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }
193
194 bool contains(SUnit *SU) const { return AllSUs.contains(key: SU); }
195
196 /// \returns the SUnit with higher priority or nullptr if they are the same.
197 /// This method looks through the PrioritySUs to determine if one SU is more
198 /// prioritized than the other. If neither are in the PrioritySUs list, then
199 /// neither have priority over each other.
200 SUnit *getHigherPriority(SUnit *SU, SUnit *Other) const {
201 for (SUnit *SUOrder : PrioritySUs) {
202 if (SUOrder == SU)
203 return SU;
204
205 if (SUOrder == Other)
206 return Other;
207 }
208 return nullptr;
209 }
210
211 void reset() {
212 AllSUs.clear();
213 PrioritySUs.clear();
214 TotalCycles = 0;
215 Type = AMDGPU::InstructionFlavor::Other;
216 ProducesCoexecWindow = false;
217 }
218
219 /// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
220 /// set, we will look beyond the PrioritySUs (if all the PrioritySUs are
221 /// ready) to AllSUs to attempt to find a target SU. When looking through
222 /// AllSUs we sort pick the target SU by minimal depth for top-down
223 /// scheduling. getNextTargetSU is useful for determining which SU on this
224 /// HardwareUnit we are trying to schedule - this info helps us determine
225 /// which dependencies to schedule. LookDeep is useful if the dependencies are
226 /// long latency (e.g. memory instructions). If we have many long latency
227 /// dependencies, it is beneficial to enable SUs multiple levels ahead.
228 SUnit *getNextTargetSU(bool LookDeep = false) const;
229 /// Insert the \p SU into AllSUs and account its \p BlockingCycles into
230 /// the TotalCycles. This maintains the list of PrioritySUs.
231 void insert(SUnit *SU, unsigned BlockingCycles);
232 /// Update the state for \p SU being scheduled by removing it from the AllSUs
233 /// and reducing its \p BlockingCycles from the TotalCycles. This maintains
234 /// the list of PrioritySUs.
235 void markScheduled(SUnit *SU, unsigned BlockingCycles);
236};
237
238//===----------------------------------------------------------------------===//
239// Candidate Heuristics
240//===----------------------------------------------------------------------===//
241
242/// CandidateHeuristics contains state and implementations to facilitate making
243/// per instruction scheduling decisions; it contains methods used in
244/// tryCandidate to decide which instruction to schedule next.
245class CandidateHeuristics {
246protected:
247 ScheduleDAGMI *DAG;
248 const SIInstrInfo *SII;
249 const SIRegisterInfo *SRI;
250 const TargetSchedModel *SchedModel;
251 SmallVector<HardwareUnitInfo, 8> HWUInfo;
252
253 /// Walk over the region and collect total usage per HardwareUnit.
254 void collectHWUIPressure();
255
256 /// Compute the blocking cycles for the appropriate HardwareUnit given an \p
257 /// SU.
258 unsigned getHWUICyclesForInst(SUnit *SU);
259
260 /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
261 /// mapped HardwareUnit.
262 HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);
263
264public:
265 CandidateHeuristics() = default;
266
267 void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel,
268 const TargetRegisterInfo *TRI);
269
270 /// Update the state to reflect that \p SU is going to be scheduled.
271 void updateForScheduling(SUnit *SU);
272
273 /// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
274 /// priority are first. Priority is determined by maximizing coexecution and
275 /// keeping the critical HardwareUnit busy.
276 void sortHWUIResources();
277
278 /// Check for critical resource consumption. Prefer the candidate that uses
279 /// the most prioritized HardwareUnit. If both candidates use the same
280 /// HarwareUnit, prefer the candidate with higher priority on that
281 /// HardwareUnit.
282 bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand,
283 GenericSchedulerBase::SchedCandidate &Cand,
284 SchedBoundary *Zone) const;
285
286 /// Check for dependencies of instructions that use prioritized HardwareUnits.
287 /// Prefer the candidate that is a dependency of an instruction that uses the
288 /// most prioritized HardwareUnit. If both candidates enable the same
289 /// HardwareUnit, prefer the candidate that enables the higher priority
290 /// instruction on that HardwareUnit.
291 bool
292 tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand,
293 GenericSchedulerBase::SchedCandidate &Cand,
294 SchedBoundary *Zone) const;
295
296 void dumpRegionSummary();
297};
298
299class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
300protected:
301 bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
302 SchedBoundary &Zone) const;
303 AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None;
304 CandidateHeuristics Heurs;
305
306#ifndef NDEBUG
307 void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand);
308#endif
309
310 bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand,
311 SchedBoundary *Zone);
312 void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
313 const RegPressureTracker &RPTracker,
314 SchedCandidate &Cand, bool &PickedPending,
315 bool IsBottomUp);
316
317public:
318 AMDGPUCoExecSchedStrategy(const MachineSchedContext *C);
319
320 void initPolicy(MachineBasicBlock::iterator Begin,
321 MachineBasicBlock::iterator End,
322 unsigned NumRegionInstrs) override;
323 void initialize(ScheduleDAGMI *DAG) override;
324 SUnit *pickNode(bool &IsTopNode) override;
325 void schedNode(SUnit *SU, bool IsTopNode) override;
326};
327
328ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C);
329ScheduleDAGInstrs *createGCNNoopPostMachineScheduler(MachineSchedContext *C);
330
331} // End namespace llvm
332
333#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
334