1//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Coexecution-focused scheduling strategy for AMDGPU.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
16
17#include "GCNSchedStrategy.h"
18#include "llvm/CodeGen/MachineScheduler.h"
19
20namespace llvm {
21
22namespace AMDGPU {
23
24//===----------------------------------------------------------------------===//
25// Instruction Flavor Classification
26//===----------------------------------------------------------------------===//
27
28enum class InstructionFlavor : uint8_t {
29 WMMA, // WMMA/MFMA matrix operations
30 SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
31 TRANS, // Transcendental ops (v_exp, v_log, etc.)
32 MultiCycleVALU, // VALU instructions with repeat rate > 1
33 VMEM, // FLAT/GLOBAL memory operations
34 DS, // LDS/GDS operations
35 SALU, // Scalar ALU
36 DMA, // Tensor DMA operations
37 Fence, // Fences and waits
38 Other, // Everything else
39 NUM_FLAVORS
40};
41
42inline StringRef getFlavorName(InstructionFlavor F) {
43 switch (F) {
44 case InstructionFlavor::WMMA:
45 return "WMMA";
46 case InstructionFlavor::SingleCycleVALU:
47 return "VALU(1c)";
48 case InstructionFlavor::TRANS:
49 return "TRANS";
50 case InstructionFlavor::MultiCycleVALU:
51 return "VALU(Nc)";
52 case InstructionFlavor::VMEM:
53 return "VMEM";
54 case InstructionFlavor::DS:
55 return "DS";
56 case InstructionFlavor::SALU:
57 return "SALU";
58 case InstructionFlavor::DMA:
59 return "DMA";
60 case InstructionFlavor::Fence:
61 return "Fence";
62 case InstructionFlavor::Other:
63 return "Other";
64 case InstructionFlavor::NUM_FLAVORS:
65 return "???";
66 }
67 llvm_unreachable("Unknown InstructionFlavor");
68}
69
70inline StringRef getFlavorShortName(InstructionFlavor F) {
71 switch (F) {
72 case InstructionFlavor::WMMA:
73 return "W";
74 case InstructionFlavor::SingleCycleVALU:
75 return "V";
76 case InstructionFlavor::TRANS:
77 return "T";
78 case InstructionFlavor::MultiCycleVALU:
79 return "C";
80 case InstructionFlavor::VMEM:
81 return "M";
82 case InstructionFlavor::DS:
83 return "D";
84 case InstructionFlavor::SALU:
85 return "S";
86 case InstructionFlavor::DMA:
87 return "X";
88 case InstructionFlavor::Fence:
89 return "F";
90 case InstructionFlavor::Other:
91 return "O";
92 case InstructionFlavor::NUM_FLAVORS:
93 return "?";
94 }
95 llvm_unreachable("Unknown InstructionFlavor");
96}
97
98InstructionFlavor classifyFlavor(const MachineInstr &MI,
99 const SIInstrInfo &SII);
100
101using FlavorGroup = SmallVector<InstructionFlavor, 4>;
102
103namespace FlavorGroups {
104inline FlavorGroup allVALU() {
105 return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS,
106 InstructionFlavor::MultiCycleVALU};
107}
108inline FlavorGroup allMem() {
109 return {InstructionFlavor::VMEM, InstructionFlavor::DS,
110 InstructionFlavor::DMA};
111}
112inline FlavorGroup individual(InstructionFlavor F) { return {F}; }
113inline FlavorGroup all() {
114 FlavorGroup G;
115 for (unsigned I = 0;
116 I < static_cast<unsigned>(InstructionFlavor::NUM_FLAVORS); ++I)
117 G.push_back(Elt: static_cast<InstructionFlavor>(I));
118 return G;
119}
120} // namespace FlavorGroups
121
122/// AMDGPU-specific scheduling decision reasons. These provide more granularity
123/// than the generic CandReason enum for debugging purposes.
124enum class AMDGPUSchedReason : uint8_t {
125 None,
126 CritResourceBalance, // tryCriticalResource chose based on resource pressure
127 CritResourceDep, // tryCriticalResourceDependency chose based on enabling
128 NUM_REASONS
129};
130
131inline StringRef getReasonName(AMDGPUSchedReason R) {
132 switch (R) {
133 case AMDGPUSchedReason::None:
134 return "None";
135 case AMDGPUSchedReason::CritResourceBalance:
136 return "CritResource";
137 case AMDGPUSchedReason::CritResourceDep:
138 return "CritResourceDep";
139 case AMDGPUSchedReason::NUM_REASONS:
140 return "???";
141 }
142 llvm_unreachable("Unknown AMDGPUSchedReason");
143}
144
145} // End namespace AMDGPU
146
147//===----------------------------------------------------------------------===//
148// Hardware Unit Information
149//===----------------------------------------------------------------------===//
150
151/// HardwareUnitInfo is a wrapper class which maps to some real hardware
152/// resource. This is used to model hardware resource pressure per region, and
153/// guide scheduling heuristics.
154class HardwareUnitInfo {
155private:
156 /// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
157 /// for this HardwareUnit. This is used for agreement between
158 /// tryCriticalResourceDependency and tryCriticalResource: we schedule the
159 /// dependencies for a SU on critical resource, then schedule that same SU on
160 /// the critical resource. This agreement results in shorter live ranges and
161 /// more regular HardwareUnit access patterns. SUs are prioritized based on
162 /// depth for top-down scheduling.
163 SmallSetVector<SUnit *, 16> PrioritySUs;
164 /// All the SUs in the region that consume this resource
165 SmallSetVector<SUnit *, 16> AllSUs;
166 /// The total number of busy cycles for this HardwareUnit for a given region.
167 unsigned TotalCycles = 0;
168 // InstructionFlavor mapping
169 AMDGPU::InstructionFlavor Type;
170 // Whether or not instructions on this HardwareUnit may produce a window in
171 // which instructions in other HardwareUnits can coexecute. For example, WMMA
172 // / MFMA instructions may take multiple cycles, which may be overlapped with
173 // instructions on other HardwareUnits
174 bool ProducesCoexecWindow = false;
175
176public:
177 HardwareUnitInfo() {}
178
179 unsigned size() { return AllSUs.size(); }
180
181 unsigned getTotalCycles() { return TotalCycles; }
182
183 void setType(unsigned TheType) {
184 assert(TheType < (unsigned)AMDGPU::InstructionFlavor::NUM_FLAVORS);
185 Type = (AMDGPU::InstructionFlavor)(TheType);
186 }
187
188 AMDGPU::InstructionFlavor getType() const { return Type; }
189
190 bool producesCoexecWindow() const { return ProducesCoexecWindow; }
191
192 void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }
193
194 bool contains(SUnit *SU) const { return AllSUs.contains(key: SU); }
195
196 /// \returns true if there is a difference in priority between \p SU and \p
197 /// Other. If so, \returns the SUnit with higher priority. This
198 /// method looks through the PrioritySUs to determine if one SU is more
199 /// prioritized than the other. If neither are in the PrioritySUs list, then
200 /// neither have priority over each other.
201 SUnit *getHigherPriority(SUnit *SU, SUnit *Other) const {
202 for (auto *SUOrder : PrioritySUs) {
203 if (SUOrder == SU)
204 return SU;
205
206 if (SUOrder == Other)
207 return Other;
208 }
209 return nullptr;
210 }
211
212 void reset() {
213 AllSUs.clear();
214 PrioritySUs.clear();
215 TotalCycles = 0;
216 Type = AMDGPU::InstructionFlavor::Other;
217 ProducesCoexecWindow = false;
218 }
219
220 /// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
221 /// set, we will look beyond the PrioritySUs (if all the PrioritySUs are
222 /// ready) to AllSUs to attempt to find a target SU. When looking through
223 /// AllSUs we sort pick the target SU by minimal depth for top-down
224 /// scheduling. getNextTargetSU is useful for determining which SU on this
225 /// HardwareUnit we are trying to schedule - this info helps us determine
226 /// which dependencies to schedule. LookDeep is useful if the dependencies are
227 /// long latency (e.g. memory instructions). If we have many long latency
228 /// dependencies, it is beneficial to enable SUs multiple levels ahead.
229 SUnit *getNextTargetSU(bool LookDeep = false) const;
230 /// Insert the \p SU into the AllSUs and account its \p BlockingCycles into
231 /// the TotalCycles. This maintains the list of PrioritySUs.
232 void insert(SUnit *SU, unsigned BlockingCycles);
233 /// Update the state for \p SU being scheduled by removing it from the AllSus
234 /// and reducing its \p BlockingCycles from the TotalCycles. This maintains
235 /// the list of PrioritySUS.
236 void markScheduled(SUnit *SU, unsigned BlockingCycles);
237};
238
239//===----------------------------------------------------------------------===//
240// Candidate Heuristics
241//===----------------------------------------------------------------------===//
242
243/// CandidateHeuristics contains state and implementations to facilitate making
244/// per instruction scheduling decisions; it contains methods used in
245/// tryCandidate to decide which instruction to schedule next.
246class CandidateHeuristics {
247protected:
248 ScheduleDAGMI *DAG;
249 const SIInstrInfo *SII;
250 const SIRegisterInfo *SRI;
251 const TargetSchedModel *SchedModel;
252 SmallVector<HardwareUnitInfo, 8> HWUInfo;
253
254 /// Walk over the region and collect total usage per HardwareUnit
255 void collectHWUIPressure();
256
257 /// Compute the blocking cycles for the appropriate HardwareUnit given an \p
258 /// SU
259 unsigned getHWUICyclesForInst(SUnit *SU);
260
261 /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
262 /// mapped HardwareUnit.
263 HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);
264
265public:
266 CandidateHeuristics() = default;
267
268 void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel,
269 const TargetRegisterInfo *TRI);
270
271 /// Update the state to reflect that \p SU is going to be scheduled.
272 void updateForScheduling(SUnit *SU);
273
274 /// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
275 /// priority are first. Priority is determined by maximizing coexecution and
276 /// keeping the critical HardwareUnit busy.
277 void sortHWUIResources();
278
279 /// Check for critical resource consumption. Prefer the candidate that uses
280 /// the most prioritized HardwareUnit. If both candidates use the same
281 /// HarwareUnit, prefer the candidate with higher priority on that
282 /// HardwareUnit.
283 bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand,
284 GenericSchedulerBase::SchedCandidate &Cand,
285 SchedBoundary *Zone) const;
286
287 /// Check for dependencies of instructions that use prioritized HardwareUnits.
288 /// Prefer the candidate that is a dependency of an instruction that uses the
289 /// most prioritized HardwareUnit. If both candidates enable the same
290 /// HardwareUnit, prefer the candidate that enables the higher priority
291 /// instruction on that HardwareUnit.
292 bool
293 tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand,
294 GenericSchedulerBase::SchedCandidate &Cand,
295 SchedBoundary *Zone) const;
296
297 void dumpRegionSummary();
298};
299
300class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
301protected:
302 bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
303 SchedBoundary &Zone) const;
304 AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None;
305 CandidateHeuristics Heurs;
306
307#ifndef NDEBUG
308 void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand);
309#endif
310
311 bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand,
312 SchedBoundary *Zone);
313 void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
314 const RegPressureTracker &RPTracker,
315 SchedCandidate &Cand, bool &PickedPending,
316 bool IsBottomUp);
317
318public:
319 AMDGPUCoExecSchedStrategy(const MachineSchedContext *C);
320
321 void initPolicy(MachineBasicBlock::iterator Begin,
322 MachineBasicBlock::iterator End,
323 unsigned NumRegionInstrs) override;
324 void initialize(ScheduleDAGMI *DAG) override;
325 SUnit *pickNode(bool &IsTopNode) override;
326 void schedNode(SUnit *SU, bool IsTopNode) override;
327};
328
329ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C);
330ScheduleDAGInstrs *createGCNNoopPostMachineScheduler(MachineSchedContext *C);
331
332} // End namespace llvm
333
334#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
335