AMDGPUCoExecSchedStrategy.h source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h]

1	//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Coexecution-focused scheduling strategy for AMDGPU.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
15	#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
16
17	#include "GCNSchedStrategy.h"
18	#include "llvm/CodeGen/MachineScheduler.h"
19
20	namespace llvm {
21
22	namespace AMDGPU {
23
24	//===----------------------------------------------------------------------===//
25	// Instruction Flavor Classification
26	//===----------------------------------------------------------------------===//
27
28	enum class InstructionFlavor : uint8_t {
29	WMMA, // WMMA/MFMA matrix operations
30	SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
31	TRANS, // Transcendental ops (v_exp, v_log, etc.)
32	MultiCycleVALU, // VALU instructions with repeat rate > 1
33	VMEM, // FLAT/GLOBAL memory operations
34	DS, // LDS/GDS operations
35	SALU, // Scalar ALU
36	DMA, // Tensor DMA operations
37	Fence, // Fences and waits
38	Other, // Everything else
39	NUM_FLAVORS
40	};
41
42	inline StringRef getFlavorName(InstructionFlavor F) {
43	switch (F) {
44	case InstructionFlavor::WMMA:
45	return "WMMA";
46	case InstructionFlavor::SingleCycleVALU:
47	return "VALU(1c)";
48	case InstructionFlavor::TRANS:
49	return "TRANS";
50	case InstructionFlavor::MultiCycleVALU:
51	return "VALU(Nc)";
52	case InstructionFlavor::VMEM:
53	return "VMEM";
54	case InstructionFlavor::DS:
55	return "DS";
56	case InstructionFlavor::SALU:
57	return "SALU";
58	case InstructionFlavor::DMA:
59	return "DMA";
60	case InstructionFlavor::Fence:
61	return "Fence";
62	case InstructionFlavor::Other:
63	return "Other";
64	case InstructionFlavor::NUM_FLAVORS:
65	llvm_unreachable("Unknown InstructionFlavor");
66	}
67	llvm_unreachable("Unknown InstructionFlavor");
68	}
69
70	inline StringRef getFlavorShortName(InstructionFlavor F) {
71	switch (F) {
72	case InstructionFlavor::WMMA:
73	return "W";
74	case InstructionFlavor::SingleCycleVALU:
75	return "V";
76	case InstructionFlavor::TRANS:
77	return "T";
78	case InstructionFlavor::MultiCycleVALU:
79	return "C";
80	case InstructionFlavor::VMEM:
81	return "M";
82	case InstructionFlavor::DS:
83	return "D";
84	case InstructionFlavor::SALU:
85	return "S";
86	case InstructionFlavor::DMA:
87	return "X";
88	case InstructionFlavor::Fence:
89	return "F";
90	case InstructionFlavor::Other:
91	return "O";
92	case InstructionFlavor::NUM_FLAVORS:
93	llvm_unreachable("Unknown InstructionFlavor");
94	}
95	llvm_unreachable("Unknown InstructionFlavor");
96	}
97
98	InstructionFlavor classifyFlavor(const MachineInstr &MI,
99	const SIInstrInfo &SII);
100
101	using FlavorGroup = SmallVector<InstructionFlavor, `4`>;
102
103	namespace FlavorGroups {
104	inline FlavorGroup allVALU() {
105	return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS,
106	InstructionFlavor::MultiCycleVALU};
107	}
108	inline FlavorGroup allMem() {
109	return {InstructionFlavor::VMEM, InstructionFlavor::DS,
110	InstructionFlavor::DMA};
111	}
112	inline FlavorGroup individual(InstructionFlavor F) { return {F}; }
113	inline FlavorGroup all() {
114	FlavorGroup G;
115	for (unsigned I = `0`;
116	I < static_cast<unsigned>(InstructionFlavor::NUM_FLAVORS); ++I)
117	G.push_back(Elt: static_cast<InstructionFlavor>(I));
118	return G;
119	}
120	} // namespace FlavorGroups
121
122	/// AMDGPU-specific scheduling decision reasons. These provide more granularity
123	/// than the generic CandReason enum for debugging purposes.
124	enum class AMDGPUSchedReason : uint8_t {
125	None,
126	CritResourceBalance, // tryCriticalResource chose based on resource pressure
127	CritResourceDep, // tryCriticalResourceDependency chose based on enabling
128	NUM_REASONS
129	};
130
131	inline StringRef getReasonName(AMDGPUSchedReason R) {
132	switch (R) {
133	case AMDGPUSchedReason::None:
134	return "None";
135	case AMDGPUSchedReason::CritResourceBalance:
136	return "CritResource";
137	case AMDGPUSchedReason::CritResourceDep:
138	return "CritResourceDep";
139	case AMDGPUSchedReason::NUM_REASONS:
140	llvm_unreachable("Unknown AMDGPUSchedReason");
141	}
142	llvm_unreachable("Unknown AMDGPUSchedReason");
143	}
144
145	} // End namespace AMDGPU
146
147	//===----------------------------------------------------------------------===//
148	// Hardware Unit Information
149	//===----------------------------------------------------------------------===//
150
151	/// HardwareUnitInfo is a wrapper class which maps to some real hardware
152	/// resource. This is used to model hardware resource pressure per region, and
153	/// guide scheduling heuristics.
154	class HardwareUnitInfo {
155	private:
156	/// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
157	/// for this HardwareUnit. This is used for agreement between
158	/// tryCriticalResourceDependency and tryCriticalResource: we schedule the
159	/// dependencies for a SU on critical resource, then schedule that same SU on
160	/// the critical resource. This agreement results in shorter live ranges and
161	/// more regular HardwareUnit access patterns. SUs are prioritized based on
162	/// depth for top-down scheduling.
163	SmallSetVector<SUnit *, `16`> PrioritySUs;
164	/// All the SUs in the region that consume this resource.
165	SmallSetVector<SUnit *, `16`> AllSUs;
166	/// The total number of busy cycles for this HardwareUnit for a given region.
167	unsigned TotalCycles = `0`;
168	/// InstructionFlavor mapping.
169	AMDGPU::InstructionFlavor Type;
170	/// Whether or not instructions on this HardwareUnit may produce a window in
171	/// which instructions in other HardwareUnits can coexecute. For example, WMMA
172	/// / MFMA instructions may take multiple cycles, which may be overlapped with
173	/// instructions on other HardwareUnits.
174	bool ProducesCoexecWindow = false;
175
176	public:
177	HardwareUnitInfo() {}
178
179	unsigned size() { return AllSUs.size(); }
180
181	unsigned getTotalCycles() { return TotalCycles; }
182
183	void setType(unsigned TheType) {
184	assert(TheType < (unsigned)AMDGPU::InstructionFlavor::NUM_FLAVORS);
185	Type = (AMDGPU::InstructionFlavor)(TheType);
186	}
187
188	AMDGPU::InstructionFlavor getType() const { return Type; }
189
190	bool producesCoexecWindow() const { return ProducesCoexecWindow; }
191
192	void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }
193
194	bool contains(SUnit SU) const* { return AllSUs.contains(key: SU); }
195
196	/// \returns the SUnit with higher priority or nullptr if they are the same.
197	/// This method looks through the PrioritySUs to determine if one SU is more
198	/// prioritized than the other. If neither are in the PrioritySUs list, then
199	/// neither have priority over each other.
200	SUnit getHigherPriority(SUnit SU, SUnit Other) const* {
201	for (SUnit *SUOrder : PrioritySUs) {
202	if (SUOrder == SU)
203	return SU;
204
205	if (SUOrder == Other)
206	return Other;
207	}
208	return nullptr;
209	}
210
211	void reset() {
212	AllSUs.clear();
213	PrioritySUs.clear();
214	TotalCycles = `0`;
215	Type = AMDGPU::InstructionFlavor::Other;
216	ProducesCoexecWindow = false;
217	}
218
219	/// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
220	/// set, we will look beyond the PrioritySUs (if all the PrioritySUs are
221	/// ready) to AllSUs to attempt to find a target SU. When looking through
222	/// AllSUs we sort pick the target SU by minimal depth for top-down
223	/// scheduling. getNextTargetSU is useful for determining which SU on this
224	/// HardwareUnit we are trying to schedule - this info helps us determine
225	/// which dependencies to schedule. LookDeep is useful if the dependencies are
226	/// long latency (e.g. memory instructions). If we have many long latency
227	/// dependencies, it is beneficial to enable SUs multiple levels ahead.
228	SUnit getNextTargetSU(bool* LookDeep = false) const;
229	/// Insert the \p SU into AllSUs and account its \p BlockingCycles into
230	/// the TotalCycles. This maintains the list of PrioritySUs.
231	void insert(SUnit SU, unsigned* BlockingCycles);
232	/// Update the state for \p SU being scheduled by removing it from the AllSUs
233	/// and reducing its \p BlockingCycles from the TotalCycles. This maintains
234	/// the list of PrioritySUs.
235	void markScheduled(SUnit SU, unsigned* BlockingCycles);
236	};
237
238	//===----------------------------------------------------------------------===//
239	// Candidate Heuristics
240	//===----------------------------------------------------------------------===//
241
242	/// CandidateHeuristics contains state and implementations to facilitate making
243	/// per instruction scheduling decisions; it contains methods used in
244	/// tryCandidate to decide which instruction to schedule next.
245	class CandidateHeuristics {
246	protected:
247	ScheduleDAGMI *DAG;
248	const SIInstrInfo *SII;
249	const SIRegisterInfo *SRI;
250	const TargetSchedModel *SchedModel;
251	SmallVector<HardwareUnitInfo, `8`> HWUInfo;
252
253	/// Walk over the region and collect total usage per HardwareUnit.
254	void collectHWUIPressure();
255
256	/// Compute the blocking cycles for the appropriate HardwareUnit given an \p
257	/// SU.
258	unsigned getHWUICyclesForInst(SUnit *SU);
259
260	/// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
261	/// mapped HardwareUnit.
262	HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);
263
264	public:
265	CandidateHeuristics() = default;
266
267	void initialize(ScheduleDAGMI DAG, const* TargetSchedModel *SchedModel,
268	const TargetRegisterInfo *TRI);
269
270	/// Update the state to reflect that \p SU is going to be scheduled.
271	void updateForScheduling(SUnit *SU);
272
273	/// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
274	/// priority are first. Priority is determined by maximizing coexecution and
275	/// keeping the critical HardwareUnit busy.
276	void sortHWUIResources();
277
278	/// Check for critical resource consumption. Prefer the candidate that uses
279	/// the most prioritized HardwareUnit. If both candidates use the same
280	/// HarwareUnit, prefer the candidate with higher priority on that
281	/// HardwareUnit.
282	bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand,
283	GenericSchedulerBase::SchedCandidate &Cand,
284	SchedBoundary Zone) const*;
285
286	/// Check for dependencies of instructions that use prioritized HardwareUnits.
287	/// Prefer the candidate that is a dependency of an instruction that uses the
288	/// most prioritized HardwareUnit. If both candidates enable the same
289	/// HardwareUnit, prefer the candidate that enables the higher priority
290	/// instruction on that HardwareUnit.
291	bool
292	tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand,
293	GenericSchedulerBase::SchedCandidate &Cand,
294	SchedBoundary Zone) const*;
295
296	void dumpRegionSummary();
297	};
298
299	class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
300	protected:
301	bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
302	SchedBoundary &Zone) const;
303	AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None;
304	CandidateHeuristics Heurs;
305
306	#ifndef NDEBUG
307	void dumpPickSummary(SUnit SU, bool* IsTopNode, SchedCandidate &Cand);
308	#endif
309
310	bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand,
311	SchedBoundary *Zone);
312	void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
313	const RegPressureTracker &RPTracker,
314	SchedCandidate &Cand, bool &PickedPending,
315	bool IsBottomUp);
316
317	public:
318	AMDGPUCoExecSchedStrategy(const MachineSchedContext *C);
319
320	void initPolicy(MachineBasicBlock::iterator Begin,
321	MachineBasicBlock::iterator End,
322	unsigned NumRegionInstrs) override;
323	void initialize(ScheduleDAGMI *DAG) override;
324	SUnit pickNode(bool* &IsTopNode) override;
325	void schedNode(SUnit SU, bool* IsTopNode) override;
326	};
327
328	ScheduleDAGInstrs createGCNCoExecMachineScheduler(MachineSchedContext C);
329	ScheduleDAGInstrs createGCNNoopPostMachineScheduler(MachineSchedContext C);
330
331	} // End namespace llvm
332
333	#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
334

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h