AMDGPUIGroupLP.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp]

1	//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file This file defines a set of schedule DAG mutations that can be used to
10	// override default scheduler behavior to enforce specific scheduling patterns.
11	// They should be used in cases where runtime performance considerations such as
12	// inter-wavefront interactions, mean that compile-time heuristics cannot
13	// predict the optimal instruction ordering, or in kernels where optimum
14	// instruction scheduling is important enough to warrant manual intervention.
15	//
16	//===----------------------------------------------------------------------===//
17
18	#include "AMDGPUIGroupLP.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "SIInstrInfo.h"
21	#include "SIMachineFunctionInfo.h"
22	#include "llvm/ADT/BitmaskEnum.h"
23	#include "llvm/ADT/DenseMap.h"
24	#include "llvm/CodeGen/MachineScheduler.h"
25	#include "llvm/CodeGen/TargetOpcodes.h"
26
27	#include <type_traits>
28
29	using namespace llvm;
30	using namespace llvm::AMDGPU;
31
32	#define DEBUG_TYPE "igrouplp"
33
34	namespace {
35
36	static cl::opt<bool> EnableExactSolver(
37	"amdgpu-igrouplp-exact-solver", cl::Hidden,
38	cl::desc("Whether to use the exponential time solver to fit "
39	"the instructions to the pipeline as closely as "
40	"possible."),
41	cl::init(Val: false));
42
43	static cl::opt<unsigned> CutoffForExact(
44	"amdgpu-igrouplp-exact-solver-cutoff", cl::init(Val: `0`), cl::Hidden,
45	cl::desc("The maximum number of scheduling group conflicts "
46	"which we attempt to solve with the exponential time "
47	"exact solver. Problem sizes greater than this will"
48	"be solved by the less accurate greedy algorithm. Selecting "
49	"solver by size is superseded by manually selecting "
50	"the solver (e.g. by amdgpu-igrouplp-exact-solver"));
51
52	static cl::opt<uint64_t> MaxBranchesExplored(
53	"amdgpu-igrouplp-exact-solver-max-branches", cl::init(Val: `0`), cl::Hidden,
54	cl::desc("The amount of branches that we are willing to explore with"
55	"the exact algorithm before giving up."));
56
57	static cl::opt<bool> UseCostHeur(
58	"amdgpu-igrouplp-exact-solver-cost-heur", cl::init(Val: true), cl::Hidden,
59	cl::desc("Whether to use the cost heuristic to make choices as we "
60	"traverse the search space using the exact solver. Defaulted "
61	"to on, and if turned off, we will use the node order -- "
62	"attempting to put the later nodes in the later sched groups. "
63	"Experimentally, results are mixed, so this should be set on a "
64	"case-by-case basis."));
65
66	// Components of the mask that determines which instruction types may be may be
67	// classified into a SchedGroup.
68	enum class SchedGroupMask {
69	NONE = `0u`,
70	ALU = `1u` << `0`,
71	VALU = `1u` << `1`,
72	SALU = `1u` << `2`,
73	MFMA = `1u` << `3`,
74	VMEM = `1u` << `4`,
75	VMEM_READ = `1u` << `5`,
76	VMEM_WRITE = `1u` << `6`,
77	DS = `1u` << `7`,
78	DS_READ = `1u` << `8`,
79	DS_WRITE = `1u` << `9`,
80	TRANS = `1u` << `10`,
81	LDSDMA = `1u` << `11`,
82	ALL = ALU \| VALU \| SALU \| MFMA \| VMEM \| VMEM_READ \| VMEM_WRITE \| DS \|
83	DS_READ \| DS_WRITE \| TRANS \| LDSDMA,
84	LLVM_MARK_AS_BITMASK_ENUM(/ LargestFlag = / ALL)
85	};
86
87	class SchedGroup;
88
89	// InstructionRule class is used to enact a filter which determines whether or
90	// not an SU maps to a given SchedGroup. It contains complementary data
91	// structures (e.g Cache) to help those filters.
92	class InstructionRule {
93	protected:
94	const SIInstrInfo *TII;
95	unsigned SGID;
96	// A cache made available to the Filter to store SUnits for subsequent
97	// invocations of the Filter
98	std::optional<SmallVector<SUnit *, `4`>> Cache;
99
100	public:
101	virtual bool
102	apply(const SUnit , const* ArrayRef<SUnit *>,
103	SmallVectorImpl<SchedGroup> &) {
104	return true;
105	};
106
107	InstructionRule(const SIInstrInfo TII, unsigned* SGID,
108	bool NeedsCache = false)
109	: TII(TII), SGID(SGID) {
110	if (NeedsCache) {
111	Cache = SmallVector<SUnit *, `4`>();
112	}
113	}
114
115	virtual ~InstructionRule() = default;
116	};
117
118	using SUnitsToCandidateSGsMap = DenseMap<SUnit , SmallVector<int*, `4`>>;
119
120	// Classify instructions into groups to enable fine tuned control over the
121	// scheduler. These groups may be more specific than current SchedModel
122	// instruction classes.
123	class SchedGroup {
124	private:
125	// Mask that defines which instruction types can be classified into this
126	// SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
127	// and SCHED_GROUP_BARRIER.
128	SchedGroupMask SGMask;
129
130	// Maximum number of SUnits that can be added to this group.
131	std::optional<unsigned> MaxSize;
132
133	// SchedGroups will only synchronize with other SchedGroups that have the same
134	// SyncID.
135	int SyncID = `0`;
136
137	// SGID is used to map instructions to candidate SchedGroups
138	unsigned SGID;
139
140	// The different rules each instruction in this SchedGroup must conform to
141	SmallVector<std::shared_ptr<InstructionRule>, `4`> Rules;
142
143	// Count of the number of created SchedGroups, used to initialize SGID.
144	static unsigned NumSchedGroups;
145
146	// Use SGMask to determine whether we can classify MI as a member of this
147	// SchedGroup object.
148	bool canAddMI(const MachineInstr &MI) const;
149
150	public:
151	// Collection of SUnits that are classified as members of this group.
152	SmallVector<SUnit *, `32`> Collection;
153
154	ScheduleDAGInstrs *DAG;
155	const SIInstrInfo *TII;
156
157	// Try to add and edge from SU A to SU B.
158	bool tryAddEdge(SUnit A, SUnit B);
159
160	// Returns true if SU can be added to this SchedGroup.
161	bool canAddSU(SUnit &SU) const;
162
163	// Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
164	// MakePred is true, SU will be a predecessor of the SUnits in this
165	// SchedGroup, otherwise SU will be a successor.
166	void link(SUnit &SU, bool MakePred = false);
167
168	// Add DAG dependencies and track which edges are added, and the count of
169	// missed edges
170	int link(SUnit &SU, bool MakePred,
171	std::list<std::pair<SUnit , SUnit >> &AddedEdges);
172
173	// Add DAG dependencies from all SUnits in this SchedGroup and this SU.
174	// Use the predicate to determine whether SU should be a predecessor (P =
175	// true) or a successor (P = false) of this SchedGroup.
176	void link(SUnit &SU, function_ref<bool(const SUnit A, const* SUnit *B)> P);
177
178	// Add DAG dependencies such that SUnits in this group shall be ordered
179	// before SUnits in OtherGroup.
180	void link(SchedGroup &OtherGroup);
181
182	// Returns true if no more instructions may be added to this group.
183	bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
184
185	// Append a constraint that SUs must meet in order to fit into this
186	// SchedGroup. Since many rules involve the relationship between a SchedGroup
187	// and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
188	// time (rather than SchedGroup init time.)
189	void addRule(std::shared_ptr<InstructionRule> NewRule) {
190	Rules.push_back(Elt: NewRule);
191	}
192
193	// Returns true if the SU matches all rules
194	bool allowedByRules(const SUnit *SU,
195	SmallVectorImpl<SchedGroup> &SyncPipe) const {
196	for (auto &Rule : Rules) {
197	if (!Rule ->apply(SU, Collection, SyncPipe))
198	return false;
199	}
200	return true;
201	}
202
203	// Add SU to the SchedGroup.
204	void add(SUnit &SU) {
205	LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
206	<< format_hex((int)SGMask, `10`, true) << " adding "
207	<< *SU.getInstr());
208	Collection.push_back(Elt: &SU);
209	}
210
211	// Remove last element in the SchedGroup
212	void pop() { Collection.pop_back(); }
213
214	template <class T>
215	void findCandidateSUnits(T Begin, T End,
216	SUnitsToCandidateSGsMap &SyncedInstrs);
217
218	/// Find each SUnit in the DAG that could potentially be added to
219	/// this SchedGroup and add the SGID to the candidate SchedGroups
220	/// for SU in \p SyncedInstrs.
221	void findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs);
222
223	int getSyncID() { return SyncID; }
224
225	int getSGID() { return SGID; }
226
227	SchedGroupMask getMask() { return SGMask; }
228
229	SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
230	ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
231	: SGMask(SGMask), MaxSize (MaxSize), DAG(DAG), TII(TII) {
232	SGID = NumSchedGroups++;
233	}
234
235	SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
236	ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
237	: SGMask(SGMask), MaxSize (MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
238	SGID = NumSchedGroups++;
239	}
240	};
241
242	using SUToCandSGsPair = std::pair<SUnit , SmallVector<int*, `4`>>;
243	using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, `4`>;
244
245	// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
246	// in non-trivial cases. For example, if the requested pipeline is
247	// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
248	// in the DAG, then we will have an instruction that can not be trivially
249	// assigned to a SchedGroup. The PipelineSolver class implements two algorithms
250	// to find a good solution to the pipeline -- a greedy algorithm and an exact
251	// algorithm. The exact algorithm has an exponential time complexity and should
252	// only be used for small sized problems or medium sized problems where an exact
253	// solution is highly desired.
254	class PipelineSolver {
255	[[maybe_unused]] ScheduleDAGMI *DAG;
256
257	// Instructions that can be assigned to multiple SchedGroups
258	DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
259	SmallVector<SUsToCandSGsVec, `4`> PipelineInstrs;
260	DenseMap<int, SmallVector<SchedGroup, `4`>> SyncedSchedGroups;
261	// The current working pipeline
262	SmallVector<SmallVector<SchedGroup, `4`>, `4`> CurrPipeline;
263	// The pipeline that has the best solution found so far
264	SmallVector<SmallVector<SchedGroup, `4`>, `4`> BestPipeline;
265
266	// Whether or not we actually have any SyncedInstrs to try to solve.
267	bool NeedsSolver = false;
268
269	// Compute an estimate of the size of search tree -- the true size is
270	// the product of each conflictedInst.Matches.size() across all SyncPipelines
271	unsigned computeProblemSize();
272
273	// The cost penalty of not assigning a SU to a SchedGroup
274	int MissPenalty = `0`;
275
276	// Costs in terms of the number of edges we are unable to add
277	int BestCost = -`1`;
278	int CurrCost = `0`;
279
280	// Index pointing to the conflicting instruction that is currently being
281	// fitted
282	int CurrConflInstNo = `0`;
283	// Index to the pipeline that is currently being fitted
284	int CurrSyncGroupIdx = `0`;
285	// The first non trivial pipeline
286	int BeginSyncGroupIdx = `0`;
287
288	// How many branches we have explored
289	uint64_t BranchesExplored = `0`;
290
291	// The direction in which we process the candidate SchedGroups per SU
292	bool IsBottomUp = true;
293
294	// Update indices to fit next conflicting instruction
295	void advancePosition();
296	// Recede indices to attempt to find better fit for previous conflicting
297	// instruction
298	void retreatPosition();
299
300	// The exponential time algorithm which finds the provably best fit
301	bool solveExact();
302	// The polynomial time algorithm which attempts to find a good fit
303	bool solveGreedy();
304	// Find the best SchedGroup for the current SU using the heuristic given all
305	// current information. One step in the greedy algorithm. Templated against
306	// the SchedGroup iterator (either reverse or forward).
307	template <typename T>
308	void greedyFind(std::list<std::pair<SUnit , SUnit >> &AddedEdges, T I, T E);
309	// Whether or not the current solution is optimal
310	bool checkOptimal();
311	// Populate the ready list, prioiritizing fewest missed edges first
312	// Templated against the SchedGroup iterator (either reverse or forward).
313	template <typename T>
314	void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
315	T E);
316	// Add edges corresponding to the SchedGroups as assigned by solver
317	void makePipeline();
318	// Link the SchedGroups in the best found pipeline.
319	// Tmplated against the SchedGroup iterator (either reverse or forward).
320	template <typename T> void linkSchedGroups(T I, T E);
321	// Add the edges from the SU to the other SchedGroups in pipeline, and
322	// return the number of edges missed.
323	int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit SU, int* SGID,
324	std::list<std::pair<SUnit , SUnit >> &AddedEdges);
325
326	/// This class is used to build the edge set implied by an
327	/// assignment of an SUnit to a SchedGroup and to compute the cost
328	/// (edges that cannot be assigned without introducing cycles) of
329	/// the assignment.
330	class EdgeSetBuilder {
331	SUnit *SU;
332	SmallVectorImpl<SchedGroup> &SyncPipeline;
333	bool IsBottomUp;
334	DenseSet<SUnit *> InitialPreds;
335	DenseSet<SUnit *> Succs;
336	bool Initialized = false;
337
338	/// Compute reachability via DFS. If ComputePreds is true, follows
339	/// predecessor edges; otherwise follows successor edges.
340	template <bool ComputePreds>
341	static void computeReachable(DenseSet<SUnit > &Reachable, SUnit Start);
342
343	/// Compute all nodes that can reach Start via predecessor edges, including
344	/// Start itself.
345	static void computePreds(DenseSet<SUnit > &Preds, SUnit Start);
346
347	/// Compute all nodes reachable from Start via successor edges, including
348	/// Start itself.
349	static void computeSuccs(DenseSet<SUnit > &Succs, SUnit Start);
350
351	public:
352	EdgeSetBuilder(SUnit *SU, SmallVectorImpl<SchedGroup> &SyncPipeline,
353	bool IsBottomUp)
354	: SU(SU), SyncPipeline(SyncPipeline), IsBottomUp(IsBottomUp) {}
355
356	/// Determine the edges implied by assigning SU to the SchedGroup
357	/// with ID SGID. Edges are added to NewEdges unless they
358	/// introduce cycles. Return the number of edges that cannot be
359	/// added.
360	int build(int SGID, std::list<std::pair<SUnit , SUnit >> &NewEdges);
361
362	private:
363	template <typename T>
364	int buildImpl(int SGID, const iterator_range<T> SchedGroups,
365	std::list<std::pair<SUnit , SUnit >> &NewEdges);
366	};
367
368	/// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
369	/// returns the cost (in terms of missed pipeline edges), and tracks the edges
370	/// added in \p AddedEdges
371	template <typename T>
372	int linkSUnit(SUnit SU, int* SGID,
373	std::list<std::pair<SUnit , SUnit >> &AddedEdges, T I, T E);
374	/// Remove the edges passed via \p AddedEdges
375	void removeEdges(const std::list<std::pair<SUnit , SUnit >> &AddedEdges);
376	// Convert the passed in maps to arrays for bidirectional iterators
377	void convertSyncMapsToArrays();
378
379	void reset();
380
381	public:
382	// Invoke the solver to map instructions to instruction groups. Heuristic &&
383	// command-line-option determines to use exact or greedy algorithm.
384	void solve();
385
386	PipelineSolver(DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
387	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
388	ScheduleDAGMI DAG, bool* IsBottomUp = true)
389	: DAG(DAG), SyncedInstrs (SyncedInstrs),
390	SyncedSchedGroups (SyncedSchedGroups), IsBottomUp(IsBottomUp) {
391
392	for (auto &PipelineInstrs : SyncedInstrs) {
393	if (!PipelineInstrs.second.empty()) {
394	NeedsSolver = true;
395	break;
396	}
397	}
398
399	if (!NeedsSolver)
400	return;
401
402	convertSyncMapsToArrays();
403
404	CurrPipeline = BestPipeline;
405
406	while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
407	PipelineInstrs [BeginSyncGroupIdx].empty())
408	++BeginSyncGroupIdx;
409
410	if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
411	return;
412	}
413	};
414
415	void PipelineSolver::reset() {
416
417	for (auto &SyncPipeline : CurrPipeline) {
418	for (auto &SG : SyncPipeline) {
419	SmallVector<SUnit *, `32`> TempCollection = SG.Collection;
420	SG.Collection.clear();
421	auto SchedBarr = llvm::find_if(Range&: TempCollection, P: [](SUnit SU) {
422	return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
423	});
424	if (SchedBarr != TempCollection.end())
425	SG.Collection.push_back(Elt: *SchedBarr);
426	}
427	}
428
429	CurrSyncGroupIdx = BeginSyncGroupIdx;
430	CurrConflInstNo = `0`;
431	CurrCost = `0`;
432	}
433
434	void PipelineSolver::convertSyncMapsToArrays() {
435	for (auto &SyncPipe : SyncedSchedGroups) {
436	BestPipeline.insert(I: BestPipeline.begin(), Elt: SyncPipe.second);
437	}
438
439	int PipelineIDx = SyncedInstrs.size() - `1`;
440	PipelineInstrs.resize(N: SyncedInstrs.size());
441	for (auto &SyncInstrMap : SyncedInstrs) {
442	for (auto &SUsToCandSGs : SyncInstrMap.second) {
443	if (PipelineInstrs [PipelineIDx].empty()) {
444	PipelineInstrs [PipelineIDx].push_back(
445	Elt: std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
446	continue;
447	}
448	auto *SortPosition = PipelineInstrs [PipelineIDx].begin();
449	// Insert them in sorted order -- this allows for good parsing order in
450	// the greedy algorithm
451	while (SortPosition != PipelineInstrs [PipelineIDx].end() &&
452	SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
453	++SortPosition;
454	PipelineInstrs [PipelineIDx].insert(
455	I: SortPosition, Elt: std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
456	}
457	--PipelineIDx;
458	}
459	}
460
461	template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
462	for (; I != E; ++I) {
463	auto &GroupA = *I;
464	for (auto J = std::next(I); J != E; ++J) {
465	auto &GroupB = *J;
466	GroupA.link(GroupB);
467	}
468	}
469	}
470
471	void PipelineSolver::makePipeline() {
472	// Preserve the order of barrier for subsequent SchedGroupBarrier mutations
473	for (auto &SyncPipeline : BestPipeline) {
474	LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
475	for (auto &SG : SyncPipeline) {
476	LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
477	<< " has: \n");
478	SUnit SGBarr = nullptr*;
479	for (auto &SU : SG.Collection) {
480	if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
481	SGBarr = SU;
482	LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
483	}
484	// Command line requested IGroupLP doesn't have SGBarr
485	if (!SGBarr)
486	continue;
487	SG.link(SU&: SGBarr, MakePred: false*);
488	}
489	}
490
491	for (auto &SyncPipeline : BestPipeline) {
492	IsBottomUp ? linkSchedGroups(I: SyncPipeline.rbegin(), E: SyncPipeline.rend())
493	: linkSchedGroups(I: SyncPipeline.begin(), E: SyncPipeline.end());
494	}
495	}
496
497	template <typename T>
498	int PipelineSolver::linkSUnit(
499	SUnit SU, int* SGID, std::list<std::pair<SUnit , SUnit >> &AddedEdges,
500	T I, T E) {
501	bool MakePred = false;
502	int AddedCost = `0`;
503	for (; I < E; ++I) {
504	if (I->getSGID() == SGID) {
505	MakePred = true;
506	continue;
507	}
508	auto Group = *I;
509	AddedCost += Group.link(*SU, MakePred, AddedEdges);
510	assert(AddedCost >= `0`);
511	}
512	return AddedCost;
513	}
514
515	template <bool ComputePreds>
516	void PipelineSolver::EdgeSetBuilder::computeReachable(
517	DenseSet<SUnit > &Reachable, SUnit Start) {
518	if (!Reachable.insert(V: Start).second)
519	return;
520
521	SmallVector<SUnit *, `32`> WorkList = {Start};
522
523	while (!WorkList.empty()) {
524	SUnit *Current = WorkList.pop_back_val();
525
526	for (const SDep &Dep : ComputePreds ? Current->Preds : Current->Succs) {
527	if (Reachable.insert(V: Dep.getSUnit()).second)
528	WorkList.push_back(Elt: Dep.getSUnit());
529	}
530	}
531	}
532
533	void PipelineSolver::EdgeSetBuilder::computePreds(DenseSet<SUnit *> &Preds,
534	SUnit *Start) {
535	computeReachable</ComputePreds/ true>(Reachable&: Preds, Start);
536	}
537
538	void PipelineSolver::EdgeSetBuilder::computeSuccs(DenseSet<SUnit *> &Succs,
539	SUnit *Start) {
540	computeReachable</ComputePreds/ false>(Reachable&: Succs, Start);
541	}
542
543	int PipelineSolver::EdgeSetBuilder::build(
544	int SGID, std::list<std::pair<SUnit , SUnit >> &NewEdges) {
545	if (!Initialized) {
546	computePreds(Preds&: InitialPreds, Start: SU);
547	computeSuccs(Succs, Start: SU);
548	Initialized = true;
549	}
550
551	// See comment in addEdges concerning the iterator direction.
552	return IsBottomUp ? buildImpl(SGID, SchedGroups: reverse(C&: SyncPipeline), NewEdges)
553	: buildImpl(SGID,
554	SchedGroups: llvm::make_range(x: SyncPipeline.begin(),
555	y: SyncPipeline.end()),
556	NewEdges);
557	}
558
559	template <typename T>
560	int PipelineSolver::EdgeSetBuilder::buildImpl(
561	int SGID, iterator_range<T> SchedGroups,
562	std::list<std::pair<SUnit , SUnit >> &NewEdges) {
563
564	// Determine the edges that will be added to the DAG if SU is
565	// assigned to the SchedGroup SG with the given SGID. It might be
566	// impossible to add some edges because they would introduce
567	// cycles. The number of such edges is counted and returned, all
568	// other edges are added to NewEdges.
569	//
570	// SU is made a successor of SUnits in SchedGroups before SG, and a
571	// predecessor of SUnits after SG. In each case, the cycle check
572	// requires reachability information for the opposing direction.
573
574	// Nodes U that can reach SU (U ~> SU).
575	// Will be extended as new edges are added and hence cannot be
576	// shared between calls to this function, in contrast to Succs.
577	DenseSet<SUnit *> Preds = InitialPreds;
578
579	int MissedEdges = `0`;
580	bool MakePred = false;
581	for (SchedGroup &SG : SchedGroups) {
582	if (SG.getSGID() == SGID) {
583	MakePred = true;
584	continue;
585	}
586
587	for (SUnit *A : SG.Collection) {
588	if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
589	continue;
590
591	if (MakePred) {
592	// Try add SU -> A.
593	if (Preds.contains(V: A)) { // Would add cycle since A ~> SU.
594	++MissedEdges;
595	continue;
596	}
597	// Succs does not need to be updated, since it will not be
598	// queried after entering the MakePred case.
599	NewEdges.emplace_back(args&: SU, args&: A);
600	continue;
601	}
602
603	// Try add A -> SU.
604	if (Succs.contains(V: A)) { // Would add cycle since SU ~> A.
605	++MissedEdges;
606	continue;
607	}
608	NewEdges.emplace_back(args&: A, args&: SU);
609	computePreds(Preds, Start: A);
610	}
611	}
612
613	return MissedEdges;
614	}
615
616	int PipelineSolver::addEdges(
617	SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit SU, int* SGID,
618	std::list<std::pair<SUnit , SUnit >> &AddedEdges) {
619
620	// For IsBottomUp, the first SchedGroup in SyncPipeline contains the
621	// instructions that are the ultimate successors in the resultant mutation.
622	// Therefore, in such a configuration, the SchedGroups occurring before the
623	// candidate SGID are successors of the candidate SchedGroup, thus the current
624	// SU should be linked as a predecessor to SUs in those SchedGroups. The
625	// opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
626	// SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
627	// IsBottomUp (in reverse).
628	return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, I: SyncPipeline.rbegin(),
629	E: SyncPipeline.rend())
630	: linkSUnit(SU, SGID, AddedEdges, I: SyncPipeline.begin(),
631	E: SyncPipeline.end());
632	}
633
634	void PipelineSolver::removeEdges(
635	const std::list<std::pair<SUnit , SUnit >> &EdgesToRemove) {
636	// Only remove the edges that we have added when testing
637	// the fit.
638	for (auto &PredSuccPair : EdgesToRemove) {
639	SUnit *Pred = PredSuccPair.first;
640	SUnit *Succ = PredSuccPair.second;
641
642	auto *Match = llvm::find_if(Range&: Succ->Preds, P: [&Pred](SDep &P) {
643	return P.getSUnit() == Pred && P.isArtificial();
644	});
645	if (Match != Succ->Preds.end())
646	Succ->removePred(D: *Match);
647	}
648	}
649
650	void PipelineSolver::advancePosition() {
651	++CurrConflInstNo;
652
653	if (static_cast<size_t>(CurrConflInstNo) >=
654	PipelineInstrs [CurrSyncGroupIdx].size()) {
655	CurrConflInstNo = `0`;
656	++CurrSyncGroupIdx;
657	// Advance to next non-trivial pipeline
658	while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
659	PipelineInstrs [CurrSyncGroupIdx].empty())
660	++CurrSyncGroupIdx;
661	}
662	}
663
664	void PipelineSolver::retreatPosition() {
665	assert(CurrConflInstNo >= `0`);
666	assert(CurrSyncGroupIdx >= `0`);
667
668	if (CurrConflInstNo > `0`) {
669	--CurrConflInstNo;
670	return;
671	}
672
673	if (CurrConflInstNo == `0`) {
674	// If we return to the starting position, we have explored
675	// the entire tree
676	if (CurrSyncGroupIdx == BeginSyncGroupIdx)
677	return;
678
679	--CurrSyncGroupIdx;
680	// Go to previous non-trivial pipeline
681	while (PipelineInstrs [CurrSyncGroupIdx].empty())
682	--CurrSyncGroupIdx;
683
684	CurrConflInstNo = PipelineInstrs [CurrSyncGroupIdx].size() - `1`;
685	}
686	}
687
688	bool PipelineSolver::checkOptimal() {
689	if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
690	if (BestCost == -`1` \|\| CurrCost < BestCost) {
691	BestPipeline = CurrPipeline;
692	BestCost = CurrCost;
693	LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
694	}
695	assert(BestCost >= `0`);
696	}
697
698	bool DoneExploring = false;
699	if (MaxBranchesExplored > `0` && BranchesExplored >= MaxBranchesExplored)
700	DoneExploring = true;
701
702	return (DoneExploring \|\| BestCost == `0`);
703	}
704
705	template <typename T>
706	void PipelineSolver::populateReadyList(
707	SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
708	SUToCandSGsPair CurrSU = PipelineInstrs [CurrSyncGroupIdx][CurrConflInstNo];
709	auto SyncPipeline = CurrPipeline [CurrSyncGroupIdx];
710	assert(CurrSU.second.size() >= `1`);
711
712	for (; I != E; ++I) {
713	std::list<std::pair<SUnit , SUnit >> AddedEdges;
714	int CandSGID = *I;
715	SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
716	return SG.getSGID() == CandSGID;
717	});
718	assert(Match);
719
720	if (UseCostHeur) {
721	if (Match->isFull()) {
722	ReadyList.push_back(Elt: std::pair(*I, MissPenalty));
723	continue;
724	}
725
726	int TempCost = addEdges(SyncPipeline, SU: CurrSU.first, SGID: CandSGID, AddedEdges);
727	ReadyList.push_back(Elt: std::pair(*I, TempCost));
728	removeEdges(EdgesToRemove: AddedEdges);
729	} else
730	ReadyList.push_back(Elt: std::pair(*I, -`1`));
731	}
732
733	if (UseCostHeur)
734	std::sort(first: ReadyList.begin(), last: ReadyList.end(), comp: llvm::less_second());
735
736	assert(ReadyList.size() == CurrSU.second.size());
737	}
738
739	bool PipelineSolver::solveExact() {
740	if (checkOptimal())
741	return true;
742
743	if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
744	return false;
745
746	assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
747	assert(static_cast<size_t>(CurrConflInstNo) <
748	PipelineInstrs[CurrSyncGroupIdx].size());
749	SUToCandSGsPair CurrSU = PipelineInstrs [CurrSyncGroupIdx][CurrConflInstNo];
750	LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
751	<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");
752
753	// SchedGroup -> Cost pairs
754	SmallVector<std::pair<int, int>, `4`> ReadyList;
755	// Prioritize the candidate sched groups in terms of lowest cost first
756	IsBottomUp ? populateReadyList(ReadyList, I: CurrSU.second.rbegin(),
757	E: CurrSU.second.rend())
758	: populateReadyList(ReadyList, I: CurrSU.second.begin(),
759	E: CurrSU.second.end());
760
761	auto *I = ReadyList.begin();
762	auto *E = ReadyList.end();
763	for (; I != E; ++I) {
764	// If we are trying SGs in least cost order, and the current SG is cost
765	// infeasible, then all subsequent SGs will also be cost infeasible, so we
766	// can prune.
767	if (BestCost != -`1` && (CurrCost + I->second > BestCost))
768	return false;
769
770	int CandSGID = I->first;
771	int AddedCost = `0`;
772	std::list<std::pair<SUnit , SUnit >> AddedEdges;
773	auto &SyncPipeline = CurrPipeline [CurrSyncGroupIdx];
774	SchedGroup *Match = llvm::find_if(Range&: SyncPipeline, P: [CandSGID](SchedGroup &SG) {
775	return SG.getSGID() == CandSGID;
776	});
777	assert(Match);
778
779	if (Match->isFull())
780	continue;
781
782	if (!Match->allowedByRules(SU: CurrSU.first, SyncPipe&: SyncPipeline))
783	continue;
784
785	LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
786	<< (int)Match->getMask() << "and ID " << CandSGID
787	<< "\n");
788	Match->add(SU&: *CurrSU.first);
789	AddedCost = addEdges(SyncPipeline, SU: CurrSU.first, SGID: CandSGID, AddedEdges);
790	LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
791	CurrCost += AddedCost;
792	advancePosition();
793	++BranchesExplored;
794	bool FinishedExploring = false;
795	// If the Cost after adding edges is greater than a known solution,
796	// backtrack
797	if (CurrCost < BestCost \|\| BestCost == -`1`) {
798	if (solveExact()) {
799	FinishedExploring = BestCost != `0`;
800	if (!FinishedExploring)
801	return true;
802	}
803	}
804
805	retreatPosition();
806	CurrCost -= AddedCost;
807	removeEdges(EdgesToRemove: AddedEdges);
808	Match->pop();
809	CurrPipeline [CurrSyncGroupIdx] = SyncPipeline;
810	if (FinishedExploring)
811	return true;
812	}
813
814	// Try the pipeline where the current instruction is omitted
815	// Potentially if we omit a problematic instruction from the pipeline,
816	// all the other instructions can nicely fit.
817	CurrCost += MissPenalty;
818	advancePosition();
819
820	LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
821
822	bool FinishedExploring = false;
823	if (CurrCost < BestCost \|\| BestCost == -`1`) {
824	if (solveExact()) {
825	bool FinishedExploring = BestCost != `0`;
826	if (!FinishedExploring)
827	return true;
828	}
829	}
830
831	retreatPosition();
832	CurrCost -= MissPenalty;
833	return FinishedExploring;
834	}
835
836	template <typename T>
837	void PipelineSolver::greedyFind(
838	std::list<std::pair<SUnit , SUnit >> &AddedEdges, T I, T E) {
839	SUToCandSGsPair CurrSU = PipelineInstrs [CurrSyncGroupIdx][CurrConflInstNo];
840
841	struct GroupInfo {
842	SchedGroup *SG;
843	std::list<std::pair<SUnit , SUnit >> Edges;
844	int Cost = `0`;
845	};
846	std::optional<GroupInfo> Best;
847
848	auto &SyncPipeline = CurrPipeline [CurrSyncGroupIdx];
849	LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
850	<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");
851
852	EdgeSetBuilder Builder(CurrSU.first, SyncPipeline, IsBottomUp);
853
854	// Since we have added the potential SchedGroups from bottom up, but
855	// traversed the DAG from top down, parse over the groups from last to
856	// first. If we fail to do this for the greedy algorithm, the solution will
857	// likely not be good in more complex cases.
858	for (; I != E; ++I) {
859	int CandSGID = *I;
860	SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
861	return SG.getSGID() == CandSGID;
862	});
863	assert(Match);
864
865	LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
866	<< (int)Match->getMask() << "\n");
867
868	if (Match->isFull()) {
869	LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
870	continue;
871	}
872	if (!Match->allowedByRules(SU: CurrSU.first, SyncPipe&: SyncPipeline)) {
873	LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
874	continue;
875	}
876
877	std::list<std::pair<SUnit , SUnit >> TempEdges;
878	int TempCost = Builder.build(SGID: CandSGID, NewEdges&: TempEdges);
879	LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
880
881	if (!Best \|\| TempCost < Best->Cost) {
882	Best = {Match, TempEdges, TempCost};
883	if (Best->Cost == `0`)
884	break;
885	}
886	}
887
888	if (Best) {
889	SchedGroup *SG = Best->SG;
890	std::list<std::pair<SUnit , SUnit >> &Edges = Best->Edges;
891
892	SG->add(SU&: *CurrSU.first);
893	if (AddedEdges.empty())
894	AddedEdges = Edges;
895	else
896	AddedEdges.splice(position: std::prev(x: AddedEdges.cend()), x&: Edges);
897
898	for (const std::pair<SUnit , SUnit > &E : Edges) {
899	if (!SG->tryAddEdge(A: E.first, B: E.second))
900	llvm_unreachable("Edges known to be insertable.");
901	}
902
903	LLVM_DEBUG(dbgs() << "Best Group has ID: " << SG->getSGID() << " and Mask"
904	<< (int)SG->getMask() << "\n");
905	BestCost += Best->Cost;
906	} else
907	BestCost += MissPenalty;
908	}
909
910	bool PipelineSolver::solveGreedy() {
911	BestCost = `0`;
912	std::list<std::pair<SUnit , SUnit >> AddedEdges;
913
914	while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
915	SUToCandSGsPair CurrSU = PipelineInstrs [CurrSyncGroupIdx][CurrConflInstNo];
916	IsBottomUp
917	? greedyFind(AddedEdges, I: CurrSU.second.rbegin(), E: CurrSU.second.rend())
918	: greedyFind(AddedEdges, I: CurrSU.second.begin(), E: CurrSU.second.end());
919	advancePosition();
920	}
921	BestPipeline = CurrPipeline;
922	removeEdges(EdgesToRemove: AddedEdges);
923	return false;
924	}
925
926	unsigned PipelineSolver::computeProblemSize() {
927	unsigned ProblemSize = `0`;
928	for (auto &PipeConflicts : PipelineInstrs) {
929	ProblemSize += PipeConflicts.size();
930	}
931
932	return ProblemSize;
933	}
934
935	void PipelineSolver::solve() {
936	if (!NeedsSolver)
937	return;
938
939	unsigned ProblemSize = computeProblemSize();
940	assert(ProblemSize > `0`);
941
942	bool BelowCutoff = (CutoffForExact > `0`) && ProblemSize <= CutoffForExact;
943	MissPenalty = (ProblemSize / `2`) + `1`;
944
945	LLVM_DEBUG(DAG->dump());
946	if (EnableExactSolver \|\| BelowCutoff) {
947	LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
948	solveGreedy();
949	reset();
950	LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
951	if (BestCost > `0`) {
952	LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
953	solveExact();
954	LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
955	}
956	} else { // Use the Greedy Algorithm by default
957	LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
958	solveGreedy();
959	LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
960	}
961
962	makePipeline();
963	LLVM_DEBUG(dbgs() << "After applying mutation\n");
964	LLVM_DEBUG(DAG->dump());
965	}
966
967	// Implement a IGLP scheduling strategy.
968	class IGLPStrategy {
969	protected:
970	ScheduleDAGInstrs *DAG;
971
972	const SIInstrInfo *TII;
973
974	public:
975	/// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
976	virtual bool applyIGLPStrategy(
977	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
978	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
979	AMDGPU::SchedulingPhase Phase) = `0`;
980
981	// Returns true if this strategy should be applied to a ScheduleDAG.
982	virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
983	AMDGPU::SchedulingPhase Phase) = `0`;
984
985	bool IsBottomUp = true;
986
987	IGLPStrategy(ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
988	: DAG(DAG), TII(TII) {}
989
990	virtual ~IGLPStrategy() = default;
991	};
992
993	class MFMASmallGemmOpt final : public IGLPStrategy {
994	private:
995	public:
996	bool applyIGLPStrategy(
997	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
998	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
999	AMDGPU::SchedulingPhase Phase) override;
1000
1001	bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1002	AMDGPU::SchedulingPhase Phase) override {
1003	return true;
1004	}
1005
1006	MFMASmallGemmOpt(ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
1007	: IGLPStrategy (DAG, TII) {
1008	IsBottomUp = true;
1009	}
1010	};
1011
1012	bool MFMASmallGemmOpt::applyIGLPStrategy(
1013	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1014	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
1015	AMDGPU::SchedulingPhase Phase) {
1016	// Count the number of MFMA instructions.
1017	unsigned MFMACount = `0`;
1018	for (const MachineInstr &I : *DAG)
1019	if (TII->isMFMAorWMMA(MI: I))
1020	++MFMACount;
1021
1022	const unsigned PipelineSyncID = `0`;
1023	SchedGroup SG = nullptr*;
1024	for (unsigned I = `0`; I < MFMACount * `3`; ++I) {
1025	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1026	Args: SchedGroupMask::DS, Args: `2`, Args: PipelineSyncID, Args&: DAG, Args&: TII);
1027	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1028
1029	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1030	Args: SchedGroupMask::MFMA, Args: `1`, Args: PipelineSyncID, Args&: DAG, Args&: TII);
1031	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1032	}
1033
1034	return true;
1035	}
1036
1037	class MFMAExpInterleaveOpt final : public IGLPStrategy {
1038	private:
1039	// The count of TRANS SUs involved in the interleaved pipeline
1040	static unsigned TransPipeCount;
1041	// The count of MFMA SUs involved in the interleaved pipeline
1042	static unsigned MFMAPipeCount;
1043	// The count of Add SUs involved in the interleaved pipeline
1044	static unsigned AddPipeCount;
1045	// The number of transitive MFMA successors for each TRANS SU
1046	static unsigned MFMAEnablement;
1047	// The number of transitive TRANS predecessors for each MFMA SU
1048	static unsigned ExpRequirement;
1049	// The count of independent "chains" of MFMA instructions in the pipeline
1050	static unsigned MFMAChains;
1051	// Whether or not the pipeline has V_CVT instructions
1052	static bool HasCvt;
1053	// Whether or not there are instructions between the TRANS instruction and
1054	// V_CVT
1055	static bool HasChainBetweenCvt;
1056	// The first occuring DS_READ which feeds an MFMA chain
1057	static std::optional<unsigned> FirstPipeDSR;
1058	// The MFMAPipe SUs with no MFMA predecessors
1059	SmallVector<SUnit *, `4`> MFMAChainSeeds;
1060	// Compute the heuristics for the pipeline, returning whether or not the DAG
1061	// is well formatted for the mutation
1062	bool analyzeDAG(const SIInstrInfo *TII);
1063
1064	/// Whether or not the instruction is a transitive predecessor of an MFMA
1065	/// instruction
1066	class IsPipeExp final : public InstructionRule {
1067	public:
1068	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1069	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1070
1071	auto *DAG = SyncPipe [`0`].DAG;
1072
1073	if (Cache ->empty()) {
1074	auto I = DAG->SUnits.rbegin();
1075	auto E = DAG->SUnits.rend();
1076	for (; I != E; I ++) {
1077	if (TII->isMFMAorWMMA(MI: *I ->getInstr()))
1078	Cache ->push_back(Elt: &*I);
1079	}
1080	if (Cache ->empty())
1081	return false;
1082	}
1083
1084	auto Reaches = any_of(Range&: Cache, P: [&SU, &DAG](SUnit TargetSU) {
1085	return DAG->IsReachable(SU: TargetSU, TargetSU: const_cast<SUnit *>(SU));
1086	});
1087
1088	return Reaches;
1089	}
1090	IsPipeExp(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
1091	: InstructionRule (TII, SGID, NeedsCache) {}
1092	};
1093
1094	/// Whether or not the instruction is a transitive predecessor of the
1095	/// \p Number th MFMA of the MFMAs occuring after a TRANS instruction
1096	class EnablesNthMFMA final : public InstructionRule {
1097	private:
1098	unsigned Number = `1`;
1099
1100	public:
1101	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1102	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1103	bool FoundTrans = false;
1104	unsigned Counter = `1`;
1105	auto *DAG = SyncPipe [`0`].DAG;
1106
1107	if (Cache ->empty()) {
1108	auto I = DAG->SUnits.begin();
1109	auto E = DAG->SUnits.end();
1110	for (; I != E; I ++) {
1111	if (FoundTrans && TII->isMFMAorWMMA(MI: *I ->getInstr())) {
1112	if (Counter == Number) {
1113	Cache ->push_back(Elt: &*I);
1114	break;
1115	}
1116	++Counter;
1117	}
1118	if (!FoundTrans && TII->isTRANS(Opcode: I ->getInstr()->getOpcode()))
1119	FoundTrans = true;
1120	}
1121	if (Cache ->empty())
1122	return false;
1123	}
1124
1125	return DAG->IsReachable(SU: (Cache)[`0`], TargetSU: const_cast<SUnit >(SU));
1126	}
1127
1128	EnablesNthMFMA(unsigned Number, const SIInstrInfo TII, unsigned* SGID,
1129	bool NeedsCache = false)
1130	: InstructionRule (TII, SGID, NeedsCache), Number(Number) {}
1131	};
1132
1133	/// Whether or not the instruction enables the exact MFMA that is the \p
1134	/// Number th MFMA in the chain starting with \p ChainSeed
1135	class EnablesNthMFMAInChain final : public InstructionRule {
1136	private:
1137	unsigned Number = `1`;
1138	SUnit *ChainSeed;
1139
1140	public:
1141	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1142	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1143	auto *DAG = SyncPipe [`0`].DAG;
1144
1145	if (!SU \|\| !TII->isMFMAorWMMA(MI: *ChainSeed->getInstr()))
1146	return false;
1147
1148	if (Cache ->empty()) {
1149	auto *TempSU = ChainSeed;
1150	auto Depth = Number;
1151	while (Depth > `0`) {
1152	--Depth;
1153	bool Found = false;
1154	for (auto &Succ : TempSU->Succs) {
1155	if (TII->isMFMAorWMMA(MI: *Succ.getSUnit()->getInstr())) {
1156	TempSU = Succ.getSUnit();
1157	Found = true;
1158	break;
1159	}
1160	}
1161	if (!Found)
1162	return false;
1163	}
1164
1165	Cache ->push_back(Elt: TempSU);
1166	}
1167	// If we failed to find the instruction to be placed into the cache, we
1168	// would have already exited.
1169	assert(!Cache->empty());
1170
1171	return DAG->IsReachable(SU: (Cache)[`0`], TargetSU: const_cast<SUnit >(SU));
1172	}
1173
1174	EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
1175	const SIInstrInfo TII, unsigned* SGID,
1176	bool NeedsCache = false)
1177	: InstructionRule (TII, SGID, NeedsCache), Number(Number),
1178	ChainSeed(ChainSeed) {}
1179	};
1180
1181	/// Whether or not the instruction has less than \p Size immediate successors.
1182	/// If \p HasIntermediary is true, this tests also whether all successors of
1183	/// the SUnit have less than \p Size successors.
1184	class LessThanNSuccs final : public InstructionRule {
1185	private:
1186	unsigned Size = `1`;
1187	bool HasIntermediary = false;
1188
1189	public:
1190	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1191	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1192	if (!SyncPipe.size())
1193	return false;
1194
1195	unsigned SuccSize = llvm::count_if(Range: SU->Succs, P: [](const SDep &Succ) {
1196	return Succ.getKind() == SDep::Data;
1197	});
1198	if (SuccSize >= Size)
1199	return false;
1200
1201	if (HasIntermediary) {
1202	for (auto Succ : SU->Succs) {
1203	unsigned SuccSize =
1204	llvm::count_if(Range&: Succ.getSUnit()->Succs, P: [](const SDep &SuccSucc) {
1205	return SuccSucc.getKind() == SDep::Data;
1206	});
1207	if (SuccSize >= Size)
1208	return false;
1209	}
1210	}
1211
1212	return true;
1213	}
1214	LessThanNSuccs(unsigned Size, const SIInstrInfo TII, unsigned* SGID,
1215	bool HasIntermediary = false, bool NeedsCache = false)
1216	: InstructionRule (TII, SGID, NeedsCache), Size(Size),
1217	HasIntermediary(HasIntermediary) {}
1218	};
1219
1220	/// Whether or not the instruction has greater than or equal to \p Size
1221	/// immediate successors. If \p HasIntermediary is true, this tests also
1222	/// whether all successors of the SUnit have greater than or equal to \p Size
1223	/// successors.
1224	class GreaterThanOrEqualToNSuccs final : public InstructionRule {
1225	private:
1226	unsigned Size = `1`;
1227	bool HasIntermediary = false;
1228
1229	public:
1230	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1231	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1232	if (!SyncPipe.size())
1233	return false;
1234
1235	unsigned SuccSize = llvm::count_if(Range: SU->Succs, P: [](const SDep &Succ) {
1236	return Succ.getKind() == SDep::Data;
1237	});
1238	if (SuccSize >= Size)
1239	return true;
1240
1241	if (HasIntermediary) {
1242	for (auto Succ : SU->Succs) {
1243	unsigned SuccSize =
1244	llvm::count_if(Range&: Succ.getSUnit()->Succs, P: [](const SDep &SuccSucc) {
1245	return SuccSucc.getKind() == SDep::Data;
1246	});
1247	if (SuccSize >= Size)
1248	return true;
1249	}
1250	}
1251
1252	return false;
1253	}
1254	GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,
1255	unsigned SGID, bool HasIntermediary = false,
1256	bool NeedsCache = false)
1257	: InstructionRule (TII, SGID, NeedsCache), Size(Size),
1258	HasIntermediary(HasIntermediary) {}
1259	};
1260
1261	// Whether or not the instruction is a relevant V_CVT instruction.
1262	class IsCvt final : public InstructionRule {
1263	public:
1264	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1265	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1266	auto Opc = SU->getInstr()->getOpcode();
1267	return Opc == AMDGPU::V_CVT_F16_F32_e32 \|\|
1268	Opc == AMDGPU::V_CVT_I32_F32_e32;
1269	}
1270	IsCvt(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
1271	: InstructionRule (TII, SGID, NeedsCache) {}
1272	};
1273
1274	// Whether or not the instruction is FMA_F32.
1275	class IsFMA final : public InstructionRule {
1276	public:
1277	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1278	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1279	return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 \|\|
1280	SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
1281	}
1282	IsFMA(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
1283	: InstructionRule (TII, SGID, NeedsCache) {}
1284	};
1285
1286	// Whether or not the instruction is a V_ADD_F32 instruction.
1287	class IsPipeAdd final : public InstructionRule {
1288	public:
1289	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1290	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1291	return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32;
1292	}
1293	IsPipeAdd(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
1294	: InstructionRule (TII, SGID, NeedsCache) {}
1295	};
1296
1297	/// Whether or not the instruction is an immediate RAW successor
1298	/// of the SchedGroup \p Distance steps before.
1299	class IsSuccOfPrevNthGroup final : public InstructionRule {
1300	private:
1301	unsigned Distance = `1`;
1302
1303	public:
1304	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1305	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1306	SchedGroup OtherGroup = nullptr*;
1307	if (!SyncPipe.size())
1308	return false;
1309
1310	for (auto &PipeSG : SyncPipe) {
1311	if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1312	OtherGroup = &PipeSG;
1313	}
1314
1315	if (!OtherGroup)
1316	return false;
1317	if (!OtherGroup->Collection.size())
1318	return true;
1319
1320	for (auto &OtherEle : OtherGroup->Collection) {
1321	for (auto &Succ : OtherEle->Succs) {
1322	if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
1323	return true;
1324	}
1325	}
1326
1327	return false;
1328	}
1329	IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1330	unsigned SGID, bool NeedsCache = false)
1331	: InstructionRule (TII, SGID, NeedsCache), Distance(Distance) {}
1332	};
1333
1334	/// Whether or not the instruction is a transitive successor of any
1335	/// instruction the the SchedGroup \p Distance steps before.
1336	class IsReachableFromPrevNthGroup final : public InstructionRule {
1337	private:
1338	unsigned Distance = `1`;
1339
1340	public:
1341	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1342	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1343	SchedGroup OtherGroup = nullptr*;
1344	if (!SyncPipe.size())
1345	return false;
1346
1347	for (auto &PipeSG : SyncPipe) {
1348	if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1349	OtherGroup = &PipeSG;
1350	}
1351
1352	if (!OtherGroup)
1353	return false;
1354	if (!OtherGroup->Collection.size())
1355	return true;
1356
1357	auto *DAG = SyncPipe [`0`].DAG;
1358
1359	for (auto &OtherEle : OtherGroup->Collection)
1360	if (DAG->IsReachable(SU: const_cast<SUnit *>(SU), TargetSU: OtherEle))
1361	return true;
1362
1363	return false;
1364	}
1365	IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1366	unsigned SGID, bool NeedsCache = false)
1367	: InstructionRule (TII, SGID, NeedsCache), Distance(Distance) {}
1368	};
1369
1370	/// Whether or not the instruction occurs after the SU with NodeNUm \p Number
1371	class OccursAtOrAfterNode final : public InstructionRule {
1372	private:
1373	unsigned Number = `1`;
1374
1375	public:
1376	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1377	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1378
1379	return SU->NodeNum >= Number;
1380	}
1381	OccursAtOrAfterNode(unsigned Number, const SIInstrInfo TII, unsigned* SGID,
1382	bool NeedsCache = false)
1383	: InstructionRule (TII, SGID, NeedsCache), Number(Number) {}
1384	};
1385
1386	/// Whether or not the SU is exactly the \p Number th MFMA in the chain
1387	/// starting with \p ChainSeed
1388	class IsExactMFMA final : public InstructionRule {
1389	private:
1390	unsigned Number = `1`;
1391	SUnit *ChainSeed;
1392
1393	public:
1394	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1395	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1396	if (!SU \|\| !TII->isMFMAorWMMA(MI: *ChainSeed->getInstr()))
1397	return false;
1398
1399	if (Cache ->empty()) {
1400	auto *TempSU = ChainSeed;
1401	auto Depth = Number;
1402	while (Depth > `0`) {
1403	--Depth;
1404	bool Found = false;
1405	for (auto &Succ : TempSU->Succs) {
1406	if (TII->isMFMAorWMMA(MI: *Succ.getSUnit()->getInstr())) {
1407	TempSU = Succ.getSUnit();
1408	Found = true;
1409	break;
1410	}
1411	}
1412	if (!Found) {
1413	return false;
1414	}
1415	}
1416	Cache ->push_back(Elt: TempSU);
1417	}
1418	// If we failed to find the instruction to be placed into the cache, we
1419	// would have already exited.
1420	assert(!Cache->empty());
1421
1422	return (*Cache)[`0`] == SU;
1423	}
1424
1425	IsExactMFMA(unsigned Number, SUnit ChainSeed, const* SIInstrInfo *TII,
1426	unsigned SGID, bool NeedsCache = false)
1427	: InstructionRule (TII, SGID, NeedsCache), Number(Number),
1428	ChainSeed(ChainSeed) {}
1429	};
1430
1431	// Whether the instruction occurs after the first TRANS instruction. This
1432	// implies the instruction can not be a predecessor of the first TRANS
1433	// insruction
1434	class OccursAfterExp final : public InstructionRule {
1435	public:
1436	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1437	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1438
1439	auto *DAG = SyncPipe [`0`].DAG;
1440	if (Cache ->empty()) {
1441	for (auto &SU : DAG->SUnits)
1442	if (TII->isTRANS(Opcode: SU.getInstr()->getOpcode())) {
1443	Cache ->push_back(Elt: &SU);
1444	break;
1445	}
1446	if (Cache ->empty())
1447	return false;
1448	}
1449
1450	return SU->NodeNum > (*Cache)[`0`]->NodeNum;
1451	}
1452
1453	OccursAfterExp(const SIInstrInfo TII, unsigned* SGID,
1454	bool NeedsCache = false)
1455	: InstructionRule (TII, SGID, NeedsCache) {}
1456	};
1457
1458	public:
1459	bool applyIGLPStrategy(
1460	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1461	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
1462	AMDGPU::SchedulingPhase Phase) override;
1463
1464	bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1465	AMDGPU::SchedulingPhase Phase) override;
1466
1467	MFMAExpInterleaveOpt(ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
1468	: IGLPStrategy (DAG, TII) {
1469	IsBottomUp = false;
1470	}
1471	};
1472
1473	unsigned MFMAExpInterleaveOpt::TransPipeCount = `0`;
1474	unsigned MFMAExpInterleaveOpt::MFMAPipeCount = `0`;
1475	unsigned MFMAExpInterleaveOpt::AddPipeCount = `0`;
1476	unsigned MFMAExpInterleaveOpt::MFMAEnablement = `0`;
1477	unsigned MFMAExpInterleaveOpt::ExpRequirement = `0`;
1478	unsigned MFMAExpInterleaveOpt::MFMAChains = `0`;
1479	bool MFMAExpInterleaveOpt::HasCvt = false;
1480	bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
1481	std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1482
1483	bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
1484	SmallVector<SUnit *, `10`> ExpPipeCands;
1485	SmallVector<SUnit *, `10`> MFMAPipeCands;
1486	SmallVector<SUnit *, `10`> MFMAPipeSUs;
1487	SmallVector<SUnit *, `10`> PackSUs;
1488	SmallVector<SUnit *, `10`> CvtSUs;
1489
1490	auto isBitPack = [](unsigned Opc) {
1491	return Opc == AMDGPU::V_PACK_B32_F16_e64 \|\| Opc == AMDGPU::V_PERM_B32_e64;
1492	};
1493
1494	auto isCvt = [](unsigned Opc) {
1495	return Opc == AMDGPU::V_CVT_F16_F32_e32 \|\| Opc == AMDGPU::V_CVT_I32_F32_e32;
1496	};
1497
1498	auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };
1499
1500	AddPipeCount = `0`;
1501	for (SUnit &SU : DAG->SUnits) {
1502	auto Opc = SU.getInstr()->getOpcode();
1503	if (TII->isTRANS(Opcode: Opc)) {
1504	// Avoid counting a potential bonus V_EXP which all the MFMA depend on
1505	if (SU.Succs.size() >= `7`)
1506	continue;
1507	for (auto &Succ : SU.Succs) {
1508	if (Succ.getSUnit()->Succs.size() >= `7`)
1509	continue;
1510	}
1511	ExpPipeCands.push_back(Elt: &SU);
1512	}
1513
1514	if (TII->isMFMAorWMMA(MI: *SU.getInstr()))
1515	MFMAPipeCands.push_back(Elt: &SU);
1516
1517	if (isBitPack (Opc))
1518	PackSUs.push_back(Elt: &SU);
1519
1520	if (isCvt (Opc))
1521	CvtSUs.push_back(Elt: &SU);
1522
1523	if (isAdd (Opc))
1524	++AddPipeCount;
1525	}
1526
1527	if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
1528	return false;
1529
1530	TransPipeCount = `0`;
1531
1532	std::optional<SUnit *> TempMFMA;
1533	std::optional<SUnit *> TempExp;
1534	// Count the number of EXPs that reach an MFMA
1535	for (auto &PredSU : ExpPipeCands) {
1536	for (auto &SuccSU : MFMAPipeCands) {
1537	if (DAG->IsReachable(SU: SuccSU, TargetSU: PredSU)) {
1538	if (!TempExp) {
1539	TempExp = PredSU;
1540	TempMFMA = SuccSU;
1541	}
1542	MFMAPipeSUs.push_back(Elt: SuccSU);
1543	++TransPipeCount;
1544	break;
1545	}
1546	}
1547	}
1548
1549	if (!(TempExp && TempMFMA))
1550	return false;
1551
1552	HasChainBetweenCvt = none_of(Range&: (*TempExp)->Succs, P: [&isCvt](SDep &Succ) {
1553	return isCvt (Succ.getSUnit()->getInstr()->getOpcode());
1554	});
1555
1556	// Count the number of MFMAs that are reached by an EXP
1557	for (auto &SuccSU : MFMAPipeCands) {
1558	if (MFMAPipeSUs.size() &&
1559	any_of(Range&: MFMAPipeSUs, P: [&SuccSU](SUnit *PotentialMatch) {
1560	return PotentialMatch->NodeNum == SuccSU->NodeNum;
1561	}))
1562	continue;
1563
1564	for (auto &PredSU : ExpPipeCands) {
1565	if (DAG->IsReachable(SU: SuccSU, TargetSU: PredSU)) {
1566	MFMAPipeSUs.push_back(Elt: SuccSU);
1567	break;
1568	}
1569	}
1570	}
1571
1572	MFMAPipeCount = MFMAPipeSUs.size();
1573
1574	assert(TempExp && TempMFMA);
1575	assert(MFMAPipeCount > `0`);
1576
1577	std::optional<SUnit *> TempCvt;
1578	for (auto &SuccSU : CvtSUs) {
1579	if (DAG->IsReachable(SU: SuccSU, TargetSU: *TempExp)) {
1580	TempCvt = SuccSU;
1581	break;
1582	}
1583	}
1584
1585	HasCvt = false;
1586	if (TempCvt.has_value()) {
1587	for (auto &SuccSU : MFMAPipeSUs) {
1588	if (DAG->IsReachable(SU: SuccSU, TargetSU: *TempCvt)) {
1589	HasCvt = true;
1590	break;
1591	}
1592	}
1593	}
1594
1595	MFMAChains = `0`;
1596	for (auto &MFMAPipeSU : MFMAPipeSUs) {
1597	if (is_contained(Range&: MFMAChainSeeds, Element: MFMAPipeSU))
1598	continue;
1599	if (none_of(Range&: MFMAPipeSU->Preds, P: [&TII](SDep &Succ) {
1600	return TII->isMFMAorWMMA(MI: *Succ.getSUnit()->getInstr());
1601	})) {
1602	MFMAChainSeeds.push_back(Elt: MFMAPipeSU);
1603	++MFMAChains;
1604	}
1605	}
1606
1607	if (!MFMAChains)
1608	return false;
1609
1610	for (auto Pred : MFMAChainSeeds [`0`]->Preds) {
1611	if (TII->isDS(Opcode: Pred.getSUnit()->getInstr()->getOpcode()) &&
1612	Pred.getSUnit()->getInstr()->mayLoad())
1613	FirstPipeDSR = Pred.getSUnit()->NodeNum;
1614	}
1615
1616	// The number of bit pack operations that depend on a single V_EXP
1617	unsigned PackSuccCount =
1618	llvm::count_if(Range&: PackSUs, P: [this, &TempExp](SUnit *VPack) {
1619	return DAG->IsReachable(SU: VPack, TargetSU: *TempExp);
1620	});
1621
1622	// The number of bit pack operations an MFMA depends on
1623	unsigned PackPredCount =
1624	llvm::count_if(Range&: (*TempMFMA)->Preds, P: [&isBitPack](SDep &Pred) {
1625	auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1626	return isBitPack (Opc);
1627	});
1628
1629	auto PackPred = llvm::find_if(Range&: (TempMFMA)->Preds, P: [&isBitPack](SDep &Pred) {
1630	auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1631	return isBitPack (Opc);
1632	});
1633
1634	if (PackPred == (*TempMFMA)->Preds.end())
1635	return false;
1636
1637	MFMAEnablement = `0`;
1638	ExpRequirement = `0`;
1639	// How many MFMAs depend on a single bit pack operation
1640	MFMAEnablement =
1641	llvm::count_if(Range&: PackPred->getSUnit()->Succs, P: [&TII](SDep &Succ) {
1642	return TII->isMFMAorWMMA(MI: *Succ.getSUnit()->getInstr());
1643	});
1644
1645	// The number of MFMAs that depend on a single V_EXP
1646	MFMAEnablement *= PackSuccCount;
1647
1648	// The number of V_EXPs required to resolve all dependencies for an MFMA
1649	ExpRequirement =
1650	llvm::count_if(Range&: ExpPipeCands, P: [this, &PackPred](SUnit *ExpBase) {
1651	return DAG->IsReachable(SU: PackPred->getSUnit(), TargetSU: ExpBase);
1652	});
1653
1654	ExpRequirement *= PackPredCount;
1655	return true;
1656	}
1657
1658	bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1659	AMDGPU::SchedulingPhase Phase) {
1660	const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1661	const SIInstrInfo *TII = ST.getInstrInfo();
1662
1663	if (Phase != AMDGPU::SchedulingPhase::PostRA)
1664	MFMAChainSeeds.clear();
1665	if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
1666	return false;
1667
1668	return true;
1669	}
1670
1671	bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1672	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1673	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
1674	AMDGPU::SchedulingPhase Phase) {
1675
1676	bool IsSmallKernelType =
1677	MFMAEnablement == `2` && ExpRequirement == `4` && TransPipeCount == `32`;
1678	bool IsLargeKernelType =
1679	MFMAEnablement == `4` && ExpRequirement == `4` && TransPipeCount == `64`;
1680
1681	if (!(IsSmallKernelType \|\| IsLargeKernelType))
1682	return false;
1683
1684	const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1685	const SIInstrInfo *TII = ST.getInstrInfo();
1686
1687	unsigned PipelineSyncID = `0`;
1688	SchedGroup SG = nullptr*;
1689
1690	unsigned MFMAChain = `0`;
1691	unsigned PositionInChain = `0`;
1692	unsigned CurrMFMAForTransPosition = `0`;
1693
1694	auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1695	&CurrMFMAForTransPosition]() {
1696	CurrMFMAForTransPosition += MFMAEnablement;
1697	PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1698	MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1699	};
1700
1701	auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1702	auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1703	return (TempMFMAForTrans / MFMAChains);
1704	};
1705
1706	auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1707	auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1708	return TempMFMAForTrans % MFMAChains;
1709	};
1710
1711	unsigned CurrMFMAPosition = `0`;
1712	unsigned MFMAChainForMFMA = `0`;
1713	unsigned PositionInChainForMFMA = `0`;
1714
1715	auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1716	&PositionInChainForMFMA]() {
1717	++CurrMFMAPosition;
1718	MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1719	PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1720	};
1721
1722	bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
1723	assert(IsPostRA \|\| MFMAChainSeeds.size() == MFMAChains);
1724
1725	bool UsesFMA = IsSmallKernelType \|\| !IsPostRA;
1726	bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1727	bool UsesCvt = HasCvt && (IsSmallKernelType \|\| !IsPostRA);
1728	bool UsesVALU = IsSmallKernelType;
1729
1730	// PHASE 1: "Prefetch"
1731	if (UsesFMA) {
1732	// First Round FMA
1733	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1734	Args: SchedGroupMask::VALU, Args&: ExpRequirement, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1735	if (!IsPostRA && MFMAChains) {
1736	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1737	args&: PositionInChain, args&: MFMAChainSeeds [MFMAChain], args&: TII, args: SG->getSGID(),
1738	args: true));
1739	} else
1740	SG->addRule(
1741	NewRule: std::make_shared<EnablesNthMFMA>(args: `1`, args&: TII, args: SG->getSGID(), args: true));
1742	SG->addRule(NewRule: std::make_shared<IsFMA>(args&: TII, args: SG->getSGID()));
1743	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1744
1745	// Second Round FMA
1746	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1747	Args: SchedGroupMask::VALU, Args&: ExpRequirement, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1748	if (!IsPostRA && MFMAChains) {
1749	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1750	args: getNextTransPositionInChain (),
1751	args&: MFMAChainSeeds [getNextTransMFMAChain ()], args&: TII, args: SG->getSGID(), args: true));
1752	} else
1753	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(args: MFMAEnablement + `1`, args&: TII,
1754	args: SG->getSGID(), args: true));
1755	SG->addRule(NewRule: std::make_shared<IsFMA>(args&: TII, args: SG->getSGID()));
1756	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1757	}
1758
1759	if (UsesDSRead) {
1760	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1761	Args: SchedGroupMask::DS_READ, Args: `2`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1762	SG->addRule(NewRule: std::make_shared<OccursAtOrAfterNode>(args&: *FirstPipeDSR, args&: TII,
1763	args: SG->getSGID()));
1764	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1765	}
1766
1767	// First Round EXP
1768	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1769	Args: SchedGroupMask::TRANS, Args&: ExpRequirement, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1770	if (!IsPostRA && MFMAChains)
1771	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1772	args&: PositionInChain, args&: MFMAChainSeeds [MFMAChain], args&: TII, args: SG->getSGID(), args: true));
1773	else
1774	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(args: `1`, args&: TII, args: SG->getSGID(), args: true));
1775	SG->addRule(NewRule: std::make_shared<IsPipeExp>(args&: TII, args: SG->getSGID(), args: true));
1776	SG->addRule(NewRule: std::make_shared<LessThanNSuccs>(args: `8`, args&: TII, args: SG->getSGID(),
1777	args&: HasChainBetweenCvt));
1778	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1779
1780	incrementTransPosition ();
1781
1782	// First Round CVT, Third Round FMA, Second Round EXP; interleaved
1783	for (unsigned I = `0`; I < ExpRequirement; I++) {
1784	// First Round CVT
1785	if (UsesCvt) {
1786	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1787	Args: SchedGroupMask::VALU, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1788	SG->addRule(NewRule: std::make_shared<IsCvt>(args&: TII, args: SG->getSGID()));
1789	if (HasChainBetweenCvt)
1790	SG->addRule(NewRule: std::make_shared<IsReachableFromPrevNthGroup>(
1791	args: `1` + (`2` + UsesFMA) * I, args&: TII, args: SG->getSGID()));
1792	else
1793	SG->addRule(NewRule: std::make_shared<IsSuccOfPrevNthGroup>(
1794	args: `1` + (`2` + UsesFMA) * I, args&: TII, args: SG->getSGID()));
1795	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1796	}
1797
1798	// Third Round FMA
1799	if (UsesFMA) {
1800	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1801	Args: SchedGroupMask::VALU, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1802	if (!IsPostRA && MFMAChains) {
1803	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1804	args: getNextTransPositionInChain (),
1805	args&: MFMAChainSeeds [getNextTransMFMAChain ()], args&: TII, args: SG->getSGID(), args: true));
1806	} else
1807	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(args: `2` * MFMAEnablement + `1`,
1808	args&: TII, args: SG->getSGID(), args: true));
1809	SG->addRule(NewRule: std::make_shared<IsFMA>(args&: TII, args: SG->getSGID()));
1810	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1811	}
1812
1813	// Second Round EXP
1814	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1815	Args: SchedGroupMask::TRANS, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1816	if (!IsPostRA && MFMAChains)
1817	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1818	args&: PositionInChain, args&: MFMAChainSeeds [MFMAChain], args&: TII, args: SG->getSGID(),
1819	args: true));
1820	else
1821	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(args: MFMAEnablement + `1`, args&: TII,
1822	args: SG->getSGID(), args: true));
1823	SG->addRule(NewRule: std::make_shared<IsPipeExp>(args&: TII, args: SG->getSGID(), args: true));
1824	SG->addRule(NewRule: std::make_shared<LessThanNSuccs>(args: `8`, args&: TII, args: SG->getSGID(),
1825	args&: HasChainBetweenCvt));
1826	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1827	}
1828
1829	// The "extra" EXP which enables all MFMA
1830	// TODO: UsesExtraExp
1831	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1832	Args: SchedGroupMask::TRANS, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1833	SG->addRule(NewRule: std::make_shared<IsPipeExp>(args&: TII, args: SG->getSGID(), args: true));
1834	SG->addRule(NewRule: std::make_shared<GreaterThanOrEqualToNSuccs>(
1835	args: `8`, args&: TII, args: SG->getSGID(), args&: HasChainBetweenCvt));
1836	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1837
1838	// PHASE 2: Main Interleave Loop
1839
1840	// The number of MFMAs per iteration
1841	unsigned MFMARatio =
1842	MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : `1`;
1843	// The number of Exps per iteration
1844	unsigned ExpRatio =
1845	MFMAEnablement > ExpRequirement ? `1` : ExpRequirement / MFMAEnablement;
1846	// The reamaining Exps
1847	unsigned RemainingExp = TransPipeCount > (`2` * ExpRequirement)
1848	? TransPipeCount - (`2` * ExpRequirement)
1849	: `0`;
1850	unsigned ExpLoopCount = RemainingExp / ExpRatio;
1851	// In loop MFMAs
1852	unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * `2`)
1853	? MFMAPipeCount - (MFMAEnablement * `2`)
1854	: `0`;
1855	unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1856	unsigned VALUOps =
1857	AddPipeCount < MFMAPipeCount ? `1` : AddPipeCount / MFMAPipeCount;
1858	unsigned LoopSize = std::min(a: ExpLoopCount, b: MFMALoopCount);
1859
1860	for (unsigned I = `0`; I < LoopSize; I++) {
1861	if (!(I * ExpRatio % ExpRequirement))
1862	incrementTransPosition ();
1863
1864	// Round N MFMA
1865	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1866	Args: SchedGroupMask::MFMA, Args&: MFMARatio, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1867	if (!IsPostRA && MFMAChains)
1868	SG->addRule(NewRule: std::make_shared<IsExactMFMA>(
1869	args&: PositionInChainForMFMA, args&: MFMAChainSeeds [MFMAChainForMFMA], args&: TII,
1870	args: SG->getSGID(), args: true));
1871	else
1872	SG->addRule(NewRule: std::make_shared<OccursAfterExp>(args&: TII, args: SG->getSGID(), args: true));
1873	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1874	incrementMFMAPosition ();
1875
1876	if (UsesVALU) {
1877	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1878	Args: SchedGroupMask::VALU, Args&: VALUOps, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1879	SG->addRule(NewRule: std::make_shared<IsPipeAdd>(args&: TII, args: SG->getSGID()));
1880	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1881	}
1882
1883	if (UsesDSRead && !(I % `4`)) {
1884	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1885	Args: SchedGroupMask::DS_READ, Args: `2`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1886	SG->addRule(NewRule: std::make_shared<OccursAtOrAfterNode>(args&: *FirstPipeDSR, args&: TII,
1887	args: SG->getSGID()));
1888	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1889	}
1890
1891	// CVT, EXP, FMA Interleaving
1892	for (unsigned J = `0`; J < ExpRatio; J++) {
1893	auto MFMAOffset = (`1` + UsesVALU) * MFMARatio * (I + `1`);
1894	auto MaxMFMAOffset =
1895	(`1` + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1896
1897	// Round N + 1 CVT
1898	if (UsesCvt) {
1899	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1900	Args: SchedGroupMask::VALU, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1901	SG->addRule(NewRule: std::make_shared<IsCvt>(args&: TII, args: SG->getSGID()));
1902	auto BaseDiff = (`2` + UsesFMA) * (ExpRequirement - `1`) + `1`;
1903	auto DSROffset = I / `4` + `1`;
1904	auto MaxDSROffset = MaxMFMAOffset / `4`;
1905	// TODO: UsesExtraExp
1906	auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? `0` : `1`;
1907	auto CurrentOffset = UsesDSRead * std::min(a: MaxDSROffset, b: DSROffset) +
1908	std::min(a: MaxMFMAOffset, b: MFMAOffset) + BaseDiff +
1909	ExpOffset;
1910	if (HasChainBetweenCvt)
1911	SG->addRule(NewRule: std::make_shared<IsReachableFromPrevNthGroup>(
1912	args&: CurrentOffset, args&: TII, args: SG->getSGID()));
1913	else
1914	SG->addRule(NewRule: std::make_shared<IsSuccOfPrevNthGroup>(args&: CurrentOffset, args&: TII,
1915	args: SG->getSGID()));
1916	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1917	}
1918
1919	// Round N + 3 FMA
1920	if (UsesFMA) {
1921	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1922	Args: SchedGroupMask::VALU, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1923	if (!IsPostRA && MFMAChains)
1924	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1925	args: getNextTransPositionInChain (),
1926	args&: MFMAChainSeeds [getNextTransMFMAChain ()], args&: TII, args: SG->getSGID(),
1927	args: true));
1928	else
1929	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(
1930	args: (((I * ExpRatio + J) / ExpRequirement) + `3`) * MFMAEnablement + `1`,
1931	args&: TII, args: SG->getSGID(), args: true));
1932	SG->addRule(NewRule: std::make_shared<IsFMA>(args&: TII, args: SG->getSGID()));
1933	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1934	}
1935
1936	// Round N + 2 Exp
1937	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1938	Args: SchedGroupMask::TRANS, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1939	if (!IsPostRA && MFMAChains)
1940	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1941	args&: PositionInChain, args&: MFMAChainSeeds [MFMAChain], args&: TII, args: SG->getSGID(),
1942	args: true));
1943	else
1944	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(
1945	args: (((I * ExpRatio + J) / ExpRequirement) + `2`) * MFMAEnablement + `1`,
1946	args&: TII, args: SG->getSGID(), args: true));
1947	SG->addRule(NewRule: std::make_shared<IsPipeExp>(args&: TII, args: SG->getSGID(), args: true));
1948	SG->addRule(NewRule: std::make_shared<LessThanNSuccs>(args: `8`, args&: TII, args: SG->getSGID(),
1949	args&: HasChainBetweenCvt));
1950	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1951	}
1952	}
1953
1954	// PHASE 3: Remaining MFMAs
1955	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1956	Args: SchedGroupMask::MFMA, Args: MFMAEnablement * `2`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1957	SG->addRule(NewRule: std::make_shared<OccursAfterExp>(args&: TII, args: SG->getSGID(), args: true));
1958	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1959	return true;
1960	}
1961
1962	class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy {
1963	public:
1964	bool applyIGLPStrategy(
1965	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1966	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
1967	AMDGPU::SchedulingPhase Phase) override;
1968
1969	bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1970	AMDGPU::SchedulingPhase Phase) override {
1971	return true;
1972	}
1973
1974	MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
1975	: IGLPStrategy (DAG, TII) {
1976	IsBottomUp = true;
1977	}
1978	};
1979
1980	bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
1981	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1982	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
1983	AMDGPU::SchedulingPhase Phase) {
1984	// Count the number of MFMA instructions.
1985	unsigned MFMACount = `0`;
1986	for (const MachineInstr &I : *DAG)
1987	if (TII->isMFMAorWMMA(MI: I))
1988	++MFMACount;
1989
1990	const unsigned PipelineSyncID = `0`;
1991	for (unsigned I = `0`; I < MFMACount * `3`; ++I) {
1992	SchedGroup *SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1993	Args: SchedGroupMask::TRANS, Args: `1`, Args: PipelineSyncID, Args&: DAG, Args&: TII);
1994	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1995
1996	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1997	Args: SchedGroupMask::MFMA, Args: `1`, Args: PipelineSyncID, Args&: DAG, Args&: TII);
1998	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1999	}
2000
2001	return true;
2002	}
2003
2004	class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
2005	private:
2006	// Whether the DS_READ is a predecessor of first four MFMA in region
2007	class EnablesInitialMFMA final : public InstructionRule {
2008	public:
2009	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
2010	SmallVectorImpl<SchedGroup> &SyncPipe) override {
2011	if (!SyncPipe.size())
2012	return false;
2013	int MFMAsFound = `0`;
2014	if (!Cache ->size()) {
2015	for (auto &Elt : SyncPipe [`0`].DAG->SUnits) {
2016	if (TII->isMFMAorWMMA(MI: *Elt.getInstr())) {
2017	++MFMAsFound;
2018	if (MFMAsFound > `4`)
2019	break;
2020	Cache ->push_back(Elt: &Elt);
2021	}
2022	}
2023	}
2024
2025	auto *DAG = SyncPipe [`0`].DAG;
2026	for (auto &Elt : *Cache) {
2027	if (DAG->IsReachable(SU: Elt, TargetSU: const_cast<SUnit *>(SU)))
2028	return true;
2029	}
2030	return false;
2031	}
2032
2033	EnablesInitialMFMA(const SIInstrInfo TII, unsigned* SGID,
2034	bool NeedsCache = false)
2035	: InstructionRule (TII, SGID, NeedsCache) {}
2036	};
2037
2038	// Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
2039	class IsPermForDSW final : public InstructionRule {
2040	public:
2041	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
2042	SmallVectorImpl<SchedGroup> &SyncPipe) override {
2043	auto *MI = SU->getInstr();
2044	if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
2045	return false;
2046
2047	bool FitsInGroup = false;
2048	// Does the VALU have a DS_WRITE successor
2049	if (!Collection.size()) {
2050	for (auto &Succ : SU->Succs) {
2051	SUnit *SuccUnit = Succ.getSUnit();
2052	if (TII->isDS(MI: *SuccUnit->getInstr()) &&
2053	SuccUnit->getInstr()->mayStore()) {
2054	Cache ->push_back(Elt: SuccUnit);
2055	FitsInGroup = true;
2056	}
2057	}
2058	return FitsInGroup;
2059	}
2060
2061	// Does the VALU have a DS_WRITE successor that is the same as other
2062	// VALU already in the group. The V_PERMs will all share 1 DS_W succ
2063	return llvm::any_of(Range&: Cache, P: [&SU](SUnit Elt) {
2064	return llvm::any_of(Range: SU->Succs, P: [&Elt](const SDep &ThisSucc) {
2065	return ThisSucc.getSUnit() == Elt;
2066	});
2067	});
2068	}
2069
2070	IsPermForDSW(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
2071	: InstructionRule (TII, SGID, NeedsCache) {}
2072	};
2073
2074	// Whether the SU is a successor of any element in previous SchedGroup
2075	class IsSuccOfPrevGroup final : public InstructionRule {
2076	public:
2077	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
2078	SmallVectorImpl<SchedGroup> &SyncPipe) override {
2079	SchedGroup OtherGroup = nullptr*;
2080	for (auto &PipeSG : SyncPipe) {
2081	if ((unsigned)PipeSG.getSGID() == SGID - `1`) {
2082	OtherGroup = &PipeSG;
2083	}
2084	}
2085
2086	if (!OtherGroup)
2087	return false;
2088	if (!OtherGroup->Collection.size())
2089	return true;
2090
2091	// Does the previous VALU have this DS_Write as a successor
2092	return any_of(Range&: OtherGroup->Collection, P: [&SU](SUnit *Elt) {
2093	return any_of(Range&: Elt->Succs,
2094	P: [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
2095	});
2096	}
2097	IsSuccOfPrevGroup(const SIInstrInfo TII, unsigned* SGID,
2098	bool NeedsCache = false)
2099	: InstructionRule (TII, SGID, NeedsCache) {}
2100	};
2101
2102	// Whether the combined load width of group is 128 bits
2103	class VMEMSize final : public InstructionRule {
2104	public:
2105	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
2106	SmallVectorImpl<SchedGroup> &SyncPipe) override {
2107	auto *MI = SU->getInstr();
2108	if (MI->getOpcode() == TargetOpcode::BUNDLE)
2109	return false;
2110	if (!Collection.size())
2111	return true;
2112
2113	int NumBits = `0`;
2114
2115	auto TRI = TII->getRegisterInfo();
2116	auto &MRI = MI->getMF()->getRegInfo();
2117	for (auto &Elt : Collection) {
2118	auto Op = Elt->getInstr()->getOperand(i: `0`);
2119	auto Size =
2120	TRI.getRegSizeInBits(RC: *TRI.getRegClassForOperandReg(MRI, MO: Op));
2121	NumBits += Size;
2122	}
2123
2124	if (NumBits < `128`) {
2125	assert(TII->isVMEM(*MI) && MI->mayLoad());
2126	if (NumBits + TRI.getRegSizeInBits(RC: *TRI.getRegClassForOperandReg(
2127	MRI, MO: MI->getOperand(i: `0`))) <=
2128	`128`)
2129	return true;
2130	}
2131
2132	return false;
2133	}
2134
2135	VMEMSize(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
2136	: InstructionRule (TII, SGID, NeedsCache) {}
2137	};
2138
2139	/// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
2140	/// that is \p Distance steps away
2141	class SharesPredWithPrevNthGroup final : public InstructionRule {
2142	private:
2143	unsigned Distance = `1`;
2144
2145	public:
2146	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
2147	SmallVectorImpl<SchedGroup> &SyncPipe) override {
2148	SchedGroup OtherGroup = nullptr*;
2149	if (!SyncPipe.size())
2150	return false;
2151
2152	if (!Cache ->size()) {
2153
2154	for (auto &PipeSG : SyncPipe) {
2155	if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
2156	OtherGroup = &PipeSG;
2157	}
2158	}
2159
2160	if (!OtherGroup)
2161	return false;
2162	if (!OtherGroup->Collection.size())
2163	return true;
2164
2165	for (auto &OtherEle : OtherGroup->Collection) {
2166	for (auto &Pred : OtherEle->Preds) {
2167	if (Pred.getSUnit()->getInstr()->getOpcode() ==
2168	AMDGPU::V_PERM_B32_e64)
2169	Cache ->push_back(Elt: Pred.getSUnit());
2170	}
2171	}
2172
2173	// If the other group has no PERM preds, then this group won't share any
2174	if (!Cache ->size())
2175	return false;
2176	}
2177
2178	auto *DAG = SyncPipe [`0`].DAG;
2179	// Does the previous DS_WRITE share a V_PERM predecessor with this
2180	// VMEM_READ
2181	return llvm::any_of(Range&: Cache, P: [&SU, &DAG](SUnit Elt) {
2182	return DAG->IsReachable(SU: const_cast<SUnit *>(SU), TargetSU: Elt);
2183	});
2184	}
2185	SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
2186	unsigned SGID, bool NeedsCache = false)
2187	: InstructionRule (TII, SGID, NeedsCache), Distance(Distance) {}
2188	};
2189
2190	public:
2191	bool applyIGLPStrategy(
2192	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2193	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
2194	AMDGPU::SchedulingPhase Phase) override;
2195
2196	bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
2197	AMDGPU::SchedulingPhase Phase) override {
2198	return true;
2199	}
2200
2201	MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
2202	: IGLPStrategy (DAG, TII) {
2203	IsBottomUp = false;
2204	}
2205	};
2206
2207	static unsigned DSWCount = `0`;
2208	static unsigned DSWWithPermCount = `0`;
2209	static unsigned DSWWithSharedVMEMCount = `0`;
2210
2211	bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2212	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2213	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
2214	AMDGPU::SchedulingPhase Phase) {
2215	unsigned MFMACount = `0`;
2216	unsigned DSRCount = `0`;
2217
2218	bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
2219
2220	assert((!IsInitial \|\| (DSWCount == `0` && DSWWithPermCount == `0` &&
2221	DSWWithSharedVMEMCount == `0`)) &&
2222	"DSWCounters should be zero in pre-RA scheduling!");
2223	SmallVector<SUnit *, `6`> DSWithPerms;
2224	for (auto &SU : DAG->SUnits) {
2225	auto *I = SU.getInstr();
2226	if (TII->isMFMAorWMMA(MI: *I))
2227	++MFMACount;
2228	else if (TII->isDS(MI: *I)) {
2229	if (I->mayLoad())
2230	++DSRCount;
2231	else if (I->mayStore() && IsInitial) {
2232	++DSWCount;
2233	for (auto Pred : SU.Preds) {
2234	if (Pred.getSUnit()->getInstr()->getOpcode() ==
2235	AMDGPU::V_PERM_B32_e64) {
2236	DSWithPerms.push_back(Elt: &SU);
2237	break;
2238	}
2239	}
2240	}
2241	}
2242	}
2243
2244	if (IsInitial) {
2245	DSWWithPermCount = DSWithPerms.size();
2246	auto *I = DSWithPerms.begin();
2247	auto *E = DSWithPerms.end();
2248
2249	// Get the count of DS_WRITES with V_PERM predecessors which
2250	// have loop carried dependencies (WAR) on the same VMEM_READs.
2251	// We consider partial overlap as a miss -- in other words,
2252	// for a given DS_W, we only consider another DS_W as matching
2253	// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
2254	// for every V_PERM pred of this DS_W.
2255	DenseMap<MachineInstr , SUnit > VMEMLookup;
2256	SmallVector<SUnit *, `6`> Counted;
2257	for (; I != E; I++) {
2258	SUnit Cand = nullptr*;
2259	bool MissedAny = false;
2260	for (auto &Pred : (*I)->Preds) {
2261	if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2262	continue;
2263
2264	if (Cand && llvm::is_contained(Range&: Counted, Element: Cand))
2265	break;
2266
2267	for (auto &Succ : Pred.getSUnit()->Succs) {
2268	auto *MI = Succ.getSUnit()->getInstr();
2269	if (!TII->isVMEM(MI: *MI) \|\| !MI->mayLoad())
2270	continue;
2271
2272	if (MissedAny \|\| !VMEMLookup.size()) {
2273	MissedAny = true;
2274	VMEMLookup [MI] = *I;
2275	continue;
2276	}
2277
2278	auto [It, Inserted] = VMEMLookup.try_emplace(Key: MI, Args&: *I);
2279	if (Inserted) {
2280	MissedAny = true;
2281	continue;
2282	}
2283
2284	Cand = It ->second;
2285	if (llvm::is_contained(Range&: Counted, Element: Cand)) {
2286	MissedAny = true;
2287	break;
2288	}
2289	}
2290	}
2291	if (!MissedAny && Cand) {
2292	DSWWithSharedVMEMCount += `2`;
2293	Counted.push_back(Elt: Cand);
2294	Counted.push_back(Elt: *I);
2295	}
2296	}
2297	}
2298
2299	assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2300	SchedGroup *SG;
2301	unsigned PipelineSyncID = `0`;
2302	// For kernels with V_PERM, there are enough VALU to mix in between MFMAs
2303	if (DSWWithPermCount) {
2304	for (unsigned I = `0`; I < MFMACount; I++) {
2305	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2306	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2307	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2308
2309	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2310	Args: SchedGroupMask::VALU, Args: `2`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2311	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2312	}
2313	}
2314
2315	PipelineSyncID = `1`;
2316	// Phase 1: Break up DS_READ and MFMA clusters.
2317	// First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
2318	// prefetch
2319
2320	// Make ready initial MFMA
2321	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2322	Args: SchedGroupMask::DS_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2323	SG->addRule(NewRule: std::make_shared<EnablesInitialMFMA>(args&: TII, args: SG->getSGID(), args: true));
2324	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2325
2326	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2327	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2328	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2329
2330	// Interleave MFMA with DS_READ prefetch
2331	for (unsigned I = `4`; I < DSRCount; ++I) {
2332	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2333	Args: SchedGroupMask::DS_READ, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2334	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2335
2336	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2337	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2338	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2339	}
2340
2341	// Phase 2a: Loop carried dependency with V_PERM
2342	// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2343	// depend on. Interleave MFMA to keep XDL unit busy throughout.
2344	for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
2345	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2346	Args: SchedGroupMask::VALU, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2347	SG->addRule(NewRule: std::make_shared<IsPermForDSW>(args&: TII, args: SG->getSGID(), args: true));
2348	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2349
2350	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2351	Args: SchedGroupMask::DS_WRITE, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2352	SG->addRule(NewRule: std::make_shared<IsSuccOfPrevGroup>(args&: TII, args: SG->getSGID()));
2353	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2354
2355	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2356	Args: SchedGroupMask::VMEM_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2357	SG->addRule(NewRule: std::make_shared<SharesPredWithPrevNthGroup>(
2358	args: `1`, args&: TII, args: SG->getSGID(), args: true));
2359	SG->addRule(NewRule: std::make_shared<VMEMSize>(args&: TII, args: SG->getSGID()));
2360	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2361
2362	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2363	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2364	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2365
2366	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2367	Args: SchedGroupMask::VMEM_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2368	SG->addRule(NewRule: std::make_shared<SharesPredWithPrevNthGroup>(
2369	args: `3`, args&: TII, args: SG->getSGID(), args: true));
2370	SG->addRule(NewRule: std::make_shared<VMEMSize>(args&: TII, args: SG->getSGID()));
2371	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2372
2373	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2374	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2375	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2376	}
2377
2378	// Phase 2b: Loop carried dependency without V_PERM
2379	// Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
2380	// Interleave MFMA to keep XDL unit busy throughout.
2381	for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
2382	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2383	Args: SchedGroupMask::DS_WRITE, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2384	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2385
2386	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2387	Args: SchedGroupMask::VMEM_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2388	SG->addRule(NewRule: std::make_shared<VMEMSize>(args&: TII, args: SG->getSGID()));
2389	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2390
2391	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2392	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2393	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2394	}
2395
2396	// Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
2397	// ultimately used by two DS_WRITE
2398	// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2399	// depend on. Interleave MFMA to keep XDL unit busy throughout.
2400
2401	for (unsigned I = `0`; I < DSWWithSharedVMEMCount; ++I) {
2402	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2403	Args: SchedGroupMask::VALU, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2404	SG->addRule(NewRule: std::make_shared<IsPermForDSW>(args&: TII, args: SG->getSGID(), args: true));
2405	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2406
2407	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2408	Args: SchedGroupMask::DS_WRITE, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2409	SG->addRule(NewRule: std::make_shared<IsSuccOfPrevGroup>(args&: TII, args: SG->getSGID()));
2410	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2411
2412	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2413	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2414	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2415
2416	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2417	Args: SchedGroupMask::VALU, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2418	SG->addRule(NewRule: std::make_shared<IsPermForDSW>(args&: TII, args: SG->getSGID(), args: true));
2419	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2420
2421	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2422	Args: SchedGroupMask::DS_WRITE, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2423	SG->addRule(NewRule: std::make_shared<IsSuccOfPrevGroup>(args&: TII, args: SG->getSGID()));
2424	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2425
2426	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2427	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2428	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2429
2430	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2431	Args: SchedGroupMask::VMEM_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2432	SG->addRule(NewRule: std::make_shared<SharesPredWithPrevNthGroup>(
2433	args: `2`, args&: TII, args: SG->getSGID(), args: true));
2434	SG->addRule(NewRule: std::make_shared<VMEMSize>(args&: TII, args: SG->getSGID()));
2435	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2436
2437	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2438	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2439	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2440
2441	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2442	Args: SchedGroupMask::VMEM_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2443	SG->addRule(NewRule: std::make_shared<SharesPredWithPrevNthGroup>(
2444	args: `4`, args&: TII, args: SG->getSGID(), args: true));
2445	SG->addRule(NewRule: std::make_shared<VMEMSize>(args&: TII, args: SG->getSGID()));
2446	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2447
2448	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2449	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2450	SG->findCandidateSUnits(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2451	}
2452
2453	return true;
2454	}
2455
2456	static std::unique_ptr<IGLPStrategy>
2457	createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
2458	const SIInstrInfo *TII) {
2459	switch (ID) {
2460	case MFMASmallGemmOptID:
2461	return std::make_unique<MFMASmallGemmOpt>(args&: DAG, args&: TII);
2462	case MFMASmallGemmSingleWaveOptID:
2463	return std::make_unique<MFMASmallGemmSingleWaveOpt>(args&: DAG, args&: TII);
2464	case MFMAExpInterleaveID:
2465	return std::make_unique<MFMAExpInterleaveOpt>(args&: DAG, args&: TII);
2466	case MFMAExpSimpleInterleaveID:
2467	return std::make_unique<MFMAExpSimpleInterleaveOpt>(args&: DAG, args&: TII);
2468	}
2469
2470	llvm_unreachable("Unknown IGLPStrategyID");
2471	}
2472
2473	class IGroupLPDAGMutation : public ScheduleDAGMutation {
2474	private:
2475	const SIInstrInfo *TII;
2476
2477	ScheduleDAGMI *DAG;
2478
2479	// Organize lists of SchedGroups by their SyncID. SchedGroups /
2480	// SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
2481	// between then.
2482	DenseMap<int, SmallVector<SchedGroup, `4`>> SyncedSchedGroups;
2483
2484	// Used to track instructions that can be mapped to multiple sched groups
2485	DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
2486
2487	// Add DAG edges that enforce SCHED_BARRIER ordering.
2488	void addSchedBarrierEdges(SUnit &SU);
2489
2490	// Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
2491	// not be reordered accross the SCHED_BARRIER. This is used for the base
2492	// SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
2493	// SCHED_BARRIER will always block all instructions that can be classified
2494	// into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
2495	// and may only synchronize with some SchedGroups. Returns the inverse of
2496	// Mask. SCHED_BARRIER's mask describes which instruction types should be
2497	// allowed to be scheduled across it. Invert the mask to get the
2498	// SchedGroupMask of instructions that should be barred.
2499	SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
2500
2501	// Create SchedGroups for a SCHED_GROUP_BARRIER.
2502	void initSchedGroupBarrierPipelineStage(
2503	std::vector<SUnit>::reverse_iterator RIter);
2504
2505	bool initIGLPOpt(SUnit &SU);
2506
2507	public:
2508	void apply(ScheduleDAGInstrs *DAGInstrs) override;
2509
2510	// The order in which the PipelineSolver should process the candidate
2511	// SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
2512	// created SchedGroup first, and will consider that as the ultimate
2513	// predecessor group when linking. TOP_DOWN instead links and processes the
2514	// first created SchedGroup first.
2515	bool IsBottomUp = true;
2516
2517	// The scheduling phase this application of IGLP corresponds with.
2518	AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
2519
2520	IGroupLPDAGMutation() = default;
2521	IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
2522	};
2523
2524	unsigned SchedGroup::NumSchedGroups = `0`;
2525
2526	bool SchedGroup::tryAddEdge(SUnit A, SUnit B) {
2527	return A != B && DAG->addEdge(SuccSU: B, PredDep: SDep (A, SDep::Artificial));
2528	}
2529
2530	bool SchedGroup::canAddMI(const MachineInstr &MI) const {
2531	bool Result = false;
2532	if (MI.isMetaInstruction())
2533	Result = false;
2534
2535	else if (MI.isInlineAsm()) {
2536	const SIRegisterInfo &TRI = TII->getRegisterInfo();
2537	auto &MRI = MI.getParent()->getParent()->getRegInfo();
2538	bool SGPR_used = false, SGPR_big_def = false, VGPR_used = false,
2539	VMFMA_used = false, VReg32_used = false, MayLoad = MI.mayLoad(),
2540	MayStore = MI.mayStore();
2541	for (const MachineOperand &Operand : MI.operands())
2542	if (Operand.isReg()) {
2543	const TargetRegisterClass &RegClass =
2544	*TRI.getRegClassForOperandReg(MRI, MO: Operand);
2545	if (TRI.hasVGPRs(RC: &RegClass)) {
2546	VGPR_used = true;
2547	if (Operand.isUse() && TRI.getRegSizeInBits(RC: RegClass) == `32`)
2548	VReg32_used = true;
2549	}
2550	// > 128 bit registers are usually only used by MFMA instructions, so
2551	// we're using that as a heuristic to guess the schedule group mask of
2552	// the inline asm.
2553	if (TRI.hasAGPRs(RC: &RegClass) \|\| TRI.getRegSizeInBits(RC: RegClass) > `128`)
2554	VMFMA_used = true;
2555	if (TRI.hasSGPRs(RC: &RegClass))
2556	SGPR_used = true;
2557	if (TRI.getRegSizeInBits(RC: RegClass) > `64` && Operand.isDef())
2558	SGPR_big_def = true;
2559	}
2560
2561	typedef std::underlying_type_t<SchedGroupMask> SGMask_t;
2562	SGMask_t InlineAsmMask = `0`;
2563	if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
2564	InlineAsmMask \|= (SGMask_t)SchedGroupMask::VALU;
2565	if (SGPR_used && !VGPR_used && !MayLoad && !MayStore)
2566	InlineAsmMask \|= (SGMask_t)SchedGroupMask::SALU;
2567	if (VMFMA_used)
2568	InlineAsmMask \|= (SGMask_t)SchedGroupMask::MFMA;
2569	if (VGPR_used && MayLoad)
2570	InlineAsmMask \|= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ
2571	: SchedGroupMask::VMEM_READ);
2572	if (VGPR_used && MayStore)
2573	InlineAsmMask \|= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE
2574	: SchedGroupMask::VMEM_WRITE);
2575	if (SGPR_big_def)
2576	InlineAsmMask \|= (SGMask_t)SchedGroupMask::DS_READ;
2577	if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU \|\|
2578	InlineAsmMask & (SGMask_t)SchedGroupMask::SALU)
2579	InlineAsmMask \|= (SGMask_t)SchedGroupMask::ALU;
2580	if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ \|\|
2581	InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE)
2582	InlineAsmMask \|= (SGMask_t)SchedGroupMask::DS;
2583	if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ \|\|
2584	InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE)
2585	InlineAsmMask \|= (SGMask_t)SchedGroupMask::VMEM;
2586
2587	Result = ((SGMask_t)SGMask & InlineAsmMask) != `0`;
2588	}
2589
2590	else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2591	(TII->isVALU(MI, /AllowLDSDMA=/true) \|\| TII->isMFMAorWMMA(MI) \|\|
2592	TII->isSALU(MI) \|\| TII->isTRANS(MI)))
2593	Result = !MI.mayLoadOrStore();
2594
2595	else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2596	TII->isVALU(MI, /AllowLDSDMA=/true) && !TII->isMFMAorWMMA(MI) &&
2597	!TII->isTRANS(MI) && !TII->isLDSDMA(MI)) {
2598	// Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD__LDS).*
2599	// For our purposes, these shall not be classified as VALU as this results
2600	// in unexpected behavior.
2601	Result = !MI.mayLoadOrStore();
2602	}
2603
2604	else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2605	TII->isSALU(MI))
2606	Result = !MI.mayLoadOrStore();
2607
2608	else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2609	TII->isMFMAorWMMA(MI))
2610	Result = true;
2611
2612	else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2613	(TII->isVMEM(MI) \|\| TII->isLDSDMA(MI)))
2614	Result = true;
2615
2616	else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2617	MI.mayLoad() && TII->isVMEM(MI))
2618	Result = true;
2619
2620	else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2621	MI.mayStore() && TII->isVMEM(MI))
2622	Result = true;
2623
2624	else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2625	(TII->isDS(MI) \|\| TII->isLDSDMA(MI)))
2626	Result = true;
2627
2628	else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2629	MI.mayLoad() && TII->isDS(MI))
2630	Result = true;
2631
2632	else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2633	MI.mayStore() && TII->isDS(MI))
2634	Result = true;
2635
2636	else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2637	TII->isTRANS(MI))
2638	Result = true;
2639
2640	else if (((SGMask & SchedGroupMask::LDSDMA) != SchedGroupMask::NONE) &&
2641	TII->isLDSDMA(MI))
2642	Result = true;
2643
2644	LLVM_DEBUG(
2645	dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, `10`, true)
2646	<< (Result ? " could classify " : " unable to classify ") << MI);
2647
2648	return Result;
2649	}
2650
2651	int SchedGroup::link(SUnit &SU, bool MakePred,
2652	std::list<std::pair<SUnit , SUnit >> &AddedEdges) {
2653	int MissedEdges = `0`;
2654	for (auto *A : Collection) {
2655	SUnit *B = &SU;
2656	if (A == B \|\| A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2657	continue;
2658	if (MakePred)
2659	std::swap(a&: A, b&: B);
2660
2661	if (DAG->IsReachable(SU: B, TargetSU: A))
2662	continue;
2663
2664	// tryAddEdge returns false if there is a dependency that makes adding
2665	// the A->B edge impossible, otherwise it returns true;
2666	bool Added = tryAddEdge(A, B);
2667	if (Added)
2668	AddedEdges.emplace_back(args&: A, args&: B);
2669	else
2670	++MissedEdges;
2671	}
2672
2673	return MissedEdges;
2674	}
2675
2676	void SchedGroup::link(SUnit &SU, bool MakePred) {
2677	for (auto *A : Collection) {
2678	SUnit *B = &SU;
2679	if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2680	continue;
2681	if (MakePred)
2682	std::swap(a&: A, b&: B);
2683
2684	tryAddEdge(A, B);
2685	}
2686	}
2687
2688	void SchedGroup::link(SUnit &SU,
2689	function_ref<bool(const SUnit A, const* SUnit *B)> P) {
2690	for (auto *A : Collection) {
2691	SUnit *B = &SU;
2692	if (P (A, B))
2693	std::swap(a&: A, b&: B);
2694
2695	tryAddEdge(A, B);
2696	}
2697	}
2698
2699	void SchedGroup::link(SchedGroup &OtherGroup) {
2700	for (auto *B : OtherGroup.Collection)
2701	link(SU&: *B);
2702	}
2703
2704	bool SchedGroup::canAddSU(SUnit &SU) const {
2705	MachineInstr &MI = *SU.getInstr();
2706	if (MI.getOpcode() != TargetOpcode::BUNDLE)
2707	return canAddMI(MI);
2708
2709	// Special case for bundled MIs.
2710	const MachineBasicBlock *MBB = MI.getParent();
2711	MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
2712	while (E != MBB->end() && E ->isBundledWithPred())
2713	++E;
2714
2715	// Return true if all of the bundled MIs can be added to this group.
2716	return std::all_of(first: B, last: E, pred: [this](MachineInstr &MI) { return canAddMI(MI); });
2717	}
2718
2719	template <class T>
2720	void SchedGroup::findCandidateSUnits(T Begin, T End,
2721	SUnitsToCandidateSGsMap &SyncedInstrs) {
2722	for (SUnit &SU : make_range(Begin, End)) {
2723	if (canAddSU(SU))
2724	SyncedInstrs [&SU].push_back(Elt: SGID);
2725	}
2726	}
2727
2728	void SchedGroup::findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs) {
2729	findCandidateSUnits(Begin: DAG->SUnits.rbegin(), End: DAG->SUnits.rend(), SyncedInstrs);
2730	}
2731
2732	void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
2733	const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
2734	if (!TSchedModel \|\| DAGInstrs->SUnits.empty())
2735	return;
2736
2737	LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
2738	const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
2739	TII = ST.getInstrInfo();
2740	DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
2741	SyncedSchedGroups.clear();
2742	SyncedInstrs.clear();
2743	bool FoundSB = false;
2744	bool FoundIGLP = false;
2745	bool ShouldApplyIGLP = false;
2746	for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
2747	unsigned Opc = R ->getInstr()->getOpcode();
2748	// SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
2749	if (Opc == AMDGPU::SCHED_BARRIER) {
2750	addSchedBarrierEdges(SU&: *R);
2751	FoundSB = true;
2752	} else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2753	initSchedGroupBarrierPipelineStage(RIter: R);
2754	FoundSB = true;
2755	} else if (Opc == AMDGPU::IGLP_OPT) {
2756	if (!FoundSB && !FoundIGLP) {
2757	FoundIGLP = true;
2758	ShouldApplyIGLP = initIGLPOpt(SU&: *R);
2759	}
2760	}
2761	}
2762
2763	if (FoundSB \|\| (FoundIGLP && ShouldApplyIGLP)) {
2764	PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2765	// PipelineSolver performs the mutation by adding the edges it
2766	// determined as the best
2767	PS.solve();
2768	return;
2769	}
2770	}
2771
2772	void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
2773	MachineInstr &MI = *SchedBarrier.getInstr();
2774	assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2775	LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
2776	<< MI.getOperand(`0`).getImm() << "\n");
2777	auto InvertedMask =
2778	invertSchedBarrierMask(Mask: (SchedGroupMask)MI.getOperand(i: `0`).getImm());
2779	SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
2780
2781	for (SUnit &SU : DAG->SUnits)
2782	if (SG.canAddSU(SU))
2783	SG.add(SU);
2784
2785	// Preserve original instruction ordering relative to the SCHED_BARRIER.
2786	SG.link(
2787	SU&: SchedBarrier,
2788	P: (function_ref<bool(const SUnit A, const* SUnit *B)>)[](
2789	const SUnit A, const* SUnit B) { return* A->NodeNum > B->NodeNum; });
2790	}
2791
2792	SchedGroupMask
2793	IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
2794	// Invert mask and erase bits for types of instructions that are implied to be
2795	// allowed past the SCHED_BARRIER.
2796	SchedGroupMask InvertedMask = ~Mask;
2797
2798	static constexpr std::pair<SchedGroupMask, SchedGroupMask> ImpliedGroups[] = {
2799	{SchedGroupMask::ALU, SchedGroupMask::VALU \| SchedGroupMask::SALU \|
2800	SchedGroupMask::MFMA \| SchedGroupMask::TRANS},
2801	{SchedGroupMask::VMEM, SchedGroupMask::VMEM_READ \|
2802	SchedGroupMask::VMEM_WRITE \|
2803	SchedGroupMask::LDSDMA},
2804	{SchedGroupMask::DS, SchedGroupMask::DS_READ \| SchedGroupMask::DS_WRITE \|
2805	SchedGroupMask::LDSDMA},
2806	};
2807
2808	for (auto [Aggregate, Members] : ImpliedGroups) {
2809	// Aggregate allowed past the barrier implies all its members are too.
2810	if ((InvertedMask & Aggregate) == SchedGroupMask::NONE)
2811	InvertedMask &= ~Members;
2812	// Any member allowed past the barrier implies the aggregate is too.
2813	else if ((InvertedMask & Members) != Members)
2814	InvertedMask &= ~Aggregate;
2815	}
2816
2817	LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
2818	<< "\n");
2819
2820	return InvertedMask;
2821	}
2822
2823	void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2824	std::vector<SUnit>::reverse_iterator RIter) {
2825	MachineInstr &SGB = *RIter ->getInstr();
2826	assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
2827	int32_t SGMask = SGB.getOperand(i: `0`).getImm();
2828	int32_t Size = SGB.getOperand(i: `1`).getImm();
2829	int32_t SyncID = SGB.getOperand(i: `2`).getImm();
2830
2831	Size++; // Make room for the SCHED_GROUP_BARRIER instruction
2832	auto &SG = SyncedSchedGroups [SyncID].emplace_back(Args: (SchedGroupMask)SGMask,
2833	Args&: Size, Args&: SyncID, Args&: DAG, Args&: TII);
2834	SG.add(SU&: *RIter);
2835	SG.findCandidateSUnits(Begin: RIter, End: SG.DAG->SUnits.rend(),
2836	SyncedInstrs&: SyncedInstrs [SG.getSyncID()]);
2837	}
2838
2839	bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2840	IGLPStrategyID StrategyID =
2841	(IGLPStrategyID)SU.getInstr()->getOperand(i: `0`).getImm();
2842	auto S = createIGLPStrategy(ID: StrategyID, DAG, TII);
2843	if (!S ->shouldApplyStrategy(DAG, Phase))
2844	return false;
2845
2846	IsBottomUp = S ->IsBottomUp;
2847	return S ->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
2848	}
2849
2850	} // namespace
2851
2852	/// \p Phase specifes whether or not this is a reentry into the
2853	/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
2854	/// same scheduling region (e.g. pre and post-RA scheduling / multiple
2855	/// scheduling "phases"), we can reenter this mutation framework more than once
2856	/// for a given region.
2857	std::unique_ptr<ScheduleDAGMutation>
2858	llvm::createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
2859	return std::make_unique<IGroupLPDAGMutation>(args&: Phase);
2860	}
2861

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp