AMDGPUIGroupLP.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp]

1	//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file This file defines a set of schedule DAG mutations that can be used to
10	// override default scheduler behavior to enforce specific scheduling patterns.
11	// They should be used in cases where runtime performance considerations such as
12	// inter-wavefront interactions, mean that compile-time heuristics cannot
13	// predict the optimal instruction ordering, or in kernels where optimum
14	// instruction scheduling is important enough to warrant manual intervention.
15	//
16	//===----------------------------------------------------------------------===//
17
18	#include "AMDGPUIGroupLP.h"
19	#include "AMDGPUTargetMachine.h"
20	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21	#include "SIInstrInfo.h"
22	#include "SIMachineFunctionInfo.h"
23	#include "llvm/ADT/BitmaskEnum.h"
24	#include "llvm/ADT/DenseMap.h"
25	#include "llvm/CodeGen/MachineScheduler.h"
26	#include "llvm/CodeGen/TargetOpcodes.h"
27
28	using namespace llvm;
29
30	#define DEBUG_TYPE "igrouplp"
31
32	namespace {
33
34	static cl::opt<bool> EnableExactSolver(
35	"amdgpu-igrouplp-exact-solver", cl::Hidden,
36	cl::desc ("Whether to use the exponential time solver to fit "
37	"the instructions to the pipeline as closely as "
38	"possible."),
39	cl::init(Val: false));
40
41	static cl::opt<unsigned> CutoffForExact(
42	"amdgpu-igrouplp-exact-solver-cutoff", cl::init(Val: `0`), cl::Hidden,
43	cl::desc ("The maximum number of scheduling group conflicts "
44	"which we attempt to solve with the exponential time "
45	"exact solver. Problem sizes greater than this will"
46	"be solved by the less accurate greedy algorithm. Selecting "
47	"solver by size is superseded by manually selecting "
48	"the solver (e.g. by amdgpu-igrouplp-exact-solver"));
49
50	static cl::opt<uint64_t> MaxBranchesExplored(
51	"amdgpu-igrouplp-exact-solver-max-branches", cl::init(Val: `0`), cl::Hidden,
52	cl::desc ("The amount of branches that we are willing to explore with"
53	"the exact algorithm before giving up."));
54
55	static cl::opt<bool> UseCostHeur(
56	"amdgpu-igrouplp-exact-solver-cost-heur", cl::init(Val: true), cl::Hidden,
57	cl::desc ("Whether to use the cost heuristic to make choices as we "
58	"traverse the search space using the exact solver. Defaulted "
59	"to on, and if turned off, we will use the node order -- "
60	"attempting to put the later nodes in the later sched groups. "
61	"Experimentally, results are mixed, so this should be set on a "
62	"case-by-case basis."));
63
64	// Components of the mask that determines which instruction types may be may be
65	// classified into a SchedGroup.
66	enum class SchedGroupMask {
67	NONE = `0u`,
68	ALU = `1u` << `0`,
69	VALU = `1u` << `1`,
70	SALU = `1u` << `2`,
71	MFMA = `1u` << `3`,
72	VMEM = `1u` << `4`,
73	VMEM_READ = `1u` << `5`,
74	VMEM_WRITE = `1u` << `6`,
75	DS = `1u` << `7`,
76	DS_READ = `1u` << `8`,
77	DS_WRITE = `1u` << `9`,
78	TRANS = `1u` << `10`,
79	ALL = ALU \| VALU \| SALU \| MFMA \| VMEM \| VMEM_READ \| VMEM_WRITE \| DS \|
80	DS_READ \| DS_WRITE \| TRANS,
81	LLVM_MARK_AS_BITMASK_ENUM(/ LargestFlag = / ALL)
82	};
83
84	class SchedGroup;
85
86	// InstructionRule class is used to enact a filter which determines whether or
87	// not an SU maps to a given SchedGroup. It contains complementary data
88	// structures (e.g Cache) to help those filters.
89	class InstructionRule {
90	protected:
91	const SIInstrInfo *TII;
92	unsigned SGID;
93	// A cache made available to the Filter to store SUnits for subsequent
94	// invocations of the Filter
95	std::optional<SmallVector<SUnit *, `4`>> Cache;
96
97	public:
98	virtual bool
99	apply(const SUnit , const* ArrayRef<SUnit *>,
100	SmallVectorImpl<SchedGroup> &) {
101	return true;
102	};
103
104	InstructionRule(const SIInstrInfo TII, unsigned* SGID,
105	bool NeedsCache = false)
106	: TII(TII), SGID(SGID) {
107	if (NeedsCache) {
108	Cache = SmallVector<SUnit *, `4`>();
109	}
110	}
111
112	virtual ~InstructionRule() = default;
113	};
114
115	using SUnitsToCandidateSGsMap = DenseMap<SUnit , SmallVector<int*, `4`>>;
116
117	// Classify instructions into groups to enable fine tuned control over the
118	// scheduler. These groups may be more specific than current SchedModel
119	// instruction classes.
120	class SchedGroup {
121	private:
122	// Mask that defines which instruction types can be classified into this
123	// SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
124	// and SCHED_GROUP_BARRIER.
125	SchedGroupMask SGMask;
126
127	// Maximum number of SUnits that can be added to this group.
128	std::optional<unsigned> MaxSize;
129
130	// SchedGroups will only synchronize with other SchedGroups that have the same
131	// SyncID.
132	int SyncID = `0`;
133
134	// SGID is used to map instructions to candidate SchedGroups
135	unsigned SGID;
136
137	// The different rules each instruction in this SchedGroup must conform to
138	SmallVector<std::shared_ptr<InstructionRule>, `4`> Rules;
139
140	// Count of the number of created SchedGroups, used to initialize SGID.
141	static unsigned NumSchedGroups;
142
143	// Try to add and edge from SU A to SU B.
144	bool tryAddEdge(SUnit A, SUnit B);
145
146	// Use SGMask to determine whether we can classify MI as a member of this
147	// SchedGroup object.
148	bool canAddMI(const MachineInstr &MI) const;
149
150	public:
151	// Collection of SUnits that are classified as members of this group.
152	SmallVector<SUnit *, `32`> Collection;
153
154	ScheduleDAGInstrs *DAG;
155	const SIInstrInfo *TII;
156
157	// Returns true if SU can be added to this SchedGroup.
158	bool canAddSU(SUnit &SU) const;
159
160	// Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
161	// MakePred is true, SU will be a predecessor of the SUnits in this
162	// SchedGroup, otherwise SU will be a successor.
163	void link(SUnit &SU, bool MakePred = false);
164
165	// Add DAG dependencies and track which edges are added, and the count of
166	// missed edges
167	int link(SUnit &SU, bool MakePred,
168	std::vector<std::pair<SUnit , SUnit >> &AddedEdges);
169
170	// Add DAG dependencies from all SUnits in this SchedGroup and this SU.
171	// Use the predicate to determine whether SU should be a predecessor (P =
172	// true) or a successor (P = false) of this SchedGroup.
173	void link(SUnit &SU, function_ref<bool(const SUnit A, const* SUnit *B)> P);
174
175	// Add DAG dependencies such that SUnits in this group shall be ordered
176	// before SUnits in OtherGroup.
177	void link(SchedGroup &OtherGroup);
178
179	// Returns true if no more instructions may be added to this group.
180	bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
181
182	// Append a constraint that SUs must meet in order to fit into this
183	// SchedGroup. Since many rules involve the relationship between a SchedGroup
184	// and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
185	// time (rather than SchedGroup init time.)
186	void addRule(std::shared_ptr<InstructionRule> NewRule) {
187	Rules.push_back(Elt: NewRule);
188	}
189
190	// Returns true if the SU matches all rules
191	bool allowedByRules(const SUnit *SU,
192	SmallVectorImpl<SchedGroup> &SyncPipe) const {
193	for (auto &Rule : Rules) {
194	if (!Rule.get()->apply(SU, Collection, SyncPipe))
195	return false;
196	}
197	return true;
198	}
199
200	// Add SU to the SchedGroup.
201	void add(SUnit &SU) {
202	LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
203	<< format_hex((int)SGMask, `10`, true) << " adding "
204	<< *SU.getInstr());
205	Collection.push_back(Elt: &SU);
206	}
207
208	// Remove last element in the SchedGroup
209	void pop() { Collection.pop_back(); }
210
211	// Identify and add all relevant SUs from the DAG to this SchedGroup.
212	void initSchedGroup();
213
214	// Add instructions to the SchedGroup bottom up starting from RIter.
215	// PipelineInstrs is a set of instructions that should not be added to the
216	// SchedGroup even when the other conditions for adding it are satisfied.
217	// RIter will be added to the SchedGroup as well, and dependencies will be
218	// added so that RIter will always be scheduled at the end of the group.
219	void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
220	SUnitsToCandidateSGsMap &SyncedInstrs);
221
222	void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
223
224	int getSyncID() { return SyncID; }
225
226	int getSGID() { return SGID; }
227
228	SchedGroupMask getMask() { return SGMask; }
229
230	SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
231	ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
232	: SGMask(SGMask), MaxSize (MaxSize), DAG(DAG), TII(TII) {
233	SGID = NumSchedGroups++;
234	}
235
236	SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
237	ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
238	: SGMask(SGMask), MaxSize (MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
239	SGID = NumSchedGroups++;
240	}
241	};
242
243	// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
244	static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
245	assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER \|\|
246	SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER \|\|
247	SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
248
249	while (!SU.Preds.empty())
250	for (auto &P : SU.Preds)
251	SU.removePred(D: P);
252
253	while (!SU.Succs.empty())
254	for (auto &S : SU.Succs)
255	for (auto &SP : S.getSUnit()->Preds)
256	if (SP.getSUnit() == &SU)
257	S.getSUnit()->removePred(D: SP);
258	}
259
260	using SUToCandSGsPair = std::pair<SUnit , SmallVector<int*, `4`>>;
261	using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, `4`>;
262
263	// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
264	// in non-trivial cases. For example, if the requested pipeline is
265	// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
266	// in the DAG, then we will have an instruction that can not be trivially
267	// assigned to a SchedGroup. The PipelineSolver class implements two algorithms
268	// to find a good solution to the pipeline -- a greedy algorithm and an exact
269	// algorithm. The exact algorithm has an exponential time complexity and should
270	// only be used for small sized problems or medium sized problems where an exact
271	// solution is highly desired.
272	class PipelineSolver {
273	ScheduleDAGMI *DAG;
274
275	// Instructions that can be assigned to multiple SchedGroups
276	DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
277	SmallVector<SUsToCandSGsVec, `4`> PipelineInstrs;
278	DenseMap<int, SmallVector<SchedGroup, `4`>> SyncedSchedGroups;
279	// The current working pipeline
280	SmallVector<SmallVector<SchedGroup, `4`>, `4`> CurrPipeline;
281	// The pipeline that has the best solution found so far
282	SmallVector<SmallVector<SchedGroup, `4`>, `4`> BestPipeline;
283
284	// Whether or not we actually have any SyncedInstrs to try to solve.
285	bool NeedsSolver = false;
286
287	// Compute an estimate of the size of search tree -- the true size is
288	// the product of each conflictedInst.Matches.size() across all SyncPipelines
289	unsigned computeProblemSize();
290
291	// The cost penalty of not assigning a SU to a SchedGroup
292	int MissPenalty = `0`;
293
294	// Costs in terms of the number of edges we are unable to add
295	int BestCost = -`1`;
296	int CurrCost = `0`;
297
298	// Index pointing to the conflicting instruction that is currently being
299	// fitted
300	int CurrConflInstNo = `0`;
301	// Index to the pipeline that is currently being fitted
302	int CurrSyncGroupIdx = `0`;
303	// The first non trivial pipeline
304	int BeginSyncGroupIdx = `0`;
305
306	// How many branches we have explored
307	uint64_t BranchesExplored = `0`;
308
309	// The direction in which we process the candidate SchedGroups per SU
310	bool IsBottomUp = true;
311
312	// Update indices to fit next conflicting instruction
313	void advancePosition();
314	// Recede indices to attempt to find better fit for previous conflicting
315	// instruction
316	void retreatPosition();
317
318	// The exponential time algorithm which finds the provably best fit
319	bool solveExact();
320	// The polynomial time algorithm which attempts to find a good fit
321	bool solveGreedy();
322	// Find the best SchedGroup for the current SU using the heuristic given all
323	// current information. One step in the greedy algorithm. Templated against
324	// the SchedGroup iterator (either reverse or forward).
325	template <typename T>
326	void greedyFind(std::vector<std::pair<SUnit , SUnit >> &AddedEdges, T I,
327	T E);
328	// Whether or not the current solution is optimal
329	bool checkOptimal();
330	// Populate the ready list, prioiritizing fewest missed edges first
331	// Templated against the SchedGroup iterator (either reverse or forward).
332	template <typename T>
333	void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
334	T E);
335	// Add edges corresponding to the SchedGroups as assigned by solver
336	void makePipeline();
337	// Link the SchedGroups in the best found pipeline.
338	// Tmplated against the SchedGroup iterator (either reverse or forward).
339	template <typename T> void linkSchedGroups(T I, T E);
340	// Add the edges from the SU to the other SchedGroups in pipeline, and
341	// return the number of edges missed.
342	int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit SU, int* SGID,
343	std::vector<std::pair<SUnit , SUnit >> &AddedEdges);
344	/// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
345	/// returns the cost (in terms of missed pipeline edges), and tracks the edges
346	/// added in \p AddedEdges
347	template <typename T>
348	int linkSUnit(SUnit SU, int* SGID,
349	std::vector<std::pair<SUnit , SUnit >> &AddedEdges, T I, T E);
350	/// Remove the edges passed via \p AddedEdges
351	void removeEdges(const std::vector<std::pair<SUnit , SUnit >> &AddedEdges);
352	// Convert the passed in maps to arrays for bidirectional iterators
353	void convertSyncMapsToArrays();
354
355	void reset();
356
357	public:
358	// Invoke the solver to map instructions to instruction groups. Heuristic &&
359	// command-line-option determines to use exact or greedy algorithm.
360	void solve();
361
362	PipelineSolver(DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
363	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
364	ScheduleDAGMI DAG, bool* IsBottomUp = true)
365	: DAG(DAG), SyncedInstrs (SyncedInstrs),
366	SyncedSchedGroups (SyncedSchedGroups), IsBottomUp(IsBottomUp) {
367
368	for (auto &PipelineInstrs : SyncedInstrs) {
369	if (PipelineInstrs.second.size() > `0`) {
370	NeedsSolver = true;
371	break;
372	}
373	}
374
375	if (!NeedsSolver)
376	return;
377
378	convertSyncMapsToArrays();
379
380	CurrPipeline = BestPipeline;
381
382	while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
383	PipelineInstrs [BeginSyncGroupIdx].size() == `0`)
384	++BeginSyncGroupIdx;
385
386	if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
387	return;
388	}
389	};
390
391	void PipelineSolver::reset() {
392
393	for (auto &SyncPipeline : CurrPipeline) {
394	for (auto &SG : SyncPipeline) {
395	SmallVector<SUnit *, `32`> TempCollection = SG.Collection;
396	SG.Collection.clear();
397	auto SchedBarr = llvm::find_if(Range&: TempCollection, P: [](SUnit *SU) {
398	return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
399	});
400	if (SchedBarr != TempCollection.end())
401	SG.Collection.push_back(Elt: *SchedBarr);
402	}
403	}
404
405	CurrSyncGroupIdx = BeginSyncGroupIdx;
406	CurrConflInstNo = `0`;
407	CurrCost = `0`;
408	}
409
410	void PipelineSolver::convertSyncMapsToArrays() {
411	for (auto &SyncPipe : SyncedSchedGroups) {
412	BestPipeline.insert(I: BestPipeline.begin(), Elt: SyncPipe.second);
413	}
414
415	int PipelineIDx = SyncedInstrs.size() - `1`;
416	PipelineInstrs.resize(N: SyncedInstrs.size());
417	for (auto &SyncInstrMap : SyncedInstrs) {
418	for (auto &SUsToCandSGs : SyncInstrMap.second) {
419	if (PipelineInstrs [PipelineIDx].size() == `0`) {
420	PipelineInstrs [PipelineIDx].push_back(
421	Elt: std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
422	continue;
423	}
424	auto SortPosition = PipelineInstrs [PipelineIDx].begin();
425	// Insert them in sorted order -- this allows for good parsing order in
426	// the greedy algorithm
427	while (SortPosition != PipelineInstrs [PipelineIDx].end() &&
428	SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
429	++SortPosition;
430	PipelineInstrs [PipelineIDx].insert(
431	I: SortPosition, Elt: std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
432	}
433	--PipelineIDx;
434	}
435	}
436
437	template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
438	for (; I != E; ++I) {
439	auto &GroupA = *I;
440	for (auto J = std::next(I); J != E; ++J) {
441	auto &GroupB = *J;
442	GroupA.link(GroupB);
443	}
444	}
445	}
446
447	void PipelineSolver::makePipeline() {
448	// Preserve the order of barrier for subsequent SchedGroupBarrier mutations
449	for (auto &SyncPipeline : BestPipeline) {
450	LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
451	for (auto &SG : SyncPipeline) {
452	LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
453	<< " has: \n");
454	SUnit SGBarr = nullptr*;
455	for (auto &SU : SG.Collection) {
456	if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
457	SGBarr = SU;
458	LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
459	}
460	// Command line requested IGroupLP doesn't have SGBarr
461	if (!SGBarr)
462	continue;
463	resetEdges(SU&: *SGBarr, DAG);
464	SG.link(SU&: SGBarr, MakePred: false*);
465	}
466	}
467
468	for (auto &SyncPipeline : BestPipeline) {
469	IsBottomUp ? linkSchedGroups(I: SyncPipeline.rbegin(), E: SyncPipeline.rend())
470	: linkSchedGroups(I: SyncPipeline.begin(), E: SyncPipeline.end());
471	}
472	}
473
474	template <typename T>
475	int PipelineSolver::linkSUnit(
476	SUnit SU, int* SGID, std::vector<std::pair<SUnit , SUnit >> &AddedEdges,
477	T I, T E) {
478	bool MakePred = false;
479	int AddedCost = `0`;
480	for (; I < E; ++I) {
481	if (I->getSGID() == SGID) {
482	MakePred = true;
483	continue;
484	}
485	auto Group = *I;
486	AddedCost += Group.link(*SU, MakePred, AddedEdges);
487	assert(AddedCost >= `0`);
488	}
489	return AddedCost;
490	}
491
492	int PipelineSolver::addEdges(
493	SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit SU, int* SGID,
494	std::vector<std::pair<SUnit , SUnit >> &AddedEdges) {
495
496	// For IsBottomUp, the first SchedGroup in SyncPipeline contains the
497	// instructions that are the ultimate successors in the resultant mutation.
498	// Therefore, in such a configuration, the SchedGroups occurring before the
499	// candidate SGID are successors of the candidate SchedGroup, thus the current
500	// SU should be linked as a predecessor to SUs in those SchedGroups. The
501	// opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
502	// SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
503	// IsBottomUp (in reverse).
504	return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, I: SyncPipeline.rbegin(),
505	E: SyncPipeline.rend())
506	: linkSUnit(SU, SGID, AddedEdges, I: SyncPipeline.begin(),
507	E: SyncPipeline.end());
508	}
509
510	void PipelineSolver::removeEdges(
511	const std::vector<std::pair<SUnit , SUnit >> &EdgesToRemove) {
512	// Only remove the edges that we have added when testing
513	// the fit.
514	for (auto &PredSuccPair : EdgesToRemove) {
515	SUnit *Pred = PredSuccPair.first;
516	SUnit *Succ = PredSuccPair.second;
517
518	auto Match = llvm::find_if(
519	Range&: Succ->Preds, P: [&Pred](SDep &P) { return P.getSUnit() == Pred; });
520	if (Match != Succ->Preds.end()) {
521	assert(Match->isArtificial());
522	Succ->removePred(D: *Match);
523	}
524	}
525	}
526
527	void PipelineSolver::advancePosition() {
528	++CurrConflInstNo;
529
530	if (static_cast<size_t>(CurrConflInstNo) >=
531	PipelineInstrs [CurrSyncGroupIdx].size()) {
532	CurrConflInstNo = `0`;
533	++CurrSyncGroupIdx;
534	// Advance to next non-trivial pipeline
535	while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
536	PipelineInstrs [CurrSyncGroupIdx].size() == `0`)
537	++CurrSyncGroupIdx;
538	}
539	}
540
541	void PipelineSolver::retreatPosition() {
542	assert(CurrConflInstNo >= `0`);
543	assert(CurrSyncGroupIdx >= `0`);
544
545	if (CurrConflInstNo > `0`) {
546	--CurrConflInstNo;
547	return;
548	}
549
550	if (CurrConflInstNo == `0`) {
551	// If we return to the starting position, we have explored
552	// the entire tree
553	if (CurrSyncGroupIdx == BeginSyncGroupIdx)
554	return;
555
556	--CurrSyncGroupIdx;
557	// Go to previous non-trivial pipeline
558	while (PipelineInstrs [CurrSyncGroupIdx].size() == `0`)
559	--CurrSyncGroupIdx;
560
561	CurrConflInstNo = PipelineInstrs [CurrSyncGroupIdx].size() - `1`;
562	}
563	}
564
565	bool PipelineSolver::checkOptimal() {
566	if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
567	if (BestCost == -`1` \|\| CurrCost < BestCost) {
568	BestPipeline = CurrPipeline;
569	BestCost = CurrCost;
570	LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
571	}
572	assert(BestCost >= `0`);
573	}
574
575	bool DoneExploring = false;
576	if (MaxBranchesExplored > `0` && BranchesExplored >= MaxBranchesExplored)
577	DoneExploring = true;
578
579	return (DoneExploring \|\| BestCost == `0`);
580	}
581
582	template <typename T>
583	void PipelineSolver::populateReadyList(
584	SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
585	SUToCandSGsPair CurrSU = PipelineInstrs [CurrSyncGroupIdx][CurrConflInstNo];
586	auto SyncPipeline = CurrPipeline [CurrSyncGroupIdx];
587	assert(CurrSU.second.size() >= `1`);
588
589	for (; I != E; ++I) {
590	std::vector<std::pair<SUnit , SUnit >> AddedEdges;
591	int CandSGID = *I;
592	SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
593	return SG.getSGID() == CandSGID;
594	});
595	assert(Match);
596
597	if (UseCostHeur) {
598	if (Match->isFull()) {
599	ReadyList.push_back(Elt: std::pair(*I, MissPenalty));
600	continue;
601	}
602
603	int TempCost = addEdges(SyncPipeline, SU: CurrSU.first, SGID: CandSGID, AddedEdges);
604	ReadyList.push_back(Elt: std::pair(*I, TempCost));
605	removeEdges(EdgesToRemove: AddedEdges);
606	} else
607	ReadyList.push_back(Elt: std::pair(*I, -`1`));
608	}
609
610	if (UseCostHeur) {
611	std::sort(ReadyList.begin(), ReadyList.end(),
612	[](std::pair<int, int> A, std::pair<int, int> B) {
613	return A.second < B.second;
614	});
615	}
616
617	assert(ReadyList.size() == CurrSU.second.size());
618	}
619
620	bool PipelineSolver::solveExact() {
621	if (checkOptimal())
622	return true;
623
624	if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
625	return false;
626
627	assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
628	assert(static_cast<size_t>(CurrConflInstNo) <
629	PipelineInstrs[CurrSyncGroupIdx].size());
630	SUToCandSGsPair CurrSU = PipelineInstrs [CurrSyncGroupIdx][CurrConflInstNo];
631	LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
632	<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");
633
634	// SchedGroup -> Cost pairs
635	SmallVector<std::pair<int, int>, `4`> ReadyList;
636	// Prioritize the candidate sched groups in terms of lowest cost first
637	IsBottomUp ? populateReadyList(ReadyList, I: CurrSU.second.rbegin(),
638	E: CurrSU.second.rend())
639	: populateReadyList(ReadyList, I: CurrSU.second.begin(),
640	E: CurrSU.second.end());
641
642	auto I = ReadyList.begin();
643	auto E = ReadyList.end();
644	for (; I != E; ++I) {
645	// If we are trying SGs in least cost order, and the current SG is cost
646	// infeasible, then all subsequent SGs will also be cost infeasible, so we
647	// can prune.
648	if (BestCost != -`1` && (CurrCost + I->second > BestCost))
649	return false;
650
651	int CandSGID = I->first;
652	int AddedCost = `0`;
653	std::vector<std::pair<SUnit , SUnit >> AddedEdges;
654	auto &SyncPipeline = CurrPipeline [CurrSyncGroupIdx];
655	SchedGroup *Match;
656	for (auto &SG : SyncPipeline) {
657	if (SG.getSGID() == CandSGID)
658	Match = &SG;
659	}
660
661	if (Match->isFull())
662	continue;
663
664	if (!Match->allowedByRules(SU: CurrSU.first, SyncPipe&: SyncPipeline))
665	continue;
666
667	LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
668	<< (int)Match->getMask() << "and ID " << CandSGID
669	<< "\n");
670	Match->add(SU&: *CurrSU.first);
671	AddedCost = addEdges(SyncPipeline, SU: CurrSU.first, SGID: CandSGID, AddedEdges);
672	LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
673	CurrCost += AddedCost;
674	advancePosition();
675	++BranchesExplored;
676	bool FinishedExploring = false;
677	// If the Cost after adding edges is greater than a known solution,
678	// backtrack
679	if (CurrCost < BestCost \|\| BestCost == -`1`) {
680	if (solveExact()) {
681	FinishedExploring = BestCost != `0`;
682	if (!FinishedExploring)
683	return true;
684	}
685	}
686
687	retreatPosition();
688	CurrCost -= AddedCost;
689	removeEdges(EdgesToRemove: AddedEdges);
690	Match->pop();
691	CurrPipeline [CurrSyncGroupIdx] = SyncPipeline;
692	if (FinishedExploring)
693	return true;
694	}
695
696	// Try the pipeline where the current instruction is omitted
697	// Potentially if we omit a problematic instruction from the pipeline,
698	// all the other instructions can nicely fit.
699	CurrCost += MissPenalty;
700	advancePosition();
701
702	LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
703
704	bool FinishedExploring = false;
705	if (CurrCost < BestCost \|\| BestCost == -`1`) {
706	if (solveExact()) {
707	bool FinishedExploring = BestCost != `0`;
708	if (!FinishedExploring)
709	return true;
710	}
711	}
712
713	retreatPosition();
714	CurrCost -= MissPenalty;
715	return FinishedExploring;
716	}
717
718	template <typename T>
719	void PipelineSolver::greedyFind(
720	std::vector<std::pair<SUnit , SUnit >> &AddedEdges, T I, T E) {
721	SUToCandSGsPair CurrSU = PipelineInstrs [CurrSyncGroupIdx][CurrConflInstNo];
722	int BestNodeCost = -`1`;
723	int TempCost;
724	SchedGroup BestGroup = nullptr*;
725	int BestGroupID = -`1`;
726	auto &SyncPipeline = CurrPipeline [CurrSyncGroupIdx];
727	LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
728	<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");
729
730	// Since we have added the potential SchedGroups from bottom up, but
731	// traversed the DAG from top down, parse over the groups from last to
732	// first. If we fail to do this for the greedy algorithm, the solution will
733	// likely not be good in more complex cases.
734	for (; I != E; ++I) {
735	std::vector<std::pair<SUnit , SUnit >> AddedEdges;
736	int CandSGID = *I;
737	SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
738	return SG.getSGID() == CandSGID;
739	});
740	assert(Match);
741
742	LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
743	<< (int)Match->getMask() << "\n");
744
745	if (Match->isFull()) {
746	LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
747	continue;
748	}
749	if (!Match->allowedByRules(SU: CurrSU.first, SyncPipe&: SyncPipeline)) {
750	LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
751	continue;
752	}
753	TempCost = addEdges(SyncPipeline, SU: CurrSU.first, SGID: CandSGID, AddedEdges);
754	LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
755	if (TempCost < BestNodeCost \|\| BestNodeCost == -`1`) {
756	BestGroup = Match;
757	BestNodeCost = TempCost;
758	BestGroupID = CandSGID;
759	}
760	removeEdges(EdgesToRemove: AddedEdges);
761	if (BestNodeCost == `0`)
762	break;
763	}
764
765	if (BestGroupID != -`1`) {
766	BestGroup->add(SU&: *CurrSU.first);
767	addEdges(SyncPipeline, SU: CurrSU.first, SGID: BestGroupID, AddedEdges);
768	LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
769	<< (int)BestGroup->getMask() << "\n");
770	BestCost += TempCost;
771	} else
772	BestCost += MissPenalty;
773
774	CurrPipeline [CurrSyncGroupIdx] = SyncPipeline;
775	}
776
777	bool PipelineSolver::solveGreedy() {
778	BestCost = `0`;
779	std::vector<std::pair<SUnit , SUnit >> AddedEdges;
780
781	while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
782	SUToCandSGsPair CurrSU = PipelineInstrs [CurrSyncGroupIdx][CurrConflInstNo];
783	IsBottomUp
784	? greedyFind(AddedEdges, I: CurrSU.second.rbegin(), E: CurrSU.second.rend())
785	: greedyFind(AddedEdges, I: CurrSU.second.begin(), E: CurrSU.second.end());
786	advancePosition();
787	}
788	BestPipeline = CurrPipeline;
789	removeEdges(EdgesToRemove: AddedEdges);
790	return false;
791	}
792
793	unsigned PipelineSolver::computeProblemSize() {
794	unsigned ProblemSize = `0`;
795	for (auto &PipeConflicts : PipelineInstrs) {
796	ProblemSize += PipeConflicts.size();
797	}
798
799	return ProblemSize;
800	}
801
802	void PipelineSolver::solve() {
803	if (!NeedsSolver)
804	return;
805
806	unsigned ProblemSize = computeProblemSize();
807	assert(ProblemSize > `0`);
808
809	bool BelowCutoff = (CutoffForExact > `0`) && ProblemSize <= CutoffForExact;
810	MissPenalty = (ProblemSize / `2`) + `1`;
811
812	LLVM_DEBUG(DAG->dump());
813	if (EnableExactSolver \|\| BelowCutoff) {
814	LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
815	solveGreedy();
816	reset();
817	LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
818	if (BestCost > `0`) {
819	LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
820	solveExact();
821	LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
822	}
823	} else { // Use the Greedy Algorithm by default
824	LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
825	solveGreedy();
826	}
827
828	makePipeline();
829	LLVM_DEBUG(dbgs() << "After applying mutation\n");
830	LLVM_DEBUG(DAG->dump());
831	}
832
833	enum IGLPStrategyID : int {
834	MFMASmallGemmOptID = `0`,
835	MFMASmallGemmSingleWaveOptID = `1`,
836	MFMAExpInterleave = `2`
837	};
838
839	// Implement a IGLP scheduling strategy.
840	class IGLPStrategy {
841	protected:
842	ScheduleDAGInstrs *DAG;
843
844	const SIInstrInfo *TII;
845
846	public:
847	/// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
848	virtual bool applyIGLPStrategy(
849	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
850	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
851	AMDGPU::SchedulingPhase Phase) = `0`;
852
853	// Returns true if this strategy should be applied to a ScheduleDAG.
854	virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
855	AMDGPU::SchedulingPhase Phase) = `0`;
856
857	bool IsBottomUp = true;
858
859	IGLPStrategy(ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
860	: DAG(DAG), TII(TII) {}
861
862	virtual ~IGLPStrategy() = default;
863	};
864
865	class MFMASmallGemmOpt final : public IGLPStrategy {
866	private:
867	public:
868	bool applyIGLPStrategy(
869	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
870	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
871	AMDGPU::SchedulingPhase Phase) override;
872
873	bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
874	AMDGPU::SchedulingPhase Phase) override {
875	return true;
876	}
877
878	MFMASmallGemmOpt(ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
879	: IGLPStrategy (DAG, TII) {
880	IsBottomUp = true;
881	}
882	};
883
884	bool MFMASmallGemmOpt::applyIGLPStrategy(
885	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
886	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
887	AMDGPU::SchedulingPhase Phase) {
888	// Count the number of MFMA instructions.
889	unsigned MFMACount = `0`;
890	for (const MachineInstr &I : *DAG)
891	if (TII->isMFMAorWMMA(MI: I))
892	++MFMACount;
893
894	const unsigned PipelineSyncID = `0`;
895	SchedGroup SG = nullptr*;
896	for (unsigned I = `0`; I < MFMACount * `3`; ++I) {
897	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
898	Args: SchedGroupMask::DS, Args: `2`, Args: PipelineSyncID, Args&: DAG, Args&: TII);
899	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
900
901	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
902	Args: SchedGroupMask::MFMA, Args: `1`, Args: PipelineSyncID, Args&: DAG, Args&: TII);
903	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
904	}
905
906	return true;
907	}
908
909	class MFMAExpInterleaveOpt final : public IGLPStrategy {
910	private:
911	// The count of TRANS SUs involved in the interleaved pipeline
912	static unsigned TransPipeCount;
913	// The count of MFMA SUs involved in the interleaved pipeline
914	static unsigned MFMAPipeCount;
915	// The count of Add SUs involved in the interleaved pipeline
916	static unsigned AddPipeCount;
917	// The number of transitive MFMA successors for each TRANS SU
918	static unsigned MFMAEnablement;
919	// The number of transitive TRANS predecessors for each MFMA SU
920	static unsigned ExpRequirement;
921	// The count of independent "chains" of MFMA instructions in the pipeline
922	static unsigned MFMAChains;
923	// The length of each independent "chain" of MFMA instructions
924	static unsigned MFMAChainLength;
925	// Whether or not the pipeline has V_CVT instructions
926	static bool HasCvt;
927	// Whether or not there are instructions between the TRANS instruction and
928	// V_CVT
929	static bool HasChainBetweenCvt;
930	// The first occuring DS_READ which feeds an MFMA chain
931	static std::optional<unsigned> FirstPipeDSR;
932	// The MFMAPipe SUs with no MFMA predecessors
933	SmallVector<SUnit *, `4`> MFMAChainSeeds;
934	// Compute the heuristics for the pipeline, returning whether or not the DAG
935	// is well formatted for the mutation
936	bool analyzeDAG(const SIInstrInfo *TII);
937
938	/// Whether or not the instruction is a transitive predecessor of an MFMA
939	/// instruction
940	class IsPipeExp final : public InstructionRule {
941	public:
942	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
943	SmallVectorImpl<SchedGroup> &SyncPipe) override {
944
945	auto DAG = SyncPipe [`0`].DAG;
946
947	if (Cache ->empty()) {
948	auto I = DAG->SUnits.rbegin();
949	auto E = DAG->SUnits.rend();
950	for (; I != E; I ++) {
951	if (TII->isMFMAorWMMA(MI: *I ->getInstr()))
952	Cache ->push_back(Elt: &*I);
953	}
954	if (Cache ->empty())
955	return false;
956	}
957
958	auto Reaches = (std::any_of(
959	first: Cache ->begin(), last: Cache ->end(), pred: [&SU, &DAG](SUnit *TargetSU) {
960	return DAG->IsReachable(SU: TargetSU, TargetSU: const_cast<SUnit *>(SU));
961	}));
962
963	return Reaches;
964	}
965	IsPipeExp(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
966	: InstructionRule (TII, SGID, NeedsCache) {}
967	};
968
969	/// Whether or not the instruction is a transitive predecessor of the
970	/// \p Number th MFMA of the MFMAs occuring after a TRANS instruction
971	class EnablesNthMFMA final : public InstructionRule {
972	private:
973	unsigned Number = `1`;
974
975	public:
976	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
977	SmallVectorImpl<SchedGroup> &SyncPipe) override {
978	bool FoundTrans = false;
979	unsigned Counter = `1`;
980	auto DAG = SyncPipe [`0`].DAG;
981
982	if (Cache ->empty()) {
983	SmallVector<SUnit *, `8`> Worklist;
984
985	auto I = DAG->SUnits.begin();
986	auto E = DAG->SUnits.end();
987	for (; I != E; I ++) {
988	if (FoundTrans && TII->isMFMAorWMMA(MI: *I ->getInstr())) {
989	if (Counter == Number) {
990	Cache ->push_back(Elt: &*I);
991	break;
992	}
993	++Counter;
994	}
995	if (!FoundTrans && TII->isTRANS(Opcode: I ->getInstr()->getOpcode()))
996	FoundTrans = true;
997	}
998	if (Cache ->empty())
999	return false;
1000	}
1001
1002	return DAG->IsReachable(SU: (Cache)[`0`], TargetSU: const_cast<SUnit >(SU));
1003	}
1004
1005	EnablesNthMFMA(unsigned Number, const SIInstrInfo TII, unsigned* SGID,
1006	bool NeedsCache = false)
1007	: InstructionRule (TII, SGID, NeedsCache), Number(Number) {}
1008	};
1009
1010	/// Whether or not the instruction enables the exact MFMA that is the \p
1011	/// Number th MFMA in the chain starting with \p ChainSeed
1012	class EnablesNthMFMAInChain final : public InstructionRule {
1013	private:
1014	unsigned Number = `1`;
1015	SUnit *ChainSeed;
1016
1017	public:
1018	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1019	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1020	auto DAG = SyncPipe [`0`].DAG;
1021
1022	if (!SU \|\| !TII->isMFMAorWMMA(MI: *ChainSeed->getInstr()))
1023	return false;
1024
1025	if (Cache ->empty()) {
1026	auto TempSU = ChainSeed;
1027	auto Depth = Number;
1028	while (Depth > `0`) {
1029	--Depth;
1030	bool Found = false;
1031	for (auto &Succ : TempSU->Succs) {
1032	if (TII->isMFMAorWMMA(MI: *Succ.getSUnit()->getInstr())) {
1033	TempSU = Succ.getSUnit();
1034	Found = true;
1035	break;
1036	}
1037	}
1038	if (!Found)
1039	return false;
1040	}
1041
1042	Cache ->push_back(Elt: TempSU);
1043	}
1044	// If we failed to find the instruction to be placed into the cache, we
1045	// would have already exited.
1046	assert(!Cache->empty());
1047
1048	return DAG->IsReachable(SU: (Cache)[`0`], TargetSU: const_cast<SUnit >(SU));
1049	}
1050
1051	EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
1052	const SIInstrInfo TII, unsigned* SGID,
1053	bool NeedsCache = false)
1054	: InstructionRule (TII, SGID, NeedsCache), Number(Number),
1055	ChainSeed(ChainSeed) {}
1056	};
1057
1058	/// Whether or not the instruction has less than \p Size immediate successors.
1059	/// If \p HasIntermediary is true, this tests also whether all successors of
1060	/// the SUnit have less than \p Size successors.
1061	class LessThanNSuccs final : public InstructionRule {
1062	private:
1063	unsigned Size = `1`;
1064	bool HasIntermediary = false;
1065
1066	public:
1067	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1068	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1069	if (!SyncPipe.size())
1070	return false;
1071
1072	auto SuccSize = std::count_if(
1073	first: SU->Succs.begin(), last: SU->Succs.end(),
1074	pred: [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1075	if (SuccSize >= Size)
1076	return false;
1077
1078	if (HasIntermediary) {
1079	for (auto Succ : SU->Succs) {
1080	auto SuccSize = std::count_if(
1081	first: Succ.getSUnit()->Succs.begin(), last: Succ.getSUnit()->Succs.end(),
1082	pred: [](const SDep &SuccSucc) {
1083	return SuccSucc.getKind() == SDep::Data;
1084	});
1085	if (SuccSize >= Size)
1086	return false;
1087	}
1088	}
1089
1090	return true;
1091	}
1092	LessThanNSuccs(unsigned Size, const SIInstrInfo TII, unsigned* SGID,
1093	bool HasIntermediary = false, bool NeedsCache = false)
1094	: InstructionRule (TII, SGID, NeedsCache), Size(Size),
1095	HasIntermediary(HasIntermediary) {}
1096	};
1097
1098	/// Whether or not the instruction has greater than or equal to \p Size
1099	/// immediate successors. If \p HasIntermediary is true, this tests also
1100	/// whether all successors of the SUnit have greater than or equal to \p Size
1101	/// successors.
1102	class GreaterThanOrEqualToNSuccs final : public InstructionRule {
1103	private:
1104	unsigned Size = `1`;
1105	bool HasIntermediary = false;
1106
1107	public:
1108	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1109	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1110	if (!SyncPipe.size())
1111	return false;
1112
1113	auto SuccSize = std::count_if(
1114	first: SU->Succs.begin(), last: SU->Succs.end(),
1115	pred: [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1116	if (SuccSize >= Size)
1117	return true;
1118
1119	if (HasIntermediary) {
1120	for (auto Succ : SU->Succs) {
1121	auto SuccSize = std::count_if(
1122	first: Succ.getSUnit()->Succs.begin(), last: Succ.getSUnit()->Succs.end(),
1123	pred: [](const SDep &SuccSucc) {
1124	return SuccSucc.getKind() == SDep::Data;
1125	});
1126	if (SuccSize >= Size)
1127	return true;
1128	}
1129	}
1130
1131	return false;
1132	}
1133	GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,
1134	unsigned SGID, bool HasIntermediary = false,
1135	bool NeedsCache = false)
1136	: InstructionRule (TII, SGID, NeedsCache), Size(Size),
1137	HasIntermediary(HasIntermediary) {}
1138	};
1139
1140	// Whether or not the instruction is a relevant V_CVT instruction.
1141	class IsCvt final : public InstructionRule {
1142	public:
1143	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1144	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1145	auto Opc = SU->getInstr()->getOpcode();
1146	return Opc == AMDGPU::V_CVT_F16_F32_e32 \|\|
1147	Opc == AMDGPU::V_CVT_I32_F32_e32;
1148	}
1149	IsCvt(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
1150	: InstructionRule (TII, SGID, NeedsCache) {}
1151	};
1152
1153	// Whether or not the instruction is FMA_F32.
1154	class IsFMA final : public InstructionRule {
1155	public:
1156	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1157	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1158	return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 \|\|
1159	SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
1160	}
1161	IsFMA(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
1162	: InstructionRule (TII, SGID, NeedsCache) {}
1163	};
1164
1165	// Whether or not the instruction is a V_ADD_F32 instruction.
1166	class IsPipeAdd final : public InstructionRule {
1167	public:
1168	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1169	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1170	return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32;
1171	}
1172	IsPipeAdd(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
1173	: InstructionRule (TII, SGID, NeedsCache) {}
1174	};
1175
1176	/// Whether or not the instruction is an immediate RAW successor
1177	/// of the SchedGroup \p Distance steps before.
1178	class IsSuccOfPrevNthGroup final : public InstructionRule {
1179	private:
1180	unsigned Distance = `1`;
1181
1182	public:
1183	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1184	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1185	SchedGroup OtherGroup = nullptr*;
1186	if (!SyncPipe.size())
1187	return false;
1188
1189	for (auto &PipeSG : SyncPipe) {
1190	if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1191	OtherGroup = &PipeSG;
1192	}
1193
1194	if (!OtherGroup)
1195	return false;
1196	if (!OtherGroup->Collection.size())
1197	return true;
1198
1199	for (auto &OtherEle : OtherGroup->Collection) {
1200	for (auto &Succ : OtherEle->Succs) {
1201	if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
1202	return true;
1203	}
1204	}
1205
1206	return false;
1207	}
1208	IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1209	unsigned SGID, bool NeedsCache = false)
1210	: InstructionRule (TII, SGID, NeedsCache), Distance(Distance) {}
1211	};
1212
1213	/// Whether or not the instruction is a transitive successor of any
1214	/// instruction the the SchedGroup \p Distance steps before.
1215	class IsReachableFromPrevNthGroup final : public InstructionRule {
1216	private:
1217	unsigned Distance = `1`;
1218
1219	public:
1220	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1221	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1222	SchedGroup OtherGroup = nullptr*;
1223	if (!SyncPipe.size())
1224	return false;
1225
1226	for (auto &PipeSG : SyncPipe) {
1227	if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1228	OtherGroup = &PipeSG;
1229	}
1230
1231	if (!OtherGroup)
1232	return false;
1233	if (!OtherGroup->Collection.size())
1234	return true;
1235
1236	auto DAG = SyncPipe [`0`].DAG;
1237
1238	for (auto &OtherEle : OtherGroup->Collection)
1239	if (DAG->IsReachable(SU: const_cast<SUnit *>(SU), TargetSU: OtherEle))
1240	return true;
1241
1242	return false;
1243	}
1244	IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1245	unsigned SGID, bool NeedsCache = false)
1246	: InstructionRule (TII, SGID, NeedsCache), Distance(Distance) {}
1247	};
1248
1249	/// Whether or not the instruction occurs after the SU with NodeNUm \p Number
1250	class OccursAtOrAfterNode final : public InstructionRule {
1251	private:
1252	unsigned Number = `1`;
1253
1254	public:
1255	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1256	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1257
1258	return SU->NodeNum >= Number;
1259	}
1260	OccursAtOrAfterNode(unsigned Number, const SIInstrInfo TII, unsigned* SGID,
1261	bool NeedsCache = false)
1262	: InstructionRule (TII, SGID, NeedsCache), Number(Number) {}
1263	};
1264
1265	/// Whether or not the SU is exactly the \p Number th MFMA in the chain
1266	/// starting with \p ChainSeed
1267	class IsExactMFMA final : public InstructionRule {
1268	private:
1269	unsigned Number = `1`;
1270	SUnit *ChainSeed;
1271
1272	public:
1273	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1274	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1275	if (!SU \|\| !TII->isMFMAorWMMA(MI: *ChainSeed->getInstr()))
1276	return false;
1277
1278	if (Cache ->empty()) {
1279	auto TempSU = ChainSeed;
1280	auto Depth = Number;
1281	while (Depth > `0`) {
1282	--Depth;
1283	bool Found = false;
1284	for (auto &Succ : TempSU->Succs) {
1285	if (TII->isMFMAorWMMA(MI: *Succ.getSUnit()->getInstr())) {
1286	TempSU = Succ.getSUnit();
1287	Found = true;
1288	break;
1289	}
1290	}
1291	if (!Found) {
1292	return false;
1293	}
1294	}
1295	Cache ->push_back(Elt: TempSU);
1296	}
1297	// If we failed to find the instruction to be placed into the cache, we
1298	// would have already exited.
1299	assert(!Cache->empty());
1300
1301	return (*Cache)[`0`] == SU;
1302	}
1303
1304	IsExactMFMA(unsigned Number, SUnit ChainSeed, const* SIInstrInfo *TII,
1305	unsigned SGID, bool NeedsCache = false)
1306	: InstructionRule (TII, SGID, NeedsCache), Number(Number),
1307	ChainSeed(ChainSeed) {}
1308	};
1309
1310	// Whether the instruction occurs after the first TRANS instruction. This
1311	// implies the instruction can not be a predecessor of the first TRANS
1312	// insruction
1313	class OccursAfterExp final : public InstructionRule {
1314	public:
1315	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1316	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1317
1318	SmallVector<SUnit *, `12`> Worklist;
1319	auto DAG = SyncPipe [`0`].DAG;
1320	if (Cache ->empty()) {
1321	for (auto &SU : DAG->SUnits)
1322	if (TII->isTRANS(Opcode: SU.getInstr()->getOpcode())) {
1323	Cache ->push_back(Elt: &SU);
1324	break;
1325	}
1326	if (Cache ->empty())
1327	return false;
1328	}
1329
1330	return SU->NodeNum > (*Cache)[`0`]->NodeNum;
1331	}
1332
1333	OccursAfterExp(const SIInstrInfo TII, unsigned* SGID,
1334	bool NeedsCache = false)
1335	: InstructionRule (TII, SGID, NeedsCache) {}
1336	};
1337
1338	public:
1339	bool applyIGLPStrategy(
1340	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1341	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
1342	AMDGPU::SchedulingPhase Phase) override;
1343
1344	bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1345	AMDGPU::SchedulingPhase Phase) override;
1346
1347	MFMAExpInterleaveOpt(ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
1348	: IGLPStrategy (DAG, TII) {
1349	IsBottomUp = false;
1350	}
1351	};
1352
1353	unsigned MFMAExpInterleaveOpt::TransPipeCount = `0`;
1354	unsigned MFMAExpInterleaveOpt::MFMAPipeCount = `0`;
1355	unsigned MFMAExpInterleaveOpt::AddPipeCount = `0`;
1356	unsigned MFMAExpInterleaveOpt::MFMAEnablement = `0`;
1357	unsigned MFMAExpInterleaveOpt::ExpRequirement = `0`;
1358	unsigned MFMAExpInterleaveOpt::MFMAChains = `0`;
1359	unsigned MFMAExpInterleaveOpt::MFMAChainLength = `0`;
1360	bool MFMAExpInterleaveOpt::HasCvt = false;
1361	bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
1362	std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1363
1364	bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
1365	SmallVector<SUnit *, `10`> ExpPipeCands;
1366	SmallVector<SUnit *, `10`> MFMAPipeCands;
1367	SmallVector<SUnit *, `10`> MFMAPipeSUs;
1368	SmallVector<SUnit *, `10`> PackSUs;
1369	SmallVector<SUnit *, `10`> CvtSUs;
1370
1371	auto isBitPack = [](unsigned Opc) {
1372	return Opc == AMDGPU::V_PACK_B32_F16_e64 \|\| Opc == AMDGPU::V_PERM_B32_e64;
1373	};
1374
1375	auto isCvt = [](unsigned Opc) {
1376	return Opc == AMDGPU::V_CVT_F16_F32_e32 \|\| Opc == AMDGPU::V_CVT_I32_F32_e32;
1377	};
1378
1379	auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };
1380
1381	AddPipeCount = `0`;
1382	for (SUnit &SU : DAG->SUnits) {
1383	auto Opc = SU.getInstr()->getOpcode();
1384	if (TII->isTRANS(Opcode: Opc)) {
1385	// Avoid counting a potential bonus V_EXP which all the MFMA depend on
1386	if (SU.Succs.size() >= `7`)
1387	continue;
1388	for (auto &Succ : SU.Succs) {
1389	if (Succ.getSUnit()->Succs.size() >= `7`)
1390	continue;
1391	}
1392	ExpPipeCands.push_back(Elt: &SU);
1393	}
1394
1395	if (TII->isMFMAorWMMA(MI: *SU.getInstr()))
1396	MFMAPipeCands.push_back(Elt: &SU);
1397
1398	if (isBitPack (Opc))
1399	PackSUs.push_back(Elt: &SU);
1400
1401	if (isCvt (Opc))
1402	CvtSUs.push_back(Elt: &SU);
1403
1404	if (isAdd (Opc))
1405	++AddPipeCount;
1406	}
1407
1408	if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
1409	return false;
1410
1411	TransPipeCount = `0`;
1412
1413	std::optional<SUnit *> TempMFMA;
1414	std::optional<SUnit *> TempExp;
1415	// Count the number of EXPs that reach an MFMA
1416	for (auto &PredSU : ExpPipeCands) {
1417	for (auto &SuccSU : MFMAPipeCands) {
1418	if (DAG->IsReachable(SU: SuccSU, TargetSU: PredSU)) {
1419	if (!TempExp) {
1420	TempExp = PredSU;
1421	TempMFMA = SuccSU;
1422	}
1423	MFMAPipeSUs.push_back(Elt: SuccSU);
1424	++TransPipeCount;
1425	break;
1426	}
1427	}
1428	}
1429
1430	if (!(TempExp && TempMFMA))
1431	return false;
1432
1433	HasChainBetweenCvt =
1434	std::find_if(first: (TempExp)->Succs.begin(), last: (TempExp)->Succs.end(),
1435	pred: [&isCvt](SDep &Succ) {
1436	return isCvt (Succ.getSUnit()->getInstr()->getOpcode());
1437	}) == (*TempExp)->Succs.end();
1438
1439	// Count the number of MFMAs that are reached by an EXP
1440	for (auto &SuccSU : MFMAPipeCands) {
1441	if (MFMAPipeSUs.size() &&
1442	std::find_if(first: MFMAPipeSUs.begin(), last: MFMAPipeSUs.end(),
1443	pred: [&SuccSU](SUnit *PotentialMatch) {
1444	return PotentialMatch->NodeNum == SuccSU->NodeNum;
1445	}) != MFMAPipeSUs.end())
1446	continue;
1447
1448	for (auto &PredSU : ExpPipeCands) {
1449	if (DAG->IsReachable(SU: SuccSU, TargetSU: PredSU)) {
1450	MFMAPipeSUs.push_back(Elt: SuccSU);
1451	break;
1452	}
1453	}
1454	}
1455
1456	MFMAPipeCount = MFMAPipeSUs.size();
1457
1458	assert(TempExp && TempMFMA);
1459	assert(MFMAPipeCount > `0`);
1460
1461	std::optional<SUnit *> TempCvt;
1462	for (auto &SuccSU : CvtSUs) {
1463	if (DAG->IsReachable(SU: SuccSU, TargetSU: *TempExp)) {
1464	TempCvt = SuccSU;
1465	break;
1466	}
1467	}
1468
1469	HasCvt = false;
1470	if (TempCvt.has_value()) {
1471	for (auto &SuccSU : MFMAPipeSUs) {
1472	if (DAG->IsReachable(SU: SuccSU, TargetSU: *TempCvt)) {
1473	HasCvt = true;
1474	break;
1475	}
1476	}
1477	}
1478
1479	MFMAChains = `0`;
1480	for (auto &MFMAPipeSU : MFMAPipeSUs) {
1481	if (is_contained(Range&: MFMAChainSeeds, Element: MFMAPipeSU))
1482	continue;
1483	if (!std::any_of(first: MFMAPipeSU->Preds.begin(), last: MFMAPipeSU->Preds.end(),
1484	pred: [&TII](SDep &Succ) {
1485	return TII->isMFMAorWMMA(MI: *Succ.getSUnit()->getInstr());
1486	})) {
1487	MFMAChainSeeds.push_back(Elt: MFMAPipeSU);
1488	++MFMAChains;
1489	}
1490	}
1491
1492	if (!MFMAChains)
1493	return false;
1494
1495	for (auto Pred : MFMAChainSeeds [`0`]->Preds) {
1496	if (TII->isDS(Opcode: Pred.getSUnit()->getInstr()->getOpcode()) &&
1497	Pred.getSUnit()->getInstr()->mayLoad())
1498	FirstPipeDSR = Pred.getSUnit()->NodeNum;
1499	}
1500
1501	MFMAChainLength = MFMAPipeCount / MFMAChains;
1502
1503	// The number of bit pack operations that depend on a single V_EXP
1504	unsigned PackSuccCount = std::count_if(
1505	first: PackSUs.begin(), last: PackSUs.end(), pred: [this, &TempExp](SUnit *VPack) {
1506	return DAG->IsReachable(SU: VPack, TargetSU: *TempExp);
1507	});
1508
1509	// The number of bit pack operations an MFMA depends on
1510	unsigned PackPredCount =
1511	std::count_if(first: (TempMFMA)->Preds.begin(), last: (TempMFMA)->Preds.end(),
1512	pred: [&isBitPack](SDep &Pred) {
1513	auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1514	return isBitPack (Opc);
1515	});
1516
1517	auto PackPred =
1518	std::find_if(first: (TempMFMA)->Preds.begin(), last: (TempMFMA)->Preds.end(),
1519	pred: [&isBitPack](SDep &Pred) {
1520	auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1521	return isBitPack (Opc);
1522	});
1523
1524	if (PackPred == (*TempMFMA)->Preds.end())
1525	return false;
1526
1527	MFMAEnablement = `0`;
1528	ExpRequirement = `0`;
1529	// How many MFMAs depend on a single bit pack operation
1530	MFMAEnablement =
1531	std::count_if(first: PackPred->getSUnit()->Succs.begin(),
1532	last: PackPred->getSUnit()->Succs.end(), pred: [&TII](SDep &Succ) {
1533	return TII->isMFMAorWMMA(MI: *Succ.getSUnit()->getInstr());
1534	});
1535
1536	// The number of MFMAs that depend on a single V_EXP
1537	MFMAEnablement *= PackSuccCount;
1538
1539	// The number of V_EXPs required to resolve all dependencies for an MFMA
1540	ExpRequirement =
1541	std::count_if(first: ExpPipeCands.begin(), last: ExpPipeCands.end(),
1542	pred: [this, &PackPred](SUnit *ExpBase) {
1543	return DAG->IsReachable(SU: PackPred->getSUnit(), TargetSU: ExpBase);
1544	});
1545
1546	ExpRequirement *= PackPredCount;
1547	return true;
1548	}
1549
1550	bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1551	AMDGPU::SchedulingPhase Phase) {
1552	const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1553	const SIInstrInfo *TII = ST.getInstrInfo();
1554
1555	if (Phase != AMDGPU::SchedulingPhase::PostRA)
1556	MFMAChainSeeds.clear();
1557	if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
1558	return false;
1559
1560	return true;
1561	}
1562
1563	bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1564	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1565	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
1566	AMDGPU::SchedulingPhase Phase) {
1567
1568	bool IsSmallKernelType =
1569	MFMAEnablement == `2` && ExpRequirement == `4` && TransPipeCount == `32`;
1570	bool IsLargeKernelType =
1571	MFMAEnablement == `4` && ExpRequirement == `4` && TransPipeCount == `64`;
1572
1573	if (!(IsSmallKernelType \|\| IsLargeKernelType))
1574	return false;
1575
1576	const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1577	const SIInstrInfo *TII = ST.getInstrInfo();
1578
1579	unsigned PipelineSyncID = `0`;
1580	SchedGroup SG = nullptr*;
1581
1582	unsigned MFMAChain = `0`;
1583	unsigned PositionInChain = `0`;
1584	unsigned CurrMFMAForTransPosition = `0`;
1585
1586	auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1587	&CurrMFMAForTransPosition]() {
1588	CurrMFMAForTransPosition += MFMAEnablement;
1589	PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1590	MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1591	};
1592
1593	auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1594	auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1595	return (TempMFMAForTrans / MFMAChains);
1596	};
1597
1598	auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1599	auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1600	return TempMFMAForTrans % MFMAChains;
1601	};
1602
1603	unsigned CurrMFMAPosition = `0`;
1604	unsigned MFMAChainForMFMA = `0`;
1605	unsigned PositionInChainForMFMA = `0`;
1606
1607	auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1608	&PositionInChainForMFMA]() {
1609	++CurrMFMAPosition;
1610	MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1611	PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1612	};
1613
1614	bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
1615	assert(IsPostRA \|\| MFMAChainSeeds.size() == MFMAChains);
1616
1617	bool UsesFMA = IsSmallKernelType \|\| !IsPostRA;
1618	bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1619	bool UsesCvt = HasCvt && (IsSmallKernelType \|\| !IsPostRA);
1620	bool UsesVALU = IsSmallKernelType;
1621
1622	// PHASE 1: "Prefetch"
1623	if (UsesFMA) {
1624	// First Round FMA
1625	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1626	Args: SchedGroupMask::VALU, Args&: ExpRequirement, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1627	if (!IsPostRA && MFMAChains) {
1628	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1629	args&: PositionInChain, args&: MFMAChainSeeds [MFMAChain], args&: TII, args: SG->getSGID(),
1630	args: true));
1631	} else
1632	SG->addRule(
1633	NewRule: std::make_shared<EnablesNthMFMA>(args: `1`, args&: TII, args: SG->getSGID(), args: true));
1634	SG->addRule(NewRule: std::make_shared<IsFMA>(args&: TII, args: SG->getSGID()));
1635	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1636
1637	// Second Round FMA
1638	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1639	Args: SchedGroupMask::VALU, Args&: ExpRequirement, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1640	if (!IsPostRA && MFMAChains) {
1641	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1642	args: getNextTransPositionInChain (),
1643	args&: MFMAChainSeeds [getNextTransMFMAChain ()], args&: TII, args: SG->getSGID(), args: true));
1644	} else
1645	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(args: MFMAEnablement + `1`, args&: TII,
1646	args: SG->getSGID(), args: true));
1647	SG->addRule(NewRule: std::make_shared<IsFMA>(args&: TII, args: SG->getSGID()));
1648	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1649	}
1650
1651	if (UsesDSRead) {
1652	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1653	Args: SchedGroupMask::DS_READ, Args: `2`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1654	SG->addRule(NewRule: std::make_shared<OccursAtOrAfterNode>(args&: *FirstPipeDSR, args&: TII,
1655	args: SG->getSGID()));
1656	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1657	}
1658
1659	// First Round EXP
1660	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1661	Args: SchedGroupMask::TRANS, Args&: ExpRequirement, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1662	if (!IsPostRA && MFMAChains)
1663	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1664	args&: PositionInChain, args&: MFMAChainSeeds [MFMAChain], args&: TII, args: SG->getSGID(), args: true));
1665	else
1666	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(args: `1`, args&: TII, args: SG->getSGID(), args: true));
1667	SG->addRule(NewRule: std::make_shared<IsPipeExp>(args&: TII, args: SG->getSGID(), args: true));
1668	SG->addRule(NewRule: std::make_shared<LessThanNSuccs>(args: `8`, args&: TII, args: SG->getSGID(),
1669	args&: HasChainBetweenCvt));
1670	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1671
1672	incrementTransPosition ();
1673
1674	// First Round CVT, Third Round FMA, Second Round EXP; interleaved
1675	for (unsigned I = `0`; I < ExpRequirement; I++) {
1676	// First Round CVT
1677	if (UsesCvt) {
1678	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1679	Args: SchedGroupMask::VALU, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1680	SG->addRule(NewRule: std::make_shared<IsCvt>(args&: TII, args: SG->getSGID()));
1681	if (HasChainBetweenCvt)
1682	SG->addRule(NewRule: std::make_shared<IsReachableFromPrevNthGroup>(
1683	args: `1` + (`2` + UsesFMA) * I, args&: TII, args: SG->getSGID()));
1684	else
1685	SG->addRule(NewRule: std::make_shared<IsSuccOfPrevNthGroup>(
1686	args: `1` + (`2` + UsesFMA) * I, args&: TII, args: SG->getSGID()));
1687	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1688	}
1689
1690	// Third Round FMA
1691	if (UsesFMA) {
1692	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1693	Args: SchedGroupMask::VALU, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1694	if (!IsPostRA && MFMAChains) {
1695	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1696	args: getNextTransPositionInChain (),
1697	args&: MFMAChainSeeds [getNextTransMFMAChain ()], args&: TII, args: SG->getSGID(), args: true));
1698	} else
1699	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(args: `2` * MFMAEnablement + `1`,
1700	args&: TII, args: SG->getSGID(), args: true));
1701	SG->addRule(NewRule: std::make_shared<IsFMA>(args&: TII, args: SG->getSGID()));
1702	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1703	}
1704
1705	// Second Round EXP
1706	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1707	Args: SchedGroupMask::TRANS, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1708	if (!IsPostRA && MFMAChains)
1709	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1710	args&: PositionInChain, args&: MFMAChainSeeds [MFMAChain], args&: TII, args: SG->getSGID(),
1711	args: true));
1712	else
1713	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(args: MFMAEnablement + `1`, args&: TII,
1714	args: SG->getSGID(), args: true));
1715	SG->addRule(NewRule: std::make_shared<IsPipeExp>(args&: TII, args: SG->getSGID(), args: true));
1716	SG->addRule(NewRule: std::make_shared<LessThanNSuccs>(args: `8`, args&: TII, args: SG->getSGID(),
1717	args&: HasChainBetweenCvt));
1718	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1719	}
1720
1721	// The "extra" EXP which enables all MFMA
1722	// TODO: UsesExtraExp
1723	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1724	Args: SchedGroupMask::TRANS, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1725	SG->addRule(NewRule: std::make_shared<IsPipeExp>(args&: TII, args: SG->getSGID(), args: true));
1726	SG->addRule(NewRule: std::make_shared<GreaterThanOrEqualToNSuccs>(
1727	args: `8`, args&: TII, args: SG->getSGID(), args&: HasChainBetweenCvt));
1728	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1729
1730	// PHASE 2: Main Interleave Loop
1731
1732	// The number of MFMAs per iteration
1733	unsigned MFMARatio =
1734	MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : `1`;
1735	// The number of Exps per iteration
1736	unsigned ExpRatio =
1737	MFMAEnablement > ExpRequirement ? `1` : ExpRequirement / MFMAEnablement;
1738	// The reamaining Exps
1739	unsigned RemainingExp = TransPipeCount > (`2` * ExpRequirement)
1740	? TransPipeCount - (`2` * ExpRequirement)
1741	: `0`;
1742	unsigned ExpLoopCount = RemainingExp / ExpRatio;
1743	// In loop MFMAs
1744	unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * `2`)
1745	? MFMAPipeCount - (MFMAEnablement * `2`)
1746	: `0`;
1747	unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1748	unsigned VALUOps =
1749	AddPipeCount < MFMAPipeCount ? `1` : AddPipeCount / MFMAPipeCount;
1750	unsigned LoopSize = std::min(a: ExpLoopCount, b: MFMALoopCount);
1751
1752	for (unsigned I = `0`; I < LoopSize; I++) {
1753	if (!(I * ExpRatio % ExpRequirement))
1754	incrementTransPosition ();
1755
1756	// Round N MFMA
1757	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1758	Args: SchedGroupMask::MFMA, Args&: MFMARatio, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1759	if (!IsPostRA && MFMAChains)
1760	SG->addRule(NewRule: std::make_shared<IsExactMFMA>(
1761	args&: PositionInChainForMFMA, args&: MFMAChainSeeds [MFMAChainForMFMA], args&: TII,
1762	args: SG->getSGID(), args: true));
1763	else
1764	SG->addRule(NewRule: std::make_shared<OccursAfterExp>(args&: TII, args: SG->getSGID(), args: true));
1765	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1766	incrementMFMAPosition ();
1767
1768	if (UsesVALU) {
1769	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1770	Args: SchedGroupMask::VALU, Args&: VALUOps, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1771	SG->addRule(NewRule: std::make_shared<IsPipeAdd>(args&: TII, args: SG->getSGID()));
1772	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1773	}
1774
1775	if (UsesDSRead && !(I % `4`)) {
1776	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1777	Args: SchedGroupMask::DS_READ, Args: `2`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1778	SG->addRule(NewRule: std::make_shared<OccursAtOrAfterNode>(args&: *FirstPipeDSR, args&: TII,
1779	args: SG->getSGID()));
1780	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1781	}
1782
1783	// CVT, EXP, FMA Interleaving
1784	for (unsigned J = `0`; J < ExpRatio; J++) {
1785	auto MFMAOffset = (`1` + UsesVALU) * MFMARatio * (I + `1`);
1786	auto MaxMFMAOffset =
1787	(`1` + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1788
1789	// Round N + 1 CVT
1790	if (UsesCvt) {
1791	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1792	Args: SchedGroupMask::VALU, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1793	SG->addRule(NewRule: std::make_shared<IsCvt>(args&: TII, args: SG->getSGID()));
1794	auto BaseDiff = (`2` + UsesFMA) * (ExpRequirement - `1`) + `1`;
1795	auto DSROffset = I / `4` + `1`;
1796	auto MaxDSROffset = MaxMFMAOffset / `4`;
1797	// TODO: UsesExtraExp
1798	auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? `0` : `1`;
1799	auto CurrentOffset = UsesDSRead * std::min(a: MaxDSROffset, b: DSROffset) +
1800	std::min(a: MaxMFMAOffset, b: MFMAOffset) + BaseDiff +
1801	ExpOffset;
1802	if (HasChainBetweenCvt)
1803	SG->addRule(NewRule: std::make_shared<IsReachableFromPrevNthGroup>(
1804	args&: CurrentOffset, args&: TII, args: SG->getSGID()));
1805	else
1806	SG->addRule(NewRule: std::make_shared<IsSuccOfPrevNthGroup>(args&: CurrentOffset, args&: TII,
1807	args: SG->getSGID()));
1808	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1809	}
1810
1811	// Round N + 3 FMA
1812	if (UsesFMA) {
1813	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1814	Args: SchedGroupMask::VALU, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1815	if (!IsPostRA && MFMAChains)
1816	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1817	args: getNextTransPositionInChain (),
1818	args&: MFMAChainSeeds [getNextTransMFMAChain ()], args&: TII, args: SG->getSGID(),
1819	args: true));
1820	else
1821	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(
1822	args: (((I * ExpRatio + J) / ExpRequirement) + `3`) * MFMAEnablement + `1`,
1823	args&: TII, args: SG->getSGID(), args: true));
1824	SG->addRule(NewRule: std::make_shared<IsFMA>(args&: TII, args: SG->getSGID()));
1825	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1826	}
1827
1828	// Round N + 2 Exp
1829	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1830	Args: SchedGroupMask::TRANS, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1831	if (!IsPostRA && MFMAChains)
1832	SG->addRule(NewRule: std::make_shared<EnablesNthMFMAInChain>(
1833	args&: PositionInChain, args&: MFMAChainSeeds [MFMAChain], args&: TII, args: SG->getSGID(),
1834	args: true));
1835	else
1836	SG->addRule(NewRule: std::make_shared<EnablesNthMFMA>(
1837	args: (((I * ExpRatio + J) / ExpRequirement) + `2`) * MFMAEnablement + `1`,
1838	args&: TII, args: SG->getSGID(), args: true));
1839	SG->addRule(NewRule: std::make_shared<IsPipeExp>(args&: TII, args: SG->getSGID(), args: true));
1840	SG->addRule(NewRule: std::make_shared<LessThanNSuccs>(args: `8`, args&: TII, args: SG->getSGID(),
1841	args&: HasChainBetweenCvt));
1842	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1843	}
1844	}
1845
1846	// PHASE 3: Remaining MFMAs
1847	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
1848	Args: SchedGroupMask::MFMA, Args: MFMAEnablement * `2`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
1849	SG->addRule(NewRule: std::make_shared<OccursAfterExp>(args&: TII, args: SG->getSGID(), args: true));
1850	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
1851	return true;
1852	}
1853
1854	class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
1855	private:
1856	// Whether the DS_READ is a predecessor of first four MFMA in region
1857	class EnablesInitialMFMA final : public InstructionRule {
1858	public:
1859	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1860	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1861	if (!SyncPipe.size())
1862	return false;
1863	int MFMAsFound = `0`;
1864	if (!Cache ->size()) {
1865	for (auto &Elt : SyncPipe [`0`].DAG->SUnits) {
1866	if (TII->isMFMAorWMMA(MI: *Elt.getInstr())) {
1867	++MFMAsFound;
1868	if (MFMAsFound > `4`)
1869	break;
1870	Cache ->push_back(Elt: &Elt);
1871	}
1872	}
1873	}
1874
1875	assert(Cache->size());
1876	auto DAG = SyncPipe [`0`].DAG;
1877	for (auto &Elt : *Cache) {
1878	if (DAG->IsReachable(SU: Elt, TargetSU: const_cast<SUnit *>(SU)))
1879	return true;
1880	}
1881	return false;
1882	}
1883
1884	EnablesInitialMFMA(const SIInstrInfo TII, unsigned* SGID,
1885	bool NeedsCache = false)
1886	: InstructionRule (TII, SGID, NeedsCache) {}
1887	};
1888
1889	// Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
1890	class IsPermForDSW final : public InstructionRule {
1891	public:
1892	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1893	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1894	auto MI = SU->getInstr();
1895	if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1896	return false;
1897
1898	bool FitsInGroup = false;
1899	// Does the VALU have a DS_WRITE successor
1900	if (!Collection.size()) {
1901	for (auto &Succ : SU->Succs) {
1902	SUnit *SuccUnit = Succ.getSUnit();
1903	if (TII->isDS(MI: *SuccUnit->getInstr()) &&
1904	SuccUnit->getInstr()->mayStore()) {
1905	Cache ->push_back(Elt: SuccUnit);
1906	FitsInGroup = true;
1907	}
1908	}
1909	return FitsInGroup;
1910	}
1911
1912	assert(Cache->size());
1913
1914	// Does the VALU have a DS_WRITE successor that is the same as other
1915	// VALU already in the group. The V_PERMs will all share 1 DS_W succ
1916	return llvm::any_of(Range&: Cache, P: [&SU](SUnit Elt) {
1917	return llvm::any_of(Range: SU->Succs, P: [&Elt](const SDep &ThisSucc) {
1918	return ThisSucc.getSUnit() == Elt;
1919	});
1920	});
1921	}
1922
1923	IsPermForDSW(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
1924	: InstructionRule (TII, SGID, NeedsCache) {}
1925	};
1926
1927	// Whether the SU is a successor of any element in previous SchedGroup
1928	class IsSuccOfPrevGroup final : public InstructionRule {
1929	public:
1930	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1931	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1932	SchedGroup OtherGroup = nullptr*;
1933	for (auto &PipeSG : SyncPipe) {
1934	if ((unsigned)PipeSG.getSGID() == SGID - `1`) {
1935	OtherGroup = &PipeSG;
1936	}
1937	}
1938
1939	if (!OtherGroup)
1940	return false;
1941	if (!OtherGroup->Collection.size())
1942	return true;
1943
1944	// Does the previous VALU have this DS_Write as a successor
1945	return (std::any_of(first: OtherGroup->Collection.begin(),
1946	last: OtherGroup->Collection.end(), pred: [&SU](SUnit *Elt) {
1947	return std::any_of(first: Elt->Succs.begin(),
1948	last: Elt->Succs.end(),
1949	pred: [&SU](SDep &Succ) {
1950	return Succ.getSUnit() == SU;
1951	});
1952	}));
1953	}
1954	IsSuccOfPrevGroup(const SIInstrInfo TII, unsigned* SGID,
1955	bool NeedsCache = false)
1956	: InstructionRule (TII, SGID, NeedsCache) {}
1957	};
1958
1959	// Whether the combined load width of group is 128 bits
1960	class VMEMSize final : public InstructionRule {
1961	public:
1962	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
1963	SmallVectorImpl<SchedGroup> &SyncPipe) override {
1964	auto MI = SU->getInstr();
1965	if (MI->getOpcode() == TargetOpcode::BUNDLE)
1966	return false;
1967	if (!Collection.size())
1968	return true;
1969
1970	int NumBits = `0`;
1971
1972	auto TRI = TII->getRegisterInfo();
1973	auto &MRI = MI->getParent()->getParent()->getRegInfo();
1974	for (auto &Elt : Collection) {
1975	auto Op = Elt->getInstr()->getOperand(i: `0`);
1976	auto Size =
1977	TRI.getRegSizeInBits(RC: *TRI.getRegClassForOperandReg(MRI, MO: Op));
1978	NumBits += Size;
1979	}
1980
1981	if (NumBits < `128`) {
1982	assert(TII->isVMEM(*MI) && MI->mayLoad());
1983	if (NumBits + TRI.getRegSizeInBits(RC: *TRI.getRegClassForOperandReg(
1984	MRI, MO: MI->getOperand(i: `0`))) <=
1985	`128`)
1986	return true;
1987	}
1988
1989	return false;
1990	}
1991
1992	VMEMSize(const SIInstrInfo TII, unsigned* SGID, bool NeedsCache = false)
1993	: InstructionRule (TII, SGID, NeedsCache) {}
1994	};
1995
1996	/// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
1997	/// that is \p Distance steps away
1998	class SharesPredWithPrevNthGroup final : public InstructionRule {
1999	private:
2000	unsigned Distance = `1`;
2001
2002	public:
2003	bool apply(const SUnit SU, const* ArrayRef<SUnit *> Collection,
2004	SmallVectorImpl<SchedGroup> &SyncPipe) override {
2005	SchedGroup OtherGroup = nullptr*;
2006	if (!SyncPipe.size())
2007	return false;
2008
2009	if (!Cache ->size()) {
2010
2011	for (auto &PipeSG : SyncPipe) {
2012	if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
2013	OtherGroup = &PipeSG;
2014	}
2015	}
2016
2017	if (!OtherGroup)
2018	return false;
2019	if (!OtherGroup->Collection.size())
2020	return true;
2021
2022	for (auto &OtherEle : OtherGroup->Collection) {
2023	for (auto &Pred : OtherEle->Preds) {
2024	if (Pred.getSUnit()->getInstr()->getOpcode() ==
2025	AMDGPU::V_PERM_B32_e64)
2026	Cache ->push_back(Elt: Pred.getSUnit());
2027	}
2028	}
2029
2030	// If the other group has no PERM preds, then this group won't share any
2031	if (!Cache ->size())
2032	return false;
2033	}
2034
2035	auto DAG = SyncPipe [`0`].DAG;
2036	// Does the previous DS_WRITE share a V_PERM predecessor with this
2037	// VMEM_READ
2038	return llvm::any_of(Range&: Cache, P: [&SU, &DAG](SUnit Elt) {
2039	return DAG->IsReachable(SU: const_cast<SUnit *>(SU), TargetSU: Elt);
2040	});
2041	}
2042	SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
2043	unsigned SGID, bool NeedsCache = false)
2044	: InstructionRule (TII, SGID, NeedsCache), Distance(Distance) {}
2045	};
2046
2047	public:
2048	bool applyIGLPStrategy(
2049	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2050	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
2051	AMDGPU::SchedulingPhase Phase) override;
2052
2053	bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
2054	AMDGPU::SchedulingPhase Phase) override {
2055	return true;
2056	}
2057
2058	MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs DAG, const* SIInstrInfo *TII)
2059	: IGLPStrategy (DAG, TII) {
2060	IsBottomUp = false;
2061	}
2062	};
2063
2064	static unsigned DSWCount = `0`;
2065	static unsigned DSWWithPermCount = `0`;
2066	static unsigned DSWWithSharedVMEMCount = `0`;
2067
2068	bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2069	DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2070	DenseMap<int, SmallVector<SchedGroup, `4`>> &SyncedSchedGroups,
2071	AMDGPU::SchedulingPhase Phase) {
2072	unsigned MFMACount = `0`;
2073	unsigned DSRCount = `0`;
2074
2075	bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
2076
2077	assert((!IsInitial \|\| (DSWCount == `0` && DSWWithPermCount == `0` &&
2078	DSWWithSharedVMEMCount == `0`)) &&
2079	"DSWCounters should be zero in pre-RA scheduling!");
2080	SmallVector<SUnit *, `6`> DSWithPerms;
2081	for (auto &SU : DAG->SUnits) {
2082	auto I = SU.getInstr();
2083	if (TII->isMFMAorWMMA(MI: *I))
2084	++MFMACount;
2085	else if (TII->isDS(MI: *I)) {
2086	if (I->mayLoad())
2087	++DSRCount;
2088	else if (I->mayStore() && IsInitial) {
2089	++DSWCount;
2090	for (auto Pred : SU.Preds) {
2091	if (Pred.getSUnit()->getInstr()->getOpcode() ==
2092	AMDGPU::V_PERM_B32_e64) {
2093	DSWithPerms.push_back(Elt: &SU);
2094	break;
2095	}
2096	}
2097	}
2098	}
2099	}
2100
2101	if (IsInitial) {
2102	DSWWithPermCount = DSWithPerms.size();
2103	auto I = DSWithPerms.begin();
2104	auto E = DSWithPerms.end();
2105
2106	// Get the count of DS_WRITES with V_PERM predecessors which
2107	// have loop carried dependencies (WAR) on the same VMEM_READs.
2108	// We consider partial overlap as a miss -- in other words,
2109	// for a given DS_W, we only consider another DS_W as matching
2110	// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
2111	// for every V_PERM pred of this DS_W.
2112	DenseMap<MachineInstr , SUnit > VMEMLookup;
2113	SmallVector<SUnit *, `6`> Counted;
2114	for (; I != E; I++) {
2115	SUnit Cand = nullptr*;
2116	bool MissedAny = false;
2117	for (auto &Pred : (*I)->Preds) {
2118	if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2119	continue;
2120
2121	if (Cand && llvm::is_contained(Range&: Counted, Element: Cand))
2122	break;
2123
2124	for (auto &Succ : Pred.getSUnit()->Succs) {
2125	auto MI = Succ.getSUnit()->getInstr();
2126	if (!TII->isVMEM(MI: *MI) \|\| !MI->mayLoad())
2127	continue;
2128
2129	if (MissedAny \|\| !VMEMLookup.size()) {
2130	MissedAny = true;
2131	VMEMLookup [MI] = *I;
2132	continue;
2133	}
2134
2135	if (!VMEMLookup.contains(Val: MI)) {
2136	MissedAny = true;
2137	VMEMLookup [MI] = *I;
2138	continue;
2139	}
2140
2141	Cand = VMEMLookup [MI];
2142	if (llvm::is_contained(Range&: Counted, Element: Cand)) {
2143	MissedAny = true;
2144	break;
2145	}
2146	}
2147	}
2148	if (!MissedAny && Cand) {
2149	DSWWithSharedVMEMCount += `2`;
2150	Counted.push_back(Elt: Cand);
2151	Counted.push_back(Elt: *I);
2152	}
2153	}
2154	}
2155
2156	assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2157	SchedGroup *SG;
2158	unsigned PipelineSyncID = `0`;
2159	// For kernels with V_PERM, there are enough VALU to mix in between MFMAs
2160	if (DSWWithPermCount) {
2161	for (unsigned I = `0`; I < MFMACount; I++) {
2162	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2163	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2164	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2165
2166	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2167	Args: SchedGroupMask::VALU, Args: `2`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2168	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2169	}
2170	}
2171
2172	PipelineSyncID = `1`;
2173	// Phase 1: Break up DS_READ and MFMA clusters.
2174	// First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
2175	// prefetch
2176
2177	// Make ready initial MFMA
2178	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2179	Args: SchedGroupMask::DS_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2180	SG->addRule(NewRule: std::make_shared<EnablesInitialMFMA>(args&: TII, args: SG->getSGID(), args: true));
2181	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2182
2183	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2184	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2185	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2186
2187	// Interleave MFMA with DS_READ prefetch
2188	for (unsigned I = `0`; I < DSRCount - `4`; ++I) {
2189	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2190	Args: SchedGroupMask::DS_READ, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2191	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2192
2193	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2194	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2195	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2196	}
2197
2198	// Phase 2a: Loop carried dependency with V_PERM
2199	// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2200	// depend on. Interleave MFMA to keep XDL unit busy throughout.
2201	for (unsigned I = `0`; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
2202	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2203	Args: SchedGroupMask::VALU, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2204	SG->addRule(NewRule: std::make_shared<IsPermForDSW>(args&: TII, args: SG->getSGID(), args: true));
2205	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2206
2207	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2208	Args: SchedGroupMask::DS_WRITE, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2209	SG->addRule(NewRule: std::make_shared<IsSuccOfPrevGroup>(args&: TII, args: SG->getSGID()));
2210	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2211
2212	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2213	Args: SchedGroupMask::VMEM_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2214	SG->addRule(NewRule: std::make_shared<SharesPredWithPrevNthGroup>(
2215	args: `1`, args&: TII, args: SG->getSGID(), args: true));
2216	SG->addRule(NewRule: std::make_shared<VMEMSize>(args&: TII, args: SG->getSGID()));
2217	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2218
2219	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2220	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2221	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2222
2223	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2224	Args: SchedGroupMask::VMEM_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2225	SG->addRule(NewRule: std::make_shared<SharesPredWithPrevNthGroup>(
2226	args: `3`, args&: TII, args: SG->getSGID(), args: true));
2227	SG->addRule(NewRule: std::make_shared<VMEMSize>(args&: TII, args: SG->getSGID()));
2228	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2229
2230	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2231	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2232	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2233	}
2234
2235	// Phase 2b: Loop carried dependency without V_PERM
2236	// Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
2237	// Interleave MFMA to keep XDL unit busy throughout.
2238	for (unsigned I = `0`; I < DSWCount - DSWWithPermCount; I++) {
2239	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2240	Args: SchedGroupMask::DS_WRITE, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2241	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2242
2243	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2244	Args: SchedGroupMask::VMEM_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2245	SG->addRule(NewRule: std::make_shared<VMEMSize>(args&: TII, args: SG->getSGID()));
2246	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2247
2248	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2249	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2250	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2251	}
2252
2253	// Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
2254	// ultimately used by two DS_WRITE
2255	// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2256	// depend on. Interleave MFMA to keep XDL unit busy throughout.
2257
2258	for (unsigned I = `0`; I < DSWWithSharedVMEMCount; ++I) {
2259	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2260	Args: SchedGroupMask::VALU, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2261	SG->addRule(NewRule: std::make_shared<IsPermForDSW>(args&: TII, args: SG->getSGID(), args: true));
2262	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2263
2264	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2265	Args: SchedGroupMask::DS_WRITE, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2266	SG->addRule(NewRule: std::make_shared<IsSuccOfPrevGroup>(args&: TII, args: SG->getSGID()));
2267	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2268
2269	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2270	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2271	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2272
2273	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2274	Args: SchedGroupMask::VALU, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2275	SG->addRule(NewRule: std::make_shared<IsPermForDSW>(args&: TII, args: SG->getSGID(), args: true));
2276	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2277
2278	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2279	Args: SchedGroupMask::DS_WRITE, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2280	SG->addRule(NewRule: std::make_shared<IsSuccOfPrevGroup>(args&: TII, args: SG->getSGID()));
2281	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2282
2283	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2284	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2285	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2286
2287	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2288	Args: SchedGroupMask::VMEM_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2289	SG->addRule(NewRule: std::make_shared<SharesPredWithPrevNthGroup>(
2290	args: `2`, args&: TII, args: SG->getSGID(), args: true));
2291	SG->addRule(NewRule: std::make_shared<VMEMSize>(args&: TII, args: SG->getSGID()));
2292	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2293
2294	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2295	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2296	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2297
2298	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2299	Args: SchedGroupMask::VMEM_READ, Args: `4`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2300	SG->addRule(NewRule: std::make_shared<SharesPredWithPrevNthGroup>(
2301	args: `4`, args&: TII, args: SG->getSGID(), args: true));
2302	SG->addRule(NewRule: std::make_shared<VMEMSize>(args&: TII, args: SG->getSGID()));
2303	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2304
2305	SG = &SyncedSchedGroups [PipelineSyncID].emplace_back(
2306	Args: SchedGroupMask::MFMA, Args: `1`, Args&: PipelineSyncID, Args&: DAG, Args&: TII);
2307	SG->initSchedGroup(SyncedInstrs&: SyncedInstrs [SG->getSyncID()]);
2308	}
2309
2310	return true;
2311	}
2312
2313	static std::unique_ptr<IGLPStrategy>
2314	createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
2315	const SIInstrInfo *TII) {
2316	switch (ID) {
2317	case MFMASmallGemmOptID:
2318	return std::make_unique<MFMASmallGemmOpt>(args&: DAG, args&: TII);
2319	case MFMASmallGemmSingleWaveOptID:
2320	return std::make_unique<MFMASmallGemmSingleWaveOpt>(args&: DAG, args&: TII);
2321	case MFMAExpInterleave:
2322	return std::make_unique<MFMAExpInterleaveOpt>(args&: DAG, args&: TII);
2323	}
2324
2325	llvm_unreachable("Unknown IGLPStrategyID");
2326	}
2327
2328	class IGroupLPDAGMutation : public ScheduleDAGMutation {
2329	private:
2330	const SIInstrInfo *TII;
2331
2332	ScheduleDAGMI *DAG;
2333
2334	// Organize lists of SchedGroups by their SyncID. SchedGroups /
2335	// SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
2336	// between then.
2337	DenseMap<int, SmallVector<SchedGroup, `4`>> SyncedSchedGroups;
2338
2339	// Used to track instructions that can be mapped to multiple sched groups
2340	DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
2341
2342	// Add DAG edges that enforce SCHED_BARRIER ordering.
2343	void addSchedBarrierEdges(SUnit &SU);
2344
2345	// Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
2346	// not be reordered accross the SCHED_BARRIER. This is used for the base
2347	// SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
2348	// SCHED_BARRIER will always block all instructions that can be classified
2349	// into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
2350	// and may only synchronize with some SchedGroups. Returns the inverse of
2351	// Mask. SCHED_BARRIER's mask describes which instruction types should be
2352	// allowed to be scheduled across it. Invert the mask to get the
2353	// SchedGroupMask of instructions that should be barred.
2354	SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
2355
2356	// Create SchedGroups for a SCHED_GROUP_BARRIER.
2357	void initSchedGroupBarrierPipelineStage(
2358	std::vector<SUnit>::reverse_iterator RIter);
2359
2360	bool initIGLPOpt(SUnit &SU);
2361
2362	public:
2363	void apply(ScheduleDAGInstrs *DAGInstrs) override;
2364
2365	// The order in which the PipelineSolver should process the candidate
2366	// SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
2367	// created SchedGroup first, and will consider that as the ultimate
2368	// predecessor group when linking. TOP_DOWN instead links and processes the
2369	// first created SchedGroup first.
2370	bool IsBottomUp = true;
2371
2372	// The scheduling phase this application of IGLP corresponds with.
2373	AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
2374
2375	IGroupLPDAGMutation() = default;
2376	IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
2377	};
2378
2379	unsigned SchedGroup::NumSchedGroups = `0`;
2380
2381	bool SchedGroup::tryAddEdge(SUnit A, SUnit B) {
2382	if (A != B && DAG->canAddEdge(SuccSU: B, PredSU: A)) {
2383	DAG->addEdge(SuccSU: B, PredDep: SDep (A, SDep::Artificial));
2384	return true;
2385	}
2386	return false;
2387	}
2388
2389	bool SchedGroup::canAddMI(const MachineInstr &MI) const {
2390	bool Result = false;
2391	if (MI.isMetaInstruction())
2392	Result = false;
2393
2394	else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2395	(TII->isVALU(MI) \|\| TII->isMFMAorWMMA(MI) \|\| TII->isSALU(MI) \|\|
2396	TII->isTRANS(MI)))
2397	Result = true;
2398
2399	else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2400	TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
2401	Result = true;
2402
2403	else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2404	TII->isSALU(MI))
2405	Result = true;
2406
2407	else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2408	TII->isMFMAorWMMA(MI))
2409	Result = true;
2410
2411	else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2412	(TII->isVMEM(MI) \|\| (TII->isFLAT(MI) && !TII->isDS(MI))))
2413	Result = true;
2414
2415	else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2416	MI.mayLoad() &&
2417	(TII->isVMEM(MI) \|\| (TII->isFLAT(MI) && !TII->isDS(MI))))
2418	Result = true;
2419
2420	else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2421	MI.mayStore() &&
2422	(TII->isVMEM(MI) \|\| (TII->isFLAT(MI) && !TII->isDS(MI))))
2423	Result = true;
2424
2425	else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2426	TII->isDS(MI))
2427	Result = true;
2428
2429	else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2430	MI.mayLoad() && TII->isDS(MI))
2431	Result = true;
2432
2433	else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2434	MI.mayStore() && TII->isDS(MI))
2435	Result = true;
2436
2437	else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2438	TII->isTRANS(MI))
2439	Result = true;
2440
2441	LLVM_DEBUG(
2442	dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, `10`, true)
2443	<< (Result ? " could classify " : " unable to classify ") << MI);
2444
2445	return Result;
2446	}
2447
2448	int SchedGroup::link(SUnit &SU, bool MakePred,
2449	std::vector<std::pair<SUnit , SUnit >> &AddedEdges) {
2450	int MissedEdges = `0`;
2451	for (auto *A : Collection) {
2452	SUnit *B = &SU;
2453	if (A == B \|\| A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2454	continue;
2455	if (MakePred)
2456	std::swap(a&: A, b&: B);
2457
2458	if (DAG->IsReachable(SU: B, TargetSU: A))
2459	continue;
2460
2461	// tryAddEdge returns false if there is a dependency that makes adding
2462	// the A->B edge impossible, otherwise it returns true;
2463	bool Added = tryAddEdge(A, B);
2464	if (Added)
2465	AddedEdges.emplace_back(args&: A, args&: B);
2466	else
2467	++MissedEdges;
2468	}
2469
2470	return MissedEdges;
2471	}
2472
2473	void SchedGroup::link(SUnit &SU, bool MakePred) {
2474	for (auto *A : Collection) {
2475	SUnit *B = &SU;
2476	if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2477	continue;
2478	if (MakePred)
2479	std::swap(a&: A, b&: B);
2480
2481	tryAddEdge(A, B);
2482	}
2483	}
2484
2485	void SchedGroup::link(SUnit &SU,
2486	function_ref<bool(const SUnit A, const* SUnit *B)> P) {
2487	for (auto *A : Collection) {
2488	SUnit *B = &SU;
2489	if (P (A, B))
2490	std::swap(a&: A, b&: B);
2491
2492	tryAddEdge(A, B);
2493	}
2494	}
2495
2496	void SchedGroup::link(SchedGroup &OtherGroup) {
2497	for (auto *B : OtherGroup.Collection)
2498	link(SU&: *B);
2499	}
2500
2501	bool SchedGroup::canAddSU(SUnit &SU) const {
2502	MachineInstr &MI = *SU.getInstr();
2503	if (MI.getOpcode() != TargetOpcode::BUNDLE)
2504	return canAddMI(MI);
2505
2506	// Special case for bundled MIs.
2507	const MachineBasicBlock *MBB = MI.getParent();
2508	MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
2509	while (E != MBB->end() && E ->isBundledWithPred())
2510	++E;
2511
2512	// Return true if all of the bundled MIs can be added to this group.
2513	return std::all_of(first: B, last: E, pred: [this](MachineInstr &MI) { return canAddMI(MI); });
2514	}
2515
2516	void SchedGroup::initSchedGroup() {
2517	for (auto &SU : DAG->SUnits) {
2518	if (isFull())
2519	break;
2520
2521	if (canAddSU(SU))
2522	add(SU);
2523	}
2524	}
2525
2526	void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
2527	SUnitsToCandidateSGsMap &SyncedInstrs) {
2528	SUnit &InitSU = *RIter;
2529	for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
2530	auto &SU = *RIter;
2531	if (isFull())
2532	break;
2533
2534	if (canAddSU(SU))
2535	SyncedInstrs [&SU].push_back(Elt: SGID);
2536	}
2537
2538	add(SU&: InitSU);
2539	assert(MaxSize);
2540	(*MaxSize)++;
2541	}
2542
2543	void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
2544	auto I = DAG->SUnits.rbegin();
2545	auto E = DAG->SUnits.rend();
2546	for (; I != E; ++I) {
2547	auto &SU = *I;
2548	if (isFull())
2549	break;
2550	if (canAddSU(SU))
2551	SyncedInstrs [&SU].push_back(Elt: SGID);
2552	}
2553	}
2554
2555	void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
2556	const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
2557	if (!TSchedModel \|\| DAGInstrs->SUnits.empty())
2558	return;
2559
2560	LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
2561	const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
2562	TII = ST.getInstrInfo();
2563	DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
2564	SyncedSchedGroups.clear();
2565	SyncedInstrs.clear();
2566	bool FoundSB = false;
2567	bool FoundIGLP = false;
2568	bool ShouldApplyIGLP = false;
2569	for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
2570	unsigned Opc = R ->getInstr()->getOpcode();
2571	// SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
2572	if (Opc == AMDGPU::SCHED_BARRIER) {
2573	addSchedBarrierEdges(SU&: *R);
2574	FoundSB = true;
2575	} else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2576	initSchedGroupBarrierPipelineStage(RIter: R);
2577	FoundSB = true;
2578	} else if (Opc == AMDGPU::IGLP_OPT) {
2579	resetEdges(SU&: *R, DAG);
2580	if (!FoundSB && !FoundIGLP) {
2581	FoundIGLP = true;
2582	ShouldApplyIGLP = initIGLPOpt(SU&: *R);
2583	}
2584	}
2585	}
2586
2587	if (FoundSB \|\| (FoundIGLP && ShouldApplyIGLP)) {
2588	PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2589	// PipelineSolver performs the mutation by adding the edges it
2590	// determined as the best
2591	PS.solve();
2592	return;
2593	}
2594	}
2595
2596	void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
2597	MachineInstr &MI = *SchedBarrier.getInstr();
2598	assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2599	// Remove all existing edges from the SCHED_BARRIER that were added due to the
2600	// instruction having side effects.
2601	resetEdges(SU&: SchedBarrier, DAG);
2602	LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
2603	<< MI.getOperand(`0`).getImm() << "\n");
2604	auto InvertedMask =
2605	invertSchedBarrierMask(Mask: (SchedGroupMask)MI.getOperand(i: `0`).getImm());
2606	SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
2607	SG.initSchedGroup();
2608
2609	// Preserve original instruction ordering relative to the SCHED_BARRIER.
2610	SG.link(
2611	SU&: SchedBarrier,
2612	P: (function_ref<bool(const SUnit A, const* SUnit *B)>)[](
2613	const SUnit A, const* SUnit B) { return* A->NodeNum > B->NodeNum; });
2614	}
2615
2616	SchedGroupMask
2617	IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
2618	// Invert mask and erase bits for types of instructions that are implied to be
2619	// allowed past the SCHED_BARRIER.
2620	SchedGroupMask InvertedMask = ~Mask;
2621
2622	// ALU implies VALU, SALU, MFMA, TRANS.
2623	if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2624	InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2625	~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2626	// VALU, SALU, MFMA, TRANS implies ALU.
2627	else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE \|\|
2628	(InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE \|\|
2629	(InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE \|\|
2630	(InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2631	InvertedMask &= ~SchedGroupMask::ALU;
2632
2633	// VMEM implies VMEM_READ, VMEM_WRITE.
2634	if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2635	InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2636	// VMEM_READ, VMEM_WRITE implies VMEM.
2637	else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE \|\|
2638	(InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2639	InvertedMask &= ~SchedGroupMask::VMEM;
2640
2641	// DS implies DS_READ, DS_WRITE.
2642	if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2643	InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2644	// DS_READ, DS_WRITE implies DS.
2645	else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE \|\|
2646	(InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2647	InvertedMask &= ~SchedGroupMask::DS;
2648
2649	LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
2650	<< "\n");
2651
2652	return InvertedMask;
2653	}
2654
2655	void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2656	std::vector<SUnit>::reverse_iterator RIter) {
2657	// Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
2658	// to the instruction having side effects.
2659	resetEdges(SU&: *RIter, DAG);
2660	MachineInstr &SGB = *RIter ->getInstr();
2661	assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
2662	int32_t SGMask = SGB.getOperand(i: `0`).getImm();
2663	int32_t Size = SGB.getOperand(i: `1`).getImm();
2664	int32_t SyncID = SGB.getOperand(i: `2`).getImm();
2665
2666	auto &SG = SyncedSchedGroups [SyncID].emplace_back(Args: (SchedGroupMask)SGMask,
2667	Args&: Size, Args&: SyncID, Args&: DAG, Args&: TII);
2668
2669	SG.initSchedGroup(RIter, SyncedInstrs&: SyncedInstrs [SG.getSyncID()]);
2670	}
2671
2672	bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2673	IGLPStrategyID StrategyID =
2674	(IGLPStrategyID)SU.getInstr()->getOperand(i: `0`).getImm();
2675	auto S = createIGLPStrategy(ID: StrategyID, DAG, TII);
2676	if (!S ->shouldApplyStrategy(DAG, Phase))
2677	return false;
2678
2679	IsBottomUp = S ->IsBottomUp;
2680	return S ->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
2681	}
2682
2683	} // namespace
2684
2685	namespace llvm {
2686
2687	/// \p Phase specifes whether or not this is a reentry into the
2688	/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
2689	/// same scheduling region (e.g. pre and post-RA scheduling / multiple
2690	/// scheduling "phases"), we can reenter this mutation framework more than once
2691	/// for a given region.
2692	std::unique_ptr<ScheduleDAGMutation>
2693	createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
2694	return std::make_unique<IGroupLPDAGMutation>(args&: Phase);
2695	}
2696
2697	} // end namespace llvm
2698

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp