AMDGPUCoExecSchedStrategy.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp]

1	//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Coexecution-focused scheduling strategy for AMDGPU.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPUCoExecSchedStrategy.h"
15	#include "llvm/Support/Debug.h"
16
17	using namespace llvm;
18	using namespace llvm::AMDGPU;
19
20	#define DEBUG_TYPE "machine-scheduler"
21
22	namespace {
23
24	// Used to disable post-RA scheduling with function level granularity.
25	class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
26	public:
27	explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
28	: ScheduleDAGInstrs (C->MF, C->MLI, /RemoveKillFlags=/*true) {}
29
30	// Do nothing.
31	void schedule() override {}
32	};
33
34	} // namespace
35
36	static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
37	// pickOnlyChoice() releases pending instructions and checks for new hazards.
38	SUnit *OnlyChoice = Zone.pickOnlyChoice();
39	if (!Zone.Pending.empty())
40	return nullptr;
41
42	return OnlyChoice;
43	}
44
45	InstructionFlavor llvm::AMDGPU::classifyFlavor(const MachineInstr &MI,
46	const SIInstrInfo &SII) {
47	if (MI.isDebugInstr())
48	return InstructionFlavor::Other;
49
50	unsigned Opc = MI.getOpcode();
51
52	// Check for specific opcodes first.
53	if (Opc == AMDGPU::ATOMIC_FENCE \|\| Opc == AMDGPU::S_WAIT_ASYNCCNT \|\|
54	Opc == AMDGPU::S_WAIT_TENSORCNT \|\| Opc == AMDGPU::S_BARRIER_WAIT \|\|
55	Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
56	return InstructionFlavor::Fence;
57
58	if (SII.isLDSDMA(MI))
59	return InstructionFlavor::DMA;
60
61	if (SII.isMFMAorWMMA(MI))
62	return InstructionFlavor::WMMA;
63
64	if (SII.isTRANS(MI))
65	return InstructionFlavor::TRANS;
66
67	if (SII.isVALU(MI))
68	return InstructionFlavor::SingleCycleVALU;
69
70	if (SII.isDS(MI))
71	return InstructionFlavor::DS;
72
73	if (SII.isFLAT(MI) \|\| SII.isFLATGlobal(MI) \|\| SII.isFLATScratch(MI))
74	return InstructionFlavor::VMEM;
75
76	if (SII.isSALU(MI))
77	return InstructionFlavor::SALU;
78
79	return InstructionFlavor::Other;
80	}
81
82	SUnit HardwareUnitInfo::getNextTargetSU(bool* LookDeep) const {
83	for (auto *PrioritySU : PrioritySUs) {
84	if (!PrioritySU->isTopReady())
85	return PrioritySU;
86	}
87
88	if (!LookDeep)
89	return nullptr;
90
91	unsigned MinDepth = std::numeric_limits<unsigned int>::max();
92	SUnit TargetSU = nullptr*;
93	for (auto *SU : AllSUs) {
94	if (SU->isScheduled)
95	continue;
96
97	if (SU->isTopReady())
98	continue;
99
100	if (SU->getDepth() < MinDepth) {
101	MinDepth = SU->getDepth();
102	TargetSU = SU;
103	}
104	}
105	return TargetSU;
106	}
107
108	void HardwareUnitInfo::insert(SUnit SU, unsigned* BlockingCycles) {
109	#ifndef NDEBUG
110	bool Inserted = AllSUs.insert(SU);
111	assert(Inserted);
112	#else
113	AllSUs.insert(X: SU);
114	#endif
115
116	TotalCycles += BlockingCycles;
117
118	if (PrioritySUs.empty()) {
119	PrioritySUs.insert(X: SU);
120	return;
121	}
122	unsigned SUDepth = SU->getDepth();
123	unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
124	if (SUDepth > CurrDepth)
125	return;
126
127	if (SUDepth == CurrDepth) {
128	PrioritySUs.insert(X: SU);
129	return;
130	}
131
132	// SU is lower depth and should be prioritized.
133	PrioritySUs.clear();
134	PrioritySUs.insert(X: SU);
135	}
136
137	void HardwareUnitInfo::markScheduled(SUnit SU, unsigned* BlockingCycles) {
138	// We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
139	// we just clear the HWUI. However, we still have instructions which map to
140	// this HWUI. Don't bother managing the state for these HWUI.
141	if (TotalCycles == `0`)
142	return;
143
144	AllSUs.remove(X: SU);
145	PrioritySUs.remove(X: SU);
146
147	TotalCycles -= BlockingCycles;
148
149	if (AllSUs.empty())
150	return;
151	if (PrioritySUs.empty()) {
152	for (auto SU : AllSUs) {
153	if (PrioritySUs.empty()) {
154	PrioritySUs.insert(X: SU);
155	continue;
156	}
157	unsigned SUDepth = SU->getDepth();
158	unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
159	if (SUDepth > CurrDepth)
160	continue;
161
162	if (SUDepth == CurrDepth) {
163	PrioritySUs.insert(X: SU);
164	continue;
165	}
166
167	// SU is lower depth and should be prioritized.
168	PrioritySUs.clear();
169	PrioritySUs.insert(X: SU);
170	}
171	}
172	}
173
174	HardwareUnitInfo *
175	CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
176	for (auto &HWUICand : HWUInfo) {
177	if (HWUICand.getType() == Flavor) {
178	return &HWUICand;
179	}
180	}
181	return nullptr;
182	}
183
184	unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) {
185	assert(SchedModel && SchedModel->hasInstrSchedModel());
186	unsigned ReleaseAtCycle = `0`;
187	const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
188	for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC),
189	PE = SchedModel->getWriteProcResEnd(SC);
190	PI != PE; ++PI) {
191	ReleaseAtCycle = std::max(a: ReleaseAtCycle, b: (unsigned)PI->ReleaseAtCycle);
192	}
193	return ReleaseAtCycle;
194	}
195
196	void CandidateHeuristics::updateForScheduling(SUnit *SU) {
197	HardwareUnitInfo *HWUI =
198	getHWUIFromFlavor(Flavor: classifyFlavor(MI: SU->getInstr(), SII: SII));
199	assert(HWUI);
200	HWUI->markScheduled(SU, BlockingCycles: getHWUICyclesForInst(SU));
201	}
202
203	void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
204	const TargetSchedModel *TargetSchedModel,
205	const TargetRegisterInfo *TRI) {
206	DAG = SchedDAG;
207	SchedModel = TargetSchedModel;
208	assert(SchedModel && SchedModel->hasInstrSchedModel());
209
210	SRI = static_cast<const SIRegisterInfo *>(TRI);
211	SII = static_cast<const SIInstrInfo *>(DAG->TII);
212
213	HWUInfo.resize(N: (int)InstructionFlavor::NUM_FLAVORS);
214
215	for (unsigned I = `0`; I < HWUInfo.size(); I++) {
216	HWUInfo [I].reset();
217	HWUInfo [I].setType(I);
218	}
219
220	HWUInfo [(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
221	HWUInfo [(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
222	HWUInfo [(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
223
224	collectHWUIPressure();
225	}
226
227	void CandidateHeuristics::collectHWUIPressure() {
228	if (!SchedModel \|\| !SchedModel->hasInstrSchedModel())
229	return;
230
231	for (auto &SU : DAG->SUnits) {
232	const InstructionFlavor Flavor = classifyFlavor(MI: SU.getInstr(), SII: SII);
233	HWUInfo [(int)(Flavor)].insert(SU: &SU, BlockingCycles: getHWUICyclesForInst(SU: &SU));
234	}
235
236	LLVM_DEBUG(dumpRegionSummary());
237	}
238
239	void CandidateHeuristics::dumpRegionSummary() {
240	MachineBasicBlock *BB = DAG->begin()->getParent();
241	dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
242	<< " (" << DAG->SUnits.size() << " SUs) ===\n";
243
244	dbgs() << "\nHWUI Resource Pressure:\n";
245	for (auto &HWUI : HWUInfo) {
246	if (HWUI.getTotalCycles() == `0`)
247	continue;
248
249	StringRef Name = getFlavorName(F: HWUI.getType());
250	dbgs() << " " << Name << ": " << HWUI.getTotalCycles() << " cycles, "
251	<< HWUI.size() << " instrs\n";
252	}
253	dbgs() << "\n";
254	}
255
256	void CandidateHeuristics::sortHWUIResources() {
257	// Highest priority should be first.
258	llvm::sort(C&: HWUInfo, Comp: [](HardwareUnitInfo &A, HardwareUnitInfo &B) {
259	// Prefer CoexecWindow producers
260	if (A.producesCoexecWindow() != B.producesCoexecWindow())
261	return A.producesCoexecWindow();
262
263	// Prefer more demanded resources
264	if (A.getTotalCycles() != B.getTotalCycles())
265	return A.getTotalCycles() > B.getTotalCycles();
266
267	// In ties -- prefer the resource with more instructions
268	if (A.size() != B.size())
269	return A.size() < B.size();
270
271	// Default to Flavor order
272	return (unsigned)A.getType() < (unsigned)B.getType();
273	});
274	}
275
276	bool CandidateHeuristics::tryCriticalResourceDependency(
277	GenericSchedulerBase::SchedCandidate &TryCand,
278	GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary Zone) const* {
279
280	auto HasPrioritySU = [this, &Cand, &TryCand](unsigned ResourceIdx) {
281	const HardwareUnitInfo &HWUI = HWUInfo [ResourceIdx];
282
283	auto CandFlavor = classifyFlavor(MI: Cand.SU->getInstr(), SII: SII);
284	auto TryCandFlavor = classifyFlavor(MI: TryCand.SU->getInstr(), SII: SII);
285	bool LookDeep = (CandFlavor == InstructionFlavor::DS \|\|
286	TryCandFlavor == InstructionFlavor::DS) &&
287	HWUI.getType() == InstructionFlavor::WMMA;
288	auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
289
290	// If we do not have a TargetSU for this resource, then it is not critical.
291	if (!TargetSU)
292	return false;
293
294	return true;
295	};
296
297	auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) {
298	const HardwareUnitInfo &HWUI = HWUInfo [ResourceIdx];
299	auto CandFlavor = classifyFlavor(MI: Cand.SU->getInstr(), SII: SII);
300
301	// We want to ensure our DS order matches WMMA order.
302	bool LookDeep = CandFlavor == InstructionFlavor::DS &&
303	HWUI.getType() == InstructionFlavor::WMMA;
304	auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
305
306	bool CandEnables =
307	TargetSU != Cand.SU && DAG->IsReachable(SU: TargetSU, TargetSU: Cand.SU);
308	bool TryCandEnables =
309	TargetSU != TryCand.SU && DAG->IsReachable(SU: TargetSU, TargetSU: TryCand.SU);
310
311	if (!CandEnables && !TryCandEnables)
312	return false;
313
314	if (CandEnables && !TryCandEnables) {
315	if (Cand.Reason > GenericSchedulerBase::RegCritical)
316	Cand.Reason = GenericSchedulerBase::RegCritical;
317
318	return true;
319	}
320
321	if (!CandEnables && TryCandEnables) {
322	TryCand.Reason = GenericSchedulerBase::RegCritical;
323	return true;
324	}
325
326	// Both enable, prefer the critical path.
327	unsigned CandHeight = Cand.SU->getHeight();
328	unsigned TryCandHeight = TryCand.SU->getHeight();
329
330	if (CandHeight > TryCandHeight) {
331	if (Cand.Reason > GenericSchedulerBase::RegCritical)
332	Cand.Reason = GenericSchedulerBase::RegCritical;
333
334	return true;
335	}
336
337	if (CandHeight < TryCandHeight) {
338	TryCand.Reason = GenericSchedulerBase::RegCritical;
339	return true;
340	}
341
342	// Same critical path, just prefer original candidate.
343	if (Cand.Reason > GenericSchedulerBase::RegCritical)
344	Cand.Reason = GenericSchedulerBase::RegCritical;
345
346	return true;
347	};
348
349	for (unsigned I = `0`; I < HWUInfo.size(); I++) {
350	// If we have encountered a resource that is not critical, then neither
351	// candidate enables a critical resource
352	if (!HasPrioritySU (I))
353	continue;
354
355	bool Enabled = TryEnablesResource (I);
356	// If neither has enabled the resource, continue to the next resource
357	if (Enabled)
358	return true;
359	}
360	return false;
361	}
362
363	bool CandidateHeuristics::tryCriticalResource(
364	GenericSchedulerBase::SchedCandidate &TryCand,
365	GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary Zone) const* {
366	for (unsigned I = `0`; I < HWUInfo.size(); I++) {
367	const HardwareUnitInfo &HWUI = HWUInfo [I];
368
369	bool CandUsesCrit = HWUI.contains(SU: Cand.SU);
370	bool TryCandUsesCrit = HWUI.contains(SU: TryCand.SU);
371
372	if (!CandUsesCrit && !TryCandUsesCrit)
373	continue;
374
375	if (CandUsesCrit != TryCandUsesCrit) {
376	if (CandUsesCrit) {
377	if (Cand.Reason > GenericSchedulerBase::RegCritical)
378	Cand.Reason = GenericSchedulerBase::RegCritical;
379	return true;
380	}
381	TryCand.Reason = GenericSchedulerBase::RegCritical;
382	return true;
383	}
384
385	// Otherwise, both use the critical resource
386	// For longer latency InstructionFlavors, we should prioritize first by
387	// their enablement of critical resources
388	if (HWUI.getType() == InstructionFlavor::DS) {
389	if (tryCriticalResourceDependency(TryCand, Cand, Zone))
390	return true;
391	}
392
393	// Prioritize based on HWUI priorities.
394	SUnit *Match = HWUI.getHigherPriority(SU: Cand.SU, Other: TryCand.SU);
395	if (Match) {
396	if (Match == Cand.SU) {
397	if (Cand.Reason > GenericSchedulerBase::RegCritical)
398	Cand.Reason = GenericSchedulerBase::RegCritical;
399	return true;
400	}
401	TryCand.Reason = GenericSchedulerBase::RegCritical;
402	return true;
403	}
404	}
405
406	return false;
407	}
408
409	AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
410	const MachineSchedContext *C)
411	: GCNSchedStrategy (C) {
412	SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule);
413	SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize);
414	// Use more accurate GCN pressure trackers.
415	UseGCNTrackers = true;
416	}
417
418	void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
419	MachineBasicBlock::iterator End,
420	unsigned NumRegionInstrs) {
421	GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
422	assert((PreRADirection == MISched::Unspecified \|\|
423	PreRADirection == MISched::TopDown) &&
424	"coexec scheduler only supports top-down scheduling");
425	RegionPolicy.OnlyTopDown = true;
426	RegionPolicy.OnlyBottomUp = false;
427	}
428
429	void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
430	// Coexecution scheduling strategy is only done top-down to support new
431	// resource balancing heuristics.
432	RegionPolicy.OnlyTopDown = true;
433	RegionPolicy.OnlyBottomUp = false;
434
435	GCNSchedStrategy::initialize(DAG);
436	Heurs.initialize(SchedDAG: DAG, TargetSchedModel: SchedModel, TRI);
437	}
438
439	void AMDGPUCoExecSchedStrategy::schedNode(SUnit SU, bool* IsTopNode) {
440	Heurs.updateForScheduling(SU);
441	GCNSchedStrategy::schedNode(SU, IsTopNode);
442	}
443
444	SUnit AMDGPUCoExecSchedStrategy::pickNode(bool* &IsTopNode) {
445	assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
446	"coexec scheduler only supports top-down scheduling");
447
448	if (DAG->top() == DAG->bottom()) {
449	assert(Top.Available.empty() && Top.Pending.empty() &&
450	Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
451	return nullptr;
452	}
453
454	bool PickedPending = false;
455	SUnit SU = nullptr*;
456	#ifndef NDEBUG
457	SchedCandidate PickedCand = nullptr*;
458	#endif
459	do {
460	PickedPending = false;
461	SU = pickOnlyChoice(Zone&: Top);
462	if (!SU) {
463	CandPolicy NoPolicy;
464	TopCand.reset(NewPolicy: NoPolicy);
465	pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
466	PickedPending, /IsBottomUp=/false);
467	assert(TopCand.Reason != NoCand && "failed to find a candidate");
468	SU = TopCand.SU;
469	#ifndef NDEBUG
470	PickedCand = &TopCand;
471	#endif
472	}
473	IsTopNode = true;
474	} while (SU->isScheduled);
475
476	LLVM_DEBUG(if (PickedCand) dumpPickSummary(SU, IsTopNode, *PickedCand));
477
478	if (PickedPending) {
479	unsigned ReadyCycle = SU->TopReadyCycle;
480	unsigned CurrentCycle = Top.getCurrCycle();
481	if (ReadyCycle > CurrentCycle)
482	Top.bumpCycle(NextCycle: ReadyCycle);
483
484	// checkHazard() does not expose the exact cycle where the hazard clears.
485	while (Top.checkHazard(SU))
486	Top.bumpCycle(NextCycle: Top.getCurrCycle() + `1`);
487
488	Top.releasePending();
489	}
490
491	if (SU->isTopReady())
492	Top.removeReady(SU);
493	if (SU->isBottomReady())
494	Bot.removeReady(SU);
495
496	LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
497	<< *SU->getInstr());
498
499	assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
500	return SU;
501	}
502
503	void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
504	SchedBoundary &Zone, const CandPolicy &ZonePolicy,
505	const RegPressureTracker &RPTracker, SchedCandidate &Cand,
506	bool &PickedPending, bool IsBottomUp) {
507	assert(Zone.isTop() && "coexec scheduler only supports top boundary");
508	assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");
509
510	const SIRegisterInfo SRI = static_cast<const* SIRegisterInfo *>(TRI);
511	ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
512	unsigned SGPRPressure = `0`;
513	unsigned VGPRPressure = `0`;
514	PickedPending = false;
515	if (DAG->isTrackingPressure()) {
516	if (!useGCNTrackers()) {
517	SGPRPressure = Pressure [AMDGPU::RegisterPressureSets::SReg_32];
518	VGPRPressure = Pressure [AMDGPU::RegisterPressureSets::VGPR_32];
519	} else {
520	SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
521	VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
522	}
523	}
524
525	auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
526	for (SUnit *SU : Q) {
527	SchedCandidate TryCand(ZonePolicy);
528	initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
529	VGPRPressure, IsBottomUp);
530	SchedBoundary ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr*;
531	tryCandidateCoexec(Cand, TryCand, Zone: ZoneArg);
532	if (TryCand.Reason != NoCand) {
533	if (TryCand.ResDelta == SchedResourceDelta ())
534	TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
535	LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
536	PickedPending = FromPending;
537	Cand.setBest(TryCand);
538	} else {
539	LLVM_DEBUG(printCandidateDecision(TryCand, Cand));
540	}
541	}
542	};
543
544	LLVM_DEBUG(dbgs() << "Available Q:\n");
545	EvaluateQueue (Zone.Available, /FromPending=/false);
546
547	LLVM_DEBUG(dbgs() << "Pending Q:\n");
548	EvaluateQueue (Zone.Pending, /FromPending=/true);
549	}
550
551	#ifndef NDEBUG
552	void AMDGPUCoExecSchedStrategy::dumpPickSummary(SUnit SU, bool* IsTopNode,
553	SchedCandidate &Cand) {
554	const SIInstrInfo SII = static_cast<const* SIInstrInfo *>(DAG->TII);
555	unsigned Cycle = IsTopNode ? Top.getCurrCycle() : Bot.getCurrCycle();
556
557	dbgs() << "=== Pick @ Cycle " << Cycle << " ===\n";
558
559	const InstructionFlavor Flavor = classifyFlavor(SU->getInstr(), SII);
560	dbgs() << "Picked: SU(" << SU->NodeNum << ") ";
561	SU->getInstr()->print(dbgs(), /IsStandalone=/true, /SkipOpers=/false,
562	/SkipDebugLoc=/true);
563	dbgs() << " [" << getFlavorName(Flavor) << "]\n";
564
565	dbgs() << " Reason: ";
566	if (LastAMDGPUReason != AMDGPUSchedReason::None)
567	dbgs() << getReasonName(LastAMDGPUReason);
568	else if (Cand.Reason != NoCand)
569	dbgs() << GenericSchedulerBase::getReasonStr(Cand.Reason);
570	else
571	dbgs() << "Unknown";
572	dbgs() << "\n\n";
573
574	LastAMDGPUReason = AMDGPUSchedReason::None;
575	}
576	#endif
577
578	bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
579	SchedCandidate &TryCand,
580	SchedBoundary *Zone) {
581	// Initialize the candidate if needed.
582	if (!Cand.isValid()) {
583	TryCand.Reason = FirstValid;
584	return true;
585	}
586
587	// Bias PhysReg Defs and copies to their uses and defined respectively.
588	if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
589	CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
590	return TryCand.Reason != NoCand;
591
592	// Avoid exceeding the target's limit.
593	if (DAG->isTrackingPressure() &&
594	tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
595	Reason: RegExcess, TRI, MF: DAG->MF))
596	return TryCand.Reason != NoCand;
597
598	// We only compare a subset of features when comparing nodes between
599	// Top and Bottom boundary. Some properties are simply incomparable, in many
600	// other instances we should only override the other boundary if something
601	// is a clear good pick on one boundary. Skip heuristics that are more
602	// "tie-breaking" in nature.
603	bool SameBoundary = Zone != nullptr;
604	if (SameBoundary) {
605	// Compare candidates by the stall they would introduce if
606	// scheduled in the current cycle.
607	if (tryEffectiveStall(Cand, TryCand, Zone&: *Zone))
608	return TryCand.Reason != NoCand;
609
610	Heurs.sortHWUIResources();
611	if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
612	LastAMDGPUReason = AMDGPUSchedReason::CritResourceBalance;
613	return TryCand.Reason != NoCand;
614	}
615
616	if (Heurs.tryCriticalResourceDependency(TryCand, Cand, Zone)) {
617	LastAMDGPUReason = AMDGPUSchedReason::CritResourceDep;
618	return TryCand.Reason != NoCand;
619	}
620	}
621
622	// Keep clustered nodes together to encourage downstream peephole
623	// optimizations which may reduce resource requirements.
624	//
625	// This is a best effort to set things up for a post-RA pass. Optimizations
626	// like generating loads of multiple registers should ideally be done within
627	// the scheduler pass by combining the loads during DAG postprocessing.
628	unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
629	unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
630	bool CandIsClusterSucc =
631	isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx);
632	bool TryCandIsClusterSucc =
633	isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx);
634
635	if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand,
636	Reason: Cluster))
637	return TryCand.Reason != NoCand;
638
639	if (SameBoundary) {
640	// Weak edges are for clustering and other constraints.
641	if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
642	CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
643	return TryCand.Reason != NoCand;
644	}
645
646	// Avoid increasing the max pressure of the entire region.
647	if (DAG->isTrackingPressure() &&
648	tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
649	Cand, Reason: RegMax, TRI, MF: DAG->MF))
650	return TryCand.Reason != NoCand;
651
652	if (SameBoundary) {
653	// Avoid serializing long latency dependence chains.
654	// For acyclic path limited loops, latency was already checked above.
655	if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
656	!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone))
657	return TryCand.Reason != NoCand;
658
659	// Fall through to original instruction order.
660	if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) \|\|
661	(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
662	TryCand.Reason = NodeOrder;
663	return true;
664	}
665	}
666
667	return false;
668	}
669
670	bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
671	SchedCandidate &TryCand,
672	SchedBoundary &Zone) const {
673	// Treat structural and latency stalls as a single scheduling cost for the
674	// current cycle.
675	struct StallCosts {
676	unsigned Ready = `0`;
677	unsigned Structural = `0`;
678	unsigned Latency = `0`;
679	unsigned Effective = `0`;
680	};
681
682	unsigned CurrCycle = Zone.getCurrCycle();
683	auto GetStallCosts = [&](SUnit *SU) {
684	unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
685	StallCosts Costs;
686	Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : `0`;
687	Costs.Structural = getStructuralStallCycles(Zone, SU);
688	Costs.Latency = Zone.getLatencyStallCycles(SU);
689	Costs.Effective = std::max(l: {Costs.Ready, Costs.Structural, Costs.Latency});
690	return Costs;
691	};
692
693	StallCosts TryCosts = GetStallCosts (TryCand.SU);
694	StallCosts CandCosts = GetStallCosts (Cand.SU);
695
696	LLVM_DEBUG(if (TryCosts.Effective \|\| CandCosts.Effective) {
697	dbgs() << "Effective stalls: try=" << TryCosts.Effective
698	<< " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
699	<< ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
700	<< " (ready=" << CandCosts.Ready
701	<< ", struct=" << CandCosts.Structural
702	<< ", lat=" << CandCosts.Latency << ")\n";
703	});
704
705	return tryLess(TryVal: TryCosts.Effective, CandVal: CandCosts.Effective, TryCand, Cand, Reason: Stall);
706	}
707
708	ScheduleDAGInstrs *
709	llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
710	LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
711	<< C->MF->getName() << `'\n'`);
712	return new GCNScheduleDAGMILive (
713	C, std::make_unique<AMDGPUCoExecSchedStrategy>(args&: C));
714	}
715
716	ScheduleDAGInstrs *
717	llvm::createGCNNoopPostMachineScheduler(MachineSchedContext *C) {
718	LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
719	<< C->MF->getName() << `'\n'`);
720	return new GCNNoopPostScheduleDAG (C);
721	}
722

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp