1//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Coexecution-focused scheduling strategy for AMDGPU.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUCoExecSchedStrategy.h"
15#include "AMDGPUIGroupLP.h"
16#include "llvm/Support/Debug.h"
17
18using namespace llvm;
19using namespace llvm::AMDGPU;
20
21#define DEBUG_TYPE "machine-scheduler"
22
23namespace {
24
25// Used to disable post-RA scheduling with function level granularity.
26class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
27public:
28 explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
29 : ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {}
30
31 // Do nothing.
32 void schedule() override {}
33};
34
35} // namespace
36
37static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
38 // pickOnlyChoice() releases pending instructions and checks for new hazards.
39 SUnit *OnlyChoice = Zone.pickOnlyChoice();
40 if (!Zone.Pending.empty())
41 return nullptr;
42
43 return OnlyChoice;
44}
45
46InstructionFlavor llvm::AMDGPU::classifyFlavor(const MachineInstr &MI,
47 const SIInstrInfo &SII) {
48 if (MI.isDebugInstr())
49 return InstructionFlavor::Other;
50
51 unsigned Opc = MI.getOpcode();
52
53 // Check for specific opcodes first.
54 if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
55 Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
56 Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
57 return InstructionFlavor::Fence;
58
59 if (SII.isLDSDMA(MI))
60 return InstructionFlavor::DMA;
61
62 if (SII.isMFMAorWMMA(MI))
63 return InstructionFlavor::WMMA;
64
65 if (SII.isTRANS(MI))
66 return InstructionFlavor::TRANS;
67
68 if (SII.isVALU(MI, /*AllowLDSDMA=*/true))
69 return InstructionFlavor::SingleCycleVALU;
70
71 if (SII.isDS(MI))
72 return InstructionFlavor::DS;
73
74 if (SII.isVMEM(MI))
75 return InstructionFlavor::VMEM;
76
77 if (SII.isSALU(MI))
78 return InstructionFlavor::SALU;
79
80 return InstructionFlavor::Other;
81}
82
83SUnit *HardwareUnitInfo::getNextTargetSU(bool LookDeep) const {
84 for (SUnit *PrioritySU : PrioritySUs) {
85 if (!PrioritySU->isTopReady())
86 return PrioritySU;
87 }
88
89 if (!LookDeep)
90 return nullptr;
91
92 unsigned MinDepth = std::numeric_limits<unsigned int>::max();
93 SUnit *TargetSU = nullptr;
94 for (auto *SU : AllSUs) {
95 if (SU->isScheduled)
96 continue;
97
98 if (SU->isTopReady())
99 continue;
100
101 if (SU->getDepth() < MinDepth) {
102 MinDepth = SU->getDepth();
103 TargetSU = SU;
104 }
105 }
106 return TargetSU;
107}
108
109void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) {
110 if (!AllSUs.insert(X: SU))
111 llvm_unreachable("HardwareUnit already contains SU!");
112
113 TotalCycles += BlockingCycles;
114
115 if (PrioritySUs.empty()) {
116 PrioritySUs.insert(X: SU);
117 return;
118 }
119 unsigned SUDepth = SU->getDepth();
120 unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
121 if (SUDepth > CurrDepth)
122 return;
123
124 if (SUDepth == CurrDepth) {
125 PrioritySUs.insert(X: SU);
126 return;
127 }
128
129 // SU is lower depth and should be prioritized.
130 PrioritySUs.clear();
131 PrioritySUs.insert(X: SU);
132}
133
134void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
135 // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
136 // we just clear the HWUI. However, we still have instructions which map to
137 // this HWUI. Don't bother managing the state for these HWUI.
138 if (TotalCycles == 0)
139 return;
140
141 AllSUs.remove(X: SU);
142 PrioritySUs.remove(X: SU);
143
144 TotalCycles -= BlockingCycles;
145
146 if (AllSUs.empty())
147 return;
148 if (PrioritySUs.empty()) {
149 for (auto SU : AllSUs) {
150 if (PrioritySUs.empty()) {
151 PrioritySUs.insert(X: SU);
152 continue;
153 }
154 unsigned SUDepth = SU->getDepth();
155 unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
156 if (SUDepth > CurrDepth)
157 continue;
158
159 if (SUDepth == CurrDepth) {
160 PrioritySUs.insert(X: SU);
161 continue;
162 }
163
164 // SU is lower depth and should be prioritized.
165 PrioritySUs.clear();
166 PrioritySUs.insert(X: SU);
167 }
168 }
169}
170
171HardwareUnitInfo *
172CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
173 for (HardwareUnitInfo &HWUICand : HWUInfo) {
174 if (HWUICand.getType() == Flavor) {
175 return &HWUICand;
176 }
177 }
178 return nullptr;
179}
180
181unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) {
182 assert(SchedModel && SchedModel->hasInstrSchedModel());
183 unsigned ReleaseAtCycle = 0;
184 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
185 for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC),
186 PE = SchedModel->getWriteProcResEnd(SC);
187 PI != PE; ++PI) {
188 ReleaseAtCycle = std::max(a: ReleaseAtCycle, b: (unsigned)PI->ReleaseAtCycle);
189 }
190 return ReleaseAtCycle;
191}
192
193void CandidateHeuristics::updateForScheduling(SUnit *SU) {
194 HardwareUnitInfo *HWUI =
195 getHWUIFromFlavor(Flavor: classifyFlavor(MI: *SU->getInstr(), SII: *SII));
196 assert(HWUI);
197 HWUI->markScheduled(SU, BlockingCycles: getHWUICyclesForInst(SU));
198}
199
200void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
201 const TargetSchedModel *TargetSchedModel,
202 const TargetRegisterInfo *TRI) {
203 DAG = SchedDAG;
204 SchedModel = TargetSchedModel;
205 assert(SchedModel && SchedModel->hasInstrSchedModel());
206
207 SRI = static_cast<const SIRegisterInfo *>(TRI);
208 SII = static_cast<const SIInstrInfo *>(DAG->TII);
209
210 HWUInfo.resize(N: (int)InstructionFlavor::NUM_FLAVORS);
211
212 for (unsigned I = 0; I < HWUInfo.size(); I++) {
213 HWUInfo[I].reset();
214 HWUInfo[I].setType(I);
215 }
216
217 HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
218 HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
219 HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
220
221 collectHWUIPressure();
222}
223
224void CandidateHeuristics::collectHWUIPressure() {
225 if (!SchedModel || !SchedModel->hasInstrSchedModel())
226 return;
227
228 for (auto &SU : DAG->SUnits) {
229 const InstructionFlavor Flavor = classifyFlavor(MI: *SU.getInstr(), SII: *SII);
230 HWUInfo[(int)(Flavor)].insert(SU: &SU, BlockingCycles: getHWUICyclesForInst(SU: &SU));
231 }
232
233 LLVM_DEBUG(dumpRegionSummary());
234}
235
236void CandidateHeuristics::dumpRegionSummary() {
237 MachineBasicBlock *BB = DAG->begin()->getParent();
238 dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
239 << " (" << DAG->SUnits.size() << " SUs) ===\n";
240
241 dbgs() << "\nHWUI Resource Pressure:\n";
242 for (auto &HWUI : HWUInfo) {
243 if (HWUI.getTotalCycles() == 0)
244 continue;
245
246 StringRef Name = getFlavorName(F: HWUI.getType());
247 dbgs() << " " << Name << ": " << HWUI.getTotalCycles() << " cycles, "
248 << HWUI.size() << " instrs\n";
249 }
250 dbgs() << "\n";
251}
252
253void CandidateHeuristics::sortHWUIResources() {
254 // Highest priority should be first.
255 llvm::sort(C&: HWUInfo, Comp: [](HardwareUnitInfo &A, HardwareUnitInfo &B) {
256 // Prefer CoexecWindow producers
257 if (A.producesCoexecWindow() != B.producesCoexecWindow())
258 return A.producesCoexecWindow();
259
260 // Prefer more demanded resources
261 if (A.getTotalCycles() != B.getTotalCycles())
262 return A.getTotalCycles() > B.getTotalCycles();
263
264 // In ties -- prefer the resource with more instructions
265 if (A.size() != B.size())
266 return A.size() < B.size();
267
268 // Default to Flavor order
269 return static_cast<unsigned>(A.getType()) <
270 static_cast<unsigned>(B.getType());
271 });
272}
273
274bool CandidateHeuristics::tryCriticalResourceDependency(
275 GenericSchedulerBase::SchedCandidate &TryCand,
276 GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
277
278 auto HasPrioritySU = [this, &Cand, &TryCand](unsigned ResourceIdx) {
279 const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
280
281 auto CandFlavor = classifyFlavor(MI: *Cand.SU->getInstr(), SII: *SII);
282 auto TryCandFlavor = classifyFlavor(MI: *TryCand.SU->getInstr(), SII: *SII);
283 bool LookDeep = (CandFlavor == InstructionFlavor::DS ||
284 TryCandFlavor == InstructionFlavor::DS) &&
285 HWUI.getType() == InstructionFlavor::WMMA;
286 auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
287
288 // If we do not have a TargetSU for this resource, then it is not critical.
289 if (!TargetSU)
290 return false;
291
292 return true;
293 };
294
295 auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) {
296 const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
297 auto CandFlavor = classifyFlavor(MI: *Cand.SU->getInstr(), SII: *SII);
298
299 // We want to ensure our DS order matches WMMA order.
300 bool LookDeep = CandFlavor == InstructionFlavor::DS &&
301 HWUI.getType() == InstructionFlavor::WMMA;
302 auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
303
304 bool CandEnables =
305 TargetSU != Cand.SU && DAG->IsReachable(SU: TargetSU, TargetSU: Cand.SU);
306 bool TryCandEnables =
307 TargetSU != TryCand.SU && DAG->IsReachable(SU: TargetSU, TargetSU: TryCand.SU);
308
309 if (!CandEnables && !TryCandEnables)
310 return false;
311
312 if (CandEnables && !TryCandEnables) {
313 if (Cand.Reason > GenericSchedulerBase::RegCritical)
314 Cand.Reason = GenericSchedulerBase::RegCritical;
315
316 return true;
317 }
318
319 if (!CandEnables && TryCandEnables) {
320 TryCand.Reason = GenericSchedulerBase::RegCritical;
321 return true;
322 }
323
324 // Both enable, prefer the critical path.
325 unsigned CandHeight = Cand.SU->getHeight();
326 unsigned TryCandHeight = TryCand.SU->getHeight();
327
328 if (CandHeight > TryCandHeight) {
329 if (Cand.Reason > GenericSchedulerBase::RegCritical)
330 Cand.Reason = GenericSchedulerBase::RegCritical;
331
332 return true;
333 }
334
335 if (CandHeight < TryCandHeight) {
336 TryCand.Reason = GenericSchedulerBase::RegCritical;
337 return true;
338 }
339
340 // Same critical path, just prefer original candidate.
341 if (Cand.Reason > GenericSchedulerBase::RegCritical)
342 Cand.Reason = GenericSchedulerBase::RegCritical;
343
344 return true;
345 };
346
347 for (unsigned I = 0; I < HWUInfo.size(); I++) {
348 // If we have encountered a resource that is not critical, then neither
349 // candidate enables a critical resource
350 if (!HasPrioritySU(I))
351 continue;
352
353 bool Enabled = TryEnablesResource(I);
354 // If neither has enabled the resource, continue to the next resource
355 if (Enabled)
356 return true;
357 }
358 return false;
359}
360
361bool CandidateHeuristics::tryCriticalResource(
362 GenericSchedulerBase::SchedCandidate &TryCand,
363 GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
364 for (unsigned I = 0; I < HWUInfo.size(); I++) {
365 const HardwareUnitInfo &HWUI = HWUInfo[I];
366
367 bool CandUsesCrit = HWUI.contains(SU: Cand.SU);
368 bool TryCandUsesCrit = HWUI.contains(SU: TryCand.SU);
369
370 if (!CandUsesCrit && !TryCandUsesCrit)
371 continue;
372
373 if (CandUsesCrit != TryCandUsesCrit) {
374 if (CandUsesCrit) {
375 if (Cand.Reason > GenericSchedulerBase::RegCritical)
376 Cand.Reason = GenericSchedulerBase::RegCritical;
377 return true;
378 }
379 TryCand.Reason = GenericSchedulerBase::RegCritical;
380 return true;
381 }
382
383 // Otherwise, both use the critical resource
384 // For longer latency InstructionFlavors, we should prioritize first by
385 // their enablement of critical resources
386 if (HWUI.getType() == InstructionFlavor::DS) {
387 if (tryCriticalResourceDependency(TryCand, Cand, Zone))
388 return true;
389 }
390
391 // Prioritize based on HWUI priorities.
392 SUnit *Match = HWUI.getHigherPriority(SU: Cand.SU, Other: TryCand.SU);
393 if (Match) {
394 if (Match == Cand.SU) {
395 if (Cand.Reason > GenericSchedulerBase::RegCritical)
396 Cand.Reason = GenericSchedulerBase::RegCritical;
397 return true;
398 }
399 TryCand.Reason = GenericSchedulerBase::RegCritical;
400 return true;
401 }
402 }
403
404 return false;
405}
406
407AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
408 const MachineSchedContext *C)
409 : GCNSchedStrategy(C) {
410 SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule);
411 SchedStages.push_back(Elt: GCNSchedStageID::RewriteMFMAForm);
412 SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize);
413 // Use more accurate GCN pressure trackers.
414 UseGCNTrackers = true;
415}
416
417void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
418 MachineBasicBlock::iterator End,
419 unsigned NumRegionInstrs) {
420 GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
421 assert((PreRADirection == MISched::Unspecified ||
422 PreRADirection == MISched::TopDown) &&
423 "coexec scheduler only supports top-down scheduling");
424 RegionPolicy.OnlyTopDown = true;
425 RegionPolicy.OnlyBottomUp = false;
426 RegionPolicy.ShouldTrackLaneMasks = true;
427}
428
429void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
430 // Coexecution scheduling strategy is only done top-down to support new
431 // resource balancing heuristics.
432 RegionPolicy.OnlyTopDown = true;
433 RegionPolicy.OnlyBottomUp = false;
434
435 GCNSchedStrategy::initialize(DAG);
436 Heurs.initialize(SchedDAG: DAG, TargetSchedModel: SchedModel, TRI);
437}
438
439void AMDGPUCoExecSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
440 Heurs.updateForScheduling(SU);
441 GCNSchedStrategy::schedNode(SU, IsTopNode);
442}
443
444SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
445 assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
446 "coexec scheduler only supports top-down scheduling");
447
448 if (DAG->top() == DAG->bottom()) {
449 assert(Top.Available.empty() && Top.Pending.empty() &&
450 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
451 return nullptr;
452 }
453
454 bool PickedPending = false;
455 SUnit *SU = nullptr;
456#ifndef NDEBUG
457 SchedCandidate *PickedCand = nullptr;
458#endif
459 do {
460 PickedPending = false;
461 SU = pickOnlyChoice(Zone&: Top);
462 if (!SU) {
463 CandPolicy NoPolicy;
464 TopCand.reset(NewPolicy: NoPolicy);
465 pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
466 PickedPending, /*IsBottomUp=*/false);
467 assert(TopCand.Reason != NoCand && "failed to find a candidate");
468 SU = TopCand.SU;
469#ifndef NDEBUG
470 PickedCand = &TopCand;
471#endif
472 }
473 IsTopNode = true;
474 } while (SU->isScheduled);
475
476 LLVM_DEBUG(if (PickedCand) dumpPickSummary(SU, IsTopNode, *PickedCand));
477
478 if (PickedPending) {
479 unsigned ReadyCycle = SU->TopReadyCycle;
480 unsigned CurrentCycle = Top.getCurrCycle();
481 if (ReadyCycle > CurrentCycle)
482 Top.bumpCycle(NextCycle: ReadyCycle);
483
484 // checkHazard() does not expose the exact cycle where the hazard clears.
485 while (Top.checkHazard(SU))
486 Top.bumpCycle(NextCycle: Top.getCurrCycle() + 1);
487
488 Top.releasePending();
489 }
490
491 if (SU->isTopReady())
492 Top.removeReady(SU);
493 if (SU->isBottomReady())
494 Bot.removeReady(SU);
495
496 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
497 << *SU->getInstr());
498
499 assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
500 return SU;
501}
502
503void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
504 SchedBoundary &Zone, const CandPolicy &ZonePolicy,
505 const RegPressureTracker &RPTracker, SchedCandidate &Cand,
506 bool &PickedPending, bool IsBottomUp) {
507 assert(Zone.isTop() && "coexec scheduler only supports top boundary");
508 assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");
509
510 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
511 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
512 unsigned SGPRPressure = 0;
513 unsigned VGPRPressure = 0;
514 PickedPending = false;
515 if (DAG->isTrackingPressure()) {
516 if (!useGCNTrackers()) {
517 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
518 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
519 } else {
520 SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
521 VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
522 }
523 }
524
525 auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
526 for (SUnit *SU : Q) {
527 SchedCandidate TryCand(ZonePolicy);
528 initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
529 VGPRPressure, IsBottomUp);
530 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
531 tryCandidateCoexec(Cand, TryCand, Zone: ZoneArg);
532 if (TryCand.Reason != NoCand) {
533 if (TryCand.ResDelta == SchedResourceDelta())
534 TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
535 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
536 PickedPending = FromPending;
537 Cand.setBest(TryCand);
538 } else {
539 LLVM_DEBUG(printCandidateDecision(TryCand, Cand));
540 }
541 }
542 };
543
544 LLVM_DEBUG(dbgs() << "Available Q:\n");
545 EvaluateQueue(Zone.Available, /*FromPending=*/false);
546
547 LLVM_DEBUG(dbgs() << "Pending Q:\n");
548 EvaluateQueue(Zone.Pending, /*FromPending=*/true);
549}
550
551#ifndef NDEBUG
552void AMDGPUCoExecSchedStrategy::dumpPickSummary(SUnit *SU, bool IsTopNode,
553 SchedCandidate &Cand) {
554 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
555 unsigned Cycle = IsTopNode ? Top.getCurrCycle() : Bot.getCurrCycle();
556
557 dbgs() << "=== Pick @ Cycle " << Cycle << " ===\n";
558
559 const InstructionFlavor Flavor = classifyFlavor(*SU->getInstr(), *SII);
560 dbgs() << "Picked: SU(" << SU->NodeNum << ") ";
561 SU->getInstr()->print(dbgs(), /*IsStandalone=*/true, /*SkipOpers=*/false,
562 /*SkipDebugLoc=*/true);
563 dbgs() << " [" << getFlavorName(Flavor) << "]\n";
564
565 dbgs() << " Reason: ";
566 if (LastAMDGPUReason != AMDGPUSchedReason::None)
567 dbgs() << getReasonName(LastAMDGPUReason);
568 else if (Cand.Reason != NoCand)
569 dbgs() << GenericSchedulerBase::getReasonStr(Cand.Reason);
570 else
571 dbgs() << "Unknown";
572 dbgs() << "\n\n";
573
574 LastAMDGPUReason = AMDGPUSchedReason::None;
575}
576#endif
577
578bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
579 SchedCandidate &TryCand,
580 SchedBoundary *Zone) {
581 // Initialize the candidate if needed.
582 if (!Cand.isValid()) {
583 TryCand.Reason = FirstValid;
584 return true;
585 }
586
587 // Bias PhysReg Defs and copies to their uses and defined respectively.
588 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
589 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
590 return TryCand.Reason != NoCand;
591
592 // Avoid exceeding the target's limit.
593 if (DAG->isTrackingPressure() &&
594 tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
595 Reason: RegExcess, TRI, MF: DAG->MF))
596 return TryCand.Reason != NoCand;
597
598 // We only compare a subset of features when comparing nodes between
599 // Top and Bottom boundary. Some properties are simply incomparable, in many
600 // other instances we should only override the other boundary if something
601 // is a clear good pick on one boundary. Skip heuristics that are more
602 // "tie-breaking" in nature.
603 bool SameBoundary = Zone != nullptr;
604 if (SameBoundary) {
605 // Compare candidates by the stall they would introduce if
606 // scheduled in the current cycle.
607 if (tryEffectiveStall(Cand, TryCand, Zone&: *Zone))
608 return TryCand.Reason != NoCand;
609
610 Heurs.sortHWUIResources();
611 if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
612 LastAMDGPUReason = AMDGPUSchedReason::CritResourceBalance;
613 return TryCand.Reason != NoCand;
614 }
615
616 if (Heurs.tryCriticalResourceDependency(TryCand, Cand, Zone)) {
617 LastAMDGPUReason = AMDGPUSchedReason::CritResourceDep;
618 return TryCand.Reason != NoCand;
619 }
620 }
621
622 // Keep clustered nodes together to encourage downstream peephole
623 // optimizations which may reduce resource requirements.
624 //
625 // This is a best effort to set things up for a post-RA pass. Optimizations
626 // like generating loads of multiple registers should ideally be done within
627 // the scheduler pass by combining the loads during DAG postprocessing.
628 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
629 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
630 bool CandIsClusterSucc =
631 isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx);
632 bool TryCandIsClusterSucc =
633 isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx);
634
635 if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand,
636 Reason: Cluster))
637 return TryCand.Reason != NoCand;
638
639 if (SameBoundary) {
640 // Weak edges are for clustering and other constraints.
641 if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
642 CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
643 return TryCand.Reason != NoCand;
644 }
645
646 // Avoid increasing the max pressure of the entire region.
647 if (DAG->isTrackingPressure() &&
648 tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
649 Cand, Reason: RegMax, TRI, MF: DAG->MF))
650 return TryCand.Reason != NoCand;
651
652 if (SameBoundary) {
653 // Avoid serializing long latency dependence chains.
654 // For acyclic path limited loops, latency was already checked above.
655 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
656 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone))
657 return TryCand.Reason != NoCand;
658
659 // Fall through to original instruction order.
660 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
661 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
662 TryCand.Reason = NodeOrder;
663 return true;
664 }
665 }
666
667 return false;
668}
669
670bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
671 SchedCandidate &TryCand,
672 SchedBoundary &Zone) const {
673 // Treat structural and latency stalls as a single scheduling cost for the
674 // current cycle.
675 struct StallCosts {
676 unsigned Ready = 0;
677 unsigned Structural = 0;
678 unsigned Latency = 0;
679 unsigned Effective = 0;
680 };
681
682 unsigned CurrCycle = Zone.getCurrCycle();
683 auto GetStallCosts = [&](SUnit *SU) {
684 unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
685 StallCosts Costs;
686 Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
687 Costs.Structural = getStructuralStallCycles(Zone, SU);
688 Costs.Latency = Zone.getLatencyStallCycles(SU);
689 Costs.Effective = std::max(l: {Costs.Ready, Costs.Structural, Costs.Latency});
690 return Costs;
691 };
692
693 StallCosts TryCosts = GetStallCosts(TryCand.SU);
694 StallCosts CandCosts = GetStallCosts(Cand.SU);
695
696 LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
697 dbgs() << "Effective stalls: try=" << TryCosts.Effective
698 << " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
699 << ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
700 << " (ready=" << CandCosts.Ready
701 << ", struct=" << CandCosts.Structural
702 << ", lat=" << CandCosts.Latency << ")\n";
703 });
704
705 return tryLess(TryVal: TryCosts.Effective, CandVal: CandCosts.Effective, TryCand, Cand, Reason: Stall);
706}
707
708ScheduleDAGInstrs *
709llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
710 LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
711 << C->MF->getName() << '\n');
712 ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
713 C, std::make_unique<AMDGPUCoExecSchedStrategy>(args&: C));
714 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
715 return DAG;
716}
717
718ScheduleDAGInstrs *
719llvm::createGCNNoopPostMachineScheduler(MachineSchedContext *C) {
720 LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
721 << C->MF->getName() << '\n');
722 return new GCNNoopPostScheduleDAG(C);
723}
724