1//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Coexecution-focused scheduling strategy for AMDGPU.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUCoExecSchedStrategy.h"
15#include "llvm/Support/Debug.h"
16
17using namespace llvm;
18using namespace llvm::AMDGPU;
19
20#define DEBUG_TYPE "machine-scheduler"
21
22namespace {
23
24// Used to disable post-RA scheduling with function level granularity.
25class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
26public:
27 explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
28 : ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {}
29
30 // Do nothing.
31 void schedule() override {}
32};
33
34} // namespace
35
36static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
37 // pickOnlyChoice() releases pending instructions and checks for new hazards.
38 SUnit *OnlyChoice = Zone.pickOnlyChoice();
39 if (!Zone.Pending.empty())
40 return nullptr;
41
42 return OnlyChoice;
43}
44
45InstructionFlavor llvm::AMDGPU::classifyFlavor(const MachineInstr &MI,
46 const SIInstrInfo &SII) {
47 if (MI.isDebugInstr())
48 return InstructionFlavor::Other;
49
50 unsigned Opc = MI.getOpcode();
51
52 // Check for specific opcodes first.
53 if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
54 Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
55 Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
56 return InstructionFlavor::Fence;
57
58 if (SII.isLDSDMA(MI))
59 return InstructionFlavor::DMA;
60
61 if (SII.isMFMAorWMMA(MI))
62 return InstructionFlavor::WMMA;
63
64 if (SII.isTRANS(MI))
65 return InstructionFlavor::TRANS;
66
67 if (SII.isVALU(MI))
68 return InstructionFlavor::SingleCycleVALU;
69
70 if (SII.isDS(MI))
71 return InstructionFlavor::DS;
72
73 if (SII.isFLAT(MI) || SII.isFLATGlobal(MI) || SII.isFLATScratch(MI))
74 return InstructionFlavor::VMEM;
75
76 if (SII.isSALU(MI))
77 return InstructionFlavor::SALU;
78
79 return InstructionFlavor::Other;
80}
81
82SUnit *HardwareUnitInfo::getNextTargetSU(bool LookDeep) const {
83 for (auto *PrioritySU : PrioritySUs) {
84 if (!PrioritySU->isTopReady())
85 return PrioritySU;
86 }
87
88 if (!LookDeep)
89 return nullptr;
90
91 unsigned MinDepth = std::numeric_limits<unsigned int>::max();
92 SUnit *TargetSU = nullptr;
93 for (auto *SU : AllSUs) {
94 if (SU->isScheduled)
95 continue;
96
97 if (SU->isTopReady())
98 continue;
99
100 if (SU->getDepth() < MinDepth) {
101 MinDepth = SU->getDepth();
102 TargetSU = SU;
103 }
104 }
105 return TargetSU;
106}
107
108void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) {
109#ifndef NDEBUG
110 bool Inserted = AllSUs.insert(SU);
111 assert(Inserted);
112#else
113 AllSUs.insert(X: SU);
114#endif
115
116 TotalCycles += BlockingCycles;
117
118 if (PrioritySUs.empty()) {
119 PrioritySUs.insert(X: SU);
120 return;
121 }
122 unsigned SUDepth = SU->getDepth();
123 unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
124 if (SUDepth > CurrDepth)
125 return;
126
127 if (SUDepth == CurrDepth) {
128 PrioritySUs.insert(X: SU);
129 return;
130 }
131
132 // SU is lower depth and should be prioritized.
133 PrioritySUs.clear();
134 PrioritySUs.insert(X: SU);
135}
136
137void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
138 // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
139 // we just clear the HWUI. However, we still have instructions which map to
140 // this HWUI. Don't bother managing the state for these HWUI.
141 if (TotalCycles == 0)
142 return;
143
144 AllSUs.remove(X: SU);
145 PrioritySUs.remove(X: SU);
146
147 TotalCycles -= BlockingCycles;
148
149 if (AllSUs.empty())
150 return;
151 if (PrioritySUs.empty()) {
152 for (auto SU : AllSUs) {
153 if (PrioritySUs.empty()) {
154 PrioritySUs.insert(X: SU);
155 continue;
156 }
157 unsigned SUDepth = SU->getDepth();
158 unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
159 if (SUDepth > CurrDepth)
160 continue;
161
162 if (SUDepth == CurrDepth) {
163 PrioritySUs.insert(X: SU);
164 continue;
165 }
166
167 // SU is lower depth and should be prioritized.
168 PrioritySUs.clear();
169 PrioritySUs.insert(X: SU);
170 }
171 }
172}
173
174HardwareUnitInfo *
175CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
176 for (auto &HWUICand : HWUInfo) {
177 if (HWUICand.getType() == Flavor) {
178 return &HWUICand;
179 }
180 }
181 return nullptr;
182}
183
184unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) {
185 assert(SchedModel && SchedModel->hasInstrSchedModel());
186 unsigned ReleaseAtCycle = 0;
187 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
188 for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC),
189 PE = SchedModel->getWriteProcResEnd(SC);
190 PI != PE; ++PI) {
191 ReleaseAtCycle = std::max(a: ReleaseAtCycle, b: (unsigned)PI->ReleaseAtCycle);
192 }
193 return ReleaseAtCycle;
194}
195
196void CandidateHeuristics::updateForScheduling(SUnit *SU) {
197 HardwareUnitInfo *HWUI =
198 getHWUIFromFlavor(Flavor: classifyFlavor(MI: *SU->getInstr(), SII: *SII));
199 assert(HWUI);
200 HWUI->markScheduled(SU, BlockingCycles: getHWUICyclesForInst(SU));
201}
202
203void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
204 const TargetSchedModel *TargetSchedModel,
205 const TargetRegisterInfo *TRI) {
206 DAG = SchedDAG;
207 SchedModel = TargetSchedModel;
208 assert(SchedModel && SchedModel->hasInstrSchedModel());
209
210 SRI = static_cast<const SIRegisterInfo *>(TRI);
211 SII = static_cast<const SIInstrInfo *>(DAG->TII);
212
213 HWUInfo.resize(N: (int)InstructionFlavor::NUM_FLAVORS);
214
215 for (unsigned I = 0; I < HWUInfo.size(); I++) {
216 HWUInfo[I].reset();
217 HWUInfo[I].setType(I);
218 }
219
220 HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
221 HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
222 HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
223
224 collectHWUIPressure();
225}
226
227void CandidateHeuristics::collectHWUIPressure() {
228 if (!SchedModel || !SchedModel->hasInstrSchedModel())
229 return;
230
231 for (auto &SU : DAG->SUnits) {
232 const InstructionFlavor Flavor = classifyFlavor(MI: *SU.getInstr(), SII: *SII);
233 HWUInfo[(int)(Flavor)].insert(SU: &SU, BlockingCycles: getHWUICyclesForInst(SU: &SU));
234 }
235
236 LLVM_DEBUG(dumpRegionSummary());
237}
238
239void CandidateHeuristics::dumpRegionSummary() {
240 MachineBasicBlock *BB = DAG->begin()->getParent();
241 dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
242 << " (" << DAG->SUnits.size() << " SUs) ===\n";
243
244 dbgs() << "\nHWUI Resource Pressure:\n";
245 for (auto &HWUI : HWUInfo) {
246 if (HWUI.getTotalCycles() == 0)
247 continue;
248
249 StringRef Name = getFlavorName(F: HWUI.getType());
250 dbgs() << " " << Name << ": " << HWUI.getTotalCycles() << " cycles, "
251 << HWUI.size() << " instrs\n";
252 }
253 dbgs() << "\n";
254}
255
256void CandidateHeuristics::sortHWUIResources() {
257 // Highest priority should be first.
258 llvm::sort(C&: HWUInfo, Comp: [](HardwareUnitInfo &A, HardwareUnitInfo &B) {
259 // Prefer CoexecWindow producers
260 if (A.producesCoexecWindow() != B.producesCoexecWindow())
261 return A.producesCoexecWindow();
262
263 // Prefer more demanded resources
264 if (A.getTotalCycles() != B.getTotalCycles())
265 return A.getTotalCycles() > B.getTotalCycles();
266
267 // In ties -- prefer the resource with more instructions
268 if (A.size() != B.size())
269 return A.size() < B.size();
270
271 // Default to Flavor order
272 return (unsigned)A.getType() < (unsigned)B.getType();
273 });
274}
275
276bool CandidateHeuristics::tryCriticalResourceDependency(
277 GenericSchedulerBase::SchedCandidate &TryCand,
278 GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
279
280 auto HasPrioritySU = [this, &Cand, &TryCand](unsigned ResourceIdx) {
281 const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
282
283 auto CandFlavor = classifyFlavor(MI: *Cand.SU->getInstr(), SII: *SII);
284 auto TryCandFlavor = classifyFlavor(MI: *TryCand.SU->getInstr(), SII: *SII);
285 bool LookDeep = (CandFlavor == InstructionFlavor::DS ||
286 TryCandFlavor == InstructionFlavor::DS) &&
287 HWUI.getType() == InstructionFlavor::WMMA;
288 auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
289
290 // If we do not have a TargetSU for this resource, then it is not critical.
291 if (!TargetSU)
292 return false;
293
294 return true;
295 };
296
297 auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) {
298 const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
299 auto CandFlavor = classifyFlavor(MI: *Cand.SU->getInstr(), SII: *SII);
300
301 // We want to ensure our DS order matches WMMA order.
302 bool LookDeep = CandFlavor == InstructionFlavor::DS &&
303 HWUI.getType() == InstructionFlavor::WMMA;
304 auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
305
306 bool CandEnables =
307 TargetSU != Cand.SU && DAG->IsReachable(SU: TargetSU, TargetSU: Cand.SU);
308 bool TryCandEnables =
309 TargetSU != TryCand.SU && DAG->IsReachable(SU: TargetSU, TargetSU: TryCand.SU);
310
311 if (!CandEnables && !TryCandEnables)
312 return false;
313
314 if (CandEnables && !TryCandEnables) {
315 if (Cand.Reason > GenericSchedulerBase::RegCritical)
316 Cand.Reason = GenericSchedulerBase::RegCritical;
317
318 return true;
319 }
320
321 if (!CandEnables && TryCandEnables) {
322 TryCand.Reason = GenericSchedulerBase::RegCritical;
323 return true;
324 }
325
326 // Both enable, prefer the critical path.
327 unsigned CandHeight = Cand.SU->getHeight();
328 unsigned TryCandHeight = TryCand.SU->getHeight();
329
330 if (CandHeight > TryCandHeight) {
331 if (Cand.Reason > GenericSchedulerBase::RegCritical)
332 Cand.Reason = GenericSchedulerBase::RegCritical;
333
334 return true;
335 }
336
337 if (CandHeight < TryCandHeight) {
338 TryCand.Reason = GenericSchedulerBase::RegCritical;
339 return true;
340 }
341
342 // Same critical path, just prefer original candidate.
343 if (Cand.Reason > GenericSchedulerBase::RegCritical)
344 Cand.Reason = GenericSchedulerBase::RegCritical;
345
346 return true;
347 };
348
349 for (unsigned I = 0; I < HWUInfo.size(); I++) {
350 // If we have encountered a resource that is not critical, then neither
351 // candidate enables a critical resource
352 if (!HasPrioritySU(I))
353 continue;
354
355 bool Enabled = TryEnablesResource(I);
356 // If neither has enabled the resource, continue to the next resource
357 if (Enabled)
358 return true;
359 }
360 return false;
361}
362
363bool CandidateHeuristics::tryCriticalResource(
364 GenericSchedulerBase::SchedCandidate &TryCand,
365 GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
366 for (unsigned I = 0; I < HWUInfo.size(); I++) {
367 const HardwareUnitInfo &HWUI = HWUInfo[I];
368
369 bool CandUsesCrit = HWUI.contains(SU: Cand.SU);
370 bool TryCandUsesCrit = HWUI.contains(SU: TryCand.SU);
371
372 if (!CandUsesCrit && !TryCandUsesCrit)
373 continue;
374
375 if (CandUsesCrit != TryCandUsesCrit) {
376 if (CandUsesCrit) {
377 if (Cand.Reason > GenericSchedulerBase::RegCritical)
378 Cand.Reason = GenericSchedulerBase::RegCritical;
379 return true;
380 }
381 TryCand.Reason = GenericSchedulerBase::RegCritical;
382 return true;
383 }
384
385 // Otherwise, both use the critical resource
386 // For longer latency InstructionFlavors, we should prioritize first by
387 // their enablement of critical resources
388 if (HWUI.getType() == InstructionFlavor::DS) {
389 if (tryCriticalResourceDependency(TryCand, Cand, Zone))
390 return true;
391 }
392
393 // Prioritize based on HWUI priorities.
394 SUnit *Match = HWUI.getHigherPriority(SU: Cand.SU, Other: TryCand.SU);
395 if (Match) {
396 if (Match == Cand.SU) {
397 if (Cand.Reason > GenericSchedulerBase::RegCritical)
398 Cand.Reason = GenericSchedulerBase::RegCritical;
399 return true;
400 }
401 TryCand.Reason = GenericSchedulerBase::RegCritical;
402 return true;
403 }
404 }
405
406 return false;
407}
408
409AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
410 const MachineSchedContext *C)
411 : GCNSchedStrategy(C) {
412 SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule);
413 SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize);
414 // Use more accurate GCN pressure trackers.
415 UseGCNTrackers = true;
416}
417
418void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
419 MachineBasicBlock::iterator End,
420 unsigned NumRegionInstrs) {
421 GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
422 assert((PreRADirection == MISched::Unspecified ||
423 PreRADirection == MISched::TopDown) &&
424 "coexec scheduler only supports top-down scheduling");
425 RegionPolicy.OnlyTopDown = true;
426 RegionPolicy.OnlyBottomUp = false;
427}
428
429void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
430 // Coexecution scheduling strategy is only done top-down to support new
431 // resource balancing heuristics.
432 RegionPolicy.OnlyTopDown = true;
433 RegionPolicy.OnlyBottomUp = false;
434
435 GCNSchedStrategy::initialize(DAG);
436 Heurs.initialize(SchedDAG: DAG, TargetSchedModel: SchedModel, TRI);
437}
438
439void AMDGPUCoExecSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
440 Heurs.updateForScheduling(SU);
441 GCNSchedStrategy::schedNode(SU, IsTopNode);
442}
443
444SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
445 assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
446 "coexec scheduler only supports top-down scheduling");
447
448 if (DAG->top() == DAG->bottom()) {
449 assert(Top.Available.empty() && Top.Pending.empty() &&
450 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
451 return nullptr;
452 }
453
454 bool PickedPending = false;
455 SUnit *SU = nullptr;
456#ifndef NDEBUG
457 SchedCandidate *PickedCand = nullptr;
458#endif
459 do {
460 PickedPending = false;
461 SU = pickOnlyChoice(Zone&: Top);
462 if (!SU) {
463 CandPolicy NoPolicy;
464 TopCand.reset(NewPolicy: NoPolicy);
465 pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
466 PickedPending, /*IsBottomUp=*/false);
467 assert(TopCand.Reason != NoCand && "failed to find a candidate");
468 SU = TopCand.SU;
469#ifndef NDEBUG
470 PickedCand = &TopCand;
471#endif
472 }
473 IsTopNode = true;
474 } while (SU->isScheduled);
475
476 LLVM_DEBUG(if (PickedCand) dumpPickSummary(SU, IsTopNode, *PickedCand));
477
478 if (PickedPending) {
479 unsigned ReadyCycle = SU->TopReadyCycle;
480 unsigned CurrentCycle = Top.getCurrCycle();
481 if (ReadyCycle > CurrentCycle)
482 Top.bumpCycle(NextCycle: ReadyCycle);
483
484 // checkHazard() does not expose the exact cycle where the hazard clears.
485 while (Top.checkHazard(SU))
486 Top.bumpCycle(NextCycle: Top.getCurrCycle() + 1);
487
488 Top.releasePending();
489 }
490
491 if (SU->isTopReady())
492 Top.removeReady(SU);
493 if (SU->isBottomReady())
494 Bot.removeReady(SU);
495
496 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
497 << *SU->getInstr());
498
499 assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
500 return SU;
501}
502
503void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
504 SchedBoundary &Zone, const CandPolicy &ZonePolicy,
505 const RegPressureTracker &RPTracker, SchedCandidate &Cand,
506 bool &PickedPending, bool IsBottomUp) {
507 assert(Zone.isTop() && "coexec scheduler only supports top boundary");
508 assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");
509
510 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
511 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
512 unsigned SGPRPressure = 0;
513 unsigned VGPRPressure = 0;
514 PickedPending = false;
515 if (DAG->isTrackingPressure()) {
516 if (!useGCNTrackers()) {
517 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
518 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
519 } else {
520 SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
521 VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
522 }
523 }
524
525 auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
526 for (SUnit *SU : Q) {
527 SchedCandidate TryCand(ZonePolicy);
528 initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
529 VGPRPressure, IsBottomUp);
530 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
531 tryCandidateCoexec(Cand, TryCand, Zone: ZoneArg);
532 if (TryCand.Reason != NoCand) {
533 if (TryCand.ResDelta == SchedResourceDelta())
534 TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
535 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
536 PickedPending = FromPending;
537 Cand.setBest(TryCand);
538 } else {
539 LLVM_DEBUG(printCandidateDecision(TryCand, Cand));
540 }
541 }
542 };
543
544 LLVM_DEBUG(dbgs() << "Available Q:\n");
545 EvaluateQueue(Zone.Available, /*FromPending=*/false);
546
547 LLVM_DEBUG(dbgs() << "Pending Q:\n");
548 EvaluateQueue(Zone.Pending, /*FromPending=*/true);
549}
550
551#ifndef NDEBUG
552void AMDGPUCoExecSchedStrategy::dumpPickSummary(SUnit *SU, bool IsTopNode,
553 SchedCandidate &Cand) {
554 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
555 unsigned Cycle = IsTopNode ? Top.getCurrCycle() : Bot.getCurrCycle();
556
557 dbgs() << "=== Pick @ Cycle " << Cycle << " ===\n";
558
559 const InstructionFlavor Flavor = classifyFlavor(*SU->getInstr(), *SII);
560 dbgs() << "Picked: SU(" << SU->NodeNum << ") ";
561 SU->getInstr()->print(dbgs(), /*IsStandalone=*/true, /*SkipOpers=*/false,
562 /*SkipDebugLoc=*/true);
563 dbgs() << " [" << getFlavorName(Flavor) << "]\n";
564
565 dbgs() << " Reason: ";
566 if (LastAMDGPUReason != AMDGPUSchedReason::None)
567 dbgs() << getReasonName(LastAMDGPUReason);
568 else if (Cand.Reason != NoCand)
569 dbgs() << GenericSchedulerBase::getReasonStr(Cand.Reason);
570 else
571 dbgs() << "Unknown";
572 dbgs() << "\n\n";
573
574 LastAMDGPUReason = AMDGPUSchedReason::None;
575}
576#endif
577
578bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
579 SchedCandidate &TryCand,
580 SchedBoundary *Zone) {
581 // Initialize the candidate if needed.
582 if (!Cand.isValid()) {
583 TryCand.Reason = FirstValid;
584 return true;
585 }
586
587 // Bias PhysReg Defs and copies to their uses and defined respectively.
588 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
589 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
590 return TryCand.Reason != NoCand;
591
592 // Avoid exceeding the target's limit.
593 if (DAG->isTrackingPressure() &&
594 tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
595 Reason: RegExcess, TRI, MF: DAG->MF))
596 return TryCand.Reason != NoCand;
597
598 // We only compare a subset of features when comparing nodes between
599 // Top and Bottom boundary. Some properties are simply incomparable, in many
600 // other instances we should only override the other boundary if something
601 // is a clear good pick on one boundary. Skip heuristics that are more
602 // "tie-breaking" in nature.
603 bool SameBoundary = Zone != nullptr;
604 if (SameBoundary) {
605 // Compare candidates by the stall they would introduce if
606 // scheduled in the current cycle.
607 if (tryEffectiveStall(Cand, TryCand, Zone&: *Zone))
608 return TryCand.Reason != NoCand;
609
610 Heurs.sortHWUIResources();
611 if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
612 LastAMDGPUReason = AMDGPUSchedReason::CritResourceBalance;
613 return TryCand.Reason != NoCand;
614 }
615
616 if (Heurs.tryCriticalResourceDependency(TryCand, Cand, Zone)) {
617 LastAMDGPUReason = AMDGPUSchedReason::CritResourceDep;
618 return TryCand.Reason != NoCand;
619 }
620 }
621
622 // Keep clustered nodes together to encourage downstream peephole
623 // optimizations which may reduce resource requirements.
624 //
625 // This is a best effort to set things up for a post-RA pass. Optimizations
626 // like generating loads of multiple registers should ideally be done within
627 // the scheduler pass by combining the loads during DAG postprocessing.
628 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
629 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
630 bool CandIsClusterSucc =
631 isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx);
632 bool TryCandIsClusterSucc =
633 isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx);
634
635 if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand,
636 Reason: Cluster))
637 return TryCand.Reason != NoCand;
638
639 if (SameBoundary) {
640 // Weak edges are for clustering and other constraints.
641 if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
642 CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
643 return TryCand.Reason != NoCand;
644 }
645
646 // Avoid increasing the max pressure of the entire region.
647 if (DAG->isTrackingPressure() &&
648 tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
649 Cand, Reason: RegMax, TRI, MF: DAG->MF))
650 return TryCand.Reason != NoCand;
651
652 if (SameBoundary) {
653 // Avoid serializing long latency dependence chains.
654 // For acyclic path limited loops, latency was already checked above.
655 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
656 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone))
657 return TryCand.Reason != NoCand;
658
659 // Fall through to original instruction order.
660 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
661 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
662 TryCand.Reason = NodeOrder;
663 return true;
664 }
665 }
666
667 return false;
668}
669
670bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
671 SchedCandidate &TryCand,
672 SchedBoundary &Zone) const {
673 // Treat structural and latency stalls as a single scheduling cost for the
674 // current cycle.
675 struct StallCosts {
676 unsigned Ready = 0;
677 unsigned Structural = 0;
678 unsigned Latency = 0;
679 unsigned Effective = 0;
680 };
681
682 unsigned CurrCycle = Zone.getCurrCycle();
683 auto GetStallCosts = [&](SUnit *SU) {
684 unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
685 StallCosts Costs;
686 Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
687 Costs.Structural = getStructuralStallCycles(Zone, SU);
688 Costs.Latency = Zone.getLatencyStallCycles(SU);
689 Costs.Effective = std::max(l: {Costs.Ready, Costs.Structural, Costs.Latency});
690 return Costs;
691 };
692
693 StallCosts TryCosts = GetStallCosts(TryCand.SU);
694 StallCosts CandCosts = GetStallCosts(Cand.SU);
695
696 LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
697 dbgs() << "Effective stalls: try=" << TryCosts.Effective
698 << " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
699 << ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
700 << " (ready=" << CandCosts.Ready
701 << ", struct=" << CandCosts.Structural
702 << ", lat=" << CandCosts.Latency << ")\n";
703 });
704
705 return tryLess(TryVal: TryCosts.Effective, CandVal: CandCosts.Effective, TryCand, Cand, Reason: Stall);
706}
707
708ScheduleDAGInstrs *
709llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
710 LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
711 << C->MF->getName() << '\n');
712 return new GCNScheduleDAGMILive(
713 C, std::make_unique<AMDGPUCoExecSchedStrategy>(args&: C));
714}
715
716ScheduleDAGInstrs *
717llvm::createGCNNoopPostMachineScheduler(MachineSchedContext *C) {
718 LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
719 << C->MF->getName() << '\n');
720 return new GCNNoopPostScheduleDAG(C);
721}
722