1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNHazardRecognizer.h"
29#include "GCNRegPressure.h"
30#include "SIMachineFunctionInfo.h"
31#include "Utils/AMDGPUBaseInfo.h"
32#include "llvm/ADT/BitVector.h"
33#include "llvm/ADT/STLExtras.h"
34#include "llvm/CodeGen/CalcSpillWeights.h"
35#include "llvm/CodeGen/MachineBasicBlock.h"
36#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
37#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
38#include "llvm/CodeGen/MachineCycleAnalysis.h"
39#include "llvm/CodeGen/MachineOperand.h"
40#include "llvm/CodeGen/RegisterClassInfo.h"
41#include "llvm/MC/LaneBitmask.h"
42#include "llvm/MC/MCInstrItineraries.h"
43#include "llvm/MC/MCSchedule.h"
44#include "llvm/MC/TargetRegistry.h"
45#include "llvm/Support/ErrorHandling.h"
46
47#define DEBUG_TYPE "machine-scheduler"
48
49using namespace llvm;
50
51static cl::opt<bool> DisableUnclusterHighRP(
52 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
53 cl::desc("Disable unclustered high register pressure "
54 "reduction scheduling stage."),
55 cl::init(Val: false));
56
57static cl::opt<bool> DisableClusteredLowOccupancy(
58 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
59 cl::desc("Disable clustered low occupancy "
60 "rescheduling for ILP scheduling stage."),
61 cl::init(Val: false));
62
63static cl::opt<unsigned> ScheduleMetricBias(
64 "amdgpu-schedule-metric-bias", cl::Hidden,
65 cl::desc(
66 "Sets the bias which adds weight to occupancy vs latency. Set it to "
67 "100 to chase the occupancy only."),
68 cl::init(Val: 10));
69
70static cl::opt<bool>
71 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
72 cl::desc("Relax occupancy targets for kernels which are memory "
73 "bound (amdgpu-membound-threshold), or "
74 "Wave Limited (amdgpu-limit-wave-threshold)."),
75 cl::init(Val: false));
76
77static cl::opt<bool> GCNTrackers(
78 "amdgpu-use-amdgpu-trackers", cl::Hidden,
79 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
80 cl::init(Val: false));
81
82static cl::opt<unsigned> PendingQueueLimit(
83 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
84 cl::desc(
85 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
86 cl::init(Val: 256));
87
88#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
89#define DUMP_MAX_REG_PRESSURE
90static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
91 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
92 cl::desc("Print a list of live registers along with their def/uses at the "
93 "point of maximum register pressure before scheduling."),
94 cl::init(false));
95
96static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
97 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
98 cl::desc("Print a list of live registers along with their def/uses at the "
99 "point of maximum register pressure after scheduling."),
100 cl::init(false));
101#endif
102
103static cl::opt<bool> DisableRewriteMFMAFormSchedStage(
104 "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
105 cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(Val: true));
106
107const unsigned ScheduleMetrics::ScaleFactor = 100;
108
109GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
110 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
111 DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
112 if (GCNTrackers.getNumOccurrences() > 0)
113 GCNTrackersOverride = GCNTrackers;
114}
115
116void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
117 GenericScheduler::initialize(dag: DAG);
118
119 MF = &DAG->MF;
120
121 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
122
123 SGPRExcessLimit =
124 Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass);
125 VGPRExcessLimit =
126 Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass);
127
128 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
129 // Set the initial TargetOccupnacy to the maximum occupancy that we can
130 // achieve for this function. This effectively sets a lower bound on the
131 // 'Critical' register limits in the scheduler.
132 // Allow for lower occupancy targets if kernel is wave limited or memory
133 // bound, and using the relaxed occupancy feature.
134 TargetOccupancy =
135 RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
136 SGPRCriticalLimit =
137 std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit);
138
139 if (!KnownExcessRP) {
140 VGPRCriticalLimit = std::min(
141 a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize()),
142 b: VGPRExcessLimit);
143 } else {
144 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
145 // returns a reasonably small number for targets with lots of VGPRs, such
146 // as GFX10 and GFX11.
147 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
148 "VGPRCriticalLimit calculation method.\n");
149 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
150 unsigned Granule =
151 AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST, DynamicVGPRBlockSize);
152 unsigned Addressable =
153 AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: &ST, DynamicVGPRBlockSize);
154 unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule);
155 VGPRBudget = std::max(a: VGPRBudget, b: Granule);
156 VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit);
157 }
158
159 // Subtract error margin and bias from register limits and avoid overflow.
160 SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit);
161 VGPRCriticalLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit);
162 SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit);
163 VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit);
164 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
165 << ", VGPRExcessLimit = " << VGPRExcessLimit
166 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
167 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
168}
169
170/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
171/// current register pressure.
172///
173/// This works for the common case, but it has a few exceptions that have been
174/// observed through trial and error:
175/// - Explicit physical register operands
176/// - Subregister definitions
177///
178/// In both of those cases, PressureDiff doesn't represent the actual pressure,
179/// and querying LiveIntervals through the RegPressureTracker is needed to get
180/// an accurate value.
181///
182/// We should eventually only use PressureDiff for maximum performance, but this
183/// already allows 80% of SUs to take the fast path without changing scheduling
184/// at all. Further changes would either change scheduling, or require a lot
185/// more logic to recover an accurate pressure estimate from the PressureDiffs.
186static bool canUsePressureDiffs(const SUnit &SU) {
187 if (!SU.isInstr())
188 return false;
189
190 // Cannot use pressure diffs for subregister defs or with physregs, it's
191 // imprecise in both cases.
192 for (const auto &Op : SU.getInstr()->operands()) {
193 if (!Op.isReg() || Op.isImplicit())
194 continue;
195 if (Op.getReg().isPhysical() ||
196 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
197 return false;
198 }
199 return true;
200}
201
202void GCNSchedStrategy::getRegisterPressures(
203 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
204 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
205 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
206 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
207 // getDownwardPressure() and getUpwardPressure() make temporary changes to
208 // the tracker, so we need to pass those function a non-const copy.
209 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
210 if (!useGCNTrackers()) {
211 AtTop
212 ? TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure)
213 : TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure);
214
215 return;
216 }
217
218 // GCNTrackers
219 Pressure.resize(new_size: 4, x: 0);
220 MachineInstr *MI = SU->getInstr();
221 GCNRegPressure NewPressure;
222 if (AtTop) {
223 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
224 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, TRI: SRI);
225 } else {
226 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
227 TempUpwardTracker.recede(MI: *MI);
228 NewPressure = TempUpwardTracker.getPressure();
229 }
230 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
231 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
232 NewPressure.getArchVGPRNum();
233 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
234}
235
236unsigned GCNSchedStrategy::getStructuralStallCycles(SchedBoundary &Zone,
237 SUnit *SU) const {
238 // Only implemented for top-down scheduling currently.
239 if (!Zone.isTop() || !SU)
240 return 0;
241
242 MachineInstr *MI = SU->getInstr();
243 unsigned CurrCycle = Zone.getCurrCycle();
244 unsigned Stall = 0;
245
246 // Query SchedModel for resource stalls (unbuffered resources).
247 if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
248 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
249 for (const MCWriteProcResEntry &PE :
250 make_range(x: SchedModel->getWriteProcResBegin(SC),
251 y: SchedModel->getWriteProcResEnd(SC))) {
252 unsigned NextAvail =
253 Zone.getNextResourceCycle(SC, PIdx: PE.ProcResourceIdx, ReleaseAtCycle: PE.ReleaseAtCycle,
254 AcquireAtCycle: PE.AcquireAtCycle)
255 .first;
256 if (NextAvail > CurrCycle)
257 Stall = std::max(a: Stall, b: NextAvail - CurrCycle);
258 }
259 }
260
261 // Query HazardRecognizer for sequence-dependent hazard penalties.
262 // AMDGPU currently installs GCNHazardRecognizer for MI scheduling only in
263 // the post-RA configuration without vreg liveness.
264 if (!DAG->hasVRegLiveness() && Zone.HazardRec &&
265 Zone.HazardRec->isEnabled()) {
266 auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec);
267 Stall = std::max(a: Stall, b: HR->getHazardWaitStates(MI));
268 }
269
270 return Stall;
271}
272
273void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
274 bool AtTop,
275 const RegPressureTracker &RPTracker,
276 const SIRegisterInfo *SRI,
277 unsigned SGPRPressure,
278 unsigned VGPRPressure, bool IsBottomUp) {
279 Cand.SU = SU;
280 Cand.AtTop = AtTop;
281
282 if (!DAG->isTrackingPressure())
283 return;
284
285 Pressure.clear();
286 MaxPressure.clear();
287
288 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
289 // possible over querying the RegPressureTracker.
290 //
291 // RegPressureTracker will make a lot of LIS queries which are very
292 // expensive, it is considered a slow function in this context.
293 //
294 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
295 // trivial lookup into an array. It is pretty much free.
296 //
297 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
298 // PressureDiffs.
299 if (AtTop || !canUsePressureDiffs(SU: *SU) || useGCNTrackers()) {
300 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
301 DownwardTracker, UpwardTracker, DAG, SRI);
302 } else {
303 // Reserve 4 slots.
304 Pressure.resize(new_size: 4, x: 0);
305 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
306 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
307
308 for (const auto &Diff : DAG->getPressureDiff(SU)) {
309 if (!Diff.isValid())
310 continue;
311 // PressureDiffs is always bottom-up so if we're working top-down we need
312 // to invert its sign.
313 Pressure[Diff.getPSet()] +=
314 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
315 }
316
317#ifdef EXPENSIVE_CHECKS
318 std::vector<unsigned> CheckPressure, CheckMaxPressure;
319 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
320 DownwardTracker, UpwardTracker, DAG, SRI);
321 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
322 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
323 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
324 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
325 errs() << "Register Pressure is inaccurate when calculated through "
326 "PressureDiff\n"
327 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
328 << ", expected "
329 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
330 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
331 << ", expected "
332 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
333 report_fatal_error("inaccurate register pressure calculation");
334 }
335#endif
336 }
337
338 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
339 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
340
341 // If two instructions increase the pressure of different register sets
342 // by the same amount, the generic scheduler will prefer to schedule the
343 // instruction that increases the set with the least amount of registers,
344 // which in our case would be SGPRs. This is rarely what we want, so
345 // when we report excess/critical register pressure, we do it either
346 // only for VGPRs or only for SGPRs.
347
348 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
349 const unsigned MaxVGPRPressureInc = 16;
350 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
351 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
352
353 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
354 // to increase the likelihood we don't go over the limits. We should improve
355 // the analysis to look through dependencies to find the path with the least
356 // register pressure.
357
358 // We only need to update the RPDelta for instructions that increase register
359 // pressure. Instructions that decrease or keep reg pressure the same will be
360 // marked as RegExcess in tryCandidate() when they are compared with
361 // instructions that increase the register pressure.
362 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
363 HasHighPressure = true;
364 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
365 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
366 }
367
368 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
369 HasHighPressure = true;
370 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
371 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
372 }
373
374 // Register pressure is considered 'CRITICAL' if it is approaching a value
375 // that would reduce the wave occupancy for the execution unit. When
376 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
377 // has the same cost, so we don't need to prefer one over the other.
378
379 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
380 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
381
382 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
383 HasHighPressure = true;
384 if (SGPRDelta > VGPRDelta) {
385 Cand.RPDelta.CriticalMax =
386 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
387 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
388 } else {
389 Cand.RPDelta.CriticalMax =
390 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
391 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
392 }
393 }
394}
395
396static bool shouldCheckPending(SchedBoundary &Zone,
397 const TargetSchedModel *SchedModel) {
398 bool HasBufferedModel =
399 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
400 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
401 return Combined <= PendingQueueLimit && HasBufferedModel;
402}
403
404static SUnit *pickOnlyChoice(SchedBoundary &Zone,
405 const TargetSchedModel *SchedModel) {
406 // pickOnlyChoice() releases pending instructions and checks for new hazards.
407 SUnit *OnlyChoice = Zone.pickOnlyChoice();
408 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
409 return OnlyChoice;
410
411 return nullptr;
412}
413
414void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current,
415 const SchedCandidate &Preferred) {
416 LLVM_DEBUG({
417 dbgs() << "Prefer:\t\t";
418 DAG->dumpNode(*Preferred.SU);
419
420 if (Current.SU) {
421 dbgs() << "Not:\t";
422 DAG->dumpNode(*Current.SU);
423 }
424
425 dbgs() << "Reason:\t\t";
426 traceCandidate(Preferred);
427 });
428}
429
430// This function is mostly cut and pasted from
431// GenericScheduler::pickNodeFromQueue()
432void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
433 const CandPolicy &ZonePolicy,
434 const RegPressureTracker &RPTracker,
435 SchedCandidate &Cand, bool &IsPending,
436 bool IsBottomUp) {
437 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
438 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
439 unsigned SGPRPressure = 0;
440 unsigned VGPRPressure = 0;
441 IsPending = false;
442 if (DAG->isTrackingPressure()) {
443 if (!useGCNTrackers()) {
444 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
445 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
446 } else {
447 GCNRPTracker *T = IsBottomUp
448 ? static_cast<GCNRPTracker *>(&UpwardTracker)
449 : static_cast<GCNRPTracker *>(&DownwardTracker);
450 SGPRPressure = T->getPressure().getSGPRNum();
451 VGPRPressure = T->getPressure().getArchVGPRNum();
452 }
453 }
454 LLVM_DEBUG(dbgs() << "Available Q:\n");
455 ReadyQueue &AQ = Zone.Available;
456 for (SUnit *SU : AQ) {
457
458 SchedCandidate TryCand(ZonePolicy);
459 initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
460 VGPRPressure, IsBottomUp);
461 // Pass SchedBoundary only when comparing nodes from the same boundary.
462 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
463 tryCandidate(Cand, TryCand, Zone: ZoneArg);
464 if (TryCand.Reason != NoCand) {
465 // Initialize resource delta if needed in case future heuristics query it.
466 if (TryCand.ResDelta == SchedResourceDelta())
467 TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
468 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
469 Cand.setBest(TryCand);
470 } else {
471 printCandidateDecision(Current: TryCand, Preferred: Cand);
472 }
473 }
474
475 if (!shouldCheckPending(Zone, SchedModel))
476 return;
477
478 LLVM_DEBUG(dbgs() << "Pending Q:\n");
479 ReadyQueue &PQ = Zone.Pending;
480 for (SUnit *SU : PQ) {
481
482 SchedCandidate TryCand(ZonePolicy);
483 initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
484 VGPRPressure, IsBottomUp);
485 // Pass SchedBoundary only when comparing nodes from the same boundary.
486 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
487 tryPendingCandidate(Cand, TryCand, Zone: ZoneArg);
488 if (TryCand.Reason != NoCand) {
489 // Initialize resource delta if needed in case future heuristics query it.
490 if (TryCand.ResDelta == SchedResourceDelta())
491 TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
492 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
493 IsPending = true;
494 Cand.setBest(TryCand);
495 } else {
496 printCandidateDecision(Current: TryCand, Preferred: Cand);
497 }
498 }
499}
500
501// This function is mostly cut and pasted from
502// GenericScheduler::pickNodeBidirectional()
503SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
504 bool &PickedPending) {
505 // Schedule as far as possible in the direction of no choice. This is most
506 // efficient, but also provides the best heuristics for CriticalPSets.
507 if (SUnit *SU = pickOnlyChoice(Zone&: Bot, SchedModel)) {
508 IsTopNode = false;
509 return SU;
510 }
511 if (SUnit *SU = pickOnlyChoice(Zone&: Top, SchedModel)) {
512 IsTopNode = true;
513 return SU;
514 }
515 // Set the bottom-up policy based on the state of the current bottom zone
516 // and the instructions outside the zone, including the top zone.
517 CandPolicy BotPolicy;
518 setPolicy(Policy&: BotPolicy, /*IsPostRA=*/false, CurrZone&: Bot, OtherZone: &Top);
519 // Set the top-down policy based on the state of the current top zone and
520 // the instructions outside the zone, including the bottom zone.
521 CandPolicy TopPolicy;
522 setPolicy(Policy&: TopPolicy, /*IsPostRA=*/false, CurrZone&: Top, OtherZone: &Bot);
523
524 bool BotPending = false;
525 // See if BotCand is still valid (because we previously scheduled from Top).
526 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
527 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
528 BotCand.Policy != BotPolicy) {
529 BotCand.reset(NewPolicy: CandPolicy());
530 pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
531 IsPending&: BotPending,
532 /*IsBottomUp=*/true);
533 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
534 } else {
535 LLVM_DEBUG(traceCandidate(BotCand));
536#ifndef NDEBUG
537 if (VerifyScheduling) {
538 SchedCandidate TCand;
539 TCand.reset(CandPolicy());
540 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
541 BotPending,
542 /*IsBottomUp=*/true);
543 assert(TCand.SU == BotCand.SU &&
544 "Last pick result should correspond to re-picking right now");
545 }
546#endif
547 }
548
549 bool TopPending = false;
550 // Check if the top Q has a better candidate.
551 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
552 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
553 TopCand.Policy != TopPolicy) {
554 TopCand.reset(NewPolicy: CandPolicy());
555 pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
556 IsPending&: TopPending,
557 /*IsBottomUp=*/false);
558 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
559 } else {
560 LLVM_DEBUG(traceCandidate(TopCand));
561#ifndef NDEBUG
562 if (VerifyScheduling) {
563 SchedCandidate TCand;
564 TCand.reset(CandPolicy());
565 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
566 TopPending,
567 /*IsBottomUp=*/false);
568 assert(TCand.SU == TopCand.SU &&
569 "Last pick result should correspond to re-picking right now");
570 }
571#endif
572 }
573
574 // Pick best from BotCand and TopCand.
575 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
576 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
577 SchedCandidate Cand = BotPending ? TopCand : BotCand;
578 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
579 PickedPending = BotPending && TopPending;
580
581 TryCand.Reason = NoCand;
582 if (BotPending || TopPending) {
583 PickedPending |= tryPendingCandidate(Cand, TryCand&: TopCand, Zone: nullptr);
584 } else {
585 tryCandidate(Cand, TryCand, Zone: nullptr);
586 }
587
588 if (TryCand.Reason != NoCand) {
589 Cand.setBest(TryCand);
590 }
591
592 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
593
594 IsTopNode = Cand.AtTop;
595 return Cand.SU;
596}
597
598// This function is mostly cut and pasted from
599// GenericScheduler::pickNode()
600SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
601 if (DAG->top() == DAG->bottom()) {
602 assert(Top.Available.empty() && Top.Pending.empty() &&
603 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
604 return nullptr;
605 }
606 bool PickedPending;
607 SUnit *SU;
608 do {
609 PickedPending = false;
610 if (RegionPolicy.OnlyTopDown) {
611 SU = pickOnlyChoice(Zone&: Top, SchedModel);
612 if (!SU) {
613 CandPolicy NoPolicy;
614 TopCand.reset(NewPolicy: NoPolicy);
615 pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
616 IsPending&: PickedPending,
617 /*IsBottomUp=*/false);
618 assert(TopCand.Reason != NoCand && "failed to find a candidate");
619 SU = TopCand.SU;
620 }
621 IsTopNode = true;
622 } else if (RegionPolicy.OnlyBottomUp) {
623 SU = pickOnlyChoice(Zone&: Bot, SchedModel);
624 if (!SU) {
625 CandPolicy NoPolicy;
626 BotCand.reset(NewPolicy: NoPolicy);
627 pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
628 IsPending&: PickedPending,
629 /*IsBottomUp=*/true);
630 assert(BotCand.Reason != NoCand && "failed to find a candidate");
631 SU = BotCand.SU;
632 }
633 IsTopNode = false;
634 } else {
635 SU = pickNodeBidirectional(IsTopNode, PickedPending);
636 }
637 } while (SU->isScheduled);
638
639 if (PickedPending) {
640 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
641 SchedBoundary &Zone = IsTopNode ? Top : Bot;
642 unsigned CurrentCycle = Zone.getCurrCycle();
643 if (ReadyCycle > CurrentCycle)
644 Zone.bumpCycle(NextCycle: ReadyCycle);
645
646 // FIXME: checkHazard() doesn't give information about which cycle the
647 // hazard will resolve so just keep bumping the cycle by 1. This could be
648 // made more efficient if checkHazard() returned more details.
649 while (Zone.checkHazard(SU))
650 Zone.bumpCycle(NextCycle: Zone.getCurrCycle() + 1);
651
652 Zone.releasePending();
653 }
654
655 if (SU->isTopReady())
656 Top.removeReady(SU);
657 if (SU->isBottomReady())
658 Bot.removeReady(SU);
659
660 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
661 << *SU->getInstr());
662 return SU;
663}
664
665void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
666 if (useGCNTrackers()) {
667 MachineInstr *MI = SU->getInstr();
668 IsTopNode ? (void)DownwardTracker.advance(MI, UseInternalIterator: false)
669 : UpwardTracker.recede(MI: *MI);
670 }
671
672 return GenericScheduler::schedNode(SU, IsTopNode);
673}
674
675GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
676 assert(CurrentStage && CurrentStage != SchedStages.end());
677 return *CurrentStage;
678}
679
680bool GCNSchedStrategy::advanceStage() {
681 assert(CurrentStage != SchedStages.end());
682 if (!CurrentStage)
683 CurrentStage = SchedStages.begin();
684 else
685 CurrentStage++;
686
687 return CurrentStage != SchedStages.end();
688}
689
690bool GCNSchedStrategy::hasNextStage() const {
691 assert(CurrentStage);
692 return std::next(x: CurrentStage) != SchedStages.end();
693}
694
695GCNSchedStageID GCNSchedStrategy::getNextStage() const {
696 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
697 return *std::next(x: CurrentStage);
698}
699
700bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
701 SchedCandidate &TryCand,
702 SchedBoundary *Zone) const {
703 // Initialize the candidate if needed.
704 if (!Cand.isValid()) {
705 TryCand.Reason = NodeOrder;
706 return true;
707 }
708
709 // Bias PhysReg Defs and copies to their uses and defined respectively.
710 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
711 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
712 return TryCand.Reason != NoCand;
713
714 // Avoid exceeding the target's limit.
715 if (DAG->isTrackingPressure() &&
716 tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
717 Reason: RegExcess, TRI, MF: DAG->MF))
718 return TryCand.Reason != NoCand;
719
720 // Avoid increasing the max critical pressure in the scheduled region.
721 if (DAG->isTrackingPressure() &&
722 tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
723 TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
724 return TryCand.Reason != NoCand;
725
726 bool SameBoundary = Zone != nullptr;
727 if (SameBoundary) {
728 TryCand.initResourceDelta(DAG, SchedModel);
729 if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
730 TryCand, Cand, Reason: ResourceReduce))
731 return TryCand.Reason != NoCand;
732 if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
733 CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
734 Reason: ResourceDemand))
735 return TryCand.Reason != NoCand;
736 }
737
738 return false;
739}
740
741GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
742 const MachineSchedContext *C, bool IsLegacyScheduler)
743 : GCNSchedStrategy(C) {
744 SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule);
745 if (!DisableRewriteMFMAFormSchedStage)
746 SchedStages.push_back(Elt: GCNSchedStageID::RewriteMFMAForm);
747 SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule);
748 SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule);
749 SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize);
750 if (IsLegacyScheduler)
751 GCNTrackersOverride = std::nullopt;
752}
753
754GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
755 : GCNSchedStrategy(C) {
756 SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule);
757}
758
759bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
760 SchedCandidate &TryCand,
761 SchedBoundary *Zone) const {
762 // Initialize the candidate if needed.
763 if (!Cand.isValid()) {
764 TryCand.Reason = NodeOrder;
765 return true;
766 }
767
768 // Avoid spilling by exceeding the register limit.
769 if (DAG->isTrackingPressure() &&
770 tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
771 Reason: RegExcess, TRI, MF: DAG->MF))
772 return TryCand.Reason != NoCand;
773
774 // Bias PhysReg Defs and copies to their uses and defined respectively.
775 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
776 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
777 return TryCand.Reason != NoCand;
778
779 bool SameBoundary = Zone != nullptr;
780 if (SameBoundary) {
781 // Prioritize instructions that read unbuffered resources by stall cycles.
782 if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
783 CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
784 return TryCand.Reason != NoCand;
785
786 // Avoid critical resource consumption and balance the schedule.
787 TryCand.initResourceDelta(DAG, SchedModel);
788 if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
789 TryCand, Cand, Reason: ResourceReduce))
790 return TryCand.Reason != NoCand;
791 if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
792 CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
793 Reason: ResourceDemand))
794 return TryCand.Reason != NoCand;
795
796 // Unconditionally try to reduce latency.
797 if (tryLatency(TryCand, Cand, Zone&: *Zone))
798 return TryCand.Reason != NoCand;
799
800 // Weak edges are for clustering and other constraints.
801 if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
802 CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
803 return TryCand.Reason != NoCand;
804 }
805
806 // Keep clustered nodes together to encourage downstream peephole
807 // optimizations which may reduce resource requirements.
808 //
809 // This is a best effort to set things up for a post-RA pass. Optimizations
810 // like generating loads of multiple registers should ideally be done within
811 // the scheduler pass by combining the loads during DAG postprocessing.
812 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
813 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
814 bool CandIsClusterSucc =
815 isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx);
816 bool TryCandIsClusterSucc =
817 isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx);
818 if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand,
819 Reason: Cluster))
820 return TryCand.Reason != NoCand;
821
822 // Avoid increasing the max critical pressure in the scheduled region.
823 if (DAG->isTrackingPressure() &&
824 tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
825 TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
826 return TryCand.Reason != NoCand;
827
828 // Avoid increasing the max pressure of the entire region.
829 if (DAG->isTrackingPressure() &&
830 tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
831 Cand, Reason: RegMax, TRI, MF: DAG->MF))
832 return TryCand.Reason != NoCand;
833
834 if (SameBoundary) {
835 // Fall through to original instruction order.
836 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
837 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
838 TryCand.Reason = NodeOrder;
839 return true;
840 }
841 }
842 return false;
843}
844
845GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
846 const MachineSchedContext *C)
847 : GCNSchedStrategy(C) {
848 SchedStages.push_back(Elt: GCNSchedStageID::MemoryClauseInitialSchedule);
849}
850
851/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
852/// much as possible. This is achieved by:
853// 1. Prioritize clustered operations before stall latency heuristic.
854// 2. Prioritize long-latency-load before stall latency heuristic.
855///
856/// \param Cand provides the policy and current best candidate.
857/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
858/// \param Zone describes the scheduled zone that we are extending, or nullptr
859/// if Cand is from a different zone than TryCand.
860/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
861bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
862 SchedCandidate &TryCand,
863 SchedBoundary *Zone) const {
864 // Initialize the candidate if needed.
865 if (!Cand.isValid()) {
866 TryCand.Reason = NodeOrder;
867 return true;
868 }
869
870 // Bias PhysReg Defs and copies to their uses and defined respectively.
871 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
872 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
873 return TryCand.Reason != NoCand;
874
875 if (DAG->isTrackingPressure()) {
876 // Avoid exceeding the target's limit.
877 if (tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
878 Reason: RegExcess, TRI, MF: DAG->MF))
879 return TryCand.Reason != NoCand;
880
881 // Avoid increasing the max critical pressure in the scheduled region.
882 if (tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
883 TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
884 return TryCand.Reason != NoCand;
885 }
886
887 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
888 // get more benefit from clausing these memory instructions.
889 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
890 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
891 bool CandIsClusterSucc =
892 isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx);
893 bool TryCandIsClusterSucc =
894 isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx);
895 if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand,
896 Reason: Cluster))
897 return TryCand.Reason != NoCand;
898
899 // We only compare a subset of features when comparing nodes between
900 // Top and Bottom boundary. Some properties are simply incomparable, in many
901 // other instances we should only override the other boundary if something
902 // is a clear good pick on one boundary. Skip heuristics that are more
903 // "tie-breaking" in nature.
904 bool SameBoundary = Zone != nullptr;
905 if (SameBoundary) {
906 // For loops that are acyclic path limited, aggressively schedule for
907 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
908 // heuristics to take precedence.
909 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
910 tryLatency(TryCand, Cand, Zone&: *Zone))
911 return TryCand.Reason != NoCand;
912
913 // MaxMemoryClause-specific: Prioritize long latency memory load
914 // instructions in top-bottom order to hide more latency. The mayLoad check
915 // is used to exclude store-like instructions, which we do not want to
916 // scheduler them too early.
917 bool TryMayLoad =
918 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
919 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
920
921 if (TryMayLoad || CandMayLoad) {
922 bool TryLongLatency =
923 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
924 bool CandLongLatency =
925 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
926
927 if (tryGreater(TryVal: Zone->isTop() ? TryLongLatency : CandLongLatency,
928 CandVal: Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
929 Cand, Reason: Stall))
930 return TryCand.Reason != NoCand;
931 }
932 // Prioritize instructions that read unbuffered resources by stall cycles.
933 if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
934 CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
935 return TryCand.Reason != NoCand;
936 }
937
938 if (SameBoundary) {
939 // Weak edges are for clustering and other constraints.
940 if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
941 CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
942 return TryCand.Reason != NoCand;
943 }
944
945 // Avoid increasing the max pressure of the entire region.
946 if (DAG->isTrackingPressure() &&
947 tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
948 Cand, Reason: RegMax, TRI, MF: DAG->MF))
949 return TryCand.Reason != NoCand;
950
951 if (SameBoundary) {
952 // Avoid critical resource consumption and balance the schedule.
953 TryCand.initResourceDelta(DAG, SchedModel);
954 if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
955 TryCand, Cand, Reason: ResourceReduce))
956 return TryCand.Reason != NoCand;
957 if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
958 CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
959 Reason: ResourceDemand))
960 return TryCand.Reason != NoCand;
961
962 // Avoid serializing long latency dependence chains.
963 // For acyclic path limited loops, latency was already checked above.
964 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
965 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone))
966 return TryCand.Reason != NoCand;
967
968 // Fall through to original instruction order.
969 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
970 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
971 TryCand.Reason = NodeOrder;
972 return true;
973 }
974 }
975
976 return false;
977}
978
979GCNScheduleDAGMILive::GCNScheduleDAGMILive(
980 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
981 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
982 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
983 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
984 RegionLiveOuts(this, /*IsLiveOut=*/true) {
985
986 // We want regions with a single MI to be scheduled so that we can reason
987 // about them correctly during scheduling stages that move MIs between regions
988 // (e.g., rematerialization).
989 ScheduleSingleMIRegions = true;
990 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
991 if (RelaxedOcc) {
992 MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy);
993 if (MinOccupancy != StartingOccupancy)
994 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
995 << ".\n");
996 }
997}
998
999std::unique_ptr<GCNSchedStage>
1000GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
1001 switch (SchedStageID) {
1002 case GCNSchedStageID::OccInitialSchedule:
1003 return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this);
1004 case GCNSchedStageID::RewriteMFMAForm:
1005 return std::make_unique<RewriteMFMAFormStage>(args&: SchedStageID, args&: *this);
1006 case GCNSchedStageID::UnclusteredHighRPReschedule:
1007 return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this);
1008 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
1009 return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this);
1010 case GCNSchedStageID::PreRARematerialize:
1011 return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this);
1012 case GCNSchedStageID::ILPInitialSchedule:
1013 return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this);
1014 case GCNSchedStageID::MemoryClauseInitialSchedule:
1015 return std::make_unique<MemoryClauseInitialScheduleStage>(args&: SchedStageID,
1016 args&: *this);
1017 }
1018
1019 llvm_unreachable("Unknown SchedStageID.");
1020}
1021
1022void GCNScheduleDAGMILive::schedule() {
1023 // Collect all scheduling regions. The actual scheduling is performed in
1024 // GCNScheduleDAGMILive::finalizeSchedule.
1025 Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd));
1026}
1027
1028GCNRegPressure
1029GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
1030 if (Regions[RegionIdx].first == Regions[RegionIdx].second)
1031 return llvm::getRegPressure(MRI, LiveRegs: LiveIns[RegionIdx]);
1032 GCNDownwardRPTracker RPTracker(*LIS);
1033 RPTracker.advance(Begin: Regions[RegionIdx].first, End: Regions[RegionIdx].second,
1034 LiveRegsCopy: &LiveIns[RegionIdx]);
1035 return RPTracker.moveMaxPressure();
1036}
1037
1038static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
1039 MachineBasicBlock::iterator RegionEnd) {
1040 assert(RegionBegin != RegionEnd && "Region must not be empty");
1041 return &*skipDebugInstructionsBackward(It: std::prev(x: RegionEnd), Begin: RegionBegin);
1042}
1043
1044void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
1045 const MachineBasicBlock *MBB) {
1046 GCNDownwardRPTracker RPTracker(*LIS);
1047
1048 // If the block has the only successor then live-ins of that successor are
1049 // live-outs of the current block. We can reuse calculated live set if the
1050 // successor will be sent to scheduling past current block.
1051
1052 // However, due to the bug in LiveInterval analysis it may happen that two
1053 // predecessors of the same successor block have different lane bitmasks for
1054 // a live-out register. Workaround that by sticking to one-to-one relationship
1055 // i.e. one predecessor with one successor block.
1056 const MachineBasicBlock *OnlySucc = nullptr;
1057 if (MBB->succ_size() == 1) {
1058 auto *Candidate = *MBB->succ_begin();
1059 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1060 SlotIndexes *Ind = LIS->getSlotIndexes();
1061 if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate))
1062 OnlySucc = Candidate;
1063 }
1064 }
1065
1066 // Scheduler sends regions from the end of the block upwards.
1067 size_t CurRegion = RegionIdx;
1068 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1069 if (Regions[CurRegion].first->getParent() != MBB)
1070 break;
1071 --CurRegion;
1072
1073 auto I = MBB->begin();
1074 auto LiveInIt = MBBLiveIns.find(Val: MBB);
1075 auto &Rgn = Regions[CurRegion];
1076 auto *NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
1077 if (LiveInIt != MBBLiveIns.end()) {
1078 auto LiveIn = std::move(LiveInIt->second);
1079 RPTracker.reset(MI: *MBB->begin(), LiveRegs: &LiveIn);
1080 MBBLiveIns.erase(I: LiveInIt);
1081 } else {
1082 I = Rgn.first;
1083 auto LRS = BBLiveInMap.lookup(Val: NonDbgMI);
1084#ifdef EXPENSIVE_CHECKS
1085 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1086#endif
1087 RPTracker.reset(MI: *I, LiveRegs: &LRS);
1088 }
1089
1090 for (;;) {
1091 I = RPTracker.getNext();
1092
1093 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1094 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1095 RPTracker.clearMaxPressure();
1096 }
1097
1098 if (Regions[CurRegion].second == I) {
1099 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1100 if (CurRegion-- == RegionIdx)
1101 break;
1102 auto &Rgn = Regions[CurRegion];
1103 NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
1104 }
1105 RPTracker.advanceBeforeNext();
1106 RPTracker.advanceToNext();
1107 }
1108
1109 if (OnlySucc) {
1110 if (I != MBB->end()) {
1111 RPTracker.advanceBeforeNext();
1112 RPTracker.advanceToNext();
1113 RPTracker.advance(End: MBB->end());
1114 }
1115 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1116 }
1117}
1118
1119DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
1120GCNScheduleDAGMILive::getRegionLiveInMap() const {
1121 assert(!Regions.empty());
1122 std::vector<MachineInstr *> RegionFirstMIs;
1123 RegionFirstMIs.reserve(n: Regions.size());
1124 for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions))
1125 RegionFirstMIs.push_back(
1126 x: &*skipDebugInstructionsForward(It: RegionBegin, End: RegionEnd));
1127
1128 return getLiveRegMap(R&: RegionFirstMIs, /*After=*/false, LIS&: *LIS);
1129}
1130
1131DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
1132GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1133 assert(!Regions.empty());
1134 std::vector<MachineInstr *> RegionLastMIs;
1135 RegionLastMIs.reserve(n: Regions.size());
1136 for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions)) {
1137 // Skip empty regions.
1138 if (RegionBegin == RegionEnd)
1139 continue;
1140 RegionLastMIs.push_back(x: getLastMIForRegion(RegionBegin, RegionEnd));
1141 }
1142 return getLiveRegMap(R&: RegionLastMIs, /*After=*/true, LIS&: *LIS);
1143}
1144
1145void RegionPressureMap::buildLiveRegMap() {
1146 IdxToInstruction.clear();
1147
1148 RegionLiveRegMap =
1149 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1150 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1151 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1152 // Skip empty regions.
1153 if (RegionBegin == RegionEnd)
1154 continue;
1155 MachineInstr *RegionKey =
1156 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1157 IdxToInstruction[I] = RegionKey;
1158 }
1159}
1160
1161void GCNScheduleDAGMILive::finalizeSchedule() {
1162 // Start actual scheduling here. This function is called by the base
1163 // MachineScheduler after all regions have been recorded by
1164 // GCNScheduleDAGMILive::schedule().
1165 LiveIns.resize(N: Regions.size());
1166 Pressure.resize(N: Regions.size());
1167 RegionsWithHighRP.resize(N: Regions.size());
1168 RegionsWithExcessRP.resize(N: Regions.size());
1169 RegionsWithIGLPInstrs.resize(N: Regions.size());
1170 RegionsWithHighRP.reset();
1171 RegionsWithExcessRP.reset();
1172 RegionsWithIGLPInstrs.reset();
1173
1174 runSchedStages();
1175}
1176
1177void GCNScheduleDAGMILive::runSchedStages() {
1178 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1179
1180 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1181 if (!Regions.empty()) {
1182 BBLiveInMap = getRegionLiveInMap();
1183 if (S.useGCNTrackers())
1184 RegionLiveOuts.buildLiveRegMap();
1185 }
1186
1187#ifdef DUMP_MAX_REG_PRESSURE
1188 if (PrintMaxRPRegUsageBeforeScheduler) {
1189 dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
1190 dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
1191 LIS->dump();
1192 }
1193#endif
1194
1195 while (S.advanceStage()) {
1196 auto Stage = createSchedStage(SchedStageID: S.getCurrentStage());
1197 if (!Stage->initGCNSchedStage())
1198 continue;
1199
1200 for (auto Region : Regions) {
1201 RegionBegin = Region.first;
1202 RegionEnd = Region.second;
1203 // Setup for scheduling the region and check whether it should be skipped.
1204 if (!Stage->initGCNRegion()) {
1205 Stage->advanceRegion();
1206 exitRegion();
1207 continue;
1208 }
1209
1210 if (S.useGCNTrackers()) {
1211 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
1212 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
1213 GCNRPTracker::LiveRegSet *RegionLiveIns =
1214 &LiveIns[Stage->getRegionIdx()];
1215
1216 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
1217 ->reset(MRI_: MRI, LiveRegs_: *RegionLiveIns);
1218 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
1219 ->reset(MRI_: MRI, LiveRegs_: RegionLiveOuts.getLiveRegsForRegionIdx(
1220 RegionIdx: Stage->getRegionIdx()));
1221 }
1222
1223 ScheduleDAGMILive::schedule();
1224 Stage->finalizeGCNRegion();
1225 Stage->advanceRegion();
1226 exitRegion();
1227 }
1228
1229 Stage->finalizeGCNSchedStage();
1230 }
1231
1232#ifdef DUMP_MAX_REG_PRESSURE
1233 if (PrintMaxRPRegUsageAfterScheduler) {
1234 dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
1235 dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
1236 LIS->dump();
1237 }
1238#endif
1239}
1240
1241#ifndef NDEBUG
1242raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
1243 switch (StageID) {
1244 case GCNSchedStageID::OccInitialSchedule:
1245 OS << "Max Occupancy Initial Schedule";
1246 break;
1247 case GCNSchedStageID::RewriteMFMAForm:
1248 OS << "Instruction Rewriting Reschedule";
1249 break;
1250 case GCNSchedStageID::UnclusteredHighRPReschedule:
1251 OS << "Unclustered High Register Pressure Reschedule";
1252 break;
1253 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
1254 OS << "Clustered Low Occupancy Reschedule";
1255 break;
1256 case GCNSchedStageID::PreRARematerialize:
1257 OS << "Pre-RA Rematerialize";
1258 break;
1259 case GCNSchedStageID::ILPInitialSchedule:
1260 OS << "Max ILP Initial Schedule";
1261 break;
1262 case GCNSchedStageID::MemoryClauseInitialSchedule:
1263 OS << "Max memory clause Initial Schedule";
1264 break;
1265 }
1266
1267 return OS;
1268}
1269#endif
1270
1271GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
1272 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
1273 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
1274
1275bool GCNSchedStage::initGCNSchedStage() {
1276 if (!DAG.LIS)
1277 return false;
1278
1279 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1280 return true;
1281}
1282
1283void RewriteMFMAFormStage::findReachingDefs(
1284 MachineOperand &UseMO, LiveIntervals *LIS,
1285 SmallVectorImpl<SlotIndex> &DefIdxs) {
1286 MachineInstr *UseMI = UseMO.getParent();
1287 LiveInterval &UseLI = LIS->getInterval(Reg: UseMO.getReg());
1288 VNInfo *VNI = UseLI.getVNInfoAt(Idx: LIS->getInstructionIndex(Instr: *UseMI));
1289
1290 // If the def is not a PHI, then it must be the only reaching def.
1291 if (!VNI->isPHIDef()) {
1292 DefIdxs.push_back(Elt: VNI->def);
1293 return;
1294 }
1295
1296 SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
1297 SmallVector<MachineBasicBlock *, 8> Worklist;
1298
1299 // Mark the predecessor blocks for traversal
1300 for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1301 Worklist.push_back(Elt: PredMBB);
1302 Visited.insert(Ptr: PredMBB);
1303 }
1304
1305 while (!Worklist.empty()) {
1306 MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1307
1308 SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(mbb: CurrMBB);
1309 VNInfo *VNI = UseLI.getVNInfoAt(Idx: CurrMBBEnd.getPrevSlot());
1310
1311 MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(index: VNI->def);
1312
1313 // If there is a def in this block, then add it to the list. This is the
1314 // reaching def of this path.
1315 if (!VNI->isPHIDef()) {
1316 DefIdxs.push_back(Elt: VNI->def);
1317 continue;
1318 }
1319
1320 for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1321 if (Visited.insert(Ptr: PredMBB).second)
1322 Worklist.push_back(Elt: PredMBB);
1323 }
1324 }
1325}
1326
1327void RewriteMFMAFormStage::findReachingUses(
1328 MachineInstr *DefMI, LiveIntervals *LIS,
1329 SmallVectorImpl<MachineOperand *> &ReachingUses) {
1330 SlotIndex DefIdx = LIS->getInstructionIndex(Instr: *DefMI);
1331 for (MachineOperand &UseMO :
1332 DAG.MRI.use_nodbg_operands(Reg: DefMI->getOperand(i: 0).getReg())) {
1333 SmallVector<SlotIndex, 8> ReachingDefIndexes;
1334 findReachingDefs(UseMO, LIS, DefIdxs&: ReachingDefIndexes);
1335
1336 // If we find a use that contains this DefMI in its reachingDefs, then it is
1337 // a reaching use.
1338 if (any_of(Range&: ReachingDefIndexes, P: [DefIdx](SlotIndex RDIdx) {
1339 return SlotIndex::isSameInstr(A: RDIdx, B: DefIdx);
1340 }))
1341 ReachingUses.push_back(Elt: &UseMO);
1342 }
1343}
1344
1345bool RewriteMFMAFormStage::initGCNSchedStage() {
1346 // We only need to run this pass if the architecture supports AGPRs.
1347 // Additionally, we don't use AGPRs at occupancy levels above 1 so there
1348 // is no need for this pass in that case, either.
1349 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1350 if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
1351 return false;
1352
1353 RegionsWithExcessArchVGPR.resize(N: DAG.Regions.size());
1354 RegionsWithExcessArchVGPR.reset();
1355 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
1356 GCNRegPressure PressureBefore = DAG.Pressure[Region];
1357 if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1358 RegionsWithExcessArchVGPR[Region] = true;
1359 }
1360
1361 if (RegionsWithExcessArchVGPR.none())
1362 return false;
1363
1364 TII = ST.getInstrInfo();
1365 SRI = ST.getRegisterInfo();
1366
1367 std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
1368 DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse;
1369 SmallPtrSet<MachineInstr *, 8> CopyForDef;
1370
1371 if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1372 return false;
1373
1374 int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1375
1376 // If we haven't found the beneficial conditions, prefer the VGPR form which
1377 // may result in less cross RC copies.
1378 if (Cost > 0)
1379 return false;
1380
1381 return rewrite(RewriteCands);
1382}
1383
1384bool UnclusteredHighRPStage::initGCNSchedStage() {
1385 if (DisableUnclusterHighRP)
1386 return false;
1387
1388 if (!GCNSchedStage::initGCNSchedStage())
1389 return false;
1390
1391 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1392 return false;
1393
1394 SavedMutations.swap(x&: DAG.Mutations);
1395 DAG.addMutation(
1396 Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry));
1397
1398 InitialOccupancy = DAG.MinOccupancy;
1399 // Aggressively try to reduce register pressure in the unclustered high RP
1400 // stage. Temporarily increase occupancy target in the region.
1401 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1402 ? InitialOccupancy + 1
1403 : InitialOccupancy;
1404 IsAnyRegionScheduled = false;
1405 S.SGPRLimitBias = S.HighRPSGPRBias;
1406 S.VGPRLimitBias = S.HighRPVGPRBias;
1407
1408 LLVM_DEBUG(
1409 dbgs()
1410 << "Retrying function scheduling without clustering. "
1411 "Aggressively try to reduce register pressure to achieve occupancy "
1412 << TempTargetOccupancy << ".\n");
1413
1414 return true;
1415}
1416
1417bool ClusteredLowOccStage::initGCNSchedStage() {
1418 if (DisableClusteredLowOccupancy)
1419 return false;
1420
1421 if (!GCNSchedStage::initGCNSchedStage())
1422 return false;
1423
1424 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1425 // been dropped. All regions will have already been scheduled with the ideal
1426 // occupancy targets.
1427 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1428 return false;
1429
1430 LLVM_DEBUG(
1431 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1432 << DAG.MinOccupancy << ".\n");
1433 return true;
1434}
1435
1436/// Allows to easily filter for this stage's debug output.
1437#define REMAT_PREFIX "[PreRARemat] "
1438#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1439
1440#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1441Printable PreRARematStage::ScoredRemat::print() const {
1442 return Printable([&](raw_ostream &OS) {
1443 OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1444 });
1445}
1446#endif
1447
1448bool PreRARematStage::initGCNSchedStage() {
1449 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1450 // regions inbetween the defs and region we sinked the def to. Will need to be
1451 // fixed if there is another pass after this pass.
1452 assert(!S.hasNextStage());
1453
1454 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
1455 return false;
1456
1457 // Maps all MIs (except lone terminators, which are not part of any region) to
1458 // their parent region. Non-lone terminators are considered part of the region
1459 // they delimitate.
1460 DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount());
1461
1462 // Before performing any IR modification record the parent region of each MI
1463 // and the parent MBB of each region.
1464 const unsigned NumRegions = DAG.Regions.size();
1465 for (unsigned I = 0; I < NumRegions; ++I) {
1466 RegionBoundaries Region = DAG.Regions[I];
1467 for (auto MI = Region.first; MI != Region.second; ++MI)
1468 MIRegion.insert(KV: {&*MI, I});
1469 MachineBasicBlock *ParentMBB = Region.first->getParent();
1470 if (Region.second != ParentMBB->end())
1471 MIRegion.insert(KV: {&*Region.second, I});
1472 RegionBB.push_back(Elt: ParentMBB);
1473 }
1474
1475#ifndef NDEBUG
1476 auto PrintTargetRegions = [&]() -> void {
1477 if (TargetRegions.none()) {
1478 dbgs() << REMAT_PREFIX << "No target regions\n";
1479 return;
1480 }
1481 dbgs() << REMAT_PREFIX << "Target regions:\n";
1482 for (unsigned I : TargetRegions.set_bits())
1483 dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1484 };
1485 auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
1486 return Printable([&, Remat](raw_ostream &OS) {
1487 // Concatenate all region numbers in which the register is unused and
1488 // live-through.
1489 bool HasLiveThroughRegion = false;
1490 OS << '[' << Remat.DefRegion << " -";
1491 for (unsigned I = 0; I < NumRegions; ++I) {
1492 if (Remat.isUnusedLiveThrough(I)) {
1493 if (HasLiveThroughRegion) {
1494 OS << ',';
1495 } else {
1496 OS << "- ";
1497 HasLiveThroughRegion = true;
1498 }
1499 OS << I;
1500 }
1501 }
1502 if (HasLiveThroughRegion)
1503 OS << " -";
1504 OS << "-> " << Remat.UseRegion << "] ";
1505 Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
1506 /*SkipDebugLoc=*/false, /*AddNewLine=*/false);
1507 });
1508 };
1509#endif
1510
1511 // Set an objective for the stage based on current RP in each region.
1512 REMAT_DEBUG({
1513 dbgs() << "Analyzing ";
1514 MF.getFunction().printAsOperand(dbgs(), false);
1515 dbgs() << ": ";
1516 });
1517 if (!setObjective()) {
1518 LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
1519 << MFI.getMaxWavesPerEU() << '\n');
1520 return false;
1521 }
1522 LLVM_DEBUG({
1523 if (TargetOcc) {
1524 dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
1525 } else {
1526 dbgs() << "reduce spilling (minimum target occupancy is "
1527 << MFI.getMinWavesPerEU() << ")\n";
1528 }
1529 PrintTargetRegions();
1530 });
1531
1532 if (!collectRematRegs(MIRegion)) {
1533 REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
1534 return false;
1535 }
1536 const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1537 REMAT_DEBUG({
1538 dbgs() << "Rematerializable registers:\n";
1539 for (const RematReg &Remat : RematRegs)
1540 dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n';
1541 dbgs() << REMAT_PREFIX << "Region frequencies\n";
1542 for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
1543 dbgs() << REMAT_PREFIX << " [" << I << "] ";
1544 if (Freq)
1545 dbgs() << Freq;
1546 else
1547 dbgs() << "unknown ";
1548 dbgs() << " | " << *DAG.Regions[I].first;
1549 }
1550 });
1551
1552 SmallVector<ScoredRemat> ScoredRemats;
1553 for (RematReg &Remat : RematRegs)
1554 ScoredRemats.emplace_back(Args: &Remat, Args: FreqInfo, Args&: DAG);
1555
1556// Rematerialize registers in successive rounds until all RP targets are
1557// satisifed or until we run out of rematerialization candidates.
1558#ifndef NDEBUG
1559 unsigned RoundNum = 0;
1560#endif
1561 BitVector RecomputeRP(NumRegions);
1562 do {
1563 assert(!ScoredRemats.empty() && "no more remat candidates");
1564
1565 // (Re-)Score and (re-)sort all remats in increasing score order.
1566 for (ScoredRemat &Remat : ScoredRemats)
1567 Remat.update(TargetRegions, RPTargets, Freq: FreqInfo, ReduceSpill: !TargetOcc);
1568 sort(C&: ScoredRemats);
1569
1570 REMAT_DEBUG({
1571 dbgs() << "==== ROUND " << RoundNum++ << " ====\n"
1572 << REMAT_PREFIX
1573 << "Candidates with non-null score, in rematerialization order:\n";
1574 for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
1575 if (RematDecision.hasNullScore())
1576 break;
1577 dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | "
1578 << *RematDecision.Remat->DefMI;
1579 }
1580 PrintTargetRegions();
1581 });
1582
1583 RecomputeRP.reset();
1584 unsigned RematIdx = ScoredRemats.size();
1585
1586 // Rematerialize registers in decreasing score order until we estimate
1587 // that all RP targets are satisfied or until rematerialization candidates
1588 // are no longer useful to decrease RP.
1589 for (; RematIdx && TargetRegions.any(); --RematIdx) {
1590 const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1];
1591 // Stop rematerializing on encountering a null score. Since scores
1592 // monotonically decrease as we rematerialize, we know there is nothing
1593 // useful left to do in such cases, even if we were to re-score.
1594 if (Candidate.hasNullScore()) {
1595 RematIdx = 0;
1596 break;
1597 }
1598
1599 RematReg &Remat = *Candidate.Remat;
1600 // When previous rematerializations in this round have already satisfied
1601 // RP targets in all regions this rematerialization can impact, we have a
1602 // good indication that our scores have diverged significantly from
1603 // reality, in which case we interrupt this round and re-score. This also
1604 // ensures that every rematerialization we perform is possibly impactful
1605 // in at least one target region.
1606 if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
1607 break;
1608
1609 REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';);
1610 MachineInstr *RematMI =
1611 Candidate.rematerialize(RecomputeRP, RPTargets, DAG);
1612 RescheduleRegions |= Remat.Live;
1613
1614 // Every rematerialization we do here is likely to move the instruction
1615 // into a higher frequency region, increasing the total sum latency of the
1616 // instruction itself. This is acceptable if we are eliminating a spill in
1617 // the process, but when the goal is increasing occupancy we get nothing
1618 // out of rematerialization if occupancy is not increased in the end; in
1619 // such cases we want to roll back the rematerialization.
1620 if (TargetOcc) {
1621 RollbackInfo &Rollback = Rollbacks.emplace_back(Args: &Remat);
1622 Rollback.RematMI = RematMI;
1623 // Make the original MI a debug value so that it does not influence
1624 // scheduling and replace all read registers with a sentinel register to
1625 // prevent operands to appear in use-lists of other MIs during LIS
1626 // updates. Store mappings between operand indices and original
1627 // registers for potential rollback.
1628 Remat.DefMI->setDesc(DAG.TII->get(Opcode: TargetOpcode::DBG_VALUE));
1629 for (auto [Idx, MO] : enumerate(First: Remat.DefMI->operands())) {
1630 if (MO.isReg() && MO.readsReg()) {
1631 Rollback.RegMap.insert(KV: {Idx, MO.getReg()});
1632 MO.setReg(Register());
1633 }
1634 }
1635 } else {
1636 // Just delete the original instruction if it cannot be rolled back.
1637 DAG.deleteMI(RegionIdx: Remat.DefRegion, MI: Remat.DefMI);
1638 }
1639
1640 unsetSatisfiedRPTargets(Regions: Remat.Live);
1641 }
1642
1643 REMAT_DEBUG({
1644 if (!TargetRegions.any()) {
1645 dbgs() << "** Interrupt round on all targets achieved\n";
1646 } else if (RematIdx) {
1647 dbgs() << "** Interrupt round on stale score for "
1648 << *ScoredRemats[RematIdx - 1].Remat->DefMI;
1649 } else {
1650 dbgs() << "** Stop on exhausted rematerialization candidates\n";
1651 }
1652 });
1653
1654 // Peel off registers we already rematerialized from the vector's tail.
1655 ScoredRemats.truncate(N: RematIdx);
1656 } while ((updateAndVerifyRPTargets(Regions: RecomputeRP) || TargetRegions.any()) &&
1657 !ScoredRemats.empty());
1658 if (RescheduleRegions.none())
1659 return false;
1660
1661 // Commit all pressure changes to the DAG and compute minimum achieved
1662 // occupancy in impacted regions.
1663 REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
1664 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
1665 for (unsigned I : RescheduleRegions.set_bits()) {
1666 DAG.Pressure[I] = RPTargets[I].getCurrentRP();
1667 REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
1668 << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
1669 << " (" << RPTargets[I] << ")\n");
1670 }
1671 AchievedOcc = MFI.getMaxWavesPerEU();
1672 for (const GCNRegPressure &RP : DAG.Pressure) {
1673 AchievedOcc =
1674 std::min(a: AchievedOcc, b: RP.getOccupancy(ST, DynamicVGPRBlockSize));
1675 }
1676
1677 REMAT_DEBUG({
1678 dbgs() << "Retrying function scheduling with new min. occupancy of "
1679 << AchievedOcc << " from rematerializing (original was "
1680 << DAG.MinOccupancy;
1681 if (TargetOcc)
1682 dbgs() << ", target was " << *TargetOcc;
1683 dbgs() << ")\n";
1684 });
1685
1686 DAG.setTargetOccupancy(getStageTargetOccupancy());
1687 return true;
1688}
1689
1690void GCNSchedStage::finalizeGCNSchedStage() {
1691 DAG.finishBlock();
1692 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1693}
1694
1695void UnclusteredHighRPStage::finalizeGCNSchedStage() {
1696 SavedMutations.swap(x&: DAG.Mutations);
1697 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1698 if (DAG.MinOccupancy > InitialOccupancy) {
1699 assert(IsAnyRegionScheduled);
1700 LLVM_DEBUG(dbgs() << StageID
1701 << " stage successfully increased occupancy to "
1702 << DAG.MinOccupancy << '\n');
1703 } else if (!IsAnyRegionScheduled) {
1704 assert(DAG.MinOccupancy == InitialOccupancy);
1705 LLVM_DEBUG(dbgs() << StageID
1706 << ": No regions scheduled, min occupancy stays at "
1707 << DAG.MinOccupancy << ", MFI occupancy stays at "
1708 << MFI.getOccupancy() << ".\n");
1709 }
1710
1711 GCNSchedStage::finalizeGCNSchedStage();
1712}
1713
1714bool GCNSchedStage::initGCNRegion() {
1715 // Skip empty scheduling region.
1716 if (DAG.begin() == DAG.end())
1717 return false;
1718
1719 // Check whether this new region is also a new block.
1720 if (DAG.RegionBegin->getParent() != CurrentMBB)
1721 setupNewBlock();
1722
1723 unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end());
1724 DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs);
1725
1726 // Skip regions with 1 schedulable instruction.
1727 if (DAG.begin() == std::prev(x: DAG.end()))
1728 return false;
1729
1730 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1731 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1732 << " " << CurrentMBB->getName()
1733 << "\n From: " << *DAG.begin() << " To: ";
1734 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1735 else dbgs() << "End";
1736 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1737
1738 // Save original instruction order before scheduling for possible revert.
1739 Unsched.clear();
1740 Unsched.reserve(n: DAG.NumRegionInstrs);
1741 if (StageID == GCNSchedStageID::OccInitialSchedule ||
1742 StageID == GCNSchedStageID::ILPInitialSchedule) {
1743 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1744 for (auto &I : DAG) {
1745 Unsched.push_back(x: &I);
1746 if (SII->isIGLPMutationOnly(Opcode: I.getOpcode()))
1747 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1748 }
1749 } else {
1750 for (auto &I : DAG)
1751 Unsched.push_back(x: &I);
1752 }
1753
1754 PressureBefore = DAG.Pressure[RegionIdx];
1755
1756 LLVM_DEBUG(
1757 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1758 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1759 << "Region live-in pressure: "
1760 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1761 << "Region register pressure: " << print(PressureBefore));
1762
1763 S.HasHighPressure = false;
1764 S.KnownExcessRP = isRegionWithExcessRP();
1765
1766 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1767 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
1768 SavedMutations.clear();
1769 SavedMutations.swap(x&: DAG.Mutations);
1770 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1771 StageID == GCNSchedStageID::ILPInitialSchedule;
1772 DAG.addMutation(Mutation: createIGroupLPDAGMutation(
1773 Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1774 : AMDGPU::SchedulingPhase::PreRAReentry));
1775 }
1776
1777 return true;
1778}
1779
1780bool UnclusteredHighRPStage::initGCNRegion() {
1781 // Only reschedule regions that have excess register pressure (i.e. spilling)
1782 // or had minimum occupancy at the beginning of the stage (as long as
1783 // rescheduling of previous regions did not make occupancy drop back down to
1784 // the initial minimum).
1785 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1786 // If no region has been scheduled yet, the DAG has not yet been updated with
1787 // the occupancy target. So retrieve it from the temporary.
1788 unsigned CurrentTargetOccupancy =
1789 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1790 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1791 (CurrentTargetOccupancy <= InitialOccupancy ||
1792 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1793 InitialOccupancy))
1794 return false;
1795
1796 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1797 // If this is the first region scheduled during this stage, make the target
1798 // occupancy changes in the DAG and MFI.
1799 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1800 IsAnyRegionScheduled = true;
1801 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1802 DAG.setTargetOccupancy(TempTargetOccupancy);
1803 }
1804 return IsSchedulingThisRegion;
1805}
1806
1807bool ClusteredLowOccStage::initGCNRegion() {
1808 // We may need to reschedule this region if it wasn't rescheduled in the last
1809 // stage, or if we found it was testing critical register pressure limits in
1810 // the unclustered reschedule stage. The later is because we may not have been
1811 // able to raise the min occupancy in the previous stage so the region may be
1812 // overly constrained even if it was already rescheduled.
1813 if (!DAG.RegionsWithHighRP[RegionIdx])
1814 return false;
1815
1816 return GCNSchedStage::initGCNRegion();
1817}
1818
1819bool PreRARematStage::initGCNRegion() {
1820 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1821}
1822
1823void GCNSchedStage::setupNewBlock() {
1824 if (CurrentMBB)
1825 DAG.finishBlock();
1826
1827 CurrentMBB = DAG.RegionBegin->getParent();
1828 DAG.startBlock(bb: CurrentMBB);
1829 // Get real RP for the region if it hasn't be calculated before. After the
1830 // initial schedule stage real RP will be collected after scheduling.
1831 if (StageID == GCNSchedStageID::OccInitialSchedule ||
1832 StageID == GCNSchedStageID::ILPInitialSchedule ||
1833 StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
1834 DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB);
1835}
1836
1837void GCNSchedStage::finalizeGCNRegion() {
1838 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1839 if (S.HasHighPressure)
1840 DAG.RegionsWithHighRP[RegionIdx] = true;
1841
1842 // Revert scheduling if we have dropped occupancy or there is some other
1843 // reason that the original schedule is better.
1844 checkScheduling();
1845
1846 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1847 StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1848 SavedMutations.swap(x&: DAG.Mutations);
1849}
1850
1851void PreRARematStage::finalizeGCNRegion() {
1852 GCNSchedStage::finalizeGCNRegion();
1853 // When the goal is to increase occupancy, all regions must reach the target
1854 // occupancy for rematerializations to be possibly useful, otherwise we will
1855 // just hurt latency for no benefit. If minimum occupancy drops below the
1856 // target there is no point in trying to re-schedule further regions.
1857 if (!TargetOcc)
1858 return;
1859 RegionReverts.emplace_back(Args&: RegionIdx, Args&: Unsched, Args&: PressureBefore);
1860 if (DAG.MinOccupancy < *TargetOcc) {
1861 REMAT_DEBUG(dbgs() << "Region " << RegionIdx
1862 << " cannot meet occupancy target, interrupting "
1863 "re-scheduling in all regions\n");
1864 RescheduleRegions.reset();
1865 }
1866}
1867
1868void GCNSchedStage::checkScheduling() {
1869 // Check the results of scheduling.
1870 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1871
1872 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1873 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1874
1875 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1876
1877 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1878 PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1879 DAG.Pressure[RegionIdx] = PressureAfter;
1880
1881 // Early out if we have achieved the occupancy target.
1882 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1883 return;
1884 }
1885
1886 unsigned TargetOccupancy = std::min(
1887 a: S.getTargetOccupancy(), b: ST.getOccupancyWithWorkGroupSizes(MF).second);
1888 unsigned WavesAfter = std::min(
1889 a: TargetOccupancy, b: PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1890 unsigned WavesBefore = std::min(
1891 a: TargetOccupancy, b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1892 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1893 << ", after " << WavesAfter << ".\n");
1894
1895 // We may not be able to keep the current target occupancy because of the just
1896 // scheduled region. We might still be able to revert scheduling if the
1897 // occupancy before was higher, or if the current schedule has register
1898 // pressure higher than the excess limits which could lead to more spilling.
1899 unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore);
1900
1901 // Allow memory bound functions to drop to 4 waves if not limited by an
1902 // attribute.
1903 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1904 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1905 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1906 << MFI.getMinAllowedOccupancy() << " waves\n");
1907 NewOccupancy = WavesAfter;
1908 }
1909
1910 if (NewOccupancy < DAG.MinOccupancy) {
1911 DAG.MinOccupancy = NewOccupancy;
1912 MFI.limitOccupancy(Limit: DAG.MinOccupancy);
1913 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1914 << DAG.MinOccupancy << ".\n");
1915 }
1916 // The maximum number of arch VGPR on non-unified register file, or the
1917 // maximum VGPR + AGPR in the unified register file case.
1918 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1919 // The maximum number of arch VGPR for both unified and non-unified register
1920 // file.
1921 unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs());
1922 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1923
1924 if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs ||
1925 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1926 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1927 PressureAfter.getSGPRNum() > MaxSGPRs) {
1928 DAG.RegionsWithHighRP[RegionIdx] = true;
1929 DAG.RegionsWithExcessRP[RegionIdx] = true;
1930 }
1931
1932 // Revert if this region's schedule would cause a drop in occupancy or
1933 // spilling.
1934 if (shouldRevertScheduling(WavesAfter)) {
1935 modifyRegionSchedule(RegionIdx, MBB: DAG.BB, MIOrder: Unsched);
1936 std::tie(args&: DAG.RegionBegin, args&: DAG.RegionEnd) = DAG.Regions[RegionIdx];
1937 } else {
1938 DAG.Pressure[RegionIdx] = PressureAfter;
1939 }
1940}
1941
1942unsigned
1943GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1944 DenseMap<unsigned, unsigned> &ReadyCycles,
1945 const TargetSchedModel &SM) {
1946 unsigned ReadyCycle = CurrCycle;
1947 for (auto &D : SU.Preds) {
1948 if (D.isAssignedRegDep()) {
1949 MachineInstr *DefMI = D.getSUnit()->getInstr();
1950 unsigned Latency = SM.computeInstrLatency(MI: DefMI);
1951 unsigned DefReady = ReadyCycles[DAG.getSUnit(MI: DefMI)->NodeNum];
1952 ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency);
1953 }
1954 }
1955 ReadyCycles[SU.NodeNum] = ReadyCycle;
1956 return ReadyCycle;
1957}
1958
1959#ifndef NDEBUG
1960struct EarlierIssuingCycle {
1961 bool operator()(std::pair<MachineInstr *, unsigned> A,
1962 std::pair<MachineInstr *, unsigned> B) const {
1963 return A.second < B.second;
1964 }
1965};
1966
1967static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1968 EarlierIssuingCycle> &ReadyCycles) {
1969 if (ReadyCycles.empty())
1970 return;
1971 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1972 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1973 << " ##################\n# Cycle #\t\t\tInstruction "
1974 " "
1975 " \n";
1976 unsigned IPrev = 1;
1977 for (auto &I : ReadyCycles) {
1978 if (I.second > IPrev + 1)
1979 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1980 << " CYCLES DETECTED ******************************\n\n";
1981 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1982 IPrev = I.second;
1983 }
1984}
1985#endif
1986
1987ScheduleMetrics
1988GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1989#ifndef NDEBUG
1990 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1991 ReadyCyclesSorted;
1992#endif
1993 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1994 unsigned SumBubbles = 0;
1995 DenseMap<unsigned, unsigned> ReadyCycles;
1996 unsigned CurrCycle = 0;
1997 for (auto &SU : InputSchedule) {
1998 unsigned ReadyCycle =
1999 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
2000 SumBubbles += ReadyCycle - CurrCycle;
2001#ifndef NDEBUG
2002 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
2003#endif
2004 CurrCycle = ++ReadyCycle;
2005 }
2006#ifndef NDEBUG
2007 LLVM_DEBUG(
2008 printScheduleModel(ReadyCyclesSorted);
2009 dbgs() << "\n\t"
2010 << "Metric: "
2011 << (SumBubbles
2012 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2013 : 1)
2014 << "\n\n");
2015#endif
2016
2017 return ScheduleMetrics(CurrCycle, SumBubbles);
2018}
2019
2020ScheduleMetrics
2021GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
2022#ifndef NDEBUG
2023 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
2024 ReadyCyclesSorted;
2025#endif
2026 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
2027 unsigned SumBubbles = 0;
2028 DenseMap<unsigned, unsigned> ReadyCycles;
2029 unsigned CurrCycle = 0;
2030 for (auto &MI : DAG) {
2031 SUnit *SU = DAG.getSUnit(MI: &MI);
2032 if (!SU)
2033 continue;
2034 unsigned ReadyCycle =
2035 computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM);
2036 SumBubbles += ReadyCycle - CurrCycle;
2037#ifndef NDEBUG
2038 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
2039#endif
2040 CurrCycle = ++ReadyCycle;
2041 }
2042#ifndef NDEBUG
2043 LLVM_DEBUG(
2044 printScheduleModel(ReadyCyclesSorted);
2045 dbgs() << "\n\t"
2046 << "Metric: "
2047 << (SumBubbles
2048 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2049 : 1)
2050 << "\n\n");
2051#endif
2052
2053 return ScheduleMetrics(CurrCycle, SumBubbles);
2054}
2055
2056bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
2057 if (WavesAfter < DAG.MinOccupancy)
2058 return true;
2059
2060 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
2061 if (DAG.MFI.isDynamicVGPREnabled()) {
2062 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2063 STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(),
2064 DynamicVGPRBlockSize: PressureBefore.getVGPRNum(UnifiedVGPRFile: false));
2065 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2066 STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(),
2067 DynamicVGPRBlockSize: PressureAfter.getVGPRNum(UnifiedVGPRFile: false));
2068 if (BlocksAfter > BlocksBefore)
2069 return true;
2070 }
2071
2072 return false;
2073}
2074
2075bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
2076 if (PressureAfter == PressureBefore)
2077 return false;
2078
2079 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
2080 return true;
2081
2082 if (mayCauseSpilling(WavesAfter))
2083 return true;
2084
2085 return false;
2086}
2087
2088bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
2089 // If RP is not reduced in the unclustered reschedule stage, revert to the
2090 // old schedule.
2091 if ((WavesAfter <=
2092 PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) &&
2093 mayCauseSpilling(WavesAfter)) ||
2094 GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
2095 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
2096 return true;
2097 }
2098
2099 // Do not attempt to relax schedule even more if we are already spilling.
2100 if (isRegionWithExcessRP())
2101 return false;
2102
2103 LLVM_DEBUG(
2104 dbgs()
2105 << "\n\t *** In shouldRevertScheduling ***\n"
2106 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
2107 ScheduleMetrics MBefore = getScheduleMetrics(InputSchedule: DAG.SUnits);
2108 LLVM_DEBUG(
2109 dbgs()
2110 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
2111 ScheduleMetrics MAfter = getScheduleMetrics(DAG);
2112 unsigned OldMetric = MBefore.getMetric();
2113 unsigned NewMetric = MAfter.getMetric();
2114 unsigned WavesBefore = std::min(
2115 a: S.getTargetOccupancy(),
2116 b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()));
2117 unsigned Profit =
2118 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
2119 ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
2120 NewMetric) /
2121 ScheduleMetrics::ScaleFactor;
2122 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
2123 << MAfter << "Profit: " << Profit << "\n");
2124 return Profit < ScheduleMetrics::ScaleFactor;
2125}
2126
2127bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
2128 if (PressureAfter == PressureBefore)
2129 return false;
2130
2131 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
2132 return true;
2133
2134 if (mayCauseSpilling(WavesAfter))
2135 return true;
2136
2137 return false;
2138}
2139
2140bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
2141 // When trying to increase occupancy (TargetOcc == true) the stage manages
2142 // region reverts globally (all or none), so we always return false here.
2143 return !TargetOcc && mayCauseSpilling(WavesAfter);
2144}
2145
2146bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
2147 if (mayCauseSpilling(WavesAfter))
2148 return true;
2149
2150 return false;
2151}
2152
2153bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
2154 unsigned WavesAfter) {
2155 return mayCauseSpilling(WavesAfter);
2156}
2157
2158bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
2159 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
2160 !PressureAfter.less(MF, O: PressureBefore)) {
2161 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
2162 return true;
2163 }
2164
2165 return false;
2166}
2167
2168void GCNSchedStage::modifyRegionSchedule(unsigned RegionIdx,
2169 MachineBasicBlock *MBB,
2170 ArrayRef<MachineInstr *> MIOrder) {
2171 assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
2172 DAG.Regions[RegionIdx].second)) ==
2173 MIOrder.size() &&
2174 "instruction number mismatch");
2175 if (MIOrder.empty())
2176 return;
2177
2178 LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
2179
2180 // Reconstruct MI sequence by moving instructions in desired order before
2181 // the current region's start.
2182 MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
2183 for (MachineInstr *MI : MIOrder) {
2184 // Either move the next MI in order before the end of the region or move the
2185 // region end past the MI if it is at the correct position.
2186 MachineBasicBlock::iterator MII = MI->getIterator();
2187 if (MII != RegionEnd) {
2188 // Will subsequent splice move MI up past a non-debug instruction?
2189 bool NonDebugReordered =
2190 !MI->isDebugInstr() &&
2191 skipDebugInstructionsForward(It: RegionEnd, End: MII) != MII;
2192 MBB->splice(Where: RegionEnd, Other: MBB, From: MI);
2193 // Only update LiveIntervals information if non-debug instructions are
2194 // reordered. Otherwise debug instructions could cause code generation to
2195 // change.
2196 if (NonDebugReordered)
2197 DAG.LIS->handleMove(MI&: *MI, UpdateFlags: true);
2198 } else {
2199 ++RegionEnd;
2200 }
2201 if (MI->isDebugInstr()) {
2202 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2203 continue;
2204 }
2205
2206 // Reset read-undef flags and update them later.
2207 for (MachineOperand &Op : MI->all_defs())
2208 Op.setIsUndef(false);
2209 RegisterOperands RegOpers;
2210 RegOpers.collect(MI: *MI, TRI: *DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false);
2211 if (DAG.ShouldTrackLaneMasks) {
2212 // Adjust liveness and add missing dead+read-undef flags.
2213 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot();
2214 RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI);
2215 } else {
2216 // Adjust for missing dead-def flags.
2217 RegOpers.detectDeadDefs(MI: *MI, LIS: *DAG.LIS);
2218 }
2219 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2220 }
2221
2222 // The region end doesn't change throughout scheduling since it itself is
2223 // outside the region (whether that is a MBB end or a terminator MI).
2224 assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
2225 DAG.Regions[RegionIdx].first = MIOrder.front();
2226}
2227
2228bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
2229
2230 if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(MI: *MI))
2231 return false;
2232 return AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI->getOpcode()) != -1;
2233}
2234
2235bool RewriteMFMAFormStage::initHeuristics(
2236 std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2237 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2238 SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2239 bool Changed = false;
2240
2241 // Prepare for the heuristics
2242 for (MachineBasicBlock &MBB : MF) {
2243 for (MachineInstr &MI : MBB) {
2244 if (!isRewriteCandidate(MI: &MI))
2245 continue;
2246
2247 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI.getOpcode());
2248 assert(ReplacementOp != -1);
2249
2250 RewriteCands.push_back(x: {&MI, MI.getOpcode()});
2251 MI.setDesc(TII->get(Opcode: ReplacementOp));
2252
2253 MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
2254 if (Src2->isReg()) {
2255 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2256 findReachingDefs(UseMO&: *Src2, LIS: DAG.LIS, DefIdxs&: Src2ReachingDefs);
2257
2258 // For any definition of the src2 register which is non-MFMA, we
2259 // insert a copy.
2260 for (SlotIndex RDIdx : Src2ReachingDefs) {
2261 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIdx);
2262 if (!TII->isMAI(MI: *RD))
2263 CopyForDef.insert(Ptr: RD);
2264 }
2265 }
2266
2267 MachineOperand &Dst = MI.getOperand(i: 0);
2268 SmallVector<MachineOperand *, 8> DstReachingUses;
2269
2270 findReachingUses(DefMI: &MI, LIS: DAG.LIS, ReachingUses&: DstReachingUses);
2271
2272 for (MachineOperand *RUOp : DstReachingUses) {
2273 if (TII->isMAI(MI: *RUOp->getParent()))
2274 continue;
2275
2276 // For any user of the result of the MFMA which is not an MFMA, we
2277 // insert a copy. For a given register, we will only insert one copy
2278 // per user block.
2279 CopyForUse[RUOp->getParent()->getParent()].insert(x: RUOp->getReg());
2280
2281 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2282 findReachingDefs(UseMO&: *RUOp, LIS: DAG.LIS, DefIdxs&: DstUsesReachingDefs);
2283
2284 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2285 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2286 if (TII->isMAI(MI: *RD))
2287 continue;
2288
2289 // For any definition of the user of the MFMA which is not an MFMA,
2290 // we insert a copy. We do this to transform all the reaching defs
2291 // of this use to AGPR. By doing this, we can insert a copy from
2292 // AGPR to VGPR at the user rather than after the MFMA.
2293 CopyForDef.insert(Ptr: RD);
2294 }
2295 }
2296
2297 // Do the rewrite to allow for updated RP calculation.
2298 const TargetRegisterClass *VDefRC = DAG.MRI.getRegClass(Reg: Dst.getReg());
2299 const TargetRegisterClass *ADefRC = SRI->getEquivalentAGPRClass(SRC: VDefRC);
2300 DAG.MRI.setRegClass(Reg: Dst.getReg(), RC: ADefRC);
2301 if (Src2->isReg()) {
2302 // Have to get src types separately since subregs may cause C and D
2303 // registers to be different types even though the actual operand is
2304 // the same size.
2305 const TargetRegisterClass *VUseRC = DAG.MRI.getRegClass(Reg: Src2->getReg());
2306 const TargetRegisterClass *AUseRC = SRI->getEquivalentAGPRClass(SRC: VUseRC);
2307 DAG.MRI.setRegClass(Reg: Src2->getReg(), RC: AUseRC);
2308 }
2309 Changed = true;
2310 }
2311 }
2312
2313 return Changed;
2314}
2315
2316int64_t RewriteMFMAFormStage::getRewriteCost(
2317 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2318 const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2319 const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2320 MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2321
2322 int64_t BestSpillCost = 0;
2323 int64_t Cost = 0;
2324 uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2325
2326 std::pair<unsigned, unsigned> MaxVectorRegs =
2327 ST.getMaxNumVectorRegs(F: MF.getFunction());
2328 unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2329 unsigned AGPRThreshold = MaxVectorRegs.second;
2330 unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2331
2332 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2333 if (!RegionsWithExcessArchVGPR[Region])
2334 continue;
2335
2336 GCNRegPressure &PressureBefore = DAG.Pressure[Region];
2337 unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2338 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2339
2340 // For the cases we care about (i.e. ArchVGPR usage is greater than the
2341 // addressable limit), rewriting alone should bring pressure to manageable
2342 // level. If we find any such region, then the rewrite is potentially
2343 // beneficial.
2344 GCNRegPressure PressureAfter = DAG.getRealRegPressure(RegionIdx: Region);
2345 unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2346 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2347
2348 uint64_t BlockFreq =
2349 MBFI->getBlockFreq(MBB: DAG.Regions[Region].first->getParent())
2350 .getFrequency();
2351
2352 bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2353 uint64_t RelativeFreq = EntryFreq && BlockFreq
2354 ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2355 : BlockFreq / EntryFreq)
2356 : 1;
2357
2358 // This assumes perfect spilling / splitting -- using one spill / copy
2359 // instruction and one restoreFrom / copy for each excess register,
2360 int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
2361
2362 // Also account for the block frequency.
2363 if (RelativeFreqIsDenom)
2364 SpillCost /= (int64_t)RelativeFreq;
2365 else
2366 SpillCost *= (int64_t)RelativeFreq;
2367
2368 // If we have increased spilling in any block, just bail.
2369 if (SpillCost > 0)
2370 return SpillCost;
2371
2372 if (SpillCost < BestSpillCost)
2373 BestSpillCost = SpillCost;
2374 }
2375
2376 // Set the cost to the largest decrease in spill cost in order to not double
2377 // count spill reductions.
2378 Cost = BestSpillCost;
2379 assert(Cost <= 0);
2380
2381 unsigned CopyCost = 0;
2382
2383 // For each CopyForDef, increase the cost by the register size while
2384 // accounting for block frequency.
2385 for (MachineInstr *DefMI : CopyForDef) {
2386 Register DefReg = DefMI->getOperand(i: 0).getReg();
2387 uint64_t DefFreq =
2388 EntryFreq
2389 ? MBFI->getBlockFreq(MBB: DefMI->getParent()).getFrequency() / EntryFreq
2390 : 1;
2391
2392 const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg: DefReg);
2393 CopyCost += RC->getCopyCost() * DefFreq;
2394 }
2395
2396 // Account for CopyForUse copies in each block that the register is used.
2397 for (auto &[UseBlock, UseRegs] : CopyForUse) {
2398 uint64_t UseFreq =
2399 EntryFreq ? MBFI->getBlockFreq(MBB: UseBlock).getFrequency() / EntryFreq : 1;
2400
2401 for (Register UseReg : UseRegs) {
2402 const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg: UseReg);
2403 CopyCost += RC->getCopyCost() * UseFreq;
2404 }
2405 }
2406
2407 // Reset the classes that were changed to AGPR for better RB analysis.
2408 // We must do rewriting after copy-insertion, as some defs of the register
2409 // may require VGPR. Additionally, if we bail out and don't perform the
2410 // rewrite then these need to be restored anyway.
2411 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2412 assert(TII->isMAI(*MI));
2413 const TargetRegisterClass *ADefRC =
2414 DAG.MRI.getRegClass(Reg: MI->getOperand(i: 0).getReg());
2415 const TargetRegisterClass *VDefRC = SRI->getEquivalentVGPRClass(SRC: ADefRC);
2416 DAG.MRI.setRegClass(Reg: MI->getOperand(i: 0).getReg(), RC: VDefRC);
2417 MI->setDesc(TII->get(Opcode: OriginalOpcode));
2418
2419 MachineOperand *Src2 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2);
2420 assert(Src2);
2421 if (!Src2->isReg())
2422 continue;
2423
2424 // Have to get src types separately since subregs may cause C and D
2425 // registers to be different types even though the actual operand is
2426 // the same size.
2427 const TargetRegisterClass *AUseRC = DAG.MRI.getRegClass(Reg: Src2->getReg());
2428 const TargetRegisterClass *VUseRC = SRI->getEquivalentVGPRClass(SRC: AUseRC);
2429 DAG.MRI.setRegClass(Reg: Src2->getReg(), RC: VUseRC);
2430 }
2431
2432 return Cost + CopyCost;
2433}
2434
2435bool RewriteMFMAFormStage::rewrite(
2436 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
2437 DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
2438 DenseMap<MachineInstr *, unsigned> LastMIToRegion;
2439
2440 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2441 RegionBoundaries Entry = DAG.Regions[Region];
2442 if (Entry.first == Entry.second)
2443 continue;
2444
2445 FirstMIToRegion[&*Entry.first] = Region;
2446 if (Entry.second != Entry.first->getParent()->end())
2447 LastMIToRegion[&*Entry.second] = Region;
2448 }
2449
2450 // Rewrite the MFMAs to AGPR, and insert any copies as needed.
2451 // The general assumption of the algorithm (and the previous cost calculation)
2452 // is that it is better to insert the copies in the MBB of the def of the src2
2453 // operands, and in the MBB of the user of the dest operands. This is based on
2454 // the assumption that the MFMAs are likely to appear in loop bodies, while
2455 // the src2 and dest operands are live-in / live-out of the loop. Due to this
2456 // design, the algorithm for finding copy insertion points is more
2457 // complicated.
2458 //
2459 // There are three main cases to handle: 1. the reaching defs of the src2
2460 // operands, 2. the reaching uses of the dst operands, and 3. the reaching
2461 // defs of the reaching uses of the dst operand.
2462 //
2463 // In the first case, we simply insert copies after each of the reaching
2464 // definitions. In the second case, we collect all the uses of a given dest
2465 // and organize them by MBB. Then, we insert 1 copy for each MBB before the
2466 // earliest use. Since the use may have multiple reaching defs, and since we
2467 // want to replace the register it is using with the result of the copy, we
2468 // must handle case 3. In the third case, we simply insert a copy after each
2469 // of the reaching defs to connect to the copy of the reaching uses of the dst
2470 // reg. This allows us to avoid inserting copies next to the MFMAs.
2471 //
2472 // While inserting the copies, we maintain a map of operands which will use
2473 // different regs (i.e. the result of the copies). For example, a case 1 src2
2474 // operand will use the register result of the copies after the reaching defs,
2475 // as opposed to the original register. Now that we have completed our copy
2476 // analysis and placement, we can bulk update the registers. We do this
2477 // separately as to avoid complicating the reachingDef and reachingUse
2478 // queries.
2479 //
2480 // While inserting the copies, we also maintain a list or registers which we
2481 // will want to reclassify as AGPR. After doing the copy insertion and the
2482 // register replacement, we can finally do the reclassification. This uses the
2483 // redef map, as the registers we are interested in reclassifying may be
2484 // replaced by the result of a copy. We must do this after the copy analysis
2485 // and placement as we must have an accurate redef map -- otherwise we may end
2486 // up creating illegal instructions.
2487
2488 // The original registers of the MFMA that need to be reclassified as AGPR.
2489 DenseSet<Register> RewriteRegs;
2490 // The map of an original register in the MFMA to a new register (result of a
2491 // copy) that it should be replaced with.
2492 DenseMap<Register, Register> RedefMap;
2493 // The map of the original MFMA registers to the relevant MFMA operands.
2494 DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2495 // The map of reaching defs for a given register -- to avoid duplicate copies.
2496 DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
2497 // The map of reaching uses for a given register by basic block -- to avoid
2498 // duplicate copies and to calculate per MBB insert pts.
2499 DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
2500 ReachingUseTracker;
2501
2502 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2503 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI->getOpcode());
2504 if (ReplacementOp == -1)
2505 continue;
2506 MI->setDesc(TII->get(Opcode: ReplacementOp));
2507
2508 // Case 1: insert copies for the reaching defs of the Src2Reg.
2509 MachineOperand *Src2 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2);
2510 if (Src2->isReg()) {
2511 Register Src2Reg = Src2->getReg();
2512 if (!Src2Reg.isVirtual())
2513 return false;
2514
2515 Register MappedReg = Src2->getReg();
2516 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2517 findReachingDefs(UseMO&: *Src2, LIS: DAG.LIS, DefIdxs&: Src2ReachingDefs);
2518 SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
2519
2520 for (SlotIndex RDIndex : Src2ReachingDefs) {
2521 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2522 if (TII->isMAI(MI: *RD))
2523 continue;
2524
2525 // If there is a non mai reaching def, then we need a copy.
2526 Src2DefsReplace.insert(X: RD);
2527 }
2528
2529 if (!Src2DefsReplace.empty()) {
2530 DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: Src2Reg);
2531 if (RI != RedefMap.end()) {
2532 MappedReg = RI->second;
2533 } else {
2534 assert(!ReachingDefCopyMap.contains(Src2Reg));
2535 const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Reg: Src2Reg);
2536 const TargetRegisterClass *VGPRRC =
2537 SRI->getEquivalentVGPRClass(SRC: Src2RC);
2538
2539 // Track the mapping of the original register to the new register.
2540 MappedReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2541 RedefMap[Src2Reg] = MappedReg;
2542 }
2543
2544 // If none exists, create a copy from this reaching def.
2545 // We may have inserted a copy already in an earlier iteration.
2546 for (MachineInstr *RD : Src2DefsReplace) {
2547 // Do not create redundant copies.
2548 if (ReachingDefCopyMap[Src2Reg].insert(Ptr: RD).second) {
2549 MachineInstrBuilder VGPRCopy =
2550 BuildMI(BB&: *RD->getParent(), I: std::next(x: RD->getIterator()),
2551 MIMD: RD->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2552 .addDef(RegNo: MappedReg, Flags: {}, SubReg: 0)
2553 .addUse(RegNo: Src2Reg, Flags: {}, SubReg: 0);
2554 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2555
2556 // If this reaching def was the last MI in the region, update the
2557 // region boundaries.
2558 if (LastMIToRegion.contains(Val: RD)) {
2559 unsigned UpdateRegion = LastMIToRegion[RD];
2560 DAG.Regions[UpdateRegion].second = VGPRCopy;
2561 LastMIToRegion.erase(Val: RD);
2562 }
2563 }
2564 }
2565 }
2566
2567 // Track the register for reclassification
2568 RewriteRegs.insert(V: Src2Reg);
2569
2570 // Always insert the operand for replacement. If this corresponds with a
2571 // chain of tied-def we may not see the VGPR requirement until later.
2572 ReplaceMap[Src2Reg].insert(V: Src2);
2573 }
2574
2575 // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2576 // and after the reaching defs of the reaching uses of the dsts.
2577
2578 MachineOperand *Dst = &MI->getOperand(i: 0);
2579 Register DstReg = Dst->getReg();
2580 if (!DstReg.isVirtual())
2581 return false;
2582
2583 Register MappedReg = DstReg;
2584 SmallVector<MachineOperand *, 8> DstReachingUses;
2585
2586 SmallVector<MachineOperand *, 8> DstReachingUseCopies;
2587 SmallVector<MachineInstr *, 8> DstUseDefsReplace;
2588
2589 findReachingUses(DefMI: MI, LIS: DAG.LIS, ReachingUses&: DstReachingUses);
2590
2591 for (MachineOperand *RUOp : DstReachingUses) {
2592 if (TII->isMAI(MI: *RUOp->getParent()))
2593 continue;
2594
2595 // If there is a non mai reaching use, then we need a copy.
2596 if (find(Range&: DstReachingUseCopies, Val: RUOp) == DstReachingUseCopies.end())
2597 DstReachingUseCopies.push_back(Elt: RUOp);
2598 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2599 findReachingDefs(UseMO&: *RUOp, LIS: DAG.LIS, DefIdxs&: DstUsesReachingDefs);
2600
2601 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2602 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2603 if (TII->isMAI(MI: *RD))
2604 continue;
2605
2606 // If there is a non mai reaching def of this reaching use, then we will
2607 // need a copy.
2608 if (find(Range&: DstUseDefsReplace, Val: RD) == DstUseDefsReplace.end())
2609 DstUseDefsReplace.push_back(Elt: RD);
2610 }
2611 }
2612
2613 if (!DstUseDefsReplace.empty()) {
2614 DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: DstReg);
2615 if (RI != RedefMap.end()) {
2616 MappedReg = RI->second;
2617 } else {
2618 assert(!ReachingDefCopyMap.contains(DstReg));
2619 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: DstReg);
2620 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2621
2622 // Track the mapping of the original register to the new register.
2623 MappedReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2624 RedefMap[DstReg] = MappedReg;
2625 }
2626
2627 // If none exists, create a copy from this reaching def.
2628 // We may have inserted a copy already in an earlier iteration.
2629 for (MachineInstr *RD : DstUseDefsReplace) {
2630 // Do not create reundant copies.
2631 if (ReachingDefCopyMap[DstReg].insert(Ptr: RD).second) {
2632 MachineInstrBuilder VGPRCopy =
2633 BuildMI(BB&: *RD->getParent(), I: std::next(x: RD->getIterator()),
2634 MIMD: RD->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2635 .addDef(RegNo: MappedReg, Flags: {}, SubReg: 0)
2636 .addUse(RegNo: DstReg, Flags: {}, SubReg: 0);
2637 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2638
2639 // If this reaching def was the last MI in the region, update the
2640 // region boundaries.
2641 DenseMap<MachineInstr *, unsigned>::iterator LMI =
2642 LastMIToRegion.find(Val: RD);
2643 if (LMI != LastMIToRegion.end()) {
2644 unsigned UpdateRegion = LMI->second;
2645 DAG.Regions[UpdateRegion].second = VGPRCopy;
2646 LastMIToRegion.erase(Val: RD);
2647 }
2648 }
2649 }
2650 }
2651
2652 DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
2653 for (MachineOperand *RU : DstReachingUseCopies) {
2654 MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2655 // Just keep track of the reaching use of this register by block. After we
2656 // have scanned all the MFMAs we can find optimal insert pts.
2657 if (RUBlock != MI->getParent()) {
2658 ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(Ptr: RU);
2659 continue;
2660 }
2661
2662 // Special case, the use is in the same block as the MFMA. Insert the copy
2663 // just before the use.
2664 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: DstReg);
2665 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2666 Register NewUseReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2667 MachineInstr *UseInst = RU->getParent();
2668 MachineInstrBuilder VGPRCopy =
2669 BuildMI(BB&: *UseInst->getParent(), I: UseInst->getIterator(),
2670 MIMD: UseInst->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2671 .addDef(RegNo: NewUseReg, Flags: {}, SubReg: 0)
2672 .addUse(RegNo: DstReg, Flags: {}, SubReg: 0);
2673 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2674 // Since we know this use has only one reaching def, we can replace the
2675 // use reg.
2676 RU->setReg(NewUseReg);
2677 // Track the copy source operand for r eplacement.
2678 DstRegSet.insert(V: &VGPRCopy->getOperand(i: 1));
2679 }
2680
2681 // Track the register for reclassification
2682 RewriteRegs.insert(V: DstReg);
2683
2684 // Insert the dst operand for replacement. If this dst is in a chain of
2685 // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2686 // all the correspond operands need to be replaced.
2687 DstRegSet.insert(V: Dst);
2688 }
2689
2690 // Handle the copies for dst uses.
2691 using RUBType =
2692 std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
2693 for (RUBType RUBlockEntry : ReachingUseTracker) {
2694 using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
2695 for (RUDType RUDst : RUBlockEntry.second) {
2696 MachineOperand *OpBegin = *RUDst.second.begin();
2697 SlotIndex InstPt = DAG.LIS->getInstructionIndex(Instr: *OpBegin->getParent());
2698
2699 // Find the earliest use in this block.
2700 for (MachineOperand *User : RUDst.second) {
2701 SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(Instr: *User->getParent());
2702 if (SlotIndex::isEarlierInstr(A: NewInstPt, B: InstPt))
2703 InstPt = NewInstPt;
2704 }
2705
2706 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: RUDst.first);
2707 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2708 Register NewUseReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2709 MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(index: InstPt);
2710
2711 MachineInstrBuilder VGPRCopy =
2712 BuildMI(BB&: *UseInst->getParent(), I: UseInst->getIterator(),
2713 MIMD: UseInst->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2714 .addDef(RegNo: NewUseReg, Flags: {}, SubReg: 0)
2715 .addUse(RegNo: RUDst.first, Flags: {}, SubReg: 0);
2716 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2717
2718 // If this UseInst was the first MI in the region, update the region
2719 // boundaries.
2720 DenseMap<MachineInstr *, unsigned>::iterator FI =
2721 FirstMIToRegion.find(Val: UseInst);
2722 if (FI != FirstMIToRegion.end()) {
2723 unsigned UpdateRegion = FI->second;
2724 DAG.Regions[UpdateRegion].first = VGPRCopy;
2725 FirstMIToRegion.erase(Val: UseInst);
2726 }
2727
2728 // Replace the operand for all users.
2729 for (MachineOperand *User : RUDst.second) {
2730 User->setReg(NewUseReg);
2731 }
2732
2733 // Track the copy source operand for replacement.
2734 ReplaceMap[RUDst.first].insert(V: &VGPRCopy->getOperand(i: 1));
2735 }
2736 }
2737
2738 // We may have needed to insert copies after the reaching defs of the MFMAs.
2739 // Replace the original register with the result of the copy for all relevant
2740 // operands.
2741 for (std::pair<Register, Register> NewDef : RedefMap) {
2742 Register OldReg = NewDef.first;
2743 Register NewReg = NewDef.second;
2744
2745 // Replace the register for any associated operand in the MFMA chain.
2746 for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
2747 ReplaceOp->setReg(NewReg);
2748 }
2749
2750 // Finally, do the reclassification of the MFMA registers.
2751 for (Register RewriteReg : RewriteRegs) {
2752 Register RegToRewrite = RewriteReg;
2753
2754 // Be sure to update the replacement register and not the original.
2755 DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: RewriteReg);
2756 if (RI != RedefMap.end())
2757 RegToRewrite = RI->second;
2758
2759 const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(Reg: RegToRewrite);
2760 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(SRC: CurrRC);
2761
2762 DAG.MRI.setRegClass(Reg: RegToRewrite, RC: AGPRRC);
2763 }
2764
2765 // Bulk update the LIS.
2766 DAG.LIS->reanalyze(MF&: DAG.MF);
2767 // Liveins may have been modified for cross RC copies
2768 RegionPressureMap LiveInUpdater(&DAG, false);
2769 LiveInUpdater.buildLiveRegMap();
2770
2771 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
2772 DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(RegionIdx: Region);
2773
2774 DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2775
2776 return true;
2777}
2778
2779unsigned PreRARematStage::getStageTargetOccupancy() const {
2780 return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
2781}
2782
2783bool PreRARematStage::setObjective() {
2784 const Function &F = MF.getFunction();
2785
2786 // Set up "spilling targets" for all regions.
2787 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2788 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2789 bool HasVectorRegisterExcess = false;
2790 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2791 const GCNRegPressure &RP = DAG.Pressure[I];
2792 GCNRPTarget &Target = RPTargets.emplace_back(Args&: MaxSGPRs, Args&: MaxVGPRs, Args&: MF, Args: RP);
2793 if (!Target.satisfied())
2794 TargetRegions.set(I);
2795 HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
2796 }
2797
2798 if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2799 // In addition to register usage being above addressable limits, occupancy
2800 // below the minimum is considered like "spilling" as well.
2801 TargetOcc = std::nullopt;
2802 } else {
2803 // There is no spilling and room to improve occupancy; set up "increased
2804 // occupancy targets" for all regions.
2805 TargetOcc = DAG.MinOccupancy + 1;
2806 const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
2807 MaxSGPRs = ST.getMaxNumSGPRs(WavesPerEU: *TargetOcc, Addressable: false);
2808 MaxVGPRs = ST.getMaxNumVGPRs(WavesPerEU: *TargetOcc, DynamicVGPRBlockSize: VGPRBlockSize);
2809 for (auto [I, Target] : enumerate(First&: RPTargets)) {
2810 Target.setTarget(NumSGPRs: MaxSGPRs, NumVGPRs: MaxVGPRs);
2811 if (!Target.satisfied())
2812 TargetRegions.set(I);
2813 }
2814 }
2815
2816 return TargetRegions.any();
2817}
2818
2819bool PreRARematStage::collectRematRegs(
2820 const DenseMap<MachineInstr *, unsigned> &MIRegion) {
2821 // We need up-to-date live-out info. to query live-out register masks in
2822 // regions containing rematerializable instructions.
2823 DAG.RegionLiveOuts.buildLiveRegMap();
2824
2825 // Set of registers already marked for potential remterialization; used to
2826 // avoid rematerialization chains.
2827 SmallSet<Register, 4> MarkedRegs;
2828 auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool {
2829 return MO.isReg() && MarkedRegs.contains(V: MO.getReg());
2830 };
2831
2832 // Identify rematerializable instructions in the function.
2833 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2834 RegionBoundaries Bounds = DAG.Regions[I];
2835 for (auto MI = Bounds.first; MI != Bounds.second; ++MI) {
2836 // The instruction must be rematerializable.
2837 MachineInstr &DefMI = *MI;
2838 if (!isReMaterializable(MI: DefMI))
2839 continue;
2840
2841 // We only support rematerializing virtual registers with one
2842 // definition.
2843 Register Reg = DefMI.getOperand(i: 0).getReg();
2844 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(RegNo: Reg))
2845 continue;
2846
2847 // We only care to rematerialize the instruction if it has a single
2848 // non-debug user in a different region.
2849 // FIXME: Allow rematerializations with multiple uses. This should be
2850 // relatively easy to support using the current cost model.
2851 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(RegNo: Reg);
2852 if (!UseMI)
2853 continue;
2854 auto UseRegion = MIRegion.find(Val: UseMI);
2855 if (UseRegion == MIRegion.end() || UseRegion->second == I)
2856 continue;
2857
2858 // Do not rematerialize an instruction if it uses or is used by an
2859 // instruction that we have designated for rematerialization.
2860 // FIXME: Allow for rematerialization chains: this requires 1. updating
2861 // remat points to account for uses that are rematerialized, and 2.
2862 // either rematerializing the candidates in careful ordering, or
2863 // deferring the MBB RP walk until the entire chain has been
2864 // rematerialized.
2865 const MachineOperand &UseMO = UseMI->getOperand(i: 0);
2866 if (IsMarkedForRemat(UseMO) ||
2867 llvm::any_of(Range: DefMI.operands(), P: IsMarkedForRemat))
2868 continue;
2869
2870 // Do not rematerialize an instruction it it uses registers that aren't
2871 // available at its use. This ensures that we are not extending any live
2872 // range while rematerializing.
2873 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(Instr: *UseMI).getRegSlot(EC: true);
2874 if (!VirtRegAuxInfo::allUsesAvailableAt(MI: &DefMI, UseIdx, LIS: *DAG.LIS, MRI: DAG.MRI,
2875 TII: *DAG.TII))
2876 continue;
2877
2878 // Add the instruction to the rematerializable list.
2879 MarkedRegs.insert(V: Reg);
2880 RematRegs.emplace_back(Args: &DefMI, Args&: UseMI, Args&: DAG, Args: MIRegion);
2881 }
2882 }
2883
2884 return !RematRegs.empty();
2885}
2886
2887PreRARematStage::RematReg::RematReg(
2888 MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
2889 const DenseMap<MachineInstr *, unsigned> &MIRegion)
2890 : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()),
2891 LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()),
2892 DefRegion(MIRegion.at(Val: DefMI)), UseRegion(MIRegion.at(Val: UseMI)) {
2893
2894 // Mark regions in which the rematerializable register is live.
2895 Register Reg = getReg();
2896 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2897 auto LiveInIt = DAG.LiveIns[I].find(Val: Reg);
2898 if (LiveInIt != DAG.LiveIns[I].end())
2899 LiveIn.set(I);
2900 const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I);
2901 if (auto LiveOutIt = LiveOuts.find(Val: Reg); LiveOutIt != LiveOuts.end())
2902 LiveOut.set(I);
2903 }
2904 Live |= LiveIn;
2905 Live |= LiveOut;
2906 Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: DefRegion).at(Val: Reg);
2907}
2908
2909bool PreRARematStage::RematReg::maybeBeneficial(
2910 const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
2911 Register Reg = getReg();
2912 for (unsigned I : TargetRegions.set_bits()) {
2913 if (Live[I] && RPTargets[I].isSaveBeneficial(Reg))
2914 return true;
2915 }
2916 return false;
2917}
2918
2919void PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
2920 MachineInstr *RematMI,
2921 GCNScheduleDAGMILive &DAG) const {
2922 RegionBoundaries &Bounds = DAG.Regions[RegionIdx];
2923 if (Bounds.first == std::next(x: MachineBasicBlock::iterator(RematMI)))
2924 Bounds.first = RematMI;
2925 DAG.LIS->InsertMachineInstrInMaps(MI&: *RematMI);
2926 DAG.LIS->createAndComputeVirtRegInterval(Reg: RematMI->getOperand(i: 0).getReg());
2927}
2928
2929PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
2930 MachineFunction &MF, const GCNScheduleDAGMILive &DAG) {
2931 assert(DAG.MLI && "MLI not defined in DAG");
2932 MachineBranchProbabilityInfo MBPI;
2933 MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
2934
2935 const unsigned NumRegions = DAG.Regions.size();
2936 MinFreq = MBFI.getEntryFreq().getFrequency();
2937 MaxFreq = 0;
2938 Regions.reserve(N: NumRegions);
2939 for (unsigned I = 0; I < NumRegions; ++I) {
2940 MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
2941 uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
2942 Regions.push_back(Elt: BlockFreq);
2943 if (BlockFreq && BlockFreq < MinFreq)
2944 MinFreq = BlockFreq;
2945 else if (BlockFreq > MaxFreq)
2946 MaxFreq = BlockFreq;
2947 }
2948 if (!MinFreq)
2949 return;
2950
2951 // Scale everything down if frequencies are high.
2952 if (MinFreq >= ScaleFactor * ScaleFactor) {
2953 for (uint64_t &Freq : Regions)
2954 Freq /= ScaleFactor;
2955 MinFreq /= ScaleFactor;
2956 MaxFreq /= ScaleFactor;
2957 }
2958}
2959
2960PreRARematStage::ScoredRemat::ScoredRemat(RematReg *Remat, const FreqInfo &Freq,
2961 const GCNScheduleDAGMILive &DAG)
2962 : Remat(Remat), FreqDiff(getFreqDiff(Freq)) {
2963 RPSave.inc(Reg: Remat->getReg(), PrevMask: LaneBitmask::getNone(), NewMask: Remat->Mask, MRI: DAG.MRI);
2964}
2965
2966int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
2967 // Get frequencies of defining and using regions. A rematerialization from the
2968 // least frequent region to the most frequent region will yield the greatest
2969 // latency penalty and therefore should get minimum score. Reciprocally, a
2970 // rematerialization in the other direction should get maximum score. Default
2971 // to values that will yield the worst possible score given known frequencies
2972 // in order to penalize rematerializations from or into regions whose
2973 // frequency is unknown.
2974 int64_t DefOrMin = std::max(a: Freq.Regions[Remat->DefRegion], b: Freq.MinFreq);
2975 int64_t UseOrMax = Freq.Regions[Remat->UseRegion];
2976 if (!UseOrMax)
2977 UseOrMax = Freq.MaxFreq;
2978 return DefOrMin - UseOrMax;
2979}
2980
2981void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
2982 ArrayRef<GCNRPTarget> RPTargets,
2983 const FreqInfo &FreqInfo,
2984 bool ReduceSpill) {
2985 MaxFreq = 0;
2986 RegionImpact = 0;
2987 for (unsigned I : TargetRegions.set_bits()) {
2988 if (!Remat->Live[I])
2989 continue;
2990
2991 // The rematerialization must contribute positively in at least one
2992 // register class with usage above the RP target for this region to
2993 // contribute to the score.
2994 const GCNRPTarget &RegionTarget = RPTargets[I];
2995 const unsigned NumRegsBenefit = RegionTarget.getNumRegsBenefit(SaveRP: RPSave);
2996 if (!NumRegsBenefit)
2997 continue;
2998
2999 bool UnusedLT = Remat->isUnusedLiveThrough(I);
3000
3001 // Regions in which RP is guaranteed to decrease have more weight.
3002 RegionImpact += (UnusedLT ? 2 : 1) * NumRegsBenefit;
3003
3004 if (ReduceSpill) {
3005 uint64_t Freq = FreqInfo.Regions[I];
3006 if (!UnusedLT) {
3007 // Apply a frequency penalty in regions in which we are not sure that RP
3008 // will decrease.
3009 Freq /= 2;
3010 }
3011 MaxFreq = std::max(a: MaxFreq, b: Freq);
3012 }
3013 }
3014}
3015
3016MachineInstr *PreRARematStage::ScoredRemat::rematerialize(
3017 BitVector &RecomputeRP, SmallVectorImpl<GCNRPTarget> &RPTargets,
3018 GCNScheduleDAGMILive &DAG) const {
3019 const SIInstrInfo *TII = DAG.MF.getSubtarget<GCNSubtarget>().getInstrInfo();
3020 MachineInstr &DefMI = *Remat->DefMI;
3021 Register Reg = DefMI.getOperand(i: 0).getReg();
3022 Register NewReg = DAG.MRI.cloneVirtualRegister(VReg: Reg);
3023
3024 // Rematerialize the register in the region where it is used.
3025 MachineBasicBlock::iterator InsertPos = Remat->UseMI;
3026 TII->reMaterialize(MBB&: *InsertPos->getParent(), MI: InsertPos, DestReg: NewReg, SubIdx: 0, Orig: DefMI);
3027 MachineInstr *RematMI = &*std::prev(x: InsertPos);
3028 Remat->UseMI->substituteRegister(FromReg: Reg, ToReg: NewReg, SubIdx: 0, RegInfo: *DAG.TRI);
3029 Remat->insertMI(RegionIdx: Remat->UseRegion, RematMI, DAG);
3030
3031#ifdef EXPENSIVE_CHECKS
3032 // All uses are known to be available / live at the remat point. Thus,
3033 // the uses should already be live in to the using region.
3034 for (MachineOperand &MO : DefMI.operands()) {
3035 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
3036 continue;
3037
3038 Register UseReg = MO.getReg();
3039 if (!UseReg.isVirtual())
3040 continue;
3041
3042 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
3043 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
3044 if (LI.hasSubRanges() && MO.getSubReg())
3045 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
3046
3047 LaneBitmask LiveInMask = DAG.LiveIns[Remat->UseRegion].at(UseReg);
3048 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
3049 // If this register has lanes not covered by the LiveIns, be sure they
3050 // do not map to any subrange. ref:
3051 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
3052 if (UncoveredLanes.any()) {
3053 assert(LI.hasSubRanges());
3054 for (LiveInterval::SubRange &SR : LI.subranges())
3055 assert((SR.LaneMask & UncoveredLanes).none());
3056 }
3057 }
3058#endif
3059
3060 // Remove the register from all regions where it is a live-in or live-out
3061 // and adjust RP targets. The save is guaranteed in regions in which the
3062 // register is live-through and unused but optimistic in all other regions
3063 // where the register is live.
3064 for (unsigned I : Remat->Live.set_bits()) {
3065 RPTargets[I].saveRP(SaveRP: RPSave);
3066 DAG.LiveIns[I].erase(Val: Reg);
3067 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I).erase(Val: Reg);
3068 if (!Remat->isUnusedLiveThrough(I))
3069 RecomputeRP.set(I);
3070 }
3071
3072 return RematMI;
3073}
3074
3075void PreRARematStage::commitRematerializations() const {
3076 REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n");
3077 for (const RollbackInfo &Rollback : Rollbacks)
3078 DAG.deleteMI(RegionIdx: Rollback.Remat->DefRegion, MI: Rollback.Remat->DefMI);
3079}
3080
3081void PreRARematStage::unsetSatisfiedRPTargets(const BitVector &Regions) {
3082 for (unsigned I : Regions.set_bits()) {
3083 if (TargetRegions[I] && RPTargets[I].satisfied()) {
3084 REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
3085 TargetRegions.reset(Idx: I);
3086 }
3087 }
3088}
3089
3090bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
3091 bool TooOptimistic = false;
3092 for (unsigned I : Regions.set_bits()) {
3093 GCNRPTarget &Target = RPTargets[I];
3094 Target.setRP(DAG.getRealRegPressure(RegionIdx: I));
3095
3096 // Since we were optimistic in assessing RP decreases in these regions, we
3097 // may need to remark the target as a target region if RP didn't decrease
3098 // as expected.
3099 if (!TargetRegions[I] && !Target.satisfied()) {
3100 REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
3101 TooOptimistic = true;
3102 TargetRegions.set(I);
3103 }
3104 }
3105 return TooOptimistic;
3106}
3107
3108// Copied from MachineLICM
3109bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
3110 if (!DAG.TII->isReMaterializable(MI))
3111 return false;
3112
3113 for (const MachineOperand &MO : MI.all_uses()) {
3114 // We can't remat physreg uses, unless it is a constant or an ignorable
3115 // use (e.g. implicit exec use on VALU instructions)
3116 if (MO.getReg().isPhysical()) {
3117 if (DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) || DAG.TII->isIgnorableUse(MO))
3118 continue;
3119 return false;
3120 }
3121 }
3122
3123 return true;
3124}
3125
3126void PreRARematStage::finalizeGCNSchedStage() {
3127 // We consider that reducing spilling is always beneficial so we never
3128 // rollback rematerializations or revert scheduling in such cases.
3129 if (!TargetOcc)
3130 return;
3131
3132 // When increasing occupancy, it is possible that re-scheduling is not able to
3133 // achieve the target occupancy in all regions, in which case re-scheduling in
3134 // all regions should be reverted.
3135 if (DAG.MinOccupancy >= *TargetOcc) {
3136 commitRematerializations();
3137 return;
3138 }
3139
3140 // It is possible that re-scheduling lowers occupancy over the one achieved
3141 // just through rematerializations, in which case we revert re-scheduling in
3142 // all regions but do not roll back rematerializations.
3143 const bool ShouldRollbackRemats = AchievedOcc < *TargetOcc;
3144
3145 // When we both need to revert re-scheduling and rollback rematerializations,
3146 // restore rematerialized MIs' original state before reverting so that they
3147 // are treated as non-debug instructions by the revert logic.
3148 if (ShouldRollbackRemats) {
3149 for (const RollbackInfo &Rollback : Rollbacks) {
3150 const auto &[Remat, RematMI, RegMap] = Rollback;
3151 Remat->DefMI->setDesc(DAG.TII->get(Opcode: RematMI->getOpcode()));
3152 for (const auto &[MOIdx, Reg] : RegMap)
3153 Remat->DefMI->getOperand(i: MOIdx).setReg(Reg);
3154 }
3155 }
3156
3157 // Revert re-scheduling in all affected regions.
3158 for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
3159 REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
3160 << '\n');
3161 DAG.Pressure[RegionIdx] = MaxPressure;
3162 modifyRegionSchedule(RegionIdx, MBB: RegionBB[RegionIdx], MIOrder: OrigMIOrder);
3163 }
3164
3165 if (!ShouldRollbackRemats) {
3166 commitRematerializations();
3167 DAG.setTargetOccupancy(AchievedOcc);
3168 return;
3169 }
3170
3171 // Reset the target occupancy to what it was pre-rematerialization.
3172 DAG.setTargetOccupancy(*TargetOcc - 1);
3173
3174 // Finish rolling back rematerializations, then recompute pressure in all
3175 // affected regions.
3176 REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
3177 BitVector RecomputeRP(DAG.Regions.size());
3178 DenseSet<Register> RecomputeLI;
3179 for (const RollbackInfo &Rollback : Rollbacks) {
3180 const auto &[Remat, RematMI, RegMap] = Rollback;
3181
3182 // Switch back to using the original register and delete the
3183 // rematerialization.
3184 Register Reg = RematMI->getOperand(i: 0).getReg();
3185 Register OriginalReg = Remat->DefMI->getOperand(i: 0).getReg();
3186 Remat->UseMI->substituteRegister(FromReg: Reg, ToReg: OriginalReg, SubIdx: 0, RegInfo: *DAG.TRI);
3187 REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion
3188 << "] Deleting rematerialization " << *RematMI);
3189 DAG.deleteMI(RegionIdx: Remat->UseRegion, MI: RematMI);
3190
3191 // Re-add the defined register as a live-in/live-out in all regions it used
3192 // to be one in.
3193 std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask);
3194 for (unsigned I : Remat->LiveIn.set_bits())
3195 DAG.LiveIns[I].insert(KV: LiveReg);
3196 for (unsigned I : Remat->LiveOut.set_bits())
3197 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I).insert(KV: LiveReg);
3198
3199 RecomputeRP |= Rollback.Remat->Live;
3200 // Regenerate intervals for all register operands of rematerialized MIs as
3201 // slot indices may have changed slightly from before re-scheduling.
3202 for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) {
3203 if (MO.isReg() && MO.getReg().isVirtual())
3204 RecomputeLI.insert(V: MO.getReg());
3205 }
3206 }
3207 for (Register Reg : RecomputeLI) {
3208 DAG.LIS->removeInterval(Reg);
3209 DAG.LIS->createAndComputeVirtRegInterval(Reg);
3210 }
3211#ifdef EXPENSIVE_CHECKS
3212 // In particular, we want to check for coherent MI/slot order in regions in
3213 // which reverts and/or rollbacks may have happened.
3214 MF.verify();
3215#endif
3216 for (unsigned I : RecomputeRP.set_bits())
3217 DAG.Pressure[I] = DAG.getRealRegPressure(RegionIdx: I);
3218
3219 GCNSchedStage::finalizeGCNSchedStage();
3220}
3221
3222void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) {
3223 // It's not possible for the deleted instruction to be upper region boundary
3224 // since we don't delete region terminators.
3225 if (Regions[RegionIdx].first == MI)
3226 Regions[RegionIdx].first = std::next(x: MachineBasicBlock::iterator(MI));
3227 LIS->removeInterval(Reg: MI->getOperand(i: 0).getReg());
3228 LIS->RemoveMachineInstrFromMaps(MI&: *MI);
3229 MI->eraseFromParent();
3230}
3231
3232void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
3233 MinOccupancy = TargetOccupancy;
3234 if (MFI.getOccupancy() < TargetOccupancy)
3235 MFI.increaseOccupancy(MF, Limit: MinOccupancy);
3236 else
3237 MFI.limitOccupancy(Limit: MinOccupancy);
3238}
3239
3240static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
3241 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
3242 return any_of(Range&: *DAG, P: [SII](MachineBasicBlock::iterator MI) {
3243 return SII->isIGLPMutationOnly(Opcode: MI->getOpcode());
3244 });
3245}
3246
3247GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
3248 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
3249 bool RemoveKillFlags)
3250 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
3251
3252void GCNPostScheduleDAGMILive::schedule() {
3253 HasIGLPInstrs = hasIGLPInstrs(DAG: this);
3254 if (HasIGLPInstrs) {
3255 SavedMutations.clear();
3256 SavedMutations.swap(x&: Mutations);
3257 addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
3258 }
3259
3260 ScheduleDAGMI::schedule();
3261}
3262
3263void GCNPostScheduleDAGMILive::finalizeSchedule() {
3264 if (HasIGLPInstrs)
3265 SavedMutations.swap(x&: Mutations);
3266
3267 ScheduleDAGMI::finalizeSchedule();
3268}
3269