1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNHazardRecognizer.h"
29#include "GCNRegPressure.h"
30#include "SIMachineFunctionInfo.h"
31#include "Utils/AMDGPUBaseInfo.h"
32#include "llvm/ADT/BitVector.h"
33#include "llvm/ADT/STLExtras.h"
34#include "llvm/CodeGen/CalcSpillWeights.h"
35#include "llvm/CodeGen/MachineBasicBlock.h"
36#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
37#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
38#include "llvm/CodeGen/MachineOperand.h"
39#include "llvm/CodeGen/RegisterClassInfo.h"
40#include "llvm/CodeGen/Rematerializer.h"
41#include "llvm/MC/LaneBitmask.h"
42#include "llvm/MC/MCSchedule.h"
43#include "llvm/MC/TargetRegistry.h"
44#include "llvm/Support/ErrorHandling.h"
45
46#define DEBUG_TYPE "machine-scheduler"
47
48using namespace llvm;
49
50static cl::opt<bool> DisableUnclusterHighRP(
51 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
52 cl::desc("Disable unclustered high register pressure "
53 "reduction scheduling stage."),
54 cl::init(Val: false));
55
56static cl::opt<bool> DisableClusteredLowOccupancy(
57 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
58 cl::desc("Disable clustered low occupancy "
59 "rescheduling for ILP scheduling stage."),
60 cl::init(Val: false));
61
62static cl::opt<unsigned> ScheduleMetricBias(
63 "amdgpu-schedule-metric-bias", cl::Hidden,
64 cl::desc(
65 "Sets the bias which adds weight to occupancy vs latency. Set it to "
66 "100 to chase the occupancy only."),
67 cl::init(Val: 10));
68
69static cl::opt<bool>
70 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
71 cl::desc("Relax occupancy targets for kernels which are memory "
72 "bound (amdgpu-membound-threshold), or "
73 "Wave Limited (amdgpu-limit-wave-threshold)."),
74 cl::init(Val: false));
75
76static cl::opt<bool> GCNTrackers(
77 "amdgpu-use-amdgpu-trackers", cl::Hidden,
78 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
79 cl::init(Val: false));
80
81static cl::opt<unsigned> PendingQueueLimit(
82 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
83 cl::desc(
84 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
85 cl::init(Val: 256));
86
87#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
88#define DUMP_MAX_REG_PRESSURE
89static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
90 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
91 cl::desc("Print a list of live registers along with their def/uses at the "
92 "point of maximum register pressure before scheduling."),
93 cl::init(false));
94
95static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
96 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
97 cl::desc("Print a list of live registers along with their def/uses at the "
98 "point of maximum register pressure after scheduling."),
99 cl::init(false));
100#endif
101
102static cl::opt<bool> DisableRewriteMFMAFormSchedStage(
103 "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
104 cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(Val: true));
105
106namespace {
107
108struct VGPRThresholdParser : public cl::parser<unsigned> {
109 VGPRThresholdParser(cl::Option &O) : cl::parser<unsigned>(O) {}
110
111 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
112 if (Arg.getAsInteger(Radix: 0, Result&: Value))
113 return O.error(Message: "'" + Arg + "' value invalid for uint argument!");
114
115 if (Value > 100)
116 return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!");
117
118 return false;
119 }
120};
121
122} // end anonymous namespace
123
124static cl::opt<unsigned, false, VGPRThresholdParser> VGPRThresholdPercentOpt(
125 "amdgpu-vgpr-threshold-percent", cl::Hidden,
126 cl::desc("Percent of VGPR limits that we should use as RP threshold "
127 "during scheduling. We have two limits relevant to scheduling: "
128 "Critical (avoid decreasing occupancy), Excess (avoid spilling). "
129 "This flag scales both limits back by an equal percent: (0 = use "
130 " default calculation, 1-100 = use percentage), default: 0"),
131 cl::init(Val: 0));
132
133const unsigned ScheduleMetrics::ScaleFactor = 100;
134
135GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
136 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
137 DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
138 if (GCNTrackers.getNumOccurrences() > 0)
139 GCNTrackersOverride = GCNTrackers;
140}
141
142void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
143 GenericScheduler::initialize(dag: DAG);
144
145 MF = &DAG->MF;
146
147 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
148
149 SGPRExcessLimit =
150 Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass);
151 VGPRExcessLimit =
152 Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass);
153
154 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
155 // Set the initial TargetOccupnacy to the maximum occupancy that we can
156 // achieve for this function. This effectively sets a lower bound on the
157 // 'Critical' register limits in the scheduler.
158 // Allow for lower occupancy targets if kernel is wave limited or memory
159 // bound, and using the relaxed occupancy feature.
160 TargetOccupancy =
161 RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
162 SGPRCriticalLimit =
163 std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit);
164
165 if (!KnownExcessRP) {
166 VGPRCriticalLimit = std::min(
167 a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize()),
168 b: VGPRExcessLimit);
169 } else {
170 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
171 // returns a reasonably small number for targets with lots of VGPRs, such
172 // as GFX10 and GFX11.
173 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
174 "VGPRCriticalLimit calculation method.\n");
175 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
176 unsigned Granule =
177 AMDGPU::IsaInfo::getVGPRAllocGranule(STI: ST, DynamicVGPRBlockSize);
178 unsigned Addressable =
179 AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: ST, DynamicVGPRBlockSize);
180 unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule);
181 VGPRBudget = std::max(a: VGPRBudget, b: Granule);
182 VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit);
183 }
184 // Apply VGPR excess threshold percentage if specified.
185 if (VGPRThresholdPercentOpt > 0) {
186 [[maybe_unused]] unsigned OriginalVGPRExcessLimit = VGPRExcessLimit;
187 [[maybe_unused]] unsigned OriginalVGPRCriticalLimit = VGPRCriticalLimit;
188 VGPRExcessLimit = (VGPRThresholdPercentOpt * VGPRExcessLimit + 99) / 100;
189 VGPRCriticalLimit =
190 (VGPRThresholdPercentOpt * VGPRCriticalLimit + 99) / 100;
191 LLVM_DEBUG(dbgs() << "Applied VGPR excess threshold "
192 << VGPRThresholdPercentOpt << "%, VGPRExcessLimit: "
193 << OriginalVGPRExcessLimit << " -> " << VGPRExcessLimit
194 << ". VGPRCriticalLimit: " << OriginalVGPRCriticalLimit
195 << " -> " << VGPRCriticalLimit << '\n');
196 } else {
197 VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit);
198 VGPRCriticalLimit -=
199 std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit);
200 }
201
202 // Subtract error margin and bias from register limits and avoid overflow.
203 SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit);
204 SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit);
205 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
206 << ", VGPRExcessLimit = " << VGPRExcessLimit
207 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
208 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
209}
210
211/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
212/// current register pressure.
213///
214/// This works for the common case, but it has a few exceptions that have been
215/// observed through trial and error:
216/// - Explicit physical register operands
217/// - Subregister definitions
218///
219/// In both of those cases, PressureDiff doesn't represent the actual pressure,
220/// and querying LiveIntervals through the RegPressureTracker is needed to get
221/// an accurate value.
222///
223/// We should eventually only use PressureDiff for maximum performance, but this
224/// already allows 80% of SUs to take the fast path without changing scheduling
225/// at all. Further changes would either change scheduling, or require a lot
226/// more logic to recover an accurate pressure estimate from the PressureDiffs.
227static bool canUsePressureDiffs(const SUnit &SU) {
228 if (!SU.isInstr())
229 return false;
230
231 // Cannot use pressure diffs for subregister defs or with physregs, it's
232 // imprecise in both cases.
233 for (const auto &Op : SU.getInstr()->operands()) {
234 if (!Op.isReg() || Op.isImplicit())
235 continue;
236 if (Op.getReg().isPhysical() ||
237 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
238 return false;
239 }
240 return true;
241}
242
243void GCNSchedStrategy::getRegisterPressures(
244 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
245 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
246 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
247 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
248 // getDownwardPressure() and getUpwardPressure() make temporary changes to
249 // the tracker, so we need to pass those function a non-const copy.
250 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
251 if (!useGCNTrackers()) {
252 AtTop
253 ? TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure)
254 : TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure);
255
256 return;
257 }
258
259 // GCNTrackers
260 Pressure.resize(new_size: 4, x: 0);
261 MachineInstr *MI = SU->getInstr();
262 GCNRegPressure NewPressure;
263 if (AtTop) {
264 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
265 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, TRI: SRI);
266 } else {
267 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
268 TempUpwardTracker.recede(MI: *MI);
269 NewPressure = TempUpwardTracker.getPressure();
270 }
271 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
272 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
273 NewPressure.getArchVGPRNum();
274 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
275}
276
277unsigned GCNSchedStrategy::getStructuralStallCycles(SchedBoundary &Zone,
278 SUnit *SU) const {
279 // Only implemented for top-down scheduling currently.
280 if (!Zone.isTop() || !SU)
281 return 0;
282
283 MachineInstr *MI = SU->getInstr();
284 unsigned CurrCycle = Zone.getCurrCycle();
285 unsigned Stall = 0;
286
287 // Query SchedModel for resource stalls (unbuffered resources).
288 if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
289 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
290 for (const MCWriteProcResEntry &PE :
291 make_range(x: SchedModel->getWriteProcResBegin(SC),
292 y: SchedModel->getWriteProcResEnd(SC))) {
293 unsigned NextAvail =
294 Zone.getNextResourceCycle(SC, PIdx: PE.ProcResourceIdx, ReleaseAtCycle: PE.ReleaseAtCycle,
295 AcquireAtCycle: PE.AcquireAtCycle)
296 .first;
297 if (NextAvail > CurrCycle)
298 Stall = std::max(a: Stall, b: NextAvail - CurrCycle);
299 }
300 }
301
302 // Query HazardRecognizer for sequence-dependent hazard penalties.
303 // AMDGPU currently installs GCNHazardRecognizer for MI scheduling only in
304 // the post-RA configuration without vreg liveness.
305 if (!DAG->hasVRegLiveness() && Zone.HazardRec &&
306 Zone.HazardRec->isEnabled()) {
307 auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec);
308 Stall = std::max(a: Stall, b: HR->getHazardWaitStates(MI));
309 }
310
311 return Stall;
312}
313
314void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
315 bool AtTop,
316 const RegPressureTracker &RPTracker,
317 const SIRegisterInfo *SRI,
318 unsigned SGPRPressure,
319 unsigned VGPRPressure, bool IsBottomUp) {
320 Cand.SU = SU;
321 Cand.AtTop = AtTop;
322
323 if (!DAG->isTrackingPressure())
324 return;
325
326 Pressure.clear();
327 MaxPressure.clear();
328
329 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
330 // possible over querying the RegPressureTracker.
331 //
332 // RegPressureTracker will make a lot of LIS queries which are very
333 // expensive, it is considered a slow function in this context.
334 //
335 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
336 // trivial lookup into an array. It is pretty much free.
337 //
338 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
339 // PressureDiffs.
340 if (AtTop || !canUsePressureDiffs(SU: *SU) || useGCNTrackers()) {
341 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
342 DownwardTracker, UpwardTracker, DAG, SRI);
343 } else {
344 // Reserve 4 slots.
345 Pressure.resize(new_size: 4, x: 0);
346 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
347 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
348
349 for (const auto &Diff : DAG->getPressureDiff(SU)) {
350 if (!Diff.isValid())
351 continue;
352 // PressureDiffs is always bottom-up so if we're working top-down we need
353 // to invert its sign.
354 Pressure[Diff.getPSet()] +=
355 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
356 }
357
358#ifdef EXPENSIVE_CHECKS
359 std::vector<unsigned> CheckPressure, CheckMaxPressure;
360 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
361 DownwardTracker, UpwardTracker, DAG, SRI);
362 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
363 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
364 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
365 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
366 errs() << "Register Pressure is inaccurate when calculated through "
367 "PressureDiff\n"
368 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
369 << ", expected "
370 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
371 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
372 << ", expected "
373 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
374 report_fatal_error("inaccurate register pressure calculation");
375 }
376#endif
377 }
378
379 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
380 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
381
382 // If two instructions increase the pressure of different register sets
383 // by the same amount, the generic scheduler will prefer to schedule the
384 // instruction that increases the set with the least amount of registers,
385 // which in our case would be SGPRs. This is rarely what we want, so
386 // when we report excess/critical register pressure, we do it either
387 // only for VGPRs or only for SGPRs.
388
389 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
390 const unsigned MaxVGPRPressureInc = 16;
391 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
392 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
393
394 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
395 // to increase the likelihood we don't go over the limits. We should improve
396 // the analysis to look through dependencies to find the path with the least
397 // register pressure.
398
399 // We only need to update the RPDelta for instructions that increase register
400 // pressure. Instructions that decrease or keep reg pressure the same will be
401 // marked as RegExcess in tryCandidate() when they are compared with
402 // instructions that increase the register pressure.
403 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
404 HasHighPressure = true;
405 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
406 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
407 }
408
409 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
410 HasHighPressure = true;
411 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
412 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
413 }
414
415 // Register pressure is considered 'CRITICAL' if it is approaching a value
416 // that would reduce the wave occupancy for the execution unit. When
417 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
418 // has the same cost, so we don't need to prefer one over the other.
419
420 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
421 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
422
423 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
424 HasHighPressure = true;
425 if (SGPRDelta > VGPRDelta) {
426 Cand.RPDelta.CriticalMax =
427 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
428 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
429 } else {
430 Cand.RPDelta.CriticalMax =
431 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
432 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
433 }
434 }
435}
436
437static bool shouldCheckPending(SchedBoundary &Zone,
438 const TargetSchedModel *SchedModel) {
439 bool HasBufferedModel =
440 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
441 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
442 return Combined <= PendingQueueLimit && HasBufferedModel;
443}
444
445static SUnit *pickOnlyChoice(SchedBoundary &Zone,
446 const TargetSchedModel *SchedModel) {
447 // pickOnlyChoice() releases pending instructions and checks for new hazards.
448 SUnit *OnlyChoice = Zone.pickOnlyChoice();
449 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
450 return OnlyChoice;
451
452 return nullptr;
453}
454
455void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current,
456 const SchedCandidate &Preferred) {
457 LLVM_DEBUG({
458 dbgs() << "Prefer:\t\t";
459 DAG->dumpNode(*Preferred.SU);
460
461 if (Current.SU) {
462 dbgs() << "Not:\t";
463 DAG->dumpNode(*Current.SU);
464 }
465
466 dbgs() << "Reason:\t\t";
467 traceCandidate(Preferred);
468 });
469}
470
471// This function is mostly cut and pasted from
472// GenericScheduler::pickNodeFromQueue()
473void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
474 const CandPolicy &ZonePolicy,
475 const RegPressureTracker &RPTracker,
476 SchedCandidate &Cand, bool &IsPending,
477 bool IsBottomUp) {
478 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
479 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
480 unsigned SGPRPressure = 0;
481 unsigned VGPRPressure = 0;
482 IsPending = false;
483 if (DAG->isTrackingPressure()) {
484 if (!useGCNTrackers()) {
485 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
486 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
487 } else {
488 GCNRPTracker *T = IsBottomUp
489 ? static_cast<GCNRPTracker *>(&UpwardTracker)
490 : static_cast<GCNRPTracker *>(&DownwardTracker);
491 SGPRPressure = T->getPressure().getSGPRNum();
492 VGPRPressure = T->getPressure().getArchVGPRNum();
493 }
494 }
495 LLVM_DEBUG(dbgs() << "Available Q:\n");
496 ReadyQueue &AQ = Zone.Available;
497 for (SUnit *SU : AQ) {
498
499 SchedCandidate TryCand(ZonePolicy);
500 initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
501 VGPRPressure, IsBottomUp);
502 // Pass SchedBoundary only when comparing nodes from the same boundary.
503 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
504 tryCandidate(Cand, TryCand, Zone: ZoneArg);
505 if (TryCand.Reason != NoCand) {
506 // Initialize resource delta if needed in case future heuristics query it.
507 if (TryCand.ResDelta == SchedResourceDelta())
508 TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
509 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
510 Cand.setBest(TryCand);
511 } else {
512 printCandidateDecision(Current: TryCand, Preferred: Cand);
513 }
514 }
515
516 if (!shouldCheckPending(Zone, SchedModel))
517 return;
518
519 LLVM_DEBUG(dbgs() << "Pending Q:\n");
520 ReadyQueue &PQ = Zone.Pending;
521 for (SUnit *SU : PQ) {
522
523 SchedCandidate TryCand(ZonePolicy);
524 initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
525 VGPRPressure, IsBottomUp);
526 // Pass SchedBoundary only when comparing nodes from the same boundary.
527 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
528 tryPendingCandidate(Cand, TryCand, Zone: ZoneArg);
529 if (TryCand.Reason != NoCand) {
530 // Initialize resource delta if needed in case future heuristics query it.
531 if (TryCand.ResDelta == SchedResourceDelta())
532 TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
533 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
534 IsPending = true;
535 Cand.setBest(TryCand);
536 } else {
537 printCandidateDecision(Current: TryCand, Preferred: Cand);
538 }
539 }
540}
541
542// This function is mostly cut and pasted from
543// GenericScheduler::pickNodeBidirectional()
544SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
545 bool &PickedPending) {
546 // Schedule as far as possible in the direction of no choice. This is most
547 // efficient, but also provides the best heuristics for CriticalPSets.
548 if (SUnit *SU = pickOnlyChoice(Zone&: Bot, SchedModel)) {
549 IsTopNode = false;
550 return SU;
551 }
552 if (SUnit *SU = pickOnlyChoice(Zone&: Top, SchedModel)) {
553 IsTopNode = true;
554 return SU;
555 }
556 // Set the bottom-up policy based on the state of the current bottom zone
557 // and the instructions outside the zone, including the top zone.
558 CandPolicy BotPolicy;
559 setPolicy(Policy&: BotPolicy, /*IsPostRA=*/false, CurrZone&: Bot, OtherZone: &Top);
560 // Set the top-down policy based on the state of the current top zone and
561 // the instructions outside the zone, including the bottom zone.
562 CandPolicy TopPolicy;
563 setPolicy(Policy&: TopPolicy, /*IsPostRA=*/false, CurrZone&: Top, OtherZone: &Bot);
564
565 bool BotPending = false;
566 // See if BotCand is still valid (because we previously scheduled from Top).
567 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
568 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
569 BotCand.Policy != BotPolicy) {
570 BotCand.reset(NewPolicy: CandPolicy());
571 pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
572 IsPending&: BotPending,
573 /*IsBottomUp=*/true);
574 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
575 } else {
576 LLVM_DEBUG(traceCandidate(BotCand));
577#ifndef NDEBUG
578 if (VerifyScheduling) {
579 SchedCandidate TCand;
580 TCand.reset(CandPolicy());
581 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
582 BotPending,
583 /*IsBottomUp=*/true);
584 assert(TCand.SU == BotCand.SU &&
585 "Last pick result should correspond to re-picking right now");
586 }
587#endif
588 }
589
590 bool TopPending = false;
591 // Check if the top Q has a better candidate.
592 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
593 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
594 TopCand.Policy != TopPolicy) {
595 TopCand.reset(NewPolicy: CandPolicy());
596 pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
597 IsPending&: TopPending,
598 /*IsBottomUp=*/false);
599 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
600 } else {
601 LLVM_DEBUG(traceCandidate(TopCand));
602#ifndef NDEBUG
603 if (VerifyScheduling) {
604 SchedCandidate TCand;
605 TCand.reset(CandPolicy());
606 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
607 TopPending,
608 /*IsBottomUp=*/false);
609 assert(TCand.SU == TopCand.SU &&
610 "Last pick result should correspond to re-picking right now");
611 }
612#endif
613 }
614
615 // Pick best from BotCand and TopCand.
616 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
617 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
618 SchedCandidate Cand = BotPending ? TopCand : BotCand;
619 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
620 PickedPending = BotPending && TopPending;
621
622 TryCand.Reason = NoCand;
623 if (BotPending || TopPending) {
624 PickedPending |= tryPendingCandidate(Cand, TryCand&: TopCand, Zone: nullptr);
625 } else {
626 tryCandidate(Cand, TryCand, Zone: nullptr);
627 }
628
629 if (TryCand.Reason != NoCand) {
630 Cand.setBest(TryCand);
631 }
632
633 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
634
635 IsTopNode = Cand.AtTop;
636 return Cand.SU;
637}
638
639// This function is mostly cut and pasted from
640// GenericScheduler::pickNode()
641SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
642 if (DAG->top() == DAG->bottom()) {
643 assert(Top.Available.empty() && Top.Pending.empty() &&
644 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
645 return nullptr;
646 }
647 bool PickedPending;
648 SUnit *SU;
649 do {
650 PickedPending = false;
651 if (RegionPolicy.OnlyTopDown) {
652 SU = pickOnlyChoice(Zone&: Top, SchedModel);
653 if (!SU) {
654 CandPolicy NoPolicy;
655 TopCand.reset(NewPolicy: NoPolicy);
656 pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
657 IsPending&: PickedPending,
658 /*IsBottomUp=*/false);
659 assert(TopCand.Reason != NoCand && "failed to find a candidate");
660 SU = TopCand.SU;
661 }
662 IsTopNode = true;
663 } else if (RegionPolicy.OnlyBottomUp) {
664 SU = pickOnlyChoice(Zone&: Bot, SchedModel);
665 if (!SU) {
666 CandPolicy NoPolicy;
667 BotCand.reset(NewPolicy: NoPolicy);
668 pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
669 IsPending&: PickedPending,
670 /*IsBottomUp=*/true);
671 assert(BotCand.Reason != NoCand && "failed to find a candidate");
672 SU = BotCand.SU;
673 }
674 IsTopNode = false;
675 } else {
676 SU = pickNodeBidirectional(IsTopNode, PickedPending);
677 }
678 } while (SU->isScheduled);
679
680 if (PickedPending) {
681 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
682 SchedBoundary &Zone = IsTopNode ? Top : Bot;
683 unsigned CurrentCycle = Zone.getCurrCycle();
684 if (ReadyCycle > CurrentCycle)
685 Zone.bumpCycle(NextCycle: ReadyCycle);
686
687 // FIXME: checkHazard() doesn't give information about which cycle the
688 // hazard will resolve so just keep bumping the cycle by 1. This could be
689 // made more efficient if checkHazard() returned more details.
690 while (Zone.checkHazard(SU))
691 Zone.bumpCycle(NextCycle: Zone.getCurrCycle() + 1);
692
693 Zone.releasePending();
694 }
695
696 if (SU->isTopReady())
697 Top.removeReady(SU);
698 if (SU->isBottomReady())
699 Bot.removeReady(SU);
700
701 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
702 << *SU->getInstr());
703 return SU;
704}
705
706void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
707 if (useGCNTrackers()) {
708 MachineInstr *MI = SU->getInstr();
709 IsTopNode ? (void)DownwardTracker.advance(MI, UseInternalIterator: false)
710 : UpwardTracker.recede(MI: *MI);
711 }
712
713 return GenericScheduler::schedNode(SU, IsTopNode);
714}
715
716GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
717 assert(CurrentStage && CurrentStage != SchedStages.end());
718 return *CurrentStage;
719}
720
721bool GCNSchedStrategy::advanceStage() {
722 assert(CurrentStage != SchedStages.end());
723 if (!CurrentStage)
724 CurrentStage = SchedStages.begin();
725 else
726 CurrentStage++;
727
728 return CurrentStage != SchedStages.end();
729}
730
731bool GCNSchedStrategy::hasNextStage() const {
732 assert(CurrentStage);
733 return std::next(x: CurrentStage) != SchedStages.end();
734}
735
736GCNSchedStageID GCNSchedStrategy::getNextStage() const {
737 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
738 return *std::next(x: CurrentStage);
739}
740
741bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
742 SchedCandidate &TryCand,
743 SchedBoundary *Zone) const {
744 // Initialize the candidate if needed.
745 if (!Cand.isValid()) {
746 TryCand.Reason = NodeOrder;
747 return true;
748 }
749
750 // Bias PhysReg Defs and copies to their uses and defined respectively.
751 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
752 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
753 return TryCand.Reason != NoCand;
754
755 // Avoid exceeding the target's limit.
756 if (DAG->isTrackingPressure() &&
757 tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
758 Reason: RegExcess, TRI, MF: DAG->MF))
759 return TryCand.Reason != NoCand;
760
761 // Avoid increasing the max critical pressure in the scheduled region.
762 if (DAG->isTrackingPressure() &&
763 tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
764 TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
765 return TryCand.Reason != NoCand;
766
767 bool SameBoundary = Zone != nullptr;
768 if (SameBoundary) {
769 TryCand.initResourceDelta(DAG, SchedModel);
770 if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
771 TryCand, Cand, Reason: ResourceReduce))
772 return TryCand.Reason != NoCand;
773 if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
774 CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
775 Reason: ResourceDemand))
776 return TryCand.Reason != NoCand;
777 }
778
779 return false;
780}
781
782GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
783 const MachineSchedContext *C, bool IsLegacyScheduler)
784 : GCNSchedStrategy(C) {
785 SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule);
786 if (!DisableRewriteMFMAFormSchedStage)
787 SchedStages.push_back(Elt: GCNSchedStageID::RewriteMFMAForm);
788 SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule);
789 SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule);
790 SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize);
791 if (IsLegacyScheduler)
792 GCNTrackersOverride = std::nullopt;
793}
794
795GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
796 : GCNSchedStrategy(C) {
797 SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule);
798}
799
800bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
801 SchedCandidate &TryCand,
802 SchedBoundary *Zone) const {
803 // Initialize the candidate if needed.
804 if (!Cand.isValid()) {
805 TryCand.Reason = NodeOrder;
806 return true;
807 }
808
809 // Avoid spilling by exceeding the register limit.
810 if (DAG->isTrackingPressure() &&
811 tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
812 Reason: RegExcess, TRI, MF: DAG->MF))
813 return TryCand.Reason != NoCand;
814
815 // Bias PhysReg Defs and copies to their uses and defined respectively.
816 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
817 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
818 return TryCand.Reason != NoCand;
819
820 bool SameBoundary = Zone != nullptr;
821 if (SameBoundary) {
822 // Prioritize instructions that read unbuffered resources by stall cycles.
823 if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
824 CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
825 return TryCand.Reason != NoCand;
826
827 // Avoid critical resource consumption and balance the schedule.
828 TryCand.initResourceDelta(DAG, SchedModel);
829 if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
830 TryCand, Cand, Reason: ResourceReduce))
831 return TryCand.Reason != NoCand;
832 if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
833 CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
834 Reason: ResourceDemand))
835 return TryCand.Reason != NoCand;
836
837 // Unconditionally try to reduce latency.
838 if (tryLatency(TryCand, Cand, Zone&: *Zone))
839 return TryCand.Reason != NoCand;
840
841 // Weak edges are for clustering and other constraints.
842 if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
843 CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
844 return TryCand.Reason != NoCand;
845 }
846
847 // Keep clustered nodes together to encourage downstream peephole
848 // optimizations which may reduce resource requirements.
849 //
850 // This is a best effort to set things up for a post-RA pass. Optimizations
851 // like generating loads of multiple registers should ideally be done within
852 // the scheduler pass by combining the loads during DAG postprocessing.
853 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
854 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
855 bool CandIsClusterSucc =
856 isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx);
857 bool TryCandIsClusterSucc =
858 isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx);
859 if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand,
860 Reason: Cluster))
861 return TryCand.Reason != NoCand;
862
863 // Avoid increasing the max critical pressure in the scheduled region.
864 if (DAG->isTrackingPressure() &&
865 tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
866 TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
867 return TryCand.Reason != NoCand;
868
869 // Avoid increasing the max pressure of the entire region.
870 if (DAG->isTrackingPressure() &&
871 tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
872 Cand, Reason: RegMax, TRI, MF: DAG->MF))
873 return TryCand.Reason != NoCand;
874
875 if (SameBoundary) {
876 // Fall through to original instruction order.
877 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
878 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
879 TryCand.Reason = NodeOrder;
880 return true;
881 }
882 }
883 return false;
884}
885
886GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
887 const MachineSchedContext *C)
888 : GCNSchedStrategy(C) {
889 SchedStages.push_back(Elt: GCNSchedStageID::MemoryClauseInitialSchedule);
890}
891
892/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
893/// much as possible. This is achieved by:
894// 1. Prioritize clustered operations before stall latency heuristic.
895// 2. Prioritize long-latency-load before stall latency heuristic.
896///
897/// \param Cand provides the policy and current best candidate.
898/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
899/// \param Zone describes the scheduled zone that we are extending, or nullptr
900/// if Cand is from a different zone than TryCand.
901/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
902bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
903 SchedCandidate &TryCand,
904 SchedBoundary *Zone) const {
905 // Initialize the candidate if needed.
906 if (!Cand.isValid()) {
907 TryCand.Reason = NodeOrder;
908 return true;
909 }
910
911 // Bias PhysReg Defs and copies to their uses and defined respectively.
912 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
913 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
914 return TryCand.Reason != NoCand;
915
916 if (DAG->isTrackingPressure()) {
917 // Avoid exceeding the target's limit.
918 if (tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
919 Reason: RegExcess, TRI, MF: DAG->MF))
920 return TryCand.Reason != NoCand;
921
922 // Avoid increasing the max critical pressure in the scheduled region.
923 if (tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
924 TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
925 return TryCand.Reason != NoCand;
926 }
927
928 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
929 // get more benefit from clausing these memory instructions.
930 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
931 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
932 bool CandIsClusterSucc =
933 isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx);
934 bool TryCandIsClusterSucc =
935 isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx);
936 if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand,
937 Reason: Cluster))
938 return TryCand.Reason != NoCand;
939
940 // We only compare a subset of features when comparing nodes between
941 // Top and Bottom boundary. Some properties are simply incomparable, in many
942 // other instances we should only override the other boundary if something
943 // is a clear good pick on one boundary. Skip heuristics that are more
944 // "tie-breaking" in nature.
945 bool SameBoundary = Zone != nullptr;
946 if (SameBoundary) {
947 // For loops that are acyclic path limited, aggressively schedule for
948 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
949 // heuristics to take precedence.
950 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
951 tryLatency(TryCand, Cand, Zone&: *Zone))
952 return TryCand.Reason != NoCand;
953
954 // MaxMemoryClause-specific: Prioritize long latency memory load
955 // instructions in top-bottom order to hide more latency. The mayLoad check
956 // is used to exclude store-like instructions, which we do not want to
957 // scheduler them too early.
958 bool TryMayLoad =
959 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
960 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
961
962 if (TryMayLoad || CandMayLoad) {
963 bool TryLongLatency =
964 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
965 bool CandLongLatency =
966 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
967
968 if (tryGreater(TryVal: Zone->isTop() ? TryLongLatency : CandLongLatency,
969 CandVal: Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
970 Cand, Reason: Stall))
971 return TryCand.Reason != NoCand;
972 }
973 // Prioritize instructions that read unbuffered resources by stall cycles.
974 if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
975 CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
976 return TryCand.Reason != NoCand;
977 }
978
979 if (SameBoundary) {
980 // Weak edges are for clustering and other constraints.
981 if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
982 CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
983 return TryCand.Reason != NoCand;
984 }
985
986 // Avoid increasing the max pressure of the entire region.
987 if (DAG->isTrackingPressure() &&
988 tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
989 Cand, Reason: RegMax, TRI, MF: DAG->MF))
990 return TryCand.Reason != NoCand;
991
992 if (SameBoundary) {
993 // Avoid critical resource consumption and balance the schedule.
994 TryCand.initResourceDelta(DAG, SchedModel);
995 if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
996 TryCand, Cand, Reason: ResourceReduce))
997 return TryCand.Reason != NoCand;
998 if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
999 CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
1000 Reason: ResourceDemand))
1001 return TryCand.Reason != NoCand;
1002
1003 // Avoid serializing long latency dependence chains.
1004 // For acyclic path limited loops, latency was already checked above.
1005 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
1006 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone))
1007 return TryCand.Reason != NoCand;
1008
1009 // Fall through to original instruction order.
1010 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
1011 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
1012 TryCand.Reason = NodeOrder;
1013 return true;
1014 }
1015 }
1016
1017 return false;
1018}
1019
1020GCNScheduleDAGMILive::GCNScheduleDAGMILive(
1021 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
1022 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
1023 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
1024 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
1025 RegionLiveOuts(this, /*IsLiveOut=*/true) {
1026
1027 // We want regions with a single MI to be scheduled so that we can reason
1028 // about them correctly during scheduling stages that move MIs between regions
1029 // (e.g., rematerialization).
1030 ScheduleSingleMIRegions = true;
1031 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
1032 if (RelaxedOcc) {
1033 MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy);
1034 if (MinOccupancy != StartingOccupancy)
1035 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
1036 << ".\n");
1037 }
1038}
1039
1040std::unique_ptr<GCNSchedStage>
1041GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
1042 switch (SchedStageID) {
1043 case GCNSchedStageID::OccInitialSchedule:
1044 return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this);
1045 case GCNSchedStageID::RewriteMFMAForm:
1046 return std::make_unique<RewriteMFMAFormStage>(args&: SchedStageID, args&: *this);
1047 case GCNSchedStageID::UnclusteredHighRPReschedule:
1048 return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this);
1049 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
1050 return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this);
1051 case GCNSchedStageID::PreRARematerialize:
1052 return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this);
1053 case GCNSchedStageID::ILPInitialSchedule:
1054 return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this);
1055 case GCNSchedStageID::MemoryClauseInitialSchedule:
1056 return std::make_unique<MemoryClauseInitialScheduleStage>(args&: SchedStageID,
1057 args&: *this);
1058 }
1059
1060 llvm_unreachable("Unknown SchedStageID.");
1061}
1062
1063void GCNScheduleDAGMILive::schedule() {
1064 // Collect all scheduling regions. The actual scheduling is performed in
1065 // GCNScheduleDAGMILive::finalizeSchedule.
1066 Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd));
1067}
1068
1069GCNRegPressure
1070GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
1071 if (Regions[RegionIdx].first == Regions[RegionIdx].second)
1072 return llvm::getRegPressure(MRI, LiveRegs: LiveIns[RegionIdx]);
1073 GCNDownwardRPTracker RPTracker(*LIS);
1074 RPTracker.advance(Begin: Regions[RegionIdx].first, End: Regions[RegionIdx].second,
1075 LiveRegsCopy: &LiveIns[RegionIdx]);
1076 return RPTracker.moveMaxPressure();
1077}
1078
1079static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
1080 MachineBasicBlock::iterator RegionEnd) {
1081 assert(RegionBegin != RegionEnd && "Region must not be empty");
1082 return &*skipDebugInstructionsBackward(It: std::prev(x: RegionEnd), Begin: RegionBegin);
1083}
1084
1085void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
1086 const MachineBasicBlock *MBB) {
1087 GCNDownwardRPTracker RPTracker(*LIS);
1088
1089 // If the block has the only successor then live-ins of that successor are
1090 // live-outs of the current block. We can reuse calculated live set if the
1091 // successor will be sent to scheduling past current block.
1092
1093 // However, due to the bug in LiveInterval analysis it may happen that two
1094 // predecessors of the same successor block have different lane bitmasks for
1095 // a live-out register. Workaround that by sticking to one-to-one relationship
1096 // i.e. one predecessor with one successor block.
1097 const MachineBasicBlock *OnlySucc = nullptr;
1098 if (MBB->succ_size() == 1) {
1099 auto *Candidate = *MBB->succ_begin();
1100 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1101 SlotIndexes *Ind = LIS->getSlotIndexes();
1102 if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate))
1103 OnlySucc = Candidate;
1104 }
1105 }
1106
1107 // Scheduler sends regions from the end of the block upwards.
1108 size_t CurRegion = RegionIdx;
1109 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1110 if (Regions[CurRegion].first->getParent() != MBB)
1111 break;
1112 --CurRegion;
1113
1114 auto I = MBB->begin();
1115 auto LiveInIt = MBBLiveIns.find(Val: MBB);
1116 auto &Rgn = Regions[CurRegion];
1117 auto *NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
1118 if (LiveInIt != MBBLiveIns.end()) {
1119 auto LiveIn = std::move(LiveInIt->second);
1120 RPTracker.reset(MI: *MBB->begin(), End: MBB->end(), LiveRegs: &LiveIn);
1121 MBBLiveIns.erase(I: LiveInIt);
1122 } else {
1123 I = Rgn.first;
1124 auto LRS = BBLiveInMap.lookup(Val: NonDbgMI);
1125#ifdef EXPENSIVE_CHECKS
1126 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1127#endif
1128 RPTracker.reset(MI: *I, End: I->getParent()->end(), LiveRegs: &LRS);
1129 }
1130
1131 for (;;) {
1132 I = RPTracker.getNext();
1133
1134 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1135 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1136 RPTracker.clearMaxPressure();
1137 }
1138
1139 if (Regions[CurRegion].second == I) {
1140 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1141 if (CurRegion-- == RegionIdx)
1142 break;
1143 auto &Rgn = Regions[CurRegion];
1144 NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
1145 }
1146 RPTracker.advanceBeforeNext();
1147 RPTracker.advanceToNext();
1148 }
1149
1150 if (OnlySucc) {
1151 if (I != MBB->end()) {
1152 RPTracker.advanceBeforeNext();
1153 RPTracker.advanceToNext();
1154 RPTracker.advance(End: MBB->end());
1155 }
1156 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1157 }
1158}
1159
1160DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
1161GCNScheduleDAGMILive::getRegionLiveInMap() const {
1162 assert(!Regions.empty());
1163 std::vector<MachineInstr *> RegionFirstMIs;
1164 RegionFirstMIs.reserve(n: Regions.size());
1165 for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions))
1166 RegionFirstMIs.push_back(
1167 x: &*skipDebugInstructionsForward(It: RegionBegin, End: RegionEnd));
1168
1169 return getLiveRegMap(R&: RegionFirstMIs, /*After=*/false, LIS&: *LIS);
1170}
1171
1172DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
1173GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1174 assert(!Regions.empty());
1175 std::vector<MachineInstr *> RegionLastMIs;
1176 RegionLastMIs.reserve(n: Regions.size());
1177 for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions)) {
1178 // Skip empty regions.
1179 if (RegionBegin == RegionEnd)
1180 continue;
1181 RegionLastMIs.push_back(x: getLastMIForRegion(RegionBegin, RegionEnd));
1182 }
1183 return getLiveRegMap(R&: RegionLastMIs, /*After=*/true, LIS&: *LIS);
1184}
1185
1186void RegionPressureMap::buildLiveRegMap() {
1187 IdxToInstruction.clear();
1188
1189 RegionLiveRegMap =
1190 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1191 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1192 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1193 // Skip empty regions.
1194 if (RegionBegin == RegionEnd)
1195 continue;
1196 MachineInstr *RegionKey =
1197 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1198 IdxToInstruction[I] = RegionKey;
1199 }
1200}
1201
1202void GCNScheduleDAGMILive::finalizeSchedule() {
1203 // Start actual scheduling here. This function is called by the base
1204 // MachineScheduler after all regions have been recorded by
1205 // GCNScheduleDAGMILive::schedule().
1206 LiveIns.resize(N: Regions.size());
1207 Pressure.resize(N: Regions.size());
1208 RegionsWithHighRP.resize(N: Regions.size());
1209 RegionsWithExcessRP.resize(N: Regions.size());
1210 RegionsWithIGLPInstrs.resize(N: Regions.size());
1211 RegionsWithHighRP.reset();
1212 RegionsWithExcessRP.reset();
1213 RegionsWithIGLPInstrs.reset();
1214
1215 runSchedStages();
1216}
1217
1218void GCNScheduleDAGMILive::runSchedStages() {
1219 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1220
1221 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1222 if (!Regions.empty()) {
1223 BBLiveInMap = getRegionLiveInMap();
1224 if (S.useGCNTrackers())
1225 RegionLiveOuts.buildLiveRegMap();
1226 }
1227
1228#ifdef DUMP_MAX_REG_PRESSURE
1229 if (PrintMaxRPRegUsageBeforeScheduler) {
1230 dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
1231 dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
1232 LIS->dump();
1233 }
1234#endif
1235
1236 while (S.advanceStage()) {
1237 auto Stage = createSchedStage(SchedStageID: S.getCurrentStage());
1238 if (!Stage->initGCNSchedStage())
1239 continue;
1240
1241 for (auto Region : Regions) {
1242 RegionBegin = Region.first;
1243 RegionEnd = Region.second;
1244 // Setup for scheduling the region and check whether it should be skipped.
1245 if (!Stage->initGCNRegion()) {
1246 Stage->advanceRegion();
1247 exitRegion();
1248 continue;
1249 }
1250
1251 if (S.useGCNTrackers()) {
1252 const unsigned RegionIdx = Stage->getRegionIdx();
1253 S.getDownwardTracker()->reset(MRI, LiveRegs: LiveIns[RegionIdx]);
1254 S.getUpwardTracker()->reset(
1255 MRI, LiveRegs: RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx));
1256 }
1257
1258 ScheduleDAGMILive::schedule();
1259 Stage->finalizeGCNRegion();
1260 Stage->advanceRegion();
1261 exitRegion();
1262 }
1263
1264 Stage->finalizeGCNSchedStage();
1265 }
1266
1267#ifdef DUMP_MAX_REG_PRESSURE
1268 if (PrintMaxRPRegUsageAfterScheduler) {
1269 dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
1270 dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
1271 LIS->dump();
1272 }
1273#endif
1274}
1275
1276#ifndef NDEBUG
1277raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
1278 switch (StageID) {
1279 case GCNSchedStageID::OccInitialSchedule:
1280 OS << "Max Occupancy Initial Schedule";
1281 break;
1282 case GCNSchedStageID::RewriteMFMAForm:
1283 OS << "Instruction Rewriting Reschedule";
1284 break;
1285 case GCNSchedStageID::UnclusteredHighRPReschedule:
1286 OS << "Unclustered High Register Pressure Reschedule";
1287 break;
1288 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
1289 OS << "Clustered Low Occupancy Reschedule";
1290 break;
1291 case GCNSchedStageID::PreRARematerialize:
1292 OS << "Pre-RA Rematerialize";
1293 break;
1294 case GCNSchedStageID::ILPInitialSchedule:
1295 OS << "Max ILP Initial Schedule";
1296 break;
1297 case GCNSchedStageID::MemoryClauseInitialSchedule:
1298 OS << "Max memory clause Initial Schedule";
1299 break;
1300 }
1301
1302 return OS;
1303}
1304#endif
1305
1306GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
1307 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
1308 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
1309
1310bool GCNSchedStage::initGCNSchedStage() {
1311 if (!DAG.LIS)
1312 return false;
1313
1314 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1315 return true;
1316}
1317
1318void RewriteMFMAFormStage::findReachingDefs(
1319 MachineOperand &UseMO, LiveIntervals *LIS,
1320 SmallVectorImpl<SlotIndex> &DefIdxs) {
1321 MachineInstr *UseMI = UseMO.getParent();
1322 LiveInterval &UseLI = LIS->getInterval(Reg: UseMO.getReg());
1323 VNInfo *VNI = UseLI.getVNInfoAt(Idx: LIS->getInstructionIndex(Instr: *UseMI));
1324
1325 // If the def is not a PHI, then it must be the only reaching def.
1326 if (!VNI->isPHIDef()) {
1327 DefIdxs.push_back(Elt: VNI->def);
1328 return;
1329 }
1330
1331 SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
1332 SmallVector<MachineBasicBlock *, 8> Worklist;
1333
1334 // Mark the predecessor blocks for traversal
1335 for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1336 Worklist.push_back(Elt: PredMBB);
1337 Visited.insert(Ptr: PredMBB);
1338 }
1339
1340 while (!Worklist.empty()) {
1341 MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1342
1343 SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(mbb: CurrMBB);
1344 VNInfo *VNI = UseLI.getVNInfoAt(Idx: CurrMBBEnd.getPrevSlot());
1345
1346 MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(index: VNI->def);
1347
1348 // If there is a def in this block, then add it to the list. This is the
1349 // reaching def of this path.
1350 if (!VNI->isPHIDef()) {
1351 DefIdxs.push_back(Elt: VNI->def);
1352 continue;
1353 }
1354
1355 for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1356 if (Visited.insert(Ptr: PredMBB).second)
1357 Worklist.push_back(Elt: PredMBB);
1358 }
1359 }
1360}
1361
1362void RewriteMFMAFormStage::findReachingUses(
1363 const MachineInstr *DefMI, LiveIntervals *LIS,
1364 SmallVectorImpl<MachineOperand *> &ReachingUses) {
1365 SlotIndex DefIdx = LIS->getInstructionIndex(Instr: *DefMI);
1366 for (MachineOperand &UseMO :
1367 DAG.MRI.use_nodbg_operands(Reg: DefMI->getOperand(i: 0).getReg())) {
1368 SmallVector<SlotIndex, 8> ReachingDefIndexes;
1369 findReachingDefs(UseMO, LIS, DefIdxs&: ReachingDefIndexes);
1370
1371 // If we find a use that contains this DefMI in its reachingDefs, then it is
1372 // a reaching use.
1373 if (any_of(Range&: ReachingDefIndexes, P: [DefIdx](SlotIndex RDIdx) {
1374 return SlotIndex::isSameInstr(A: RDIdx, B: DefIdx);
1375 }))
1376 ReachingUses.push_back(Elt: &UseMO);
1377 }
1378}
1379
1380bool RewriteMFMAFormStage::initGCNSchedStage() {
1381 // We only need to run this pass if the architecture supports AGPRs.
1382 // Additionally, we don't use AGPRs at occupancy levels above 1 so there
1383 // is no need for this pass in that case, either.
1384 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1385 if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
1386 return false;
1387
1388 RegionsWithExcessArchVGPR.resize(N: DAG.Regions.size());
1389 RegionsWithExcessArchVGPR.reset();
1390 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
1391 GCNRegPressure PressureBefore = DAG.Pressure[Region];
1392 if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1393 RegionsWithExcessArchVGPR[Region] = true;
1394 }
1395
1396 if (RegionsWithExcessArchVGPR.none())
1397 return false;
1398
1399 TII = ST.getInstrInfo();
1400 SRI = ST.getRegisterInfo();
1401
1402 std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
1403 DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse;
1404 SmallPtrSet<MachineInstr *, 8> CopyForDef;
1405
1406 if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1407 return false;
1408
1409 int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1410
1411 // If we haven't found the beneficial conditions, prefer the VGPR form which
1412 // may result in less cross RC copies.
1413 if (Cost > 0)
1414 return false;
1415
1416 return rewrite(RewriteCands);
1417}
1418
1419bool UnclusteredHighRPStage::initGCNSchedStage() {
1420 if (DisableUnclusterHighRP)
1421 return false;
1422
1423 if (!GCNSchedStage::initGCNSchedStage())
1424 return false;
1425
1426 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1427 return false;
1428
1429 SavedMutations.swap(x&: DAG.Mutations);
1430 DAG.addMutation(
1431 Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry));
1432
1433 InitialOccupancy = DAG.MinOccupancy;
1434 // Aggressively try to reduce register pressure in the unclustered high RP
1435 // stage. Temporarily increase occupancy target in the region.
1436 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1437 ? InitialOccupancy + 1
1438 : InitialOccupancy;
1439 IsAnyRegionScheduled = false;
1440 S.SGPRLimitBias = S.HighRPSGPRBias;
1441 S.VGPRLimitBias = S.HighRPVGPRBias;
1442
1443 LLVM_DEBUG(
1444 dbgs()
1445 << "Retrying function scheduling without clustering. "
1446 "Aggressively try to reduce register pressure to achieve occupancy "
1447 << TempTargetOccupancy << ".\n");
1448
1449 return true;
1450}
1451
1452bool ClusteredLowOccStage::initGCNSchedStage() {
1453 if (DisableClusteredLowOccupancy)
1454 return false;
1455
1456 if (!GCNSchedStage::initGCNSchedStage())
1457 return false;
1458
1459 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1460 // been dropped. All regions will have already been scheduled with the ideal
1461 // occupancy targets.
1462 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1463 return false;
1464
1465 LLVM_DEBUG(
1466 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1467 << DAG.MinOccupancy << ".\n");
1468 return true;
1469}
1470
1471/// Allows to easily filter for this stage's debug output.
1472#define REMAT_PREFIX "[PreRARemat] "
1473#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1474
1475#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1476Printable PreRARematStage::ScoredRemat::print() const {
1477 return Printable([&](raw_ostream &OS) {
1478 OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1479 });
1480}
1481#endif
1482
1483bool PreRARematStage::initGCNSchedStage() {
1484 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1485 // regions inbetween the defs and region we sinked the def to. Will need to be
1486 // fixed if there is another pass after this pass.
1487 assert(!S.hasNextStage());
1488
1489 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
1490 return false;
1491
1492#ifndef NDEBUG
1493 auto PrintTargetRegions = [&]() -> void {
1494 if (TargetRegions.none()) {
1495 dbgs() << REMAT_PREFIX << "No target regions\n";
1496 return;
1497 }
1498 dbgs() << REMAT_PREFIX << "Target regions:\n";
1499 for (unsigned I : TargetRegions.set_bits())
1500 dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1501 };
1502#endif
1503
1504 // Set an objective for the stage based on current RP in each region.
1505 REMAT_DEBUG({
1506 dbgs() << "Analyzing ";
1507 MF.getFunction().printAsOperand(dbgs(), false);
1508 dbgs() << ": ";
1509 });
1510 if (!setObjective()) {
1511 LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
1512 << MFI.getMaxWavesPerEU() << '\n');
1513 return false;
1514 }
1515 LLVM_DEBUG({
1516 if (TargetOcc) {
1517 dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
1518 } else {
1519 dbgs() << "reduce spilling (minimum target occupancy is "
1520 << MFI.getMinWavesPerEU() << ")\n";
1521 }
1522 PrintTargetRegions();
1523 });
1524
1525 // We need up-to-date live-out info. to query live-out register masks in
1526 // regions containing rematerializable instructions.
1527 DAG.RegionLiveOuts.buildLiveRegMap();
1528
1529 if (!Remater.analyze()) {
1530 REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
1531 return false;
1532 }
1533 const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1534
1535 // Set of registers already marked for potential remterialization; used to
1536 // avoid rematerialization chains.
1537 SmallSet<Register, 4> MarkedRegs;
1538
1539 // Collect candidates. We have more restrictions on what we can track here
1540 // compared to the rematerializer.
1541 SmallVector<ScoredRemat, 8> Candidates;
1542 SmallVector<unsigned> CandidateOrder;
1543 for (unsigned RegIdx = 0, E = Remater.getNumRegs(); RegIdx < E; ++RegIdx) {
1544 const Rematerializer::Reg &CandReg = Remater.getReg(RegIdx);
1545
1546 // Single user only.
1547 unsigned NumUsers = 0;
1548 for (const auto &[_, RegionUses] : CandReg.Uses)
1549 NumUsers += RegionUses.size();
1550 if (NumUsers != 1)
1551 continue;
1552
1553 // We further filter the registers that we can rematerialize based on our
1554 // current tracking capabilities in the stage. The user cannot itself be
1555 // marked rematerializable, and no register operand of the defining MI can
1556 // be marked rematerializable.
1557 MachineInstr *UseMI = *CandReg.Uses.begin()->getSecond().begin();
1558 const MachineOperand &UseMO = UseMI->getOperand(i: 0);
1559 if (UseMO.isReg() && MarkedRegs.contains(V: UseMO.getReg()))
1560 continue;
1561 if (llvm::any_of(Range: CandReg.DefMI->all_uses(),
1562 P: [&MarkedRegs](const MachineOperand &MO) {
1563 return MarkedRegs.contains(V: MO.getReg());
1564 }))
1565 continue;
1566
1567 // Do not rematerialize an instruction if it uses registers that aren't
1568 // available at its use. This ensures that we are not extending any live
1569 // range while rematerializing.
1570 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(Instr: *UseMI).getRegSlot(EC: true);
1571 if (!VirtRegAuxInfo::allUsesAvailableAt(MI: CandReg.DefMI, UseIdx, LIS: *DAG.LIS,
1572 MRI: DAG.MRI, TII: *DAG.TII))
1573 continue;
1574
1575 MarkedRegs.insert(V: CandReg.getDefReg());
1576 ScoredRemat &Cand = Candidates.emplace_back();
1577 Cand.init(RegIdx, Freq: FreqInfo, Remater, DAG);
1578 Cand.update(TargetRegions, RPTargets, Freq: FreqInfo, ReduceSpill: !TargetOcc);
1579 if (!Cand.hasNullScore())
1580 CandidateOrder.push_back(Elt: Candidates.size() - 1);
1581 }
1582
1583 if (TargetOcc) {
1584 // Every rematerialization we do here is likely to move the instruction
1585 // into a higher frequency region, increasing the total sum latency of the
1586 // instruction itself. This is acceptable if we are eliminating a spill in
1587 // the process, but when the goal is increasing occupancy we get nothing
1588 // out of rematerialization if occupancy is not increased in the end; in
1589 // such cases we want to roll back the rematerialization.
1590 Rollback = std::make_unique<RollbackSupport>(args&: Remater);
1591 }
1592
1593 // Rematerialize registers in successive rounds until all RP targets are
1594 // satisifed or until we run out of rematerialization candidates.
1595 BitVector RecomputeRP(DAG.Regions.size());
1596 for (;;) {
1597 RecomputeRP.reset();
1598
1599 // Sort candidates in increasing score order.
1600 sort(C&: CandidateOrder, Comp: [&](unsigned LHSIndex, unsigned RHSIndex) {
1601 return Candidates[LHSIndex] < Candidates[RHSIndex];
1602 });
1603
1604 REMAT_DEBUG({
1605 dbgs() << "==== NEW REMAT ROUND ====\n"
1606 << REMAT_PREFIX
1607 << "Candidates with non-null score, in rematerialization order:\n";
1608 for (const ScoredRemat &Cand : reverse(Candidates)) {
1609 dbgs() << REMAT_PREFIX << " " << Cand.print() << " | "
1610 << Remater.printRematReg(Cand.RegIdx) << '\n';
1611 }
1612 PrintTargetRegions();
1613 });
1614
1615 // Rematerialize registers in decreasing score order until we estimate
1616 // that all RP targets are satisfied or until rematerialization candidates
1617 // are no longer useful to decrease RP.
1618 while (!CandidateOrder.empty()) {
1619 const ScoredRemat &Cand = Candidates[CandidateOrder.back()];
1620 const Rematerializer::Reg &Reg = Remater.getReg(RegIdx: Cand.RegIdx);
1621
1622 // When previous rematerializations in this round have already satisfied
1623 // RP targets in all regions this rematerialization can impact, we have a
1624 // good indication that our scores have diverged significantly from
1625 // reality, in which case we interrupt this round and re-score. This also
1626 // ensures that every rematerialization we perform is possibly impactful
1627 // in at least one target region.
1628 if (!Cand.maybeBeneficial(TargetRegions, RPTargets)) {
1629 REMAT_DEBUG(dbgs() << "Interrupt round on stale score for "
1630 << Cand.print() << " | "
1631 << Remater.printRematReg(Cand.RegIdx));
1632 break;
1633 }
1634 CandidateOrder.pop_back();
1635
1636#ifdef EXPENSIVE_CHECKS
1637 // All uses are known to be available / live at the remat point. Thus,
1638 // the uses should already be live in to the using region.
1639 for (MachineOperand &MO : Reg.DefMI->operands()) {
1640 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
1641 continue;
1642
1643 Register UseReg = MO.getReg();
1644 if (!UseReg.isVirtual())
1645 continue;
1646
1647 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
1648 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
1649 if (LI.hasSubRanges() && MO.getSubReg())
1650 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
1651
1652 const unsigned UseRegion = Reg.Uses.begin()->first;
1653 LaneBitmask LiveInMask = DAG.LiveIns[UseRegion].at(UseReg);
1654 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
1655 // If this register has lanes not covered by the LiveIns, be sure they
1656 // do not map to any subrange. ref:
1657 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
1658 if (UncoveredLanes.any()) {
1659 assert(LI.hasSubRanges());
1660 for (LiveInterval::SubRange &SR : LI.subranges())
1661 assert((SR.LaneMask & UncoveredLanes).none());
1662 }
1663 }
1664#endif
1665
1666 // Remove the register from all regions where it is a live-in or live-out,
1667 // then rematerialize the register.
1668 REMAT_DEBUG(dbgs() << "** REMAT " << Remater.printRematReg(Cand.RegIdx)
1669 << '\n');
1670 removeFromLiveMaps(Reg: Reg.getDefReg(), LiveIn: Cand.LiveIn, LiveOut: Cand.LiveOut);
1671 if (Rollback) {
1672 Rollback->LiveMapUpdates.emplace_back(Args: Cand.RegIdx, Args: Cand.LiveIn,
1673 Args: Cand.LiveOut);
1674 }
1675 Cand.rematerialize(Remater);
1676
1677 // Adjust RP targets. The save is guaranteed in regions in which the
1678 // register is live-through and unused but optimistic in all other regions
1679 // where the register is live.
1680 updateRPTargets(Regions: Cand.Live, RPSave: Cand.RPSave);
1681 RecomputeRP |= Cand.UnpredictableRPSave;
1682 RescheduleRegions |= Cand.Live;
1683 if (!TargetRegions.any()) {
1684 REMAT_DEBUG(dbgs() << "All targets cleared, verifying...\n");
1685 break;
1686 }
1687 }
1688
1689 if (!updateAndVerifyRPTargets(Regions: RecomputeRP) && !TargetRegions.any()) {
1690 REMAT_DEBUG(dbgs() << "Objectives achieved!\n");
1691 break;
1692 }
1693
1694 // Update the score of remaining candidates and filter out those that have
1695 // become useless from the vector. Candidates never become useful after
1696 // having been useless for a round, so we can freely drop them without
1697 // losing any future rematerialization opportunity.
1698 unsigned NumUsefulCandidates = 0;
1699 for (unsigned CandIdx : CandidateOrder) {
1700 ScoredRemat &Candidate = Candidates[CandIdx];
1701 Candidate.update(TargetRegions, RPTargets, Freq: FreqInfo, ReduceSpill: !TargetOcc);
1702 if (!Candidate.hasNullScore())
1703 CandidateOrder[NumUsefulCandidates++] = CandIdx;
1704 }
1705 if (NumUsefulCandidates == 0) {
1706 REMAT_DEBUG(dbgs() << "Stop on exhausted rematerialization candidates\n");
1707 break;
1708 }
1709 CandidateOrder.truncate(N: NumUsefulCandidates);
1710 }
1711
1712 if (RescheduleRegions.none())
1713 return false;
1714
1715 // Commit all pressure changes to the DAG and compute minimum achieved
1716 // occupancy in impacted regions.
1717 REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
1718 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
1719 for (unsigned I : RescheduleRegions.set_bits()) {
1720 DAG.Pressure[I] = RPTargets[I].getCurrentRP();
1721 REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
1722 << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
1723 << " (" << RPTargets[I] << ")\n");
1724 }
1725 AchievedOcc = MFI.getMaxWavesPerEU();
1726 for (const GCNRegPressure &RP : DAG.Pressure) {
1727 AchievedOcc =
1728 std::min(a: AchievedOcc, b: RP.getOccupancy(ST, DynamicVGPRBlockSize));
1729 }
1730
1731 REMAT_DEBUG({
1732 dbgs() << "Retrying function scheduling with new min. occupancy of "
1733 << AchievedOcc << " from rematerializing (original was "
1734 << DAG.MinOccupancy;
1735 if (TargetOcc)
1736 dbgs() << ", target was " << *TargetOcc;
1737 dbgs() << ")\n";
1738 });
1739
1740 DAG.setTargetOccupancy(getStageTargetOccupancy());
1741 return true;
1742}
1743
1744void GCNSchedStage::finalizeGCNSchedStage() {
1745 DAG.finishBlock();
1746 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1747}
1748
1749void UnclusteredHighRPStage::finalizeGCNSchedStage() {
1750 SavedMutations.swap(x&: DAG.Mutations);
1751 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1752 if (DAG.MinOccupancy > InitialOccupancy) {
1753 assert(IsAnyRegionScheduled);
1754 LLVM_DEBUG(dbgs() << StageID
1755 << " stage successfully increased occupancy to "
1756 << DAG.MinOccupancy << '\n');
1757 } else if (!IsAnyRegionScheduled) {
1758 assert(DAG.MinOccupancy == InitialOccupancy);
1759 LLVM_DEBUG(dbgs() << StageID
1760 << ": No regions scheduled, min occupancy stays at "
1761 << DAG.MinOccupancy << ", MFI occupancy stays at "
1762 << MFI.getOccupancy() << ".\n");
1763 }
1764
1765 GCNSchedStage::finalizeGCNSchedStage();
1766}
1767
1768bool GCNSchedStage::initGCNRegion() {
1769 // Skip empty scheduling region.
1770 if (DAG.begin() == DAG.end())
1771 return false;
1772
1773 // Check whether this new region is also a new block.
1774 if (DAG.RegionBegin->getParent() != CurrentMBB)
1775 setupNewBlock();
1776
1777 unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end());
1778 DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs);
1779
1780 // Skip regions with 1 schedulable instruction.
1781 if (DAG.begin() == std::prev(x: DAG.end()))
1782 return false;
1783
1784 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1785 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1786 << " " << CurrentMBB->getName()
1787 << "\n From: " << *DAG.begin() << " To: ";
1788 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1789 else dbgs() << "End";
1790 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1791
1792 // Save original instruction order before scheduling for possible revert.
1793 Unsched.clear();
1794 Unsched.reserve(n: DAG.NumRegionInstrs);
1795 if (StageID == GCNSchedStageID::OccInitialSchedule ||
1796 StageID == GCNSchedStageID::ILPInitialSchedule) {
1797 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1798 for (auto &I : DAG) {
1799 Unsched.push_back(x: &I);
1800 if (SII->isIGLPMutationOnly(Opcode: I.getOpcode()))
1801 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1802 }
1803 } else {
1804 for (auto &I : DAG)
1805 Unsched.push_back(x: &I);
1806 }
1807
1808 PressureBefore = DAG.Pressure[RegionIdx];
1809
1810 LLVM_DEBUG(
1811 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1812 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1813 << "Region live-in pressure: "
1814 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1815 << "Region register pressure: " << print(PressureBefore));
1816
1817 S.HasHighPressure = false;
1818 S.KnownExcessRP = isRegionWithExcessRP();
1819
1820 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1821 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
1822 SavedMutations.clear();
1823 SavedMutations.swap(x&: DAG.Mutations);
1824 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1825 StageID == GCNSchedStageID::ILPInitialSchedule;
1826 DAG.addMutation(Mutation: createIGroupLPDAGMutation(
1827 Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1828 : AMDGPU::SchedulingPhase::PreRAReentry));
1829 }
1830
1831 return true;
1832}
1833
1834bool UnclusteredHighRPStage::initGCNRegion() {
1835 // Only reschedule regions that have excess register pressure (i.e. spilling)
1836 // or had minimum occupancy at the beginning of the stage (as long as
1837 // rescheduling of previous regions did not make occupancy drop back down to
1838 // the initial minimum).
1839 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1840 // If no region has been scheduled yet, the DAG has not yet been updated with
1841 // the occupancy target. So retrieve it from the temporary.
1842 unsigned CurrentTargetOccupancy =
1843 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1844 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1845 (CurrentTargetOccupancy <= InitialOccupancy ||
1846 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1847 InitialOccupancy))
1848 return false;
1849
1850 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1851 // If this is the first region scheduled during this stage, make the target
1852 // occupancy changes in the DAG and MFI.
1853 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1854 IsAnyRegionScheduled = true;
1855 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1856 DAG.setTargetOccupancy(TempTargetOccupancy);
1857 }
1858 return IsSchedulingThisRegion;
1859}
1860
1861bool ClusteredLowOccStage::initGCNRegion() {
1862 // We may need to reschedule this region if it wasn't rescheduled in the last
1863 // stage, or if we found it was testing critical register pressure limits in
1864 // the unclustered reschedule stage. The later is because we may not have been
1865 // able to raise the min occupancy in the previous stage so the region may be
1866 // overly constrained even if it was already rescheduled.
1867 if (!DAG.RegionsWithHighRP[RegionIdx])
1868 return false;
1869
1870 return GCNSchedStage::initGCNRegion();
1871}
1872
1873bool PreRARematStage::initGCNRegion() {
1874 return !RevertAllRegions && RescheduleRegions[RegionIdx] &&
1875 GCNSchedStage::initGCNRegion();
1876}
1877
1878void GCNSchedStage::setupNewBlock() {
1879 if (CurrentMBB)
1880 DAG.finishBlock();
1881
1882 CurrentMBB = DAG.RegionBegin->getParent();
1883 DAG.startBlock(bb: CurrentMBB);
1884 // Get real RP for the region if it hasn't be calculated before. After the
1885 // initial schedule stage real RP will be collected after scheduling.
1886 if (StageID == GCNSchedStageID::OccInitialSchedule ||
1887 StageID == GCNSchedStageID::ILPInitialSchedule ||
1888 StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
1889 DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB);
1890}
1891
1892void GCNSchedStage::finalizeGCNRegion() {
1893 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1894 if (S.HasHighPressure)
1895 DAG.RegionsWithHighRP[RegionIdx] = true;
1896
1897 // Revert scheduling if we have dropped occupancy or there is some other
1898 // reason that the original schedule is better.
1899 checkScheduling();
1900
1901 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1902 StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1903 SavedMutations.swap(x&: DAG.Mutations);
1904}
1905
1906void PreRARematStage::finalizeGCNRegion() {
1907 GCNSchedStage::finalizeGCNRegion();
1908 // When the goal is to increase occupancy, all regions must reach the target
1909 // occupancy for rematerializations to be possibly useful, otherwise we will
1910 // just hurt latency for no benefit. If minimum occupancy drops below the
1911 // target there is no point in trying to re-schedule further regions.
1912 if (!TargetOcc)
1913 return;
1914 RegionReverts.emplace_back(Args&: RegionIdx, Args&: Unsched, Args&: PressureBefore);
1915 if (DAG.MinOccupancy < *TargetOcc) {
1916 REMAT_DEBUG(dbgs() << "Region " << RegionIdx
1917 << " cannot meet occupancy target, interrupting "
1918 "re-scheduling in all regions\n");
1919 RevertAllRegions = true;
1920 }
1921}
1922
1923void GCNSchedStage::checkScheduling() {
1924 // Check the results of scheduling.
1925 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1926
1927 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1928 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1929
1930 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1931
1932 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1933 PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1934 DAG.Pressure[RegionIdx] = PressureAfter;
1935
1936 // Early out if we have achieved the occupancy target.
1937 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1938 return;
1939 }
1940
1941 unsigned TargetOccupancy = std::min(
1942 a: S.getTargetOccupancy(), b: ST.getOccupancyWithWorkGroupSizes(MF).second);
1943 unsigned WavesAfter = std::min(
1944 a: TargetOccupancy, b: PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1945 unsigned WavesBefore = std::min(
1946 a: TargetOccupancy, b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1947 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1948 << ", after " << WavesAfter << ".\n");
1949
1950 // We may not be able to keep the current target occupancy because of the just
1951 // scheduled region. We might still be able to revert scheduling if the
1952 // occupancy before was higher, or if the current schedule has register
1953 // pressure higher than the excess limits which could lead to more spilling.
1954 unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore);
1955
1956 // Allow memory bound functions to drop to 4 waves if not limited by an
1957 // attribute.
1958 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1959 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1960 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1961 << MFI.getMinAllowedOccupancy() << " waves\n");
1962 NewOccupancy = WavesAfter;
1963 }
1964
1965 if (NewOccupancy < DAG.MinOccupancy) {
1966 DAG.MinOccupancy = NewOccupancy;
1967 MFI.limitOccupancy(Limit: DAG.MinOccupancy);
1968 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1969 << DAG.MinOccupancy << ".\n");
1970 }
1971 // The maximum number of arch VGPR on non-unified register file, or the
1972 // maximum VGPR + AGPR in the unified register file case.
1973 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1974 // The maximum number of arch VGPR for both unified and non-unified register
1975 // file.
1976 unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs());
1977 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1978
1979 if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs ||
1980 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1981 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1982 PressureAfter.getSGPRNum() > MaxSGPRs) {
1983 DAG.RegionsWithHighRP[RegionIdx] = true;
1984 DAG.RegionsWithExcessRP[RegionIdx] = true;
1985 }
1986
1987 // Revert if this region's schedule would cause a drop in occupancy or
1988 // spilling.
1989 if (shouldRevertScheduling(WavesAfter)) {
1990 modifyRegionSchedule(RegionIdx, MIOrder: Unsched);
1991 std::tie(args&: DAG.RegionBegin, args&: DAG.RegionEnd) = DAG.Regions[RegionIdx];
1992 } else {
1993 DAG.Pressure[RegionIdx] = PressureAfter;
1994 }
1995}
1996
1997unsigned
1998GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1999 DenseMap<unsigned, unsigned> &ReadyCycles,
2000 const TargetSchedModel &SM) {
2001 unsigned ReadyCycle = CurrCycle;
2002 for (auto &D : SU.Preds) {
2003 if (D.isAssignedRegDep()) {
2004 MachineInstr *DefMI = D.getSUnit()->getInstr();
2005 unsigned Latency = SM.computeInstrLatency(MI: DefMI);
2006 unsigned DefReady = ReadyCycles[DAG.getSUnit(MI: DefMI)->NodeNum];
2007 ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency);
2008 }
2009 }
2010 ReadyCycles[SU.NodeNum] = ReadyCycle;
2011 return ReadyCycle;
2012}
2013
2014#ifndef NDEBUG
2015struct EarlierIssuingCycle {
2016 bool operator()(std::pair<MachineInstr *, unsigned> A,
2017 std::pair<MachineInstr *, unsigned> B) const {
2018 return A.second < B.second;
2019 }
2020};
2021
2022static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
2023 EarlierIssuingCycle> &ReadyCycles) {
2024 if (ReadyCycles.empty())
2025 return;
2026 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
2027 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
2028 << " ##################\n# Cycle #\t\t\tInstruction "
2029 " "
2030 " \n";
2031 unsigned IPrev = 1;
2032 for (auto &I : ReadyCycles) {
2033 if (I.second > IPrev + 1)
2034 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
2035 << " CYCLES DETECTED ******************************\n\n";
2036 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
2037 IPrev = I.second;
2038 }
2039}
2040#endif
2041
2042ScheduleMetrics
2043GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
2044#ifndef NDEBUG
2045 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
2046 ReadyCyclesSorted;
2047#endif
2048 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
2049 unsigned SumBubbles = 0;
2050 DenseMap<unsigned, unsigned> ReadyCycles;
2051 unsigned CurrCycle = 0;
2052 for (auto &SU : InputSchedule) {
2053 unsigned ReadyCycle =
2054 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
2055 SumBubbles += ReadyCycle - CurrCycle;
2056#ifndef NDEBUG
2057 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
2058#endif
2059 CurrCycle = ++ReadyCycle;
2060 }
2061#ifndef NDEBUG
2062 LLVM_DEBUG(
2063 printScheduleModel(ReadyCyclesSorted);
2064 dbgs() << "\n\t"
2065 << "Metric: "
2066 << (SumBubbles
2067 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2068 : 1)
2069 << "\n\n");
2070#endif
2071
2072 return ScheduleMetrics(CurrCycle, SumBubbles);
2073}
2074
2075ScheduleMetrics
2076GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
2077#ifndef NDEBUG
2078 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
2079 ReadyCyclesSorted;
2080#endif
2081 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
2082 unsigned SumBubbles = 0;
2083 DenseMap<unsigned, unsigned> ReadyCycles;
2084 unsigned CurrCycle = 0;
2085 for (auto &MI : DAG) {
2086 SUnit *SU = DAG.getSUnit(MI: &MI);
2087 if (!SU)
2088 continue;
2089 unsigned ReadyCycle =
2090 computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM);
2091 SumBubbles += ReadyCycle - CurrCycle;
2092#ifndef NDEBUG
2093 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
2094#endif
2095 CurrCycle = ++ReadyCycle;
2096 }
2097#ifndef NDEBUG
2098 LLVM_DEBUG(
2099 printScheduleModel(ReadyCyclesSorted);
2100 dbgs() << "\n\t"
2101 << "Metric: "
2102 << (SumBubbles
2103 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2104 : 1)
2105 << "\n\n");
2106#endif
2107
2108 return ScheduleMetrics(CurrCycle, SumBubbles);
2109}
2110
2111bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
2112 if (WavesAfter < DAG.MinOccupancy)
2113 return true;
2114
2115 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
2116 if (DAG.MFI.isDynamicVGPREnabled()) {
2117 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2118 STI: ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(),
2119 DynamicVGPRBlockSize: PressureBefore.getVGPRNum(UnifiedVGPRFile: false));
2120 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2121 STI: ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(), DynamicVGPRBlockSize: PressureAfter.getVGPRNum(UnifiedVGPRFile: false));
2122 if (BlocksAfter > BlocksBefore)
2123 return true;
2124 }
2125
2126 return false;
2127}
2128
2129bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
2130 if (PressureAfter == PressureBefore)
2131 return false;
2132
2133 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
2134 return true;
2135
2136 if (mayCauseSpilling(WavesAfter))
2137 return true;
2138
2139 return false;
2140}
2141
2142bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
2143 // If RP is not reduced in the unclustered reschedule stage, revert to the
2144 // old schedule.
2145 if ((WavesAfter <=
2146 PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) &&
2147 mayCauseSpilling(WavesAfter)) ||
2148 GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
2149 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
2150 return true;
2151 }
2152
2153 // Do not attempt to relax schedule even more if we are already spilling.
2154 if (isRegionWithExcessRP())
2155 return false;
2156
2157 LLVM_DEBUG(
2158 dbgs()
2159 << "\n\t *** In shouldRevertScheduling ***\n"
2160 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
2161 ScheduleMetrics MBefore = getScheduleMetrics(InputSchedule: DAG.SUnits);
2162 LLVM_DEBUG(
2163 dbgs()
2164 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
2165 ScheduleMetrics MAfter = getScheduleMetrics(DAG);
2166 unsigned OldMetric = MBefore.getMetric();
2167 unsigned NewMetric = MAfter.getMetric();
2168 unsigned WavesBefore = std::min(
2169 a: S.getTargetOccupancy(),
2170 b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()));
2171 unsigned Profit =
2172 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
2173 ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
2174 NewMetric) /
2175 ScheduleMetrics::ScaleFactor;
2176 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
2177 << MAfter << "Profit: " << Profit << "\n");
2178 return Profit < ScheduleMetrics::ScaleFactor;
2179}
2180
2181bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
2182 if (PressureAfter == PressureBefore)
2183 return false;
2184
2185 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
2186 return true;
2187
2188 if (mayCauseSpilling(WavesAfter))
2189 return true;
2190
2191 return false;
2192}
2193
2194bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
2195 // When trying to increase occupancy (TargetOcc == true) the stage manages
2196 // region reverts globally (all or none), so we always return false here.
2197 return !TargetOcc && mayCauseSpilling(WavesAfter);
2198}
2199
2200bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
2201 if (mayCauseSpilling(WavesAfter))
2202 return true;
2203
2204 return false;
2205}
2206
2207bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
2208 unsigned WavesAfter) {
2209 return mayCauseSpilling(WavesAfter);
2210}
2211
2212bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
2213 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
2214 !PressureAfter.less(MF, O: PressureBefore)) {
2215 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
2216 return true;
2217 }
2218
2219 return false;
2220}
2221
2222void GCNSchedStage::modifyRegionSchedule(unsigned RegionIdx,
2223 ArrayRef<MachineInstr *> MIOrder) {
2224 assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
2225 DAG.Regions[RegionIdx].second)) ==
2226 MIOrder.size() &&
2227 "instruction number mismatch");
2228 if (MIOrder.empty())
2229 return;
2230
2231 LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
2232
2233 // Reconstruct MI sequence by moving instructions in desired order before
2234 // the current region's start.
2235 MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
2236 MachineBasicBlock *MBB = MIOrder.front()->getParent();
2237 for (MachineInstr *MI : MIOrder) {
2238 // Either move the next MI in order before the end of the region or move the
2239 // region end past the MI if it is at the correct position.
2240 MachineBasicBlock::iterator MII = MI->getIterator();
2241 if (MII != RegionEnd) {
2242 // Will subsequent splice move MI up past a non-debug instruction?
2243 bool NonDebugReordered =
2244 !MI->isDebugInstr() &&
2245 skipDebugInstructionsForward(It: RegionEnd, End: MII) != MII;
2246 MBB->splice(Where: RegionEnd, Other: MBB, From: MI);
2247 // Only update LiveIntervals information if non-debug instructions are
2248 // reordered. Otherwise debug instructions could cause code generation to
2249 // change.
2250 if (NonDebugReordered)
2251 DAG.LIS->handleMove(MI&: *MI, UpdateFlags: true);
2252 } else {
2253 // MI is already at the expected position. However, earlier splices in
2254 // this loop may have changed neighboring slot indices, so this MI's
2255 // slot index can become non-monotonic w.r.t. the physical MBB order.
2256 // Only re-seat when monotonicity is actually violated to avoid
2257 // unnecessary LiveInterval changes that could perturb scheduling.
2258 if (!MI->isDebugInstr()) {
2259 SlotIndex MIIdx = DAG.LIS->getInstructionIndex(Instr: *MI);
2260 SlotIndex PrevIdx = DAG.LIS->getSlotIndexes()->getIndexBefore(MI: *MI);
2261 if (PrevIdx >= MIIdx)
2262 DAG.LIS->handleMove(MI&: *MI, UpdateFlags: true);
2263 }
2264 ++RegionEnd;
2265 }
2266 if (MI->isDebugInstr()) {
2267 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2268 continue;
2269 }
2270
2271 // Reset read-undef flags and update them later.
2272 for (MachineOperand &Op : MI->all_defs())
2273 Op.setIsUndef(false);
2274 RegisterOperands RegOpers;
2275 RegOpers.collect(MI: *MI, TRI: *DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false);
2276 if (DAG.ShouldTrackLaneMasks) {
2277 // Adjust liveness and add missing dead+read-undef flags.
2278 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot();
2279 RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI);
2280 } else {
2281 // Adjust for missing dead-def flags.
2282 RegOpers.detectDeadDefs(MI: *MI, LIS: *DAG.LIS);
2283 }
2284 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2285 }
2286
2287 // The region end doesn't change throughout scheduling since it itself is
2288 // outside the region (whether that is a MBB end or a terminator MI).
2289 assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
2290 DAG.Regions[RegionIdx].first = MIOrder.front();
2291}
2292
2293/// Returns true if reaching def \p RD will be in AGPR form after the rewrite
2294/// and so needs no bridge copy: a candidate MFMA in \p RewriteSet, an
2295/// AV_MOV_*_IMM_PSEUDO, or a copy from a candidate src2 reg in \p CandSrc2Regs.
2296/// A non-candidate MFMA stays in VGPR form and still needs a bridge.
2297static bool isReachingDefAGPRForm(
2298 MachineInstr *RD, const SmallPtrSetImpl<MachineInstr *> &RewriteSet,
2299 const DenseSet<Register> &CandSrc2Regs, const SIInstrInfo &TII) {
2300 if (TII.isMAI(MI: *RD))
2301 return RewriteSet.contains(Ptr: RD);
2302 if (RD->getOpcode() == AMDGPU::AV_MOV_B32_IMM_PSEUDO ||
2303 RD->getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO)
2304 return true;
2305 if (RD->isCopy() && CandSrc2Regs.contains(V: RD->getOperand(i: 1).getReg()))
2306 return true;
2307 return false;
2308}
2309
2310bool RewriteMFMAFormStage::hasUseRequiringVGPR(
2311 ArrayRef<SlotIndex> Src2ReachingDefs,
2312 const SmallPtrSetImpl<MachineInstr *> &RewriteSet) {
2313 for (SlotIndex RDIdx : Src2ReachingDefs) {
2314 const MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIdx);
2315 SmallVector<MachineOperand *, 8> ReachingUses;
2316 findReachingUses(DefMI: RD, LIS: DAG.LIS, ReachingUses);
2317 for (const MachineOperand *UseMO : ReachingUses) {
2318 const MachineInstr *UseMI = UseMO->getParent();
2319 if (UseMI->isCopy())
2320 continue;
2321 if (TII->isMAI(MI: *UseMI) && RewriteSet.contains(Ptr: UseMI))
2322 continue;
2323 return true;
2324 }
2325 }
2326 return false;
2327}
2328
2329void RewriteMFMAFormStage::resetRewriteCandsToVGPR(
2330 ArrayRef<std::pair<MachineInstr *, unsigned>> RewriteCands) {
2331 for (auto [MI, OriginalOpcode] : RewriteCands) {
2332 assert(TII->isMAI(*MI));
2333 const TargetRegisterClass *ADefRC =
2334 DAG.MRI.getRegClass(Reg: MI->getOperand(i: 0).getReg());
2335 const TargetRegisterClass *VDefRC = SRI->getEquivalentVGPRClass(SRC: ADefRC);
2336 DAG.MRI.setRegClass(Reg: MI->getOperand(i: 0).getReg(), RC: VDefRC);
2337 MI->setDesc(TII->get(Opcode: OriginalOpcode));
2338
2339 MachineOperand *Src2 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2);
2340 if (!Src2->isReg())
2341 continue;
2342
2343 // Have to get src types separately since subregs may cause C and D
2344 // registers to be different types even though the actual operand is
2345 // the same size.
2346 const TargetRegisterClass *AUseRC = DAG.MRI.getRegClass(Reg: Src2->getReg());
2347 const TargetRegisterClass *VUseRC = SRI->getEquivalentVGPRClass(SRC: AUseRC);
2348 DAG.MRI.setRegClass(Reg: Src2->getReg(), RC: VUseRC);
2349 }
2350}
2351
2352bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
2353 if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(MI: *MI))
2354 return false;
2355 if (AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI->getOpcode()) == -1)
2356 return false;
2357 // Reject candidates whose users force an unavoidable bridge copy.
2358 Register DstReg = MI->getOperand(i: 0).getReg();
2359 for (const MachineOperand &Use : DAG.MRI.use_nodbg_operands(Reg: DstReg)) {
2360 if (!TII->isMAI(MI: *Use.getParent()) && !Use.getParent()->isCopy())
2361 return false;
2362 }
2363 return true;
2364}
2365
2366bool RewriteMFMAFormStage::initHeuristics(
2367 std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2368 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2369 SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2370 bool Changed = false;
2371
2372 // Collect the candidate group, its members share AGPR-form operands
2373 // post-rewrite, so reaching defs feeding any member don't need bridge copy.
2374 SmallPtrSet<MachineInstr *, 16> RewriteSet;
2375 DenseSet<Register> CandSrc2Regs;
2376 for (MachineBasicBlock &MBB : MF) {
2377 for (MachineInstr &MI : MBB) {
2378 if (!isRewriteCandidate(MI: &MI))
2379 continue;
2380 RewriteSet.insert(Ptr: &MI);
2381 MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
2382 if (Src2 && Src2->isReg())
2383 CandSrc2Regs.insert(V: Src2->getReg());
2384 }
2385 }
2386
2387 // Prepare for the heuristics
2388 for (MachineBasicBlock &MBB : MF) {
2389 for (MachineInstr &MI : MBB) {
2390 if (!isRewriteCandidate(MI: &MI))
2391 continue;
2392
2393 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI.getOpcode());
2394 assert(ReplacementOp != -1);
2395
2396 RewriteCands.push_back(x: {&MI, MI.getOpcode()});
2397 MI.setDesc(TII->get(Opcode: ReplacementOp));
2398
2399 MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
2400 if (Src2->isReg()) {
2401 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2402 findReachingDefs(UseMO&: *Src2, LIS: DAG.LIS, DefIdxs&: Src2ReachingDefs);
2403
2404 // If src2 has a use that must remain VGPR, it cannot be reclassified to
2405 // AGPR.
2406 bool Src2NeedsVGPR = hasUseRequiringVGPR(Src2ReachingDefs, RewriteSet);
2407 Src2NeedsVGPRCache[&MI] = Src2NeedsVGPR;
2408
2409 for (SlotIndex RDIdx : Src2ReachingDefs) {
2410 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIdx);
2411 if (!Src2NeedsVGPR &&
2412 isReachingDefAGPRForm(RD, RewriteSet, CandSrc2Regs, TII: *TII))
2413 continue;
2414 CopyForDef.insert(Ptr: RD);
2415 }
2416 }
2417
2418 MachineOperand &Dst = MI.getOperand(i: 0);
2419 SmallVector<MachineOperand *, 8> DstReachingUses;
2420
2421 findReachingUses(DefMI: &MI, LIS: DAG.LIS, ReachingUses&: DstReachingUses);
2422
2423 for (MachineOperand *RUOp : DstReachingUses) {
2424 MachineInstr *UserMI = RUOp->getParent();
2425 // Group members read the AGPR result directly.
2426 if (TII->isMAI(MI: *UserMI) && RewriteSet.contains(Ptr: UserMI))
2427 continue;
2428
2429 // For any user of the result of the MFMA which is not an MFMA, we
2430 // insert a copy. For a given register, we will only insert one copy
2431 // per user block.
2432 CopyForUse[UserMI->getParent()].insert(x: RUOp->getReg());
2433
2434 if (TII->isMAI(MI: *UserMI))
2435 continue;
2436
2437 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2438 findReachingDefs(UseMO&: *RUOp, LIS: DAG.LIS, DefIdxs&: DstUsesReachingDefs);
2439
2440 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2441 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2442 if (TII->isMAI(MI: *RD))
2443 continue;
2444
2445 // For any definition of the user of the MFMA which is not an MFMA,
2446 // we insert a copy. We do this to transform all the reaching defs
2447 // of this use to AGPR. By doing this, we can insert a copy from
2448 // AGPR to VGPR at the user rather than after the MFMA.
2449 CopyForDef.insert(Ptr: RD);
2450 }
2451 }
2452
2453 // Do the rewrite to allow for updated RP calculation.
2454 const TargetRegisterClass *VDefRC = DAG.MRI.getRegClass(Reg: Dst.getReg());
2455 const TargetRegisterClass *ADefRC = SRI->getEquivalentAGPRClass(SRC: VDefRC);
2456 DAG.MRI.setRegClass(Reg: Dst.getReg(), RC: ADefRC);
2457 if (Src2->isReg()) {
2458 // Have to get src types separately since subregs may cause C and D
2459 // registers to be different types even though the actual operand is
2460 // the same size.
2461 const TargetRegisterClass *VUseRC = DAG.MRI.getRegClass(Reg: Src2->getReg());
2462 const TargetRegisterClass *AUseRC = SRI->getEquivalentAGPRClass(SRC: VUseRC);
2463 DAG.MRI.setRegClass(Reg: Src2->getReg(), RC: AUseRC);
2464 }
2465 Changed = true;
2466 }
2467 }
2468
2469 return Changed;
2470}
2471
2472int64_t RewriteMFMAFormStage::getRewriteCost(
2473 ArrayRef<std::pair<MachineInstr *, unsigned>> RewriteCands,
2474 const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2475 const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2476 MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2477
2478 int64_t BestSpillCost = 0;
2479 int64_t Cost = 0;
2480 uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2481
2482 std::pair<unsigned, unsigned> MaxVectorRegs =
2483 ST.getMaxNumVectorRegs(F: MF.getFunction());
2484 unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2485 unsigned AGPRThreshold = MaxVectorRegs.second;
2486 unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2487
2488 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2489 if (!RegionsWithExcessArchVGPR[Region])
2490 continue;
2491
2492 GCNRegPressure &PressureBefore = DAG.Pressure[Region];
2493 unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2494 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2495
2496 // For the cases we care about (i.e. ArchVGPR usage is greater than the
2497 // addressable limit), rewriting alone should bring pressure to manageable
2498 // level. If we find any such region, then the rewrite is potentially
2499 // beneficial.
2500 GCNRegPressure PressureAfter = DAG.getRealRegPressure(RegionIdx: Region);
2501 unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2502 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2503
2504 uint64_t BlockFreq =
2505 MBFI->getBlockFreq(MBB: DAG.Regions[Region].first->getParent())
2506 .getFrequency();
2507
2508 bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2509 uint64_t RelativeFreq = EntryFreq && BlockFreq
2510 ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2511 : BlockFreq / EntryFreq)
2512 : 1;
2513
2514 // This assumes perfect spilling / splitting -- using one spill / copy
2515 // instruction and one restoreFrom / copy for each excess register,
2516 int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
2517
2518 // Also account for the block frequency.
2519 if (RelativeFreqIsDenom)
2520 SpillCost /= (int64_t)RelativeFreq;
2521 else
2522 SpillCost *= (int64_t)RelativeFreq;
2523
2524 // If we have increased spilling in any block, just bail.
2525 if (SpillCost > 0) {
2526 resetRewriteCandsToVGPR(RewriteCands);
2527 return SpillCost;
2528 }
2529
2530 if (SpillCost < BestSpillCost)
2531 BestSpillCost = SpillCost;
2532 }
2533
2534 // Set the cost to the largest decrease in spill cost in order to not double
2535 // count spill reductions.
2536 Cost = BestSpillCost;
2537 assert(Cost <= 0);
2538
2539 unsigned CopyCost = 0;
2540
2541 // For each CopyForDef, increase the cost by the register size while
2542 // accounting for block frequency.
2543 for (MachineInstr *DefMI : CopyForDef) {
2544 Register DefReg = DefMI->getOperand(i: 0).getReg();
2545 uint64_t DefFreq =
2546 EntryFreq
2547 ? MBFI->getBlockFreq(MBB: DefMI->getParent()).getFrequency() / EntryFreq
2548 : 1;
2549
2550 const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg: DefReg);
2551 CopyCost += RC->getCopyCost() * DefFreq;
2552 }
2553
2554 // Account for CopyForUse copies in each block that the register is used.
2555 for (auto &[UseBlock, UseRegs] : CopyForUse) {
2556 uint64_t UseFreq =
2557 EntryFreq ? MBFI->getBlockFreq(MBB: UseBlock).getFrequency() / EntryFreq : 1;
2558
2559 for (Register UseReg : UseRegs) {
2560 const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg: UseReg);
2561 CopyCost += RC->getCopyCost() * UseFreq;
2562 }
2563 }
2564
2565 // Reset the classes that were changed to AGPR for better register bank
2566 // analysis. We must do rewriting after copy-insertion, as some defs of the
2567 // register may require VGPR. Additionally, if we bail out and don't perform
2568 // the rewrite then these need to be restored anyway.
2569 resetRewriteCandsToVGPR(RewriteCands);
2570
2571 return Cost + CopyCost;
2572}
2573
2574bool RewriteMFMAFormStage::rewrite(
2575 ArrayRef<std::pair<MachineInstr *, unsigned>> RewriteCands) {
2576 DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
2577 DenseMap<MachineInstr *, unsigned> LastMIToRegion;
2578
2579 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2580 RegionBoundaries Entry = DAG.Regions[Region];
2581 if (Entry.first == Entry.second)
2582 continue;
2583
2584 FirstMIToRegion[&*Entry.first] = Region;
2585 if (Entry.second != Entry.first->getParent()->end())
2586 LastMIToRegion[&*Entry.second] = Region;
2587 }
2588
2589 // Rewrite the MFMAs to AGPR, and insert any copies as needed.
2590 // The general assumption of the algorithm (and the previous cost calculation)
2591 // is that it is better to insert the copies in the MBB of the def of the src2
2592 // operands, and in the MBB of the user of the dest operands. This is based on
2593 // the assumption that the MFMAs are likely to appear in loop bodies, while
2594 // the src2 and dest operands are live-in / live-out of the loop. Due to this
2595 // design, the algorithm for finding copy insertion points is more
2596 // complicated.
2597 //
2598 // There are three main cases to handle: 1. the reaching defs of the src2
2599 // operands, 2. the reaching uses of the dst operands, and 3. the reaching
2600 // defs of the reaching uses of the dst operand.
2601 //
2602 // In the first case, we simply insert copies after each of the reaching
2603 // definitions. In the second case, we collect all the uses of a given dest
2604 // and organize them by MBB. Then, we insert 1 copy for each MBB before the
2605 // earliest use. Since the use may have multiple reaching defs, and since we
2606 // want to replace the register it is using with the result of the copy, we
2607 // must handle case 3. In the third case, we simply insert a copy after each
2608 // of the reaching defs to connect to the copy of the reaching uses of the dst
2609 // reg. This allows us to avoid inserting copies next to the MFMAs.
2610 //
2611 // While inserting the copies, we maintain a map of operands which will use
2612 // different regs (i.e. the result of the copies). For example, a case 1 src2
2613 // operand will use the register result of the copies after the reaching defs,
2614 // as opposed to the original register. Now that we have completed our copy
2615 // analysis and placement, we can bulk update the registers. We do this
2616 // separately as to avoid complicating the reachingDef and reachingUse
2617 // queries.
2618 //
2619 // While inserting the copies, we also maintain a list or registers which we
2620 // will want to reclassify as AGPR. After doing the copy insertion and the
2621 // register replacement, we can finally do the reclassification. This uses the
2622 // redef map, as the registers we are interested in reclassifying may be
2623 // replaced by the result of a copy. We must do this after the copy analysis
2624 // and placement as we must have an accurate redef map -- otherwise we may end
2625 // up creating illegal instructions.
2626
2627 // The original registers of the MFMA that need to be reclassified as AGPR.
2628 DenseSet<Register> RewriteRegs;
2629 // The map of an original register in the MFMA to a new register (result of a
2630 // copy) that it should be replaced with.
2631 DenseMap<Register, Register> RedefMap;
2632 // The map of the original MFMA registers to the relevant MFMA operands.
2633 DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2634 // The map of reaching defs for a given register -- to avoid duplicate copies.
2635 DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
2636 // The map of reaching uses for a given register by basic block -- to avoid
2637 // duplicate copies and to calculate per MBB insert pts.
2638 DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
2639 ReachingUseTracker;
2640
2641 // Collect the candidate group; its members share AGPR-form operands
2642 // post-rewrite, so reaching defs feeding any member need no bridge copy.
2643 SmallPtrSet<MachineInstr *, 16> RewriteCandsSet;
2644 DenseSet<Register> RewriteSrc2Regs;
2645 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2646 RewriteCandsSet.insert(Ptr: MI);
2647 MachineOperand *Src2 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2);
2648 if (Src2 && Src2->isReg())
2649 RewriteSrc2Regs.insert(V: Src2->getReg());
2650 }
2651
2652 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2653 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI->getOpcode());
2654 if (ReplacementOp == -1)
2655 continue;
2656 MI->setDesc(TII->get(Opcode: ReplacementOp));
2657
2658 // Case 1: insert copies for the reaching defs of the Src2Reg.
2659 MachineOperand *Src2 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2);
2660 if (Src2->isReg()) {
2661 Register Src2Reg = Src2->getReg();
2662 if (!Src2Reg.isVirtual())
2663 return false;
2664
2665 Register MappedReg = Src2->getReg();
2666 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2667 findReachingDefs(UseMO&: *Src2, LIS: DAG.LIS, DefIdxs&: Src2ReachingDefs);
2668 SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
2669
2670 // If src2 has a use that must remain VGPR, it cannot be reclassified to
2671 // AGPR.
2672 bool Src2NeedsVGPR = Src2NeedsVGPRCache.lookup(Val: MI);
2673
2674 for (SlotIndex RDIndex : Src2ReachingDefs) {
2675 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2676 if (!Src2NeedsVGPR &&
2677 isReachingDefAGPRForm(RD, RewriteSet: RewriteCandsSet, CandSrc2Regs: RewriteSrc2Regs, TII: *TII))
2678 continue;
2679
2680 Src2DefsReplace.insert(X: RD);
2681 }
2682
2683 if (!Src2DefsReplace.empty()) {
2684 auto RI = RedefMap.find(Val: Src2Reg);
2685 if (RI != RedefMap.end()) {
2686 MappedReg = RI->second;
2687 } else {
2688 assert(!ReachingDefCopyMap.contains(Src2Reg));
2689 const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Reg: Src2Reg);
2690 const TargetRegisterClass *VGPRRC =
2691 SRI->getEquivalentVGPRClass(SRC: Src2RC);
2692
2693 // Track the mapping of the original register to the new register.
2694 MappedReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2695 RedefMap[Src2Reg] = MappedReg;
2696 }
2697
2698 // If none exists, create a copy from this reaching def.
2699 // We may have inserted a copy already in an earlier iteration.
2700 for (MachineInstr *RD : Src2DefsReplace) {
2701 // Do not create redundant copies.
2702 if (ReachingDefCopyMap[Src2Reg].insert(Ptr: RD).second) {
2703 MachineInstrBuilder VGPRCopy =
2704 BuildMI(BB&: *RD->getParent(), I: std::next(x: RD->getIterator()),
2705 MIMD: RD->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2706 .addDef(RegNo: MappedReg, Flags: {}, SubReg: 0)
2707 .addUse(RegNo: Src2Reg, Flags: {}, SubReg: 0);
2708 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2709
2710 // If this reaching def was the last MI in the region, update the
2711 // region boundaries.
2712 if (LastMIToRegion.contains(Val: RD)) {
2713 unsigned UpdateRegion = LastMIToRegion[RD];
2714 DAG.Regions[UpdateRegion].second = VGPRCopy;
2715 LastMIToRegion.erase(Val: RD);
2716 }
2717 }
2718 }
2719 }
2720
2721 // Track the register for reclassification
2722 RewriteRegs.insert(V: Src2Reg);
2723
2724 // Always insert the operand for replacement. If this corresponds with a
2725 // chain of tied-def we may not see the VGPR requirement until later.
2726 ReplaceMap[Src2Reg].insert(V: Src2);
2727 }
2728
2729 // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2730 // and after the reaching defs of the reaching uses of the dsts.
2731
2732 MachineOperand *Dst = &MI->getOperand(i: 0);
2733 Register DstReg = Dst->getReg();
2734 if (!DstReg.isVirtual())
2735 return false;
2736
2737 Register MappedReg = DstReg;
2738 SmallVector<MachineOperand *, 8> DstReachingUses;
2739
2740 SmallVector<MachineOperand *, 8> DstReachingUseCopies;
2741 SmallVector<MachineInstr *, 8> DstUseDefsReplace;
2742
2743 findReachingUses(DefMI: MI, LIS: DAG.LIS, ReachingUses&: DstReachingUses);
2744
2745 for (MachineOperand *RUOp : DstReachingUses) {
2746 MachineInstr *UserMI = RUOp->getParent();
2747 // Group members read the AGPR result directly.
2748 if (TII->isMAI(MI: *UserMI) && RewriteCandsSet.contains(Ptr: UserMI))
2749 continue;
2750
2751 // If there is a non mai reaching use, then we need a copy.
2752 if (find(Range&: DstReachingUseCopies, Val: RUOp) == DstReachingUseCopies.end())
2753 DstReachingUseCopies.push_back(Elt: RUOp);
2754
2755 // Non-rewritten MAI: its defs aren't being reclassified.
2756 if (TII->isMAI(MI: *UserMI))
2757 continue;
2758
2759 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2760 findReachingDefs(UseMO&: *RUOp, LIS: DAG.LIS, DefIdxs&: DstUsesReachingDefs);
2761
2762 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2763 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2764 if (TII->isMAI(MI: *RD))
2765 continue;
2766
2767 // If there is a non mai reaching def of this reaching use, then we will
2768 // need a copy.
2769 if (find(Range&: DstUseDefsReplace, Val: RD) == DstUseDefsReplace.end())
2770 DstUseDefsReplace.push_back(Elt: RD);
2771 }
2772 }
2773
2774 if (!DstUseDefsReplace.empty()) {
2775 auto RI = RedefMap.find(Val: DstReg);
2776 if (RI != RedefMap.end()) {
2777 MappedReg = RI->second;
2778 } else {
2779 assert(!ReachingDefCopyMap.contains(DstReg));
2780 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: DstReg);
2781 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2782
2783 // Track the mapping of the original register to the new register.
2784 MappedReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2785 RedefMap[DstReg] = MappedReg;
2786 }
2787
2788 // If none exists, create a copy from this reaching def.
2789 // We may have inserted a copy already in an earlier iteration.
2790 for (MachineInstr *RD : DstUseDefsReplace) {
2791 // Do not create reundant copies.
2792 if (ReachingDefCopyMap[DstReg].insert(Ptr: RD).second) {
2793 MachineInstrBuilder VGPRCopy =
2794 BuildMI(BB&: *RD->getParent(), I: std::next(x: RD->getIterator()),
2795 MIMD: RD->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2796 .addDef(RegNo: MappedReg, Flags: {}, SubReg: 0)
2797 .addUse(RegNo: DstReg, Flags: {}, SubReg: 0);
2798 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2799
2800 // If this reaching def was the last MI in the region, update the
2801 // region boundaries.
2802 auto LMI = LastMIToRegion.find(Val: RD);
2803 if (LMI != LastMIToRegion.end()) {
2804 unsigned UpdateRegion = LMI->second;
2805 DAG.Regions[UpdateRegion].second = VGPRCopy;
2806 LastMIToRegion.erase(Val: RD);
2807 }
2808 }
2809 }
2810 }
2811
2812 DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
2813 for (MachineOperand *RU : DstReachingUseCopies) {
2814 MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2815 // Just keep track of the reaching use of this register by block. After we
2816 // have scanned all the MFMAs we can find optimal insert pts.
2817 if (RUBlock != MI->getParent()) {
2818 ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(Ptr: RU);
2819 continue;
2820 }
2821
2822 // Special case, the use is in the same block as the MFMA. Insert the copy
2823 // just before the use.
2824 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: DstReg);
2825 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2826 Register NewUseReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2827 MachineInstr *UseInst = RU->getParent();
2828 MachineInstrBuilder VGPRCopy =
2829 BuildMI(BB&: *UseInst->getParent(), I: UseInst->getIterator(),
2830 MIMD: UseInst->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2831 .addDef(RegNo: NewUseReg, Flags: {}, SubReg: 0)
2832 .addUse(RegNo: DstReg, Flags: {}, SubReg: 0);
2833 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2834 // Since we know this use has only one reaching def, we can replace the
2835 // use reg.
2836 RU->setReg(NewUseReg);
2837 // Track the copy source operand for r eplacement.
2838 DstRegSet.insert(V: &VGPRCopy->getOperand(i: 1));
2839 }
2840
2841 // Track the register for reclassification
2842 RewriteRegs.insert(V: DstReg);
2843
2844 // Insert the dst operand for replacement. If this dst is in a chain of
2845 // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2846 // all the correspond operands need to be replaced.
2847 DstRegSet.insert(V: Dst);
2848 }
2849
2850 // Handle the copies for dst uses.
2851 using RUBType =
2852 std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
2853 for (RUBType RUBlockEntry : ReachingUseTracker) {
2854 using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
2855 for (RUDType RUDst : RUBlockEntry.second) {
2856 MachineOperand *OpBegin = *RUDst.second.begin();
2857 SlotIndex InstPt = DAG.LIS->getInstructionIndex(Instr: *OpBegin->getParent());
2858
2859 // Find the earliest use in this block.
2860 for (MachineOperand *User : RUDst.second) {
2861 SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(Instr: *User->getParent());
2862 if (SlotIndex::isEarlierInstr(A: NewInstPt, B: InstPt))
2863 InstPt = NewInstPt;
2864 }
2865
2866 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: RUDst.first);
2867 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2868 Register NewUseReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2869 MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(index: InstPt);
2870
2871 MachineInstrBuilder VGPRCopy =
2872 BuildMI(BB&: *UseInst->getParent(), I: UseInst->getIterator(),
2873 MIMD: UseInst->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2874 .addDef(RegNo: NewUseReg, Flags: {}, SubReg: 0)
2875 .addUse(RegNo: RUDst.first, Flags: {}, SubReg: 0);
2876 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2877
2878 // If this UseInst was the first MI in the region, update the region
2879 // boundaries.
2880 auto FI = FirstMIToRegion.find(Val: UseInst);
2881 if (FI != FirstMIToRegion.end()) {
2882 unsigned UpdateRegion = FI->second;
2883 DAG.Regions[UpdateRegion].first = VGPRCopy;
2884 FirstMIToRegion.erase(Val: UseInst);
2885 }
2886
2887 // Replace the operand for all users.
2888 for (MachineOperand *User : RUDst.second) {
2889 User->setReg(NewUseReg);
2890 }
2891
2892 // Track the copy source operand for replacement.
2893 ReplaceMap[RUDst.first].insert(V: &VGPRCopy->getOperand(i: 1));
2894 }
2895 }
2896
2897 // We may have needed to insert copies after the reaching defs of the MFMAs.
2898 // Replace the original register with the result of the copy for all relevant
2899 // operands.
2900 for (std::pair<Register, Register> NewDef : RedefMap) {
2901 Register OldReg = NewDef.first;
2902 Register NewReg = NewDef.second;
2903
2904 // Replace the register for any associated operand in the MFMA chain.
2905 for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
2906 ReplaceOp->setReg(NewReg);
2907 }
2908
2909 // Finally, do the reclassification of the MFMA registers.
2910 for (Register RewriteReg : RewriteRegs) {
2911 Register RegToRewrite = RewriteReg;
2912
2913 // Be sure to update the replacement register and not the original.
2914 auto RI = RedefMap.find(Val: RewriteReg);
2915 if (RI != RedefMap.end())
2916 RegToRewrite = RI->second;
2917
2918 const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(Reg: RegToRewrite);
2919 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(SRC: CurrRC);
2920
2921 DAG.MRI.setRegClass(Reg: RegToRewrite, RC: AGPRRC);
2922 }
2923
2924 // Bulk update the LIS.
2925 DAG.LIS->reanalyze(MF&: DAG.MF);
2926 // Liveins may have been modified for cross RC copies
2927 RegionPressureMap LiveInUpdater(&DAG, false);
2928 LiveInUpdater.buildLiveRegMap();
2929
2930 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
2931 DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(RegionIdx: Region);
2932
2933 DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2934
2935 return true;
2936}
2937
2938unsigned PreRARematStage::getStageTargetOccupancy() const {
2939 return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
2940}
2941
2942bool PreRARematStage::setObjective() {
2943 const Function &F = MF.getFunction();
2944
2945 // Set up "spilling targets" for all regions.
2946 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2947 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2948 bool HasVectorRegisterExcess = false;
2949 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2950 const GCNRegPressure &RP = DAG.Pressure[I];
2951 GCNRPTarget &Target = RPTargets.emplace_back(Args&: MaxSGPRs, Args&: MaxVGPRs, Args&: MF, Args: RP);
2952 if (!Target.satisfied())
2953 TargetRegions.set(I);
2954 HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
2955 }
2956
2957 if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2958 // In addition to register usage being above addressable limits, occupancy
2959 // below the minimum is considered like "spilling" as well.
2960 TargetOcc = std::nullopt;
2961 } else {
2962 // There is no spilling and room to improve occupancy; set up "increased
2963 // occupancy targets" for all regions.
2964 TargetOcc = DAG.MinOccupancy + 1;
2965 const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
2966 MaxSGPRs = ST.getMaxNumSGPRs(WavesPerEU: *TargetOcc, Addressable: false);
2967 MaxVGPRs = ST.getMaxNumVGPRs(WavesPerEU: *TargetOcc, DynamicVGPRBlockSize: VGPRBlockSize);
2968 for (auto [I, Target] : enumerate(First&: RPTargets)) {
2969 Target.setTarget(NumSGPRs: MaxSGPRs, NumVGPRs: MaxVGPRs);
2970 if (!Target.satisfied())
2971 TargetRegions.set(I);
2972 }
2973 }
2974
2975 return TargetRegions.any();
2976}
2977
2978bool PreRARematStage::ScoredRemat::maybeBeneficial(
2979 const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
2980 for (unsigned I : TargetRegions.set_bits()) {
2981 if (Live[I] && RPTargets[I].isSaveBeneficial(SaveRP: RPSave))
2982 return true;
2983 }
2984 return false;
2985}
2986
2987PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
2988 MachineFunction &MF, const GCNScheduleDAGMILive &DAG) {
2989 assert(DAG.MLI && "MLI not defined in DAG");
2990 MachineBranchProbabilityInfo MBPI;
2991 MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
2992
2993 const unsigned NumRegions = DAG.Regions.size();
2994 MinFreq = MBFI.getEntryFreq().getFrequency();
2995 MaxFreq = 0;
2996 Regions.reserve(N: NumRegions);
2997 for (unsigned I = 0; I < NumRegions; ++I) {
2998 MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
2999 uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
3000 Regions.push_back(Elt: BlockFreq);
3001 if (BlockFreq && BlockFreq < MinFreq)
3002 MinFreq = BlockFreq;
3003 else if (BlockFreq > MaxFreq)
3004 MaxFreq = BlockFreq;
3005 }
3006 if (!MinFreq)
3007 return;
3008
3009 // Scale everything down if frequencies are high.
3010 if (MinFreq >= ScaleFactor * ScaleFactor) {
3011 for (uint64_t &Freq : Regions)
3012 Freq /= ScaleFactor;
3013 MinFreq /= ScaleFactor;
3014 MaxFreq /= ScaleFactor;
3015 }
3016}
3017
3018void PreRARematStage::ScoredRemat::init(RegisterIdx RegIdx,
3019 const FreqInfo &Freq,
3020 const Rematerializer &Remater,
3021 GCNScheduleDAGMILive &DAG) {
3022 this->RegIdx = RegIdx;
3023 const unsigned NumRegions = DAG.Regions.size();
3024 LiveIn.resize(N: NumRegions);
3025 LiveOut.resize(N: NumRegions);
3026 Live.resize(N: NumRegions);
3027 UnpredictableRPSave.resize(N: NumRegions);
3028
3029 const Rematerializer::Reg &Reg = Remater.getReg(RegIdx);
3030 Register DefReg = Reg.getDefReg();
3031 assert(Reg.Uses.size() == 1 && "expected users in single region");
3032 const unsigned UseRegion = Reg.Uses.begin()->first;
3033
3034 // Mark regions in which the rematerializable register is live.
3035 for (unsigned I = 0, E = NumRegions; I != E; ++I) {
3036 if (DAG.LiveIns[I].contains(Val: DefReg))
3037 LiveIn.set(I);
3038 if (DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I).contains(Val: DefReg))
3039 LiveOut.set(I);
3040
3041 // If the register is both unused and live-through in the region, the
3042 // latter's RP is guaranteed to decrease.
3043 if (!LiveIn[I] || !LiveOut[I] || I == UseRegion)
3044 UnpredictableRPSave.set(I);
3045 }
3046 Live |= LiveIn;
3047 Live |= LiveOut;
3048 RPSave.inc(Reg: DefReg, PrevMask: LaneBitmask::getNone(), NewMask: Reg.Mask, MRI: DAG.MRI);
3049
3050 // Get frequencies of defining and using regions. A rematerialization from the
3051 // least frequent region to the most frequent region will yield the greatest
3052 // in order to penalize rematerializations from or into regions whose
3053 int64_t DefOrMin = std::max(a: Freq.Regions[Reg.DefRegion], b: Freq.MinFreq);
3054 int64_t UseOrMax = Freq.Regions[UseRegion];
3055 if (!UseOrMax)
3056 UseOrMax = Freq.MaxFreq;
3057 FreqDiff = DefOrMin - UseOrMax;
3058}
3059
3060void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
3061 ArrayRef<GCNRPTarget> RPTargets,
3062 const FreqInfo &FreqInfo,
3063 bool ReduceSpill) {
3064 MaxFreq = 0;
3065 RegionImpact = 0;
3066 for (unsigned I : TargetRegions.set_bits()) {
3067 if (!Live[I])
3068 continue;
3069
3070 // The rematerialization must contribute positively in at least one
3071 // register class with usage above the RP target for this region to
3072 // contribute to the score.
3073 const GCNRPTarget &RegionTarget = RPTargets[I];
3074 const unsigned NumRegsBenefit = RegionTarget.getNumRegsBenefit(SaveRP: RPSave);
3075 if (!NumRegsBenefit)
3076 continue;
3077
3078 // Regions in which RP is guaranteed to decrease have more weight.
3079 RegionImpact += (UnpredictableRPSave[I] ? 1 : 2) * NumRegsBenefit;
3080
3081 if (ReduceSpill) {
3082 uint64_t Freq = FreqInfo.Regions[I];
3083 if (UnpredictableRPSave[I]) {
3084 // Apply a frequency penalty in regions in which we are not sure that RP
3085 // will decrease.
3086 Freq /= 2;
3087 }
3088 MaxFreq = std::max(a: MaxFreq, b: Freq);
3089 }
3090 }
3091}
3092
3093void PreRARematStage::ScoredRemat::rematerialize(
3094 Rematerializer &Remater) const {
3095 const Rematerializer::Reg &Reg = Remater.getReg(RegIdx);
3096 Rematerializer::DependencyReuseInfo DRI;
3097 for (const Rematerializer::Reg::Dependency &Dep : Reg.Dependencies)
3098 DRI.reuse(DepIdx: Dep.RegIdx);
3099 unsigned UseRegion = Reg.Uses.begin()->first;
3100 Remater.rematerializeToRegion(RootIdx: RegIdx, UseRegion, DRI);
3101}
3102
3103void PreRARematStage::updateRPTargets(const BitVector &Regions,
3104 const GCNRegPressure &RPSave) {
3105 for (unsigned I : Regions.set_bits()) {
3106 RPTargets[I].saveRP(SaveRP: RPSave);
3107 if (TargetRegions[I] && RPTargets[I].satisfied()) {
3108 REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
3109 TargetRegions.reset(Idx: I);
3110 }
3111 }
3112}
3113
3114bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
3115 bool TooOptimistic = false;
3116 for (unsigned I : Regions.set_bits()) {
3117 GCNRPTarget &Target = RPTargets[I];
3118 Target.setRP(DAG.getRealRegPressure(RegionIdx: I));
3119
3120 // Since we were optimistic in assessing RP decreases in these regions, we
3121 // may need to remark the target as a target region if RP didn't decrease
3122 // as expected.
3123 if (!TargetRegions[I] && !Target.satisfied()) {
3124 REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
3125 TooOptimistic = true;
3126 TargetRegions.set(I);
3127 }
3128 }
3129 return TooOptimistic;
3130}
3131
3132void PreRARematStage::removeFromLiveMaps(Register Reg, const BitVector &LiveIn,
3133 const BitVector &LiveOut) {
3134 assert(LiveIn.size() == DAG.Regions.size() &&
3135 LiveOut.size() == DAG.Regions.size() && "region num mismatch");
3136 for (unsigned I : LiveIn.set_bits())
3137 DAG.LiveIns[I].erase(Val: Reg);
3138 for (unsigned I : LiveOut.set_bits())
3139 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I).erase(Val: Reg);
3140}
3141
3142void PreRARematStage::addToLiveMaps(Register Reg, LaneBitmask Mask,
3143 const BitVector &LiveIn,
3144 const BitVector &LiveOut) {
3145 assert(LiveIn.size() == DAG.Regions.size() &&
3146 LiveOut.size() == DAG.Regions.size() && "region num mismatch");
3147 std::pair<Register, LaneBitmask> LiveReg(Reg, Mask);
3148 for (unsigned I : LiveIn.set_bits())
3149 DAG.LiveIns[I].insert(KV: LiveReg);
3150 for (unsigned I : LiveOut.set_bits())
3151 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I).insert(KV: LiveReg);
3152}
3153
3154void PreRARematStage::finalizeGCNSchedStage() {
3155 // We consider that reducing spilling is always beneficial so we never
3156 // rollback rematerializations or revert scheduling in such cases.
3157 if (!TargetOcc)
3158 return;
3159
3160 // When increasing occupancy, it is possible that re-scheduling is not able to
3161 // achieve the target occupancy in all regions, in which case re-scheduling in
3162 // all regions should be reverted.
3163 if (DAG.MinOccupancy >= *TargetOcc)
3164 return;
3165
3166 // Revert re-scheduling in all affected regions.
3167 for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
3168 REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
3169 << '\n');
3170 DAG.Pressure[RegionIdx] = MaxPressure;
3171 modifyRegionSchedule(RegionIdx, MIOrder: OrigMIOrder);
3172 }
3173
3174 // It is possible that re-scheduling lowers occupancy over the one achieved
3175 // just through rematerializations, in which case we revert re-scheduling in
3176 // all regions but do not roll back rematerializations.
3177 if (AchievedOcc >= *TargetOcc) {
3178 DAG.setTargetOccupancy(AchievedOcc);
3179 return;
3180 }
3181
3182 // Reset the target occupancy to what it was pre-rematerialization.
3183 DAG.setTargetOccupancy(*TargetOcc - 1);
3184
3185 // Roll back changes made by the stage, then recompute pressure in all
3186 // affected regions.
3187 REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
3188 assert(Rollback && "rollbacker should be defined");
3189 Rollback->Listener.rollback(Remater);
3190 for (const auto &[RegIdx, LiveIn, LiveOut] : Rollback->LiveMapUpdates) {
3191 const Rematerializer::Reg &Reg = Remater.getReg(RegIdx);
3192 addToLiveMaps(Reg: Reg.getDefReg(), Mask: Reg.Mask, LiveIn, LiveOut);
3193 }
3194
3195#ifdef EXPENSIVE_CHECKS
3196 // In particular, we want to check for coherent MI/slot order in regions in
3197 // which reverts and/or rollbacks may have happened.
3198 MF.verify();
3199#endif
3200 for (unsigned I : RescheduleRegions.set_bits())
3201 DAG.Pressure[I] = DAG.getRealRegPressure(RegionIdx: I);
3202
3203 GCNSchedStage::finalizeGCNSchedStage();
3204}
3205
3206void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
3207 MinOccupancy = TargetOccupancy;
3208 if (MFI.getOccupancy() < TargetOccupancy)
3209 MFI.increaseOccupancy(MF, Limit: MinOccupancy);
3210 else
3211 MFI.limitOccupancy(Limit: MinOccupancy);
3212}
3213
3214static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
3215 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
3216 return any_of(Range&: *DAG, P: [SII](MachineBasicBlock::iterator MI) {
3217 return SII->isIGLPMutationOnly(Opcode: MI->getOpcode());
3218 });
3219}
3220
3221GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
3222 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
3223 bool RemoveKillFlags)
3224 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
3225
3226void GCNPostScheduleDAGMILive::schedule() {
3227 HasIGLPInstrs = hasIGLPInstrs(DAG: this);
3228 if (HasIGLPInstrs) {
3229 SavedMutations.clear();
3230 SavedMutations.swap(x&: Mutations);
3231 addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
3232 }
3233
3234 ScheduleDAGMI::schedule();
3235}
3236
3237void GCNPostScheduleDAGMILive::finalizeSchedule() {
3238 if (HasIGLPInstrs)
3239 SavedMutations.swap(x&: Mutations);
3240
3241 ScheduleDAGMI::finalizeSchedule();
3242}
3243