1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNRegPressure.h"
29#include "SIMachineFunctionInfo.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/STLExtras.h"
33#include "llvm/CodeGen/CalcSpillWeights.h"
34#include "llvm/CodeGen/MachineBasicBlock.h"
35#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
36#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
37#include "llvm/CodeGen/MachineCycleAnalysis.h"
38#include "llvm/CodeGen/MachineOperand.h"
39#include "llvm/CodeGen/RegisterClassInfo.h"
40#include "llvm/MC/LaneBitmask.h"
41#include "llvm/MC/MCInstrItineraries.h"
42#include "llvm/MC/MCSchedule.h"
43#include "llvm/MC/TargetRegistry.h"
44#include "llvm/Support/ErrorHandling.h"
45
46#define DEBUG_TYPE "machine-scheduler"
47
48using namespace llvm;
49
50static cl::opt<bool> DisableUnclusterHighRP(
51 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
52 cl::desc("Disable unclustered high register pressure "
53 "reduction scheduling stage."),
54 cl::init(Val: false));
55
56static cl::opt<bool> DisableClusteredLowOccupancy(
57 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
58 cl::desc("Disable clustered low occupancy "
59 "rescheduling for ILP scheduling stage."),
60 cl::init(Val: false));
61
62static cl::opt<unsigned> ScheduleMetricBias(
63 "amdgpu-schedule-metric-bias", cl::Hidden,
64 cl::desc(
65 "Sets the bias which adds weight to occupancy vs latency. Set it to "
66 "100 to chase the occupancy only."),
67 cl::init(Val: 10));
68
69static cl::opt<bool>
70 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
71 cl::desc("Relax occupancy targets for kernels which are memory "
72 "bound (amdgpu-membound-threshold), or "
73 "Wave Limited (amdgpu-limit-wave-threshold)."),
74 cl::init(Val: false));
75
76static cl::opt<bool> GCNTrackers(
77 "amdgpu-use-amdgpu-trackers", cl::Hidden,
78 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
79 cl::init(Val: false));
80
81static cl::opt<unsigned> PendingQueueLimit(
82 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
83 cl::desc(
84 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
85 cl::init(Val: 256));
86
87#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
88#define DUMP_MAX_REG_PRESSURE
89static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
90 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
91 cl::desc("Print a list of live registers along with their def/uses at the "
92 "point of maximum register pressure before scheduling."),
93 cl::init(false));
94
95static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
96 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
97 cl::desc("Print a list of live registers along with their def/uses at the "
98 "point of maximum register pressure after scheduling."),
99 cl::init(false));
100#endif
101
102static cl::opt<bool> DisableRewriteMFMAFormSchedStage(
103 "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
104 cl::desc("Disable rewrie mfma rewrite scheduling stage"), cl::init(Val: true));
105
106const unsigned ScheduleMetrics::ScaleFactor = 100;
107
108GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
109 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
110 DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
111}
112
113void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
114 GenericScheduler::initialize(dag: DAG);
115
116 MF = &DAG->MF;
117
118 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
119
120 SGPRExcessLimit =
121 Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass);
122 VGPRExcessLimit =
123 Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass);
124
125 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
126 // Set the initial TargetOccupnacy to the maximum occupancy that we can
127 // achieve for this function. This effectively sets a lower bound on the
128 // 'Critical' register limits in the scheduler.
129 // Allow for lower occupancy targets if kernel is wave limited or memory
130 // bound, and using the relaxed occupancy feature.
131 TargetOccupancy =
132 RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
133 SGPRCriticalLimit =
134 std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit);
135
136 if (!KnownExcessRP) {
137 VGPRCriticalLimit = std::min(
138 a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize()),
139 b: VGPRExcessLimit);
140 } else {
141 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
142 // returns a reasonably small number for targets with lots of VGPRs, such
143 // as GFX10 and GFX11.
144 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
145 "VGPRCriticalLimit calculation method.\n");
146 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
147 unsigned Granule =
148 AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST, DynamicVGPRBlockSize);
149 unsigned Addressable =
150 AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: &ST, DynamicVGPRBlockSize);
151 unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule);
152 VGPRBudget = std::max(a: VGPRBudget, b: Granule);
153 VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit);
154 }
155
156 // Subtract error margin and bias from register limits and avoid overflow.
157 SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit);
158 VGPRCriticalLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit);
159 SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit);
160 VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit);
161 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
162 << ", VGPRExcessLimit = " << VGPRExcessLimit
163 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
164 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
165}
166
167/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
168/// current register pressure.
169///
170/// This works for the common case, but it has a few exceptions that have been
171/// observed through trial and error:
172/// - Explicit physical register operands
173/// - Subregister definitions
174///
175/// In both of those cases, PressureDiff doesn't represent the actual pressure,
176/// and querying LiveIntervals through the RegPressureTracker is needed to get
177/// an accurate value.
178///
179/// We should eventually only use PressureDiff for maximum performance, but this
180/// already allows 80% of SUs to take the fast path without changing scheduling
181/// at all. Further changes would either change scheduling, or require a lot
182/// more logic to recover an accurate pressure estimate from the PressureDiffs.
183static bool canUsePressureDiffs(const SUnit &SU) {
184 if (!SU.isInstr())
185 return false;
186
187 // Cannot use pressure diffs for subregister defs or with physregs, it's
188 // imprecise in both cases.
189 for (const auto &Op : SU.getInstr()->operands()) {
190 if (!Op.isReg() || Op.isImplicit())
191 continue;
192 if (Op.getReg().isPhysical() ||
193 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
194 return false;
195 }
196 return true;
197}
198
199static void getRegisterPressures(
200 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
201 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
202 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
203 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
204 // getDownwardPressure() and getUpwardPressure() make temporary changes to
205 // the tracker, so we need to pass those function a non-const copy.
206 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
207 if (!GCNTrackers) {
208 AtTop
209 ? TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure)
210 : TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure);
211
212 return;
213 }
214
215 // GCNTrackers
216 Pressure.resize(new_size: 4, x: 0);
217 MachineInstr *MI = SU->getInstr();
218 GCNRegPressure NewPressure;
219 if (AtTop) {
220 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
221 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, TRI: SRI);
222 } else {
223 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
224 TempUpwardTracker.recede(MI: *MI);
225 NewPressure = TempUpwardTracker.getPressure();
226 }
227 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
228 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
229 NewPressure.getArchVGPRNum();
230 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
231}
232
233void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
234 bool AtTop,
235 const RegPressureTracker &RPTracker,
236 const SIRegisterInfo *SRI,
237 unsigned SGPRPressure,
238 unsigned VGPRPressure, bool IsBottomUp) {
239 Cand.SU = SU;
240 Cand.AtTop = AtTop;
241
242 if (!DAG->isTrackingPressure())
243 return;
244
245 Pressure.clear();
246 MaxPressure.clear();
247
248 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
249 // possible over querying the RegPressureTracker.
250 //
251 // RegPressureTracker will make a lot of LIS queries which are very
252 // expensive, it is considered a slow function in this context.
253 //
254 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
255 // trivial lookup into an array. It is pretty much free.
256 //
257 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
258 // PressureDiffs.
259 if (AtTop || !canUsePressureDiffs(SU: *SU) || GCNTrackers) {
260 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
261 DownwardTracker, UpwardTracker, DAG, SRI);
262 } else {
263 // Reserve 4 slots.
264 Pressure.resize(new_size: 4, x: 0);
265 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
266 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
267
268 for (const auto &Diff : DAG->getPressureDiff(SU)) {
269 if (!Diff.isValid())
270 continue;
271 // PressureDiffs is always bottom-up so if we're working top-down we need
272 // to invert its sign.
273 Pressure[Diff.getPSet()] +=
274 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
275 }
276
277#ifdef EXPENSIVE_CHECKS
278 std::vector<unsigned> CheckPressure, CheckMaxPressure;
279 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
280 DownwardTracker, UpwardTracker, DAG, SRI);
281 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
282 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
283 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
284 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
285 errs() << "Register Pressure is inaccurate when calculated through "
286 "PressureDiff\n"
287 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
288 << ", expected "
289 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
290 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
291 << ", expected "
292 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
293 report_fatal_error("inaccurate register pressure calculation");
294 }
295#endif
296 }
297
298 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
299 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
300
301 // If two instructions increase the pressure of different register sets
302 // by the same amount, the generic scheduler will prefer to schedule the
303 // instruction that increases the set with the least amount of registers,
304 // which in our case would be SGPRs. This is rarely what we want, so
305 // when we report excess/critical register pressure, we do it either
306 // only for VGPRs or only for SGPRs.
307
308 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
309 const unsigned MaxVGPRPressureInc = 16;
310 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
311 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
312
313 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
314 // to increase the likelihood we don't go over the limits. We should improve
315 // the analysis to look through dependencies to find the path with the least
316 // register pressure.
317
318 // We only need to update the RPDelta for instructions that increase register
319 // pressure. Instructions that decrease or keep reg pressure the same will be
320 // marked as RegExcess in tryCandidate() when they are compared with
321 // instructions that increase the register pressure.
322 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
323 HasHighPressure = true;
324 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
325 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
326 }
327
328 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
329 HasHighPressure = true;
330 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
331 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
332 }
333
334 // Register pressure is considered 'CRITICAL' if it is approaching a value
335 // that would reduce the wave occupancy for the execution unit. When
336 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
337 // has the same cost, so we don't need to prefer one over the other.
338
339 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
340 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
341
342 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
343 HasHighPressure = true;
344 if (SGPRDelta > VGPRDelta) {
345 Cand.RPDelta.CriticalMax =
346 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
347 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
348 } else {
349 Cand.RPDelta.CriticalMax =
350 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
351 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
352 }
353 }
354}
355
356static bool shouldCheckPending(SchedBoundary &Zone,
357 const TargetSchedModel *SchedModel) {
358 bool HasBufferedModel =
359 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
360 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
361 return Combined <= PendingQueueLimit && HasBufferedModel;
362}
363
364static SUnit *pickOnlyChoice(SchedBoundary &Zone,
365 const TargetSchedModel *SchedModel) {
366 // pickOnlyChoice() releases pending instructions and checks for new hazards.
367 SUnit *OnlyChoice = Zone.pickOnlyChoice();
368 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
369 return OnlyChoice;
370
371 return nullptr;
372}
373
374void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current,
375 const SchedCandidate &Preferred) {
376 LLVM_DEBUG({
377 dbgs() << "Prefer:\t\t";
378 DAG->dumpNode(*Preferred.SU);
379
380 if (Current.SU) {
381 dbgs() << "Not:\t";
382 DAG->dumpNode(*Current.SU);
383 }
384
385 dbgs() << "Reason:\t\t";
386 traceCandidate(Preferred);
387 });
388}
389
390// This function is mostly cut and pasted from
391// GenericScheduler::pickNodeFromQueue()
392void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
393 const CandPolicy &ZonePolicy,
394 const RegPressureTracker &RPTracker,
395 SchedCandidate &Cand, bool &IsPending,
396 bool IsBottomUp) {
397 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
398 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
399 unsigned SGPRPressure = 0;
400 unsigned VGPRPressure = 0;
401 IsPending = false;
402 if (DAG->isTrackingPressure()) {
403 if (!GCNTrackers) {
404 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
405 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
406 } else {
407 GCNRPTracker *T = IsBottomUp
408 ? static_cast<GCNRPTracker *>(&UpwardTracker)
409 : static_cast<GCNRPTracker *>(&DownwardTracker);
410 SGPRPressure = T->getPressure().getSGPRNum();
411 VGPRPressure = T->getPressure().getArchVGPRNum();
412 }
413 }
414 LLVM_DEBUG(dbgs() << "Available Q:\n");
415 ReadyQueue &AQ = Zone.Available;
416 for (SUnit *SU : AQ) {
417
418 SchedCandidate TryCand(ZonePolicy);
419 initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
420 VGPRPressure, IsBottomUp);
421 // Pass SchedBoundary only when comparing nodes from the same boundary.
422 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
423 tryCandidate(Cand, TryCand, Zone: ZoneArg);
424 if (TryCand.Reason != NoCand) {
425 // Initialize resource delta if needed in case future heuristics query it.
426 if (TryCand.ResDelta == SchedResourceDelta())
427 TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
428 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
429 Cand.setBest(TryCand);
430 } else {
431 printCandidateDecision(Current: TryCand, Preferred: Cand);
432 }
433 }
434
435 if (!shouldCheckPending(Zone, SchedModel))
436 return;
437
438 LLVM_DEBUG(dbgs() << "Pending Q:\n");
439 ReadyQueue &PQ = Zone.Pending;
440 for (SUnit *SU : PQ) {
441
442 SchedCandidate TryCand(ZonePolicy);
443 initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
444 VGPRPressure, IsBottomUp);
445 // Pass SchedBoundary only when comparing nodes from the same boundary.
446 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
447 tryPendingCandidate(Cand, TryCand, Zone: ZoneArg);
448 if (TryCand.Reason != NoCand) {
449 // Initialize resource delta if needed in case future heuristics query it.
450 if (TryCand.ResDelta == SchedResourceDelta())
451 TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
452 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
453 IsPending = true;
454 Cand.setBest(TryCand);
455 } else {
456 printCandidateDecision(Current: TryCand, Preferred: Cand);
457 }
458 }
459}
460
461// This function is mostly cut and pasted from
462// GenericScheduler::pickNodeBidirectional()
463SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
464 bool &PickedPending) {
465 // Schedule as far as possible in the direction of no choice. This is most
466 // efficient, but also provides the best heuristics for CriticalPSets.
467 if (SUnit *SU = pickOnlyChoice(Zone&: Bot, SchedModel)) {
468 IsTopNode = false;
469 return SU;
470 }
471 if (SUnit *SU = pickOnlyChoice(Zone&: Top, SchedModel)) {
472 IsTopNode = true;
473 return SU;
474 }
475 // Set the bottom-up policy based on the state of the current bottom zone
476 // and the instructions outside the zone, including the top zone.
477 CandPolicy BotPolicy;
478 setPolicy(Policy&: BotPolicy, /*IsPostRA=*/false, CurrZone&: Bot, OtherZone: &Top);
479 // Set the top-down policy based on the state of the current top zone and
480 // the instructions outside the zone, including the bottom zone.
481 CandPolicy TopPolicy;
482 setPolicy(Policy&: TopPolicy, /*IsPostRA=*/false, CurrZone&: Top, OtherZone: &Bot);
483
484 bool BotPending = false;
485 // See if BotCand is still valid (because we previously scheduled from Top).
486 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
487 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
488 BotCand.Policy != BotPolicy) {
489 BotCand.reset(NewPolicy: CandPolicy());
490 pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
491 IsPending&: BotPending,
492 /*IsBottomUp=*/true);
493 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
494 } else {
495 LLVM_DEBUG(traceCandidate(BotCand));
496#ifndef NDEBUG
497 if (VerifyScheduling) {
498 SchedCandidate TCand;
499 TCand.reset(CandPolicy());
500 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
501 BotPending,
502 /*IsBottomUp=*/true);
503 assert(TCand.SU == BotCand.SU &&
504 "Last pick result should correspond to re-picking right now");
505 }
506#endif
507 }
508
509 bool TopPending = false;
510 // Check if the top Q has a better candidate.
511 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
512 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
513 TopCand.Policy != TopPolicy) {
514 TopCand.reset(NewPolicy: CandPolicy());
515 pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
516 IsPending&: TopPending,
517 /*IsBottomUp=*/false);
518 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
519 } else {
520 LLVM_DEBUG(traceCandidate(TopCand));
521#ifndef NDEBUG
522 if (VerifyScheduling) {
523 SchedCandidate TCand;
524 TCand.reset(CandPolicy());
525 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
526 TopPending,
527 /*IsBottomUp=*/false);
528 assert(TCand.SU == TopCand.SU &&
529 "Last pick result should correspond to re-picking right now");
530 }
531#endif
532 }
533
534 // Pick best from BotCand and TopCand.
535 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
536 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
537 SchedCandidate Cand = BotPending ? TopCand : BotCand;
538 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
539 PickedPending = BotPending && TopPending;
540
541 TryCand.Reason = NoCand;
542 if (BotPending || TopPending) {
543 PickedPending |= tryPendingCandidate(Cand, TryCand&: TopCand, Zone: nullptr);
544 } else {
545 tryCandidate(Cand, TryCand, Zone: nullptr);
546 }
547
548 if (TryCand.Reason != NoCand) {
549 Cand.setBest(TryCand);
550 }
551
552 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
553
554 IsTopNode = Cand.AtTop;
555 return Cand.SU;
556}
557
558// This function is mostly cut and pasted from
559// GenericScheduler::pickNode()
560SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
561 if (DAG->top() == DAG->bottom()) {
562 assert(Top.Available.empty() && Top.Pending.empty() &&
563 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
564 return nullptr;
565 }
566 bool PickedPending;
567 SUnit *SU;
568 do {
569 PickedPending = false;
570 if (RegionPolicy.OnlyTopDown) {
571 SU = pickOnlyChoice(Zone&: Top, SchedModel);
572 if (!SU) {
573 CandPolicy NoPolicy;
574 TopCand.reset(NewPolicy: NoPolicy);
575 pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
576 IsPending&: PickedPending,
577 /*IsBottomUp=*/false);
578 assert(TopCand.Reason != NoCand && "failed to find a candidate");
579 SU = TopCand.SU;
580 }
581 IsTopNode = true;
582 } else if (RegionPolicy.OnlyBottomUp) {
583 SU = pickOnlyChoice(Zone&: Bot, SchedModel);
584 if (!SU) {
585 CandPolicy NoPolicy;
586 BotCand.reset(NewPolicy: NoPolicy);
587 pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
588 IsPending&: PickedPending,
589 /*IsBottomUp=*/true);
590 assert(BotCand.Reason != NoCand && "failed to find a candidate");
591 SU = BotCand.SU;
592 }
593 IsTopNode = false;
594 } else {
595 SU = pickNodeBidirectional(IsTopNode, PickedPending);
596 }
597 } while (SU->isScheduled);
598
599 if (PickedPending) {
600 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
601 SchedBoundary &Zone = IsTopNode ? Top : Bot;
602 unsigned CurrentCycle = Zone.getCurrCycle();
603 if (ReadyCycle > CurrentCycle)
604 Zone.bumpCycle(NextCycle: ReadyCycle);
605
606 // FIXME: checkHazard() doesn't give information about which cycle the
607 // hazard will resolve so just keep bumping the cycle by 1. This could be
608 // made more efficient if checkHazard() returned more details.
609 while (Zone.checkHazard(SU))
610 Zone.bumpCycle(NextCycle: Zone.getCurrCycle() + 1);
611
612 Zone.releasePending();
613 }
614
615 if (SU->isTopReady())
616 Top.removeReady(SU);
617 if (SU->isBottomReady())
618 Bot.removeReady(SU);
619
620 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
621 << *SU->getInstr());
622 return SU;
623}
624
625void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
626 if (GCNTrackers) {
627 MachineInstr *MI = SU->getInstr();
628 IsTopNode ? (void)DownwardTracker.advance(MI, UseInternalIterator: false)
629 : UpwardTracker.recede(MI: *MI);
630 }
631
632 return GenericScheduler::schedNode(SU, IsTopNode);
633}
634
635GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
636 assert(CurrentStage && CurrentStage != SchedStages.end());
637 return *CurrentStage;
638}
639
640bool GCNSchedStrategy::advanceStage() {
641 assert(CurrentStage != SchedStages.end());
642 if (!CurrentStage)
643 CurrentStage = SchedStages.begin();
644 else
645 CurrentStage++;
646
647 return CurrentStage != SchedStages.end();
648}
649
650bool GCNSchedStrategy::hasNextStage() const {
651 assert(CurrentStage);
652 return std::next(x: CurrentStage) != SchedStages.end();
653}
654
655GCNSchedStageID GCNSchedStrategy::getNextStage() const {
656 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
657 return *std::next(x: CurrentStage);
658}
659
660bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
661 SchedCandidate &TryCand,
662 SchedBoundary *Zone) const {
663 // Initialize the candidate if needed.
664 if (!Cand.isValid()) {
665 TryCand.Reason = NodeOrder;
666 return true;
667 }
668
669 // Bias PhysReg Defs and copies to their uses and defined respectively.
670 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
671 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
672 return TryCand.Reason != NoCand;
673
674 // Avoid exceeding the target's limit.
675 if (DAG->isTrackingPressure() &&
676 tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
677 Reason: RegExcess, TRI, MF: DAG->MF))
678 return TryCand.Reason != NoCand;
679
680 // Avoid increasing the max critical pressure in the scheduled region.
681 if (DAG->isTrackingPressure() &&
682 tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
683 TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
684 return TryCand.Reason != NoCand;
685
686 bool SameBoundary = Zone != nullptr;
687 if (SameBoundary) {
688 TryCand.initResourceDelta(DAG, SchedModel);
689 if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
690 TryCand, Cand, Reason: ResourceReduce))
691 return TryCand.Reason != NoCand;
692 if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
693 CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
694 Reason: ResourceDemand))
695 return TryCand.Reason != NoCand;
696 }
697
698 return false;
699}
700
701GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
702 const MachineSchedContext *C, bool IsLegacyScheduler)
703 : GCNSchedStrategy(C) {
704 SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule);
705 if (!DisableRewriteMFMAFormSchedStage)
706 SchedStages.push_back(Elt: GCNSchedStageID::RewriteMFMAForm);
707 SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule);
708 SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule);
709 SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize);
710 GCNTrackers = GCNTrackers & !IsLegacyScheduler;
711}
712
713GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
714 : GCNSchedStrategy(C) {
715 SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule);
716}
717
718bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
719 SchedCandidate &TryCand,
720 SchedBoundary *Zone) const {
721 // Initialize the candidate if needed.
722 if (!Cand.isValid()) {
723 TryCand.Reason = NodeOrder;
724 return true;
725 }
726
727 // Avoid spilling by exceeding the register limit.
728 if (DAG->isTrackingPressure() &&
729 tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
730 Reason: RegExcess, TRI, MF: DAG->MF))
731 return TryCand.Reason != NoCand;
732
733 // Bias PhysReg Defs and copies to their uses and defined respectively.
734 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
735 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
736 return TryCand.Reason != NoCand;
737
738 bool SameBoundary = Zone != nullptr;
739 if (SameBoundary) {
740 // Prioritize instructions that read unbuffered resources by stall cycles.
741 if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
742 CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
743 return TryCand.Reason != NoCand;
744
745 // Avoid critical resource consumption and balance the schedule.
746 TryCand.initResourceDelta(DAG, SchedModel);
747 if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
748 TryCand, Cand, Reason: ResourceReduce))
749 return TryCand.Reason != NoCand;
750 if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
751 CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
752 Reason: ResourceDemand))
753 return TryCand.Reason != NoCand;
754
755 // Unconditionally try to reduce latency.
756 if (tryLatency(TryCand, Cand, Zone&: *Zone))
757 return TryCand.Reason != NoCand;
758
759 // Weak edges are for clustering and other constraints.
760 if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
761 CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
762 return TryCand.Reason != NoCand;
763 }
764
765 // Keep clustered nodes together to encourage downstream peephole
766 // optimizations which may reduce resource requirements.
767 //
768 // This is a best effort to set things up for a post-RA pass. Optimizations
769 // like generating loads of multiple registers should ideally be done within
770 // the scheduler pass by combining the loads during DAG postprocessing.
771 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
772 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
773 bool CandIsClusterSucc =
774 isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx);
775 bool TryCandIsClusterSucc =
776 isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx);
777 if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand,
778 Reason: Cluster))
779 return TryCand.Reason != NoCand;
780
781 // Avoid increasing the max critical pressure in the scheduled region.
782 if (DAG->isTrackingPressure() &&
783 tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
784 TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
785 return TryCand.Reason != NoCand;
786
787 // Avoid increasing the max pressure of the entire region.
788 if (DAG->isTrackingPressure() &&
789 tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
790 Cand, Reason: RegMax, TRI, MF: DAG->MF))
791 return TryCand.Reason != NoCand;
792
793 if (SameBoundary) {
794 // Fall through to original instruction order.
795 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
796 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
797 TryCand.Reason = NodeOrder;
798 return true;
799 }
800 }
801 return false;
802}
803
804GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
805 const MachineSchedContext *C)
806 : GCNSchedStrategy(C) {
807 SchedStages.push_back(Elt: GCNSchedStageID::MemoryClauseInitialSchedule);
808}
809
810/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
811/// much as possible. This is achieved by:
812// 1. Prioritize clustered operations before stall latency heuristic.
813// 2. Prioritize long-latency-load before stall latency heuristic.
814///
815/// \param Cand provides the policy and current best candidate.
816/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
817/// \param Zone describes the scheduled zone that we are extending, or nullptr
818/// if Cand is from a different zone than TryCand.
819/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
820bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
821 SchedCandidate &TryCand,
822 SchedBoundary *Zone) const {
823 // Initialize the candidate if needed.
824 if (!Cand.isValid()) {
825 TryCand.Reason = NodeOrder;
826 return true;
827 }
828
829 // Bias PhysReg Defs and copies to their uses and defined respectively.
830 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
831 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
832 return TryCand.Reason != NoCand;
833
834 if (DAG->isTrackingPressure()) {
835 // Avoid exceeding the target's limit.
836 if (tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
837 Reason: RegExcess, TRI, MF: DAG->MF))
838 return TryCand.Reason != NoCand;
839
840 // Avoid increasing the max critical pressure in the scheduled region.
841 if (tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
842 TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
843 return TryCand.Reason != NoCand;
844 }
845
846 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
847 // get more benefit from clausing these memory instructions.
848 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
849 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
850 bool CandIsClusterSucc =
851 isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx);
852 bool TryCandIsClusterSucc =
853 isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx);
854 if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand,
855 Reason: Cluster))
856 return TryCand.Reason != NoCand;
857
858 // We only compare a subset of features when comparing nodes between
859 // Top and Bottom boundary. Some properties are simply incomparable, in many
860 // other instances we should only override the other boundary if something
861 // is a clear good pick on one boundary. Skip heuristics that are more
862 // "tie-breaking" in nature.
863 bool SameBoundary = Zone != nullptr;
864 if (SameBoundary) {
865 // For loops that are acyclic path limited, aggressively schedule for
866 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
867 // heuristics to take precedence.
868 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
869 tryLatency(TryCand, Cand, Zone&: *Zone))
870 return TryCand.Reason != NoCand;
871
872 // MaxMemoryClause-specific: Prioritize long latency memory load
873 // instructions in top-bottom order to hide more latency. The mayLoad check
874 // is used to exclude store-like instructions, which we do not want to
875 // scheduler them too early.
876 bool TryMayLoad =
877 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
878 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
879
880 if (TryMayLoad || CandMayLoad) {
881 bool TryLongLatency =
882 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
883 bool CandLongLatency =
884 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
885
886 if (tryGreater(TryVal: Zone->isTop() ? TryLongLatency : CandLongLatency,
887 CandVal: Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
888 Cand, Reason: Stall))
889 return TryCand.Reason != NoCand;
890 }
891 // Prioritize instructions that read unbuffered resources by stall cycles.
892 if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
893 CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
894 return TryCand.Reason != NoCand;
895 }
896
897 if (SameBoundary) {
898 // Weak edges are for clustering and other constraints.
899 if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
900 CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
901 return TryCand.Reason != NoCand;
902 }
903
904 // Avoid increasing the max pressure of the entire region.
905 if (DAG->isTrackingPressure() &&
906 tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
907 Cand, Reason: RegMax, TRI, MF: DAG->MF))
908 return TryCand.Reason != NoCand;
909
910 if (SameBoundary) {
911 // Avoid critical resource consumption and balance the schedule.
912 TryCand.initResourceDelta(DAG, SchedModel);
913 if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
914 TryCand, Cand, Reason: ResourceReduce))
915 return TryCand.Reason != NoCand;
916 if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
917 CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
918 Reason: ResourceDemand))
919 return TryCand.Reason != NoCand;
920
921 // Avoid serializing long latency dependence chains.
922 // For acyclic path limited loops, latency was already checked above.
923 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
924 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone))
925 return TryCand.Reason != NoCand;
926
927 // Fall through to original instruction order.
928 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
929 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
930 TryCand.Reason = NodeOrder;
931 return true;
932 }
933 }
934
935 return false;
936}
937
938GCNScheduleDAGMILive::GCNScheduleDAGMILive(
939 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
940 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
941 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
942 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
943 RegionLiveOuts(this, /*IsLiveOut=*/true) {
944
945 // We want regions with a single MI to be scheduled so that we can reason
946 // about them correctly during scheduling stages that move MIs between regions
947 // (e.g., rematerialization).
948 ScheduleSingleMIRegions = true;
949 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
950 if (RelaxedOcc) {
951 MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy);
952 if (MinOccupancy != StartingOccupancy)
953 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
954 << ".\n");
955 }
956}
957
958std::unique_ptr<GCNSchedStage>
959GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
960 switch (SchedStageID) {
961 case GCNSchedStageID::OccInitialSchedule:
962 return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this);
963 case GCNSchedStageID::RewriteMFMAForm:
964 return std::make_unique<RewriteMFMAFormStage>(args&: SchedStageID, args&: *this);
965 case GCNSchedStageID::UnclusteredHighRPReschedule:
966 return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this);
967 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
968 return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this);
969 case GCNSchedStageID::PreRARematerialize:
970 return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this);
971 case GCNSchedStageID::ILPInitialSchedule:
972 return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this);
973 case GCNSchedStageID::MemoryClauseInitialSchedule:
974 return std::make_unique<MemoryClauseInitialScheduleStage>(args&: SchedStageID,
975 args&: *this);
976 }
977
978 llvm_unreachable("Unknown SchedStageID.");
979}
980
981void GCNScheduleDAGMILive::schedule() {
982 // Collect all scheduling regions. The actual scheduling is performed in
983 // GCNScheduleDAGMILive::finalizeSchedule.
984 Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd));
985}
986
987GCNRegPressure
988GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
989 if (Regions[RegionIdx].first == Regions[RegionIdx].second)
990 return llvm::getRegPressure(MRI, LiveRegs: LiveIns[RegionIdx]);
991 GCNDownwardRPTracker RPTracker(*LIS);
992 RPTracker.advance(Begin: Regions[RegionIdx].first, End: Regions[RegionIdx].second,
993 LiveRegsCopy: &LiveIns[RegionIdx]);
994 return RPTracker.moveMaxPressure();
995}
996
997static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
998 MachineBasicBlock::iterator RegionEnd) {
999 assert(RegionBegin != RegionEnd && "Region must not be empty");
1000 return &*skipDebugInstructionsBackward(It: std::prev(x: RegionEnd), Begin: RegionBegin);
1001}
1002
1003void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
1004 const MachineBasicBlock *MBB) {
1005 GCNDownwardRPTracker RPTracker(*LIS);
1006
1007 // If the block has the only successor then live-ins of that successor are
1008 // live-outs of the current block. We can reuse calculated live set if the
1009 // successor will be sent to scheduling past current block.
1010
1011 // However, due to the bug in LiveInterval analysis it may happen that two
1012 // predecessors of the same successor block have different lane bitmasks for
1013 // a live-out register. Workaround that by sticking to one-to-one relationship
1014 // i.e. one predecessor with one successor block.
1015 const MachineBasicBlock *OnlySucc = nullptr;
1016 if (MBB->succ_size() == 1) {
1017 auto *Candidate = *MBB->succ_begin();
1018 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1019 SlotIndexes *Ind = LIS->getSlotIndexes();
1020 if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate))
1021 OnlySucc = Candidate;
1022 }
1023 }
1024
1025 // Scheduler sends regions from the end of the block upwards.
1026 size_t CurRegion = RegionIdx;
1027 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1028 if (Regions[CurRegion].first->getParent() != MBB)
1029 break;
1030 --CurRegion;
1031
1032 auto I = MBB->begin();
1033 auto LiveInIt = MBBLiveIns.find(Val: MBB);
1034 auto &Rgn = Regions[CurRegion];
1035 auto *NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
1036 if (LiveInIt != MBBLiveIns.end()) {
1037 auto LiveIn = std::move(LiveInIt->second);
1038 RPTracker.reset(MI: *MBB->begin(), LiveRegs: &LiveIn);
1039 MBBLiveIns.erase(I: LiveInIt);
1040 } else {
1041 I = Rgn.first;
1042 auto LRS = BBLiveInMap.lookup(Val: NonDbgMI);
1043#ifdef EXPENSIVE_CHECKS
1044 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1045#endif
1046 RPTracker.reset(MI: *I, LiveRegs: &LRS);
1047 }
1048
1049 for (;;) {
1050 I = RPTracker.getNext();
1051
1052 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1053 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1054 RPTracker.clearMaxPressure();
1055 }
1056
1057 if (Regions[CurRegion].second == I) {
1058 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1059 if (CurRegion-- == RegionIdx)
1060 break;
1061 auto &Rgn = Regions[CurRegion];
1062 NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
1063 }
1064 RPTracker.advanceToNext();
1065 RPTracker.advanceBeforeNext();
1066 }
1067
1068 if (OnlySucc) {
1069 if (I != MBB->end()) {
1070 RPTracker.advanceToNext();
1071 RPTracker.advance(End: MBB->end());
1072 }
1073 RPTracker.advanceBeforeNext();
1074 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1075 }
1076}
1077
1078DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
1079GCNScheduleDAGMILive::getRegionLiveInMap() const {
1080 assert(!Regions.empty());
1081 std::vector<MachineInstr *> RegionFirstMIs;
1082 RegionFirstMIs.reserve(n: Regions.size());
1083 for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions))
1084 RegionFirstMIs.push_back(
1085 x: &*skipDebugInstructionsForward(It: RegionBegin, End: RegionEnd));
1086
1087 return getLiveRegMap(R&: RegionFirstMIs, /*After=*/false, LIS&: *LIS);
1088}
1089
1090DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
1091GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1092 assert(!Regions.empty());
1093 std::vector<MachineInstr *> RegionLastMIs;
1094 RegionLastMIs.reserve(n: Regions.size());
1095 for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions)) {
1096 // Skip empty regions.
1097 if (RegionBegin == RegionEnd)
1098 continue;
1099 RegionLastMIs.push_back(x: getLastMIForRegion(RegionBegin, RegionEnd));
1100 }
1101 return getLiveRegMap(R&: RegionLastMIs, /*After=*/true, LIS&: *LIS);
1102}
1103
1104void RegionPressureMap::buildLiveRegMap() {
1105 IdxToInstruction.clear();
1106
1107 RegionLiveRegMap =
1108 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1109 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1110 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1111 // Skip empty regions.
1112 if (RegionBegin == RegionEnd)
1113 continue;
1114 MachineInstr *RegionKey =
1115 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1116 IdxToInstruction[I] = RegionKey;
1117 }
1118}
1119
1120void GCNScheduleDAGMILive::finalizeSchedule() {
1121 // Start actual scheduling here. This function is called by the base
1122 // MachineScheduler after all regions have been recorded by
1123 // GCNScheduleDAGMILive::schedule().
1124 LiveIns.resize(N: Regions.size());
1125 Pressure.resize(N: Regions.size());
1126 RegionsWithHighRP.resize(N: Regions.size());
1127 RegionsWithExcessRP.resize(N: Regions.size());
1128 RegionsWithIGLPInstrs.resize(N: Regions.size());
1129 RegionsWithHighRP.reset();
1130 RegionsWithExcessRP.reset();
1131 RegionsWithIGLPInstrs.reset();
1132
1133 runSchedStages();
1134}
1135
1136void GCNScheduleDAGMILive::runSchedStages() {
1137 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1138
1139 if (!Regions.empty()) {
1140 BBLiveInMap = getRegionLiveInMap();
1141 if (GCNTrackers)
1142 RegionLiveOuts.buildLiveRegMap();
1143 }
1144
1145#ifdef DUMP_MAX_REG_PRESSURE
1146 if (PrintMaxRPRegUsageBeforeScheduler) {
1147 dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
1148 dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
1149 LIS->dump();
1150 }
1151#endif
1152
1153 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1154 while (S.advanceStage()) {
1155 auto Stage = createSchedStage(SchedStageID: S.getCurrentStage());
1156 if (!Stage->initGCNSchedStage())
1157 continue;
1158
1159 for (auto Region : Regions) {
1160 RegionBegin = Region.first;
1161 RegionEnd = Region.second;
1162 // Setup for scheduling the region and check whether it should be skipped.
1163 if (!Stage->initGCNRegion()) {
1164 Stage->advanceRegion();
1165 exitRegion();
1166 continue;
1167 }
1168
1169 if (GCNTrackers) {
1170 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
1171 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
1172 GCNRPTracker::LiveRegSet *RegionLiveIns =
1173 &LiveIns[Stage->getRegionIdx()];
1174
1175 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
1176 ->reset(MRI_: MRI, LiveRegs_: *RegionLiveIns);
1177 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
1178 ->reset(MRI_: MRI, LiveRegs_: RegionLiveOuts.getLiveRegsForRegionIdx(
1179 RegionIdx: Stage->getRegionIdx()));
1180 }
1181
1182 ScheduleDAGMILive::schedule();
1183 Stage->finalizeGCNRegion();
1184 Stage->advanceRegion();
1185 exitRegion();
1186 }
1187
1188 Stage->finalizeGCNSchedStage();
1189 }
1190
1191#ifdef DUMP_MAX_REG_PRESSURE
1192 if (PrintMaxRPRegUsageAfterScheduler) {
1193 dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
1194 dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
1195 LIS->dump();
1196 }
1197#endif
1198}
1199
1200#ifndef NDEBUG
1201raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
1202 switch (StageID) {
1203 case GCNSchedStageID::OccInitialSchedule:
1204 OS << "Max Occupancy Initial Schedule";
1205 break;
1206 case GCNSchedStageID::RewriteMFMAForm:
1207 OS << "Instruction Rewriting Reschedule";
1208 break;
1209 case GCNSchedStageID::UnclusteredHighRPReschedule:
1210 OS << "Unclustered High Register Pressure Reschedule";
1211 break;
1212 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
1213 OS << "Clustered Low Occupancy Reschedule";
1214 break;
1215 case GCNSchedStageID::PreRARematerialize:
1216 OS << "Pre-RA Rematerialize";
1217 break;
1218 case GCNSchedStageID::ILPInitialSchedule:
1219 OS << "Max ILP Initial Schedule";
1220 break;
1221 case GCNSchedStageID::MemoryClauseInitialSchedule:
1222 OS << "Max memory clause Initial Schedule";
1223 break;
1224 }
1225
1226 return OS;
1227}
1228#endif
1229
1230GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
1231 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
1232 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
1233
1234bool GCNSchedStage::initGCNSchedStage() {
1235 if (!DAG.LIS)
1236 return false;
1237
1238 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1239 return true;
1240}
1241
1242void RewriteMFMAFormStage::findReachingDefs(
1243 MachineOperand &UseMO, LiveIntervals *LIS,
1244 SmallVectorImpl<SlotIndex> &DefIdxs) {
1245 MachineInstr *UseMI = UseMO.getParent();
1246 LiveInterval &UseLI = LIS->getInterval(Reg: UseMO.getReg());
1247 VNInfo *VNI = UseLI.getVNInfoAt(Idx: LIS->getInstructionIndex(Instr: *UseMI));
1248
1249 // If the def is not a PHI, then it must be the only reaching def.
1250 if (!VNI->isPHIDef()) {
1251 DefIdxs.push_back(Elt: VNI->def);
1252 return;
1253 }
1254
1255 SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
1256 SmallVector<MachineBasicBlock *, 8> Worklist;
1257
1258 // Mark the predecessor blocks for traversal
1259 for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1260 Worklist.push_back(Elt: PredMBB);
1261 Visited.insert(Ptr: PredMBB);
1262 }
1263
1264 while (!Worklist.empty()) {
1265 MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1266
1267 SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(mbb: CurrMBB);
1268 VNInfo *VNI = UseLI.getVNInfoAt(Idx: CurrMBBEnd.getPrevSlot());
1269
1270 MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(index: VNI->def);
1271
1272 // If there is a def in this block, then add it to the list. This is the
1273 // reaching def of this path.
1274 if (!VNI->isPHIDef()) {
1275 DefIdxs.push_back(Elt: VNI->def);
1276 continue;
1277 }
1278
1279 for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1280 if (Visited.insert(Ptr: PredMBB).second)
1281 Worklist.push_back(Elt: PredMBB);
1282 }
1283 }
1284}
1285
1286void RewriteMFMAFormStage::findReachingUses(
1287 MachineInstr *DefMI, LiveIntervals *LIS,
1288 SmallVectorImpl<MachineOperand *> &ReachingUses) {
1289 SlotIndex DefIdx = LIS->getInstructionIndex(Instr: *DefMI);
1290 for (MachineOperand &UseMO :
1291 DAG.MRI.use_nodbg_operands(Reg: DefMI->getOperand(i: 0).getReg())) {
1292 SmallVector<SlotIndex, 8> ReachingDefIndexes;
1293 findReachingDefs(UseMO, LIS, DefIdxs&: ReachingDefIndexes);
1294
1295 // If we find a use that contains this DefMI in its reachingDefs, then it is
1296 // a reaching use.
1297 if (any_of(Range&: ReachingDefIndexes, P: [DefIdx](SlotIndex RDIdx) {
1298 return SlotIndex::isSameInstr(A: RDIdx, B: DefIdx);
1299 }))
1300 ReachingUses.push_back(Elt: &UseMO);
1301 }
1302}
1303
1304bool RewriteMFMAFormStage::initGCNSchedStage() {
1305 // We only need to run this pass if the architecture supports AGPRs.
1306 // Additionally, we don't use AGPRs at occupancy levels above 1 so there
1307 // is no need for this pass in that case, either.
1308 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1309 if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
1310 return false;
1311
1312 RegionsWithExcessArchVGPR.resize(N: DAG.Regions.size());
1313 RegionsWithExcessArchVGPR.reset();
1314 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
1315 GCNRegPressure PressureBefore = DAG.Pressure[Region];
1316 if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1317 RegionsWithExcessArchVGPR[Region] = true;
1318 }
1319
1320 if (RegionsWithExcessArchVGPR.none())
1321 return false;
1322
1323 TII = ST.getInstrInfo();
1324 SRI = ST.getRegisterInfo();
1325
1326 std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
1327 DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse;
1328 SmallPtrSet<MachineInstr *, 8> CopyForDef;
1329
1330 if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1331 return false;
1332
1333 int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1334
1335 // If we haven't found the beneficial conditions, prefer the VGPR form which
1336 // may result in less cross RC copies.
1337 if (Cost > 0)
1338 return false;
1339
1340 return rewrite(RewriteCands);
1341}
1342
1343bool UnclusteredHighRPStage::initGCNSchedStage() {
1344 if (DisableUnclusterHighRP)
1345 return false;
1346
1347 if (!GCNSchedStage::initGCNSchedStage())
1348 return false;
1349
1350 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1351 return false;
1352
1353 SavedMutations.swap(x&: DAG.Mutations);
1354 DAG.addMutation(
1355 Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry));
1356
1357 InitialOccupancy = DAG.MinOccupancy;
1358 // Aggressively try to reduce register pressure in the unclustered high RP
1359 // stage. Temporarily increase occupancy target in the region.
1360 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1361 ? InitialOccupancy + 1
1362 : InitialOccupancy;
1363 IsAnyRegionScheduled = false;
1364 S.SGPRLimitBias = S.HighRPSGPRBias;
1365 S.VGPRLimitBias = S.HighRPVGPRBias;
1366
1367 LLVM_DEBUG(
1368 dbgs()
1369 << "Retrying function scheduling without clustering. "
1370 "Aggressively try to reduce register pressure to achieve occupancy "
1371 << TempTargetOccupancy << ".\n");
1372
1373 return true;
1374}
1375
1376bool ClusteredLowOccStage::initGCNSchedStage() {
1377 if (DisableClusteredLowOccupancy)
1378 return false;
1379
1380 if (!GCNSchedStage::initGCNSchedStage())
1381 return false;
1382
1383 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1384 // been dropped. All regions will have already been scheduled with the ideal
1385 // occupancy targets.
1386 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1387 return false;
1388
1389 LLVM_DEBUG(
1390 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1391 << DAG.MinOccupancy << ".\n");
1392 return true;
1393}
1394
1395/// Allows to easily filter for this stage's debug output.
1396#define REMAT_PREFIX "[PreRARemat] "
1397#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1398
1399#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1400Printable PreRARematStage::ScoredRemat::print() const {
1401 return Printable([&](raw_ostream &OS) {
1402 OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1403 });
1404}
1405#endif
1406
1407bool PreRARematStage::initGCNSchedStage() {
1408 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1409 // regions inbetween the defs and region we sinked the def to. Will need to be
1410 // fixed if there is another pass after this pass.
1411 assert(!S.hasNextStage());
1412
1413 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
1414 return false;
1415
1416 // Maps all MIs (except lone terminators, which are not part of any region) to
1417 // their parent region. Non-lone terminators are considered part of the region
1418 // they delimitate.
1419 DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount());
1420
1421 // Before performing any IR modification record the parent region of each MI
1422 // and the parent MBB of each region.
1423 const unsigned NumRegions = DAG.Regions.size();
1424 for (unsigned I = 0; I < NumRegions; ++I) {
1425 RegionBoundaries Region = DAG.Regions[I];
1426 for (auto MI = Region.first; MI != Region.second; ++MI)
1427 MIRegion.insert(KV: {&*MI, I});
1428 MachineBasicBlock *ParentMBB = Region.first->getParent();
1429 if (Region.second != ParentMBB->end())
1430 MIRegion.insert(KV: {&*Region.second, I});
1431 RegionBB.push_back(Elt: ParentMBB);
1432 }
1433
1434#ifndef NDEBUG
1435 auto PrintTargetRegions = [&]() -> void {
1436 if (TargetRegions.none()) {
1437 dbgs() << REMAT_PREFIX << "No target regions\n";
1438 return;
1439 }
1440 dbgs() << REMAT_PREFIX << "Target regions:\n";
1441 for (unsigned I : TargetRegions.set_bits())
1442 dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1443 };
1444 auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
1445 return Printable([&, Remat](raw_ostream &OS) {
1446 // Concatenate all region numbers in which the register is unused and
1447 // live-through.
1448 bool HasLiveThroughRegion = false;
1449 OS << '[' << Remat.DefRegion << " -";
1450 for (unsigned I = 0; I < NumRegions; ++I) {
1451 if (Remat.isUnusedLiveThrough(I)) {
1452 if (HasLiveThroughRegion) {
1453 OS << ',';
1454 } else {
1455 OS << "- ";
1456 HasLiveThroughRegion = true;
1457 }
1458 OS << I;
1459 }
1460 }
1461 if (HasLiveThroughRegion)
1462 OS << " -";
1463 OS << "-> " << Remat.UseRegion << "] ";
1464 Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
1465 /*SkipDebugLoc=*/false, /*AddNewLine=*/false);
1466 });
1467 };
1468#endif
1469
1470 // Set an objective for the stage based on current RP in each region.
1471 REMAT_DEBUG({
1472 dbgs() << "Analyzing ";
1473 MF.getFunction().printAsOperand(dbgs(), false);
1474 dbgs() << ": ";
1475 });
1476 if (!setObjective()) {
1477 LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
1478 << MFI.getMaxWavesPerEU() << '\n');
1479 return false;
1480 }
1481 LLVM_DEBUG({
1482 if (TargetOcc) {
1483 dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
1484 } else {
1485 dbgs() << "reduce spilling (minimum target occupancy is "
1486 << MFI.getMinWavesPerEU() << ")\n";
1487 }
1488 PrintTargetRegions();
1489 });
1490
1491 if (!collectRematRegs(MIRegion)) {
1492 REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
1493 return false;
1494 }
1495 const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1496 REMAT_DEBUG({
1497 dbgs() << "Rematerializable registers:\n";
1498 for (const RematReg &Remat : RematRegs)
1499 dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n';
1500 dbgs() << REMAT_PREFIX << "Region frequencies\n";
1501 for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
1502 dbgs() << REMAT_PREFIX << " [" << I << "] ";
1503 if (Freq)
1504 dbgs() << Freq;
1505 else
1506 dbgs() << "unknown ";
1507 dbgs() << " | " << *DAG.Regions[I].first;
1508 }
1509 });
1510
1511 SmallVector<ScoredRemat> ScoredRemats;
1512 for (RematReg &Remat : RematRegs)
1513 ScoredRemats.emplace_back(Args: &Remat, Args: FreqInfo, Args&: DAG);
1514
1515// Rematerialize registers in successive rounds until all RP targets are
1516// satisifed or until we run out of rematerialization candidates.
1517#ifndef NDEBUG
1518 unsigned RoundNum = 0;
1519#endif
1520 BitVector RecomputeRP(NumRegions);
1521 do {
1522 assert(!ScoredRemats.empty() && "no more remat candidates");
1523
1524 // (Re-)Score and (re-)sort all remats in increasing score order.
1525 for (ScoredRemat &Remat : ScoredRemats)
1526 Remat.update(TargetRegions, RPTargets, Freq: FreqInfo, ReduceSpill: !TargetOcc);
1527 sort(C&: ScoredRemats);
1528
1529 REMAT_DEBUG({
1530 dbgs() << "==== ROUND " << RoundNum++ << " ====\n"
1531 << REMAT_PREFIX
1532 << "Candidates with non-null score, in rematerialization order:\n";
1533 for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
1534 if (RematDecision.hasNullScore())
1535 break;
1536 dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | "
1537 << *RematDecision.Remat->DefMI;
1538 }
1539 PrintTargetRegions();
1540 });
1541
1542 RecomputeRP.reset();
1543 unsigned RematIdx = ScoredRemats.size();
1544
1545 // Rematerialize registers in decreasing score order until we estimate
1546 // that all RP targets are satisfied or until rematerialization candidates
1547 // are no longer useful to decrease RP.
1548 for (; RematIdx && TargetRegions.any(); --RematIdx) {
1549 const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1];
1550 // Stop rematerializing on encountering a null score. Since scores
1551 // monotonically decrease as we rematerialize, we know there is nothing
1552 // useful left to do in such cases, even if we were to re-score.
1553 if (Candidate.hasNullScore()) {
1554 RematIdx = 0;
1555 break;
1556 }
1557
1558 RematReg &Remat = *Candidate.Remat;
1559 // When previous rematerializations in this round have already satisfied
1560 // RP targets in all regions this rematerialization can impact, we have a
1561 // good indication that our scores have diverged significantly from
1562 // reality, in which case we interrupt this round and re-score. This also
1563 // ensures that every rematerialization we perform is possibly impactful
1564 // in at least one target region.
1565 if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
1566 break;
1567
1568 REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';);
1569 // Every rematerialization we do here is likely to move the instruction
1570 // into a higher frequency region, increasing the total sum latency of the
1571 // instruction itself. This is acceptable if we are eliminating a spill in
1572 // the process, but when the goal is increasing occupancy we get nothing
1573 // out of rematerialization if occupancy is not increased in the end; in
1574 // such cases we want to roll back the rematerialization.
1575 RollbackInfo *Rollback =
1576 TargetOcc ? &Rollbacks.emplace_back(Args: &Remat) : nullptr;
1577 rematerialize(Remat, RecomputeRP, Rollback);
1578 unsetSatisifedRPTargets(Regions: Remat.Live);
1579 }
1580
1581 REMAT_DEBUG({
1582 if (!TargetRegions.any()) {
1583 dbgs() << "** Interrupt round on all targets achieved\n";
1584 } else if (RematIdx) {
1585 dbgs() << "** Interrupt round on stale score for "
1586 << *ScoredRemats[RematIdx - 1].Remat->DefMI;
1587 } else {
1588 dbgs() << "** Stop on exhausted rematerialization candidates\n";
1589 }
1590 });
1591
1592 // Peel off registers we already rematerialized from the vector's tail.
1593 ScoredRemats.truncate(N: RematIdx);
1594 } while ((updateAndVerifyRPTargets(Regions: RecomputeRP) || TargetRegions.any()) &&
1595 !ScoredRemats.empty());
1596 if (RescheduleRegions.none())
1597 return false;
1598
1599 // Commit all pressure changes to the DAG and compute minimum achieved
1600 // occupancy in impacted regions.
1601 REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
1602 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
1603 for (unsigned I : RescheduleRegions.set_bits()) {
1604 DAG.Pressure[I] = RPTargets[I].getCurrentRP();
1605 REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
1606 << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
1607 << " (" << RPTargets[I] << ")\n");
1608 }
1609 AchievedOcc = MFI.getMaxWavesPerEU();
1610 for (const GCNRegPressure &RP : DAG.Pressure) {
1611 AchievedOcc =
1612 std::min(a: AchievedOcc, b: RP.getOccupancy(ST, DynamicVGPRBlockSize));
1613 }
1614
1615 REMAT_DEBUG({
1616 dbgs() << "Retrying function scheduling with new min. occupancy of "
1617 << AchievedOcc << " from rematerializing (original was "
1618 << DAG.MinOccupancy;
1619 if (TargetOcc)
1620 dbgs() << ", target was " << *TargetOcc;
1621 dbgs() << ")\n";
1622 });
1623
1624 DAG.setTargetOccupancy(getStageTargetOccupancy());
1625 return true;
1626}
1627
1628void GCNSchedStage::finalizeGCNSchedStage() {
1629 DAG.finishBlock();
1630 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1631}
1632
1633void UnclusteredHighRPStage::finalizeGCNSchedStage() {
1634 SavedMutations.swap(x&: DAG.Mutations);
1635 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1636 if (DAG.MinOccupancy > InitialOccupancy) {
1637 assert(IsAnyRegionScheduled);
1638 LLVM_DEBUG(dbgs() << StageID
1639 << " stage successfully increased occupancy to "
1640 << DAG.MinOccupancy << '\n');
1641 } else if (!IsAnyRegionScheduled) {
1642 assert(DAG.MinOccupancy == InitialOccupancy);
1643 LLVM_DEBUG(dbgs() << StageID
1644 << ": No regions scheduled, min occupancy stays at "
1645 << DAG.MinOccupancy << ", MFI occupancy stays at "
1646 << MFI.getOccupancy() << ".\n");
1647 }
1648
1649 GCNSchedStage::finalizeGCNSchedStage();
1650}
1651
1652bool GCNSchedStage::initGCNRegion() {
1653 // Skip empty scheduling region.
1654 if (DAG.begin() == DAG.end())
1655 return false;
1656
1657 // Check whether this new region is also a new block.
1658 if (DAG.RegionBegin->getParent() != CurrentMBB)
1659 setupNewBlock();
1660
1661 unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end());
1662 DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs);
1663
1664 // Skip regions with 1 schedulable instruction.
1665 if (DAG.begin() == std::prev(x: DAG.end()))
1666 return false;
1667
1668 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1669 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1670 << " " << CurrentMBB->getName()
1671 << "\n From: " << *DAG.begin() << " To: ";
1672 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1673 else dbgs() << "End";
1674 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1675
1676 // Save original instruction order before scheduling for possible revert.
1677 Unsched.clear();
1678 Unsched.reserve(n: DAG.NumRegionInstrs);
1679 if (StageID == GCNSchedStageID::OccInitialSchedule ||
1680 StageID == GCNSchedStageID::ILPInitialSchedule) {
1681 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1682 for (auto &I : DAG) {
1683 Unsched.push_back(x: &I);
1684 if (SII->isIGLPMutationOnly(Opcode: I.getOpcode()))
1685 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1686 }
1687 } else {
1688 for (auto &I : DAG)
1689 Unsched.push_back(x: &I);
1690 }
1691
1692 PressureBefore = DAG.Pressure[RegionIdx];
1693
1694 LLVM_DEBUG(
1695 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1696 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1697 << "Region live-in pressure: "
1698 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1699 << "Region register pressure: " << print(PressureBefore));
1700
1701 S.HasHighPressure = false;
1702 S.KnownExcessRP = isRegionWithExcessRP();
1703
1704 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1705 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
1706 SavedMutations.clear();
1707 SavedMutations.swap(x&: DAG.Mutations);
1708 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1709 StageID == GCNSchedStageID::ILPInitialSchedule;
1710 DAG.addMutation(Mutation: createIGroupLPDAGMutation(
1711 Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1712 : AMDGPU::SchedulingPhase::PreRAReentry));
1713 }
1714
1715 return true;
1716}
1717
1718bool UnclusteredHighRPStage::initGCNRegion() {
1719 // Only reschedule regions that have excess register pressure (i.e. spilling)
1720 // or had minimum occupancy at the beginning of the stage (as long as
1721 // rescheduling of previous regions did not make occupancy drop back down to
1722 // the initial minimum).
1723 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1724 // If no region has been scheduled yet, the DAG has not yet been updated with
1725 // the occupancy target. So retrieve it from the temporary.
1726 unsigned CurrentTargetOccupancy =
1727 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1728 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1729 (CurrentTargetOccupancy <= InitialOccupancy ||
1730 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1731 InitialOccupancy))
1732 return false;
1733
1734 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1735 // If this is the first region scheduled during this stage, make the target
1736 // occupancy changes in the DAG and MFI.
1737 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1738 IsAnyRegionScheduled = true;
1739 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1740 DAG.setTargetOccupancy(TempTargetOccupancy);
1741 }
1742 return IsSchedulingThisRegion;
1743}
1744
1745bool ClusteredLowOccStage::initGCNRegion() {
1746 // We may need to reschedule this region if it wasn't rescheduled in the last
1747 // stage, or if we found it was testing critical register pressure limits in
1748 // the unclustered reschedule stage. The later is because we may not have been
1749 // able to raise the min occupancy in the previous stage so the region may be
1750 // overly constrained even if it was already rescheduled.
1751 if (!DAG.RegionsWithHighRP[RegionIdx])
1752 return false;
1753
1754 return GCNSchedStage::initGCNRegion();
1755}
1756
1757bool PreRARematStage::initGCNRegion() {
1758 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1759}
1760
1761void GCNSchedStage::setupNewBlock() {
1762 if (CurrentMBB)
1763 DAG.finishBlock();
1764
1765 CurrentMBB = DAG.RegionBegin->getParent();
1766 DAG.startBlock(bb: CurrentMBB);
1767 // Get real RP for the region if it hasn't be calculated before. After the
1768 // initial schedule stage real RP will be collected after scheduling.
1769 if (StageID == GCNSchedStageID::OccInitialSchedule ||
1770 StageID == GCNSchedStageID::ILPInitialSchedule ||
1771 StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
1772 DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB);
1773}
1774
1775void GCNSchedStage::finalizeGCNRegion() {
1776 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1777 if (S.HasHighPressure)
1778 DAG.RegionsWithHighRP[RegionIdx] = true;
1779
1780 // Revert scheduling if we have dropped occupancy or there is some other
1781 // reason that the original schedule is better.
1782 checkScheduling();
1783
1784 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1785 StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1786 SavedMutations.swap(x&: DAG.Mutations);
1787}
1788
1789void PreRARematStage::finalizeGCNRegion() {
1790 GCNSchedStage::finalizeGCNRegion();
1791 // When the goal is to increase occupancy, all regions must reach the target
1792 // occupancy for rematerializations to be possibly useful, otherwise we will
1793 // just hurt latency for no benefit. If minimum occupancy drops below the
1794 // target there is no point in trying to re-schedule further regions.
1795 if (!TargetOcc)
1796 return;
1797 RegionReverts.emplace_back(Args&: RegionIdx, Args&: Unsched, Args&: PressureBefore);
1798 if (DAG.MinOccupancy < *TargetOcc) {
1799 REMAT_DEBUG(dbgs() << "Region " << RegionIdx
1800 << " cannot meet occupancy target, interrupting "
1801 "re-scheduling in all regions\n");
1802 RescheduleRegions.reset();
1803 }
1804}
1805
1806void GCNSchedStage::checkScheduling() {
1807 // Check the results of scheduling.
1808 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1809
1810 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1811 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1812
1813 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1814
1815 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1816 PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1817 DAG.Pressure[RegionIdx] = PressureAfter;
1818
1819 // Early out if we have achieved the occupancy target.
1820 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1821 return;
1822 }
1823
1824 unsigned TargetOccupancy = std::min(
1825 a: S.getTargetOccupancy(), b: ST.getOccupancyWithWorkGroupSizes(MF).second);
1826 unsigned WavesAfter = std::min(
1827 a: TargetOccupancy, b: PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1828 unsigned WavesBefore = std::min(
1829 a: TargetOccupancy, b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1830 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1831 << ", after " << WavesAfter << ".\n");
1832
1833 // We may not be able to keep the current target occupancy because of the just
1834 // scheduled region. We might still be able to revert scheduling if the
1835 // occupancy before was higher, or if the current schedule has register
1836 // pressure higher than the excess limits which could lead to more spilling.
1837 unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore);
1838
1839 // Allow memory bound functions to drop to 4 waves if not limited by an
1840 // attribute.
1841 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1842 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1843 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1844 << MFI.getMinAllowedOccupancy() << " waves\n");
1845 NewOccupancy = WavesAfter;
1846 }
1847
1848 if (NewOccupancy < DAG.MinOccupancy) {
1849 DAG.MinOccupancy = NewOccupancy;
1850 MFI.limitOccupancy(Limit: DAG.MinOccupancy);
1851 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1852 << DAG.MinOccupancy << ".\n");
1853 }
1854 // The maximum number of arch VGPR on non-unified register file, or the
1855 // maximum VGPR + AGPR in the unified register file case.
1856 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1857 // The maximum number of arch VGPR for both unified and non-unified register
1858 // file.
1859 unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs());
1860 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1861
1862 if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs ||
1863 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1864 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1865 PressureAfter.getSGPRNum() > MaxSGPRs) {
1866 DAG.RegionsWithHighRP[RegionIdx] = true;
1867 DAG.RegionsWithExcessRP[RegionIdx] = true;
1868 }
1869
1870 // Revert if this region's schedule would cause a drop in occupancy or
1871 // spilling.
1872 if (shouldRevertScheduling(WavesAfter)) {
1873 modifyRegionSchedule(RegionIdx, MBB: DAG.BB, MIOrder: Unsched);
1874 std::tie(args&: DAG.RegionBegin, args&: DAG.RegionEnd) = DAG.Regions[RegionIdx];
1875 } else {
1876 DAG.Pressure[RegionIdx] = PressureAfter;
1877 }
1878}
1879
1880unsigned
1881GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1882 DenseMap<unsigned, unsigned> &ReadyCycles,
1883 const TargetSchedModel &SM) {
1884 unsigned ReadyCycle = CurrCycle;
1885 for (auto &D : SU.Preds) {
1886 if (D.isAssignedRegDep()) {
1887 MachineInstr *DefMI = D.getSUnit()->getInstr();
1888 unsigned Latency = SM.computeInstrLatency(MI: DefMI);
1889 unsigned DefReady = ReadyCycles[DAG.getSUnit(MI: DefMI)->NodeNum];
1890 ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency);
1891 }
1892 }
1893 ReadyCycles[SU.NodeNum] = ReadyCycle;
1894 return ReadyCycle;
1895}
1896
1897#ifndef NDEBUG
1898struct EarlierIssuingCycle {
1899 bool operator()(std::pair<MachineInstr *, unsigned> A,
1900 std::pair<MachineInstr *, unsigned> B) const {
1901 return A.second < B.second;
1902 }
1903};
1904
1905static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1906 EarlierIssuingCycle> &ReadyCycles) {
1907 if (ReadyCycles.empty())
1908 return;
1909 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1910 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1911 << " ##################\n# Cycle #\t\t\tInstruction "
1912 " "
1913 " \n";
1914 unsigned IPrev = 1;
1915 for (auto &I : ReadyCycles) {
1916 if (I.second > IPrev + 1)
1917 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1918 << " CYCLES DETECTED ******************************\n\n";
1919 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1920 IPrev = I.second;
1921 }
1922}
1923#endif
1924
1925ScheduleMetrics
1926GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1927#ifndef NDEBUG
1928 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1929 ReadyCyclesSorted;
1930#endif
1931 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1932 unsigned SumBubbles = 0;
1933 DenseMap<unsigned, unsigned> ReadyCycles;
1934 unsigned CurrCycle = 0;
1935 for (auto &SU : InputSchedule) {
1936 unsigned ReadyCycle =
1937 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1938 SumBubbles += ReadyCycle - CurrCycle;
1939#ifndef NDEBUG
1940 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1941#endif
1942 CurrCycle = ++ReadyCycle;
1943 }
1944#ifndef NDEBUG
1945 LLVM_DEBUG(
1946 printScheduleModel(ReadyCyclesSorted);
1947 dbgs() << "\n\t"
1948 << "Metric: "
1949 << (SumBubbles
1950 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1951 : 1)
1952 << "\n\n");
1953#endif
1954
1955 return ScheduleMetrics(CurrCycle, SumBubbles);
1956}
1957
1958ScheduleMetrics
1959GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1960#ifndef NDEBUG
1961 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1962 ReadyCyclesSorted;
1963#endif
1964 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1965 unsigned SumBubbles = 0;
1966 DenseMap<unsigned, unsigned> ReadyCycles;
1967 unsigned CurrCycle = 0;
1968 for (auto &MI : DAG) {
1969 SUnit *SU = DAG.getSUnit(MI: &MI);
1970 if (!SU)
1971 continue;
1972 unsigned ReadyCycle =
1973 computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM);
1974 SumBubbles += ReadyCycle - CurrCycle;
1975#ifndef NDEBUG
1976 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1977#endif
1978 CurrCycle = ++ReadyCycle;
1979 }
1980#ifndef NDEBUG
1981 LLVM_DEBUG(
1982 printScheduleModel(ReadyCyclesSorted);
1983 dbgs() << "\n\t"
1984 << "Metric: "
1985 << (SumBubbles
1986 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1987 : 1)
1988 << "\n\n");
1989#endif
1990
1991 return ScheduleMetrics(CurrCycle, SumBubbles);
1992}
1993
1994bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1995 if (WavesAfter < DAG.MinOccupancy)
1996 return true;
1997
1998 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1999 if (DAG.MFI.isDynamicVGPREnabled()) {
2000 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2001 STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(),
2002 DynamicVGPRBlockSize: PressureBefore.getVGPRNum(UnifiedVGPRFile: false));
2003 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2004 STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(),
2005 DynamicVGPRBlockSize: PressureAfter.getVGPRNum(UnifiedVGPRFile: false));
2006 if (BlocksAfter > BlocksBefore)
2007 return true;
2008 }
2009
2010 return false;
2011}
2012
2013bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
2014 if (PressureAfter == PressureBefore)
2015 return false;
2016
2017 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
2018 return true;
2019
2020 if (mayCauseSpilling(WavesAfter))
2021 return true;
2022
2023 return false;
2024}
2025
2026bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
2027 // If RP is not reduced in the unclustered reschedule stage, revert to the
2028 // old schedule.
2029 if ((WavesAfter <=
2030 PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) &&
2031 mayCauseSpilling(WavesAfter)) ||
2032 GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
2033 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
2034 return true;
2035 }
2036
2037 // Do not attempt to relax schedule even more if we are already spilling.
2038 if (isRegionWithExcessRP())
2039 return false;
2040
2041 LLVM_DEBUG(
2042 dbgs()
2043 << "\n\t *** In shouldRevertScheduling ***\n"
2044 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
2045 ScheduleMetrics MBefore = getScheduleMetrics(InputSchedule: DAG.SUnits);
2046 LLVM_DEBUG(
2047 dbgs()
2048 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
2049 ScheduleMetrics MAfter = getScheduleMetrics(DAG);
2050 unsigned OldMetric = MBefore.getMetric();
2051 unsigned NewMetric = MAfter.getMetric();
2052 unsigned WavesBefore = std::min(
2053 a: S.getTargetOccupancy(),
2054 b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()));
2055 unsigned Profit =
2056 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
2057 ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
2058 NewMetric) /
2059 ScheduleMetrics::ScaleFactor;
2060 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
2061 << MAfter << "Profit: " << Profit << "\n");
2062 return Profit < ScheduleMetrics::ScaleFactor;
2063}
2064
2065bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
2066 if (PressureAfter == PressureBefore)
2067 return false;
2068
2069 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
2070 return true;
2071
2072 if (mayCauseSpilling(WavesAfter))
2073 return true;
2074
2075 return false;
2076}
2077
2078bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
2079 return mayCauseSpilling(WavesAfter);
2080}
2081
2082bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
2083 if (mayCauseSpilling(WavesAfter))
2084 return true;
2085
2086 return false;
2087}
2088
2089bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
2090 unsigned WavesAfter) {
2091 return mayCauseSpilling(WavesAfter);
2092}
2093
2094bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
2095 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
2096 !PressureAfter.less(MF, O: PressureBefore)) {
2097 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
2098 return true;
2099 }
2100
2101 return false;
2102}
2103
2104void GCNSchedStage::modifyRegionSchedule(unsigned RegionIdx,
2105 MachineBasicBlock *MBB,
2106 ArrayRef<MachineInstr *> MIOrder) {
2107 assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
2108 DAG.Regions[RegionIdx].second)) ==
2109 MIOrder.size() &&
2110 "instruction number mismatch");
2111 if (MIOrder.empty())
2112 return;
2113
2114 LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
2115
2116 // Reconstruct MI sequence by moving instructions in desired order before
2117 // the current region's start.
2118 MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
2119 for (MachineInstr *MI : MIOrder) {
2120 // Either move the next MI in order before the end of the region or move the
2121 // region end past the MI if it is at the correct position.
2122 MachineBasicBlock::iterator MII = MI->getIterator();
2123 if (MII != RegionEnd) {
2124 // Will subsequent splice move MI up past a non-debug instruction?
2125 bool NonDebugReordered =
2126 !MI->isDebugInstr() &&
2127 skipDebugInstructionsForward(It: RegionEnd, End: MII) != MII;
2128 MBB->splice(Where: RegionEnd, Other: MBB, From: MI);
2129 // Only update LiveIntervals information if non-debug instructions are
2130 // reordered. Otherwise debug instructions could cause code generation to
2131 // change.
2132 if (NonDebugReordered)
2133 DAG.LIS->handleMove(MI&: *MI, UpdateFlags: true);
2134 } else {
2135 ++RegionEnd;
2136 }
2137 if (MI->isDebugInstr()) {
2138 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2139 continue;
2140 }
2141
2142 // Reset read-undef flags and update them later.
2143 for (MachineOperand &Op : MI->all_defs())
2144 Op.setIsUndef(false);
2145 RegisterOperands RegOpers;
2146 RegOpers.collect(MI: *MI, TRI: *DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false);
2147 if (DAG.ShouldTrackLaneMasks) {
2148 // Adjust liveness and add missing dead+read-undef flags.
2149 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot();
2150 RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI);
2151 } else {
2152 // Adjust for missing dead-def flags.
2153 RegOpers.detectDeadDefs(MI: *MI, LIS: *DAG.LIS);
2154 }
2155 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2156 }
2157
2158 // The region end doesn't change throughout scheduling since it itself is
2159 // outside the region (whether that is a MBB end or a terminator MI).
2160 assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
2161 DAG.Regions[RegionIdx].first = MIOrder.front();
2162}
2163
2164bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
2165
2166 if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(MI: *MI))
2167 return false;
2168 return AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI->getOpcode()) != -1;
2169}
2170
2171bool RewriteMFMAFormStage::initHeuristics(
2172 std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2173 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2174 SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2175 bool Changed = false;
2176
2177 // Prepare for the heuristics
2178 for (MachineBasicBlock &MBB : MF) {
2179 for (MachineInstr &MI : MBB) {
2180 if (!isRewriteCandidate(MI: &MI))
2181 continue;
2182
2183 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI.getOpcode());
2184 assert(ReplacementOp != -1);
2185
2186 RewriteCands.push_back(x: {&MI, MI.getOpcode()});
2187 MI.setDesc(TII->get(Opcode: ReplacementOp));
2188
2189 MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
2190 if (Src2->isReg()) {
2191 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2192 findReachingDefs(UseMO&: *Src2, LIS: DAG.LIS, DefIdxs&: Src2ReachingDefs);
2193
2194 // For any definition of the src2 register which is non-MFMA, we
2195 // insert a copy.
2196 for (SlotIndex RDIdx : Src2ReachingDefs) {
2197 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIdx);
2198 if (!TII->isMAI(MI: *RD))
2199 CopyForDef.insert(Ptr: RD);
2200 }
2201 }
2202
2203 MachineOperand &Dst = MI.getOperand(i: 0);
2204 SmallVector<MachineOperand *, 8> DstReachingUses;
2205
2206 findReachingUses(DefMI: &MI, LIS: DAG.LIS, ReachingUses&: DstReachingUses);
2207
2208 for (MachineOperand *RUOp : DstReachingUses) {
2209 if (TII->isMAI(MI: *RUOp->getParent()))
2210 continue;
2211
2212 // For any user of the result of the MFMA which is not an MFMA, we
2213 // insert a copy. For a given register, we will only insert one copy
2214 // per user block.
2215 CopyForUse[RUOp->getParent()->getParent()].insert(x: RUOp->getReg());
2216
2217 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2218 findReachingDefs(UseMO&: *RUOp, LIS: DAG.LIS, DefIdxs&: DstUsesReachingDefs);
2219
2220 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2221 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2222 if (TII->isMAI(MI: *RD))
2223 continue;
2224
2225 // For any definition of the user of the MFMA which is not an MFMA,
2226 // we insert a copy. We do this to transform all the reaching defs
2227 // of this use to AGPR. By doing this, we can insert a copy from
2228 // AGPR to VGPR at the user rather than after the MFMA.
2229 CopyForDef.insert(Ptr: RD);
2230 }
2231 }
2232
2233 // Do the rewrite to allow for updated RP calculation.
2234 const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Reg: Dst.getReg());
2235 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(SRC: VGPRRC);
2236 DAG.MRI.setRegClass(Reg: Dst.getReg(), RC: AGPRRC);
2237 if (Src2->isReg())
2238 DAG.MRI.setRegClass(Reg: Src2->getReg(), RC: AGPRRC);
2239 Changed = true;
2240 }
2241 }
2242
2243 return Changed;
2244}
2245
2246int64_t RewriteMFMAFormStage::getRewriteCost(
2247 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2248 const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2249 const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2250 MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2251
2252 int64_t BestSpillCost = 0;
2253 int64_t Cost = 0;
2254 uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2255
2256 std::pair<unsigned, unsigned> MaxVectorRegs =
2257 ST.getMaxNumVectorRegs(F: MF.getFunction());
2258 unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2259 unsigned AGPRThreshold = MaxVectorRegs.second;
2260 unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2261
2262 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2263 if (!RegionsWithExcessArchVGPR[Region])
2264 continue;
2265
2266 GCNRegPressure &PressureBefore = DAG.Pressure[Region];
2267 unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2268 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2269
2270 // For the cases we care about (i.e. ArchVGPR usage is greater than the
2271 // addressable limit), rewriting alone should bring pressure to manageable
2272 // level. If we find any such region, then the rewrite is potentially
2273 // beneficial.
2274 GCNRegPressure PressureAfter = DAG.getRealRegPressure(RegionIdx: Region);
2275 unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2276 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2277
2278 uint64_t BlockFreq =
2279 MBFI->getBlockFreq(MBB: DAG.Regions[Region].first->getParent())
2280 .getFrequency();
2281
2282 bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2283 uint64_t RelativeFreq = EntryFreq && BlockFreq
2284 ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2285 : BlockFreq / EntryFreq)
2286 : 1;
2287
2288 // This assumes perfect spilling / splitting -- using one spill / copy
2289 // instruction and one restoreFrom / copy for each excess register,
2290 int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
2291
2292 // Also account for the block frequency.
2293 if (RelativeFreqIsDenom)
2294 SpillCost /= (int64_t)RelativeFreq;
2295 else
2296 SpillCost *= (int64_t)RelativeFreq;
2297
2298 // If we have increased spilling in any block, just bail.
2299 if (SpillCost > 0)
2300 return SpillCost;
2301
2302 if (SpillCost < BestSpillCost)
2303 BestSpillCost = SpillCost;
2304 }
2305
2306 // Set the cost to the largest decrease in spill cost in order to not double
2307 // count spill reductions.
2308 Cost = BestSpillCost;
2309 assert(Cost <= 0);
2310
2311 unsigned CopyCost = 0;
2312
2313 // For each CopyForDef, increase the cost by the register size while
2314 // accounting for block frequency.
2315 for (MachineInstr *DefMI : CopyForDef) {
2316 Register DefReg = DefMI->getOperand(i: 0).getReg();
2317 uint64_t DefFreq =
2318 EntryFreq
2319 ? MBFI->getBlockFreq(MBB: DefMI->getParent()).getFrequency() / EntryFreq
2320 : 1;
2321
2322 const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg: DefReg);
2323 CopyCost += RC->getCopyCost() * DefFreq;
2324 }
2325
2326 // Account for CopyForUse copies in each block that the register is used.
2327 for (auto &[UseBlock, UseRegs] : CopyForUse) {
2328 uint64_t UseFreq =
2329 EntryFreq ? MBFI->getBlockFreq(MBB: UseBlock).getFrequency() / EntryFreq : 1;
2330
2331 for (Register UseReg : UseRegs) {
2332 const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg: UseReg);
2333 CopyCost += RC->getCopyCost() * UseFreq;
2334 }
2335 }
2336
2337 // Reset the classes that were changed to AGPR for better RB analysis.
2338 // We must do rewriting after copy-insertion, as some defs of the register
2339 // may require VGPR. Additionally, if we bail out and don't perform the
2340 // rewrite then these need to be restored anyway.
2341 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2342 assert(TII->isMAI(*MI));
2343 const TargetRegisterClass *AGPRRC =
2344 DAG.MRI.getRegClass(Reg: MI->getOperand(i: 0).getReg());
2345 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: AGPRRC);
2346
2347 MachineOperand *Src2 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2);
2348 assert(Src2);
2349
2350 if (Src2->isReg())
2351 DAG.MRI.setRegClass(Reg: Src2->getReg(), RC: VGPRRC);
2352 DAG.MRI.setRegClass(Reg: MI->getOperand(i: 0).getReg(), RC: VGPRRC);
2353 MI->setDesc(TII->get(Opcode: OriginalOpcode));
2354 }
2355
2356 return Cost + CopyCost;
2357}
2358
2359bool RewriteMFMAFormStage::rewrite(
2360 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
2361 DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
2362 DenseMap<MachineInstr *, unsigned> LastMIToRegion;
2363
2364 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2365 RegionBoundaries Entry = DAG.Regions[Region];
2366 if (Entry.first == Entry.second)
2367 continue;
2368
2369 FirstMIToRegion[&*Entry.first] = Region;
2370 if (Entry.second != Entry.first->getParent()->end())
2371 LastMIToRegion[&*Entry.second] = Region;
2372 }
2373
2374 // Rewrite the MFMAs to AGPR, and insert any copies as needed.
2375 // The general assumption of the algorithm (and the previous cost calculation)
2376 // is that it is better to insert the copies in the MBB of the def of the src2
2377 // operands, and in the MBB of the user of the dest operands. This is based on
2378 // the assumption that the MFMAs are likely to appear in loop bodies, while
2379 // the src2 and dest operands are live-in / live-out of the loop. Due to this
2380 // design, the algorithm for finding copy insertion points is more
2381 // complicated.
2382 //
2383 // There are three main cases to handle: 1. the reaching defs of the src2
2384 // operands, 2. the reaching uses of the dst operands, and 3. the reaching
2385 // defs of the reaching uses of the dst operand.
2386 //
2387 // In the first case, we simply insert copies after each of the reaching
2388 // definitions. In the second case, we collect all the uses of a given dest
2389 // and organize them by MBB. Then, we insert 1 copy for each MBB before the
2390 // earliest use. Since the use may have multiple reaching defs, and since we
2391 // want to replace the register it is using with the result of the copy, we
2392 // must handle case 3. In the third case, we simply insert a copy after each
2393 // of the reaching defs to connect to the copy of the reaching uses of the dst
2394 // reg. This allows us to avoid inserting copies next to the MFMAs.
2395 //
2396 // While inserting the copies, we maintain a map of operands which will use
2397 // different regs (i.e. the result of the copies). For example, a case 1 src2
2398 // operand will use the register result of the copies after the reaching defs,
2399 // as opposed to the original register. Now that we have completed our copy
2400 // analysis and placement, we can bulk update the registers. We do this
2401 // separately as to avoid complicating the reachingDef and reachingUse
2402 // queries.
2403 //
2404 // While inserting the copies, we also maintain a list or registers which we
2405 // will want to reclassify as AGPR. After doing the copy insertion and the
2406 // register replacement, we can finally do the reclassification. This uses the
2407 // redef map, as the registers we are interested in reclassifying may be
2408 // replaced by the result of a copy. We must do this after the copy analysis
2409 // and placement as we must have an accurate redef map -- otherwise we may end
2410 // up creating illegal instructions.
2411
2412 // The original registers of the MFMA that need to be reclassified as AGPR.
2413 DenseSet<Register> RewriteRegs;
2414 // The map of an original register in the MFMA to a new register (result of a
2415 // copy) that it should be replaced with.
2416 DenseMap<Register, Register> RedefMap;
2417 // The map of the original MFMA registers to the relevant MFMA operands.
2418 DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2419 // The map of reaching defs for a given register -- to avoid duplicate copies.
2420 DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
2421 // The map of reaching uses for a given register by basic block -- to avoid
2422 // duplicate copies and to calculate per MBB insert pts.
2423 DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
2424 ReachingUseTracker;
2425
2426 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2427 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI->getOpcode());
2428 if (ReplacementOp == -1)
2429 continue;
2430 MI->setDesc(TII->get(Opcode: ReplacementOp));
2431
2432 // Case 1: insert copies for the reaching defs of the Src2Reg.
2433 MachineOperand *Src2 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2);
2434 if (Src2->isReg()) {
2435 Register Src2Reg = Src2->getReg();
2436 if (!Src2Reg.isVirtual())
2437 return false;
2438
2439 Register MappedReg = Src2->getReg();
2440 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2441 findReachingDefs(UseMO&: *Src2, LIS: DAG.LIS, DefIdxs&: Src2ReachingDefs);
2442 SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
2443
2444 for (SlotIndex RDIndex : Src2ReachingDefs) {
2445 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2446 if (TII->isMAI(MI: *RD))
2447 continue;
2448
2449 // If there is a non mai reaching def, then we need a copy.
2450 Src2DefsReplace.insert(X: RD);
2451 }
2452
2453 if (!Src2DefsReplace.empty()) {
2454 DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: Src2Reg);
2455 if (RI != RedefMap.end()) {
2456 MappedReg = RI->second;
2457 } else {
2458 assert(!ReachingDefCopyMap.contains(Src2Reg));
2459 const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Reg: Src2Reg);
2460 const TargetRegisterClass *VGPRRC =
2461 SRI->getEquivalentVGPRClass(SRC: Src2RC);
2462
2463 // Track the mapping of the original register to the new register.
2464 MappedReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2465 RedefMap[Src2Reg] = MappedReg;
2466 }
2467
2468 // If none exists, create a copy from this reaching def.
2469 // We may have inserted a copy already in an earlier iteration.
2470 for (MachineInstr *RD : Src2DefsReplace) {
2471 // Do not create redundant copies.
2472 if (ReachingDefCopyMap[Src2Reg].insert(Ptr: RD).second) {
2473 MachineInstrBuilder VGPRCopy =
2474 BuildMI(BB&: *RD->getParent(), I: std::next(x: RD->getIterator()),
2475 MIMD: RD->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2476 .addDef(RegNo: MappedReg, Flags: {}, SubReg: 0)
2477 .addUse(RegNo: Src2Reg, Flags: {}, SubReg: 0);
2478 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2479
2480 // If this reaching def was the last MI in the region, update the
2481 // region boundaries.
2482 if (LastMIToRegion.contains(Val: RD)) {
2483 unsigned UpdateRegion = LastMIToRegion[RD];
2484 DAG.Regions[UpdateRegion].second = VGPRCopy;
2485 LastMIToRegion.erase(Val: RD);
2486 }
2487 }
2488 }
2489 }
2490
2491 // Track the register for reclassification
2492 RewriteRegs.insert(V: Src2Reg);
2493
2494 // Always insert the operand for replacement. If this corresponds with a
2495 // chain of tied-def we may not see the VGPR requirement until later.
2496 ReplaceMap[Src2Reg].insert(V: Src2);
2497 }
2498
2499 // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2500 // and after the reaching defs of the reaching uses of the dsts.
2501
2502 MachineOperand *Dst = &MI->getOperand(i: 0);
2503 Register DstReg = Dst->getReg();
2504 if (!DstReg.isVirtual())
2505 return false;
2506
2507 Register MappedReg = DstReg;
2508 SmallVector<MachineOperand *, 8> DstReachingUses;
2509
2510 SmallVector<MachineOperand *, 8> DstReachingUseCopies;
2511 SmallVector<MachineInstr *, 8> DstUseDefsReplace;
2512
2513 findReachingUses(DefMI: MI, LIS: DAG.LIS, ReachingUses&: DstReachingUses);
2514
2515 for (MachineOperand *RUOp : DstReachingUses) {
2516 if (TII->isMAI(MI: *RUOp->getParent()))
2517 continue;
2518
2519 // If there is a non mai reaching use, then we need a copy.
2520 if (find(Range&: DstReachingUseCopies, Val: RUOp) == DstReachingUseCopies.end())
2521 DstReachingUseCopies.push_back(Elt: RUOp);
2522 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2523 findReachingDefs(UseMO&: *RUOp, LIS: DAG.LIS, DefIdxs&: DstUsesReachingDefs);
2524
2525 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2526 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2527 if (TII->isMAI(MI: *RD))
2528 continue;
2529
2530 // If there is a non mai reaching def of this reaching use, then we will
2531 // need a copy.
2532 if (find(Range&: DstUseDefsReplace, Val: RD) == DstUseDefsReplace.end())
2533 DstUseDefsReplace.push_back(Elt: RD);
2534 }
2535 }
2536
2537 if (!DstUseDefsReplace.empty()) {
2538 DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: DstReg);
2539 if (RI != RedefMap.end()) {
2540 MappedReg = RI->second;
2541 } else {
2542 assert(!ReachingDefCopyMap.contains(DstReg));
2543 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: DstReg);
2544 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2545
2546 // Track the mapping of the original register to the new register.
2547 MappedReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2548 RedefMap[DstReg] = MappedReg;
2549 }
2550
2551 // If none exists, create a copy from this reaching def.
2552 // We may have inserted a copy already in an earlier iteration.
2553 for (MachineInstr *RD : DstUseDefsReplace) {
2554 // Do not create reundant copies.
2555 if (ReachingDefCopyMap[DstReg].insert(Ptr: RD).second) {
2556 MachineInstrBuilder VGPRCopy =
2557 BuildMI(BB&: *RD->getParent(), I: std::next(x: RD->getIterator()),
2558 MIMD: RD->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2559 .addDef(RegNo: MappedReg, Flags: {}, SubReg: 0)
2560 .addUse(RegNo: DstReg, Flags: {}, SubReg: 0);
2561 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2562
2563 // If this reaching def was the last MI in the region, update the
2564 // region boundaries.
2565 DenseMap<MachineInstr *, unsigned>::iterator LMI =
2566 LastMIToRegion.find(Val: RD);
2567 if (LMI != LastMIToRegion.end()) {
2568 unsigned UpdateRegion = LMI->second;
2569 DAG.Regions[UpdateRegion].second = VGPRCopy;
2570 LastMIToRegion.erase(Val: RD);
2571 }
2572 }
2573 }
2574 }
2575
2576 DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
2577 for (MachineOperand *RU : DstReachingUseCopies) {
2578 MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2579 // Just keep track of the reaching use of this register by block. After we
2580 // have scanned all the MFMAs we can find optimal insert pts.
2581 if (RUBlock != MI->getParent()) {
2582 ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(Ptr: RU);
2583 continue;
2584 }
2585
2586 // Special case, the use is in the same block as the MFMA. Insert the copy
2587 // just before the use.
2588 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: DstReg);
2589 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2590 Register NewUseReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2591 MachineInstr *UseInst = RU->getParent();
2592 MachineInstrBuilder VGPRCopy =
2593 BuildMI(BB&: *UseInst->getParent(), I: UseInst->getIterator(),
2594 MIMD: UseInst->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2595 .addDef(RegNo: NewUseReg, Flags: {}, SubReg: 0)
2596 .addUse(RegNo: DstReg, Flags: {}, SubReg: 0);
2597 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2598 // Since we know this use has only one reaching def, we can replace the
2599 // use reg.
2600 RU->setReg(NewUseReg);
2601 // Track the copy source operand for r eplacement.
2602 DstRegSet.insert(V: &VGPRCopy->getOperand(i: 1));
2603 }
2604
2605 // Track the register for reclassification
2606 RewriteRegs.insert(V: DstReg);
2607
2608 // Insert the dst operand for replacement. If this dst is in a chain of
2609 // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2610 // all the correspond operands need to be replaced.
2611 DstRegSet.insert(V: Dst);
2612 }
2613
2614 // Handle the copies for dst uses.
2615 using RUBType =
2616 std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
2617 for (RUBType RUBlockEntry : ReachingUseTracker) {
2618 using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
2619 for (RUDType RUDst : RUBlockEntry.second) {
2620 MachineOperand *OpBegin = *RUDst.second.begin();
2621 SlotIndex InstPt = DAG.LIS->getInstructionIndex(Instr: *OpBegin->getParent());
2622
2623 // Find the earliest use in this block.
2624 for (MachineOperand *User : RUDst.second) {
2625 SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(Instr: *User->getParent());
2626 if (SlotIndex::isEarlierInstr(A: NewInstPt, B: InstPt))
2627 InstPt = NewInstPt;
2628 }
2629
2630 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: RUDst.first);
2631 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2632 Register NewUseReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2633 MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(index: InstPt);
2634
2635 MachineInstrBuilder VGPRCopy =
2636 BuildMI(BB&: *UseInst->getParent(), I: UseInst->getIterator(),
2637 MIMD: UseInst->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2638 .addDef(RegNo: NewUseReg, Flags: {}, SubReg: 0)
2639 .addUse(RegNo: RUDst.first, Flags: {}, SubReg: 0);
2640 DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2641
2642 // If this UseInst was the first MI in the region, update the region
2643 // boundaries.
2644 DenseMap<MachineInstr *, unsigned>::iterator FI =
2645 FirstMIToRegion.find(Val: UseInst);
2646 if (FI != FirstMIToRegion.end()) {
2647 unsigned UpdateRegion = FI->second;
2648 DAG.Regions[UpdateRegion].first = VGPRCopy;
2649 FirstMIToRegion.erase(Val: UseInst);
2650 }
2651
2652 // Replace the operand for all users.
2653 for (MachineOperand *User : RUDst.second) {
2654 User->setReg(NewUseReg);
2655 }
2656
2657 // Track the copy source operand for replacement.
2658 ReplaceMap[RUDst.first].insert(V: &VGPRCopy->getOperand(i: 1));
2659 }
2660 }
2661
2662 // We may have needed to insert copies after the reaching defs of the MFMAs.
2663 // Replace the original register with the result of the copy for all relevant
2664 // operands.
2665 for (std::pair<Register, Register> NewDef : RedefMap) {
2666 Register OldReg = NewDef.first;
2667 Register NewReg = NewDef.second;
2668
2669 // Replace the register for any associated operand in the MFMA chain.
2670 for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
2671 ReplaceOp->setReg(NewReg);
2672 }
2673
2674 // Finally, do the reclassification of the MFMA registers.
2675 for (Register RewriteReg : RewriteRegs) {
2676 Register RegToRewrite = RewriteReg;
2677
2678 // Be sure to update the replacement register and not the original.
2679 DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: RewriteReg);
2680 if (RI != RedefMap.end())
2681 RegToRewrite = RI->second;
2682
2683 const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(Reg: RegToRewrite);
2684 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(SRC: CurrRC);
2685
2686 DAG.MRI.setRegClass(Reg: RegToRewrite, RC: AGPRRC);
2687 }
2688
2689 // Bulk update the LIS.
2690 DAG.LIS->reanalyze(MF&: DAG.MF);
2691 // Liveins may have been modified for cross RC copies
2692 RegionPressureMap LiveInUpdater(&DAG, false);
2693 LiveInUpdater.buildLiveRegMap();
2694
2695 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
2696 DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(RegionIdx: Region);
2697
2698 DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2699
2700 return true;
2701}
2702
2703unsigned PreRARematStage::getStageTargetOccupancy() const {
2704 return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
2705}
2706
2707bool PreRARematStage::setObjective() {
2708 const Function &F = MF.getFunction();
2709
2710 // Set up "spilling targets" for all regions.
2711 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2712 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2713 bool HasVectorRegisterExcess = false;
2714 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2715 const GCNRegPressure &RP = DAG.Pressure[I];
2716 GCNRPTarget &Target = RPTargets.emplace_back(Args&: MaxSGPRs, Args&: MaxVGPRs, Args&: MF, Args: RP);
2717 if (!Target.satisfied())
2718 TargetRegions.set(I);
2719 HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
2720 }
2721
2722 if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2723 // In addition to register usage being above addressable limits, occupancy
2724 // below the minimum is considered like "spilling" as well.
2725 TargetOcc = std::nullopt;
2726 } else {
2727 // There is no spilling and room to improve occupancy; set up "increased
2728 // occupancy targets" for all regions.
2729 TargetOcc = DAG.MinOccupancy + 1;
2730 const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
2731 MaxSGPRs = ST.getMaxNumSGPRs(WavesPerEU: *TargetOcc, Addressable: false);
2732 MaxVGPRs = ST.getMaxNumVGPRs(WavesPerEU: *TargetOcc, DynamicVGPRBlockSize: VGPRBlockSize);
2733 for (auto [I, Target] : enumerate(First&: RPTargets)) {
2734 Target.setTarget(NumSGPRs: MaxSGPRs, NumVGPRs: MaxVGPRs);
2735 if (!Target.satisfied())
2736 TargetRegions.set(I);
2737 }
2738 }
2739
2740 return TargetRegions.any();
2741}
2742
2743bool PreRARematStage::collectRematRegs(
2744 const DenseMap<MachineInstr *, unsigned> &MIRegion) {
2745 // We need up-to-date live-out info. to query live-out register masks in
2746 // regions containing rematerializable instructions.
2747 DAG.RegionLiveOuts.buildLiveRegMap();
2748
2749 // Set of registers already marked for potential remterialization; used to
2750 // avoid rematerialization chains.
2751 SmallSet<Register, 4> MarkedRegs;
2752 auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool {
2753 return MO.isReg() && MarkedRegs.contains(V: MO.getReg());
2754 };
2755
2756 // Identify rematerializable instructions in the function.
2757 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2758 RegionBoundaries Bounds = DAG.Regions[I];
2759 for (auto MI = Bounds.first; MI != Bounds.second; ++MI) {
2760 // The instruction must be rematerializable.
2761 MachineInstr &DefMI = *MI;
2762 if (!isReMaterializable(MI: DefMI))
2763 continue;
2764
2765 // We only support rematerializing virtual registers with one
2766 // definition.
2767 Register Reg = DefMI.getOperand(i: 0).getReg();
2768 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(RegNo: Reg))
2769 continue;
2770
2771 // We only care to rematerialize the instruction if it has a single
2772 // non-debug user in a different region.
2773 // FIXME: Allow rematerializations with multiple uses. This should be
2774 // relatively easy to support using the current cost model.
2775 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(RegNo: Reg);
2776 if (!UseMI)
2777 continue;
2778 auto UseRegion = MIRegion.find(Val: UseMI);
2779 if (UseRegion == MIRegion.end() || UseRegion->second == I)
2780 continue;
2781
2782 // Do not rematerialize an instruction if it uses or is used by an
2783 // instruction that we have designated for rematerialization.
2784 // FIXME: Allow for rematerialization chains: this requires 1. updating
2785 // remat points to account for uses that are rematerialized, and 2.
2786 // either rematerializing the candidates in careful ordering, or
2787 // deferring the MBB RP walk until the entire chain has been
2788 // rematerialized.
2789 const MachineOperand &UseMO = UseMI->getOperand(i: 0);
2790 if (IsMarkedForRemat(UseMO) ||
2791 llvm::any_of(Range: DefMI.operands(), P: IsMarkedForRemat))
2792 continue;
2793
2794 // Do not rematerialize an instruction it it uses registers that aren't
2795 // available at its use. This ensures that we are not extending any live
2796 // range while rematerializing.
2797 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(Instr: *UseMI).getRegSlot(EC: true);
2798 if (!VirtRegAuxInfo::allUsesAvailableAt(MI: &DefMI, UseIdx, LIS: *DAG.LIS, MRI: DAG.MRI,
2799 TII: *DAG.TII))
2800 continue;
2801
2802 // Add the instruction to the rematerializable list.
2803 MarkedRegs.insert(V: Reg);
2804 RematRegs.emplace_back(Args: &DefMI, Args&: UseMI, Args&: DAG, Args: MIRegion);
2805 }
2806 }
2807
2808 return !RematRegs.empty();
2809}
2810
2811PreRARematStage::RematReg::RematReg(
2812 MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
2813 const DenseMap<MachineInstr *, unsigned> &MIRegion)
2814 : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()),
2815 LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()),
2816 DefRegion(MIRegion.at(Val: DefMI)), UseRegion(MIRegion.at(Val: UseMI)) {
2817
2818 // Mark regions in which the rematerializable register is live.
2819 Register Reg = getReg();
2820 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2821 auto LiveInIt = DAG.LiveIns[I].find(Val: Reg);
2822 if (LiveInIt != DAG.LiveIns[I].end())
2823 LiveIn.set(I);
2824 const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I);
2825 if (auto LiveOutIt = LiveOuts.find(Val: Reg); LiveOutIt != LiveOuts.end())
2826 LiveOut.set(I);
2827 }
2828 Live |= LiveIn;
2829 Live |= LiveOut;
2830 Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: DefRegion).at(Val: Reg);
2831}
2832
2833bool PreRARematStage::RematReg::maybeBeneficial(
2834 const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
2835 Register Reg = getReg();
2836 for (unsigned I : TargetRegions.set_bits()) {
2837 if (Live[I] && RPTargets[I].isSaveBeneficial(Reg))
2838 return true;
2839 }
2840 return false;
2841}
2842
2843void PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
2844 MachineInstr *RematMI,
2845 GCNScheduleDAGMILive &DAG) const {
2846 RegionBoundaries &Bounds = DAG.Regions[RegionIdx];
2847 if (Bounds.first == std::next(x: MachineBasicBlock::iterator(RematMI)))
2848 Bounds.first = RematMI;
2849 DAG.LIS->InsertMachineInstrInMaps(MI&: *RematMI);
2850 DAG.LIS->createAndComputeVirtRegInterval(Reg: RematMI->getOperand(i: 0).getReg());
2851}
2852
2853PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
2854 MachineFunction &MF, const GCNScheduleDAGMILive &DAG) {
2855 assert(DAG.MLI && "MLI not defined in DAG");
2856 MachineBranchProbabilityInfo MBPI;
2857 MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
2858
2859 const unsigned NumRegions = DAG.Regions.size();
2860 MinFreq = MBFI.getEntryFreq().getFrequency();
2861 MaxFreq = 0;
2862 Regions.reserve(N: NumRegions);
2863 for (unsigned I = 0; I < NumRegions; ++I) {
2864 MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
2865 uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
2866 Regions.push_back(Elt: BlockFreq);
2867 if (BlockFreq && BlockFreq < MinFreq)
2868 MinFreq = BlockFreq;
2869 else if (BlockFreq > MaxFreq)
2870 MaxFreq = BlockFreq;
2871 }
2872 if (!MinFreq)
2873 return;
2874
2875 // Scale everything down if frequencies are high.
2876 if (MinFreq >= ScaleFactor * ScaleFactor) {
2877 for (uint64_t &Freq : Regions)
2878 Freq /= ScaleFactor;
2879 MinFreq /= ScaleFactor;
2880 MaxFreq /= ScaleFactor;
2881 }
2882}
2883
2884PreRARematStage::ScoredRemat::ScoredRemat(RematReg *Remat, const FreqInfo &Freq,
2885 const GCNScheduleDAGMILive &DAG)
2886 : Remat(Remat), NumRegs(getNumRegs(DAG)), FreqDiff(getFreqDiff(Freq)) {}
2887
2888unsigned PreRARematStage::ScoredRemat::getNumRegs(
2889 const GCNScheduleDAGMILive &DAG) const {
2890 const TargetRegisterClass &RC = *DAG.MRI.getRegClass(Reg: Remat->getReg());
2891 unsigned RegSize = DAG.TRI->getRegSizeInBits(RC);
2892 if (unsigned SubIdx = Remat->DefMI->getOperand(i: 0).getSubReg()) {
2893 // The following may return -1 (i.e., a large unsigned number) on indices
2894 // that may be used to access subregisters of multiple sizes; in such cases
2895 // fallback on the size derived from the register class.
2896 unsigned SubRegSize = DAG.TRI->getSubRegIdxSize(Idx: SubIdx);
2897 if (SubRegSize < RegSize)
2898 RegSize = SubRegSize;
2899 }
2900 return divideCeil(Numerator: RegSize, Denominator: 32);
2901}
2902
2903int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
2904 // Get frequencies of defining and using regions. A rematerialization from the
2905 // least frequent region to the most frequent region will yield the greatest
2906 // latency penalty and therefore should get minimum score. Reciprocally, a
2907 // rematerialization in the other direction should get maximum score. Default
2908 // to values that will yield the worst possible score given known frequencies
2909 // in order to penalize rematerializations from or into regions whose
2910 // frequency is unknown.
2911 int64_t DefOrMin = std::max(a: Freq.Regions[Remat->DefRegion], b: Freq.MinFreq);
2912 int64_t UseOrMax = Freq.Regions[Remat->UseRegion];
2913 if (!UseOrMax)
2914 UseOrMax = Freq.MaxFreq;
2915 return DefOrMin - UseOrMax;
2916}
2917
2918void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
2919 ArrayRef<GCNRPTarget> RPTargets,
2920 const FreqInfo &FreqInfo,
2921 bool ReduceSpill) {
2922 MaxFreq = 0;
2923 RegionImpact = 0;
2924 for (unsigned I : TargetRegions.set_bits()) {
2925 if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Reg: Remat->getReg()))
2926 continue;
2927 bool UnusedLT = Remat->isUnusedLiveThrough(I);
2928
2929 // Regions in which RP is guaranteed to decrease have more weight.
2930 RegionImpact += UnusedLT ? 2 : 1;
2931
2932 if (ReduceSpill) {
2933 uint64_t Freq = FreqInfo.Regions[I];
2934 if (!UnusedLT) {
2935 // Apply a frequency penalty in regions in which we are not sure that RP
2936 // will decrease.
2937 Freq /= 2;
2938 }
2939 MaxFreq = std::max(a: MaxFreq, b: Freq);
2940 }
2941 }
2942 RegionImpact *= NumRegs;
2943}
2944
2945void PreRARematStage::rematerialize(const RematReg &Remat,
2946 BitVector &RecomputeRP,
2947 RollbackInfo *Rollback) {
2948 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2949 MachineInstr &DefMI = *Remat.DefMI;
2950 Register Reg = DefMI.getOperand(i: 0).getReg();
2951 Register NewReg = DAG.MRI.cloneVirtualRegister(VReg: Reg);
2952
2953 // Rematerialize the register in the region where it is used.
2954 MachineBasicBlock::iterator InsertPos = Remat.UseMI;
2955 TII->reMaterialize(MBB&: *InsertPos->getParent(), MI: InsertPos, DestReg: NewReg, SubIdx: 0, Orig: DefMI);
2956 MachineInstr *RematMI = &*std::prev(x: InsertPos);
2957 Remat.UseMI->substituteRegister(FromReg: Reg, ToReg: NewReg, SubIdx: 0, RegInfo: *DAG.TRI);
2958 Remat.insertMI(RegionIdx: Remat.UseRegion, RematMI, DAG);
2959 if (Rollback) {
2960 Rollback->RematMI = RematMI;
2961 // Make the original MI a debug value so that it does not influence
2962 // scheduling and replace all read registers with a sentinel register to
2963 // prevent operands to appear in use-lists of other MIs during LIS
2964 // updates. Store mappings between operand indices and original registers
2965 // for potential rollback.
2966 DefMI.setDesc(TII->get(Opcode: TargetOpcode::DBG_VALUE));
2967 for (auto [Idx, MO] : enumerate(First: Remat.DefMI->operands())) {
2968 if (MO.isReg() && MO.readsReg()) {
2969 Rollback->RegMap.insert(KV: {Idx, MO.getReg()});
2970 MO.setReg(Register());
2971 }
2972 }
2973 } else {
2974 // Just delete the original instruction if it cannot be rolled back.
2975 DAG.deleteMI(RegionIdx: Remat.DefRegion, MI: &DefMI);
2976 }
2977
2978#ifdef EXPENSIVE_CHECKS
2979 // All uses are known to be available / live at the remat point. Thus,
2980 // the uses should already be live in to the using region.
2981 for (MachineOperand &MO : DefMI.operands()) {
2982 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
2983 continue;
2984
2985 Register UseReg = MO.getReg();
2986 if (!UseReg.isVirtual())
2987 continue;
2988
2989 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
2990 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
2991 if (LI.hasSubRanges() && MO.getSubReg())
2992 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
2993
2994 LaneBitmask LiveInMask = DAG.LiveIns[Remat.UseRegion].at(UseReg);
2995 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
2996 // If this register has lanes not covered by the LiveIns, be sure they
2997 // do not map to any subrange. ref:
2998 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
2999 if (UncoveredLanes.any()) {
3000 assert(LI.hasSubRanges());
3001 for (LiveInterval::SubRange &SR : LI.subranges())
3002 assert((SR.LaneMask & UncoveredLanes).none());
3003 }
3004 }
3005#endif
3006
3007 // Remove the register from all regions where it is a live-in or live-out
3008 // and adjust RP targets. The save is guaranteed in regions in which the
3009 // register is live-through and unused but optimistic in all other regions
3010 // where the register is live.
3011 for (unsigned I : Remat.Live.set_bits()) {
3012 RPTargets[I].saveReg(Reg, Mask: Remat.Mask, MRI: DAG.MRI);
3013 DAG.LiveIns[I].erase(Val: Reg);
3014 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I).erase(Val: Reg);
3015 if (!Remat.isUnusedLiveThrough(I))
3016 RecomputeRP.set(I);
3017 }
3018
3019 RescheduleRegions |= Remat.Live;
3020}
3021
3022void PreRARematStage::rollback(const RollbackInfo &Rollback) const {
3023 const auto &[Remat, RematMI, RegMap] = Rollback;
3024
3025 // Restore the original defining instruction to its original state.
3026 Remat->DefMI->setDesc(DAG.TII->get(Opcode: RematMI->getOpcode()));
3027 for (const auto &[MOIdx, Reg] : RegMap)
3028 Remat->DefMI->getOperand(i: MOIdx).setReg(Reg);
3029
3030 // Switch back to using the original register and delete the
3031 // rematerialization.
3032 Register Reg = RematMI->getOperand(i: 0).getReg();
3033 Register OriginalReg = Remat->DefMI->getOperand(i: 0).getReg();
3034 Remat->UseMI->substituteRegister(FromReg: Reg, ToReg: OriginalReg, SubIdx: 0, RegInfo: *DAG.TRI);
3035 REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion
3036 << "] Deleting rematerialization " << *RematMI);
3037 DAG.deleteMI(RegionIdx: Remat->UseRegion, MI: RematMI);
3038
3039 // Re-add the defined register as a live-in/live-out in all regions it used to
3040 // be one in.
3041 std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask);
3042 for (unsigned I : Remat->LiveIn.set_bits())
3043 DAG.LiveIns[I].insert(KV: LiveReg);
3044 for (unsigned I : Remat->LiveOut.set_bits())
3045 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I).insert(KV: LiveReg);
3046}
3047
3048void PreRARematStage::commitRematerializations() const {
3049 REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n");
3050 for (const RollbackInfo &Rollback : Rollbacks)
3051 DAG.deleteMI(RegionIdx: Rollback.Remat->DefRegion, MI: Rollback.Remat->DefMI);
3052}
3053
3054void PreRARematStage::unsetSatisifedRPTargets(const BitVector &Regions) {
3055 for (unsigned I : Regions.set_bits()) {
3056 if (TargetRegions[I] && RPTargets[I].satisfied()) {
3057 REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
3058 TargetRegions.reset(Idx: I);
3059 }
3060 }
3061}
3062
3063bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
3064 bool TooOptimistic = false;
3065 for (unsigned I : Regions.set_bits()) {
3066 GCNRPTarget &Target = RPTargets[I];
3067 Target.setRP(DAG.getRealRegPressure(RegionIdx: I));
3068
3069 // Since we were optimistic in assessing RP decreases in these regions, we
3070 // may need to remark the target as a target region if RP didn't decrease
3071 // as expected.
3072 if (!TargetRegions[I] && !Target.satisfied()) {
3073 REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
3074 TooOptimistic = true;
3075 TargetRegions.set(I);
3076 }
3077 }
3078 return TooOptimistic;
3079}
3080
3081// Copied from MachineLICM
3082bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
3083 if (!DAG.TII->isReMaterializable(MI))
3084 return false;
3085
3086 for (const MachineOperand &MO : MI.all_uses()) {
3087 // We can't remat physreg uses, unless it is a constant or an ignorable
3088 // use (e.g. implicit exec use on VALU instructions)
3089 if (MO.getReg().isPhysical()) {
3090 if (DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) || DAG.TII->isIgnorableUse(MO))
3091 continue;
3092 return false;
3093 }
3094 }
3095
3096 return true;
3097}
3098
3099void PreRARematStage::finalizeGCNSchedStage() {
3100 // We consider that reducing spilling is always beneficial so we never
3101 // rollback rematerializations or revert scheduling in such cases.
3102 if (!TargetOcc)
3103 return;
3104
3105 // When increasing occupancy, it is possible that re-scheduling is not able to
3106 // achieve the target occupancy in all regions, in which case re-scheduling in
3107 // all regions should be reverted.
3108 if (DAG.MinOccupancy >= *TargetOcc) {
3109 commitRematerializations();
3110 return;
3111 }
3112 for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
3113 REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
3114 << '\n');
3115 DAG.Pressure[RegionIdx] = MaxPressure;
3116 modifyRegionSchedule(RegionIdx, MBB: RegionBB[RegionIdx], MIOrder: OrigMIOrder);
3117 }
3118
3119 // It is possible that re-scheduling lowers occupancy over the one achieved
3120 // just through rematerializations, in which case we revert re-scheduling in
3121 // all regions but do not roll back rematerializations.
3122 if (AchievedOcc >= *TargetOcc) {
3123 commitRematerializations();
3124 DAG.setTargetOccupancy(AchievedOcc);
3125 return;
3126 }
3127 // Reset the target occupancy to what it was pre-rematerialization.
3128 DAG.setTargetOccupancy(*TargetOcc - 1);
3129
3130 // Rollback, then recompute pressure in all affected regions.
3131 REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
3132 BitVector RecomputeRP(DAG.Regions.size());
3133 DenseSet<Register> RecomputeLI;
3134 for (const RollbackInfo &Rollback : Rollbacks) {
3135 rollback(Rollback);
3136 RecomputeRP |= Rollback.Remat->Live;
3137 // Regenerate intervals for all register operands of rematerialized MIs as
3138 // slot indices may have changed slightly from before re-scheduling.
3139 for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) {
3140 if (MO.isReg() && MO.getReg().isVirtual())
3141 RecomputeLI.insert(V: MO.getReg());
3142 }
3143 }
3144 for (Register Reg : RecomputeLI) {
3145 DAG.LIS->removeInterval(Reg);
3146 DAG.LIS->createAndComputeVirtRegInterval(Reg);
3147 }
3148 for (unsigned I : RecomputeRP.set_bits())
3149 DAG.Pressure[I] = DAG.getRealRegPressure(RegionIdx: I);
3150
3151 GCNSchedStage::finalizeGCNSchedStage();
3152}
3153
3154void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) {
3155 // It's not possible for the deleted instruction to be upper region boundary
3156 // since we don't delete region terminators.
3157 if (Regions[RegionIdx].first == MI)
3158 Regions[RegionIdx].first = std::next(x: MachineBasicBlock::iterator(MI));
3159 LIS->removeInterval(Reg: MI->getOperand(i: 0).getReg());
3160 LIS->RemoveMachineInstrFromMaps(MI&: *MI);
3161 MI->eraseFromParent();
3162}
3163
3164void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
3165 MinOccupancy = TargetOccupancy;
3166 if (MFI.getOccupancy() < TargetOccupancy)
3167 MFI.increaseOccupancy(MF, Limit: MinOccupancy);
3168 else
3169 MFI.limitOccupancy(Limit: MinOccupancy);
3170}
3171
3172static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
3173 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
3174 return any_of(Range&: *DAG, P: [SII](MachineBasicBlock::iterator MI) {
3175 return SII->isIGLPMutationOnly(Opcode: MI->getOpcode());
3176 });
3177}
3178
3179GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
3180 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
3181 bool RemoveKillFlags)
3182 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
3183
3184void GCNPostScheduleDAGMILive::schedule() {
3185 HasIGLPInstrs = hasIGLPInstrs(DAG: this);
3186 if (HasIGLPInstrs) {
3187 SavedMutations.clear();
3188 SavedMutations.swap(x&: Mutations);
3189 addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
3190 }
3191
3192 ScheduleDAGMI::schedule();
3193}
3194
3195void GCNPostScheduleDAGMILive::finalizeSchedule() {
3196 if (HasIGLPInstrs)
3197 SavedMutations.swap(x&: Mutations);
3198
3199 ScheduleDAGMI::finalizeSchedule();
3200}
3201