1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNRegPressure.h"
29#include "SIMachineFunctionInfo.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/CodeGen/RegisterClassInfo.h"
33#include "llvm/MC/LaneBitmask.h"
34#include "llvm/Support/ErrorHandling.h"
35
36#define DEBUG_TYPE "machine-scheduler"
37
38using namespace llvm;
39
40static cl::opt<bool> DisableUnclusterHighRP(
41 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
42 cl::desc("Disable unclustered high register pressure "
43 "reduction scheduling stage."),
44 cl::init(Val: false));
45
46static cl::opt<bool> DisableClusteredLowOccupancy(
47 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
48 cl::desc("Disable clustered low occupancy "
49 "rescheduling for ILP scheduling stage."),
50 cl::init(Val: false));
51
52static cl::opt<unsigned> ScheduleMetricBias(
53 "amdgpu-schedule-metric-bias", cl::Hidden,
54 cl::desc(
55 "Sets the bias which adds weight to occupancy vs latency. Set it to "
56 "100 to chase the occupancy only."),
57 cl::init(Val: 10));
58
59static cl::opt<bool>
60 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
61 cl::desc("Relax occupancy targets for kernels which are memory "
62 "bound (amdgpu-membound-threshold), or "
63 "Wave Limited (amdgpu-limit-wave-threshold)."),
64 cl::init(Val: false));
65
66static cl::opt<bool> GCNTrackers(
67 "amdgpu-use-amdgpu-trackers", cl::Hidden,
68 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
69 cl::init(Val: false));
70
71const unsigned ScheduleMetrics::ScaleFactor = 100;
72
73GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
74 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
75 DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
76}
77
78void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
79 GenericScheduler::initialize(dag: DAG);
80
81 MF = &DAG->MF;
82
83 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
84
85 SGPRExcessLimit =
86 Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass);
87 VGPRExcessLimit =
88 Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass);
89
90 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
91 // Set the initial TargetOccupnacy to the maximum occupancy that we can
92 // achieve for this function. This effectively sets a lower bound on the
93 // 'Critical' register limits in the scheduler.
94 // Allow for lower occupancy targets if kernel is wave limited or memory
95 // bound, and using the relaxed occupancy feature.
96 TargetOccupancy =
97 RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
98 SGPRCriticalLimit =
99 std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit);
100
101 if (!KnownExcessRP) {
102 VGPRCriticalLimit = std::min(
103 a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize()),
104 b: VGPRExcessLimit);
105 } else {
106 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
107 // returns a reasonably small number for targets with lots of VGPRs, such
108 // as GFX10 and GFX11.
109 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
110 "VGPRCriticalLimit calculation method.\n");
111 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
112 unsigned Granule =
113 AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST, DynamicVGPRBlockSize);
114 unsigned Addressable =
115 AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: &ST, DynamicVGPRBlockSize);
116 unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule);
117 VGPRBudget = std::max(a: VGPRBudget, b: Granule);
118 VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit);
119 }
120
121 // Subtract error margin and bias from register limits and avoid overflow.
122 SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit);
123 VGPRCriticalLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit);
124 SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit);
125 VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit);
126
127 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
128 << ", VGPRExcessLimit = " << VGPRExcessLimit
129 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
130 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
131}
132
133/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
134/// current register pressure.
135///
136/// This works for the common case, but it has a few exceptions that have been
137/// observed through trial and error:
138/// - Explicit physical register operands
139/// - Subregister definitions
140///
141/// In both of those cases, PressureDiff doesn't represent the actual pressure,
142/// and querying LiveIntervals through the RegPressureTracker is needed to get
143/// an accurate value.
144///
145/// We should eventually only use PressureDiff for maximum performance, but this
146/// already allows 80% of SUs to take the fast path without changing scheduling
147/// at all. Further changes would either change scheduling, or require a lot
148/// more logic to recover an accurate pressure estimate from the PressureDiffs.
149static bool canUsePressureDiffs(const SUnit &SU) {
150 if (!SU.isInstr())
151 return false;
152
153 // Cannot use pressure diffs for subregister defs or with physregs, it's
154 // imprecise in both cases.
155 for (const auto &Op : SU.getInstr()->operands()) {
156 if (!Op.isReg() || Op.isImplicit())
157 continue;
158 if (Op.getReg().isPhysical() ||
159 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
160 return false;
161 }
162 return true;
163}
164
165static void getRegisterPressures(
166 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
167 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
168 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
169 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
170 // getDownwardPressure() and getUpwardPressure() make temporary changes to
171 // the tracker, so we need to pass those function a non-const copy.
172 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
173 if (!GCNTrackers) {
174 AtTop
175 ? TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure)
176 : TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure);
177
178 return;
179 }
180
181 // GCNTrackers
182 Pressure.resize(new_size: 4, x: 0);
183 MachineInstr *MI = SU->getInstr();
184 GCNRegPressure NewPressure;
185 if (AtTop) {
186 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
187 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, TRI: SRI);
188 } else {
189 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
190 TempUpwardTracker.recede(MI: *MI);
191 NewPressure = TempUpwardTracker.getPressure();
192 }
193 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
194 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
195 NewPressure.getArchVGPRNum();
196 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
197}
198
199void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
200 bool AtTop,
201 const RegPressureTracker &RPTracker,
202 const SIRegisterInfo *SRI,
203 unsigned SGPRPressure,
204 unsigned VGPRPressure, bool IsBottomUp) {
205 Cand.SU = SU;
206 Cand.AtTop = AtTop;
207
208 if (!DAG->isTrackingPressure())
209 return;
210
211 Pressure.clear();
212 MaxPressure.clear();
213
214 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
215 // possible over querying the RegPressureTracker.
216 //
217 // RegPressureTracker will make a lot of LIS queries which are very
218 // expensive, it is considered a slow function in this context.
219 //
220 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
221 // trivial lookup into an array. It is pretty much free.
222 //
223 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
224 // PressureDiffs.
225 if (AtTop || !canUsePressureDiffs(SU: *SU) || GCNTrackers) {
226 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
227 DownwardTracker, UpwardTracker, DAG, SRI);
228 } else {
229 // Reserve 4 slots.
230 Pressure.resize(new_size: 4, x: 0);
231 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
232 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
233
234 for (const auto &Diff : DAG->getPressureDiff(SU)) {
235 if (!Diff.isValid())
236 continue;
237 // PressureDiffs is always bottom-up so if we're working top-down we need
238 // to invert its sign.
239 Pressure[Diff.getPSet()] +=
240 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
241 }
242
243#ifdef EXPENSIVE_CHECKS
244 std::vector<unsigned> CheckPressure, CheckMaxPressure;
245 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
246 DownwardTracker, UpwardTracker, DAG, SRI);
247 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
248 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
249 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
250 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
251 errs() << "Register Pressure is inaccurate when calculated through "
252 "PressureDiff\n"
253 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
254 << ", expected "
255 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
256 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
257 << ", expected "
258 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
259 report_fatal_error("inaccurate register pressure calculation");
260 }
261#endif
262 }
263
264 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
265 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
266
267 // If two instructions increase the pressure of different register sets
268 // by the same amount, the generic scheduler will prefer to schedule the
269 // instruction that increases the set with the least amount of registers,
270 // which in our case would be SGPRs. This is rarely what we want, so
271 // when we report excess/critical register pressure, we do it either
272 // only for VGPRs or only for SGPRs.
273
274 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
275 const unsigned MaxVGPRPressureInc = 16;
276 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
277 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
278
279 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
280 // to increase the likelihood we don't go over the limits. We should improve
281 // the analysis to look through dependencies to find the path with the least
282 // register pressure.
283
284 // We only need to update the RPDelta for instructions that increase register
285 // pressure. Instructions that decrease or keep reg pressure the same will be
286 // marked as RegExcess in tryCandidate() when they are compared with
287 // instructions that increase the register pressure.
288 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
289 HasHighPressure = true;
290 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
291 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
292 }
293
294 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
295 HasHighPressure = true;
296 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
297 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
298 }
299
300 // Register pressure is considered 'CRITICAL' if it is approaching a value
301 // that would reduce the wave occupancy for the execution unit. When
302 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
303 // has the same cost, so we don't need to prefer one over the other.
304
305 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
306 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
307
308 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
309 HasHighPressure = true;
310 if (SGPRDelta > VGPRDelta) {
311 Cand.RPDelta.CriticalMax =
312 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
313 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
314 } else {
315 Cand.RPDelta.CriticalMax =
316 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
317 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
318 }
319 }
320}
321
322// This function is mostly cut and pasted from
323// GenericScheduler::pickNodeFromQueue()
324void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
325 const CandPolicy &ZonePolicy,
326 const RegPressureTracker &RPTracker,
327 SchedCandidate &Cand,
328 bool IsBottomUp) {
329 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
330 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
331 unsigned SGPRPressure = 0;
332 unsigned VGPRPressure = 0;
333 if (DAG->isTrackingPressure()) {
334 if (!GCNTrackers) {
335 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
336 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
337 } else {
338 GCNRPTracker *T = IsBottomUp
339 ? static_cast<GCNRPTracker *>(&UpwardTracker)
340 : static_cast<GCNRPTracker *>(&DownwardTracker);
341 SGPRPressure = T->getPressure().getSGPRNum();
342 VGPRPressure = T->getPressure().getArchVGPRNum();
343 }
344 }
345 ReadyQueue &Q = Zone.Available;
346 for (SUnit *SU : Q) {
347
348 SchedCandidate TryCand(ZonePolicy);
349 initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
350 VGPRPressure, IsBottomUp);
351 // Pass SchedBoundary only when comparing nodes from the same boundary.
352 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
353 tryCandidate(Cand, TryCand, Zone: ZoneArg);
354 if (TryCand.Reason != NoCand) {
355 // Initialize resource delta if needed in case future heuristics query it.
356 if (TryCand.ResDelta == SchedResourceDelta())
357 TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
358 Cand.setBest(TryCand);
359 LLVM_DEBUG(traceCandidate(Cand));
360 }
361 }
362}
363
364// This function is mostly cut and pasted from
365// GenericScheduler::pickNodeBidirectional()
366SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
367 // Schedule as far as possible in the direction of no choice. This is most
368 // efficient, but also provides the best heuristics for CriticalPSets.
369 if (SUnit *SU = Bot.pickOnlyChoice()) {
370 IsTopNode = false;
371 return SU;
372 }
373 if (SUnit *SU = Top.pickOnlyChoice()) {
374 IsTopNode = true;
375 return SU;
376 }
377 // Set the bottom-up policy based on the state of the current bottom zone and
378 // the instructions outside the zone, including the top zone.
379 CandPolicy BotPolicy;
380 setPolicy(Policy&: BotPolicy, /*IsPostRA=*/false, CurrZone&: Bot, OtherZone: &Top);
381 // Set the top-down policy based on the state of the current top zone and
382 // the instructions outside the zone, including the bottom zone.
383 CandPolicy TopPolicy;
384 setPolicy(Policy&: TopPolicy, /*IsPostRA=*/false, CurrZone&: Top, OtherZone: &Bot);
385
386 // See if BotCand is still valid (because we previously scheduled from Top).
387 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
388 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
389 BotCand.Policy != BotPolicy) {
390 BotCand.reset(NewPolicy: CandPolicy());
391 pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
392 /*IsBottomUp=*/true);
393 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
394 } else {
395 LLVM_DEBUG(traceCandidate(BotCand));
396#ifndef NDEBUG
397 if (VerifyScheduling) {
398 SchedCandidate TCand;
399 TCand.reset(CandPolicy());
400 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
401 /*IsBottomUp=*/true);
402 assert(TCand.SU == BotCand.SU &&
403 "Last pick result should correspond to re-picking right now");
404 }
405#endif
406 }
407
408 // Check if the top Q has a better candidate.
409 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
410 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
411 TopCand.Policy != TopPolicy) {
412 TopCand.reset(NewPolicy: CandPolicy());
413 pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
414 /*IsBottomUp=*/false);
415 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
416 } else {
417 LLVM_DEBUG(traceCandidate(TopCand));
418#ifndef NDEBUG
419 if (VerifyScheduling) {
420 SchedCandidate TCand;
421 TCand.reset(CandPolicy());
422 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
423 /*IsBottomUp=*/false);
424 assert(TCand.SU == TopCand.SU &&
425 "Last pick result should correspond to re-picking right now");
426 }
427#endif
428 }
429
430 // Pick best from BotCand and TopCand.
431 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
432 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
433 SchedCandidate Cand = BotCand;
434 TopCand.Reason = NoCand;
435 tryCandidate(Cand, TryCand&: TopCand, Zone: nullptr);
436 if (TopCand.Reason != NoCand) {
437 Cand.setBest(TopCand);
438 }
439 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
440
441 IsTopNode = Cand.AtTop;
442 return Cand.SU;
443}
444
445// This function is mostly cut and pasted from
446// GenericScheduler::pickNode()
447SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
448 if (DAG->top() == DAG->bottom()) {
449 assert(Top.Available.empty() && Top.Pending.empty() &&
450 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
451 return nullptr;
452 }
453 SUnit *SU;
454 do {
455 if (RegionPolicy.OnlyTopDown) {
456 SU = Top.pickOnlyChoice();
457 if (!SU) {
458 CandPolicy NoPolicy;
459 TopCand.reset(NewPolicy: NoPolicy);
460 pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
461 /*IsBottomUp=*/false);
462 assert(TopCand.Reason != NoCand && "failed to find a candidate");
463 SU = TopCand.SU;
464 }
465 IsTopNode = true;
466 } else if (RegionPolicy.OnlyBottomUp) {
467 SU = Bot.pickOnlyChoice();
468 if (!SU) {
469 CandPolicy NoPolicy;
470 BotCand.reset(NewPolicy: NoPolicy);
471 pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
472 /*IsBottomUp=*/true);
473 assert(BotCand.Reason != NoCand && "failed to find a candidate");
474 SU = BotCand.SU;
475 }
476 IsTopNode = false;
477 } else {
478 SU = pickNodeBidirectional(IsTopNode);
479 }
480 } while (SU->isScheduled);
481
482 if (SU->isTopReady())
483 Top.removeReady(SU);
484 if (SU->isBottomReady())
485 Bot.removeReady(SU);
486
487 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
488 << *SU->getInstr());
489 return SU;
490}
491
492void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
493 if (GCNTrackers) {
494 MachineInstr *MI = SU->getInstr();
495 IsTopNode ? (void)DownwardTracker.advance(MI, UseInternalIterator: false)
496 : UpwardTracker.recede(MI: *MI);
497 }
498
499 return GenericScheduler::schedNode(SU, IsTopNode);
500}
501
502GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
503 assert(CurrentStage && CurrentStage != SchedStages.end());
504 return *CurrentStage;
505}
506
507bool GCNSchedStrategy::advanceStage() {
508 assert(CurrentStage != SchedStages.end());
509 if (!CurrentStage)
510 CurrentStage = SchedStages.begin();
511 else
512 CurrentStage++;
513
514 return CurrentStage != SchedStages.end();
515}
516
517bool GCNSchedStrategy::hasNextStage() const {
518 assert(CurrentStage);
519 return std::next(x: CurrentStage) != SchedStages.end();
520}
521
522GCNSchedStageID GCNSchedStrategy::getNextStage() const {
523 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
524 return *std::next(x: CurrentStage);
525}
526
527GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
528 const MachineSchedContext *C, bool IsLegacyScheduler)
529 : GCNSchedStrategy(C) {
530 SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule);
531 SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule);
532 SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule);
533 SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize);
534 GCNTrackers = GCNTrackers & !IsLegacyScheduler;
535}
536
537GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
538 : GCNSchedStrategy(C) {
539 SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule);
540}
541
542bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
543 SchedCandidate &TryCand,
544 SchedBoundary *Zone) const {
545 // Initialize the candidate if needed.
546 if (!Cand.isValid()) {
547 TryCand.Reason = NodeOrder;
548 return true;
549 }
550
551 // Avoid spilling by exceeding the register limit.
552 if (DAG->isTrackingPressure() &&
553 tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
554 Reason: RegExcess, TRI, MF: DAG->MF))
555 return TryCand.Reason != NoCand;
556
557 // Bias PhysReg Defs and copies to their uses and defined respectively.
558 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
559 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
560 return TryCand.Reason != NoCand;
561
562 bool SameBoundary = Zone != nullptr;
563 if (SameBoundary) {
564 // Prioritize instructions that read unbuffered resources by stall cycles.
565 if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
566 CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
567 return TryCand.Reason != NoCand;
568
569 // Avoid critical resource consumption and balance the schedule.
570 TryCand.initResourceDelta(DAG, SchedModel);
571 if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
572 TryCand, Cand, Reason: ResourceReduce))
573 return TryCand.Reason != NoCand;
574 if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
575 CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
576 Reason: ResourceDemand))
577 return TryCand.Reason != NoCand;
578
579 // Unconditionally try to reduce latency.
580 if (tryLatency(TryCand, Cand, Zone&: *Zone))
581 return TryCand.Reason != NoCand;
582
583 // Weak edges are for clustering and other constraints.
584 if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
585 CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
586 return TryCand.Reason != NoCand;
587 }
588
589 // Keep clustered nodes together to encourage downstream peephole
590 // optimizations which may reduce resource requirements.
591 //
592 // This is a best effort to set things up for a post-RA pass. Optimizations
593 // like generating loads of multiple registers should ideally be done within
594 // the scheduler pass by combining the loads during DAG postprocessing.
595 const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
596 const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
597 if (tryGreater(TryVal: TryCandCluster && TryCandCluster->contains(Ptr: TryCand.SU),
598 CandVal: CandCluster && CandCluster->contains(Ptr: Cand.SU), TryCand, Cand,
599 Reason: Cluster))
600 return TryCand.Reason != NoCand;
601
602 // Avoid increasing the max critical pressure in the scheduled region.
603 if (DAG->isTrackingPressure() &&
604 tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
605 TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
606 return TryCand.Reason != NoCand;
607
608 // Avoid increasing the max pressure of the entire region.
609 if (DAG->isTrackingPressure() &&
610 tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
611 Cand, Reason: RegMax, TRI, MF: DAG->MF))
612 return TryCand.Reason != NoCand;
613
614 if (SameBoundary) {
615 // Fall through to original instruction order.
616 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
617 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
618 TryCand.Reason = NodeOrder;
619 return true;
620 }
621 }
622 return false;
623}
624
625GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
626 const MachineSchedContext *C)
627 : GCNSchedStrategy(C) {
628 SchedStages.push_back(Elt: GCNSchedStageID::MemoryClauseInitialSchedule);
629}
630
631/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
632/// much as possible. This is achieved by:
633// 1. Prioritize clustered operations before stall latency heuristic.
634// 2. Prioritize long-latency-load before stall latency heuristic.
635///
636/// \param Cand provides the policy and current best candidate.
637/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
638/// \param Zone describes the scheduled zone that we are extending, or nullptr
639/// if Cand is from a different zone than TryCand.
640/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
641bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
642 SchedCandidate &TryCand,
643 SchedBoundary *Zone) const {
644 // Initialize the candidate if needed.
645 if (!Cand.isValid()) {
646 TryCand.Reason = NodeOrder;
647 return true;
648 }
649
650 // Bias PhysReg Defs and copies to their uses and defined respectively.
651 if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
652 CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
653 return TryCand.Reason != NoCand;
654
655 if (DAG->isTrackingPressure()) {
656 // Avoid exceeding the target's limit.
657 if (tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
658 Reason: RegExcess, TRI, MF: DAG->MF))
659 return TryCand.Reason != NoCand;
660
661 // Avoid increasing the max critical pressure in the scheduled region.
662 if (tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
663 TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
664 return TryCand.Reason != NoCand;
665 }
666
667 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
668 // get more benefit from clausing these memory instructions.
669 const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
670 const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
671 if (tryGreater(TryVal: TryCandCluster && TryCandCluster->contains(Ptr: TryCand.SU),
672 CandVal: CandCluster && CandCluster->contains(Ptr: Cand.SU), TryCand, Cand,
673 Reason: Cluster))
674 return TryCand.Reason != NoCand;
675
676 // We only compare a subset of features when comparing nodes between
677 // Top and Bottom boundary. Some properties are simply incomparable, in many
678 // other instances we should only override the other boundary if something
679 // is a clear good pick on one boundary. Skip heuristics that are more
680 // "tie-breaking" in nature.
681 bool SameBoundary = Zone != nullptr;
682 if (SameBoundary) {
683 // For loops that are acyclic path limited, aggressively schedule for
684 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
685 // heuristics to take precedence.
686 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
687 tryLatency(TryCand, Cand, Zone&: *Zone))
688 return TryCand.Reason != NoCand;
689
690 // MaxMemoryClause-specific: Prioritize long latency memory load
691 // instructions in top-bottom order to hide more latency. The mayLoad check
692 // is used to exclude store-like instructions, which we do not want to
693 // scheduler them too early.
694 bool TryMayLoad =
695 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
696 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
697
698 if (TryMayLoad || CandMayLoad) {
699 bool TryLongLatency =
700 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
701 bool CandLongLatency =
702 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
703
704 if (tryGreater(TryVal: Zone->isTop() ? TryLongLatency : CandLongLatency,
705 CandVal: Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
706 Cand, Reason: Stall))
707 return TryCand.Reason != NoCand;
708 }
709 // Prioritize instructions that read unbuffered resources by stall cycles.
710 if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
711 CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
712 return TryCand.Reason != NoCand;
713 }
714
715 if (SameBoundary) {
716 // Weak edges are for clustering and other constraints.
717 if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
718 CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
719 return TryCand.Reason != NoCand;
720 }
721
722 // Avoid increasing the max pressure of the entire region.
723 if (DAG->isTrackingPressure() &&
724 tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
725 Cand, Reason: RegMax, TRI, MF: DAG->MF))
726 return TryCand.Reason != NoCand;
727
728 if (SameBoundary) {
729 // Avoid critical resource consumption and balance the schedule.
730 TryCand.initResourceDelta(DAG, SchedModel);
731 if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
732 TryCand, Cand, Reason: ResourceReduce))
733 return TryCand.Reason != NoCand;
734 if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
735 CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
736 Reason: ResourceDemand))
737 return TryCand.Reason != NoCand;
738
739 // Avoid serializing long latency dependence chains.
740 // For acyclic path limited loops, latency was already checked above.
741 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
742 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone))
743 return TryCand.Reason != NoCand;
744
745 // Fall through to original instruction order.
746 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
747 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
748 TryCand.Reason = NodeOrder;
749 return true;
750 }
751 }
752
753 return false;
754}
755
756GCNScheduleDAGMILive::GCNScheduleDAGMILive(
757 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
758 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
759 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
760 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
761 RegionLiveOuts(this, /*IsLiveOut=*/true) {
762
763 // We want regions with a single MI to be scheduled so that we can reason
764 // about them correctly during scheduling stages that move MIs between regions
765 // (e.g., rematerialization).
766 ScheduleSingleMIRegions = true;
767 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
768 if (RelaxedOcc) {
769 MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy);
770 if (MinOccupancy != StartingOccupancy)
771 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
772 << ".\n");
773 }
774}
775
776std::unique_ptr<GCNSchedStage>
777GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
778 switch (SchedStageID) {
779 case GCNSchedStageID::OccInitialSchedule:
780 return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this);
781 case GCNSchedStageID::UnclusteredHighRPReschedule:
782 return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this);
783 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
784 return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this);
785 case GCNSchedStageID::PreRARematerialize:
786 return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this);
787 case GCNSchedStageID::ILPInitialSchedule:
788 return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this);
789 case GCNSchedStageID::MemoryClauseInitialSchedule:
790 return std::make_unique<MemoryClauseInitialScheduleStage>(args&: SchedStageID,
791 args&: *this);
792 }
793
794 llvm_unreachable("Unknown SchedStageID.");
795}
796
797void GCNScheduleDAGMILive::schedule() {
798 // Collect all scheduling regions. The actual scheduling is performed in
799 // GCNScheduleDAGMILive::finalizeSchedule.
800 Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd));
801}
802
803GCNRegPressure
804GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
805 GCNDownwardRPTracker RPTracker(*LIS);
806 RPTracker.advance(Begin: begin(), End: end(), LiveRegsCopy: &LiveIns[RegionIdx]);
807 return RPTracker.moveMaxPressure();
808}
809
810static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
811 MachineBasicBlock::iterator RegionEnd) {
812 auto REnd = RegionEnd == RegionBegin->getParent()->end()
813 ? std::prev(x: RegionEnd)
814 : RegionEnd;
815 return &*skipDebugInstructionsBackward(It: REnd, Begin: RegionBegin);
816}
817
818void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
819 const MachineBasicBlock *MBB) {
820 GCNDownwardRPTracker RPTracker(*LIS);
821
822 // If the block has the only successor then live-ins of that successor are
823 // live-outs of the current block. We can reuse calculated live set if the
824 // successor will be sent to scheduling past current block.
825
826 // However, due to the bug in LiveInterval analysis it may happen that two
827 // predecessors of the same successor block have different lane bitmasks for
828 // a live-out register. Workaround that by sticking to one-to-one relationship
829 // i.e. one predecessor with one successor block.
830 const MachineBasicBlock *OnlySucc = nullptr;
831 if (MBB->succ_size() == 1) {
832 auto *Candidate = *MBB->succ_begin();
833 if (!Candidate->empty() && Candidate->pred_size() == 1) {
834 SlotIndexes *Ind = LIS->getSlotIndexes();
835 if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate))
836 OnlySucc = Candidate;
837 }
838 }
839
840 // Scheduler sends regions from the end of the block upwards.
841 size_t CurRegion = RegionIdx;
842 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
843 if (Regions[CurRegion].first->getParent() != MBB)
844 break;
845 --CurRegion;
846
847 auto I = MBB->begin();
848 auto LiveInIt = MBBLiveIns.find(Val: MBB);
849 auto &Rgn = Regions[CurRegion];
850 auto *NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
851 if (LiveInIt != MBBLiveIns.end()) {
852 auto LiveIn = std::move(LiveInIt->second);
853 RPTracker.reset(MI: *MBB->begin(), LiveRegs: &LiveIn);
854 MBBLiveIns.erase(I: LiveInIt);
855 } else {
856 I = Rgn.first;
857 auto LRS = BBLiveInMap.lookup(Val: NonDbgMI);
858#ifdef EXPENSIVE_CHECKS
859 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
860#endif
861 RPTracker.reset(MI: *I, LiveRegs: &LRS);
862 }
863
864 for (;;) {
865 I = RPTracker.getNext();
866
867 if (Regions[CurRegion].first == I || NonDbgMI == I) {
868 LiveIns[CurRegion] = RPTracker.getLiveRegs();
869 RPTracker.clearMaxPressure();
870 }
871
872 if (Regions[CurRegion].second == I) {
873 Pressure[CurRegion] = RPTracker.moveMaxPressure();
874 if (CurRegion-- == RegionIdx)
875 break;
876 auto &Rgn = Regions[CurRegion];
877 NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
878 }
879 RPTracker.advanceToNext();
880 RPTracker.advanceBeforeNext();
881 }
882
883 if (OnlySucc) {
884 if (I != MBB->end()) {
885 RPTracker.advanceToNext();
886 RPTracker.advance(End: MBB->end());
887 }
888 RPTracker.advanceBeforeNext();
889 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
890 }
891}
892
893DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
894GCNScheduleDAGMILive::getRegionLiveInMap() const {
895 assert(!Regions.empty());
896 std::vector<MachineInstr *> RegionFirstMIs;
897 RegionFirstMIs.reserve(n: Regions.size());
898 auto I = Regions.rbegin(), E = Regions.rend();
899 do {
900 const MachineBasicBlock *MBB = I->first->getParent();
901 auto *MI = &*skipDebugInstructionsForward(It: I->first, End: I->second);
902 RegionFirstMIs.push_back(x: MI);
903 do {
904 ++I;
905 } while (I != E && I->first->getParent() == MBB);
906 } while (I != E);
907 return getLiveRegMap(R&: RegionFirstMIs, /*After=*/false, LIS&: *LIS);
908}
909
910DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
911GCNScheduleDAGMILive::getRegionLiveOutMap() const {
912 assert(!Regions.empty());
913 std::vector<MachineInstr *> RegionLastMIs;
914 RegionLastMIs.reserve(n: Regions.size());
915 for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions))
916 RegionLastMIs.push_back(x: getLastMIForRegion(RegionBegin, RegionEnd));
917
918 return getLiveRegMap(R&: RegionLastMIs, /*After=*/true, LIS&: *LIS);
919}
920
921void RegionPressureMap::buildLiveRegMap() {
922 IdxToInstruction.clear();
923
924 RegionLiveRegMap =
925 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
926 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
927 MachineInstr *RegionKey =
928 IsLiveOut
929 ? getLastMIForRegion(RegionBegin: DAG->Regions[I].first, RegionEnd: DAG->Regions[I].second)
930 : &*DAG->Regions[I].first;
931 IdxToInstruction[I] = RegionKey;
932 }
933}
934
935void GCNScheduleDAGMILive::finalizeSchedule() {
936 // Start actual scheduling here. This function is called by the base
937 // MachineScheduler after all regions have been recorded by
938 // GCNScheduleDAGMILive::schedule().
939 LiveIns.resize(N: Regions.size());
940 Pressure.resize(N: Regions.size());
941 RegionsWithHighRP.resize(N: Regions.size());
942 RegionsWithExcessRP.resize(N: Regions.size());
943 RegionsWithMinOcc.resize(N: Regions.size());
944 RegionsWithIGLPInstrs.resize(N: Regions.size());
945 RegionsWithHighRP.reset();
946 RegionsWithExcessRP.reset();
947 RegionsWithMinOcc.reset();
948 RegionsWithIGLPInstrs.reset();
949
950 runSchedStages();
951}
952
953void GCNScheduleDAGMILive::runSchedStages() {
954 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
955
956 if (!Regions.empty()) {
957 BBLiveInMap = getRegionLiveInMap();
958 if (GCNTrackers)
959 RegionLiveOuts.buildLiveRegMap();
960 }
961
962 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
963 while (S.advanceStage()) {
964 auto Stage = createSchedStage(SchedStageID: S.getCurrentStage());
965 if (!Stage->initGCNSchedStage())
966 continue;
967
968 for (auto Region : Regions) {
969 RegionBegin = Region.first;
970 RegionEnd = Region.second;
971 // Setup for scheduling the region and check whether it should be skipped.
972 if (!Stage->initGCNRegion()) {
973 Stage->advanceRegion();
974 exitRegion();
975 continue;
976 }
977
978 if (GCNTrackers) {
979 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
980 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
981 GCNRPTracker::LiveRegSet *RegionLiveIns =
982 &LiveIns[Stage->getRegionIdx()];
983
984 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
985 ->reset(MRI_: MRI, LiveRegs_: *RegionLiveIns);
986 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
987 ->reset(MRI_: MRI, LiveRegs_: RegionLiveOuts.getLiveRegsForRegionIdx(
988 RegionIdx: Stage->getRegionIdx()));
989 }
990
991 ScheduleDAGMILive::schedule();
992 Stage->finalizeGCNRegion();
993 }
994
995 Stage->finalizeGCNSchedStage();
996 }
997}
998
999#ifndef NDEBUG
1000raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
1001 switch (StageID) {
1002 case GCNSchedStageID::OccInitialSchedule:
1003 OS << "Max Occupancy Initial Schedule";
1004 break;
1005 case GCNSchedStageID::UnclusteredHighRPReschedule:
1006 OS << "Unclustered High Register Pressure Reschedule";
1007 break;
1008 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
1009 OS << "Clustered Low Occupancy Reschedule";
1010 break;
1011 case GCNSchedStageID::PreRARematerialize:
1012 OS << "Pre-RA Rematerialize";
1013 break;
1014 case GCNSchedStageID::ILPInitialSchedule:
1015 OS << "Max ILP Initial Schedule";
1016 break;
1017 case GCNSchedStageID::MemoryClauseInitialSchedule:
1018 OS << "Max memory clause Initial Schedule";
1019 break;
1020 }
1021
1022 return OS;
1023}
1024#endif
1025
1026GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
1027 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
1028 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
1029
1030bool GCNSchedStage::initGCNSchedStage() {
1031 if (!DAG.LIS)
1032 return false;
1033
1034 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1035 return true;
1036}
1037
1038bool UnclusteredHighRPStage::initGCNSchedStage() {
1039 if (DisableUnclusterHighRP)
1040 return false;
1041
1042 if (!GCNSchedStage::initGCNSchedStage())
1043 return false;
1044
1045 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1046 return false;
1047
1048 SavedMutations.swap(x&: DAG.Mutations);
1049 DAG.addMutation(
1050 Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry));
1051
1052 InitialOccupancy = DAG.MinOccupancy;
1053 // Aggressivly try to reduce register pressure in the unclustered high RP
1054 // stage. Temporarily increase occupancy target in the region.
1055 S.SGPRLimitBias = S.HighRPSGPRBias;
1056 S.VGPRLimitBias = S.HighRPVGPRBias;
1057 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1058 MFI.increaseOccupancy(MF, Limit: ++DAG.MinOccupancy);
1059
1060 LLVM_DEBUG(
1061 dbgs()
1062 << "Retrying function scheduling without clustering. "
1063 "Aggressivly try to reduce register pressure to achieve occupancy "
1064 << DAG.MinOccupancy << ".\n");
1065
1066 return true;
1067}
1068
1069bool ClusteredLowOccStage::initGCNSchedStage() {
1070 if (DisableClusteredLowOccupancy)
1071 return false;
1072
1073 if (!GCNSchedStage::initGCNSchedStage())
1074 return false;
1075
1076 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1077 // been dropped. All regions will have already been scheduled with the ideal
1078 // occupancy targets.
1079 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1080 return false;
1081
1082 LLVM_DEBUG(
1083 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1084 << DAG.MinOccupancy << ".\n");
1085 return true;
1086}
1087
1088/// Allows to easily filter for this stage's debug output.
1089#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
1090
1091bool PreRARematStage::initGCNSchedStage() {
1092 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1093 // regions inbetween the defs and region we sinked the def to. Will need to be
1094 // fixed if there is another pass after this pass.
1095 assert(!S.hasNextStage());
1096
1097 if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() ||
1098 DAG.Regions.size() == 1)
1099 return false;
1100
1101 // Before performing any IR modification record the parent region of each MI
1102 // and the parent MBB of each region.
1103 const unsigned NumRegions = DAG.Regions.size();
1104 RegionBB.reserve(N: NumRegions);
1105 for (unsigned I = 0; I < NumRegions; ++I) {
1106 RegionBoundaries Region = DAG.Regions[I];
1107 for (auto MI = Region.first; MI != Region.second; ++MI)
1108 MIRegion.insert(KV: {&*MI, I});
1109 RegionBB.push_back(Elt: Region.first->getParent());
1110 }
1111
1112 if (!canIncreaseOccupancyOrReduceSpill())
1113 return false;
1114
1115 // Rematerialize identified instructions and update scheduler's state.
1116 rematerialize();
1117 if (GCNTrackers)
1118 DAG.RegionLiveOuts.buildLiveRegMap();
1119 REMAT_DEBUG(
1120 dbgs() << "Retrying function scheduling with new min. occupancy of "
1121 << AchievedOcc << " from rematerializing (original was "
1122 << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
1123 if (AchievedOcc > DAG.MinOccupancy) {
1124 DAG.MinOccupancy = AchievedOcc;
1125 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1126 MFI.increaseOccupancy(MF, Limit: DAG.MinOccupancy);
1127 }
1128 return true;
1129}
1130
1131void GCNSchedStage::finalizeGCNSchedStage() {
1132 DAG.finishBlock();
1133 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1134}
1135
1136void UnclusteredHighRPStage::finalizeGCNSchedStage() {
1137 SavedMutations.swap(x&: DAG.Mutations);
1138 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1139 if (DAG.MinOccupancy > InitialOccupancy) {
1140 for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
1141 DAG.RegionsWithMinOcc[IDX] =
1142 DAG.Pressure[IDX].getOccupancy(
1143 ST: DAG.ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;
1144
1145 LLVM_DEBUG(dbgs() << StageID
1146 << " stage successfully increased occupancy to "
1147 << DAG.MinOccupancy << '\n');
1148 }
1149
1150 GCNSchedStage::finalizeGCNSchedStage();
1151}
1152
1153bool GCNSchedStage::initGCNRegion() {
1154 // Check whether this new region is also a new block.
1155 if (DAG.RegionBegin->getParent() != CurrentMBB)
1156 setupNewBlock();
1157
1158 unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end());
1159 DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs);
1160
1161 // Skip empty scheduling regions (0 or 1 schedulable instructions).
1162 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(x: DAG.end()))
1163 return false;
1164
1165 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1166 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1167 << " " << CurrentMBB->getName()
1168 << "\n From: " << *DAG.begin() << " To: ";
1169 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1170 else dbgs() << "End";
1171 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1172
1173 // Save original instruction order before scheduling for possible revert.
1174 Unsched.clear();
1175 Unsched.reserve(n: DAG.NumRegionInstrs);
1176 if (StageID == GCNSchedStageID::OccInitialSchedule ||
1177 StageID == GCNSchedStageID::ILPInitialSchedule) {
1178 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1179 for (auto &I : DAG) {
1180 Unsched.push_back(x: &I);
1181 if (SII->isIGLPMutationOnly(Opcode: I.getOpcode()))
1182 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1183 }
1184 } else {
1185 for (auto &I : DAG)
1186 Unsched.push_back(x: &I);
1187 }
1188
1189 PressureBefore = DAG.Pressure[RegionIdx];
1190
1191 LLVM_DEBUG(
1192 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1193 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1194 << "Region live-in pressure: "
1195 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1196 << "Region register pressure: " << print(PressureBefore));
1197
1198 S.HasHighPressure = false;
1199 S.KnownExcessRP = isRegionWithExcessRP();
1200
1201 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1202 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
1203 SavedMutations.clear();
1204 SavedMutations.swap(x&: DAG.Mutations);
1205 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1206 StageID == GCNSchedStageID::ILPInitialSchedule;
1207 DAG.addMutation(Mutation: createIGroupLPDAGMutation(
1208 Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1209 : AMDGPU::SchedulingPhase::PreRAReentry));
1210 }
1211
1212 return true;
1213}
1214
1215bool UnclusteredHighRPStage::initGCNRegion() {
1216 // Only reschedule regions with the minimum occupancy or regions that may have
1217 // spilling (excess register pressure).
1218 if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
1219 DAG.MinOccupancy <= InitialOccupancy) &&
1220 !DAG.RegionsWithExcessRP[RegionIdx])
1221 return false;
1222
1223 return GCNSchedStage::initGCNRegion();
1224}
1225
1226bool ClusteredLowOccStage::initGCNRegion() {
1227 // We may need to reschedule this region if it wasn't rescheduled in the last
1228 // stage, or if we found it was testing critical register pressure limits in
1229 // the unclustered reschedule stage. The later is because we may not have been
1230 // able to raise the min occupancy in the previous stage so the region may be
1231 // overly constrained even if it was already rescheduled.
1232 if (!DAG.RegionsWithHighRP[RegionIdx])
1233 return false;
1234
1235 return GCNSchedStage::initGCNRegion();
1236}
1237
1238bool PreRARematStage::initGCNRegion() {
1239 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1240}
1241
1242void GCNSchedStage::setupNewBlock() {
1243 if (CurrentMBB)
1244 DAG.finishBlock();
1245
1246 CurrentMBB = DAG.RegionBegin->getParent();
1247 DAG.startBlock(bb: CurrentMBB);
1248 // Get real RP for the region if it hasn't be calculated before. After the
1249 // initial schedule stage real RP will be collected after scheduling.
1250 if (StageID == GCNSchedStageID::OccInitialSchedule ||
1251 StageID == GCNSchedStageID::ILPInitialSchedule ||
1252 StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
1253 DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB);
1254}
1255
1256void GCNSchedStage::finalizeGCNRegion() {
1257 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1258 if (S.HasHighPressure)
1259 DAG.RegionsWithHighRP[RegionIdx] = true;
1260
1261 // Revert scheduling if we have dropped occupancy or there is some other
1262 // reason that the original schedule is better.
1263 checkScheduling();
1264
1265 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1266 StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1267 SavedMutations.swap(x&: DAG.Mutations);
1268
1269 DAG.exitRegion();
1270 advanceRegion();
1271}
1272
1273void GCNSchedStage::checkScheduling() {
1274 // Check the results of scheduling.
1275 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1276
1277 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1278 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1279
1280 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1281
1282 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1283 PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1284 DAG.Pressure[RegionIdx] = PressureAfter;
1285 DAG.RegionsWithMinOcc[RegionIdx] =
1286 PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
1287 DAG.MinOccupancy;
1288
1289 // Early out if we have achieved the occupancy target.
1290 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1291 return;
1292 }
1293
1294 unsigned TargetOccupancy = std::min(
1295 a: S.getTargetOccupancy(), b: ST.getOccupancyWithWorkGroupSizes(MF).second);
1296 unsigned WavesAfter = std::min(
1297 a: TargetOccupancy, b: PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1298 unsigned WavesBefore = std::min(
1299 a: TargetOccupancy, b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1300 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1301 << ", after " << WavesAfter << ".\n");
1302
1303 // We may not be able to keep the current target occupancy because of the just
1304 // scheduled region. We might still be able to revert scheduling if the
1305 // occupancy before was higher, or if the current schedule has register
1306 // pressure higher than the excess limits which could lead to more spilling.
1307 unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore);
1308
1309 // Allow memory bound functions to drop to 4 waves if not limited by an
1310 // attribute.
1311 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1312 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1313 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1314 << MFI.getMinAllowedOccupancy() << " waves\n");
1315 NewOccupancy = WavesAfter;
1316 }
1317
1318 if (NewOccupancy < DAG.MinOccupancy) {
1319 DAG.MinOccupancy = NewOccupancy;
1320 MFI.limitOccupancy(Limit: DAG.MinOccupancy);
1321 DAG.RegionsWithMinOcc.reset();
1322 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1323 << DAG.MinOccupancy << ".\n");
1324 }
1325 // The maximum number of arch VGPR on non-unified register file, or the
1326 // maximum VGPR + AGPR in the unified register file case.
1327 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1328 // The maximum number of arch VGPR for both unified and non-unified register
1329 // file.
1330 unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs());
1331 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1332
1333 if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs ||
1334 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1335 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1336 PressureAfter.getSGPRNum() > MaxSGPRs) {
1337 DAG.RegionsWithHighRP[RegionIdx] = true;
1338 DAG.RegionsWithExcessRP[RegionIdx] = true;
1339 }
1340
1341 // Revert if this region's schedule would cause a drop in occupancy or
1342 // spilling.
1343 if (shouldRevertScheduling(WavesAfter)) {
1344 revertScheduling();
1345 } else {
1346 DAG.Pressure[RegionIdx] = PressureAfter;
1347 DAG.RegionsWithMinOcc[RegionIdx] =
1348 PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
1349 DAG.MinOccupancy;
1350 }
1351}
1352
1353unsigned
1354GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1355 DenseMap<unsigned, unsigned> &ReadyCycles,
1356 const TargetSchedModel &SM) {
1357 unsigned ReadyCycle = CurrCycle;
1358 for (auto &D : SU.Preds) {
1359 if (D.isAssignedRegDep()) {
1360 MachineInstr *DefMI = D.getSUnit()->getInstr();
1361 unsigned Latency = SM.computeInstrLatency(MI: DefMI);
1362 unsigned DefReady = ReadyCycles[DAG.getSUnit(MI: DefMI)->NodeNum];
1363 ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency);
1364 }
1365 }
1366 ReadyCycles[SU.NodeNum] = ReadyCycle;
1367 return ReadyCycle;
1368}
1369
1370#ifndef NDEBUG
1371struct EarlierIssuingCycle {
1372 bool operator()(std::pair<MachineInstr *, unsigned> A,
1373 std::pair<MachineInstr *, unsigned> B) const {
1374 return A.second < B.second;
1375 }
1376};
1377
1378static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1379 EarlierIssuingCycle> &ReadyCycles) {
1380 if (ReadyCycles.empty())
1381 return;
1382 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1383 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1384 << " ##################\n# Cycle #\t\t\tInstruction "
1385 " "
1386 " \n";
1387 unsigned IPrev = 1;
1388 for (auto &I : ReadyCycles) {
1389 if (I.second > IPrev + 1)
1390 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1391 << " CYCLES DETECTED ******************************\n\n";
1392 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1393 IPrev = I.second;
1394 }
1395}
1396#endif
1397
1398ScheduleMetrics
1399GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1400#ifndef NDEBUG
1401 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1402 ReadyCyclesSorted;
1403#endif
1404 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1405 unsigned SumBubbles = 0;
1406 DenseMap<unsigned, unsigned> ReadyCycles;
1407 unsigned CurrCycle = 0;
1408 for (auto &SU : InputSchedule) {
1409 unsigned ReadyCycle =
1410 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1411 SumBubbles += ReadyCycle - CurrCycle;
1412#ifndef NDEBUG
1413 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1414#endif
1415 CurrCycle = ++ReadyCycle;
1416 }
1417#ifndef NDEBUG
1418 LLVM_DEBUG(
1419 printScheduleModel(ReadyCyclesSorted);
1420 dbgs() << "\n\t"
1421 << "Metric: "
1422 << (SumBubbles
1423 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1424 : 1)
1425 << "\n\n");
1426#endif
1427
1428 return ScheduleMetrics(CurrCycle, SumBubbles);
1429}
1430
1431ScheduleMetrics
1432GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1433#ifndef NDEBUG
1434 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1435 ReadyCyclesSorted;
1436#endif
1437 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1438 unsigned SumBubbles = 0;
1439 DenseMap<unsigned, unsigned> ReadyCycles;
1440 unsigned CurrCycle = 0;
1441 for (auto &MI : DAG) {
1442 SUnit *SU = DAG.getSUnit(MI: &MI);
1443 if (!SU)
1444 continue;
1445 unsigned ReadyCycle =
1446 computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM);
1447 SumBubbles += ReadyCycle - CurrCycle;
1448#ifndef NDEBUG
1449 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1450#endif
1451 CurrCycle = ++ReadyCycle;
1452 }
1453#ifndef NDEBUG
1454 LLVM_DEBUG(
1455 printScheduleModel(ReadyCyclesSorted);
1456 dbgs() << "\n\t"
1457 << "Metric: "
1458 << (SumBubbles
1459 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1460 : 1)
1461 << "\n\n");
1462#endif
1463
1464 return ScheduleMetrics(CurrCycle, SumBubbles);
1465}
1466
1467bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1468 if (WavesAfter < DAG.MinOccupancy)
1469 return true;
1470
1471 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1472 if (DAG.MFI.isDynamicVGPREnabled()) {
1473 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1474 STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(),
1475 DynamicVGPRBlockSize: PressureBefore.getVGPRNum(UnifiedVGPRFile: false));
1476 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1477 STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(),
1478 DynamicVGPRBlockSize: PressureAfter.getVGPRNum(UnifiedVGPRFile: false));
1479 if (BlocksAfter > BlocksBefore)
1480 return true;
1481 }
1482
1483 return false;
1484}
1485
1486bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1487 if (PressureAfter == PressureBefore)
1488 return false;
1489
1490 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1491 return true;
1492
1493 if (mayCauseSpilling(WavesAfter))
1494 return true;
1495
1496 return false;
1497}
1498
1499bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1500 // If RP is not reduced in the unclustered reschedule stage, revert to the
1501 // old schedule.
1502 if ((WavesAfter <=
1503 PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) &&
1504 mayCauseSpilling(WavesAfter)) ||
1505 GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1506 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1507 return true;
1508 }
1509
1510 // Do not attempt to relax schedule even more if we are already spilling.
1511 if (isRegionWithExcessRP())
1512 return false;
1513
1514 LLVM_DEBUG(
1515 dbgs()
1516 << "\n\t *** In shouldRevertScheduling ***\n"
1517 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
1518 ScheduleMetrics MBefore = getScheduleMetrics(InputSchedule: DAG.SUnits);
1519 LLVM_DEBUG(
1520 dbgs()
1521 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
1522 ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1523 unsigned OldMetric = MBefore.getMetric();
1524 unsigned NewMetric = MAfter.getMetric();
1525 unsigned WavesBefore = std::min(
1526 a: S.getTargetOccupancy(),
1527 b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()));
1528 unsigned Profit =
1529 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1530 ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1531 NewMetric) /
1532 ScheduleMetrics::ScaleFactor;
1533 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1534 << MAfter << "Profit: " << Profit << "\n");
1535 return Profit < ScheduleMetrics::ScaleFactor;
1536}
1537
1538bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1539 if (PressureAfter == PressureBefore)
1540 return false;
1541
1542 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1543 return true;
1544
1545 if (mayCauseSpilling(WavesAfter))
1546 return true;
1547
1548 return false;
1549}
1550
1551bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1552 return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
1553 mayCauseSpilling(WavesAfter) ||
1554 (IncreaseOccupancy && WavesAfter < TargetOcc);
1555}
1556
1557bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1558 if (mayCauseSpilling(WavesAfter))
1559 return true;
1560
1561 return false;
1562}
1563
1564bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
1565 unsigned WavesAfter) {
1566 return mayCauseSpilling(WavesAfter);
1567}
1568
1569bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1570 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1571 !PressureAfter.less(MF, O: PressureBefore)) {
1572 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1573 return true;
1574 }
1575
1576 return false;
1577}
1578
1579void GCNSchedStage::revertScheduling() {
1580 DAG.RegionsWithMinOcc[RegionIdx] =
1581 PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) ==
1582 DAG.MinOccupancy;
1583 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1584 DAG.RegionEnd = DAG.RegionBegin;
1585 int SkippedDebugInstr = 0;
1586 for (MachineInstr *MI : Unsched) {
1587 if (MI->isDebugInstr()) {
1588 ++SkippedDebugInstr;
1589 continue;
1590 }
1591
1592 if (MI->getIterator() != DAG.RegionEnd) {
1593 DAG.BB->splice(Where: DAG.RegionEnd, Other: DAG.BB, From: MI);
1594 if (!MI->isDebugInstr())
1595 DAG.LIS->handleMove(MI&: *MI, UpdateFlags: true);
1596 }
1597
1598 // Reset read-undef flags and update them later.
1599 for (auto &Op : MI->all_defs())
1600 Op.setIsUndef(false);
1601 RegisterOperands RegOpers;
1602 RegOpers.collect(MI: *MI, TRI: *DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false);
1603 if (!MI->isDebugInstr()) {
1604 if (DAG.ShouldTrackLaneMasks) {
1605 // Adjust liveness and add missing dead+read-undef flags.
1606 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot();
1607 RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI);
1608 } else {
1609 // Adjust for missing dead-def flags.
1610 RegOpers.detectDeadDefs(MI: *MI, LIS: *DAG.LIS);
1611 }
1612 }
1613 DAG.RegionEnd = MI->getIterator();
1614 ++DAG.RegionEnd;
1615 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1616 }
1617
1618 // After reverting schedule, debug instrs will now be at the end of the block
1619 // and RegionEnd will point to the first debug instr. Increment RegionEnd
1620 // pass debug instrs to the actual end of the scheduling region.
1621 while (SkippedDebugInstr-- > 0)
1622 ++DAG.RegionEnd;
1623
1624 // If Unsched.front() instruction is a debug instruction, this will actually
1625 // shrink the region since we moved all debug instructions to the end of the
1626 // block. Find the first instruction that is not a debug instruction.
1627 DAG.RegionBegin = Unsched.front()->getIterator();
1628 if (DAG.RegionBegin->isDebugInstr()) {
1629 for (MachineInstr *MI : Unsched) {
1630 if (MI->isDebugInstr())
1631 continue;
1632 DAG.RegionBegin = MI->getIterator();
1633 break;
1634 }
1635 }
1636
1637 // Then move the debug instructions back into their correct place and set
1638 // RegionBegin and RegionEnd if needed.
1639 DAG.placeDebugValues();
1640
1641 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1642}
1643
1644bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
1645 SlotIndex OriginalIdx,
1646 SlotIndex RematIdx) const {
1647
1648 LiveIntervals *LIS = DAG.LIS;
1649 MachineRegisterInfo &MRI = DAG.MRI;
1650 OriginalIdx = OriginalIdx.getRegSlot(EC: true);
1651 RematIdx = std::max(a: RematIdx, b: RematIdx.getRegSlot(EC: true));
1652 for (const MachineOperand &MO : InstToRemat->operands()) {
1653 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
1654 continue;
1655
1656 if (!MO.getReg().isVirtual()) {
1657 // Do not attempt to reason about PhysRegs
1658 // TODO: better analysis of PhysReg livness
1659 if (!DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) &&
1660 !DAG.TII->isIgnorableUse(MO))
1661 return false;
1662
1663 // Constant PhysRegs and IgnorableUses are okay
1664 continue;
1665 }
1666
1667 LiveInterval &LI = LIS->getInterval(Reg: MO.getReg());
1668 const VNInfo *OVNI = LI.getVNInfoAt(Idx: OriginalIdx);
1669 assert(OVNI);
1670
1671 // Don't allow rematerialization immediately after the original def.
1672 // It would be incorrect if InstToRemat redefines the register.
1673 // See PR14098.
1674 if (SlotIndex::isSameInstr(A: OriginalIdx, B: RematIdx))
1675 return false;
1676
1677 if (OVNI != LI.getVNInfoAt(Idx: RematIdx))
1678 return false;
1679
1680 // Check that subrange is live at RematIdx.
1681 if (LI.hasSubRanges()) {
1682 const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
1683 unsigned SubReg = MO.getSubReg();
1684 LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubIdx: SubReg)
1685 : MRI.getMaxLaneMaskForVReg(Reg: MO.getReg());
1686 for (LiveInterval::SubRange &SR : LI.subranges()) {
1687 if ((SR.LaneMask & LM).none())
1688 continue;
1689 if (!SR.liveAt(index: RematIdx))
1690 return false;
1691
1692 // Early exit if all used lanes are checked. No need to continue.
1693 LM &= ~SR.LaneMask;
1694 if (LM.none())
1695 break;
1696 }
1697 }
1698 }
1699 return true;
1700}
1701
1702bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
1703 REMAT_DEBUG({
1704 dbgs() << "Collecting rematerializable instructions in ";
1705 MF.getFunction().printAsOperand(dbgs(), false);
1706 dbgs() << '\n';
1707 });
1708
1709 // Maps optimizable regions (i.e., regions at minimum and register-limited
1710 // occupancy, or regions with spilling) to the target RP we would like to
1711 // reach.
1712 DenseMap<unsigned, GCNRPTarget> OptRegions;
1713 const Function &F = MF.getFunction();
1714 unsigned DynamicVGPRBlockSize =
1715 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
1716
1717 std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
1718 const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
1719 const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
1720 const unsigned MaxSGPRsIncOcc =
1721 ST.getMaxNumSGPRs(WavesPerEU: DAG.MinOccupancy + 1, Addressable: false);
1722 const unsigned MaxVGPRsIncOcc =
1723 ST.getMaxNumVGPRs(WavesPerEU: DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
1724 IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
1725
1726 // Collect optimizable regions. If there is spilling in any region we will
1727 // just try to reduce spilling. Otherwise we will try to increase occupancy by
1728 // one in the whole function.
1729 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1730 GCNRegPressure &RP = DAG.Pressure[I];
1731 // We allow ArchVGPR or AGPR savings to count as savings of the other kind
1732 // of VGPR only when trying to eliminate spilling. We cannot do this when
1733 // trying to increase occupancy since VGPR class swaps only occur later in
1734 // the register allocator i.e., the scheduler will not be able to reason
1735 // about these savings and will not report an increase in the achievable
1736 // occupancy, triggering rollbacks.
1737 GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP,
1738 /*CombineVGPRSavings=*/true);
1739 if (!Target.satisfied() && IncreaseOccupancy) {
1740 // There is spilling in the region and we were so far trying to increase
1741 // occupancy. Strop trying that and focus on reducing spilling.
1742 IncreaseOccupancy = false;
1743 OptRegions.clear();
1744 } else if (IncreaseOccupancy) {
1745 // There is no spilling in the region, try to increase occupancy.
1746 Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
1747 /*CombineVGPRSavings=*/false);
1748 }
1749 if (!Target.satisfied())
1750 OptRegions.insert(KV: {I, Target});
1751 }
1752 if (OptRegions.empty())
1753 return false;
1754
1755#ifndef NDEBUG
1756 if (IncreaseOccupancy) {
1757 REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy
1758 << ") in regions:\n");
1759 } else {
1760 REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy ("
1761 << WavesPerEU.first << ") in regions:\n");
1762 }
1763 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1764 if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
1765 REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n');
1766 }
1767#endif
1768
1769 // When we are reducing spilling, the target is the minimum target number of
1770 // waves/EU determined by the subtarget. In cases where either one of
1771 // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
1772 // minimum region occupancy may be higher than the latter.
1773 TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
1774 : std::max(a: DAG.MinOccupancy, b: WavesPerEU.first);
1775
1776 // Accounts for a reduction in RP in an optimizable region. Returns whether we
1777 // estimate that we have identified enough rematerialization opportunities to
1778 // achieve our goal, and sets Progress to true when this particular reduction
1779 // in pressure was helpful toward that goal.
1780 auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
1781 bool &Progress) -> bool {
1782 GCNRPTarget &Target = OptIt->getSecond();
1783 if (!Target.isSaveBeneficial(Reg, MRI: DAG.MRI))
1784 return false;
1785 Progress = true;
1786 Target.saveReg(Reg, Mask, MRI: DAG.MRI);
1787 if (Target.satisfied())
1788 OptRegions.erase(OptIt->getFirst());
1789 return OptRegions.empty();
1790 };
1791
1792 // We need up-to-date live-out info. to query live-out register masks in
1793 // regions containing rematerializable instructions.
1794 DAG.RegionLiveOuts.buildLiveRegMap();
1795
1796 // Cache set of registers that are going to be rematerialized.
1797 DenseSet<unsigned> RematRegs;
1798
1799 // Identify rematerializable instructions in the function.
1800 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1801 auto Region = DAG.Regions[I];
1802 for (auto MI = Region.first; MI != Region.second; ++MI) {
1803 // The instruction must be trivially rematerializable.
1804 MachineInstr &DefMI = *MI;
1805 if (!isTriviallyReMaterializable(MI: DefMI))
1806 continue;
1807
1808 // We only support rematerializing virtual registers with one definition.
1809 Register Reg = DefMI.getOperand(i: 0).getReg();
1810 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(RegNo: Reg))
1811 continue;
1812
1813 // We only care to rematerialize the instruction if it has a single
1814 // non-debug user in a different region. The using MI may not belong to a
1815 // region if it is a lone region terminator.
1816 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(RegNo: Reg);
1817 if (!UseMI)
1818 continue;
1819 auto UseRegion = MIRegion.find(Val: UseMI);
1820 if (UseRegion != MIRegion.end() && UseRegion->second == I)
1821 continue;
1822
1823 // Do not rematerialize an instruction if it uses or is used by an
1824 // instruction that we have designated for rematerialization.
1825 // FIXME: Allow for rematerialization chains: this requires 1. updating
1826 // remat points to account for uses that are rematerialized, and 2. either
1827 // rematerializing the candidates in careful ordering, or deferring the
1828 // MBB RP walk until the entire chain has been rematerialized.
1829 if (Rematerializations.contains(Key: UseMI) ||
1830 llvm::any_of(Range: DefMI.operands(), P: [&RematRegs](MachineOperand &MO) {
1831 return MO.isReg() && RematRegs.contains(V: MO.getReg());
1832 }))
1833 continue;
1834
1835 // Do not rematerialize an instruction it it uses registers that aren't
1836 // available at its use. This ensures that we are not extending any live
1837 // range while rematerializing.
1838 SlotIndex DefIdx = DAG.LIS->getInstructionIndex(Instr: DefMI);
1839 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(Instr: *UseMI).getRegSlot(EC: true);
1840 if (!allUsesAvailableAt(InstToRemat: &DefMI, OriginalIdx: DefIdx, RematIdx: UseIdx))
1841 continue;
1842
1843 REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
1844 RematInstruction &Remat =
1845 Rematerializations.try_emplace(Key: &DefMI, Args&: UseMI).first->second;
1846
1847 bool RematUseful = false;
1848 if (auto It = OptRegions.find(Val: I); It != OptRegions.end()) {
1849 // Optimistically consider that moving the instruction out of its
1850 // defining region will reduce RP in the latter; this assumes that
1851 // maximum RP in the region is reached somewhere between the defining
1852 // instruction and the end of the region.
1853 REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
1854 LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I)[Reg];
1855 if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
1856 return true;
1857 }
1858
1859 for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
1860 // We are only collecting regions in which the register is a live-in
1861 // (and may be live-through).
1862 auto It = DAG.LiveIns[LIRegion].find(Val: Reg);
1863 if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
1864 continue;
1865 Remat.LiveInRegions.insert(V: LIRegion);
1866
1867 // Account for the reduction in RP due to the rematerialization in an
1868 // optimizable region in which the defined register is a live-in. This
1869 // is exact for live-through region but optimistic in the using region,
1870 // where RP is actually reduced only if maximum RP is reached somewhere
1871 // between the beginning of the region and the rematerializable
1872 // instruction's use.
1873 if (auto It = OptRegions.find(Val: LIRegion); It != OptRegions.end()) {
1874 REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
1875 if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
1876 RematUseful))
1877 return true;
1878 }
1879 }
1880
1881 // If the instruction is not a live-in or live-out in any optimizable
1882 // region then there is no point in rematerializing it.
1883 if (!RematUseful) {
1884 Rematerializations.pop_back();
1885 REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
1886 } else {
1887 RematRegs.insert(V: Reg);
1888 }
1889 }
1890 }
1891
1892 if (IncreaseOccupancy) {
1893 // We were trying to increase occupancy but failed, abort the stage.
1894 REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
1895 Rematerializations.clear();
1896 return false;
1897 }
1898 REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
1899 return !Rematerializations.empty();
1900}
1901
1902void PreRARematStage::rematerialize() {
1903 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
1904
1905 // Collect regions whose RP changes in unpredictable way; we will have to
1906 // fully recompute their RP after all rematerailizations.
1907 DenseSet<unsigned> RecomputeRP;
1908
1909 // Rematerialize all instructions.
1910 for (auto &[DefMI, Remat] : Rematerializations) {
1911 MachineBasicBlock::iterator InsertPos(Remat.UseMI);
1912 Register Reg = DefMI->getOperand(i: 0).getReg();
1913 unsigned SubReg = DefMI->getOperand(i: 0).getSubReg();
1914 unsigned DefRegion = MIRegion.at(Val: DefMI);
1915
1916 // Rematerialize DefMI to its use block.
1917 TII->reMaterialize(MBB&: *InsertPos->getParent(), MI: InsertPos, DestReg: Reg, SubIdx: SubReg, Orig: *DefMI,
1918 TRI: *DAG.TRI);
1919 Remat.RematMI = &*std::prev(x: InsertPos);
1920 Remat.RematMI->getOperand(i: 0).setSubReg(SubReg);
1921 DAG.LIS->InsertMachineInstrInMaps(MI&: *Remat.RematMI);
1922
1923 // Update region boundaries in regions we sinked from (remove defining MI)
1924 // and to (insert MI rematerialized in use block). Only then we can erase
1925 // the original MI.
1926 DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[DefRegion], MI: DefMI, NewMI: nullptr);
1927 auto UseRegion = MIRegion.find(Val: Remat.UseMI);
1928 if (UseRegion != MIRegion.end()) {
1929 DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[UseRegion->second], MI: InsertPos,
1930 NewMI: Remat.RematMI);
1931 }
1932 DAG.LIS->RemoveMachineInstrFromMaps(MI&: *DefMI);
1933 DefMI->eraseFromParent();
1934
1935 // Collect all regions impacted by the rematerialization and update their
1936 // live-in/RP information.
1937 for (unsigned I : Remat.LiveInRegions) {
1938 ImpactedRegions.insert(KV: {I, DAG.Pressure[I]});
1939 GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
1940
1941#ifdef EXPENSIVE_CHECKS
1942 // All uses are known to be available / live at the remat point. Thus, the
1943 // uses should already be live in to the region.
1944 for (MachineOperand &MO : DefMI->operands()) {
1945 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
1946 continue;
1947
1948 Register UseReg = MO.getReg();
1949 if (!UseReg.isVirtual())
1950 continue;
1951
1952 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
1953 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
1954 if (LI.hasSubRanges() && MO.getSubReg())
1955 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
1956
1957 LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
1958 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
1959 // If this register has lanes not covered by the LiveIns, be sure they
1960 // do not map to any subrange. ref:
1961 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
1962 if (UncoveredLanes.any()) {
1963 assert(LI.hasSubRanges());
1964 for (LiveInterval::SubRange &SR : LI.subranges())
1965 assert((SR.LaneMask & UncoveredLanes).none());
1966 }
1967 }
1968#endif
1969
1970 // The register is no longer a live-in in all regions but the one that
1971 // contains the single use. In live-through regions, maximum register
1972 // pressure decreases predictably so we can directly update it. In the
1973 // using region, maximum RP may or may not decrease, so we will mark it
1974 // for re-computation after all materializations have taken place.
1975 LaneBitmask PrevMask = RegionLiveIns[Reg];
1976 RegionLiveIns.erase(Val: Reg);
1977 RegMasks.insert(KV: {{I, Remat.RematMI->getOperand(i: 0).getReg()}, PrevMask});
1978 if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
1979 DAG.Pressure[I].inc(Reg, PrevMask, NewMask: LaneBitmask::getNone(), MRI: DAG.MRI);
1980 else
1981 RecomputeRP.insert(V: I);
1982 }
1983 // RP in the region from which the instruction was rematerialized may or may
1984 // not decrease.
1985 ImpactedRegions.insert(KV: {DefRegion, DAG.Pressure[DefRegion]});
1986 RecomputeRP.insert(V: DefRegion);
1987
1988 // Recompute live interval to reflect the register's rematerialization.
1989 Register RematReg = Remat.RematMI->getOperand(i: 0).getReg();
1990 DAG.LIS->removeInterval(Reg: RematReg);
1991 DAG.LIS->createAndComputeVirtRegInterval(Reg: RematReg);
1992 }
1993
1994 // All regions impacted by at least one rematerialization must be rescheduled.
1995 // Maximum pressure must also be recomputed for all regions where it changed
1996 // non-predictably and checked against the target occupancy.
1997 AchievedOcc = TargetOcc;
1998 for (auto &[I, OriginalRP] : ImpactedRegions) {
1999 bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
2000 RescheduleRegions[I] = !IsEmptyRegion;
2001 if (!RecomputeRP.contains(V: I))
2002 continue;
2003
2004 GCNRegPressure RP;
2005 if (IsEmptyRegion) {
2006 RP = getRegPressure(MRI: DAG.MRI, LiveRegs&: DAG.LiveIns[I]);
2007 } else {
2008 GCNDownwardRPTracker RPT(*DAG.LIS);
2009 auto *NonDbgMI = &*skipDebugInstructionsForward(It: DAG.Regions[I].first,
2010 End: DAG.Regions[I].second);
2011 if (NonDbgMI == DAG.Regions[I].second) {
2012 // Region is non-empty but contains only debug instructions.
2013 RP = getRegPressure(MRI: DAG.MRI, LiveRegs&: DAG.LiveIns[I]);
2014 } else {
2015 RPT.reset(MI: *NonDbgMI, LiveRegs: &DAG.LiveIns[I]);
2016 RPT.advance(End: DAG.Regions[I].second);
2017 RP = RPT.moveMaxPressure();
2018 }
2019 }
2020 DAG.Pressure[I] = RP;
2021 AchievedOcc = std::min(
2022 a: AchievedOcc, b: RP.getOccupancy(ST, DynamicVGPRBlockSize: MF.getInfo<SIMachineFunctionInfo>()
2023 ->getDynamicVGPRBlockSize()));
2024 }
2025 REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
2026}
2027
2028// Copied from MachineLICM
2029bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
2030 if (!DAG.TII->isTriviallyReMaterializable(MI))
2031 return false;
2032
2033 for (const MachineOperand &MO : MI.all_uses()) {
2034 // We can't remat physreg uses, unless it is a constant or an ignorable
2035 // use (e.g. implicit exec use on VALU instructions)
2036 if (MO.getReg().isPhysical()) {
2037 if (DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) || DAG.TII->isIgnorableUse(MO))
2038 continue;
2039 return false;
2040 }
2041 }
2042
2043 return true;
2044}
2045
2046void PreRARematStage::finalizeGCNSchedStage() {
2047 // We consider that reducing spilling is always beneficial so we never
2048 // rollback rematerializations in such cases. It's also possible that
2049 // rescheduling lowers occupancy over the one achieved just through remats, in
2050 // which case we do not want to rollback either (the rescheduling was already
2051 // reverted in PreRARematStage::shouldRevertScheduling in such cases).
2052 unsigned MaxOcc = std::max(a: AchievedOcc, b: DAG.MinOccupancy);
2053 if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
2054 return;
2055
2056 REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
2057 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2058
2059 // Rollback the rematerializations.
2060 for (const auto &[DefMI, Remat] : Rematerializations) {
2061 MachineInstr &RematMI = *Remat.RematMI;
2062 unsigned DefRegion = MIRegion.at(Val: DefMI);
2063 MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
2064 MachineBasicBlock *MBB = RegionBB[DefRegion];
2065 Register Reg = RematMI.getOperand(i: 0).getReg();
2066 unsigned SubReg = RematMI.getOperand(i: 0).getSubReg();
2067
2068 // Re-rematerialize MI at the end of its original region. Note that it may
2069 // not be rematerialized exactly in the same position as originally within
2070 // the region, but it should not matter much.
2071 TII->reMaterialize(MBB&: *MBB, MI: InsertPos, DestReg: Reg, SubIdx: SubReg, Orig: RematMI, TRI: *DAG.TRI);
2072 MachineInstr *NewMI = &*std::prev(x: InsertPos);
2073 NewMI->getOperand(i: 0).setSubReg(SubReg);
2074 DAG.LIS->InsertMachineInstrInMaps(MI&: *NewMI);
2075
2076 auto UseRegion = MIRegion.find(Val: Remat.UseMI);
2077 if (UseRegion != MIRegion.end()) {
2078 DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[UseRegion->second], MI: RematMI,
2079 NewMI: nullptr);
2080 }
2081 DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[DefRegion], MI: InsertPos, NewMI);
2082
2083 // Erase rematerialized MI.
2084 DAG.LIS->RemoveMachineInstrFromMaps(MI&: RematMI);
2085 RematMI.eraseFromParent();
2086
2087 // Recompute live interval for the re-rematerialized register
2088 DAG.LIS->removeInterval(Reg);
2089 DAG.LIS->createAndComputeVirtRegInterval(Reg);
2090
2091 // Re-add the register as a live-in in all regions it used to be one in.
2092 for (unsigned LIRegion : Remat.LiveInRegions)
2093 DAG.LiveIns[LIRegion].insert(KV: {Reg, RegMasks.at(Val: {LIRegion, Reg})});
2094 }
2095
2096 // Reset RP in all impacted regions.
2097 for (auto &[I, OriginalRP] : ImpactedRegions)
2098 DAG.Pressure[I] = OriginalRP;
2099
2100 GCNSchedStage::finalizeGCNSchedStage();
2101}
2102
2103void GCNScheduleDAGMILive::updateRegionBoundaries(
2104 RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
2105 MachineInstr *NewMI) {
2106 assert((!NewMI || NewMI != RegionBounds.second) &&
2107 "cannot remove at region end");
2108
2109 if (RegionBounds.first == RegionBounds.second) {
2110 assert(NewMI && "cannot remove from an empty region");
2111 RegionBounds.first = NewMI;
2112 return;
2113 }
2114
2115 // We only care for modifications at the beginning of a non-empty region since
2116 // the upper region boundary is exclusive.
2117 if (MI != RegionBounds.first)
2118 return;
2119 if (!NewMI)
2120 RegionBounds.first = std::next(x: MI); // Removal
2121 else
2122 RegionBounds.first = NewMI; // Insertion
2123}
2124
2125static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
2126 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
2127 return any_of(Range&: *DAG, P: [SII](MachineBasicBlock::iterator MI) {
2128 return SII->isIGLPMutationOnly(Opcode: MI->getOpcode());
2129 });
2130}
2131
2132GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
2133 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
2134 bool RemoveKillFlags)
2135 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
2136
2137void GCNPostScheduleDAGMILive::schedule() {
2138 HasIGLPInstrs = hasIGLPInstrs(DAG: this);
2139 if (HasIGLPInstrs) {
2140 SavedMutations.clear();
2141 SavedMutations.swap(x&: Mutations);
2142 addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
2143 }
2144
2145 ScheduleDAGMI::schedule();
2146}
2147
2148void GCNPostScheduleDAGMILive::finalizeSchedule() {
2149 if (HasIGLPInstrs)
2150 SavedMutations.swap(x&: Mutations);
2151
2152 ScheduleDAGMI::finalizeSchedule();
2153}
2154