GCNSchedStrategy.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp]

1	//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This contains a MachineSchedStrategy implementation for maximizing wave
11	/// occupancy on GCN hardware.
12	///
13	/// This pass will apply multiple scheduling stages to the same function.
14	/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15	/// entry point for the scheduling of those regions is
16	/// GCNScheduleDAGMILive::runSchedStages.
17
18	/// Generally, the reason for having multiple scheduling stages is to account
19	/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20	/// few scheduling regions will have register pressure high enough to limit
21	/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22	/// other regions.
23	///
24	//===----------------------------------------------------------------------===//
25
26	#include "GCNSchedStrategy.h"
27	#include "AMDGPUIGroupLP.h"
28	#include "GCNRegPressure.h"
29	#include "SIMachineFunctionInfo.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "llvm/ADT/STLExtras.h"
32	#include "llvm/CodeGen/RegisterClassInfo.h"
33	#include "llvm/MC/LaneBitmask.h"
34	#include "llvm/Support/ErrorHandling.h"
35
36	#define DEBUG_TYPE "machine-scheduler"
37
38	using namespace llvm;
39
40	static cl::opt<bool> DisableUnclusterHighRP(
41	"amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
42	cl::desc ("Disable unclustered high register pressure "
43	"reduction scheduling stage."),
44	cl::init(Val: false));
45
46	static cl::opt<bool> DisableClusteredLowOccupancy(
47	"amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
48	cl::desc ("Disable clustered low occupancy "
49	"rescheduling for ILP scheduling stage."),
50	cl::init(Val: false));
51
52	static cl::opt<unsigned> ScheduleMetricBias(
53	"amdgpu-schedule-metric-bias", cl::Hidden,
54	cl::desc (
55	"Sets the bias which adds weight to occupancy vs latency. Set it to "
56	"100 to chase the occupancy only."),
57	cl::init(Val: `10`));
58
59	static cl::opt<bool>
60	RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
61	cl::desc ("Relax occupancy targets for kernels which are memory "
62	"bound (amdgpu-membound-threshold), or "
63	"Wave Limited (amdgpu-limit-wave-threshold)."),
64	cl::init(Val: false));
65
66	static cl::opt<bool> GCNTrackers(
67	"amdgpu-use-amdgpu-trackers", cl::Hidden,
68	cl::desc ("Use the AMDGPU specific RPTrackers during scheduling"),
69	cl::init(Val: false));
70
71	const unsigned ScheduleMetrics::ScaleFactor = `100`;
72
73	GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
74	: GenericScheduler (C), TargetOccupancy(`0`), MF(nullptr),
75	DownwardTracker (C->LIS), UpwardTracker (C->LIS), HasHighPressure(false) {
76	}
77
78	void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
79	GenericScheduler::initialize(dag: DAG);
80
81	MF = &DAG->MF;
82
83	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
84
85	SGPRExcessLimit =
86	Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass);
87	VGPRExcessLimit =
88	Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass);
89
90	SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
91	// Set the initial TargetOccupnacy to the maximum occupancy that we can
92	// achieve for this function. This effectively sets a lower bound on the
93	// 'Critical' register limits in the scheduler.
94	// Allow for lower occupancy targets if kernel is wave limited or memory
95	// bound, and using the relaxed occupancy feature.
96	TargetOccupancy =
97	RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
98	SGPRCriticalLimit =
99	std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit);
100
101	if (!KnownExcessRP) {
102	VGPRCriticalLimit = std::min(
103	a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize()),
104	b: VGPRExcessLimit);
105	} else {
106	// This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
107	// returns a reasonably small number for targets with lots of VGPRs, such
108	// as GFX10 and GFX11.
109	LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
110	"VGPRCriticalLimit calculation method.\n");
111	unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
112	unsigned Granule =
113	AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST, DynamicVGPRBlockSize);
114	unsigned Addressable =
115	AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: &ST, DynamicVGPRBlockSize);
116	unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule);
117	VGPRBudget = std::max(a: VGPRBudget, b: Granule);
118	VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit);
119	}
120
121	// Subtract error margin and bias from register limits and avoid overflow.
122	SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit);
123	VGPRCriticalLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit);
124	SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit);
125	VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit);
126
127	LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
128	<< ", VGPRExcessLimit = " << VGPRExcessLimit
129	<< ", SGPRCriticalLimit = " << SGPRCriticalLimit
130	<< ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
131	}
132
133	/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
134	/// current register pressure.
135	///
136	/// This works for the common case, but it has a few exceptions that have been
137	/// observed through trial and error:
138	/// - Explicit physical register operands
139	/// - Subregister definitions
140	///
141	/// In both of those cases, PressureDiff doesn't represent the actual pressure,
142	/// and querying LiveIntervals through the RegPressureTracker is needed to get
143	/// an accurate value.
144	///
145	/// We should eventually only use PressureDiff for maximum performance, but this
146	/// already allows 80% of SUs to take the fast path without changing scheduling
147	/// at all. Further changes would either change scheduling, or require a lot
148	/// more logic to recover an accurate pressure estimate from the PressureDiffs.
149	static bool canUsePressureDiffs(const SUnit &SU) {
150	if (!SU.isInstr())
151	return false;
152
153	// Cannot use pressure diffs for subregister defs or with physregs, it's
154	// imprecise in both cases.
155	for (const auto &Op : SU.getInstr()->operands()) {
156	if (!Op.isReg() \|\| Op.isImplicit())
157	continue;
158	if (Op.getReg().isPhysical() \|\|
159	(Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
160	return false;
161	}
162	return true;
163	}
164
165	static void getRegisterPressures(
166	bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
167	std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
168	GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
169	ScheduleDAGMI DAG, const* SIRegisterInfo *SRI) {
170	// getDownwardPressure() and getUpwardPressure() make temporary changes to
171	// the tracker, so we need to pass those function a non-const copy.
172	RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
173	if (!GCNTrackers) {
174	AtTop
175	? TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure)
176	: TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure);
177
178	return;
179	}
180
181	// GCNTrackers
182	Pressure.resize(new_size: `4`, x: `0`);
183	MachineInstr *MI = SU->getInstr();
184	GCNRegPressure NewPressure;
185	if (AtTop) {
186	GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
187	NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, TRI: SRI);
188	} else {
189	GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
190	TempUpwardTracker.recede(MI: *MI);
191	NewPressure = TempUpwardTracker.getPressure();
192	}
193	Pressure [AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
194	Pressure [AMDGPU::RegisterPressureSets::VGPR_32] =
195	NewPressure.getArchVGPRNum();
196	Pressure [AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
197	}
198
199	void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
200	bool AtTop,
201	const RegPressureTracker &RPTracker,
202	const SIRegisterInfo *SRI,
203	unsigned SGPRPressure,
204	unsigned VGPRPressure, bool IsBottomUp) {
205	Cand.SU = SU;
206	Cand.AtTop = AtTop;
207
208	if (!DAG->isTrackingPressure())
209	return;
210
211	Pressure.clear();
212	MaxPressure.clear();
213
214	// We try to use the cached PressureDiffs in the ScheduleDAG whenever
215	// possible over querying the RegPressureTracker.
216	//
217	// RegPressureTracker will make a lot of LIS queries which are very
218	// expensive, it is considered a slow function in this context.
219	//
220	// PressureDiffs are precomputed and cached, and getPressureDiff is just a
221	// trivial lookup into an array. It is pretty much free.
222	//
223	// In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
224	// PressureDiffs.
225	if (AtTop \|\| !canUsePressureDiffs(SU: *SU) \|\| GCNTrackers) {
226	getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
227	DownwardTracker, UpwardTracker, DAG, SRI);
228	} else {
229	// Reserve 4 slots.
230	Pressure.resize(new_size: `4`, x: `0`);
231	Pressure [AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
232	Pressure [AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
233
234	for (const auto &Diff : DAG->getPressureDiff(SU)) {
235	if (!Diff.isValid())
236	continue;
237	// PressureDiffs is always bottom-up so if we're working top-down we need
238	// to invert its sign.
239	Pressure [Diff.getPSet()] +=
240	(IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
241	}
242
243	#ifdef EXPENSIVE_CHECKS
244	std::vector<unsigned> CheckPressure, CheckMaxPressure;
245	getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
246	DownwardTracker, UpwardTracker, DAG, SRI);
247	if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
248	CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] \|\|
249	Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
250	CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
251	errs() << "Register Pressure is inaccurate when calculated through "
252	"PressureDiff\n"
253	<< "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
254	<< ", expected "
255	<< CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
256	<< "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
257	<< ", expected "
258	<< CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
259	report_fatal_error("inaccurate register pressure calculation");
260	}
261	#endif
262	}
263
264	unsigned NewSGPRPressure = Pressure [AMDGPU::RegisterPressureSets::SReg_32];
265	unsigned NewVGPRPressure = Pressure [AMDGPU::RegisterPressureSets::VGPR_32];
266
267	// If two instructions increase the pressure of different register sets
268	// by the same amount, the generic scheduler will prefer to schedule the
269	// instruction that increases the set with the least amount of registers,
270	// which in our case would be SGPRs. This is rarely what we want, so
271	// when we report excess/critical register pressure, we do it either
272	// only for VGPRs or only for SGPRs.
273
274	// FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
275	const unsigned MaxVGPRPressureInc = `16`;
276	bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
277	bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
278
279	// FIXME: We have to enter REG-EXCESS before we reach the actual threshold
280	// to increase the likelihood we don't go over the limits. We should improve
281	// the analysis to look through dependencies to find the path with the least
282	// register pressure.
283
284	// We only need to update the RPDelta for instructions that increase register
285	// pressure. Instructions that decrease or keep reg pressure the same will be
286	// marked as RegExcess in tryCandidate() when they are compared with
287	// instructions that increase the register pressure.
288	if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
289	HasHighPressure = true;
290	Cand.RPDelta.Excess = PressureChange (AMDGPU::RegisterPressureSets::VGPR_32);
291	Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
292	}
293
294	if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
295	HasHighPressure = true;
296	Cand.RPDelta.Excess = PressureChange (AMDGPU::RegisterPressureSets::SReg_32);
297	Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
298	}
299
300	// Register pressure is considered 'CRITICAL' if it is approaching a value
301	// that would reduce the wave occupancy for the execution unit. When
302	// register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
303	// has the same cost, so we don't need to prefer one over the other.
304
305	int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
306	int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
307
308	if (SGPRDelta >= `0` \|\| VGPRDelta >= `0`) {
309	HasHighPressure = true;
310	if (SGPRDelta > VGPRDelta) {
311	Cand.RPDelta.CriticalMax =
312	PressureChange (AMDGPU::RegisterPressureSets::SReg_32);
313	Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
314	} else {
315	Cand.RPDelta.CriticalMax =
316	PressureChange (AMDGPU::RegisterPressureSets::VGPR_32);
317	Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
318	}
319	}
320	}
321
322	// This function is mostly cut and pasted from
323	// GenericScheduler::pickNodeFromQueue()
324	void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
325	const CandPolicy &ZonePolicy,
326	const RegPressureTracker &RPTracker,
327	SchedCandidate &Cand,
328	bool IsBottomUp) {
329	const SIRegisterInfo SRI = static_cast<const* SIRegisterInfo *>(TRI);
330	ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
331	unsigned SGPRPressure = `0`;
332	unsigned VGPRPressure = `0`;
333	if (DAG->isTrackingPressure()) {
334	if (!GCNTrackers) {
335	SGPRPressure = Pressure [AMDGPU::RegisterPressureSets::SReg_32];
336	VGPRPressure = Pressure [AMDGPU::RegisterPressureSets::VGPR_32];
337	} else {
338	GCNRPTracker *T = IsBottomUp
339	? static_cast<GCNRPTracker *>(&UpwardTracker)
340	: static_cast<GCNRPTracker *>(&DownwardTracker);
341	SGPRPressure = T->getPressure().getSGPRNum();
342	VGPRPressure = T->getPressure().getArchVGPRNum();
343	}
344	}
345	ReadyQueue &Q = Zone.Available;
346	for (SUnit *SU : Q) {
347
348	SchedCandidate TryCand(ZonePolicy);
349	initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
350	VGPRPressure, IsBottomUp);
351	// Pass SchedBoundary only when comparing nodes from the same boundary.
352	SchedBoundary ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr*;
353	tryCandidate(Cand, TryCand, Zone: ZoneArg);
354	if (TryCand.Reason != NoCand) {
355	// Initialize resource delta if needed in case future heuristics query it.
356	if (TryCand.ResDelta == SchedResourceDelta ())
357	TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
358	Cand.setBest(TryCand);
359	LLVM_DEBUG(traceCandidate(Cand));
360	}
361	}
362	}
363
364	// This function is mostly cut and pasted from
365	// GenericScheduler::pickNodeBidirectional()
366	SUnit GCNSchedStrategy::pickNodeBidirectional(bool* &IsTopNode) {
367	// Schedule as far as possible in the direction of no choice. This is most
368	// efficient, but also provides the best heuristics for CriticalPSets.
369	if (SUnit *SU = Bot.pickOnlyChoice()) {
370	IsTopNode = false;
371	return SU;
372	}
373	if (SUnit *SU = Top.pickOnlyChoice()) {
374	IsTopNode = true;
375	return SU;
376	}
377	// Set the bottom-up policy based on the state of the current bottom zone and
378	// the instructions outside the zone, including the top zone.
379	CandPolicy BotPolicy;
380	setPolicy(Policy&: BotPolicy, /IsPostRA=/false, CurrZone&: Bot, OtherZone: &Top);
381	// Set the top-down policy based on the state of the current top zone and
382	// the instructions outside the zone, including the bottom zone.
383	CandPolicy TopPolicy;
384	setPolicy(Policy&: TopPolicy, /IsPostRA=/false, CurrZone&: Top, OtherZone: &Bot);
385
386	// See if BotCand is still valid (because we previously scheduled from Top).
387	LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
388	if (!BotCand.isValid() \|\| BotCand.SU->isScheduled \|\|
389	BotCand.Policy != BotPolicy) {
390	BotCand.reset(NewPolicy: CandPolicy ());
391	pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
392	/IsBottomUp=/true);
393	assert(BotCand.Reason != NoCand && "failed to find the first candidate");
394	} else {
395	LLVM_DEBUG(traceCandidate(BotCand));
396	#ifndef NDEBUG
397	if (VerifyScheduling) {
398	SchedCandidate TCand;
399	TCand.reset(CandPolicy());
400	pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
401	/IsBottomUp=/true);
402	assert(TCand.SU == BotCand.SU &&
403	"Last pick result should correspond to re-picking right now");
404	}
405	#endif
406	}
407
408	// Check if the top Q has a better candidate.
409	LLVM_DEBUG(dbgs() << "Picking from Top:\n");
410	if (!TopCand.isValid() \|\| TopCand.SU->isScheduled \|\|
411	TopCand.Policy != TopPolicy) {
412	TopCand.reset(NewPolicy: CandPolicy ());
413	pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
414	/IsBottomUp=/false);
415	assert(TopCand.Reason != NoCand && "failed to find the first candidate");
416	} else {
417	LLVM_DEBUG(traceCandidate(TopCand));
418	#ifndef NDEBUG
419	if (VerifyScheduling) {
420	SchedCandidate TCand;
421	TCand.reset(CandPolicy());
422	pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
423	/IsBottomUp=/false);
424	assert(TCand.SU == TopCand.SU &&
425	"Last pick result should correspond to re-picking right now");
426	}
427	#endif
428	}
429
430	// Pick best from BotCand and TopCand.
431	LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
432	dbgs() << "Bot Cand: "; traceCandidate(BotCand););
433	SchedCandidate Cand = BotCand;
434	TopCand.Reason = NoCand;
435	tryCandidate(Cand, TryCand&: TopCand, Zone: nullptr);
436	if (TopCand.Reason != NoCand) {
437	Cand.setBest(TopCand);
438	}
439	LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
440
441	IsTopNode = Cand.AtTop;
442	return Cand.SU;
443	}
444
445	// This function is mostly cut and pasted from
446	// GenericScheduler::pickNode()
447	SUnit GCNSchedStrategy::pickNode(bool* &IsTopNode) {
448	if (DAG->top() == DAG->bottom()) {
449	assert(Top.Available.empty() && Top.Pending.empty() &&
450	Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
451	return nullptr;
452	}
453	SUnit *SU;
454	do {
455	if (RegionPolicy.OnlyTopDown) {
456	SU = Top.pickOnlyChoice();
457	if (!SU) {
458	CandPolicy NoPolicy;
459	TopCand.reset(NewPolicy: NoPolicy);
460	pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
461	/IsBottomUp=/false);
462	assert(TopCand.Reason != NoCand && "failed to find a candidate");
463	SU = TopCand.SU;
464	}
465	IsTopNode = true;
466	} else if (RegionPolicy.OnlyBottomUp) {
467	SU = Bot.pickOnlyChoice();
468	if (!SU) {
469	CandPolicy NoPolicy;
470	BotCand.reset(NewPolicy: NoPolicy);
471	pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
472	/IsBottomUp=/true);
473	assert(BotCand.Reason != NoCand && "failed to find a candidate");
474	SU = BotCand.SU;
475	}
476	IsTopNode = false;
477	} else {
478	SU = pickNodeBidirectional(IsTopNode);
479	}
480	} while (SU->isScheduled);
481
482	if (SU->isTopReady())
483	Top.removeReady(SU);
484	if (SU->isBottomReady())
485	Bot.removeReady(SU);
486
487	LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
488	<< *SU->getInstr());
489	return SU;
490	}
491
492	void GCNSchedStrategy::schedNode(SUnit SU, bool* IsTopNode) {
493	if (GCNTrackers) {
494	MachineInstr *MI = SU->getInstr();
495	IsTopNode ? (void)DownwardTracker.advance(MI, UseInternalIterator: false)
496	: UpwardTracker.recede(MI: *MI);
497	}
498
499	return GenericScheduler::schedNode(SU, IsTopNode);
500	}
501
502	GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
503	assert(CurrentStage && CurrentStage != SchedStages.end());
504	return *CurrentStage;
505	}
506
507	bool GCNSchedStrategy::advanceStage() {
508	assert(CurrentStage != SchedStages.end());
509	if (!CurrentStage)
510	CurrentStage = SchedStages.begin();
511	else
512	CurrentStage++;
513
514	return CurrentStage != SchedStages.end();
515	}
516
517	bool GCNSchedStrategy::hasNextStage() const {
518	assert(CurrentStage);
519	return std::next(x: CurrentStage) != SchedStages.end();
520	}
521
522	GCNSchedStageID GCNSchedStrategy::getNextStage() const {
523	assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
524	return *std::next(x: CurrentStage);
525	}
526
527	GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
528	const MachineSchedContext C, bool* IsLegacyScheduler)
529	: GCNSchedStrategy (C) {
530	SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule);
531	SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule);
532	SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule);
533	SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize);
534	GCNTrackers = GCNTrackers & !IsLegacyScheduler;
535	}
536
537	GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
538	: GCNSchedStrategy (C) {
539	SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule);
540	}
541
542	bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
543	SchedCandidate &TryCand,
544	SchedBoundary Zone) const* {
545	// Initialize the candidate if needed.
546	if (!Cand.isValid()) {
547	TryCand.Reason = NodeOrder;
548	return true;
549	}
550
551	// Avoid spilling by exceeding the register limit.
552	if (DAG->isTrackingPressure() &&
553	tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
554	Reason: RegExcess, TRI, MF: DAG->MF))
555	return TryCand.Reason != NoCand;
556
557	// Bias PhysReg Defs and copies to their uses and defined respectively.
558	if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
559	CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
560	return TryCand.Reason != NoCand;
561
562	bool SameBoundary = Zone != nullptr;
563	if (SameBoundary) {
564	// Prioritize instructions that read unbuffered resources by stall cycles.
565	if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
566	CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
567	return TryCand.Reason != NoCand;
568
569	// Avoid critical resource consumption and balance the schedule.
570	TryCand.initResourceDelta(DAG, SchedModel);
571	if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
572	TryCand, Cand, Reason: ResourceReduce))
573	return TryCand.Reason != NoCand;
574	if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
575	CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
576	Reason: ResourceDemand))
577	return TryCand.Reason != NoCand;
578
579	// Unconditionally try to reduce latency.
580	if (tryLatency(TryCand, Cand, Zone&: *Zone))
581	return TryCand.Reason != NoCand;
582
583	// Weak edges are for clustering and other constraints.
584	if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
585	CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
586	return TryCand.Reason != NoCand;
587	}
588
589	// Keep clustered nodes together to encourage downstream peephole
590	// optimizations which may reduce resource requirements.
591	//
592	// This is a best effort to set things up for a post-RA pass. Optimizations
593	// like generating loads of multiple registers should ideally be done within
594	// the scheduler pass by combining the loads during DAG postprocessing.
595	const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
596	const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
597	if (tryGreater(TryVal: TryCandCluster && TryCandCluster->contains(Ptr: TryCand.SU),
598	CandVal: CandCluster && CandCluster->contains(Ptr: Cand.SU), TryCand, Cand,
599	Reason: Cluster))
600	return TryCand.Reason != NoCand;
601
602	// Avoid increasing the max critical pressure in the scheduled region.
603	if (DAG->isTrackingPressure() &&
604	tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
605	TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
606	return TryCand.Reason != NoCand;
607
608	// Avoid increasing the max pressure of the entire region.
609	if (DAG->isTrackingPressure() &&
610	tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
611	Cand, Reason: RegMax, TRI, MF: DAG->MF))
612	return TryCand.Reason != NoCand;
613
614	if (SameBoundary) {
615	// Fall through to original instruction order.
616	if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) \|\|
617	(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
618	TryCand.Reason = NodeOrder;
619	return true;
620	}
621	}
622	return false;
623	}
624
625	GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
626	const MachineSchedContext *C)
627	: GCNSchedStrategy (C) {
628	SchedStages.push_back(Elt: GCNSchedStageID::MemoryClauseInitialSchedule);
629	}
630
631	/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
632	/// much as possible. This is achieved by:
633	// 1. Prioritize clustered operations before stall latency heuristic.
634	// 2. Prioritize long-latency-load before stall latency heuristic.
635	///
636	/// \param Cand provides the policy and current best candidate.
637	/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
638	/// \param Zone describes the scheduled zone that we are extending, or nullptr
639	/// if Cand is from a different zone than TryCand.
640	/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
641	bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
642	SchedCandidate &TryCand,
643	SchedBoundary Zone) const* {
644	// Initialize the candidate if needed.
645	if (!Cand.isValid()) {
646	TryCand.Reason = NodeOrder;
647	return true;
648	}
649
650	// Bias PhysReg Defs and copies to their uses and defined respectively.
651	if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
652	CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
653	return TryCand.Reason != NoCand;
654
655	if (DAG->isTrackingPressure()) {
656	// Avoid exceeding the target's limit.
657	if (tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
658	Reason: RegExcess, TRI, MF: DAG->MF))
659	return TryCand.Reason != NoCand;
660
661	// Avoid increasing the max critical pressure in the scheduled region.
662	if (tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
663	TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
664	return TryCand.Reason != NoCand;
665	}
666
667	// MaxMemoryClause-specific: We prioritize clustered instructions as we would
668	// get more benefit from clausing these memory instructions.
669	const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
670	const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
671	if (tryGreater(TryVal: TryCandCluster && TryCandCluster->contains(Ptr: TryCand.SU),
672	CandVal: CandCluster && CandCluster->contains(Ptr: Cand.SU), TryCand, Cand,
673	Reason: Cluster))
674	return TryCand.Reason != NoCand;
675
676	// We only compare a subset of features when comparing nodes between
677	// Top and Bottom boundary. Some properties are simply incomparable, in many
678	// other instances we should only override the other boundary if something
679	// is a clear good pick on one boundary. Skip heuristics that are more
680	// "tie-breaking" in nature.
681	bool SameBoundary = Zone != nullptr;
682	if (SameBoundary) {
683	// For loops that are acyclic path limited, aggressively schedule for
684	// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
685	// heuristics to take precedence.
686	if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
687	tryLatency(TryCand, Cand, Zone&: *Zone))
688	return TryCand.Reason != NoCand;
689
690	// MaxMemoryClause-specific: Prioritize long latency memory load
691	// instructions in top-bottom order to hide more latency. The mayLoad check
692	// is used to exclude store-like instructions, which we do not want to
693	// scheduler them too early.
694	bool TryMayLoad =
695	TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
696	bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
697
698	if (TryMayLoad \|\| CandMayLoad) {
699	bool TryLongLatency =
700	TryCand.SU->Latency > `10` * Cand.SU->Latency && TryMayLoad;
701	bool CandLongLatency =
702	`10` * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
703
704	if (tryGreater(TryVal: Zone->isTop() ? TryLongLatency : CandLongLatency,
705	CandVal: Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
706	Cand, Reason: Stall))
707	return TryCand.Reason != NoCand;
708	}
709	// Prioritize instructions that read unbuffered resources by stall cycles.
710	if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
711	CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
712	return TryCand.Reason != NoCand;
713	}
714
715	if (SameBoundary) {
716	// Weak edges are for clustering and other constraints.
717	if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
718	CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
719	return TryCand.Reason != NoCand;
720	}
721
722	// Avoid increasing the max pressure of the entire region.
723	if (DAG->isTrackingPressure() &&
724	tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
725	Cand, Reason: RegMax, TRI, MF: DAG->MF))
726	return TryCand.Reason != NoCand;
727
728	if (SameBoundary) {
729	// Avoid critical resource consumption and balance the schedule.
730	TryCand.initResourceDelta(DAG, SchedModel);
731	if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
732	TryCand, Cand, Reason: ResourceReduce))
733	return TryCand.Reason != NoCand;
734	if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
735	CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
736	Reason: ResourceDemand))
737	return TryCand.Reason != NoCand;
738
739	// Avoid serializing long latency dependence chains.
740	// For acyclic path limited loops, latency was already checked above.
741	if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
742	!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone))
743	return TryCand.Reason != NoCand;
744
745	// Fall through to original instruction order.
746	if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
747	assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
748	TryCand.Reason = NodeOrder;
749	return true;
750	}
751	}
752
753	return false;
754	}
755
756	GCNScheduleDAGMILive::GCNScheduleDAGMILive(
757	MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
758	: ScheduleDAGMILive (C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
759	MFI(*MF.getInfo<SIMachineFunctionInfo>()),
760	StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
761	RegionLiveOuts (this, /IsLiveOut=/true) {
762
763	// We want regions with a single MI to be scheduled so that we can reason
764	// about them correctly during scheduling stages that move MIs between regions
765	// (e.g., rematerialization).
766	ScheduleSingleMIRegions = true;
767	LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
768	if (RelaxedOcc) {
769	MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy);
770	if (MinOccupancy != StartingOccupancy)
771	LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
772	<< ".\n");
773	}
774	}
775
776	std::unique_ptr<GCNSchedStage>
777	GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
778	switch (SchedStageID) {
779	case GCNSchedStageID::OccInitialSchedule:
780	return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this);
781	case GCNSchedStageID::UnclusteredHighRPReschedule:
782	return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this);
783	case GCNSchedStageID::ClusteredLowOccupancyReschedule:
784	return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this);
785	case GCNSchedStageID::PreRARematerialize:
786	return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this);
787	case GCNSchedStageID::ILPInitialSchedule:
788	return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this);
789	case GCNSchedStageID::MemoryClauseInitialSchedule:
790	return std::make_unique<MemoryClauseInitialScheduleStage>(args&: SchedStageID,
791	args&: *this);
792	}
793
794	llvm_unreachable("Unknown SchedStageID.");
795	}
796
797	void GCNScheduleDAGMILive::schedule() {
798	// Collect all scheduling regions. The actual scheduling is performed in
799	// GCNScheduleDAGMILive::finalizeSchedule.
800	Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd));
801	}
802
803	GCNRegPressure
804	GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
805	GCNDownwardRPTracker RPTracker(*LIS);
806	RPTracker.advance(Begin: begin(), End: end(), LiveRegsCopy: &LiveIns [RegionIdx]);
807	return RPTracker.moveMaxPressure();
808	}
809
810	static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
811	MachineBasicBlock::iterator RegionEnd) {
812	auto REnd = RegionEnd == RegionBegin ->getParent()->end()
813	? std::prev(x: RegionEnd)
814	: RegionEnd;
815	return &*skipDebugInstructionsBackward(It: REnd, Begin: RegionBegin);
816	}
817
818	void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
819	const MachineBasicBlock *MBB) {
820	GCNDownwardRPTracker RPTracker(*LIS);
821
822	// If the block has the only successor then live-ins of that successor are
823	// live-outs of the current block. We can reuse calculated live set if the
824	// successor will be sent to scheduling past current block.
825
826	// However, due to the bug in LiveInterval analysis it may happen that two
827	// predecessors of the same successor block have different lane bitmasks for
828	// a live-out register. Workaround that by sticking to one-to-one relationship
829	// i.e. one predecessor with one successor block.
830	const MachineBasicBlock OnlySucc = nullptr*;
831	if (MBB->succ_size() == `1`) {
832	auto Candidate = MBB->succ_begin();
833	if (!Candidate->empty() && Candidate->pred_size() == `1`) {
834	SlotIndexes *Ind = LIS->getSlotIndexes();
835	if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate))
836	OnlySucc = Candidate;
837	}
838	}
839
840	// Scheduler sends regions from the end of the block upwards.
841	size_t CurRegion = RegionIdx;
842	for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
843	if (Regions [CurRegion].first ->getParent() != MBB)
844	break;
845	--CurRegion;
846
847	auto I = MBB->begin();
848	auto LiveInIt = MBBLiveIns.find(Val: MBB);
849	auto &Rgn = Regions [CurRegion];
850	auto NonDbgMI = &skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
851	if (LiveInIt != MBBLiveIns.end()) {
852	auto LiveIn = std::move(LiveInIt ->second);
853	RPTracker.reset(MI: *MBB->begin(), LiveRegs: &LiveIn);
854	MBBLiveIns.erase(I: LiveInIt);
855	} else {
856	I = Rgn.first;
857	auto LRS = BBLiveInMap.lookup(Val: NonDbgMI);
858	#ifdef EXPENSIVE_CHECKS
859	assert(isEqual(getLiveRegsBefore(NonDbgMI, LIS), LRS));
860	#endif
861	RPTracker.reset(MI: *I, LiveRegs: &LRS);
862	}
863
864	for (;;) {
865	I = RPTracker.getNext();
866
867	if (Regions [CurRegion].first == I \|\| NonDbgMI == I) {
868	LiveIns [CurRegion] = RPTracker.getLiveRegs();
869	RPTracker.clearMaxPressure();
870	}
871
872	if (Regions [CurRegion].second == I) {
873	Pressure [CurRegion] = RPTracker.moveMaxPressure();
874	if (CurRegion-- == RegionIdx)
875	break;
876	auto &Rgn = Regions [CurRegion];
877	NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
878	}
879	RPTracker.advanceToNext();
880	RPTracker.advanceBeforeNext();
881	}
882
883	if (OnlySucc) {
884	if (I != MBB->end()) {
885	RPTracker.advanceToNext();
886	RPTracker.advance(End: MBB->end());
887	}
888	RPTracker.advanceBeforeNext();
889	MBBLiveIns [OnlySucc] = RPTracker.moveLiveRegs();
890	}
891	}
892
893	DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
894	GCNScheduleDAGMILive::getRegionLiveInMap() const {
895	assert(!Regions.empty());
896	std::vector<MachineInstr *> RegionFirstMIs;
897	RegionFirstMIs.reserve(n: Regions.size());
898	auto I = Regions.rbegin(), E = Regions.rend();
899	do {
900	const MachineBasicBlock *MBB = I ->first ->getParent();
901	auto MI = &skipDebugInstructionsForward(It: I ->first, End: I ->second);
902	RegionFirstMIs.push_back(x: MI);
903	do {
904	++I;
905	} while (I != E && I ->first ->getParent() == MBB);
906	} while (I != E);
907	return getLiveRegMap(R&: RegionFirstMIs, /After=/false, LIS&: *LIS);
908	}
909
910	DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
911	GCNScheduleDAGMILive::getRegionLiveOutMap() const {
912	assert(!Regions.empty());
913	std::vector<MachineInstr *> RegionLastMIs;
914	RegionLastMIs.reserve(n: Regions.size());
915	for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions))
916	RegionLastMIs.push_back(x: getLastMIForRegion(RegionBegin, RegionEnd));
917
918	return getLiveRegMap(R&: RegionLastMIs, /After=/true, LIS&: *LIS);
919	}
920
921	void RegionPressureMap::buildLiveRegMap() {
922	IdxToInstruction.clear();
923
924	RegionLiveRegMap =
925	IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
926	for (unsigned I = `0`; I < DAG->Regions.size(); I++) {
927	MachineInstr *RegionKey =
928	IsLiveOut
929	? getLastMIForRegion(RegionBegin: DAG->Regions [I].first, RegionEnd: DAG->Regions [I].second)
930	: &*DAG->Regions [I].first;
931	IdxToInstruction [I] = RegionKey;
932	}
933	}
934
935	void GCNScheduleDAGMILive::finalizeSchedule() {
936	// Start actual scheduling here. This function is called by the base
937	// MachineScheduler after all regions have been recorded by
938	// GCNScheduleDAGMILive::schedule().
939	LiveIns.resize(N: Regions.size());
940	Pressure.resize(N: Regions.size());
941	RegionsWithHighRP.resize(N: Regions.size());
942	RegionsWithExcessRP.resize(N: Regions.size());
943	RegionsWithMinOcc.resize(N: Regions.size());
944	RegionsWithIGLPInstrs.resize(N: Regions.size());
945	RegionsWithHighRP.reset();
946	RegionsWithExcessRP.reset();
947	RegionsWithMinOcc.reset();
948	RegionsWithIGLPInstrs.reset();
949
950	runSchedStages();
951	}
952
953	void GCNScheduleDAGMILive::runSchedStages() {
954	LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
955
956	if (!Regions.empty()) {
957	BBLiveInMap = getRegionLiveInMap();
958	if (GCNTrackers)
959	RegionLiveOuts.buildLiveRegMap();
960	}
961
962	GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
963	while (S.advanceStage()) {
964	auto Stage = createSchedStage(SchedStageID: S.getCurrentStage());
965	if (!Stage ->initGCNSchedStage())
966	continue;
967
968	for (auto Region : Regions) {
969	RegionBegin = Region.first;
970	RegionEnd = Region.second;
971	// Setup for scheduling the region and check whether it should be skipped.
972	if (!Stage ->initGCNRegion()) {
973	Stage ->advanceRegion();
974	exitRegion();
975	continue;
976	}
977
978	if (GCNTrackers) {
979	GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
980	GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
981	GCNRPTracker::LiveRegSet *RegionLiveIns =
982	&LiveIns [Stage ->getRegionIdx()];
983
984	reinterpret_cast<GCNRPTracker *>(DownwardTracker)
985	->reset(MRI_: MRI, LiveRegs_: *RegionLiveIns);
986	reinterpret_cast<GCNRPTracker *>(UpwardTracker)
987	->reset(MRI_: MRI, LiveRegs_: RegionLiveOuts.getLiveRegsForRegionIdx(
988	RegionIdx: Stage ->getRegionIdx()));
989	}
990
991	ScheduleDAGMILive::schedule();
992	Stage ->finalizeGCNRegion();
993	}
994
995	Stage ->finalizeGCNSchedStage();
996	}
997	}
998
999	#ifndef NDEBUG
1000	raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
1001	switch (StageID) {
1002	case GCNSchedStageID::OccInitialSchedule:
1003	OS << "Max Occupancy Initial Schedule";
1004	break;
1005	case GCNSchedStageID::UnclusteredHighRPReschedule:
1006	OS << "Unclustered High Register Pressure Reschedule";
1007	break;
1008	case GCNSchedStageID::ClusteredLowOccupancyReschedule:
1009	OS << "Clustered Low Occupancy Reschedule";
1010	break;
1011	case GCNSchedStageID::PreRARematerialize:
1012	OS << "Pre-RA Rematerialize";
1013	break;
1014	case GCNSchedStageID::ILPInitialSchedule:
1015	OS << "Max ILP Initial Schedule";
1016	break;
1017	case GCNSchedStageID::MemoryClauseInitialSchedule:
1018	OS << "Max memory clause Initial Schedule";
1019	break;
1020	}
1021
1022	return OS;
1023	}
1024	#endif
1025
1026	GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
1027	: DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
1028	MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
1029
1030	bool GCNSchedStage::initGCNSchedStage() {
1031	if (!DAG.LIS)
1032	return false;
1033
1034	LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1035	return true;
1036	}
1037
1038	bool UnclusteredHighRPStage::initGCNSchedStage() {
1039	if (DisableUnclusterHighRP)
1040	return false;
1041
1042	if (!GCNSchedStage::initGCNSchedStage())
1043	return false;
1044
1045	if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1046	return false;
1047
1048	SavedMutations.swap(x&: DAG.Mutations);
1049	DAG.addMutation(
1050	Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry));
1051
1052	InitialOccupancy = DAG.MinOccupancy;
1053	// Aggressivly try to reduce register pressure in the unclustered high RP
1054	// stage. Temporarily increase occupancy target in the region.
1055	S.SGPRLimitBias = S.HighRPSGPRBias;
1056	S.VGPRLimitBias = S.HighRPVGPRBias;
1057	if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1058	MFI.increaseOccupancy(MF, Limit: ++DAG.MinOccupancy);
1059
1060	LLVM_DEBUG(
1061	dbgs()
1062	<< "Retrying function scheduling without clustering. "
1063	"Aggressivly try to reduce register pressure to achieve occupancy "
1064	<< DAG.MinOccupancy << ".\n");
1065
1066	return true;
1067	}
1068
1069	bool ClusteredLowOccStage::initGCNSchedStage() {
1070	if (DisableClusteredLowOccupancy)
1071	return false;
1072
1073	if (!GCNSchedStage::initGCNSchedStage())
1074	return false;
1075
1076	// Don't bother trying to improve ILP in lower RP regions if occupancy has not
1077	// been dropped. All regions will have already been scheduled with the ideal
1078	// occupancy targets.
1079	if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1080	return false;
1081
1082	LLVM_DEBUG(
1083	dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1084	<< DAG.MinOccupancy << ".\n");
1085	return true;
1086	}
1087
1088	/// Allows to easily filter for this stage's debug output.
1089	#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
1090
1091	bool PreRARematStage::initGCNSchedStage() {
1092	// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1093	// regions inbetween the defs and region we sinked the def to. Will need to be
1094	// fixed if there is another pass after this pass.
1095	assert(!S.hasNextStage());
1096
1097	if (!GCNSchedStage::initGCNSchedStage() \|\| DAG.RegionsWithMinOcc.none() \|\|
1098	DAG.Regions.size() == `1`)
1099	return false;
1100
1101	// Before performing any IR modification record the parent region of each MI
1102	// and the parent MBB of each region.
1103	const unsigned NumRegions = DAG.Regions.size();
1104	RegionBB.reserve(N: NumRegions);
1105	for (unsigned I = `0`; I < NumRegions; ++I) {
1106	RegionBoundaries Region = DAG.Regions [I];
1107	for (auto MI = Region.first; MI != Region.second; ++MI)
1108	MIRegion.insert(KV: {&*MI, I});
1109	RegionBB.push_back(Elt: Region.first ->getParent());
1110	}
1111
1112	if (!canIncreaseOccupancyOrReduceSpill())
1113	return false;
1114
1115	// Rematerialize identified instructions and update scheduler's state.
1116	rematerialize();
1117	if (GCNTrackers)
1118	DAG.RegionLiveOuts.buildLiveRegMap();
1119	REMAT_DEBUG(
1120	dbgs() << "Retrying function scheduling with new min. occupancy of "
1121	<< AchievedOcc << " from rematerializing (original was "
1122	<< DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
1123	if (AchievedOcc > DAG.MinOccupancy) {
1124	DAG.MinOccupancy = AchievedOcc;
1125	SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1126	MFI.increaseOccupancy(MF, Limit: DAG.MinOccupancy);
1127	}
1128	return true;
1129	}
1130
1131	void GCNSchedStage::finalizeGCNSchedStage() {
1132	DAG.finishBlock();
1133	LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1134	}
1135
1136	void UnclusteredHighRPStage::finalizeGCNSchedStage() {
1137	SavedMutations.swap(x&: DAG.Mutations);
1138	S.SGPRLimitBias = S.VGPRLimitBias = `0`;
1139	if (DAG.MinOccupancy > InitialOccupancy) {
1140	for (unsigned IDX = `0`; IDX < DAG.Pressure.size(); ++IDX)
1141	DAG.RegionsWithMinOcc [IDX] =
1142	DAG.Pressure [IDX].getOccupancy(
1143	ST: DAG.ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;
1144
1145	LLVM_DEBUG(dbgs() << StageID
1146	<< " stage successfully increased occupancy to "
1147	<< DAG.MinOccupancy << `'\n'`);
1148	}
1149
1150	GCNSchedStage::finalizeGCNSchedStage();
1151	}
1152
1153	bool GCNSchedStage::initGCNRegion() {
1154	// Check whether this new region is also a new block.
1155	if (DAG.RegionBegin ->getParent() != CurrentMBB)
1156	setupNewBlock();
1157
1158	unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end());
1159	DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs);
1160
1161	// Skip empty scheduling regions (0 or 1 schedulable instructions).
1162	if (DAG.begin() == DAG.end() \|\| DAG.begin() == std::prev(x: DAG.end()))
1163	return false;
1164
1165	LLVM_DEBUG(dbgs() << "******** MI Scheduling ********\n");
1166	LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1167	<< " " << CurrentMBB->getName()
1168	<< "\n From: " << *DAG.begin() << " To: ";
1169	if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1170	else dbgs() << "End";
1171	dbgs() << " RegionInstrs: " << NumRegionInstrs << `'\n'`);
1172
1173	// Save original instruction order before scheduling for possible revert.
1174	Unsched.clear();
1175	Unsched.reserve(n: DAG.NumRegionInstrs);
1176	if (StageID == GCNSchedStageID::OccInitialSchedule \|\|
1177	StageID == GCNSchedStageID::ILPInitialSchedule) {
1178	const SIInstrInfo SII = static_cast<const* SIInstrInfo *>(DAG.TII);
1179	for (auto &I : DAG) {
1180	Unsched.push_back(x: &I);
1181	if (SII->isIGLPMutationOnly(Opcode: I.getOpcode()))
1182	DAG.RegionsWithIGLPInstrs [RegionIdx] = true;
1183	}
1184	} else {
1185	for (auto &I : DAG)
1186	Unsched.push_back(x: &I);
1187	}
1188
1189	PressureBefore = DAG.Pressure [RegionIdx];
1190
1191	LLVM_DEBUG(
1192	dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1193	<< print(DAG.LiveIns[RegionIdx], DAG.MRI)
1194	<< "Region live-in pressure: "
1195	<< print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1196	<< "Region register pressure: " << print(PressureBefore));
1197
1198	S.HasHighPressure = false;
1199	S.KnownExcessRP = isRegionWithExcessRP();
1200
1201	if (DAG.RegionsWithIGLPInstrs [RegionIdx] &&
1202	StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
1203	SavedMutations.clear();
1204	SavedMutations.swap(x&: DAG.Mutations);
1205	bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule \|\|
1206	StageID == GCNSchedStageID::ILPInitialSchedule;
1207	DAG.addMutation(Mutation: createIGroupLPDAGMutation(
1208	Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1209	: AMDGPU::SchedulingPhase::PreRAReentry));
1210	}
1211
1212	return true;
1213	}
1214
1215	bool UnclusteredHighRPStage::initGCNRegion() {
1216	// Only reschedule regions with the minimum occupancy or regions that may have
1217	// spilling (excess register pressure).
1218	if ((!DAG.RegionsWithMinOcc [RegionIdx] \|\|
1219	DAG.MinOccupancy <= InitialOccupancy) &&
1220	!DAG.RegionsWithExcessRP [RegionIdx])
1221	return false;
1222
1223	return GCNSchedStage::initGCNRegion();
1224	}
1225
1226	bool ClusteredLowOccStage::initGCNRegion() {
1227	// We may need to reschedule this region if it wasn't rescheduled in the last
1228	// stage, or if we found it was testing critical register pressure limits in
1229	// the unclustered reschedule stage. The later is because we may not have been
1230	// able to raise the min occupancy in the previous stage so the region may be
1231	// overly constrained even if it was already rescheduled.
1232	if (!DAG.RegionsWithHighRP [RegionIdx])
1233	return false;
1234
1235	return GCNSchedStage::initGCNRegion();
1236	}
1237
1238	bool PreRARematStage::initGCNRegion() {
1239	return RescheduleRegions [RegionIdx] && GCNSchedStage::initGCNRegion();
1240	}
1241
1242	void GCNSchedStage::setupNewBlock() {
1243	if (CurrentMBB)
1244	DAG.finishBlock();
1245
1246	CurrentMBB = DAG.RegionBegin ->getParent();
1247	DAG.startBlock(bb: CurrentMBB);
1248	// Get real RP for the region if it hasn't be calculated before. After the
1249	// initial schedule stage real RP will be collected after scheduling.
1250	if (StageID == GCNSchedStageID::OccInitialSchedule \|\|
1251	StageID == GCNSchedStageID::ILPInitialSchedule \|\|
1252	StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
1253	DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB);
1254	}
1255
1256	void GCNSchedStage::finalizeGCNRegion() {
1257	DAG.Regions [RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1258	if (S.HasHighPressure)
1259	DAG.RegionsWithHighRP [RegionIdx] = true;
1260
1261	// Revert scheduling if we have dropped occupancy or there is some other
1262	// reason that the original schedule is better.
1263	checkScheduling();
1264
1265	if (DAG.RegionsWithIGLPInstrs [RegionIdx] &&
1266	StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1267	SavedMutations.swap(x&: DAG.Mutations);
1268
1269	DAG.exitRegion();
1270	advanceRegion();
1271	}
1272
1273	void GCNSchedStage::checkScheduling() {
1274	// Check the results of scheduling.
1275	PressureAfter = DAG.getRealRegPressure(RegionIdx);
1276
1277	LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1278	LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1279
1280	unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1281
1282	if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1283	PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1284	DAG.Pressure [RegionIdx] = PressureAfter;
1285	DAG.RegionsWithMinOcc [RegionIdx] =
1286	PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
1287	DAG.MinOccupancy;
1288
1289	// Early out if we have achieved the occupancy target.
1290	LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1291	return;
1292	}
1293
1294	unsigned TargetOccupancy = std::min(
1295	a: S.getTargetOccupancy(), b: ST.getOccupancyWithWorkGroupSizes(MF).second);
1296	unsigned WavesAfter = std::min(
1297	a: TargetOccupancy, b: PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1298	unsigned WavesBefore = std::min(
1299	a: TargetOccupancy, b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1300	LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1301	<< ", after " << WavesAfter << ".\n");
1302
1303	// We may not be able to keep the current target occupancy because of the just
1304	// scheduled region. We might still be able to revert scheduling if the
1305	// occupancy before was higher, or if the current schedule has register
1306	// pressure higher than the excess limits which could lead to more spilling.
1307	unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore);
1308
1309	// Allow memory bound functions to drop to 4 waves if not limited by an
1310	// attribute.
1311	if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1312	WavesAfter >= MFI.getMinAllowedOccupancy()) {
1313	LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1314	<< MFI.getMinAllowedOccupancy() << " waves\n");
1315	NewOccupancy = WavesAfter;
1316	}
1317
1318	if (NewOccupancy < DAG.MinOccupancy) {
1319	DAG.MinOccupancy = NewOccupancy;
1320	MFI.limitOccupancy(Limit: DAG.MinOccupancy);
1321	DAG.RegionsWithMinOcc.reset();
1322	LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1323	<< DAG.MinOccupancy << ".\n");
1324	}
1325	// The maximum number of arch VGPR on non-unified register file, or the
1326	// maximum VGPR + AGPR in the unified register file case.
1327	unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1328	// The maximum number of arch VGPR for both unified and non-unified register
1329	// file.
1330	unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs());
1331	unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1332
1333	if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs \|\|
1334	PressureAfter.getArchVGPRNum() > MaxArchVGPRs \|\|
1335	PressureAfter.getAGPRNum() > MaxArchVGPRs \|\|
1336	PressureAfter.getSGPRNum() > MaxSGPRs) {
1337	DAG.RegionsWithHighRP [RegionIdx] = true;
1338	DAG.RegionsWithExcessRP [RegionIdx] = true;
1339	}
1340
1341	// Revert if this region's schedule would cause a drop in occupancy or
1342	// spilling.
1343	if (shouldRevertScheduling(WavesAfter)) {
1344	revertScheduling();
1345	} else {
1346	DAG.Pressure [RegionIdx] = PressureAfter;
1347	DAG.RegionsWithMinOcc [RegionIdx] =
1348	PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
1349	DAG.MinOccupancy;
1350	}
1351	}
1352
1353	unsigned
1354	GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1355	DenseMap<unsigned, unsigned> &ReadyCycles,
1356	const TargetSchedModel &SM) {
1357	unsigned ReadyCycle = CurrCycle;
1358	for (auto &D : SU.Preds) {
1359	if (D.isAssignedRegDep()) {
1360	MachineInstr *DefMI = D.getSUnit()->getInstr();
1361	unsigned Latency = SM.computeInstrLatency(MI: DefMI);
1362	unsigned DefReady = ReadyCycles [DAG.getSUnit(MI: DefMI)->NodeNum];
1363	ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency);
1364	}
1365	}
1366	ReadyCycles [SU.NodeNum] = ReadyCycle;
1367	return ReadyCycle;
1368	}
1369
1370	#ifndef NDEBUG
1371	struct EarlierIssuingCycle {
1372	bool operator()(std::pair<MachineInstr , unsigned*> A,
1373	std::pair<MachineInstr , unsigned> B) const* {
1374	return A.second < B.second;
1375	}
1376	};
1377
1378	static void printScheduleModel(std::set<std::pair<MachineInstr , unsigned*>,
1379	EarlierIssuingCycle> &ReadyCycles) {
1380	if (ReadyCycles.empty())
1381	return;
1382	unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1383	dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1384	<< " ##################\n# Cycle #\t\t\tInstruction "
1385	" "
1386	" \n";
1387	unsigned IPrev = `1`;
1388	for (auto &I : ReadyCycles) {
1389	if (I.second > IPrev + `1`)
1390	dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1391	<< " CYCLES DETECTED ******************************\n\n";
1392	dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1393	IPrev = I.second;
1394	}
1395	}
1396	#endif
1397
1398	ScheduleMetrics
1399	GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1400	#ifndef NDEBUG
1401	std::set<std::pair<MachineInstr , unsigned*>, EarlierIssuingCycle>
1402	ReadyCyclesSorted;
1403	#endif
1404	const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1405	unsigned SumBubbles = `0`;
1406	DenseMap<unsigned, unsigned> ReadyCycles;
1407	unsigned CurrCycle = `0`;
1408	for (auto &SU : InputSchedule) {
1409	unsigned ReadyCycle =
1410	computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1411	SumBubbles += ReadyCycle - CurrCycle;
1412	#ifndef NDEBUG
1413	ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1414	#endif
1415	CurrCycle = ++ReadyCycle;
1416	}
1417	#ifndef NDEBUG
1418	LLVM_DEBUG(
1419	printScheduleModel(ReadyCyclesSorted);
1420	dbgs() << "\n\t"
1421	<< "Metric: "
1422	<< (SumBubbles
1423	? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1424	: `1`)
1425	<< "\n\n");
1426	#endif
1427
1428	return ScheduleMetrics (CurrCycle, SumBubbles);
1429	}
1430
1431	ScheduleMetrics
1432	GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1433	#ifndef NDEBUG
1434	std::set<std::pair<MachineInstr , unsigned*>, EarlierIssuingCycle>
1435	ReadyCyclesSorted;
1436	#endif
1437	const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1438	unsigned SumBubbles = `0`;
1439	DenseMap<unsigned, unsigned> ReadyCycles;
1440	unsigned CurrCycle = `0`;
1441	for (auto &MI : DAG) {
1442	SUnit *SU = DAG.getSUnit(MI: &MI);
1443	if (!SU)
1444	continue;
1445	unsigned ReadyCycle =
1446	computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM);
1447	SumBubbles += ReadyCycle - CurrCycle;
1448	#ifndef NDEBUG
1449	ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1450	#endif
1451	CurrCycle = ++ReadyCycle;
1452	}
1453	#ifndef NDEBUG
1454	LLVM_DEBUG(
1455	printScheduleModel(ReadyCyclesSorted);
1456	dbgs() << "\n\t"
1457	<< "Metric: "
1458	<< (SumBubbles
1459	? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1460	: `1`)
1461	<< "\n\n");
1462	#endif
1463
1464	return ScheduleMetrics (CurrCycle, SumBubbles);
1465	}
1466
1467	bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1468	if (WavesAfter < DAG.MinOccupancy)
1469	return true;
1470
1471	// For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1472	if (DAG.MFI.isDynamicVGPREnabled()) {
1473	unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1474	STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(),
1475	DynamicVGPRBlockSize: PressureBefore.getVGPRNum(UnifiedVGPRFile: false));
1476	unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1477	STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(),
1478	DynamicVGPRBlockSize: PressureAfter.getVGPRNum(UnifiedVGPRFile: false));
1479	if (BlocksAfter > BlocksBefore)
1480	return true;
1481	}
1482
1483	return false;
1484	}
1485
1486	bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1487	if (PressureAfter == PressureBefore)
1488	return false;
1489
1490	if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1491	return true;
1492
1493	if (mayCauseSpilling(WavesAfter))
1494	return true;
1495
1496	return false;
1497	}
1498
1499	bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1500	// If RP is not reduced in the unclustered reschedule stage, revert to the
1501	// old schedule.
1502	if ((WavesAfter <=
1503	PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) &&
1504	mayCauseSpilling(WavesAfter)) \|\|
1505	GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1506	LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1507	return true;
1508	}
1509
1510	// Do not attempt to relax schedule even more if we are already spilling.
1511	if (isRegionWithExcessRP())
1512	return false;
1513
1514	LLVM_DEBUG(
1515	dbgs()
1516	<< "\n\t * In shouldRevertScheduling *\n"
1517	<< " ********* BEFORE UnclusteredHighRPStage *********\n");
1518	ScheduleMetrics MBefore = getScheduleMetrics(InputSchedule: DAG.SUnits);
1519	LLVM_DEBUG(
1520	dbgs()
1521	<< "\n ********* AFTER UnclusteredHighRPStage *********\n");
1522	ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1523	unsigned OldMetric = MBefore.getMetric();
1524	unsigned NewMetric = MAfter.getMetric();
1525	unsigned WavesBefore = std::min(
1526	a: S.getTargetOccupancy(),
1527	b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()));
1528	unsigned Profit =
1529	((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1530	((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1531	NewMetric) /
1532	ScheduleMetrics::ScaleFactor;
1533	LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1534	<< MAfter << "Profit: " << Profit << "\n");
1535	return Profit < ScheduleMetrics::ScaleFactor;
1536	}
1537
1538	bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1539	if (PressureAfter == PressureBefore)
1540	return false;
1541
1542	if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1543	return true;
1544
1545	if (mayCauseSpilling(WavesAfter))
1546	return true;
1547
1548	return false;
1549	}
1550
1551	bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1552	return GCNSchedStage::shouldRevertScheduling(WavesAfter) \|\|
1553	mayCauseSpilling(WavesAfter) \|\|
1554	(IncreaseOccupancy && WavesAfter < TargetOcc);
1555	}
1556
1557	bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1558	if (mayCauseSpilling(WavesAfter))
1559	return true;
1560
1561	return false;
1562	}
1563
1564	bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
1565	unsigned WavesAfter) {
1566	return mayCauseSpilling(WavesAfter);
1567	}
1568
1569	bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1570	if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1571	!PressureAfter.less(MF, O: PressureBefore)) {
1572	LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1573	return true;
1574	}
1575
1576	return false;
1577	}
1578
1579	void GCNSchedStage::revertScheduling() {
1580	DAG.RegionsWithMinOcc [RegionIdx] =
1581	PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) ==
1582	DAG.MinOccupancy;
1583	LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1584	DAG.RegionEnd = DAG.RegionBegin;
1585	int SkippedDebugInstr = `0`;
1586	for (MachineInstr *MI : Unsched) {
1587	if (MI->isDebugInstr()) {
1588	++SkippedDebugInstr;
1589	continue;
1590	}
1591
1592	if (MI->getIterator() != DAG.RegionEnd) {
1593	DAG.BB->splice(Where: DAG.RegionEnd, Other: DAG.BB, From: MI);
1594	if (!MI->isDebugInstr())
1595	DAG.LIS->handleMove(MI&: MI, UpdateFlags: true*);
1596	}
1597
1598	// Reset read-undef flags and update them later.
1599	for (auto &Op : MI->all_defs())
1600	Op.setIsUndef(false);
1601	RegisterOperands RegOpers;
1602	RegOpers.collect(MI: MI, TRI: DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false);
1603	if (!MI->isDebugInstr()) {
1604	if (DAG.ShouldTrackLaneMasks) {
1605	// Adjust liveness and add missing dead+read-undef flags.
1606	SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot();
1607	RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI);
1608	} else {
1609	// Adjust for missing dead-def flags.
1610	RegOpers.detectDeadDefs(MI: MI, LIS: DAG.LIS);
1611	}
1612	}
1613	DAG.RegionEnd = MI->getIterator();
1614	++DAG.RegionEnd;
1615	LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1616	}
1617
1618	// After reverting schedule, debug instrs will now be at the end of the block
1619	// and RegionEnd will point to the first debug instr. Increment RegionEnd
1620	// pass debug instrs to the actual end of the scheduling region.
1621	while (SkippedDebugInstr-- > `0`)
1622	++DAG.RegionEnd;
1623
1624	// If Unsched.front() instruction is a debug instruction, this will actually
1625	// shrink the region since we moved all debug instructions to the end of the
1626	// block. Find the first instruction that is not a debug instruction.
1627	DAG.RegionBegin = Unsched.front()->getIterator();
1628	if (DAG.RegionBegin ->isDebugInstr()) {
1629	for (MachineInstr *MI : Unsched) {
1630	if (MI->isDebugInstr())
1631	continue;
1632	DAG.RegionBegin = MI->getIterator();
1633	break;
1634	}
1635	}
1636
1637	// Then move the debug instructions back into their correct place and set
1638	// RegionBegin and RegionEnd if needed.
1639	DAG.placeDebugValues();
1640
1641	DAG.Regions [RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1642	}
1643
1644	bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
1645	SlotIndex OriginalIdx,
1646	SlotIndex RematIdx) const {
1647
1648	LiveIntervals *LIS = DAG.LIS;
1649	MachineRegisterInfo &MRI = DAG.MRI;
1650	OriginalIdx = OriginalIdx.getRegSlot(EC: true);
1651	RematIdx = std::max(a: RematIdx, b: RematIdx.getRegSlot(EC: true));
1652	for (const MachineOperand &MO : InstToRemat->operands()) {
1653	if (!MO.isReg() \|\| !MO.getReg() \|\| !MO.readsReg())
1654	continue;
1655
1656	if (!MO.getReg().isVirtual()) {
1657	// Do not attempt to reason about PhysRegs
1658	// TODO: better analysis of PhysReg livness
1659	if (!DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) &&
1660	!DAG.TII->isIgnorableUse(MO))
1661	return false;
1662
1663	// Constant PhysRegs and IgnorableUses are okay
1664	continue;
1665	}
1666
1667	LiveInterval &LI = LIS->getInterval(Reg: MO.getReg());
1668	const VNInfo *OVNI = LI.getVNInfoAt(Idx: OriginalIdx);
1669	assert(OVNI);
1670
1671	// Don't allow rematerialization immediately after the original def.
1672	// It would be incorrect if InstToRemat redefines the register.
1673	// See PR14098.
1674	if (SlotIndex::isSameInstr(A: OriginalIdx, B: RematIdx))
1675	return false;
1676
1677	if (OVNI != LI.getVNInfoAt(Idx: RematIdx))
1678	return false;
1679
1680	// Check that subrange is live at RematIdx.
1681	if (LI.hasSubRanges()) {
1682	const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
1683	unsigned SubReg = MO.getSubReg();
1684	LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubIdx: SubReg)
1685	: MRI.getMaxLaneMaskForVReg(Reg: MO.getReg());
1686	for (LiveInterval::SubRange &SR : LI.subranges()) {
1687	if ((SR.LaneMask & LM).none())
1688	continue;
1689	if (!SR.liveAt(index: RematIdx))
1690	return false;
1691
1692	// Early exit if all used lanes are checked. No need to continue.
1693	LM &= ~SR.LaneMask;
1694	if (LM.none())
1695	break;
1696	}
1697	}
1698	}
1699	return true;
1700	}
1701
1702	bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
1703	REMAT_DEBUG({
1704	dbgs() << "Collecting rematerializable instructions in ";
1705	MF.getFunction().printAsOperand(dbgs(), false);
1706	dbgs() << `'\n'`;
1707	});
1708
1709	// Maps optimizable regions (i.e., regions at minimum and register-limited
1710	// occupancy, or regions with spilling) to the target RP we would like to
1711	// reach.
1712	DenseMap<unsigned, GCNRPTarget> OptRegions;
1713	const Function &F = MF.getFunction();
1714	unsigned DynamicVGPRBlockSize =
1715	MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
1716
1717	std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
1718	const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
1719	const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
1720	const unsigned MaxSGPRsIncOcc =
1721	ST.getMaxNumSGPRs(WavesPerEU: DAG.MinOccupancy + `1`, Addressable: false);
1722	const unsigned MaxVGPRsIncOcc =
1723	ST.getMaxNumVGPRs(WavesPerEU: DAG.MinOccupancy + `1`, DynamicVGPRBlockSize);
1724	IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
1725
1726	// Collect optimizable regions. If there is spilling in any region we will
1727	// just try to reduce spilling. Otherwise we will try to increase occupancy by
1728	// one in the whole function.
1729	for (unsigned I = `0`, E = DAG.Regions.size(); I != E; ++I) {
1730	GCNRegPressure &RP = DAG.Pressure [I];
1731	// We allow ArchVGPR or AGPR savings to count as savings of the other kind
1732	// of VGPR only when trying to eliminate spilling. We cannot do this when
1733	// trying to increase occupancy since VGPR class swaps only occur later in
1734	// the register allocator i.e., the scheduler will not be able to reason
1735	// about these savings and will not report an increase in the achievable
1736	// occupancy, triggering rollbacks.
1737	GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP,
1738	/CombineVGPRSavings=/true);
1739	if (!Target.satisfied() && IncreaseOccupancy) {
1740	// There is spilling in the region and we were so far trying to increase
1741	// occupancy. Strop trying that and focus on reducing spilling.
1742	IncreaseOccupancy = false;
1743	OptRegions.clear();
1744	} else if (IncreaseOccupancy) {
1745	// There is no spilling in the region, try to increase occupancy.
1746	Target = GCNRPTarget (MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
1747	/CombineVGPRSavings=/false);
1748	}
1749	if (!Target.satisfied())
1750	OptRegions.insert(KV: {I, Target});
1751	}
1752	if (OptRegions.empty())
1753	return false;
1754
1755	#ifndef NDEBUG
1756	if (IncreaseOccupancy) {
1757	REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy
1758	<< ") in regions:\n");
1759	} else {
1760	REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy ("
1761	<< WavesPerEU.first << ") in regions:\n");
1762	}
1763	for (unsigned I = `0`, E = DAG.Regions.size(); I != E; ++I) {
1764	if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
1765	REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << `'\n'`);
1766	}
1767	#endif
1768
1769	// When we are reducing spilling, the target is the minimum target number of
1770	// waves/EU determined by the subtarget. In cases where either one of
1771	// "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
1772	// minimum region occupancy may be higher than the latter.
1773	TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + `1`
1774	: std::max(a: DAG.MinOccupancy, b: WavesPerEU.first);
1775
1776	// Accounts for a reduction in RP in an optimizable region. Returns whether we
1777	// estimate that we have identified enough rematerialization opportunities to
1778	// achieve our goal, and sets Progress to true when this particular reduction
1779	// in pressure was helpful toward that goal.
1780	auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
1781	bool &Progress) -> bool {
1782	GCNRPTarget &Target = OptIt->getSecond();
1783	if (!Target.isSaveBeneficial(Reg, MRI: DAG.MRI))
1784	return false;
1785	Progress = true;
1786	Target.saveReg(Reg, Mask, MRI: DAG.MRI);
1787	if (Target.satisfied())
1788	OptRegions.erase(OptIt->getFirst());
1789	return OptRegions.empty();
1790	};
1791
1792	// We need up-to-date live-out info. to query live-out register masks in
1793	// regions containing rematerializable instructions.
1794	DAG.RegionLiveOuts.buildLiveRegMap();
1795
1796	// Cache set of registers that are going to be rematerialized.
1797	DenseSet<unsigned> RematRegs;
1798
1799	// Identify rematerializable instructions in the function.
1800	for (unsigned I = `0`, E = DAG.Regions.size(); I != E; ++I) {
1801	auto Region = DAG.Regions [I];
1802	for (auto MI = Region.first; MI != Region.second; ++MI) {
1803	// The instruction must be trivially rematerializable.
1804	MachineInstr &DefMI = *MI;
1805	if (!isTriviallyReMaterializable(MI: DefMI))
1806	continue;
1807
1808	// We only support rematerializing virtual registers with one definition.
1809	Register Reg = DefMI.getOperand(i: `0`).getReg();
1810	if (!Reg.isVirtual() \|\| !DAG.MRI.hasOneDef(RegNo: Reg))
1811	continue;
1812
1813	// We only care to rematerialize the instruction if it has a single
1814	// non-debug user in a different region. The using MI may not belong to a
1815	// region if it is a lone region terminator.
1816	MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(RegNo: Reg);
1817	if (!UseMI)
1818	continue;
1819	auto UseRegion = MIRegion.find(Val: UseMI);
1820	if (UseRegion != MIRegion.end() && UseRegion ->second == I)
1821	continue;
1822
1823	// Do not rematerialize an instruction if it uses or is used by an
1824	// instruction that we have designated for rematerialization.
1825	// FIXME: Allow for rematerialization chains: this requires 1. updating
1826	// remat points to account for uses that are rematerialized, and 2. either
1827	// rematerializing the candidates in careful ordering, or deferring the
1828	// MBB RP walk until the entire chain has been rematerialized.
1829	if (Rematerializations.contains(Key: UseMI) \|\|
1830	llvm::any_of(Range: DefMI.operands(), P: [&RematRegs](MachineOperand &MO) {
1831	return MO.isReg() && RematRegs.contains(V: MO.getReg());
1832	}))
1833	continue;
1834
1835	// Do not rematerialize an instruction it it uses registers that aren't
1836	// available at its use. This ensures that we are not extending any live
1837	// range while rematerializing.
1838	SlotIndex DefIdx = DAG.LIS->getInstructionIndex(Instr: DefMI);
1839	SlotIndex UseIdx = DAG.LIS->getInstructionIndex(Instr: UseMI).getRegSlot(EC: true*);
1840	if (!allUsesAvailableAt(InstToRemat: &DefMI, OriginalIdx: DefIdx, RematIdx: UseIdx))
1841	continue;
1842
1843	REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
1844	RematInstruction &Remat =
1845	Rematerializations.try_emplace(Key: &DefMI, Args&: UseMI).first->second;
1846
1847	bool RematUseful = false;
1848	if (auto It = OptRegions.find(Val: I); It != OptRegions.end()) {
1849	// Optimistically consider that moving the instruction out of its
1850	// defining region will reduce RP in the latter; this assumes that
1851	// maximum RP in the region is reached somewhere between the defining
1852	// instruction and the end of the region.
1853	REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
1854	LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I)[Reg];
1855	if (ReduceRPInRegion (It, Reg, Mask, RematUseful))
1856	return true;
1857	}
1858
1859	for (unsigned LIRegion = `0`; LIRegion != E; ++LIRegion) {
1860	// We are only collecting regions in which the register is a live-in
1861	// (and may be live-through).
1862	auto It = DAG.LiveIns [LIRegion].find(Val: Reg);
1863	if (It == DAG.LiveIns [LIRegion].end() \|\| It ->second.none())
1864	continue;
1865	Remat.LiveInRegions.insert(V: LIRegion);
1866
1867	// Account for the reduction in RP due to the rematerialization in an
1868	// optimizable region in which the defined register is a live-in. This
1869	// is exact for live-through region but optimistic in the using region,
1870	// where RP is actually reduced only if maximum RP is reached somewhere
1871	// between the beginning of the region and the rematerializable
1872	// instruction's use.
1873	if (auto It = OptRegions.find(Val: LIRegion); It != OptRegions.end()) {
1874	REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << `'\n'`);
1875	if (ReduceRPInRegion (It, Reg, DAG.LiveIns [LIRegion][Reg],
1876	RematUseful))
1877	return true;
1878	}
1879	}
1880
1881	// If the instruction is not a live-in or live-out in any optimizable
1882	// region then there is no point in rematerializing it.
1883	if (!RematUseful) {
1884	Rematerializations.pop_back();
1885	REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
1886	} else {
1887	RematRegs.insert(V: Reg);
1888	}
1889	}
1890	}
1891
1892	if (IncreaseOccupancy) {
1893	// We were trying to increase occupancy but failed, abort the stage.
1894	REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
1895	Rematerializations.clear();
1896	return false;
1897	}
1898	REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
1899	return !Rematerializations.empty();
1900	}
1901
1902	void PreRARematStage::rematerialize() {
1903	const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
1904
1905	// Collect regions whose RP changes in unpredictable way; we will have to
1906	// fully recompute their RP after all rematerailizations.
1907	DenseSet<unsigned> RecomputeRP;
1908
1909	// Rematerialize all instructions.
1910	for (auto &[DefMI, Remat] : Rematerializations) {
1911	MachineBasicBlock::iterator InsertPos(Remat.UseMI);
1912	Register Reg = DefMI->getOperand(i: `0`).getReg();
1913	unsigned SubReg = DefMI->getOperand(i: `0`).getSubReg();
1914	unsigned DefRegion = MIRegion.at(Val: DefMI);
1915
1916	// Rematerialize DefMI to its use block.
1917	TII->reMaterialize(MBB&: InsertPos ->getParent(), MI: InsertPos, DestReg: Reg, SubIdx: SubReg, Orig: DefMI,
1918	TRI: *DAG.TRI);
1919	Remat.RematMI = &*std::prev(x: InsertPos);
1920	Remat.RematMI->getOperand(i: `0`).setSubReg(SubReg);
1921	DAG.LIS->InsertMachineInstrInMaps(MI&: *Remat.RematMI);
1922
1923	// Update region boundaries in regions we sinked from (remove defining MI)
1924	// and to (insert MI rematerialized in use block). Only then we can erase
1925	// the original MI.
1926	DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions [DefRegion], MI: DefMI, NewMI: nullptr);
1927	auto UseRegion = MIRegion.find(Val: Remat.UseMI);
1928	if (UseRegion != MIRegion.end()) {
1929	DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions [UseRegion ->second], MI: InsertPos,
1930	NewMI: Remat.RematMI);
1931	}
1932	DAG.LIS->RemoveMachineInstrFromMaps(MI&: *DefMI);
1933	DefMI->eraseFromParent();
1934
1935	// Collect all regions impacted by the rematerialization and update their
1936	// live-in/RP information.
1937	for (unsigned I : Remat.LiveInRegions) {
1938	ImpactedRegions.insert(KV: {I, DAG.Pressure [I]});
1939	GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns [I];
1940
1941	#ifdef EXPENSIVE_CHECKS
1942	// All uses are known to be available / live at the remat point. Thus, the
1943	// uses should already be live in to the region.
1944	for (MachineOperand &MO : DefMI->operands()) {
1945	if (!MO.isReg() \|\| !MO.getReg() \|\| !MO.readsReg())
1946	continue;
1947
1948	Register UseReg = MO.getReg();
1949	if (!UseReg.isVirtual())
1950	continue;
1951
1952	LiveInterval &LI = DAG.LIS->getInterval(UseReg);
1953	LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
1954	if (LI.hasSubRanges() && MO.getSubReg())
1955	LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
1956
1957	LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
1958	LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
1959	// If this register has lanes not covered by the LiveIns, be sure they
1960	// do not map to any subrange. ref:
1961	// machine-scheduler-sink-trivial-remats.mir::omitted_subrange
1962	if (UncoveredLanes.any()) {
1963	assert(LI.hasSubRanges());
1964	for (LiveInterval::SubRange &SR : LI.subranges())
1965	assert((SR.LaneMask & UncoveredLanes).none());
1966	}
1967	}
1968	#endif
1969
1970	// The register is no longer a live-in in all regions but the one that
1971	// contains the single use. In live-through regions, maximum register
1972	// pressure decreases predictably so we can directly update it. In the
1973	// using region, maximum RP may or may not decrease, so we will mark it
1974	// for re-computation after all materializations have taken place.
1975	LaneBitmask PrevMask = RegionLiveIns [Reg];
1976	RegionLiveIns.erase(Val: Reg);
1977	RegMasks.insert(KV: {{I, Remat.RematMI->getOperand(i: `0`).getReg()}, PrevMask});
1978	if (Remat.UseMI->getParent() != DAG.Regions [I].first ->getParent())
1979	DAG.Pressure [I].inc(Reg, PrevMask, NewMask: LaneBitmask::getNone(), MRI: DAG.MRI);
1980	else
1981	RecomputeRP.insert(V: I);
1982	}
1983	// RP in the region from which the instruction was rematerialized may or may
1984	// not decrease.
1985	ImpactedRegions.insert(KV: {DefRegion, DAG.Pressure [DefRegion]});
1986	RecomputeRP.insert(V: DefRegion);
1987
1988	// Recompute live interval to reflect the register's rematerialization.
1989	Register RematReg = Remat.RematMI->getOperand(i: `0`).getReg();
1990	DAG.LIS->removeInterval(Reg: RematReg);
1991	DAG.LIS->createAndComputeVirtRegInterval(Reg: RematReg);
1992	}
1993
1994	// All regions impacted by at least one rematerialization must be rescheduled.
1995	// Maximum pressure must also be recomputed for all regions where it changed
1996	// non-predictably and checked against the target occupancy.
1997	AchievedOcc = TargetOcc;
1998	for (auto &[I, OriginalRP] : ImpactedRegions) {
1999	bool IsEmptyRegion = DAG.Regions [I].first == DAG.Regions [I].second;
2000	RescheduleRegions [I] = !IsEmptyRegion;
2001	if (!RecomputeRP.contains(V: I))
2002	continue;
2003
2004	GCNRegPressure RP;
2005	if (IsEmptyRegion) {
2006	RP = getRegPressure(MRI: DAG.MRI, LiveRegs&: DAG.LiveIns [I]);
2007	} else {
2008	GCNDownwardRPTracker RPT(*DAG.LIS);
2009	auto NonDbgMI = &skipDebugInstructionsForward(It: DAG.Regions [I].first,
2010	End: DAG.Regions [I].second);
2011	if (NonDbgMI == DAG.Regions [I].second) {
2012	// Region is non-empty but contains only debug instructions.
2013	RP = getRegPressure(MRI: DAG.MRI, LiveRegs&: DAG.LiveIns [I]);
2014	} else {
2015	RPT.reset(MI: *NonDbgMI, LiveRegs: &DAG.LiveIns [I]);
2016	RPT.advance(End: DAG.Regions [I].second);
2017	RP = RPT.moveMaxPressure();
2018	}
2019	}
2020	DAG.Pressure [I] = RP;
2021	AchievedOcc = std::min(
2022	a: AchievedOcc, b: RP.getOccupancy(ST, DynamicVGPRBlockSize: MF.getInfo<SIMachineFunctionInfo>()
2023	->getDynamicVGPRBlockSize()));
2024	}
2025	REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
2026	}
2027
2028	// Copied from MachineLICM
2029	bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
2030	if (!DAG.TII->isTriviallyReMaterializable(MI))
2031	return false;
2032
2033	for (const MachineOperand &MO : MI.all_uses()) {
2034	// We can't remat physreg uses, unless it is a constant or an ignorable
2035	// use (e.g. implicit exec use on VALU instructions)
2036	if (MO.getReg().isPhysical()) {
2037	if (DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) \|\| DAG.TII->isIgnorableUse(MO))
2038	continue;
2039	return false;
2040	}
2041	}
2042
2043	return true;
2044	}
2045
2046	void PreRARematStage::finalizeGCNSchedStage() {
2047	// We consider that reducing spilling is always beneficial so we never
2048	// rollback rematerializations in such cases. It's also possible that
2049	// rescheduling lowers occupancy over the one achieved just through remats, in
2050	// which case we do not want to rollback either (the rescheduling was already
2051	// reverted in PreRARematStage::shouldRevertScheduling in such cases).
2052	unsigned MaxOcc = std::max(a: AchievedOcc, b: DAG.MinOccupancy);
2053	if (!IncreaseOccupancy \|\| MaxOcc >= TargetOcc)
2054	return;
2055
2056	REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
2057	const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2058
2059	// Rollback the rematerializations.
2060	for (const auto &[DefMI, Remat] : Rematerializations) {
2061	MachineInstr &RematMI = *Remat.RematMI;
2062	unsigned DefRegion = MIRegion.at(Val: DefMI);
2063	MachineBasicBlock::iterator InsertPos(DAG.Regions [DefRegion].second);
2064	MachineBasicBlock *MBB = RegionBB [DefRegion];
2065	Register Reg = RematMI.getOperand(i: `0`).getReg();
2066	unsigned SubReg = RematMI.getOperand(i: `0`).getSubReg();
2067
2068	// Re-rematerialize MI at the end of its original region. Note that it may
2069	// not be rematerialized exactly in the same position as originally within
2070	// the region, but it should not matter much.
2071	TII->reMaterialize(MBB&: MBB, MI: InsertPos, DestReg: Reg, SubIdx: SubReg, Orig: RematMI, TRI: DAG.TRI);
2072	MachineInstr NewMI = &std::prev(x: InsertPos);
2073	NewMI->getOperand(i: `0`).setSubReg(SubReg);
2074	DAG.LIS->InsertMachineInstrInMaps(MI&: *NewMI);
2075
2076	auto UseRegion = MIRegion.find(Val: Remat.UseMI);
2077	if (UseRegion != MIRegion.end()) {
2078	DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions [UseRegion ->second], MI: RematMI,
2079	NewMI: nullptr);
2080	}
2081	DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions [DefRegion], MI: InsertPos, NewMI);
2082
2083	// Erase rematerialized MI.
2084	DAG.LIS->RemoveMachineInstrFromMaps(MI&: RematMI);
2085	RematMI.eraseFromParent();
2086
2087	// Recompute live interval for the re-rematerialized register
2088	DAG.LIS->removeInterval(Reg);
2089	DAG.LIS->createAndComputeVirtRegInterval(Reg);
2090
2091	// Re-add the register as a live-in in all regions it used to be one in.
2092	for (unsigned LIRegion : Remat.LiveInRegions)
2093	DAG.LiveIns [LIRegion].insert(KV: {Reg, RegMasks.at(Val: {LIRegion, Reg})});
2094	}
2095
2096	// Reset RP in all impacted regions.
2097	for (auto &[I, OriginalRP] : ImpactedRegions)
2098	DAG.Pressure [I] = OriginalRP;
2099
2100	GCNSchedStage::finalizeGCNSchedStage();
2101	}
2102
2103	void GCNScheduleDAGMILive::updateRegionBoundaries(
2104	RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
2105	MachineInstr *NewMI) {
2106	assert((!NewMI \|\| NewMI != RegionBounds.second) &&
2107	"cannot remove at region end");
2108
2109	if (RegionBounds.first == RegionBounds.second) {
2110	assert(NewMI && "cannot remove from an empty region");
2111	RegionBounds.first = NewMI;
2112	return;
2113	}
2114
2115	// We only care for modifications at the beginning of a non-empty region since
2116	// the upper region boundary is exclusive.
2117	if (MI != RegionBounds.first)
2118	return;
2119	if (!NewMI)
2120	RegionBounds.first = std::next(x: MI); // Removal
2121	else
2122	RegionBounds.first = NewMI; // Insertion
2123	}
2124
2125	static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
2126	const SIInstrInfo SII = static_cast<const* SIInstrInfo *>(DAG->TII);
2127	return any_of(Range&: *DAG, P: [SII](MachineBasicBlock::iterator MI) {
2128	return SII->isIGLPMutationOnly(Opcode: MI ->getOpcode());
2129	});
2130	}
2131
2132	GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
2133	MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
2134	bool RemoveKillFlags)
2135	: ScheduleDAGMI (C, std::move(S), RemoveKillFlags) {}
2136
2137	void GCNPostScheduleDAGMILive::schedule() {
2138	HasIGLPInstrs = hasIGLPInstrs(DAG: this);
2139	if (HasIGLPInstrs) {
2140	SavedMutations.clear();
2141	SavedMutations.swap(x&: Mutations);
2142	addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
2143	}
2144
2145	ScheduleDAGMI::schedule();
2146	}
2147
2148	void GCNPostScheduleDAGMILive::finalizeSchedule() {
2149	if (HasIGLPInstrs)
2150	SavedMutations.swap(x&: Mutations);
2151
2152	ScheduleDAGMI::finalizeSchedule();
2153	}
2154

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp