GCNSchedStrategy.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp]

1	//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This contains a MachineSchedStrategy implementation for maximizing wave
11	/// occupancy on GCN hardware.
12	///
13	/// This pass will apply multiple scheduling stages to the same function.
14	/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15	/// entry point for the scheduling of those regions is
16	/// GCNScheduleDAGMILive::runSchedStages.
17
18	/// Generally, the reason for having multiple scheduling stages is to account
19	/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20	/// few scheduling regions will have register pressure high enough to limit
21	/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22	/// other regions.
23	///
24	//===----------------------------------------------------------------------===//
25
26	#include "GCNSchedStrategy.h"
27	#include "AMDGPUIGroupLP.h"
28	#include "SIMachineFunctionInfo.h"
29	#include "llvm/CodeGen/RegisterClassInfo.h"
30
31	#define DEBUG_TYPE "machine-scheduler"
32
33	using namespace llvm;
34
35	static cl::opt<bool> DisableUnclusterHighRP(
36	"amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
37	cl::desc ("Disable unclustered high register pressure "
38	"reduction scheduling stage."),
39	cl::init(Val: false));
40
41	static cl::opt<bool> DisableClusteredLowOccupancy(
42	"amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
43	cl::desc ("Disable clustered low occupancy "
44	"rescheduling for ILP scheduling stage."),
45	cl::init(Val: false));
46
47	static cl::opt<unsigned> ScheduleMetricBias(
48	"amdgpu-schedule-metric-bias", cl::Hidden,
49	cl::desc (
50	"Sets the bias which adds weight to occupancy vs latency. Set it to "
51	"100 to chase the occupancy only."),
52	cl::init(Val: `10`));
53
54	static cl::opt<bool>
55	RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
56	cl::desc ("Relax occupancy targets for kernels which are memory "
57	"bound (amdgpu-membound-threshold), or "
58	"Wave Limited (amdgpu-limit-wave-threshold)."),
59	cl::init(Val: false));
60
61	const unsigned ScheduleMetrics::ScaleFactor = `100`;
62
63	GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
64	: GenericScheduler (C), TargetOccupancy(`0`), MF(nullptr),
65	HasHighPressure(false) {}
66
67	void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
68	GenericScheduler::initialize(dag: DAG);
69
70	MF = &DAG->MF;
71
72	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
73
74	SGPRExcessLimit =
75	Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass);
76	VGPRExcessLimit =
77	Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass);
78
79	SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
80	// Set the initial TargetOccupnacy to the maximum occupancy that we can
81	// achieve for this function. This effectively sets a lower bound on the
82	// 'Critical' register limits in the scheduler.
83	// Allow for lower occupancy targets if kernel is wave limited or memory
84	// bound, and using the relaxed occupancy feature.
85	TargetOccupancy =
86	RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
87	SGPRCriticalLimit =
88	std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit);
89
90	if (!KnownExcessRP) {
91	VGPRCriticalLimit =
92	std::min(a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy), b: VGPRExcessLimit);
93	} else {
94	// This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
95	// returns a reasonably small number for targets with lots of VGPRs, such
96	// as GFX10 and GFX11.
97	LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
98	"VGPRCriticalLimit calculation method.\n");
99
100	unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST);
101	unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: &ST);
102	unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule);
103	VGPRBudget = std::max(a: VGPRBudget, b: Granule);
104	VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit);
105	}
106
107	// Subtract error margin and bias from register limits and avoid overflow.
108	SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit);
109	VGPRCriticalLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit);
110	SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit);
111	VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit);
112
113	LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
114	<< ", VGPRExcessLimit = " << VGPRExcessLimit
115	<< ", SGPRCriticalLimit = " << SGPRCriticalLimit
116	<< ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
117	}
118
119	/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
120	/// current register pressure.
121	///
122	/// This works for the common case, but it has a few exceptions that have been
123	/// observed through trial and error:
124	/// - Explicit physical register operands
125	/// - Subregister definitions
126	///
127	/// In both of those cases, PressureDiff doesn't represent the actual pressure,
128	/// and querying LiveIntervals through the RegPressureTracker is needed to get
129	/// an accurate value.
130	///
131	/// We should eventually only use PressureDiff for maximum performance, but this
132	/// already allows 80% of SUs to take the fast path without changing scheduling
133	/// at all. Further changes would either change scheduling, or require a lot
134	/// more logic to recover an accurate pressure estimate from the PressureDiffs.
135	static bool canUsePressureDiffs(const SUnit &SU) {
136	if (!SU.isInstr())
137	return false;
138
139	// Cannot use pressure diffs for subregister defs or with physregs, it's
140	// imprecise in both cases.
141	for (const auto &Op : SU.getInstr()->operands()) {
142	if (!Op.isReg() \|\| Op.isImplicit())
143	continue;
144	if (Op.getReg().isPhysical() \|\|
145	(Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
146	return false;
147	}
148	return true;
149	}
150
151	static void getRegisterPressures(bool AtTop,
152	const RegPressureTracker &RPTracker, SUnit *SU,
153	std::vector<unsigned> &Pressure,
154	std::vector<unsigned> &MaxPressure) {
155	// getDownwardPressure() and getUpwardPressure() make temporary changes to
156	// the tracker, so we need to pass those function a non-const copy.
157	RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
158	if (AtTop)
159	TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure);
160	else
161	TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure);
162	}
163
164	void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
165	bool AtTop,
166	const RegPressureTracker &RPTracker,
167	const SIRegisterInfo *SRI,
168	unsigned SGPRPressure,
169	unsigned VGPRPressure, bool IsBottomUp) {
170	Cand.SU = SU;
171	Cand.AtTop = AtTop;
172
173	if (!DAG->isTrackingPressure())
174	return;
175
176	Pressure.clear();
177	MaxPressure.clear();
178
179	// We try to use the cached PressureDiffs in the ScheduleDAG whenever
180	// possible over querying the RegPressureTracker.
181	//
182	// RegPressureTracker will make a lot of LIS queries which are very
183	// expensive, it is considered a slow function in this context.
184	//
185	// PressureDiffs are precomputed and cached, and getPressureDiff is just a
186	// trivial lookup into an array. It is pretty much free.
187	//
188	// In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
189	// PressureDiffs.
190	if (AtTop \|\| !canUsePressureDiffs(SU: *SU)) {
191	getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure);
192	} else {
193	// Reserve 4 slots.
194	Pressure.resize(new_size: `4`, x: `0`);
195	Pressure [AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
196	Pressure [AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
197
198	for (const auto &Diff : DAG->getPressureDiff(SU)) {
199	if (!Diff.isValid())
200	continue;
201	// PressureDiffs is always bottom-up so if we're working top-down we need
202	// to invert its sign.
203	Pressure [Diff.getPSet()] +=
204	(IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
205	}
206
207	#ifdef EXPENSIVE_CHECKS
208	std::vector<unsigned> CheckPressure, CheckMaxPressure;
209	getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure);
210	if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
211	CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] \|\|
212	Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
213	CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
214	errs() << "Register Pressure is inaccurate when calculated through "
215	"PressureDiff\n"
216	<< "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
217	<< ", expected "
218	<< CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
219	<< "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
220	<< ", expected "
221	<< CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
222	report_fatal_error("inaccurate register pressure calculation");
223	}
224	#endif
225	}
226
227	unsigned NewSGPRPressure = Pressure [AMDGPU::RegisterPressureSets::SReg_32];
228	unsigned NewVGPRPressure = Pressure [AMDGPU::RegisterPressureSets::VGPR_32];
229
230	// If two instructions increase the pressure of different register sets
231	// by the same amount, the generic scheduler will prefer to schedule the
232	// instruction that increases the set with the least amount of registers,
233	// which in our case would be SGPRs. This is rarely what we want, so
234	// when we report excess/critical register pressure, we do it either
235	// only for VGPRs or only for SGPRs.
236
237	// FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
238	const unsigned MaxVGPRPressureInc = `16`;
239	bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
240	bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
241
242	// FIXME: We have to enter REG-EXCESS before we reach the actual threshold
243	// to increase the likelihood we don't go over the limits. We should improve
244	// the analysis to look through dependencies to find the path with the least
245	// register pressure.
246
247	// We only need to update the RPDelta for instructions that increase register
248	// pressure. Instructions that decrease or keep reg pressure the same will be
249	// marked as RegExcess in tryCandidate() when they are compared with
250	// instructions that increase the register pressure.
251	if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
252	HasHighPressure = true;
253	Cand.RPDelta.Excess = PressureChange (AMDGPU::RegisterPressureSets::VGPR_32);
254	Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
255	}
256
257	if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
258	HasHighPressure = true;
259	Cand.RPDelta.Excess = PressureChange (AMDGPU::RegisterPressureSets::SReg_32);
260	Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
261	}
262
263	// Register pressure is considered 'CRITICAL' if it is approaching a value
264	// that would reduce the wave occupancy for the execution unit. When
265	// register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
266	// has the same cost, so we don't need to prefer one over the other.
267
268	int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
269	int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
270
271	if (SGPRDelta >= `0` \|\| VGPRDelta >= `0`) {
272	HasHighPressure = true;
273	if (SGPRDelta > VGPRDelta) {
274	Cand.RPDelta.CriticalMax =
275	PressureChange (AMDGPU::RegisterPressureSets::SReg_32);
276	Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
277	} else {
278	Cand.RPDelta.CriticalMax =
279	PressureChange (AMDGPU::RegisterPressureSets::VGPR_32);
280	Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
281	}
282	}
283	}
284
285	// This function is mostly cut and pasted from
286	// GenericScheduler::pickNodeFromQueue()
287	void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
288	const CandPolicy &ZonePolicy,
289	const RegPressureTracker &RPTracker,
290	SchedCandidate &Cand,
291	bool IsBottomUp) {
292	const SIRegisterInfo SRI = static_cast<const* SIRegisterInfo*>(TRI);
293	ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
294	unsigned SGPRPressure = `0`;
295	unsigned VGPRPressure = `0`;
296	if (DAG->isTrackingPressure()) {
297	SGPRPressure = Pressure [AMDGPU::RegisterPressureSets::SReg_32];
298	VGPRPressure = Pressure [AMDGPU::RegisterPressureSets::VGPR_32];
299	}
300	ReadyQueue &Q = Zone.Available;
301	for (SUnit *SU : Q) {
302
303	SchedCandidate TryCand(ZonePolicy);
304	initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
305	VGPRPressure, IsBottomUp);
306	// Pass SchedBoundary only when comparing nodes from the same boundary.
307	SchedBoundary ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr*;
308	tryCandidate(Cand, TryCand, Zone: ZoneArg);
309	if (TryCand.Reason != NoCand) {
310	// Initialize resource delta if needed in case future heuristics query it.
311	if (TryCand.ResDelta == SchedResourceDelta ())
312	TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
313	Cand.setBest(TryCand);
314	LLVM_DEBUG(traceCandidate(Cand));
315	}
316	}
317	}
318
319	// This function is mostly cut and pasted from
320	// GenericScheduler::pickNodeBidirectional()
321	SUnit GCNSchedStrategy::pickNodeBidirectional(bool* &IsTopNode) {
322	// Schedule as far as possible in the direction of no choice. This is most
323	// efficient, but also provides the best heuristics for CriticalPSets.
324	if (SUnit *SU = Bot.pickOnlyChoice()) {
325	IsTopNode = false;
326	return SU;
327	}
328	if (SUnit *SU = Top.pickOnlyChoice()) {
329	IsTopNode = true;
330	return SU;
331	}
332	// Set the bottom-up policy based on the state of the current bottom zone and
333	// the instructions outside the zone, including the top zone.
334	CandPolicy BotPolicy;
335	setPolicy(Policy&: BotPolicy, /IsPostRA=/false, CurrZone&: Bot, OtherZone: &Top);
336	// Set the top-down policy based on the state of the current top zone and
337	// the instructions outside the zone, including the bottom zone.
338	CandPolicy TopPolicy;
339	setPolicy(Policy&: TopPolicy, /IsPostRA=/false, CurrZone&: Top, OtherZone: &Bot);
340
341	// See if BotCand is still valid (because we previously scheduled from Top).
342	LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
343	if (!BotCand.isValid() \|\| BotCand.SU->isScheduled \|\|
344	BotCand.Policy != BotPolicy) {
345	BotCand.reset(NewPolicy: CandPolicy ());
346	pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
347	/IsBottomUp=/true);
348	assert(BotCand.Reason != NoCand && "failed to find the first candidate");
349	} else {
350	LLVM_DEBUG(traceCandidate(BotCand));
351	#ifndef NDEBUG
352	if (VerifyScheduling) {
353	SchedCandidate TCand;
354	TCand.reset(CandPolicy());
355	pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
356	/IsBottomUp=/true);
357	assert(TCand.SU == BotCand.SU &&
358	"Last pick result should correspond to re-picking right now");
359	}
360	#endif
361	}
362
363	// Check if the top Q has a better candidate.
364	LLVM_DEBUG(dbgs() << "Picking from Top:\n");
365	if (!TopCand.isValid() \|\| TopCand.SU->isScheduled \|\|
366	TopCand.Policy != TopPolicy) {
367	TopCand.reset(NewPolicy: CandPolicy ());
368	pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
369	/IsBottomUp=/false);
370	assert(TopCand.Reason != NoCand && "failed to find the first candidate");
371	} else {
372	LLVM_DEBUG(traceCandidate(TopCand));
373	#ifndef NDEBUG
374	if (VerifyScheduling) {
375	SchedCandidate TCand;
376	TCand.reset(CandPolicy());
377	pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
378	/IsBottomUp=/false);
379	assert(TCand.SU == TopCand.SU &&
380	"Last pick result should correspond to re-picking right now");
381	}
382	#endif
383	}
384
385	// Pick best from BotCand and TopCand.
386	LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
387	dbgs() << "Bot Cand: "; traceCandidate(BotCand););
388	SchedCandidate Cand = BotCand;
389	TopCand.Reason = NoCand;
390	tryCandidate(Cand, TryCand&: TopCand, Zone: nullptr);
391	if (TopCand.Reason != NoCand) {
392	Cand.setBest(TopCand);
393	}
394	LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
395
396	IsTopNode = Cand.AtTop;
397	return Cand.SU;
398	}
399
400	// This function is mostly cut and pasted from
401	// GenericScheduler::pickNode()
402	SUnit GCNSchedStrategy::pickNode(bool* &IsTopNode) {
403	if (DAG->top() == DAG->bottom()) {
404	assert(Top.Available.empty() && Top.Pending.empty() &&
405	Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
406	return nullptr;
407	}
408	SUnit *SU;
409	do {
410	if (RegionPolicy.OnlyTopDown) {
411	SU = Top.pickOnlyChoice();
412	if (!SU) {
413	CandPolicy NoPolicy;
414	TopCand.reset(NewPolicy: NoPolicy);
415	pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
416	/IsBottomUp=/false);
417	assert(TopCand.Reason != NoCand && "failed to find a candidate");
418	SU = TopCand.SU;
419	}
420	IsTopNode = true;
421	} else if (RegionPolicy.OnlyBottomUp) {
422	SU = Bot.pickOnlyChoice();
423	if (!SU) {
424	CandPolicy NoPolicy;
425	BotCand.reset(NewPolicy: NoPolicy);
426	pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
427	/IsBottomUp=/true);
428	assert(BotCand.Reason != NoCand && "failed to find a candidate");
429	SU = BotCand.SU;
430	}
431	IsTopNode = false;
432	} else {
433	SU = pickNodeBidirectional(IsTopNode);
434	}
435	} while (SU->isScheduled);
436
437	if (SU->isTopReady())
438	Top.removeReady(SU);
439	if (SU->isBottomReady())
440	Bot.removeReady(SU);
441
442	LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
443	<< *SU->getInstr());
444	return SU;
445	}
446
447	GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
448	assert(CurrentStage && CurrentStage != SchedStages.end());
449	return *CurrentStage;
450	}
451
452	bool GCNSchedStrategy::advanceStage() {
453	assert(CurrentStage != SchedStages.end());
454	if (!CurrentStage)
455	CurrentStage = SchedStages.begin();
456	else
457	CurrentStage++;
458
459	return CurrentStage != SchedStages.end();
460	}
461
462	bool GCNSchedStrategy::hasNextStage() const {
463	assert(CurrentStage);
464	return std::next(x: CurrentStage) != SchedStages.end();
465	}
466
467	GCNSchedStageID GCNSchedStrategy::getNextStage() const {
468	assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
469	return *std::next(x: CurrentStage);
470	}
471
472	GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
473	const MachineSchedContext *C)
474	: GCNSchedStrategy (C) {
475	SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule);
476	SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule);
477	SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule);
478	SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize);
479	}
480
481	GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
482	: GCNSchedStrategy (C) {
483	SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule);
484	}
485
486	bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
487	SchedCandidate &TryCand,
488	SchedBoundary Zone) const* {
489	// Initialize the candidate if needed.
490	if (!Cand.isValid()) {
491	TryCand.Reason = NodeOrder;
492	return true;
493	}
494
495	// Avoid spilling by exceeding the register limit.
496	if (DAG->isTrackingPressure() &&
497	tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
498	Reason: RegExcess, TRI, MF: DAG->MF))
499	return TryCand.Reason != NoCand;
500
501	// Bias PhysReg Defs and copies to their uses and defined respectively.
502	if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
503	CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
504	return TryCand.Reason != NoCand;
505
506	bool SameBoundary = Zone != nullptr;
507	if (SameBoundary) {
508	// Prioritize instructions that read unbuffered resources by stall cycles.
509	if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
510	CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
511	return TryCand.Reason != NoCand;
512
513	// Avoid critical resource consumption and balance the schedule.
514	TryCand.initResourceDelta(DAG, SchedModel);
515	if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
516	TryCand, Cand, Reason: ResourceReduce))
517	return TryCand.Reason != NoCand;
518	if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
519	CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
520	Reason: ResourceDemand))
521	return TryCand.Reason != NoCand;
522
523	// Unconditionally try to reduce latency.
524	if (tryLatency(TryCand, Cand, Zone&: *Zone))
525	return TryCand.Reason != NoCand;
526
527	// Weak edges are for clustering and other constraints.
528	if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
529	CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
530	return TryCand.Reason != NoCand;
531	}
532
533	// Keep clustered nodes together to encourage downstream peephole
534	// optimizations which may reduce resource requirements.
535	//
536	// This is a best effort to set things up for a post-RA pass. Optimizations
537	// like generating loads of multiple registers should ideally be done within
538	// the scheduler pass by combining the loads during DAG postprocessing.
539	const SUnit *CandNextClusterSU =
540	Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
541	const SUnit *TryCandNextClusterSU =
542	TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
543	if (tryGreater(TryVal: TryCand.SU == TryCandNextClusterSU,
544	CandVal: Cand.SU == CandNextClusterSU, TryCand, Cand, Reason: Cluster))
545	return TryCand.Reason != NoCand;
546
547	// Avoid increasing the max critical pressure in the scheduled region.
548	if (DAG->isTrackingPressure() &&
549	tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
550	TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
551	return TryCand.Reason != NoCand;
552
553	// Avoid increasing the max pressure of the entire region.
554	if (DAG->isTrackingPressure() &&
555	tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
556	Cand, Reason: RegMax, TRI, MF: DAG->MF))
557	return TryCand.Reason != NoCand;
558
559	if (SameBoundary) {
560	// Fall through to original instruction order.
561	if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) \|\|
562	(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
563	TryCand.Reason = NodeOrder;
564	return true;
565	}
566	}
567	return false;
568	}
569
570	GCNScheduleDAGMILive::GCNScheduleDAGMILive(
571	MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
572	: ScheduleDAGMILive (C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
573	MFI(*MF.getInfo<SIMachineFunctionInfo>()),
574	StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
575
576	LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
577	if (RelaxedOcc) {
578	MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy);
579	if (MinOccupancy != StartingOccupancy)
580	LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
581	<< ".\n");
582	}
583	}
584
585	std::unique_ptr<GCNSchedStage>
586	GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
587	switch (SchedStageID) {
588	case GCNSchedStageID::OccInitialSchedule:
589	return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this);
590	case GCNSchedStageID::UnclusteredHighRPReschedule:
591	return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this);
592	case GCNSchedStageID::ClusteredLowOccupancyReschedule:
593	return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this);
594	case GCNSchedStageID::PreRARematerialize:
595	return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this);
596	case GCNSchedStageID::ILPInitialSchedule:
597	return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this);
598	}
599
600	llvm_unreachable("Unknown SchedStageID.");
601	}
602
603	void GCNScheduleDAGMILive::schedule() {
604	// Collect all scheduling regions. The actual scheduling is performed in
605	// GCNScheduleDAGMILive::finalizeSchedule.
606	Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd));
607	}
608
609	GCNRegPressure
610	GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
611	GCNDownwardRPTracker RPTracker(*LIS);
612	RPTracker.advance(Begin: begin(), End: end(), LiveRegsCopy: &LiveIns [RegionIdx]);
613	return RPTracker.moveMaxPressure();
614	}
615
616	void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
617	const MachineBasicBlock *MBB) {
618	GCNDownwardRPTracker RPTracker(*LIS);
619
620	// If the block has the only successor then live-ins of that successor are
621	// live-outs of the current block. We can reuse calculated live set if the
622	// successor will be sent to scheduling past current block.
623
624	// However, due to the bug in LiveInterval analysis it may happen that two
625	// predecessors of the same successor block have different lane bitmasks for
626	// a live-out register. Workaround that by sticking to one-to-one relationship
627	// i.e. one predecessor with one successor block.
628	const MachineBasicBlock OnlySucc = nullptr*;
629	if (MBB->succ_size() == `1`) {
630	auto Candidate = MBB->succ_begin();
631	if (!Candidate->empty() && Candidate->pred_size() == `1`) {
632	SlotIndexes *Ind = LIS->getSlotIndexes();
633	if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate))
634	OnlySucc = Candidate;
635	}
636	}
637
638	// Scheduler sends regions from the end of the block upwards.
639	size_t CurRegion = RegionIdx;
640	for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
641	if (Regions [CurRegion].first ->getParent() != MBB)
642	break;
643	--CurRegion;
644
645	auto I = MBB->begin();
646	auto LiveInIt = MBBLiveIns.find(Val: MBB);
647	auto &Rgn = Regions [CurRegion];
648	auto NonDbgMI = &skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
649	if (LiveInIt != MBBLiveIns.end()) {
650	auto LiveIn = std::move(LiveInIt ->second);
651	RPTracker.reset(MI: *MBB->begin(), LiveRegs: &LiveIn);
652	MBBLiveIns.erase(I: LiveInIt);
653	} else {
654	I = Rgn.first;
655	auto LRS = BBLiveInMap.lookup(Val: NonDbgMI);
656	#ifdef EXPENSIVE_CHECKS
657	assert(isEqual(getLiveRegsBefore(NonDbgMI, LIS), LRS));
658	#endif
659	RPTracker.reset(MI: *I, LiveRegs: &LRS);
660	}
661
662	for (;;) {
663	I = RPTracker.getNext();
664
665	if (Regions [CurRegion].first == I \|\| NonDbgMI == I) {
666	LiveIns [CurRegion] = RPTracker.getLiveRegs();
667	RPTracker.clearMaxPressure();
668	}
669
670	if (Regions [CurRegion].second == I) {
671	Pressure [CurRegion] = RPTracker.moveMaxPressure();
672	if (CurRegion-- == RegionIdx)
673	break;
674	}
675	RPTracker.advanceToNext();
676	RPTracker.advanceBeforeNext();
677	}
678
679	if (OnlySucc) {
680	if (I != MBB->end()) {
681	RPTracker.advanceToNext();
682	RPTracker.advance(End: MBB->end());
683	}
684	RPTracker.advanceBeforeNext();
685	MBBLiveIns [OnlySucc] = RPTracker.moveLiveRegs();
686	}
687	}
688
689	DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
690	GCNScheduleDAGMILive::getBBLiveInMap() const {
691	assert(!Regions.empty());
692	std::vector<MachineInstr *> BBStarters;
693	BBStarters.reserve(n: Regions.size());
694	auto I = Regions.rbegin(), E = Regions.rend();
695	auto *BB = I ->first ->getParent();
696	do {
697	auto MI = &skipDebugInstructionsForward(It: I ->first, End: I ->second);
698	BBStarters.push_back(x: MI);
699	do {
700	++I;
701	} while (I != E && I ->first ->getParent() == BB);
702	} while (I != E);
703	return getLiveRegMap(R&: BBStarters, After: false /After/, LIS&: *LIS);
704	}
705
706	void GCNScheduleDAGMILive::finalizeSchedule() {
707	// Start actual scheduling here. This function is called by the base
708	// MachineScheduler after all regions have been recorded by
709	// GCNScheduleDAGMILive::schedule().
710	LiveIns.resize(N: Regions.size());
711	Pressure.resize(N: Regions.size());
712	RescheduleRegions.resize(N: Regions.size());
713	RegionsWithHighRP.resize(N: Regions.size());
714	RegionsWithExcessRP.resize(N: Regions.size());
715	RegionsWithMinOcc.resize(N: Regions.size());
716	RegionsWithIGLPInstrs.resize(N: Regions.size());
717	RescheduleRegions.set();
718	RegionsWithHighRP.reset();
719	RegionsWithExcessRP.reset();
720	RegionsWithMinOcc.reset();
721	RegionsWithIGLPInstrs.reset();
722
723	runSchedStages();
724	}
725
726	void GCNScheduleDAGMILive::runSchedStages() {
727	LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
728
729	if (!Regions.empty())
730	BBLiveInMap = getBBLiveInMap();
731
732	GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
733	while (S.advanceStage()) {
734	auto Stage = createSchedStage(SchedStageID: S.getCurrentStage());
735	if (!Stage ->initGCNSchedStage())
736	continue;
737
738	for (auto Region : Regions) {
739	RegionBegin = Region.first;
740	RegionEnd = Region.second;
741	// Setup for scheduling the region and check whether it should be skipped.
742	if (!Stage ->initGCNRegion()) {
743	Stage ->advanceRegion();
744	exitRegion();
745	continue;
746	}
747
748	ScheduleDAGMILive::schedule();
749	Stage ->finalizeGCNRegion();
750	}
751
752	Stage ->finalizeGCNSchedStage();
753	}
754	}
755
756	#ifndef NDEBUG
757	raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
758	switch (StageID) {
759	case GCNSchedStageID::OccInitialSchedule:
760	OS << "Max Occupancy Initial Schedule";
761	break;
762	case GCNSchedStageID::UnclusteredHighRPReschedule:
763	OS << "Unclustered High Register Pressure Reschedule";
764	break;
765	case GCNSchedStageID::ClusteredLowOccupancyReschedule:
766	OS << "Clustered Low Occupancy Reschedule";
767	break;
768	case GCNSchedStageID::PreRARematerialize:
769	OS << "Pre-RA Rematerialize";
770	break;
771	case GCNSchedStageID::ILPInitialSchedule:
772	OS << "Max ILP Initial Schedule";
773	break;
774	}
775
776	return OS;
777	}
778	#endif
779
780	GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
781	: DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
782	MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
783
784	bool GCNSchedStage::initGCNSchedStage() {
785	if (!DAG.LIS)
786	return false;
787
788	LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
789	return true;
790	}
791
792	bool UnclusteredHighRPStage::initGCNSchedStage() {
793	if (DisableUnclusterHighRP)
794	return false;
795
796	if (!GCNSchedStage::initGCNSchedStage())
797	return false;
798
799	if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
800	return false;
801
802	SavedMutations.swap(x&: DAG.Mutations);
803	DAG.addMutation(
804	Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry));
805
806	InitialOccupancy = DAG.MinOccupancy;
807	// Aggressivly try to reduce register pressure in the unclustered high RP
808	// stage. Temporarily increase occupancy target in the region.
809	S.SGPRLimitBias = S.HighRPSGPRBias;
810	S.VGPRLimitBias = S.HighRPVGPRBias;
811	if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
812	MFI.increaseOccupancy(MF, Limit: ++DAG.MinOccupancy);
813
814	LLVM_DEBUG(
815	dbgs()
816	<< "Retrying function scheduling without clustering. "
817	"Aggressivly try to reduce register pressure to achieve occupancy "
818	<< DAG.MinOccupancy << ".\n");
819
820	return true;
821	}
822
823	bool ClusteredLowOccStage::initGCNSchedStage() {
824	if (DisableClusteredLowOccupancy)
825	return false;
826
827	if (!GCNSchedStage::initGCNSchedStage())
828	return false;
829
830	// Don't bother trying to improve ILP in lower RP regions if occupancy has not
831	// been dropped. All regions will have already been scheduled with the ideal
832	// occupancy targets.
833	if (DAG.StartingOccupancy <= DAG.MinOccupancy)
834	return false;
835
836	LLVM_DEBUG(
837	dbgs() << "Retrying function scheduling with lowest recorded occupancy "
838	<< DAG.MinOccupancy << ".\n");
839	return true;
840	}
841
842	bool PreRARematStage::initGCNSchedStage() {
843	if (!GCNSchedStage::initGCNSchedStage())
844	return false;
845
846	if (DAG.RegionsWithMinOcc.none() \|\| DAG.Regions.size() == `1`)
847	return false;
848
849	const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
850	// Check maximum occupancy
851	if (ST.computeOccupancy(F: MF.getFunction(), LDSSize: MFI.getLDSSize()) ==
852	DAG.MinOccupancy)
853	return false;
854
855	// FIXME: This pass will invalidate cached MBBLiveIns for regions
856	// inbetween the defs and region we sinked the def to. Cached pressure
857	// for regions where a def is sinked from will also be invalidated. Will
858	// need to be fixed if there is another pass after this pass.
859	assert(!S.hasNextStage());
860
861	collectRematerializableInstructions();
862	if (RematerializableInsts.empty() \|\| !sinkTriviallyRematInsts(ST, TII))
863	return false;
864
865	LLVM_DEBUG(
866	dbgs() << "Retrying function scheduling with improved occupancy of "
867	<< DAG.MinOccupancy << " from rematerializing\n");
868	return true;
869	}
870
871	void GCNSchedStage::finalizeGCNSchedStage() {
872	DAG.finishBlock();
873	LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
874	}
875
876	void UnclusteredHighRPStage::finalizeGCNSchedStage() {
877	SavedMutations.swap(x&: DAG.Mutations);
878	S.SGPRLimitBias = S.VGPRLimitBias = `0`;
879	if (DAG.MinOccupancy > InitialOccupancy) {
880	for (unsigned IDX = `0`; IDX < DAG.Pressure.size(); ++IDX)
881	DAG.RegionsWithMinOcc [IDX] =
882	DAG.Pressure [IDX].getOccupancy(ST: DAG.ST) == DAG.MinOccupancy;
883
884	LLVM_DEBUG(dbgs() << StageID
885	<< " stage successfully increased occupancy to "
886	<< DAG.MinOccupancy << `'\n'`);
887	}
888
889	GCNSchedStage::finalizeGCNSchedStage();
890	}
891
892	bool GCNSchedStage::initGCNRegion() {
893	// Check whether this new region is also a new block.
894	if (DAG.RegionBegin ->getParent() != CurrentMBB)
895	setupNewBlock();
896
897	unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end());
898	DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs);
899
900	// Skip empty scheduling regions (0 or 1 schedulable instructions).
901	if (DAG.begin() == DAG.end() \|\| DAG.begin() == std::prev(x: DAG.end()))
902	return false;
903
904	LLVM_DEBUG(dbgs() << "******** MI Scheduling ********\n");
905	LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
906	<< " " << CurrentMBB->getName()
907	<< "\n From: " << *DAG.begin() << " To: ";
908	if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
909	else dbgs() << "End";
910	dbgs() << " RegionInstrs: " << NumRegionInstrs << `'\n'`);
911
912	// Save original instruction order before scheduling for possible revert.
913	Unsched.clear();
914	Unsched.reserve(n: DAG.NumRegionInstrs);
915	if (StageID == GCNSchedStageID::OccInitialSchedule \|\|
916	StageID == GCNSchedStageID::ILPInitialSchedule) {
917	for (auto &I : DAG) {
918	Unsched.push_back(x: &I);
919	if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER \|\|
920	I.getOpcode() == AMDGPU::IGLP_OPT)
921	DAG.RegionsWithIGLPInstrs [RegionIdx] = true;
922	}
923	} else {
924	for (auto &I : DAG)
925	Unsched.push_back(x: &I);
926	}
927
928	PressureBefore = DAG.Pressure [RegionIdx];
929
930	LLVM_DEBUG(
931	dbgs() << "Pressure before scheduling:\nRegion live-ins:"
932	<< print(DAG.LiveIns[RegionIdx], DAG.MRI)
933	<< "Region live-in pressure: "
934	<< print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
935	<< "Region register pressure: " << print(PressureBefore));
936
937	S.HasHighPressure = false;
938	S.KnownExcessRP = isRegionWithExcessRP();
939
940	if (DAG.RegionsWithIGLPInstrs [RegionIdx] &&
941	StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
942	SavedMutations.clear();
943	SavedMutations.swap(x&: DAG.Mutations);
944	bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule \|\|
945	StageID == GCNSchedStageID::ILPInitialSchedule;
946	DAG.addMutation(Mutation: createIGroupLPDAGMutation(
947	Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial
948	: AMDGPU::SchedulingPhase::PreRAReentry));
949	}
950
951	return true;
952	}
953
954	bool UnclusteredHighRPStage::initGCNRegion() {
955	// Only reschedule regions with the minimum occupancy or regions that may have
956	// spilling (excess register pressure).
957	if ((!DAG.RegionsWithMinOcc [RegionIdx] \|\|
958	DAG.MinOccupancy <= InitialOccupancy) &&
959	!DAG.RegionsWithExcessRP [RegionIdx])
960	return false;
961
962	return GCNSchedStage::initGCNRegion();
963	}
964
965	bool ClusteredLowOccStage::initGCNRegion() {
966	// We may need to reschedule this region if it wasn't rescheduled in the last
967	// stage, or if we found it was testing critical register pressure limits in
968	// the unclustered reschedule stage. The later is because we may not have been
969	// able to raise the min occupancy in the previous stage so the region may be
970	// overly constrained even if it was already rescheduled.
971	if (!DAG.RegionsWithHighRP [RegionIdx])
972	return false;
973
974	return GCNSchedStage::initGCNRegion();
975	}
976
977	bool PreRARematStage::initGCNRegion() {
978	if (!DAG.RescheduleRegions [RegionIdx])
979	return false;
980
981	return GCNSchedStage::initGCNRegion();
982	}
983
984	void GCNSchedStage::setupNewBlock() {
985	if (CurrentMBB)
986	DAG.finishBlock();
987
988	CurrentMBB = DAG.RegionBegin ->getParent();
989	DAG.startBlock(bb: CurrentMBB);
990	// Get real RP for the region if it hasn't be calculated before. After the
991	// initial schedule stage real RP will be collected after scheduling.
992	if (StageID == GCNSchedStageID::OccInitialSchedule \|\|
993	StageID == GCNSchedStageID::ILPInitialSchedule)
994	DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB);
995	}
996
997	void GCNSchedStage::finalizeGCNRegion() {
998	DAG.Regions [RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
999	DAG.RescheduleRegions [RegionIdx] = false;
1000	if (S.HasHighPressure)
1001	DAG.RegionsWithHighRP [RegionIdx] = true;
1002
1003	// Revert scheduling if we have dropped occupancy or there is some other
1004	// reason that the original schedule is better.
1005	checkScheduling();
1006
1007	if (DAG.RegionsWithIGLPInstrs [RegionIdx] &&
1008	StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1009	SavedMutations.swap(x&: DAG.Mutations);
1010
1011	DAG.exitRegion();
1012	RegionIdx++;
1013	}
1014
1015	void GCNSchedStage::checkScheduling() {
1016	// Check the results of scheduling.
1017	PressureAfter = DAG.getRealRegPressure(RegionIdx);
1018	LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1019	LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1020
1021	if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1022	PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1023	DAG.Pressure [RegionIdx] = PressureAfter;
1024	DAG.RegionsWithMinOcc [RegionIdx] =
1025	PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1026
1027	// Early out if we have achieved the occupancy target.
1028	LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1029	return;
1030	}
1031
1032	unsigned TargetOccupancy =
1033	std::min(a: S.getTargetOccupancy(), b: ST.getOccupancyWithLocalMemSize(MF));
1034	unsigned WavesAfter =
1035	std::min(a: TargetOccupancy, b: PressureAfter.getOccupancy(ST));
1036	unsigned WavesBefore =
1037	std::min(a: TargetOccupancy, b: PressureBefore.getOccupancy(ST));
1038	LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1039	<< ", after " << WavesAfter << ".\n");
1040
1041	// We may not be able to keep the current target occupancy because of the just
1042	// scheduled region. We might still be able to revert scheduling if the
1043	// occupancy before was higher, or if the current schedule has register
1044	// pressure higher than the excess limits which could lead to more spilling.
1045	unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore);
1046
1047	// Allow memory bound functions to drop to 4 waves if not limited by an
1048	// attribute.
1049	if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1050	WavesAfter >= MFI.getMinAllowedOccupancy()) {
1051	LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1052	<< MFI.getMinAllowedOccupancy() << " waves\n");
1053	NewOccupancy = WavesAfter;
1054	}
1055
1056	if (NewOccupancy < DAG.MinOccupancy) {
1057	DAG.MinOccupancy = NewOccupancy;
1058	MFI.limitOccupancy(Limit: DAG.MinOccupancy);
1059	DAG.RegionsWithMinOcc.reset();
1060	LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1061	<< DAG.MinOccupancy << ".\n");
1062	}
1063	// The maximum number of arch VGPR on non-unified register file, or the
1064	// maximum VGPR + AGPR in the unified register file case.
1065	unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1066	// The maximum number of arch VGPR for both unified and non-unified register
1067	// file.
1068	unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs());
1069	unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1070
1071	if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs \|\|
1072	PressureAfter.getVGPRNum(UnifiedVGPRFile: false) > MaxArchVGPRs \|\|
1073	PressureAfter.getAGPRNum() > MaxArchVGPRs \|\|
1074	PressureAfter.getSGPRNum() > MaxSGPRs) {
1075	DAG.RescheduleRegions [RegionIdx] = true;
1076	DAG.RegionsWithHighRP [RegionIdx] = true;
1077	DAG.RegionsWithExcessRP [RegionIdx] = true;
1078	}
1079
1080	// Revert if this region's schedule would cause a drop in occupancy or
1081	// spilling.
1082	if (shouldRevertScheduling(WavesAfter)) {
1083	revertScheduling();
1084	} else {
1085	DAG.Pressure [RegionIdx] = PressureAfter;
1086	DAG.RegionsWithMinOcc [RegionIdx] =
1087	PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1088	}
1089	}
1090
1091	unsigned
1092	GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1093	DenseMap<unsigned, unsigned> &ReadyCycles,
1094	const TargetSchedModel &SM) {
1095	unsigned ReadyCycle = CurrCycle;
1096	for (auto &D : SU.Preds) {
1097	if (D.isAssignedRegDep()) {
1098	MachineInstr *DefMI = D.getSUnit()->getInstr();
1099	unsigned Latency = SM.computeInstrLatency(MI: DefMI);
1100	unsigned DefReady = ReadyCycles [DAG.getSUnit(MI: DefMI)->NodeNum];
1101	ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency);
1102	}
1103	}
1104	ReadyCycles [SU.NodeNum] = ReadyCycle;
1105	return ReadyCycle;
1106	}
1107
1108	#ifndef NDEBUG
1109	struct EarlierIssuingCycle {
1110	bool operator()(std::pair<MachineInstr , unsigned*> A,
1111	std::pair<MachineInstr , unsigned> B) const* {
1112	return A.second < B.second;
1113	}
1114	};
1115
1116	static void printScheduleModel(std::set<std::pair<MachineInstr , unsigned*>,
1117	EarlierIssuingCycle> &ReadyCycles) {
1118	if (ReadyCycles.empty())
1119	return;
1120	unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1121	dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1122	<< " ##################\n# Cycle #\t\t\tInstruction "
1123	" "
1124	" \n";
1125	unsigned IPrev = `1`;
1126	for (auto &I : ReadyCycles) {
1127	if (I.second > IPrev + `1`)
1128	dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1129	<< " CYCLES DETECTED ******************************\n\n";
1130	dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1131	IPrev = I.second;
1132	}
1133	}
1134	#endif
1135
1136	ScheduleMetrics
1137	GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1138	#ifndef NDEBUG
1139	std::set<std::pair<MachineInstr , unsigned*>, EarlierIssuingCycle>
1140	ReadyCyclesSorted;
1141	#endif
1142	const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1143	unsigned SumBubbles = `0`;
1144	DenseMap<unsigned, unsigned> ReadyCycles;
1145	unsigned CurrCycle = `0`;
1146	for (auto &SU : InputSchedule) {
1147	unsigned ReadyCycle =
1148	computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1149	SumBubbles += ReadyCycle - CurrCycle;
1150	#ifndef NDEBUG
1151	ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1152	#endif
1153	CurrCycle = ++ReadyCycle;
1154	}
1155	#ifndef NDEBUG
1156	LLVM_DEBUG(
1157	printScheduleModel(ReadyCyclesSorted);
1158	dbgs() << "\n\t"
1159	<< "Metric: "
1160	<< (SumBubbles
1161	? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1162	: `1`)
1163	<< "\n\n");
1164	#endif
1165
1166	return ScheduleMetrics (CurrCycle, SumBubbles);
1167	}
1168
1169	ScheduleMetrics
1170	GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1171	#ifndef NDEBUG
1172	std::set<std::pair<MachineInstr , unsigned*>, EarlierIssuingCycle>
1173	ReadyCyclesSorted;
1174	#endif
1175	const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1176	unsigned SumBubbles = `0`;
1177	DenseMap<unsigned, unsigned> ReadyCycles;
1178	unsigned CurrCycle = `0`;
1179	for (auto &MI : DAG) {
1180	SUnit *SU = DAG.getSUnit(MI: &MI);
1181	if (!SU)
1182	continue;
1183	unsigned ReadyCycle =
1184	computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM);
1185	SumBubbles += ReadyCycle - CurrCycle;
1186	#ifndef NDEBUG
1187	ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1188	#endif
1189	CurrCycle = ++ReadyCycle;
1190	}
1191	#ifndef NDEBUG
1192	LLVM_DEBUG(
1193	printScheduleModel(ReadyCyclesSorted);
1194	dbgs() << "\n\t"
1195	<< "Metric: "
1196	<< (SumBubbles
1197	? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1198	: `1`)
1199	<< "\n\n");
1200	#endif
1201
1202	return ScheduleMetrics (CurrCycle, SumBubbles);
1203	}
1204
1205	bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1206	if (WavesAfter < DAG.MinOccupancy)
1207	return true;
1208
1209	return false;
1210	}
1211
1212	bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1213	if (PressureAfter == PressureBefore)
1214	return false;
1215
1216	if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1217	return true;
1218
1219	if (mayCauseSpilling(WavesAfter))
1220	return true;
1221
1222	return false;
1223	}
1224
1225	bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1226	// If RP is not reduced in the unclustered reschedule stage, revert to the
1227	// old schedule.
1228	if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
1229	mayCauseSpilling(WavesAfter)) \|\|
1230	GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1231	LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1232	return true;
1233	}
1234
1235	// Do not attempt to relax schedule even more if we are already spilling.
1236	if (isRegionWithExcessRP())
1237	return false;
1238
1239	LLVM_DEBUG(
1240	dbgs()
1241	<< "\n\t * In shouldRevertScheduling *\n"
1242	<< " ********* BEFORE UnclusteredHighRPStage *********\n");
1243	ScheduleMetrics MBefore =
1244	getScheduleMetrics(InputSchedule: DAG.SUnits);
1245	LLVM_DEBUG(
1246	dbgs()
1247	<< "\n ********* AFTER UnclusteredHighRPStage *********\n");
1248	ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1249	unsigned OldMetric = MBefore.getMetric();
1250	unsigned NewMetric = MAfter.getMetric();
1251	unsigned WavesBefore =
1252	std::min(a: S.getTargetOccupancy(), b: PressureBefore.getOccupancy(ST));
1253	unsigned Profit =
1254	((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1255	((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1256	NewMetric) /
1257	ScheduleMetrics::ScaleFactor;
1258	LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1259	<< MAfter << "Profit: " << Profit << "\n");
1260	return Profit < ScheduleMetrics::ScaleFactor;
1261	}
1262
1263	bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1264	if (PressureAfter == PressureBefore)
1265	return false;
1266
1267	if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1268	return true;
1269
1270	if (mayCauseSpilling(WavesAfter))
1271	return true;
1272
1273	return false;
1274	}
1275
1276	bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1277	if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1278	return true;
1279
1280	if (mayCauseSpilling(WavesAfter))
1281	return true;
1282
1283	return false;
1284	}
1285
1286	bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1287	if (mayCauseSpilling(WavesAfter))
1288	return true;
1289
1290	return false;
1291	}
1292
1293	bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1294	if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1295	!PressureAfter.less(MF, O: PressureBefore)) {
1296	LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1297	return true;
1298	}
1299
1300	return false;
1301	}
1302
1303	void GCNSchedStage::revertScheduling() {
1304	DAG.RegionsWithMinOcc [RegionIdx] =
1305	PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
1306	LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1307	DAG.RescheduleRegions [RegionIdx] =
1308	S.hasNextStage() &&
1309	S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
1310	DAG.RegionEnd = DAG.RegionBegin;
1311	int SkippedDebugInstr = `0`;
1312	for (MachineInstr *MI : Unsched) {
1313	if (MI->isDebugInstr()) {
1314	++SkippedDebugInstr;
1315	continue;
1316	}
1317
1318	if (MI->getIterator() != DAG.RegionEnd) {
1319	DAG.BB->remove(I: MI);
1320	DAG.BB->insert(I: DAG.RegionEnd, MI);
1321	if (!MI->isDebugInstr())
1322	DAG.LIS->handleMove(MI&: MI, UpdateFlags: true*);
1323	}
1324
1325	// Reset read-undef flags and update them later.
1326	for (auto &Op : MI->all_defs())
1327	Op.setIsUndef(false);
1328	RegisterOperands RegOpers;
1329	RegOpers.collect(MI: MI, TRI: DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false);
1330	if (!MI->isDebugInstr()) {
1331	if (DAG.ShouldTrackLaneMasks) {
1332	// Adjust liveness and add missing dead+read-undef flags.
1333	SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot();
1334	RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI);
1335	} else {
1336	// Adjust for missing dead-def flags.
1337	RegOpers.detectDeadDefs(MI: MI, LIS: DAG.LIS);
1338	}
1339	}
1340	DAG.RegionEnd = MI->getIterator();
1341	++DAG.RegionEnd;
1342	LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1343	}
1344
1345	// After reverting schedule, debug instrs will now be at the end of the block
1346	// and RegionEnd will point to the first debug instr. Increment RegionEnd
1347	// pass debug instrs to the actual end of the scheduling region.
1348	while (SkippedDebugInstr-- > `0`)
1349	++DAG.RegionEnd;
1350
1351	// If Unsched.front() instruction is a debug instruction, this will actually
1352	// shrink the region since we moved all debug instructions to the end of the
1353	// block. Find the first instruction that is not a debug instruction.
1354	DAG.RegionBegin = Unsched.front()->getIterator();
1355	if (DAG.RegionBegin ->isDebugInstr()) {
1356	for (MachineInstr *MI : Unsched) {
1357	if (MI->isDebugInstr())
1358	continue;
1359	DAG.RegionBegin = MI->getIterator();
1360	break;
1361	}
1362	}
1363
1364	// Then move the debug instructions back into their correct place and set
1365	// RegionBegin and RegionEnd if needed.
1366	DAG.placeDebugValues();
1367
1368	DAG.Regions [RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1369	}
1370
1371	void PreRARematStage::collectRematerializableInstructions() {
1372	const SIRegisterInfo SRI = static_cast<const* SIRegisterInfo *>(DAG.TRI);
1373	for (unsigned I = `0`, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
1374	Register Reg = Register::index2VirtReg(Index: I);
1375	if (!DAG.LIS->hasInterval(Reg))
1376	continue;
1377
1378	// TODO: Handle AGPR and SGPR rematerialization
1379	if (!SRI->isVGPRClass(RC: DAG.MRI.getRegClass(Reg)) \|\|
1380	!DAG.MRI.hasOneDef(RegNo: Reg) \|\| !DAG.MRI.hasOneNonDBGUse(RegNo: Reg))
1381	continue;
1382
1383	MachineOperand *Op = DAG.MRI.getOneDef(Reg);
1384	MachineInstr *Def = Op->getParent();
1385	if (Op->getSubReg() != `0` \|\| !isTriviallyReMaterializable(MI: *Def))
1386	continue;
1387
1388	MachineInstr UseI = &DAG.MRI.use_instr_nodbg_begin(RegNo: Reg);
1389	if (Def->getParent() == UseI->getParent())
1390	continue;
1391
1392	// We are only collecting defs that are defined in another block and are
1393	// live-through or used inside regions at MinOccupancy. This means that the
1394	// register must be in the live-in set for the region.
1395	bool AddedToRematList = false;
1396	for (unsigned I = `0`, E = DAG.Regions.size(); I != E; ++I) {
1397	auto It = DAG.LiveIns [I].find(Val: Reg);
1398	if (It != DAG.LiveIns [I].end() && !It ->second.none()) {
1399	if (DAG.RegionsWithMinOcc [I]) {
1400	RematerializableInsts [I][Def] = UseI;
1401	AddedToRematList = true;
1402	}
1403
1404	// Collect regions with rematerializable reg as live-in to avoid
1405	// searching later when updating RP.
1406	RematDefToLiveInRegions [Def].push_back(Elt: I);
1407	}
1408	}
1409	if (!AddedToRematList)
1410	RematDefToLiveInRegions.erase(Val: Def);
1411	}
1412	}
1413
1414	bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
1415	const TargetInstrInfo *TII) {
1416	// Temporary copies of cached variables we will be modifying and replacing if
1417	// sinking succeeds.
1418	SmallVector<
1419	std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, `32`>
1420	NewRegions;
1421	DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
1422	DenseMap<unsigned, GCNRegPressure> NewPressure;
1423	BitVector NewRescheduleRegions;
1424	LiveIntervals *LIS = DAG.LIS;
1425
1426	NewRegions.resize(N: DAG.Regions.size());
1427	NewRescheduleRegions.resize(N: DAG.Regions.size());
1428
1429	// Collect only regions that has a rematerializable def as a live-in.
1430	SmallSet<unsigned, `16`> ImpactedRegions;
1431	for (const auto &It : RematDefToLiveInRegions)
1432	ImpactedRegions.insert(I: It.second.begin(), E: It.second.end());
1433
1434	// Make copies of register pressure and live-ins cache that will be updated
1435	// as we rematerialize.
1436	for (auto Idx : ImpactedRegions) {
1437	NewPressure [Idx] = DAG.Pressure [Idx];
1438	NewLiveIns [Idx] = DAG.LiveIns [Idx];
1439	}
1440	NewRegions = DAG.Regions;
1441	NewRescheduleRegions.reset();
1442
1443	DenseMap<MachineInstr , MachineInstr > InsertedMIToOldDef;
1444	bool Improved = false;
1445	for (auto I : ImpactedRegions) {
1446	if (!DAG.RegionsWithMinOcc [I])
1447	continue;
1448
1449	Improved = false;
1450	int VGPRUsage = NewPressure [I].getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts());
1451	int SGPRUsage = NewPressure [I].getSGPRNum();
1452
1453	// TODO: Handle occupancy drop due to AGPR and SGPR.
1454	// Check if cause of occupancy drop is due to VGPR usage and not SGPR.
1455	if (ST.getOccupancyWithNumSGPRs(SGPRs: SGPRUsage) == DAG.MinOccupancy)
1456	break;
1457
1458	// The occupancy of this region could have been improved by a previous
1459	// iteration's sinking of defs.
1460	if (NewPressure [I].getOccupancy(ST) > DAG.MinOccupancy) {
1461	NewRescheduleRegions [I] = true;
1462	Improved = true;
1463	continue;
1464	}
1465
1466	// First check if we have enough trivially rematerializable instructions to
1467	// improve occupancy. Optimistically assume all instructions we are able to
1468	// sink decreased RP.
1469	int TotalSinkableRegs = `0`;
1470	for (const auto &It : RematerializableInsts [I]) {
1471	MachineInstr *Def = It.first;
1472	Register DefReg = Def->getOperand(i: `0`).getReg();
1473	TotalSinkableRegs +=
1474	SIRegisterInfo::getNumCoveredRegs(LM: NewLiveIns [I][DefReg]);
1475	}
1476	int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
1477	unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRs: VGPRsAfterSink);
1478	// If in the most optimistic scenario, we cannot improve occupancy, then do
1479	// not attempt to sink any instructions.
1480	if (OptimisticOccupancy <= DAG.MinOccupancy)
1481	break;
1482
1483	unsigned ImproveOccupancy = `0`;
1484	SmallVector<MachineInstr *, `4`> SinkedDefs;
1485	for (auto &It : RematerializableInsts [I]) {
1486	MachineInstr *Def = It.first;
1487	MachineBasicBlock::iterator InsertPos =
1488	MachineBasicBlock::iterator (It.second);
1489	Register Reg = Def->getOperand(i: `0`).getReg();
1490	// Rematerialize MI to its use block. Since we are only rematerializing
1491	// instructions that do not have any virtual reg uses, we do not need to
1492	// call LiveRangeEdit::allUsesAvailableAt() and
1493	// LiveRangeEdit::canRematerializeAt().
1494	TII->reMaterialize(MBB&: *InsertPos ->getParent(), MI: InsertPos, DestReg: Reg,
1495	SubIdx: Def->getOperand(i: `0`).getSubReg(), Orig: Def, TRI: DAG.TRI);
1496	MachineInstr NewMI = &std::prev(x: InsertPos);
1497	LIS->InsertMachineInstrInMaps(MI&: *NewMI);
1498	LIS->removeInterval(Reg);
1499	LIS->createAndComputeVirtRegInterval(Reg);
1500	InsertedMIToOldDef [NewMI] = Def;
1501
1502	// Update region boundaries in scheduling region we sinked from since we
1503	// may sink an instruction that was at the beginning or end of its region
1504	DAG.updateRegionBoundaries(RegionBoundaries&: NewRegions, MI: Def, /NewMI =/nullptr,
1505	/Removing =/true);
1506
1507	// Update region boundaries in region we sinked to.
1508	DAG.updateRegionBoundaries(RegionBoundaries&: NewRegions, MI: InsertPos, NewMI);
1509
1510	LaneBitmask PrevMask = NewLiveIns [I][Reg];
1511	// FIXME: Also update cached pressure for where the def was sinked from.
1512	// Update RP for all regions that has this reg as a live-in and remove
1513	// the reg from all regions as a live-in.
1514	for (auto Idx : RematDefToLiveInRegions [Def]) {
1515	NewLiveIns [Idx].erase(Val: Reg);
1516	if (InsertPos ->getParent() != DAG.Regions [Idx].first ->getParent()) {
1517	// Def is live-through and not used in this block.
1518	NewPressure [Idx].inc(Reg, PrevMask, NewMask: LaneBitmask::getNone(), MRI: DAG.MRI);
1519	} else {
1520	// Def is used and rematerialized into this block.
1521	GCNDownwardRPTracker RPT(*LIS);
1522	auto NonDbgMI = &skipDebugInstructionsForward(
1523	It: NewRegions [Idx].first, End: NewRegions [Idx].second);
1524	RPT.reset(MI: *NonDbgMI, LiveRegs: &NewLiveIns [Idx]);
1525	RPT.advance(End: NewRegions [Idx].second);
1526	NewPressure [Idx] = RPT.moveMaxPressure();
1527	}
1528	}
1529
1530	SinkedDefs.push_back(Elt: Def);
1531	ImproveOccupancy = NewPressure [I].getOccupancy(ST);
1532	if (ImproveOccupancy > DAG.MinOccupancy)
1533	break;
1534	}
1535
1536	// Remove defs we just sinked from all regions' list of sinkable defs
1537	for (auto &Def : SinkedDefs)
1538	for (auto TrackedIdx : RematDefToLiveInRegions [Def])
1539	RematerializableInsts [TrackedIdx].erase(Key: Def);
1540
1541	if (ImproveOccupancy <= DAG.MinOccupancy)
1542	break;
1543
1544	NewRescheduleRegions [I] = true;
1545	Improved = true;
1546	}
1547
1548	if (!Improved) {
1549	// Occupancy was not improved for all regions that were at MinOccupancy.
1550	// Undo sinking and remove newly rematerialized instructions.
1551	for (auto &Entry : InsertedMIToOldDef) {
1552	MachineInstr *MI = Entry.first;
1553	MachineInstr *OldMI = Entry.second;
1554	Register Reg = MI->getOperand(i: `0`).getReg();
1555	LIS->RemoveMachineInstrFromMaps(MI&: *MI);
1556	MI->eraseFromParent();
1557	OldMI->clearRegisterDeads(Reg);
1558	LIS->removeInterval(Reg);
1559	LIS->createAndComputeVirtRegInterval(Reg);
1560	}
1561	return false;
1562	}
1563
1564	// Occupancy was improved for all regions.
1565	for (auto &Entry : InsertedMIToOldDef) {
1566	MachineInstr *MI = Entry.first;
1567	MachineInstr *OldMI = Entry.second;
1568
1569	// Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
1570	DAG.BBLiveInMap.erase(Val: OldMI);
1571
1572	// Remove OldMI and update LIS
1573	Register Reg = MI->getOperand(i: `0`).getReg();
1574	LIS->RemoveMachineInstrFromMaps(MI&: *OldMI);
1575	OldMI->eraseFromParent();
1576	LIS->removeInterval(Reg);
1577	LIS->createAndComputeVirtRegInterval(Reg);
1578	}
1579
1580	// Update live-ins, register pressure, and regions caches.
1581	for (auto Idx : ImpactedRegions) {
1582	DAG.LiveIns [Idx] = NewLiveIns [Idx];
1583	DAG.Pressure [Idx] = NewPressure [Idx];
1584	DAG.MBBLiveIns.erase(Val: DAG.Regions [Idx].first ->getParent());
1585	}
1586	DAG.Regions = NewRegions;
1587	DAG.RescheduleRegions = NewRescheduleRegions;
1588
1589	SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1590	MFI.increaseOccupancy(MF, Limit: ++DAG.MinOccupancy);
1591
1592	return true;
1593	}
1594
1595	// Copied from MachineLICM
1596	bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
1597	if (!DAG.TII->isTriviallyReMaterializable(MI))
1598	return false;
1599
1600	for (const MachineOperand &MO : MI.all_uses())
1601	if (MO.getReg().isVirtual())
1602	return false;
1603
1604	return true;
1605	}
1606
1607	// When removing, we will have to check both beginning and ending of the region.
1608	// When inserting, we will only have to check if we are inserting NewMI in front
1609	// of a scheduling region and do not need to check the ending since we will only
1610	// ever be inserting before an already existing MI.
1611	void GCNScheduleDAGMILive::updateRegionBoundaries(
1612	SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
1613	MachineBasicBlock::iterator>> &RegionBoundaries,
1614	MachineBasicBlock::iterator MI, MachineInstr NewMI, bool* Removing) {
1615	unsigned I = `0`, E = RegionBoundaries.size();
1616	// Search for first region of the block where MI is located
1617	while (I != E && MI ->getParent() != RegionBoundaries [I].first ->getParent())
1618	++I;
1619
1620	for (; I != E; ++I) {
1621	if (MI ->getParent() != RegionBoundaries [I].first ->getParent())
1622	return;
1623
1624	if (Removing && MI == RegionBoundaries [I].first &&
1625	MI == RegionBoundaries [I].second) {
1626	// MI is in a region with size 1, after removing, the region will be
1627	// size 0, set RegionBegin and RegionEnd to pass end of block iterator.
1628	RegionBoundaries [I] =
1629	std::pair(MI ->getParent()->end(), MI ->getParent()->end());
1630	return;
1631	}
1632	if (MI == RegionBoundaries [I].first) {
1633	if (Removing)
1634	RegionBoundaries [I] =
1635	std::pair(std::next(x: MI), RegionBoundaries [I].second);
1636	else
1637	// Inserted NewMI in front of region, set new RegionBegin to NewMI
1638	RegionBoundaries [I] = std::pair(MachineBasicBlock::iterator (NewMI),
1639	RegionBoundaries [I].second);
1640	return;
1641	}
1642	if (Removing && MI == RegionBoundaries [I].second) {
1643	RegionBoundaries [I] = std::pair(RegionBoundaries [I].first, std::prev(x: MI));
1644	return;
1645	}
1646	}
1647	}
1648
1649	static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
1650	return std::any_of(
1651	first: DAG->begin(), last: DAG->end(), pred: [](MachineBasicBlock::iterator MI) {
1652	unsigned Opc = MI ->getOpcode();
1653	return Opc == AMDGPU::SCHED_GROUP_BARRIER \|\| Opc == AMDGPU::IGLP_OPT;
1654	});
1655	}
1656
1657	GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
1658	MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
1659	bool RemoveKillFlags)
1660	: ScheduleDAGMI (C, std::move(S), RemoveKillFlags) {}
1661
1662	void GCNPostScheduleDAGMILive::schedule() {
1663	HasIGLPInstrs = hasIGLPInstrs(DAG: this);
1664	if (HasIGLPInstrs) {
1665	SavedMutations.clear();
1666	SavedMutations.swap(x&: Mutations);
1667	addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
1668	}
1669
1670	ScheduleDAGMI::schedule();
1671	}
1672
1673	void GCNPostScheduleDAGMILive::finalizeSchedule() {
1674	if (HasIGLPInstrs)
1675	SavedMutations.swap(x&: Mutations);
1676
1677	ScheduleDAGMI::finalizeSchedule();
1678	}
1679

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp