GCNSchedStrategy.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp]

1	//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This contains a MachineSchedStrategy implementation for maximizing wave
11	/// occupancy on GCN hardware.
12	///
13	/// This pass will apply multiple scheduling stages to the same function.
14	/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15	/// entry point for the scheduling of those regions is
16	/// GCNScheduleDAGMILive::runSchedStages.
17
18	/// Generally, the reason for having multiple scheduling stages is to account
19	/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20	/// few scheduling regions will have register pressure high enough to limit
21	/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22	/// other regions.
23	///
24	//===----------------------------------------------------------------------===//
25
26	#include "GCNSchedStrategy.h"
27	#include "AMDGPUIGroupLP.h"
28	#include "GCNRegPressure.h"
29	#include "SIMachineFunctionInfo.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "llvm/ADT/BitVector.h"
32	#include "llvm/ADT/STLExtras.h"
33	#include "llvm/CodeGen/CalcSpillWeights.h"
34	#include "llvm/CodeGen/MachineBasicBlock.h"
35	#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
36	#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
37	#include "llvm/CodeGen/MachineCycleAnalysis.h"
38	#include "llvm/CodeGen/MachineOperand.h"
39	#include "llvm/CodeGen/RegisterClassInfo.h"
40	#include "llvm/MC/LaneBitmask.h"
41	#include "llvm/MC/MCInstrItineraries.h"
42	#include "llvm/MC/MCSchedule.h"
43	#include "llvm/MC/TargetRegistry.h"
44	#include "llvm/Support/ErrorHandling.h"
45
46	#define DEBUG_TYPE "machine-scheduler"
47
48	using namespace llvm;
49
50	static cl::opt<bool> DisableUnclusterHighRP(
51	"amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
52	cl::desc ("Disable unclustered high register pressure "
53	"reduction scheduling stage."),
54	cl::init(Val: false));
55
56	static cl::opt<bool> DisableClusteredLowOccupancy(
57	"amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
58	cl::desc ("Disable clustered low occupancy "
59	"rescheduling for ILP scheduling stage."),
60	cl::init(Val: false));
61
62	static cl::opt<unsigned> ScheduleMetricBias(
63	"amdgpu-schedule-metric-bias", cl::Hidden,
64	cl::desc (
65	"Sets the bias which adds weight to occupancy vs latency. Set it to "
66	"100 to chase the occupancy only."),
67	cl::init(Val: `10`));
68
69	static cl::opt<bool>
70	RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
71	cl::desc ("Relax occupancy targets for kernels which are memory "
72	"bound (amdgpu-membound-threshold), or "
73	"Wave Limited (amdgpu-limit-wave-threshold)."),
74	cl::init(Val: false));
75
76	static cl::opt<bool> GCNTrackers(
77	"amdgpu-use-amdgpu-trackers", cl::Hidden,
78	cl::desc ("Use the AMDGPU specific RPTrackers during scheduling"),
79	cl::init(Val: false));
80
81	static cl::opt<unsigned> PendingQueueLimit(
82	"amdgpu-scheduler-pending-queue-limit", cl::Hidden,
83	cl::desc (
84	"Max (Available+Pending) size to inspect pending queue (0 disables)"),
85	cl::init(Val: `256`));
86
87	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
88	#define DUMP_MAX_REG_PRESSURE
89	static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
90	"amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
91	cl::desc("Print a list of live registers along with their def/uses at the "
92	"point of maximum register pressure before scheduling."),
93	cl::init(false));
94
95	static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
96	"amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
97	cl::desc("Print a list of live registers along with their def/uses at the "
98	"point of maximum register pressure after scheduling."),
99	cl::init(false));
100	#endif
101
102	static cl::opt<bool> DisableRewriteMFMAFormSchedStage(
103	"amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
104	cl::desc ("Disable rewrite mfma rewrite scheduling stage"), cl::init(Val: true));
105
106	const unsigned ScheduleMetrics::ScaleFactor = `100`;
107
108	GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
109	: GenericScheduler (C), TargetOccupancy(`0`), MF(nullptr),
110	DownwardTracker (C->LIS), UpwardTracker (C->LIS), HasHighPressure(false) {
111	}
112
113	void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
114	GenericScheduler::initialize(dag: DAG);
115
116	MF = &DAG->MF;
117
118	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
119
120	SGPRExcessLimit =
121	Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass);
122	VGPRExcessLimit =
123	Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass);
124
125	SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
126	// Set the initial TargetOccupnacy to the maximum occupancy that we can
127	// achieve for this function. This effectively sets a lower bound on the
128	// 'Critical' register limits in the scheduler.
129	// Allow for lower occupancy targets if kernel is wave limited or memory
130	// bound, and using the relaxed occupancy feature.
131	TargetOccupancy =
132	RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
133	SGPRCriticalLimit =
134	std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit);
135
136	if (!KnownExcessRP) {
137	VGPRCriticalLimit = std::min(
138	a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize()),
139	b: VGPRExcessLimit);
140	} else {
141	// This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
142	// returns a reasonably small number for targets with lots of VGPRs, such
143	// as GFX10 and GFX11.
144	LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
145	"VGPRCriticalLimit calculation method.\n");
146	unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
147	unsigned Granule =
148	AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST, DynamicVGPRBlockSize);
149	unsigned Addressable =
150	AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: &ST, DynamicVGPRBlockSize);
151	unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule);
152	VGPRBudget = std::max(a: VGPRBudget, b: Granule);
153	VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit);
154	}
155
156	// Subtract error margin and bias from register limits and avoid overflow.
157	SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit);
158	VGPRCriticalLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit);
159	SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit);
160	VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit);
161	LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
162	<< ", VGPRExcessLimit = " << VGPRExcessLimit
163	<< ", SGPRCriticalLimit = " << SGPRCriticalLimit
164	<< ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
165	}
166
167	/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
168	/// current register pressure.
169	///
170	/// This works for the common case, but it has a few exceptions that have been
171	/// observed through trial and error:
172	/// - Explicit physical register operands
173	/// - Subregister definitions
174	///
175	/// In both of those cases, PressureDiff doesn't represent the actual pressure,
176	/// and querying LiveIntervals through the RegPressureTracker is needed to get
177	/// an accurate value.
178	///
179	/// We should eventually only use PressureDiff for maximum performance, but this
180	/// already allows 80% of SUs to take the fast path without changing scheduling
181	/// at all. Further changes would either change scheduling, or require a lot
182	/// more logic to recover an accurate pressure estimate from the PressureDiffs.
183	static bool canUsePressureDiffs(const SUnit &SU) {
184	if (!SU.isInstr())
185	return false;
186
187	// Cannot use pressure diffs for subregister defs or with physregs, it's
188	// imprecise in both cases.
189	for (const auto &Op : SU.getInstr()->operands()) {
190	if (!Op.isReg() \|\| Op.isImplicit())
191	continue;
192	if (Op.getReg().isPhysical() \|\|
193	(Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
194	return false;
195	}
196	return true;
197	}
198
199	static void getRegisterPressures(
200	bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
201	std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
202	GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
203	ScheduleDAGMI DAG, const* SIRegisterInfo *SRI) {
204	// getDownwardPressure() and getUpwardPressure() make temporary changes to
205	// the tracker, so we need to pass those function a non-const copy.
206	RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
207	if (!GCNTrackers) {
208	AtTop
209	? TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure)
210	: TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure);
211
212	return;
213	}
214
215	// GCNTrackers
216	Pressure.resize(new_size: `4`, x: `0`);
217	MachineInstr *MI = SU->getInstr();
218	GCNRegPressure NewPressure;
219	if (AtTop) {
220	GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
221	NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, TRI: SRI);
222	} else {
223	GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
224	TempUpwardTracker.recede(MI: *MI);
225	NewPressure = TempUpwardTracker.getPressure();
226	}
227	Pressure [AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
228	Pressure [AMDGPU::RegisterPressureSets::VGPR_32] =
229	NewPressure.getArchVGPRNum();
230	Pressure [AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
231	}
232
233	void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
234	bool AtTop,
235	const RegPressureTracker &RPTracker,
236	const SIRegisterInfo *SRI,
237	unsigned SGPRPressure,
238	unsigned VGPRPressure, bool IsBottomUp) {
239	Cand.SU = SU;
240	Cand.AtTop = AtTop;
241
242	if (!DAG->isTrackingPressure())
243	return;
244
245	Pressure.clear();
246	MaxPressure.clear();
247
248	// We try to use the cached PressureDiffs in the ScheduleDAG whenever
249	// possible over querying the RegPressureTracker.
250	//
251	// RegPressureTracker will make a lot of LIS queries which are very
252	// expensive, it is considered a slow function in this context.
253	//
254	// PressureDiffs are precomputed and cached, and getPressureDiff is just a
255	// trivial lookup into an array. It is pretty much free.
256	//
257	// In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
258	// PressureDiffs.
259	if (AtTop \|\| !canUsePressureDiffs(SU: *SU) \|\| GCNTrackers) {
260	getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
261	DownwardTracker, UpwardTracker, DAG, SRI);
262	} else {
263	// Reserve 4 slots.
264	Pressure.resize(new_size: `4`, x: `0`);
265	Pressure [AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
266	Pressure [AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
267
268	for (const auto &Diff : DAG->getPressureDiff(SU)) {
269	if (!Diff.isValid())
270	continue;
271	// PressureDiffs is always bottom-up so if we're working top-down we need
272	// to invert its sign.
273	Pressure [Diff.getPSet()] +=
274	(IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
275	}
276
277	#ifdef EXPENSIVE_CHECKS
278	std::vector<unsigned> CheckPressure, CheckMaxPressure;
279	getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
280	DownwardTracker, UpwardTracker, DAG, SRI);
281	if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
282	CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] \|\|
283	Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
284	CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
285	errs() << "Register Pressure is inaccurate when calculated through "
286	"PressureDiff\n"
287	<< "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
288	<< ", expected "
289	<< CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
290	<< "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
291	<< ", expected "
292	<< CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
293	report_fatal_error("inaccurate register pressure calculation");
294	}
295	#endif
296	}
297
298	unsigned NewSGPRPressure = Pressure [AMDGPU::RegisterPressureSets::SReg_32];
299	unsigned NewVGPRPressure = Pressure [AMDGPU::RegisterPressureSets::VGPR_32];
300
301	// If two instructions increase the pressure of different register sets
302	// by the same amount, the generic scheduler will prefer to schedule the
303	// instruction that increases the set with the least amount of registers,
304	// which in our case would be SGPRs. This is rarely what we want, so
305	// when we report excess/critical register pressure, we do it either
306	// only for VGPRs or only for SGPRs.
307
308	// FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
309	const unsigned MaxVGPRPressureInc = `16`;
310	bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
311	bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
312
313	// FIXME: We have to enter REG-EXCESS before we reach the actual threshold
314	// to increase the likelihood we don't go over the limits. We should improve
315	// the analysis to look through dependencies to find the path with the least
316	// register pressure.
317
318	// We only need to update the RPDelta for instructions that increase register
319	// pressure. Instructions that decrease or keep reg pressure the same will be
320	// marked as RegExcess in tryCandidate() when they are compared with
321	// instructions that increase the register pressure.
322	if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
323	HasHighPressure = true;
324	Cand.RPDelta.Excess = PressureChange (AMDGPU::RegisterPressureSets::VGPR_32);
325	Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
326	}
327
328	if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
329	HasHighPressure = true;
330	Cand.RPDelta.Excess = PressureChange (AMDGPU::RegisterPressureSets::SReg_32);
331	Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
332	}
333
334	// Register pressure is considered 'CRITICAL' if it is approaching a value
335	// that would reduce the wave occupancy for the execution unit. When
336	// register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
337	// has the same cost, so we don't need to prefer one over the other.
338
339	int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
340	int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
341
342	if (SGPRDelta >= `0` \|\| VGPRDelta >= `0`) {
343	HasHighPressure = true;
344	if (SGPRDelta > VGPRDelta) {
345	Cand.RPDelta.CriticalMax =
346	PressureChange (AMDGPU::RegisterPressureSets::SReg_32);
347	Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
348	} else {
349	Cand.RPDelta.CriticalMax =
350	PressureChange (AMDGPU::RegisterPressureSets::VGPR_32);
351	Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
352	}
353	}
354	}
355
356	static bool shouldCheckPending(SchedBoundary &Zone,
357	const TargetSchedModel *SchedModel) {
358	bool HasBufferedModel =
359	SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
360	unsigned Combined = Zone.Available.size() + Zone.Pending.size();
361	return Combined <= PendingQueueLimit && HasBufferedModel;
362	}
363
364	static SUnit *pickOnlyChoice(SchedBoundary &Zone,
365	const TargetSchedModel *SchedModel) {
366	// pickOnlyChoice() releases pending instructions and checks for new hazards.
367	SUnit *OnlyChoice = Zone.pickOnlyChoice();
368	if (!shouldCheckPending(Zone, SchedModel) \|\| Zone.Pending.empty())
369	return OnlyChoice;
370
371	return nullptr;
372	}
373
374	void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current,
375	const SchedCandidate &Preferred) {
376	LLVM_DEBUG({
377	dbgs() << "Prefer:\t\t";
378	DAG->dumpNode(*Preferred.SU);
379
380	if (Current.SU) {
381	dbgs() << "Not:\t";
382	DAG->dumpNode(*Current.SU);
383	}
384
385	dbgs() << "Reason:\t\t";
386	traceCandidate(Preferred);
387	});
388	}
389
390	// This function is mostly cut and pasted from
391	// GenericScheduler::pickNodeFromQueue()
392	void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
393	const CandPolicy &ZonePolicy,
394	const RegPressureTracker &RPTracker,
395	SchedCandidate &Cand, bool &IsPending,
396	bool IsBottomUp) {
397	const SIRegisterInfo SRI = static_cast<const* SIRegisterInfo *>(TRI);
398	ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
399	unsigned SGPRPressure = `0`;
400	unsigned VGPRPressure = `0`;
401	IsPending = false;
402	if (DAG->isTrackingPressure()) {
403	if (!GCNTrackers) {
404	SGPRPressure = Pressure [AMDGPU::RegisterPressureSets::SReg_32];
405	VGPRPressure = Pressure [AMDGPU::RegisterPressureSets::VGPR_32];
406	} else {
407	GCNRPTracker *T = IsBottomUp
408	? static_cast<GCNRPTracker *>(&UpwardTracker)
409	: static_cast<GCNRPTracker *>(&DownwardTracker);
410	SGPRPressure = T->getPressure().getSGPRNum();
411	VGPRPressure = T->getPressure().getArchVGPRNum();
412	}
413	}
414	LLVM_DEBUG(dbgs() << "Available Q:\n");
415	ReadyQueue &AQ = Zone.Available;
416	for (SUnit *SU : AQ) {
417
418	SchedCandidate TryCand(ZonePolicy);
419	initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
420	VGPRPressure, IsBottomUp);
421	// Pass SchedBoundary only when comparing nodes from the same boundary.
422	SchedBoundary ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr*;
423	tryCandidate(Cand, TryCand, Zone: ZoneArg);
424	if (TryCand.Reason != NoCand) {
425	// Initialize resource delta if needed in case future heuristics query it.
426	if (TryCand.ResDelta == SchedResourceDelta ())
427	TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
428	LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
429	Cand.setBest(TryCand);
430	} else {
431	printCandidateDecision(Current: TryCand, Preferred: Cand);
432	}
433	}
434
435	if (!shouldCheckPending(Zone, SchedModel))
436	return;
437
438	LLVM_DEBUG(dbgs() << "Pending Q:\n");
439	ReadyQueue &PQ = Zone.Pending;
440	for (SUnit *SU : PQ) {
441
442	SchedCandidate TryCand(ZonePolicy);
443	initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure,
444	VGPRPressure, IsBottomUp);
445	// Pass SchedBoundary only when comparing nodes from the same boundary.
446	SchedBoundary ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr*;
447	tryPendingCandidate(Cand, TryCand, Zone: ZoneArg);
448	if (TryCand.Reason != NoCand) {
449	// Initialize resource delta if needed in case future heuristics query it.
450	if (TryCand.ResDelta == SchedResourceDelta ())
451	TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel);
452	LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
453	IsPending = true;
454	Cand.setBest(TryCand);
455	} else {
456	printCandidateDecision(Current: TryCand, Preferred: Cand);
457	}
458	}
459	}
460
461	// This function is mostly cut and pasted from
462	// GenericScheduler::pickNodeBidirectional()
463	SUnit GCNSchedStrategy::pickNodeBidirectional(bool* &IsTopNode,
464	bool &PickedPending) {
465	// Schedule as far as possible in the direction of no choice. This is most
466	// efficient, but also provides the best heuristics for CriticalPSets.
467	if (SUnit *SU = pickOnlyChoice(Zone&: Bot, SchedModel)) {
468	IsTopNode = false;
469	return SU;
470	}
471	if (SUnit *SU = pickOnlyChoice(Zone&: Top, SchedModel)) {
472	IsTopNode = true;
473	return SU;
474	}
475	// Set the bottom-up policy based on the state of the current bottom zone
476	// and the instructions outside the zone, including the top zone.
477	CandPolicy BotPolicy;
478	setPolicy(Policy&: BotPolicy, /IsPostRA=/false, CurrZone&: Bot, OtherZone: &Top);
479	// Set the top-down policy based on the state of the current top zone and
480	// the instructions outside the zone, including the bottom zone.
481	CandPolicy TopPolicy;
482	setPolicy(Policy&: TopPolicy, /IsPostRA=/false, CurrZone&: Top, OtherZone: &Bot);
483
484	bool BotPending = false;
485	// See if BotCand is still valid (because we previously scheduled from Top).
486	LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
487	if (!BotCand.isValid() \|\| BotCand.SU->isScheduled \|\|
488	BotCand.Policy != BotPolicy) {
489	BotCand.reset(NewPolicy: CandPolicy ());
490	pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
491	IsPending&: BotPending,
492	/IsBottomUp=/true);
493	assert(BotCand.Reason != NoCand && "failed to find the first candidate");
494	} else {
495	LLVM_DEBUG(traceCandidate(BotCand));
496	#ifndef NDEBUG
497	if (VerifyScheduling) {
498	SchedCandidate TCand;
499	TCand.reset(CandPolicy());
500	pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
501	BotPending,
502	/IsBottomUp=/true);
503	assert(TCand.SU == BotCand.SU &&
504	"Last pick result should correspond to re-picking right now");
505	}
506	#endif
507	}
508
509	bool TopPending = false;
510	// Check if the top Q has a better candidate.
511	LLVM_DEBUG(dbgs() << "Picking from Top:\n");
512	if (!TopCand.isValid() \|\| TopCand.SU->isScheduled \|\|
513	TopCand.Policy != TopPolicy) {
514	TopCand.reset(NewPolicy: CandPolicy ());
515	pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
516	IsPending&: TopPending,
517	/IsBottomUp=/false);
518	assert(TopCand.Reason != NoCand && "failed to find the first candidate");
519	} else {
520	LLVM_DEBUG(traceCandidate(TopCand));
521	#ifndef NDEBUG
522	if (VerifyScheduling) {
523	SchedCandidate TCand;
524	TCand.reset(CandPolicy());
525	pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
526	TopPending,
527	/IsBottomUp=/false);
528	assert(TCand.SU == TopCand.SU &&
529	"Last pick result should correspond to re-picking right now");
530	}
531	#endif
532	}
533
534	// Pick best from BotCand and TopCand.
535	LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
536	dbgs() << "Bot Cand: "; traceCandidate(BotCand););
537	SchedCandidate Cand = BotPending ? TopCand : BotCand;
538	SchedCandidate TryCand = BotPending ? BotCand : TopCand;
539	PickedPending = BotPending && TopPending;
540
541	TryCand.Reason = NoCand;
542	if (BotPending \|\| TopPending) {
543	PickedPending \|= tryPendingCandidate(Cand, TryCand&: TopCand, Zone: nullptr);
544	} else {
545	tryCandidate(Cand, TryCand, Zone: nullptr);
546	}
547
548	if (TryCand.Reason != NoCand) {
549	Cand.setBest(TryCand);
550	}
551
552	LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
553
554	IsTopNode = Cand.AtTop;
555	return Cand.SU;
556	}
557
558	// This function is mostly cut and pasted from
559	// GenericScheduler::pickNode()
560	SUnit GCNSchedStrategy::pickNode(bool* &IsTopNode) {
561	if (DAG->top() == DAG->bottom()) {
562	assert(Top.Available.empty() && Top.Pending.empty() &&
563	Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
564	return nullptr;
565	}
566	bool PickedPending;
567	SUnit *SU;
568	do {
569	PickedPending = false;
570	if (RegionPolicy.OnlyTopDown) {
571	SU = pickOnlyChoice(Zone&: Top, SchedModel);
572	if (!SU) {
573	CandPolicy NoPolicy;
574	TopCand.reset(NewPolicy: NoPolicy);
575	pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand,
576	IsPending&: PickedPending,
577	/IsBottomUp=/false);
578	assert(TopCand.Reason != NoCand && "failed to find a candidate");
579	SU = TopCand.SU;
580	}
581	IsTopNode = true;
582	} else if (RegionPolicy.OnlyBottomUp) {
583	SU = pickOnlyChoice(Zone&: Bot, SchedModel);
584	if (!SU) {
585	CandPolicy NoPolicy;
586	BotCand.reset(NewPolicy: NoPolicy);
587	pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand,
588	IsPending&: PickedPending,
589	/IsBottomUp=/true);
590	assert(BotCand.Reason != NoCand && "failed to find a candidate");
591	SU = BotCand.SU;
592	}
593	IsTopNode = false;
594	} else {
595	SU = pickNodeBidirectional(IsTopNode, PickedPending);
596	}
597	} while (SU->isScheduled);
598
599	if (PickedPending) {
600	unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
601	SchedBoundary &Zone = IsTopNode ? Top : Bot;
602	unsigned CurrentCycle = Zone.getCurrCycle();
603	if (ReadyCycle > CurrentCycle)
604	Zone.bumpCycle(NextCycle: ReadyCycle);
605
606	// FIXME: checkHazard() doesn't give information about which cycle the
607	// hazard will resolve so just keep bumping the cycle by 1. This could be
608	// made more efficient if checkHazard() returned more details.
609	while (Zone.checkHazard(SU))
610	Zone.bumpCycle(NextCycle: Zone.getCurrCycle() + `1`);
611
612	Zone.releasePending();
613	}
614
615	if (SU->isTopReady())
616	Top.removeReady(SU);
617	if (SU->isBottomReady())
618	Bot.removeReady(SU);
619
620	LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
621	<< *SU->getInstr());
622	return SU;
623	}
624
625	void GCNSchedStrategy::schedNode(SUnit SU, bool* IsTopNode) {
626	if (GCNTrackers) {
627	MachineInstr *MI = SU->getInstr();
628	IsTopNode ? (void)DownwardTracker.advance(MI, UseInternalIterator: false)
629	: UpwardTracker.recede(MI: *MI);
630	}
631
632	return GenericScheduler::schedNode(SU, IsTopNode);
633	}
634
635	GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
636	assert(CurrentStage && CurrentStage != SchedStages.end());
637	return *CurrentStage;
638	}
639
640	bool GCNSchedStrategy::advanceStage() {
641	assert(CurrentStage != SchedStages.end());
642	if (!CurrentStage)
643	CurrentStage = SchedStages.begin();
644	else
645	CurrentStage++;
646
647	return CurrentStage != SchedStages.end();
648	}
649
650	bool GCNSchedStrategy::hasNextStage() const {
651	assert(CurrentStage);
652	return std::next(x: CurrentStage) != SchedStages.end();
653	}
654
655	GCNSchedStageID GCNSchedStrategy::getNextStage() const {
656	assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
657	return *std::next(x: CurrentStage);
658	}
659
660	bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
661	SchedCandidate &TryCand,
662	SchedBoundary Zone) const* {
663	// Initialize the candidate if needed.
664	if (!Cand.isValid()) {
665	TryCand.Reason = NodeOrder;
666	return true;
667	}
668
669	// Bias PhysReg Defs and copies to their uses and defined respectively.
670	if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
671	CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
672	return TryCand.Reason != NoCand;
673
674	// Avoid exceeding the target's limit.
675	if (DAG->isTrackingPressure() &&
676	tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
677	Reason: RegExcess, TRI, MF: DAG->MF))
678	return TryCand.Reason != NoCand;
679
680	// Avoid increasing the max critical pressure in the scheduled region.
681	if (DAG->isTrackingPressure() &&
682	tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
683	TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
684	return TryCand.Reason != NoCand;
685
686	bool SameBoundary = Zone != nullptr;
687	if (SameBoundary) {
688	TryCand.initResourceDelta(DAG, SchedModel);
689	if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
690	TryCand, Cand, Reason: ResourceReduce))
691	return TryCand.Reason != NoCand;
692	if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
693	CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
694	Reason: ResourceDemand))
695	return TryCand.Reason != NoCand;
696	}
697
698	return false;
699	}
700
701	GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
702	const MachineSchedContext C, bool* IsLegacyScheduler)
703	: GCNSchedStrategy (C) {
704	SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule);
705	if (!DisableRewriteMFMAFormSchedStage)
706	SchedStages.push_back(Elt: GCNSchedStageID::RewriteMFMAForm);
707	SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule);
708	SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule);
709	SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize);
710	GCNTrackers = GCNTrackers & !IsLegacyScheduler;
711	}
712
713	GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
714	: GCNSchedStrategy (C) {
715	SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule);
716	}
717
718	bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
719	SchedCandidate &TryCand,
720	SchedBoundary Zone) const* {
721	// Initialize the candidate if needed.
722	if (!Cand.isValid()) {
723	TryCand.Reason = NodeOrder;
724	return true;
725	}
726
727	// Avoid spilling by exceeding the register limit.
728	if (DAG->isTrackingPressure() &&
729	tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
730	Reason: RegExcess, TRI, MF: DAG->MF))
731	return TryCand.Reason != NoCand;
732
733	// Bias PhysReg Defs and copies to their uses and defined respectively.
734	if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
735	CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
736	return TryCand.Reason != NoCand;
737
738	bool SameBoundary = Zone != nullptr;
739	if (SameBoundary) {
740	// Prioritize instructions that read unbuffered resources by stall cycles.
741	if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
742	CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
743	return TryCand.Reason != NoCand;
744
745	// Avoid critical resource consumption and balance the schedule.
746	TryCand.initResourceDelta(DAG, SchedModel);
747	if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
748	TryCand, Cand, Reason: ResourceReduce))
749	return TryCand.Reason != NoCand;
750	if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
751	CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
752	Reason: ResourceDemand))
753	return TryCand.Reason != NoCand;
754
755	// Unconditionally try to reduce latency.
756	if (tryLatency(TryCand, Cand, Zone&: *Zone))
757	return TryCand.Reason != NoCand;
758
759	// Weak edges are for clustering and other constraints.
760	if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
761	CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
762	return TryCand.Reason != NoCand;
763	}
764
765	// Keep clustered nodes together to encourage downstream peephole
766	// optimizations which may reduce resource requirements.
767	//
768	// This is a best effort to set things up for a post-RA pass. Optimizations
769	// like generating loads of multiple registers should ideally be done within
770	// the scheduler pass by combining the loads during DAG postprocessing.
771	unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
772	unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
773	bool CandIsClusterSucc =
774	isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx);
775	bool TryCandIsClusterSucc =
776	isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx);
777	if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand,
778	Reason: Cluster))
779	return TryCand.Reason != NoCand;
780
781	// Avoid increasing the max critical pressure in the scheduled region.
782	if (DAG->isTrackingPressure() &&
783	tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
784	TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
785	return TryCand.Reason != NoCand;
786
787	// Avoid increasing the max pressure of the entire region.
788	if (DAG->isTrackingPressure() &&
789	tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
790	Cand, Reason: RegMax, TRI, MF: DAG->MF))
791	return TryCand.Reason != NoCand;
792
793	if (SameBoundary) {
794	// Fall through to original instruction order.
795	if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) \|\|
796	(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
797	TryCand.Reason = NodeOrder;
798	return true;
799	}
800	}
801	return false;
802	}
803
804	GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
805	const MachineSchedContext *C)
806	: GCNSchedStrategy (C) {
807	SchedStages.push_back(Elt: GCNSchedStageID::MemoryClauseInitialSchedule);
808	}
809
810	/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
811	/// much as possible. This is achieved by:
812	// 1. Prioritize clustered operations before stall latency heuristic.
813	// 2. Prioritize long-latency-load before stall latency heuristic.
814	///
815	/// \param Cand provides the policy and current best candidate.
816	/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
817	/// \param Zone describes the scheduled zone that we are extending, or nullptr
818	/// if Cand is from a different zone than TryCand.
819	/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
820	bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
821	SchedCandidate &TryCand,
822	SchedBoundary Zone) const* {
823	// Initialize the candidate if needed.
824	if (!Cand.isValid()) {
825	TryCand.Reason = NodeOrder;
826	return true;
827	}
828
829	// Bias PhysReg Defs and copies to their uses and defined respectively.
830	if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop),
831	CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg))
832	return TryCand.Reason != NoCand;
833
834	if (DAG->isTrackingPressure()) {
835	// Avoid exceeding the target's limit.
836	if (tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand,
837	Reason: RegExcess, TRI, MF: DAG->MF))
838	return TryCand.Reason != NoCand;
839
840	// Avoid increasing the max critical pressure in the scheduled region.
841	if (tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax,
842	TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF))
843	return TryCand.Reason != NoCand;
844	}
845
846	// MaxMemoryClause-specific: We prioritize clustered instructions as we would
847	// get more benefit from clausing these memory instructions.
848	unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
849	unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
850	bool CandIsClusterSucc =
851	isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx);
852	bool TryCandIsClusterSucc =
853	isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx);
854	if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand,
855	Reason: Cluster))
856	return TryCand.Reason != NoCand;
857
858	// We only compare a subset of features when comparing nodes between
859	// Top and Bottom boundary. Some properties are simply incomparable, in many
860	// other instances we should only override the other boundary if something
861	// is a clear good pick on one boundary. Skip heuristics that are more
862	// "tie-breaking" in nature.
863	bool SameBoundary = Zone != nullptr;
864	if (SameBoundary) {
865	// For loops that are acyclic path limited, aggressively schedule for
866	// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
867	// heuristics to take precedence.
868	if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
869	tryLatency(TryCand, Cand, Zone&: *Zone))
870	return TryCand.Reason != NoCand;
871
872	// MaxMemoryClause-specific: Prioritize long latency memory load
873	// instructions in top-bottom order to hide more latency. The mayLoad check
874	// is used to exclude store-like instructions, which we do not want to
875	// scheduler them too early.
876	bool TryMayLoad =
877	TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
878	bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
879
880	if (TryMayLoad \|\| CandMayLoad) {
881	bool TryLongLatency =
882	TryCand.SU->Latency > `10` * Cand.SU->Latency && TryMayLoad;
883	bool CandLongLatency =
884	`10` * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
885
886	if (tryGreater(TryVal: Zone->isTop() ? TryLongLatency : CandLongLatency,
887	CandVal: Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
888	Cand, Reason: Stall))
889	return TryCand.Reason != NoCand;
890	}
891	// Prioritize instructions that read unbuffered resources by stall cycles.
892	if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU),
893	CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall))
894	return TryCand.Reason != NoCand;
895	}
896
897	if (SameBoundary) {
898	// Weak edges are for clustering and other constraints.
899	if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop),
900	CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak))
901	return TryCand.Reason != NoCand;
902	}
903
904	// Avoid increasing the max pressure of the entire region.
905	if (DAG->isTrackingPressure() &&
906	tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand,
907	Cand, Reason: RegMax, TRI, MF: DAG->MF))
908	return TryCand.Reason != NoCand;
909
910	if (SameBoundary) {
911	// Avoid critical resource consumption and balance the schedule.
912	TryCand.initResourceDelta(DAG, SchedModel);
913	if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources,
914	TryCand, Cand, Reason: ResourceReduce))
915	return TryCand.Reason != NoCand;
916	if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources,
917	CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand,
918	Reason: ResourceDemand))
919	return TryCand.Reason != NoCand;
920
921	// Avoid serializing long latency dependence chains.
922	// For acyclic path limited loops, latency was already checked above.
923	if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
924	!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone))
925	return TryCand.Reason != NoCand;
926
927	// Fall through to original instruction order.
928	if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
929	assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
930	TryCand.Reason = NodeOrder;
931	return true;
932	}
933	}
934
935	return false;
936	}
937
938	GCNScheduleDAGMILive::GCNScheduleDAGMILive(
939	MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
940	: ScheduleDAGMILive (C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
941	MFI(*MF.getInfo<SIMachineFunctionInfo>()),
942	StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
943	RegionLiveOuts (this, /IsLiveOut=/true) {
944
945	// We want regions with a single MI to be scheduled so that we can reason
946	// about them correctly during scheduling stages that move MIs between regions
947	// (e.g., rematerialization).
948	ScheduleSingleMIRegions = true;
949	LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
950	if (RelaxedOcc) {
951	MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy);
952	if (MinOccupancy != StartingOccupancy)
953	LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
954	<< ".\n");
955	}
956	}
957
958	std::unique_ptr<GCNSchedStage>
959	GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
960	switch (SchedStageID) {
961	case GCNSchedStageID::OccInitialSchedule:
962	return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this);
963	case GCNSchedStageID::RewriteMFMAForm:
964	return std::make_unique<RewriteMFMAFormStage>(args&: SchedStageID, args&: *this);
965	case GCNSchedStageID::UnclusteredHighRPReschedule:
966	return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this);
967	case GCNSchedStageID::ClusteredLowOccupancyReschedule:
968	return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this);
969	case GCNSchedStageID::PreRARematerialize:
970	return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this);
971	case GCNSchedStageID::ILPInitialSchedule:
972	return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this);
973	case GCNSchedStageID::MemoryClauseInitialSchedule:
974	return std::make_unique<MemoryClauseInitialScheduleStage>(args&: SchedStageID,
975	args&: *this);
976	}
977
978	llvm_unreachable("Unknown SchedStageID.");
979	}
980
981	void GCNScheduleDAGMILive::schedule() {
982	// Collect all scheduling regions. The actual scheduling is performed in
983	// GCNScheduleDAGMILive::finalizeSchedule.
984	Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd));
985	}
986
987	GCNRegPressure
988	GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
989	if (Regions [RegionIdx].first == Regions [RegionIdx].second)
990	return llvm::getRegPressure(MRI, LiveRegs: LiveIns [RegionIdx]);
991	GCNDownwardRPTracker RPTracker(*LIS);
992	RPTracker.advance(Begin: Regions [RegionIdx].first, End: Regions [RegionIdx].second,
993	LiveRegsCopy: &LiveIns [RegionIdx]);
994	return RPTracker.moveMaxPressure();
995	}
996
997	static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
998	MachineBasicBlock::iterator RegionEnd) {
999	assert(RegionBegin != RegionEnd && "Region must not be empty");
1000	return &*skipDebugInstructionsBackward(It: std::prev(x: RegionEnd), Begin: RegionBegin);
1001	}
1002
1003	void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
1004	const MachineBasicBlock *MBB) {
1005	GCNDownwardRPTracker RPTracker(*LIS);
1006
1007	// If the block has the only successor then live-ins of that successor are
1008	// live-outs of the current block. We can reuse calculated live set if the
1009	// successor will be sent to scheduling past current block.
1010
1011	// However, due to the bug in LiveInterval analysis it may happen that two
1012	// predecessors of the same successor block have different lane bitmasks for
1013	// a live-out register. Workaround that by sticking to one-to-one relationship
1014	// i.e. one predecessor with one successor block.
1015	const MachineBasicBlock OnlySucc = nullptr*;
1016	if (MBB->succ_size() == `1`) {
1017	auto Candidate = MBB->succ_begin();
1018	if (!Candidate->empty() && Candidate->pred_size() == `1`) {
1019	SlotIndexes *Ind = LIS->getSlotIndexes();
1020	if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate))
1021	OnlySucc = Candidate;
1022	}
1023	}
1024
1025	// Scheduler sends regions from the end of the block upwards.
1026	size_t CurRegion = RegionIdx;
1027	for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1028	if (Regions [CurRegion].first ->getParent() != MBB)
1029	break;
1030	--CurRegion;
1031
1032	auto I = MBB->begin();
1033	auto LiveInIt = MBBLiveIns.find(Val: MBB);
1034	auto &Rgn = Regions [CurRegion];
1035	auto NonDbgMI = &skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
1036	if (LiveInIt != MBBLiveIns.end()) {
1037	auto LiveIn = std::move(LiveInIt ->second);
1038	RPTracker.reset(MI: *MBB->begin(), LiveRegs: &LiveIn);
1039	MBBLiveIns.erase(I: LiveInIt);
1040	} else {
1041	I = Rgn.first;
1042	auto LRS = BBLiveInMap.lookup(Val: NonDbgMI);
1043	#ifdef EXPENSIVE_CHECKS
1044	assert(isEqual(getLiveRegsBefore(NonDbgMI, LIS), LRS));
1045	#endif
1046	RPTracker.reset(MI: *I, LiveRegs: &LRS);
1047	}
1048
1049	for (;;) {
1050	I = RPTracker.getNext();
1051
1052	if (Regions [CurRegion].first == I \|\| NonDbgMI == I) {
1053	LiveIns [CurRegion] = RPTracker.getLiveRegs();
1054	RPTracker.clearMaxPressure();
1055	}
1056
1057	if (Regions [CurRegion].second == I) {
1058	Pressure [CurRegion] = RPTracker.moveMaxPressure();
1059	if (CurRegion-- == RegionIdx)
1060	break;
1061	auto &Rgn = Regions [CurRegion];
1062	NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second);
1063	}
1064	RPTracker.advanceBeforeNext();
1065	RPTracker.advanceToNext();
1066	}
1067
1068	if (OnlySucc) {
1069	if (I != MBB->end()) {
1070	RPTracker.advanceBeforeNext();
1071	RPTracker.advanceToNext();
1072	RPTracker.advance(End: MBB->end());
1073	}
1074	MBBLiveIns [OnlySucc] = RPTracker.moveLiveRegs();
1075	}
1076	}
1077
1078	DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
1079	GCNScheduleDAGMILive::getRegionLiveInMap() const {
1080	assert(!Regions.empty());
1081	std::vector<MachineInstr *> RegionFirstMIs;
1082	RegionFirstMIs.reserve(n: Regions.size());
1083	for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions))
1084	RegionFirstMIs.push_back(
1085	x: &*skipDebugInstructionsForward(It: RegionBegin, End: RegionEnd));
1086
1087	return getLiveRegMap(R&: RegionFirstMIs, /After=/false, LIS&: *LIS);
1088	}
1089
1090	DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
1091	GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1092	assert(!Regions.empty());
1093	std::vector<MachineInstr *> RegionLastMIs;
1094	RegionLastMIs.reserve(n: Regions.size());
1095	for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions)) {
1096	// Skip empty regions.
1097	if (RegionBegin == RegionEnd)
1098	continue;
1099	RegionLastMIs.push_back(x: getLastMIForRegion(RegionBegin, RegionEnd));
1100	}
1101	return getLiveRegMap(R&: RegionLastMIs, /After=/true, LIS&: *LIS);
1102	}
1103
1104	void RegionPressureMap::buildLiveRegMap() {
1105	IdxToInstruction.clear();
1106
1107	RegionLiveRegMap =
1108	IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1109	for (unsigned I = `0`; I < DAG->Regions.size(); I++) {
1110	auto &[RegionBegin, RegionEnd] = DAG->Regions [I];
1111	// Skip empty regions.
1112	if (RegionBegin == RegionEnd)
1113	continue;
1114	MachineInstr *RegionKey =
1115	IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1116	IdxToInstruction [I] = RegionKey;
1117	}
1118	}
1119
1120	void GCNScheduleDAGMILive::finalizeSchedule() {
1121	// Start actual scheduling here. This function is called by the base
1122	// MachineScheduler after all regions have been recorded by
1123	// GCNScheduleDAGMILive::schedule().
1124	LiveIns.resize(N: Regions.size());
1125	Pressure.resize(N: Regions.size());
1126	RegionsWithHighRP.resize(N: Regions.size());
1127	RegionsWithExcessRP.resize(N: Regions.size());
1128	RegionsWithIGLPInstrs.resize(N: Regions.size());
1129	RegionsWithHighRP.reset();
1130	RegionsWithExcessRP.reset();
1131	RegionsWithIGLPInstrs.reset();
1132
1133	runSchedStages();
1134	}
1135
1136	void GCNScheduleDAGMILive::runSchedStages() {
1137	LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1138
1139	if (!Regions.empty()) {
1140	BBLiveInMap = getRegionLiveInMap();
1141	if (GCNTrackers)
1142	RegionLiveOuts.buildLiveRegMap();
1143	}
1144
1145	#ifdef DUMP_MAX_REG_PRESSURE
1146	if (PrintMaxRPRegUsageBeforeScheduler) {
1147	dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
1148	dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
1149	LIS->dump();
1150	}
1151	#endif
1152
1153	GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1154	while (S.advanceStage()) {
1155	auto Stage = createSchedStage(SchedStageID: S.getCurrentStage());
1156	if (!Stage ->initGCNSchedStage())
1157	continue;
1158
1159	for (auto Region : Regions) {
1160	RegionBegin = Region.first;
1161	RegionEnd = Region.second;
1162	// Setup for scheduling the region and check whether it should be skipped.
1163	if (!Stage ->initGCNRegion()) {
1164	Stage ->advanceRegion();
1165	exitRegion();
1166	continue;
1167	}
1168
1169	if (GCNTrackers) {
1170	GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
1171	GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
1172	GCNRPTracker::LiveRegSet *RegionLiveIns =
1173	&LiveIns [Stage ->getRegionIdx()];
1174
1175	reinterpret_cast<GCNRPTracker *>(DownwardTracker)
1176	->reset(MRI_: MRI, LiveRegs_: *RegionLiveIns);
1177	reinterpret_cast<GCNRPTracker *>(UpwardTracker)
1178	->reset(MRI_: MRI, LiveRegs_: RegionLiveOuts.getLiveRegsForRegionIdx(
1179	RegionIdx: Stage ->getRegionIdx()));
1180	}
1181
1182	ScheduleDAGMILive::schedule();
1183	Stage ->finalizeGCNRegion();
1184	Stage ->advanceRegion();
1185	exitRegion();
1186	}
1187
1188	Stage ->finalizeGCNSchedStage();
1189	}
1190
1191	#ifdef DUMP_MAX_REG_PRESSURE
1192	if (PrintMaxRPRegUsageAfterScheduler) {
1193	dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
1194	dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
1195	LIS->dump();
1196	}
1197	#endif
1198	}
1199
1200	#ifndef NDEBUG
1201	raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
1202	switch (StageID) {
1203	case GCNSchedStageID::OccInitialSchedule:
1204	OS << "Max Occupancy Initial Schedule";
1205	break;
1206	case GCNSchedStageID::RewriteMFMAForm:
1207	OS << "Instruction Rewriting Reschedule";
1208	break;
1209	case GCNSchedStageID::UnclusteredHighRPReschedule:
1210	OS << "Unclustered High Register Pressure Reschedule";
1211	break;
1212	case GCNSchedStageID::ClusteredLowOccupancyReschedule:
1213	OS << "Clustered Low Occupancy Reschedule";
1214	break;
1215	case GCNSchedStageID::PreRARematerialize:
1216	OS << "Pre-RA Rematerialize";
1217	break;
1218	case GCNSchedStageID::ILPInitialSchedule:
1219	OS << "Max ILP Initial Schedule";
1220	break;
1221	case GCNSchedStageID::MemoryClauseInitialSchedule:
1222	OS << "Max memory clause Initial Schedule";
1223	break;
1224	}
1225
1226	return OS;
1227	}
1228	#endif
1229
1230	GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
1231	: DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
1232	MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
1233
1234	bool GCNSchedStage::initGCNSchedStage() {
1235	if (!DAG.LIS)
1236	return false;
1237
1238	LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1239	return true;
1240	}
1241
1242	void RewriteMFMAFormStage::findReachingDefs(
1243	MachineOperand &UseMO, LiveIntervals *LIS,
1244	SmallVectorImpl<SlotIndex> &DefIdxs) {
1245	MachineInstr *UseMI = UseMO.getParent();
1246	LiveInterval &UseLI = LIS->getInterval(Reg: UseMO.getReg());
1247	VNInfo VNI = UseLI.getVNInfoAt(Idx: LIS->getInstructionIndex(Instr: UseMI));
1248
1249	// If the def is not a PHI, then it must be the only reaching def.
1250	if (!VNI->isPHIDef()) {
1251	DefIdxs.push_back(Elt: VNI->def);
1252	return;
1253	}
1254
1255	SmallPtrSet<MachineBasicBlock *, `8`> Visited = {UseMI->getParent()};
1256	SmallVector<MachineBasicBlock *, `8`> Worklist;
1257
1258	// Mark the predecessor blocks for traversal
1259	for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1260	Worklist.push_back(Elt: PredMBB);
1261	Visited.insert(Ptr: PredMBB);
1262	}
1263
1264	while (!Worklist.empty()) {
1265	MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1266
1267	SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(mbb: CurrMBB);
1268	VNInfo *VNI = UseLI.getVNInfoAt(Idx: CurrMBBEnd.getPrevSlot());
1269
1270	MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(index: VNI->def);
1271
1272	// If there is a def in this block, then add it to the list. This is the
1273	// reaching def of this path.
1274	if (!VNI->isPHIDef()) {
1275	DefIdxs.push_back(Elt: VNI->def);
1276	continue;
1277	}
1278
1279	for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1280	if (Visited.insert(Ptr: PredMBB).second)
1281	Worklist.push_back(Elt: PredMBB);
1282	}
1283	}
1284	}
1285
1286	void RewriteMFMAFormStage::findReachingUses(
1287	MachineInstr DefMI, LiveIntervals LIS,
1288	SmallVectorImpl<MachineOperand *> &ReachingUses) {
1289	SlotIndex DefIdx = LIS->getInstructionIndex(Instr: *DefMI);
1290	for (MachineOperand &UseMO :
1291	DAG.MRI.use_nodbg_operands(Reg: DefMI->getOperand(i: `0`).getReg())) {
1292	SmallVector<SlotIndex, `8`> ReachingDefIndexes;
1293	findReachingDefs(UseMO, LIS, DefIdxs&: ReachingDefIndexes);
1294
1295	// If we find a use that contains this DefMI in its reachingDefs, then it is
1296	// a reaching use.
1297	if (any_of(Range&: ReachingDefIndexes, P: [DefIdx](SlotIndex RDIdx) {
1298	return SlotIndex::isSameInstr(A: RDIdx, B: DefIdx);
1299	}))
1300	ReachingUses.push_back(Elt: &UseMO);
1301	}
1302	}
1303
1304	bool RewriteMFMAFormStage::initGCNSchedStage() {
1305	// We only need to run this pass if the architecture supports AGPRs.
1306	// Additionally, we don't use AGPRs at occupancy levels above 1 so there
1307	// is no need for this pass in that case, either.
1308	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1309	if (!ST.hasGFX90AInsts() \|\| MFI.getMinWavesPerEU() > `1`)
1310	return false;
1311
1312	RegionsWithExcessArchVGPR.resize(N: DAG.Regions.size());
1313	RegionsWithExcessArchVGPR.reset();
1314	for (unsigned Region = `0`; Region < DAG.Regions.size(); Region++) {
1315	GCNRegPressure PressureBefore = DAG.Pressure [Region];
1316	if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1317	RegionsWithExcessArchVGPR [Region] = true;
1318	}
1319
1320	if (RegionsWithExcessArchVGPR.none())
1321	return false;
1322
1323	TII = ST.getInstrInfo();
1324	SRI = ST.getRegisterInfo();
1325
1326	std::vector<std::pair<MachineInstr , unsigned*>> RewriteCands;
1327	DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse;
1328	SmallPtrSet<MachineInstr *, `8`> CopyForDef;
1329
1330	if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1331	return false;
1332
1333	int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1334
1335	// If we haven't found the beneficial conditions, prefer the VGPR form which
1336	// may result in less cross RC copies.
1337	if (Cost > `0`)
1338	return false;
1339
1340	return rewrite(RewriteCands);
1341	}
1342
1343	bool UnclusteredHighRPStage::initGCNSchedStage() {
1344	if (DisableUnclusterHighRP)
1345	return false;
1346
1347	if (!GCNSchedStage::initGCNSchedStage())
1348	return false;
1349
1350	if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1351	return false;
1352
1353	SavedMutations.swap(x&: DAG.Mutations);
1354	DAG.addMutation(
1355	Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry));
1356
1357	InitialOccupancy = DAG.MinOccupancy;
1358	// Aggressively try to reduce register pressure in the unclustered high RP
1359	// stage. Temporarily increase occupancy target in the region.
1360	TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1361	? InitialOccupancy + `1`
1362	: InitialOccupancy;
1363	IsAnyRegionScheduled = false;
1364	S.SGPRLimitBias = S.HighRPSGPRBias;
1365	S.VGPRLimitBias = S.HighRPVGPRBias;
1366
1367	LLVM_DEBUG(
1368	dbgs()
1369	<< "Retrying function scheduling without clustering. "
1370	"Aggressively try to reduce register pressure to achieve occupancy "
1371	<< TempTargetOccupancy << ".\n");
1372
1373	return true;
1374	}
1375
1376	bool ClusteredLowOccStage::initGCNSchedStage() {
1377	if (DisableClusteredLowOccupancy)
1378	return false;
1379
1380	if (!GCNSchedStage::initGCNSchedStage())
1381	return false;
1382
1383	// Don't bother trying to improve ILP in lower RP regions if occupancy has not
1384	// been dropped. All regions will have already been scheduled with the ideal
1385	// occupancy targets.
1386	if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1387	return false;
1388
1389	LLVM_DEBUG(
1390	dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1391	<< DAG.MinOccupancy << ".\n");
1392	return true;
1393	}
1394
1395	/// Allows to easily filter for this stage's debug output.
1396	#define REMAT_PREFIX "[PreRARemat] "
1397	#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1398
1399	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1400	Printable PreRARematStage::ScoredRemat::print() const {
1401	return Printable([&](raw_ostream &OS) {
1402	OS << `'('` << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << `')'`;
1403	});
1404	}
1405	#endif
1406
1407	bool PreRARematStage::initGCNSchedStage() {
1408	// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1409	// regions inbetween the defs and region we sinked the def to. Will need to be
1410	// fixed if there is another pass after this pass.
1411	assert(!S.hasNextStage());
1412
1413	if (!GCNSchedStage::initGCNSchedStage() \|\| DAG.Regions.size() <= `1`)
1414	return false;
1415
1416	// Maps all MIs (except lone terminators, which are not part of any region) to
1417	// their parent region. Non-lone terminators are considered part of the region
1418	// they delimitate.
1419	DenseMap<MachineInstr , unsigned*> MIRegion(MF.getInstructionCount());
1420
1421	// Before performing any IR modification record the parent region of each MI
1422	// and the parent MBB of each region.
1423	const unsigned NumRegions = DAG.Regions.size();
1424	for (unsigned I = `0`; I < NumRegions; ++I) {
1425	RegionBoundaries Region = DAG.Regions [I];
1426	for (auto MI = Region.first; MI != Region.second; ++MI)
1427	MIRegion.insert(KV: {&*MI, I});
1428	MachineBasicBlock *ParentMBB = Region.first ->getParent();
1429	if (Region.second != ParentMBB->end())
1430	MIRegion.insert(KV: {&*Region.second, I});
1431	RegionBB.push_back(Elt: ParentMBB);
1432	}
1433
1434	#ifndef NDEBUG
1435	auto PrintTargetRegions = [&]() -> void {
1436	if (TargetRegions.none()) {
1437	dbgs() << REMAT_PREFIX << "No target regions\n";
1438	return;
1439	}
1440	dbgs() << REMAT_PREFIX << "Target regions:\n";
1441	for (unsigned I : TargetRegions.set_bits())
1442	dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << `'\n'`;
1443	};
1444	auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
1445	return Printable([&, Remat](raw_ostream &OS) {
1446	// Concatenate all region numbers in which the register is unused and
1447	// live-through.
1448	bool HasLiveThroughRegion = false;
1449	OS << `'['` << Remat.DefRegion << " -";
1450	for (unsigned I = `0`; I < NumRegions; ++I) {
1451	if (Remat.isUnusedLiveThrough(I)) {
1452	if (HasLiveThroughRegion) {
1453	OS << `','`;
1454	} else {
1455	OS << "- ";
1456	HasLiveThroughRegion = true;
1457	}
1458	OS << I;
1459	}
1460	}
1461	if (HasLiveThroughRegion)
1462	OS << " -";
1463	OS << "-> " << Remat.UseRegion << "] ";
1464	Remat.DefMI->print(OS, /IsStandalone=/true, /SkipOpers=/false,
1465	/SkipDebugLoc=/false, /AddNewLine=/false);
1466	});
1467	};
1468	#endif
1469
1470	// Set an objective for the stage based on current RP in each region.
1471	REMAT_DEBUG({
1472	dbgs() << "Analyzing ";
1473	MF.getFunction().printAsOperand(dbgs(), false);
1474	dbgs() << ": ";
1475	});
1476	if (!setObjective()) {
1477	LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
1478	<< MFI.getMaxWavesPerEU() << `'\n'`);
1479	return false;
1480	}
1481	LLVM_DEBUG({
1482	if (TargetOcc) {
1483	dbgs() << "increase occupancy from " << *TargetOcc - `1` << `'\n'`;
1484	} else {
1485	dbgs() << "reduce spilling (minimum target occupancy is "
1486	<< MFI.getMinWavesPerEU() << ")\n";
1487	}
1488	PrintTargetRegions();
1489	});
1490
1491	if (!collectRematRegs(MIRegion)) {
1492	REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
1493	return false;
1494	}
1495	const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1496	REMAT_DEBUG({
1497	dbgs() << "Rematerializable registers:\n";
1498	for (const RematReg &Remat : RematRegs)
1499	dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << `'\n'`;
1500	dbgs() << REMAT_PREFIX << "Region frequencies\n";
1501	for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
1502	dbgs() << REMAT_PREFIX << " [" << I << "] ";
1503	if (Freq)
1504	dbgs() << Freq;
1505	else
1506	dbgs() << "unknown ";
1507	dbgs() << " \| " << *DAG.Regions[I].first;
1508	}
1509	});
1510
1511	SmallVector<ScoredRemat> ScoredRemats;
1512	for (RematReg &Remat : RematRegs)
1513	ScoredRemats.emplace_back(Args: &Remat, Args: FreqInfo, Args&: DAG);
1514
1515	// Rematerialize registers in successive rounds until all RP targets are
1516	// satisifed or until we run out of rematerialization candidates.
1517	#ifndef NDEBUG
1518	unsigned RoundNum = `0`;
1519	#endif
1520	BitVector RecomputeRP(NumRegions);
1521	do {
1522	assert(!ScoredRemats.empty() && "no more remat candidates");
1523
1524	// (Re-)Score and (re-)sort all remats in increasing score order.
1525	for (ScoredRemat &Remat : ScoredRemats)
1526	Remat.update(TargetRegions, RPTargets, Freq: FreqInfo, ReduceSpill: !TargetOcc);
1527	sort(C&: ScoredRemats);
1528
1529	REMAT_DEBUG({
1530	dbgs() << "==== ROUND " << RoundNum++ << " ====\n"
1531	<< REMAT_PREFIX
1532	<< "Candidates with non-null score, in rematerialization order:\n";
1533	for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
1534	if (RematDecision.hasNullScore())
1535	break;
1536	dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " \| "
1537	<< *RematDecision.Remat->DefMI;
1538	}
1539	PrintTargetRegions();
1540	});
1541
1542	RecomputeRP.reset();
1543	unsigned RematIdx = ScoredRemats.size();
1544
1545	// Rematerialize registers in decreasing score order until we estimate
1546	// that all RP targets are satisfied or until rematerialization candidates
1547	// are no longer useful to decrease RP.
1548	for (; RematIdx && TargetRegions.any(); --RematIdx) {
1549	const ScoredRemat &Candidate = ScoredRemats [RematIdx - `1`];
1550	// Stop rematerializing on encountering a null score. Since scores
1551	// monotonically decrease as we rematerialize, we know there is nothing
1552	// useful left to do in such cases, even if we were to re-score.
1553	if (Candidate.hasNullScore()) {
1554	RematIdx = `0`;
1555	break;
1556	}
1557
1558	RematReg &Remat = *Candidate.Remat;
1559	// When previous rematerializations in this round have already satisfied
1560	// RP targets in all regions this rematerialization can impact, we have a
1561	// good indication that our scores have diverged significantly from
1562	// reality, in which case we interrupt this round and re-score. This also
1563	// ensures that every rematerialization we perform is possibly impactful
1564	// in at least one target region.
1565	if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
1566	break;
1567
1568	REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << `'\n'`;);
1569	MachineInstr *RematMI =
1570	Candidate.rematerialize(RecomputeRP, RPTargets, DAG);
1571	RescheduleRegions \|= Remat.Live;
1572
1573	// Every rematerialization we do here is likely to move the instruction
1574	// into a higher frequency region, increasing the total sum latency of the
1575	// instruction itself. This is acceptable if we are eliminating a spill in
1576	// the process, but when the goal is increasing occupancy we get nothing
1577	// out of rematerialization if occupancy is not increased in the end; in
1578	// such cases we want to roll back the rematerialization.
1579	if (TargetOcc) {
1580	RollbackInfo &Rollback = Rollbacks.emplace_back(Args: &Remat);
1581	Rollback.RematMI = RematMI;
1582	// Make the original MI a debug value so that it does not influence
1583	// scheduling and replace all read registers with a sentinel register to
1584	// prevent operands to appear in use-lists of other MIs during LIS
1585	// updates. Store mappings between operand indices and original
1586	// registers for potential rollback.
1587	Remat.DefMI->setDesc(DAG.TII->get(Opcode: TargetOpcode::DBG_VALUE));
1588	for (auto [Idx, MO] : enumerate(First: Remat.DefMI->operands())) {
1589	if (MO.isReg() && MO.readsReg()) {
1590	Rollback.RegMap.insert(KV: {Idx, MO.getReg()});
1591	MO.setReg(Register ());
1592	}
1593	}
1594	} else {
1595	// Just delete the original instruction if it cannot be rolled back.
1596	DAG.deleteMI(RegionIdx: Remat.DefRegion, MI: Remat.DefMI);
1597	}
1598
1599	unsetSatisfiedRPTargets(Regions: Remat.Live);
1600	}
1601
1602	REMAT_DEBUG({
1603	if (!TargetRegions.any()) {
1604	dbgs() << "** Interrupt round on all targets achieved\n";
1605	} else if (RematIdx) {
1606	dbgs() << "** Interrupt round on stale score for "
1607	<< *ScoredRemats[RematIdx - `1`].Remat->DefMI;
1608	} else {
1609	dbgs() << "** Stop on exhausted rematerialization candidates\n";
1610	}
1611	});
1612
1613	// Peel off registers we already rematerialized from the vector's tail.
1614	ScoredRemats.truncate(N: RematIdx);
1615	} while ((updateAndVerifyRPTargets(Regions: RecomputeRP) \|\| TargetRegions.any()) &&
1616	!ScoredRemats.empty());
1617	if (RescheduleRegions.none())
1618	return false;
1619
1620	// Commit all pressure changes to the DAG and compute minimum achieved
1621	// occupancy in impacted regions.
1622	REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
1623	unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
1624	for (unsigned I : RescheduleRegions.set_bits()) {
1625	DAG.Pressure [I] = RPTargets [I].getCurrentRP();
1626	REMAT_DEBUG(dbgs() << `'['` << I << "] Achieved occupancy "
1627	<< DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
1628	<< " (" << RPTargets[I] << ")\n");
1629	}
1630	AchievedOcc = MFI.getMaxWavesPerEU();
1631	for (const GCNRegPressure &RP : DAG.Pressure) {
1632	AchievedOcc =
1633	std::min(a: AchievedOcc, b: RP.getOccupancy(ST, DynamicVGPRBlockSize));
1634	}
1635
1636	REMAT_DEBUG({
1637	dbgs() << "Retrying function scheduling with new min. occupancy of "
1638	<< AchievedOcc << " from rematerializing (original was "
1639	<< DAG.MinOccupancy;
1640	if (TargetOcc)
1641	dbgs() << ", target was " << *TargetOcc;
1642	dbgs() << ")\n";
1643	});
1644
1645	DAG.setTargetOccupancy(getStageTargetOccupancy());
1646	return true;
1647	}
1648
1649	void GCNSchedStage::finalizeGCNSchedStage() {
1650	DAG.finishBlock();
1651	LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1652	}
1653
1654	void UnclusteredHighRPStage::finalizeGCNSchedStage() {
1655	SavedMutations.swap(x&: DAG.Mutations);
1656	S.SGPRLimitBias = S.VGPRLimitBias = `0`;
1657	if (DAG.MinOccupancy > InitialOccupancy) {
1658	assert(IsAnyRegionScheduled);
1659	LLVM_DEBUG(dbgs() << StageID
1660	<< " stage successfully increased occupancy to "
1661	<< DAG.MinOccupancy << `'\n'`);
1662	} else if (!IsAnyRegionScheduled) {
1663	assert(DAG.MinOccupancy == InitialOccupancy);
1664	LLVM_DEBUG(dbgs() << StageID
1665	<< ": No regions scheduled, min occupancy stays at "
1666	<< DAG.MinOccupancy << ", MFI occupancy stays at "
1667	<< MFI.getOccupancy() << ".\n");
1668	}
1669
1670	GCNSchedStage::finalizeGCNSchedStage();
1671	}
1672
1673	bool GCNSchedStage::initGCNRegion() {
1674	// Skip empty scheduling region.
1675	if (DAG.begin() == DAG.end())
1676	return false;
1677
1678	// Check whether this new region is also a new block.
1679	if (DAG.RegionBegin ->getParent() != CurrentMBB)
1680	setupNewBlock();
1681
1682	unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end());
1683	DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs);
1684
1685	// Skip regions with 1 schedulable instruction.
1686	if (DAG.begin() == std::prev(x: DAG.end()))
1687	return false;
1688
1689	LLVM_DEBUG(dbgs() << "******** MI Scheduling ********\n");
1690	LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1691	<< " " << CurrentMBB->getName()
1692	<< "\n From: " << *DAG.begin() << " To: ";
1693	if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1694	else dbgs() << "End";
1695	dbgs() << " RegionInstrs: " << NumRegionInstrs << `'\n'`);
1696
1697	// Save original instruction order before scheduling for possible revert.
1698	Unsched.clear();
1699	Unsched.reserve(n: DAG.NumRegionInstrs);
1700	if (StageID == GCNSchedStageID::OccInitialSchedule \|\|
1701	StageID == GCNSchedStageID::ILPInitialSchedule) {
1702	const SIInstrInfo SII = static_cast<const* SIInstrInfo *>(DAG.TII);
1703	for (auto &I : DAG) {
1704	Unsched.push_back(x: &I);
1705	if (SII->isIGLPMutationOnly(Opcode: I.getOpcode()))
1706	DAG.RegionsWithIGLPInstrs [RegionIdx] = true;
1707	}
1708	} else {
1709	for (auto &I : DAG)
1710	Unsched.push_back(x: &I);
1711	}
1712
1713	PressureBefore = DAG.Pressure [RegionIdx];
1714
1715	LLVM_DEBUG(
1716	dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1717	<< print(DAG.LiveIns[RegionIdx], DAG.MRI)
1718	<< "Region live-in pressure: "
1719	<< print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1720	<< "Region register pressure: " << print(PressureBefore));
1721
1722	S.HasHighPressure = false;
1723	S.KnownExcessRP = isRegionWithExcessRP();
1724
1725	if (DAG.RegionsWithIGLPInstrs [RegionIdx] &&
1726	StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
1727	SavedMutations.clear();
1728	SavedMutations.swap(x&: DAG.Mutations);
1729	bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule \|\|
1730	StageID == GCNSchedStageID::ILPInitialSchedule;
1731	DAG.addMutation(Mutation: createIGroupLPDAGMutation(
1732	Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1733	: AMDGPU::SchedulingPhase::PreRAReentry));
1734	}
1735
1736	return true;
1737	}
1738
1739	bool UnclusteredHighRPStage::initGCNRegion() {
1740	// Only reschedule regions that have excess register pressure (i.e. spilling)
1741	// or had minimum occupancy at the beginning of the stage (as long as
1742	// rescheduling of previous regions did not make occupancy drop back down to
1743	// the initial minimum).
1744	unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1745	// If no region has been scheduled yet, the DAG has not yet been updated with
1746	// the occupancy target. So retrieve it from the temporary.
1747	unsigned CurrentTargetOccupancy =
1748	IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1749	if (!DAG.RegionsWithExcessRP [RegionIdx] &&
1750	(CurrentTargetOccupancy <= InitialOccupancy \|\|
1751	DAG.Pressure [RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1752	InitialOccupancy))
1753	return false;
1754
1755	bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1756	// If this is the first region scheduled during this stage, make the target
1757	// occupancy changes in the DAG and MFI.
1758	if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1759	IsAnyRegionScheduled = true;
1760	if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1761	DAG.setTargetOccupancy(TempTargetOccupancy);
1762	}
1763	return IsSchedulingThisRegion;
1764	}
1765
1766	bool ClusteredLowOccStage::initGCNRegion() {
1767	// We may need to reschedule this region if it wasn't rescheduled in the last
1768	// stage, or if we found it was testing critical register pressure limits in
1769	// the unclustered reschedule stage. The later is because we may not have been
1770	// able to raise the min occupancy in the previous stage so the region may be
1771	// overly constrained even if it was already rescheduled.
1772	if (!DAG.RegionsWithHighRP [RegionIdx])
1773	return false;
1774
1775	return GCNSchedStage::initGCNRegion();
1776	}
1777
1778	bool PreRARematStage::initGCNRegion() {
1779	return RescheduleRegions [RegionIdx] && GCNSchedStage::initGCNRegion();
1780	}
1781
1782	void GCNSchedStage::setupNewBlock() {
1783	if (CurrentMBB)
1784	DAG.finishBlock();
1785
1786	CurrentMBB = DAG.RegionBegin ->getParent();
1787	DAG.startBlock(bb: CurrentMBB);
1788	// Get real RP for the region if it hasn't be calculated before. After the
1789	// initial schedule stage real RP will be collected after scheduling.
1790	if (StageID == GCNSchedStageID::OccInitialSchedule \|\|
1791	StageID == GCNSchedStageID::ILPInitialSchedule \|\|
1792	StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
1793	DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB);
1794	}
1795
1796	void GCNSchedStage::finalizeGCNRegion() {
1797	DAG.Regions [RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1798	if (S.HasHighPressure)
1799	DAG.RegionsWithHighRP [RegionIdx] = true;
1800
1801	// Revert scheduling if we have dropped occupancy or there is some other
1802	// reason that the original schedule is better.
1803	checkScheduling();
1804
1805	if (DAG.RegionsWithIGLPInstrs [RegionIdx] &&
1806	StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1807	SavedMutations.swap(x&: DAG.Mutations);
1808	}
1809
1810	void PreRARematStage::finalizeGCNRegion() {
1811	GCNSchedStage::finalizeGCNRegion();
1812	// When the goal is to increase occupancy, all regions must reach the target
1813	// occupancy for rematerializations to be possibly useful, otherwise we will
1814	// just hurt latency for no benefit. If minimum occupancy drops below the
1815	// target there is no point in trying to re-schedule further regions.
1816	if (!TargetOcc)
1817	return;
1818	RegionReverts.emplace_back(Args&: RegionIdx, Args&: Unsched, Args&: PressureBefore);
1819	if (DAG.MinOccupancy < *TargetOcc) {
1820	REMAT_DEBUG(dbgs() << "Region " << RegionIdx
1821	<< " cannot meet occupancy target, interrupting "
1822	"re-scheduling in all regions\n");
1823	RescheduleRegions.reset();
1824	}
1825	}
1826
1827	void GCNSchedStage::checkScheduling() {
1828	// Check the results of scheduling.
1829	PressureAfter = DAG.getRealRegPressure(RegionIdx);
1830
1831	LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1832	LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1833
1834	unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1835
1836	if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1837	PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1838	DAG.Pressure [RegionIdx] = PressureAfter;
1839
1840	// Early out if we have achieved the occupancy target.
1841	LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1842	return;
1843	}
1844
1845	unsigned TargetOccupancy = std::min(
1846	a: S.getTargetOccupancy(), b: ST.getOccupancyWithWorkGroupSizes(MF).second);
1847	unsigned WavesAfter = std::min(
1848	a: TargetOccupancy, b: PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1849	unsigned WavesBefore = std::min(
1850	a: TargetOccupancy, b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1851	LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1852	<< ", after " << WavesAfter << ".\n");
1853
1854	// We may not be able to keep the current target occupancy because of the just
1855	// scheduled region. We might still be able to revert scheduling if the
1856	// occupancy before was higher, or if the current schedule has register
1857	// pressure higher than the excess limits which could lead to more spilling.
1858	unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore);
1859
1860	// Allow memory bound functions to drop to 4 waves if not limited by an
1861	// attribute.
1862	if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1863	WavesAfter >= MFI.getMinAllowedOccupancy()) {
1864	LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1865	<< MFI.getMinAllowedOccupancy() << " waves\n");
1866	NewOccupancy = WavesAfter;
1867	}
1868
1869	if (NewOccupancy < DAG.MinOccupancy) {
1870	DAG.MinOccupancy = NewOccupancy;
1871	MFI.limitOccupancy(Limit: DAG.MinOccupancy);
1872	LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1873	<< DAG.MinOccupancy << ".\n");
1874	}
1875	// The maximum number of arch VGPR on non-unified register file, or the
1876	// maximum VGPR + AGPR in the unified register file case.
1877	unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1878	// The maximum number of arch VGPR for both unified and non-unified register
1879	// file.
1880	unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs());
1881	unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1882
1883	if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs \|\|
1884	PressureAfter.getArchVGPRNum() > MaxArchVGPRs \|\|
1885	PressureAfter.getAGPRNum() > MaxArchVGPRs \|\|
1886	PressureAfter.getSGPRNum() > MaxSGPRs) {
1887	DAG.RegionsWithHighRP [RegionIdx] = true;
1888	DAG.RegionsWithExcessRP [RegionIdx] = true;
1889	}
1890
1891	// Revert if this region's schedule would cause a drop in occupancy or
1892	// spilling.
1893	if (shouldRevertScheduling(WavesAfter)) {
1894	modifyRegionSchedule(RegionIdx, MBB: DAG.BB, MIOrder: Unsched);
1895	std::tie(args&: DAG.RegionBegin, args&: DAG.RegionEnd) = DAG.Regions [RegionIdx];
1896	} else {
1897	DAG.Pressure [RegionIdx] = PressureAfter;
1898	}
1899	}
1900
1901	unsigned
1902	GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1903	DenseMap<unsigned, unsigned> &ReadyCycles,
1904	const TargetSchedModel &SM) {
1905	unsigned ReadyCycle = CurrCycle;
1906	for (auto &D : SU.Preds) {
1907	if (D.isAssignedRegDep()) {
1908	MachineInstr *DefMI = D.getSUnit()->getInstr();
1909	unsigned Latency = SM.computeInstrLatency(MI: DefMI);
1910	unsigned DefReady = ReadyCycles [DAG.getSUnit(MI: DefMI)->NodeNum];
1911	ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency);
1912	}
1913	}
1914	ReadyCycles [SU.NodeNum] = ReadyCycle;
1915	return ReadyCycle;
1916	}
1917
1918	#ifndef NDEBUG
1919	struct EarlierIssuingCycle {
1920	bool operator()(std::pair<MachineInstr , unsigned*> A,
1921	std::pair<MachineInstr , unsigned> B) const* {
1922	return A.second < B.second;
1923	}
1924	};
1925
1926	static void printScheduleModel(std::set<std::pair<MachineInstr , unsigned*>,
1927	EarlierIssuingCycle> &ReadyCycles) {
1928	if (ReadyCycles.empty())
1929	return;
1930	unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1931	dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1932	<< " ##################\n# Cycle #\t\t\tInstruction "
1933	" "
1934	" \n";
1935	unsigned IPrev = `1`;
1936	for (auto &I : ReadyCycles) {
1937	if (I.second > IPrev + `1`)
1938	dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1939	<< " CYCLES DETECTED ******************************\n\n";
1940	dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1941	IPrev = I.second;
1942	}
1943	}
1944	#endif
1945
1946	ScheduleMetrics
1947	GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1948	#ifndef NDEBUG
1949	std::set<std::pair<MachineInstr , unsigned*>, EarlierIssuingCycle>
1950	ReadyCyclesSorted;
1951	#endif
1952	const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1953	unsigned SumBubbles = `0`;
1954	DenseMap<unsigned, unsigned> ReadyCycles;
1955	unsigned CurrCycle = `0`;
1956	for (auto &SU : InputSchedule) {
1957	unsigned ReadyCycle =
1958	computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1959	SumBubbles += ReadyCycle - CurrCycle;
1960	#ifndef NDEBUG
1961	ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1962	#endif
1963	CurrCycle = ++ReadyCycle;
1964	}
1965	#ifndef NDEBUG
1966	LLVM_DEBUG(
1967	printScheduleModel(ReadyCyclesSorted);
1968	dbgs() << "\n\t"
1969	<< "Metric: "
1970	<< (SumBubbles
1971	? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1972	: `1`)
1973	<< "\n\n");
1974	#endif
1975
1976	return ScheduleMetrics (CurrCycle, SumBubbles);
1977	}
1978
1979	ScheduleMetrics
1980	GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1981	#ifndef NDEBUG
1982	std::set<std::pair<MachineInstr , unsigned*>, EarlierIssuingCycle>
1983	ReadyCyclesSorted;
1984	#endif
1985	const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1986	unsigned SumBubbles = `0`;
1987	DenseMap<unsigned, unsigned> ReadyCycles;
1988	unsigned CurrCycle = `0`;
1989	for (auto &MI : DAG) {
1990	SUnit *SU = DAG.getSUnit(MI: &MI);
1991	if (!SU)
1992	continue;
1993	unsigned ReadyCycle =
1994	computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM);
1995	SumBubbles += ReadyCycle - CurrCycle;
1996	#ifndef NDEBUG
1997	ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1998	#endif
1999	CurrCycle = ++ReadyCycle;
2000	}
2001	#ifndef NDEBUG
2002	LLVM_DEBUG(
2003	printScheduleModel(ReadyCyclesSorted);
2004	dbgs() << "\n\t"
2005	<< "Metric: "
2006	<< (SumBubbles
2007	? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2008	: `1`)
2009	<< "\n\n");
2010	#endif
2011
2012	return ScheduleMetrics (CurrCycle, SumBubbles);
2013	}
2014
2015	bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
2016	if (WavesAfter < DAG.MinOccupancy)
2017	return true;
2018
2019	// For dynamic VGPR mode, we don't want to waste any VGPR blocks.
2020	if (DAG.MFI.isDynamicVGPREnabled()) {
2021	unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2022	STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(),
2023	DynamicVGPRBlockSize: PressureBefore.getVGPRNum(UnifiedVGPRFile: false));
2024	unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2025	STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(),
2026	DynamicVGPRBlockSize: PressureAfter.getVGPRNum(UnifiedVGPRFile: false));
2027	if (BlocksAfter > BlocksBefore)
2028	return true;
2029	}
2030
2031	return false;
2032	}
2033
2034	bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
2035	if (PressureAfter == PressureBefore)
2036	return false;
2037
2038	if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
2039	return true;
2040
2041	if (mayCauseSpilling(WavesAfter))
2042	return true;
2043
2044	return false;
2045	}
2046
2047	bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
2048	// If RP is not reduced in the unclustered reschedule stage, revert to the
2049	// old schedule.
2050	if ((WavesAfter <=
2051	PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) &&
2052	mayCauseSpilling(WavesAfter)) \|\|
2053	GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
2054	LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
2055	return true;
2056	}
2057
2058	// Do not attempt to relax schedule even more if we are already spilling.
2059	if (isRegionWithExcessRP())
2060	return false;
2061
2062	LLVM_DEBUG(
2063	dbgs()
2064	<< "\n\t * In shouldRevertScheduling *\n"
2065	<< " ********* BEFORE UnclusteredHighRPStage *********\n");
2066	ScheduleMetrics MBefore = getScheduleMetrics(InputSchedule: DAG.SUnits);
2067	LLVM_DEBUG(
2068	dbgs()
2069	<< "\n ********* AFTER UnclusteredHighRPStage *********\n");
2070	ScheduleMetrics MAfter = getScheduleMetrics(DAG);
2071	unsigned OldMetric = MBefore.getMetric();
2072	unsigned NewMetric = MAfter.getMetric();
2073	unsigned WavesBefore = std::min(
2074	a: S.getTargetOccupancy(),
2075	b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()));
2076	unsigned Profit =
2077	((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
2078	((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
2079	NewMetric) /
2080	ScheduleMetrics::ScaleFactor;
2081	LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
2082	<< MAfter << "Profit: " << Profit << "\n");
2083	return Profit < ScheduleMetrics::ScaleFactor;
2084	}
2085
2086	bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
2087	if (PressureAfter == PressureBefore)
2088	return false;
2089
2090	if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
2091	return true;
2092
2093	if (mayCauseSpilling(WavesAfter))
2094	return true;
2095
2096	return false;
2097	}
2098
2099	bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
2100	// When trying to increase occupancy (TargetOcc == true) the stage manages
2101	// region reverts globally (all or none), so we always return false here.
2102	return !TargetOcc && mayCauseSpilling(WavesAfter);
2103	}
2104
2105	bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
2106	if (mayCauseSpilling(WavesAfter))
2107	return true;
2108
2109	return false;
2110	}
2111
2112	bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
2113	unsigned WavesAfter) {
2114	return mayCauseSpilling(WavesAfter);
2115	}
2116
2117	bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
2118	if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
2119	!PressureAfter.less(MF, O: PressureBefore)) {
2120	LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
2121	return true;
2122	}
2123
2124	return false;
2125	}
2126
2127	void GCNSchedStage::modifyRegionSchedule(unsigned RegionIdx,
2128	MachineBasicBlock *MBB,
2129	ArrayRef<MachineInstr *> MIOrder) {
2130	assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
2131	DAG.Regions[RegionIdx].second)) ==
2132	MIOrder.size() &&
2133	"instruction number mismatch");
2134	if (MIOrder.empty())
2135	return;
2136
2137	LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << `'\n'`);
2138
2139	// Reconstruct MI sequence by moving instructions in desired order before
2140	// the current region's start.
2141	MachineBasicBlock::iterator RegionEnd = DAG.Regions [RegionIdx].first;
2142	for (MachineInstr *MI : MIOrder) {
2143	// Either move the next MI in order before the end of the region or move the
2144	// region end past the MI if it is at the correct position.
2145	MachineBasicBlock::iterator MII = MI->getIterator();
2146	if (MII != RegionEnd) {
2147	// Will subsequent splice move MI up past a non-debug instruction?
2148	bool NonDebugReordered =
2149	!MI->isDebugInstr() &&
2150	skipDebugInstructionsForward(It: RegionEnd, End: MII) != MII;
2151	MBB->splice(Where: RegionEnd, Other: MBB, From: MI);
2152	// Only update LiveIntervals information if non-debug instructions are
2153	// reordered. Otherwise debug instructions could cause code generation to
2154	// change.
2155	if (NonDebugReordered)
2156	DAG.LIS->handleMove(MI&: MI, UpdateFlags: true*);
2157	} else {
2158	++RegionEnd;
2159	}
2160	if (MI->isDebugInstr()) {
2161	LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2162	continue;
2163	}
2164
2165	// Reset read-undef flags and update them later.
2166	for (MachineOperand &Op : MI->all_defs())
2167	Op.setIsUndef(false);
2168	RegisterOperands RegOpers;
2169	RegOpers.collect(MI: MI, TRI: DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false);
2170	if (DAG.ShouldTrackLaneMasks) {
2171	// Adjust liveness and add missing dead+read-undef flags.
2172	SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot();
2173	RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI);
2174	} else {
2175	// Adjust for missing dead-def flags.
2176	RegOpers.detectDeadDefs(MI: MI, LIS: DAG.LIS);
2177	}
2178	LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2179	}
2180
2181	// The region end doesn't change throughout scheduling since it itself is
2182	// outside the region (whether that is a MBB end or a terminator MI).
2183	assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
2184	DAG.Regions [RegionIdx].first = MIOrder.front();
2185	}
2186
2187	bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr MI) const* {
2188
2189	if (!static_cast<const SIInstrInfo >(DAG.TII)->isMAI(MI: MI))
2190	return false;
2191	return AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI->getOpcode()) != -`1`;
2192	}
2193
2194	bool RewriteMFMAFormStage::initHeuristics(
2195	std::vector<std::pair<MachineInstr , unsigned*>> &RewriteCands,
2196	DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2197	SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2198	bool Changed = false;
2199
2200	// Prepare for the heuristics
2201	for (MachineBasicBlock &MBB : MF) {
2202	for (MachineInstr &MI : MBB) {
2203	if (!isRewriteCandidate(MI: &MI))
2204	continue;
2205
2206	int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI.getOpcode());
2207	assert(ReplacementOp != -`1`);
2208
2209	RewriteCands.push_back(x: {&MI, MI.getOpcode()});
2210	MI.setDesc(TII->get(Opcode: ReplacementOp));
2211
2212	MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
2213	if (Src2->isReg()) {
2214	SmallVector<SlotIndex, `8`> Src2ReachingDefs;
2215	findReachingDefs(UseMO&: *Src2, LIS: DAG.LIS, DefIdxs&: Src2ReachingDefs);
2216
2217	// For any definition of the src2 register which is non-MFMA, we
2218	// insert a copy.
2219	for (SlotIndex RDIdx : Src2ReachingDefs) {
2220	MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIdx);
2221	if (!TII->isMAI(MI: *RD))
2222	CopyForDef.insert(Ptr: RD);
2223	}
2224	}
2225
2226	MachineOperand &Dst = MI.getOperand(i: `0`);
2227	SmallVector<MachineOperand *, `8`> DstReachingUses;
2228
2229	findReachingUses(DefMI: &MI, LIS: DAG.LIS, ReachingUses&: DstReachingUses);
2230
2231	for (MachineOperand *RUOp : DstReachingUses) {
2232	if (TII->isMAI(MI: *RUOp->getParent()))
2233	continue;
2234
2235	// For any user of the result of the MFMA which is not an MFMA, we
2236	// insert a copy. For a given register, we will only insert one copy
2237	// per user block.
2238	CopyForUse [RUOp->getParent()->getParent()].insert(x: RUOp->getReg());
2239
2240	SmallVector<SlotIndex, `8`> DstUsesReachingDefs;
2241	findReachingDefs(UseMO&: *RUOp, LIS: DAG.LIS, DefIdxs&: DstUsesReachingDefs);
2242
2243	for (SlotIndex RDIndex : DstUsesReachingDefs) {
2244	MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2245	if (TII->isMAI(MI: *RD))
2246	continue;
2247
2248	// For any definition of the user of the MFMA which is not an MFMA,
2249	// we insert a copy. We do this to transform all the reaching defs
2250	// of this use to AGPR. By doing this, we can insert a copy from
2251	// AGPR to VGPR at the user rather than after the MFMA.
2252	CopyForDef.insert(Ptr: RD);
2253	}
2254	}
2255
2256	// Do the rewrite to allow for updated RP calculation.
2257	const TargetRegisterClass *VDefRC = DAG.MRI.getRegClass(Reg: Dst.getReg());
2258	const TargetRegisterClass *ADefRC = SRI->getEquivalentAGPRClass(SRC: VDefRC);
2259	DAG.MRI.setRegClass(Reg: Dst.getReg(), RC: ADefRC);
2260	if (Src2->isReg()) {
2261	// Have to get src types separately since subregs may cause C and D
2262	// registers to be different types even though the actual operand is
2263	// the same size.
2264	const TargetRegisterClass *VUseRC = DAG.MRI.getRegClass(Reg: Src2->getReg());
2265	const TargetRegisterClass *AUseRC = SRI->getEquivalentAGPRClass(SRC: VUseRC);
2266	DAG.MRI.setRegClass(Reg: Src2->getReg(), RC: AUseRC);
2267	}
2268	Changed = true;
2269	}
2270	}
2271
2272	return Changed;
2273	}
2274
2275	int64_t RewriteMFMAFormStage::getRewriteCost(
2276	const std::vector<std::pair<MachineInstr , unsigned*>> &RewriteCands,
2277	const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2278	const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2279	MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2280
2281	int64_t BestSpillCost = `0`;
2282	int64_t Cost = `0`;
2283	uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2284
2285	std::pair<unsigned, unsigned> MaxVectorRegs =
2286	ST.getMaxNumVectorRegs(F: MF.getFunction());
2287	unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2288	unsigned AGPRThreshold = MaxVectorRegs.second;
2289	unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2290
2291	for (unsigned Region = `0`; Region < DAG.Regions.size(); Region++) {
2292	if (!RegionsWithExcessArchVGPR [Region])
2293	continue;
2294
2295	GCNRegPressure &PressureBefore = DAG.Pressure [Region];
2296	unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2297	MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2298
2299	// For the cases we care about (i.e. ArchVGPR usage is greater than the
2300	// addressable limit), rewriting alone should bring pressure to manageable
2301	// level. If we find any such region, then the rewrite is potentially
2302	// beneficial.
2303	GCNRegPressure PressureAfter = DAG.getRealRegPressure(RegionIdx: Region);
2304	unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2305	MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2306
2307	uint64_t BlockFreq =
2308	MBFI->getBlockFreq(MBB: DAG.Regions [Region].first ->getParent())
2309	.getFrequency();
2310
2311	bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2312	uint64_t RelativeFreq = EntryFreq && BlockFreq
2313	? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2314	: BlockFreq / EntryFreq)
2315	: `1`;
2316
2317	// This assumes perfect spilling / splitting -- using one spill / copy
2318	// instruction and one restoreFrom / copy for each excess register,
2319	int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * `2`;
2320
2321	// Also account for the block frequency.
2322	if (RelativeFreqIsDenom)
2323	SpillCost /= (int64_t)RelativeFreq;
2324	else
2325	SpillCost *= (int64_t)RelativeFreq;
2326
2327	// If we have increased spilling in any block, just bail.
2328	if (SpillCost > `0`)
2329	return SpillCost;
2330
2331	if (SpillCost < BestSpillCost)
2332	BestSpillCost = SpillCost;
2333	}
2334
2335	// Set the cost to the largest decrease in spill cost in order to not double
2336	// count spill reductions.
2337	Cost = BestSpillCost;
2338	assert(Cost <= `0`);
2339
2340	unsigned CopyCost = `0`;
2341
2342	// For each CopyForDef, increase the cost by the register size while
2343	// accounting for block frequency.
2344	for (MachineInstr *DefMI : CopyForDef) {
2345	Register DefReg = DefMI->getOperand(i: `0`).getReg();
2346	uint64_t DefFreq =
2347	EntryFreq
2348	? MBFI->getBlockFreq(MBB: DefMI->getParent()).getFrequency() / EntryFreq
2349	: `1`;
2350
2351	const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg: DefReg);
2352	CopyCost += RC->getCopyCost() * DefFreq;
2353	}
2354
2355	// Account for CopyForUse copies in each block that the register is used.
2356	for (auto &[UseBlock, UseRegs] : CopyForUse) {
2357	uint64_t UseFreq =
2358	EntryFreq ? MBFI->getBlockFreq(MBB: UseBlock).getFrequency() / EntryFreq : `1`;
2359
2360	for (Register UseReg : UseRegs) {
2361	const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg: UseReg);
2362	CopyCost += RC->getCopyCost() * UseFreq;
2363	}
2364	}
2365
2366	// Reset the classes that were changed to AGPR for better RB analysis.
2367	// We must do rewriting after copy-insertion, as some defs of the register
2368	// may require VGPR. Additionally, if we bail out and don't perform the
2369	// rewrite then these need to be restored anyway.
2370	for (auto &[MI, OriginalOpcode] : RewriteCands) {
2371	assert(TII->isMAI(*MI));
2372	const TargetRegisterClass *ADefRC =
2373	DAG.MRI.getRegClass(Reg: MI->getOperand(i: `0`).getReg());
2374	const TargetRegisterClass *VDefRC = SRI->getEquivalentVGPRClass(SRC: ADefRC);
2375	DAG.MRI.setRegClass(Reg: MI->getOperand(i: `0`).getReg(), RC: VDefRC);
2376	MI->setDesc(TII->get(Opcode: OriginalOpcode));
2377
2378	MachineOperand Src2 = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::src2);
2379	assert(Src2);
2380	if (!Src2->isReg())
2381	continue;
2382
2383	// Have to get src types separately since subregs may cause C and D
2384	// registers to be different types even though the actual operand is
2385	// the same size.
2386	const TargetRegisterClass *AUseRC = DAG.MRI.getRegClass(Reg: Src2->getReg());
2387	const TargetRegisterClass *VUseRC = SRI->getEquivalentVGPRClass(SRC: AUseRC);
2388	DAG.MRI.setRegClass(Reg: Src2->getReg(), RC: VUseRC);
2389	}
2390
2391	return Cost + CopyCost;
2392	}
2393
2394	bool RewriteMFMAFormStage::rewrite(
2395	const std::vector<std::pair<MachineInstr , unsigned*>> &RewriteCands) {
2396	DenseMap<MachineInstr , unsigned*> FirstMIToRegion;
2397	DenseMap<MachineInstr , unsigned*> LastMIToRegion;
2398
2399	for (unsigned Region = `0`; Region < DAG.Regions.size(); Region++) {
2400	RegionBoundaries Entry = DAG.Regions [Region];
2401	if (Entry.first == Entry.second)
2402	continue;
2403
2404	FirstMIToRegion [&*Entry.first] = Region;
2405	if (Entry.second != Entry.first ->getParent()->end())
2406	LastMIToRegion [&*Entry.second] = Region;
2407	}
2408
2409	// Rewrite the MFMAs to AGPR, and insert any copies as needed.
2410	// The general assumption of the algorithm (and the previous cost calculation)
2411	// is that it is better to insert the copies in the MBB of the def of the src2
2412	// operands, and in the MBB of the user of the dest operands. This is based on
2413	// the assumption that the MFMAs are likely to appear in loop bodies, while
2414	// the src2 and dest operands are live-in / live-out of the loop. Due to this
2415	// design, the algorithm for finding copy insertion points is more
2416	// complicated.
2417	//
2418	// There are three main cases to handle: 1. the reaching defs of the src2
2419	// operands, 2. the reaching uses of the dst operands, and 3. the reaching
2420	// defs of the reaching uses of the dst operand.
2421	//
2422	// In the first case, we simply insert copies after each of the reaching
2423	// definitions. In the second case, we collect all the uses of a given dest
2424	// and organize them by MBB. Then, we insert 1 copy for each MBB before the
2425	// earliest use. Since the use may have multiple reaching defs, and since we
2426	// want to replace the register it is using with the result of the copy, we
2427	// must handle case 3. In the third case, we simply insert a copy after each
2428	// of the reaching defs to connect to the copy of the reaching uses of the dst
2429	// reg. This allows us to avoid inserting copies next to the MFMAs.
2430	//
2431	// While inserting the copies, we maintain a map of operands which will use
2432	// different regs (i.e. the result of the copies). For example, a case 1 src2
2433	// operand will use the register result of the copies after the reaching defs,
2434	// as opposed to the original register. Now that we have completed our copy
2435	// analysis and placement, we can bulk update the registers. We do this
2436	// separately as to avoid complicating the reachingDef and reachingUse
2437	// queries.
2438	//
2439	// While inserting the copies, we also maintain a list or registers which we
2440	// will want to reclassify as AGPR. After doing the copy insertion and the
2441	// register replacement, we can finally do the reclassification. This uses the
2442	// redef map, as the registers we are interested in reclassifying may be
2443	// replaced by the result of a copy. We must do this after the copy analysis
2444	// and placement as we must have an accurate redef map -- otherwise we may end
2445	// up creating illegal instructions.
2446
2447	// The original registers of the MFMA that need to be reclassified as AGPR.
2448	DenseSet<Register> RewriteRegs;
2449	// The map of an original register in the MFMA to a new register (result of a
2450	// copy) that it should be replaced with.
2451	DenseMap<Register, Register> RedefMap;
2452	// The map of the original MFMA registers to the relevant MFMA operands.
2453	DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2454	// The map of reaching defs for a given register -- to avoid duplicate copies.
2455	DenseMap<Register, SmallPtrSet<MachineInstr *, `8`>> ReachingDefCopyMap;
2456	// The map of reaching uses for a given register by basic block -- to avoid
2457	// duplicate copies and to calculate per MBB insert pts.
2458	DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, `8`>>>
2459	ReachingUseTracker;
2460
2461	for (auto &[MI, OriginalOpcode] : RewriteCands) {
2462	int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI->getOpcode());
2463	if (ReplacementOp == -`1`)
2464	continue;
2465	MI->setDesc(TII->get(Opcode: ReplacementOp));
2466
2467	// Case 1: insert copies for the reaching defs of the Src2Reg.
2468	MachineOperand Src2 = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::src2);
2469	if (Src2->isReg()) {
2470	Register Src2Reg = Src2->getReg();
2471	if (!Src2Reg.isVirtual())
2472	return false;
2473
2474	Register MappedReg = Src2->getReg();
2475	SmallVector<SlotIndex, `8`> Src2ReachingDefs;
2476	findReachingDefs(UseMO&: *Src2, LIS: DAG.LIS, DefIdxs&: Src2ReachingDefs);
2477	SmallSetVector<MachineInstr *, `8`> Src2DefsReplace;
2478
2479	for (SlotIndex RDIndex : Src2ReachingDefs) {
2480	MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2481	if (TII->isMAI(MI: *RD))
2482	continue;
2483
2484	// If there is a non mai reaching def, then we need a copy.
2485	Src2DefsReplace.insert(X: RD);
2486	}
2487
2488	if (!Src2DefsReplace.empty()) {
2489	DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: Src2Reg);
2490	if (RI != RedefMap.end()) {
2491	MappedReg = RI ->second;
2492	} else {
2493	assert(!ReachingDefCopyMap.contains(Src2Reg));
2494	const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Reg: Src2Reg);
2495	const TargetRegisterClass *VGPRRC =
2496	SRI->getEquivalentVGPRClass(SRC: Src2RC);
2497
2498	// Track the mapping of the original register to the new register.
2499	MappedReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2500	RedefMap [Src2Reg] = MappedReg;
2501	}
2502
2503	// If none exists, create a copy from this reaching def.
2504	// We may have inserted a copy already in an earlier iteration.
2505	for (MachineInstr *RD : Src2DefsReplace) {
2506	// Do not create redundant copies.
2507	if (ReachingDefCopyMap [Src2Reg].insert(Ptr: RD).second) {
2508	MachineInstrBuilder VGPRCopy =
2509	BuildMI(BB&: *RD->getParent(), I: std::next(x: RD->getIterator()),
2510	MIMD: RD->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2511	.addDef(RegNo: MappedReg, Flags: {}, SubReg: `0`)
2512	.addUse(RegNo: Src2Reg, Flags: {}, SubReg: `0`);
2513	DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2514
2515	// If this reaching def was the last MI in the region, update the
2516	// region boundaries.
2517	if (LastMIToRegion.contains(Val: RD)) {
2518	unsigned UpdateRegion = LastMIToRegion [RD];
2519	DAG.Regions [UpdateRegion].second = VGPRCopy;
2520	LastMIToRegion.erase(Val: RD);
2521	}
2522	}
2523	}
2524	}
2525
2526	// Track the register for reclassification
2527	RewriteRegs.insert(V: Src2Reg);
2528
2529	// Always insert the operand for replacement. If this corresponds with a
2530	// chain of tied-def we may not see the VGPR requirement until later.
2531	ReplaceMap [Src2Reg].insert(V: Src2);
2532	}
2533
2534	// Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2535	// and after the reaching defs of the reaching uses of the dsts.
2536
2537	MachineOperand *Dst = &MI->getOperand(i: `0`);
2538	Register DstReg = Dst->getReg();
2539	if (!DstReg.isVirtual())
2540	return false;
2541
2542	Register MappedReg = DstReg;
2543	SmallVector<MachineOperand *, `8`> DstReachingUses;
2544
2545	SmallVector<MachineOperand *, `8`> DstReachingUseCopies;
2546	SmallVector<MachineInstr *, `8`> DstUseDefsReplace;
2547
2548	findReachingUses(DefMI: MI, LIS: DAG.LIS, ReachingUses&: DstReachingUses);
2549
2550	for (MachineOperand *RUOp : DstReachingUses) {
2551	if (TII->isMAI(MI: *RUOp->getParent()))
2552	continue;
2553
2554	// If there is a non mai reaching use, then we need a copy.
2555	if (find(Range&: DstReachingUseCopies, Val: RUOp) == DstReachingUseCopies.end())
2556	DstReachingUseCopies.push_back(Elt: RUOp);
2557	SmallVector<SlotIndex, `8`> DstUsesReachingDefs;
2558	findReachingDefs(UseMO&: *RUOp, LIS: DAG.LIS, DefIdxs&: DstUsesReachingDefs);
2559
2560	for (SlotIndex RDIndex : DstUsesReachingDefs) {
2561	MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex);
2562	if (TII->isMAI(MI: *RD))
2563	continue;
2564
2565	// If there is a non mai reaching def of this reaching use, then we will
2566	// need a copy.
2567	if (find(Range&: DstUseDefsReplace, Val: RD) == DstUseDefsReplace.end())
2568	DstUseDefsReplace.push_back(Elt: RD);
2569	}
2570	}
2571
2572	if (!DstUseDefsReplace.empty()) {
2573	DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: DstReg);
2574	if (RI != RedefMap.end()) {
2575	MappedReg = RI ->second;
2576	} else {
2577	assert(!ReachingDefCopyMap.contains(DstReg));
2578	const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: DstReg);
2579	const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2580
2581	// Track the mapping of the original register to the new register.
2582	MappedReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2583	RedefMap [DstReg] = MappedReg;
2584	}
2585
2586	// If none exists, create a copy from this reaching def.
2587	// We may have inserted a copy already in an earlier iteration.
2588	for (MachineInstr *RD : DstUseDefsReplace) {
2589	// Do not create reundant copies.
2590	if (ReachingDefCopyMap [DstReg].insert(Ptr: RD).second) {
2591	MachineInstrBuilder VGPRCopy =
2592	BuildMI(BB&: *RD->getParent(), I: std::next(x: RD->getIterator()),
2593	MIMD: RD->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2594	.addDef(RegNo: MappedReg, Flags: {}, SubReg: `0`)
2595	.addUse(RegNo: DstReg, Flags: {}, SubReg: `0`);
2596	DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2597
2598	// If this reaching def was the last MI in the region, update the
2599	// region boundaries.
2600	DenseMap<MachineInstr , unsigned*>::iterator LMI =
2601	LastMIToRegion.find(Val: RD);
2602	if (LMI != LastMIToRegion.end()) {
2603	unsigned UpdateRegion = LMI ->second;
2604	DAG.Regions [UpdateRegion].second = VGPRCopy;
2605	LastMIToRegion.erase(Val: RD);
2606	}
2607	}
2608	}
2609	}
2610
2611	DenseSet<MachineOperand *> &DstRegSet = ReplaceMap [DstReg];
2612	for (MachineOperand *RU : DstReachingUseCopies) {
2613	MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2614	// Just keep track of the reaching use of this register by block. After we
2615	// have scanned all the MFMAs we can find optimal insert pts.
2616	if (RUBlock != MI->getParent()) {
2617	ReachingUseTracker [RUBlock->getNumber()][DstReg].insert(Ptr: RU);
2618	continue;
2619	}
2620
2621	// Special case, the use is in the same block as the MFMA. Insert the copy
2622	// just before the use.
2623	const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: DstReg);
2624	const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2625	Register NewUseReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2626	MachineInstr *UseInst = RU->getParent();
2627	MachineInstrBuilder VGPRCopy =
2628	BuildMI(BB&: *UseInst->getParent(), I: UseInst->getIterator(),
2629	MIMD: UseInst->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2630	.addDef(RegNo: NewUseReg, Flags: {}, SubReg: `0`)
2631	.addUse(RegNo: DstReg, Flags: {}, SubReg: `0`);
2632	DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2633	// Since we know this use has only one reaching def, we can replace the
2634	// use reg.
2635	RU->setReg(NewUseReg);
2636	// Track the copy source operand for r eplacement.
2637	DstRegSet.insert(V: &VGPRCopy ->getOperand(i: `1`));
2638	}
2639
2640	// Track the register for reclassification
2641	RewriteRegs.insert(V: DstReg);
2642
2643	// Insert the dst operand for replacement. If this dst is in a chain of
2644	// tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2645	// all the correspond operands need to be replaced.
2646	DstRegSet.insert(V: Dst);
2647	}
2648
2649	// Handle the copies for dst uses.
2650	using RUBType =
2651	std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, `8`>>>;
2652	for (RUBType RUBlockEntry : ReachingUseTracker) {
2653	using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, `8`>>;
2654	for (RUDType RUDst : RUBlockEntry.second) {
2655	MachineOperand OpBegin = RUDst.second.begin();
2656	SlotIndex InstPt = DAG.LIS->getInstructionIndex(Instr: *OpBegin->getParent());
2657
2658	// Find the earliest use in this block.
2659	for (MachineOperand *User : RUDst.second) {
2660	SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(Instr: *User->getParent());
2661	if (SlotIndex::isEarlierInstr(A: NewInstPt, B: InstPt))
2662	InstPt = NewInstPt;
2663	}
2664
2665	const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: RUDst.first);
2666	const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC);
2667	Register NewUseReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC);
2668	MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(index: InstPt);
2669
2670	MachineInstrBuilder VGPRCopy =
2671	BuildMI(BB&: *UseInst->getParent(), I: UseInst->getIterator(),
2672	MIMD: UseInst->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY))
2673	.addDef(RegNo: NewUseReg, Flags: {}, SubReg: `0`)
2674	.addUse(RegNo: RUDst.first, Flags: {}, SubReg: `0`);
2675	DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy);
2676
2677	// If this UseInst was the first MI in the region, update the region
2678	// boundaries.
2679	DenseMap<MachineInstr , unsigned*>::iterator FI =
2680	FirstMIToRegion.find(Val: UseInst);
2681	if (FI != FirstMIToRegion.end()) {
2682	unsigned UpdateRegion = FI ->second;
2683	DAG.Regions [UpdateRegion].first = VGPRCopy;
2684	FirstMIToRegion.erase(Val: UseInst);
2685	}
2686
2687	// Replace the operand for all users.
2688	for (MachineOperand *User : RUDst.second) {
2689	User->setReg(NewUseReg);
2690	}
2691
2692	// Track the copy source operand for replacement.
2693	ReplaceMap [RUDst.first].insert(V: &VGPRCopy ->getOperand(i: `1`));
2694	}
2695	}
2696
2697	// We may have needed to insert copies after the reaching defs of the MFMAs.
2698	// Replace the original register with the result of the copy for all relevant
2699	// operands.
2700	for (std::pair<Register, Register> NewDef : RedefMap) {
2701	Register OldReg = NewDef.first;
2702	Register NewReg = NewDef.second;
2703
2704	// Replace the register for any associated operand in the MFMA chain.
2705	for (MachineOperand *ReplaceOp : ReplaceMap [OldReg])
2706	ReplaceOp->setReg(NewReg);
2707	}
2708
2709	// Finally, do the reclassification of the MFMA registers.
2710	for (Register RewriteReg : RewriteRegs) {
2711	Register RegToRewrite = RewriteReg;
2712
2713	// Be sure to update the replacement register and not the original.
2714	DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: RewriteReg);
2715	if (RI != RedefMap.end())
2716	RegToRewrite = RI ->second;
2717
2718	const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(Reg: RegToRewrite);
2719	const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(SRC: CurrRC);
2720
2721	DAG.MRI.setRegClass(Reg: RegToRewrite, RC: AGPRRC);
2722	}
2723
2724	// Bulk update the LIS.
2725	DAG.LIS->reanalyze(MF&: DAG.MF);
2726	// Liveins may have been modified for cross RC copies
2727	RegionPressureMap LiveInUpdater(&DAG, false);
2728	LiveInUpdater.buildLiveRegMap();
2729
2730	for (unsigned Region = `0`; Region < DAG.Regions.size(); Region++)
2731	DAG.LiveIns [Region] = LiveInUpdater.getLiveRegsForRegionIdx(RegionIdx: Region);
2732
2733	DAG.Pressure [RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2734
2735	return true;
2736	}
2737
2738	unsigned PreRARematStage::getStageTargetOccupancy() const {
2739	return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
2740	}
2741
2742	bool PreRARematStage::setObjective() {
2743	const Function &F = MF.getFunction();
2744
2745	// Set up "spilling targets" for all regions.
2746	unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2747	unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2748	bool HasVectorRegisterExcess = false;
2749	for (unsigned I = `0`, E = DAG.Regions.size(); I != E; ++I) {
2750	const GCNRegPressure &RP = DAG.Pressure [I];
2751	GCNRPTarget &Target = RPTargets.emplace_back(Args&: MaxSGPRs, Args&: MaxVGPRs, Args&: MF, Args: RP);
2752	if (!Target.satisfied())
2753	TargetRegions.set(I);
2754	HasVectorRegisterExcess \|= Target.hasVectorRegisterExcess();
2755	}
2756
2757	if (HasVectorRegisterExcess \|\| DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2758	// In addition to register usage being above addressable limits, occupancy
2759	// below the minimum is considered like "spilling" as well.
2760	TargetOcc = std::nullopt;
2761	} else {
2762	// There is no spilling and room to improve occupancy; set up "increased
2763	// occupancy targets" for all regions.
2764	TargetOcc = DAG.MinOccupancy + `1`;
2765	const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
2766	MaxSGPRs = ST.getMaxNumSGPRs(WavesPerEU: TargetOcc, Addressable: false*);
2767	MaxVGPRs = ST.getMaxNumVGPRs(WavesPerEU: *TargetOcc, DynamicVGPRBlockSize: VGPRBlockSize);
2768	for (auto [I, Target] : enumerate(First&: RPTargets)) {
2769	Target.setTarget(NumSGPRs: MaxSGPRs, NumVGPRs: MaxVGPRs);
2770	if (!Target.satisfied())
2771	TargetRegions.set(I);
2772	}
2773	}
2774
2775	return TargetRegions.any();
2776	}
2777
2778	bool PreRARematStage::collectRematRegs(
2779	const DenseMap<MachineInstr , unsigned*> &MIRegion) {
2780	// We need up-to-date live-out info. to query live-out register masks in
2781	// regions containing rematerializable instructions.
2782	DAG.RegionLiveOuts.buildLiveRegMap();
2783
2784	// Set of registers already marked for potential remterialization; used to
2785	// avoid rematerialization chains.
2786	SmallSet<Register, `4`> MarkedRegs;
2787	auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool {
2788	return MO.isReg() && MarkedRegs.contains(V: MO.getReg());
2789	};
2790
2791	// Identify rematerializable instructions in the function.
2792	for (unsigned I = `0`, E = DAG.Regions.size(); I != E; ++I) {
2793	RegionBoundaries Bounds = DAG.Regions [I];
2794	for (auto MI = Bounds.first; MI != Bounds.second; ++MI) {
2795	// The instruction must be rematerializable.
2796	MachineInstr &DefMI = *MI;
2797	if (!isReMaterializable(MI: DefMI))
2798	continue;
2799
2800	// We only support rematerializing virtual registers with one
2801	// definition.
2802	Register Reg = DefMI.getOperand(i: `0`).getReg();
2803	if (!Reg.isVirtual() \|\| !DAG.MRI.hasOneDef(RegNo: Reg))
2804	continue;
2805
2806	// We only care to rematerialize the instruction if it has a single
2807	// non-debug user in a different region.
2808	// FIXME: Allow rematerializations with multiple uses. This should be
2809	// relatively easy to support using the current cost model.
2810	MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(RegNo: Reg);
2811	if (!UseMI)
2812	continue;
2813	auto UseRegion = MIRegion.find(Val: UseMI);
2814	if (UseRegion == MIRegion.end() \|\| UseRegion ->second == I)
2815	continue;
2816
2817	// Do not rematerialize an instruction if it uses or is used by an
2818	// instruction that we have designated for rematerialization.
2819	// FIXME: Allow for rematerialization chains: this requires 1. updating
2820	// remat points to account for uses that are rematerialized, and 2.
2821	// either rematerializing the candidates in careful ordering, or
2822	// deferring the MBB RP walk until the entire chain has been
2823	// rematerialized.
2824	const MachineOperand &UseMO = UseMI->getOperand(i: `0`);
2825	if (IsMarkedForRemat (UseMO) \|\|
2826	llvm::any_of(Range: DefMI.operands(), P: IsMarkedForRemat))
2827	continue;
2828
2829	// Do not rematerialize an instruction it it uses registers that aren't
2830	// available at its use. This ensures that we are not extending any live
2831	// range while rematerializing.
2832	SlotIndex UseIdx = DAG.LIS->getInstructionIndex(Instr: UseMI).getRegSlot(EC: true*);
2833	if (!VirtRegAuxInfo::allUsesAvailableAt(MI: &DefMI, UseIdx, LIS: *DAG.LIS, MRI: DAG.MRI,
2834	TII: *DAG.TII))
2835	continue;
2836
2837	// Add the instruction to the rematerializable list.
2838	MarkedRegs.insert(V: Reg);
2839	RematRegs.emplace_back(Args: &DefMI, Args&: UseMI, Args&: DAG, Args: MIRegion);
2840	}
2841	}
2842
2843	return !RematRegs.empty();
2844	}
2845
2846	PreRARematStage::RematReg::RematReg(
2847	MachineInstr DefMI, MachineInstr UseMI, GCNScheduleDAGMILive &DAG,
2848	const DenseMap<MachineInstr , unsigned*> &MIRegion)
2849	: DefMI(DefMI), UseMI(UseMI), LiveIn (DAG.Regions.size()),
2850	LiveOut (DAG.Regions.size()), Live (DAG.Regions.size()),
2851	DefRegion(MIRegion.at(Val: DefMI)), UseRegion(MIRegion.at(Val: UseMI)) {
2852
2853	// Mark regions in which the rematerializable register is live.
2854	Register Reg = getReg();
2855	for (unsigned I = `0`, E = DAG.Regions.size(); I != E; ++I) {
2856	auto LiveInIt = DAG.LiveIns [I].find(Val: Reg);
2857	if (LiveInIt != DAG.LiveIns [I].end())
2858	LiveIn.set(I);
2859	const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I);
2860	if (auto LiveOutIt = LiveOuts.find(Val: Reg); LiveOutIt != LiveOuts.end())
2861	LiveOut.set(I);
2862	}
2863	Live \|= LiveIn;
2864	Live \|= LiveOut;
2865	Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: DefRegion).at(Val: Reg);
2866	}
2867
2868	bool PreRARematStage::RematReg::maybeBeneficial(
2869	const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
2870	Register Reg = getReg();
2871	for (unsigned I : TargetRegions.set_bits()) {
2872	if (Live [I] && RPTargets [I].isSaveBeneficial(Reg))
2873	return true;
2874	}
2875	return false;
2876	}
2877
2878	void PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
2879	MachineInstr *RematMI,
2880	GCNScheduleDAGMILive &DAG) const {
2881	RegionBoundaries &Bounds = DAG.Regions [RegionIdx];
2882	if (Bounds.first == std::next(x: MachineBasicBlock::iterator (RematMI)))
2883	Bounds.first = RematMI;
2884	DAG.LIS->InsertMachineInstrInMaps(MI&: *RematMI);
2885	DAG.LIS->createAndComputeVirtRegInterval(Reg: RematMI->getOperand(i: `0`).getReg());
2886	}
2887
2888	PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
2889	MachineFunction &MF, const GCNScheduleDAGMILive &DAG) {
2890	assert(DAG.MLI && "MLI not defined in DAG");
2891	MachineBranchProbabilityInfo MBPI;
2892	MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
2893
2894	const unsigned NumRegions = DAG.Regions.size();
2895	MinFreq = MBFI.getEntryFreq().getFrequency();
2896	MaxFreq = `0`;
2897	Regions.reserve(N: NumRegions);
2898	for (unsigned I = `0`; I < NumRegions; ++I) {
2899	MachineBasicBlock *MBB = DAG.Regions [I].first ->getParent();
2900	uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
2901	Regions.push_back(Elt: BlockFreq);
2902	if (BlockFreq && BlockFreq < MinFreq)
2903	MinFreq = BlockFreq;
2904	else if (BlockFreq > MaxFreq)
2905	MaxFreq = BlockFreq;
2906	}
2907	if (!MinFreq)
2908	return;
2909
2910	// Scale everything down if frequencies are high.
2911	if (MinFreq >= ScaleFactor * ScaleFactor) {
2912	for (uint64_t &Freq : Regions)
2913	Freq /= ScaleFactor;
2914	MinFreq /= ScaleFactor;
2915	MaxFreq /= ScaleFactor;
2916	}
2917	}
2918
2919	PreRARematStage::ScoredRemat::ScoredRemat(RematReg Remat, const* FreqInfo &Freq,
2920	const GCNScheduleDAGMILive &DAG)
2921	: Remat(Remat), FreqDiff(getFreqDiff(Freq)) {
2922	RPSave.inc(Reg: Remat->getReg(), PrevMask: LaneBitmask::getNone(), NewMask: Remat->Mask, MRI: DAG.MRI);
2923	}
2924
2925	int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
2926	// Get frequencies of defining and using regions. A rematerialization from the
2927	// least frequent region to the most frequent region will yield the greatest
2928	// latency penalty and therefore should get minimum score. Reciprocally, a
2929	// rematerialization in the other direction should get maximum score. Default
2930	// to values that will yield the worst possible score given known frequencies
2931	// in order to penalize rematerializations from or into regions whose
2932	// frequency is unknown.
2933	int64_t DefOrMin = std::max(a: Freq.Regions [Remat->DefRegion], b: Freq.MinFreq);
2934	int64_t UseOrMax = Freq.Regions [Remat->UseRegion];
2935	if (!UseOrMax)
2936	UseOrMax = Freq.MaxFreq;
2937	return DefOrMin - UseOrMax;
2938	}
2939
2940	void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
2941	ArrayRef<GCNRPTarget> RPTargets,
2942	const FreqInfo &FreqInfo,
2943	bool ReduceSpill) {
2944	MaxFreq = `0`;
2945	RegionImpact = `0`;
2946	for (unsigned I : TargetRegions.set_bits()) {
2947	if (!Remat->Live [I])
2948	continue;
2949
2950	// The rematerialization must contribute positively in at least one
2951	// register class with usage above the RP target for this region to
2952	// contribute to the score.
2953	const GCNRPTarget &RegionTarget = RPTargets [I];
2954	const unsigned NumRegsBenefit = RegionTarget.getNumRegsBenefit(SaveRP: RPSave);
2955	if (!NumRegsBenefit)
2956	continue;
2957
2958	bool UnusedLT = Remat->isUnusedLiveThrough(I);
2959
2960	// Regions in which RP is guaranteed to decrease have more weight.
2961	RegionImpact += (UnusedLT ? `2` : `1`) * NumRegsBenefit;
2962
2963	if (ReduceSpill) {
2964	uint64_t Freq = FreqInfo.Regions [I];
2965	if (!UnusedLT) {
2966	// Apply a frequency penalty in regions in which we are not sure that RP
2967	// will decrease.
2968	Freq /= `2`;
2969	}
2970	MaxFreq = std::max(a: MaxFreq, b: Freq);
2971	}
2972	}
2973	}
2974
2975	MachineInstr *PreRARematStage::ScoredRemat::rematerialize(
2976	BitVector &RecomputeRP, SmallVectorImpl<GCNRPTarget> &RPTargets,
2977	GCNScheduleDAGMILive &DAG) const {
2978	const SIInstrInfo *TII = DAG.MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2979	MachineInstr &DefMI = *Remat->DefMI;
2980	Register Reg = DefMI.getOperand(i: `0`).getReg();
2981	Register NewReg = DAG.MRI.cloneVirtualRegister(VReg: Reg);
2982
2983	// Rematerialize the register in the region where it is used.
2984	MachineBasicBlock::iterator InsertPos = Remat->UseMI;
2985	TII->reMaterialize(MBB&: *InsertPos ->getParent(), MI: InsertPos, DestReg: NewReg, SubIdx: `0`, Orig: DefMI);
2986	MachineInstr RematMI = &std::prev(x: InsertPos);
2987	Remat->UseMI->substituteRegister(FromReg: Reg, ToReg: NewReg, SubIdx: `0`, RegInfo: *DAG.TRI);
2988	Remat->insertMI(RegionIdx: Remat->UseRegion, RematMI, DAG);
2989
2990	#ifdef EXPENSIVE_CHECKS
2991	// All uses are known to be available / live at the remat point. Thus,
2992	// the uses should already be live in to the using region.
2993	for (MachineOperand &MO : DefMI.operands()) {
2994	if (!MO.isReg() \|\| !MO.getReg() \|\| !MO.readsReg())
2995	continue;
2996
2997	Register UseReg = MO.getReg();
2998	if (!UseReg.isVirtual())
2999	continue;
3000
3001	LiveInterval &LI = DAG.LIS->getInterval(UseReg);
3002	LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
3003	if (LI.hasSubRanges() && MO.getSubReg())
3004	LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
3005
3006	LaneBitmask LiveInMask = DAG.LiveIns[Remat->UseRegion].at(UseReg);
3007	LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
3008	// If this register has lanes not covered by the LiveIns, be sure they
3009	// do not map to any subrange. ref:
3010	// machine-scheduler-sink-trivial-remats.mir::omitted_subrange
3011	if (UncoveredLanes.any()) {
3012	assert(LI.hasSubRanges());
3013	for (LiveInterval::SubRange &SR : LI.subranges())
3014	assert((SR.LaneMask & UncoveredLanes).none());
3015	}
3016	}
3017	#endif
3018
3019	// Remove the register from all regions where it is a live-in or live-out
3020	// and adjust RP targets. The save is guaranteed in regions in which the
3021	// register is live-through and unused but optimistic in all other regions
3022	// where the register is live.
3023	for (unsigned I : Remat->Live.set_bits()) {
3024	RPTargets [I].saveRP(SaveRP: RPSave);
3025	DAG.LiveIns [I].erase(Val: Reg);
3026	DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I).erase(Val: Reg);
3027	if (!Remat->isUnusedLiveThrough(I))
3028	RecomputeRP.set(I);
3029	}
3030
3031	return RematMI;
3032	}
3033
3034	void PreRARematStage::commitRematerializations() const {
3035	REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n");
3036	for (const RollbackInfo &Rollback : Rollbacks)
3037	DAG.deleteMI(RegionIdx: Rollback.Remat->DefRegion, MI: Rollback.Remat->DefMI);
3038	}
3039
3040	void PreRARematStage::unsetSatisfiedRPTargets(const BitVector &Regions) {
3041	for (unsigned I : Regions.set_bits()) {
3042	if (TargetRegions [I] && RPTargets [I].satisfied()) {
3043	REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
3044	TargetRegions.reset(Idx: I);
3045	}
3046	}
3047	}
3048
3049	bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
3050	bool TooOptimistic = false;
3051	for (unsigned I : Regions.set_bits()) {
3052	GCNRPTarget &Target = RPTargets [I];
3053	Target.setRP(DAG.getRealRegPressure(RegionIdx: I));
3054
3055	// Since we were optimistic in assessing RP decreases in these regions, we
3056	// may need to remark the target as a target region if RP didn't decrease
3057	// as expected.
3058	if (!TargetRegions [I] && !Target.satisfied()) {
3059	REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
3060	TooOptimistic = true;
3061	TargetRegions.set(I);
3062	}
3063	}
3064	return TooOptimistic;
3065	}
3066
3067	// Copied from MachineLICM
3068	bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
3069	if (!DAG.TII->isReMaterializable(MI))
3070	return false;
3071
3072	for (const MachineOperand &MO : MI.all_uses()) {
3073	// We can't remat physreg uses, unless it is a constant or an ignorable
3074	// use (e.g. implicit exec use on VALU instructions)
3075	if (MO.getReg().isPhysical()) {
3076	if (DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) \|\| DAG.TII->isIgnorableUse(MO))
3077	continue;
3078	return false;
3079	}
3080	}
3081
3082	return true;
3083	}
3084
3085	void PreRARematStage::finalizeGCNSchedStage() {
3086	// We consider that reducing spilling is always beneficial so we never
3087	// rollback rematerializations or revert scheduling in such cases.
3088	if (!TargetOcc)
3089	return;
3090
3091	// When increasing occupancy, it is possible that re-scheduling is not able to
3092	// achieve the target occupancy in all regions, in which case re-scheduling in
3093	// all regions should be reverted.
3094	if (DAG.MinOccupancy >= *TargetOcc) {
3095	commitRematerializations();
3096	return;
3097	}
3098
3099	// It is possible that re-scheduling lowers occupancy over the one achieved
3100	// just through rematerializations, in which case we revert re-scheduling in
3101	// all regions but do not roll back rematerializations.
3102	const bool ShouldRollbackRemats = AchievedOcc < *TargetOcc;
3103
3104	// When we both need to revert re-scheduling and rollback rematerializations,
3105	// restore rematerialized MIs' original state before reverting so that they
3106	// are treated as non-debug instructions by the revert logic.
3107	if (ShouldRollbackRemats) {
3108	for (const RollbackInfo &Rollback : Rollbacks) {
3109	const auto &[Remat, RematMI, RegMap] = Rollback;
3110	Remat->DefMI->setDesc(DAG.TII->get(Opcode: RematMI->getOpcode()));
3111	for (const auto &[MOIdx, Reg] : RegMap)
3112	Remat->DefMI->getOperand(i: MOIdx).setReg(Reg);
3113	}
3114	}
3115
3116	// Revert re-scheduling in all affected regions.
3117	for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
3118	REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
3119	<< `'\n'`);
3120	DAG.Pressure [RegionIdx] = MaxPressure;
3121	modifyRegionSchedule(RegionIdx, MBB: RegionBB [RegionIdx], MIOrder: OrigMIOrder);
3122	}
3123
3124	if (!ShouldRollbackRemats) {
3125	commitRematerializations();
3126	DAG.setTargetOccupancy(AchievedOcc);
3127	return;
3128	}
3129
3130	// Reset the target occupancy to what it was pre-rematerialization.
3131	DAG.setTargetOccupancy(*TargetOcc - `1`);
3132
3133	// Finish rolling back rematerializations, then recompute pressure in all
3134	// affected regions.
3135	REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
3136	BitVector RecomputeRP(DAG.Regions.size());
3137	DenseSet<Register> RecomputeLI;
3138	for (const RollbackInfo &Rollback : Rollbacks) {
3139	const auto &[Remat, RematMI, RegMap] = Rollback;
3140
3141	// Switch back to using the original register and delete the
3142	// rematerialization.
3143	Register Reg = RematMI->getOperand(i: `0`).getReg();
3144	Register OriginalReg = Remat->DefMI->getOperand(i: `0`).getReg();
3145	Remat->UseMI->substituteRegister(FromReg: Reg, ToReg: OriginalReg, SubIdx: `0`, RegInfo: *DAG.TRI);
3146	REMAT_DEBUG(dbgs() << `'['` << Remat->UseRegion
3147	<< "] Deleting rematerialization " << *RematMI);
3148	DAG.deleteMI(RegionIdx: Remat->UseRegion, MI: RematMI);
3149
3150	// Re-add the defined register as a live-in/live-out in all regions it used
3151	// to be one in.
3152	std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask);
3153	for (unsigned I : Remat->LiveIn.set_bits())
3154	DAG.LiveIns [I].insert(KV: LiveReg);
3155	for (unsigned I : Remat->LiveOut.set_bits())
3156	DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I).insert(KV: LiveReg);
3157
3158	RecomputeRP \|= Rollback.Remat->Live;
3159	// Regenerate intervals for all register operands of rematerialized MIs as
3160	// slot indices may have changed slightly from before re-scheduling.
3161	for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) {
3162	if (MO.isReg() && MO.getReg().isVirtual())
3163	RecomputeLI.insert(V: MO.getReg());
3164	}
3165	}
3166	for (Register Reg : RecomputeLI) {
3167	DAG.LIS->removeInterval(Reg);
3168	DAG.LIS->createAndComputeVirtRegInterval(Reg);
3169	}
3170	#ifdef EXPENSIVE_CHECKS
3171	// In particular, we want to check for coherent MI/slot order in regions in
3172	// which reverts and/or rollbacks may have happened.
3173	MF.verify();
3174	#endif
3175	for (unsigned I : RecomputeRP.set_bits())
3176	DAG.Pressure [I] = DAG.getRealRegPressure(RegionIdx: I);
3177
3178	GCNSchedStage::finalizeGCNSchedStage();
3179	}
3180
3181	void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) {
3182	// It's not possible for the deleted instruction to be upper region boundary
3183	// since we don't delete region terminators.
3184	if (Regions [RegionIdx].first == MI)
3185	Regions [RegionIdx].first = std::next(x: MachineBasicBlock::iterator (MI));
3186	LIS->removeInterval(Reg: MI->getOperand(i: `0`).getReg());
3187	LIS->RemoveMachineInstrFromMaps(MI&: *MI);
3188	MI->eraseFromParent();
3189	}
3190
3191	void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
3192	MinOccupancy = TargetOccupancy;
3193	if (MFI.getOccupancy() < TargetOccupancy)
3194	MFI.increaseOccupancy(MF, Limit: MinOccupancy);
3195	else
3196	MFI.limitOccupancy(Limit: MinOccupancy);
3197	}
3198
3199	static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
3200	const SIInstrInfo SII = static_cast<const* SIInstrInfo *>(DAG->TII);
3201	return any_of(Range&: *DAG, P: [SII](MachineBasicBlock::iterator MI) {
3202	return SII->isIGLPMutationOnly(Opcode: MI ->getOpcode());
3203	});
3204	}
3205
3206	GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
3207	MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
3208	bool RemoveKillFlags)
3209	: ScheduleDAGMI (C, std::move(S), RemoveKillFlags) {}
3210
3211	void GCNPostScheduleDAGMILive::schedule() {
3212	HasIGLPInstrs = hasIGLPInstrs(DAG: this);
3213	if (HasIGLPInstrs) {
3214	SavedMutations.clear();
3215	SavedMutations.swap(x&: Mutations);
3216	addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
3217	}
3218
3219	ScheduleDAGMI::schedule();
3220	}
3221
3222	void GCNPostScheduleDAGMILive::finalizeSchedule() {
3223	if (HasIGLPInstrs)
3224	SavedMutations.swap(x&: Mutations);
3225
3226	ScheduleDAGMI::finalizeSchedule();
3227	}
3228

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp