SIMachineScheduler.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp]

1	//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// SI Machine Scheduler interface
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "SIMachineScheduler.h"
15	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16	#include "SIInstrInfo.h"
17	#include "llvm/CodeGen/LiveIntervals.h"
18	#include "llvm/CodeGen/MachineRegisterInfo.h"
19
20	using namespace llvm;
21
22	#define DEBUG_TYPE "machine-scheduler"
23
24	// This scheduler implements a different scheduling algorithm than
25	// GenericScheduler.
26	//
27	// There are several specific architecture behaviours that can't be modelled
28	// for GenericScheduler:
29	// . When accessing the result of an SGPR load instruction, you have to wait
30	// for all the SGPR load instructions before your current instruction to
31	// have finished.
32	// . When accessing the result of an VGPR load instruction, you have to wait
33	// for all the VGPR load instructions previous to the VGPR load instruction
34	// you are interested in to finish.
35	// . The less the register pressure, the best load latencies are hidden
36	//
37	// Moreover some specifities (like the fact a lot of instructions in the shader
38	// have few dependencies) makes the generic scheduler have some unpredictable
39	// behaviours. For example when register pressure becomes high, it can either
40	// manage to prevent register pressure from going too high, or it can
41	// increase register pressure even more than if it hadn't taken register
42	// pressure into account.
43	//
44	// Also some other bad behaviours are generated, like loading at the beginning
45	// of the shader a constant in VGPR you won't need until the end of the shader.
46	//
47	// The scheduling problem for SI can distinguish three main parts:
48	// . Hiding high latencies (texture sampling, etc)
49	// . Hiding low latencies (SGPR constant loading, etc)
50	// . Keeping register usage low for better latency hiding and general
51	// performance
52	//
53	// Some other things can also affect performance, but are hard to predict
54	// (cache usage, the fact the HW can issue several instructions from different
55	// wavefronts if different types, etc)
56	//
57	// This scheduler tries to solve the scheduling problem by dividing it into
58	// simpler sub-problems. It divides the instructions into blocks, schedules
59	// locally inside the blocks where it takes care of low latencies, and then
60	// chooses the order of the blocks by taking care of high latencies.
61	// Dividing the instructions into blocks helps control keeping register
62	// usage low.
63	//
64	// First the instructions are put into blocks.
65	// We want the blocks help control register usage and hide high latencies
66	// later. To help control register usage, we typically want all local
67	// computations, when for example you create a result that can be consumed
68	// right away, to be contained in a block. Block inputs and outputs would
69	// typically be important results that are needed in several locations of
70	// the shader. Since we do want blocks to help hide high latencies, we want
71	// the instructions inside the block to have a minimal set of dependencies
72	// on high latencies. It will make it easy to pick blocks to hide specific
73	// high latencies.
74	// The block creation algorithm is divided into several steps, and several
75	// variants can be tried during the scheduling process.
76	//
77	// Second the order of the instructions inside the blocks is chosen.
78	// At that step we do take into account only register usage and hiding
79	// low latency instructions
80	//
81	// Third the block order is chosen, there we try to hide high latencies
82	// and keep register usage low.
83	//
84	// After the third step, a pass is done to improve the hiding of low
85	// latencies.
86	//
87	// Actually when talking about 'low latency' or 'high latency' it includes
88	// both the latency to get the cache (or global mem) data go to the register,
89	// and the bandwidth limitations.
90	// Increasing the number of active wavefronts helps hide the former, but it
91	// doesn't solve the latter, thus why even if wavefront count is high, we have
92	// to try have as many instructions hiding high latencies as possible.
93	// The OpenCL doc says for example latency of 400 cycles for a global mem
94	// access, which is hidden by 10 instructions if the wavefront count is 10.
95
96	// Some figures taken from AMD docs:
97	// Both texture and constant L1 caches are 4-way associative with 64 bytes
98	// lines.
99	// Constant cache is shared with 4 CUs.
100	// For texture sampling, the address generation unit receives 4 texture
101	// addresses per cycle, thus we could expect texture sampling latency to be
102	// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
103	// instructions in a wavefront group are executed every 4 cycles),
104	// or 16 instructions if the other wavefronts associated to the 3 other VALUs
105	// of the CU do texture sampling too. (Don't take these figures too seriously,
106	// as I'm not 100% sure of the computation)
107	// Data exports should get similar latency.
108	// For constant loading, the cache is shader with 4 CUs.
109	// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
110	// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
111	// It means a simple s_buffer_load should take one instruction to hide, as
112	// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
113	// cache line.
114	//
115	// As of today the driver doesn't preload the constants in cache, thus the
116	// first loads get extra latency. The doc says global memory access can be
117	// 300-600 cycles. We do not specially take that into account when scheduling
118	// As we expect the driver to be able to preload the constants soon.
119
120	// common code //
121
122	#ifndef NDEBUG
123
124	static const char *getReasonStr(SIScheduleCandReason Reason) {
125	switch (Reason) {
126	case NoCand: return "NOCAND";
127	case RegUsage: return "REGUSAGE";
128	case Latency: return "LATENCY";
129	case Successor: return "SUCCESSOR";
130	case Depth: return "DEPTH";
131	case NodeOrder: return "ORDER";
132	}
133	llvm_unreachable("Unknown reason!");
134	}
135
136	#endif
137
138	namespace llvm::SISched {
139	static bool tryLess(int TryVal, int CandVal,
140	SISchedulerCandidate &TryCand,
141	SISchedulerCandidate &Cand,
142	SIScheduleCandReason Reason) {
143	if (TryVal < CandVal) {
144	TryCand.Reason = Reason;
145	return true;
146	}
147	if (TryVal > CandVal) {
148	if (Cand.Reason > Reason)
149	Cand.Reason = Reason;
150	return true;
151	}
152	Cand.setRepeat(Reason);
153	return false;
154	}
155
156	static bool tryGreater(int TryVal, int CandVal,
157	SISchedulerCandidate &TryCand,
158	SISchedulerCandidate &Cand,
159	SIScheduleCandReason Reason) {
160	if (TryVal > CandVal) {
161	TryCand.Reason = Reason;
162	return true;
163	}
164	if (TryVal < CandVal) {
165	if (Cand.Reason > Reason)
166	Cand.Reason = Reason;
167	return true;
168	}
169	Cand.setRepeat(Reason);
170	return false;
171	}
172	} // end namespace llvm::SISched
173
174	// SIScheduleBlock //
175
176	void SIScheduleBlock::addUnit(SUnit *SU) {
177	NodeNum2Index [SU->NodeNum] = SUnits.size();
178	SUnits.push_back(x: SU);
179	}
180
181	#ifndef NDEBUG
182	void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
183
184	dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
185	dbgs() << `'\n'`;
186	}
187	#endif
188
189	void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
190	SISchedCandidate &TryCand) {
191	// Initialize the candidate if needed.
192	if (!Cand.isValid()) {
193	TryCand.Reason = NodeOrder;
194	return;
195	}
196
197	if (Cand.SGPRUsage > `60` &&
198	SISched::tryLess(TryVal: TryCand.SGPRUsage, CandVal: Cand.SGPRUsage,
199	TryCand, Cand, Reason: RegUsage))
200	return;
201
202	// Schedule low latency instructions as top as possible.
203	// Order of priority is:
204	// . Low latency instructions which do not depend on other low latency
205	// instructions we haven't waited for
206	// . Other instructions which do not depend on low latency instructions
207	// we haven't waited for
208	// . Low latencies
209	// . All other instructions
210	// Goal is to get: low latency instructions - independent instructions
211	// - (eventually some more low latency instructions)
212	// - instructions that depend on the first low latency instructions.
213	// If in the block there is a lot of constant loads, the SGPR usage
214	// could go quite high, thus above the arbitrary limit of 60 will encourage
215	// use the already loaded constants (in order to release some SGPRs) before
216	// loading more.
217	if (SISched::tryLess(TryVal: TryCand.HasLowLatencyNonWaitedParent,
218	CandVal: Cand.HasLowLatencyNonWaitedParent,
219	TryCand, Cand, Reason: SIScheduleCandReason::Depth))
220	return;
221
222	if (SISched::tryGreater(TryVal: TryCand.IsLowLatency, CandVal: Cand.IsLowLatency,
223	TryCand, Cand, Reason: SIScheduleCandReason::Depth))
224	return;
225
226	if (TryCand.IsLowLatency &&
227	SISched::tryLess(TryVal: TryCand.LowLatencyOffset, CandVal: Cand.LowLatencyOffset,
228	TryCand, Cand, Reason: SIScheduleCandReason::Depth))
229	return;
230
231	if (SISched::tryLess(TryVal: TryCand.VGPRUsage, CandVal: Cand.VGPRUsage,
232	TryCand, Cand, Reason: RegUsage))
233	return;
234
235	// Fall through to original instruction order.
236	if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
237	TryCand.Reason = NodeOrder;
238	}
239	}
240
241	SUnit* SIScheduleBlock::pickNode() {
242	SISchedCandidate TopCand;
243
244	for (SUnit* SU : TopReadySUs) {
245	SISchedCandidate TryCand;
246	std::vector<unsigned> pressure;
247	std::vector<unsigned> MaxPressure;
248	// Predict register usage after this instruction.
249	TryCand.SU = SU;
250	TopRPTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: pressure, MaxPressureResult&: MaxPressure);
251	TryCand.SGPRUsage = pressure [AMDGPU::RegisterPressureSets::SReg_32];
252	TryCand.VGPRUsage = pressure [AMDGPU::RegisterPressureSets::VGPR_32];
253	TryCand.IsLowLatency = DAG->IsLowLatencySU [SU->NodeNum];
254	TryCand.LowLatencyOffset = DAG->LowLatencyOffset [SU->NodeNum];
255	TryCand.HasLowLatencyNonWaitedParent =
256	HasLowLatencyNonWaitedParent [NodeNum2Index [SU->NodeNum]];
257	tryCandidateTopDown(Cand&: TopCand, TryCand);
258	if (TryCand.Reason != NoCand)
259	TopCand.setBest(TryCand);
260	}
261
262	return TopCand.SU;
263	}
264
265
266	// Schedule something valid.
267	void SIScheduleBlock::fastSchedule() {
268	TopReadySUs.clear();
269	if (Scheduled)
270	undoSchedule();
271
272	for (SUnit* SU : SUnits) {
273	if (!SU->NumPredsLeft)
274	TopReadySUs.push_back(x: SU);
275	}
276
277	while (!TopReadySUs.empty()) {
278	SUnit *SU = TopReadySUs [`0`];
279	ScheduledSUnits.push_back(x: SU);
280	nodeScheduled(SU);
281	}
282
283	Scheduled = true;
284	}
285
286	// Returns if the register was set between first and last.
287	static bool isDefBetween(Register Reg, SlotIndex First, SlotIndex Last,
288	const MachineRegisterInfo *MRI,
289	const LiveIntervals *LIS) {
290	for (const MachineInstr &MI : MRI->def_instructions(Reg)) {
291	if (MI.isDebugValue())
292	continue;
293	SlotIndex InstSlot = LIS->getInstructionIndex(Instr: MI).getRegSlot();
294	if (InstSlot >= First && InstSlot <= Last)
295	return true;
296	}
297	return false;
298	}
299
300	void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
301	MachineBasicBlock::iterator EndBlock) {
302	IntervalPressure Pressure, BotPressure;
303	RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
304	LiveIntervals *LIS = DAG->getLIS();
305	MachineRegisterInfo *MRI = DAG->getMRI();
306	DAG->initRPTracker(RPTracker&: TopRPTracker);
307	DAG->initRPTracker(RPTracker&: BotRPTracker);
308	DAG->initRPTracker(RPTracker);
309
310	// Goes though all SU. RPTracker captures what had to be alive for the SUs
311	// to execute, and what is still alive at the end.
312	for (SUnit* SU : ScheduledSUnits) {
313	RPTracker.setPos(SU->getInstr());
314	RPTracker.advance();
315	}
316
317	// Close the RPTracker to finalize live ins/outs.
318	RPTracker.closeRegion();
319
320	// Initialize the live ins and live outs.
321	TopRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveInRegs);
322	BotRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveOutRegs);
323
324	// Do not Track Physical Registers, because it messes up.
325	for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
326	if (RegMaskPair.VRegOrUnit.isVirtualReg())
327	LiveInRegs.insert(x: RegMaskPair.VRegOrUnit.asVirtualReg());
328	}
329	LiveOutRegs.clear();
330	// There is several possibilities to distinguish:
331	// 1) Reg is not input to any instruction in the block, but is output of one
332	// 2) 1) + read in the block and not needed after it
333	// 3) 1) + read in the block but needed in another block
334	// 4) Reg is input of an instruction but another block will read it too
335	// 5) Reg is input of an instruction and then rewritten in the block.
336	// result is not read in the block (implies used in another block)
337	// 6) Reg is input of an instruction and then rewritten in the block.
338	// result is read in the block and not needed in another block
339	// 7) Reg is input of an instruction and then rewritten in the block.
340	// result is read in the block but also needed in another block
341	// LiveInRegs will contains all the regs in situation 4, 5, 6, 7
342	// We want LiveOutRegs to contain only Regs whose content will be read after
343	// in another block, and whose content was written in the current block,
344	// that is we want it to get 1, 3, 5, 7
345	// Since we made the MIs of a block to be packed all together before
346	// scheduling, then the LiveIntervals were correct, and the RPTracker was
347	// able to correctly handle 5 vs 6, 2 vs 3.
348	// (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
349	// The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
350	// Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
351	// The use of findDefBetween removes the case 4.
352	for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
353	VirtRegOrUnit VRegOrUnit = RegMaskPair.VRegOrUnit;
354	if (VRegOrUnit.isVirtualReg() &&
355	isDefBetween(Reg: VRegOrUnit.asVirtualReg(),
356	First: LIS->getInstructionIndex(Instr: *BeginBlock).getRegSlot(),
357	Last: LIS->getInstructionIndex(Instr: *EndBlock).getRegSlot(), MRI,
358	LIS)) {
359	LiveOutRegs.insert(x: VRegOrUnit.asVirtualReg());
360	}
361	}
362
363	// Pressure = sum_alive_registers register size
364	// Internally llvm will represent some registers as big 128 bits registers
365	// for example, but they actually correspond to 4 actual 32 bits registers.
366	// Thus Pressure is not equal to num_alive_registers constant.*
367	LiveInPressure = TopPressure.MaxSetPressure;
368	LiveOutPressure = BotPressure.MaxSetPressure;
369
370	// Prepares TopRPTracker for top down scheduling.
371	TopRPTracker.closeTop();
372	}
373
374	void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
375	MachineBasicBlock::iterator EndBlock) {
376	if (!Scheduled)
377	fastSchedule();
378
379	// PreScheduling phase to set LiveIn and LiveOut.
380	initRegPressure(BeginBlock, EndBlock);
381	undoSchedule();
382
383	// Schedule for real now.
384
385	TopReadySUs.clear();
386
387	for (SUnit* SU : SUnits) {
388	if (!SU->NumPredsLeft)
389	TopReadySUs.push_back(x: SU);
390	}
391
392	while (!TopReadySUs.empty()) {
393	SUnit *SU = pickNode();
394	ScheduledSUnits.push_back(x: SU);
395	TopRPTracker.setPos(SU->getInstr());
396	TopRPTracker.advance();
397	nodeScheduled(SU);
398	}
399
400	// TODO: compute InternalAdditionalPressure.
401	InternalAdditionalPressure.resize(new_size: TopPressure.MaxSetPressure.size());
402
403	// Check everything is right.
404	#ifndef NDEBUG
405	assert(SUnits.size() == ScheduledSUnits.size() &&
406	TopReadySUs.empty());
407	for (SUnit* SU : SUnits) {
408	assert(SU->isScheduled &&
409	SU->NumPredsLeft == `0`);
410	}
411	#endif
412
413	Scheduled = true;
414	}
415
416	void SIScheduleBlock::undoSchedule() {
417	for (SUnit* SU : SUnits) {
418	SU->isScheduled = false;
419	for (SDep& Succ : SU->Succs) {
420	if (BC->isSUInBlock(SU: Succ.getSUnit(), ID))
421	undoReleaseSucc(SU, SuccEdge: &Succ);
422	}
423	}
424	HasLowLatencyNonWaitedParent.assign(n: SUnits.size(), val: `0`);
425	ScheduledSUnits.clear();
426	Scheduled = false;
427	}
428
429	void SIScheduleBlock::undoReleaseSucc(SUnit SU, SDep SuccEdge) {
430	SUnit *SuccSU = SuccEdge->getSUnit();
431
432	if (SuccEdge->isWeak()) {
433	++SuccSU->WeakPredsLeft;
434	return;
435	}
436	++SuccSU->NumPredsLeft;
437	}
438
439	void SIScheduleBlock::releaseSucc(SUnit SU, SDep SuccEdge) {
440	SUnit *SuccSU = SuccEdge->getSUnit();
441
442	if (SuccEdge->isWeak()) {
443	--SuccSU->WeakPredsLeft;
444	return;
445	}
446	#ifndef NDEBUG
447	if (SuccSU->NumPredsLeft == `0`) {
448	dbgs() << "* Scheduling failed! *\n";
449	DAG->dumpNode(*SuccSU);
450	dbgs() << " has been released too many times!\n";
451	llvm_unreachable(nullptr);
452	}
453	#endif
454
455	--SuccSU->NumPredsLeft;
456	}
457
458	/// Release Successors of the SU that are in the block or not.
459	void SIScheduleBlock::releaseSuccessors(SUnit SU, bool* InOrOutBlock) {
460	for (SDep& Succ : SU->Succs) {
461	SUnit *SuccSU = Succ.getSUnit();
462
463	if (SuccSU->NodeNum >= DAG->SUnits.size())
464	continue;
465
466	if (BC->isSUInBlock(SU: SuccSU, ID) != InOrOutBlock)
467	continue;
468
469	releaseSucc(SU, SuccEdge: &Succ);
470	if (SuccSU->NumPredsLeft == `0` && InOrOutBlock)
471	TopReadySUs.push_back(x: SuccSU);
472	}
473	}
474
475	void SIScheduleBlock::nodeScheduled(SUnit *SU) {
476	// Is in TopReadySUs
477	assert (!SU->NumPredsLeft);
478	std::vector<SUnit *>::iterator I = llvm::find(Range&: TopReadySUs, Val: SU);
479	if (I == TopReadySUs.end()) {
480	dbgs() << "Data Structure Bug in SI Scheduler\n";
481	llvm_unreachable(nullptr);
482	}
483	TopReadySUs.erase(position: I);
484
485	releaseSuccessors(SU, InOrOutBlock: true);
486	// Scheduling this node will trigger a wait,
487	// thus propagate to other instructions that they do not need to wait either.
488	if (HasLowLatencyNonWaitedParent [NodeNum2Index [SU->NodeNum]])
489	HasLowLatencyNonWaitedParent.assign(n: SUnits.size(), val: `0`);
490
491	if (DAG->IsLowLatencySU [SU->NodeNum]) {
492	for (SDep& Succ : SU->Succs) {
493	std::map<unsigned, unsigned>::iterator I =
494	NodeNum2Index.find(x: Succ.getSUnit()->NodeNum);
495	if (I != NodeNum2Index.end())
496	HasLowLatencyNonWaitedParent [I ->second] = `1`;
497	}
498	}
499	SU->isScheduled = true;
500	}
501
502	void SIScheduleBlock::finalizeUnits() {
503	// We remove links from outside blocks to enable scheduling inside the block.
504	for (SUnit* SU : SUnits) {
505	releaseSuccessors(SU, InOrOutBlock: false);
506	if (DAG->IsHighLatencySU [SU->NodeNum])
507	HighLatencyBlock = true;
508	}
509	HasLowLatencyNonWaitedParent.resize(new_size: SUnits.size(), x: `0`);
510	}
511
512	// we maintain ascending order of IDs
513	void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
514	unsigned PredID = Pred->getID();
515
516	// Check if not already predecessor.
517	for (SIScheduleBlock* P : Preds) {
518	if (PredID == P->getID())
519	return;
520	}
521	Preds.push_back(x: Pred);
522
523	assert(none_of(Succs,
524	[=](std::pair<SIScheduleBlock*,
525	SIScheduleBlockLinkKind> S) {
526	return PredID == S.first->getID();
527	}) &&
528	"Loop in the Block Graph!");
529	}
530
531	void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
532	SIScheduleBlockLinkKind Kind) {
533	unsigned SuccID = Succ->getID();
534
535	// Check if not already predecessor.
536	for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
537	if (SuccID == S.first->getID()) {
538	if (S.second == SIScheduleBlockLinkKind::NoData &&
539	Kind == SIScheduleBlockLinkKind::Data)
540	S.second = Kind;
541	return;
542	}
543	}
544	if (Succ->isHighLatencyBlock())
545	++NumHighLatencySuccessors;
546	Succs.emplace_back(args&: Succ, args&: Kind);
547
548	assert(none_of(Preds,
549	[=](SIScheduleBlock P) { return* SuccID == P->getID(); }) &&
550	"Loop in the Block Graph!");
551	}
552
553	#ifndef NDEBUG
554	void SIScheduleBlock::printDebug(bool full) {
555	dbgs() << "Block (" << ID << ")\n";
556	if (!full)
557	return;
558
559	dbgs() << "\nContains High Latency Instruction: "
560	<< HighLatencyBlock << `'\n'`;
561	dbgs() << "\nDepends On:\n";
562	for (SIScheduleBlock* P : Preds) {
563	P->printDebug(false);
564	}
565
566	dbgs() << "\nSuccessors:\n";
567	for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
568	if (S.second == SIScheduleBlockLinkKind::Data)
569	dbgs() << "(Data Dep) ";
570	S.first->printDebug(false);
571	}
572
573	if (Scheduled) {
574	dbgs() << "LiveInPressure "
575	<< LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << `' '`
576	<< LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << `'\n'`;
577	dbgs() << "LiveOutPressure "
578	<< LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << `' '`
579	<< LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
580	dbgs() << "LiveIns:\n";
581	for (Register Reg : LiveInRegs)
582	dbgs() << printReg(Reg, DAG->getTRI()) << `' '`;
583
584	dbgs() << "\nLiveOuts:\n";
585	for (Register Reg : LiveOutRegs)
586	dbgs() << printReg(Reg, DAG->getTRI()) << `' '`;
587	}
588
589	dbgs() << "\nInstructions:\n";
590	for (const SUnit* SU : SUnits)
591	DAG->dumpNode(*SU);
592
593	dbgs() << "///////////////////////\n";
594	}
595	#endif
596
597	// SIScheduleBlockCreator //
598
599	SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
600	: DAG(DAG) {}
601
602	SIScheduleBlocks
603	SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
604	std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
605	Blocks.find(x: BlockVariant);
606	if (B == Blocks.end()) {
607	SIScheduleBlocks Res;
608	createBlocksForVariant(BlockVariant);
609	topologicalSort();
610	scheduleInsideBlocks();
611	fillStats();
612	Res.Blocks = CurrentBlocks;
613	Res.TopDownIndex2Block = TopDownIndex2Block;
614	Res.TopDownBlock2Index = TopDownBlock2Index;
615	Blocks [BlockVariant] = Res;
616	return Res;
617	}
618	return B ->second;
619	}
620
621	bool SIScheduleBlockCreator::isSUInBlock(SUnit SU, unsigned* ID) {
622	if (SU->NodeNum >= DAG->SUnits.size())
623	return false;
624	return CurrentBlocks [Node2CurrentBlock [SU->NodeNum]]->getID() == ID;
625	}
626
627	void SIScheduleBlockCreator::colorHighLatenciesAlone() {
628	unsigned DAGSize = DAG->SUnits.size();
629
630	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
631	SUnit *SU = &DAG->SUnits [i];
632	if (DAG->IsHighLatencySU [SU->NodeNum]) {
633	CurrentColoring [SU->NodeNum] = NextReservedID++;
634	}
635	}
636	}
637
638	static bool
639	hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
640	for (const auto &PredDep : SU.Preds) {
641	if (PredDep.getSUnit() == &FromSU &&
642	PredDep.getKind() == llvm::SDep::Data)
643	return true;
644	}
645	return false;
646	}
647
648	void SIScheduleBlockCreator::colorHighLatenciesGroups() {
649	unsigned DAGSize = DAG->SUnits.size();
650	unsigned NumHighLatencies = `0`;
651	unsigned GroupSize;
652	int Color = NextReservedID;
653	unsigned Count = `0`;
654	std::set<unsigned> FormingGroup;
655
656	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
657	SUnit *SU = &DAG->SUnits [i];
658	if (DAG->IsHighLatencySU [SU->NodeNum])
659	++NumHighLatencies;
660	}
661
662	if (NumHighLatencies == `0`)
663	return;
664
665	if (NumHighLatencies <= `6`)
666	GroupSize = `2`;
667	else if (NumHighLatencies <= `12`)
668	GroupSize = `3`;
669	else
670	GroupSize = `4`;
671
672	for (unsigned SUNum : DAG->TopDownIndex2SU) {
673	const SUnit &SU = DAG->SUnits [SUNum];
674	if (DAG->IsHighLatencySU [SU.NodeNum]) {
675	unsigned CompatibleGroup = true;
676	int ProposedColor = Color;
677	std::vector<int> AdditionalElements;
678
679	// We don't want to put in the same block
680	// two high latency instructions that depend
681	// on each other.
682	// One way would be to check canAddEdge
683	// in both directions, but that currently is not
684	// enough because there the high latency order is
685	// enforced (via links).
686	// Instead, look at the dependencies between the
687	// high latency instructions and deduce if it is
688	// a data dependency or not.
689	for (unsigned j : FormingGroup) {
690	bool HasSubGraph;
691	std::vector<int> SubGraph;
692	// By construction (topological order), if SU and
693	// DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
694	// in the parent graph of SU.
695	#ifndef NDEBUG
696	SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
697	HasSubGraph);
698	assert(!HasSubGraph);
699	#endif
700	SubGraph = DAG->GetTopo()->GetSubGraph(StartSU: DAG->SUnits [j], TargetSU: SU,
701	Success&: HasSubGraph);
702	if (!HasSubGraph)
703	continue; // No dependencies between each other
704	if (SubGraph.size() > `5`) {
705	// Too many elements would be required to be added to the block.
706	CompatibleGroup = false;
707	break;
708	}
709	// Check the type of dependency
710	for (unsigned k : SubGraph) {
711	// If in the path to join the two instructions,
712	// there is another high latency instruction,
713	// or instructions colored for another block
714	// abort the merge.
715	if (DAG->IsHighLatencySU [k] \|\| (CurrentColoring [k] != ProposedColor &&
716	CurrentColoring [k] != `0`)) {
717	CompatibleGroup = false;
718	break;
719	}
720	// If one of the SU in the subgraph depends on the result of SU j,
721	// there'll be a data dependency.
722	if (hasDataDependencyPred(SU: DAG->SUnits [k], FromSU: DAG->SUnits [j])) {
723	CompatibleGroup = false;
724	break;
725	}
726	}
727	if (!CompatibleGroup)
728	break;
729	// Same check for the SU
730	if (hasDataDependencyPred(SU, FromSU: DAG->SUnits [j])) {
731	CompatibleGroup = false;
732	break;
733	}
734	// Add all the required instructions to the block
735	// These cannot live in another block (because they
736	// depend (order dependency) on one of the
737	// instruction in the block, and are required for the
738	// high latency instruction we add.
739	llvm::append_range(C&: AdditionalElements, R&: SubGraph);
740	}
741	if (CompatibleGroup) {
742	FormingGroup.insert(x: SU.NodeNum);
743	for (unsigned j : AdditionalElements)
744	CurrentColoring [j] = ProposedColor;
745	CurrentColoring [SU.NodeNum] = ProposedColor;
746	++Count;
747	}
748	// Found one incompatible instruction,
749	// or has filled a big enough group.
750	// -> start a new one.
751	if (!CompatibleGroup) {
752	FormingGroup.clear();
753	Color = ++NextReservedID;
754	ProposedColor = Color;
755	FormingGroup.insert(x: SU.NodeNum);
756	CurrentColoring [SU.NodeNum] = ProposedColor;
757	Count = `0`;
758	} else if (Count == GroupSize) {
759	FormingGroup.clear();
760	Color = ++NextReservedID;
761	ProposedColor = Color;
762	Count = `0`;
763	}
764	}
765	}
766	}
767
768	void SIScheduleBlockCreator::colorComputeReservedDependencies() {
769	unsigned DAGSize = DAG->SUnits.size();
770	std::map<std::set<unsigned>, unsigned> ColorCombinations;
771
772	CurrentTopDownReservedDependencyColoring.clear();
773	CurrentBottomUpReservedDependencyColoring.clear();
774
775	CurrentTopDownReservedDependencyColoring.resize(new_size: DAGSize, x: `0`);
776	CurrentBottomUpReservedDependencyColoring.resize(new_size: DAGSize, x: `0`);
777
778	// Traverse TopDown, and give different colors to SUs depending
779	// on which combination of High Latencies they depend on.
780
781	for (unsigned SUNum : DAG->TopDownIndex2SU) {
782	SUnit *SU = &DAG->SUnits [SUNum];
783	std::set<unsigned> SUColors;
784
785	// Already given.
786	if (CurrentColoring [SU->NodeNum]) {
787	CurrentTopDownReservedDependencyColoring [SU->NodeNum] =
788	CurrentColoring [SU->NodeNum];
789	continue;
790	}
791
792	for (SDep& PredDep : SU->Preds) {
793	SUnit *Pred = PredDep.getSUnit();
794	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
795	continue;
796	if (CurrentTopDownReservedDependencyColoring [Pred->NodeNum] > `0`)
797	SUColors.insert(x: CurrentTopDownReservedDependencyColoring [Pred->NodeNum]);
798	}
799	// Color 0 by default.
800	if (SUColors.empty())
801	continue;
802	// Same color than parents.
803	if (SUColors.size() == `1` && *SUColors.begin() > DAGSize)
804	CurrentTopDownReservedDependencyColoring [SU->NodeNum] =
805	*SUColors.begin();
806	else {
807	auto [Pos, Inserted] =
808	ColorCombinations.try_emplace(k: SUColors, args&: NextNonReservedID);
809	if (Inserted)
810	++NextNonReservedID;
811	CurrentTopDownReservedDependencyColoring [SU->NodeNum] = Pos ->second;
812	}
813	}
814
815	ColorCombinations.clear();
816
817	// Same as before, but BottomUp.
818
819	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
820	SUnit *SU = &DAG->SUnits [SUNum];
821	std::set<unsigned> SUColors;
822
823	// Already given.
824	if (CurrentColoring [SU->NodeNum]) {
825	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] =
826	CurrentColoring [SU->NodeNum];
827	continue;
828	}
829
830	for (SDep& SuccDep : SU->Succs) {
831	SUnit *Succ = SuccDep.getSUnit();
832	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
833	continue;
834	if (CurrentBottomUpReservedDependencyColoring [Succ->NodeNum] > `0`)
835	SUColors.insert(x: CurrentBottomUpReservedDependencyColoring [Succ->NodeNum]);
836	}
837	// Keep color 0.
838	if (SUColors.empty())
839	continue;
840	// Same color than parents.
841	if (SUColors.size() == `1` && *SUColors.begin() > DAGSize)
842	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] =
843	*SUColors.begin();
844	else {
845	std::map<std::set<unsigned>, unsigned>::iterator Pos =
846	ColorCombinations.find(x: SUColors);
847	if (Pos != ColorCombinations.end()) {
848	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] = Pos ->second;
849	} else {
850	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] =
851	NextNonReservedID;
852	ColorCombinations [SUColors] = NextNonReservedID++;
853	}
854	}
855	}
856	}
857
858	void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
859	std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
860
861	// Every combination of colors given by the top down
862	// and bottom up Reserved node dependency
863
864	for (const SUnit &SU : DAG->SUnits) {
865	std::pair<unsigned, unsigned> SUColors;
866
867	// High latency instructions: already given.
868	if (CurrentColoring [SU.NodeNum])
869	continue;
870
871	SUColors.first = CurrentTopDownReservedDependencyColoring [SU.NodeNum];
872	SUColors.second = CurrentBottomUpReservedDependencyColoring [SU.NodeNum];
873
874	auto [Pos, Inserted] =
875	ColorCombinations.try_emplace(k: SUColors, args&: NextNonReservedID);
876	CurrentColoring [SU.NodeNum] = Pos ->second;
877	if (Inserted)
878	NextNonReservedID++;
879	}
880	}
881
882	void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
883	unsigned DAGSize = DAG->SUnits.size();
884	std::vector<int> PendingColoring = CurrentColoring;
885
886	assert(DAGSize >= `1` &&
887	CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
888	CurrentTopDownReservedDependencyColoring.size() == DAGSize);
889	// If there is no reserved block at all, do nothing. We don't want
890	// everything in one block.
891	if (*llvm::max_element(Range&: CurrentBottomUpReservedDependencyColoring) == `0` &&
892	*llvm::max_element(Range&: CurrentTopDownReservedDependencyColoring) == `0`)
893	return;
894
895	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
896	SUnit *SU = &DAG->SUnits [SUNum];
897	std::set<unsigned> SUColors;
898	std::set<unsigned> SUColorsPending;
899
900	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
901	continue;
902
903	if (CurrentBottomUpReservedDependencyColoring [SU->NodeNum] > `0` \|\|
904	CurrentTopDownReservedDependencyColoring [SU->NodeNum] > `0`)
905	continue;
906
907	for (SDep& SuccDep : SU->Succs) {
908	SUnit *Succ = SuccDep.getSUnit();
909	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
910	continue;
911	if (CurrentBottomUpReservedDependencyColoring [Succ->NodeNum] > `0` \|\|
912	CurrentTopDownReservedDependencyColoring [Succ->NodeNum] > `0`)
913	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
914	SUColorsPending.insert(x: PendingColoring [Succ->NodeNum]);
915	}
916	// If there is only one child/parent block, and that block
917	// is not among the ones we are removing in this path, then
918	// merge the instruction to that block
919	if (SUColors.size() == `1` && SUColorsPending.size() == `1`)
920	PendingColoring [SU->NodeNum] = *SUColors.begin();
921	else // TODO: Attribute new colors depending on color
922	// combination of children.
923	PendingColoring [SU->NodeNum] = NextNonReservedID++;
924	}
925	CurrentColoring = std::move(PendingColoring);
926	}
927
928
929	void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
930	unsigned DAGSize = DAG->SUnits.size();
931	unsigned PreviousColor;
932	std::set<unsigned> SeenColors;
933
934	if (DAGSize <= `1`)
935	return;
936
937	PreviousColor = CurrentColoring [`0`];
938
939	for (unsigned i = `1`, e = DAGSize; i != e; ++i) {
940	SUnit *SU = &DAG->SUnits [i];
941	unsigned CurrentColor = CurrentColoring [i];
942	unsigned PreviousColorSave = PreviousColor;
943	assert(i == SU->NodeNum);
944
945	if (CurrentColor != PreviousColor)
946	SeenColors.insert(x: PreviousColor);
947	PreviousColor = CurrentColor;
948
949	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
950	continue;
951
952	if (SeenColors.find(x: CurrentColor) == SeenColors.end())
953	continue;
954
955	if (PreviousColorSave != CurrentColor)
956	CurrentColoring [i] = NextNonReservedID++;
957	else
958	CurrentColoring [i] = CurrentColoring [i-`1`];
959	}
960	}
961
962	void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
963	unsigned DAGSize = DAG->SUnits.size();
964
965	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
966	SUnit *SU = &DAG->SUnits [SUNum];
967	std::set<unsigned> SUColors;
968
969	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
970	continue;
971
972	// No predecessor: Vgpr constant loading.
973	// Low latency instructions usually have a predecessor (the address)
974	if (!SU->Preds.empty() && !DAG->IsLowLatencySU [SU->NodeNum])
975	continue;
976
977	for (SDep& SuccDep : SU->Succs) {
978	SUnit *Succ = SuccDep.getSUnit();
979	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
980	continue;
981	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
982	}
983	if (SUColors.size() == `1`)
984	CurrentColoring [SU->NodeNum] = *SUColors.begin();
985	}
986	}
987
988	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
989	unsigned DAGSize = DAG->SUnits.size();
990
991	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
992	SUnit *SU = &DAG->SUnits [SUNum];
993	std::set<unsigned> SUColors;
994
995	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
996	continue;
997
998	for (SDep& SuccDep : SU->Succs) {
999	SUnit *Succ = SuccDep.getSUnit();
1000	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1001	continue;
1002	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
1003	}
1004	if (SUColors.size() == `1`)
1005	CurrentColoring [SU->NodeNum] = *SUColors.begin();
1006	}
1007	}
1008
1009	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1010	unsigned DAGSize = DAG->SUnits.size();
1011
1012	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1013	SUnit *SU = &DAG->SUnits [SUNum];
1014	std::set<unsigned> SUColors;
1015
1016	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
1017	continue;
1018
1019	for (SDep& SuccDep : SU->Succs) {
1020	SUnit *Succ = SuccDep.getSUnit();
1021	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1022	continue;
1023	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
1024	}
1025	if (SUColors.size() == `1` && *SUColors.begin() <= DAGSize)
1026	CurrentColoring [SU->NodeNum] = *SUColors.begin();
1027	}
1028	}
1029
1030	void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1031	unsigned DAGSize = DAG->SUnits.size();
1032	std::map<unsigned, unsigned> ColorCount;
1033
1034	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1035	SUnit *SU = &DAG->SUnits [SUNum];
1036	unsigned color = CurrentColoring [SU->NodeNum];
1037	++ColorCount [color];
1038	}
1039
1040	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1041	SUnit *SU = &DAG->SUnits [SUNum];
1042	unsigned color = CurrentColoring [SU->NodeNum];
1043	std::set<unsigned> SUColors;
1044
1045	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
1046	continue;
1047
1048	if (ColorCount [color] > `1`)
1049	continue;
1050
1051	for (SDep& SuccDep : SU->Succs) {
1052	SUnit *Succ = SuccDep.getSUnit();
1053	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1054	continue;
1055	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
1056	}
1057	if (SUColors.size() == `1` && *SUColors.begin() != color) {
1058	--ColorCount [color];
1059	CurrentColoring [SU->NodeNum] = *SUColors.begin();
1060	++ColorCount [*SUColors.begin()];
1061	}
1062	}
1063	}
1064
1065	void SIScheduleBlockCreator::cutHugeBlocks() {
1066	// TODO
1067	}
1068
1069	void SIScheduleBlockCreator::regroupNoUserInstructions() {
1070	unsigned DAGSize = DAG->SUnits.size();
1071	int GroupID = NextNonReservedID++;
1072
1073	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1074	SUnit *SU = &DAG->SUnits [SUNum];
1075	bool hasSuccessor = false;
1076
1077	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
1078	continue;
1079
1080	for (SDep& SuccDep : SU->Succs) {
1081	SUnit *Succ = SuccDep.getSUnit();
1082	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1083	continue;
1084	hasSuccessor = true;
1085	}
1086	if (!hasSuccessor)
1087	CurrentColoring [SU->NodeNum] = GroupID;
1088	}
1089	}
1090
1091	void SIScheduleBlockCreator::colorExports() {
1092	unsigned ExportColor = NextNonReservedID++;
1093	SmallVector<unsigned, `8`> ExpGroup;
1094
1095	// Put all exports together in a block.
1096	// The block will naturally end up being scheduled last,
1097	// thus putting exports at the end of the schedule, which
1098	// is better for performance.
1099	// However we must ensure, for safety, the exports can be put
1100	// together in the same block without any other instruction.
1101	// This could happen, for example, when scheduling after regalloc
1102	// if reloading a spilled register from memory using the same
1103	// register than used in a previous export.
1104	// If that happens, do not regroup the exports.
1105	for (unsigned SUNum : DAG->TopDownIndex2SU) {
1106	const SUnit &SU = DAG->SUnits [SUNum];
1107	if (SIInstrInfo::isEXP(MI: *SU.getInstr())) {
1108	// SU is an export instruction. Check whether one of its successor
1109	// dependencies is a non-export, in which case we skip export grouping.
1110	for (const SDep &SuccDep : SU.Succs) {
1111	const SUnit *SuccSU = SuccDep.getSUnit();
1112	if (SuccDep.isWeak() \|\| SuccSU->NodeNum >= DAG->SUnits.size()) {
1113	// Ignore these dependencies.
1114	continue;
1115	}
1116	assert(SuccSU->isInstr() &&
1117	"SUnit unexpectedly not representing an instruction!");
1118
1119	if (!SIInstrInfo::isEXP(MI: *SuccSU->getInstr())) {
1120	// A non-export depends on us. Skip export grouping.
1121	// Note that this is a bit pessimistic: We could still group all other
1122	// exports that are not depended on by non-exports, directly or
1123	// indirectly. Simply skipping this particular export but grouping all
1124	// others would not account for indirect dependencies.
1125	return;
1126	}
1127	}
1128	ExpGroup.push_back(Elt: SUNum);
1129	}
1130	}
1131
1132	// The group can be formed. Give the color.
1133	for (unsigned j : ExpGroup)
1134	CurrentColoring [j] = ExportColor;
1135	}
1136
1137	void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1138	unsigned DAGSize = DAG->SUnits.size();
1139	std::map<unsigned,unsigned> RealID;
1140
1141	CurrentBlocks.clear();
1142	CurrentColoring.clear();
1143	CurrentColoring.resize(new_size: DAGSize, x: `0`);
1144	Node2CurrentBlock.clear();
1145
1146	// Restore links previous scheduling variant has overridden.
1147	DAG->restoreSULinksLeft();
1148
1149	NextReservedID = `1`;
1150	NextNonReservedID = DAGSize + `1`;
1151
1152	LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1153
1154	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1155	colorHighLatenciesGroups();
1156	else
1157	colorHighLatenciesAlone();
1158	colorComputeReservedDependencies();
1159	colorAccordingToReservedDependencies();
1160	colorEndsAccordingToDependencies();
1161	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1162	colorForceConsecutiveOrderInGroup();
1163	regroupNoUserInstructions();
1164	colorMergeConstantLoadsNextGroup();
1165	colorMergeIfPossibleNextGroupOnlyForReserved();
1166	colorExports();
1167
1168	// Put SUs of same color into same block
1169	Node2CurrentBlock.resize(new_size: DAGSize, x: -`1`);
1170	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1171	SUnit *SU = &DAG->SUnits [i];
1172	unsigned Color = CurrentColoring [SU->NodeNum];
1173	auto [It, Inserted] = RealID.try_emplace(k: Color);
1174	if (Inserted) {
1175	int ID = CurrentBlocks.size();
1176	BlockPtrs.push_back(x: std::make_unique<SIScheduleBlock>(args&: DAG, args: this, args&: ID));
1177	CurrentBlocks.push_back(x: BlockPtrs.rbegin()->get());
1178	It ->second = ID;
1179	}
1180	CurrentBlocks [It ->second]->addUnit(SU);
1181	Node2CurrentBlock [SU->NodeNum] = It ->second;
1182	}
1183
1184	// Build dependencies between blocks.
1185	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1186	SUnit *SU = &DAG->SUnits [i];
1187	int SUID = Node2CurrentBlock [i];
1188	for (SDep& SuccDep : SU->Succs) {
1189	SUnit *Succ = SuccDep.getSUnit();
1190	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1191	continue;
1192	if (Node2CurrentBlock [Succ->NodeNum] != SUID)
1193	CurrentBlocks [SUID]->addSucc(Succ: CurrentBlocks [Node2CurrentBlock [Succ->NodeNum]],
1194	Kind: SuccDep.isCtrl() ? NoData : Data);
1195	}
1196	for (SDep& PredDep : SU->Preds) {
1197	SUnit *Pred = PredDep.getSUnit();
1198	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
1199	continue;
1200	if (Node2CurrentBlock [Pred->NodeNum] != SUID)
1201	CurrentBlocks [SUID]->addPred(Pred: CurrentBlocks [Node2CurrentBlock [Pred->NodeNum]]);
1202	}
1203	}
1204
1205	// Free root and leafs of all blocks to enable scheduling inside them.
1206	for (SIScheduleBlock *Block : CurrentBlocks)
1207	Block->finalizeUnits();
1208	LLVM_DEBUG({
1209	dbgs() << "Blocks created:\n\n";
1210	for (SIScheduleBlock *Block : CurrentBlocks)
1211	Block->printDebug(true);
1212	});
1213	}
1214
1215	// Two functions taken from Codegen/MachineScheduler.cpp
1216
1217	/// Non-const version.
1218	static MachineBasicBlock::iterator
1219	nextIfDebug(MachineBasicBlock::iterator I,
1220	MachineBasicBlock::const_iterator End) {
1221	for (; I != End; ++I) {
1222	if (!I ->isDebugInstr())
1223	break;
1224	}
1225	return I;
1226	}
1227
1228	void SIScheduleBlockCreator::topologicalSort() {
1229	unsigned DAGSize = CurrentBlocks.size();
1230	std::vector<int> WorkList;
1231
1232	LLVM_DEBUG(dbgs() << "Topological Sort\n");
1233
1234	WorkList.reserve(n: DAGSize);
1235	TopDownIndex2Block.resize(new_size: DAGSize);
1236	TopDownBlock2Index.resize(new_size: DAGSize);
1237	BottomUpIndex2Block.resize(new_size: DAGSize);
1238
1239	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1240	SIScheduleBlock *Block = CurrentBlocks [i];
1241	unsigned Degree = Block->getSuccs().size();
1242	TopDownBlock2Index [i] = Degree;
1243	if (Degree == `0`) {
1244	WorkList.push_back(x: i);
1245	}
1246	}
1247
1248	int Id = DAGSize;
1249	while (!WorkList.empty()) {
1250	int i = WorkList.back();
1251	SIScheduleBlock *Block = CurrentBlocks [i];
1252	WorkList.pop_back();
1253	TopDownBlock2Index [i] = --Id;
1254	TopDownIndex2Block [Id] = i;
1255	for (SIScheduleBlock* Pred : Block->getPreds()) {
1256	if (!--TopDownBlock2Index [Pred->getID()])
1257	WorkList.push_back(x: Pred->getID());
1258	}
1259	}
1260
1261	#ifndef NDEBUG
1262	// Check correctness of the ordering.
1263	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1264	SIScheduleBlock *Block = CurrentBlocks[i];
1265	for (SIScheduleBlock* Pred : Block->getPreds()) {
1266	assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1267	"Wrong Top Down topological sorting");
1268	}
1269	}
1270	#endif
1271
1272	BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1273	TopDownIndex2Block.rend());
1274	}
1275
1276	void SIScheduleBlockCreator::scheduleInsideBlocks() {
1277	unsigned DAGSize = CurrentBlocks.size();
1278
1279	LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1280
1281	// We do schedule a valid scheduling such that a Block corresponds
1282	// to a range of instructions.
1283	LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1284	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1285	SIScheduleBlock *Block = CurrentBlocks [i];
1286	Block->fastSchedule();
1287	}
1288
1289	// Note: the following code, and the part restoring previous position
1290	// is by far the most expensive operation of the Scheduler.
1291
1292	// Do not update CurrentTop.
1293	MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1294	std::vector<MachineBasicBlock::iterator> PosOld;
1295	std::vector<MachineBasicBlock::iterator> PosNew;
1296	PosOld.reserve(n: DAG->SUnits.size());
1297	PosNew.reserve(n: DAG->SUnits.size());
1298
1299	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1300	int BlockIndice = TopDownIndex2Block [i];
1301	SIScheduleBlock *Block = CurrentBlocks [BlockIndice];
1302	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1303
1304	for (SUnit* SU : SUs) {
1305	MachineInstr *MI = SU->getInstr();
1306	MachineBasicBlock::iterator Pos = MI;
1307	PosOld.push_back(x: Pos);
1308	if (&*CurrentTopFastSched == MI) {
1309	PosNew.push_back(x: Pos);
1310	CurrentTopFastSched = nextIfDebug(I: ++CurrentTopFastSched,
1311	End: DAG->getCurrentBottom());
1312	} else {
1313	// Update the instruction stream.
1314	DAG->getBB()->splice(Where: CurrentTopFastSched, Other: DAG->getBB(), From: MI);
1315
1316	// Update LiveIntervals.
1317	// Note: Moving all instructions and calling handleMove every time
1318	// is the most cpu intensive operation of the scheduler.
1319	// It would gain a lot if there was a way to recompute the
1320	// LiveIntervals for the entire scheduling region.
1321	DAG->getLIS()->handleMove(MI&: MI, /UpdateFlags=/*true);
1322	PosNew.push_back(x: CurrentTopFastSched);
1323	}
1324	}
1325	}
1326
1327	// Now we have Block of SUs == Block of MI.
1328	// We do the final schedule for the instructions inside the block.
1329	// The property that all the SUs of the Block are grouped together as MI
1330	// is used for correct reg usage tracking.
1331	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1332	SIScheduleBlock *Block = CurrentBlocks [i];
1333	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1334	Block->schedule(BeginBlock: (SUs.begin())->getInstr(), EndBlock: (SUs.rbegin())->getInstr());
1335	}
1336
1337	LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1338	// Restore old ordering (which prevents a LIS->handleMove bug).
1339	for (unsigned i = PosOld.size(), e = `0`; i != e; --i) {
1340	MachineBasicBlock::iterator POld = PosOld [i-`1`];
1341	MachineBasicBlock::iterator PNew = PosNew [i-`1`];
1342	if (PNew != POld) {
1343	// Update the instruction stream.
1344	DAG->getBB()->splice(Where: POld, Other: DAG->getBB(), From: PNew);
1345
1346	// Update LiveIntervals.
1347	DAG->getLIS()->handleMove(MI&: POld, /UpdateFlags=/*true);
1348	}
1349	}
1350
1351	LLVM_DEBUG({
1352	for (SIScheduleBlock *Block : CurrentBlocks)
1353	Block->printDebug(true);
1354	});
1355	}
1356
1357	void SIScheduleBlockCreator::fillStats() {
1358	unsigned DAGSize = CurrentBlocks.size();
1359
1360	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1361	int BlockIndice = TopDownIndex2Block [i];
1362	SIScheduleBlock *Block = CurrentBlocks [BlockIndice];
1363	if (Block->getPreds().empty())
1364	Block->Depth = `0`;
1365	else {
1366	unsigned Depth = `0`;
1367	for (SIScheduleBlock *Pred : Block->getPreds()) {
1368	if (Depth < Pred->Depth + Pred->getCost())
1369	Depth = Pred->Depth + Pred->getCost();
1370	}
1371	Block->Depth = Depth;
1372	}
1373	}
1374
1375	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1376	int BlockIndice = BottomUpIndex2Block [i];
1377	SIScheduleBlock *Block = CurrentBlocks [BlockIndice];
1378	if (Block->getSuccs().empty())
1379	Block->Height = `0`;
1380	else {
1381	unsigned Height = `0`;
1382	for (const auto &Succ : Block->getSuccs())
1383	Height = std::max(a: Height, b: Succ.first->Height + Succ.first->getCost());
1384	Block->Height = Height;
1385	}
1386	}
1387	}
1388
1389	// SIScheduleBlockScheduler //
1390
1391	SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1392	SISchedulerBlockSchedulerVariant Variant,
1393	SIScheduleBlocks BlocksStruct) :
1394	DAG(DAG), Variant(Variant), Blocks (BlocksStruct.Blocks),
1395	LastPosWaitedHighLatency(`0`), NumBlockScheduled(`0`), VregCurrentUsage(`0`),
1396	SregCurrentUsage(`0`), maxVregUsage(`0`), maxSregUsage(`0`) {
1397
1398	// Fill the usage of every output
1399	// Warning: while by construction we always have a link between two blocks
1400	// when one needs a result from the other, the number of users of an output
1401	// is not the sum of child blocks having as input the same virtual register.
1402	// Here is an example. A produces x and y. B eats x and produces x'.
1403	// C eats x' and y. The register coalescer may have attributed the same
1404	// virtual register to x and x'.
1405	// To count accurately, we do a topological sort. In case the register is
1406	// found for several parents, we increment the usage of the one with the
1407	// highest topological index.
1408	LiveOutRegsNumUsages.resize(new_size: Blocks.size());
1409	for (SIScheduleBlock *Block : Blocks) {
1410	for (Register Reg : Block->getInRegs()) {
1411	bool Found = false;
1412	int topoInd = -`1`;
1413	for (SIScheduleBlock* Pred: Block->getPreds()) {
1414	std::set<Register> PredOutRegs = Pred->getOutRegs();
1415	std::set<Register>::iterator RegPos = PredOutRegs.find(x: Reg);
1416
1417	if (RegPos != PredOutRegs.end()) {
1418	Found = true;
1419	if (topoInd < BlocksStruct.TopDownBlock2Index [Pred->getID()]) {
1420	topoInd = BlocksStruct.TopDownBlock2Index [Pred->getID()];
1421	}
1422	}
1423	}
1424
1425	if (!Found)
1426	continue;
1427
1428	int PredID = BlocksStruct.TopDownIndex2Block [topoInd];
1429	++LiveOutRegsNumUsages [PredID][Reg];
1430	}
1431	}
1432
1433	LastPosHighLatencyParentScheduled.resize(new_size: Blocks.size(), x: `0`);
1434	BlockNumPredsLeft.resize(new_size: Blocks.size());
1435	BlockNumSuccsLeft.resize(new_size: Blocks.size());
1436
1437	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1438	SIScheduleBlock *Block = Blocks [i];
1439	BlockNumPredsLeft [i] = Block->getPreds().size();
1440	BlockNumSuccsLeft [i] = Block->getSuccs().size();
1441	}
1442
1443	#ifndef NDEBUG
1444	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1445	SIScheduleBlock *Block = Blocks[i];
1446	assert(Block->getID() == i);
1447	}
1448	#endif
1449
1450	std::set<VirtRegOrUnit> InRegs = DAG->getInRegs();
1451	addLiveRegs(Regs&: InRegs);
1452
1453	// Increase LiveOutRegsNumUsages for blocks
1454	// producing registers consumed in another
1455	// scheduling region.
1456	for (VirtRegOrUnit VRegOrUnit : DAG->getOutRegs()) {
1457	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1458	// Do reverse traversal
1459	int ID = BlocksStruct.TopDownIndex2Block [Blocks.size()-`1`-i];
1460	SIScheduleBlock *Block = Blocks [ID];
1461	const std::set<Register> &OutRegs = Block->getOutRegs();
1462
1463	if (!VRegOrUnit.isVirtualReg() \|\|
1464	OutRegs.find(x: VRegOrUnit.asVirtualReg()) == OutRegs.end())
1465	continue;
1466
1467	++LiveOutRegsNumUsages [ID][VRegOrUnit.asVirtualReg()];
1468	break;
1469	}
1470	}
1471
1472	// Fill LiveRegsConsumers for regs that were already
1473	// defined before scheduling.
1474	for (SIScheduleBlock *Block : Blocks) {
1475	for (Register Reg : Block->getInRegs()) {
1476	bool Found = false;
1477	for (SIScheduleBlock* Pred: Block->getPreds()) {
1478	std::set<Register> PredOutRegs = Pred->getOutRegs();
1479	std::set<Register>::iterator RegPos = PredOutRegs.find(x: Reg);
1480
1481	if (RegPos != PredOutRegs.end()) {
1482	Found = true;
1483	break;
1484	}
1485	}
1486
1487	if (!Found)
1488	++LiveRegsConsumers [Reg];
1489	}
1490	}
1491
1492	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1493	SIScheduleBlock *Block = Blocks [i];
1494	if (BlockNumPredsLeft [i] == `0`) {
1495	ReadyBlocks.push_back(x: Block);
1496	}
1497	}
1498
1499	while (SIScheduleBlock *Block = pickBlock()) {
1500	BlocksScheduled.push_back(x: Block);
1501	blockScheduled(Block);
1502	}
1503
1504	LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
1505	: BlocksScheduled) {
1506	dbgs() << `' '` << Block->getID();
1507	} dbgs() << `'\n'`;);
1508	}
1509
1510	bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1511	SIBlockSchedCandidate &TryCand) {
1512	if (!Cand.isValid()) {
1513	TryCand.Reason = NodeOrder;
1514	return true;
1515	}
1516
1517	// Try to hide high latencies.
1518	if (SISched::tryLess(TryVal: TryCand.LastPosHighLatParentScheduled,
1519	CandVal: Cand.LastPosHighLatParentScheduled, TryCand, Cand, Reason: Latency))
1520	return true;
1521	// Schedule high latencies early so you can hide them better.
1522	if (SISched::tryGreater(TryVal: TryCand.IsHighLatency, CandVal: Cand.IsHighLatency,
1523	TryCand, Cand, Reason: Latency))
1524	return true;
1525	if (TryCand.IsHighLatency && SISched::tryGreater(TryVal: TryCand.Height, CandVal: Cand.Height,
1526	TryCand, Cand, Reason: Depth))
1527	return true;
1528	if (SISched::tryGreater(TryVal: TryCand.NumHighLatencySuccessors,
1529	CandVal: Cand.NumHighLatencySuccessors,
1530	TryCand, Cand, Reason: Successor))
1531	return true;
1532	return false;
1533	}
1534
1535	bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1536	SIBlockSchedCandidate &TryCand) {
1537	if (!Cand.isValid()) {
1538	TryCand.Reason = NodeOrder;
1539	return true;
1540	}
1541
1542	if (SISched::tryLess(TryVal: TryCand.VGPRUsageDiff > `0`, CandVal: Cand.VGPRUsageDiff > `0`,
1543	TryCand, Cand, Reason: RegUsage))
1544	return true;
1545	if (SISched::tryGreater(TryVal: TryCand.NumSuccessors > `0`,
1546	CandVal: Cand.NumSuccessors > `0`,
1547	TryCand, Cand, Reason: Successor))
1548	return true;
1549	if (SISched::tryGreater(TryVal: TryCand.Height, CandVal: Cand.Height, TryCand, Cand, Reason: Depth))
1550	return true;
1551	if (SISched::tryLess(TryVal: TryCand.VGPRUsageDiff, CandVal: Cand.VGPRUsageDiff,
1552	TryCand, Cand, Reason: RegUsage))
1553	return true;
1554	return false;
1555	}
1556
1557	SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1558	SIBlockSchedCandidate Cand;
1559	std::vector<SIScheduleBlock*>::iterator Best;
1560	SIScheduleBlock *Block;
1561	if (ReadyBlocks.empty())
1562	return nullptr;
1563
1564	DAG->fillVgprSgprCost(First: LiveRegs.begin(), End: LiveRegs.end(),
1565	VgprUsage&: VregCurrentUsage, SgprUsage&: SregCurrentUsage);
1566	if (VregCurrentUsage > maxVregUsage)
1567	maxVregUsage = VregCurrentUsage;
1568	if (SregCurrentUsage > maxSregUsage)
1569	maxSregUsage = SregCurrentUsage;
1570	LLVM_DEBUG({
1571	dbgs() << "Picking New Blocks\n";
1572	dbgs() << "Available: ";
1573	for (SIScheduleBlock *Block : ReadyBlocks)
1574	dbgs() << Block->getID() << `' '`;
1575	dbgs() << "\nCurrent Live:\n";
1576	for (Register Reg : LiveRegs)
1577	dbgs() << printReg(Reg, DAG->getTRI()) << `' '`;
1578	dbgs() << `'\n'`;
1579	dbgs() << "Current VGPRs: " << VregCurrentUsage << `'\n'`;
1580	dbgs() << "Current SGPRs: " << SregCurrentUsage << `'\n'`;
1581	});
1582
1583	Cand.Block = nullptr;
1584	for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1585	E = ReadyBlocks.end(); I != E; ++I) {
1586	SIBlockSchedCandidate TryCand;
1587	TryCand.Block = *I;
1588	TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1589	TryCand.VGPRUsageDiff =
1590	checkRegUsageImpact(InRegs&: TryCand.Block->getInRegs(),
1591	OutRegs&: TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32];
1592	TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1593	TryCand.NumHighLatencySuccessors =
1594	TryCand.Block->getNumHighLatencySuccessors();
1595	TryCand.LastPosHighLatParentScheduled =
1596	(unsigned int) std::max<int> (a: `0`,
1597	b: LastPosHighLatencyParentScheduled [TryCand.Block->getID()] -
1598	LastPosWaitedHighLatency);
1599	TryCand.Height = TryCand.Block->Height;
1600	// Try not to increase VGPR usage too much, else we may spill.
1601	if (VregCurrentUsage > `120` \|\|
1602	Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1603	if (!tryCandidateRegUsage(Cand, TryCand) &&
1604	Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1605	tryCandidateLatency(Cand, TryCand);
1606	} else {
1607	if (!tryCandidateLatency(Cand, TryCand))
1608	tryCandidateRegUsage(Cand, TryCand);
1609	}
1610	if (TryCand.Reason != NoCand) {
1611	Cand.setBest(TryCand);
1612	Best = I;
1613	LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << `' '`
1614	<< getReasonStr(Cand.Reason) << `'\n'`);
1615	}
1616	}
1617
1618	LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << `'\n'`;
1619	dbgs() << "Is a block with high latency instruction: "
1620	<< (Cand.IsHighLatency ? "yes\n" : "no\n");
1621	dbgs() << "Position of last high latency dependency: "
1622	<< Cand.LastPosHighLatParentScheduled << `'\n'`;
1623	dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << `'\n'`;
1624	dbgs() << `'\n'`;);
1625
1626	Block = Cand.Block;
1627	ReadyBlocks.erase(position: Best);
1628	return Block;
1629	}
1630
1631	// Tracking of currently alive registers to determine VGPR Usage.
1632
1633	void SIScheduleBlockScheduler::addLiveRegs(std::set<VirtRegOrUnit> &Regs) {
1634	for (VirtRegOrUnit VRegOrUnit : Regs) {
1635	// For now only track virtual registers.
1636	if (!VRegOrUnit.isVirtualReg())
1637	continue;
1638	// If not already in the live set, then add it.
1639	(void)LiveRegs.insert(x: VRegOrUnit.asVirtualReg());
1640	}
1641	}
1642
1643	void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1644	std::set<Register> &Regs) {
1645	for (Register Reg : Regs) {
1646	// For now only track virtual registers.
1647	std::set<Register>::iterator Pos = LiveRegs.find(x: Reg);
1648	assert (Pos != LiveRegs.end() && // Reg must be live.
1649	LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1650	LiveRegsConsumers[Reg] >= `1`);
1651	--LiveRegsConsumers [Reg];
1652	if (LiveRegsConsumers [Reg] == `0`)
1653	LiveRegs.erase(position: Pos);
1654	}
1655	}
1656
1657	void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1658	for (const auto &Block : Parent->getSuccs()) {
1659	if (--BlockNumPredsLeft [Block.first->getID()] == `0`)
1660	ReadyBlocks.push_back(x: Block.first);
1661
1662	if (Parent->isHighLatencyBlock() &&
1663	Block.second == SIScheduleBlockLinkKind::Data)
1664	LastPosHighLatencyParentScheduled [Block.first->getID()] = NumBlockScheduled;
1665	}
1666	}
1667
1668	void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1669	decreaseLiveRegs(Block, Regs&: Block->getInRegs());
1670	LiveRegs.insert(first: Block->getOutRegs().begin(), last: Block->getOutRegs().end());
1671	releaseBlockSuccs(Parent: Block);
1672	for (const auto &RegP : LiveOutRegsNumUsages [Block->getID()]) {
1673	// We produce this register, thus it must not be previously alive.
1674	assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() \|\|
1675	LiveRegsConsumers[RegP.first] == `0`);
1676	LiveRegsConsumers [RegP.first] += RegP.second;
1677	}
1678	if (LastPosHighLatencyParentScheduled [Block->getID()] >
1679	(unsigned)LastPosWaitedHighLatency)
1680	LastPosWaitedHighLatency =
1681	LastPosHighLatencyParentScheduled [Block->getID()];
1682	++NumBlockScheduled;
1683	}
1684
1685	std::vector<int>
1686	SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
1687	std::set<Register> &OutRegs) {
1688	std::vector<int> DiffSetPressure;
1689	DiffSetPressure.assign(n: DAG->getTRI()->getNumRegPressureSets(), val: `0`);
1690
1691	for (Register Reg : InRegs) {
1692	// For now only track virtual registers.
1693	if (!Reg.isVirtual())
1694	continue;
1695	if (LiveRegsConsumers [Reg] > `1`)
1696	continue;
1697	PSetIterator PSetI = DAG->getMRI()->getPressureSets(VRegOrUnit: VirtRegOrUnit (Reg));
1698	for (; PSetI.isValid(); ++PSetI) {
1699	DiffSetPressure [*PSetI] -= PSetI.getWeight();
1700	}
1701	}
1702
1703	for (Register Reg : OutRegs) {
1704	// For now only track virtual registers.
1705	if (!Reg.isVirtual())
1706	continue;
1707	PSetIterator PSetI = DAG->getMRI()->getPressureSets(VRegOrUnit: VirtRegOrUnit (Reg));
1708	for (; PSetI.isValid(); ++PSetI) {
1709	DiffSetPressure [*PSetI] += PSetI.getWeight();
1710	}
1711	}
1712
1713	return DiffSetPressure;
1714	}
1715
1716	// SIScheduler //
1717
1718	struct SIScheduleBlockResult
1719	SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1720	SISchedulerBlockSchedulerVariant ScheduleVariant) {
1721	SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1722	SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1723	std::vector<SIScheduleBlock*> ScheduledBlocks;
1724	struct SIScheduleBlockResult Res;
1725
1726	ScheduledBlocks = Scheduler.getBlocks();
1727
1728	for (SIScheduleBlock *Block : ScheduledBlocks) {
1729	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1730
1731	for (SUnit* SU : SUs)
1732	Res.SUs.push_back(x: SU->NodeNum);
1733	}
1734
1735	Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1736	Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1737	return Res;
1738	}
1739
1740	// SIScheduleDAGMI //
1741
1742	SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1743	ScheduleDAGMILive (C, std::make_unique<GenericScheduler>(args&: C)) {
1744	SITII = static_cast<const SIInstrInfo*>(TII);
1745	SITRI = static_cast<const SIRegisterInfo*>(TRI);
1746	}
1747
1748	SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1749
1750	// Code adapted from scheduleDAG.cpp
1751	// Does a topological sort over the SUs.
1752	// Both TopDown and BottomUp
1753	void SIScheduleDAGMI::topologicalSort() {
1754	Topo.InitDAGTopologicalSorting();
1755
1756	TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1757	BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1758	}
1759
1760	// Move low latencies further from their user without
1761	// increasing SGPR usage (in general)
1762	// This is to be replaced by a better pass that would
1763	// take into account SGPR usage (based on VGPR Usage
1764	// and the corresponding wavefront count), that would
1765	// try to merge groups of loads if it make sense, etc
1766	void SIScheduleDAGMI::moveLowLatencies() {
1767	unsigned DAGSize = SUnits.size();
1768	int LastLowLatencyUser = -`1`;
1769	int LastLowLatencyPos = -`1`;
1770
1771	for (unsigned i = `0`, e = ScheduledSUnits.size(); i != e; ++i) {
1772	SUnit *SU = &SUnits [ScheduledSUnits [i]];
1773	bool IsLowLatencyUser = false;
1774	unsigned MinPos = `0`;
1775
1776	for (SDep& PredDep : SU->Preds) {
1777	SUnit *Pred = PredDep.getSUnit();
1778	if (SITII->isLowLatencyInstruction(MI: *Pred->getInstr())) {
1779	IsLowLatencyUser = true;
1780	}
1781	if (Pred->NodeNum >= DAGSize)
1782	continue;
1783	unsigned PredPos = ScheduledSUnitsInv [Pred->NodeNum];
1784	if (PredPos >= MinPos)
1785	MinPos = PredPos + `1`;
1786	}
1787
1788	if (SITII->isLowLatencyInstruction(MI: *SU->getInstr())) {
1789	unsigned BestPos = LastLowLatencyUser + `1`;
1790	if ((int)BestPos <= LastLowLatencyPos)
1791	BestPos = LastLowLatencyPos + `1`;
1792	if (BestPos < MinPos)
1793	BestPos = MinPos;
1794	if (BestPos < i) {
1795	for (unsigned u = i; u > BestPos; --u) {
1796	++ScheduledSUnitsInv [ScheduledSUnits [u-`1`]];
1797	ScheduledSUnits [u] = ScheduledSUnits [u-`1`];
1798	}
1799	ScheduledSUnits [BestPos] = SU->NodeNum;
1800	ScheduledSUnitsInv [SU->NodeNum] = BestPos;
1801	}
1802	LastLowLatencyPos = BestPos;
1803	if (IsLowLatencyUser)
1804	LastLowLatencyUser = BestPos;
1805	} else if (IsLowLatencyUser) {
1806	LastLowLatencyUser = i;
1807	// Moves COPY instructions on which depends
1808	// the low latency instructions too.
1809	} else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1810	bool CopyForLowLat = false;
1811	for (SDep& SuccDep : SU->Succs) {
1812	SUnit *Succ = SuccDep.getSUnit();
1813	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1814	continue;
1815	if (SITII->isLowLatencyInstruction(MI: *Succ->getInstr())) {
1816	CopyForLowLat = true;
1817	}
1818	}
1819	if (!CopyForLowLat)
1820	continue;
1821	if (MinPos < i) {
1822	for (unsigned u = i; u > MinPos; --u) {
1823	++ScheduledSUnitsInv [ScheduledSUnits [u-`1`]];
1824	ScheduledSUnits [u] = ScheduledSUnits [u-`1`];
1825	}
1826	ScheduledSUnits [MinPos] = SU->NodeNum;
1827	ScheduledSUnitsInv [SU->NodeNum] = MinPos;
1828	}
1829	}
1830	}
1831	}
1832
1833	void SIScheduleDAGMI::restoreSULinksLeft() {
1834	for (unsigned i = `0`, e = SUnits.size(); i != e; ++i) {
1835	SUnits [i].isScheduled = false;
1836	SUnits [i].WeakPredsLeft = SUnitsLinksBackup [i].WeakPredsLeft;
1837	SUnits [i].NumPredsLeft = SUnitsLinksBackup [i].NumPredsLeft;
1838	SUnits [i].WeakSuccsLeft = SUnitsLinksBackup [i].WeakSuccsLeft;
1839	SUnits [i].NumSuccsLeft = SUnitsLinksBackup [i].NumSuccsLeft;
1840	}
1841	}
1842
1843	// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1844	template<typename _Iterator> void
1845	SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1846	unsigned &VgprUsage, unsigned &SgprUsage) {
1847	VgprUsage = `0`;
1848	SgprUsage = `0`;
1849	for (_Iterator RegI = First; RegI != End; ++RegI) {
1850	Register Reg = *RegI;
1851	// For now only track virtual registers
1852	if (!Reg.isVirtual())
1853	continue;
1854	PSetIterator PSetI = MRI.getPressureSets(VRegOrUnit: VirtRegOrUnit (Reg));
1855	for (; PSetI.isValid(); ++PSetI) {
1856	if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
1857	VgprUsage += PSetI.getWeight();
1858	else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32)
1859	SgprUsage += PSetI.getWeight();
1860	}
1861	}
1862	}
1863
1864	void SIScheduleDAGMI::schedule()
1865	{
1866	SmallVector<SUnit*, `8`> TopRoots, BotRoots;
1867	SIScheduleBlockResult Best, Temp;
1868	LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1869
1870	buildDAGWithRegPressure();
1871	postProcessDAG();
1872
1873	LLVM_DEBUG(dump());
1874	if (PrintDAGs)
1875	dump();
1876	if (ViewMISchedDAGs)
1877	viewGraph();
1878
1879	topologicalSort();
1880	findRootsAndBiasEdges(TopRoots, BotRoots);
1881	// We reuse several ScheduleDAGMI and ScheduleDAGMILive
1882	// functions, but to make them happy we must initialize
1883	// the default Scheduler implementation (even if we do not
1884	// run it)
1885	SchedImpl ->initialize(DAG: this);
1886	initQueues(TopRoots, BotRoots);
1887
1888	// Fill some stats to help scheduling.
1889
1890	SUnitsLinksBackup = SUnits;
1891	IsLowLatencySU.clear();
1892	LowLatencyOffset.clear();
1893	IsHighLatencySU.clear();
1894
1895	IsLowLatencySU.resize(new_size: SUnits.size(), x: `0`);
1896	LowLatencyOffset.resize(new_size: SUnits.size(), x: `0`);
1897	IsHighLatencySU.resize(new_size: SUnits.size(), x: `0`);
1898
1899	for (unsigned i = `0`, e = (unsigned)SUnits.size(); i != e; ++i) {
1900	SUnit *SU = &SUnits [i];
1901	const MachineOperand *BaseLatOp;
1902	int64_t OffLatReg;
1903	if (SITII->isLowLatencyInstruction(MI: *SU->getInstr())) {
1904	IsLowLatencySU [i] = `1`;
1905	bool OffsetIsScalable;
1906	if (SITII->getMemOperandWithOffset(MI: *SU->getInstr(), BaseOp&: BaseLatOp, Offset&: OffLatReg,
1907	OffsetIsScalable, TRI))
1908	LowLatencyOffset [i] = OffLatReg;
1909	} else if (SITII->isHighLatencyDef(Opc: SU->getInstr()->getOpcode()))
1910	IsHighLatencySU [i] = `1`;
1911	}
1912
1913	SIScheduler Scheduler(this);
1914	Best = Scheduler.scheduleVariant(BlockVariant: SISchedulerBlockCreatorVariant::LatenciesAlone,
1915	ScheduleVariant: SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1916
1917	// if VGPR usage is extremely high, try other good performing variants
1918	// which could lead to lower VGPR usage
1919	if (Best.MaxVGPRUsage > `180`) {
1920	static const std::pair<SISchedulerBlockCreatorVariant,
1921	SISchedulerBlockSchedulerVariant>
1922	Variants[] = {
1923	{ LatenciesAlone, BlockRegUsageLatency },
1924	// { LatenciesAlone, BlockRegUsage },
1925	{ LatenciesGrouped, BlockLatencyRegUsage },
1926	// { LatenciesGrouped, BlockRegUsageLatency },
1927	// { LatenciesGrouped, BlockRegUsage },
1928	{ LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1929	// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1930	// { LatenciesAlonePlusConsecutive, BlockRegUsage }
1931	};
1932	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1933	Temp = Scheduler.scheduleVariant(BlockVariant: v.first, ScheduleVariant: v.second);
1934	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1935	Best = Temp;
1936	}
1937	}
1938	// if VGPR usage is still extremely high, we may spill. Try other variants
1939	// which are less performing, but that could lead to lower VGPR usage.
1940	if (Best.MaxVGPRUsage > `200`) {
1941	static const std::pair<SISchedulerBlockCreatorVariant,
1942	SISchedulerBlockSchedulerVariant>
1943	Variants[] = {
1944	// { LatenciesAlone, BlockRegUsageLatency },
1945	{ LatenciesAlone, BlockRegUsage },
1946	// { LatenciesGrouped, BlockLatencyRegUsage },
1947	{ LatenciesGrouped, BlockRegUsageLatency },
1948	{ LatenciesGrouped, BlockRegUsage },
1949	// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1950	{ LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1951	{ LatenciesAlonePlusConsecutive, BlockRegUsage }
1952	};
1953	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1954	Temp = Scheduler.scheduleVariant(BlockVariant: v.first, ScheduleVariant: v.second);
1955	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1956	Best = Temp;
1957	}
1958	}
1959
1960	ScheduledSUnits = Best.SUs;
1961	ScheduledSUnitsInv.resize(new_size: SUnits.size());
1962
1963	for (unsigned i = `0`, e = (unsigned)SUnits.size(); i != e; ++i) {
1964	ScheduledSUnitsInv [ScheduledSUnits [i]] = i;
1965	}
1966
1967	moveLowLatencies();
1968
1969	// Tell the outside world about the result of the scheduling.
1970
1971	assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1972	TopRPTracker.setPos(CurrentTop);
1973
1974	for (unsigned I : ScheduledSUnits) {
1975	SUnit *SU = &SUnits [I];
1976
1977	scheduleMI(SU, IsTopNode: true);
1978
1979	LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
1980	<< *SU->getInstr());
1981	}
1982
1983	assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1984
1985	placeDebugValues();
1986
1987	LLVM_DEBUG({
1988	dbgs() << "*** Final schedule for "
1989	<< printMBBReference(begin()->getParent()) << " **\n";
1990	dumpSchedule();
1991	dbgs() << `'\n'`;
1992	});
1993	}
1994

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp