SIMachineScheduler.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp]

1	//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// SI Machine Scheduler interface
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "SIMachineScheduler.h"
15	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16	#include "SIInstrInfo.h"
17	#include "llvm/CodeGen/LiveIntervals.h"
18	#include "llvm/CodeGen/MachineRegisterInfo.h"
19
20	using namespace llvm;
21
22	#define DEBUG_TYPE "machine-scheduler"
23
24	// This scheduler implements a different scheduling algorithm than
25	// GenericScheduler.
26	//
27	// There are several specific architecture behaviours that can't be modelled
28	// for GenericScheduler:
29	// . When accessing the result of an SGPR load instruction, you have to wait
30	// for all the SGPR load instructions before your current instruction to
31	// have finished.
32	// . When accessing the result of an VGPR load instruction, you have to wait
33	// for all the VGPR load instructions previous to the VGPR load instruction
34	// you are interested in to finish.
35	// . The less the register pressure, the best load latencies are hidden
36	//
37	// Moreover some specifities (like the fact a lot of instructions in the shader
38	// have few dependencies) makes the generic scheduler have some unpredictable
39	// behaviours. For example when register pressure becomes high, it can either
40	// manage to prevent register pressure from going too high, or it can
41	// increase register pressure even more than if it hadn't taken register
42	// pressure into account.
43	//
44	// Also some other bad behaviours are generated, like loading at the beginning
45	// of the shader a constant in VGPR you won't need until the end of the shader.
46	//
47	// The scheduling problem for SI can distinguish three main parts:
48	// . Hiding high latencies (texture sampling, etc)
49	// . Hiding low latencies (SGPR constant loading, etc)
50	// . Keeping register usage low for better latency hiding and general
51	// performance
52	//
53	// Some other things can also affect performance, but are hard to predict
54	// (cache usage, the fact the HW can issue several instructions from different
55	// wavefronts if different types, etc)
56	//
57	// This scheduler tries to solve the scheduling problem by dividing it into
58	// simpler sub-problems. It divides the instructions into blocks, schedules
59	// locally inside the blocks where it takes care of low latencies, and then
60	// chooses the order of the blocks by taking care of high latencies.
61	// Dividing the instructions into blocks helps control keeping register
62	// usage low.
63	//
64	// First the instructions are put into blocks.
65	// We want the blocks help control register usage and hide high latencies
66	// later. To help control register usage, we typically want all local
67	// computations, when for example you create a result that can be consumed
68	// right away, to be contained in a block. Block inputs and outputs would
69	// typically be important results that are needed in several locations of
70	// the shader. Since we do want blocks to help hide high latencies, we want
71	// the instructions inside the block to have a minimal set of dependencies
72	// on high latencies. It will make it easy to pick blocks to hide specific
73	// high latencies.
74	// The block creation algorithm is divided into several steps, and several
75	// variants can be tried during the scheduling process.
76	//
77	// Second the order of the instructions inside the blocks is chosen.
78	// At that step we do take into account only register usage and hiding
79	// low latency instructions
80	//
81	// Third the block order is chosen, there we try to hide high latencies
82	// and keep register usage low.
83	//
84	// After the third step, a pass is done to improve the hiding of low
85	// latencies.
86	//
87	// Actually when talking about 'low latency' or 'high latency' it includes
88	// both the latency to get the cache (or global mem) data go to the register,
89	// and the bandwidth limitations.
90	// Increasing the number of active wavefronts helps hide the former, but it
91	// doesn't solve the latter, thus why even if wavefront count is high, we have
92	// to try have as many instructions hiding high latencies as possible.
93	// The OpenCL doc says for example latency of 400 cycles for a global mem
94	// access, which is hidden by 10 instructions if the wavefront count is 10.
95
96	// Some figures taken from AMD docs:
97	// Both texture and constant L1 caches are 4-way associative with 64 bytes
98	// lines.
99	// Constant cache is shared with 4 CUs.
100	// For texture sampling, the address generation unit receives 4 texture
101	// addresses per cycle, thus we could expect texture sampling latency to be
102	// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
103	// instructions in a wavefront group are executed every 4 cycles),
104	// or 16 instructions if the other wavefronts associated to the 3 other VALUs
105	// of the CU do texture sampling too. (Don't take these figures too seriously,
106	// as I'm not 100% sure of the computation)
107	// Data exports should get similar latency.
108	// For constant loading, the cache is shader with 4 CUs.
109	// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
110	// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
111	// It means a simple s_buffer_load should take one instruction to hide, as
112	// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
113	// cache line.
114	//
115	// As of today the driver doesn't preload the constants in cache, thus the
116	// first loads get extra latency. The doc says global memory access can be
117	// 300-600 cycles. We do not specially take that into account when scheduling
118	// As we expect the driver to be able to preload the constants soon.
119
120	// common code //
121
122	#ifndef NDEBUG
123
124	static const char *getReasonStr(SIScheduleCandReason Reason) {
125	switch (Reason) {
126	case NoCand: return "NOCAND";
127	case RegUsage: return "REGUSAGE";
128	case Latency: return "LATENCY";
129	case Successor: return "SUCCESSOR";
130	case Depth: return "DEPTH";
131	case NodeOrder: return "ORDER";
132	}
133	llvm_unreachable("Unknown reason!");
134	}
135
136	#endif
137
138	namespace llvm::SISched {
139	static bool tryLess(int TryVal, int CandVal,
140	SISchedulerCandidate &TryCand,
141	SISchedulerCandidate &Cand,
142	SIScheduleCandReason Reason) {
143	if (TryVal < CandVal) {
144	TryCand.Reason = Reason;
145	return true;
146	}
147	if (TryVal > CandVal) {
148	if (Cand.Reason > Reason)
149	Cand.Reason = Reason;
150	return true;
151	}
152	Cand.setRepeat(Reason);
153	return false;
154	}
155
156	static bool tryGreater(int TryVal, int CandVal,
157	SISchedulerCandidate &TryCand,
158	SISchedulerCandidate &Cand,
159	SIScheduleCandReason Reason) {
160	if (TryVal > CandVal) {
161	TryCand.Reason = Reason;
162	return true;
163	}
164	if (TryVal < CandVal) {
165	if (Cand.Reason > Reason)
166	Cand.Reason = Reason;
167	return true;
168	}
169	Cand.setRepeat(Reason);
170	return false;
171	}
172	} // end namespace llvm::SISched
173
174	// SIScheduleBlock //
175
176	void SIScheduleBlock::addUnit(SUnit *SU) {
177	NodeNum2Index [SU->NodeNum] = SUnits.size();
178	SUnits.push_back(x: SU);
179	}
180
181	#ifndef NDEBUG
182	void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
183
184	dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
185	dbgs() << `'\n'`;
186	}
187	#endif
188
189	void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
190	SISchedCandidate &TryCand) {
191	// Initialize the candidate if needed.
192	if (!Cand.isValid()) {
193	TryCand.Reason = NodeOrder;
194	return;
195	}
196
197	if (Cand.SGPRUsage > `60` &&
198	SISched::tryLess(TryVal: TryCand.SGPRUsage, CandVal: Cand.SGPRUsage,
199	TryCand, Cand, Reason: RegUsage))
200	return;
201
202	// Schedule low latency instructions as top as possible.
203	// Order of priority is:
204	// . Low latency instructions which do not depend on other low latency
205	// instructions we haven't waited for
206	// . Other instructions which do not depend on low latency instructions
207	// we haven't waited for
208	// . Low latencies
209	// . All other instructions
210	// Goal is to get: low latency instructions - independent instructions
211	// - (eventually some more low latency instructions)
212	// - instructions that depend on the first low latency instructions.
213	// If in the block there is a lot of constant loads, the SGPR usage
214	// could go quite high, thus above the arbitrary limit of 60 will encourage
215	// use the already loaded constants (in order to release some SGPRs) before
216	// loading more.
217	if (SISched::tryLess(TryVal: TryCand.HasLowLatencyNonWaitedParent,
218	CandVal: Cand.HasLowLatencyNonWaitedParent,
219	TryCand, Cand, Reason: SIScheduleCandReason::Depth))
220	return;
221
222	if (SISched::tryGreater(TryVal: TryCand.IsLowLatency, CandVal: Cand.IsLowLatency,
223	TryCand, Cand, Reason: SIScheduleCandReason::Depth))
224	return;
225
226	if (TryCand.IsLowLatency &&
227	SISched::tryLess(TryVal: TryCand.LowLatencyOffset, CandVal: Cand.LowLatencyOffset,
228	TryCand, Cand, Reason: SIScheduleCandReason::Depth))
229	return;
230
231	if (SISched::tryLess(TryVal: TryCand.VGPRUsage, CandVal: Cand.VGPRUsage,
232	TryCand, Cand, Reason: RegUsage))
233	return;
234
235	// Fall through to original instruction order.
236	if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
237	TryCand.Reason = NodeOrder;
238	}
239	}
240
241	SUnit* SIScheduleBlock::pickNode() {
242	SISchedCandidate TopCand;
243
244	for (SUnit* SU : TopReadySUs) {
245	SISchedCandidate TryCand;
246	std::vector<unsigned> pressure;
247	std::vector<unsigned> MaxPressure;
248	// Predict register usage after this instruction.
249	TryCand.SU = SU;
250	TopRPTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: pressure, MaxPressureResult&: MaxPressure);
251	TryCand.SGPRUsage = pressure [AMDGPU::RegisterPressureSets::SReg_32];
252	TryCand.VGPRUsage = pressure [AMDGPU::RegisterPressureSets::VGPR_32];
253	TryCand.IsLowLatency = DAG->IsLowLatencySU [SU->NodeNum];
254	TryCand.LowLatencyOffset = DAG->LowLatencyOffset [SU->NodeNum];
255	TryCand.HasLowLatencyNonWaitedParent =
256	HasLowLatencyNonWaitedParent [NodeNum2Index [SU->NodeNum]];
257	tryCandidateTopDown(Cand&: TopCand, TryCand);
258	if (TryCand.Reason != NoCand)
259	TopCand.setBest(TryCand);
260	}
261
262	return TopCand.SU;
263	}
264
265
266	// Schedule something valid.
267	void SIScheduleBlock::fastSchedule() {
268	TopReadySUs.clear();
269	if (Scheduled)
270	undoSchedule();
271
272	for (SUnit* SU : SUnits) {
273	if (!SU->NumPredsLeft)
274	TopReadySUs.push_back(x: SU);
275	}
276
277	while (!TopReadySUs.empty()) {
278	SUnit *SU = TopReadySUs [`0`];
279	ScheduledSUnits.push_back(x: SU);
280	nodeScheduled(SU);
281	}
282
283	Scheduled = true;
284	}
285
286	// Returns if the register was set between first and last.
287	static bool isDefBetween(Register Reg, SlotIndex First, SlotIndex Last,
288	const MachineRegisterInfo *MRI,
289	const LiveIntervals *LIS) {
290	for (const MachineInstr &MI : MRI->def_instructions(Reg)) {
291	if (MI.isDebugValue())
292	continue;
293	SlotIndex InstSlot = LIS->getInstructionIndex(Instr: MI).getRegSlot();
294	if (InstSlot >= First && InstSlot <= Last)
295	return true;
296	}
297	return false;
298	}
299
300	void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
301	MachineBasicBlock::iterator EndBlock) {
302	IntervalPressure Pressure, BotPressure;
303	RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
304	LiveIntervals *LIS = DAG->getLIS();
305	MachineRegisterInfo *MRI = DAG->getMRI();
306	DAG->initRPTracker(RPTracker&: TopRPTracker);
307	DAG->initRPTracker(RPTracker&: BotRPTracker);
308	DAG->initRPTracker(RPTracker);
309
310	// Goes though all SU. RPTracker captures what had to be alive for the SUs
311	// to execute, and what is still alive at the end.
312	for (SUnit* SU : ScheduledSUnits) {
313	RPTracker.setPos(SU->getInstr());
314	RPTracker.advance();
315	}
316
317	// Close the RPTracker to finalize live ins/outs.
318	RPTracker.closeRegion();
319
320	// Initialize the live ins and live outs.
321	TopRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveInRegs);
322	BotRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveOutRegs);
323
324	// Do not Track Physical Registers, because it messes up.
325	for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
326	if (RegMaskPair.RegUnit.isVirtual())
327	LiveInRegs.insert(x: RegMaskPair.RegUnit);
328	}
329	LiveOutRegs.clear();
330	// There is several possibilities to distinguish:
331	// 1) Reg is not input to any instruction in the block, but is output of one
332	// 2) 1) + read in the block and not needed after it
333	// 3) 1) + read in the block but needed in another block
334	// 4) Reg is input of an instruction but another block will read it too
335	// 5) Reg is input of an instruction and then rewritten in the block.
336	// result is not read in the block (implies used in another block)
337	// 6) Reg is input of an instruction and then rewritten in the block.
338	// result is read in the block and not needed in another block
339	// 7) Reg is input of an instruction and then rewritten in the block.
340	// result is read in the block but also needed in another block
341	// LiveInRegs will contains all the regs in situation 4, 5, 6, 7
342	// We want LiveOutRegs to contain only Regs whose content will be read after
343	// in another block, and whose content was written in the current block,
344	// that is we want it to get 1, 3, 5, 7
345	// Since we made the MIs of a block to be packed all together before
346	// scheduling, then the LiveIntervals were correct, and the RPTracker was
347	// able to correctly handle 5 vs 6, 2 vs 3.
348	// (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
349	// The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
350	// Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
351	// The use of findDefBetween removes the case 4.
352	for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
353	Register Reg = RegMaskPair.RegUnit;
354	if (Reg.isVirtual() &&
355	isDefBetween(Reg, First: LIS->getInstructionIndex(Instr: *BeginBlock).getRegSlot(),
356	Last: LIS->getInstructionIndex(Instr: *EndBlock).getRegSlot(), MRI,
357	LIS)) {
358	LiveOutRegs.insert(x: Reg);
359	}
360	}
361
362	// Pressure = sum_alive_registers register size
363	// Internally llvm will represent some registers as big 128 bits registers
364	// for example, but they actually correspond to 4 actual 32 bits registers.
365	// Thus Pressure is not equal to num_alive_registers constant.*
366	LiveInPressure = TopPressure.MaxSetPressure;
367	LiveOutPressure = BotPressure.MaxSetPressure;
368
369	// Prepares TopRPTracker for top down scheduling.
370	TopRPTracker.closeTop();
371	}
372
373	void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
374	MachineBasicBlock::iterator EndBlock) {
375	if (!Scheduled)
376	fastSchedule();
377
378	// PreScheduling phase to set LiveIn and LiveOut.
379	initRegPressure(BeginBlock, EndBlock);
380	undoSchedule();
381
382	// Schedule for real now.
383
384	TopReadySUs.clear();
385
386	for (SUnit* SU : SUnits) {
387	if (!SU->NumPredsLeft)
388	TopReadySUs.push_back(x: SU);
389	}
390
391	while (!TopReadySUs.empty()) {
392	SUnit *SU = pickNode();
393	ScheduledSUnits.push_back(x: SU);
394	TopRPTracker.setPos(SU->getInstr());
395	TopRPTracker.advance();
396	nodeScheduled(SU);
397	}
398
399	// TODO: compute InternalAdditionalPressure.
400	InternalAdditionalPressure.resize(new_size: TopPressure.MaxSetPressure.size());
401
402	// Check everything is right.
403	#ifndef NDEBUG
404	assert(SUnits.size() == ScheduledSUnits.size() &&
405	TopReadySUs.empty());
406	for (SUnit* SU : SUnits) {
407	assert(SU->isScheduled &&
408	SU->NumPredsLeft == `0`);
409	}
410	#endif
411
412	Scheduled = true;
413	}
414
415	void SIScheduleBlock::undoSchedule() {
416	for (SUnit* SU : SUnits) {
417	SU->isScheduled = false;
418	for (SDep& Succ : SU->Succs) {
419	if (BC->isSUInBlock(SU: Succ.getSUnit(), ID))
420	undoReleaseSucc(SU, SuccEdge: &Succ);
421	}
422	}
423	HasLowLatencyNonWaitedParent.assign(n: SUnits.size(), val: `0`);
424	ScheduledSUnits.clear();
425	Scheduled = false;
426	}
427
428	void SIScheduleBlock::undoReleaseSucc(SUnit SU, SDep SuccEdge) {
429	SUnit *SuccSU = SuccEdge->getSUnit();
430
431	if (SuccEdge->isWeak()) {
432	++SuccSU->WeakPredsLeft;
433	return;
434	}
435	++SuccSU->NumPredsLeft;
436	}
437
438	void SIScheduleBlock::releaseSucc(SUnit SU, SDep SuccEdge) {
439	SUnit *SuccSU = SuccEdge->getSUnit();
440
441	if (SuccEdge->isWeak()) {
442	--SuccSU->WeakPredsLeft;
443	return;
444	}
445	#ifndef NDEBUG
446	if (SuccSU->NumPredsLeft == `0`) {
447	dbgs() << "* Scheduling failed! *\n";
448	DAG->dumpNode(*SuccSU);
449	dbgs() << " has been released too many times!\n";
450	llvm_unreachable(nullptr);
451	}
452	#endif
453
454	--SuccSU->NumPredsLeft;
455	}
456
457	/// Release Successors of the SU that are in the block or not.
458	void SIScheduleBlock::releaseSuccessors(SUnit SU, bool* InOrOutBlock) {
459	for (SDep& Succ : SU->Succs) {
460	SUnit *SuccSU = Succ.getSUnit();
461
462	if (SuccSU->NodeNum >= DAG->SUnits.size())
463	continue;
464
465	if (BC->isSUInBlock(SU: SuccSU, ID) != InOrOutBlock)
466	continue;
467
468	releaseSucc(SU, SuccEdge: &Succ);
469	if (SuccSU->NumPredsLeft == `0` && InOrOutBlock)
470	TopReadySUs.push_back(x: SuccSU);
471	}
472	}
473
474	void SIScheduleBlock::nodeScheduled(SUnit *SU) {
475	// Is in TopReadySUs
476	assert (!SU->NumPredsLeft);
477	std::vector<SUnit *>::iterator I = llvm::find(Range&: TopReadySUs, Val: SU);
478	if (I == TopReadySUs.end()) {
479	dbgs() << "Data Structure Bug in SI Scheduler\n";
480	llvm_unreachable(nullptr);
481	}
482	TopReadySUs.erase(position: I);
483
484	releaseSuccessors(SU, InOrOutBlock: true);
485	// Scheduling this node will trigger a wait,
486	// thus propagate to other instructions that they do not need to wait either.
487	if (HasLowLatencyNonWaitedParent [NodeNum2Index [SU->NodeNum]])
488	HasLowLatencyNonWaitedParent.assign(n: SUnits.size(), val: `0`);
489
490	if (DAG->IsLowLatencySU [SU->NodeNum]) {
491	for (SDep& Succ : SU->Succs) {
492	std::map<unsigned, unsigned>::iterator I =
493	NodeNum2Index.find(x: Succ.getSUnit()->NodeNum);
494	if (I != NodeNum2Index.end())
495	HasLowLatencyNonWaitedParent [I ->second] = `1`;
496	}
497	}
498	SU->isScheduled = true;
499	}
500
501	void SIScheduleBlock::finalizeUnits() {
502	// We remove links from outside blocks to enable scheduling inside the block.
503	for (SUnit* SU : SUnits) {
504	releaseSuccessors(SU, InOrOutBlock: false);
505	if (DAG->IsHighLatencySU [SU->NodeNum])
506	HighLatencyBlock = true;
507	}
508	HasLowLatencyNonWaitedParent.resize(new_size: SUnits.size(), x: `0`);
509	}
510
511	// we maintain ascending order of IDs
512	void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
513	unsigned PredID = Pred->getID();
514
515	// Check if not already predecessor.
516	for (SIScheduleBlock* P : Preds) {
517	if (PredID == P->getID())
518	return;
519	}
520	Preds.push_back(x: Pred);
521
522	assert(none_of(Succs,
523	[=](std::pair<SIScheduleBlock*,
524	SIScheduleBlockLinkKind> S) {
525	return PredID == S.first->getID();
526	}) &&
527	"Loop in the Block Graph!");
528	}
529
530	void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
531	SIScheduleBlockLinkKind Kind) {
532	unsigned SuccID = Succ->getID();
533
534	// Check if not already predecessor.
535	for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
536	if (SuccID == S.first->getID()) {
537	if (S.second == SIScheduleBlockLinkKind::NoData &&
538	Kind == SIScheduleBlockLinkKind::Data)
539	S.second = Kind;
540	return;
541	}
542	}
543	if (Succ->isHighLatencyBlock())
544	++NumHighLatencySuccessors;
545	Succs.emplace_back(args&: Succ, args&: Kind);
546
547	assert(none_of(Preds,
548	[=](SIScheduleBlock P) { return* SuccID == P->getID(); }) &&
549	"Loop in the Block Graph!");
550	}
551
552	#ifndef NDEBUG
553	void SIScheduleBlock::printDebug(bool full) {
554	dbgs() << "Block (" << ID << ")\n";
555	if (!full)
556	return;
557
558	dbgs() << "\nContains High Latency Instruction: "
559	<< HighLatencyBlock << `'\n'`;
560	dbgs() << "\nDepends On:\n";
561	for (SIScheduleBlock* P : Preds) {
562	P->printDebug(false);
563	}
564
565	dbgs() << "\nSuccessors:\n";
566	for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
567	if (S.second == SIScheduleBlockLinkKind::Data)
568	dbgs() << "(Data Dep) ";
569	S.first->printDebug(false);
570	}
571
572	if (Scheduled) {
573	dbgs() << "LiveInPressure "
574	<< LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << `' '`
575	<< LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << `'\n'`;
576	dbgs() << "LiveOutPressure "
577	<< LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << `' '`
578	<< LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
579	dbgs() << "LiveIns:\n";
580	for (Register Reg : LiveInRegs)
581	dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << `' '`;
582
583	dbgs() << "\nLiveOuts:\n";
584	for (Register Reg : LiveOutRegs)
585	dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << `' '`;
586	}
587
588	dbgs() << "\nInstructions:\n";
589	for (const SUnit* SU : SUnits)
590	DAG->dumpNode(*SU);
591
592	dbgs() << "///////////////////////\n";
593	}
594	#endif
595
596	// SIScheduleBlockCreator //
597
598	SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
599	: DAG(DAG) {}
600
601	SIScheduleBlocks
602	SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
603	std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
604	Blocks.find(x: BlockVariant);
605	if (B == Blocks.end()) {
606	SIScheduleBlocks Res;
607	createBlocksForVariant(BlockVariant);
608	topologicalSort();
609	scheduleInsideBlocks();
610	fillStats();
611	Res.Blocks = CurrentBlocks;
612	Res.TopDownIndex2Block = TopDownIndex2Block;
613	Res.TopDownBlock2Index = TopDownBlock2Index;
614	Blocks [BlockVariant] = Res;
615	return Res;
616	}
617	return B ->second;
618	}
619
620	bool SIScheduleBlockCreator::isSUInBlock(SUnit SU, unsigned* ID) {
621	if (SU->NodeNum >= DAG->SUnits.size())
622	return false;
623	return CurrentBlocks [Node2CurrentBlock [SU->NodeNum]]->getID() == ID;
624	}
625
626	void SIScheduleBlockCreator::colorHighLatenciesAlone() {
627	unsigned DAGSize = DAG->SUnits.size();
628
629	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
630	SUnit *SU = &DAG->SUnits [i];
631	if (DAG->IsHighLatencySU [SU->NodeNum]) {
632	CurrentColoring [SU->NodeNum] = NextReservedID++;
633	}
634	}
635	}
636
637	static bool
638	hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
639	for (const auto &PredDep : SU.Preds) {
640	if (PredDep.getSUnit() == &FromSU &&
641	PredDep.getKind() == llvm::SDep::Data)
642	return true;
643	}
644	return false;
645	}
646
647	void SIScheduleBlockCreator::colorHighLatenciesGroups() {
648	unsigned DAGSize = DAG->SUnits.size();
649	unsigned NumHighLatencies = `0`;
650	unsigned GroupSize;
651	int Color = NextReservedID;
652	unsigned Count = `0`;
653	std::set<unsigned> FormingGroup;
654
655	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
656	SUnit *SU = &DAG->SUnits [i];
657	if (DAG->IsHighLatencySU [SU->NodeNum])
658	++NumHighLatencies;
659	}
660
661	if (NumHighLatencies == `0`)
662	return;
663
664	if (NumHighLatencies <= `6`)
665	GroupSize = `2`;
666	else if (NumHighLatencies <= `12`)
667	GroupSize = `3`;
668	else
669	GroupSize = `4`;
670
671	for (unsigned SUNum : DAG->TopDownIndex2SU) {
672	const SUnit &SU = DAG->SUnits [SUNum];
673	if (DAG->IsHighLatencySU [SU.NodeNum]) {
674	unsigned CompatibleGroup = true;
675	int ProposedColor = Color;
676	std::vector<int> AdditionalElements;
677
678	// We don't want to put in the same block
679	// two high latency instructions that depend
680	// on each other.
681	// One way would be to check canAddEdge
682	// in both directions, but that currently is not
683	// enough because there the high latency order is
684	// enforced (via links).
685	// Instead, look at the dependencies between the
686	// high latency instructions and deduce if it is
687	// a data dependency or not.
688	for (unsigned j : FormingGroup) {
689	bool HasSubGraph;
690	std::vector<int> SubGraph;
691	// By construction (topological order), if SU and
692	// DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
693	// in the parent graph of SU.
694	#ifndef NDEBUG
695	SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
696	HasSubGraph);
697	assert(!HasSubGraph);
698	#endif
699	SubGraph = DAG->GetTopo()->GetSubGraph(StartSU: DAG->SUnits [j], TargetSU: SU,
700	Success&: HasSubGraph);
701	if (!HasSubGraph)
702	continue; // No dependencies between each other
703	if (SubGraph.size() > `5`) {
704	// Too many elements would be required to be added to the block.
705	CompatibleGroup = false;
706	break;
707	}
708	// Check the type of dependency
709	for (unsigned k : SubGraph) {
710	// If in the path to join the two instructions,
711	// there is another high latency instruction,
712	// or instructions colored for another block
713	// abort the merge.
714	if (DAG->IsHighLatencySU [k] \|\| (CurrentColoring [k] != ProposedColor &&
715	CurrentColoring [k] != `0`)) {
716	CompatibleGroup = false;
717	break;
718	}
719	// If one of the SU in the subgraph depends on the result of SU j,
720	// there'll be a data dependency.
721	if (hasDataDependencyPred(SU: DAG->SUnits [k], FromSU: DAG->SUnits [j])) {
722	CompatibleGroup = false;
723	break;
724	}
725	}
726	if (!CompatibleGroup)
727	break;
728	// Same check for the SU
729	if (hasDataDependencyPred(SU, FromSU: DAG->SUnits [j])) {
730	CompatibleGroup = false;
731	break;
732	}
733	// Add all the required instructions to the block
734	// These cannot live in another block (because they
735	// depend (order dependency) on one of the
736	// instruction in the block, and are required for the
737	// high latency instruction we add.
738	llvm::append_range(C&: AdditionalElements, R&: SubGraph);
739	}
740	if (CompatibleGroup) {
741	FormingGroup.insert(x: SU.NodeNum);
742	for (unsigned j : AdditionalElements)
743	CurrentColoring [j] = ProposedColor;
744	CurrentColoring [SU.NodeNum] = ProposedColor;
745	++Count;
746	}
747	// Found one incompatible instruction,
748	// or has filled a big enough group.
749	// -> start a new one.
750	if (!CompatibleGroup) {
751	FormingGroup.clear();
752	Color = ++NextReservedID;
753	ProposedColor = Color;
754	FormingGroup.insert(x: SU.NodeNum);
755	CurrentColoring [SU.NodeNum] = ProposedColor;
756	Count = `0`;
757	} else if (Count == GroupSize) {
758	FormingGroup.clear();
759	Color = ++NextReservedID;
760	ProposedColor = Color;
761	Count = `0`;
762	}
763	}
764	}
765	}
766
767	void SIScheduleBlockCreator::colorComputeReservedDependencies() {
768	unsigned DAGSize = DAG->SUnits.size();
769	std::map<std::set<unsigned>, unsigned> ColorCombinations;
770
771	CurrentTopDownReservedDependencyColoring.clear();
772	CurrentBottomUpReservedDependencyColoring.clear();
773
774	CurrentTopDownReservedDependencyColoring.resize(new_size: DAGSize, x: `0`);
775	CurrentBottomUpReservedDependencyColoring.resize(new_size: DAGSize, x: `0`);
776
777	// Traverse TopDown, and give different colors to SUs depending
778	// on which combination of High Latencies they depend on.
779
780	for (unsigned SUNum : DAG->TopDownIndex2SU) {
781	SUnit *SU = &DAG->SUnits [SUNum];
782	std::set<unsigned> SUColors;
783
784	// Already given.
785	if (CurrentColoring [SU->NodeNum]) {
786	CurrentTopDownReservedDependencyColoring [SU->NodeNum] =
787	CurrentColoring [SU->NodeNum];
788	continue;
789	}
790
791	for (SDep& PredDep : SU->Preds) {
792	SUnit *Pred = PredDep.getSUnit();
793	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
794	continue;
795	if (CurrentTopDownReservedDependencyColoring [Pred->NodeNum] > `0`)
796	SUColors.insert(x: CurrentTopDownReservedDependencyColoring [Pred->NodeNum]);
797	}
798	// Color 0 by default.
799	if (SUColors.empty())
800	continue;
801	// Same color than parents.
802	if (SUColors.size() == `1` && *SUColors.begin() > DAGSize)
803	CurrentTopDownReservedDependencyColoring [SU->NodeNum] =
804	*SUColors.begin();
805	else {
806	auto [Pos, Inserted] =
807	ColorCombinations.try_emplace(k: SUColors, args&: NextNonReservedID);
808	if (Inserted)
809	++NextNonReservedID;
810	CurrentTopDownReservedDependencyColoring [SU->NodeNum] = Pos ->second;
811	}
812	}
813
814	ColorCombinations.clear();
815
816	// Same as before, but BottomUp.
817
818	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
819	SUnit *SU = &DAG->SUnits [SUNum];
820	std::set<unsigned> SUColors;
821
822	// Already given.
823	if (CurrentColoring [SU->NodeNum]) {
824	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] =
825	CurrentColoring [SU->NodeNum];
826	continue;
827	}
828
829	for (SDep& SuccDep : SU->Succs) {
830	SUnit *Succ = SuccDep.getSUnit();
831	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
832	continue;
833	if (CurrentBottomUpReservedDependencyColoring [Succ->NodeNum] > `0`)
834	SUColors.insert(x: CurrentBottomUpReservedDependencyColoring [Succ->NodeNum]);
835	}
836	// Keep color 0.
837	if (SUColors.empty())
838	continue;
839	// Same color than parents.
840	if (SUColors.size() == `1` && *SUColors.begin() > DAGSize)
841	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] =
842	*SUColors.begin();
843	else {
844	std::map<std::set<unsigned>, unsigned>::iterator Pos =
845	ColorCombinations.find(x: SUColors);
846	if (Pos != ColorCombinations.end()) {
847	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] = Pos ->second;
848	} else {
849	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] =
850	NextNonReservedID;
851	ColorCombinations [SUColors] = NextNonReservedID++;
852	}
853	}
854	}
855	}
856
857	void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
858	std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
859
860	// Every combination of colors given by the top down
861	// and bottom up Reserved node dependency
862
863	for (const SUnit &SU : DAG->SUnits) {
864	std::pair<unsigned, unsigned> SUColors;
865
866	// High latency instructions: already given.
867	if (CurrentColoring [SU.NodeNum])
868	continue;
869
870	SUColors.first = CurrentTopDownReservedDependencyColoring [SU.NodeNum];
871	SUColors.second = CurrentBottomUpReservedDependencyColoring [SU.NodeNum];
872
873	auto [Pos, Inserted] =
874	ColorCombinations.try_emplace(k: SUColors, args&: NextNonReservedID);
875	CurrentColoring [SU.NodeNum] = Pos ->second;
876	if (Inserted)
877	NextNonReservedID++;
878	}
879	}
880
881	void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
882	unsigned DAGSize = DAG->SUnits.size();
883	std::vector<int> PendingColoring = CurrentColoring;
884
885	assert(DAGSize >= `1` &&
886	CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
887	CurrentTopDownReservedDependencyColoring.size() == DAGSize);
888	// If there is no reserved block at all, do nothing. We don't want
889	// everything in one block.
890	if (*llvm::max_element(Range&: CurrentBottomUpReservedDependencyColoring) == `0` &&
891	*llvm::max_element(Range&: CurrentTopDownReservedDependencyColoring) == `0`)
892	return;
893
894	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
895	SUnit *SU = &DAG->SUnits [SUNum];
896	std::set<unsigned> SUColors;
897	std::set<unsigned> SUColorsPending;
898
899	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
900	continue;
901
902	if (CurrentBottomUpReservedDependencyColoring [SU->NodeNum] > `0` \|\|
903	CurrentTopDownReservedDependencyColoring [SU->NodeNum] > `0`)
904	continue;
905
906	for (SDep& SuccDep : SU->Succs) {
907	SUnit *Succ = SuccDep.getSUnit();
908	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
909	continue;
910	if (CurrentBottomUpReservedDependencyColoring [Succ->NodeNum] > `0` \|\|
911	CurrentTopDownReservedDependencyColoring [Succ->NodeNum] > `0`)
912	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
913	SUColorsPending.insert(x: PendingColoring [Succ->NodeNum]);
914	}
915	// If there is only one child/parent block, and that block
916	// is not among the ones we are removing in this path, then
917	// merge the instruction to that block
918	if (SUColors.size() == `1` && SUColorsPending.size() == `1`)
919	PendingColoring [SU->NodeNum] = *SUColors.begin();
920	else // TODO: Attribute new colors depending on color
921	// combination of children.
922	PendingColoring [SU->NodeNum] = NextNonReservedID++;
923	}
924	CurrentColoring = PendingColoring;
925	}
926
927
928	void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
929	unsigned DAGSize = DAG->SUnits.size();
930	unsigned PreviousColor;
931	std::set<unsigned> SeenColors;
932
933	if (DAGSize <= `1`)
934	return;
935
936	PreviousColor = CurrentColoring [`0`];
937
938	for (unsigned i = `1`, e = DAGSize; i != e; ++i) {
939	SUnit *SU = &DAG->SUnits [i];
940	unsigned CurrentColor = CurrentColoring [i];
941	unsigned PreviousColorSave = PreviousColor;
942	assert(i == SU->NodeNum);
943
944	if (CurrentColor != PreviousColor)
945	SeenColors.insert(x: PreviousColor);
946	PreviousColor = CurrentColor;
947
948	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
949	continue;
950
951	if (SeenColors.find(x: CurrentColor) == SeenColors.end())
952	continue;
953
954	if (PreviousColorSave != CurrentColor)
955	CurrentColoring [i] = NextNonReservedID++;
956	else
957	CurrentColoring [i] = CurrentColoring [i-`1`];
958	}
959	}
960
961	void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
962	unsigned DAGSize = DAG->SUnits.size();
963
964	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
965	SUnit *SU = &DAG->SUnits [SUNum];
966	std::set<unsigned> SUColors;
967
968	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
969	continue;
970
971	// No predecessor: Vgpr constant loading.
972	// Low latency instructions usually have a predecessor (the address)
973	if (SU->Preds.size() > `0` && !DAG->IsLowLatencySU [SU->NodeNum])
974	continue;
975
976	for (SDep& SuccDep : SU->Succs) {
977	SUnit *Succ = SuccDep.getSUnit();
978	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
979	continue;
980	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
981	}
982	if (SUColors.size() == `1`)
983	CurrentColoring [SU->NodeNum] = *SUColors.begin();
984	}
985	}
986
987	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
988	unsigned DAGSize = DAG->SUnits.size();
989
990	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
991	SUnit *SU = &DAG->SUnits [SUNum];
992	std::set<unsigned> SUColors;
993
994	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
995	continue;
996
997	for (SDep& SuccDep : SU->Succs) {
998	SUnit *Succ = SuccDep.getSUnit();
999	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1000	continue;
1001	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
1002	}
1003	if (SUColors.size() == `1`)
1004	CurrentColoring [SU->NodeNum] = *SUColors.begin();
1005	}
1006	}
1007
1008	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1009	unsigned DAGSize = DAG->SUnits.size();
1010
1011	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1012	SUnit *SU = &DAG->SUnits [SUNum];
1013	std::set<unsigned> SUColors;
1014
1015	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
1016	continue;
1017
1018	for (SDep& SuccDep : SU->Succs) {
1019	SUnit *Succ = SuccDep.getSUnit();
1020	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1021	continue;
1022	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
1023	}
1024	if (SUColors.size() == `1` && *SUColors.begin() <= DAGSize)
1025	CurrentColoring [SU->NodeNum] = *SUColors.begin();
1026	}
1027	}
1028
1029	void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1030	unsigned DAGSize = DAG->SUnits.size();
1031	std::map<unsigned, unsigned> ColorCount;
1032
1033	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1034	SUnit *SU = &DAG->SUnits [SUNum];
1035	unsigned color = CurrentColoring [SU->NodeNum];
1036	++ColorCount [color];
1037	}
1038
1039	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1040	SUnit *SU = &DAG->SUnits [SUNum];
1041	unsigned color = CurrentColoring [SU->NodeNum];
1042	std::set<unsigned> SUColors;
1043
1044	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
1045	continue;
1046
1047	if (ColorCount [color] > `1`)
1048	continue;
1049
1050	for (SDep& SuccDep : SU->Succs) {
1051	SUnit *Succ = SuccDep.getSUnit();
1052	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1053	continue;
1054	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
1055	}
1056	if (SUColors.size() == `1` && *SUColors.begin() != color) {
1057	--ColorCount [color];
1058	CurrentColoring [SU->NodeNum] = *SUColors.begin();
1059	++ColorCount [*SUColors.begin()];
1060	}
1061	}
1062	}
1063
1064	void SIScheduleBlockCreator::cutHugeBlocks() {
1065	// TODO
1066	}
1067
1068	void SIScheduleBlockCreator::regroupNoUserInstructions() {
1069	unsigned DAGSize = DAG->SUnits.size();
1070	int GroupID = NextNonReservedID++;
1071
1072	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1073	SUnit *SU = &DAG->SUnits [SUNum];
1074	bool hasSuccessor = false;
1075
1076	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
1077	continue;
1078
1079	for (SDep& SuccDep : SU->Succs) {
1080	SUnit *Succ = SuccDep.getSUnit();
1081	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1082	continue;
1083	hasSuccessor = true;
1084	}
1085	if (!hasSuccessor)
1086	CurrentColoring [SU->NodeNum] = GroupID;
1087	}
1088	}
1089
1090	void SIScheduleBlockCreator::colorExports() {
1091	unsigned ExportColor = NextNonReservedID++;
1092	SmallVector<unsigned, `8`> ExpGroup;
1093
1094	// Put all exports together in a block.
1095	// The block will naturally end up being scheduled last,
1096	// thus putting exports at the end of the schedule, which
1097	// is better for performance.
1098	// However we must ensure, for safety, the exports can be put
1099	// together in the same block without any other instruction.
1100	// This could happen, for example, when scheduling after regalloc
1101	// if reloading a spilled register from memory using the same
1102	// register than used in a previous export.
1103	// If that happens, do not regroup the exports.
1104	for (unsigned SUNum : DAG->TopDownIndex2SU) {
1105	const SUnit &SU = DAG->SUnits [SUNum];
1106	if (SIInstrInfo::isEXP(MI: *SU.getInstr())) {
1107	// SU is an export instruction. Check whether one of its successor
1108	// dependencies is a non-export, in which case we skip export grouping.
1109	for (const SDep &SuccDep : SU.Succs) {
1110	const SUnit *SuccSU = SuccDep.getSUnit();
1111	if (SuccDep.isWeak() \|\| SuccSU->NodeNum >= DAG->SUnits.size()) {
1112	// Ignore these dependencies.
1113	continue;
1114	}
1115	assert(SuccSU->isInstr() &&
1116	"SUnit unexpectedly not representing an instruction!");
1117
1118	if (!SIInstrInfo::isEXP(MI: *SuccSU->getInstr())) {
1119	// A non-export depends on us. Skip export grouping.
1120	// Note that this is a bit pessimistic: We could still group all other
1121	// exports that are not depended on by non-exports, directly or
1122	// indirectly. Simply skipping this particular export but grouping all
1123	// others would not account for indirect dependencies.
1124	return;
1125	}
1126	}
1127	ExpGroup.push_back(Elt: SUNum);
1128	}
1129	}
1130
1131	// The group can be formed. Give the color.
1132	for (unsigned j : ExpGroup)
1133	CurrentColoring [j] = ExportColor;
1134	}
1135
1136	void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1137	unsigned DAGSize = DAG->SUnits.size();
1138	std::map<unsigned,unsigned> RealID;
1139
1140	CurrentBlocks.clear();
1141	CurrentColoring.clear();
1142	CurrentColoring.resize(new_size: DAGSize, x: `0`);
1143	Node2CurrentBlock.clear();
1144
1145	// Restore links previous scheduling variant has overridden.
1146	DAG->restoreSULinksLeft();
1147
1148	NextReservedID = `1`;
1149	NextNonReservedID = DAGSize + `1`;
1150
1151	LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1152
1153	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1154	colorHighLatenciesGroups();
1155	else
1156	colorHighLatenciesAlone();
1157	colorComputeReservedDependencies();
1158	colorAccordingToReservedDependencies();
1159	colorEndsAccordingToDependencies();
1160	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1161	colorForceConsecutiveOrderInGroup();
1162	regroupNoUserInstructions();
1163	colorMergeConstantLoadsNextGroup();
1164	colorMergeIfPossibleNextGroupOnlyForReserved();
1165	colorExports();
1166
1167	// Put SUs of same color into same block
1168	Node2CurrentBlock.resize(new_size: DAGSize, x: -`1`);
1169	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1170	SUnit *SU = &DAG->SUnits [i];
1171	unsigned Color = CurrentColoring [SU->NodeNum];
1172	auto [It, Inserted] = RealID.try_emplace(k: Color);
1173	if (Inserted) {
1174	int ID = CurrentBlocks.size();
1175	BlockPtrs.push_back(x: std::make_unique<SIScheduleBlock>(args&: DAG, args: this, args&: ID));
1176	CurrentBlocks.push_back(x: BlockPtrs.rbegin()->get());
1177	It ->second = ID;
1178	}
1179	CurrentBlocks [It ->second]->addUnit(SU);
1180	Node2CurrentBlock [SU->NodeNum] = It ->second;
1181	}
1182
1183	// Build dependencies between blocks.
1184	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1185	SUnit *SU = &DAG->SUnits [i];
1186	int SUID = Node2CurrentBlock [i];
1187	for (SDep& SuccDep : SU->Succs) {
1188	SUnit *Succ = SuccDep.getSUnit();
1189	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1190	continue;
1191	if (Node2CurrentBlock [Succ->NodeNum] != SUID)
1192	CurrentBlocks [SUID]->addSucc(Succ: CurrentBlocks [Node2CurrentBlock [Succ->NodeNum]],
1193	Kind: SuccDep.isCtrl() ? NoData : Data);
1194	}
1195	for (SDep& PredDep : SU->Preds) {
1196	SUnit *Pred = PredDep.getSUnit();
1197	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
1198	continue;
1199	if (Node2CurrentBlock [Pred->NodeNum] != SUID)
1200	CurrentBlocks [SUID]->addPred(Pred: CurrentBlocks [Node2CurrentBlock [Pred->NodeNum]]);
1201	}
1202	}
1203
1204	// Free root and leafs of all blocks to enable scheduling inside them.
1205	for (SIScheduleBlock *Block : CurrentBlocks)
1206	Block->finalizeUnits();
1207	LLVM_DEBUG({
1208	dbgs() << "Blocks created:\n\n";
1209	for (SIScheduleBlock *Block : CurrentBlocks)
1210	Block->printDebug(true);
1211	});
1212	}
1213
1214	// Two functions taken from Codegen/MachineScheduler.cpp
1215
1216	/// Non-const version.
1217	static MachineBasicBlock::iterator
1218	nextIfDebug(MachineBasicBlock::iterator I,
1219	MachineBasicBlock::const_iterator End) {
1220	for (; I != End; ++I) {
1221	if (!I ->isDebugInstr())
1222	break;
1223	}
1224	return I;
1225	}
1226
1227	void SIScheduleBlockCreator::topologicalSort() {
1228	unsigned DAGSize = CurrentBlocks.size();
1229	std::vector<int> WorkList;
1230
1231	LLVM_DEBUG(dbgs() << "Topological Sort\n");
1232
1233	WorkList.reserve(n: DAGSize);
1234	TopDownIndex2Block.resize(new_size: DAGSize);
1235	TopDownBlock2Index.resize(new_size: DAGSize);
1236	BottomUpIndex2Block.resize(new_size: DAGSize);
1237
1238	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1239	SIScheduleBlock *Block = CurrentBlocks [i];
1240	unsigned Degree = Block->getSuccs().size();
1241	TopDownBlock2Index [i] = Degree;
1242	if (Degree == `0`) {
1243	WorkList.push_back(x: i);
1244	}
1245	}
1246
1247	int Id = DAGSize;
1248	while (!WorkList.empty()) {
1249	int i = WorkList.back();
1250	SIScheduleBlock *Block = CurrentBlocks [i];
1251	WorkList.pop_back();
1252	TopDownBlock2Index [i] = --Id;
1253	TopDownIndex2Block [Id] = i;
1254	for (SIScheduleBlock* Pred : Block->getPreds()) {
1255	if (!--TopDownBlock2Index [Pred->getID()])
1256	WorkList.push_back(x: Pred->getID());
1257	}
1258	}
1259
1260	#ifndef NDEBUG
1261	// Check correctness of the ordering.
1262	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1263	SIScheduleBlock *Block = CurrentBlocks[i];
1264	for (SIScheduleBlock* Pred : Block->getPreds()) {
1265	assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1266	"Wrong Top Down topological sorting");
1267	}
1268	}
1269	#endif
1270
1271	BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1272	TopDownIndex2Block.rend());
1273	}
1274
1275	void SIScheduleBlockCreator::scheduleInsideBlocks() {
1276	unsigned DAGSize = CurrentBlocks.size();
1277
1278	LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1279
1280	// We do schedule a valid scheduling such that a Block corresponds
1281	// to a range of instructions.
1282	LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1283	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1284	SIScheduleBlock *Block = CurrentBlocks [i];
1285	Block->fastSchedule();
1286	}
1287
1288	// Note: the following code, and the part restoring previous position
1289	// is by far the most expensive operation of the Scheduler.
1290
1291	// Do not update CurrentTop.
1292	MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1293	std::vector<MachineBasicBlock::iterator> PosOld;
1294	std::vector<MachineBasicBlock::iterator> PosNew;
1295	PosOld.reserve(n: DAG->SUnits.size());
1296	PosNew.reserve(n: DAG->SUnits.size());
1297
1298	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1299	int BlockIndice = TopDownIndex2Block [i];
1300	SIScheduleBlock *Block = CurrentBlocks [BlockIndice];
1301	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1302
1303	for (SUnit* SU : SUs) {
1304	MachineInstr *MI = SU->getInstr();
1305	MachineBasicBlock::iterator Pos = MI;
1306	PosOld.push_back(x: Pos);
1307	if (&*CurrentTopFastSched == MI) {
1308	PosNew.push_back(x: Pos);
1309	CurrentTopFastSched = nextIfDebug(I: ++CurrentTopFastSched,
1310	End: DAG->getCurrentBottom());
1311	} else {
1312	// Update the instruction stream.
1313	DAG->getBB()->splice(Where: CurrentTopFastSched, Other: DAG->getBB(), From: MI);
1314
1315	// Update LiveIntervals.
1316	// Note: Moving all instructions and calling handleMove every time
1317	// is the most cpu intensive operation of the scheduler.
1318	// It would gain a lot if there was a way to recompute the
1319	// LiveIntervals for the entire scheduling region.
1320	DAG->getLIS()->handleMove(MI&: MI, /UpdateFlags=/*true);
1321	PosNew.push_back(x: CurrentTopFastSched);
1322	}
1323	}
1324	}
1325
1326	// Now we have Block of SUs == Block of MI.
1327	// We do the final schedule for the instructions inside the block.
1328	// The property that all the SUs of the Block are grouped together as MI
1329	// is used for correct reg usage tracking.
1330	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1331	SIScheduleBlock *Block = CurrentBlocks [i];
1332	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1333	Block->schedule(BeginBlock: (SUs.begin())->getInstr(), EndBlock: (SUs.rbegin())->getInstr());
1334	}
1335
1336	LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1337	// Restore old ordering (which prevents a LIS->handleMove bug).
1338	for (unsigned i = PosOld.size(), e = `0`; i != e; --i) {
1339	MachineBasicBlock::iterator POld = PosOld [i-`1`];
1340	MachineBasicBlock::iterator PNew = PosNew [i-`1`];
1341	if (PNew != POld) {
1342	// Update the instruction stream.
1343	DAG->getBB()->splice(Where: POld, Other: DAG->getBB(), From: PNew);
1344
1345	// Update LiveIntervals.
1346	DAG->getLIS()->handleMove(MI&: POld, /UpdateFlags=/*true);
1347	}
1348	}
1349
1350	LLVM_DEBUG({
1351	for (SIScheduleBlock *Block : CurrentBlocks)
1352	Block->printDebug(true);
1353	});
1354	}
1355
1356	void SIScheduleBlockCreator::fillStats() {
1357	unsigned DAGSize = CurrentBlocks.size();
1358
1359	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1360	int BlockIndice = TopDownIndex2Block [i];
1361	SIScheduleBlock *Block = CurrentBlocks [BlockIndice];
1362	if (Block->getPreds().empty())
1363	Block->Depth = `0`;
1364	else {
1365	unsigned Depth = `0`;
1366	for (SIScheduleBlock *Pred : Block->getPreds()) {
1367	if (Depth < Pred->Depth + Pred->getCost())
1368	Depth = Pred->Depth + Pred->getCost();
1369	}
1370	Block->Depth = Depth;
1371	}
1372	}
1373
1374	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1375	int BlockIndice = BottomUpIndex2Block [i];
1376	SIScheduleBlock *Block = CurrentBlocks [BlockIndice];
1377	if (Block->getSuccs().empty())
1378	Block->Height = `0`;
1379	else {
1380	unsigned Height = `0`;
1381	for (const auto &Succ : Block->getSuccs())
1382	Height = std::max(a: Height, b: Succ.first->Height + Succ.first->getCost());
1383	Block->Height = Height;
1384	}
1385	}
1386	}
1387
1388	// SIScheduleBlockScheduler //
1389
1390	SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1391	SISchedulerBlockSchedulerVariant Variant,
1392	SIScheduleBlocks BlocksStruct) :
1393	DAG(DAG), Variant(Variant), Blocks (BlocksStruct.Blocks),
1394	LastPosWaitedHighLatency(`0`), NumBlockScheduled(`0`), VregCurrentUsage(`0`),
1395	SregCurrentUsage(`0`), maxVregUsage(`0`), maxSregUsage(`0`) {
1396
1397	// Fill the usage of every output
1398	// Warning: while by construction we always have a link between two blocks
1399	// when one needs a result from the other, the number of users of an output
1400	// is not the sum of child blocks having as input the same virtual register.
1401	// Here is an example. A produces x and y. B eats x and produces x'.
1402	// C eats x' and y. The register coalescer may have attributed the same
1403	// virtual register to x and x'.
1404	// To count accurately, we do a topological sort. In case the register is
1405	// found for several parents, we increment the usage of the one with the
1406	// highest topological index.
1407	LiveOutRegsNumUsages.resize(new_size: Blocks.size());
1408	for (SIScheduleBlock *Block : Blocks) {
1409	for (Register Reg : Block->getInRegs()) {
1410	bool Found = false;
1411	int topoInd = -`1`;
1412	for (SIScheduleBlock* Pred: Block->getPreds()) {
1413	std::set<Register> PredOutRegs = Pred->getOutRegs();
1414	std::set<Register>::iterator RegPos = PredOutRegs.find(x: Reg);
1415
1416	if (RegPos != PredOutRegs.end()) {
1417	Found = true;
1418	if (topoInd < BlocksStruct.TopDownBlock2Index [Pred->getID()]) {
1419	topoInd = BlocksStruct.TopDownBlock2Index [Pred->getID()];
1420	}
1421	}
1422	}
1423
1424	if (!Found)
1425	continue;
1426
1427	int PredID = BlocksStruct.TopDownIndex2Block [topoInd];
1428	++LiveOutRegsNumUsages [PredID][Reg];
1429	}
1430	}
1431
1432	LastPosHighLatencyParentScheduled.resize(new_size: Blocks.size(), x: `0`);
1433	BlockNumPredsLeft.resize(new_size: Blocks.size());
1434	BlockNumSuccsLeft.resize(new_size: Blocks.size());
1435
1436	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1437	SIScheduleBlock *Block = Blocks [i];
1438	BlockNumPredsLeft [i] = Block->getPreds().size();
1439	BlockNumSuccsLeft [i] = Block->getSuccs().size();
1440	}
1441
1442	#ifndef NDEBUG
1443	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1444	SIScheduleBlock *Block = Blocks[i];
1445	assert(Block->getID() == i);
1446	}
1447	#endif
1448
1449	std::set<Register> InRegs = DAG->getInRegs();
1450	addLiveRegs(Regs&: InRegs);
1451
1452	// Increase LiveOutRegsNumUsages for blocks
1453	// producing registers consumed in another
1454	// scheduling region.
1455	for (Register Reg : DAG->getOutRegs()) {
1456	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1457	// Do reverse traversal
1458	int ID = BlocksStruct.TopDownIndex2Block [Blocks.size()-`1`-i];
1459	SIScheduleBlock *Block = Blocks [ID];
1460	const std::set<Register> &OutRegs = Block->getOutRegs();
1461
1462	if (OutRegs.find(x: Reg) == OutRegs.end())
1463	continue;
1464
1465	++LiveOutRegsNumUsages [ID][Reg];
1466	break;
1467	}
1468	}
1469
1470	// Fill LiveRegsConsumers for regs that were already
1471	// defined before scheduling.
1472	for (SIScheduleBlock *Block : Blocks) {
1473	for (Register Reg : Block->getInRegs()) {
1474	bool Found = false;
1475	for (SIScheduleBlock* Pred: Block->getPreds()) {
1476	std::set<Register> PredOutRegs = Pred->getOutRegs();
1477	std::set<Register>::iterator RegPos = PredOutRegs.find(x: Reg);
1478
1479	if (RegPos != PredOutRegs.end()) {
1480	Found = true;
1481	break;
1482	}
1483	}
1484
1485	if (!Found)
1486	++LiveRegsConsumers [Reg];
1487	}
1488	}
1489
1490	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1491	SIScheduleBlock *Block = Blocks [i];
1492	if (BlockNumPredsLeft [i] == `0`) {
1493	ReadyBlocks.push_back(x: Block);
1494	}
1495	}
1496
1497	while (SIScheduleBlock *Block = pickBlock()) {
1498	BlocksScheduled.push_back(x: Block);
1499	blockScheduled(Block);
1500	}
1501
1502	LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
1503	: BlocksScheduled) {
1504	dbgs() << `' '` << Block->getID();
1505	} dbgs() << `'\n'`;);
1506	}
1507
1508	bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1509	SIBlockSchedCandidate &TryCand) {
1510	if (!Cand.isValid()) {
1511	TryCand.Reason = NodeOrder;
1512	return true;
1513	}
1514
1515	// Try to hide high latencies.
1516	if (SISched::tryLess(TryVal: TryCand.LastPosHighLatParentScheduled,
1517	CandVal: Cand.LastPosHighLatParentScheduled, TryCand, Cand, Reason: Latency))
1518	return true;
1519	// Schedule high latencies early so you can hide them better.
1520	if (SISched::tryGreater(TryVal: TryCand.IsHighLatency, CandVal: Cand.IsHighLatency,
1521	TryCand, Cand, Reason: Latency))
1522	return true;
1523	if (TryCand.IsHighLatency && SISched::tryGreater(TryVal: TryCand.Height, CandVal: Cand.Height,
1524	TryCand, Cand, Reason: Depth))
1525	return true;
1526	if (SISched::tryGreater(TryVal: TryCand.NumHighLatencySuccessors,
1527	CandVal: Cand.NumHighLatencySuccessors,
1528	TryCand, Cand, Reason: Successor))
1529	return true;
1530	return false;
1531	}
1532
1533	bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1534	SIBlockSchedCandidate &TryCand) {
1535	if (!Cand.isValid()) {
1536	TryCand.Reason = NodeOrder;
1537	return true;
1538	}
1539
1540	if (SISched::tryLess(TryVal: TryCand.VGPRUsageDiff > `0`, CandVal: Cand.VGPRUsageDiff > `0`,
1541	TryCand, Cand, Reason: RegUsage))
1542	return true;
1543	if (SISched::tryGreater(TryVal: TryCand.NumSuccessors > `0`,
1544	CandVal: Cand.NumSuccessors > `0`,
1545	TryCand, Cand, Reason: Successor))
1546	return true;
1547	if (SISched::tryGreater(TryVal: TryCand.Height, CandVal: Cand.Height, TryCand, Cand, Reason: Depth))
1548	return true;
1549	if (SISched::tryLess(TryVal: TryCand.VGPRUsageDiff, CandVal: Cand.VGPRUsageDiff,
1550	TryCand, Cand, Reason: RegUsage))
1551	return true;
1552	return false;
1553	}
1554
1555	SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1556	SIBlockSchedCandidate Cand;
1557	std::vector<SIScheduleBlock*>::iterator Best;
1558	SIScheduleBlock *Block;
1559	if (ReadyBlocks.empty())
1560	return nullptr;
1561
1562	DAG->fillVgprSgprCost(First: LiveRegs.begin(), End: LiveRegs.end(),
1563	VgprUsage&: VregCurrentUsage, SgprUsage&: SregCurrentUsage);
1564	if (VregCurrentUsage > maxVregUsage)
1565	maxVregUsage = VregCurrentUsage;
1566	if (SregCurrentUsage > maxSregUsage)
1567	maxSregUsage = SregCurrentUsage;
1568	LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1569	for (SIScheduleBlock *Block : ReadyBlocks)
1570	dbgs() << Block->getID() << `' '`;
1571	dbgs() << "\nCurrent Live:\n";
1572	for (Register Reg : LiveRegs)
1573	dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << `' '`;
1574	dbgs() << `'\n'`;
1575	dbgs() << "Current VGPRs: " << VregCurrentUsage << `'\n'`;
1576	dbgs() << "Current SGPRs: " << SregCurrentUsage << `'\n'`;);
1577
1578	Cand.Block = nullptr;
1579	for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1580	E = ReadyBlocks.end(); I != E; ++I) {
1581	SIBlockSchedCandidate TryCand;
1582	TryCand.Block = *I;
1583	TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1584	TryCand.VGPRUsageDiff =
1585	checkRegUsageImpact(InRegs&: TryCand.Block->getInRegs(),
1586	OutRegs&: TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32];
1587	TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1588	TryCand.NumHighLatencySuccessors =
1589	TryCand.Block->getNumHighLatencySuccessors();
1590	TryCand.LastPosHighLatParentScheduled =
1591	(unsigned int) std::max<int> (a: `0`,
1592	b: LastPosHighLatencyParentScheduled [TryCand.Block->getID()] -
1593	LastPosWaitedHighLatency);
1594	TryCand.Height = TryCand.Block->Height;
1595	// Try not to increase VGPR usage too much, else we may spill.
1596	if (VregCurrentUsage > `120` \|\|
1597	Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1598	if (!tryCandidateRegUsage(Cand, TryCand) &&
1599	Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1600	tryCandidateLatency(Cand, TryCand);
1601	} else {
1602	if (!tryCandidateLatency(Cand, TryCand))
1603	tryCandidateRegUsage(Cand, TryCand);
1604	}
1605	if (TryCand.Reason != NoCand) {
1606	Cand.setBest(TryCand);
1607	Best = I;
1608	LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << `' '`
1609	<< getReasonStr(Cand.Reason) << `'\n'`);
1610	}
1611	}
1612
1613	LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << `'\n'`;
1614	dbgs() << "Is a block with high latency instruction: "
1615	<< (Cand.IsHighLatency ? "yes\n" : "no\n");
1616	dbgs() << "Position of last high latency dependency: "
1617	<< Cand.LastPosHighLatParentScheduled << `'\n'`;
1618	dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << `'\n'`;
1619	dbgs() << `'\n'`;);
1620
1621	Block = Cand.Block;
1622	ReadyBlocks.erase(position: Best);
1623	return Block;
1624	}
1625
1626	// Tracking of currently alive registers to determine VGPR Usage.
1627
1628	void SIScheduleBlockScheduler::addLiveRegs(std::set<Register> &Regs) {
1629	for (Register Reg : Regs) {
1630	// For now only track virtual registers.
1631	if (!Reg.isVirtual())
1632	continue;
1633	// If not already in the live set, then add it.
1634	(void) LiveRegs.insert(x: Reg);
1635	}
1636	}
1637
1638	void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1639	std::set<Register> &Regs) {
1640	for (Register Reg : Regs) {
1641	// For now only track virtual registers.
1642	std::set<Register>::iterator Pos = LiveRegs.find(x: Reg);
1643	assert (Pos != LiveRegs.end() && // Reg must be live.
1644	LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1645	LiveRegsConsumers[Reg] >= `1`);
1646	--LiveRegsConsumers [Reg];
1647	if (LiveRegsConsumers [Reg] == `0`)
1648	LiveRegs.erase(position: Pos);
1649	}
1650	}
1651
1652	void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1653	for (const auto &Block : Parent->getSuccs()) {
1654	if (--BlockNumPredsLeft [Block.first->getID()] == `0`)
1655	ReadyBlocks.push_back(x: Block.first);
1656
1657	if (Parent->isHighLatencyBlock() &&
1658	Block.second == SIScheduleBlockLinkKind::Data)
1659	LastPosHighLatencyParentScheduled [Block.first->getID()] = NumBlockScheduled;
1660	}
1661	}
1662
1663	void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1664	decreaseLiveRegs(Block, Regs&: Block->getInRegs());
1665	addLiveRegs(Regs&: Block->getOutRegs());
1666	releaseBlockSuccs(Parent: Block);
1667	for (const auto &RegP : LiveOutRegsNumUsages [Block->getID()]) {
1668	// We produce this register, thus it must not be previously alive.
1669	assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() \|\|
1670	LiveRegsConsumers[RegP.first] == `0`);
1671	LiveRegsConsumers [RegP.first] += RegP.second;
1672	}
1673	if (LastPosHighLatencyParentScheduled [Block->getID()] >
1674	(unsigned)LastPosWaitedHighLatency)
1675	LastPosWaitedHighLatency =
1676	LastPosHighLatencyParentScheduled [Block->getID()];
1677	++NumBlockScheduled;
1678	}
1679
1680	std::vector<int>
1681	SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
1682	std::set<Register> &OutRegs) {
1683	std::vector<int> DiffSetPressure;
1684	DiffSetPressure.assign(n: DAG->getTRI()->getNumRegPressureSets(), val: `0`);
1685
1686	for (Register Reg : InRegs) {
1687	// For now only track virtual registers.
1688	if (!Reg.isVirtual())
1689	continue;
1690	if (LiveRegsConsumers [Reg] > `1`)
1691	continue;
1692	PSetIterator PSetI = DAG->getMRI()->getPressureSets(RegUnit: Reg);
1693	for (; PSetI.isValid(); ++PSetI) {
1694	DiffSetPressure [*PSetI] -= PSetI.getWeight();
1695	}
1696	}
1697
1698	for (Register Reg : OutRegs) {
1699	// For now only track virtual registers.
1700	if (!Reg.isVirtual())
1701	continue;
1702	PSetIterator PSetI = DAG->getMRI()->getPressureSets(RegUnit: Reg);
1703	for (; PSetI.isValid(); ++PSetI) {
1704	DiffSetPressure [*PSetI] += PSetI.getWeight();
1705	}
1706	}
1707
1708	return DiffSetPressure;
1709	}
1710
1711	// SIScheduler //
1712
1713	struct SIScheduleBlockResult
1714	SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1715	SISchedulerBlockSchedulerVariant ScheduleVariant) {
1716	SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1717	SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1718	std::vector<SIScheduleBlock*> ScheduledBlocks;
1719	struct SIScheduleBlockResult Res;
1720
1721	ScheduledBlocks = Scheduler.getBlocks();
1722
1723	for (SIScheduleBlock *Block : ScheduledBlocks) {
1724	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1725
1726	for (SUnit* SU : SUs)
1727	Res.SUs.push_back(x: SU->NodeNum);
1728	}
1729
1730	Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1731	Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1732	return Res;
1733	}
1734
1735	// SIScheduleDAGMI //
1736
1737	SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1738	ScheduleDAGMILive (C, std::make_unique<GenericScheduler>(args&: C)) {
1739	SITII = static_cast<const SIInstrInfo*>(TII);
1740	SITRI = static_cast<const SIRegisterInfo*>(TRI);
1741	}
1742
1743	SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1744
1745	// Code adapted from scheduleDAG.cpp
1746	// Does a topological sort over the SUs.
1747	// Both TopDown and BottomUp
1748	void SIScheduleDAGMI::topologicalSort() {
1749	Topo.InitDAGTopologicalSorting();
1750
1751	TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1752	BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1753	}
1754
1755	// Move low latencies further from their user without
1756	// increasing SGPR usage (in general)
1757	// This is to be replaced by a better pass that would
1758	// take into account SGPR usage (based on VGPR Usage
1759	// and the corresponding wavefront count), that would
1760	// try to merge groups of loads if it make sense, etc
1761	void SIScheduleDAGMI::moveLowLatencies() {
1762	unsigned DAGSize = SUnits.size();
1763	int LastLowLatencyUser = -`1`;
1764	int LastLowLatencyPos = -`1`;
1765
1766	for (unsigned i = `0`, e = ScheduledSUnits.size(); i != e; ++i) {
1767	SUnit *SU = &SUnits [ScheduledSUnits [i]];
1768	bool IsLowLatencyUser = false;
1769	unsigned MinPos = `0`;
1770
1771	for (SDep& PredDep : SU->Preds) {
1772	SUnit *Pred = PredDep.getSUnit();
1773	if (SITII->isLowLatencyInstruction(MI: *Pred->getInstr())) {
1774	IsLowLatencyUser = true;
1775	}
1776	if (Pred->NodeNum >= DAGSize)
1777	continue;
1778	unsigned PredPos = ScheduledSUnitsInv [Pred->NodeNum];
1779	if (PredPos >= MinPos)
1780	MinPos = PredPos + `1`;
1781	}
1782
1783	if (SITII->isLowLatencyInstruction(MI: *SU->getInstr())) {
1784	unsigned BestPos = LastLowLatencyUser + `1`;
1785	if ((int)BestPos <= LastLowLatencyPos)
1786	BestPos = LastLowLatencyPos + `1`;
1787	if (BestPos < MinPos)
1788	BestPos = MinPos;
1789	if (BestPos < i) {
1790	for (unsigned u = i; u > BestPos; --u) {
1791	++ScheduledSUnitsInv [ScheduledSUnits [u-`1`]];
1792	ScheduledSUnits [u] = ScheduledSUnits [u-`1`];
1793	}
1794	ScheduledSUnits [BestPos] = SU->NodeNum;
1795	ScheduledSUnitsInv [SU->NodeNum] = BestPos;
1796	}
1797	LastLowLatencyPos = BestPos;
1798	if (IsLowLatencyUser)
1799	LastLowLatencyUser = BestPos;
1800	} else if (IsLowLatencyUser) {
1801	LastLowLatencyUser = i;
1802	// Moves COPY instructions on which depends
1803	// the low latency instructions too.
1804	} else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1805	bool CopyForLowLat = false;
1806	for (SDep& SuccDep : SU->Succs) {
1807	SUnit *Succ = SuccDep.getSUnit();
1808	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1809	continue;
1810	if (SITII->isLowLatencyInstruction(MI: *Succ->getInstr())) {
1811	CopyForLowLat = true;
1812	}
1813	}
1814	if (!CopyForLowLat)
1815	continue;
1816	if (MinPos < i) {
1817	for (unsigned u = i; u > MinPos; --u) {
1818	++ScheduledSUnitsInv [ScheduledSUnits [u-`1`]];
1819	ScheduledSUnits [u] = ScheduledSUnits [u-`1`];
1820	}
1821	ScheduledSUnits [MinPos] = SU->NodeNum;
1822	ScheduledSUnitsInv [SU->NodeNum] = MinPos;
1823	}
1824	}
1825	}
1826	}
1827
1828	void SIScheduleDAGMI::restoreSULinksLeft() {
1829	for (unsigned i = `0`, e = SUnits.size(); i != e; ++i) {
1830	SUnits [i].isScheduled = false;
1831	SUnits [i].WeakPredsLeft = SUnitsLinksBackup [i].WeakPredsLeft;
1832	SUnits [i].NumPredsLeft = SUnitsLinksBackup [i].NumPredsLeft;
1833	SUnits [i].WeakSuccsLeft = SUnitsLinksBackup [i].WeakSuccsLeft;
1834	SUnits [i].NumSuccsLeft = SUnitsLinksBackup [i].NumSuccsLeft;
1835	}
1836	}
1837
1838	// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1839	template<typename _Iterator> void
1840	SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1841	unsigned &VgprUsage, unsigned &SgprUsage) {
1842	VgprUsage = `0`;
1843	SgprUsage = `0`;
1844	for (_Iterator RegI = First; RegI != End; ++RegI) {
1845	Register Reg = *RegI;
1846	// For now only track virtual registers
1847	if (!Reg.isVirtual())
1848	continue;
1849	PSetIterator PSetI = MRI.getPressureSets(RegUnit: Reg);
1850	for (; PSetI.isValid(); ++PSetI) {
1851	if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
1852	VgprUsage += PSetI.getWeight();
1853	else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32)
1854	SgprUsage += PSetI.getWeight();
1855	}
1856	}
1857	}
1858
1859	void SIScheduleDAGMI::schedule()
1860	{
1861	SmallVector<SUnit*, `8`> TopRoots, BotRoots;
1862	SIScheduleBlockResult Best, Temp;
1863	LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1864
1865	buildDAGWithRegPressure();
1866	postProcessDAG();
1867
1868	LLVM_DEBUG(dump());
1869	if (PrintDAGs)
1870	dump();
1871	if (ViewMISchedDAGs)
1872	viewGraph();
1873
1874	topologicalSort();
1875	findRootsAndBiasEdges(TopRoots, BotRoots);
1876	// We reuse several ScheduleDAGMI and ScheduleDAGMILive
1877	// functions, but to make them happy we must initialize
1878	// the default Scheduler implementation (even if we do not
1879	// run it)
1880	SchedImpl ->initialize(DAG: this);
1881	initQueues(TopRoots, BotRoots);
1882
1883	// Fill some stats to help scheduling.
1884
1885	SUnitsLinksBackup = SUnits;
1886	IsLowLatencySU.clear();
1887	LowLatencyOffset.clear();
1888	IsHighLatencySU.clear();
1889
1890	IsLowLatencySU.resize(new_size: SUnits.size(), x: `0`);
1891	LowLatencyOffset.resize(new_size: SUnits.size(), x: `0`);
1892	IsHighLatencySU.resize(new_size: SUnits.size(), x: `0`);
1893
1894	for (unsigned i = `0`, e = (unsigned)SUnits.size(); i != e; ++i) {
1895	SUnit *SU = &SUnits [i];
1896	const MachineOperand *BaseLatOp;
1897	int64_t OffLatReg;
1898	if (SITII->isLowLatencyInstruction(MI: *SU->getInstr())) {
1899	IsLowLatencySU [i] = `1`;
1900	bool OffsetIsScalable;
1901	if (SITII->getMemOperandWithOffset(MI: *SU->getInstr(), BaseOp&: BaseLatOp, Offset&: OffLatReg,
1902	OffsetIsScalable, TRI))
1903	LowLatencyOffset [i] = OffLatReg;
1904	} else if (SITII->isHighLatencyDef(Opc: SU->getInstr()->getOpcode()))
1905	IsHighLatencySU [i] = `1`;
1906	}
1907
1908	SIScheduler Scheduler(this);
1909	Best = Scheduler.scheduleVariant(BlockVariant: SISchedulerBlockCreatorVariant::LatenciesAlone,
1910	ScheduleVariant: SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1911
1912	// if VGPR usage is extremely high, try other good performing variants
1913	// which could lead to lower VGPR usage
1914	if (Best.MaxVGPRUsage > `180`) {
1915	static const std::pair<SISchedulerBlockCreatorVariant,
1916	SISchedulerBlockSchedulerVariant>
1917	Variants[] = {
1918	{ LatenciesAlone, BlockRegUsageLatency },
1919	// { LatenciesAlone, BlockRegUsage },
1920	{ LatenciesGrouped, BlockLatencyRegUsage },
1921	// { LatenciesGrouped, BlockRegUsageLatency },
1922	// { LatenciesGrouped, BlockRegUsage },
1923	{ LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1924	// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1925	// { LatenciesAlonePlusConsecutive, BlockRegUsage }
1926	};
1927	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1928	Temp = Scheduler.scheduleVariant(BlockVariant: v.first, ScheduleVariant: v.second);
1929	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1930	Best = Temp;
1931	}
1932	}
1933	// if VGPR usage is still extremely high, we may spill. Try other variants
1934	// which are less performing, but that could lead to lower VGPR usage.
1935	if (Best.MaxVGPRUsage > `200`) {
1936	static const std::pair<SISchedulerBlockCreatorVariant,
1937	SISchedulerBlockSchedulerVariant>
1938	Variants[] = {
1939	// { LatenciesAlone, BlockRegUsageLatency },
1940	{ LatenciesAlone, BlockRegUsage },
1941	// { LatenciesGrouped, BlockLatencyRegUsage },
1942	{ LatenciesGrouped, BlockRegUsageLatency },
1943	{ LatenciesGrouped, BlockRegUsage },
1944	// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1945	{ LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1946	{ LatenciesAlonePlusConsecutive, BlockRegUsage }
1947	};
1948	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1949	Temp = Scheduler.scheduleVariant(BlockVariant: v.first, ScheduleVariant: v.second);
1950	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1951	Best = Temp;
1952	}
1953	}
1954
1955	ScheduledSUnits = Best.SUs;
1956	ScheduledSUnitsInv.resize(new_size: SUnits.size());
1957
1958	for (unsigned i = `0`, e = (unsigned)SUnits.size(); i != e; ++i) {
1959	ScheduledSUnitsInv [ScheduledSUnits [i]] = i;
1960	}
1961
1962	moveLowLatencies();
1963
1964	// Tell the outside world about the result of the scheduling.
1965
1966	assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1967	TopRPTracker.setPos(CurrentTop);
1968
1969	for (unsigned I : ScheduledSUnits) {
1970	SUnit *SU = &SUnits [I];
1971
1972	scheduleMI(SU, IsTopNode: true);
1973
1974	LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
1975	<< *SU->getInstr());
1976	}
1977
1978	assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1979
1980	placeDebugValues();
1981
1982	LLVM_DEBUG({
1983	dbgs() << "*** Final schedule for "
1984	<< printMBBReference(begin()->getParent()) << " **\n";
1985	dumpSchedule();
1986	dbgs() << `'\n'`;
1987	});
1988	}
1989

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp