SIMachineScheduler.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp]

1	//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// SI Machine Scheduler interface
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "SIMachineScheduler.h"
15	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16	#include "SIInstrInfo.h"
17	#include "llvm/CodeGen/LiveIntervals.h"
18	#include "llvm/CodeGen/MachineRegisterInfo.h"
19
20	using namespace llvm;
21
22	#define DEBUG_TYPE "machine-scheduler"
23
24	// This scheduler implements a different scheduling algorithm than
25	// GenericScheduler.
26	//
27	// There are several specific architecture behaviours that can't be modelled
28	// for GenericScheduler:
29	// . When accessing the result of an SGPR load instruction, you have to wait
30	// for all the SGPR load instructions before your current instruction to
31	// have finished.
32	// . When accessing the result of an VGPR load instruction, you have to wait
33	// for all the VGPR load instructions previous to the VGPR load instruction
34	// you are interested in to finish.
35	// . The less the register pressure, the best load latencies are hidden
36	//
37	// Moreover some specifities (like the fact a lot of instructions in the shader
38	// have few dependencies) makes the generic scheduler have some unpredictable
39	// behaviours. For example when register pressure becomes high, it can either
40	// manage to prevent register pressure from going too high, or it can
41	// increase register pressure even more than if it hadn't taken register
42	// pressure into account.
43	//
44	// Also some other bad behaviours are generated, like loading at the beginning
45	// of the shader a constant in VGPR you won't need until the end of the shader.
46	//
47	// The scheduling problem for SI can distinguish three main parts:
48	// . Hiding high latencies (texture sampling, etc)
49	// . Hiding low latencies (SGPR constant loading, etc)
50	// . Keeping register usage low for better latency hiding and general
51	// performance
52	//
53	// Some other things can also affect performance, but are hard to predict
54	// (cache usage, the fact the HW can issue several instructions from different
55	// wavefronts if different types, etc)
56	//
57	// This scheduler tries to solve the scheduling problem by dividing it into
58	// simpler sub-problems. It divides the instructions into blocks, schedules
59	// locally inside the blocks where it takes care of low latencies, and then
60	// chooses the order of the blocks by taking care of high latencies.
61	// Dividing the instructions into blocks helps control keeping register
62	// usage low.
63	//
64	// First the instructions are put into blocks.
65	// We want the blocks help control register usage and hide high latencies
66	// later. To help control register usage, we typically want all local
67	// computations, when for example you create a result that can be consumed
68	// right away, to be contained in a block. Block inputs and outputs would
69	// typically be important results that are needed in several locations of
70	// the shader. Since we do want blocks to help hide high latencies, we want
71	// the instructions inside the block to have a minimal set of dependencies
72	// on high latencies. It will make it easy to pick blocks to hide specific
73	// high latencies.
74	// The block creation algorithm is divided into several steps, and several
75	// variants can be tried during the scheduling process.
76	//
77	// Second the order of the instructions inside the blocks is chosen.
78	// At that step we do take into account only register usage and hiding
79	// low latency instructions
80	//
81	// Third the block order is chosen, there we try to hide high latencies
82	// and keep register usage low.
83	//
84	// After the third step, a pass is done to improve the hiding of low
85	// latencies.
86	//
87	// Actually when talking about 'low latency' or 'high latency' it includes
88	// both the latency to get the cache (or global mem) data go to the register,
89	// and the bandwidth limitations.
90	// Increasing the number of active wavefronts helps hide the former, but it
91	// doesn't solve the latter, thus why even if wavefront count is high, we have
92	// to try have as many instructions hiding high latencies as possible.
93	// The OpenCL doc says for example latency of 400 cycles for a global mem
94	// access, which is hidden by 10 instructions if the wavefront count is 10.
95
96	// Some figures taken from AMD docs:
97	// Both texture and constant L1 caches are 4-way associative with 64 bytes
98	// lines.
99	// Constant cache is shared with 4 CUs.
100	// For texture sampling, the address generation unit receives 4 texture
101	// addresses per cycle, thus we could expect texture sampling latency to be
102	// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
103	// instructions in a wavefront group are executed every 4 cycles),
104	// or 16 instructions if the other wavefronts associated to the 3 other VALUs
105	// of the CU do texture sampling too. (Don't take these figures too seriously,
106	// as I'm not 100% sure of the computation)
107	// Data exports should get similar latency.
108	// For constant loading, the cache is shader with 4 CUs.
109	// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
110	// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
111	// It means a simple s_buffer_load should take one instruction to hide, as
112	// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
113	// cache line.
114	//
115	// As of today the driver doesn't preload the constants in cache, thus the
116	// first loads get extra latency. The doc says global memory access can be
117	// 300-600 cycles. We do not specially take that into account when scheduling
118	// As we expect the driver to be able to preload the constants soon.
119
120	// common code //
121
122	#ifndef NDEBUG
123
124	static const char *getReasonStr(SIScheduleCandReason Reason) {
125	switch (Reason) {
126	case NoCand: return "NOCAND";
127	case RegUsage: return "REGUSAGE";
128	case Latency: return "LATENCY";
129	case Successor: return "SUCCESSOR";
130	case Depth: return "DEPTH";
131	case NodeOrder: return "ORDER";
132	}
133	llvm_unreachable("Unknown reason!");
134	}
135
136	#endif
137
138	namespace llvm::SISched {
139	static bool tryLess(int TryVal, int CandVal,
140	SISchedulerCandidate &TryCand,
141	SISchedulerCandidate &Cand,
142	SIScheduleCandReason Reason) {
143	if (TryVal < CandVal) {
144	TryCand.Reason = Reason;
145	return true;
146	}
147	if (TryVal > CandVal) {
148	if (Cand.Reason > Reason)
149	Cand.Reason = Reason;
150	return true;
151	}
152	Cand.setRepeat(Reason);
153	return false;
154	}
155
156	static bool tryGreater(int TryVal, int CandVal,
157	SISchedulerCandidate &TryCand,
158	SISchedulerCandidate &Cand,
159	SIScheduleCandReason Reason) {
160	if (TryVal > CandVal) {
161	TryCand.Reason = Reason;
162	return true;
163	}
164	if (TryVal < CandVal) {
165	if (Cand.Reason > Reason)
166	Cand.Reason = Reason;
167	return true;
168	}
169	Cand.setRepeat(Reason);
170	return false;
171	}
172	} // end namespace llvm::SISched
173
174	// SIScheduleBlock //
175
176	void SIScheduleBlock::addUnit(SUnit *SU) {
177	NodeNum2Index [SU->NodeNum] = SUnits.size();
178	SUnits.push_back(x: SU);
179	}
180
181	#ifndef NDEBUG
182	void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
183
184	dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
185	dbgs() << `'\n'`;
186	}
187	#endif
188
189	void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
190	SISchedCandidate &TryCand) {
191	// Initialize the candidate if needed.
192	if (!Cand.isValid()) {
193	TryCand.Reason = NodeOrder;
194	return;
195	}
196
197	if (Cand.SGPRUsage > `60` &&
198	SISched::tryLess(TryVal: TryCand.SGPRUsage, CandVal: Cand.SGPRUsage,
199	TryCand, Cand, Reason: RegUsage))
200	return;
201
202	// Schedule low latency instructions as top as possible.
203	// Order of priority is:
204	// . Low latency instructions which do not depend on other low latency
205	// instructions we haven't waited for
206	// . Other instructions which do not depend on low latency instructions
207	// we haven't waited for
208	// . Low latencies
209	// . All other instructions
210	// Goal is to get: low latency instructions - independent instructions
211	// - (eventually some more low latency instructions)
212	// - instructions that depend on the first low latency instructions.
213	// If in the block there is a lot of constant loads, the SGPR usage
214	// could go quite high, thus above the arbitrary limit of 60 will encourage
215	// use the already loaded constants (in order to release some SGPRs) before
216	// loading more.
217	if (SISched::tryLess(TryVal: TryCand.HasLowLatencyNonWaitedParent,
218	CandVal: Cand.HasLowLatencyNonWaitedParent,
219	TryCand, Cand, Reason: SIScheduleCandReason::Depth))
220	return;
221
222	if (SISched::tryGreater(TryVal: TryCand.IsLowLatency, CandVal: Cand.IsLowLatency,
223	TryCand, Cand, Reason: SIScheduleCandReason::Depth))
224	return;
225
226	if (TryCand.IsLowLatency &&
227	SISched::tryLess(TryVal: TryCand.LowLatencyOffset, CandVal: Cand.LowLatencyOffset,
228	TryCand, Cand, Reason: SIScheduleCandReason::Depth))
229	return;
230
231	if (SISched::tryLess(TryVal: TryCand.VGPRUsage, CandVal: Cand.VGPRUsage,
232	TryCand, Cand, Reason: RegUsage))
233	return;
234
235	// Fall through to original instruction order.
236	if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
237	TryCand.Reason = NodeOrder;
238	}
239	}
240
241	SUnit* SIScheduleBlock::pickNode() {
242	SISchedCandidate TopCand;
243
244	for (SUnit* SU : TopReadySUs) {
245	SISchedCandidate TryCand;
246	std::vector<unsigned> pressure;
247	std::vector<unsigned> MaxPressure;
248	// Predict register usage after this instruction.
249	TryCand.SU = SU;
250	TopRPTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: pressure, MaxPressureResult&: MaxPressure);
251	TryCand.SGPRUsage = pressure [AMDGPU::RegisterPressureSets::SReg_32];
252	TryCand.VGPRUsage = pressure [AMDGPU::RegisterPressureSets::VGPR_32];
253	TryCand.IsLowLatency = DAG->IsLowLatencySU [SU->NodeNum];
254	TryCand.LowLatencyOffset = DAG->LowLatencyOffset [SU->NodeNum];
255	TryCand.HasLowLatencyNonWaitedParent =
256	HasLowLatencyNonWaitedParent [NodeNum2Index [SU->NodeNum]];
257	tryCandidateTopDown(Cand&: TopCand, TryCand);
258	if (TryCand.Reason != NoCand)
259	TopCand.setBest(TryCand);
260	}
261
262	return TopCand.SU;
263	}
264
265
266	// Schedule something valid.
267	void SIScheduleBlock::fastSchedule() {
268	TopReadySUs.clear();
269	if (Scheduled)
270	undoSchedule();
271
272	for (SUnit* SU : SUnits) {
273	if (!SU->NumPredsLeft)
274	TopReadySUs.push_back(x: SU);
275	}
276
277	while (!TopReadySUs.empty()) {
278	SUnit *SU = TopReadySUs [`0`];
279	ScheduledSUnits.push_back(x: SU);
280	nodeScheduled(SU);
281	}
282
283	Scheduled = true;
284	}
285
286	// Returns if the register was set between first and last.
287	static bool isDefBetween(unsigned Reg,
288	SlotIndex First, SlotIndex Last,
289	const MachineRegisterInfo *MRI,
290	const LiveIntervals *LIS) {
291	for (MachineRegisterInfo::def_instr_iterator
292	UI = MRI->def_instr_begin(RegNo: Reg),
293	UE = MRI->def_instr_end(); UI != UE; ++UI) {
294	const MachineInstr* MI = &*UI;
295	if (MI->isDebugValue())
296	continue;
297	SlotIndex InstSlot = LIS->getInstructionIndex(Instr: *MI).getRegSlot();
298	if (InstSlot >= First && InstSlot <= Last)
299	return true;
300	}
301	return false;
302	}
303
304	void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
305	MachineBasicBlock::iterator EndBlock) {
306	IntervalPressure Pressure, BotPressure;
307	RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
308	LiveIntervals *LIS = DAG->getLIS();
309	MachineRegisterInfo *MRI = DAG->getMRI();
310	DAG->initRPTracker(RPTracker&: TopRPTracker);
311	DAG->initRPTracker(RPTracker&: BotRPTracker);
312	DAG->initRPTracker(RPTracker);
313
314	// Goes though all SU. RPTracker captures what had to be alive for the SUs
315	// to execute, and what is still alive at the end.
316	for (SUnit* SU : ScheduledSUnits) {
317	RPTracker.setPos(SU->getInstr());
318	RPTracker.advance();
319	}
320
321	// Close the RPTracker to finalize live ins/outs.
322	RPTracker.closeRegion();
323
324	// Initialize the live ins and live outs.
325	TopRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveInRegs);
326	BotRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveOutRegs);
327
328	// Do not Track Physical Registers, because it messes up.
329	for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
330	if (RegMaskPair.RegUnit.isVirtual())
331	LiveInRegs.insert(x: RegMaskPair.RegUnit);
332	}
333	LiveOutRegs.clear();
334	// There is several possibilities to distinguish:
335	// 1) Reg is not input to any instruction in the block, but is output of one
336	// 2) 1) + read in the block and not needed after it
337	// 3) 1) + read in the block but needed in another block
338	// 4) Reg is input of an instruction but another block will read it too
339	// 5) Reg is input of an instruction and then rewritten in the block.
340	// result is not read in the block (implies used in another block)
341	// 6) Reg is input of an instruction and then rewritten in the block.
342	// result is read in the block and not needed in another block
343	// 7) Reg is input of an instruction and then rewritten in the block.
344	// result is read in the block but also needed in another block
345	// LiveInRegs will contains all the regs in situation 4, 5, 6, 7
346	// We want LiveOutRegs to contain only Regs whose content will be read after
347	// in another block, and whose content was written in the current block,
348	// that is we want it to get 1, 3, 5, 7
349	// Since we made the MIs of a block to be packed all together before
350	// scheduling, then the LiveIntervals were correct, and the RPTracker was
351	// able to correctly handle 5 vs 6, 2 vs 3.
352	// (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
353	// The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
354	// Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
355	// The use of findDefBetween removes the case 4.
356	for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
357	Register Reg = RegMaskPair.RegUnit;
358	if (Reg.isVirtual() &&
359	isDefBetween(Reg, First: LIS->getInstructionIndex(Instr: *BeginBlock).getRegSlot(),
360	Last: LIS->getInstructionIndex(Instr: *EndBlock).getRegSlot(), MRI,
361	LIS)) {
362	LiveOutRegs.insert(x: Reg);
363	}
364	}
365
366	// Pressure = sum_alive_registers register size
367	// Internally llvm will represent some registers as big 128 bits registers
368	// for example, but they actually correspond to 4 actual 32 bits registers.
369	// Thus Pressure is not equal to num_alive_registers constant.*
370	LiveInPressure = TopPressure.MaxSetPressure;
371	LiveOutPressure = BotPressure.MaxSetPressure;
372
373	// Prepares TopRPTracker for top down scheduling.
374	TopRPTracker.closeTop();
375	}
376
377	void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
378	MachineBasicBlock::iterator EndBlock) {
379	if (!Scheduled)
380	fastSchedule();
381
382	// PreScheduling phase to set LiveIn and LiveOut.
383	initRegPressure(BeginBlock, EndBlock);
384	undoSchedule();
385
386	// Schedule for real now.
387
388	TopReadySUs.clear();
389
390	for (SUnit* SU : SUnits) {
391	if (!SU->NumPredsLeft)
392	TopReadySUs.push_back(x: SU);
393	}
394
395	while (!TopReadySUs.empty()) {
396	SUnit *SU = pickNode();
397	ScheduledSUnits.push_back(x: SU);
398	TopRPTracker.setPos(SU->getInstr());
399	TopRPTracker.advance();
400	nodeScheduled(SU);
401	}
402
403	// TODO: compute InternalAdditionalPressure.
404	InternalAdditionalPressure.resize(new_size: TopPressure.MaxSetPressure.size());
405
406	// Check everything is right.
407	#ifndef NDEBUG
408	assert(SUnits.size() == ScheduledSUnits.size() &&
409	TopReadySUs.empty());
410	for (SUnit* SU : SUnits) {
411	assert(SU->isScheduled &&
412	SU->NumPredsLeft == `0`);
413	}
414	#endif
415
416	Scheduled = true;
417	}
418
419	void SIScheduleBlock::undoSchedule() {
420	for (SUnit* SU : SUnits) {
421	SU->isScheduled = false;
422	for (SDep& Succ : SU->Succs) {
423	if (BC->isSUInBlock(SU: Succ.getSUnit(), ID))
424	undoReleaseSucc(SU, SuccEdge: &Succ);
425	}
426	}
427	HasLowLatencyNonWaitedParent.assign(n: SUnits.size(), val: `0`);
428	ScheduledSUnits.clear();
429	Scheduled = false;
430	}
431
432	void SIScheduleBlock::undoReleaseSucc(SUnit SU, SDep SuccEdge) {
433	SUnit *SuccSU = SuccEdge->getSUnit();
434
435	if (SuccEdge->isWeak()) {
436	++SuccSU->WeakPredsLeft;
437	return;
438	}
439	++SuccSU->NumPredsLeft;
440	}
441
442	void SIScheduleBlock::releaseSucc(SUnit SU, SDep SuccEdge) {
443	SUnit *SuccSU = SuccEdge->getSUnit();
444
445	if (SuccEdge->isWeak()) {
446	--SuccSU->WeakPredsLeft;
447	return;
448	}
449	#ifndef NDEBUG
450	if (SuccSU->NumPredsLeft == `0`) {
451	dbgs() << "* Scheduling failed! *\n";
452	DAG->dumpNode(*SuccSU);
453	dbgs() << " has been released too many times!\n";
454	llvm_unreachable(nullptr);
455	}
456	#endif
457
458	--SuccSU->NumPredsLeft;
459	}
460
461	/// Release Successors of the SU that are in the block or not.
462	void SIScheduleBlock::releaseSuccessors(SUnit SU, bool* InOrOutBlock) {
463	for (SDep& Succ : SU->Succs) {
464	SUnit *SuccSU = Succ.getSUnit();
465
466	if (SuccSU->NodeNum >= DAG->SUnits.size())
467	continue;
468
469	if (BC->isSUInBlock(SU: SuccSU, ID) != InOrOutBlock)
470	continue;
471
472	releaseSucc(SU, SuccEdge: &Succ);
473	if (SuccSU->NumPredsLeft == `0` && InOrOutBlock)
474	TopReadySUs.push_back(x: SuccSU);
475	}
476	}
477
478	void SIScheduleBlock::nodeScheduled(SUnit *SU) {
479	// Is in TopReadySUs
480	assert (!SU->NumPredsLeft);
481	std::vector<SUnit *>::iterator I = llvm::find(Range&: TopReadySUs, Val: SU);
482	if (I == TopReadySUs.end()) {
483	dbgs() << "Data Structure Bug in SI Scheduler\n";
484	llvm_unreachable(nullptr);
485	}
486	TopReadySUs.erase(position: I);
487
488	releaseSuccessors(SU, InOrOutBlock: true);
489	// Scheduling this node will trigger a wait,
490	// thus propagate to other instructions that they do not need to wait either.
491	if (HasLowLatencyNonWaitedParent [NodeNum2Index [SU->NodeNum]])
492	HasLowLatencyNonWaitedParent.assign(n: SUnits.size(), val: `0`);
493
494	if (DAG->IsLowLatencySU [SU->NodeNum]) {
495	for (SDep& Succ : SU->Succs) {
496	std::map<unsigned, unsigned>::iterator I =
497	NodeNum2Index.find(x: Succ.getSUnit()->NodeNum);
498	if (I != NodeNum2Index.end())
499	HasLowLatencyNonWaitedParent [I ->second] = `1`;
500	}
501	}
502	SU->isScheduled = true;
503	}
504
505	void SIScheduleBlock::finalizeUnits() {
506	// We remove links from outside blocks to enable scheduling inside the block.
507	for (SUnit* SU : SUnits) {
508	releaseSuccessors(SU, InOrOutBlock: false);
509	if (DAG->IsHighLatencySU [SU->NodeNum])
510	HighLatencyBlock = true;
511	}
512	HasLowLatencyNonWaitedParent.resize(new_size: SUnits.size(), x: `0`);
513	}
514
515	// we maintain ascending order of IDs
516	void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
517	unsigned PredID = Pred->getID();
518
519	// Check if not already predecessor.
520	for (SIScheduleBlock* P : Preds) {
521	if (PredID == P->getID())
522	return;
523	}
524	Preds.push_back(x: Pred);
525
526	assert(none_of(Succs,
527	[=](std::pair<SIScheduleBlock*,
528	SIScheduleBlockLinkKind> S) {
529	return PredID == S.first->getID();
530	}) &&
531	"Loop in the Block Graph!");
532	}
533
534	void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
535	SIScheduleBlockLinkKind Kind) {
536	unsigned SuccID = Succ->getID();
537
538	// Check if not already predecessor.
539	for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
540	if (SuccID == S.first->getID()) {
541	if (S.second == SIScheduleBlockLinkKind::NoData &&
542	Kind == SIScheduleBlockLinkKind::Data)
543	S.second = Kind;
544	return;
545	}
546	}
547	if (Succ->isHighLatencyBlock())
548	++NumHighLatencySuccessors;
549	Succs.emplace_back(args&: Succ, args&: Kind);
550
551	assert(none_of(Preds,
552	[=](SIScheduleBlock P) { return* SuccID == P->getID(); }) &&
553	"Loop in the Block Graph!");
554	}
555
556	#ifndef NDEBUG
557	void SIScheduleBlock::printDebug(bool full) {
558	dbgs() << "Block (" << ID << ")\n";
559	if (!full)
560	return;
561
562	dbgs() << "\nContains High Latency Instruction: "
563	<< HighLatencyBlock << `'\n'`;
564	dbgs() << "\nDepends On:\n";
565	for (SIScheduleBlock* P : Preds) {
566	P->printDebug(false);
567	}
568
569	dbgs() << "\nSuccessors:\n";
570	for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
571	if (S.second == SIScheduleBlockLinkKind::Data)
572	dbgs() << "(Data Dep) ";
573	S.first->printDebug(false);
574	}
575
576	if (Scheduled) {
577	dbgs() << "LiveInPressure "
578	<< LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << `' '`
579	<< LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << `'\n'`;
580	dbgs() << "LiveOutPressure "
581	<< LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << `' '`
582	<< LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
583	dbgs() << "LiveIns:\n";
584	for (unsigned Reg : LiveInRegs)
585	dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << `' '`;
586
587	dbgs() << "\nLiveOuts:\n";
588	for (unsigned Reg : LiveOutRegs)
589	dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << `' '`;
590	}
591
592	dbgs() << "\nInstructions:\n";
593	for (const SUnit* SU : SUnits)
594	DAG->dumpNode(*SU);
595
596	dbgs() << "///////////////////////\n";
597	}
598	#endif
599
600	// SIScheduleBlockCreator //
601
602	SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
603	: DAG(DAG) {}
604
605	SIScheduleBlocks
606	SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
607	std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
608	Blocks.find(x: BlockVariant);
609	if (B == Blocks.end()) {
610	SIScheduleBlocks Res;
611	createBlocksForVariant(BlockVariant);
612	topologicalSort();
613	scheduleInsideBlocks();
614	fillStats();
615	Res.Blocks = CurrentBlocks;
616	Res.TopDownIndex2Block = TopDownIndex2Block;
617	Res.TopDownBlock2Index = TopDownBlock2Index;
618	Blocks [BlockVariant] = Res;
619	return Res;
620	}
621	return B ->second;
622	}
623
624	bool SIScheduleBlockCreator::isSUInBlock(SUnit SU, unsigned* ID) {
625	if (SU->NodeNum >= DAG->SUnits.size())
626	return false;
627	return CurrentBlocks [Node2CurrentBlock [SU->NodeNum]]->getID() == ID;
628	}
629
630	void SIScheduleBlockCreator::colorHighLatenciesAlone() {
631	unsigned DAGSize = DAG->SUnits.size();
632
633	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
634	SUnit *SU = &DAG->SUnits [i];
635	if (DAG->IsHighLatencySU [SU->NodeNum]) {
636	CurrentColoring [SU->NodeNum] = NextReservedID++;
637	}
638	}
639	}
640
641	static bool
642	hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
643	for (const auto &PredDep : SU.Preds) {
644	if (PredDep.getSUnit() == &FromSU &&
645	PredDep.getKind() == llvm::SDep::Data)
646	return true;
647	}
648	return false;
649	}
650
651	void SIScheduleBlockCreator::colorHighLatenciesGroups() {
652	unsigned DAGSize = DAG->SUnits.size();
653	unsigned NumHighLatencies = `0`;
654	unsigned GroupSize;
655	int Color = NextReservedID;
656	unsigned Count = `0`;
657	std::set<unsigned> FormingGroup;
658
659	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
660	SUnit *SU = &DAG->SUnits [i];
661	if (DAG->IsHighLatencySU [SU->NodeNum])
662	++NumHighLatencies;
663	}
664
665	if (NumHighLatencies == `0`)
666	return;
667
668	if (NumHighLatencies <= `6`)
669	GroupSize = `2`;
670	else if (NumHighLatencies <= `12`)
671	GroupSize = `3`;
672	else
673	GroupSize = `4`;
674
675	for (unsigned SUNum : DAG->TopDownIndex2SU) {
676	const SUnit &SU = DAG->SUnits [SUNum];
677	if (DAG->IsHighLatencySU [SU.NodeNum]) {
678	unsigned CompatibleGroup = true;
679	int ProposedColor = Color;
680	std::vector<int> AdditionalElements;
681
682	// We don't want to put in the same block
683	// two high latency instructions that depend
684	// on each other.
685	// One way would be to check canAddEdge
686	// in both directions, but that currently is not
687	// enough because there the high latency order is
688	// enforced (via links).
689	// Instead, look at the dependencies between the
690	// high latency instructions and deduce if it is
691	// a data dependency or not.
692	for (unsigned j : FormingGroup) {
693	bool HasSubGraph;
694	std::vector<int> SubGraph;
695	// By construction (topological order), if SU and
696	// DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
697	// in the parent graph of SU.
698	#ifndef NDEBUG
699	SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
700	HasSubGraph);
701	assert(!HasSubGraph);
702	#endif
703	SubGraph = DAG->GetTopo()->GetSubGraph(StartSU: DAG->SUnits [j], TargetSU: SU,
704	Success&: HasSubGraph);
705	if (!HasSubGraph)
706	continue; // No dependencies between each other
707	if (SubGraph.size() > `5`) {
708	// Too many elements would be required to be added to the block.
709	CompatibleGroup = false;
710	break;
711	}
712	// Check the type of dependency
713	for (unsigned k : SubGraph) {
714	// If in the path to join the two instructions,
715	// there is another high latency instruction,
716	// or instructions colored for another block
717	// abort the merge.
718	if (DAG->IsHighLatencySU [k] \|\| (CurrentColoring [k] != ProposedColor &&
719	CurrentColoring [k] != `0`)) {
720	CompatibleGroup = false;
721	break;
722	}
723	// If one of the SU in the subgraph depends on the result of SU j,
724	// there'll be a data dependency.
725	if (hasDataDependencyPred(SU: DAG->SUnits [k], FromSU: DAG->SUnits [j])) {
726	CompatibleGroup = false;
727	break;
728	}
729	}
730	if (!CompatibleGroup)
731	break;
732	// Same check for the SU
733	if (hasDataDependencyPred(SU, FromSU: DAG->SUnits [j])) {
734	CompatibleGroup = false;
735	break;
736	}
737	// Add all the required instructions to the block
738	// These cannot live in another block (because they
739	// depend (order dependency) on one of the
740	// instruction in the block, and are required for the
741	// high latency instruction we add.
742	llvm::append_range(C&: AdditionalElements, R&: SubGraph);
743	}
744	if (CompatibleGroup) {
745	FormingGroup.insert(x: SU.NodeNum);
746	for (unsigned j : AdditionalElements)
747	CurrentColoring [j] = ProposedColor;
748	CurrentColoring [SU.NodeNum] = ProposedColor;
749	++Count;
750	}
751	// Found one incompatible instruction,
752	// or has filled a big enough group.
753	// -> start a new one.
754	if (!CompatibleGroup) {
755	FormingGroup.clear();
756	Color = ++NextReservedID;
757	ProposedColor = Color;
758	FormingGroup.insert(x: SU.NodeNum);
759	CurrentColoring [SU.NodeNum] = ProposedColor;
760	Count = `0`;
761	} else if (Count == GroupSize) {
762	FormingGroup.clear();
763	Color = ++NextReservedID;
764	ProposedColor = Color;
765	Count = `0`;
766	}
767	}
768	}
769	}
770
771	void SIScheduleBlockCreator::colorComputeReservedDependencies() {
772	unsigned DAGSize = DAG->SUnits.size();
773	std::map<std::set<unsigned>, unsigned> ColorCombinations;
774
775	CurrentTopDownReservedDependencyColoring.clear();
776	CurrentBottomUpReservedDependencyColoring.clear();
777
778	CurrentTopDownReservedDependencyColoring.resize(new_size: DAGSize, x: `0`);
779	CurrentBottomUpReservedDependencyColoring.resize(new_size: DAGSize, x: `0`);
780
781	// Traverse TopDown, and give different colors to SUs depending
782	// on which combination of High Latencies they depend on.
783
784	for (unsigned SUNum : DAG->TopDownIndex2SU) {
785	SUnit *SU = &DAG->SUnits [SUNum];
786	std::set<unsigned> SUColors;
787
788	// Already given.
789	if (CurrentColoring [SU->NodeNum]) {
790	CurrentTopDownReservedDependencyColoring [SU->NodeNum] =
791	CurrentColoring [SU->NodeNum];
792	continue;
793	}
794
795	for (SDep& PredDep : SU->Preds) {
796	SUnit *Pred = PredDep.getSUnit();
797	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
798	continue;
799	if (CurrentTopDownReservedDependencyColoring [Pred->NodeNum] > `0`)
800	SUColors.insert(x: CurrentTopDownReservedDependencyColoring [Pred->NodeNum]);
801	}
802	// Color 0 by default.
803	if (SUColors.empty())
804	continue;
805	// Same color than parents.
806	if (SUColors.size() == `1` && *SUColors.begin() > DAGSize)
807	CurrentTopDownReservedDependencyColoring [SU->NodeNum] =
808	*SUColors.begin();
809	else {
810	std::map<std::set<unsigned>, unsigned>::iterator Pos =
811	ColorCombinations.find(x: SUColors);
812	if (Pos != ColorCombinations.end()) {
813	CurrentTopDownReservedDependencyColoring [SU->NodeNum] = Pos ->second;
814	} else {
815	CurrentTopDownReservedDependencyColoring [SU->NodeNum] =
816	NextNonReservedID;
817	ColorCombinations [SUColors] = NextNonReservedID++;
818	}
819	}
820	}
821
822	ColorCombinations.clear();
823
824	// Same as before, but BottomUp.
825
826	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
827	SUnit *SU = &DAG->SUnits [SUNum];
828	std::set<unsigned> SUColors;
829
830	// Already given.
831	if (CurrentColoring [SU->NodeNum]) {
832	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] =
833	CurrentColoring [SU->NodeNum];
834	continue;
835	}
836
837	for (SDep& SuccDep : SU->Succs) {
838	SUnit *Succ = SuccDep.getSUnit();
839	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
840	continue;
841	if (CurrentBottomUpReservedDependencyColoring [Succ->NodeNum] > `0`)
842	SUColors.insert(x: CurrentBottomUpReservedDependencyColoring [Succ->NodeNum]);
843	}
844	// Keep color 0.
845	if (SUColors.empty())
846	continue;
847	// Same color than parents.
848	if (SUColors.size() == `1` && *SUColors.begin() > DAGSize)
849	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] =
850	*SUColors.begin();
851	else {
852	std::map<std::set<unsigned>, unsigned>::iterator Pos =
853	ColorCombinations.find(x: SUColors);
854	if (Pos != ColorCombinations.end()) {
855	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] = Pos ->second;
856	} else {
857	CurrentBottomUpReservedDependencyColoring [SU->NodeNum] =
858	NextNonReservedID;
859	ColorCombinations [SUColors] = NextNonReservedID++;
860	}
861	}
862	}
863	}
864
865	void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
866	std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
867
868	// Every combination of colors given by the top down
869	// and bottom up Reserved node dependency
870
871	for (const SUnit &SU : DAG->SUnits) {
872	std::pair<unsigned, unsigned> SUColors;
873
874	// High latency instructions: already given.
875	if (CurrentColoring [SU.NodeNum])
876	continue;
877
878	SUColors.first = CurrentTopDownReservedDependencyColoring [SU.NodeNum];
879	SUColors.second = CurrentBottomUpReservedDependencyColoring [SU.NodeNum];
880
881	std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
882	ColorCombinations.find(x: SUColors);
883	if (Pos != ColorCombinations.end()) {
884	CurrentColoring [SU.NodeNum] = Pos ->second;
885	} else {
886	CurrentColoring [SU.NodeNum] = NextNonReservedID;
887	ColorCombinations [SUColors] = NextNonReservedID++;
888	}
889	}
890	}
891
892	void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
893	unsigned DAGSize = DAG->SUnits.size();
894	std::vector<int> PendingColoring = CurrentColoring;
895
896	assert(DAGSize >= `1` &&
897	CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
898	CurrentTopDownReservedDependencyColoring.size() == DAGSize);
899	// If there is no reserved block at all, do nothing. We don't want
900	// everything in one block.
901	if (*llvm::max_element(Range&: CurrentBottomUpReservedDependencyColoring) == `0` &&
902	*llvm::max_element(Range&: CurrentTopDownReservedDependencyColoring) == `0`)
903	return;
904
905	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
906	SUnit *SU = &DAG->SUnits [SUNum];
907	std::set<unsigned> SUColors;
908	std::set<unsigned> SUColorsPending;
909
910	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
911	continue;
912
913	if (CurrentBottomUpReservedDependencyColoring [SU->NodeNum] > `0` \|\|
914	CurrentTopDownReservedDependencyColoring [SU->NodeNum] > `0`)
915	continue;
916
917	for (SDep& SuccDep : SU->Succs) {
918	SUnit *Succ = SuccDep.getSUnit();
919	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
920	continue;
921	if (CurrentBottomUpReservedDependencyColoring [Succ->NodeNum] > `0` \|\|
922	CurrentTopDownReservedDependencyColoring [Succ->NodeNum] > `0`)
923	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
924	SUColorsPending.insert(x: PendingColoring [Succ->NodeNum]);
925	}
926	// If there is only one child/parent block, and that block
927	// is not among the ones we are removing in this path, then
928	// merge the instruction to that block
929	if (SUColors.size() == `1` && SUColorsPending.size() == `1`)
930	PendingColoring [SU->NodeNum] = *SUColors.begin();
931	else // TODO: Attribute new colors depending on color
932	// combination of children.
933	PendingColoring [SU->NodeNum] = NextNonReservedID++;
934	}
935	CurrentColoring = PendingColoring;
936	}
937
938
939	void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
940	unsigned DAGSize = DAG->SUnits.size();
941	unsigned PreviousColor;
942	std::set<unsigned> SeenColors;
943
944	if (DAGSize <= `1`)
945	return;
946
947	PreviousColor = CurrentColoring [`0`];
948
949	for (unsigned i = `1`, e = DAGSize; i != e; ++i) {
950	SUnit *SU = &DAG->SUnits [i];
951	unsigned CurrentColor = CurrentColoring [i];
952	unsigned PreviousColorSave = PreviousColor;
953	assert(i == SU->NodeNum);
954
955	if (CurrentColor != PreviousColor)
956	SeenColors.insert(x: PreviousColor);
957	PreviousColor = CurrentColor;
958
959	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
960	continue;
961
962	if (SeenColors.find(x: CurrentColor) == SeenColors.end())
963	continue;
964
965	if (PreviousColorSave != CurrentColor)
966	CurrentColoring [i] = NextNonReservedID++;
967	else
968	CurrentColoring [i] = CurrentColoring [i-`1`];
969	}
970	}
971
972	void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
973	unsigned DAGSize = DAG->SUnits.size();
974
975	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
976	SUnit *SU = &DAG->SUnits [SUNum];
977	std::set<unsigned> SUColors;
978
979	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
980	continue;
981
982	// No predecessor: Vgpr constant loading.
983	// Low latency instructions usually have a predecessor (the address)
984	if (SU->Preds.size() > `0` && !DAG->IsLowLatencySU [SU->NodeNum])
985	continue;
986
987	for (SDep& SuccDep : SU->Succs) {
988	SUnit *Succ = SuccDep.getSUnit();
989	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
990	continue;
991	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
992	}
993	if (SUColors.size() == `1`)
994	CurrentColoring [SU->NodeNum] = *SUColors.begin();
995	}
996	}
997
998	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
999	unsigned DAGSize = DAG->SUnits.size();
1000
1001	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1002	SUnit *SU = &DAG->SUnits [SUNum];
1003	std::set<unsigned> SUColors;
1004
1005	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
1006	continue;
1007
1008	for (SDep& SuccDep : SU->Succs) {
1009	SUnit *Succ = SuccDep.getSUnit();
1010	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1011	continue;
1012	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
1013	}
1014	if (SUColors.size() == `1`)
1015	CurrentColoring [SU->NodeNum] = *SUColors.begin();
1016	}
1017	}
1018
1019	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1020	unsigned DAGSize = DAG->SUnits.size();
1021
1022	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1023	SUnit *SU = &DAG->SUnits [SUNum];
1024	std::set<unsigned> SUColors;
1025
1026	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
1027	continue;
1028
1029	for (SDep& SuccDep : SU->Succs) {
1030	SUnit *Succ = SuccDep.getSUnit();
1031	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1032	continue;
1033	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
1034	}
1035	if (SUColors.size() == `1` && *SUColors.begin() <= DAGSize)
1036	CurrentColoring [SU->NodeNum] = *SUColors.begin();
1037	}
1038	}
1039
1040	void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1041	unsigned DAGSize = DAG->SUnits.size();
1042	std::map<unsigned, unsigned> ColorCount;
1043
1044	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1045	SUnit *SU = &DAG->SUnits [SUNum];
1046	unsigned color = CurrentColoring [SU->NodeNum];
1047	++ColorCount [color];
1048	}
1049
1050	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1051	SUnit *SU = &DAG->SUnits [SUNum];
1052	unsigned color = CurrentColoring [SU->NodeNum];
1053	std::set<unsigned> SUColors;
1054
1055	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
1056	continue;
1057
1058	if (ColorCount [color] > `1`)
1059	continue;
1060
1061	for (SDep& SuccDep : SU->Succs) {
1062	SUnit *Succ = SuccDep.getSUnit();
1063	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1064	continue;
1065	SUColors.insert(x: CurrentColoring [Succ->NodeNum]);
1066	}
1067	if (SUColors.size() == `1` && *SUColors.begin() != color) {
1068	--ColorCount [color];
1069	CurrentColoring [SU->NodeNum] = *SUColors.begin();
1070	++ColorCount [*SUColors.begin()];
1071	}
1072	}
1073	}
1074
1075	void SIScheduleBlockCreator::cutHugeBlocks() {
1076	// TODO
1077	}
1078
1079	void SIScheduleBlockCreator::regroupNoUserInstructions() {
1080	unsigned DAGSize = DAG->SUnits.size();
1081	int GroupID = NextNonReservedID++;
1082
1083	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1084	SUnit *SU = &DAG->SUnits [SUNum];
1085	bool hasSuccessor = false;
1086
1087	if (CurrentColoring [SU->NodeNum] <= (int)DAGSize)
1088	continue;
1089
1090	for (SDep& SuccDep : SU->Succs) {
1091	SUnit *Succ = SuccDep.getSUnit();
1092	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1093	continue;
1094	hasSuccessor = true;
1095	}
1096	if (!hasSuccessor)
1097	CurrentColoring [SU->NodeNum] = GroupID;
1098	}
1099	}
1100
1101	void SIScheduleBlockCreator::colorExports() {
1102	unsigned ExportColor = NextNonReservedID++;
1103	SmallVector<unsigned, `8`> ExpGroup;
1104
1105	// Put all exports together in a block.
1106	// The block will naturally end up being scheduled last,
1107	// thus putting exports at the end of the schedule, which
1108	// is better for performance.
1109	// However we must ensure, for safety, the exports can be put
1110	// together in the same block without any other instruction.
1111	// This could happen, for example, when scheduling after regalloc
1112	// if reloading a spilled register from memory using the same
1113	// register than used in a previous export.
1114	// If that happens, do not regroup the exports.
1115	for (unsigned SUNum : DAG->TopDownIndex2SU) {
1116	const SUnit &SU = DAG->SUnits [SUNum];
1117	if (SIInstrInfo::isEXP(MI: *SU.getInstr())) {
1118	// SU is an export instruction. Check whether one of its successor
1119	// dependencies is a non-export, in which case we skip export grouping.
1120	for (const SDep &SuccDep : SU.Succs) {
1121	const SUnit *SuccSU = SuccDep.getSUnit();
1122	if (SuccDep.isWeak() \|\| SuccSU->NodeNum >= DAG->SUnits.size()) {
1123	// Ignore these dependencies.
1124	continue;
1125	}
1126	assert(SuccSU->isInstr() &&
1127	"SUnit unexpectedly not representing an instruction!");
1128
1129	if (!SIInstrInfo::isEXP(MI: *SuccSU->getInstr())) {
1130	// A non-export depends on us. Skip export grouping.
1131	// Note that this is a bit pessimistic: We could still group all other
1132	// exports that are not depended on by non-exports, directly or
1133	// indirectly. Simply skipping this particular export but grouping all
1134	// others would not account for indirect dependencies.
1135	return;
1136	}
1137	}
1138	ExpGroup.push_back(Elt: SUNum);
1139	}
1140	}
1141
1142	// The group can be formed. Give the color.
1143	for (unsigned j : ExpGroup)
1144	CurrentColoring [j] = ExportColor;
1145	}
1146
1147	void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1148	unsigned DAGSize = DAG->SUnits.size();
1149	std::map<unsigned,unsigned> RealID;
1150
1151	CurrentBlocks.clear();
1152	CurrentColoring.clear();
1153	CurrentColoring.resize(new_size: DAGSize, x: `0`);
1154	Node2CurrentBlock.clear();
1155
1156	// Restore links previous scheduling variant has overridden.
1157	DAG->restoreSULinksLeft();
1158
1159	NextReservedID = `1`;
1160	NextNonReservedID = DAGSize + `1`;
1161
1162	LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1163
1164	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1165	colorHighLatenciesGroups();
1166	else
1167	colorHighLatenciesAlone();
1168	colorComputeReservedDependencies();
1169	colorAccordingToReservedDependencies();
1170	colorEndsAccordingToDependencies();
1171	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1172	colorForceConsecutiveOrderInGroup();
1173	regroupNoUserInstructions();
1174	colorMergeConstantLoadsNextGroup();
1175	colorMergeIfPossibleNextGroupOnlyForReserved();
1176	colorExports();
1177
1178	// Put SUs of same color into same block
1179	Node2CurrentBlock.resize(new_size: DAGSize, x: -`1`);
1180	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1181	SUnit *SU = &DAG->SUnits [i];
1182	unsigned Color = CurrentColoring [SU->NodeNum];
1183	if (RealID.find(x: Color) == RealID.end()) {
1184	int ID = CurrentBlocks.size();
1185	BlockPtrs.push_back(x: std::make_unique<SIScheduleBlock>(args&: DAG, args: this, args&: ID));
1186	CurrentBlocks.push_back(x: BlockPtrs.rbegin()->get());
1187	RealID [Color] = ID;
1188	}
1189	CurrentBlocks [RealID [Color]]->addUnit(SU);
1190	Node2CurrentBlock [SU->NodeNum] = RealID [Color];
1191	}
1192
1193	// Build dependencies between blocks.
1194	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1195	SUnit *SU = &DAG->SUnits [i];
1196	int SUID = Node2CurrentBlock [i];
1197	for (SDep& SuccDep : SU->Succs) {
1198	SUnit *Succ = SuccDep.getSUnit();
1199	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1200	continue;
1201	if (Node2CurrentBlock [Succ->NodeNum] != SUID)
1202	CurrentBlocks [SUID]->addSucc(Succ: CurrentBlocks [Node2CurrentBlock [Succ->NodeNum]],
1203	Kind: SuccDep.isCtrl() ? NoData : Data);
1204	}
1205	for (SDep& PredDep : SU->Preds) {
1206	SUnit *Pred = PredDep.getSUnit();
1207	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
1208	continue;
1209	if (Node2CurrentBlock [Pred->NodeNum] != SUID)
1210	CurrentBlocks [SUID]->addPred(Pred: CurrentBlocks [Node2CurrentBlock [Pred->NodeNum]]);
1211	}
1212	}
1213
1214	// Free root and leafs of all blocks to enable scheduling inside them.
1215	for (SIScheduleBlock *Block : CurrentBlocks)
1216	Block->finalizeUnits();
1217	LLVM_DEBUG({
1218	dbgs() << "Blocks created:\n\n";
1219	for (SIScheduleBlock *Block : CurrentBlocks)
1220	Block->printDebug(true);
1221	});
1222	}
1223
1224	// Two functions taken from Codegen/MachineScheduler.cpp
1225
1226	/// Non-const version.
1227	static MachineBasicBlock::iterator
1228	nextIfDebug(MachineBasicBlock::iterator I,
1229	MachineBasicBlock::const_iterator End) {
1230	for (; I != End; ++I) {
1231	if (!I ->isDebugInstr())
1232	break;
1233	}
1234	return I;
1235	}
1236
1237	void SIScheduleBlockCreator::topologicalSort() {
1238	unsigned DAGSize = CurrentBlocks.size();
1239	std::vector<int> WorkList;
1240
1241	LLVM_DEBUG(dbgs() << "Topological Sort\n");
1242
1243	WorkList.reserve(n: DAGSize);
1244	TopDownIndex2Block.resize(new_size: DAGSize);
1245	TopDownBlock2Index.resize(new_size: DAGSize);
1246	BottomUpIndex2Block.resize(new_size: DAGSize);
1247
1248	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1249	SIScheduleBlock *Block = CurrentBlocks [i];
1250	unsigned Degree = Block->getSuccs().size();
1251	TopDownBlock2Index [i] = Degree;
1252	if (Degree == `0`) {
1253	WorkList.push_back(x: i);
1254	}
1255	}
1256
1257	int Id = DAGSize;
1258	while (!WorkList.empty()) {
1259	int i = WorkList.back();
1260	SIScheduleBlock *Block = CurrentBlocks [i];
1261	WorkList.pop_back();
1262	TopDownBlock2Index [i] = --Id;
1263	TopDownIndex2Block [Id] = i;
1264	for (SIScheduleBlock* Pred : Block->getPreds()) {
1265	if (!--TopDownBlock2Index [Pred->getID()])
1266	WorkList.push_back(x: Pred->getID());
1267	}
1268	}
1269
1270	#ifndef NDEBUG
1271	// Check correctness of the ordering.
1272	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1273	SIScheduleBlock *Block = CurrentBlocks[i];
1274	for (SIScheduleBlock* Pred : Block->getPreds()) {
1275	assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1276	"Wrong Top Down topological sorting");
1277	}
1278	}
1279	#endif
1280
1281	BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1282	TopDownIndex2Block.rend());
1283	}
1284
1285	void SIScheduleBlockCreator::scheduleInsideBlocks() {
1286	unsigned DAGSize = CurrentBlocks.size();
1287
1288	LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1289
1290	// We do schedule a valid scheduling such that a Block corresponds
1291	// to a range of instructions.
1292	LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1293	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1294	SIScheduleBlock *Block = CurrentBlocks [i];
1295	Block->fastSchedule();
1296	}
1297
1298	// Note: the following code, and the part restoring previous position
1299	// is by far the most expensive operation of the Scheduler.
1300
1301	// Do not update CurrentTop.
1302	MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1303	std::vector<MachineBasicBlock::iterator> PosOld;
1304	std::vector<MachineBasicBlock::iterator> PosNew;
1305	PosOld.reserve(n: DAG->SUnits.size());
1306	PosNew.reserve(n: DAG->SUnits.size());
1307
1308	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1309	int BlockIndice = TopDownIndex2Block [i];
1310	SIScheduleBlock *Block = CurrentBlocks [BlockIndice];
1311	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1312
1313	for (SUnit* SU : SUs) {
1314	MachineInstr *MI = SU->getInstr();
1315	MachineBasicBlock::iterator Pos = MI;
1316	PosOld.push_back(x: Pos);
1317	if (&*CurrentTopFastSched == MI) {
1318	PosNew.push_back(x: Pos);
1319	CurrentTopFastSched = nextIfDebug(I: ++CurrentTopFastSched,
1320	End: DAG->getCurrentBottom());
1321	} else {
1322	// Update the instruction stream.
1323	DAG->getBB()->splice(Where: CurrentTopFastSched, Other: DAG->getBB(), From: MI);
1324
1325	// Update LiveIntervals.
1326	// Note: Moving all instructions and calling handleMove every time
1327	// is the most cpu intensive operation of the scheduler.
1328	// It would gain a lot if there was a way to recompute the
1329	// LiveIntervals for the entire scheduling region.
1330	DAG->getLIS()->handleMove(MI&: MI, /UpdateFlags=/*true);
1331	PosNew.push_back(x: CurrentTopFastSched);
1332	}
1333	}
1334	}
1335
1336	// Now we have Block of SUs == Block of MI.
1337	// We do the final schedule for the instructions inside the block.
1338	// The property that all the SUs of the Block are grouped together as MI
1339	// is used for correct reg usage tracking.
1340	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1341	SIScheduleBlock *Block = CurrentBlocks [i];
1342	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1343	Block->schedule(BeginBlock: (SUs.begin())->getInstr(), EndBlock: (SUs.rbegin())->getInstr());
1344	}
1345
1346	LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1347	// Restore old ordering (which prevents a LIS->handleMove bug).
1348	for (unsigned i = PosOld.size(), e = `0`; i != e; --i) {
1349	MachineBasicBlock::iterator POld = PosOld [i-`1`];
1350	MachineBasicBlock::iterator PNew = PosNew [i-`1`];
1351	if (PNew != POld) {
1352	// Update the instruction stream.
1353	DAG->getBB()->splice(Where: POld, Other: DAG->getBB(), From: PNew);
1354
1355	// Update LiveIntervals.
1356	DAG->getLIS()->handleMove(MI&: POld, /UpdateFlags=/*true);
1357	}
1358	}
1359
1360	LLVM_DEBUG({
1361	for (SIScheduleBlock *Block : CurrentBlocks)
1362	Block->printDebug(true);
1363	});
1364	}
1365
1366	void SIScheduleBlockCreator::fillStats() {
1367	unsigned DAGSize = CurrentBlocks.size();
1368
1369	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1370	int BlockIndice = TopDownIndex2Block [i];
1371	SIScheduleBlock *Block = CurrentBlocks [BlockIndice];
1372	if (Block->getPreds().empty())
1373	Block->Depth = `0`;
1374	else {
1375	unsigned Depth = `0`;
1376	for (SIScheduleBlock *Pred : Block->getPreds()) {
1377	if (Depth < Pred->Depth + Pred->getCost())
1378	Depth = Pred->Depth + Pred->getCost();
1379	}
1380	Block->Depth = Depth;
1381	}
1382	}
1383
1384	for (unsigned i = `0`, e = DAGSize; i != e; ++i) {
1385	int BlockIndice = BottomUpIndex2Block [i];
1386	SIScheduleBlock *Block = CurrentBlocks [BlockIndice];
1387	if (Block->getSuccs().empty())
1388	Block->Height = `0`;
1389	else {
1390	unsigned Height = `0`;
1391	for (const auto &Succ : Block->getSuccs())
1392	Height = std::max(a: Height, b: Succ.first->Height + Succ.first->getCost());
1393	Block->Height = Height;
1394	}
1395	}
1396	}
1397
1398	// SIScheduleBlockScheduler //
1399
1400	SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1401	SISchedulerBlockSchedulerVariant Variant,
1402	SIScheduleBlocks BlocksStruct) :
1403	DAG(DAG), Variant(Variant), Blocks (BlocksStruct.Blocks),
1404	LastPosWaitedHighLatency(`0`), NumBlockScheduled(`0`), VregCurrentUsage(`0`),
1405	SregCurrentUsage(`0`), maxVregUsage(`0`), maxSregUsage(`0`) {
1406
1407	// Fill the usage of every output
1408	// Warning: while by construction we always have a link between two blocks
1409	// when one needs a result from the other, the number of users of an output
1410	// is not the sum of child blocks having as input the same virtual register.
1411	// Here is an example. A produces x and y. B eats x and produces x'.
1412	// C eats x' and y. The register coalescer may have attributed the same
1413	// virtual register to x and x'.
1414	// To count accurately, we do a topological sort. In case the register is
1415	// found for several parents, we increment the usage of the one with the
1416	// highest topological index.
1417	LiveOutRegsNumUsages.resize(new_size: Blocks.size());
1418	for (SIScheduleBlock *Block : Blocks) {
1419	for (unsigned Reg : Block->getInRegs()) {
1420	bool Found = false;
1421	int topoInd = -`1`;
1422	for (SIScheduleBlock* Pred: Block->getPreds()) {
1423	std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1424	std::set<unsigned>::iterator RegPos = PredOutRegs.find(x: Reg);
1425
1426	if (RegPos != PredOutRegs.end()) {
1427	Found = true;
1428	if (topoInd < BlocksStruct.TopDownBlock2Index [Pred->getID()]) {
1429	topoInd = BlocksStruct.TopDownBlock2Index [Pred->getID()];
1430	}
1431	}
1432	}
1433
1434	if (!Found)
1435	continue;
1436
1437	int PredID = BlocksStruct.TopDownIndex2Block [topoInd];
1438	++LiveOutRegsNumUsages [PredID][Reg];
1439	}
1440	}
1441
1442	LastPosHighLatencyParentScheduled.resize(new_size: Blocks.size(), x: `0`);
1443	BlockNumPredsLeft.resize(new_size: Blocks.size());
1444	BlockNumSuccsLeft.resize(new_size: Blocks.size());
1445
1446	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1447	SIScheduleBlock *Block = Blocks [i];
1448	BlockNumPredsLeft [i] = Block->getPreds().size();
1449	BlockNumSuccsLeft [i] = Block->getSuccs().size();
1450	}
1451
1452	#ifndef NDEBUG
1453	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1454	SIScheduleBlock *Block = Blocks[i];
1455	assert(Block->getID() == i);
1456	}
1457	#endif
1458
1459	std::set<unsigned> InRegs = DAG->getInRegs();
1460	addLiveRegs(Regs&: InRegs);
1461
1462	// Increase LiveOutRegsNumUsages for blocks
1463	// producing registers consumed in another
1464	// scheduling region.
1465	for (unsigned Reg : DAG->getOutRegs()) {
1466	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1467	// Do reverse traversal
1468	int ID = BlocksStruct.TopDownIndex2Block [Blocks.size()-`1`-i];
1469	SIScheduleBlock *Block = Blocks [ID];
1470	const std::set<unsigned> &OutRegs = Block->getOutRegs();
1471
1472	if (OutRegs.find(x: Reg) == OutRegs.end())
1473	continue;
1474
1475	++LiveOutRegsNumUsages [ID][Reg];
1476	break;
1477	}
1478	}
1479
1480	// Fill LiveRegsConsumers for regs that were already
1481	// defined before scheduling.
1482	for (SIScheduleBlock *Block : Blocks) {
1483	for (unsigned Reg : Block->getInRegs()) {
1484	bool Found = false;
1485	for (SIScheduleBlock* Pred: Block->getPreds()) {
1486	std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1487	std::set<unsigned>::iterator RegPos = PredOutRegs.find(x: Reg);
1488
1489	if (RegPos != PredOutRegs.end()) {
1490	Found = true;
1491	break;
1492	}
1493	}
1494
1495	if (!Found)
1496	++LiveRegsConsumers [Reg];
1497	}
1498	}
1499
1500	for (unsigned i = `0`, e = Blocks.size(); i != e; ++i) {
1501	SIScheduleBlock *Block = Blocks [i];
1502	if (BlockNumPredsLeft [i] == `0`) {
1503	ReadyBlocks.push_back(x: Block);
1504	}
1505	}
1506
1507	while (SIScheduleBlock *Block = pickBlock()) {
1508	BlocksScheduled.push_back(x: Block);
1509	blockScheduled(Block);
1510	}
1511
1512	LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
1513	: BlocksScheduled) {
1514	dbgs() << `' '` << Block->getID();
1515	} dbgs() << `'\n'`;);
1516	}
1517
1518	bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1519	SIBlockSchedCandidate &TryCand) {
1520	if (!Cand.isValid()) {
1521	TryCand.Reason = NodeOrder;
1522	return true;
1523	}
1524
1525	// Try to hide high latencies.
1526	if (SISched::tryLess(TryVal: TryCand.LastPosHighLatParentScheduled,
1527	CandVal: Cand.LastPosHighLatParentScheduled, TryCand, Cand, Reason: Latency))
1528	return true;
1529	// Schedule high latencies early so you can hide them better.
1530	if (SISched::tryGreater(TryVal: TryCand.IsHighLatency, CandVal: Cand.IsHighLatency,
1531	TryCand, Cand, Reason: Latency))
1532	return true;
1533	if (TryCand.IsHighLatency && SISched::tryGreater(TryVal: TryCand.Height, CandVal: Cand.Height,
1534	TryCand, Cand, Reason: Depth))
1535	return true;
1536	if (SISched::tryGreater(TryVal: TryCand.NumHighLatencySuccessors,
1537	CandVal: Cand.NumHighLatencySuccessors,
1538	TryCand, Cand, Reason: Successor))
1539	return true;
1540	return false;
1541	}
1542
1543	bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1544	SIBlockSchedCandidate &TryCand) {
1545	if (!Cand.isValid()) {
1546	TryCand.Reason = NodeOrder;
1547	return true;
1548	}
1549
1550	if (SISched::tryLess(TryVal: TryCand.VGPRUsageDiff > `0`, CandVal: Cand.VGPRUsageDiff > `0`,
1551	TryCand, Cand, Reason: RegUsage))
1552	return true;
1553	if (SISched::tryGreater(TryVal: TryCand.NumSuccessors > `0`,
1554	CandVal: Cand.NumSuccessors > `0`,
1555	TryCand, Cand, Reason: Successor))
1556	return true;
1557	if (SISched::tryGreater(TryVal: TryCand.Height, CandVal: Cand.Height, TryCand, Cand, Reason: Depth))
1558	return true;
1559	if (SISched::tryLess(TryVal: TryCand.VGPRUsageDiff, CandVal: Cand.VGPRUsageDiff,
1560	TryCand, Cand, Reason: RegUsage))
1561	return true;
1562	return false;
1563	}
1564
1565	SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1566	SIBlockSchedCandidate Cand;
1567	std::vector<SIScheduleBlock*>::iterator Best;
1568	SIScheduleBlock *Block;
1569	if (ReadyBlocks.empty())
1570	return nullptr;
1571
1572	DAG->fillVgprSgprCost(First: LiveRegs.begin(), End: LiveRegs.end(),
1573	VgprUsage&: VregCurrentUsage, SgprUsage&: SregCurrentUsage);
1574	if (VregCurrentUsage > maxVregUsage)
1575	maxVregUsage = VregCurrentUsage;
1576	if (SregCurrentUsage > maxSregUsage)
1577	maxSregUsage = SregCurrentUsage;
1578	LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1579	for (SIScheduleBlock *Block
1580	: ReadyBlocks) dbgs()
1581	<< Block->getID() << `' '`;
1582	dbgs() << "\nCurrent Live:\n";
1583	for (unsigned Reg
1584	: LiveRegs) dbgs()
1585	<< printVRegOrUnit(Reg, DAG->getTRI()) << `' '`;
1586	dbgs() << `'\n'`;
1587	dbgs() << "Current VGPRs: " << VregCurrentUsage << `'\n'`;
1588	dbgs() << "Current SGPRs: " << SregCurrentUsage << `'\n'`;);
1589
1590	Cand.Block = nullptr;
1591	for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1592	E = ReadyBlocks.end(); I != E; ++I) {
1593	SIBlockSchedCandidate TryCand;
1594	TryCand.Block = *I;
1595	TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1596	TryCand.VGPRUsageDiff =
1597	checkRegUsageImpact(InRegs&: TryCand.Block->getInRegs(),
1598	OutRegs&: TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32];
1599	TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1600	TryCand.NumHighLatencySuccessors =
1601	TryCand.Block->getNumHighLatencySuccessors();
1602	TryCand.LastPosHighLatParentScheduled =
1603	(unsigned int) std::max<int> (a: `0`,
1604	b: LastPosHighLatencyParentScheduled [TryCand.Block->getID()] -
1605	LastPosWaitedHighLatency);
1606	TryCand.Height = TryCand.Block->Height;
1607	// Try not to increase VGPR usage too much, else we may spill.
1608	if (VregCurrentUsage > `120` \|\|
1609	Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1610	if (!tryCandidateRegUsage(Cand, TryCand) &&
1611	Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1612	tryCandidateLatency(Cand, TryCand);
1613	} else {
1614	if (!tryCandidateLatency(Cand, TryCand))
1615	tryCandidateRegUsage(Cand, TryCand);
1616	}
1617	if (TryCand.Reason != NoCand) {
1618	Cand.setBest(TryCand);
1619	Best = I;
1620	LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << `' '`
1621	<< getReasonStr(Cand.Reason) << `'\n'`);
1622	}
1623	}
1624
1625	LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << `'\n'`;
1626	dbgs() << "Is a block with high latency instruction: "
1627	<< (Cand.IsHighLatency ? "yes\n" : "no\n");
1628	dbgs() << "Position of last high latency dependency: "
1629	<< Cand.LastPosHighLatParentScheduled << `'\n'`;
1630	dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << `'\n'`;
1631	dbgs() << `'\n'`;);
1632
1633	Block = Cand.Block;
1634	ReadyBlocks.erase(position: Best);
1635	return Block;
1636	}
1637
1638	// Tracking of currently alive registers to determine VGPR Usage.
1639
1640	void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1641	for (Register Reg : Regs) {
1642	// For now only track virtual registers.
1643	if (!Reg.isVirtual())
1644	continue;
1645	// If not already in the live set, then add it.
1646	(void) LiveRegs.insert(x: Reg);
1647	}
1648	}
1649
1650	void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1651	std::set<unsigned> &Regs) {
1652	for (unsigned Reg : Regs) {
1653	// For now only track virtual registers.
1654	std::set<unsigned>::iterator Pos = LiveRegs.find(x: Reg);
1655	assert (Pos != LiveRegs.end() && // Reg must be live.
1656	LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1657	LiveRegsConsumers[Reg] >= `1`);
1658	--LiveRegsConsumers [Reg];
1659	if (LiveRegsConsumers [Reg] == `0`)
1660	LiveRegs.erase(position: Pos);
1661	}
1662	}
1663
1664	void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1665	for (const auto &Block : Parent->getSuccs()) {
1666	if (--BlockNumPredsLeft [Block.first->getID()] == `0`)
1667	ReadyBlocks.push_back(x: Block.first);
1668
1669	if (Parent->isHighLatencyBlock() &&
1670	Block.second == SIScheduleBlockLinkKind::Data)
1671	LastPosHighLatencyParentScheduled [Block.first->getID()] = NumBlockScheduled;
1672	}
1673	}
1674
1675	void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1676	decreaseLiveRegs(Block, Regs&: Block->getInRegs());
1677	addLiveRegs(Regs&: Block->getOutRegs());
1678	releaseBlockSuccs(Parent: Block);
1679	for (const auto &RegP : LiveOutRegsNumUsages [Block->getID()]) {
1680	// We produce this register, thus it must not be previously alive.
1681	assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() \|\|
1682	LiveRegsConsumers[RegP.first] == `0`);
1683	LiveRegsConsumers [RegP.first] += RegP.second;
1684	}
1685	if (LastPosHighLatencyParentScheduled [Block->getID()] >
1686	(unsigned)LastPosWaitedHighLatency)
1687	LastPosWaitedHighLatency =
1688	LastPosHighLatencyParentScheduled [Block->getID()];
1689	++NumBlockScheduled;
1690	}
1691
1692	std::vector<int>
1693	SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1694	std::set<unsigned> &OutRegs) {
1695	std::vector<int> DiffSetPressure;
1696	DiffSetPressure.assign(n: DAG->getTRI()->getNumRegPressureSets(), val: `0`);
1697
1698	for (Register Reg : InRegs) {
1699	// For now only track virtual registers.
1700	if (!Reg.isVirtual())
1701	continue;
1702	if (LiveRegsConsumers [Reg] > `1`)
1703	continue;
1704	PSetIterator PSetI = DAG->getMRI()->getPressureSets(RegUnit: Reg);
1705	for (; PSetI.isValid(); ++PSetI) {
1706	DiffSetPressure [*PSetI] -= PSetI.getWeight();
1707	}
1708	}
1709
1710	for (Register Reg : OutRegs) {
1711	// For now only track virtual registers.
1712	if (!Reg.isVirtual())
1713	continue;
1714	PSetIterator PSetI = DAG->getMRI()->getPressureSets(RegUnit: Reg);
1715	for (; PSetI.isValid(); ++PSetI) {
1716	DiffSetPressure [*PSetI] += PSetI.getWeight();
1717	}
1718	}
1719
1720	return DiffSetPressure;
1721	}
1722
1723	// SIScheduler //
1724
1725	struct SIScheduleBlockResult
1726	SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1727	SISchedulerBlockSchedulerVariant ScheduleVariant) {
1728	SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1729	SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1730	std::vector<SIScheduleBlock*> ScheduledBlocks;
1731	struct SIScheduleBlockResult Res;
1732
1733	ScheduledBlocks = Scheduler.getBlocks();
1734
1735	for (SIScheduleBlock *Block : ScheduledBlocks) {
1736	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1737
1738	for (SUnit* SU : SUs)
1739	Res.SUs.push_back(x: SU->NodeNum);
1740	}
1741
1742	Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1743	Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1744	return Res;
1745	}
1746
1747	// SIScheduleDAGMI //
1748
1749	SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1750	ScheduleDAGMILive (C, std::make_unique<GenericScheduler>(args&: C)) {
1751	SITII = static_cast<const SIInstrInfo*>(TII);
1752	SITRI = static_cast<const SIRegisterInfo*>(TRI);
1753	}
1754
1755	SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1756
1757	// Code adapted from scheduleDAG.cpp
1758	// Does a topological sort over the SUs.
1759	// Both TopDown and BottomUp
1760	void SIScheduleDAGMI::topologicalSort() {
1761	Topo.InitDAGTopologicalSorting();
1762
1763	TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1764	BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1765	}
1766
1767	// Move low latencies further from their user without
1768	// increasing SGPR usage (in general)
1769	// This is to be replaced by a better pass that would
1770	// take into account SGPR usage (based on VGPR Usage
1771	// and the corresponding wavefront count), that would
1772	// try to merge groups of loads if it make sense, etc
1773	void SIScheduleDAGMI::moveLowLatencies() {
1774	unsigned DAGSize = SUnits.size();
1775	int LastLowLatencyUser = -`1`;
1776	int LastLowLatencyPos = -`1`;
1777
1778	for (unsigned i = `0`, e = ScheduledSUnits.size(); i != e; ++i) {
1779	SUnit *SU = &SUnits [ScheduledSUnits [i]];
1780	bool IsLowLatencyUser = false;
1781	unsigned MinPos = `0`;
1782
1783	for (SDep& PredDep : SU->Preds) {
1784	SUnit *Pred = PredDep.getSUnit();
1785	if (SITII->isLowLatencyInstruction(MI: *Pred->getInstr())) {
1786	IsLowLatencyUser = true;
1787	}
1788	if (Pred->NodeNum >= DAGSize)
1789	continue;
1790	unsigned PredPos = ScheduledSUnitsInv [Pred->NodeNum];
1791	if (PredPos >= MinPos)
1792	MinPos = PredPos + `1`;
1793	}
1794
1795	if (SITII->isLowLatencyInstruction(MI: *SU->getInstr())) {
1796	unsigned BestPos = LastLowLatencyUser + `1`;
1797	if ((int)BestPos <= LastLowLatencyPos)
1798	BestPos = LastLowLatencyPos + `1`;
1799	if (BestPos < MinPos)
1800	BestPos = MinPos;
1801	if (BestPos < i) {
1802	for (unsigned u = i; u > BestPos; --u) {
1803	++ScheduledSUnitsInv [ScheduledSUnits [u-`1`]];
1804	ScheduledSUnits [u] = ScheduledSUnits [u-`1`];
1805	}
1806	ScheduledSUnits [BestPos] = SU->NodeNum;
1807	ScheduledSUnitsInv [SU->NodeNum] = BestPos;
1808	}
1809	LastLowLatencyPos = BestPos;
1810	if (IsLowLatencyUser)
1811	LastLowLatencyUser = BestPos;
1812	} else if (IsLowLatencyUser) {
1813	LastLowLatencyUser = i;
1814	// Moves COPY instructions on which depends
1815	// the low latency instructions too.
1816	} else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1817	bool CopyForLowLat = false;
1818	for (SDep& SuccDep : SU->Succs) {
1819	SUnit *Succ = SuccDep.getSUnit();
1820	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1821	continue;
1822	if (SITII->isLowLatencyInstruction(MI: *Succ->getInstr())) {
1823	CopyForLowLat = true;
1824	}
1825	}
1826	if (!CopyForLowLat)
1827	continue;
1828	if (MinPos < i) {
1829	for (unsigned u = i; u > MinPos; --u) {
1830	++ScheduledSUnitsInv [ScheduledSUnits [u-`1`]];
1831	ScheduledSUnits [u] = ScheduledSUnits [u-`1`];
1832	}
1833	ScheduledSUnits [MinPos] = SU->NodeNum;
1834	ScheduledSUnitsInv [SU->NodeNum] = MinPos;
1835	}
1836	}
1837	}
1838	}
1839
1840	void SIScheduleDAGMI::restoreSULinksLeft() {
1841	for (unsigned i = `0`, e = SUnits.size(); i != e; ++i) {
1842	SUnits [i].isScheduled = false;
1843	SUnits [i].WeakPredsLeft = SUnitsLinksBackup [i].WeakPredsLeft;
1844	SUnits [i].NumPredsLeft = SUnitsLinksBackup [i].NumPredsLeft;
1845	SUnits [i].WeakSuccsLeft = SUnitsLinksBackup [i].WeakSuccsLeft;
1846	SUnits [i].NumSuccsLeft = SUnitsLinksBackup [i].NumSuccsLeft;
1847	}
1848	}
1849
1850	// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1851	template<typename _Iterator> void
1852	SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1853	unsigned &VgprUsage, unsigned &SgprUsage) {
1854	VgprUsage = `0`;
1855	SgprUsage = `0`;
1856	for (_Iterator RegI = First; RegI != End; ++RegI) {
1857	Register Reg = *RegI;
1858	// For now only track virtual registers
1859	if (!Reg.isVirtual())
1860	continue;
1861	PSetIterator PSetI = MRI.getPressureSets(RegUnit: Reg);
1862	for (; PSetI.isValid(); ++PSetI) {
1863	if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
1864	VgprUsage += PSetI.getWeight();
1865	else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32)
1866	SgprUsage += PSetI.getWeight();
1867	}
1868	}
1869	}
1870
1871	void SIScheduleDAGMI::schedule()
1872	{
1873	SmallVector<SUnit*, `8`> TopRoots, BotRoots;
1874	SIScheduleBlockResult Best, Temp;
1875	LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1876
1877	buildDAGWithRegPressure();
1878	postProcessDAG();
1879
1880	LLVM_DEBUG(dump());
1881	if (PrintDAGs)
1882	dump();
1883	if (ViewMISchedDAGs)
1884	viewGraph();
1885
1886	topologicalSort();
1887	findRootsAndBiasEdges(TopRoots, BotRoots);
1888	// We reuse several ScheduleDAGMI and ScheduleDAGMILive
1889	// functions, but to make them happy we must initialize
1890	// the default Scheduler implementation (even if we do not
1891	// run it)
1892	SchedImpl ->initialize(DAG: this);
1893	initQueues(TopRoots, BotRoots);
1894
1895	// Fill some stats to help scheduling.
1896
1897	SUnitsLinksBackup = SUnits;
1898	IsLowLatencySU.clear();
1899	LowLatencyOffset.clear();
1900	IsHighLatencySU.clear();
1901
1902	IsLowLatencySU.resize(new_size: SUnits.size(), x: `0`);
1903	LowLatencyOffset.resize(new_size: SUnits.size(), x: `0`);
1904	IsHighLatencySU.resize(new_size: SUnits.size(), x: `0`);
1905
1906	for (unsigned i = `0`, e = (unsigned)SUnits.size(); i != e; ++i) {
1907	SUnit *SU = &SUnits [i];
1908	const MachineOperand *BaseLatOp;
1909	int64_t OffLatReg;
1910	if (SITII->isLowLatencyInstruction(MI: *SU->getInstr())) {
1911	IsLowLatencySU [i] = `1`;
1912	bool OffsetIsScalable;
1913	if (SITII->getMemOperandWithOffset(MI: *SU->getInstr(), BaseOp&: BaseLatOp, Offset&: OffLatReg,
1914	OffsetIsScalable, TRI))
1915	LowLatencyOffset [i] = OffLatReg;
1916	} else if (SITII->isHighLatencyDef(Opc: SU->getInstr()->getOpcode()))
1917	IsHighLatencySU [i] = `1`;
1918	}
1919
1920	SIScheduler Scheduler(this);
1921	Best = Scheduler.scheduleVariant(BlockVariant: SISchedulerBlockCreatorVariant::LatenciesAlone,
1922	ScheduleVariant: SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1923
1924	// if VGPR usage is extremely high, try other good performing variants
1925	// which could lead to lower VGPR usage
1926	if (Best.MaxVGPRUsage > `180`) {
1927	static const std::pair<SISchedulerBlockCreatorVariant,
1928	SISchedulerBlockSchedulerVariant>
1929	Variants[] = {
1930	{ LatenciesAlone, BlockRegUsageLatency },
1931	// { LatenciesAlone, BlockRegUsage },
1932	{ LatenciesGrouped, BlockLatencyRegUsage },
1933	// { LatenciesGrouped, BlockRegUsageLatency },
1934	// { LatenciesGrouped, BlockRegUsage },
1935	{ LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1936	// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1937	// { LatenciesAlonePlusConsecutive, BlockRegUsage }
1938	};
1939	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1940	Temp = Scheduler.scheduleVariant(BlockVariant: v.first, ScheduleVariant: v.second);
1941	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1942	Best = Temp;
1943	}
1944	}
1945	// if VGPR usage is still extremely high, we may spill. Try other variants
1946	// which are less performing, but that could lead to lower VGPR usage.
1947	if (Best.MaxVGPRUsage > `200`) {
1948	static const std::pair<SISchedulerBlockCreatorVariant,
1949	SISchedulerBlockSchedulerVariant>
1950	Variants[] = {
1951	// { LatenciesAlone, BlockRegUsageLatency },
1952	{ LatenciesAlone, BlockRegUsage },
1953	// { LatenciesGrouped, BlockLatencyRegUsage },
1954	{ LatenciesGrouped, BlockRegUsageLatency },
1955	{ LatenciesGrouped, BlockRegUsage },
1956	// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1957	{ LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1958	{ LatenciesAlonePlusConsecutive, BlockRegUsage }
1959	};
1960	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1961	Temp = Scheduler.scheduleVariant(BlockVariant: v.first, ScheduleVariant: v.second);
1962	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1963	Best = Temp;
1964	}
1965	}
1966
1967	ScheduledSUnits = Best.SUs;
1968	ScheduledSUnitsInv.resize(new_size: SUnits.size());
1969
1970	for (unsigned i = `0`, e = (unsigned)SUnits.size(); i != e; ++i) {
1971	ScheduledSUnitsInv [ScheduledSUnits [i]] = i;
1972	}
1973
1974	moveLowLatencies();
1975
1976	// Tell the outside world about the result of the scheduling.
1977
1978	assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1979	TopRPTracker.setPos(CurrentTop);
1980
1981	for (unsigned I : ScheduledSUnits) {
1982	SUnit *SU = &SUnits [I];
1983
1984	scheduleMI(SU, IsTopNode: true);
1985
1986	LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
1987	<< *SU->getInstr());
1988	}
1989
1990	assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1991
1992	placeDebugValues();
1993
1994	LLVM_DEBUG({
1995	dbgs() << "*** Final schedule for "
1996	<< printMBBReference(begin()->getParent()) << " **\n";
1997	dumpSchedule();
1998	dbgs() << `'\n'`;
1999	});
2000	}
2001

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp