WindowScheduler.cpp source code [llvm_projects/llvm/lib/CodeGen/WindowScheduler.cpp]

1	//======----------- WindowScheduler.cpp - window scheduler -------------======//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// An implementation of the Window Scheduling software pipelining algorithm.
10	//
11	// The fundamental concept of the window scheduling algorithm involves folding
12	// the original MBB at a specific position, followed by list scheduling on the
13	// folded MIs. The optimal scheduling result is then chosen from various folding
14	// positions as the final scheduling outcome.
15	//
16	// The primary challenge in this algorithm lies in generating the folded MIs and
17	// establishing their dependencies. We have innovatively employed a new MBB,
18	// created by copying the original MBB three times, known as TripleMBB. This
19	// TripleMBB enables the convenient implementation of MI folding and dependency
20	// establishment. To facilitate the algorithm's implementation, we have also
21	// devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle.
22	//
23	// Another challenge in the algorithm is the scheduling of phis. Semantically,
24	// it is difficult to place the phis in the window and perform list scheduling.
25	// Therefore, we schedule these phis separately after each list scheduling.
26	//
27	// The provided implementation is designed for use before the Register Allocator
28	// (RA). If the target requires implementation after RA, it is recommended to
29	// reimplement analyseII(), schedulePhi(), and expand(). Additionally,
30	// target-specific logic can be added in initialize(), preProcess(), and
31	// postProcess().
32	//
33	// Lastly, it is worth mentioning that getSearchIndexes() is an important
34	// function. We have experimented with more complex heuristics on downstream
35	// target and achieved favorable results.
36	//
37	//===----------------------------------------------------------------------===//
38	#include "llvm/CodeGen/WindowScheduler.h"
39	#include "llvm/ADT/Statistic.h"
40	#include "llvm/CodeGen/LiveIntervals.h"
41	#include "llvm/CodeGen/MachineLoopInfo.h"
42	#include "llvm/CodeGen/MachinePipeliner.h"
43	#include "llvm/CodeGen/ModuloSchedule.h"
44	#include "llvm/CodeGen/TargetPassConfig.h"
45	#include "llvm/Support/CommandLine.h"
46	#include "llvm/Support/Debug.h"
47	#include "llvm/Support/TimeProfiler.h"
48
49	using namespace llvm;
50
51	#define DEBUG_TYPE "pipeliner"
52
53	namespace {
54	STATISTIC(NumTryWindowSchedule,
55	"Number of loops that we attempt to use window scheduling");
56	STATISTIC(NumTryWindowSearch,
57	"Number of times that we run list schedule in the window scheduling");
58	STATISTIC(NumWindowSchedule,
59	"Number of loops that we successfully use window scheduling");
60	STATISTIC(NumFailAnalyseII,
61	"Window scheduling abort due to the failure of the II analysis");
62
63	cl::opt<unsigned>
64	WindowSearchNum("window-search-num",
65	cl::desc ("The number of searches per loop in the window "
66	"algorithm. 0 means no search number limit."),
67	cl::Hidden, cl::init(Val: `6`));
68
69	cl::opt<unsigned> WindowSearchRatio(
70	"window-search-ratio",
71	cl::desc ("The ratio of searches per loop in the window algorithm. 100 "
72	"means search all positions in the loop, while 0 means not "
73	"performing any search."),
74	cl::Hidden, cl::init(Val: `40`));
75
76	cl::opt<unsigned> WindowIICoeff(
77	"window-ii-coeff",
78	cl::desc (
79	"The coefficient used when initializing II in the window algorithm."),
80	cl::Hidden, cl::init(Val: `5`));
81
82	cl::opt<unsigned> WindowRegionLimit(
83	"window-region-limit",
84	cl::desc (
85	"The lower limit of the scheduling region in the window algorithm."),
86	cl::Hidden, cl::init(Val: `3`));
87
88	cl::opt<unsigned> WindowDiffLimit(
89	"window-diff-limit",
90	cl::desc ("The lower limit of the difference between best II and base II in "
91	"the window algorithm. If the difference is smaller than "
92	"this lower limit, window scheduling will not be performed."),
93	cl::Hidden, cl::init(Val: `2`));
94	} // namespace
95
96	// WindowIILimit serves as an indicator of abnormal scheduling results and could
97	// potentially be referenced by the derived target window scheduler.
98	cl::opt<unsigned>
99	WindowIILimit("window-ii-limit",
100	cl::desc ("The upper limit of II in the window algorithm."),
101	cl::Hidden, cl::init(Val: `1000`));
102
103	WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML)
104	: Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML),
105	Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()),
106	TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) {
107	TripleDAG = std::unique_ptr<ScheduleDAGInstrs>(
108	createMachineScheduler(/OnlyBuildGraph=/true));
109	}
110
111	bool WindowScheduler::run() {
112	if (!initialize()) {
113	LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n");
114	return false;
115	}
116	// The window algorithm is time-consuming, and its compilation time should be
117	// taken into consideration.
118	TimeTraceScope Scope("WindowSearch");
119	++NumTryWindowSchedule;
120	// Performing the relevant processing before window scheduling.
121	preProcess();
122	// The main window scheduling begins.
123	std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler());
124	auto SearchIndexes = getSearchIndexes(SearchNum: WindowSearchNum, SearchRatio: WindowSearchRatio);
125	for (unsigned Idx : SearchIndexes) {
126	OriToCycle.clear();
127	++NumTryWindowSearch;
128	// The scheduling starts with non-phi instruction, so SchedPhiNum needs to
129	// be added to Idx.
130	unsigned Offset = Idx + SchedPhiNum;
131	auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
132	SchedDAG ->startBlock(BB: MBB);
133	SchedDAG ->enterRegion(bb: MBB, begin: Range.begin(), end: Range.end(), regioninstrs: SchedInstrNum);
134	SchedDAG ->schedule();
135	LLVM_DEBUG(SchedDAG->dump());
136	unsigned II = analyseII(DAG&: *SchedDAG, Offset);
137	if (II == WindowIILimit) {
138	restoreTripleMBB();
139	LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n");
140	++NumFailAnalyseII;
141	continue;
142	}
143	schedulePhi(Offset, II);
144	updateScheduleResult(Offset, II);
145	restoreTripleMBB();
146	LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is "
147	<< II << ".\n");
148	}
149	// Performing the relevant processing after window scheduling.
150	postProcess();
151	// Check whether the scheduling result is valid.
152	if (!isScheduleValid()) {
153	LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n");
154	return false;
155	}
156	LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset
157	<< " and Best II is " << BestII << ".\n");
158	// Expand the scheduling result to prologue, kernel, and epilogue.
159	expand();
160	++NumWindowSchedule;
161	return true;
162	}
163
164	ScheduleDAGInstrs *
165	WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) {
166	return OnlyBuildGraph
167	? new ScheduleDAGMI (
168	Context, std::make_unique<PostGenericScheduler>(args&: Context),
169	true)
170	: Context->PassConfig->createMachineScheduler(C: Context);
171	}
172
173	bool WindowScheduler::initialize() {
174	if (!Subtarget->enableWindowScheduler()) {
175	LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n");
176	return false;
177	}
178	// Initialized the member variables used by window algorithm.
179	OriMIs.clear();
180	TriMIs.clear();
181	TriToOri.clear();
182	OriToCycle.clear();
183	SchedResult.clear();
184	SchedPhiNum = `0`;
185	SchedInstrNum = `0`;
186	BestII = UINT_MAX;
187	BestOffset = `0`;
188	BaseII = `0`;
189	// List scheduling used in the window algorithm depends on LiveIntervals.
190	if (!Context->LIS) {
191	LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n");
192	return false;
193	}
194	// Check each MI in MBB.
195	SmallSet<Register, `8`> PrevDefs;
196	SmallSet<Register, `8`> PrevUses;
197	auto IsLoopCarried = [&](MachineInstr &Phi) {
198	// Two cases are checked here: (1)The virtual register defined by the
199	// preceding phi is used by the succeeding phi;(2)The preceding phi uses the
200	// virtual register defined by the succeeding phi.
201	if (PrevUses.count(V: Phi.getOperand(i: `0`).getReg()))
202	return true;
203	PrevDefs.insert(V: Phi.getOperand(i: `0`).getReg());
204	for (unsigned I = `1`, E = Phi.getNumOperands(); I != E; I += `2`) {
205	if (PrevDefs.count(V: Phi.getOperand(i: I).getReg()))
206	return true;
207	PrevUses.insert(V: Phi.getOperand(i: I).getReg());
208	}
209	return false;
210	};
211	auto PLI = TII->analyzeLoopForPipelining(LoopBB: MBB);
212	for (auto &MI : *MBB) {
213	if (MI.isMetaInstruction() \|\| MI.isTerminator())
214	continue;
215	if (MI.isPHI()) {
216	if (IsLoopCarried (MI)) {
217	LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n");
218	return false;
219	}
220	++SchedPhiNum;
221	++BestOffset;
222	} else
223	++SchedInstrNum;
224	if (TII->isSchedulingBoundary(MI, MBB, MF: *MF)) {
225	LLVM_DEBUG(
226	dbgs() << "Boundary MI is not allowed in window scheduling!\n");
227	return false;
228	}
229	if (PLI ->shouldIgnoreForPipelining(MI: &MI)) {
230	LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in "
231	"window scheduling!\n");
232	return false;
233	}
234	for (auto &Def : MI.all_defs())
235	if (Def.isReg() && Def.getReg().isPhysical()) {
236	LLVM_DEBUG(dbgs() << "Physical registers are not supported in "
237	"window scheduling!\n");
238	return false;
239	}
240	}
241	if (SchedInstrNum <= WindowRegionLimit) {
242	LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
243	return false;
244	}
245	return true;
246	}
247
248	void WindowScheduler::preProcess() {
249	// Prior to window scheduling, it's necessary to backup the original MBB,
250	// generate a new TripleMBB, and build a TripleDAG based on the TripleMBB.
251	backupMBB();
252	generateTripleMBB();
253	TripleDAG ->startBlock(BB: MBB);
254	TripleDAG ->enterRegion(
255	bb: MBB, begin: MBB->begin(), end: MBB->getFirstTerminator(),
256	regioninstrs: std::distance(first: MBB->begin(), last: MBB->getFirstTerminator()));
257	TripleDAG ->buildSchedGraph(AA: Context->AA);
258	}
259
260	void WindowScheduler::postProcess() {
261	// After window scheduling, it's necessary to clear the TripleDAG and restore
262	// to the original MBB.
263	TripleDAG ->exitRegion();
264	TripleDAG ->finishBlock();
265	restoreMBB();
266	}
267
268	void WindowScheduler::backupMBB() {
269	for (auto &MI : MBB->instrs())
270	OriMIs.push_back(Elt: &MI);
271	// Remove MIs and the corresponding live intervals.
272	for (auto &MI : make_early_inc_range(Range&: *MBB)) {
273	Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true);
274	MBB->remove(I: &MI);
275	}
276	}
277
278	void WindowScheduler::restoreMBB() {
279	// Erase MIs and the corresponding live intervals.
280	for (auto &MI : make_early_inc_range(Range&: *MBB)) {
281	Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true);
282	MI.eraseFromParent();
283	}
284	// Restore MBB to the state before window scheduling.
285	for (auto *MI : OriMIs)
286	MBB->push_back(MI);
287	updateLiveIntervals();
288	}
289
290	void WindowScheduler::generateTripleMBB() {
291	const unsigned DuplicateNum = `3`;
292	TriMIs.clear();
293	TriToOri.clear();
294	assert(OriMIs.size() > `0` && "The Original MIs were not backed up!");
295	// Step 1: Performing the first copy of MBB instructions, excluding
296	// terminators. At the same time, we back up the anti-register of phis.
297	// DefPairs hold the old and new define register pairs.
298	DenseMap<Register, Register> DefPairs;
299	for (auto *MI : OriMIs) {
300	if (MI->isMetaInstruction() \|\| MI->isTerminator())
301	continue;
302	if (MI->isPHI())
303	if (Register AntiReg = getAntiRegister(Phi: MI))
304	DefPairs [MI->getOperand(i: `0`).getReg()] = AntiReg;
305	auto *NewMI = MF->CloneMachineInstr(Orig: MI);
306	MBB->push_back(MI: NewMI);
307	TriMIs.push_back(Elt: NewMI);
308	TriToOri [NewMI] = MI;
309	}
310	// Step 2: Performing the remaining two copies of MBB instructions excluding
311	// phis, and the last one contains terminators. At the same time, registers
312	// are updated accordingly.
313	for (size_t Cnt = `1`; Cnt < DuplicateNum; ++Cnt) {
314	for (auto *MI : OriMIs) {
315	if (MI->isPHI() \|\| MI->isMetaInstruction() \|\|
316	(MI->isTerminator() && Cnt < DuplicateNum - `1`))
317	continue;
318	auto *NewMI = MF->CloneMachineInstr(Orig: MI);
319	DenseMap<Register, Register> NewDefs;
320	// New defines are updated.
321	for (auto MO : NewMI->all_defs())
322	if (MO.isReg() && MO.getReg().isVirtual()) {
323	Register NewDef =
324	MRI->createVirtualRegister(RegClass: MRI->getRegClass(Reg: MO.getReg()));
325	NewMI->substituteRegister(FromReg: MO.getReg(), ToReg: NewDef, SubIdx: `0`, RegInfo: *TRI);
326	NewDefs [MO.getReg()] = NewDef;
327	}
328	// New uses are updated.
329	for (auto DefRegPair : DefPairs)
330	if (NewMI->readsRegister(Reg: DefRegPair.first, TRI)) {
331	Register NewUse = DefRegPair.second;
332	// Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3':
333	//
334	// BB.3: DefPairs
335	// ==================================
336	// %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7)
337	// ...
338	// ==================================
339	// ...
340	// %4 = sub i32 %1, %3
341	// ...
342	// %7 = add i32 %5, %6
343	// ...
344	// ----------------------------------
345	// ...
346	// %8 = sub i32 %7, %3 (%1,%7),(%4,%8)
347	// ...
348	// %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9)
349	// ...
350	// ----------------------------------
351	// ...
352	// %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9)
353	// ... ^
354	// %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11)
355	// ...
356	// ==================================
357	// < Terminators >
358	// ==================================
359	if (DefPairs.count(Val: NewUse))
360	NewUse = DefPairs [NewUse];
361	NewMI->substituteRegister(FromReg: DefRegPair.first, ToReg: NewUse, SubIdx: `0`, RegInfo: *TRI);
362	}
363	// DefPairs is updated at last.
364	for (auto &NewDef : NewDefs)
365	DefPairs [NewDef.first] = NewDef.second;
366	MBB->push_back(MI: NewMI);
367	TriMIs.push_back(Elt: NewMI);
368	TriToOri [NewMI] = MI;
369	}
370	}
371	// Step 3: The registers used by phis are updated, and they are generated in
372	// the third copy of MBB.
373	// In the privious example, the old phi is:
374	// %1 = phi i32 [%2, %BB.1], [%7, %BB.3]
375	// The new phi is:
376	// %1 = phi i32 [%2, %BB.1], [%11, %BB.3]
377	for (auto &Phi : MBB->phis()) {
378	for (auto DefRegPair : DefPairs)
379	if (Phi.readsRegister(Reg: DefRegPair.first, TRI))
380	Phi.substituteRegister(FromReg: DefRegPair.first, ToReg: DefRegPair.second, SubIdx: `0`, RegInfo: *TRI);
381	}
382	updateLiveIntervals();
383	}
384
385	void WindowScheduler::restoreTripleMBB() {
386	// After list scheduling, the MBB is restored in one traversal.
387	for (size_t I = `0`; I < TriMIs.size(); ++I) {
388	auto *MI = TriMIs [I];
389	auto OldPos = MBB->begin();
390	std::advance(i&: OldPos, n: I);
391	auto CurPos = MI->getIterator();
392	if (CurPos != OldPos) {
393	MBB->splice(Where: OldPos, Other: MBB, From: CurPos);
394	Context->LIS->handleMove(MI&: MI, /UpdateFlags=/*false);
395	}
396	}
397	}
398
399	SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum,
400	unsigned SearchRatio) {
401	// We use SearchRatio to get the index range, and then evenly get the indexes
402	// according to the SearchNum. This is a simple huristic. Depending on the
403	// characteristics of the target, more complex algorithms can be used for both
404	// performance and compilation time.
405	assert(SearchRatio <= `100` && "SearchRatio should be equal or less than 100!");
406	unsigned MaxIdx = SchedInstrNum * SearchRatio / `100`;
407	unsigned Step = SearchNum > `0` && SearchNum <= MaxIdx ? MaxIdx / SearchNum : `1`;
408	SmallVector<unsigned> SearchIndexes;
409	for (unsigned Idx = `0`; Idx < MaxIdx; Idx += Step)
410	SearchIndexes.push_back(Elt: Idx);
411	return SearchIndexes;
412	}
413
414	int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) {
415	// Sometimes MaxDepth is 0, so it should be limited to the minimum of 1.
416	unsigned MaxDepth = `1`;
417	for (auto &SU : DAG.SUnits)
418	MaxDepth = std::max(a: SU.getDepth() + SU.Latency, b: MaxDepth);
419	return MaxDepth * WindowIICoeff;
420	}
421
422	int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
423	unsigned Offset) {
424	int InitII = getEstimatedII(DAG);
425	ResourceManager RM(Subtarget, &DAG);
426	RM.init(II: InitII);
427	// ResourceManager and DAG are used to calculate the maximum cycle for the
428	// scheduled MIs. Since MIs in the Region have already been scheduled, the
429	// emit cycles can be estimated in order here.
430	int CurCycle = `0`;
431	auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
432	for (auto &MI : Range) {
433	auto *SU = DAG.getSUnit(MI: &MI);
434	int ExpectCycle = CurCycle;
435	// The predecessors of current MI determine its earliest issue cycle.
436	for (auto &Pred : SU->Preds) {
437	if (Pred.isWeak())
438	continue;
439	auto *PredMI = Pred.getSUnit()->getInstr();
440	int PredCycle = getOriCycle(NewMI: PredMI);
441	ExpectCycle = std::max(a: ExpectCycle, b: PredCycle + (int)Pred.getLatency());
442	}
443	// Zero cost instructions do not need to check resource.
444	if (!TII->isZeroCost(Opcode: MI.getOpcode())) {
445	// ResourceManager can be used to detect resource conflicts between the
446	// current MI and the previously inserted MIs.
447	while (!RM.canReserveResources(SU&: *SU, Cycle: CurCycle) \|\| CurCycle < ExpectCycle) {
448	++CurCycle;
449	if (CurCycle == (int)WindowIILimit)
450	return CurCycle;
451	}
452	RM.reserveResources(SU&: *SU, Cycle: CurCycle);
453	}
454	OriToCycle [getOriMI(NewMI: &MI)] = CurCycle;
455	LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
456	<< getOriStage(getOriMI(&MI), Offset) << "]: " << MI);
457	}
458	LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n");
459	return CurCycle;
460	}
461
462	// By utilizing TripleDAG, we can easily establish dependencies between A and B.
463	// Based on the MaxCycle and the issue cycle of A and B, we can determine
464	// whether it is necessary to add a stall cycle. This is because, without
465	// inserting the stall cycle, the latency constraint between A and B cannot be
466	// satisfied. The details are as follows:
467	//
468	// New MBB:
469	// ========================================
470	// < Phis >
471	// ======================================== (sliding direction)
472	// MBB copy 1 \|
473	// V
474	//
475	// ~~~~~~~~~~~~~~~~~~~\|~~~~~~~~~~~~~~~~~~~~ ----schedule window-----
476	// \| \|
477	// ===================V==================== \|
478	// MBB copy 2 < MI B > \|
479	// \|
480	// < MI A > V
481	// ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------
482	// :
483	// ===================V====================
484	// MBB copy 3 < MI B'>
485	//
486	//
487	//
488	//
489	// ========================================
490	// < Terminators >
491	// ========================================
492	int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
493	int MaxStallCycle = `0`;
494	int CurrentII = MaxCycle + `1`;
495	auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
496	for (auto &MI : Range) {
497	auto *SU = TripleDAG ->getSUnit(MI: &MI);
498	int DefCycle = getOriCycle(NewMI: &MI);
499	for (auto &Succ : SU->Succs) {
500	if (Succ.isWeak() \|\| Succ.getSUnit() == &TripleDAG ->ExitSU)
501	continue;
502	// If the expected cycle does not exceed CurrentII, no check is needed.
503	if (DefCycle + (int)Succ.getLatency() <= CurrentII)
504	continue;
505	// If the cycle of the scheduled MI A is less than that of the scheduled
506	// MI B, the scheduling will fail because the lifetime of the
507	// corresponding register exceeds II.
508	auto *SuccMI = Succ.getSUnit()->getInstr();
509	int UseCycle = getOriCycle(NewMI: SuccMI);
510	if (DefCycle < UseCycle)
511	return WindowIILimit;
512	// Get the stall cycle introduced by the register between two trips.
513	int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle;
514	MaxStallCycle = std::max(a: MaxStallCycle, b: StallCycle);
515	}
516	}
517	LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n");
518	return MaxStallCycle;
519	}
520
521	unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) {
522	LLVM_DEBUG(dbgs() << "Start analyzing II:\n");
523	int MaxCycle = calculateMaxCycle(DAG, Offset);
524	if (MaxCycle == (int)WindowIILimit)
525	return MaxCycle;
526	int StallCycle = calculateStallCycle(Offset, MaxCycle);
527	if (StallCycle == (int)WindowIILimit)
528	return StallCycle;
529	// The value of II is equal to the maximum execution cycle plus 1.
530	return MaxCycle + StallCycle + `1`;
531	}
532
533	void WindowScheduler::schedulePhi(int Offset, unsigned &II) {
534	LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n");
535	for (auto &Phi : MBB->phis()) {
536	int LateCycle = INT_MAX;
537	auto *SU = TripleDAG ->getSUnit(MI: &Phi);
538	for (auto &Succ : SU->Succs) {
539	// Phi doesn't have any Anti successors.
540	if (Succ.getKind() != SDep::Data)
541	continue;
542	// Phi is scheduled before the successor of stage 0. The issue cycle of
543	// phi is the latest cycle in this interval.
544	auto *SuccMI = Succ.getSUnit()->getInstr();
545	int Cycle = getOriCycle(NewMI: SuccMI);
546	if (getOriStage(OriMI: getOriMI(NewMI: SuccMI), Offset) == `0`)
547	LateCycle = std::min(a: LateCycle, b: Cycle);
548	}
549	// The anti-dependency of phi need to be handled separately in the same way.
550	if (Register AntiReg = getAntiRegister(Phi: &Phi)) {
551	auto *AntiMI = MRI->getVRegDef(Reg: AntiReg);
552	// AntiReg may be defined outside the kernel MBB.
553	if (AntiMI->getParent() == MBB) {
554	auto AntiCycle = getOriCycle(NewMI: AntiMI);
555	if (getOriStage(OriMI: getOriMI(NewMI: AntiMI), Offset) == `0`)
556	LateCycle = std::min(a: LateCycle, b: AntiCycle);
557	}
558	}
559	// If there is no limit to the late cycle, a default value is given.
560	if (LateCycle == INT_MAX)
561	LateCycle = (int)(II - `1`);
562	LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi);
563	// The issue cycle of phi is set to the latest cycle in the interval.
564	auto *OriPhi = getOriMI(NewMI: &Phi);
565	OriToCycle [OriPhi] = LateCycle;
566	}
567	}
568
569	DenseMap<MachineInstr , int> WindowScheduler::getIssueOrder(unsigned* Offset,
570	unsigned II) {
571	// At each issue cycle, phi is placed before MIs in stage 0. So the simplest
572	// way is to put phi at the beginning of the current cycle.
573	DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs;
574	auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
575	for (auto &Phi : MBB->phis())
576	CycleToMIs [getOriCycle(NewMI: &Phi)].push_back(Elt: getOriMI(NewMI: &Phi));
577	for (auto &MI : Range)
578	CycleToMIs [getOriCycle(NewMI: &MI)].push_back(Elt: getOriMI(NewMI: &MI));
579	// Each MI is assigned a separate ordered Id, which is used as a sort marker
580	// in the following expand process.
581	DenseMap<MachineInstr , int*> IssueOrder;
582	int Id = `0`;
583	for (int Cycle = `0`; Cycle < (int)II; ++Cycle) {
584	if (!CycleToMIs.count(Val: Cycle))
585	continue;
586	for (auto *MI : CycleToMIs [Cycle])
587	IssueOrder [MI] = Id++;
588	}
589	return IssueOrder;
590	}
591
592	void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) {
593	// At the first update, Offset is equal to SchedPhiNum. At this time, only
594	// BestII, BestOffset, and BaseII need to be updated.
595	if (Offset == SchedPhiNum) {
596	BestII = II;
597	BestOffset = SchedPhiNum;
598	BaseII = II;
599	return;
600	}
601	// The update will only continue if the II is smaller than BestII and the II
602	// is sufficiently small.
603	if ((II >= BestII) \|\| (II + WindowDiffLimit > BaseII))
604	return;
605	BestII = II;
606	BestOffset = Offset;
607	// Record the result of the current list scheduling, noting that each MI is
608	// stored unordered in SchedResult.
609	SchedResult.clear();
610	auto IssueOrder = getIssueOrder(Offset, II);
611	for (auto &Pair : OriToCycle) {
612	assert(IssueOrder.count(Pair.first) && "Cannot find original MI!");
613	SchedResult.push_back(Elt: std::make_tuple(args&: Pair.first, args&: Pair.second,
614	args: getOriStage(OriMI: Pair.first, Offset),
615	args&: IssueOrder [Pair.first]));
616	}
617	}
618
619	void WindowScheduler::expand() {
620	// The MIs in the SchedResult are sorted by the issue order ID.
621	llvm::stable_sort(Range&: SchedResult,
622	C: [](const std::tuple<MachineInstr , int, int, int*> &A,
623	const std::tuple<MachineInstr , int, int, int*> &B) {
624	return std::get<`3`>(t: A) < std::get<`3`>(t: B);
625	});
626	// Use the scheduling infrastructure for expansion, noting that InstrChanges
627	// is not supported here.
628	DenseMap<MachineInstr , int*> Cycles, Stages;
629	std::vector<MachineInstr *> OrderedInsts;
630	for (auto &Info : SchedResult) {
631	auto *MI = std::get<`0`>(t&: Info);
632	OrderedInsts.push_back(x: MI);
633	Cycles [MI] = std::get<`1`>(t&: Info);
634	Stages [MI] = std::get<`2`>(t&: Info);
635	LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI]
636	<< "]: " << *MI);
637	}
638	ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
639	std::move(Stages));
640	ModuloScheduleExpander MSE(MF, MS, Context->LIS,
641	ModuloScheduleExpander::InstrChangesTy ());
642	MSE.expand();
643	MSE.cleanup();
644	}
645
646	void WindowScheduler::updateLiveIntervals() {
647	SmallVector<Register, `128`> UsedRegs;
648	for (MachineInstr &MI : *MBB)
649	for (const MachineOperand &MO : MI.operands()) {
650	if (!MO.isReg() \|\| MO.getReg() == `0`)
651	continue;
652	Register Reg = MO.getReg();
653	if (!is_contained(Range&: UsedRegs, Element: Reg))
654	UsedRegs.push_back(Elt: Reg);
655	}
656	Context->LIS->repairIntervalsInRange(MBB, Begin: MBB->begin(), End: MBB->end(), OrigRegs: UsedRegs);
657	}
658
659	iterator_range<MachineBasicBlock::iterator>
660	WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) {
661	auto RegionBegin = MBB->begin();
662	std::advance(i&: RegionBegin, n: Offset);
663	auto RegionEnd = RegionBegin;
664	std::advance(i&: RegionEnd, n: Num);
665	return make_range(x: RegionBegin, y: RegionEnd);
666	}
667
668	int WindowScheduler::getOriCycle(MachineInstr *NewMI) {
669	assert(TriToOri.count(NewMI) && "Cannot find original MI!");
670	auto *OriMI = TriToOri [NewMI];
671	assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!");
672	return OriToCycle [OriMI];
673	}
674
675	MachineInstr WindowScheduler::getOriMI(MachineInstr NewMI) {
676	assert(TriToOri.count(NewMI) && "Cannot find original MI!");
677	return TriToOri [NewMI];
678	}
679
680	unsigned WindowScheduler::getOriStage(MachineInstr OriMI, unsigned* Offset) {
681	assert(llvm::find(OriMIs, OriMI) != OriMIs.end() &&
682	"Cannot find OriMI in OriMIs!");
683	// If there is no instruction fold, all MI stages are 0.
684	if (Offset == SchedPhiNum)
685	return `0`;
686	// For those MIs with an ID less than the Offset, their stages are set to 0,
687	// while the rest are set to 1.
688	unsigned Id = `0`;
689	for (auto *MI : OriMIs) {
690	if (MI->isMetaInstruction())
691	continue;
692	if (MI == OriMI)
693	break;
694	++Id;
695	}
696	return Id >= (size_t)Offset ? `1` : `0`;
697	}
698
699	Register WindowScheduler::getAntiRegister(MachineInstr *Phi) {
700	assert(Phi->isPHI() && "Expecting PHI!");
701	Register AntiReg;
702	for (auto MO : Phi->uses()) {
703	if (MO.isReg())
704	AntiReg = MO.getReg();
705	else if (MO.isMBB() && MO.getMBB() == MBB)
706	return AntiReg;
707	}
708	return `0`;
709	}
710

Browse the source code of llvm_projects/llvm/lib/CodeGen/WindowScheduler.cpp