WindowScheduler.cpp source code [llvm_projects/llvm/lib/CodeGen/WindowScheduler.cpp]

1	//======----------- WindowScheduler.cpp - window scheduler -------------======//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// An implementation of the Window Scheduling software pipelining algorithm.
10	//
11	// The fundamental concept of the window scheduling algorithm involves folding
12	// the original MBB at a specific position, followed by list scheduling on the
13	// folded MIs. The optimal scheduling result is then chosen from various folding
14	// positions as the final scheduling outcome.
15	//
16	// The primary challenge in this algorithm lies in generating the folded MIs and
17	// establishing their dependencies. We have innovatively employed a new MBB,
18	// created by copying the original MBB three times, known as TripleMBB. This
19	// TripleMBB enables the convenient implementation of MI folding and dependency
20	// establishment. To facilitate the algorithm's implementation, we have also
21	// devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle.
22	//
23	// Another challenge in the algorithm is the scheduling of phis. Semantically,
24	// it is difficult to place the phis in the window and perform list scheduling.
25	// Therefore, we schedule these phis separately after each list scheduling.
26	//
27	// The provided implementation is designed for use before the Register Allocator
28	// (RA). If the target requires implementation after RA, it is recommended to
29	// reimplement analyseII(), schedulePhi(), and expand(). Additionally,
30	// target-specific logic can be added in initialize(), preProcess(), and
31	// postProcess().
32	//
33	// Lastly, it is worth mentioning that getSearchIndexes() is an important
34	// function. We have experimented with more complex heuristics on downstream
35	// target and achieved favorable results.
36	//
37	//===----------------------------------------------------------------------===//
38	#include "llvm/CodeGen/WindowScheduler.h"
39	#include "llvm/ADT/Statistic.h"
40	#include "llvm/CodeGen/LiveIntervals.h"
41	#include "llvm/CodeGen/MachineLoopInfo.h"
42	#include "llvm/CodeGen/MachinePipeliner.h"
43	#include "llvm/CodeGen/ModuloSchedule.h"
44	#include "llvm/CodeGen/TargetPassConfig.h"
45	#include "llvm/Support/CommandLine.h"
46	#include "llvm/Support/Debug.h"
47	#include "llvm/Support/TimeProfiler.h"
48	#include "llvm/Target/TargetMachine.h"
49
50	using namespace llvm;
51
52	#define DEBUG_TYPE "pipeliner"
53
54	namespace {
55	STATISTIC(NumTryWindowSchedule,
56	"Number of loops that we attempt to use window scheduling");
57	STATISTIC(NumTryWindowSearch,
58	"Number of times that we run list schedule in the window scheduling");
59	STATISTIC(NumWindowSchedule,
60	"Number of loops that we successfully use window scheduling");
61	STATISTIC(NumFailAnalyseII,
62	"Window scheduling abort due to the failure of the II analysis");
63
64	cl::opt<unsigned>
65	WindowSearchNum("window-search-num",
66	cl::desc ("The number of searches per loop in the window "
67	"algorithm. 0 means no search number limit."),
68	cl::Hidden, cl::init(Val: `6`));
69
70	cl::opt<unsigned> WindowSearchRatio(
71	"window-search-ratio",
72	cl::desc ("The ratio of searches per loop in the window algorithm. 100 "
73	"means search all positions in the loop, while 0 means not "
74	"performing any search."),
75	cl::Hidden, cl::init(Val: `40`));
76
77	cl::opt<unsigned> WindowIICoeff(
78	"window-ii-coeff",
79	cl::desc (
80	"The coefficient used when initializing II in the window algorithm."),
81	cl::Hidden, cl::init(Val: `5`));
82
83	cl::opt<unsigned> WindowRegionLimit(
84	"window-region-limit",
85	cl::desc (
86	"The lower limit of the scheduling region in the window algorithm."),
87	cl::Hidden, cl::init(Val: `3`));
88
89	cl::opt<unsigned> WindowDiffLimit(
90	"window-diff-limit",
91	cl::desc ("The lower limit of the difference between best II and base II in "
92	"the window algorithm. If the difference is smaller than "
93	"this lower limit, window scheduling will not be performed."),
94	cl::Hidden, cl::init(Val: `2`));
95	} // namespace
96
97	// WindowIILimit serves as an indicator of abnormal scheduling results and could
98	// potentially be referenced by the derived target window scheduler.
99	static cl::opt<unsigned>
100	WindowIILimit("window-ii-limit",
101	cl::desc ("The upper limit of II in the window algorithm."),
102	cl::Hidden, cl::init(Val: `1000`));
103
104	WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML)
105	: Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML),
106	Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()),
107	TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) {
108	TripleDAG = std::unique_ptr<ScheduleDAGInstrs>(
109	createMachineScheduler(/OnlyBuildGraph=/true));
110	}
111
112	bool WindowScheduler::run() {
113	if (!initialize()) {
114	LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n");
115	return false;
116	}
117	// The window algorithm is time-consuming, and its compilation time should be
118	// taken into consideration.
119	TimeTraceScope Scope("WindowSearch");
120	++NumTryWindowSchedule;
121	// Performing the relevant processing before window scheduling.
122	preProcess();
123	// The main window scheduling begins.
124	std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler());
125	auto SearchIndexes = getSearchIndexes(SearchNum: WindowSearchNum, SearchRatio: WindowSearchRatio);
126	for (unsigned Idx : SearchIndexes) {
127	OriToCycle.clear();
128	++NumTryWindowSearch;
129	// The scheduling starts with non-phi instruction, so SchedPhiNum needs to
130	// be added to Idx.
131	unsigned Offset = Idx + SchedPhiNum;
132	auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
133	SchedDAG ->startBlock(BB: MBB);
134	SchedDAG ->enterRegion(bb: MBB, begin: Range.begin(), end: Range.end(), regioninstrs: SchedInstrNum);
135	SchedDAG ->schedule();
136	LLVM_DEBUG(SchedDAG->dump());
137	unsigned II = analyseII(DAG&: *SchedDAG, Offset);
138	if (II == WindowIILimit) {
139	restoreTripleMBB();
140	LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n");
141	++NumFailAnalyseII;
142	continue;
143	}
144	schedulePhi(Offset, II);
145	updateScheduleResult(Offset, II);
146	restoreTripleMBB();
147	LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is "
148	<< II << ".\n");
149	}
150	// Performing the relevant processing after window scheduling.
151	postProcess();
152	// Check whether the scheduling result is valid.
153	if (!isScheduleValid()) {
154	LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n");
155	return false;
156	}
157	LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset
158	<< " and Best II is " << BestII << ".\n");
159	// Expand the scheduling result to prologue, kernel, and epilogue.
160	expand();
161	++NumWindowSchedule;
162	return true;
163	}
164
165	ScheduleDAGInstrs *
166	WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) {
167	return OnlyBuildGraph
168	? new ScheduleDAGMI (
169	Context, std::make_unique<PostGenericScheduler>(args&: Context),
170	true)
171	: Context->TM->createMachineScheduler(C: Context);
172	}
173
174	bool WindowScheduler::initialize() {
175	if (!Subtarget->enableWindowScheduler()) {
176	LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n");
177	return false;
178	}
179	// Initialized the member variables used by window algorithm.
180	OriMIs.clear();
181	TriMIs.clear();
182	TriToOri.clear();
183	OriToCycle.clear();
184	SchedResult.clear();
185	SchedPhiNum = `0`;
186	SchedInstrNum = `0`;
187	BestII = UINT_MAX;
188	BestOffset = `0`;
189	BaseII = `0`;
190	// List scheduling used in the window algorithm depends on LiveIntervals.
191	if (!Context->LIS) {
192	LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n");
193	return false;
194	}
195	// Check each MI in MBB.
196	SmallSet<Register, `8`> PrevDefs;
197	SmallSet<Register, `8`> PrevUses;
198	auto IsLoopCarried = [&](MachineInstr &Phi) {
199	// Two cases are checked here: (1)The virtual register defined by the
200	// preceding phi is used by the succeeding phi;(2)The preceding phi uses the
201	// virtual register defined by the succeeding phi.
202	if (PrevUses.count(V: Phi.getOperand(i: `0`).getReg()))
203	return true;
204	PrevDefs.insert(V: Phi.getOperand(i: `0`).getReg());
205	for (unsigned I = `1`, E = Phi.getNumOperands(); I != E; I += `2`) {
206	if (PrevDefs.count(V: Phi.getOperand(i: I).getReg()))
207	return true;
208	PrevUses.insert(V: Phi.getOperand(i: I).getReg());
209	}
210	return false;
211	};
212	auto PLI = TII->analyzeLoopForPipelining(LoopBB: MBB);
213	for (auto &MI : *MBB) {
214	if (MI.isMetaInstruction() \|\| MI.isTerminator())
215	continue;
216	if (MI.isPHI()) {
217	if (IsLoopCarried (MI)) {
218	LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n");
219	return false;
220	}
221	++SchedPhiNum;
222	++BestOffset;
223	} else
224	++SchedInstrNum;
225	if (TII->isSchedulingBoundary(MI, MBB, MF: *MF)) {
226	LLVM_DEBUG(
227	dbgs() << "Boundary MI is not allowed in window scheduling!\n");
228	return false;
229	}
230	if (PLI ->shouldIgnoreForPipelining(MI: &MI)) {
231	LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in "
232	"window scheduling!\n");
233	return false;
234	}
235	for (auto &Def : MI.all_defs())
236	if (Def.isReg() && Def.getReg().isPhysical()) {
237	LLVM_DEBUG(dbgs() << "Physical registers are not supported in "
238	"window scheduling!\n");
239	return false;
240	}
241	}
242	if (SchedInstrNum <= WindowRegionLimit) {
243	LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
244	return false;
245	}
246	return true;
247	}
248
249	void WindowScheduler::preProcess() {
250	// Prior to window scheduling, it's necessary to backup the original MBB,
251	// generate a new TripleMBB, and build a TripleDAG based on the TripleMBB.
252	backupMBB();
253	generateTripleMBB();
254	TripleDAG ->startBlock(BB: MBB);
255	TripleDAG ->enterRegion(
256	bb: MBB, begin: MBB->begin(), end: MBB->getFirstTerminator(),
257	regioninstrs: std::distance(first: MBB->begin(), last: MBB->getFirstTerminator()));
258	TripleDAG ->buildSchedGraph(AA: Context->AA);
259	}
260
261	void WindowScheduler::postProcess() {
262	// After window scheduling, it's necessary to clear the TripleDAG and restore
263	// to the original MBB.
264	TripleDAG ->exitRegion();
265	TripleDAG ->finishBlock();
266	restoreMBB();
267	}
268
269	void WindowScheduler::backupMBB() {
270	for (auto &MI : MBB->instrs())
271	OriMIs.push_back(Elt: &MI);
272	// Remove MIs and the corresponding live intervals.
273	for (auto &MI : make_early_inc_range(Range&: *MBB)) {
274	Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true);
275	MBB->remove(I: &MI);
276	}
277	}
278
279	void WindowScheduler::restoreMBB() {
280	// Erase MIs and the corresponding live intervals.
281	for (auto &MI : make_early_inc_range(Range&: *MBB)) {
282	Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true);
283	MI.eraseFromParent();
284	}
285	// Restore MBB to the state before window scheduling.
286	llvm::append_range(C&: *MBB, R&: OriMIs);
287	updateLiveIntervals();
288	}
289
290	void WindowScheduler::generateTripleMBB() {
291	const unsigned DuplicateNum = `3`;
292	TriMIs.clear();
293	TriToOri.clear();
294	assert(OriMIs.size() > `0` && "The Original MIs were not backed up!");
295	// Step 1: Performing the first copy of MBB instructions, excluding
296	// terminators. At the same time, we back up the anti-register of phis.
297	// DefPairs hold the old and new define register pairs.
298	DenseMap<Register, Register> DefPairs;
299	for (auto *MI : OriMIs) {
300	if (MI->isMetaInstruction() \|\| MI->isTerminator())
301	continue;
302	if (MI->isPHI())
303	if (Register AntiReg = getAntiRegister(Phi: MI))
304	DefPairs [MI->getOperand(i: `0`).getReg()] = AntiReg;
305	auto *NewMI = MF->CloneMachineInstr(Orig: MI);
306	MBB->push_back(MI: NewMI);
307	TriMIs.push_back(Elt: NewMI);
308	TriToOri [NewMI] = MI;
309	}
310	// Step 2: Performing the remaining two copies of MBB instructions excluding
311	// phis, and the last one contains terminators. At the same time, registers
312	// are updated accordingly.
313	for (size_t Cnt = `1`; Cnt < DuplicateNum; ++Cnt) {
314	for (auto *MI : OriMIs) {
315	if (MI->isPHI() \|\| MI->isMetaInstruction() \|\|
316	(MI->isTerminator() && Cnt < DuplicateNum - `1`))
317	continue;
318	auto *NewMI = MF->CloneMachineInstr(Orig: MI);
319	DenseMap<Register, Register> NewDefs;
320	// New defines are updated.
321	for (auto MO : NewMI->all_defs())
322	if (MO.isReg() && MO.getReg().isVirtual()) {
323	Register NewDef =
324	MRI->createVirtualRegister(RegClass: MRI->getRegClass(Reg: MO.getReg()));
325	NewMI->substituteRegister(FromReg: MO.getReg(), ToReg: NewDef, SubIdx: `0`, RegInfo: *TRI);
326	NewDefs [MO.getReg()] = NewDef;
327	}
328	// New uses are updated.
329	for (auto DefRegPair : DefPairs)
330	if (NewMI->readsRegister(Reg: DefRegPair.first, TRI)) {
331	Register NewUse = DefRegPair.second;
332	// Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3':
333	//
334	// BB.3: DefPairs
335	// ==================================
336	// %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7)
337	// ...
338	// ==================================
339	// ...
340	// %4 = sub i32 %1, %3
341	// ...
342	// %7 = add i32 %5, %6
343	// ...
344	// ----------------------------------
345	// ...
346	// %8 = sub i32 %7, %3 (%1,%7),(%4,%8)
347	// ...
348	// %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9)
349	// ...
350	// ----------------------------------
351	// ...
352	// %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9)
353	// ... ^
354	// %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11)
355	// ...
356	// ==================================
357	// < Terminators >
358	// ==================================
359	if (auto It = DefPairs.find(Val: NewUse); It != DefPairs.end())
360	NewUse = It ->second;
361	NewMI->substituteRegister(FromReg: DefRegPair.first, ToReg: NewUse, SubIdx: `0`, RegInfo: *TRI);
362	}
363	// DefPairs is updated at last.
364	for (auto &NewDef : NewDefs)
365	DefPairs [NewDef.first] = NewDef.second;
366	MBB->push_back(MI: NewMI);
367	TriMIs.push_back(Elt: NewMI);
368	TriToOri [NewMI] = MI;
369	}
370	}
371	// Step 3: The registers used by phis are updated, and they are generated in
372	// the third copy of MBB.
373	// In the privious example, the old phi is:
374	// %1 = phi i32 [%2, %BB.1], [%7, %BB.3]
375	// The new phi is:
376	// %1 = phi i32 [%2, %BB.1], [%11, %BB.3]
377	for (auto &Phi : MBB->phis()) {
378	for (auto DefRegPair : DefPairs)
379	if (Phi.readsRegister(Reg: DefRegPair.first, TRI))
380	Phi.substituteRegister(FromReg: DefRegPair.first, ToReg: DefRegPair.second, SubIdx: `0`, RegInfo: *TRI);
381	}
382	updateLiveIntervals();
383	}
384
385	void WindowScheduler::restoreTripleMBB() {
386	// After list scheduling, the MBB is restored in one traversal.
387	for (size_t I = `0`; I < TriMIs.size(); ++I) {
388	auto *MI = TriMIs [I];
389	auto OldPos = MBB->begin();
390	std::advance(i&: OldPos, n: I);
391	auto CurPos = MI->getIterator();
392	if (CurPos != OldPos) {
393	MBB->splice(Where: OldPos, Other: MBB, From: CurPos);
394	Context->LIS->handleMove(MI&: MI, /UpdateFlags=/*false);
395	}
396	}
397	}
398
399	SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum,
400	unsigned SearchRatio) {
401	// We use SearchRatio to get the index range, and then evenly get the indexes
402	// according to the SearchNum. This is a simple huristic. Depending on the
403	// characteristics of the target, more complex algorithms can be used for both
404	// performance and compilation time.
405	assert(SearchRatio <= `100` && "SearchRatio should be equal or less than 100!");
406	unsigned MaxIdx = SchedInstrNum * SearchRatio / `100`;
407	unsigned Step = SearchNum > `0` && SearchNum <= MaxIdx ? MaxIdx / SearchNum : `1`;
408	SmallVector<unsigned> SearchIndexes;
409	for (unsigned Idx = `0`; Idx < MaxIdx; Idx += Step)
410	SearchIndexes.push_back(Elt: Idx);
411	return SearchIndexes;
412	}
413
414	int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) {
415	// Sometimes MaxDepth is 0, so it should be limited to the minimum of 1.
416	unsigned MaxDepth = `1`;
417	for (auto &SU : DAG.SUnits)
418	MaxDepth = std::max(a: SU.getDepth() + SU.Latency, b: MaxDepth);
419	return MaxDepth * WindowIICoeff;
420	}
421
422	int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
423	unsigned Offset) {
424	int InitII = getEstimatedII(DAG);
425	ResourceManager RM(Subtarget, &DAG);
426	RM.init(II: InitII);
427	// ResourceManager and DAG are used to calculate the maximum cycle for the
428	// scheduled MIs. Since MIs in the Region have already been scheduled, the
429	// emit cycles can be estimated in order here.
430	int CurCycle = `0`;
431	auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
432	for (auto &MI : Range) {
433	auto *SU = DAG.getSUnit(MI: &MI);
434	int ExpectCycle = CurCycle;
435	// The predecessors of current MI determine its earliest issue cycle.
436	for (auto &Pred : SU->Preds) {
437	if (Pred.isWeak())
438	continue;
439	auto *PredMI = Pred.getSUnit()->getInstr();
440	int PredCycle = getOriCycle(NewMI: PredMI);
441	ExpectCycle = std::max(a: ExpectCycle, b: PredCycle + (int)Pred.getLatency());
442	}
443	// Zero cost instructions do not need to check resource.
444	if (!TII->isZeroCost(Opcode: MI.getOpcode())) {
445	// ResourceManager can be used to detect resource conflicts between the
446	// current MI and the previously inserted MIs.
447	while (!RM.canReserveResources(SU&: *SU, Cycle: CurCycle) \|\| CurCycle < ExpectCycle) {
448	++CurCycle;
449	if (CurCycle == (int)WindowIILimit)
450	return CurCycle;
451	}
452	RM.reserveResources(SU&: *SU, Cycle: CurCycle);
453	}
454	OriToCycle [getOriMI(NewMI: &MI)] = CurCycle;
455	LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
456	<< getOriStage(getOriMI(&MI), Offset) << "]: " << MI);
457	}
458	LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n");
459	return CurCycle;
460	}
461
462	// By utilizing TripleDAG, we can easily establish dependencies between A and B.
463	// Based on the MaxCycle and the issue cycle of A and B, we can determine
464	// whether it is necessary to add a stall cycle. This is because, without
465	// inserting the stall cycle, the latency constraint between A and B cannot be
466	// satisfied. The details are as follows:
467	//
468	// New MBB:
469	// ========================================
470	// < Phis >
471	// ======================================== (sliding direction)
472	// MBB copy 1 \|
473	// V
474	//
475	// ~~~~~~~~~~~~~~~~~~~\|~~~~~~~~~~~~~~~~~~~~ ----schedule window-----
476	// \| \|
477	// ===================V==================== \|
478	// MBB copy 2 < MI B > \|
479	// \|
480	// < MI A > V
481	// ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------
482	// :
483	// ===================V====================
484	// MBB copy 3 < MI B'>
485	//
486	//
487	//
488	//
489	// ========================================
490	// < Terminators >
491	// ========================================
492	int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
493	int MaxStallCycle = `0`;
494	int CurrentII = MaxCycle + `1`;
495	auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
496	for (auto &MI : Range) {
497	auto *SU = TripleDAG ->getSUnit(MI: &MI);
498	int DefCycle = getOriCycle(NewMI: &MI);
499	for (auto &Succ : SU->Succs) {
500	if (Succ.isWeak() \|\| Succ.getSUnit() == &TripleDAG ->ExitSU)
501	continue;
502	// If the expected cycle does not exceed CurrentII, no check is needed.
503	if (DefCycle + (int)Succ.getLatency() <= CurrentII)
504	continue;
505	// If the cycle of the scheduled MI A is less than that of the scheduled
506	// MI B, the scheduling will fail because the lifetime of the
507	// corresponding register exceeds II.
508	auto *SuccMI = Succ.getSUnit()->getInstr();
509	int UseCycle = getOriCycle(NewMI: SuccMI);
510	if (DefCycle < UseCycle)
511	return WindowIILimit;
512	// Get the stall cycle introduced by the register between two trips.
513	int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle;
514	MaxStallCycle = std::max(a: MaxStallCycle, b: StallCycle);
515	}
516	}
517	LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n");
518	return MaxStallCycle;
519	}
520
521	unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) {
522	LLVM_DEBUG(dbgs() << "Start analyzing II:\n");
523	int MaxCycle = calculateMaxCycle(DAG, Offset);
524	if (MaxCycle == (int)WindowIILimit)
525	return MaxCycle;
526	int StallCycle = calculateStallCycle(Offset, MaxCycle);
527	if (StallCycle == (int)WindowIILimit)
528	return StallCycle;
529	// The value of II is equal to the maximum execution cycle plus 1.
530	return MaxCycle + StallCycle + `1`;
531	}
532
533	void WindowScheduler::schedulePhi(int Offset, unsigned &II) {
534	LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n");
535	for (auto &Phi : MBB->phis()) {
536	int LateCycle = INT_MAX;
537	auto *SU = TripleDAG ->getSUnit(MI: &Phi);
538	for (auto &Succ : SU->Succs) {
539	// Phi doesn't have any Anti successors.
540	if (Succ.getKind() != SDep::Data)
541	continue;
542	// Phi is scheduled before the successor of stage 0. The issue cycle of
543	// phi is the latest cycle in this interval.
544	auto *SuccMI = Succ.getSUnit()->getInstr();
545	int Cycle = getOriCycle(NewMI: SuccMI);
546	if (getOriStage(OriMI: getOriMI(NewMI: SuccMI), Offset) == `0`)
547	LateCycle = std::min(a: LateCycle, b: Cycle);
548	}
549	// The anti-dependency of phi need to be handled separately in the same way.
550	if (Register AntiReg = getAntiRegister(Phi: &Phi)) {
551	auto *AntiMI = MRI->getVRegDef(Reg: AntiReg);
552	// AntiReg may be defined outside the kernel MBB.
553	if (AntiMI->getParent() == MBB) {
554	auto AntiCycle = getOriCycle(NewMI: AntiMI);
555	if (getOriStage(OriMI: getOriMI(NewMI: AntiMI), Offset) == `0`)
556	LateCycle = std::min(a: LateCycle, b: AntiCycle);
557	}
558	}
559	// If there is no limit to the late cycle, a default value is given.
560	if (LateCycle == INT_MAX)
561	LateCycle = (int)(II - `1`);
562	LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi);
563	// The issue cycle of phi is set to the latest cycle in the interval.
564	auto *OriPhi = getOriMI(NewMI: &Phi);
565	OriToCycle [OriPhi] = LateCycle;
566	}
567	}
568
569	DenseMap<MachineInstr , int> WindowScheduler::getIssueOrder(unsigned* Offset,
570	unsigned II) {
571	// At each issue cycle, phi is placed before MIs in stage 0. So the simplest
572	// way is to put phi at the beginning of the current cycle.
573	DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs;
574	auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
575	for (auto &Phi : MBB->phis())
576	CycleToMIs [getOriCycle(NewMI: &Phi)].push_back(Elt: getOriMI(NewMI: &Phi));
577	for (auto &MI : Range)
578	CycleToMIs [getOriCycle(NewMI: &MI)].push_back(Elt: getOriMI(NewMI: &MI));
579	// Each MI is assigned a separate ordered Id, which is used as a sort marker
580	// in the following expand process.
581	DenseMap<MachineInstr , int*> IssueOrder;
582	int Id = `0`;
583	for (int Cycle = `0`; Cycle < (int)II; ++Cycle) {
584	auto It = CycleToMIs.find(Val: Cycle);
585	if (It == CycleToMIs.end())
586	continue;
587	for (auto *MI : It ->second)
588	IssueOrder [MI] = Id++;
589	}
590	return IssueOrder;
591	}
592
593	void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) {
594	// At the first update, Offset is equal to SchedPhiNum. At this time, only
595	// BestII, BestOffset, and BaseII need to be updated.
596	if (Offset == SchedPhiNum) {
597	BestII = II;
598	BestOffset = SchedPhiNum;
599	BaseII = II;
600	return;
601	}
602	// The update will only continue if the II is smaller than BestII and the II
603	// is sufficiently small.
604	if ((II >= BestII) \|\| (II + WindowDiffLimit > BaseII))
605	return;
606	BestII = II;
607	BestOffset = Offset;
608	// Record the result of the current list scheduling, noting that each MI is
609	// stored unordered in SchedResult.
610	SchedResult.clear();
611	auto IssueOrder = getIssueOrder(Offset, II);
612	for (auto &Pair : OriToCycle) {
613	assert(IssueOrder.count(Pair.first) && "Cannot find original MI!");
614	SchedResult.push_back(Elt: std::make_tuple(args&: Pair.first, args&: Pair.second,
615	args: getOriStage(OriMI: Pair.first, Offset),
616	args&: IssueOrder [Pair.first]));
617	}
618	}
619
620	void WindowScheduler::expand() {
621	// The MIs in the SchedResult are sorted by the issue order ID.
622	llvm::stable_sort(Range&: SchedResult,
623	C: [](const std::tuple<MachineInstr , int, int, int*> &A,
624	const std::tuple<MachineInstr , int, int, int*> &B) {
625	return std::get<`3`>(t: A) < std::get<`3`>(t: B);
626	});
627	// Use the scheduling infrastructure for expansion, noting that InstrChanges
628	// is not supported here.
629	DenseMap<MachineInstr , int*> Cycles, Stages;
630	std::vector<MachineInstr *> OrderedInsts;
631	for (auto &Info : SchedResult) {
632	auto *MI = std::get<`0`>(t&: Info);
633	OrderedInsts.push_back(x: MI);
634	Cycles [MI] = std::get<`1`>(t&: Info);
635	Stages [MI] = std::get<`2`>(t&: Info);
636	LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI]
637	<< "]: " << *MI);
638	}
639	ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
640	std::move(Stages));
641	ModuloScheduleExpander MSE(MF, MS, Context->LIS,
642	ModuloScheduleExpander::InstrChangesTy ());
643	MSE.expand();
644	MSE.cleanup();
645	}
646
647	void WindowScheduler::updateLiveIntervals() {
648	SmallVector<Register, `128`> UsedRegs;
649	for (MachineInstr &MI : *MBB)
650	for (const MachineOperand &MO : MI.operands()) {
651	if (!MO.isReg() \|\| MO.getReg() == `0`)
652	continue;
653	Register Reg = MO.getReg();
654	if (!is_contained(Range&: UsedRegs, Element: Reg))
655	UsedRegs.push_back(Elt: Reg);
656	}
657	Context->LIS->repairIntervalsInRange(MBB, Begin: MBB->begin(), End: MBB->end(), OrigRegs: UsedRegs);
658	}
659
660	iterator_range<MachineBasicBlock::iterator>
661	WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) {
662	auto RegionBegin = MBB->begin();
663	std::advance(i&: RegionBegin, n: Offset);
664	auto RegionEnd = RegionBegin;
665	std::advance(i&: RegionEnd, n: Num);
666	return make_range(x: RegionBegin, y: RegionEnd);
667	}
668
669	int WindowScheduler::getOriCycle(MachineInstr *NewMI) {
670	assert(TriToOri.count(NewMI) && "Cannot find original MI!");
671	auto *OriMI = TriToOri [NewMI];
672	assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!");
673	return OriToCycle [OriMI];
674	}
675
676	MachineInstr WindowScheduler::getOriMI(MachineInstr NewMI) {
677	assert(TriToOri.count(NewMI) && "Cannot find original MI!");
678	return TriToOri [NewMI];
679	}
680
681	unsigned WindowScheduler::getOriStage(MachineInstr OriMI, unsigned* Offset) {
682	assert(llvm::is_contained(OriMIs, OriMI) && "Cannot find OriMI in OriMIs!");
683	// If there is no instruction fold, all MI stages are 0.
684	if (Offset == SchedPhiNum)
685	return `0`;
686	// For those MIs with an ID less than the Offset, their stages are set to 0,
687	// while the rest are set to 1.
688	unsigned Id = `0`;
689	for (auto *MI : OriMIs) {
690	if (MI->isMetaInstruction())
691	continue;
692	if (MI == OriMI)
693	break;
694	++Id;
695	}
696	return Id >= (size_t)Offset ? `1` : `0`;
697	}
698
699	Register WindowScheduler::getAntiRegister(MachineInstr *Phi) {
700	assert(Phi->isPHI() && "Expecting PHI!");
701	Register AntiReg;
702	for (auto MO : Phi->uses()) {
703	if (MO.isReg())
704	AntiReg = MO.getReg();
705	else if (MO.isMBB() && MO.getMBB() == MBB)
706	return AntiReg;
707	}
708	return `0`;
709	}
710

Browse the source code of llvm_projects/llvm/lib/CodeGen/WindowScheduler.cpp