1//======----------- WindowScheduler.cpp - window scheduler -------------======//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Window Scheduling software pipelining algorithm.
10//
11// The fundamental concept of the window scheduling algorithm involves folding
12// the original MBB at a specific position, followed by list scheduling on the
13// folded MIs. The optimal scheduling result is then chosen from various folding
14// positions as the final scheduling outcome.
15//
16// The primary challenge in this algorithm lies in generating the folded MIs and
17// establishing their dependencies. We have innovatively employed a new MBB,
18// created by copying the original MBB three times, known as TripleMBB. This
19// TripleMBB enables the convenient implementation of MI folding and dependency
20// establishment. To facilitate the algorithm's implementation, we have also
21// devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle.
22//
23// Another challenge in the algorithm is the scheduling of phis. Semantically,
24// it is difficult to place the phis in the window and perform list scheduling.
25// Therefore, we schedule these phis separately after each list scheduling.
26//
27// The provided implementation is designed for use before the Register Allocator
28// (RA). If the target requires implementation after RA, it is recommended to
29// reimplement analyseII(), schedulePhi(), and expand(). Additionally,
30// target-specific logic can be added in initialize(), preProcess(), and
31// postProcess().
32//
33// Lastly, it is worth mentioning that getSearchIndexes() is an important
34// function. We have experimented with more complex heuristics on downstream
35// target and achieved favorable results.
36//
37//===----------------------------------------------------------------------===//
38#include "llvm/CodeGen/WindowScheduler.h"
39#include "llvm/ADT/Statistic.h"
40#include "llvm/CodeGen/LiveIntervals.h"
41#include "llvm/CodeGen/MachineLoopInfo.h"
42#include "llvm/CodeGen/MachinePipeliner.h"
43#include "llvm/CodeGen/ModuloSchedule.h"
44#include "llvm/CodeGen/TargetPassConfig.h"
45#include "llvm/Support/CommandLine.h"
46#include "llvm/Support/Debug.h"
47#include "llvm/Support/TimeProfiler.h"
48
49using namespace llvm;
50
51#define DEBUG_TYPE "pipeliner"
52
53namespace {
54STATISTIC(NumTryWindowSchedule,
55 "Number of loops that we attempt to use window scheduling");
56STATISTIC(NumTryWindowSearch,
57 "Number of times that we run list schedule in the window scheduling");
58STATISTIC(NumWindowSchedule,
59 "Number of loops that we successfully use window scheduling");
60STATISTIC(NumFailAnalyseII,
61 "Window scheduling abort due to the failure of the II analysis");
62
63cl::opt<unsigned>
64 WindowSearchNum("window-search-num",
65 cl::desc("The number of searches per loop in the window "
66 "algorithm. 0 means no search number limit."),
67 cl::Hidden, cl::init(Val: 6));
68
69cl::opt<unsigned> WindowSearchRatio(
70 "window-search-ratio",
71 cl::desc("The ratio of searches per loop in the window algorithm. 100 "
72 "means search all positions in the loop, while 0 means not "
73 "performing any search."),
74 cl::Hidden, cl::init(Val: 40));
75
76cl::opt<unsigned> WindowIICoeff(
77 "window-ii-coeff",
78 cl::desc(
79 "The coefficient used when initializing II in the window algorithm."),
80 cl::Hidden, cl::init(Val: 5));
81
82cl::opt<unsigned> WindowRegionLimit(
83 "window-region-limit",
84 cl::desc(
85 "The lower limit of the scheduling region in the window algorithm."),
86 cl::Hidden, cl::init(Val: 3));
87
88cl::opt<unsigned> WindowDiffLimit(
89 "window-diff-limit",
90 cl::desc("The lower limit of the difference between best II and base II in "
91 "the window algorithm. If the difference is smaller than "
92 "this lower limit, window scheduling will not be performed."),
93 cl::Hidden, cl::init(Val: 2));
94} // namespace
95
96// WindowIILimit serves as an indicator of abnormal scheduling results and could
97// potentially be referenced by the derived target window scheduler.
98cl::opt<unsigned>
99 WindowIILimit("window-ii-limit",
100 cl::desc("The upper limit of II in the window algorithm."),
101 cl::Hidden, cl::init(Val: 1000));
102
103WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML)
104 : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML),
105 Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()),
106 TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) {
107 TripleDAG = std::unique_ptr<ScheduleDAGInstrs>(
108 createMachineScheduler(/*OnlyBuildGraph=*/true));
109}
110
111bool WindowScheduler::run() {
112 if (!initialize()) {
113 LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n");
114 return false;
115 }
116 // The window algorithm is time-consuming, and its compilation time should be
117 // taken into consideration.
118 TimeTraceScope Scope("WindowSearch");
119 ++NumTryWindowSchedule;
120 // Performing the relevant processing before window scheduling.
121 preProcess();
122 // The main window scheduling begins.
123 std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler());
124 auto SearchIndexes = getSearchIndexes(SearchNum: WindowSearchNum, SearchRatio: WindowSearchRatio);
125 for (unsigned Idx : SearchIndexes) {
126 OriToCycle.clear();
127 ++NumTryWindowSearch;
128 // The scheduling starts with non-phi instruction, so SchedPhiNum needs to
129 // be added to Idx.
130 unsigned Offset = Idx + SchedPhiNum;
131 auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
132 SchedDAG->startBlock(BB: MBB);
133 SchedDAG->enterRegion(bb: MBB, begin: Range.begin(), end: Range.end(), regioninstrs: SchedInstrNum);
134 SchedDAG->schedule();
135 LLVM_DEBUG(SchedDAG->dump());
136 unsigned II = analyseII(DAG&: *SchedDAG, Offset);
137 if (II == WindowIILimit) {
138 restoreTripleMBB();
139 LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n");
140 ++NumFailAnalyseII;
141 continue;
142 }
143 schedulePhi(Offset, II);
144 updateScheduleResult(Offset, II);
145 restoreTripleMBB();
146 LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is "
147 << II << ".\n");
148 }
149 // Performing the relevant processing after window scheduling.
150 postProcess();
151 // Check whether the scheduling result is valid.
152 if (!isScheduleValid()) {
153 LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n");
154 return false;
155 }
156 LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset
157 << " and Best II is " << BestII << ".\n");
158 // Expand the scheduling result to prologue, kernel, and epilogue.
159 expand();
160 ++NumWindowSchedule;
161 return true;
162}
163
164ScheduleDAGInstrs *
165WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) {
166 return OnlyBuildGraph
167 ? new ScheduleDAGMI(
168 Context, std::make_unique<PostGenericScheduler>(args&: Context),
169 true)
170 : Context->PassConfig->createMachineScheduler(C: Context);
171}
172
173bool WindowScheduler::initialize() {
174 if (!Subtarget->enableWindowScheduler()) {
175 LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n");
176 return false;
177 }
178 // Initialized the member variables used by window algorithm.
179 OriMIs.clear();
180 TriMIs.clear();
181 TriToOri.clear();
182 OriToCycle.clear();
183 SchedResult.clear();
184 SchedPhiNum = 0;
185 SchedInstrNum = 0;
186 BestII = UINT_MAX;
187 BestOffset = 0;
188 BaseII = 0;
189 // List scheduling used in the window algorithm depends on LiveIntervals.
190 if (!Context->LIS) {
191 LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n");
192 return false;
193 }
194 // Check each MI in MBB.
195 SmallSet<Register, 8> PrevDefs;
196 SmallSet<Register, 8> PrevUses;
197 auto IsLoopCarried = [&](MachineInstr &Phi) {
198 // Two cases are checked here: (1)The virtual register defined by the
199 // preceding phi is used by the succeeding phi;(2)The preceding phi uses the
200 // virtual register defined by the succeeding phi.
201 if (PrevUses.count(V: Phi.getOperand(i: 0).getReg()))
202 return true;
203 PrevDefs.insert(V: Phi.getOperand(i: 0).getReg());
204 for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
205 if (PrevDefs.count(V: Phi.getOperand(i: I).getReg()))
206 return true;
207 PrevUses.insert(V: Phi.getOperand(i: I).getReg());
208 }
209 return false;
210 };
211 auto PLI = TII->analyzeLoopForPipelining(LoopBB: MBB);
212 for (auto &MI : *MBB) {
213 if (MI.isMetaInstruction() || MI.isTerminator())
214 continue;
215 if (MI.isPHI()) {
216 if (IsLoopCarried(MI)) {
217 LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n");
218 return false;
219 }
220 ++SchedPhiNum;
221 ++BestOffset;
222 } else
223 ++SchedInstrNum;
224 if (TII->isSchedulingBoundary(MI, MBB, MF: *MF)) {
225 LLVM_DEBUG(
226 dbgs() << "Boundary MI is not allowed in window scheduling!\n");
227 return false;
228 }
229 if (PLI->shouldIgnoreForPipelining(MI: &MI)) {
230 LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in "
231 "window scheduling!\n");
232 return false;
233 }
234 for (auto &Def : MI.all_defs())
235 if (Def.isReg() && Def.getReg().isPhysical()) {
236 LLVM_DEBUG(dbgs() << "Physical registers are not supported in "
237 "window scheduling!\n");
238 return false;
239 }
240 }
241 if (SchedInstrNum <= WindowRegionLimit) {
242 LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
243 return false;
244 }
245 return true;
246}
247
248void WindowScheduler::preProcess() {
249 // Prior to window scheduling, it's necessary to backup the original MBB,
250 // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB.
251 backupMBB();
252 generateTripleMBB();
253 TripleDAG->startBlock(BB: MBB);
254 TripleDAG->enterRegion(
255 bb: MBB, begin: MBB->begin(), end: MBB->getFirstTerminator(),
256 regioninstrs: std::distance(first: MBB->begin(), last: MBB->getFirstTerminator()));
257 TripleDAG->buildSchedGraph(AA: Context->AA);
258}
259
260void WindowScheduler::postProcess() {
261 // After window scheduling, it's necessary to clear the TripleDAG and restore
262 // to the original MBB.
263 TripleDAG->exitRegion();
264 TripleDAG->finishBlock();
265 restoreMBB();
266}
267
268void WindowScheduler::backupMBB() {
269 for (auto &MI : MBB->instrs())
270 OriMIs.push_back(Elt: &MI);
271 // Remove MIs and the corresponding live intervals.
272 for (auto &MI : make_early_inc_range(Range&: *MBB)) {
273 Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true);
274 MBB->remove(I: &MI);
275 }
276}
277
278void WindowScheduler::restoreMBB() {
279 // Erase MIs and the corresponding live intervals.
280 for (auto &MI : make_early_inc_range(Range&: *MBB)) {
281 Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true);
282 MI.eraseFromParent();
283 }
284 // Restore MBB to the state before window scheduling.
285 for (auto *MI : OriMIs)
286 MBB->push_back(MI);
287 updateLiveIntervals();
288}
289
290void WindowScheduler::generateTripleMBB() {
291 const unsigned DuplicateNum = 3;
292 TriMIs.clear();
293 TriToOri.clear();
294 assert(OriMIs.size() > 0 && "The Original MIs were not backed up!");
295 // Step 1: Performing the first copy of MBB instructions, excluding
296 // terminators. At the same time, we back up the anti-register of phis.
297 // DefPairs hold the old and new define register pairs.
298 DenseMap<Register, Register> DefPairs;
299 for (auto *MI : OriMIs) {
300 if (MI->isMetaInstruction() || MI->isTerminator())
301 continue;
302 if (MI->isPHI())
303 if (Register AntiReg = getAntiRegister(Phi: MI))
304 DefPairs[MI->getOperand(i: 0).getReg()] = AntiReg;
305 auto *NewMI = MF->CloneMachineInstr(Orig: MI);
306 MBB->push_back(MI: NewMI);
307 TriMIs.push_back(Elt: NewMI);
308 TriToOri[NewMI] = MI;
309 }
310 // Step 2: Performing the remaining two copies of MBB instructions excluding
311 // phis, and the last one contains terminators. At the same time, registers
312 // are updated accordingly.
313 for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) {
314 for (auto *MI : OriMIs) {
315 if (MI->isPHI() || MI->isMetaInstruction() ||
316 (MI->isTerminator() && Cnt < DuplicateNum - 1))
317 continue;
318 auto *NewMI = MF->CloneMachineInstr(Orig: MI);
319 DenseMap<Register, Register> NewDefs;
320 // New defines are updated.
321 for (auto MO : NewMI->all_defs())
322 if (MO.isReg() && MO.getReg().isVirtual()) {
323 Register NewDef =
324 MRI->createVirtualRegister(RegClass: MRI->getRegClass(Reg: MO.getReg()));
325 NewMI->substituteRegister(FromReg: MO.getReg(), ToReg: NewDef, SubIdx: 0, RegInfo: *TRI);
326 NewDefs[MO.getReg()] = NewDef;
327 }
328 // New uses are updated.
329 for (auto DefRegPair : DefPairs)
330 if (NewMI->readsRegister(Reg: DefRegPair.first, TRI)) {
331 Register NewUse = DefRegPair.second;
332 // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3':
333 //
334 // BB.3: DefPairs
335 // ==================================
336 // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7)
337 // ...
338 // ==================================
339 // ...
340 // %4 = sub i32 %1, %3
341 // ...
342 // %7 = add i32 %5, %6
343 // ...
344 // ----------------------------------
345 // ...
346 // %8 = sub i32 %7, %3 (%1,%7),(%4,%8)
347 // ...
348 // %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9)
349 // ...
350 // ----------------------------------
351 // ...
352 // %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9)
353 // ... ^
354 // %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11)
355 // ...
356 // ==================================
357 // < Terminators >
358 // ==================================
359 if (DefPairs.count(Val: NewUse))
360 NewUse = DefPairs[NewUse];
361 NewMI->substituteRegister(FromReg: DefRegPair.first, ToReg: NewUse, SubIdx: 0, RegInfo: *TRI);
362 }
363 // DefPairs is updated at last.
364 for (auto &NewDef : NewDefs)
365 DefPairs[NewDef.first] = NewDef.second;
366 MBB->push_back(MI: NewMI);
367 TriMIs.push_back(Elt: NewMI);
368 TriToOri[NewMI] = MI;
369 }
370 }
371 // Step 3: The registers used by phis are updated, and they are generated in
372 // the third copy of MBB.
373 // In the privious example, the old phi is:
374 // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]
375 // The new phi is:
376 // %1 = phi i32 [%2, %BB.1], [%11, %BB.3]
377 for (auto &Phi : MBB->phis()) {
378 for (auto DefRegPair : DefPairs)
379 if (Phi.readsRegister(Reg: DefRegPair.first, TRI))
380 Phi.substituteRegister(FromReg: DefRegPair.first, ToReg: DefRegPair.second, SubIdx: 0, RegInfo: *TRI);
381 }
382 updateLiveIntervals();
383}
384
385void WindowScheduler::restoreTripleMBB() {
386 // After list scheduling, the MBB is restored in one traversal.
387 for (size_t I = 0; I < TriMIs.size(); ++I) {
388 auto *MI = TriMIs[I];
389 auto OldPos = MBB->begin();
390 std::advance(i&: OldPos, n: I);
391 auto CurPos = MI->getIterator();
392 if (CurPos != OldPos) {
393 MBB->splice(Where: OldPos, Other: MBB, From: CurPos);
394 Context->LIS->handleMove(MI&: *MI, /*UpdateFlags=*/false);
395 }
396 }
397}
398
399SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum,
400 unsigned SearchRatio) {
401 // We use SearchRatio to get the index range, and then evenly get the indexes
402 // according to the SearchNum. This is a simple huristic. Depending on the
403 // characteristics of the target, more complex algorithms can be used for both
404 // performance and compilation time.
405 assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!");
406 unsigned MaxIdx = SchedInstrNum * SearchRatio / 100;
407 unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1;
408 SmallVector<unsigned> SearchIndexes;
409 for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step)
410 SearchIndexes.push_back(Elt: Idx);
411 return SearchIndexes;
412}
413
414int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) {
415 // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1.
416 unsigned MaxDepth = 1;
417 for (auto &SU : DAG.SUnits)
418 MaxDepth = std::max(a: SU.getDepth() + SU.Latency, b: MaxDepth);
419 return MaxDepth * WindowIICoeff;
420}
421
422int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
423 unsigned Offset) {
424 int InitII = getEstimatedII(DAG);
425 ResourceManager RM(Subtarget, &DAG);
426 RM.init(II: InitII);
427 // ResourceManager and DAG are used to calculate the maximum cycle for the
428 // scheduled MIs. Since MIs in the Region have already been scheduled, the
429 // emit cycles can be estimated in order here.
430 int CurCycle = 0;
431 auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
432 for (auto &MI : Range) {
433 auto *SU = DAG.getSUnit(MI: &MI);
434 int ExpectCycle = CurCycle;
435 // The predecessors of current MI determine its earliest issue cycle.
436 for (auto &Pred : SU->Preds) {
437 if (Pred.isWeak())
438 continue;
439 auto *PredMI = Pred.getSUnit()->getInstr();
440 int PredCycle = getOriCycle(NewMI: PredMI);
441 ExpectCycle = std::max(a: ExpectCycle, b: PredCycle + (int)Pred.getLatency());
442 }
443 // Zero cost instructions do not need to check resource.
444 if (!TII->isZeroCost(Opcode: MI.getOpcode())) {
445 // ResourceManager can be used to detect resource conflicts between the
446 // current MI and the previously inserted MIs.
447 while (!RM.canReserveResources(SU&: *SU, Cycle: CurCycle) || CurCycle < ExpectCycle) {
448 ++CurCycle;
449 if (CurCycle == (int)WindowIILimit)
450 return CurCycle;
451 }
452 RM.reserveResources(SU&: *SU, Cycle: CurCycle);
453 }
454 OriToCycle[getOriMI(NewMI: &MI)] = CurCycle;
455 LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
456 << getOriStage(getOriMI(&MI), Offset) << "]: " << MI);
457 }
458 LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n");
459 return CurCycle;
460}
461
462// By utilizing TripleDAG, we can easily establish dependencies between A and B.
463// Based on the MaxCycle and the issue cycle of A and B, we can determine
464// whether it is necessary to add a stall cycle. This is because, without
465// inserting the stall cycle, the latency constraint between A and B cannot be
466// satisfied. The details are as follows:
467//
468// New MBB:
469// ========================================
470// < Phis >
471// ======================================== (sliding direction)
472// MBB copy 1 |
473// V
474//
475// ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~ ----schedule window-----
476// | |
477// ===================V==================== |
478// MBB copy 2 < MI B > |
479// |
480// < MI A > V
481// ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------
482// :
483// ===================V====================
484// MBB copy 3 < MI B'>
485//
486//
487//
488//
489// ========================================
490// < Terminators >
491// ========================================
492int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
493 int MaxStallCycle = 0;
494 int CurrentII = MaxCycle + 1;
495 auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
496 for (auto &MI : Range) {
497 auto *SU = TripleDAG->getSUnit(MI: &MI);
498 int DefCycle = getOriCycle(NewMI: &MI);
499 for (auto &Succ : SU->Succs) {
500 if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU)
501 continue;
502 // If the expected cycle does not exceed CurrentII, no check is needed.
503 if (DefCycle + (int)Succ.getLatency() <= CurrentII)
504 continue;
505 // If the cycle of the scheduled MI A is less than that of the scheduled
506 // MI B, the scheduling will fail because the lifetime of the
507 // corresponding register exceeds II.
508 auto *SuccMI = Succ.getSUnit()->getInstr();
509 int UseCycle = getOriCycle(NewMI: SuccMI);
510 if (DefCycle < UseCycle)
511 return WindowIILimit;
512 // Get the stall cycle introduced by the register between two trips.
513 int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle;
514 MaxStallCycle = std::max(a: MaxStallCycle, b: StallCycle);
515 }
516 }
517 LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n");
518 return MaxStallCycle;
519}
520
521unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) {
522 LLVM_DEBUG(dbgs() << "Start analyzing II:\n");
523 int MaxCycle = calculateMaxCycle(DAG, Offset);
524 if (MaxCycle == (int)WindowIILimit)
525 return MaxCycle;
526 int StallCycle = calculateStallCycle(Offset, MaxCycle);
527 if (StallCycle == (int)WindowIILimit)
528 return StallCycle;
529 // The value of II is equal to the maximum execution cycle plus 1.
530 return MaxCycle + StallCycle + 1;
531}
532
533void WindowScheduler::schedulePhi(int Offset, unsigned &II) {
534 LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n");
535 for (auto &Phi : MBB->phis()) {
536 int LateCycle = INT_MAX;
537 auto *SU = TripleDAG->getSUnit(MI: &Phi);
538 for (auto &Succ : SU->Succs) {
539 // Phi doesn't have any Anti successors.
540 if (Succ.getKind() != SDep::Data)
541 continue;
542 // Phi is scheduled before the successor of stage 0. The issue cycle of
543 // phi is the latest cycle in this interval.
544 auto *SuccMI = Succ.getSUnit()->getInstr();
545 int Cycle = getOriCycle(NewMI: SuccMI);
546 if (getOriStage(OriMI: getOriMI(NewMI: SuccMI), Offset) == 0)
547 LateCycle = std::min(a: LateCycle, b: Cycle);
548 }
549 // The anti-dependency of phi need to be handled separately in the same way.
550 if (Register AntiReg = getAntiRegister(Phi: &Phi)) {
551 auto *AntiMI = MRI->getVRegDef(Reg: AntiReg);
552 // AntiReg may be defined outside the kernel MBB.
553 if (AntiMI->getParent() == MBB) {
554 auto AntiCycle = getOriCycle(NewMI: AntiMI);
555 if (getOriStage(OriMI: getOriMI(NewMI: AntiMI), Offset) == 0)
556 LateCycle = std::min(a: LateCycle, b: AntiCycle);
557 }
558 }
559 // If there is no limit to the late cycle, a default value is given.
560 if (LateCycle == INT_MAX)
561 LateCycle = (int)(II - 1);
562 LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi);
563 // The issue cycle of phi is set to the latest cycle in the interval.
564 auto *OriPhi = getOriMI(NewMI: &Phi);
565 OriToCycle[OriPhi] = LateCycle;
566 }
567}
568
569DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset,
570 unsigned II) {
571 // At each issue cycle, phi is placed before MIs in stage 0. So the simplest
572 // way is to put phi at the beginning of the current cycle.
573 DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs;
574 auto Range = getScheduleRange(Offset, Num: SchedInstrNum);
575 for (auto &Phi : MBB->phis())
576 CycleToMIs[getOriCycle(NewMI: &Phi)].push_back(Elt: getOriMI(NewMI: &Phi));
577 for (auto &MI : Range)
578 CycleToMIs[getOriCycle(NewMI: &MI)].push_back(Elt: getOriMI(NewMI: &MI));
579 // Each MI is assigned a separate ordered Id, which is used as a sort marker
580 // in the following expand process.
581 DenseMap<MachineInstr *, int> IssueOrder;
582 int Id = 0;
583 for (int Cycle = 0; Cycle < (int)II; ++Cycle) {
584 if (!CycleToMIs.count(Val: Cycle))
585 continue;
586 for (auto *MI : CycleToMIs[Cycle])
587 IssueOrder[MI] = Id++;
588 }
589 return IssueOrder;
590}
591
592void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) {
593 // At the first update, Offset is equal to SchedPhiNum. At this time, only
594 // BestII, BestOffset, and BaseII need to be updated.
595 if (Offset == SchedPhiNum) {
596 BestII = II;
597 BestOffset = SchedPhiNum;
598 BaseII = II;
599 return;
600 }
601 // The update will only continue if the II is smaller than BestII and the II
602 // is sufficiently small.
603 if ((II >= BestII) || (II + WindowDiffLimit > BaseII))
604 return;
605 BestII = II;
606 BestOffset = Offset;
607 // Record the result of the current list scheduling, noting that each MI is
608 // stored unordered in SchedResult.
609 SchedResult.clear();
610 auto IssueOrder = getIssueOrder(Offset, II);
611 for (auto &Pair : OriToCycle) {
612 assert(IssueOrder.count(Pair.first) && "Cannot find original MI!");
613 SchedResult.push_back(Elt: std::make_tuple(args&: Pair.first, args&: Pair.second,
614 args: getOriStage(OriMI: Pair.first, Offset),
615 args&: IssueOrder[Pair.first]));
616 }
617}
618
619void WindowScheduler::expand() {
620 // The MIs in the SchedResult are sorted by the issue order ID.
621 llvm::stable_sort(Range&: SchedResult,
622 C: [](const std::tuple<MachineInstr *, int, int, int> &A,
623 const std::tuple<MachineInstr *, int, int, int> &B) {
624 return std::get<3>(t: A) < std::get<3>(t: B);
625 });
626 // Use the scheduling infrastructure for expansion, noting that InstrChanges
627 // is not supported here.
628 DenseMap<MachineInstr *, int> Cycles, Stages;
629 std::vector<MachineInstr *> OrderedInsts;
630 for (auto &Info : SchedResult) {
631 auto *MI = std::get<0>(t&: Info);
632 OrderedInsts.push_back(x: MI);
633 Cycles[MI] = std::get<1>(t&: Info);
634 Stages[MI] = std::get<2>(t&: Info);
635 LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI]
636 << "]: " << *MI);
637 }
638 ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
639 std::move(Stages));
640 ModuloScheduleExpander MSE(*MF, MS, *Context->LIS,
641 ModuloScheduleExpander::InstrChangesTy());
642 MSE.expand();
643 MSE.cleanup();
644}
645
646void WindowScheduler::updateLiveIntervals() {
647 SmallVector<Register, 128> UsedRegs;
648 for (MachineInstr &MI : *MBB)
649 for (const MachineOperand &MO : MI.operands()) {
650 if (!MO.isReg() || MO.getReg() == 0)
651 continue;
652 Register Reg = MO.getReg();
653 if (!is_contained(Range&: UsedRegs, Element: Reg))
654 UsedRegs.push_back(Elt: Reg);
655 }
656 Context->LIS->repairIntervalsInRange(MBB, Begin: MBB->begin(), End: MBB->end(), OrigRegs: UsedRegs);
657}
658
659iterator_range<MachineBasicBlock::iterator>
660WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) {
661 auto RegionBegin = MBB->begin();
662 std::advance(i&: RegionBegin, n: Offset);
663 auto RegionEnd = RegionBegin;
664 std::advance(i&: RegionEnd, n: Num);
665 return make_range(x: RegionBegin, y: RegionEnd);
666}
667
668int WindowScheduler::getOriCycle(MachineInstr *NewMI) {
669 assert(TriToOri.count(NewMI) && "Cannot find original MI!");
670 auto *OriMI = TriToOri[NewMI];
671 assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!");
672 return OriToCycle[OriMI];
673}
674
675MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) {
676 assert(TriToOri.count(NewMI) && "Cannot find original MI!");
677 return TriToOri[NewMI];
678}
679
680unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) {
681 assert(llvm::find(OriMIs, OriMI) != OriMIs.end() &&
682 "Cannot find OriMI in OriMIs!");
683 // If there is no instruction fold, all MI stages are 0.
684 if (Offset == SchedPhiNum)
685 return 0;
686 // For those MIs with an ID less than the Offset, their stages are set to 0,
687 // while the rest are set to 1.
688 unsigned Id = 0;
689 for (auto *MI : OriMIs) {
690 if (MI->isMetaInstruction())
691 continue;
692 if (MI == OriMI)
693 break;
694 ++Id;
695 }
696 return Id >= (size_t)Offset ? 1 : 0;
697}
698
699Register WindowScheduler::getAntiRegister(MachineInstr *Phi) {
700 assert(Phi->isPHI() && "Expecting PHI!");
701 Register AntiReg;
702 for (auto MO : Phi->uses()) {
703 if (MO.isReg())
704 AntiReg = MO.getReg();
705 else if (MO.isMBB() && MO.getMBB() == MBB)
706 return AntiReg;
707 }
708 return 0;
709}
710