1 | //======----------- WindowScheduler.cpp - window scheduler -------------======// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // An implementation of the Window Scheduling software pipelining algorithm. |
10 | // |
11 | // The fundamental concept of the window scheduling algorithm involves folding |
12 | // the original MBB at a specific position, followed by list scheduling on the |
13 | // folded MIs. The optimal scheduling result is then chosen from various folding |
14 | // positions as the final scheduling outcome. |
15 | // |
16 | // The primary challenge in this algorithm lies in generating the folded MIs and |
17 | // establishing their dependencies. We have innovatively employed a new MBB, |
18 | // created by copying the original MBB three times, known as TripleMBB. This |
19 | // TripleMBB enables the convenient implementation of MI folding and dependency |
20 | // establishment. To facilitate the algorithm's implementation, we have also |
21 | // devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle. |
22 | // |
23 | // Another challenge in the algorithm is the scheduling of phis. Semantically, |
24 | // it is difficult to place the phis in the window and perform list scheduling. |
25 | // Therefore, we schedule these phis separately after each list scheduling. |
26 | // |
27 | // The provided implementation is designed for use before the Register Allocator |
28 | // (RA). If the target requires implementation after RA, it is recommended to |
29 | // reimplement analyseII(), schedulePhi(), and expand(). Additionally, |
30 | // target-specific logic can be added in initialize(), preProcess(), and |
31 | // postProcess(). |
32 | // |
33 | // Lastly, it is worth mentioning that getSearchIndexes() is an important |
34 | // function. We have experimented with more complex heuristics on downstream |
35 | // target and achieved favorable results. |
36 | // |
37 | //===----------------------------------------------------------------------===// |
38 | #include "llvm/CodeGen/WindowScheduler.h" |
39 | #include "llvm/ADT/Statistic.h" |
40 | #include "llvm/CodeGen/LiveIntervals.h" |
41 | #include "llvm/CodeGen/MachineLoopInfo.h" |
42 | #include "llvm/CodeGen/MachinePipeliner.h" |
43 | #include "llvm/CodeGen/ModuloSchedule.h" |
44 | #include "llvm/CodeGen/TargetPassConfig.h" |
45 | #include "llvm/Support/CommandLine.h" |
46 | #include "llvm/Support/Debug.h" |
47 | #include "llvm/Support/TimeProfiler.h" |
48 | |
49 | using namespace llvm; |
50 | |
51 | #define DEBUG_TYPE "pipeliner" |
52 | |
53 | namespace { |
54 | STATISTIC(NumTryWindowSchedule, |
55 | "Number of loops that we attempt to use window scheduling" ); |
56 | STATISTIC(NumTryWindowSearch, |
57 | "Number of times that we run list schedule in the window scheduling" ); |
58 | STATISTIC(NumWindowSchedule, |
59 | "Number of loops that we successfully use window scheduling" ); |
60 | STATISTIC(NumFailAnalyseII, |
61 | "Window scheduling abort due to the failure of the II analysis" ); |
62 | |
63 | cl::opt<unsigned> |
64 | WindowSearchNum("window-search-num" , |
65 | cl::desc("The number of searches per loop in the window " |
66 | "algorithm. 0 means no search number limit." ), |
67 | cl::Hidden, cl::init(Val: 6)); |
68 | |
69 | cl::opt<unsigned> WindowSearchRatio( |
70 | "window-search-ratio" , |
71 | cl::desc("The ratio of searches per loop in the window algorithm. 100 " |
72 | "means search all positions in the loop, while 0 means not " |
73 | "performing any search." ), |
74 | cl::Hidden, cl::init(Val: 40)); |
75 | |
76 | cl::opt<unsigned> WindowIICoeff( |
77 | "window-ii-coeff" , |
78 | cl::desc( |
79 | "The coefficient used when initializing II in the window algorithm." ), |
80 | cl::Hidden, cl::init(Val: 5)); |
81 | |
82 | cl::opt<unsigned> WindowRegionLimit( |
83 | "window-region-limit" , |
84 | cl::desc( |
85 | "The lower limit of the scheduling region in the window algorithm." ), |
86 | cl::Hidden, cl::init(Val: 3)); |
87 | |
88 | cl::opt<unsigned> WindowDiffLimit( |
89 | "window-diff-limit" , |
90 | cl::desc("The lower limit of the difference between best II and base II in " |
91 | "the window algorithm. If the difference is smaller than " |
92 | "this lower limit, window scheduling will not be performed." ), |
93 | cl::Hidden, cl::init(Val: 2)); |
94 | } // namespace |
95 | |
96 | // WindowIILimit serves as an indicator of abnormal scheduling results and could |
97 | // potentially be referenced by the derived target window scheduler. |
98 | cl::opt<unsigned> |
99 | WindowIILimit("window-ii-limit" , |
100 | cl::desc("The upper limit of II in the window algorithm." ), |
101 | cl::Hidden, cl::init(Val: 1000)); |
102 | |
103 | WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML) |
104 | : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML), |
105 | Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()), |
106 | TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) { |
107 | TripleDAG = std::unique_ptr<ScheduleDAGInstrs>( |
108 | createMachineScheduler(/*OnlyBuildGraph=*/true)); |
109 | } |
110 | |
111 | bool WindowScheduler::run() { |
112 | if (!initialize()) { |
113 | LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n" ); |
114 | return false; |
115 | } |
116 | // The window algorithm is time-consuming, and its compilation time should be |
117 | // taken into consideration. |
118 | TimeTraceScope Scope("WindowSearch" ); |
119 | ++NumTryWindowSchedule; |
120 | // Performing the relevant processing before window scheduling. |
121 | preProcess(); |
122 | // The main window scheduling begins. |
123 | std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler()); |
124 | auto SearchIndexes = getSearchIndexes(SearchNum: WindowSearchNum, SearchRatio: WindowSearchRatio); |
125 | for (unsigned Idx : SearchIndexes) { |
126 | OriToCycle.clear(); |
127 | ++NumTryWindowSearch; |
128 | // The scheduling starts with non-phi instruction, so SchedPhiNum needs to |
129 | // be added to Idx. |
130 | unsigned Offset = Idx + SchedPhiNum; |
131 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
132 | SchedDAG->startBlock(BB: MBB); |
133 | SchedDAG->enterRegion(bb: MBB, begin: Range.begin(), end: Range.end(), regioninstrs: SchedInstrNum); |
134 | SchedDAG->schedule(); |
135 | LLVM_DEBUG(SchedDAG->dump()); |
136 | unsigned II = analyseII(DAG&: *SchedDAG, Offset); |
137 | if (II == WindowIILimit) { |
138 | restoreTripleMBB(); |
139 | LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n" ); |
140 | ++NumFailAnalyseII; |
141 | continue; |
142 | } |
143 | schedulePhi(Offset, II); |
144 | updateScheduleResult(Offset, II); |
145 | restoreTripleMBB(); |
146 | LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is " |
147 | << II << ".\n" ); |
148 | } |
149 | // Performing the relevant processing after window scheduling. |
150 | postProcess(); |
151 | // Check whether the scheduling result is valid. |
152 | if (!isScheduleValid()) { |
153 | LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n" ); |
154 | return false; |
155 | } |
156 | LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset |
157 | << " and Best II is " << BestII << ".\n" ); |
158 | // Expand the scheduling result to prologue, kernel, and epilogue. |
159 | expand(); |
160 | ++NumWindowSchedule; |
161 | return true; |
162 | } |
163 | |
164 | ScheduleDAGInstrs * |
165 | WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) { |
166 | return OnlyBuildGraph |
167 | ? new ScheduleDAGMI( |
168 | Context, std::make_unique<PostGenericScheduler>(args&: Context), |
169 | true) |
170 | : Context->PassConfig->createMachineScheduler(C: Context); |
171 | } |
172 | |
173 | bool WindowScheduler::initialize() { |
174 | if (!Subtarget->enableWindowScheduler()) { |
175 | LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n" ); |
176 | return false; |
177 | } |
178 | // Initialized the member variables used by window algorithm. |
179 | OriMIs.clear(); |
180 | TriMIs.clear(); |
181 | TriToOri.clear(); |
182 | OriToCycle.clear(); |
183 | SchedResult.clear(); |
184 | SchedPhiNum = 0; |
185 | SchedInstrNum = 0; |
186 | BestII = UINT_MAX; |
187 | BestOffset = 0; |
188 | BaseII = 0; |
189 | // List scheduling used in the window algorithm depends on LiveIntervals. |
190 | if (!Context->LIS) { |
191 | LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n" ); |
192 | return false; |
193 | } |
194 | // Check each MI in MBB. |
195 | SmallSet<Register, 8> PrevDefs; |
196 | SmallSet<Register, 8> PrevUses; |
197 | auto IsLoopCarried = [&](MachineInstr &Phi) { |
198 | // Two cases are checked here: (1)The virtual register defined by the |
199 | // preceding phi is used by the succeeding phi;(2)The preceding phi uses the |
200 | // virtual register defined by the succeeding phi. |
201 | if (PrevUses.count(V: Phi.getOperand(i: 0).getReg())) |
202 | return true; |
203 | PrevDefs.insert(V: Phi.getOperand(i: 0).getReg()); |
204 | for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) { |
205 | if (PrevDefs.count(V: Phi.getOperand(i: I).getReg())) |
206 | return true; |
207 | PrevUses.insert(V: Phi.getOperand(i: I).getReg()); |
208 | } |
209 | return false; |
210 | }; |
211 | auto PLI = TII->analyzeLoopForPipelining(LoopBB: MBB); |
212 | for (auto &MI : *MBB) { |
213 | if (MI.isMetaInstruction() || MI.isTerminator()) |
214 | continue; |
215 | if (MI.isPHI()) { |
216 | if (IsLoopCarried(MI)) { |
217 | LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n" ); |
218 | return false; |
219 | } |
220 | ++SchedPhiNum; |
221 | ++BestOffset; |
222 | } else |
223 | ++SchedInstrNum; |
224 | if (TII->isSchedulingBoundary(MI, MBB, MF: *MF)) { |
225 | LLVM_DEBUG( |
226 | dbgs() << "Boundary MI is not allowed in window scheduling!\n" ); |
227 | return false; |
228 | } |
229 | if (PLI->shouldIgnoreForPipelining(MI: &MI)) { |
230 | LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in " |
231 | "window scheduling!\n" ); |
232 | return false; |
233 | } |
234 | for (auto &Def : MI.all_defs()) |
235 | if (Def.isReg() && Def.getReg().isPhysical()) { |
236 | LLVM_DEBUG(dbgs() << "Physical registers are not supported in " |
237 | "window scheduling!\n" ); |
238 | return false; |
239 | } |
240 | } |
241 | if (SchedInstrNum <= WindowRegionLimit) { |
242 | LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n" ); |
243 | return false; |
244 | } |
245 | return true; |
246 | } |
247 | |
248 | void WindowScheduler::preProcess() { |
249 | // Prior to window scheduling, it's necessary to backup the original MBB, |
250 | // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB. |
251 | backupMBB(); |
252 | generateTripleMBB(); |
253 | TripleDAG->startBlock(BB: MBB); |
254 | TripleDAG->enterRegion( |
255 | bb: MBB, begin: MBB->begin(), end: MBB->getFirstTerminator(), |
256 | regioninstrs: std::distance(first: MBB->begin(), last: MBB->getFirstTerminator())); |
257 | TripleDAG->buildSchedGraph(AA: Context->AA); |
258 | } |
259 | |
260 | void WindowScheduler::postProcess() { |
261 | // After window scheduling, it's necessary to clear the TripleDAG and restore |
262 | // to the original MBB. |
263 | TripleDAG->exitRegion(); |
264 | TripleDAG->finishBlock(); |
265 | restoreMBB(); |
266 | } |
267 | |
268 | void WindowScheduler::backupMBB() { |
269 | for (auto &MI : MBB->instrs()) |
270 | OriMIs.push_back(Elt: &MI); |
271 | // Remove MIs and the corresponding live intervals. |
272 | for (auto &MI : make_early_inc_range(Range&: *MBB)) { |
273 | Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true); |
274 | MBB->remove(I: &MI); |
275 | } |
276 | } |
277 | |
278 | void WindowScheduler::restoreMBB() { |
279 | // Erase MIs and the corresponding live intervals. |
280 | for (auto &MI : make_early_inc_range(Range&: *MBB)) { |
281 | Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true); |
282 | MI.eraseFromParent(); |
283 | } |
284 | // Restore MBB to the state before window scheduling. |
285 | for (auto *MI : OriMIs) |
286 | MBB->push_back(MI); |
287 | updateLiveIntervals(); |
288 | } |
289 | |
290 | void WindowScheduler::generateTripleMBB() { |
291 | const unsigned DuplicateNum = 3; |
292 | TriMIs.clear(); |
293 | TriToOri.clear(); |
294 | assert(OriMIs.size() > 0 && "The Original MIs were not backed up!" ); |
295 | // Step 1: Performing the first copy of MBB instructions, excluding |
296 | // terminators. At the same time, we back up the anti-register of phis. |
297 | // DefPairs hold the old and new define register pairs. |
298 | DenseMap<Register, Register> DefPairs; |
299 | for (auto *MI : OriMIs) { |
300 | if (MI->isMetaInstruction() || MI->isTerminator()) |
301 | continue; |
302 | if (MI->isPHI()) |
303 | if (Register AntiReg = getAntiRegister(Phi: MI)) |
304 | DefPairs[MI->getOperand(i: 0).getReg()] = AntiReg; |
305 | auto *NewMI = MF->CloneMachineInstr(Orig: MI); |
306 | MBB->push_back(MI: NewMI); |
307 | TriMIs.push_back(Elt: NewMI); |
308 | TriToOri[NewMI] = MI; |
309 | } |
310 | // Step 2: Performing the remaining two copies of MBB instructions excluding |
311 | // phis, and the last one contains terminators. At the same time, registers |
312 | // are updated accordingly. |
313 | for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) { |
314 | for (auto *MI : OriMIs) { |
315 | if (MI->isPHI() || MI->isMetaInstruction() || |
316 | (MI->isTerminator() && Cnt < DuplicateNum - 1)) |
317 | continue; |
318 | auto *NewMI = MF->CloneMachineInstr(Orig: MI); |
319 | DenseMap<Register, Register> NewDefs; |
320 | // New defines are updated. |
321 | for (auto MO : NewMI->all_defs()) |
322 | if (MO.isReg() && MO.getReg().isVirtual()) { |
323 | Register NewDef = |
324 | MRI->createVirtualRegister(RegClass: MRI->getRegClass(Reg: MO.getReg())); |
325 | NewMI->substituteRegister(FromReg: MO.getReg(), ToReg: NewDef, SubIdx: 0, RegInfo: *TRI); |
326 | NewDefs[MO.getReg()] = NewDef; |
327 | } |
328 | // New uses are updated. |
329 | for (auto DefRegPair : DefPairs) |
330 | if (NewMI->readsRegister(Reg: DefRegPair.first, TRI)) { |
331 | Register NewUse = DefRegPair.second; |
332 | // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3': |
333 | // |
334 | // BB.3: DefPairs |
335 | // ================================== |
336 | // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7) |
337 | // ... |
338 | // ================================== |
339 | // ... |
340 | // %4 = sub i32 %1, %3 |
341 | // ... |
342 | // %7 = add i32 %5, %6 |
343 | // ... |
344 | // ---------------------------------- |
345 | // ... |
346 | // %8 = sub i32 %7, %3 (%1,%7),(%4,%8) |
347 | // ... |
348 | // %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9) |
349 | // ... |
350 | // ---------------------------------- |
351 | // ... |
352 | // %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9) |
353 | // ... ^ |
354 | // %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11) |
355 | // ... |
356 | // ================================== |
357 | // < Terminators > |
358 | // ================================== |
359 | if (DefPairs.count(Val: NewUse)) |
360 | NewUse = DefPairs[NewUse]; |
361 | NewMI->substituteRegister(FromReg: DefRegPair.first, ToReg: NewUse, SubIdx: 0, RegInfo: *TRI); |
362 | } |
363 | // DefPairs is updated at last. |
364 | for (auto &NewDef : NewDefs) |
365 | DefPairs[NewDef.first] = NewDef.second; |
366 | MBB->push_back(MI: NewMI); |
367 | TriMIs.push_back(Elt: NewMI); |
368 | TriToOri[NewMI] = MI; |
369 | } |
370 | } |
371 | // Step 3: The registers used by phis are updated, and they are generated in |
372 | // the third copy of MBB. |
373 | // In the privious example, the old phi is: |
374 | // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] |
375 | // The new phi is: |
376 | // %1 = phi i32 [%2, %BB.1], [%11, %BB.3] |
377 | for (auto &Phi : MBB->phis()) { |
378 | for (auto DefRegPair : DefPairs) |
379 | if (Phi.readsRegister(Reg: DefRegPair.first, TRI)) |
380 | Phi.substituteRegister(FromReg: DefRegPair.first, ToReg: DefRegPair.second, SubIdx: 0, RegInfo: *TRI); |
381 | } |
382 | updateLiveIntervals(); |
383 | } |
384 | |
385 | void WindowScheduler::restoreTripleMBB() { |
386 | // After list scheduling, the MBB is restored in one traversal. |
387 | for (size_t I = 0; I < TriMIs.size(); ++I) { |
388 | auto *MI = TriMIs[I]; |
389 | auto OldPos = MBB->begin(); |
390 | std::advance(i&: OldPos, n: I); |
391 | auto CurPos = MI->getIterator(); |
392 | if (CurPos != OldPos) { |
393 | MBB->splice(Where: OldPos, Other: MBB, From: CurPos); |
394 | Context->LIS->handleMove(MI&: *MI, /*UpdateFlags=*/false); |
395 | } |
396 | } |
397 | } |
398 | |
399 | SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum, |
400 | unsigned SearchRatio) { |
401 | // We use SearchRatio to get the index range, and then evenly get the indexes |
402 | // according to the SearchNum. This is a simple huristic. Depending on the |
403 | // characteristics of the target, more complex algorithms can be used for both |
404 | // performance and compilation time. |
405 | assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!" ); |
406 | unsigned MaxIdx = SchedInstrNum * SearchRatio / 100; |
407 | unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1; |
408 | SmallVector<unsigned> SearchIndexes; |
409 | for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step) |
410 | SearchIndexes.push_back(Elt: Idx); |
411 | return SearchIndexes; |
412 | } |
413 | |
414 | int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) { |
415 | // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1. |
416 | unsigned MaxDepth = 1; |
417 | for (auto &SU : DAG.SUnits) |
418 | MaxDepth = std::max(a: SU.getDepth() + SU.Latency, b: MaxDepth); |
419 | return MaxDepth * WindowIICoeff; |
420 | } |
421 | |
422 | int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG, |
423 | unsigned Offset) { |
424 | int InitII = getEstimatedII(DAG); |
425 | ResourceManager RM(Subtarget, &DAG); |
426 | RM.init(II: InitII); |
427 | // ResourceManager and DAG are used to calculate the maximum cycle for the |
428 | // scheduled MIs. Since MIs in the Region have already been scheduled, the |
429 | // emit cycles can be estimated in order here. |
430 | int CurCycle = 0; |
431 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
432 | for (auto &MI : Range) { |
433 | auto *SU = DAG.getSUnit(MI: &MI); |
434 | int ExpectCycle = CurCycle; |
435 | // The predecessors of current MI determine its earliest issue cycle. |
436 | for (auto &Pred : SU->Preds) { |
437 | if (Pred.isWeak()) |
438 | continue; |
439 | auto *PredMI = Pred.getSUnit()->getInstr(); |
440 | int PredCycle = getOriCycle(NewMI: PredMI); |
441 | ExpectCycle = std::max(a: ExpectCycle, b: PredCycle + (int)Pred.getLatency()); |
442 | } |
443 | // Zero cost instructions do not need to check resource. |
444 | if (!TII->isZeroCost(Opcode: MI.getOpcode())) { |
445 | // ResourceManager can be used to detect resource conflicts between the |
446 | // current MI and the previously inserted MIs. |
447 | while (!RM.canReserveResources(SU&: *SU, Cycle: CurCycle) || CurCycle < ExpectCycle) { |
448 | ++CurCycle; |
449 | if (CurCycle == (int)WindowIILimit) |
450 | return CurCycle; |
451 | } |
452 | RM.reserveResources(SU&: *SU, Cycle: CurCycle); |
453 | } |
454 | OriToCycle[getOriMI(NewMI: &MI)] = CurCycle; |
455 | LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S." |
456 | << getOriStage(getOriMI(&MI), Offset) << "]: " << MI); |
457 | } |
458 | LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n" ); |
459 | return CurCycle; |
460 | } |
461 | |
462 | // By utilizing TripleDAG, we can easily establish dependencies between A and B. |
463 | // Based on the MaxCycle and the issue cycle of A and B, we can determine |
464 | // whether it is necessary to add a stall cycle. This is because, without |
465 | // inserting the stall cycle, the latency constraint between A and B cannot be |
466 | // satisfied. The details are as follows: |
467 | // |
468 | // New MBB: |
469 | // ======================================== |
470 | // < Phis > |
471 | // ======================================== (sliding direction) |
472 | // MBB copy 1 | |
473 | // V |
474 | // |
475 | // ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~ ----schedule window----- |
476 | // | | |
477 | // ===================V==================== | |
478 | // MBB copy 2 < MI B > | |
479 | // | |
480 | // < MI A > V |
481 | // ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------ |
482 | // : |
483 | // ===================V==================== |
484 | // MBB copy 3 < MI B'> |
485 | // |
486 | // |
487 | // |
488 | // |
489 | // ======================================== |
490 | // < Terminators > |
491 | // ======================================== |
492 | int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) { |
493 | int MaxStallCycle = 0; |
494 | int CurrentII = MaxCycle + 1; |
495 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
496 | for (auto &MI : Range) { |
497 | auto *SU = TripleDAG->getSUnit(MI: &MI); |
498 | int DefCycle = getOriCycle(NewMI: &MI); |
499 | for (auto &Succ : SU->Succs) { |
500 | if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU) |
501 | continue; |
502 | // If the expected cycle does not exceed CurrentII, no check is needed. |
503 | if (DefCycle + (int)Succ.getLatency() <= CurrentII) |
504 | continue; |
505 | // If the cycle of the scheduled MI A is less than that of the scheduled |
506 | // MI B, the scheduling will fail because the lifetime of the |
507 | // corresponding register exceeds II. |
508 | auto *SuccMI = Succ.getSUnit()->getInstr(); |
509 | int UseCycle = getOriCycle(NewMI: SuccMI); |
510 | if (DefCycle < UseCycle) |
511 | return WindowIILimit; |
512 | // Get the stall cycle introduced by the register between two trips. |
513 | int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle; |
514 | MaxStallCycle = std::max(a: MaxStallCycle, b: StallCycle); |
515 | } |
516 | } |
517 | LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n" ); |
518 | return MaxStallCycle; |
519 | } |
520 | |
521 | unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) { |
522 | LLVM_DEBUG(dbgs() << "Start analyzing II:\n" ); |
523 | int MaxCycle = calculateMaxCycle(DAG, Offset); |
524 | if (MaxCycle == (int)WindowIILimit) |
525 | return MaxCycle; |
526 | int StallCycle = calculateStallCycle(Offset, MaxCycle); |
527 | if (StallCycle == (int)WindowIILimit) |
528 | return StallCycle; |
529 | // The value of II is equal to the maximum execution cycle plus 1. |
530 | return MaxCycle + StallCycle + 1; |
531 | } |
532 | |
533 | void WindowScheduler::schedulePhi(int Offset, unsigned &II) { |
534 | LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n" ); |
535 | for (auto &Phi : MBB->phis()) { |
536 | int LateCycle = INT_MAX; |
537 | auto *SU = TripleDAG->getSUnit(MI: &Phi); |
538 | for (auto &Succ : SU->Succs) { |
539 | // Phi doesn't have any Anti successors. |
540 | if (Succ.getKind() != SDep::Data) |
541 | continue; |
542 | // Phi is scheduled before the successor of stage 0. The issue cycle of |
543 | // phi is the latest cycle in this interval. |
544 | auto *SuccMI = Succ.getSUnit()->getInstr(); |
545 | int Cycle = getOriCycle(NewMI: SuccMI); |
546 | if (getOriStage(OriMI: getOriMI(NewMI: SuccMI), Offset) == 0) |
547 | LateCycle = std::min(a: LateCycle, b: Cycle); |
548 | } |
549 | // The anti-dependency of phi need to be handled separately in the same way. |
550 | if (Register AntiReg = getAntiRegister(Phi: &Phi)) { |
551 | auto *AntiMI = MRI->getVRegDef(Reg: AntiReg); |
552 | // AntiReg may be defined outside the kernel MBB. |
553 | if (AntiMI->getParent() == MBB) { |
554 | auto AntiCycle = getOriCycle(NewMI: AntiMI); |
555 | if (getOriStage(OriMI: getOriMI(NewMI: AntiMI), Offset) == 0) |
556 | LateCycle = std::min(a: LateCycle, b: AntiCycle); |
557 | } |
558 | } |
559 | // If there is no limit to the late cycle, a default value is given. |
560 | if (LateCycle == INT_MAX) |
561 | LateCycle = (int)(II - 1); |
562 | LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi); |
563 | // The issue cycle of phi is set to the latest cycle in the interval. |
564 | auto *OriPhi = getOriMI(NewMI: &Phi); |
565 | OriToCycle[OriPhi] = LateCycle; |
566 | } |
567 | } |
568 | |
569 | DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset, |
570 | unsigned II) { |
571 | // At each issue cycle, phi is placed before MIs in stage 0. So the simplest |
572 | // way is to put phi at the beginning of the current cycle. |
573 | DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs; |
574 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
575 | for (auto &Phi : MBB->phis()) |
576 | CycleToMIs[getOriCycle(NewMI: &Phi)].push_back(Elt: getOriMI(NewMI: &Phi)); |
577 | for (auto &MI : Range) |
578 | CycleToMIs[getOriCycle(NewMI: &MI)].push_back(Elt: getOriMI(NewMI: &MI)); |
579 | // Each MI is assigned a separate ordered Id, which is used as a sort marker |
580 | // in the following expand process. |
581 | DenseMap<MachineInstr *, int> IssueOrder; |
582 | int Id = 0; |
583 | for (int Cycle = 0; Cycle < (int)II; ++Cycle) { |
584 | if (!CycleToMIs.count(Val: Cycle)) |
585 | continue; |
586 | for (auto *MI : CycleToMIs[Cycle]) |
587 | IssueOrder[MI] = Id++; |
588 | } |
589 | return IssueOrder; |
590 | } |
591 | |
592 | void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) { |
593 | // At the first update, Offset is equal to SchedPhiNum. At this time, only |
594 | // BestII, BestOffset, and BaseII need to be updated. |
595 | if (Offset == SchedPhiNum) { |
596 | BestII = II; |
597 | BestOffset = SchedPhiNum; |
598 | BaseII = II; |
599 | return; |
600 | } |
601 | // The update will only continue if the II is smaller than BestII and the II |
602 | // is sufficiently small. |
603 | if ((II >= BestII) || (II + WindowDiffLimit > BaseII)) |
604 | return; |
605 | BestII = II; |
606 | BestOffset = Offset; |
607 | // Record the result of the current list scheduling, noting that each MI is |
608 | // stored unordered in SchedResult. |
609 | SchedResult.clear(); |
610 | auto IssueOrder = getIssueOrder(Offset, II); |
611 | for (auto &Pair : OriToCycle) { |
612 | assert(IssueOrder.count(Pair.first) && "Cannot find original MI!" ); |
613 | SchedResult.push_back(Elt: std::make_tuple(args&: Pair.first, args&: Pair.second, |
614 | args: getOriStage(OriMI: Pair.first, Offset), |
615 | args&: IssueOrder[Pair.first])); |
616 | } |
617 | } |
618 | |
619 | void WindowScheduler::expand() { |
620 | // The MIs in the SchedResult are sorted by the issue order ID. |
621 | llvm::stable_sort(Range&: SchedResult, |
622 | C: [](const std::tuple<MachineInstr *, int, int, int> &A, |
623 | const std::tuple<MachineInstr *, int, int, int> &B) { |
624 | return std::get<3>(t: A) < std::get<3>(t: B); |
625 | }); |
626 | // Use the scheduling infrastructure for expansion, noting that InstrChanges |
627 | // is not supported here. |
628 | DenseMap<MachineInstr *, int> Cycles, Stages; |
629 | std::vector<MachineInstr *> OrderedInsts; |
630 | for (auto &Info : SchedResult) { |
631 | auto *MI = std::get<0>(t&: Info); |
632 | OrderedInsts.push_back(x: MI); |
633 | Cycles[MI] = std::get<1>(t&: Info); |
634 | Stages[MI] = std::get<2>(t&: Info); |
635 | LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI] |
636 | << "]: " << *MI); |
637 | } |
638 | ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles), |
639 | std::move(Stages)); |
640 | ModuloScheduleExpander MSE(*MF, MS, *Context->LIS, |
641 | ModuloScheduleExpander::InstrChangesTy()); |
642 | MSE.expand(); |
643 | MSE.cleanup(); |
644 | } |
645 | |
646 | void WindowScheduler::updateLiveIntervals() { |
647 | SmallVector<Register, 128> UsedRegs; |
648 | for (MachineInstr &MI : *MBB) |
649 | for (const MachineOperand &MO : MI.operands()) { |
650 | if (!MO.isReg() || MO.getReg() == 0) |
651 | continue; |
652 | Register Reg = MO.getReg(); |
653 | if (!is_contained(Range&: UsedRegs, Element: Reg)) |
654 | UsedRegs.push_back(Elt: Reg); |
655 | } |
656 | Context->LIS->repairIntervalsInRange(MBB, Begin: MBB->begin(), End: MBB->end(), OrigRegs: UsedRegs); |
657 | } |
658 | |
659 | iterator_range<MachineBasicBlock::iterator> |
660 | WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) { |
661 | auto RegionBegin = MBB->begin(); |
662 | std::advance(i&: RegionBegin, n: Offset); |
663 | auto RegionEnd = RegionBegin; |
664 | std::advance(i&: RegionEnd, n: Num); |
665 | return make_range(x: RegionBegin, y: RegionEnd); |
666 | } |
667 | |
668 | int WindowScheduler::getOriCycle(MachineInstr *NewMI) { |
669 | assert(TriToOri.count(NewMI) && "Cannot find original MI!" ); |
670 | auto *OriMI = TriToOri[NewMI]; |
671 | assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!" ); |
672 | return OriToCycle[OriMI]; |
673 | } |
674 | |
675 | MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) { |
676 | assert(TriToOri.count(NewMI) && "Cannot find original MI!" ); |
677 | return TriToOri[NewMI]; |
678 | } |
679 | |
680 | unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) { |
681 | assert(llvm::find(OriMIs, OriMI) != OriMIs.end() && |
682 | "Cannot find OriMI in OriMIs!" ); |
683 | // If there is no instruction fold, all MI stages are 0. |
684 | if (Offset == SchedPhiNum) |
685 | return 0; |
686 | // For those MIs with an ID less than the Offset, their stages are set to 0, |
687 | // while the rest are set to 1. |
688 | unsigned Id = 0; |
689 | for (auto *MI : OriMIs) { |
690 | if (MI->isMetaInstruction()) |
691 | continue; |
692 | if (MI == OriMI) |
693 | break; |
694 | ++Id; |
695 | } |
696 | return Id >= (size_t)Offset ? 1 : 0; |
697 | } |
698 | |
699 | Register WindowScheduler::getAntiRegister(MachineInstr *Phi) { |
700 | assert(Phi->isPHI() && "Expecting PHI!" ); |
701 | Register AntiReg; |
702 | for (auto MO : Phi->uses()) { |
703 | if (MO.isReg()) |
704 | AntiReg = MO.getReg(); |
705 | else if (MO.isMBB() && MO.getMBB() == MBB) |
706 | return AntiReg; |
707 | } |
708 | return 0; |
709 | } |
710 | |