1 | //======----------- WindowScheduler.cpp - window scheduler -------------======// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // An implementation of the Window Scheduling software pipelining algorithm. |
10 | // |
11 | // The fundamental concept of the window scheduling algorithm involves folding |
12 | // the original MBB at a specific position, followed by list scheduling on the |
13 | // folded MIs. The optimal scheduling result is then chosen from various folding |
14 | // positions as the final scheduling outcome. |
15 | // |
16 | // The primary challenge in this algorithm lies in generating the folded MIs and |
17 | // establishing their dependencies. We have innovatively employed a new MBB, |
18 | // created by copying the original MBB three times, known as TripleMBB. This |
19 | // TripleMBB enables the convenient implementation of MI folding and dependency |
20 | // establishment. To facilitate the algorithm's implementation, we have also |
21 | // devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle. |
22 | // |
23 | // Another challenge in the algorithm is the scheduling of phis. Semantically, |
24 | // it is difficult to place the phis in the window and perform list scheduling. |
25 | // Therefore, we schedule these phis separately after each list scheduling. |
26 | // |
27 | // The provided implementation is designed for use before the Register Allocator |
28 | // (RA). If the target requires implementation after RA, it is recommended to |
29 | // reimplement analyseII(), schedulePhi(), and expand(). Additionally, |
30 | // target-specific logic can be added in initialize(), preProcess(), and |
31 | // postProcess(). |
32 | // |
33 | // Lastly, it is worth mentioning that getSearchIndexes() is an important |
34 | // function. We have experimented with more complex heuristics on downstream |
35 | // target and achieved favorable results. |
36 | // |
37 | //===----------------------------------------------------------------------===// |
38 | #include "llvm/CodeGen/WindowScheduler.h" |
39 | #include "llvm/ADT/Statistic.h" |
40 | #include "llvm/CodeGen/LiveIntervals.h" |
41 | #include "llvm/CodeGen/MachineLoopInfo.h" |
42 | #include "llvm/CodeGen/MachinePipeliner.h" |
43 | #include "llvm/CodeGen/ModuloSchedule.h" |
44 | #include "llvm/CodeGen/TargetPassConfig.h" |
45 | #include "llvm/Support/CommandLine.h" |
46 | #include "llvm/Support/Debug.h" |
47 | #include "llvm/Support/TimeProfiler.h" |
48 | #include "llvm/Target/TargetMachine.h" |
49 | |
50 | using namespace llvm; |
51 | |
52 | #define DEBUG_TYPE "pipeliner" |
53 | |
54 | namespace { |
55 | STATISTIC(NumTryWindowSchedule, |
56 | "Number of loops that we attempt to use window scheduling" ); |
57 | STATISTIC(NumTryWindowSearch, |
58 | "Number of times that we run list schedule in the window scheduling" ); |
59 | STATISTIC(NumWindowSchedule, |
60 | "Number of loops that we successfully use window scheduling" ); |
61 | STATISTIC(NumFailAnalyseII, |
62 | "Window scheduling abort due to the failure of the II analysis" ); |
63 | |
64 | cl::opt<unsigned> |
65 | WindowSearchNum("window-search-num" , |
66 | cl::desc("The number of searches per loop in the window " |
67 | "algorithm. 0 means no search number limit." ), |
68 | cl::Hidden, cl::init(Val: 6)); |
69 | |
70 | cl::opt<unsigned> WindowSearchRatio( |
71 | "window-search-ratio" , |
72 | cl::desc("The ratio of searches per loop in the window algorithm. 100 " |
73 | "means search all positions in the loop, while 0 means not " |
74 | "performing any search." ), |
75 | cl::Hidden, cl::init(Val: 40)); |
76 | |
77 | cl::opt<unsigned> WindowIICoeff( |
78 | "window-ii-coeff" , |
79 | cl::desc( |
80 | "The coefficient used when initializing II in the window algorithm." ), |
81 | cl::Hidden, cl::init(Val: 5)); |
82 | |
83 | cl::opt<unsigned> WindowRegionLimit( |
84 | "window-region-limit" , |
85 | cl::desc( |
86 | "The lower limit of the scheduling region in the window algorithm." ), |
87 | cl::Hidden, cl::init(Val: 3)); |
88 | |
89 | cl::opt<unsigned> WindowDiffLimit( |
90 | "window-diff-limit" , |
91 | cl::desc("The lower limit of the difference between best II and base II in " |
92 | "the window algorithm. If the difference is smaller than " |
93 | "this lower limit, window scheduling will not be performed." ), |
94 | cl::Hidden, cl::init(Val: 2)); |
95 | } // namespace |
96 | |
97 | // WindowIILimit serves as an indicator of abnormal scheduling results and could |
98 | // potentially be referenced by the derived target window scheduler. |
99 | static cl::opt<unsigned> |
100 | WindowIILimit("window-ii-limit" , |
101 | cl::desc("The upper limit of II in the window algorithm." ), |
102 | cl::Hidden, cl::init(Val: 1000)); |
103 | |
104 | WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML) |
105 | : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML), |
106 | Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()), |
107 | TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) { |
108 | TripleDAG = std::unique_ptr<ScheduleDAGInstrs>( |
109 | createMachineScheduler(/*OnlyBuildGraph=*/true)); |
110 | } |
111 | |
112 | bool WindowScheduler::run() { |
113 | if (!initialize()) { |
114 | LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n" ); |
115 | return false; |
116 | } |
117 | // The window algorithm is time-consuming, and its compilation time should be |
118 | // taken into consideration. |
119 | TimeTraceScope Scope("WindowSearch" ); |
120 | ++NumTryWindowSchedule; |
121 | // Performing the relevant processing before window scheduling. |
122 | preProcess(); |
123 | // The main window scheduling begins. |
124 | std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler()); |
125 | auto SearchIndexes = getSearchIndexes(SearchNum: WindowSearchNum, SearchRatio: WindowSearchRatio); |
126 | for (unsigned Idx : SearchIndexes) { |
127 | OriToCycle.clear(); |
128 | ++NumTryWindowSearch; |
129 | // The scheduling starts with non-phi instruction, so SchedPhiNum needs to |
130 | // be added to Idx. |
131 | unsigned Offset = Idx + SchedPhiNum; |
132 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
133 | SchedDAG->startBlock(BB: MBB); |
134 | SchedDAG->enterRegion(bb: MBB, begin: Range.begin(), end: Range.end(), regioninstrs: SchedInstrNum); |
135 | SchedDAG->schedule(); |
136 | LLVM_DEBUG(SchedDAG->dump()); |
137 | unsigned II = analyseII(DAG&: *SchedDAG, Offset); |
138 | if (II == WindowIILimit) { |
139 | restoreTripleMBB(); |
140 | LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n" ); |
141 | ++NumFailAnalyseII; |
142 | continue; |
143 | } |
144 | schedulePhi(Offset, II); |
145 | updateScheduleResult(Offset, II); |
146 | restoreTripleMBB(); |
147 | LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is " |
148 | << II << ".\n" ); |
149 | } |
150 | // Performing the relevant processing after window scheduling. |
151 | postProcess(); |
152 | // Check whether the scheduling result is valid. |
153 | if (!isScheduleValid()) { |
154 | LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n" ); |
155 | return false; |
156 | } |
157 | LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset |
158 | << " and Best II is " << BestII << ".\n" ); |
159 | // Expand the scheduling result to prologue, kernel, and epilogue. |
160 | expand(); |
161 | ++NumWindowSchedule; |
162 | return true; |
163 | } |
164 | |
165 | ScheduleDAGInstrs * |
166 | WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) { |
167 | return OnlyBuildGraph |
168 | ? new ScheduleDAGMI( |
169 | Context, std::make_unique<PostGenericScheduler>(args&: Context), |
170 | true) |
171 | : Context->TM->createMachineScheduler(C: Context); |
172 | } |
173 | |
174 | bool WindowScheduler::initialize() { |
175 | if (!Subtarget->enableWindowScheduler()) { |
176 | LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n" ); |
177 | return false; |
178 | } |
179 | // Initialized the member variables used by window algorithm. |
180 | OriMIs.clear(); |
181 | TriMIs.clear(); |
182 | TriToOri.clear(); |
183 | OriToCycle.clear(); |
184 | SchedResult.clear(); |
185 | SchedPhiNum = 0; |
186 | SchedInstrNum = 0; |
187 | BestII = UINT_MAX; |
188 | BestOffset = 0; |
189 | BaseII = 0; |
190 | // List scheduling used in the window algorithm depends on LiveIntervals. |
191 | if (!Context->LIS) { |
192 | LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n" ); |
193 | return false; |
194 | } |
195 | // Check each MI in MBB. |
196 | SmallSet<Register, 8> PrevDefs; |
197 | SmallSet<Register, 8> PrevUses; |
198 | auto IsLoopCarried = [&](MachineInstr &Phi) { |
199 | // Two cases are checked here: (1)The virtual register defined by the |
200 | // preceding phi is used by the succeeding phi;(2)The preceding phi uses the |
201 | // virtual register defined by the succeeding phi. |
202 | if (PrevUses.count(V: Phi.getOperand(i: 0).getReg())) |
203 | return true; |
204 | PrevDefs.insert(V: Phi.getOperand(i: 0).getReg()); |
205 | for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) { |
206 | if (PrevDefs.count(V: Phi.getOperand(i: I).getReg())) |
207 | return true; |
208 | PrevUses.insert(V: Phi.getOperand(i: I).getReg()); |
209 | } |
210 | return false; |
211 | }; |
212 | auto PLI = TII->analyzeLoopForPipelining(LoopBB: MBB); |
213 | for (auto &MI : *MBB) { |
214 | if (MI.isMetaInstruction() || MI.isTerminator()) |
215 | continue; |
216 | if (MI.isPHI()) { |
217 | if (IsLoopCarried(MI)) { |
218 | LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n" ); |
219 | return false; |
220 | } |
221 | ++SchedPhiNum; |
222 | ++BestOffset; |
223 | } else |
224 | ++SchedInstrNum; |
225 | if (TII->isSchedulingBoundary(MI, MBB, MF: *MF)) { |
226 | LLVM_DEBUG( |
227 | dbgs() << "Boundary MI is not allowed in window scheduling!\n" ); |
228 | return false; |
229 | } |
230 | if (PLI->shouldIgnoreForPipelining(MI: &MI)) { |
231 | LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in " |
232 | "window scheduling!\n" ); |
233 | return false; |
234 | } |
235 | for (auto &Def : MI.all_defs()) |
236 | if (Def.isReg() && Def.getReg().isPhysical()) { |
237 | LLVM_DEBUG(dbgs() << "Physical registers are not supported in " |
238 | "window scheduling!\n" ); |
239 | return false; |
240 | } |
241 | } |
242 | if (SchedInstrNum <= WindowRegionLimit) { |
243 | LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n" ); |
244 | return false; |
245 | } |
246 | return true; |
247 | } |
248 | |
249 | void WindowScheduler::preProcess() { |
250 | // Prior to window scheduling, it's necessary to backup the original MBB, |
251 | // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB. |
252 | backupMBB(); |
253 | generateTripleMBB(); |
254 | TripleDAG->startBlock(BB: MBB); |
255 | TripleDAG->enterRegion( |
256 | bb: MBB, begin: MBB->begin(), end: MBB->getFirstTerminator(), |
257 | regioninstrs: std::distance(first: MBB->begin(), last: MBB->getFirstTerminator())); |
258 | TripleDAG->buildSchedGraph(AA: Context->AA); |
259 | } |
260 | |
261 | void WindowScheduler::postProcess() { |
262 | // After window scheduling, it's necessary to clear the TripleDAG and restore |
263 | // to the original MBB. |
264 | TripleDAG->exitRegion(); |
265 | TripleDAG->finishBlock(); |
266 | restoreMBB(); |
267 | } |
268 | |
269 | void WindowScheduler::backupMBB() { |
270 | for (auto &MI : MBB->instrs()) |
271 | OriMIs.push_back(Elt: &MI); |
272 | // Remove MIs and the corresponding live intervals. |
273 | for (auto &MI : make_early_inc_range(Range&: *MBB)) { |
274 | Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true); |
275 | MBB->remove(I: &MI); |
276 | } |
277 | } |
278 | |
279 | void WindowScheduler::restoreMBB() { |
280 | // Erase MIs and the corresponding live intervals. |
281 | for (auto &MI : make_early_inc_range(Range&: *MBB)) { |
282 | Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true); |
283 | MI.eraseFromParent(); |
284 | } |
285 | // Restore MBB to the state before window scheduling. |
286 | llvm::append_range(C&: *MBB, R&: OriMIs); |
287 | updateLiveIntervals(); |
288 | } |
289 | |
290 | void WindowScheduler::generateTripleMBB() { |
291 | const unsigned DuplicateNum = 3; |
292 | TriMIs.clear(); |
293 | TriToOri.clear(); |
294 | assert(OriMIs.size() > 0 && "The Original MIs were not backed up!" ); |
295 | // Step 1: Performing the first copy of MBB instructions, excluding |
296 | // terminators. At the same time, we back up the anti-register of phis. |
297 | // DefPairs hold the old and new define register pairs. |
298 | DenseMap<Register, Register> DefPairs; |
299 | for (auto *MI : OriMIs) { |
300 | if (MI->isMetaInstruction() || MI->isTerminator()) |
301 | continue; |
302 | if (MI->isPHI()) |
303 | if (Register AntiReg = getAntiRegister(Phi: MI)) |
304 | DefPairs[MI->getOperand(i: 0).getReg()] = AntiReg; |
305 | auto *NewMI = MF->CloneMachineInstr(Orig: MI); |
306 | MBB->push_back(MI: NewMI); |
307 | TriMIs.push_back(Elt: NewMI); |
308 | TriToOri[NewMI] = MI; |
309 | } |
310 | // Step 2: Performing the remaining two copies of MBB instructions excluding |
311 | // phis, and the last one contains terminators. At the same time, registers |
312 | // are updated accordingly. |
313 | for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) { |
314 | for (auto *MI : OriMIs) { |
315 | if (MI->isPHI() || MI->isMetaInstruction() || |
316 | (MI->isTerminator() && Cnt < DuplicateNum - 1)) |
317 | continue; |
318 | auto *NewMI = MF->CloneMachineInstr(Orig: MI); |
319 | DenseMap<Register, Register> NewDefs; |
320 | // New defines are updated. |
321 | for (auto MO : NewMI->all_defs()) |
322 | if (MO.isReg() && MO.getReg().isVirtual()) { |
323 | Register NewDef = |
324 | MRI->createVirtualRegister(RegClass: MRI->getRegClass(Reg: MO.getReg())); |
325 | NewMI->substituteRegister(FromReg: MO.getReg(), ToReg: NewDef, SubIdx: 0, RegInfo: *TRI); |
326 | NewDefs[MO.getReg()] = NewDef; |
327 | } |
328 | // New uses are updated. |
329 | for (auto DefRegPair : DefPairs) |
330 | if (NewMI->readsRegister(Reg: DefRegPair.first, TRI)) { |
331 | Register NewUse = DefRegPair.second; |
332 | // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3': |
333 | // |
334 | // BB.3: DefPairs |
335 | // ================================== |
336 | // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7) |
337 | // ... |
338 | // ================================== |
339 | // ... |
340 | // %4 = sub i32 %1, %3 |
341 | // ... |
342 | // %7 = add i32 %5, %6 |
343 | // ... |
344 | // ---------------------------------- |
345 | // ... |
346 | // %8 = sub i32 %7, %3 (%1,%7),(%4,%8) |
347 | // ... |
348 | // %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9) |
349 | // ... |
350 | // ---------------------------------- |
351 | // ... |
352 | // %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9) |
353 | // ... ^ |
354 | // %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11) |
355 | // ... |
356 | // ================================== |
357 | // < Terminators > |
358 | // ================================== |
359 | if (auto It = DefPairs.find(Val: NewUse); It != DefPairs.end()) |
360 | NewUse = It->second; |
361 | NewMI->substituteRegister(FromReg: DefRegPair.first, ToReg: NewUse, SubIdx: 0, RegInfo: *TRI); |
362 | } |
363 | // DefPairs is updated at last. |
364 | for (auto &NewDef : NewDefs) |
365 | DefPairs[NewDef.first] = NewDef.second; |
366 | MBB->push_back(MI: NewMI); |
367 | TriMIs.push_back(Elt: NewMI); |
368 | TriToOri[NewMI] = MI; |
369 | } |
370 | } |
371 | // Step 3: The registers used by phis are updated, and they are generated in |
372 | // the third copy of MBB. |
373 | // In the privious example, the old phi is: |
374 | // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] |
375 | // The new phi is: |
376 | // %1 = phi i32 [%2, %BB.1], [%11, %BB.3] |
377 | for (auto &Phi : MBB->phis()) { |
378 | for (auto DefRegPair : DefPairs) |
379 | if (Phi.readsRegister(Reg: DefRegPair.first, TRI)) |
380 | Phi.substituteRegister(FromReg: DefRegPair.first, ToReg: DefRegPair.second, SubIdx: 0, RegInfo: *TRI); |
381 | } |
382 | updateLiveIntervals(); |
383 | } |
384 | |
385 | void WindowScheduler::restoreTripleMBB() { |
386 | // After list scheduling, the MBB is restored in one traversal. |
387 | for (size_t I = 0; I < TriMIs.size(); ++I) { |
388 | auto *MI = TriMIs[I]; |
389 | auto OldPos = MBB->begin(); |
390 | std::advance(i&: OldPos, n: I); |
391 | auto CurPos = MI->getIterator(); |
392 | if (CurPos != OldPos) { |
393 | MBB->splice(Where: OldPos, Other: MBB, From: CurPos); |
394 | Context->LIS->handleMove(MI&: *MI, /*UpdateFlags=*/false); |
395 | } |
396 | } |
397 | } |
398 | |
399 | SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum, |
400 | unsigned SearchRatio) { |
401 | // We use SearchRatio to get the index range, and then evenly get the indexes |
402 | // according to the SearchNum. This is a simple huristic. Depending on the |
403 | // characteristics of the target, more complex algorithms can be used for both |
404 | // performance and compilation time. |
405 | assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!" ); |
406 | unsigned MaxIdx = SchedInstrNum * SearchRatio / 100; |
407 | unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1; |
408 | SmallVector<unsigned> SearchIndexes; |
409 | for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step) |
410 | SearchIndexes.push_back(Elt: Idx); |
411 | return SearchIndexes; |
412 | } |
413 | |
414 | int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) { |
415 | // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1. |
416 | unsigned MaxDepth = 1; |
417 | for (auto &SU : DAG.SUnits) |
418 | MaxDepth = std::max(a: SU.getDepth() + SU.Latency, b: MaxDepth); |
419 | return MaxDepth * WindowIICoeff; |
420 | } |
421 | |
422 | int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG, |
423 | unsigned Offset) { |
424 | int InitII = getEstimatedII(DAG); |
425 | ResourceManager RM(Subtarget, &DAG); |
426 | RM.init(II: InitII); |
427 | // ResourceManager and DAG are used to calculate the maximum cycle for the |
428 | // scheduled MIs. Since MIs in the Region have already been scheduled, the |
429 | // emit cycles can be estimated in order here. |
430 | int CurCycle = 0; |
431 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
432 | for (auto &MI : Range) { |
433 | auto *SU = DAG.getSUnit(MI: &MI); |
434 | int ExpectCycle = CurCycle; |
435 | // The predecessors of current MI determine its earliest issue cycle. |
436 | for (auto &Pred : SU->Preds) { |
437 | if (Pred.isWeak()) |
438 | continue; |
439 | auto *PredMI = Pred.getSUnit()->getInstr(); |
440 | int PredCycle = getOriCycle(NewMI: PredMI); |
441 | ExpectCycle = std::max(a: ExpectCycle, b: PredCycle + (int)Pred.getLatency()); |
442 | } |
443 | // Zero cost instructions do not need to check resource. |
444 | if (!TII->isZeroCost(Opcode: MI.getOpcode())) { |
445 | // ResourceManager can be used to detect resource conflicts between the |
446 | // current MI and the previously inserted MIs. |
447 | while (!RM.canReserveResources(SU&: *SU, Cycle: CurCycle) || CurCycle < ExpectCycle) { |
448 | ++CurCycle; |
449 | if (CurCycle == (int)WindowIILimit) |
450 | return CurCycle; |
451 | } |
452 | RM.reserveResources(SU&: *SU, Cycle: CurCycle); |
453 | } |
454 | OriToCycle[getOriMI(NewMI: &MI)] = CurCycle; |
455 | LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S." |
456 | << getOriStage(getOriMI(&MI), Offset) << "]: " << MI); |
457 | } |
458 | LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n" ); |
459 | return CurCycle; |
460 | } |
461 | |
462 | // By utilizing TripleDAG, we can easily establish dependencies between A and B. |
463 | // Based on the MaxCycle and the issue cycle of A and B, we can determine |
464 | // whether it is necessary to add a stall cycle. This is because, without |
465 | // inserting the stall cycle, the latency constraint between A and B cannot be |
466 | // satisfied. The details are as follows: |
467 | // |
468 | // New MBB: |
469 | // ======================================== |
470 | // < Phis > |
471 | // ======================================== (sliding direction) |
472 | // MBB copy 1 | |
473 | // V |
474 | // |
475 | // ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~ ----schedule window----- |
476 | // | | |
477 | // ===================V==================== | |
478 | // MBB copy 2 < MI B > | |
479 | // | |
480 | // < MI A > V |
481 | // ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------ |
482 | // : |
483 | // ===================V==================== |
484 | // MBB copy 3 < MI B'> |
485 | // |
486 | // |
487 | // |
488 | // |
489 | // ======================================== |
490 | // < Terminators > |
491 | // ======================================== |
492 | int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) { |
493 | int MaxStallCycle = 0; |
494 | int CurrentII = MaxCycle + 1; |
495 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
496 | for (auto &MI : Range) { |
497 | auto *SU = TripleDAG->getSUnit(MI: &MI); |
498 | int DefCycle = getOriCycle(NewMI: &MI); |
499 | for (auto &Succ : SU->Succs) { |
500 | if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU) |
501 | continue; |
502 | // If the expected cycle does not exceed CurrentII, no check is needed. |
503 | if (DefCycle + (int)Succ.getLatency() <= CurrentII) |
504 | continue; |
505 | // If the cycle of the scheduled MI A is less than that of the scheduled |
506 | // MI B, the scheduling will fail because the lifetime of the |
507 | // corresponding register exceeds II. |
508 | auto *SuccMI = Succ.getSUnit()->getInstr(); |
509 | int UseCycle = getOriCycle(NewMI: SuccMI); |
510 | if (DefCycle < UseCycle) |
511 | return WindowIILimit; |
512 | // Get the stall cycle introduced by the register between two trips. |
513 | int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle; |
514 | MaxStallCycle = std::max(a: MaxStallCycle, b: StallCycle); |
515 | } |
516 | } |
517 | LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n" ); |
518 | return MaxStallCycle; |
519 | } |
520 | |
521 | unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) { |
522 | LLVM_DEBUG(dbgs() << "Start analyzing II:\n" ); |
523 | int MaxCycle = calculateMaxCycle(DAG, Offset); |
524 | if (MaxCycle == (int)WindowIILimit) |
525 | return MaxCycle; |
526 | int StallCycle = calculateStallCycle(Offset, MaxCycle); |
527 | if (StallCycle == (int)WindowIILimit) |
528 | return StallCycle; |
529 | // The value of II is equal to the maximum execution cycle plus 1. |
530 | return MaxCycle + StallCycle + 1; |
531 | } |
532 | |
533 | void WindowScheduler::schedulePhi(int Offset, unsigned &II) { |
534 | LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n" ); |
535 | for (auto &Phi : MBB->phis()) { |
536 | int LateCycle = INT_MAX; |
537 | auto *SU = TripleDAG->getSUnit(MI: &Phi); |
538 | for (auto &Succ : SU->Succs) { |
539 | // Phi doesn't have any Anti successors. |
540 | if (Succ.getKind() != SDep::Data) |
541 | continue; |
542 | // Phi is scheduled before the successor of stage 0. The issue cycle of |
543 | // phi is the latest cycle in this interval. |
544 | auto *SuccMI = Succ.getSUnit()->getInstr(); |
545 | int Cycle = getOriCycle(NewMI: SuccMI); |
546 | if (getOriStage(OriMI: getOriMI(NewMI: SuccMI), Offset) == 0) |
547 | LateCycle = std::min(a: LateCycle, b: Cycle); |
548 | } |
549 | // The anti-dependency of phi need to be handled separately in the same way. |
550 | if (Register AntiReg = getAntiRegister(Phi: &Phi)) { |
551 | auto *AntiMI = MRI->getVRegDef(Reg: AntiReg); |
552 | // AntiReg may be defined outside the kernel MBB. |
553 | if (AntiMI->getParent() == MBB) { |
554 | auto AntiCycle = getOriCycle(NewMI: AntiMI); |
555 | if (getOriStage(OriMI: getOriMI(NewMI: AntiMI), Offset) == 0) |
556 | LateCycle = std::min(a: LateCycle, b: AntiCycle); |
557 | } |
558 | } |
559 | // If there is no limit to the late cycle, a default value is given. |
560 | if (LateCycle == INT_MAX) |
561 | LateCycle = (int)(II - 1); |
562 | LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi); |
563 | // The issue cycle of phi is set to the latest cycle in the interval. |
564 | auto *OriPhi = getOriMI(NewMI: &Phi); |
565 | OriToCycle[OriPhi] = LateCycle; |
566 | } |
567 | } |
568 | |
569 | DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset, |
570 | unsigned II) { |
571 | // At each issue cycle, phi is placed before MIs in stage 0. So the simplest |
572 | // way is to put phi at the beginning of the current cycle. |
573 | DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs; |
574 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
575 | for (auto &Phi : MBB->phis()) |
576 | CycleToMIs[getOriCycle(NewMI: &Phi)].push_back(Elt: getOriMI(NewMI: &Phi)); |
577 | for (auto &MI : Range) |
578 | CycleToMIs[getOriCycle(NewMI: &MI)].push_back(Elt: getOriMI(NewMI: &MI)); |
579 | // Each MI is assigned a separate ordered Id, which is used as a sort marker |
580 | // in the following expand process. |
581 | DenseMap<MachineInstr *, int> IssueOrder; |
582 | int Id = 0; |
583 | for (int Cycle = 0; Cycle < (int)II; ++Cycle) { |
584 | auto It = CycleToMIs.find(Val: Cycle); |
585 | if (It == CycleToMIs.end()) |
586 | continue; |
587 | for (auto *MI : It->second) |
588 | IssueOrder[MI] = Id++; |
589 | } |
590 | return IssueOrder; |
591 | } |
592 | |
593 | void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) { |
594 | // At the first update, Offset is equal to SchedPhiNum. At this time, only |
595 | // BestII, BestOffset, and BaseII need to be updated. |
596 | if (Offset == SchedPhiNum) { |
597 | BestII = II; |
598 | BestOffset = SchedPhiNum; |
599 | BaseII = II; |
600 | return; |
601 | } |
602 | // The update will only continue if the II is smaller than BestII and the II |
603 | // is sufficiently small. |
604 | if ((II >= BestII) || (II + WindowDiffLimit > BaseII)) |
605 | return; |
606 | BestII = II; |
607 | BestOffset = Offset; |
608 | // Record the result of the current list scheduling, noting that each MI is |
609 | // stored unordered in SchedResult. |
610 | SchedResult.clear(); |
611 | auto IssueOrder = getIssueOrder(Offset, II); |
612 | for (auto &Pair : OriToCycle) { |
613 | assert(IssueOrder.count(Pair.first) && "Cannot find original MI!" ); |
614 | SchedResult.push_back(Elt: std::make_tuple(args&: Pair.first, args&: Pair.second, |
615 | args: getOriStage(OriMI: Pair.first, Offset), |
616 | args&: IssueOrder[Pair.first])); |
617 | } |
618 | } |
619 | |
620 | void WindowScheduler::expand() { |
621 | // The MIs in the SchedResult are sorted by the issue order ID. |
622 | llvm::stable_sort(Range&: SchedResult, |
623 | C: [](const std::tuple<MachineInstr *, int, int, int> &A, |
624 | const std::tuple<MachineInstr *, int, int, int> &B) { |
625 | return std::get<3>(t: A) < std::get<3>(t: B); |
626 | }); |
627 | // Use the scheduling infrastructure for expansion, noting that InstrChanges |
628 | // is not supported here. |
629 | DenseMap<MachineInstr *, int> Cycles, Stages; |
630 | std::vector<MachineInstr *> OrderedInsts; |
631 | for (auto &Info : SchedResult) { |
632 | auto *MI = std::get<0>(t&: Info); |
633 | OrderedInsts.push_back(x: MI); |
634 | Cycles[MI] = std::get<1>(t&: Info); |
635 | Stages[MI] = std::get<2>(t&: Info); |
636 | LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI] |
637 | << "]: " << *MI); |
638 | } |
639 | ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles), |
640 | std::move(Stages)); |
641 | ModuloScheduleExpander MSE(*MF, MS, *Context->LIS, |
642 | ModuloScheduleExpander::InstrChangesTy()); |
643 | MSE.expand(); |
644 | MSE.cleanup(); |
645 | } |
646 | |
647 | void WindowScheduler::updateLiveIntervals() { |
648 | SmallVector<Register, 128> UsedRegs; |
649 | for (MachineInstr &MI : *MBB) |
650 | for (const MachineOperand &MO : MI.operands()) { |
651 | if (!MO.isReg() || MO.getReg() == 0) |
652 | continue; |
653 | Register Reg = MO.getReg(); |
654 | if (!is_contained(Range&: UsedRegs, Element: Reg)) |
655 | UsedRegs.push_back(Elt: Reg); |
656 | } |
657 | Context->LIS->repairIntervalsInRange(MBB, Begin: MBB->begin(), End: MBB->end(), OrigRegs: UsedRegs); |
658 | } |
659 | |
660 | iterator_range<MachineBasicBlock::iterator> |
661 | WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) { |
662 | auto RegionBegin = MBB->begin(); |
663 | std::advance(i&: RegionBegin, n: Offset); |
664 | auto RegionEnd = RegionBegin; |
665 | std::advance(i&: RegionEnd, n: Num); |
666 | return make_range(x: RegionBegin, y: RegionEnd); |
667 | } |
668 | |
669 | int WindowScheduler::getOriCycle(MachineInstr *NewMI) { |
670 | assert(TriToOri.count(NewMI) && "Cannot find original MI!" ); |
671 | auto *OriMI = TriToOri[NewMI]; |
672 | assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!" ); |
673 | return OriToCycle[OriMI]; |
674 | } |
675 | |
676 | MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) { |
677 | assert(TriToOri.count(NewMI) && "Cannot find original MI!" ); |
678 | return TriToOri[NewMI]; |
679 | } |
680 | |
681 | unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) { |
682 | assert(llvm::is_contained(OriMIs, OriMI) && "Cannot find OriMI in OriMIs!" ); |
683 | // If there is no instruction fold, all MI stages are 0. |
684 | if (Offset == SchedPhiNum) |
685 | return 0; |
686 | // For those MIs with an ID less than the Offset, their stages are set to 0, |
687 | // while the rest are set to 1. |
688 | unsigned Id = 0; |
689 | for (auto *MI : OriMIs) { |
690 | if (MI->isMetaInstruction()) |
691 | continue; |
692 | if (MI == OriMI) |
693 | break; |
694 | ++Id; |
695 | } |
696 | return Id >= (size_t)Offset ? 1 : 0; |
697 | } |
698 | |
699 | Register WindowScheduler::getAntiRegister(MachineInstr *Phi) { |
700 | assert(Phi->isPHI() && "Expecting PHI!" ); |
701 | Register AntiReg; |
702 | for (auto MO : Phi->uses()) { |
703 | if (MO.isReg()) |
704 | AntiReg = MO.getReg(); |
705 | else if (MO.isMBB() && MO.getMBB() == MBB) |
706 | return AntiReg; |
707 | } |
708 | return 0; |
709 | } |
710 | |