| 1 | //======----------- WindowScheduler.cpp - window scheduler -------------======// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // An implementation of the Window Scheduling software pipelining algorithm. |
| 10 | // |
| 11 | // The fundamental concept of the window scheduling algorithm involves folding |
| 12 | // the original MBB at a specific position, followed by list scheduling on the |
| 13 | // folded MIs. The optimal scheduling result is then chosen from various folding |
| 14 | // positions as the final scheduling outcome. |
| 15 | // |
| 16 | // The primary challenge in this algorithm lies in generating the folded MIs and |
| 17 | // establishing their dependencies. We have innovatively employed a new MBB, |
| 18 | // created by copying the original MBB three times, known as TripleMBB. This |
| 19 | // TripleMBB enables the convenient implementation of MI folding and dependency |
| 20 | // establishment. To facilitate the algorithm's implementation, we have also |
| 21 | // devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle. |
| 22 | // |
| 23 | // Another challenge in the algorithm is the scheduling of phis. Semantically, |
| 24 | // it is difficult to place the phis in the window and perform list scheduling. |
| 25 | // Therefore, we schedule these phis separately after each list scheduling. |
| 26 | // |
| 27 | // The provided implementation is designed for use before the Register Allocator |
| 28 | // (RA). If the target requires implementation after RA, it is recommended to |
| 29 | // reimplement analyseII(), schedulePhi(), and expand(). Additionally, |
| 30 | // target-specific logic can be added in initialize(), preProcess(), and |
| 31 | // postProcess(). |
| 32 | // |
| 33 | // Lastly, it is worth mentioning that getSearchIndexes() is an important |
| 34 | // function. We have experimented with more complex heuristics on downstream |
| 35 | // target and achieved favorable results. |
| 36 | // |
| 37 | //===----------------------------------------------------------------------===// |
| 38 | #include "llvm/CodeGen/WindowScheduler.h" |
| 39 | #include "llvm/ADT/Statistic.h" |
| 40 | #include "llvm/CodeGen/LiveIntervals.h" |
| 41 | #include "llvm/CodeGen/MachineLoopInfo.h" |
| 42 | #include "llvm/CodeGen/MachinePipeliner.h" |
| 43 | #include "llvm/CodeGen/ModuloSchedule.h" |
| 44 | #include "llvm/CodeGen/TargetPassConfig.h" |
| 45 | #include "llvm/Support/CommandLine.h" |
| 46 | #include "llvm/Support/Debug.h" |
| 47 | #include "llvm/Support/TimeProfiler.h" |
| 48 | #include "llvm/Target/TargetMachine.h" |
| 49 | |
| 50 | using namespace llvm; |
| 51 | |
| 52 | #define DEBUG_TYPE "pipeliner" |
| 53 | |
| 54 | namespace { |
| 55 | STATISTIC(NumTryWindowSchedule, |
| 56 | "Number of loops that we attempt to use window scheduling" ); |
| 57 | STATISTIC(NumTryWindowSearch, |
| 58 | "Number of times that we run list schedule in the window scheduling" ); |
| 59 | STATISTIC(NumWindowSchedule, |
| 60 | "Number of loops that we successfully use window scheduling" ); |
| 61 | STATISTIC(NumFailAnalyseII, |
| 62 | "Window scheduling abort due to the failure of the II analysis" ); |
| 63 | |
| 64 | cl::opt<unsigned> |
| 65 | WindowSearchNum("window-search-num" , |
| 66 | cl::desc("The number of searches per loop in the window " |
| 67 | "algorithm. 0 means no search number limit." ), |
| 68 | cl::Hidden, cl::init(Val: 6)); |
| 69 | |
| 70 | cl::opt<unsigned> WindowSearchRatio( |
| 71 | "window-search-ratio" , |
| 72 | cl::desc("The ratio of searches per loop in the window algorithm. 100 " |
| 73 | "means search all positions in the loop, while 0 means not " |
| 74 | "performing any search." ), |
| 75 | cl::Hidden, cl::init(Val: 40)); |
| 76 | |
| 77 | cl::opt<unsigned> WindowIICoeff( |
| 78 | "window-ii-coeff" , |
| 79 | cl::desc( |
| 80 | "The coefficient used when initializing II in the window algorithm." ), |
| 81 | cl::Hidden, cl::init(Val: 5)); |
| 82 | |
| 83 | cl::opt<unsigned> WindowRegionLimit( |
| 84 | "window-region-limit" , |
| 85 | cl::desc( |
| 86 | "The lower limit of the scheduling region in the window algorithm." ), |
| 87 | cl::Hidden, cl::init(Val: 3)); |
| 88 | |
| 89 | cl::opt<unsigned> WindowDiffLimit( |
| 90 | "window-diff-limit" , |
| 91 | cl::desc("The lower limit of the difference between best II and base II in " |
| 92 | "the window algorithm. If the difference is smaller than " |
| 93 | "this lower limit, window scheduling will not be performed." ), |
| 94 | cl::Hidden, cl::init(Val: 2)); |
| 95 | } // namespace |
| 96 | |
| 97 | // WindowIILimit serves as an indicator of abnormal scheduling results and could |
| 98 | // potentially be referenced by the derived target window scheduler. |
| 99 | static cl::opt<unsigned> |
| 100 | WindowIILimit("window-ii-limit" , |
| 101 | cl::desc("The upper limit of II in the window algorithm." ), |
| 102 | cl::Hidden, cl::init(Val: 1000)); |
| 103 | |
| 104 | WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML) |
| 105 | : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML), |
| 106 | Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()), |
| 107 | TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) { |
| 108 | TripleDAG = std::unique_ptr<ScheduleDAGInstrs>( |
| 109 | createMachineScheduler(/*OnlyBuildGraph=*/true)); |
| 110 | } |
| 111 | |
| 112 | bool WindowScheduler::run() { |
| 113 | if (!initialize()) { |
| 114 | LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n" ); |
| 115 | return false; |
| 116 | } |
| 117 | // The window algorithm is time-consuming, and its compilation time should be |
| 118 | // taken into consideration. |
| 119 | TimeTraceScope Scope("WindowSearch" ); |
| 120 | ++NumTryWindowSchedule; |
| 121 | // Performing the relevant processing before window scheduling. |
| 122 | preProcess(); |
| 123 | // The main window scheduling begins. |
| 124 | std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler()); |
| 125 | auto SearchIndexes = getSearchIndexes(SearchNum: WindowSearchNum, SearchRatio: WindowSearchRatio); |
| 126 | for (unsigned Idx : SearchIndexes) { |
| 127 | OriToCycle.clear(); |
| 128 | ++NumTryWindowSearch; |
| 129 | // The scheduling starts with non-phi instruction, so SchedPhiNum needs to |
| 130 | // be added to Idx. |
| 131 | unsigned Offset = Idx + SchedPhiNum; |
| 132 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
| 133 | SchedDAG->startBlock(BB: MBB); |
| 134 | SchedDAG->enterRegion(bb: MBB, begin: Range.begin(), end: Range.end(), regioninstrs: SchedInstrNum); |
| 135 | SchedDAG->schedule(); |
| 136 | LLVM_DEBUG(SchedDAG->dump()); |
| 137 | unsigned II = analyseII(DAG&: *SchedDAG, Offset); |
| 138 | if (II == WindowIILimit) { |
| 139 | restoreTripleMBB(); |
| 140 | LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n" ); |
| 141 | ++NumFailAnalyseII; |
| 142 | continue; |
| 143 | } |
| 144 | schedulePhi(Offset, II); |
| 145 | updateScheduleResult(Offset, II); |
| 146 | restoreTripleMBB(); |
| 147 | LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is " |
| 148 | << II << ".\n" ); |
| 149 | } |
| 150 | // Performing the relevant processing after window scheduling. |
| 151 | postProcess(); |
| 152 | // Check whether the scheduling result is valid. |
| 153 | if (!isScheduleValid()) { |
| 154 | LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n" ); |
| 155 | return false; |
| 156 | } |
| 157 | LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset |
| 158 | << " and Best II is " << BestII << ".\n" ); |
| 159 | // Expand the scheduling result to prologue, kernel, and epilogue. |
| 160 | expand(); |
| 161 | ++NumWindowSchedule; |
| 162 | return true; |
| 163 | } |
| 164 | |
| 165 | ScheduleDAGInstrs * |
| 166 | WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) { |
| 167 | return OnlyBuildGraph |
| 168 | ? new ScheduleDAGMI( |
| 169 | Context, std::make_unique<PostGenericScheduler>(args&: Context), |
| 170 | true) |
| 171 | : Context->TM->createMachineScheduler(C: Context); |
| 172 | } |
| 173 | |
| 174 | bool WindowScheduler::initialize() { |
| 175 | if (!Subtarget->enableWindowScheduler()) { |
| 176 | LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n" ); |
| 177 | return false; |
| 178 | } |
| 179 | // Initialized the member variables used by window algorithm. |
| 180 | OriMIs.clear(); |
| 181 | TriMIs.clear(); |
| 182 | TriToOri.clear(); |
| 183 | OriToCycle.clear(); |
| 184 | SchedResult.clear(); |
| 185 | SchedPhiNum = 0; |
| 186 | SchedInstrNum = 0; |
| 187 | BestII = UINT_MAX; |
| 188 | BestOffset = 0; |
| 189 | BaseII = 0; |
| 190 | // List scheduling used in the window algorithm depends on LiveIntervals. |
| 191 | if (!Context->LIS) { |
| 192 | LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n" ); |
| 193 | return false; |
| 194 | } |
| 195 | // Check each MI in MBB. |
| 196 | SmallSet<Register, 8> PrevDefs; |
| 197 | SmallSet<Register, 8> PrevUses; |
| 198 | auto IsLoopCarried = [&](MachineInstr &Phi) { |
| 199 | // Two cases are checked here: (1)The virtual register defined by the |
| 200 | // preceding phi is used by the succeeding phi;(2)The preceding phi uses the |
| 201 | // virtual register defined by the succeeding phi. |
| 202 | if (PrevUses.count(V: Phi.getOperand(i: 0).getReg())) |
| 203 | return true; |
| 204 | PrevDefs.insert(V: Phi.getOperand(i: 0).getReg()); |
| 205 | for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) { |
| 206 | if (PrevDefs.count(V: Phi.getOperand(i: I).getReg())) |
| 207 | return true; |
| 208 | PrevUses.insert(V: Phi.getOperand(i: I).getReg()); |
| 209 | } |
| 210 | return false; |
| 211 | }; |
| 212 | auto PLI = TII->analyzeLoopForPipelining(LoopBB: MBB); |
| 213 | for (auto &MI : *MBB) { |
| 214 | if (MI.isMetaInstruction() || MI.isTerminator()) |
| 215 | continue; |
| 216 | if (MI.isPHI()) { |
| 217 | if (IsLoopCarried(MI)) { |
| 218 | LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n" ); |
| 219 | return false; |
| 220 | } |
| 221 | ++SchedPhiNum; |
| 222 | ++BestOffset; |
| 223 | } else |
| 224 | ++SchedInstrNum; |
| 225 | if (TII->isSchedulingBoundary(MI, MBB, MF: *MF)) { |
| 226 | LLVM_DEBUG( |
| 227 | dbgs() << "Boundary MI is not allowed in window scheduling!\n" ); |
| 228 | return false; |
| 229 | } |
| 230 | if (PLI->shouldIgnoreForPipelining(MI: &MI)) { |
| 231 | LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in " |
| 232 | "window scheduling!\n" ); |
| 233 | return false; |
| 234 | } |
| 235 | for (auto &Def : MI.all_defs()) |
| 236 | if (Def.isReg() && Def.getReg().isPhysical()) { |
| 237 | LLVM_DEBUG(dbgs() << "Physical registers are not supported in " |
| 238 | "window scheduling!\n" ); |
| 239 | return false; |
| 240 | } |
| 241 | } |
| 242 | if (SchedInstrNum <= WindowRegionLimit) { |
| 243 | LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n" ); |
| 244 | return false; |
| 245 | } |
| 246 | return true; |
| 247 | } |
| 248 | |
| 249 | void WindowScheduler::preProcess() { |
| 250 | // Prior to window scheduling, it's necessary to backup the original MBB, |
| 251 | // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB. |
| 252 | backupMBB(); |
| 253 | generateTripleMBB(); |
| 254 | TripleDAG->startBlock(BB: MBB); |
| 255 | TripleDAG->enterRegion( |
| 256 | bb: MBB, begin: MBB->begin(), end: MBB->getFirstTerminator(), |
| 257 | regioninstrs: std::distance(first: MBB->begin(), last: MBB->getFirstTerminator())); |
| 258 | TripleDAG->buildSchedGraph(AA: Context->AA); |
| 259 | } |
| 260 | |
| 261 | void WindowScheduler::postProcess() { |
| 262 | // After window scheduling, it's necessary to clear the TripleDAG and restore |
| 263 | // to the original MBB. |
| 264 | TripleDAG->exitRegion(); |
| 265 | TripleDAG->finishBlock(); |
| 266 | restoreMBB(); |
| 267 | } |
| 268 | |
| 269 | void WindowScheduler::backupMBB() { |
| 270 | for (auto &MI : MBB->instrs()) |
| 271 | OriMIs.push_back(Elt: &MI); |
| 272 | // Remove MIs and the corresponding live intervals. |
| 273 | for (auto &MI : make_early_inc_range(Range&: *MBB)) { |
| 274 | Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true); |
| 275 | MBB->remove(I: &MI); |
| 276 | } |
| 277 | } |
| 278 | |
| 279 | void WindowScheduler::restoreMBB() { |
| 280 | // Erase MIs and the corresponding live intervals. |
| 281 | for (auto &MI : make_early_inc_range(Range&: *MBB)) { |
| 282 | Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, AllowBundled: true); |
| 283 | MI.eraseFromParent(); |
| 284 | } |
| 285 | // Restore MBB to the state before window scheduling. |
| 286 | llvm::append_range(C&: *MBB, R&: OriMIs); |
| 287 | updateLiveIntervals(); |
| 288 | } |
| 289 | |
| 290 | void WindowScheduler::generateTripleMBB() { |
| 291 | const unsigned DuplicateNum = 3; |
| 292 | TriMIs.clear(); |
| 293 | TriToOri.clear(); |
| 294 | assert(OriMIs.size() > 0 && "The Original MIs were not backed up!" ); |
| 295 | // Step 1: Performing the first copy of MBB instructions, excluding |
| 296 | // terminators. At the same time, we back up the anti-register of phis. |
| 297 | // DefPairs hold the old and new define register pairs. |
| 298 | DenseMap<Register, Register> DefPairs; |
| 299 | for (auto *MI : OriMIs) { |
| 300 | if (MI->isMetaInstruction() || MI->isTerminator()) |
| 301 | continue; |
| 302 | if (MI->isPHI()) |
| 303 | if (Register AntiReg = getAntiRegister(Phi: MI)) |
| 304 | DefPairs[MI->getOperand(i: 0).getReg()] = AntiReg; |
| 305 | auto *NewMI = MF->CloneMachineInstr(Orig: MI); |
| 306 | MBB->push_back(MI: NewMI); |
| 307 | TriMIs.push_back(Elt: NewMI); |
| 308 | TriToOri[NewMI] = MI; |
| 309 | } |
| 310 | // Step 2: Performing the remaining two copies of MBB instructions excluding |
| 311 | // phis, and the last one contains terminators. At the same time, registers |
| 312 | // are updated accordingly. |
| 313 | for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) { |
| 314 | for (auto *MI : OriMIs) { |
| 315 | if (MI->isPHI() || MI->isMetaInstruction() || |
| 316 | (MI->isTerminator() && Cnt < DuplicateNum - 1)) |
| 317 | continue; |
| 318 | auto *NewMI = MF->CloneMachineInstr(Orig: MI); |
| 319 | DenseMap<Register, Register> NewDefs; |
| 320 | // New defines are updated. |
| 321 | for (auto MO : NewMI->all_defs()) |
| 322 | if (MO.isReg() && MO.getReg().isVirtual()) { |
| 323 | Register NewDef = |
| 324 | MRI->createVirtualRegister(RegClass: MRI->getRegClass(Reg: MO.getReg())); |
| 325 | NewMI->substituteRegister(FromReg: MO.getReg(), ToReg: NewDef, SubIdx: 0, RegInfo: *TRI); |
| 326 | NewDefs[MO.getReg()] = NewDef; |
| 327 | } |
| 328 | // New uses are updated. |
| 329 | for (auto DefRegPair : DefPairs) |
| 330 | if (NewMI->readsRegister(Reg: DefRegPair.first, TRI)) { |
| 331 | Register NewUse = DefRegPair.second; |
| 332 | // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3': |
| 333 | // |
| 334 | // BB.3: DefPairs |
| 335 | // ================================== |
| 336 | // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7) |
| 337 | // ... |
| 338 | // ================================== |
| 339 | // ... |
| 340 | // %4 = sub i32 %1, %3 |
| 341 | // ... |
| 342 | // %7 = add i32 %5, %6 |
| 343 | // ... |
| 344 | // ---------------------------------- |
| 345 | // ... |
| 346 | // %8 = sub i32 %7, %3 (%1,%7),(%4,%8) |
| 347 | // ... |
| 348 | // %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9) |
| 349 | // ... |
| 350 | // ---------------------------------- |
| 351 | // ... |
| 352 | // %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9) |
| 353 | // ... ^ |
| 354 | // %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11) |
| 355 | // ... |
| 356 | // ================================== |
| 357 | // < Terminators > |
| 358 | // ================================== |
| 359 | if (auto It = DefPairs.find(Val: NewUse); It != DefPairs.end()) |
| 360 | NewUse = It->second; |
| 361 | NewMI->substituteRegister(FromReg: DefRegPair.first, ToReg: NewUse, SubIdx: 0, RegInfo: *TRI); |
| 362 | } |
| 363 | // DefPairs is updated at last. |
| 364 | for (auto &NewDef : NewDefs) |
| 365 | DefPairs[NewDef.first] = NewDef.second; |
| 366 | MBB->push_back(MI: NewMI); |
| 367 | TriMIs.push_back(Elt: NewMI); |
| 368 | TriToOri[NewMI] = MI; |
| 369 | } |
| 370 | } |
| 371 | // Step 3: The registers used by phis are updated, and they are generated in |
| 372 | // the third copy of MBB. |
| 373 | // In the privious example, the old phi is: |
| 374 | // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] |
| 375 | // The new phi is: |
| 376 | // %1 = phi i32 [%2, %BB.1], [%11, %BB.3] |
| 377 | for (auto &Phi : MBB->phis()) { |
| 378 | for (auto DefRegPair : DefPairs) |
| 379 | if (Phi.readsRegister(Reg: DefRegPair.first, TRI)) |
| 380 | Phi.substituteRegister(FromReg: DefRegPair.first, ToReg: DefRegPair.second, SubIdx: 0, RegInfo: *TRI); |
| 381 | } |
| 382 | updateLiveIntervals(); |
| 383 | } |
| 384 | |
| 385 | void WindowScheduler::restoreTripleMBB() { |
| 386 | // After list scheduling, the MBB is restored in one traversal. |
| 387 | for (size_t I = 0; I < TriMIs.size(); ++I) { |
| 388 | auto *MI = TriMIs[I]; |
| 389 | auto OldPos = MBB->begin(); |
| 390 | std::advance(i&: OldPos, n: I); |
| 391 | auto CurPos = MI->getIterator(); |
| 392 | if (CurPos != OldPos) { |
| 393 | MBB->splice(Where: OldPos, Other: MBB, From: CurPos); |
| 394 | Context->LIS->handleMove(MI&: *MI, /*UpdateFlags=*/false); |
| 395 | } |
| 396 | } |
| 397 | } |
| 398 | |
| 399 | SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum, |
| 400 | unsigned SearchRatio) { |
| 401 | // We use SearchRatio to get the index range, and then evenly get the indexes |
| 402 | // according to the SearchNum. This is a simple huristic. Depending on the |
| 403 | // characteristics of the target, more complex algorithms can be used for both |
| 404 | // performance and compilation time. |
| 405 | assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!" ); |
| 406 | unsigned MaxIdx = SchedInstrNum * SearchRatio / 100; |
| 407 | unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1; |
| 408 | SmallVector<unsigned> SearchIndexes; |
| 409 | for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step) |
| 410 | SearchIndexes.push_back(Elt: Idx); |
| 411 | return SearchIndexes; |
| 412 | } |
| 413 | |
| 414 | int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) { |
| 415 | // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1. |
| 416 | unsigned MaxDepth = 1; |
| 417 | for (auto &SU : DAG.SUnits) |
| 418 | MaxDepth = std::max(a: SU.getDepth() + SU.Latency, b: MaxDepth); |
| 419 | return MaxDepth * WindowIICoeff; |
| 420 | } |
| 421 | |
| 422 | int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG, |
| 423 | unsigned Offset) { |
| 424 | int InitII = getEstimatedII(DAG); |
| 425 | ResourceManager RM(Subtarget, &DAG); |
| 426 | RM.init(II: InitII); |
| 427 | // ResourceManager and DAG are used to calculate the maximum cycle for the |
| 428 | // scheduled MIs. Since MIs in the Region have already been scheduled, the |
| 429 | // emit cycles can be estimated in order here. |
| 430 | int CurCycle = 0; |
| 431 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
| 432 | for (auto &MI : Range) { |
| 433 | auto *SU = DAG.getSUnit(MI: &MI); |
| 434 | int ExpectCycle = CurCycle; |
| 435 | // The predecessors of current MI determine its earliest issue cycle. |
| 436 | for (auto &Pred : SU->Preds) { |
| 437 | if (Pred.isWeak()) |
| 438 | continue; |
| 439 | auto *PredMI = Pred.getSUnit()->getInstr(); |
| 440 | int PredCycle = getOriCycle(NewMI: PredMI); |
| 441 | ExpectCycle = std::max(a: ExpectCycle, b: PredCycle + (int)Pred.getLatency()); |
| 442 | } |
| 443 | // Zero cost instructions do not need to check resource. |
| 444 | if (!TII->isZeroCost(Opcode: MI.getOpcode())) { |
| 445 | // ResourceManager can be used to detect resource conflicts between the |
| 446 | // current MI and the previously inserted MIs. |
| 447 | while (!RM.canReserveResources(SU&: *SU, Cycle: CurCycle) || CurCycle < ExpectCycle) { |
| 448 | ++CurCycle; |
| 449 | if (CurCycle == (int)WindowIILimit) |
| 450 | return CurCycle; |
| 451 | } |
| 452 | RM.reserveResources(SU&: *SU, Cycle: CurCycle); |
| 453 | } |
| 454 | OriToCycle[getOriMI(NewMI: &MI)] = CurCycle; |
| 455 | LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S." |
| 456 | << getOriStage(getOriMI(&MI), Offset) << "]: " << MI); |
| 457 | } |
| 458 | LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n" ); |
| 459 | return CurCycle; |
| 460 | } |
| 461 | |
| 462 | // By utilizing TripleDAG, we can easily establish dependencies between A and B. |
| 463 | // Based on the MaxCycle and the issue cycle of A and B, we can determine |
| 464 | // whether it is necessary to add a stall cycle. This is because, without |
| 465 | // inserting the stall cycle, the latency constraint between A and B cannot be |
| 466 | // satisfied. The details are as follows: |
| 467 | // |
| 468 | // New MBB: |
| 469 | // ======================================== |
| 470 | // < Phis > |
| 471 | // ======================================== (sliding direction) |
| 472 | // MBB copy 1 | |
| 473 | // V |
| 474 | // |
| 475 | // ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~ ----schedule window----- |
| 476 | // | | |
| 477 | // ===================V==================== | |
| 478 | // MBB copy 2 < MI B > | |
| 479 | // | |
| 480 | // < MI A > V |
| 481 | // ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------ |
| 482 | // : |
| 483 | // ===================V==================== |
| 484 | // MBB copy 3 < MI B'> |
| 485 | // |
| 486 | // |
| 487 | // |
| 488 | // |
| 489 | // ======================================== |
| 490 | // < Terminators > |
| 491 | // ======================================== |
| 492 | int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) { |
| 493 | int MaxStallCycle = 0; |
| 494 | int CurrentII = MaxCycle + 1; |
| 495 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
| 496 | for (auto &MI : Range) { |
| 497 | auto *SU = TripleDAG->getSUnit(MI: &MI); |
| 498 | int DefCycle = getOriCycle(NewMI: &MI); |
| 499 | for (auto &Succ : SU->Succs) { |
| 500 | if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU) |
| 501 | continue; |
| 502 | // If the expected cycle does not exceed CurrentII, no check is needed. |
| 503 | if (DefCycle + (int)Succ.getLatency() <= CurrentII) |
| 504 | continue; |
| 505 | // If the cycle of the scheduled MI A is less than that of the scheduled |
| 506 | // MI B, the scheduling will fail because the lifetime of the |
| 507 | // corresponding register exceeds II. |
| 508 | auto *SuccMI = Succ.getSUnit()->getInstr(); |
| 509 | int UseCycle = getOriCycle(NewMI: SuccMI); |
| 510 | if (DefCycle < UseCycle) |
| 511 | return WindowIILimit; |
| 512 | // Get the stall cycle introduced by the register between two trips. |
| 513 | int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle; |
| 514 | MaxStallCycle = std::max(a: MaxStallCycle, b: StallCycle); |
| 515 | } |
| 516 | } |
| 517 | LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n" ); |
| 518 | return MaxStallCycle; |
| 519 | } |
| 520 | |
| 521 | unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) { |
| 522 | LLVM_DEBUG(dbgs() << "Start analyzing II:\n" ); |
| 523 | int MaxCycle = calculateMaxCycle(DAG, Offset); |
| 524 | if (MaxCycle == (int)WindowIILimit) |
| 525 | return MaxCycle; |
| 526 | int StallCycle = calculateStallCycle(Offset, MaxCycle); |
| 527 | if (StallCycle == (int)WindowIILimit) |
| 528 | return StallCycle; |
| 529 | // The value of II is equal to the maximum execution cycle plus 1. |
| 530 | return MaxCycle + StallCycle + 1; |
| 531 | } |
| 532 | |
| 533 | void WindowScheduler::schedulePhi(int Offset, unsigned &II) { |
| 534 | LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n" ); |
| 535 | for (auto &Phi : MBB->phis()) { |
| 536 | int LateCycle = INT_MAX; |
| 537 | auto *SU = TripleDAG->getSUnit(MI: &Phi); |
| 538 | for (auto &Succ : SU->Succs) { |
| 539 | // Phi doesn't have any Anti successors. |
| 540 | if (Succ.getKind() != SDep::Data) |
| 541 | continue; |
| 542 | // Phi is scheduled before the successor of stage 0. The issue cycle of |
| 543 | // phi is the latest cycle in this interval. |
| 544 | auto *SuccMI = Succ.getSUnit()->getInstr(); |
| 545 | int Cycle = getOriCycle(NewMI: SuccMI); |
| 546 | if (getOriStage(OriMI: getOriMI(NewMI: SuccMI), Offset) == 0) |
| 547 | LateCycle = std::min(a: LateCycle, b: Cycle); |
| 548 | } |
| 549 | // The anti-dependency of phi need to be handled separately in the same way. |
| 550 | if (Register AntiReg = getAntiRegister(Phi: &Phi)) { |
| 551 | auto *AntiMI = MRI->getVRegDef(Reg: AntiReg); |
| 552 | // AntiReg may be defined outside the kernel MBB. |
| 553 | if (AntiMI->getParent() == MBB) { |
| 554 | auto AntiCycle = getOriCycle(NewMI: AntiMI); |
| 555 | if (getOriStage(OriMI: getOriMI(NewMI: AntiMI), Offset) == 0) |
| 556 | LateCycle = std::min(a: LateCycle, b: AntiCycle); |
| 557 | } |
| 558 | } |
| 559 | // If there is no limit to the late cycle, a default value is given. |
| 560 | if (LateCycle == INT_MAX) |
| 561 | LateCycle = (int)(II - 1); |
| 562 | LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi); |
| 563 | // The issue cycle of phi is set to the latest cycle in the interval. |
| 564 | auto *OriPhi = getOriMI(NewMI: &Phi); |
| 565 | OriToCycle[OriPhi] = LateCycle; |
| 566 | } |
| 567 | } |
| 568 | |
| 569 | DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset, |
| 570 | unsigned II) { |
| 571 | // At each issue cycle, phi is placed before MIs in stage 0. So the simplest |
| 572 | // way is to put phi at the beginning of the current cycle. |
| 573 | DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs; |
| 574 | auto Range = getScheduleRange(Offset, Num: SchedInstrNum); |
| 575 | for (auto &Phi : MBB->phis()) |
| 576 | CycleToMIs[getOriCycle(NewMI: &Phi)].push_back(Elt: getOriMI(NewMI: &Phi)); |
| 577 | for (auto &MI : Range) |
| 578 | CycleToMIs[getOriCycle(NewMI: &MI)].push_back(Elt: getOriMI(NewMI: &MI)); |
| 579 | // Each MI is assigned a separate ordered Id, which is used as a sort marker |
| 580 | // in the following expand process. |
| 581 | DenseMap<MachineInstr *, int> IssueOrder; |
| 582 | int Id = 0; |
| 583 | for (int Cycle = 0; Cycle < (int)II; ++Cycle) { |
| 584 | auto It = CycleToMIs.find(Val: Cycle); |
| 585 | if (It == CycleToMIs.end()) |
| 586 | continue; |
| 587 | for (auto *MI : It->second) |
| 588 | IssueOrder[MI] = Id++; |
| 589 | } |
| 590 | return IssueOrder; |
| 591 | } |
| 592 | |
| 593 | void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) { |
| 594 | // At the first update, Offset is equal to SchedPhiNum. At this time, only |
| 595 | // BestII, BestOffset, and BaseII need to be updated. |
| 596 | if (Offset == SchedPhiNum) { |
| 597 | BestII = II; |
| 598 | BestOffset = SchedPhiNum; |
| 599 | BaseII = II; |
| 600 | return; |
| 601 | } |
| 602 | // The update will only continue if the II is smaller than BestII and the II |
| 603 | // is sufficiently small. |
| 604 | if ((II >= BestII) || (II + WindowDiffLimit > BaseII)) |
| 605 | return; |
| 606 | BestII = II; |
| 607 | BestOffset = Offset; |
| 608 | // Record the result of the current list scheduling, noting that each MI is |
| 609 | // stored unordered in SchedResult. |
| 610 | SchedResult.clear(); |
| 611 | auto IssueOrder = getIssueOrder(Offset, II); |
| 612 | for (auto &Pair : OriToCycle) { |
| 613 | assert(IssueOrder.count(Pair.first) && "Cannot find original MI!" ); |
| 614 | SchedResult.push_back(Elt: std::make_tuple(args&: Pair.first, args&: Pair.second, |
| 615 | args: getOriStage(OriMI: Pair.first, Offset), |
| 616 | args&: IssueOrder[Pair.first])); |
| 617 | } |
| 618 | } |
| 619 | |
| 620 | void WindowScheduler::expand() { |
| 621 | // The MIs in the SchedResult are sorted by the issue order ID. |
| 622 | llvm::stable_sort(Range&: SchedResult, |
| 623 | C: [](const std::tuple<MachineInstr *, int, int, int> &A, |
| 624 | const std::tuple<MachineInstr *, int, int, int> &B) { |
| 625 | return std::get<3>(t: A) < std::get<3>(t: B); |
| 626 | }); |
| 627 | // Use the scheduling infrastructure for expansion, noting that InstrChanges |
| 628 | // is not supported here. |
| 629 | DenseMap<MachineInstr *, int> Cycles, Stages; |
| 630 | std::vector<MachineInstr *> OrderedInsts; |
| 631 | for (auto &Info : SchedResult) { |
| 632 | auto *MI = std::get<0>(t&: Info); |
| 633 | OrderedInsts.push_back(x: MI); |
| 634 | Cycles[MI] = std::get<1>(t&: Info); |
| 635 | Stages[MI] = std::get<2>(t&: Info); |
| 636 | LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI] |
| 637 | << "]: " << *MI); |
| 638 | } |
| 639 | ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles), |
| 640 | std::move(Stages)); |
| 641 | ModuloScheduleExpander MSE(*MF, MS, *Context->LIS, |
| 642 | ModuloScheduleExpander::InstrChangesTy()); |
| 643 | MSE.expand(); |
| 644 | MSE.cleanup(); |
| 645 | } |
| 646 | |
| 647 | void WindowScheduler::updateLiveIntervals() { |
| 648 | SmallVector<Register, 128> UsedRegs; |
| 649 | for (MachineInstr &MI : *MBB) |
| 650 | for (const MachineOperand &MO : MI.operands()) { |
| 651 | if (!MO.isReg() || MO.getReg() == 0) |
| 652 | continue; |
| 653 | Register Reg = MO.getReg(); |
| 654 | if (!is_contained(Range&: UsedRegs, Element: Reg)) |
| 655 | UsedRegs.push_back(Elt: Reg); |
| 656 | } |
| 657 | Context->LIS->repairIntervalsInRange(MBB, Begin: MBB->begin(), End: MBB->end(), OrigRegs: UsedRegs); |
| 658 | } |
| 659 | |
| 660 | iterator_range<MachineBasicBlock::iterator> |
| 661 | WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) { |
| 662 | auto RegionBegin = MBB->begin(); |
| 663 | std::advance(i&: RegionBegin, n: Offset); |
| 664 | auto RegionEnd = RegionBegin; |
| 665 | std::advance(i&: RegionEnd, n: Num); |
| 666 | return make_range(x: RegionBegin, y: RegionEnd); |
| 667 | } |
| 668 | |
| 669 | int WindowScheduler::getOriCycle(MachineInstr *NewMI) { |
| 670 | assert(TriToOri.count(NewMI) && "Cannot find original MI!" ); |
| 671 | auto *OriMI = TriToOri[NewMI]; |
| 672 | assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!" ); |
| 673 | return OriToCycle[OriMI]; |
| 674 | } |
| 675 | |
| 676 | MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) { |
| 677 | assert(TriToOri.count(NewMI) && "Cannot find original MI!" ); |
| 678 | return TriToOri[NewMI]; |
| 679 | } |
| 680 | |
| 681 | unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) { |
| 682 | assert(llvm::is_contained(OriMIs, OriMI) && "Cannot find OriMI in OriMIs!" ); |
| 683 | // If there is no instruction fold, all MI stages are 0. |
| 684 | if (Offset == SchedPhiNum) |
| 685 | return 0; |
| 686 | // For those MIs with an ID less than the Offset, their stages are set to 0, |
| 687 | // while the rest are set to 1. |
| 688 | unsigned Id = 0; |
| 689 | for (auto *MI : OriMIs) { |
| 690 | if (MI->isMetaInstruction()) |
| 691 | continue; |
| 692 | if (MI == OriMI) |
| 693 | break; |
| 694 | ++Id; |
| 695 | } |
| 696 | return Id >= (size_t)Offset ? 1 : 0; |
| 697 | } |
| 698 | |
| 699 | Register WindowScheduler::getAntiRegister(MachineInstr *Phi) { |
| 700 | assert(Phi->isPHI() && "Expecting PHI!" ); |
| 701 | Register AntiReg; |
| 702 | for (auto MO : Phi->uses()) { |
| 703 | if (MO.isReg()) |
| 704 | AntiReg = MO.getReg(); |
| 705 | else if (MO.isMBB() && MO.getMBB() == MBB) |
| 706 | return AntiReg; |
| 707 | } |
| 708 | return 0; |
| 709 | } |
| 710 | |