| 1 | //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner. |
| 10 | // |
| 11 | // This SMS implementation is a target-independent back-end pass. When enabled, |
| 12 | // the pass runs just prior to the register allocation pass, while the machine |
| 13 | // IR is in SSA form. If software pipelining is successful, then the original |
| 14 | // loop is replaced by the optimized loop. The optimized loop contains one or |
| 15 | // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If |
| 16 | // the instructions cannot be scheduled in a given MII, we increase the MII by |
| 17 | // one and try again. |
| 18 | // |
| 19 | // The SMS implementation is an extension of the ScheduleDAGInstrs class. We |
| 20 | // represent loop carried dependences in the DAG as order edges to the Phi |
| 21 | // nodes. We also perform several passes over the DAG to eliminate unnecessary |
| 22 | // edges that inhibit the ability to pipeline. The implementation uses the |
| 23 | // DFAPacketizer class to compute the minimum initiation interval and the check |
| 24 | // where an instruction may be inserted in the pipelined schedule. |
| 25 | // |
| 26 | // In order for the SMS pass to work, several target specific hooks need to be |
| 27 | // implemented to get information about the loop structure and to rewrite |
| 28 | // instructions. |
| 29 | // |
| 30 | //===----------------------------------------------------------------------===// |
| 31 | |
| 32 | #include "llvm/CodeGen/MachinePipeliner.h" |
| 33 | #include "llvm/ADT/ArrayRef.h" |
| 34 | #include "llvm/ADT/BitVector.h" |
| 35 | #include "llvm/ADT/DenseMap.h" |
| 36 | #include "llvm/ADT/PriorityQueue.h" |
| 37 | #include "llvm/ADT/STLExtras.h" |
| 38 | #include "llvm/ADT/SetOperations.h" |
| 39 | #include "llvm/ADT/SetVector.h" |
| 40 | #include "llvm/ADT/SmallPtrSet.h" |
| 41 | #include "llvm/ADT/SmallSet.h" |
| 42 | #include "llvm/ADT/SmallVector.h" |
| 43 | #include "llvm/ADT/Statistic.h" |
| 44 | #include "llvm/ADT/iterator_range.h" |
| 45 | #include "llvm/Analysis/AliasAnalysis.h" |
| 46 | #include "llvm/Analysis/MemoryLocation.h" |
| 47 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 48 | #include "llvm/Analysis/ValueTracking.h" |
| 49 | #include "llvm/CodeGen/DFAPacketizer.h" |
| 50 | #include "llvm/CodeGen/LiveIntervals.h" |
| 51 | #include "llvm/CodeGen/MachineBasicBlock.h" |
| 52 | #include "llvm/CodeGen/MachineDominators.h" |
| 53 | #include "llvm/CodeGen/MachineFunction.h" |
| 54 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 55 | #include "llvm/CodeGen/MachineInstr.h" |
| 56 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
| 57 | #include "llvm/CodeGen/MachineLoopInfo.h" |
| 58 | #include "llvm/CodeGen/MachineMemOperand.h" |
| 59 | #include "llvm/CodeGen/MachineOperand.h" |
| 60 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
| 61 | #include "llvm/CodeGen/ModuloSchedule.h" |
| 62 | #include "llvm/CodeGen/Register.h" |
| 63 | #include "llvm/CodeGen/RegisterClassInfo.h" |
| 64 | #include "llvm/CodeGen/RegisterPressure.h" |
| 65 | #include "llvm/CodeGen/ScheduleDAG.h" |
| 66 | #include "llvm/CodeGen/ScheduleDAGMutation.h" |
| 67 | #include "llvm/CodeGen/TargetInstrInfo.h" |
| 68 | #include "llvm/CodeGen/TargetOpcodes.h" |
| 69 | #include "llvm/CodeGen/TargetPassConfig.h" |
| 70 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
| 71 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
| 72 | #include "llvm/Config/llvm-config.h" |
| 73 | #include "llvm/IR/Attributes.h" |
| 74 | #include "llvm/IR/Function.h" |
| 75 | #include "llvm/MC/LaneBitmask.h" |
| 76 | #include "llvm/MC/MCInstrDesc.h" |
| 77 | #include "llvm/MC/MCInstrItineraries.h" |
| 78 | #include "llvm/Pass.h" |
| 79 | #include "llvm/Support/CommandLine.h" |
| 80 | #include "llvm/Support/Compiler.h" |
| 81 | #include "llvm/Support/Debug.h" |
| 82 | #include "llvm/Support/raw_ostream.h" |
| 83 | #include <algorithm> |
| 84 | #include <cassert> |
| 85 | #include <climits> |
| 86 | #include <cstdint> |
| 87 | #include <deque> |
| 88 | #include <functional> |
| 89 | #include <iomanip> |
| 90 | #include <iterator> |
| 91 | #include <map> |
| 92 | #include <memory> |
| 93 | #include <sstream> |
| 94 | #include <tuple> |
| 95 | #include <utility> |
| 96 | #include <vector> |
| 97 | |
| 98 | using namespace llvm; |
| 99 | |
| 100 | #define DEBUG_TYPE "pipeliner" |
| 101 | |
| 102 | STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline" ); |
| 103 | STATISTIC(NumPipelined, "Number of loops software pipelined" ); |
| 104 | STATISTIC(NumNodeOrderIssues, "Number of node order issues found" ); |
| 105 | STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch" ); |
| 106 | STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop" ); |
| 107 | STATISTIC(, "Pipeliner abort due to missing preheader" ); |
| 108 | STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large" ); |
| 109 | STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII" ); |
| 110 | STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found" ); |
| 111 | STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage" ); |
| 112 | STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages" ); |
| 113 | |
| 114 | /// A command line option to turn software pipelining on or off. |
| 115 | static cl::opt<bool> EnableSWP("enable-pipeliner" , cl::Hidden, cl::init(Val: true), |
| 116 | cl::desc("Enable Software Pipelining" )); |
| 117 | |
| 118 | /// A command line option to enable SWP at -Os. |
| 119 | static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size" , |
| 120 | cl::desc("Enable SWP at Os." ), cl::Hidden, |
| 121 | cl::init(Val: false)); |
| 122 | |
| 123 | /// A command line argument to limit minimum initial interval for pipelining. |
| 124 | static cl::opt<int> SwpMaxMii("pipeliner-max-mii" , |
| 125 | cl::desc("Size limit for the MII." ), |
| 126 | cl::Hidden, cl::init(Val: 27)); |
| 127 | |
| 128 | /// A command line argument to force pipeliner to use specified initial |
| 129 | /// interval. |
| 130 | static cl::opt<int> SwpForceII("pipeliner-force-ii" , |
| 131 | cl::desc("Force pipeliner to use specified II." ), |
| 132 | cl::Hidden, cl::init(Val: -1)); |
| 133 | |
| 134 | /// A command line argument to limit the number of stages in the pipeline. |
| 135 | static cl::opt<int> |
| 136 | SwpMaxStages("pipeliner-max-stages" , |
| 137 | cl::desc("Maximum stages allowed in the generated scheduled." ), |
| 138 | cl::Hidden, cl::init(Val: 3)); |
| 139 | |
| 140 | /// A command line option to disable the pruning of chain dependences due to |
| 141 | /// an unrelated Phi. |
| 142 | static cl::opt<bool> |
| 143 | SwpPruneDeps("pipeliner-prune-deps" , |
| 144 | cl::desc("Prune dependences between unrelated Phi nodes." ), |
| 145 | cl::Hidden, cl::init(Val: true)); |
| 146 | |
| 147 | /// A command line option to disable the pruning of loop carried order |
| 148 | /// dependences. |
| 149 | static cl::opt<bool> |
| 150 | SwpPruneLoopCarried("pipeliner-prune-loop-carried" , |
| 151 | cl::desc("Prune loop carried order dependences." ), |
| 152 | cl::Hidden, cl::init(Val: true)); |
| 153 | |
| 154 | #ifndef NDEBUG |
| 155 | static cl::opt<int> SwpLoopLimit("pipeliner-max" , cl::Hidden, cl::init(-1)); |
| 156 | #endif |
| 157 | |
| 158 | static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii" , |
| 159 | cl::ReallyHidden, |
| 160 | cl::desc("Ignore RecMII" )); |
| 161 | |
| 162 | static cl::opt<bool> SwpShowResMask("pipeliner-show-mask" , cl::Hidden, |
| 163 | cl::init(Val: false)); |
| 164 | static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res" , cl::Hidden, |
| 165 | cl::init(Val: false)); |
| 166 | |
| 167 | static cl::opt<bool> EmitTestAnnotations( |
| 168 | "pipeliner-annotate-for-testing" , cl::Hidden, cl::init(Val: false), |
| 169 | cl::desc("Instead of emitting the pipelined code, annotate instructions " |
| 170 | "with the generated schedule for feeding into the " |
| 171 | "-modulo-schedule-test pass" )); |
| 172 | |
| 173 | static cl::opt<bool> ExperimentalCodeGen( |
| 174 | "pipeliner-experimental-cg" , cl::Hidden, cl::init(Val: false), |
| 175 | cl::desc( |
| 176 | "Use the experimental peeling code generator for software pipelining" )); |
| 177 | |
| 178 | static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range" , |
| 179 | cl::desc("Range to search for II" ), |
| 180 | cl::Hidden, cl::init(Val: 10)); |
| 181 | |
| 182 | static cl::opt<bool> |
| 183 | LimitRegPressure("pipeliner-register-pressure" , cl::Hidden, cl::init(Val: false), |
| 184 | cl::desc("Limit register pressure of scheduled loop" )); |
| 185 | |
| 186 | static cl::opt<int> |
| 187 | RegPressureMargin("pipeliner-register-pressure-margin" , cl::Hidden, |
| 188 | cl::init(Val: 5), |
| 189 | cl::desc("Margin representing the unused percentage of " |
| 190 | "the register pressure limit" )); |
| 191 | |
| 192 | static cl::opt<bool> |
| 193 | MVECodeGen("pipeliner-mve-cg" , cl::Hidden, cl::init(Val: false), |
| 194 | cl::desc("Use the MVE code generator for software pipelining" )); |
| 195 | |
| 196 | namespace llvm { |
| 197 | |
| 198 | // A command line option to enable the CopyToPhi DAG mutation. |
| 199 | cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi" , cl::ReallyHidden, |
| 200 | cl::init(Val: true), |
| 201 | cl::desc("Enable CopyToPhi DAG Mutation" )); |
| 202 | |
| 203 | /// A command line argument to force pipeliner to use specified issue |
| 204 | /// width. |
| 205 | cl::opt<int> SwpForceIssueWidth( |
| 206 | "pipeliner-force-issue-width" , |
| 207 | cl::desc("Force pipeliner to use specified issue width." ), cl::Hidden, |
| 208 | cl::init(Val: -1)); |
| 209 | |
| 210 | /// A command line argument to set the window scheduling option. |
| 211 | static cl::opt<WindowSchedulingFlag> WindowSchedulingOption( |
| 212 | "window-sched" , cl::Hidden, cl::init(Val: WindowSchedulingFlag::WS_On), |
| 213 | cl::desc("Set how to use window scheduling algorithm." ), |
| 214 | cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off" , |
| 215 | "Turn off window algorithm." ), |
| 216 | clEnumValN(WindowSchedulingFlag::WS_On, "on" , |
| 217 | "Use window algorithm after SMS algorithm fails." ), |
| 218 | clEnumValN(WindowSchedulingFlag::WS_Force, "force" , |
| 219 | "Use window algorithm instead of SMS algorithm." ))); |
| 220 | |
| 221 | } // end namespace llvm |
| 222 | |
| 223 | unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5; |
| 224 | char MachinePipeliner::ID = 0; |
| 225 | #ifndef NDEBUG |
| 226 | int MachinePipeliner::NumTries = 0; |
| 227 | #endif |
| 228 | char &llvm::MachinePipelinerID = MachinePipeliner::ID; |
| 229 | |
| 230 | INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE, |
| 231 | "Modulo Software Pipelining" , false, false) |
| 232 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
| 233 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) |
| 234 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
| 235 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
| 236 | INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE, |
| 237 | "Modulo Software Pipelining" , false, false) |
| 238 | |
| 239 | namespace { |
| 240 | |
| 241 | /// This class holds an SUnit corresponding to a memory operation and other |
| 242 | /// information related to the instruction. |
| 243 | struct SUnitWithMemInfo { |
| 244 | SUnit *SU; |
| 245 | SmallVector<const Value *, 2> UnderlyingObjs; |
| 246 | |
| 247 | /// The value of a memory operand. |
| 248 | const Value *MemOpValue = nullptr; |
| 249 | |
| 250 | /// The offset of a memory operand. |
| 251 | int64_t MemOpOffset = 0; |
| 252 | |
| 253 | AAMDNodes AATags; |
| 254 | |
| 255 | /// True if all the underlying objects are identified. |
| 256 | bool IsAllIdentified = false; |
| 257 | |
| 258 | SUnitWithMemInfo(SUnit *SU); |
| 259 | |
| 260 | bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const; |
| 261 | |
| 262 | bool isUnknown() const { return MemOpValue == nullptr; } |
| 263 | |
| 264 | private: |
| 265 | bool getUnderlyingObjects(); |
| 266 | }; |
| 267 | |
| 268 | /// Add loop-carried chain dependencies. This class handles the same type of |
| 269 | /// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into |
| 270 | /// account dependencies across iterations. |
| 271 | class LoopCarriedOrderDepsTracker { |
| 272 | // Type of instruction that is relevant to order-dependencies |
| 273 | enum class InstrTag { |
| 274 | Barrier = 0, ///< A barrier event instruction. |
| 275 | LoadOrStore = 1, ///< An instruction that may load or store memory, but is |
| 276 | ///< not a barrier event. |
| 277 | FPExceptions = 2, ///< An instruction that does not match above, but may |
| 278 | ///< raise floatin-point exceptions. |
| 279 | }; |
| 280 | |
| 281 | struct TaggedSUnit : PointerIntPair<SUnit *, 2> { |
| 282 | TaggedSUnit(SUnit *SU, InstrTag Tag) |
| 283 | : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {} |
| 284 | |
| 285 | InstrTag getTag() const { return InstrTag(getInt()); } |
| 286 | }; |
| 287 | |
| 288 | /// Holds loads and stores with memory related information. |
| 289 | struct LoadStoreChunk { |
| 290 | SmallVector<SUnitWithMemInfo, 4> Loads; |
| 291 | SmallVector<SUnitWithMemInfo, 4> Stores; |
| 292 | |
| 293 | void append(SUnit *SU); |
| 294 | }; |
| 295 | |
| 296 | SwingSchedulerDAG *DAG; |
| 297 | BatchAAResults *BAA; |
| 298 | std::vector<SUnit> &SUnits; |
| 299 | |
| 300 | /// The size of SUnits, for convenience. |
| 301 | const unsigned N; |
| 302 | |
| 303 | /// Loop-carried Edges. |
| 304 | std::vector<BitVector> LoopCarried; |
| 305 | |
| 306 | /// Instructions related to chain dependencies. They are one of the |
| 307 | /// following: |
| 308 | /// |
| 309 | /// 1. Barrier event. |
| 310 | /// 2. Load, but neither a barrier event, invariant load, nor may load trap |
| 311 | /// value. |
| 312 | /// 3. Store, but not a barrier event. |
| 313 | /// 4. None of them, but may raise floating-point exceptions. |
| 314 | /// |
| 315 | /// This is used when analyzing loop-carried dependencies that access global |
| 316 | /// barrier instructions. |
| 317 | std::vector<TaggedSUnit> TaggedSUnits; |
| 318 | |
| 319 | const TargetInstrInfo *TII = nullptr; |
| 320 | const TargetRegisterInfo *TRI = nullptr; |
| 321 | |
| 322 | public: |
| 323 | LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA, |
| 324 | const TargetInstrInfo *TII, |
| 325 | const TargetRegisterInfo *TRI); |
| 326 | |
| 327 | /// The main function to compute loop-carried order-dependencies. |
| 328 | void computeDependencies(); |
| 329 | |
| 330 | const BitVector &getLoopCarried(unsigned Idx) const { |
| 331 | return LoopCarried[Idx]; |
| 332 | } |
| 333 | |
| 334 | private: |
| 335 | /// Tags to \p SU if the instruction may affect the order-dependencies. |
| 336 | std::optional<InstrTag> getInstrTag(SUnit *SU) const; |
| 337 | |
| 338 | void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From, |
| 339 | const LoadStoreChunk &To); |
| 340 | |
| 341 | void computeDependenciesAux(); |
| 342 | }; |
| 343 | |
| 344 | } // end anonymous namespace |
| 345 | |
| 346 | /// The "main" function for implementing Swing Modulo Scheduling. |
| 347 | bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) { |
| 348 | if (skipFunction(F: mf.getFunction())) |
| 349 | return false; |
| 350 | |
| 351 | if (!EnableSWP) |
| 352 | return false; |
| 353 | |
| 354 | if (mf.getFunction().getAttributes().hasFnAttr(Kind: Attribute::OptimizeForSize) && |
| 355 | !EnableSWPOptSize.getPosition()) |
| 356 | return false; |
| 357 | |
| 358 | if (!mf.getSubtarget().enableMachinePipeliner()) |
| 359 | return false; |
| 360 | |
| 361 | // Cannot pipeline loops without instruction itineraries if we are using |
| 362 | // DFA for the pipeliner. |
| 363 | if (mf.getSubtarget().useDFAforSMS() && |
| 364 | (!mf.getSubtarget().getInstrItineraryData() || |
| 365 | mf.getSubtarget().getInstrItineraryData()->isEmpty())) |
| 366 | return false; |
| 367 | |
| 368 | MF = &mf; |
| 369 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
| 370 | MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
| 371 | ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE(); |
| 372 | TII = MF->getSubtarget().getInstrInfo(); |
| 373 | RegClassInfo.runOnMachineFunction(MF: *MF); |
| 374 | |
| 375 | for (const auto &L : *MLI) |
| 376 | scheduleLoop(L&: *L); |
| 377 | |
| 378 | return false; |
| 379 | } |
| 380 | |
| 381 | /// Attempt to perform the SMS algorithm on the specified loop. This function is |
| 382 | /// the main entry point for the algorithm. The function identifies candidate |
| 383 | /// loops, calculates the minimum initiation interval, and attempts to schedule |
| 384 | /// the loop. |
| 385 | bool MachinePipeliner::scheduleLoop(MachineLoop &L) { |
| 386 | bool Changed = false; |
| 387 | for (const auto &InnerLoop : L) |
| 388 | Changed |= scheduleLoop(L&: *InnerLoop); |
| 389 | |
| 390 | #ifndef NDEBUG |
| 391 | // Stop trying after reaching the limit (if any). |
| 392 | int Limit = SwpLoopLimit; |
| 393 | if (Limit >= 0) { |
| 394 | if (NumTries >= SwpLoopLimit) |
| 395 | return Changed; |
| 396 | NumTries++; |
| 397 | } |
| 398 | #endif |
| 399 | |
| 400 | setPragmaPipelineOptions(L); |
| 401 | if (!canPipelineLoop(L)) { |
| 402 | LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n" ); |
| 403 | ORE->emit(RemarkBuilder: [&]() { |
| 404 | return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop" , |
| 405 | L.getStartLoc(), L.getHeader()) |
| 406 | << "Failed to pipeline loop" ; |
| 407 | }); |
| 408 | |
| 409 | LI.LoopPipelinerInfo.reset(); |
| 410 | return Changed; |
| 411 | } |
| 412 | |
| 413 | ++NumTrytoPipeline; |
| 414 | if (useSwingModuloScheduler()) |
| 415 | Changed = swingModuloScheduler(L); |
| 416 | |
| 417 | if (useWindowScheduler(Changed)) |
| 418 | Changed = runWindowScheduler(L); |
| 419 | |
| 420 | LI.LoopPipelinerInfo.reset(); |
| 421 | return Changed; |
| 422 | } |
| 423 | |
| 424 | void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) { |
| 425 | // Reset the pragma for the next loop in iteration. |
| 426 | disabledByPragma = false; |
| 427 | II_setByPragma = 0; |
| 428 | |
| 429 | MachineBasicBlock *LBLK = L.getTopBlock(); |
| 430 | |
| 431 | if (LBLK == nullptr) |
| 432 | return; |
| 433 | |
| 434 | const BasicBlock *BBLK = LBLK->getBasicBlock(); |
| 435 | if (BBLK == nullptr) |
| 436 | return; |
| 437 | |
| 438 | const Instruction *TI = BBLK->getTerminator(); |
| 439 | if (TI == nullptr) |
| 440 | return; |
| 441 | |
| 442 | MDNode *LoopID = TI->getMetadata(KindID: LLVMContext::MD_loop); |
| 443 | if (LoopID == nullptr) |
| 444 | return; |
| 445 | |
| 446 | assert(LoopID->getNumOperands() > 0 && "requires atleast one operand" ); |
| 447 | assert(LoopID->getOperand(0) == LoopID && "invalid loop" ); |
| 448 | |
| 449 | for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) { |
| 450 | MDNode *MD = dyn_cast<MDNode>(Val: MDO); |
| 451 | |
| 452 | if (MD == nullptr) |
| 453 | continue; |
| 454 | |
| 455 | MDString *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0)); |
| 456 | |
| 457 | if (S == nullptr) |
| 458 | continue; |
| 459 | |
| 460 | if (S->getString() == "llvm.loop.pipeline.initiationinterval" ) { |
| 461 | assert(MD->getNumOperands() == 2 && |
| 462 | "Pipeline initiation interval hint metadata should have two operands." ); |
| 463 | II_setByPragma = |
| 464 | mdconst::extract<ConstantInt>(MD: MD->getOperand(I: 1))->getZExtValue(); |
| 465 | assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive." ); |
| 466 | } else if (S->getString() == "llvm.loop.pipeline.disable" ) { |
| 467 | disabledByPragma = true; |
| 468 | } |
| 469 | } |
| 470 | } |
| 471 | |
| 472 | /// Return true if the loop can be software pipelined. The algorithm is |
| 473 | /// restricted to loops with a single basic block. Make sure that the |
| 474 | /// branch in the loop can be analyzed. |
| 475 | bool MachinePipeliner::canPipelineLoop(MachineLoop &L) { |
| 476 | if (L.getNumBlocks() != 1) { |
| 477 | ORE->emit(RemarkBuilder: [&]() { |
| 478 | return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop" , |
| 479 | L.getStartLoc(), L.getHeader()) |
| 480 | << "Not a single basic block: " |
| 481 | << ore::NV("NumBlocks" , L.getNumBlocks()); |
| 482 | }); |
| 483 | return false; |
| 484 | } |
| 485 | |
| 486 | if (disabledByPragma) { |
| 487 | ORE->emit(RemarkBuilder: [&]() { |
| 488 | return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop" , |
| 489 | L.getStartLoc(), L.getHeader()) |
| 490 | << "Disabled by Pragma." ; |
| 491 | }); |
| 492 | return false; |
| 493 | } |
| 494 | |
| 495 | // Check if the branch can't be understood because we can't do pipelining |
| 496 | // if that's the case. |
| 497 | LI.TBB = nullptr; |
| 498 | LI.FBB = nullptr; |
| 499 | LI.BrCond.clear(); |
| 500 | if (TII->analyzeBranch(MBB&: *L.getHeader(), TBB&: LI.TBB, FBB&: LI.FBB, Cond&: LI.BrCond)) { |
| 501 | LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n" ); |
| 502 | NumFailBranch++; |
| 503 | ORE->emit(RemarkBuilder: [&]() { |
| 504 | return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop" , |
| 505 | L.getStartLoc(), L.getHeader()) |
| 506 | << "The branch can't be understood" ; |
| 507 | }); |
| 508 | return false; |
| 509 | } |
| 510 | |
| 511 | LI.LoopInductionVar = nullptr; |
| 512 | LI.LoopCompare = nullptr; |
| 513 | LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(LoopBB: L.getTopBlock()); |
| 514 | if (!LI.LoopPipelinerInfo) { |
| 515 | LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n" ); |
| 516 | NumFailLoop++; |
| 517 | ORE->emit(RemarkBuilder: [&]() { |
| 518 | return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop" , |
| 519 | L.getStartLoc(), L.getHeader()) |
| 520 | << "The loop structure is not supported" ; |
| 521 | }); |
| 522 | return false; |
| 523 | } |
| 524 | |
| 525 | if (!L.getLoopPreheader()) { |
| 526 | LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n" ); |
| 527 | NumFailPreheader++; |
| 528 | ORE->emit(RemarkBuilder: [&]() { |
| 529 | return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop" , |
| 530 | L.getStartLoc(), L.getHeader()) |
| 531 | << "No loop preheader found" ; |
| 532 | }); |
| 533 | return false; |
| 534 | } |
| 535 | |
| 536 | // Remove any subregisters from inputs to phi nodes. |
| 537 | preprocessPhiNodes(B&: *L.getHeader()); |
| 538 | return true; |
| 539 | } |
| 540 | |
| 541 | void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) { |
| 542 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
| 543 | SlotIndexes &Slots = |
| 544 | *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes(); |
| 545 | |
| 546 | for (MachineInstr &PI : B.phis()) { |
| 547 | MachineOperand &DefOp = PI.getOperand(i: 0); |
| 548 | assert(DefOp.getSubReg() == 0); |
| 549 | auto *RC = MRI.getRegClass(Reg: DefOp.getReg()); |
| 550 | |
| 551 | for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) { |
| 552 | MachineOperand &RegOp = PI.getOperand(i); |
| 553 | if (RegOp.getSubReg() == 0) |
| 554 | continue; |
| 555 | |
| 556 | // If the operand uses a subregister, replace it with a new register |
| 557 | // without subregisters, and generate a copy to the new register. |
| 558 | Register NewReg = MRI.createVirtualRegister(RegClass: RC); |
| 559 | MachineBasicBlock &PredB = *PI.getOperand(i: i+1).getMBB(); |
| 560 | MachineBasicBlock::iterator At = PredB.getFirstTerminator(); |
| 561 | const DebugLoc &DL = PredB.findDebugLoc(MBBI: At); |
| 562 | auto Copy = BuildMI(BB&: PredB, I: At, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewReg) |
| 563 | .addReg(RegNo: RegOp.getReg(), flags: getRegState(RegOp), |
| 564 | SubReg: RegOp.getSubReg()); |
| 565 | Slots.insertMachineInstrInMaps(MI&: *Copy); |
| 566 | RegOp.setReg(NewReg); |
| 567 | RegOp.setSubReg(0); |
| 568 | } |
| 569 | } |
| 570 | } |
| 571 | |
| 572 | /// The SMS algorithm consists of the following main steps: |
| 573 | /// 1. Computation and analysis of the dependence graph. |
| 574 | /// 2. Ordering of the nodes (instructions). |
| 575 | /// 3. Attempt to Schedule the loop. |
| 576 | bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) { |
| 577 | assert(L.getBlocks().size() == 1 && "SMS works on single blocks only." ); |
| 578 | |
| 579 | AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
| 580 | SwingSchedulerDAG SMS( |
| 581 | *this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo, |
| 582 | II_setByPragma, LI.LoopPipelinerInfo.get(), AA); |
| 583 | |
| 584 | MachineBasicBlock *MBB = L.getHeader(); |
| 585 | // The kernel should not include any terminator instructions. These |
| 586 | // will be added back later. |
| 587 | SMS.startBlock(BB: MBB); |
| 588 | |
| 589 | // Compute the number of 'real' instructions in the basic block by |
| 590 | // ignoring terminators. |
| 591 | unsigned size = MBB->size(); |
| 592 | for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(), |
| 593 | E = MBB->instr_end(); |
| 594 | I != E; ++I, --size) |
| 595 | ; |
| 596 | |
| 597 | SMS.enterRegion(bb: MBB, begin: MBB->begin(), end: MBB->getFirstTerminator(), regioninstrs: size); |
| 598 | SMS.schedule(); |
| 599 | SMS.exitRegion(); |
| 600 | |
| 601 | SMS.finishBlock(); |
| 602 | return SMS.hasNewSchedule(); |
| 603 | } |
| 604 | |
| 605 | void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const { |
| 606 | AU.addRequired<AAResultsWrapperPass>(); |
| 607 | AU.addPreserved<AAResultsWrapperPass>(); |
| 608 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
| 609 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
| 610 | AU.addRequired<LiveIntervalsWrapperPass>(); |
| 611 | AU.addRequired<MachineOptimizationRemarkEmitterPass>(); |
| 612 | AU.addRequired<TargetPassConfig>(); |
| 613 | MachineFunctionPass::getAnalysisUsage(AU); |
| 614 | } |
| 615 | |
| 616 | bool MachinePipeliner::runWindowScheduler(MachineLoop &L) { |
| 617 | MachineSchedContext Context; |
| 618 | Context.MF = MF; |
| 619 | Context.MLI = MLI; |
| 620 | Context.MDT = MDT; |
| 621 | Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); |
| 622 | Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
| 623 | Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); |
| 624 | Context.RegClassInfo->runOnMachineFunction(MF: *MF); |
| 625 | WindowScheduler WS(&Context, L); |
| 626 | return WS.run(); |
| 627 | } |
| 628 | |
| 629 | bool MachinePipeliner::useSwingModuloScheduler() { |
| 630 | // SwingModuloScheduler does not work when WindowScheduler is forced. |
| 631 | return WindowSchedulingOption != WindowSchedulingFlag::WS_Force; |
| 632 | } |
| 633 | |
| 634 | bool MachinePipeliner::useWindowScheduler(bool Changed) { |
| 635 | // WindowScheduler does not work for following cases: |
| 636 | // 1. when it is off. |
| 637 | // 2. when SwingModuloScheduler is successfully scheduled. |
| 638 | // 3. when pragma II is enabled. |
| 639 | if (II_setByPragma) { |
| 640 | LLVM_DEBUG(dbgs() << "Window scheduling is disabled when " |
| 641 | "llvm.loop.pipeline.initiationinterval is set.\n" ); |
| 642 | return false; |
| 643 | } |
| 644 | |
| 645 | return WindowSchedulingOption == WindowSchedulingFlag::WS_Force || |
| 646 | (WindowSchedulingOption == WindowSchedulingFlag::WS_On && !Changed); |
| 647 | } |
| 648 | |
| 649 | void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) { |
| 650 | if (SwpForceII > 0) |
| 651 | MII = SwpForceII; |
| 652 | else if (II_setByPragma > 0) |
| 653 | MII = II_setByPragma; |
| 654 | else |
| 655 | MII = std::max(a: ResMII, b: RecMII); |
| 656 | } |
| 657 | |
| 658 | void SwingSchedulerDAG::setMAX_II() { |
| 659 | if (SwpForceII > 0) |
| 660 | MAX_II = SwpForceII; |
| 661 | else if (II_setByPragma > 0) |
| 662 | MAX_II = II_setByPragma; |
| 663 | else |
| 664 | MAX_II = MII + SwpIISearchRange; |
| 665 | } |
| 666 | |
| 667 | /// We override the schedule function in ScheduleDAGInstrs to implement the |
| 668 | /// scheduling part of the Swing Modulo Scheduling algorithm. |
| 669 | void SwingSchedulerDAG::schedule() { |
| 670 | buildSchedGraph(AA); |
| 671 | const LoopCarriedEdges LCE = addLoopCarriedDependences(); |
| 672 | updatePhiDependences(); |
| 673 | Topo.InitDAGTopologicalSorting(); |
| 674 | changeDependences(); |
| 675 | postProcessDAG(); |
| 676 | DDG = std::make_unique<SwingSchedulerDDG>(args&: SUnits, args: &EntrySU, args: &ExitSU); |
| 677 | LLVM_DEBUG({ |
| 678 | dump(); |
| 679 | dbgs() << "===== Loop Carried Edges Begin =====\n" ; |
| 680 | for (SUnit &SU : SUnits) |
| 681 | LCE.dump(&SU, TRI, &MRI); |
| 682 | dbgs() << "===== Loop Carried Edges End =====\n" ; |
| 683 | }); |
| 684 | |
| 685 | NodeSetType NodeSets; |
| 686 | findCircuits(NodeSets); |
| 687 | NodeSetType Circuits = NodeSets; |
| 688 | |
| 689 | // Calculate the MII. |
| 690 | unsigned ResMII = calculateResMII(); |
| 691 | unsigned RecMII = calculateRecMII(RecNodeSets&: NodeSets); |
| 692 | |
| 693 | fuseRecs(NodeSets); |
| 694 | |
| 695 | // This flag is used for testing and can cause correctness problems. |
| 696 | if (SwpIgnoreRecMII) |
| 697 | RecMII = 0; |
| 698 | |
| 699 | setMII(ResMII, RecMII); |
| 700 | setMAX_II(); |
| 701 | |
| 702 | LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II |
| 703 | << " (rec=" << RecMII << ", res=" << ResMII << ")\n" ); |
| 704 | |
| 705 | // Can't schedule a loop without a valid MII. |
| 706 | if (MII == 0) { |
| 707 | LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n" ); |
| 708 | NumFailZeroMII++; |
| 709 | Pass.ORE->emit(RemarkBuilder: [&]() { |
| 710 | return MachineOptimizationRemarkAnalysis( |
| 711 | DEBUG_TYPE, "schedule" , Loop.getStartLoc(), Loop.getHeader()) |
| 712 | << "Invalid Minimal Initiation Interval: 0" ; |
| 713 | }); |
| 714 | return; |
| 715 | } |
| 716 | |
| 717 | // Don't pipeline large loops. |
| 718 | if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) { |
| 719 | LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii |
| 720 | << ", we don't pipeline large loops\n" ); |
| 721 | NumFailLargeMaxMII++; |
| 722 | Pass.ORE->emit(RemarkBuilder: [&]() { |
| 723 | return MachineOptimizationRemarkAnalysis( |
| 724 | DEBUG_TYPE, "schedule" , Loop.getStartLoc(), Loop.getHeader()) |
| 725 | << "Minimal Initiation Interval too large: " |
| 726 | << ore::NV("MII" , (int)MII) << " > " |
| 727 | << ore::NV("SwpMaxMii" , SwpMaxMii) << "." |
| 728 | << "Refer to -pipeliner-max-mii." ; |
| 729 | }); |
| 730 | return; |
| 731 | } |
| 732 | |
| 733 | computeNodeFunctions(NodeSets); |
| 734 | |
| 735 | registerPressureFilter(NodeSets); |
| 736 | |
| 737 | colocateNodeSets(NodeSets); |
| 738 | |
| 739 | checkNodeSets(NodeSets); |
| 740 | |
| 741 | LLVM_DEBUG({ |
| 742 | for (auto &I : NodeSets) { |
| 743 | dbgs() << " Rec NodeSet " ; |
| 744 | I.dump(); |
| 745 | } |
| 746 | }); |
| 747 | |
| 748 | llvm::stable_sort(Range&: NodeSets, C: std::greater<NodeSet>()); |
| 749 | |
| 750 | groupRemainingNodes(NodeSets); |
| 751 | |
| 752 | removeDuplicateNodes(NodeSets); |
| 753 | |
| 754 | LLVM_DEBUG({ |
| 755 | for (auto &I : NodeSets) { |
| 756 | dbgs() << " NodeSet " ; |
| 757 | I.dump(); |
| 758 | } |
| 759 | }); |
| 760 | |
| 761 | computeNodeOrder(NodeSets); |
| 762 | |
| 763 | // check for node order issues |
| 764 | checkValidNodeOrder(Circuits); |
| 765 | |
| 766 | SMSchedule Schedule(Pass.MF, this); |
| 767 | Scheduled = schedulePipeline(Schedule); |
| 768 | |
| 769 | if (!Scheduled){ |
| 770 | LLVM_DEBUG(dbgs() << "No schedule found, return\n" ); |
| 771 | NumFailNoSchedule++; |
| 772 | Pass.ORE->emit(RemarkBuilder: [&]() { |
| 773 | return MachineOptimizationRemarkAnalysis( |
| 774 | DEBUG_TYPE, "schedule" , Loop.getStartLoc(), Loop.getHeader()) |
| 775 | << "Unable to find schedule" ; |
| 776 | }); |
| 777 | return; |
| 778 | } |
| 779 | |
| 780 | unsigned numStages = Schedule.getMaxStageCount(); |
| 781 | // No need to generate pipeline if there are no overlapped iterations. |
| 782 | if (numStages == 0) { |
| 783 | LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n" ); |
| 784 | NumFailZeroStage++; |
| 785 | Pass.ORE->emit(RemarkBuilder: [&]() { |
| 786 | return MachineOptimizationRemarkAnalysis( |
| 787 | DEBUG_TYPE, "schedule" , Loop.getStartLoc(), Loop.getHeader()) |
| 788 | << "No need to pipeline - no overlapped iterations in schedule." ; |
| 789 | }); |
| 790 | return; |
| 791 | } |
| 792 | // Check that the maximum stage count is less than user-defined limit. |
| 793 | if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) { |
| 794 | LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages |
| 795 | << " : too many stages, abort\n" ); |
| 796 | NumFailLargeMaxStage++; |
| 797 | Pass.ORE->emit(RemarkBuilder: [&]() { |
| 798 | return MachineOptimizationRemarkAnalysis( |
| 799 | DEBUG_TYPE, "schedule" , Loop.getStartLoc(), Loop.getHeader()) |
| 800 | << "Too many stages in schedule: " |
| 801 | << ore::NV("numStages" , (int)numStages) << " > " |
| 802 | << ore::NV("SwpMaxStages" , SwpMaxStages) |
| 803 | << ". Refer to -pipeliner-max-stages." ; |
| 804 | }); |
| 805 | return; |
| 806 | } |
| 807 | |
| 808 | Pass.ORE->emit(RemarkBuilder: [&]() { |
| 809 | return MachineOptimizationRemark(DEBUG_TYPE, "schedule" , Loop.getStartLoc(), |
| 810 | Loop.getHeader()) |
| 811 | << "Pipelined succesfully!" ; |
| 812 | }); |
| 813 | |
| 814 | // Generate the schedule as a ModuloSchedule. |
| 815 | DenseMap<MachineInstr *, int> Cycles, Stages; |
| 816 | std::vector<MachineInstr *> OrderedInsts; |
| 817 | for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle(); |
| 818 | ++Cycle) { |
| 819 | for (SUnit *SU : Schedule.getInstructions(cycle: Cycle)) { |
| 820 | OrderedInsts.push_back(x: SU->getInstr()); |
| 821 | Cycles[SU->getInstr()] = Cycle; |
| 822 | Stages[SU->getInstr()] = Schedule.stageScheduled(SU); |
| 823 | } |
| 824 | } |
| 825 | DenseMap<MachineInstr *, std::pair<Register, int64_t>> NewInstrChanges; |
| 826 | for (auto &KV : NewMIs) { |
| 827 | Cycles[KV.first] = Cycles[KV.second]; |
| 828 | Stages[KV.first] = Stages[KV.second]; |
| 829 | NewInstrChanges[KV.first] = InstrChanges[getSUnit(MI: KV.first)]; |
| 830 | } |
| 831 | |
| 832 | ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles), |
| 833 | std::move(Stages)); |
| 834 | if (EmitTestAnnotations) { |
| 835 | assert(NewInstrChanges.empty() && |
| 836 | "Cannot serialize a schedule with InstrChanges!" ); |
| 837 | ModuloScheduleTestAnnotater MSTI(MF, MS); |
| 838 | MSTI.annotate(); |
| 839 | return; |
| 840 | } |
| 841 | // The experimental code generator can't work if there are InstChanges. |
| 842 | if (ExperimentalCodeGen && NewInstrChanges.empty()) { |
| 843 | PeelingModuloScheduleExpander MSE(MF, MS, &LIS); |
| 844 | MSE.expand(); |
| 845 | } else if (MVECodeGen && NewInstrChanges.empty() && |
| 846 | LoopPipelinerInfo->isMVEExpanderSupported() && |
| 847 | ModuloScheduleExpanderMVE::canApply(L&: Loop)) { |
| 848 | ModuloScheduleExpanderMVE MSE(MF, MS, LIS); |
| 849 | MSE.expand(); |
| 850 | } else { |
| 851 | ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges)); |
| 852 | MSE.expand(); |
| 853 | MSE.cleanup(); |
| 854 | } |
| 855 | ++NumPipelined; |
| 856 | } |
| 857 | |
| 858 | /// Clean up after the software pipeliner runs. |
| 859 | void SwingSchedulerDAG::finishBlock() { |
| 860 | for (auto &KV : NewMIs) |
| 861 | MF.deleteMachineInstr(MI: KV.second); |
| 862 | NewMIs.clear(); |
| 863 | |
| 864 | // Call the superclass. |
| 865 | ScheduleDAGInstrs::finishBlock(); |
| 866 | } |
| 867 | |
| 868 | /// Return the register values for the operands of a Phi instruction. |
| 869 | /// This function assume the instruction is a Phi. |
| 870 | static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, |
| 871 | Register &InitVal, Register &LoopVal) { |
| 872 | assert(Phi.isPHI() && "Expecting a Phi." ); |
| 873 | |
| 874 | InitVal = Register(); |
| 875 | LoopVal = Register(); |
| 876 | for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2) |
| 877 | if (Phi.getOperand(i: i + 1).getMBB() != Loop) |
| 878 | InitVal = Phi.getOperand(i).getReg(); |
| 879 | else |
| 880 | LoopVal = Phi.getOperand(i).getReg(); |
| 881 | |
| 882 | assert(InitVal && LoopVal && "Unexpected Phi structure." ); |
| 883 | } |
| 884 | |
| 885 | /// Return the Phi register value that comes the loop block. |
| 886 | static Register getLoopPhiReg(const MachineInstr &Phi, |
| 887 | const MachineBasicBlock *LoopBB) { |
| 888 | for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2) |
| 889 | if (Phi.getOperand(i: i + 1).getMBB() == LoopBB) |
| 890 | return Phi.getOperand(i).getReg(); |
| 891 | return Register(); |
| 892 | } |
| 893 | |
| 894 | /// Return true if SUb can be reached from SUa following the chain edges. |
| 895 | static bool isSuccOrder(SUnit *SUa, SUnit *SUb) { |
| 896 | SmallPtrSet<SUnit *, 8> Visited; |
| 897 | SmallVector<SUnit *, 8> Worklist; |
| 898 | Worklist.push_back(Elt: SUa); |
| 899 | while (!Worklist.empty()) { |
| 900 | const SUnit *SU = Worklist.pop_back_val(); |
| 901 | for (const auto &SI : SU->Succs) { |
| 902 | SUnit *SuccSU = SI.getSUnit(); |
| 903 | if (SI.getKind() == SDep::Order) { |
| 904 | if (Visited.count(Ptr: SuccSU)) |
| 905 | continue; |
| 906 | if (SuccSU == SUb) |
| 907 | return true; |
| 908 | Worklist.push_back(Elt: SuccSU); |
| 909 | Visited.insert(Ptr: SuccSU); |
| 910 | } |
| 911 | } |
| 912 | } |
| 913 | return false; |
| 914 | } |
| 915 | |
| 916 | SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) { |
| 917 | if (!getUnderlyingObjects()) |
| 918 | return; |
| 919 | for (const Value *Obj : UnderlyingObjs) |
| 920 | if (!isIdentifiedObject(V: Obj)) { |
| 921 | IsAllIdentified = false; |
| 922 | break; |
| 923 | } |
| 924 | } |
| 925 | |
| 926 | bool SUnitWithMemInfo::isTriviallyDisjoint( |
| 927 | const SUnitWithMemInfo &Other) const { |
| 928 | // If all underlying objects are identified objects and there is no overlap |
| 929 | // between them, then these two instructions are disjoint. |
| 930 | if (!IsAllIdentified || !Other.IsAllIdentified) |
| 931 | return false; |
| 932 | for (const Value *Obj : UnderlyingObjs) |
| 933 | if (llvm::is_contained(Range: Other.UnderlyingObjs, Element: Obj)) |
| 934 | return false; |
| 935 | return true; |
| 936 | } |
| 937 | |
| 938 | /// Collect the underlying objects for the memory references of an instruction. |
| 939 | /// This function calls the code in ValueTracking, but first checks that the |
| 940 | /// instruction has a memory operand. |
| 941 | /// Returns false if we cannot find the underlying objects. |
| 942 | bool SUnitWithMemInfo::getUnderlyingObjects() { |
| 943 | const MachineInstr *MI = SU->getInstr(); |
| 944 | if (!MI->hasOneMemOperand()) |
| 945 | return false; |
| 946 | MachineMemOperand *MM = *MI->memoperands_begin(); |
| 947 | if (!MM->getValue()) |
| 948 | return false; |
| 949 | MemOpValue = MM->getValue(); |
| 950 | MemOpOffset = MM->getOffset(); |
| 951 | llvm::getUnderlyingObjects(V: MemOpValue, Objects&: UnderlyingObjs); |
| 952 | |
| 953 | // TODO: A no alias scope may be valid only in a single iteration. In this |
| 954 | // case we need to peel off it like LoopAccessAnalysis does. |
| 955 | AATags = MM->getAAInfo(); |
| 956 | return true; |
| 957 | } |
| 958 | |
| 959 | /// Returns true if there is a loop-carried order dependency from \p Src to \p |
| 960 | /// Dst. |
| 961 | static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, |
| 962 | const SUnitWithMemInfo &Dst, |
| 963 | BatchAAResults &BAA, |
| 964 | const TargetInstrInfo *TII, |
| 965 | const TargetRegisterInfo *TRI) { |
| 966 | if (Src.isTriviallyDisjoint(Other: Dst)) |
| 967 | return false; |
| 968 | if (isSuccOrder(SUa: Src.SU, SUb: Dst.SU)) |
| 969 | return false; |
| 970 | |
| 971 | MachineInstr &SrcMI = *Src.SU->getInstr(); |
| 972 | MachineInstr &DstMI = *Dst.SU->getInstr(); |
| 973 | // First, perform the cheaper check that compares the base register. |
| 974 | // If they are the same and the load offset is less than the store |
| 975 | // offset, then mark the dependence as loop carried potentially. |
| 976 | const MachineOperand *BaseOp1, *BaseOp2; |
| 977 | int64_t Offset1, Offset2; |
| 978 | bool Offset1IsScalable, Offset2IsScalable; |
| 979 | if (TII->getMemOperandWithOffset(MI: SrcMI, BaseOp&: BaseOp1, Offset&: Offset1, OffsetIsScalable&: Offset1IsScalable, |
| 980 | TRI) && |
| 981 | TII->getMemOperandWithOffset(MI: DstMI, BaseOp&: BaseOp2, Offset&: Offset2, OffsetIsScalable&: Offset2IsScalable, |
| 982 | TRI)) { |
| 983 | if (BaseOp1->isIdenticalTo(Other: *BaseOp2) && |
| 984 | Offset1IsScalable == Offset2IsScalable && (int)Offset1 < (int)Offset2) { |
| 985 | assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) && |
| 986 | "What happened to the chain edge?" ); |
| 987 | return true; |
| 988 | } |
| 989 | } |
| 990 | |
| 991 | // Second, the more expensive check that uses alias analysis on the |
| 992 | // base registers. If they alias, and the load offset is less than |
| 993 | // the store offset, the mark the dependence as loop carried. |
| 994 | if (Src.isUnknown() || Dst.isUnknown()) |
| 995 | return true; |
| 996 | if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset) |
| 997 | return true; |
| 998 | |
| 999 | if (BAA.isNoAlias( |
| 1000 | LocA: MemoryLocation::getBeforeOrAfter(Ptr: Src.MemOpValue, AATags: Src.AATags), |
| 1001 | LocB: MemoryLocation::getBeforeOrAfter(Ptr: Dst.MemOpValue, AATags: Dst.AATags))) |
| 1002 | return false; |
| 1003 | |
| 1004 | // AliasAnalysis sometimes gives up on following the underlying |
| 1005 | // object. In such a case, separate checks for underlying objects may |
| 1006 | // prove that there are no aliases between two accesses. |
| 1007 | for (const Value *SrcObj : Src.UnderlyingObjs) |
| 1008 | for (const Value *DstObj : Dst.UnderlyingObjs) |
| 1009 | if (!BAA.isNoAlias(LocA: MemoryLocation::getBeforeOrAfter(Ptr: SrcObj, AATags: Src.AATags), |
| 1010 | LocB: MemoryLocation::getBeforeOrAfter(Ptr: DstObj, AATags: Dst.AATags))) |
| 1011 | return true; |
| 1012 | |
| 1013 | return false; |
| 1014 | } |
| 1015 | |
| 1016 | void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) { |
| 1017 | const MachineInstr *MI = SU->getInstr(); |
| 1018 | if (!MI->mayLoadOrStore()) |
| 1019 | return; |
| 1020 | (MI->mayStore() ? Stores : Loads).emplace_back(Args&: SU); |
| 1021 | } |
| 1022 | |
| 1023 | LoopCarriedOrderDepsTracker::LoopCarriedOrderDepsTracker( |
| 1024 | SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII, |
| 1025 | const TargetRegisterInfo *TRI) |
| 1026 | : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()), |
| 1027 | LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {} |
| 1028 | |
| 1029 | void LoopCarriedOrderDepsTracker::computeDependencies() { |
| 1030 | // Traverse all instructions and extract only what we are targetting. |
| 1031 | for (auto &SU : SUnits) { |
| 1032 | auto Tagged = getInstrTag(SU: &SU); |
| 1033 | |
| 1034 | // This instruction has no loop-carried order-dependencies. |
| 1035 | if (!Tagged) |
| 1036 | continue; |
| 1037 | TaggedSUnits.emplace_back(args: &SU, args&: *Tagged); |
| 1038 | } |
| 1039 | |
| 1040 | computeDependenciesAux(); |
| 1041 | } |
| 1042 | |
| 1043 | std::optional<LoopCarriedOrderDepsTracker::InstrTag> |
| 1044 | LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const { |
| 1045 | MachineInstr *MI = SU->getInstr(); |
| 1046 | if (TII->isGlobalMemoryObject(MI)) |
| 1047 | return InstrTag::Barrier; |
| 1048 | |
| 1049 | if (MI->mayStore() || |
| 1050 | (MI->mayLoad() && !MI->isDereferenceableInvariantLoad())) |
| 1051 | return InstrTag::LoadOrStore; |
| 1052 | |
| 1053 | if (MI->mayRaiseFPException()) |
| 1054 | return InstrTag::FPExceptions; |
| 1055 | |
| 1056 | return std::nullopt; |
| 1057 | } |
| 1058 | |
| 1059 | void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks( |
| 1060 | const LoadStoreChunk &From, const LoadStoreChunk &To) { |
| 1061 | // Add dependencies for load-to-store (WAR) from top to bottom. |
| 1062 | for (const SUnitWithMemInfo &Src : From.Loads) |
| 1063 | for (const SUnitWithMemInfo &Dst : To.Stores) |
| 1064 | if (Src.SU->NodeNum < Dst.SU->NodeNum && |
| 1065 | hasLoopCarriedMemDep(Src, Dst, BAA&: *BAA, TII, TRI)) |
| 1066 | LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum); |
| 1067 | |
| 1068 | // TODO: The following dependencies are missed. |
| 1069 | // |
| 1070 | // - Dependencies for load-to-store from bottom to top. |
| 1071 | // - Dependencies for store-to-load (RAW). |
| 1072 | // - Dependencies for store-to-store (WAW). |
| 1073 | } |
| 1074 | |
| 1075 | void LoopCarriedOrderDepsTracker::computeDependenciesAux() { |
| 1076 | SmallVector<LoadStoreChunk, 2> Chunks(1); |
| 1077 | for (const auto &TSU : TaggedSUnits) { |
| 1078 | InstrTag Tag = TSU.getTag(); |
| 1079 | SUnit *SU = TSU.getPointer(); |
| 1080 | switch (Tag) { |
| 1081 | case InstrTag::Barrier: |
| 1082 | Chunks.emplace_back(); |
| 1083 | break; |
| 1084 | case InstrTag::LoadOrStore: |
| 1085 | Chunks.back().append(SU); |
| 1086 | break; |
| 1087 | case InstrTag::FPExceptions: |
| 1088 | // TODO: Handle this properly. |
| 1089 | break; |
| 1090 | } |
| 1091 | } |
| 1092 | |
| 1093 | // Add dependencies between memory operations. If there are one or more |
| 1094 | // barrier events between two memory instructions, we don't add a |
| 1095 | // loop-carried dependence for them. |
| 1096 | for (const LoadStoreChunk &Chunk : Chunks) |
| 1097 | addLoopCarriedDepenenciesForChunks(From: Chunk, To: Chunk); |
| 1098 | |
| 1099 | // TODO: If there are multiple barrier instructions, dependencies from the |
| 1100 | // last barrier instruction (or load/store below it) to the first barrier |
| 1101 | // instruction (or load/store above it). |
| 1102 | } |
| 1103 | |
| 1104 | /// Add a chain edge between a load and store if the store can be an |
| 1105 | /// alias of the load on a subsequent iteration, i.e., a loop carried |
| 1106 | /// dependence. This code is very similar to the code in ScheduleDAGInstrs |
| 1107 | /// but that code doesn't create loop carried dependences. |
| 1108 | /// TODO: Also compute output-dependencies. |
| 1109 | LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() { |
| 1110 | LoopCarriedEdges LCE; |
| 1111 | |
| 1112 | // Add loop-carried order-dependencies |
| 1113 | LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI); |
| 1114 | LCODTracker.computeDependencies(); |
| 1115 | for (unsigned I = 0; I != SUnits.size(); I++) |
| 1116 | for (const int Succ : LCODTracker.getLoopCarried(Idx: I).set_bits()) |
| 1117 | LCE.OrderDeps[&SUnits[I]].insert(X: &SUnits[Succ]); |
| 1118 | |
| 1119 | LCE.modifySUnits(SUnits); |
| 1120 | return LCE; |
| 1121 | } |
| 1122 | |
| 1123 | /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer |
| 1124 | /// processes dependences for PHIs. This function adds true dependences |
| 1125 | /// from a PHI to a use, and a loop carried dependence from the use to the |
| 1126 | /// PHI. The loop carried dependence is represented as an anti dependence |
| 1127 | /// edge. This function also removes chain dependences between unrelated |
| 1128 | /// PHIs. |
| 1129 | void SwingSchedulerDAG::updatePhiDependences() { |
| 1130 | SmallVector<SDep, 4> RemoveDeps; |
| 1131 | const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>(); |
| 1132 | |
| 1133 | // Iterate over each DAG node. |
| 1134 | for (SUnit &I : SUnits) { |
| 1135 | RemoveDeps.clear(); |
| 1136 | // Set to true if the instruction has an operand defined by a Phi. |
| 1137 | Register HasPhiUse; |
| 1138 | Register HasPhiDef; |
| 1139 | MachineInstr *MI = I.getInstr(); |
| 1140 | // Iterate over each operand, and we process the definitions. |
| 1141 | for (const MachineOperand &MO : MI->operands()) { |
| 1142 | if (!MO.isReg()) |
| 1143 | continue; |
| 1144 | Register Reg = MO.getReg(); |
| 1145 | if (MO.isDef()) { |
| 1146 | // If the register is used by a Phi, then create an anti dependence. |
| 1147 | for (MachineRegisterInfo::use_instr_iterator |
| 1148 | UI = MRI.use_instr_begin(RegNo: Reg), |
| 1149 | UE = MRI.use_instr_end(); |
| 1150 | UI != UE; ++UI) { |
| 1151 | MachineInstr *UseMI = &*UI; |
| 1152 | SUnit *SU = getSUnit(MI: UseMI); |
| 1153 | if (SU != nullptr && UseMI->isPHI()) { |
| 1154 | if (!MI->isPHI()) { |
| 1155 | SDep Dep(SU, SDep::Anti, Reg); |
| 1156 | Dep.setLatency(1); |
| 1157 | I.addPred(D: Dep); |
| 1158 | } else { |
| 1159 | HasPhiDef = Reg; |
| 1160 | // Add a chain edge to a dependent Phi that isn't an existing |
| 1161 | // predecessor. |
| 1162 | if (SU->NodeNum < I.NodeNum && !I.isPred(N: SU)) |
| 1163 | I.addPred(D: SDep(SU, SDep::Barrier)); |
| 1164 | } |
| 1165 | } |
| 1166 | } |
| 1167 | } else if (MO.isUse()) { |
| 1168 | // If the register is defined by a Phi, then create a true dependence. |
| 1169 | MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); |
| 1170 | if (DefMI == nullptr) |
| 1171 | continue; |
| 1172 | SUnit *SU = getSUnit(MI: DefMI); |
| 1173 | if (SU != nullptr && DefMI->isPHI()) { |
| 1174 | if (!MI->isPHI()) { |
| 1175 | SDep Dep(SU, SDep::Data, Reg); |
| 1176 | Dep.setLatency(0); |
| 1177 | ST.adjustSchedDependency(Def: SU, DefOpIdx: 0, Use: &I, UseOpIdx: MO.getOperandNo(), Dep, |
| 1178 | SchedModel: &SchedModel); |
| 1179 | I.addPred(D: Dep); |
| 1180 | } else { |
| 1181 | HasPhiUse = Reg; |
| 1182 | // Add a chain edge to a dependent Phi that isn't an existing |
| 1183 | // predecessor. |
| 1184 | if (SU->NodeNum < I.NodeNum && !I.isPred(N: SU)) |
| 1185 | I.addPred(D: SDep(SU, SDep::Barrier)); |
| 1186 | } |
| 1187 | } |
| 1188 | } |
| 1189 | } |
| 1190 | // Remove order dependences from an unrelated Phi. |
| 1191 | if (!SwpPruneDeps) |
| 1192 | continue; |
| 1193 | for (auto &PI : I.Preds) { |
| 1194 | MachineInstr *PMI = PI.getSUnit()->getInstr(); |
| 1195 | if (PMI->isPHI() && PI.getKind() == SDep::Order) { |
| 1196 | if (I.getInstr()->isPHI()) { |
| 1197 | if (PMI->getOperand(i: 0).getReg() == HasPhiUse) |
| 1198 | continue; |
| 1199 | if (getLoopPhiReg(Phi: *PMI, LoopBB: PMI->getParent()) == HasPhiDef) |
| 1200 | continue; |
| 1201 | } |
| 1202 | RemoveDeps.push_back(Elt: PI); |
| 1203 | } |
| 1204 | } |
| 1205 | for (const SDep &D : RemoveDeps) |
| 1206 | I.removePred(D); |
| 1207 | } |
| 1208 | } |
| 1209 | |
| 1210 | /// Iterate over each DAG node and see if we can change any dependences |
| 1211 | /// in order to reduce the recurrence MII. |
| 1212 | void SwingSchedulerDAG::changeDependences() { |
| 1213 | // See if an instruction can use a value from the previous iteration. |
| 1214 | // If so, we update the base and offset of the instruction and change |
| 1215 | // the dependences. |
| 1216 | for (SUnit &I : SUnits) { |
| 1217 | unsigned BasePos = 0, OffsetPos = 0; |
| 1218 | Register NewBase; |
| 1219 | int64_t NewOffset = 0; |
| 1220 | if (!canUseLastOffsetValue(MI: I.getInstr(), BasePos, OffsetPos, NewBase, |
| 1221 | NewOffset)) |
| 1222 | continue; |
| 1223 | |
| 1224 | // Get the MI and SUnit for the instruction that defines the original base. |
| 1225 | Register OrigBase = I.getInstr()->getOperand(i: BasePos).getReg(); |
| 1226 | MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg: OrigBase); |
| 1227 | if (!DefMI) |
| 1228 | continue; |
| 1229 | SUnit *DefSU = getSUnit(MI: DefMI); |
| 1230 | if (!DefSU) |
| 1231 | continue; |
| 1232 | // Get the MI and SUnit for the instruction that defins the new base. |
| 1233 | MachineInstr *LastMI = MRI.getUniqueVRegDef(Reg: NewBase); |
| 1234 | if (!LastMI) |
| 1235 | continue; |
| 1236 | SUnit *LastSU = getSUnit(MI: LastMI); |
| 1237 | if (!LastSU) |
| 1238 | continue; |
| 1239 | |
| 1240 | if (Topo.IsReachable(SU: &I, TargetSU: LastSU)) |
| 1241 | continue; |
| 1242 | |
| 1243 | // Remove the dependence. The value now depends on a prior iteration. |
| 1244 | SmallVector<SDep, 4> Deps; |
| 1245 | for (const SDep &P : I.Preds) |
| 1246 | if (P.getSUnit() == DefSU) |
| 1247 | Deps.push_back(Elt: P); |
| 1248 | for (const SDep &D : Deps) { |
| 1249 | Topo.RemovePred(M: &I, N: D.getSUnit()); |
| 1250 | I.removePred(D); |
| 1251 | } |
| 1252 | // Remove the chain dependence between the instructions. |
| 1253 | Deps.clear(); |
| 1254 | for (auto &P : LastSU->Preds) |
| 1255 | if (P.getSUnit() == &I && P.getKind() == SDep::Order) |
| 1256 | Deps.push_back(Elt: P); |
| 1257 | for (const SDep &D : Deps) { |
| 1258 | Topo.RemovePred(M: LastSU, N: D.getSUnit()); |
| 1259 | LastSU->removePred(D); |
| 1260 | } |
| 1261 | |
| 1262 | // Add a dependence between the new instruction and the instruction |
| 1263 | // that defines the new base. |
| 1264 | SDep Dep(&I, SDep::Anti, NewBase); |
| 1265 | Topo.AddPred(Y: LastSU, X: &I); |
| 1266 | LastSU->addPred(D: Dep); |
| 1267 | |
| 1268 | // Remember the base and offset information so that we can update the |
| 1269 | // instruction during code generation. |
| 1270 | InstrChanges[&I] = std::make_pair(x&: NewBase, y&: NewOffset); |
| 1271 | } |
| 1272 | } |
| 1273 | |
| 1274 | /// Create an instruction stream that represents a single iteration and stage of |
| 1275 | /// each instruction. This function differs from SMSchedule::finalizeSchedule in |
| 1276 | /// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this |
| 1277 | /// function is an approximation of SMSchedule::finalizeSchedule with all |
| 1278 | /// non-const operations removed. |
| 1279 | static void computeScheduledInsts(const SwingSchedulerDAG *SSD, |
| 1280 | SMSchedule &Schedule, |
| 1281 | std::vector<MachineInstr *> &OrderedInsts, |
| 1282 | DenseMap<MachineInstr *, unsigned> &Stages) { |
| 1283 | DenseMap<int, std::deque<SUnit *>> Instrs; |
| 1284 | |
| 1285 | // Move all instructions to the first stage from the later stages. |
| 1286 | for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle(); |
| 1287 | ++Cycle) { |
| 1288 | for (int Stage = 0, LastStage = Schedule.getMaxStageCount(); |
| 1289 | Stage <= LastStage; ++Stage) { |
| 1290 | for (SUnit *SU : llvm::reverse(C&: Schedule.getInstructions( |
| 1291 | cycle: Cycle + Stage * Schedule.getInitiationInterval()))) { |
| 1292 | Instrs[Cycle].push_front(x: SU); |
| 1293 | } |
| 1294 | } |
| 1295 | } |
| 1296 | |
| 1297 | for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle(); |
| 1298 | ++Cycle) { |
| 1299 | std::deque<SUnit *> &CycleInstrs = Instrs[Cycle]; |
| 1300 | CycleInstrs = Schedule.reorderInstructions(SSD, Instrs: CycleInstrs); |
| 1301 | for (SUnit *SU : CycleInstrs) { |
| 1302 | MachineInstr *MI = SU->getInstr(); |
| 1303 | OrderedInsts.push_back(x: MI); |
| 1304 | Stages[MI] = Schedule.stageScheduled(SU); |
| 1305 | } |
| 1306 | } |
| 1307 | } |
| 1308 | |
| 1309 | namespace { |
| 1310 | |
| 1311 | // FuncUnitSorter - Comparison operator used to sort instructions by |
| 1312 | // the number of functional unit choices. |
| 1313 | struct FuncUnitSorter { |
| 1314 | const InstrItineraryData *InstrItins; |
| 1315 | const MCSubtargetInfo *STI; |
| 1316 | DenseMap<InstrStage::FuncUnits, unsigned> Resources; |
| 1317 | |
| 1318 | FuncUnitSorter(const TargetSubtargetInfo &TSI) |
| 1319 | : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {} |
| 1320 | |
| 1321 | // Compute the number of functional unit alternatives needed |
| 1322 | // at each stage, and take the minimum value. We prioritize the |
| 1323 | // instructions by the least number of choices first. |
| 1324 | unsigned minFuncUnits(const MachineInstr *Inst, |
| 1325 | InstrStage::FuncUnits &F) const { |
| 1326 | unsigned SchedClass = Inst->getDesc().getSchedClass(); |
| 1327 | unsigned min = UINT_MAX; |
| 1328 | if (InstrItins && !InstrItins->isEmpty()) { |
| 1329 | for (const InstrStage &IS : |
| 1330 | make_range(x: InstrItins->beginStage(ItinClassIndx: SchedClass), |
| 1331 | y: InstrItins->endStage(ItinClassIndx: SchedClass))) { |
| 1332 | InstrStage::FuncUnits funcUnits = IS.getUnits(); |
| 1333 | unsigned numAlternatives = llvm::popcount(Value: funcUnits); |
| 1334 | if (numAlternatives < min) { |
| 1335 | min = numAlternatives; |
| 1336 | F = funcUnits; |
| 1337 | } |
| 1338 | } |
| 1339 | return min; |
| 1340 | } |
| 1341 | if (STI && STI->getSchedModel().hasInstrSchedModel()) { |
| 1342 | const MCSchedClassDesc *SCDesc = |
| 1343 | STI->getSchedModel().getSchedClassDesc(SchedClassIdx: SchedClass); |
| 1344 | if (!SCDesc->isValid()) |
| 1345 | // No valid Schedule Class Desc for schedClass, should be |
| 1346 | // Pseudo/PostRAPseudo |
| 1347 | return min; |
| 1348 | |
| 1349 | for (const MCWriteProcResEntry &PRE : |
| 1350 | make_range(x: STI->getWriteProcResBegin(SC: SCDesc), |
| 1351 | y: STI->getWriteProcResEnd(SC: SCDesc))) { |
| 1352 | if (!PRE.ReleaseAtCycle) |
| 1353 | continue; |
| 1354 | const MCProcResourceDesc *ProcResource = |
| 1355 | STI->getSchedModel().getProcResource(ProcResourceIdx: PRE.ProcResourceIdx); |
| 1356 | unsigned NumUnits = ProcResource->NumUnits; |
| 1357 | if (NumUnits < min) { |
| 1358 | min = NumUnits; |
| 1359 | F = PRE.ProcResourceIdx; |
| 1360 | } |
| 1361 | } |
| 1362 | return min; |
| 1363 | } |
| 1364 | llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!" ); |
| 1365 | } |
| 1366 | |
| 1367 | // Compute the critical resources needed by the instruction. This |
| 1368 | // function records the functional units needed by instructions that |
| 1369 | // must use only one functional unit. We use this as a tie breaker |
| 1370 | // for computing the resource MII. The instrutions that require |
| 1371 | // the same, highly used, functional unit have high priority. |
| 1372 | void calcCriticalResources(MachineInstr &MI) { |
| 1373 | unsigned SchedClass = MI.getDesc().getSchedClass(); |
| 1374 | if (InstrItins && !InstrItins->isEmpty()) { |
| 1375 | for (const InstrStage &IS : |
| 1376 | make_range(x: InstrItins->beginStage(ItinClassIndx: SchedClass), |
| 1377 | y: InstrItins->endStage(ItinClassIndx: SchedClass))) { |
| 1378 | InstrStage::FuncUnits FuncUnits = IS.getUnits(); |
| 1379 | if (llvm::popcount(Value: FuncUnits) == 1) |
| 1380 | Resources[FuncUnits]++; |
| 1381 | } |
| 1382 | return; |
| 1383 | } |
| 1384 | if (STI && STI->getSchedModel().hasInstrSchedModel()) { |
| 1385 | const MCSchedClassDesc *SCDesc = |
| 1386 | STI->getSchedModel().getSchedClassDesc(SchedClassIdx: SchedClass); |
| 1387 | if (!SCDesc->isValid()) |
| 1388 | // No valid Schedule Class Desc for schedClass, should be |
| 1389 | // Pseudo/PostRAPseudo |
| 1390 | return; |
| 1391 | |
| 1392 | for (const MCWriteProcResEntry &PRE : |
| 1393 | make_range(x: STI->getWriteProcResBegin(SC: SCDesc), |
| 1394 | y: STI->getWriteProcResEnd(SC: SCDesc))) { |
| 1395 | if (!PRE.ReleaseAtCycle) |
| 1396 | continue; |
| 1397 | Resources[PRE.ProcResourceIdx]++; |
| 1398 | } |
| 1399 | return; |
| 1400 | } |
| 1401 | llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!" ); |
| 1402 | } |
| 1403 | |
| 1404 | /// Return true if IS1 has less priority than IS2. |
| 1405 | bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const { |
| 1406 | InstrStage::FuncUnits F1 = 0, F2 = 0; |
| 1407 | unsigned MFUs1 = minFuncUnits(Inst: IS1, F&: F1); |
| 1408 | unsigned MFUs2 = minFuncUnits(Inst: IS2, F&: F2); |
| 1409 | if (MFUs1 == MFUs2) |
| 1410 | return Resources.lookup(Val: F1) < Resources.lookup(Val: F2); |
| 1411 | return MFUs1 > MFUs2; |
| 1412 | } |
| 1413 | }; |
| 1414 | |
| 1415 | /// Calculate the maximum register pressure of the scheduled instructions stream |
| 1416 | class HighRegisterPressureDetector { |
| 1417 | MachineBasicBlock *OrigMBB; |
| 1418 | const MachineRegisterInfo &MRI; |
| 1419 | const TargetRegisterInfo *TRI; |
| 1420 | |
| 1421 | const unsigned PSetNum; |
| 1422 | |
| 1423 | // Indexed by PSet ID |
| 1424 | // InitSetPressure takes into account the register pressure of live-in |
| 1425 | // registers. It's not depend on how the loop is scheduled, so it's enough to |
| 1426 | // calculate them once at the beginning. |
| 1427 | std::vector<unsigned> InitSetPressure; |
| 1428 | |
| 1429 | // Indexed by PSet ID |
| 1430 | // Upper limit for each register pressure set |
| 1431 | std::vector<unsigned> PressureSetLimit; |
| 1432 | |
| 1433 | DenseMap<MachineInstr *, RegisterOperands> ROMap; |
| 1434 | |
| 1435 | using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>; |
| 1436 | |
| 1437 | public: |
| 1438 | using OrderedInstsTy = std::vector<MachineInstr *>; |
| 1439 | using Instr2StageTy = DenseMap<MachineInstr *, unsigned>; |
| 1440 | |
| 1441 | private: |
| 1442 | static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) { |
| 1443 | if (Pressures.size() == 0) { |
| 1444 | dbgs() << "[]" ; |
| 1445 | } else { |
| 1446 | char Prefix = '['; |
| 1447 | for (unsigned P : Pressures) { |
| 1448 | dbgs() << Prefix << P; |
| 1449 | Prefix = ' '; |
| 1450 | } |
| 1451 | dbgs() << ']'; |
| 1452 | } |
| 1453 | } |
| 1454 | |
| 1455 | void dumpPSet(Register Reg) const { |
| 1456 | dbgs() << "Reg=" << printReg(Reg, TRI, SubIdx: 0, MRI: &MRI) << " PSet=" ; |
| 1457 | for (auto PSetIter = MRI.getPressureSets(RegUnit: Reg); PSetIter.isValid(); |
| 1458 | ++PSetIter) { |
| 1459 | dbgs() << *PSetIter << ' '; |
| 1460 | } |
| 1461 | dbgs() << '\n'; |
| 1462 | } |
| 1463 | |
| 1464 | void increaseRegisterPressure(std::vector<unsigned> &Pressure, |
| 1465 | Register Reg) const { |
| 1466 | auto PSetIter = MRI.getPressureSets(RegUnit: Reg); |
| 1467 | unsigned Weight = PSetIter.getWeight(); |
| 1468 | for (; PSetIter.isValid(); ++PSetIter) |
| 1469 | Pressure[*PSetIter] += Weight; |
| 1470 | } |
| 1471 | |
| 1472 | void decreaseRegisterPressure(std::vector<unsigned> &Pressure, |
| 1473 | Register Reg) const { |
| 1474 | auto PSetIter = MRI.getPressureSets(RegUnit: Reg); |
| 1475 | unsigned Weight = PSetIter.getWeight(); |
| 1476 | for (; PSetIter.isValid(); ++PSetIter) { |
| 1477 | auto &P = Pressure[*PSetIter]; |
| 1478 | assert(P >= Weight && |
| 1479 | "register pressure must be greater than or equal weight" ); |
| 1480 | P -= Weight; |
| 1481 | } |
| 1482 | } |
| 1483 | |
| 1484 | // Return true if Reg is reserved one, for example, stack pointer |
| 1485 | bool isReservedRegister(Register Reg) const { |
| 1486 | return Reg.isPhysical() && MRI.isReserved(PhysReg: Reg.asMCReg()); |
| 1487 | } |
| 1488 | |
| 1489 | bool isDefinedInThisLoop(Register Reg) const { |
| 1490 | return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB; |
| 1491 | } |
| 1492 | |
| 1493 | // Search for live-in variables. They are factored into the register pressure |
| 1494 | // from the begining. Live-in variables used by every iteration should be |
| 1495 | // considered as alive throughout the loop. For example, the variable `c` in |
| 1496 | // following code. \code |
| 1497 | // int c = ...; |
| 1498 | // for (int i = 0; i < n; i++) |
| 1499 | // a[i] += b[i] + c; |
| 1500 | // \endcode |
| 1501 | void computeLiveIn() { |
| 1502 | DenseSet<Register> Used; |
| 1503 | for (auto &MI : *OrigMBB) { |
| 1504 | if (MI.isDebugInstr()) |
| 1505 | continue; |
| 1506 | for (auto &Use : ROMap[&MI].Uses) { |
| 1507 | auto Reg = Use.RegUnit; |
| 1508 | // Ignore the variable that appears only on one side of phi instruction |
| 1509 | // because it's used only at the first iteration. |
| 1510 | if (MI.isPHI() && Reg != getLoopPhiReg(Phi: MI, LoopBB: OrigMBB)) |
| 1511 | continue; |
| 1512 | if (isReservedRegister(Reg)) |
| 1513 | continue; |
| 1514 | if (isDefinedInThisLoop(Reg)) |
| 1515 | continue; |
| 1516 | Used.insert(V: Reg); |
| 1517 | } |
| 1518 | } |
| 1519 | |
| 1520 | for (auto LiveIn : Used) |
| 1521 | increaseRegisterPressure(Pressure&: InitSetPressure, Reg: LiveIn); |
| 1522 | } |
| 1523 | |
| 1524 | // Calculate the upper limit of each pressure set |
| 1525 | void computePressureSetLimit(const RegisterClassInfo &RCI) { |
| 1526 | for (unsigned PSet = 0; PSet < PSetNum; PSet++) |
| 1527 | PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(Idx: PSet); |
| 1528 | } |
| 1529 | |
| 1530 | // There are two patterns of last-use. |
| 1531 | // - by an instruction of the current iteration |
| 1532 | // - by a phi instruction of the next iteration (loop carried value) |
| 1533 | // |
| 1534 | // Furthermore, following two groups of instructions are executed |
| 1535 | // simultaneously |
| 1536 | // - next iteration's phi instructions in i-th stage |
| 1537 | // - current iteration's instructions in i+1-th stage |
| 1538 | // |
| 1539 | // This function calculates the last-use of each register while taking into |
| 1540 | // account the above two patterns. |
| 1541 | Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts, |
| 1542 | Instr2StageTy &Stages) const { |
| 1543 | // We treat virtual registers that are defined and used in this loop. |
| 1544 | // Following virtual register will be ignored |
| 1545 | // - live-in one |
| 1546 | // - defined but not used in the loop (potentially live-out) |
| 1547 | DenseSet<Register> TargetRegs; |
| 1548 | const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) { |
| 1549 | if (isDefinedInThisLoop(Reg)) |
| 1550 | TargetRegs.insert(V: Reg); |
| 1551 | }; |
| 1552 | for (MachineInstr *MI : OrderedInsts) { |
| 1553 | if (MI->isPHI()) { |
| 1554 | Register Reg = getLoopPhiReg(Phi: *MI, LoopBB: OrigMBB); |
| 1555 | UpdateTargetRegs(Reg); |
| 1556 | } else { |
| 1557 | for (auto &Use : ROMap.find(Val: MI)->getSecond().Uses) |
| 1558 | UpdateTargetRegs(Use.RegUnit); |
| 1559 | } |
| 1560 | } |
| 1561 | |
| 1562 | const auto InstrScore = [&Stages](MachineInstr *MI) { |
| 1563 | return Stages[MI] + MI->isPHI(); |
| 1564 | }; |
| 1565 | |
| 1566 | DenseMap<Register, MachineInstr *> LastUseMI; |
| 1567 | for (MachineInstr *MI : llvm::reverse(C: OrderedInsts)) { |
| 1568 | for (auto &Use : ROMap.find(Val: MI)->getSecond().Uses) { |
| 1569 | auto Reg = Use.RegUnit; |
| 1570 | if (!TargetRegs.contains(V: Reg)) |
| 1571 | continue; |
| 1572 | auto [Ite, Inserted] = LastUseMI.try_emplace(Key: Reg, Args&: MI); |
| 1573 | if (!Inserted) { |
| 1574 | MachineInstr *Orig = Ite->second; |
| 1575 | MachineInstr *New = MI; |
| 1576 | if (InstrScore(Orig) < InstrScore(New)) |
| 1577 | Ite->second = New; |
| 1578 | } |
| 1579 | } |
| 1580 | } |
| 1581 | |
| 1582 | Instr2LastUsesTy LastUses; |
| 1583 | for (auto &Entry : LastUseMI) |
| 1584 | LastUses[Entry.second].insert(V: Entry.first); |
| 1585 | return LastUses; |
| 1586 | } |
| 1587 | |
| 1588 | // Compute the maximum register pressure of the kernel. We'll simulate #Stage |
| 1589 | // iterations and check the register pressure at the point where all stages |
| 1590 | // overlapping. |
| 1591 | // |
| 1592 | // An example of unrolled loop where #Stage is 4.. |
| 1593 | // Iter i+0 i+1 i+2 i+3 |
| 1594 | // ------------------------ |
| 1595 | // Stage 0 |
| 1596 | // Stage 1 0 |
| 1597 | // Stage 2 1 0 |
| 1598 | // Stage 3 2 1 0 <- All stages overlap |
| 1599 | // |
| 1600 | std::vector<unsigned> |
| 1601 | computeMaxSetPressure(const OrderedInstsTy &OrderedInsts, |
| 1602 | Instr2StageTy &Stages, |
| 1603 | const unsigned StageCount) const { |
| 1604 | using RegSetTy = SmallDenseSet<Register, 16>; |
| 1605 | |
| 1606 | // Indexed by #Iter. To treat "local" variables of each stage separately, we |
| 1607 | // manage the liveness of the registers independently by iterations. |
| 1608 | SmallVector<RegSetTy> LiveRegSets(StageCount); |
| 1609 | |
| 1610 | auto CurSetPressure = InitSetPressure; |
| 1611 | auto MaxSetPressure = InitSetPressure; |
| 1612 | auto LastUses = computeLastUses(OrderedInsts, Stages); |
| 1613 | |
| 1614 | LLVM_DEBUG({ |
| 1615 | dbgs() << "Ordered instructions:\n" ; |
| 1616 | for (MachineInstr *MI : OrderedInsts) { |
| 1617 | dbgs() << "Stage " << Stages[MI] << ": " ; |
| 1618 | MI->dump(); |
| 1619 | } |
| 1620 | }); |
| 1621 | |
| 1622 | const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet, |
| 1623 | Register Reg) { |
| 1624 | if (!Reg.isValid() || isReservedRegister(Reg)) |
| 1625 | return; |
| 1626 | |
| 1627 | bool Inserted = RegSet.insert(V: Reg).second; |
| 1628 | if (!Inserted) |
| 1629 | return; |
| 1630 | |
| 1631 | LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n" ); |
| 1632 | increaseRegisterPressure(Pressure&: CurSetPressure, Reg); |
| 1633 | LLVM_DEBUG(dumpPSet(Reg)); |
| 1634 | }; |
| 1635 | |
| 1636 | const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet, |
| 1637 | Register Reg) { |
| 1638 | if (!Reg.isValid() || isReservedRegister(Reg)) |
| 1639 | return; |
| 1640 | |
| 1641 | // live-in register |
| 1642 | if (!RegSet.contains(V: Reg)) |
| 1643 | return; |
| 1644 | |
| 1645 | LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n" ); |
| 1646 | RegSet.erase(V: Reg); |
| 1647 | decreaseRegisterPressure(Pressure&: CurSetPressure, Reg); |
| 1648 | LLVM_DEBUG(dumpPSet(Reg)); |
| 1649 | }; |
| 1650 | |
| 1651 | for (unsigned I = 0; I < StageCount; I++) { |
| 1652 | for (MachineInstr *MI : OrderedInsts) { |
| 1653 | const auto Stage = Stages[MI]; |
| 1654 | if (I < Stage) |
| 1655 | continue; |
| 1656 | |
| 1657 | const unsigned Iter = I - Stage; |
| 1658 | |
| 1659 | for (auto &Def : ROMap.find(Val: MI)->getSecond().Defs) |
| 1660 | InsertReg(LiveRegSets[Iter], Def.RegUnit); |
| 1661 | |
| 1662 | for (auto LastUse : LastUses[MI]) { |
| 1663 | if (MI->isPHI()) { |
| 1664 | if (Iter != 0) |
| 1665 | EraseReg(LiveRegSets[Iter - 1], LastUse); |
| 1666 | } else { |
| 1667 | EraseReg(LiveRegSets[Iter], LastUse); |
| 1668 | } |
| 1669 | } |
| 1670 | |
| 1671 | for (unsigned PSet = 0; PSet < PSetNum; PSet++) |
| 1672 | MaxSetPressure[PSet] = |
| 1673 | std::max(a: MaxSetPressure[PSet], b: CurSetPressure[PSet]); |
| 1674 | |
| 1675 | LLVM_DEBUG({ |
| 1676 | dbgs() << "CurSetPressure=" ; |
| 1677 | dumpRegisterPressures(CurSetPressure); |
| 1678 | dbgs() << " iter=" << Iter << " stage=" << Stage << ":" ; |
| 1679 | MI->dump(); |
| 1680 | }); |
| 1681 | } |
| 1682 | } |
| 1683 | |
| 1684 | return MaxSetPressure; |
| 1685 | } |
| 1686 | |
| 1687 | public: |
| 1688 | HighRegisterPressureDetector(MachineBasicBlock *OrigMBB, |
| 1689 | const MachineFunction &MF) |
| 1690 | : OrigMBB(OrigMBB), MRI(MF.getRegInfo()), |
| 1691 | TRI(MF.getSubtarget().getRegisterInfo()), |
| 1692 | PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0), |
| 1693 | PressureSetLimit(PSetNum, 0) {} |
| 1694 | |
| 1695 | // Used to calculate register pressure, which is independent of loop |
| 1696 | // scheduling. |
| 1697 | void init(const RegisterClassInfo &RCI) { |
| 1698 | for (MachineInstr &MI : *OrigMBB) { |
| 1699 | if (MI.isDebugInstr()) |
| 1700 | continue; |
| 1701 | ROMap[&MI].collect(MI, TRI: *TRI, MRI, TrackLaneMasks: false, IgnoreDead: true); |
| 1702 | } |
| 1703 | |
| 1704 | computeLiveIn(); |
| 1705 | computePressureSetLimit(RCI); |
| 1706 | } |
| 1707 | |
| 1708 | // Calculate the maximum register pressures of the loop and check if they |
| 1709 | // exceed the limit |
| 1710 | bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, |
| 1711 | const unsigned MaxStage) const { |
| 1712 | assert(0 <= RegPressureMargin && RegPressureMargin <= 100 && |
| 1713 | "the percentage of the margin must be between 0 to 100" ); |
| 1714 | |
| 1715 | OrderedInstsTy OrderedInsts; |
| 1716 | Instr2StageTy Stages; |
| 1717 | computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages); |
| 1718 | const auto MaxSetPressure = |
| 1719 | computeMaxSetPressure(OrderedInsts, Stages, StageCount: MaxStage + 1); |
| 1720 | |
| 1721 | LLVM_DEBUG({ |
| 1722 | dbgs() << "Dump MaxSetPressure:\n" ; |
| 1723 | for (unsigned I = 0; I < MaxSetPressure.size(); I++) { |
| 1724 | dbgs() << format("MaxSetPressure[%d]=%d\n" , I, MaxSetPressure[I]); |
| 1725 | } |
| 1726 | dbgs() << '\n'; |
| 1727 | }); |
| 1728 | |
| 1729 | for (unsigned PSet = 0; PSet < PSetNum; PSet++) { |
| 1730 | unsigned Limit = PressureSetLimit[PSet]; |
| 1731 | unsigned Margin = Limit * RegPressureMargin / 100; |
| 1732 | LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit |
| 1733 | << " Margin=" << Margin << "\n" ); |
| 1734 | if (Limit < MaxSetPressure[PSet] + Margin) { |
| 1735 | LLVM_DEBUG( |
| 1736 | dbgs() |
| 1737 | << "Rejected the schedule because of too high register pressure\n" ); |
| 1738 | return true; |
| 1739 | } |
| 1740 | } |
| 1741 | return false; |
| 1742 | } |
| 1743 | }; |
| 1744 | |
| 1745 | } // end anonymous namespace |
| 1746 | |
| 1747 | /// Calculate the resource constrained minimum initiation interval for the |
| 1748 | /// specified loop. We use the DFA to model the resources needed for |
| 1749 | /// each instruction, and we ignore dependences. A different DFA is created |
| 1750 | /// for each cycle that is required. When adding a new instruction, we attempt |
| 1751 | /// to add it to each existing DFA, until a legal space is found. If the |
| 1752 | /// instruction cannot be reserved in an existing DFA, we create a new one. |
| 1753 | unsigned SwingSchedulerDAG::calculateResMII() { |
| 1754 | LLVM_DEBUG(dbgs() << "calculateResMII:\n" ); |
| 1755 | ResourceManager RM(&MF.getSubtarget(), this); |
| 1756 | return RM.calculateResMII(); |
| 1757 | } |
| 1758 | |
| 1759 | /// Calculate the recurrence-constrainted minimum initiation interval. |
| 1760 | /// Iterate over each circuit. Compute the delay(c) and distance(c) |
| 1761 | /// for each circuit. The II needs to satisfy the inequality |
| 1762 | /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest |
| 1763 | /// II that satisfies the inequality, and the RecMII is the maximum |
| 1764 | /// of those values. |
| 1765 | unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) { |
| 1766 | unsigned RecMII = 0; |
| 1767 | |
| 1768 | for (NodeSet &Nodes : NodeSets) { |
| 1769 | if (Nodes.empty()) |
| 1770 | continue; |
| 1771 | |
| 1772 | unsigned Delay = Nodes.getLatency(); |
| 1773 | unsigned Distance = 1; |
| 1774 | |
| 1775 | // ii = ceil(delay / distance) |
| 1776 | unsigned CurMII = (Delay + Distance - 1) / Distance; |
| 1777 | Nodes.setRecMII(CurMII); |
| 1778 | if (CurMII > RecMII) |
| 1779 | RecMII = CurMII; |
| 1780 | } |
| 1781 | |
| 1782 | return RecMII; |
| 1783 | } |
| 1784 | |
| 1785 | /// Create the adjacency structure of the nodes in the graph. |
| 1786 | void SwingSchedulerDAG::Circuits::createAdjacencyStructure( |
| 1787 | SwingSchedulerDAG *DAG) { |
| 1788 | BitVector Added(SUnits.size()); |
| 1789 | DenseMap<int, int> OutputDeps; |
| 1790 | for (int i = 0, e = SUnits.size(); i != e; ++i) { |
| 1791 | Added.reset(); |
| 1792 | // Add any successor to the adjacency matrix and exclude duplicates. |
| 1793 | for (auto &OE : DAG->DDG->getOutEdges(SU: &SUnits[i])) { |
| 1794 | // Only create a back-edge on the first and last nodes of a dependence |
| 1795 | // chain. This records any chains and adds them later. |
| 1796 | if (OE.isOutputDep()) { |
| 1797 | int N = OE.getDst()->NodeNum; |
| 1798 | int BackEdge = i; |
| 1799 | auto Dep = OutputDeps.find(Val: BackEdge); |
| 1800 | if (Dep != OutputDeps.end()) { |
| 1801 | BackEdge = Dep->second; |
| 1802 | OutputDeps.erase(I: Dep); |
| 1803 | } |
| 1804 | OutputDeps[N] = BackEdge; |
| 1805 | } |
| 1806 | // Do not process a boundary node, an artificial node. |
| 1807 | if (OE.getDst()->isBoundaryNode() || OE.isArtificial()) |
| 1808 | continue; |
| 1809 | |
| 1810 | // This code is retained o preserve previous behavior and prevent |
| 1811 | // regression. This condition means that anti-dependnecies within an |
| 1812 | // iteration are ignored when searching circuits. Therefore it's natural |
| 1813 | // to consider this dependence as well. |
| 1814 | // FIXME: Remove this code if it doesn't have significant impact on |
| 1815 | // performance. |
| 1816 | if (OE.isAntiDep()) |
| 1817 | continue; |
| 1818 | |
| 1819 | int N = OE.getDst()->NodeNum; |
| 1820 | if (!Added.test(Idx: N)) { |
| 1821 | AdjK[i].push_back(Elt: N); |
| 1822 | Added.set(N); |
| 1823 | } |
| 1824 | } |
| 1825 | // A chain edge between a store and a load is treated as a back-edge in the |
| 1826 | // adjacency matrix. |
| 1827 | for (auto &IE : DAG->DDG->getInEdges(SU: &SUnits[i])) { |
| 1828 | SUnit *Src = IE.getSrc(); |
| 1829 | SUnit *Dst = IE.getDst(); |
| 1830 | if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(Edge: IE)) |
| 1831 | continue; |
| 1832 | if (IE.isOrderDep() && Src->getInstr()->mayLoad()) { |
| 1833 | int N = Src->NodeNum; |
| 1834 | if (!Added.test(Idx: N)) { |
| 1835 | AdjK[i].push_back(Elt: N); |
| 1836 | Added.set(N); |
| 1837 | } |
| 1838 | } |
| 1839 | } |
| 1840 | } |
| 1841 | // Add back-edges in the adjacency matrix for the output dependences. |
| 1842 | for (auto &OD : OutputDeps) |
| 1843 | if (!Added.test(Idx: OD.second)) { |
| 1844 | AdjK[OD.first].push_back(Elt: OD.second); |
| 1845 | Added.set(OD.second); |
| 1846 | } |
| 1847 | } |
| 1848 | |
| 1849 | /// Identify an elementary circuit in the dependence graph starting at the |
| 1850 | /// specified node. |
| 1851 | bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets, |
| 1852 | const SwingSchedulerDAG *DAG, |
| 1853 | bool HasBackedge) { |
| 1854 | SUnit *SV = &SUnits[V]; |
| 1855 | bool F = false; |
| 1856 | Stack.insert(X: SV); |
| 1857 | Blocked.set(V); |
| 1858 | |
| 1859 | for (auto W : AdjK[V]) { |
| 1860 | if (NumPaths > MaxPaths) |
| 1861 | break; |
| 1862 | if (W < S) |
| 1863 | continue; |
| 1864 | if (W == S) { |
| 1865 | if (!HasBackedge) |
| 1866 | NodeSets.push_back(Elt: NodeSet(Stack.begin(), Stack.end(), DAG)); |
| 1867 | F = true; |
| 1868 | ++NumPaths; |
| 1869 | break; |
| 1870 | } |
| 1871 | if (!Blocked.test(Idx: W)) { |
| 1872 | if (circuit(V: W, S, NodeSets, DAG, |
| 1873 | HasBackedge: Node2Idx->at(n: W) < Node2Idx->at(n: V) ? true : HasBackedge)) |
| 1874 | F = true; |
| 1875 | } |
| 1876 | } |
| 1877 | |
| 1878 | if (F) |
| 1879 | unblock(U: V); |
| 1880 | else { |
| 1881 | for (auto W : AdjK[V]) { |
| 1882 | if (W < S) |
| 1883 | continue; |
| 1884 | B[W].insert(Ptr: SV); |
| 1885 | } |
| 1886 | } |
| 1887 | Stack.pop_back(); |
| 1888 | return F; |
| 1889 | } |
| 1890 | |
| 1891 | /// Unblock a node in the circuit finding algorithm. |
| 1892 | void SwingSchedulerDAG::Circuits::unblock(int U) { |
| 1893 | Blocked.reset(Idx: U); |
| 1894 | SmallPtrSet<SUnit *, 4> &BU = B[U]; |
| 1895 | while (!BU.empty()) { |
| 1896 | SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin(); |
| 1897 | assert(SI != BU.end() && "Invalid B set." ); |
| 1898 | SUnit *W = *SI; |
| 1899 | BU.erase(Ptr: W); |
| 1900 | if (Blocked.test(Idx: W->NodeNum)) |
| 1901 | unblock(U: W->NodeNum); |
| 1902 | } |
| 1903 | } |
| 1904 | |
| 1905 | /// Identify all the elementary circuits in the dependence graph using |
| 1906 | /// Johnson's circuit algorithm. |
| 1907 | void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) { |
| 1908 | Circuits Cir(SUnits, Topo); |
| 1909 | // Create the adjacency structure. |
| 1910 | Cir.createAdjacencyStructure(DAG: this); |
| 1911 | for (int I = 0, E = SUnits.size(); I != E; ++I) { |
| 1912 | Cir.reset(); |
| 1913 | Cir.circuit(V: I, S: I, NodeSets, DAG: this); |
| 1914 | } |
| 1915 | } |
| 1916 | |
| 1917 | // Create artificial dependencies between the source of COPY/REG_SEQUENCE that |
| 1918 | // is loop-carried to the USE in next iteration. This will help pipeliner avoid |
| 1919 | // additional copies that are needed across iterations. An artificial dependence |
| 1920 | // edge is added from USE to SOURCE of COPY/REG_SEQUENCE. |
| 1921 | |
| 1922 | // PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried) |
| 1923 | // SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE |
| 1924 | // PHI-------True-Dep------> USEOfPhi |
| 1925 | |
| 1926 | // The mutation creates |
| 1927 | // USEOfPHI -------Artificial-Dep---> SRCOfCopy |
| 1928 | |
| 1929 | // This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy |
| 1930 | // (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled |
| 1931 | // late to avoid additional copies across iterations. The possible scheduling |
| 1932 | // order would be |
| 1933 | // USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE. |
| 1934 | |
| 1935 | void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) { |
| 1936 | for (SUnit &SU : DAG->SUnits) { |
| 1937 | // Find the COPY/REG_SEQUENCE instruction. |
| 1938 | if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence()) |
| 1939 | continue; |
| 1940 | |
| 1941 | // Record the loop carried PHIs. |
| 1942 | SmallVector<SUnit *, 4> PHISUs; |
| 1943 | // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions. |
| 1944 | SmallVector<SUnit *, 4> SrcSUs; |
| 1945 | |
| 1946 | for (auto &Dep : SU.Preds) { |
| 1947 | SUnit *TmpSU = Dep.getSUnit(); |
| 1948 | MachineInstr *TmpMI = TmpSU->getInstr(); |
| 1949 | SDep::Kind DepKind = Dep.getKind(); |
| 1950 | // Save the loop carried PHI. |
| 1951 | if (DepKind == SDep::Anti && TmpMI->isPHI()) |
| 1952 | PHISUs.push_back(Elt: TmpSU); |
| 1953 | // Save the source of COPY/REG_SEQUENCE. |
| 1954 | // If the source has no pre-decessors, we will end up creating cycles. |
| 1955 | else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0) |
| 1956 | SrcSUs.push_back(Elt: TmpSU); |
| 1957 | } |
| 1958 | |
| 1959 | if (PHISUs.size() == 0 || SrcSUs.size() == 0) |
| 1960 | continue; |
| 1961 | |
| 1962 | // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this |
| 1963 | // SUnit to the container. |
| 1964 | SmallVector<SUnit *, 8> UseSUs; |
| 1965 | // Do not use iterator based loop here as we are updating the container. |
| 1966 | for (size_t Index = 0; Index < PHISUs.size(); ++Index) { |
| 1967 | for (auto &Dep : PHISUs[Index]->Succs) { |
| 1968 | if (Dep.getKind() != SDep::Data) |
| 1969 | continue; |
| 1970 | |
| 1971 | SUnit *TmpSU = Dep.getSUnit(); |
| 1972 | MachineInstr *TmpMI = TmpSU->getInstr(); |
| 1973 | if (TmpMI->isPHI() || TmpMI->isRegSequence()) { |
| 1974 | PHISUs.push_back(Elt: TmpSU); |
| 1975 | continue; |
| 1976 | } |
| 1977 | UseSUs.push_back(Elt: TmpSU); |
| 1978 | } |
| 1979 | } |
| 1980 | |
| 1981 | if (UseSUs.size() == 0) |
| 1982 | continue; |
| 1983 | |
| 1984 | SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(Val: DAG); |
| 1985 | // Add the artificial dependencies if it does not form a cycle. |
| 1986 | for (auto *I : UseSUs) { |
| 1987 | for (auto *Src : SrcSUs) { |
| 1988 | if (!SDAG->Topo.IsReachable(SU: I, TargetSU: Src) && Src != I) { |
| 1989 | Src->addPred(D: SDep(I, SDep::Artificial)); |
| 1990 | SDAG->Topo.AddPred(Y: Src, X: I); |
| 1991 | } |
| 1992 | } |
| 1993 | } |
| 1994 | } |
| 1995 | } |
| 1996 | |
| 1997 | /// Compute several functions need to order the nodes for scheduling. |
| 1998 | /// ASAP - Earliest time to schedule a node. |
| 1999 | /// ALAP - Latest time to schedule a node. |
| 2000 | /// MOV - Mobility function, difference between ALAP and ASAP. |
| 2001 | /// D - Depth of each node. |
| 2002 | /// H - Height of each node. |
| 2003 | void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) { |
| 2004 | ScheduleInfo.resize(new_size: SUnits.size()); |
| 2005 | |
| 2006 | LLVM_DEBUG({ |
| 2007 | for (int I : Topo) { |
| 2008 | const SUnit &SU = SUnits[I]; |
| 2009 | dumpNode(SU); |
| 2010 | } |
| 2011 | }); |
| 2012 | |
| 2013 | int maxASAP = 0; |
| 2014 | // Compute ASAP and ZeroLatencyDepth. |
| 2015 | for (int I : Topo) { |
| 2016 | int asap = 0; |
| 2017 | int zeroLatencyDepth = 0; |
| 2018 | SUnit *SU = &SUnits[I]; |
| 2019 | for (const auto &IE : DDG->getInEdges(SU)) { |
| 2020 | SUnit *Pred = IE.getSrc(); |
| 2021 | if (IE.getLatency() == 0) |
| 2022 | zeroLatencyDepth = |
| 2023 | std::max(a: zeroLatencyDepth, b: getZeroLatencyDepth(Node: Pred) + 1); |
| 2024 | if (IE.ignoreDependence(IgnoreAnti: true)) |
| 2025 | continue; |
| 2026 | asap = std::max(a: asap, b: (int)(getASAP(Node: Pred) + IE.getLatency() - |
| 2027 | IE.getDistance() * MII)); |
| 2028 | } |
| 2029 | maxASAP = std::max(a: maxASAP, b: asap); |
| 2030 | ScheduleInfo[I].ASAP = asap; |
| 2031 | ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth; |
| 2032 | } |
| 2033 | |
| 2034 | // Compute ALAP, ZeroLatencyHeight, and MOV. |
| 2035 | for (int I : llvm::reverse(C&: Topo)) { |
| 2036 | int alap = maxASAP; |
| 2037 | int zeroLatencyHeight = 0; |
| 2038 | SUnit *SU = &SUnits[I]; |
| 2039 | for (const auto &OE : DDG->getOutEdges(SU)) { |
| 2040 | SUnit *Succ = OE.getDst(); |
| 2041 | if (Succ->isBoundaryNode()) |
| 2042 | continue; |
| 2043 | if (OE.getLatency() == 0) |
| 2044 | zeroLatencyHeight = |
| 2045 | std::max(a: zeroLatencyHeight, b: getZeroLatencyHeight(Node: Succ) + 1); |
| 2046 | if (OE.ignoreDependence(IgnoreAnti: true)) |
| 2047 | continue; |
| 2048 | alap = std::min(a: alap, b: (int)(getALAP(Node: Succ) - OE.getLatency() + |
| 2049 | OE.getDistance() * MII)); |
| 2050 | } |
| 2051 | |
| 2052 | ScheduleInfo[I].ALAP = alap; |
| 2053 | ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight; |
| 2054 | } |
| 2055 | |
| 2056 | // After computing the node functions, compute the summary for each node set. |
| 2057 | for (NodeSet &I : NodeSets) |
| 2058 | I.computeNodeSetInfo(SSD: this); |
| 2059 | |
| 2060 | LLVM_DEBUG({ |
| 2061 | for (unsigned i = 0; i < SUnits.size(); i++) { |
| 2062 | dbgs() << "\tNode " << i << ":\n" ; |
| 2063 | dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n" ; |
| 2064 | dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n" ; |
| 2065 | dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n" ; |
| 2066 | dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n" ; |
| 2067 | dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n" ; |
| 2068 | dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n" ; |
| 2069 | dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n" ; |
| 2070 | } |
| 2071 | }); |
| 2072 | } |
| 2073 | |
| 2074 | /// Compute the Pred_L(O) set, as defined in the paper. The set is defined |
| 2075 | /// as the predecessors of the elements of NodeOrder that are not also in |
| 2076 | /// NodeOrder. |
| 2077 | static bool pred_L(SetVector<SUnit *> &NodeOrder, |
| 2078 | SmallSetVector<SUnit *, 8> &Preds, SwingSchedulerDDG *DDG, |
| 2079 | const NodeSet *S = nullptr) { |
| 2080 | Preds.clear(); |
| 2081 | |
| 2082 | for (SUnit *SU : NodeOrder) { |
| 2083 | for (const auto &IE : DDG->getInEdges(SU)) { |
| 2084 | SUnit *PredSU = IE.getSrc(); |
| 2085 | if (S && S->count(SU: PredSU) == 0) |
| 2086 | continue; |
| 2087 | if (IE.ignoreDependence(IgnoreAnti: true)) |
| 2088 | continue; |
| 2089 | if (NodeOrder.count(key: PredSU) == 0) |
| 2090 | Preds.insert(X: PredSU); |
| 2091 | } |
| 2092 | |
| 2093 | // FIXME: The following loop-carried dependencies may also need to be |
| 2094 | // considered. |
| 2095 | // - Physical register dependencies (true-dependence and WAW). |
| 2096 | // - Memory dependencies. |
| 2097 | for (const auto &OE : DDG->getOutEdges(SU)) { |
| 2098 | SUnit *SuccSU = OE.getDst(); |
| 2099 | if (!OE.isAntiDep()) |
| 2100 | continue; |
| 2101 | if (S && S->count(SU: SuccSU) == 0) |
| 2102 | continue; |
| 2103 | if (NodeOrder.count(key: SuccSU) == 0) |
| 2104 | Preds.insert(X: SuccSU); |
| 2105 | } |
| 2106 | } |
| 2107 | return !Preds.empty(); |
| 2108 | } |
| 2109 | |
| 2110 | /// Compute the Succ_L(O) set, as defined in the paper. The set is defined |
| 2111 | /// as the successors of the elements of NodeOrder that are not also in |
| 2112 | /// NodeOrder. |
| 2113 | static bool succ_L(SetVector<SUnit *> &NodeOrder, |
| 2114 | SmallSetVector<SUnit *, 8> &Succs, SwingSchedulerDDG *DDG, |
| 2115 | const NodeSet *S = nullptr) { |
| 2116 | Succs.clear(); |
| 2117 | |
| 2118 | for (SUnit *SU : NodeOrder) { |
| 2119 | for (const auto &OE : DDG->getOutEdges(SU)) { |
| 2120 | SUnit *SuccSU = OE.getDst(); |
| 2121 | if (S && S->count(SU: SuccSU) == 0) |
| 2122 | continue; |
| 2123 | if (OE.ignoreDependence(IgnoreAnti: false)) |
| 2124 | continue; |
| 2125 | if (NodeOrder.count(key: SuccSU) == 0) |
| 2126 | Succs.insert(X: SuccSU); |
| 2127 | } |
| 2128 | |
| 2129 | // FIXME: The following loop-carried dependencies may also need to be |
| 2130 | // considered. |
| 2131 | // - Physical register dependnecies (true-dependnece and WAW). |
| 2132 | // - Memory dependencies. |
| 2133 | for (const auto &IE : DDG->getInEdges(SU)) { |
| 2134 | SUnit *PredSU = IE.getSrc(); |
| 2135 | if (!IE.isAntiDep()) |
| 2136 | continue; |
| 2137 | if (S && S->count(SU: PredSU) == 0) |
| 2138 | continue; |
| 2139 | if (NodeOrder.count(key: PredSU) == 0) |
| 2140 | Succs.insert(X: PredSU); |
| 2141 | } |
| 2142 | } |
| 2143 | return !Succs.empty(); |
| 2144 | } |
| 2145 | |
| 2146 | /// Return true if there is a path from the specified node to any of the nodes |
| 2147 | /// in DestNodes. Keep track and return the nodes in any path. |
| 2148 | static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path, |
| 2149 | SetVector<SUnit *> &DestNodes, |
| 2150 | SetVector<SUnit *> &Exclude, |
| 2151 | SmallPtrSet<SUnit *, 8> &Visited, |
| 2152 | SwingSchedulerDDG *DDG) { |
| 2153 | if (Cur->isBoundaryNode()) |
| 2154 | return false; |
| 2155 | if (Exclude.contains(key: Cur)) |
| 2156 | return false; |
| 2157 | if (DestNodes.contains(key: Cur)) |
| 2158 | return true; |
| 2159 | if (!Visited.insert(Ptr: Cur).second) |
| 2160 | return Path.contains(key: Cur); |
| 2161 | bool FoundPath = false; |
| 2162 | for (const auto &OE : DDG->getOutEdges(SU: Cur)) |
| 2163 | if (!OE.ignoreDependence(IgnoreAnti: false)) |
| 2164 | FoundPath |= |
| 2165 | computePath(Cur: OE.getDst(), Path, DestNodes, Exclude, Visited, DDG); |
| 2166 | for (const auto &IE : DDG->getInEdges(SU: Cur)) |
| 2167 | if (IE.isAntiDep() && IE.getDistance() == 0) |
| 2168 | FoundPath |= |
| 2169 | computePath(Cur: IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG); |
| 2170 | if (FoundPath) |
| 2171 | Path.insert(X: Cur); |
| 2172 | return FoundPath; |
| 2173 | } |
| 2174 | |
| 2175 | /// Compute the live-out registers for the instructions in a node-set. |
| 2176 | /// The live-out registers are those that are defined in the node-set, |
| 2177 | /// but not used. Except for use operands of Phis. |
| 2178 | static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, |
| 2179 | NodeSet &NS) { |
| 2180 | const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); |
| 2181 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
| 2182 | SmallVector<VRegMaskOrUnit, 8> LiveOutRegs; |
| 2183 | SmallSet<Register, 4> Uses; |
| 2184 | for (SUnit *SU : NS) { |
| 2185 | const MachineInstr *MI = SU->getInstr(); |
| 2186 | if (MI->isPHI()) |
| 2187 | continue; |
| 2188 | for (const MachineOperand &MO : MI->all_uses()) { |
| 2189 | Register Reg = MO.getReg(); |
| 2190 | if (Reg.isVirtual()) |
| 2191 | Uses.insert(V: Reg); |
| 2192 | else if (MRI.isAllocatable(PhysReg: Reg)) |
| 2193 | Uses.insert_range(R: TRI->regunits(Reg: Reg.asMCReg())); |
| 2194 | } |
| 2195 | } |
| 2196 | for (SUnit *SU : NS) |
| 2197 | for (const MachineOperand &MO : SU->getInstr()->all_defs()) |
| 2198 | if (!MO.isDead()) { |
| 2199 | Register Reg = MO.getReg(); |
| 2200 | if (Reg.isVirtual()) { |
| 2201 | if (!Uses.count(V: Reg)) |
| 2202 | LiveOutRegs.emplace_back(Args&: Reg, Args: LaneBitmask::getNone()); |
| 2203 | } else if (MRI.isAllocatable(PhysReg: Reg)) { |
| 2204 | for (MCRegUnit Unit : TRI->regunits(Reg: Reg.asMCReg())) |
| 2205 | if (!Uses.count(V: Unit)) |
| 2206 | LiveOutRegs.emplace_back(Args&: Unit, Args: LaneBitmask::getNone()); |
| 2207 | } |
| 2208 | } |
| 2209 | RPTracker.addLiveRegs(Regs: LiveOutRegs); |
| 2210 | } |
| 2211 | |
| 2212 | /// A heuristic to filter nodes in recurrent node-sets if the register |
| 2213 | /// pressure of a set is too high. |
| 2214 | void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) { |
| 2215 | for (auto &NS : NodeSets) { |
| 2216 | // Skip small node-sets since they won't cause register pressure problems. |
| 2217 | if (NS.size() <= 2) |
| 2218 | continue; |
| 2219 | IntervalPressure RecRegPressure; |
| 2220 | RegPressureTracker RecRPTracker(RecRegPressure); |
| 2221 | RecRPTracker.init(mf: &MF, rci: &RegClassInfo, lis: &LIS, mbb: BB, pos: BB->end(), TrackLaneMasks: false, TrackUntiedDefs: true); |
| 2222 | computeLiveOuts(MF, RPTracker&: RecRPTracker, NS); |
| 2223 | RecRPTracker.closeBottom(); |
| 2224 | |
| 2225 | std::vector<SUnit *> SUnits(NS.begin(), NS.end()); |
| 2226 | llvm::sort(C&: SUnits, Comp: [](const SUnit *A, const SUnit *B) { |
| 2227 | return A->NodeNum > B->NodeNum; |
| 2228 | }); |
| 2229 | |
| 2230 | for (auto &SU : SUnits) { |
| 2231 | // Since we're computing the register pressure for a subset of the |
| 2232 | // instructions in a block, we need to set the tracker for each |
| 2233 | // instruction in the node-set. The tracker is set to the instruction |
| 2234 | // just after the one we're interested in. |
| 2235 | MachineBasicBlock::const_iterator CurInstI = SU->getInstr(); |
| 2236 | RecRPTracker.setPos(std::next(x: CurInstI)); |
| 2237 | |
| 2238 | RegPressureDelta RPDelta; |
| 2239 | ArrayRef<PressureChange> CriticalPSets; |
| 2240 | RecRPTracker.getMaxUpwardPressureDelta(MI: SU->getInstr(), PDiff: nullptr, Delta&: RPDelta, |
| 2241 | CriticalPSets, |
| 2242 | MaxPressureLimit: RecRegPressure.MaxSetPressure); |
| 2243 | if (RPDelta.Excess.isValid()) { |
| 2244 | LLVM_DEBUG( |
| 2245 | dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") " |
| 2246 | << TRI->getRegPressureSetName(RPDelta.Excess.getPSet()) |
| 2247 | << ":" << RPDelta.Excess.getUnitInc() << "\n" ); |
| 2248 | NS.setExceedPressure(SU); |
| 2249 | break; |
| 2250 | } |
| 2251 | RecRPTracker.recede(); |
| 2252 | } |
| 2253 | } |
| 2254 | } |
| 2255 | |
| 2256 | /// A heuristic to colocate node sets that have the same set of |
| 2257 | /// successors. |
| 2258 | void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) { |
| 2259 | unsigned Colocate = 0; |
| 2260 | for (int i = 0, e = NodeSets.size(); i < e; ++i) { |
| 2261 | NodeSet &N1 = NodeSets[i]; |
| 2262 | SmallSetVector<SUnit *, 8> S1; |
| 2263 | if (N1.empty() || !succ_L(NodeOrder&: N1, Succs&: S1, DDG: DDG.get())) |
| 2264 | continue; |
| 2265 | for (int j = i + 1; j < e; ++j) { |
| 2266 | NodeSet &N2 = NodeSets[j]; |
| 2267 | if (N1.compareRecMII(RHS&: N2) != 0) |
| 2268 | continue; |
| 2269 | SmallSetVector<SUnit *, 8> S2; |
| 2270 | if (N2.empty() || !succ_L(NodeOrder&: N2, Succs&: S2, DDG: DDG.get())) |
| 2271 | continue; |
| 2272 | if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) { |
| 2273 | N1.setColocate(++Colocate); |
| 2274 | N2.setColocate(Colocate); |
| 2275 | break; |
| 2276 | } |
| 2277 | } |
| 2278 | } |
| 2279 | } |
| 2280 | |
| 2281 | /// Check if the existing node-sets are profitable. If not, then ignore the |
| 2282 | /// recurrent node-sets, and attempt to schedule all nodes together. This is |
| 2283 | /// a heuristic. If the MII is large and all the recurrent node-sets are small, |
| 2284 | /// then it's best to try to schedule all instructions together instead of |
| 2285 | /// starting with the recurrent node-sets. |
| 2286 | void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) { |
| 2287 | // Look for loops with a large MII. |
| 2288 | if (MII < 17) |
| 2289 | return; |
| 2290 | // Check if the node-set contains only a simple add recurrence. |
| 2291 | for (auto &NS : NodeSets) { |
| 2292 | if (NS.getRecMII() > 2) |
| 2293 | return; |
| 2294 | if (NS.getMaxDepth() > MII) |
| 2295 | return; |
| 2296 | } |
| 2297 | NodeSets.clear(); |
| 2298 | LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n" ); |
| 2299 | } |
| 2300 | |
| 2301 | /// Add the nodes that do not belong to a recurrence set into groups |
| 2302 | /// based upon connected components. |
| 2303 | void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) { |
| 2304 | SetVector<SUnit *> NodesAdded; |
| 2305 | SmallPtrSet<SUnit *, 8> Visited; |
| 2306 | // Add the nodes that are on a path between the previous node sets and |
| 2307 | // the current node set. |
| 2308 | for (NodeSet &I : NodeSets) { |
| 2309 | SmallSetVector<SUnit *, 8> N; |
| 2310 | // Add the nodes from the current node set to the previous node set. |
| 2311 | if (succ_L(NodeOrder&: I, Succs&: N, DDG: DDG.get())) { |
| 2312 | SetVector<SUnit *> Path; |
| 2313 | for (SUnit *NI : N) { |
| 2314 | Visited.clear(); |
| 2315 | computePath(Cur: NI, Path, DestNodes&: NodesAdded, Exclude&: I, Visited, DDG: DDG.get()); |
| 2316 | } |
| 2317 | if (!Path.empty()) |
| 2318 | I.insert(S: Path.begin(), E: Path.end()); |
| 2319 | } |
| 2320 | // Add the nodes from the previous node set to the current node set. |
| 2321 | N.clear(); |
| 2322 | if (succ_L(NodeOrder&: NodesAdded, Succs&: N, DDG: DDG.get())) { |
| 2323 | SetVector<SUnit *> Path; |
| 2324 | for (SUnit *NI : N) { |
| 2325 | Visited.clear(); |
| 2326 | computePath(Cur: NI, Path, DestNodes&: I, Exclude&: NodesAdded, Visited, DDG: DDG.get()); |
| 2327 | } |
| 2328 | if (!Path.empty()) |
| 2329 | I.insert(S: Path.begin(), E: Path.end()); |
| 2330 | } |
| 2331 | NodesAdded.insert_range(R&: I); |
| 2332 | } |
| 2333 | |
| 2334 | // Create a new node set with the connected nodes of any successor of a node |
| 2335 | // in a recurrent set. |
| 2336 | NodeSet NewSet; |
| 2337 | SmallSetVector<SUnit *, 8> N; |
| 2338 | if (succ_L(NodeOrder&: NodesAdded, Succs&: N, DDG: DDG.get())) |
| 2339 | for (SUnit *I : N) |
| 2340 | addConnectedNodes(SU: I, NewSet, NodesAdded); |
| 2341 | if (!NewSet.empty()) |
| 2342 | NodeSets.push_back(Elt: NewSet); |
| 2343 | |
| 2344 | // Create a new node set with the connected nodes of any predecessor of a node |
| 2345 | // in a recurrent set. |
| 2346 | NewSet.clear(); |
| 2347 | if (pred_L(NodeOrder&: NodesAdded, Preds&: N, DDG: DDG.get())) |
| 2348 | for (SUnit *I : N) |
| 2349 | addConnectedNodes(SU: I, NewSet, NodesAdded); |
| 2350 | if (!NewSet.empty()) |
| 2351 | NodeSets.push_back(Elt: NewSet); |
| 2352 | |
| 2353 | // Create new nodes sets with the connected nodes any remaining node that |
| 2354 | // has no predecessor. |
| 2355 | for (SUnit &SU : SUnits) { |
| 2356 | if (NodesAdded.count(key: &SU) == 0) { |
| 2357 | NewSet.clear(); |
| 2358 | addConnectedNodes(SU: &SU, NewSet, NodesAdded); |
| 2359 | if (!NewSet.empty()) |
| 2360 | NodeSets.push_back(Elt: NewSet); |
| 2361 | } |
| 2362 | } |
| 2363 | } |
| 2364 | |
| 2365 | /// Add the node to the set, and add all of its connected nodes to the set. |
| 2366 | void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet, |
| 2367 | SetVector<SUnit *> &NodesAdded) { |
| 2368 | NewSet.insert(SU); |
| 2369 | NodesAdded.insert(X: SU); |
| 2370 | for (auto &OE : DDG->getOutEdges(SU)) { |
| 2371 | SUnit *Successor = OE.getDst(); |
| 2372 | if (!OE.isArtificial() && !Successor->isBoundaryNode() && |
| 2373 | NodesAdded.count(key: Successor) == 0) |
| 2374 | addConnectedNodes(SU: Successor, NewSet, NodesAdded); |
| 2375 | } |
| 2376 | for (auto &IE : DDG->getInEdges(SU)) { |
| 2377 | SUnit *Predecessor = IE.getSrc(); |
| 2378 | if (!IE.isArtificial() && NodesAdded.count(key: Predecessor) == 0) |
| 2379 | addConnectedNodes(SU: Predecessor, NewSet, NodesAdded); |
| 2380 | } |
| 2381 | } |
| 2382 | |
| 2383 | /// Return true if Set1 contains elements in Set2. The elements in common |
| 2384 | /// are returned in a different container. |
| 2385 | static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2, |
| 2386 | SmallSetVector<SUnit *, 8> &Result) { |
| 2387 | Result.clear(); |
| 2388 | for (SUnit *SU : Set1) { |
| 2389 | if (Set2.count(SU) != 0) |
| 2390 | Result.insert(X: SU); |
| 2391 | } |
| 2392 | return !Result.empty(); |
| 2393 | } |
| 2394 | |
| 2395 | /// Merge the recurrence node sets that have the same initial node. |
| 2396 | void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) { |
| 2397 | for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E; |
| 2398 | ++I) { |
| 2399 | NodeSet &NI = *I; |
| 2400 | for (NodeSetType::iterator J = I + 1; J != E;) { |
| 2401 | NodeSet &NJ = *J; |
| 2402 | if (NI.getNode(i: 0)->NodeNum == NJ.getNode(i: 0)->NodeNum) { |
| 2403 | if (NJ.compareRecMII(RHS&: NI) > 0) |
| 2404 | NI.setRecMII(NJ.getRecMII()); |
| 2405 | for (SUnit *SU : *J) |
| 2406 | I->insert(SU); |
| 2407 | NodeSets.erase(CI: J); |
| 2408 | E = NodeSets.end(); |
| 2409 | } else { |
| 2410 | ++J; |
| 2411 | } |
| 2412 | } |
| 2413 | } |
| 2414 | } |
| 2415 | |
| 2416 | /// Remove nodes that have been scheduled in previous NodeSets. |
| 2417 | void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) { |
| 2418 | for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E; |
| 2419 | ++I) |
| 2420 | for (NodeSetType::iterator J = I + 1; J != E;) { |
| 2421 | J->remove_if(P: [&](SUnit *SUJ) { return I->count(SU: SUJ); }); |
| 2422 | |
| 2423 | if (J->empty()) { |
| 2424 | NodeSets.erase(CI: J); |
| 2425 | E = NodeSets.end(); |
| 2426 | } else { |
| 2427 | ++J; |
| 2428 | } |
| 2429 | } |
| 2430 | } |
| 2431 | |
| 2432 | /// Compute an ordered list of the dependence graph nodes, which |
| 2433 | /// indicates the order that the nodes will be scheduled. This is a |
| 2434 | /// two-level algorithm. First, a partial order is created, which |
| 2435 | /// consists of a list of sets ordered from highest to lowest priority. |
| 2436 | void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) { |
| 2437 | SmallSetVector<SUnit *, 8> R; |
| 2438 | NodeOrder.clear(); |
| 2439 | |
| 2440 | for (auto &Nodes : NodeSets) { |
| 2441 | LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n" ); |
| 2442 | OrderKind Order; |
| 2443 | SmallSetVector<SUnit *, 8> N; |
| 2444 | if (pred_L(NodeOrder, Preds&: N, DDG: DDG.get()) && llvm::set_is_subset(S1: N, S2: Nodes)) { |
| 2445 | R.insert_range(R&: N); |
| 2446 | Order = BottomUp; |
| 2447 | LLVM_DEBUG(dbgs() << " Bottom up (preds) " ); |
| 2448 | } else if (succ_L(NodeOrder, Succs&: N, DDG: DDG.get()) && |
| 2449 | llvm::set_is_subset(S1: N, S2: Nodes)) { |
| 2450 | R.insert_range(R&: N); |
| 2451 | Order = TopDown; |
| 2452 | LLVM_DEBUG(dbgs() << " Top down (succs) " ); |
| 2453 | } else if (isIntersect(Set1&: N, Set2: Nodes, Result&: R)) { |
| 2454 | // If some of the successors are in the existing node-set, then use the |
| 2455 | // top-down ordering. |
| 2456 | Order = TopDown; |
| 2457 | LLVM_DEBUG(dbgs() << " Top down (intersect) " ); |
| 2458 | } else if (NodeSets.size() == 1) { |
| 2459 | for (const auto &N : Nodes) |
| 2460 | if (N->Succs.size() == 0) |
| 2461 | R.insert(X: N); |
| 2462 | Order = BottomUp; |
| 2463 | LLVM_DEBUG(dbgs() << " Bottom up (all) " ); |
| 2464 | } else { |
| 2465 | // Find the node with the highest ASAP. |
| 2466 | SUnit *maxASAP = nullptr; |
| 2467 | for (SUnit *SU : Nodes) { |
| 2468 | if (maxASAP == nullptr || getASAP(Node: SU) > getASAP(Node: maxASAP) || |
| 2469 | (getASAP(Node: SU) == getASAP(Node: maxASAP) && SU->NodeNum > maxASAP->NodeNum)) |
| 2470 | maxASAP = SU; |
| 2471 | } |
| 2472 | R.insert(X: maxASAP); |
| 2473 | Order = BottomUp; |
| 2474 | LLVM_DEBUG(dbgs() << " Bottom up (default) " ); |
| 2475 | } |
| 2476 | |
| 2477 | while (!R.empty()) { |
| 2478 | if (Order == TopDown) { |
| 2479 | // Choose the node with the maximum height. If more than one, choose |
| 2480 | // the node wiTH the maximum ZeroLatencyHeight. If still more than one, |
| 2481 | // choose the node with the lowest MOV. |
| 2482 | while (!R.empty()) { |
| 2483 | SUnit *maxHeight = nullptr; |
| 2484 | for (SUnit *I : R) { |
| 2485 | if (maxHeight == nullptr || getHeight(Node: I) > getHeight(Node: maxHeight)) |
| 2486 | maxHeight = I; |
| 2487 | else if (getHeight(Node: I) == getHeight(Node: maxHeight) && |
| 2488 | getZeroLatencyHeight(Node: I) > getZeroLatencyHeight(Node: maxHeight)) |
| 2489 | maxHeight = I; |
| 2490 | else if (getHeight(Node: I) == getHeight(Node: maxHeight) && |
| 2491 | getZeroLatencyHeight(Node: I) == |
| 2492 | getZeroLatencyHeight(Node: maxHeight) && |
| 2493 | getMOV(Node: I) < getMOV(Node: maxHeight)) |
| 2494 | maxHeight = I; |
| 2495 | } |
| 2496 | NodeOrder.insert(X: maxHeight); |
| 2497 | LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " " ); |
| 2498 | R.remove(X: maxHeight); |
| 2499 | for (const auto &OE : DDG->getOutEdges(SU: maxHeight)) { |
| 2500 | SUnit *SU = OE.getDst(); |
| 2501 | if (Nodes.count(SU) == 0) |
| 2502 | continue; |
| 2503 | if (NodeOrder.contains(key: SU)) |
| 2504 | continue; |
| 2505 | if (OE.ignoreDependence(IgnoreAnti: false)) |
| 2506 | continue; |
| 2507 | R.insert(X: SU); |
| 2508 | } |
| 2509 | |
| 2510 | // FIXME: The following loop-carried dependencies may also need to be |
| 2511 | // considered. |
| 2512 | // - Physical register dependnecies (true-dependnece and WAW). |
| 2513 | // - Memory dependencies. |
| 2514 | for (const auto &IE : DDG->getInEdges(SU: maxHeight)) { |
| 2515 | SUnit *SU = IE.getSrc(); |
| 2516 | if (!IE.isAntiDep()) |
| 2517 | continue; |
| 2518 | if (Nodes.count(SU) == 0) |
| 2519 | continue; |
| 2520 | if (NodeOrder.contains(key: SU)) |
| 2521 | continue; |
| 2522 | R.insert(X: SU); |
| 2523 | } |
| 2524 | } |
| 2525 | Order = BottomUp; |
| 2526 | LLVM_DEBUG(dbgs() << "\n Switching order to bottom up " ); |
| 2527 | SmallSetVector<SUnit *, 8> N; |
| 2528 | if (pred_L(NodeOrder, Preds&: N, DDG: DDG.get(), S: &Nodes)) |
| 2529 | R.insert_range(R&: N); |
| 2530 | } else { |
| 2531 | // Choose the node with the maximum depth. If more than one, choose |
| 2532 | // the node with the maximum ZeroLatencyDepth. If still more than one, |
| 2533 | // choose the node with the lowest MOV. |
| 2534 | while (!R.empty()) { |
| 2535 | SUnit *maxDepth = nullptr; |
| 2536 | for (SUnit *I : R) { |
| 2537 | if (maxDepth == nullptr || getDepth(Node: I) > getDepth(Node: maxDepth)) |
| 2538 | maxDepth = I; |
| 2539 | else if (getDepth(Node: I) == getDepth(Node: maxDepth) && |
| 2540 | getZeroLatencyDepth(Node: I) > getZeroLatencyDepth(Node: maxDepth)) |
| 2541 | maxDepth = I; |
| 2542 | else if (getDepth(Node: I) == getDepth(Node: maxDepth) && |
| 2543 | getZeroLatencyDepth(Node: I) == getZeroLatencyDepth(Node: maxDepth) && |
| 2544 | getMOV(Node: I) < getMOV(Node: maxDepth)) |
| 2545 | maxDepth = I; |
| 2546 | } |
| 2547 | NodeOrder.insert(X: maxDepth); |
| 2548 | LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " " ); |
| 2549 | R.remove(X: maxDepth); |
| 2550 | if (Nodes.isExceedSU(SU: maxDepth)) { |
| 2551 | Order = TopDown; |
| 2552 | R.clear(); |
| 2553 | R.insert(X: Nodes.getNode(i: 0)); |
| 2554 | break; |
| 2555 | } |
| 2556 | for (const auto &IE : DDG->getInEdges(SU: maxDepth)) { |
| 2557 | SUnit *SU = IE.getSrc(); |
| 2558 | if (Nodes.count(SU) == 0) |
| 2559 | continue; |
| 2560 | if (NodeOrder.contains(key: SU)) |
| 2561 | continue; |
| 2562 | R.insert(X: SU); |
| 2563 | } |
| 2564 | |
| 2565 | // FIXME: The following loop-carried dependencies may also need to be |
| 2566 | // considered. |
| 2567 | // - Physical register dependnecies (true-dependnece and WAW). |
| 2568 | // - Memory dependencies. |
| 2569 | for (const auto &OE : DDG->getOutEdges(SU: maxDepth)) { |
| 2570 | SUnit *SU = OE.getDst(); |
| 2571 | if (!OE.isAntiDep()) |
| 2572 | continue; |
| 2573 | if (Nodes.count(SU) == 0) |
| 2574 | continue; |
| 2575 | if (NodeOrder.contains(key: SU)) |
| 2576 | continue; |
| 2577 | R.insert(X: SU); |
| 2578 | } |
| 2579 | } |
| 2580 | Order = TopDown; |
| 2581 | LLVM_DEBUG(dbgs() << "\n Switching order to top down " ); |
| 2582 | SmallSetVector<SUnit *, 8> N; |
| 2583 | if (succ_L(NodeOrder, Succs&: N, DDG: DDG.get(), S: &Nodes)) |
| 2584 | R.insert_range(R&: N); |
| 2585 | } |
| 2586 | } |
| 2587 | LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n" ); |
| 2588 | } |
| 2589 | |
| 2590 | LLVM_DEBUG({ |
| 2591 | dbgs() << "Node order: " ; |
| 2592 | for (SUnit *I : NodeOrder) |
| 2593 | dbgs() << " " << I->NodeNum << " " ; |
| 2594 | dbgs() << "\n" ; |
| 2595 | }); |
| 2596 | } |
| 2597 | |
| 2598 | /// Process the nodes in the computed order and create the pipelined schedule |
| 2599 | /// of the instructions, if possible. Return true if a schedule is found. |
| 2600 | bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) { |
| 2601 | |
| 2602 | if (NodeOrder.empty()){ |
| 2603 | LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" ); |
| 2604 | return false; |
| 2605 | } |
| 2606 | |
| 2607 | bool scheduleFound = false; |
| 2608 | std::unique_ptr<HighRegisterPressureDetector> HRPDetector; |
| 2609 | if (LimitRegPressure) { |
| 2610 | HRPDetector = |
| 2611 | std::make_unique<HighRegisterPressureDetector>(args: Loop.getHeader(), args&: MF); |
| 2612 | HRPDetector->init(RCI: RegClassInfo); |
| 2613 | } |
| 2614 | // Keep increasing II until a valid schedule is found. |
| 2615 | for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) { |
| 2616 | Schedule.reset(); |
| 2617 | Schedule.setInitiationInterval(II); |
| 2618 | LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n" ); |
| 2619 | |
| 2620 | SetVector<SUnit *>::iterator NI = NodeOrder.begin(); |
| 2621 | SetVector<SUnit *>::iterator NE = NodeOrder.end(); |
| 2622 | do { |
| 2623 | SUnit *SU = *NI; |
| 2624 | |
| 2625 | // Compute the schedule time for the instruction, which is based |
| 2626 | // upon the scheduled time for any predecessors/successors. |
| 2627 | int EarlyStart = INT_MIN; |
| 2628 | int LateStart = INT_MAX; |
| 2629 | Schedule.computeStart(SU, MaxEarlyStart: &EarlyStart, MinLateStart: &LateStart, II, DAG: this); |
| 2630 | LLVM_DEBUG({ |
| 2631 | dbgs() << "\n" ; |
| 2632 | dbgs() << "Inst (" << SU->NodeNum << ") " ; |
| 2633 | SU->getInstr()->dump(); |
| 2634 | dbgs() << "\n" ; |
| 2635 | }); |
| 2636 | LLVM_DEBUG( |
| 2637 | dbgs() << format("\tes: %8x ls: %8x\n" , EarlyStart, LateStart)); |
| 2638 | |
| 2639 | if (EarlyStart > LateStart) |
| 2640 | scheduleFound = false; |
| 2641 | else if (EarlyStart != INT_MIN && LateStart == INT_MAX) |
| 2642 | scheduleFound = |
| 2643 | Schedule.insert(SU, StartCycle: EarlyStart, EndCycle: EarlyStart + (int)II - 1, II); |
| 2644 | else if (EarlyStart == INT_MIN && LateStart != INT_MAX) |
| 2645 | scheduleFound = |
| 2646 | Schedule.insert(SU, StartCycle: LateStart, EndCycle: LateStart - (int)II + 1, II); |
| 2647 | else if (EarlyStart != INT_MIN && LateStart != INT_MAX) { |
| 2648 | LateStart = std::min(a: LateStart, b: EarlyStart + (int)II - 1); |
| 2649 | // When scheduling a Phi it is better to start at the late cycle and |
| 2650 | // go backwards. The default order may insert the Phi too far away |
| 2651 | // from its first dependence. |
| 2652 | // Also, do backward search when all scheduled predecessors are |
| 2653 | // loop-carried output/order dependencies. Empirically, there are also |
| 2654 | // cases where scheduling becomes possible with backward search. |
| 2655 | if (SU->getInstr()->isPHI() || |
| 2656 | Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, DDG: this->getDDG())) |
| 2657 | scheduleFound = Schedule.insert(SU, StartCycle: LateStart, EndCycle: EarlyStart, II); |
| 2658 | else |
| 2659 | scheduleFound = Schedule.insert(SU, StartCycle: EarlyStart, EndCycle: LateStart, II); |
| 2660 | } else { |
| 2661 | int FirstCycle = Schedule.getFirstCycle(); |
| 2662 | scheduleFound = Schedule.insert(SU, StartCycle: FirstCycle + getASAP(Node: SU), |
| 2663 | EndCycle: FirstCycle + getASAP(Node: SU) + II - 1, II); |
| 2664 | } |
| 2665 | |
| 2666 | // Even if we find a schedule, make sure the schedule doesn't exceed the |
| 2667 | // allowable number of stages. We keep trying if this happens. |
| 2668 | if (scheduleFound) |
| 2669 | if (SwpMaxStages > -1 && |
| 2670 | Schedule.getMaxStageCount() > (unsigned)SwpMaxStages) |
| 2671 | scheduleFound = false; |
| 2672 | |
| 2673 | LLVM_DEBUG({ |
| 2674 | if (!scheduleFound) |
| 2675 | dbgs() << "\tCan't schedule\n" ; |
| 2676 | }); |
| 2677 | } while (++NI != NE && scheduleFound); |
| 2678 | |
| 2679 | // If a schedule is found, ensure non-pipelined instructions are in stage 0 |
| 2680 | if (scheduleFound) |
| 2681 | scheduleFound = |
| 2682 | Schedule.normalizeNonPipelinedInstructions(SSD: this, PLI: LoopPipelinerInfo); |
| 2683 | |
| 2684 | // If a schedule is found, check if it is a valid schedule too. |
| 2685 | if (scheduleFound) |
| 2686 | scheduleFound = Schedule.isValidSchedule(SSD: this); |
| 2687 | |
| 2688 | // If a schedule was found and the option is enabled, check if the schedule |
| 2689 | // might generate additional register spills/fills. |
| 2690 | if (scheduleFound && LimitRegPressure) |
| 2691 | scheduleFound = |
| 2692 | !HRPDetector->detect(SSD: this, Schedule, MaxStage: Schedule.getMaxStageCount()); |
| 2693 | } |
| 2694 | |
| 2695 | LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound |
| 2696 | << " (II=" << Schedule.getInitiationInterval() |
| 2697 | << ")\n" ); |
| 2698 | |
| 2699 | if (scheduleFound) { |
| 2700 | scheduleFound = LoopPipelinerInfo->shouldUseSchedule(SSD&: *this, SMS&: Schedule); |
| 2701 | if (!scheduleFound) |
| 2702 | LLVM_DEBUG(dbgs() << "Target rejected schedule\n" ); |
| 2703 | } |
| 2704 | |
| 2705 | if (scheduleFound) { |
| 2706 | Schedule.finalizeSchedule(SSD: this); |
| 2707 | Pass.ORE->emit(RemarkBuilder: [&]() { |
| 2708 | return MachineOptimizationRemarkAnalysis( |
| 2709 | DEBUG_TYPE, "schedule" , Loop.getStartLoc(), Loop.getHeader()) |
| 2710 | << "Schedule found with Initiation Interval: " |
| 2711 | << ore::NV("II" , Schedule.getInitiationInterval()) |
| 2712 | << ", MaxStageCount: " |
| 2713 | << ore::NV("MaxStageCount" , Schedule.getMaxStageCount()); |
| 2714 | }); |
| 2715 | } else |
| 2716 | Schedule.reset(); |
| 2717 | |
| 2718 | return scheduleFound && Schedule.getMaxStageCount() > 0; |
| 2719 | } |
| 2720 | |
| 2721 | static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI) { |
| 2722 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); |
| 2723 | Register Result; |
| 2724 | for (const MachineOperand &Use : MI.all_uses()) { |
| 2725 | Register Reg = Use.getReg(); |
| 2726 | if (!Reg.isVirtual()) |
| 2727 | return Register(); |
| 2728 | if (MRI.getVRegDef(Reg)->getParent() != MI.getParent()) |
| 2729 | continue; |
| 2730 | if (Result) |
| 2731 | return Register(); |
| 2732 | Result = Reg; |
| 2733 | } |
| 2734 | return Result; |
| 2735 | } |
| 2736 | |
| 2737 | /// When Op is a value that is incremented recursively in a loop and there is a |
| 2738 | /// unique instruction that increments it, returns true and sets Value. |
| 2739 | static bool findLoopIncrementValue(const MachineOperand &Op, int &Value) { |
| 2740 | if (!Op.isReg() || !Op.getReg().isVirtual()) |
| 2741 | return false; |
| 2742 | |
| 2743 | Register OrgReg = Op.getReg(); |
| 2744 | Register CurReg = OrgReg; |
| 2745 | const MachineBasicBlock *LoopBB = Op.getParent()->getParent(); |
| 2746 | const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo(); |
| 2747 | |
| 2748 | const TargetInstrInfo *TII = |
| 2749 | LoopBB->getParent()->getSubtarget().getInstrInfo(); |
| 2750 | const TargetRegisterInfo *TRI = |
| 2751 | LoopBB->getParent()->getSubtarget().getRegisterInfo(); |
| 2752 | |
| 2753 | MachineInstr *Phi = nullptr; |
| 2754 | MachineInstr *Increment = nullptr; |
| 2755 | |
| 2756 | // Traverse definitions until it reaches Op or an instruction that does not |
| 2757 | // satisfy the condition. |
| 2758 | // Acceptable example: |
| 2759 | // bb.0: |
| 2760 | // %0 = PHI %3, %bb.0, ... |
| 2761 | // %2 = ADD %0, Value |
| 2762 | // ... = LOAD %2(Op) |
| 2763 | // %3 = COPY %2 |
| 2764 | while (true) { |
| 2765 | if (!CurReg.isValid() || !CurReg.isVirtual()) |
| 2766 | return false; |
| 2767 | MachineInstr *Def = MRI.getVRegDef(Reg: CurReg); |
| 2768 | if (Def->getParent() != LoopBB) |
| 2769 | return false; |
| 2770 | |
| 2771 | if (Def->isCopy()) { |
| 2772 | // Ignore copy instructions unless they contain subregisters |
| 2773 | if (Def->getOperand(i: 0).getSubReg() || Def->getOperand(i: 1).getSubReg()) |
| 2774 | return false; |
| 2775 | CurReg = Def->getOperand(i: 1).getReg(); |
| 2776 | } else if (Def->isPHI()) { |
| 2777 | // There must be just one Phi |
| 2778 | if (Phi) |
| 2779 | return false; |
| 2780 | Phi = Def; |
| 2781 | CurReg = getLoopPhiReg(Phi: *Def, LoopBB); |
| 2782 | } else if (TII->getIncrementValue(MI: *Def, Value)) { |
| 2783 | // Potentially a unique increment |
| 2784 | if (Increment) |
| 2785 | // Multiple increments exist |
| 2786 | return false; |
| 2787 | |
| 2788 | const MachineOperand *BaseOp; |
| 2789 | int64_t Offset; |
| 2790 | bool OffsetIsScalable; |
| 2791 | if (TII->getMemOperandWithOffset(MI: *Def, BaseOp, Offset, OffsetIsScalable, |
| 2792 | TRI)) { |
| 2793 | // Pre/post increment instruction |
| 2794 | CurReg = BaseOp->getReg(); |
| 2795 | } else { |
| 2796 | // If only one of the operands is defined within the loop, it is assumed |
| 2797 | // to be an incremented value. |
| 2798 | CurReg = findUniqueOperandDefinedInLoop(MI: *Def); |
| 2799 | if (!CurReg.isValid()) |
| 2800 | return false; |
| 2801 | } |
| 2802 | Increment = Def; |
| 2803 | } else { |
| 2804 | return false; |
| 2805 | } |
| 2806 | if (CurReg == OrgReg) |
| 2807 | break; |
| 2808 | } |
| 2809 | |
| 2810 | if (!Phi || !Increment) |
| 2811 | return false; |
| 2812 | |
| 2813 | return true; |
| 2814 | } |
| 2815 | |
| 2816 | /// Return true if we can compute the amount the instruction changes |
| 2817 | /// during each iteration. Set Delta to the amount of the change. |
| 2818 | bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const { |
| 2819 | const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); |
| 2820 | const MachineOperand *BaseOp; |
| 2821 | int64_t Offset; |
| 2822 | bool OffsetIsScalable; |
| 2823 | if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI)) |
| 2824 | return false; |
| 2825 | |
| 2826 | // FIXME: This algorithm assumes instructions have fixed-size offsets. |
| 2827 | if (OffsetIsScalable) |
| 2828 | return false; |
| 2829 | |
| 2830 | if (!BaseOp->isReg()) |
| 2831 | return false; |
| 2832 | |
| 2833 | return findLoopIncrementValue(Op: *BaseOp, Value&: Delta); |
| 2834 | } |
| 2835 | |
| 2836 | /// Check if we can change the instruction to use an offset value from the |
| 2837 | /// previous iteration. If so, return true and set the base and offset values |
| 2838 | /// so that we can rewrite the load, if necessary. |
| 2839 | /// v1 = Phi(v0, v3) |
| 2840 | /// v2 = load v1, 0 |
| 2841 | /// v3 = post_store v1, 4, x |
| 2842 | /// This function enables the load to be rewritten as v2 = load v3, 4. |
| 2843 | bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI, |
| 2844 | unsigned &BasePos, |
| 2845 | unsigned &OffsetPos, |
| 2846 | Register &NewBase, |
| 2847 | int64_t &Offset) { |
| 2848 | // Get the load instruction. |
| 2849 | if (TII->isPostIncrement(MI: *MI)) |
| 2850 | return false; |
| 2851 | unsigned BasePosLd, OffsetPosLd; |
| 2852 | if (!TII->getBaseAndOffsetPosition(MI: *MI, BasePos&: BasePosLd, OffsetPos&: OffsetPosLd)) |
| 2853 | return false; |
| 2854 | Register BaseReg = MI->getOperand(i: BasePosLd).getReg(); |
| 2855 | |
| 2856 | // Look for the Phi instruction. |
| 2857 | MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); |
| 2858 | MachineInstr *Phi = MRI.getVRegDef(Reg: BaseReg); |
| 2859 | if (!Phi || !Phi->isPHI()) |
| 2860 | return false; |
| 2861 | // Get the register defined in the loop block. |
| 2862 | Register PrevReg = getLoopPhiReg(Phi: *Phi, LoopBB: MI->getParent()); |
| 2863 | if (!PrevReg) |
| 2864 | return false; |
| 2865 | |
| 2866 | // Check for the post-increment load/store instruction. |
| 2867 | MachineInstr *PrevDef = MRI.getVRegDef(Reg: PrevReg); |
| 2868 | if (!PrevDef || PrevDef == MI) |
| 2869 | return false; |
| 2870 | |
| 2871 | if (!TII->isPostIncrement(MI: *PrevDef)) |
| 2872 | return false; |
| 2873 | |
| 2874 | unsigned BasePos1 = 0, OffsetPos1 = 0; |
| 2875 | if (!TII->getBaseAndOffsetPosition(MI: *PrevDef, BasePos&: BasePos1, OffsetPos&: OffsetPos1)) |
| 2876 | return false; |
| 2877 | |
| 2878 | // Make sure that the instructions do not access the same memory location in |
| 2879 | // the next iteration. |
| 2880 | int64_t LoadOffset = MI->getOperand(i: OffsetPosLd).getImm(); |
| 2881 | int64_t StoreOffset = PrevDef->getOperand(i: OffsetPos1).getImm(); |
| 2882 | MachineInstr *NewMI = MF.CloneMachineInstr(Orig: MI); |
| 2883 | NewMI->getOperand(i: OffsetPosLd).setImm(LoadOffset + StoreOffset); |
| 2884 | bool Disjoint = TII->areMemAccessesTriviallyDisjoint(MIa: *NewMI, MIb: *PrevDef); |
| 2885 | MF.deleteMachineInstr(MI: NewMI); |
| 2886 | if (!Disjoint) |
| 2887 | return false; |
| 2888 | |
| 2889 | // Set the return value once we determine that we return true. |
| 2890 | BasePos = BasePosLd; |
| 2891 | OffsetPos = OffsetPosLd; |
| 2892 | NewBase = PrevReg; |
| 2893 | Offset = StoreOffset; |
| 2894 | return true; |
| 2895 | } |
| 2896 | |
| 2897 | /// Apply changes to the instruction if needed. The changes are need |
| 2898 | /// to improve the scheduling and depend up on the final schedule. |
| 2899 | void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI, |
| 2900 | SMSchedule &Schedule) { |
| 2901 | SUnit *SU = getSUnit(MI); |
| 2902 | DenseMap<SUnit *, std::pair<Register, int64_t>>::iterator It = |
| 2903 | InstrChanges.find(Val: SU); |
| 2904 | if (It != InstrChanges.end()) { |
| 2905 | std::pair<Register, int64_t> RegAndOffset = It->second; |
| 2906 | unsigned BasePos, OffsetPos; |
| 2907 | if (!TII->getBaseAndOffsetPosition(MI: *MI, BasePos, OffsetPos)) |
| 2908 | return; |
| 2909 | Register BaseReg = MI->getOperand(i: BasePos).getReg(); |
| 2910 | MachineInstr *LoopDef = findDefInLoop(Reg: BaseReg); |
| 2911 | int DefStageNum = Schedule.stageScheduled(SU: getSUnit(MI: LoopDef)); |
| 2912 | int DefCycleNum = Schedule.cycleScheduled(SU: getSUnit(MI: LoopDef)); |
| 2913 | int BaseStageNum = Schedule.stageScheduled(SU); |
| 2914 | int BaseCycleNum = Schedule.cycleScheduled(SU); |
| 2915 | if (BaseStageNum < DefStageNum) { |
| 2916 | MachineInstr *NewMI = MF.CloneMachineInstr(Orig: MI); |
| 2917 | int OffsetDiff = DefStageNum - BaseStageNum; |
| 2918 | if (DefCycleNum < BaseCycleNum) { |
| 2919 | NewMI->getOperand(i: BasePos).setReg(RegAndOffset.first); |
| 2920 | if (OffsetDiff > 0) |
| 2921 | --OffsetDiff; |
| 2922 | } |
| 2923 | int64_t NewOffset = |
| 2924 | MI->getOperand(i: OffsetPos).getImm() + RegAndOffset.second * OffsetDiff; |
| 2925 | NewMI->getOperand(i: OffsetPos).setImm(NewOffset); |
| 2926 | SU->setInstr(NewMI); |
| 2927 | MISUnitMap[NewMI] = SU; |
| 2928 | NewMIs[MI] = NewMI; |
| 2929 | } |
| 2930 | } |
| 2931 | } |
| 2932 | |
| 2933 | /// Return the instruction in the loop that defines the register. |
| 2934 | /// If the definition is a Phi, then follow the Phi operand to |
| 2935 | /// the instruction in the loop. |
| 2936 | MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) { |
| 2937 | SmallPtrSet<MachineInstr *, 8> Visited; |
| 2938 | MachineInstr *Def = MRI.getVRegDef(Reg); |
| 2939 | while (Def->isPHI()) { |
| 2940 | if (!Visited.insert(Ptr: Def).second) |
| 2941 | break; |
| 2942 | for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2) |
| 2943 | if (Def->getOperand(i: i + 1).getMBB() == BB) { |
| 2944 | Def = MRI.getVRegDef(Reg: Def->getOperand(i).getReg()); |
| 2945 | break; |
| 2946 | } |
| 2947 | } |
| 2948 | return Def; |
| 2949 | } |
| 2950 | |
| 2951 | /// Return false if there is no overlap between the region accessed by BaseMI in |
| 2952 | /// an iteration and the region accessed by OtherMI in subsequent iterations. |
| 2953 | bool SwingSchedulerDAG::mayOverlapInLaterIter( |
| 2954 | const MachineInstr *BaseMI, const MachineInstr *OtherMI) const { |
| 2955 | int DeltaB, DeltaO, Delta; |
| 2956 | if (!computeDelta(MI: *BaseMI, Delta&: DeltaB) || !computeDelta(MI: *OtherMI, Delta&: DeltaO) || |
| 2957 | DeltaB != DeltaO) |
| 2958 | return true; |
| 2959 | Delta = DeltaB; |
| 2960 | |
| 2961 | const MachineOperand *BaseOpB, *BaseOpO; |
| 2962 | int64_t OffsetB, OffsetO; |
| 2963 | bool OffsetBIsScalable, OffsetOIsScalable; |
| 2964 | const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); |
| 2965 | if (!TII->getMemOperandWithOffset(MI: *BaseMI, BaseOp&: BaseOpB, Offset&: OffsetB, |
| 2966 | OffsetIsScalable&: OffsetBIsScalable, TRI) || |
| 2967 | !TII->getMemOperandWithOffset(MI: *OtherMI, BaseOp&: BaseOpO, Offset&: OffsetO, |
| 2968 | OffsetIsScalable&: OffsetOIsScalable, TRI)) |
| 2969 | return true; |
| 2970 | |
| 2971 | if (OffsetBIsScalable || OffsetOIsScalable) |
| 2972 | return true; |
| 2973 | |
| 2974 | if (!BaseOpB->isIdenticalTo(Other: *BaseOpO)) { |
| 2975 | // Pass cases with different base operands but same initial values. |
| 2976 | // Typically for when pre/post increment is used. |
| 2977 | |
| 2978 | if (!BaseOpB->isReg() || !BaseOpO->isReg()) |
| 2979 | return true; |
| 2980 | Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg(); |
| 2981 | if (!RegB.isVirtual() || !RegO.isVirtual()) |
| 2982 | return true; |
| 2983 | |
| 2984 | MachineInstr *DefB = MRI.getVRegDef(Reg: BaseOpB->getReg()); |
| 2985 | MachineInstr *DefO = MRI.getVRegDef(Reg: BaseOpO->getReg()); |
| 2986 | if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI()) |
| 2987 | return true; |
| 2988 | |
| 2989 | Register InitValB; |
| 2990 | Register LoopValB; |
| 2991 | Register InitValO; |
| 2992 | Register LoopValO; |
| 2993 | getPhiRegs(Phi&: *DefB, Loop: BB, InitVal&: InitValB, LoopVal&: LoopValB); |
| 2994 | getPhiRegs(Phi&: *DefO, Loop: BB, InitVal&: InitValO, LoopVal&: LoopValO); |
| 2995 | MachineInstr *InitDefB = MRI.getVRegDef(Reg: InitValB); |
| 2996 | MachineInstr *InitDefO = MRI.getVRegDef(Reg: InitValO); |
| 2997 | |
| 2998 | if (!InitDefB->isIdenticalTo(Other: *InitDefO)) |
| 2999 | return true; |
| 3000 | } |
| 3001 | |
| 3002 | LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize(); |
| 3003 | LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize(); |
| 3004 | |
| 3005 | // This is the main test, which checks the offset values and the loop |
| 3006 | // increment value to determine if the accesses may be loop carried. |
| 3007 | if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue()) |
| 3008 | return true; |
| 3009 | |
| 3010 | LLVM_DEBUG({ |
| 3011 | dbgs() << "Overlap check:\n" ; |
| 3012 | dbgs() << " BaseMI: " ; |
| 3013 | BaseMI->dump(); |
| 3014 | dbgs() << " Base + " << OffsetB << " + I * " << Delta |
| 3015 | << ", Len: " << AccessSizeB.getValue() << "\n" ; |
| 3016 | dbgs() << " OtherMI: " ; |
| 3017 | OtherMI->dump(); |
| 3018 | dbgs() << " Base + " << OffsetO << " + I * " << Delta |
| 3019 | << ", Len: " << AccessSizeO.getValue() << "\n" ; |
| 3020 | }); |
| 3021 | |
| 3022 | // Excessive overlap may be detected in strided patterns. |
| 3023 | // For example, the memory addresses of the store and the load in |
| 3024 | // for (i=0; i<n; i+=2) a[i+1] = a[i]; |
| 3025 | // are assumed to overlap. |
| 3026 | if (Delta < 0) { |
| 3027 | int64_t BaseMinAddr = OffsetB; |
| 3028 | int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1; |
| 3029 | if (BaseMinAddr > OhterNextIterMaxAddr) { |
| 3030 | LLVM_DEBUG(dbgs() << " Result: No overlap\n" ); |
| 3031 | return false; |
| 3032 | } |
| 3033 | } else { |
| 3034 | int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1; |
| 3035 | int64_t OtherNextIterMinAddr = OffsetO + Delta; |
| 3036 | if (BaseMaxAddr < OtherNextIterMinAddr) { |
| 3037 | LLVM_DEBUG(dbgs() << " Result: No overlap\n" ); |
| 3038 | return false; |
| 3039 | } |
| 3040 | } |
| 3041 | LLVM_DEBUG(dbgs() << " Result: Overlap\n" ); |
| 3042 | return true; |
| 3043 | } |
| 3044 | |
| 3045 | /// Return true for an order or output dependence that is loop carried |
| 3046 | /// potentially. A dependence is loop carried if the destination defines a value |
| 3047 | /// that may be used or defined by the source in a subsequent iteration. |
| 3048 | bool SwingSchedulerDAG::isLoopCarriedDep( |
| 3049 | const SwingSchedulerDDGEdge &Edge) const { |
| 3050 | if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() || |
| 3051 | Edge.getDst()->isBoundaryNode()) |
| 3052 | return false; |
| 3053 | |
| 3054 | if (!SwpPruneLoopCarried) |
| 3055 | return true; |
| 3056 | |
| 3057 | if (Edge.isOutputDep()) |
| 3058 | return true; |
| 3059 | |
| 3060 | MachineInstr *SI = Edge.getSrc()->getInstr(); |
| 3061 | MachineInstr *DI = Edge.getDst()->getInstr(); |
| 3062 | assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI." ); |
| 3063 | |
| 3064 | // Assume ordered loads and stores may have a loop carried dependence. |
| 3065 | if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() || |
| 3066 | SI->mayRaiseFPException() || DI->mayRaiseFPException() || |
| 3067 | SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef()) |
| 3068 | return true; |
| 3069 | |
| 3070 | if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore()) |
| 3071 | return false; |
| 3072 | |
| 3073 | // The conservative assumption is that a dependence between memory operations |
| 3074 | // may be loop carried. The following code checks when it can be proved that |
| 3075 | // there is no loop carried dependence. |
| 3076 | return mayOverlapInLaterIter(BaseMI: DI, OtherMI: SI); |
| 3077 | } |
| 3078 | |
| 3079 | void SwingSchedulerDAG::postProcessDAG() { |
| 3080 | for (auto &M : Mutations) |
| 3081 | M->apply(DAG: this); |
| 3082 | } |
| 3083 | |
| 3084 | /// Try to schedule the node at the specified StartCycle and continue |
| 3085 | /// until the node is schedule or the EndCycle is reached. This function |
| 3086 | /// returns true if the node is scheduled. This routine may search either |
| 3087 | /// forward or backward for a place to insert the instruction based upon |
| 3088 | /// the relative values of StartCycle and EndCycle. |
| 3089 | bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) { |
| 3090 | bool forward = true; |
| 3091 | LLVM_DEBUG({ |
| 3092 | dbgs() << "Trying to insert node between " << StartCycle << " and " |
| 3093 | << EndCycle << " II: " << II << "\n" ; |
| 3094 | }); |
| 3095 | if (StartCycle > EndCycle) |
| 3096 | forward = false; |
| 3097 | |
| 3098 | // The terminating condition depends on the direction. |
| 3099 | int termCycle = forward ? EndCycle + 1 : EndCycle - 1; |
| 3100 | for (int curCycle = StartCycle; curCycle != termCycle; |
| 3101 | forward ? ++curCycle : --curCycle) { |
| 3102 | |
| 3103 | if (ST.getInstrInfo()->isZeroCost(Opcode: SU->getInstr()->getOpcode()) || |
| 3104 | ProcItinResources.canReserveResources(SU&: *SU, Cycle: curCycle)) { |
| 3105 | LLVM_DEBUG({ |
| 3106 | dbgs() << "\tinsert at cycle " << curCycle << " " ; |
| 3107 | SU->getInstr()->dump(); |
| 3108 | }); |
| 3109 | |
| 3110 | if (!ST.getInstrInfo()->isZeroCost(Opcode: SU->getInstr()->getOpcode())) |
| 3111 | ProcItinResources.reserveResources(SU&: *SU, Cycle: curCycle); |
| 3112 | ScheduledInstrs[curCycle].push_back(x: SU); |
| 3113 | InstrToCycle.insert(x: std::make_pair(x&: SU, y&: curCycle)); |
| 3114 | if (curCycle > LastCycle) |
| 3115 | LastCycle = curCycle; |
| 3116 | if (curCycle < FirstCycle) |
| 3117 | FirstCycle = curCycle; |
| 3118 | return true; |
| 3119 | } |
| 3120 | LLVM_DEBUG({ |
| 3121 | dbgs() << "\tfailed to insert at cycle " << curCycle << " " ; |
| 3122 | SU->getInstr()->dump(); |
| 3123 | }); |
| 3124 | } |
| 3125 | return false; |
| 3126 | } |
| 3127 | |
| 3128 | // Return the cycle of the earliest scheduled instruction in the chain. |
| 3129 | int SMSchedule::earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, |
| 3130 | const SwingSchedulerDDG *DDG) { |
| 3131 | SmallPtrSet<SUnit *, 8> Visited; |
| 3132 | SmallVector<SwingSchedulerDDGEdge, 8> Worklist; |
| 3133 | Worklist.push_back(Elt: Dep); |
| 3134 | int EarlyCycle = INT_MAX; |
| 3135 | while (!Worklist.empty()) { |
| 3136 | const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val(); |
| 3137 | SUnit *PrevSU = Cur.getSrc(); |
| 3138 | if (Visited.count(Ptr: PrevSU)) |
| 3139 | continue; |
| 3140 | std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(x: PrevSU); |
| 3141 | if (it == InstrToCycle.end()) |
| 3142 | continue; |
| 3143 | EarlyCycle = std::min(a: EarlyCycle, b: it->second); |
| 3144 | for (const auto &IE : DDG->getInEdges(SU: PrevSU)) |
| 3145 | if (IE.isOrderDep() || IE.isOutputDep()) |
| 3146 | Worklist.push_back(Elt: IE); |
| 3147 | Visited.insert(Ptr: PrevSU); |
| 3148 | } |
| 3149 | return EarlyCycle; |
| 3150 | } |
| 3151 | |
| 3152 | // Return the cycle of the latest scheduled instruction in the chain. |
| 3153 | int SMSchedule::latestCycleInChain(const SwingSchedulerDDGEdge &Dep, |
| 3154 | const SwingSchedulerDDG *DDG) { |
| 3155 | SmallPtrSet<SUnit *, 8> Visited; |
| 3156 | SmallVector<SwingSchedulerDDGEdge, 8> Worklist; |
| 3157 | Worklist.push_back(Elt: Dep); |
| 3158 | int LateCycle = INT_MIN; |
| 3159 | while (!Worklist.empty()) { |
| 3160 | const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val(); |
| 3161 | SUnit *SuccSU = Cur.getDst(); |
| 3162 | if (Visited.count(Ptr: SuccSU) || SuccSU->isBoundaryNode()) |
| 3163 | continue; |
| 3164 | std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(x: SuccSU); |
| 3165 | if (it == InstrToCycle.end()) |
| 3166 | continue; |
| 3167 | LateCycle = std::max(a: LateCycle, b: it->second); |
| 3168 | for (const auto &OE : DDG->getOutEdges(SU: SuccSU)) |
| 3169 | if (OE.isOrderDep() || OE.isOutputDep()) |
| 3170 | Worklist.push_back(Elt: OE); |
| 3171 | Visited.insert(Ptr: SuccSU); |
| 3172 | } |
| 3173 | return LateCycle; |
| 3174 | } |
| 3175 | |
| 3176 | /// If an instruction has a use that spans multiple iterations, then |
| 3177 | /// return true. These instructions are characterized by having a back-ege |
| 3178 | /// to a Phi, which contains a reference to another Phi. |
| 3179 | static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) { |
| 3180 | for (auto &P : SU->Preds) |
| 3181 | if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI()) |
| 3182 | for (auto &S : P.getSUnit()->Succs) |
| 3183 | if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI()) |
| 3184 | return P.getSUnit(); |
| 3185 | return nullptr; |
| 3186 | } |
| 3187 | |
| 3188 | /// Compute the scheduling start slot for the instruction. The start slot |
| 3189 | /// depends on any predecessor or successor nodes scheduled already. |
| 3190 | void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, |
| 3191 | int II, SwingSchedulerDAG *DAG) { |
| 3192 | const SwingSchedulerDDG *DDG = DAG->getDDG(); |
| 3193 | |
| 3194 | // Iterate over each instruction that has been scheduled already. The start |
| 3195 | // slot computation depends on whether the previously scheduled instruction |
| 3196 | // is a predecessor or successor of the specified instruction. |
| 3197 | for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) { |
| 3198 | for (SUnit *I : getInstructions(cycle)) { |
| 3199 | for (const auto &IE : DDG->getInEdges(SU)) { |
| 3200 | if (IE.getSrc() == I) { |
| 3201 | // FIXME: Add reverse edge to `DDG` instead of calling |
| 3202 | // `isLoopCarriedDep` |
| 3203 | if (DAG->isLoopCarriedDep(Edge: IE)) { |
| 3204 | int End = earliestCycleInChain(Dep: IE, DDG) + (II - 1); |
| 3205 | *MinLateStart = std::min(a: *MinLateStart, b: End); |
| 3206 | } |
| 3207 | int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II; |
| 3208 | *MaxEarlyStart = std::max(a: *MaxEarlyStart, b: EarlyStart); |
| 3209 | } |
| 3210 | } |
| 3211 | |
| 3212 | for (const auto &OE : DDG->getOutEdges(SU)) { |
| 3213 | if (OE.getDst() == I) { |
| 3214 | // FIXME: Add reverse edge to `DDG` instead of calling |
| 3215 | // `isLoopCarriedDep` |
| 3216 | if (DAG->isLoopCarriedDep(Edge: OE)) { |
| 3217 | int Start = latestCycleInChain(Dep: OE, DDG) + 1 - II; |
| 3218 | *MaxEarlyStart = std::max(a: *MaxEarlyStart, b: Start); |
| 3219 | } |
| 3220 | int LateStart = cycle - OE.getLatency() + OE.getDistance() * II; |
| 3221 | *MinLateStart = std::min(a: *MinLateStart, b: LateStart); |
| 3222 | } |
| 3223 | } |
| 3224 | |
| 3225 | SUnit *BE = multipleIterations(SU: I, DAG); |
| 3226 | for (const auto &Dep : SU->Preds) { |
| 3227 | // For instruction that requires multiple iterations, make sure that |
| 3228 | // the dependent instruction is not scheduled past the definition. |
| 3229 | if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() && |
| 3230 | !SU->isPred(N: I)) |
| 3231 | *MinLateStart = std::min(a: *MinLateStart, b: cycle); |
| 3232 | } |
| 3233 | } |
| 3234 | } |
| 3235 | } |
| 3236 | |
| 3237 | /// Order the instructions within a cycle so that the definitions occur |
| 3238 | /// before the uses. Returns true if the instruction is added to the start |
| 3239 | /// of the list, or false if added to the end. |
| 3240 | void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, |
| 3241 | std::deque<SUnit *> &Insts) const { |
| 3242 | MachineInstr *MI = SU->getInstr(); |
| 3243 | bool OrderBeforeUse = false; |
| 3244 | bool OrderAfterDef = false; |
| 3245 | bool OrderBeforeDef = false; |
| 3246 | unsigned MoveDef = 0; |
| 3247 | unsigned MoveUse = 0; |
| 3248 | int StageInst1 = stageScheduled(SU); |
| 3249 | const SwingSchedulerDDG *DDG = SSD->getDDG(); |
| 3250 | |
| 3251 | unsigned Pos = 0; |
| 3252 | for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E; |
| 3253 | ++I, ++Pos) { |
| 3254 | for (MachineOperand &MO : MI->operands()) { |
| 3255 | if (!MO.isReg() || !MO.getReg().isVirtual()) |
| 3256 | continue; |
| 3257 | |
| 3258 | Register Reg = MO.getReg(); |
| 3259 | unsigned BasePos, OffsetPos; |
| 3260 | if (ST.getInstrInfo()->getBaseAndOffsetPosition(MI: *MI, BasePos, OffsetPos)) |
| 3261 | if (MI->getOperand(i: BasePos).getReg() == Reg) |
| 3262 | if (Register NewReg = SSD->getInstrBaseReg(SU)) |
| 3263 | Reg = NewReg; |
| 3264 | bool Reads, Writes; |
| 3265 | std::tie(args&: Reads, args&: Writes) = |
| 3266 | (*I)->getInstr()->readsWritesVirtualRegister(Reg); |
| 3267 | if (MO.isDef() && Reads && stageScheduled(SU: *I) <= StageInst1) { |
| 3268 | OrderBeforeUse = true; |
| 3269 | if (MoveUse == 0) |
| 3270 | MoveUse = Pos; |
| 3271 | } else if (MO.isDef() && Reads && stageScheduled(SU: *I) > StageInst1) { |
| 3272 | // Add the instruction after the scheduled instruction. |
| 3273 | OrderAfterDef = true; |
| 3274 | MoveDef = Pos; |
| 3275 | } else if (MO.isUse() && Writes && stageScheduled(SU: *I) == StageInst1) { |
| 3276 | if (cycleScheduled(SU: *I) == cycleScheduled(SU) && !(*I)->isSucc(N: SU)) { |
| 3277 | OrderBeforeUse = true; |
| 3278 | if (MoveUse == 0) |
| 3279 | MoveUse = Pos; |
| 3280 | } else { |
| 3281 | OrderAfterDef = true; |
| 3282 | MoveDef = Pos; |
| 3283 | } |
| 3284 | } else if (MO.isUse() && Writes && stageScheduled(SU: *I) > StageInst1) { |
| 3285 | OrderBeforeUse = true; |
| 3286 | if (MoveUse == 0) |
| 3287 | MoveUse = Pos; |
| 3288 | if (MoveUse != 0) { |
| 3289 | OrderAfterDef = true; |
| 3290 | MoveDef = Pos - 1; |
| 3291 | } |
| 3292 | } else if (MO.isUse() && Writes && stageScheduled(SU: *I) < StageInst1) { |
| 3293 | // Add the instruction before the scheduled instruction. |
| 3294 | OrderBeforeUse = true; |
| 3295 | if (MoveUse == 0) |
| 3296 | MoveUse = Pos; |
| 3297 | } else if (MO.isUse() && stageScheduled(SU: *I) == StageInst1 && |
| 3298 | isLoopCarriedDefOfUse(SSD, Def: (*I)->getInstr(), MO)) { |
| 3299 | if (MoveUse == 0) { |
| 3300 | OrderBeforeDef = true; |
| 3301 | MoveUse = Pos; |
| 3302 | } |
| 3303 | } |
| 3304 | } |
| 3305 | // Check for order dependences between instructions. Make sure the source |
| 3306 | // is ordered before the destination. |
| 3307 | for (auto &OE : DDG->getOutEdges(SU)) { |
| 3308 | if (OE.getDst() != *I) |
| 3309 | continue; |
| 3310 | if (OE.isOrderDep() && stageScheduled(SU: *I) == StageInst1) { |
| 3311 | OrderBeforeUse = true; |
| 3312 | if (Pos < MoveUse) |
| 3313 | MoveUse = Pos; |
| 3314 | } |
| 3315 | // We did not handle HW dependences in previous for loop, |
| 3316 | // and we normally set Latency = 0 for Anti/Output deps, |
| 3317 | // so may have nodes in same cycle with Anti/Output dependent on HW regs. |
| 3318 | else if ((OE.isAntiDep() || OE.isOutputDep()) && |
| 3319 | stageScheduled(SU: *I) == StageInst1) { |
| 3320 | OrderBeforeUse = true; |
| 3321 | if ((MoveUse == 0) || (Pos < MoveUse)) |
| 3322 | MoveUse = Pos; |
| 3323 | } |
| 3324 | } |
| 3325 | for (auto &IE : DDG->getInEdges(SU)) { |
| 3326 | if (IE.getSrc() != *I) |
| 3327 | continue; |
| 3328 | if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) && |
| 3329 | stageScheduled(SU: *I) == StageInst1) { |
| 3330 | OrderAfterDef = true; |
| 3331 | MoveDef = Pos; |
| 3332 | } |
| 3333 | } |
| 3334 | } |
| 3335 | |
| 3336 | // A circular dependence. |
| 3337 | if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef) |
| 3338 | OrderBeforeUse = false; |
| 3339 | |
| 3340 | // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due |
| 3341 | // to a loop-carried dependence. |
| 3342 | if (OrderBeforeDef) |
| 3343 | OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef); |
| 3344 | |
| 3345 | // The uncommon case when the instruction order needs to be updated because |
| 3346 | // there is both a use and def. |
| 3347 | if (OrderBeforeUse && OrderAfterDef) { |
| 3348 | SUnit *UseSU = Insts.at(n: MoveUse); |
| 3349 | SUnit *DefSU = Insts.at(n: MoveDef); |
| 3350 | if (MoveUse > MoveDef) { |
| 3351 | Insts.erase(position: Insts.begin() + MoveUse); |
| 3352 | Insts.erase(position: Insts.begin() + MoveDef); |
| 3353 | } else { |
| 3354 | Insts.erase(position: Insts.begin() + MoveDef); |
| 3355 | Insts.erase(position: Insts.begin() + MoveUse); |
| 3356 | } |
| 3357 | orderDependence(SSD, SU: UseSU, Insts); |
| 3358 | orderDependence(SSD, SU, Insts); |
| 3359 | orderDependence(SSD, SU: DefSU, Insts); |
| 3360 | return; |
| 3361 | } |
| 3362 | // Put the new instruction first if there is a use in the list. Otherwise, |
| 3363 | // put it at the end of the list. |
| 3364 | if (OrderBeforeUse) |
| 3365 | Insts.push_front(x: SU); |
| 3366 | else |
| 3367 | Insts.push_back(x: SU); |
| 3368 | } |
| 3369 | |
| 3370 | /// Return true if the scheduled Phi has a loop carried operand. |
| 3371 | bool SMSchedule::isLoopCarried(const SwingSchedulerDAG *SSD, |
| 3372 | MachineInstr &Phi) const { |
| 3373 | if (!Phi.isPHI()) |
| 3374 | return false; |
| 3375 | assert(Phi.isPHI() && "Expecting a Phi." ); |
| 3376 | SUnit *DefSU = SSD->getSUnit(MI: &Phi); |
| 3377 | unsigned DefCycle = cycleScheduled(SU: DefSU); |
| 3378 | int DefStage = stageScheduled(SU: DefSU); |
| 3379 | |
| 3380 | Register InitVal; |
| 3381 | Register LoopVal; |
| 3382 | getPhiRegs(Phi, Loop: Phi.getParent(), InitVal, LoopVal); |
| 3383 | SUnit *UseSU = SSD->getSUnit(MI: MRI.getVRegDef(Reg: LoopVal)); |
| 3384 | if (!UseSU) |
| 3385 | return true; |
| 3386 | if (UseSU->getInstr()->isPHI()) |
| 3387 | return true; |
| 3388 | unsigned LoopCycle = cycleScheduled(SU: UseSU); |
| 3389 | int LoopStage = stageScheduled(SU: UseSU); |
| 3390 | return (LoopCycle > DefCycle) || (LoopStage <= DefStage); |
| 3391 | } |
| 3392 | |
| 3393 | /// Return true if the instruction is a definition that is loop carried |
| 3394 | /// and defines the use on the next iteration. |
| 3395 | /// v1 = phi(v2, v3) |
| 3396 | /// (Def) v3 = op v1 |
| 3397 | /// (MO) = v1 |
| 3398 | /// If MO appears before Def, then v1 and v3 may get assigned to the same |
| 3399 | /// register. |
| 3400 | bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, |
| 3401 | MachineInstr *Def, |
| 3402 | MachineOperand &MO) const { |
| 3403 | if (!MO.isReg()) |
| 3404 | return false; |
| 3405 | if (Def->isPHI()) |
| 3406 | return false; |
| 3407 | MachineInstr *Phi = MRI.getVRegDef(Reg: MO.getReg()); |
| 3408 | if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent()) |
| 3409 | return false; |
| 3410 | if (!isLoopCarried(SSD, Phi&: *Phi)) |
| 3411 | return false; |
| 3412 | Register LoopReg = getLoopPhiReg(Phi: *Phi, LoopBB: Phi->getParent()); |
| 3413 | for (MachineOperand &DMO : Def->all_defs()) { |
| 3414 | if (DMO.getReg() == LoopReg) |
| 3415 | return true; |
| 3416 | } |
| 3417 | return false; |
| 3418 | } |
| 3419 | |
| 3420 | /// Return true if all scheduled predecessors are loop-carried output/order |
| 3421 | /// dependencies. |
| 3422 | bool SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds( |
| 3423 | SUnit *SU, const SwingSchedulerDDG *DDG) const { |
| 3424 | for (const auto &IE : DDG->getInEdges(SU)) |
| 3425 | if (InstrToCycle.count(x: IE.getSrc())) |
| 3426 | return false; |
| 3427 | return true; |
| 3428 | } |
| 3429 | |
| 3430 | /// Determine transitive dependences of unpipelineable instructions |
| 3431 | SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes( |
| 3432 | SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) { |
| 3433 | SmallSet<SUnit *, 8> DoNotPipeline; |
| 3434 | SmallVector<SUnit *, 8> Worklist; |
| 3435 | |
| 3436 | for (auto &SU : SSD->SUnits) |
| 3437 | if (SU.isInstr() && PLI->shouldIgnoreForPipelining(MI: SU.getInstr())) |
| 3438 | Worklist.push_back(Elt: &SU); |
| 3439 | |
| 3440 | const SwingSchedulerDDG *DDG = SSD->getDDG(); |
| 3441 | while (!Worklist.empty()) { |
| 3442 | auto SU = Worklist.pop_back_val(); |
| 3443 | if (DoNotPipeline.count(Ptr: SU)) |
| 3444 | continue; |
| 3445 | LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n" ); |
| 3446 | DoNotPipeline.insert(Ptr: SU); |
| 3447 | for (const auto &IE : DDG->getInEdges(SU)) |
| 3448 | Worklist.push_back(Elt: IE.getSrc()); |
| 3449 | |
| 3450 | // To preserve previous behavior and prevent regression |
| 3451 | // FIXME: Remove if this doesn't have significant impact on |
| 3452 | for (const auto &OE : DDG->getOutEdges(SU)) |
| 3453 | if (OE.getDistance() == 1) |
| 3454 | Worklist.push_back(Elt: OE.getDst()); |
| 3455 | } |
| 3456 | return DoNotPipeline; |
| 3457 | } |
| 3458 | |
| 3459 | // Determine all instructions upon which any unpipelineable instruction depends |
| 3460 | // and ensure that they are in stage 0. If unable to do so, return false. |
| 3461 | bool SMSchedule::normalizeNonPipelinedInstructions( |
| 3462 | SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) { |
| 3463 | SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI); |
| 3464 | |
| 3465 | int NewLastCycle = INT_MIN; |
| 3466 | for (SUnit &SU : SSD->SUnits) { |
| 3467 | if (!SU.isInstr()) |
| 3468 | continue; |
| 3469 | if (!DNP.contains(Ptr: &SU) || stageScheduled(SU: &SU) == 0) { |
| 3470 | NewLastCycle = std::max(a: NewLastCycle, b: InstrToCycle[&SU]); |
| 3471 | continue; |
| 3472 | } |
| 3473 | |
| 3474 | // Put the non-pipelined instruction as early as possible in the schedule |
| 3475 | int NewCycle = getFirstCycle(); |
| 3476 | for (const auto &IE : SSD->getDDG()->getInEdges(SU: &SU)) |
| 3477 | if (IE.getDistance() == 0) |
| 3478 | NewCycle = std::max(a: InstrToCycle[IE.getSrc()], b: NewCycle); |
| 3479 | |
| 3480 | // To preserve previous behavior and prevent regression |
| 3481 | // FIXME: Remove if this doesn't have significant impact on performance |
| 3482 | for (auto &OE : SSD->getDDG()->getOutEdges(SU: &SU)) |
| 3483 | if (OE.getDistance() == 1) |
| 3484 | NewCycle = std::max(a: InstrToCycle[OE.getDst()], b: NewCycle); |
| 3485 | |
| 3486 | int OldCycle = InstrToCycle[&SU]; |
| 3487 | if (OldCycle != NewCycle) { |
| 3488 | InstrToCycle[&SU] = NewCycle; |
| 3489 | auto &OldS = getInstructions(cycle: OldCycle); |
| 3490 | llvm::erase(C&: OldS, V: &SU); |
| 3491 | getInstructions(cycle: NewCycle).emplace_back(args: &SU); |
| 3492 | LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum |
| 3493 | << ") is not pipelined; moving from cycle " << OldCycle |
| 3494 | << " to " << NewCycle << " Instr:" << *SU.getInstr()); |
| 3495 | } |
| 3496 | |
| 3497 | // We traverse the SUs in the order of the original basic block. Computing |
| 3498 | // NewCycle in this order normally works fine because all dependencies |
| 3499 | // (except for loop-carried dependencies) don't violate the original order. |
| 3500 | // However, an artificial dependency (e.g., added by CopyToPhiMutation) can |
| 3501 | // break it. That is, there may be exist an artificial dependency from |
| 3502 | // bottom to top. In such a case, NewCycle may become too large to be |
| 3503 | // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the |
| 3504 | // following case: |
| 3505 | // |
| 3506 | // | Inst0 <-+ |
| 3507 | // SU order | | artificial dep |
| 3508 | // | Inst1 --+ |
| 3509 | // v |
| 3510 | // |
| 3511 | // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of |
| 3512 | // Inst0 must be greater than or equal to N so that Inst0 is not be |
| 3513 | // scheduled at Stage 0. In such cases, we reject this schedule at this |
| 3514 | // time. |
| 3515 | // FIXME: The reason for this is the existence of artificial dependencies |
| 3516 | // that are contradict to the original SU order. If ignoring artificial |
| 3517 | // dependencies does not affect correctness, then it is better to ignore |
| 3518 | // them. |
| 3519 | if (FirstCycle + InitiationInterval <= NewCycle) |
| 3520 | return false; |
| 3521 | |
| 3522 | NewLastCycle = std::max(a: NewLastCycle, b: NewCycle); |
| 3523 | } |
| 3524 | LastCycle = NewLastCycle; |
| 3525 | return true; |
| 3526 | } |
| 3527 | |
| 3528 | // Check if the generated schedule is valid. This function checks if |
| 3529 | // an instruction that uses a physical register is scheduled in a |
| 3530 | // different stage than the definition. The pipeliner does not handle |
| 3531 | // physical register values that may cross a basic block boundary. |
| 3532 | // Furthermore, if a physical def/use pair is assigned to the same |
| 3533 | // cycle, orderDependence does not guarantee def/use ordering, so that |
| 3534 | // case should be considered invalid. (The test checks for both |
| 3535 | // earlier and same-cycle use to be more robust.) |
| 3536 | bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) { |
| 3537 | for (SUnit &SU : SSD->SUnits) { |
| 3538 | if (!SU.hasPhysRegDefs) |
| 3539 | continue; |
| 3540 | int StageDef = stageScheduled(SU: &SU); |
| 3541 | int CycleDef = InstrToCycle[&SU]; |
| 3542 | assert(StageDef != -1 && "Instruction should have been scheduled." ); |
| 3543 | for (auto &OE : SSD->getDDG()->getOutEdges(SU: &SU)) { |
| 3544 | SUnit *Dst = OE.getDst(); |
| 3545 | if (OE.isAssignedRegDep() && !Dst->isBoundaryNode()) |
| 3546 | if (OE.getReg().isPhysical()) { |
| 3547 | if (stageScheduled(SU: Dst) != StageDef) |
| 3548 | return false; |
| 3549 | if (InstrToCycle[Dst] <= CycleDef) |
| 3550 | return false; |
| 3551 | } |
| 3552 | } |
| 3553 | } |
| 3554 | return true; |
| 3555 | } |
| 3556 | |
| 3557 | /// A property of the node order in swing-modulo-scheduling is |
| 3558 | /// that for nodes outside circuits the following holds: |
| 3559 | /// none of them is scheduled after both a successor and a |
| 3560 | /// predecessor. |
| 3561 | /// The method below checks whether the property is met. |
| 3562 | /// If not, debug information is printed and statistics information updated. |
| 3563 | /// Note that we do not use an assert statement. |
| 3564 | /// The reason is that although an invalid node order may prevent |
| 3565 | /// the pipeliner from finding a pipelined schedule for arbitrary II, |
| 3566 | /// it does not lead to the generation of incorrect code. |
| 3567 | void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const { |
| 3568 | |
| 3569 | // a sorted vector that maps each SUnit to its index in the NodeOrder |
| 3570 | typedef std::pair<SUnit *, unsigned> UnitIndex; |
| 3571 | std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(x: nullptr, y: 0)); |
| 3572 | |
| 3573 | for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) |
| 3574 | Indices.push_back(x: std::make_pair(x: NodeOrder[i], y&: i)); |
| 3575 | |
| 3576 | auto CompareKey = [](UnitIndex i1, UnitIndex i2) { |
| 3577 | return std::get<0>(in&: i1) < std::get<0>(in&: i2); |
| 3578 | }; |
| 3579 | |
| 3580 | // sort, so that we can perform a binary search |
| 3581 | llvm::sort(C&: Indices, Comp: CompareKey); |
| 3582 | |
| 3583 | bool Valid = true; |
| 3584 | (void)Valid; |
| 3585 | // for each SUnit in the NodeOrder, check whether |
| 3586 | // it appears after both a successor and a predecessor |
| 3587 | // of the SUnit. If this is the case, and the SUnit |
| 3588 | // is not part of circuit, then the NodeOrder is not |
| 3589 | // valid. |
| 3590 | for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) { |
| 3591 | SUnit *SU = NodeOrder[i]; |
| 3592 | unsigned Index = i; |
| 3593 | |
| 3594 | bool PredBefore = false; |
| 3595 | bool SuccBefore = false; |
| 3596 | |
| 3597 | SUnit *Succ; |
| 3598 | SUnit *Pred; |
| 3599 | (void)Succ; |
| 3600 | (void)Pred; |
| 3601 | |
| 3602 | for (const auto &IE : DDG->getInEdges(SU)) { |
| 3603 | SUnit *PredSU = IE.getSrc(); |
| 3604 | unsigned PredIndex = std::get<1>( |
| 3605 | in&: *llvm::lower_bound(Range&: Indices, Value: std::make_pair(x&: PredSU, y: 0), C: CompareKey)); |
| 3606 | if (!PredSU->getInstr()->isPHI() && PredIndex < Index) { |
| 3607 | PredBefore = true; |
| 3608 | Pred = PredSU; |
| 3609 | break; |
| 3610 | } |
| 3611 | } |
| 3612 | |
| 3613 | for (const auto &OE : DDG->getOutEdges(SU)) { |
| 3614 | SUnit *SuccSU = OE.getDst(); |
| 3615 | // Do not process a boundary node, it was not included in NodeOrder, |
| 3616 | // hence not in Indices either, call to std::lower_bound() below will |
| 3617 | // return Indices.end(). |
| 3618 | if (SuccSU->isBoundaryNode()) |
| 3619 | continue; |
| 3620 | unsigned SuccIndex = std::get<1>( |
| 3621 | in&: *llvm::lower_bound(Range&: Indices, Value: std::make_pair(x&: SuccSU, y: 0), C: CompareKey)); |
| 3622 | if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) { |
| 3623 | SuccBefore = true; |
| 3624 | Succ = SuccSU; |
| 3625 | break; |
| 3626 | } |
| 3627 | } |
| 3628 | |
| 3629 | if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) { |
| 3630 | // instructions in circuits are allowed to be scheduled |
| 3631 | // after both a successor and predecessor. |
| 3632 | bool InCircuit = llvm::any_of( |
| 3633 | Range: Circuits, P: [SU](const NodeSet &Circuit) { return Circuit.count(SU); }); |
| 3634 | if (InCircuit) |
| 3635 | LLVM_DEBUG(dbgs() << "In a circuit, predecessor " ); |
| 3636 | else { |
| 3637 | Valid = false; |
| 3638 | NumNodeOrderIssues++; |
| 3639 | LLVM_DEBUG(dbgs() << "Predecessor " ); |
| 3640 | } |
| 3641 | LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum |
| 3642 | << " are scheduled before node " << SU->NodeNum |
| 3643 | << "\n" ); |
| 3644 | } |
| 3645 | } |
| 3646 | |
| 3647 | LLVM_DEBUG({ |
| 3648 | if (!Valid) |
| 3649 | dbgs() << "Invalid node order found!\n" ; |
| 3650 | }); |
| 3651 | } |
| 3652 | |
| 3653 | /// Attempt to fix the degenerate cases when the instruction serialization |
| 3654 | /// causes the register lifetimes to overlap. For example, |
| 3655 | /// p' = store_pi(p, b) |
| 3656 | /// = load p, offset |
| 3657 | /// In this case p and p' overlap, which means that two registers are needed. |
| 3658 | /// Instead, this function changes the load to use p' and updates the offset. |
| 3659 | void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) { |
| 3660 | Register OverlapReg; |
| 3661 | Register NewBaseReg; |
| 3662 | for (SUnit *SU : Instrs) { |
| 3663 | MachineInstr *MI = SU->getInstr(); |
| 3664 | for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) { |
| 3665 | const MachineOperand &MO = MI->getOperand(i); |
| 3666 | // Look for an instruction that uses p. The instruction occurs in the |
| 3667 | // same cycle but occurs later in the serialized order. |
| 3668 | if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) { |
| 3669 | // Check that the instruction appears in the InstrChanges structure, |
| 3670 | // which contains instructions that can have the offset updated. |
| 3671 | DenseMap<SUnit *, std::pair<Register, int64_t>>::iterator It = |
| 3672 | InstrChanges.find(Val: SU); |
| 3673 | if (It != InstrChanges.end()) { |
| 3674 | unsigned BasePos, OffsetPos; |
| 3675 | // Update the base register and adjust the offset. |
| 3676 | if (TII->getBaseAndOffsetPosition(MI: *MI, BasePos, OffsetPos)) { |
| 3677 | MachineInstr *NewMI = MF.CloneMachineInstr(Orig: MI); |
| 3678 | NewMI->getOperand(i: BasePos).setReg(NewBaseReg); |
| 3679 | int64_t NewOffset = |
| 3680 | MI->getOperand(i: OffsetPos).getImm() - It->second.second; |
| 3681 | NewMI->getOperand(i: OffsetPos).setImm(NewOffset); |
| 3682 | SU->setInstr(NewMI); |
| 3683 | MISUnitMap[NewMI] = SU; |
| 3684 | NewMIs[MI] = NewMI; |
| 3685 | } |
| 3686 | } |
| 3687 | OverlapReg = Register(); |
| 3688 | NewBaseReg = Register(); |
| 3689 | break; |
| 3690 | } |
| 3691 | // Look for an instruction of the form p' = op(p), which uses and defines |
| 3692 | // two virtual registers that get allocated to the same physical register. |
| 3693 | unsigned TiedUseIdx = 0; |
| 3694 | if (MI->isRegTiedToUseOperand(DefOpIdx: i, UseOpIdx: &TiedUseIdx)) { |
| 3695 | // OverlapReg is p in the example above. |
| 3696 | OverlapReg = MI->getOperand(i: TiedUseIdx).getReg(); |
| 3697 | // NewBaseReg is p' in the example above. |
| 3698 | NewBaseReg = MI->getOperand(i).getReg(); |
| 3699 | break; |
| 3700 | } |
| 3701 | } |
| 3702 | } |
| 3703 | } |
| 3704 | |
| 3705 | std::deque<SUnit *> |
| 3706 | SMSchedule::reorderInstructions(const SwingSchedulerDAG *SSD, |
| 3707 | const std::deque<SUnit *> &Instrs) const { |
| 3708 | std::deque<SUnit *> NewOrderPhi; |
| 3709 | for (SUnit *SU : Instrs) { |
| 3710 | if (SU->getInstr()->isPHI()) |
| 3711 | NewOrderPhi.push_back(x: SU); |
| 3712 | } |
| 3713 | std::deque<SUnit *> NewOrderI; |
| 3714 | for (SUnit *SU : Instrs) { |
| 3715 | if (!SU->getInstr()->isPHI()) |
| 3716 | orderDependence(SSD, SU, Insts&: NewOrderI); |
| 3717 | } |
| 3718 | llvm::append_range(C&: NewOrderPhi, R&: NewOrderI); |
| 3719 | return NewOrderPhi; |
| 3720 | } |
| 3721 | |
| 3722 | /// After the schedule has been formed, call this function to combine |
| 3723 | /// the instructions from the different stages/cycles. That is, this |
| 3724 | /// function creates a schedule that represents a single iteration. |
| 3725 | void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) { |
| 3726 | // Move all instructions to the first stage from later stages. |
| 3727 | for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) { |
| 3728 | for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage; |
| 3729 | ++stage) { |
| 3730 | std::deque<SUnit *> &cycleInstrs = |
| 3731 | ScheduledInstrs[cycle + (stage * InitiationInterval)]; |
| 3732 | for (SUnit *SU : llvm::reverse(C&: cycleInstrs)) |
| 3733 | ScheduledInstrs[cycle].push_front(x: SU); |
| 3734 | } |
| 3735 | } |
| 3736 | |
| 3737 | // Erase all the elements in the later stages. Only one iteration should |
| 3738 | // remain in the scheduled list, and it contains all the instructions. |
| 3739 | for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle) |
| 3740 | ScheduledInstrs.erase(Val: cycle); |
| 3741 | |
| 3742 | // Change the registers in instruction as specified in the InstrChanges |
| 3743 | // map. We need to use the new registers to create the correct order. |
| 3744 | for (const SUnit &SU : SSD->SUnits) |
| 3745 | SSD->applyInstrChange(MI: SU.getInstr(), Schedule&: *this); |
| 3746 | |
| 3747 | // Reorder the instructions in each cycle to fix and improve the |
| 3748 | // generated code. |
| 3749 | for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) { |
| 3750 | std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle]; |
| 3751 | cycleInstrs = reorderInstructions(SSD, Instrs: cycleInstrs); |
| 3752 | SSD->fixupRegisterOverlaps(Instrs&: cycleInstrs); |
| 3753 | } |
| 3754 | |
| 3755 | LLVM_DEBUG(dump();); |
| 3756 | } |
| 3757 | |
| 3758 | void NodeSet::print(raw_ostream &os) const { |
| 3759 | os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV |
| 3760 | << " depth " << MaxDepth << " col " << Colocate << "\n" ; |
| 3761 | for (const auto &I : Nodes) |
| 3762 | os << " SU(" << I->NodeNum << ") " << *(I->getInstr()); |
| 3763 | os << "\n" ; |
| 3764 | } |
| 3765 | |
| 3766 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3767 | /// Print the schedule information to the given output. |
| 3768 | void SMSchedule::print(raw_ostream &os) const { |
| 3769 | // Iterate over each cycle. |
| 3770 | for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) { |
| 3771 | // Iterate over each instruction in the cycle. |
| 3772 | const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle); |
| 3773 | for (SUnit *CI : cycleInstrs->second) { |
| 3774 | os << "cycle " << cycle << " (" << stageScheduled(CI) << ") " ; |
| 3775 | os << "(" << CI->NodeNum << ") " ; |
| 3776 | CI->getInstr()->print(os); |
| 3777 | os << "\n" ; |
| 3778 | } |
| 3779 | } |
| 3780 | } |
| 3781 | |
| 3782 | /// Utility function used for debugging to print the schedule. |
| 3783 | LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); } |
| 3784 | LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); } |
| 3785 | |
| 3786 | void ResourceManager::dumpMRT() const { |
| 3787 | LLVM_DEBUG({ |
| 3788 | if (UseDFA) |
| 3789 | return; |
| 3790 | std::stringstream SS; |
| 3791 | SS << "MRT:\n" ; |
| 3792 | SS << std::setw(4) << "Slot" ; |
| 3793 | for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) |
| 3794 | SS << std::setw(3) << I; |
| 3795 | SS << std::setw(7) << "#Mops" |
| 3796 | << "\n" ; |
| 3797 | for (int Slot = 0; Slot < InitiationInterval; ++Slot) { |
| 3798 | SS << std::setw(4) << Slot; |
| 3799 | for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) |
| 3800 | SS << std::setw(3) << MRT[Slot][I]; |
| 3801 | SS << std::setw(7) << NumScheduledMops[Slot] << "\n" ; |
| 3802 | } |
| 3803 | dbgs() << SS.str(); |
| 3804 | }); |
| 3805 | } |
| 3806 | #endif |
| 3807 | |
| 3808 | void ResourceManager::initProcResourceVectors( |
| 3809 | const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) { |
| 3810 | unsigned ProcResourceID = 0; |
| 3811 | |
| 3812 | // We currently limit the resource kinds to 64 and below so that we can use |
| 3813 | // uint64_t for Masks |
| 3814 | assert(SM.getNumProcResourceKinds() < 64 && |
| 3815 | "Too many kinds of resources, unsupported" ); |
| 3816 | // Create a unique bitmask for every processor resource unit. |
| 3817 | // Skip resource at index 0, since it always references 'InvalidUnit'. |
| 3818 | Masks.resize(N: SM.getNumProcResourceKinds()); |
| 3819 | for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { |
| 3820 | const MCProcResourceDesc &Desc = *SM.getProcResource(ProcResourceIdx: I); |
| 3821 | if (Desc.SubUnitsIdxBegin) |
| 3822 | continue; |
| 3823 | Masks[I] = 1ULL << ProcResourceID; |
| 3824 | ProcResourceID++; |
| 3825 | } |
| 3826 | // Create a unique bitmask for every processor resource group. |
| 3827 | for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { |
| 3828 | const MCProcResourceDesc &Desc = *SM.getProcResource(ProcResourceIdx: I); |
| 3829 | if (!Desc.SubUnitsIdxBegin) |
| 3830 | continue; |
| 3831 | Masks[I] = 1ULL << ProcResourceID; |
| 3832 | for (unsigned U = 0; U < Desc.NumUnits; ++U) |
| 3833 | Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]]; |
| 3834 | ProcResourceID++; |
| 3835 | } |
| 3836 | LLVM_DEBUG({ |
| 3837 | if (SwpShowResMask) { |
| 3838 | dbgs() << "ProcResourceDesc:\n" ; |
| 3839 | for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { |
| 3840 | const MCProcResourceDesc *ProcResource = SM.getProcResource(I); |
| 3841 | dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n" , |
| 3842 | ProcResource->Name, I, Masks[I], |
| 3843 | ProcResource->NumUnits); |
| 3844 | } |
| 3845 | dbgs() << " -----------------\n" ; |
| 3846 | } |
| 3847 | }); |
| 3848 | } |
| 3849 | |
| 3850 | bool ResourceManager::canReserveResources(SUnit &SU, int Cycle) { |
| 3851 | LLVM_DEBUG({ |
| 3852 | if (SwpDebugResource) |
| 3853 | dbgs() << "canReserveResources:\n" ; |
| 3854 | }); |
| 3855 | if (UseDFA) |
| 3856 | return DFAResources[positiveModulo(Dividend: Cycle, Divisor: InitiationInterval)] |
| 3857 | ->canReserveResources(MID: &SU.getInstr()->getDesc()); |
| 3858 | |
| 3859 | const MCSchedClassDesc *SCDesc = DAG->getSchedClass(SU: &SU); |
| 3860 | if (!SCDesc->isValid()) { |
| 3861 | LLVM_DEBUG({ |
| 3862 | dbgs() << "No valid Schedule Class Desc for schedClass!\n" ; |
| 3863 | dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n" ; |
| 3864 | }); |
| 3865 | return true; |
| 3866 | } |
| 3867 | |
| 3868 | reserveResources(SCDesc, Cycle); |
| 3869 | bool Result = !isOverbooked(); |
| 3870 | unreserveResources(SCDesc, Cycle); |
| 3871 | |
| 3872 | LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n" ); |
| 3873 | return Result; |
| 3874 | } |
| 3875 | |
| 3876 | void ResourceManager::reserveResources(SUnit &SU, int Cycle) { |
| 3877 | LLVM_DEBUG({ |
| 3878 | if (SwpDebugResource) |
| 3879 | dbgs() << "reserveResources:\n" ; |
| 3880 | }); |
| 3881 | if (UseDFA) |
| 3882 | return DFAResources[positiveModulo(Dividend: Cycle, Divisor: InitiationInterval)] |
| 3883 | ->reserveResources(MID: &SU.getInstr()->getDesc()); |
| 3884 | |
| 3885 | const MCSchedClassDesc *SCDesc = DAG->getSchedClass(SU: &SU); |
| 3886 | if (!SCDesc->isValid()) { |
| 3887 | LLVM_DEBUG({ |
| 3888 | dbgs() << "No valid Schedule Class Desc for schedClass!\n" ; |
| 3889 | dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n" ; |
| 3890 | }); |
| 3891 | return; |
| 3892 | } |
| 3893 | |
| 3894 | reserveResources(SCDesc, Cycle); |
| 3895 | |
| 3896 | LLVM_DEBUG({ |
| 3897 | if (SwpDebugResource) { |
| 3898 | dumpMRT(); |
| 3899 | dbgs() << "reserveResources: done!\n\n" ; |
| 3900 | } |
| 3901 | }); |
| 3902 | } |
| 3903 | |
| 3904 | void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc, |
| 3905 | int Cycle) { |
| 3906 | assert(!UseDFA); |
| 3907 | for (const MCWriteProcResEntry &PRE : make_range( |
| 3908 | x: STI->getWriteProcResBegin(SC: SCDesc), y: STI->getWriteProcResEnd(SC: SCDesc))) |
| 3909 | for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C) |
| 3910 | ++MRT[positiveModulo(Dividend: C, Divisor: InitiationInterval)][PRE.ProcResourceIdx]; |
| 3911 | |
| 3912 | for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C) |
| 3913 | ++NumScheduledMops[positiveModulo(Dividend: C, Divisor: InitiationInterval)]; |
| 3914 | } |
| 3915 | |
| 3916 | void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc, |
| 3917 | int Cycle) { |
| 3918 | assert(!UseDFA); |
| 3919 | for (const MCWriteProcResEntry &PRE : make_range( |
| 3920 | x: STI->getWriteProcResBegin(SC: SCDesc), y: STI->getWriteProcResEnd(SC: SCDesc))) |
| 3921 | for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C) |
| 3922 | --MRT[positiveModulo(Dividend: C, Divisor: InitiationInterval)][PRE.ProcResourceIdx]; |
| 3923 | |
| 3924 | for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C) |
| 3925 | --NumScheduledMops[positiveModulo(Dividend: C, Divisor: InitiationInterval)]; |
| 3926 | } |
| 3927 | |
| 3928 | bool ResourceManager::isOverbooked() const { |
| 3929 | assert(!UseDFA); |
| 3930 | for (int Slot = 0; Slot < InitiationInterval; ++Slot) { |
| 3931 | for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { |
| 3932 | const MCProcResourceDesc *Desc = SM.getProcResource(ProcResourceIdx: I); |
| 3933 | if (MRT[Slot][I] > Desc->NumUnits) |
| 3934 | return true; |
| 3935 | } |
| 3936 | if (NumScheduledMops[Slot] > IssueWidth) |
| 3937 | return true; |
| 3938 | } |
| 3939 | return false; |
| 3940 | } |
| 3941 | |
| 3942 | int ResourceManager::calculateResMIIDFA() const { |
| 3943 | assert(UseDFA); |
| 3944 | |
| 3945 | // Sort the instructions by the number of available choices for scheduling, |
| 3946 | // least to most. Use the number of critical resources as the tie breaker. |
| 3947 | FuncUnitSorter FUS = FuncUnitSorter(*ST); |
| 3948 | for (SUnit &SU : DAG->SUnits) |
| 3949 | FUS.calcCriticalResources(MI&: *SU.getInstr()); |
| 3950 | PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter> |
| 3951 | FuncUnitOrder(FUS); |
| 3952 | |
| 3953 | for (SUnit &SU : DAG->SUnits) |
| 3954 | FuncUnitOrder.push(x: SU.getInstr()); |
| 3955 | |
| 3956 | SmallVector<std::unique_ptr<DFAPacketizer>, 8> Resources; |
| 3957 | Resources.push_back( |
| 3958 | Elt: std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST))); |
| 3959 | |
| 3960 | while (!FuncUnitOrder.empty()) { |
| 3961 | MachineInstr *MI = FuncUnitOrder.top(); |
| 3962 | FuncUnitOrder.pop(); |
| 3963 | if (TII->isZeroCost(Opcode: MI->getOpcode())) |
| 3964 | continue; |
| 3965 | |
| 3966 | // Attempt to reserve the instruction in an existing DFA. At least one |
| 3967 | // DFA is needed for each cycle. |
| 3968 | unsigned NumCycles = DAG->getSUnit(MI)->Latency; |
| 3969 | unsigned ReservedCycles = 0; |
| 3970 | auto *RI = Resources.begin(); |
| 3971 | auto *RE = Resources.end(); |
| 3972 | LLVM_DEBUG({ |
| 3973 | dbgs() << "Trying to reserve resource for " << NumCycles |
| 3974 | << " cycles for \n" ; |
| 3975 | MI->dump(); |
| 3976 | }); |
| 3977 | for (unsigned C = 0; C < NumCycles; ++C) |
| 3978 | while (RI != RE) { |
| 3979 | if ((*RI)->canReserveResources(MI&: *MI)) { |
| 3980 | (*RI)->reserveResources(MI&: *MI); |
| 3981 | ++ReservedCycles; |
| 3982 | break; |
| 3983 | } |
| 3984 | RI++; |
| 3985 | } |
| 3986 | LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles |
| 3987 | << ", NumCycles:" << NumCycles << "\n" ); |
| 3988 | // Add new DFAs, if needed, to reserve resources. |
| 3989 | for (unsigned C = ReservedCycles; C < NumCycles; ++C) { |
| 3990 | LLVM_DEBUG(if (SwpDebugResource) dbgs() |
| 3991 | << "NewResource created to reserve resources" |
| 3992 | << "\n" ); |
| 3993 | auto *NewResource = TII->CreateTargetScheduleState(*ST); |
| 3994 | assert(NewResource->canReserveResources(*MI) && "Reserve error." ); |
| 3995 | NewResource->reserveResources(MI&: *MI); |
| 3996 | Resources.push_back(Elt: std::unique_ptr<DFAPacketizer>(NewResource)); |
| 3997 | } |
| 3998 | } |
| 3999 | |
| 4000 | int Resmii = Resources.size(); |
| 4001 | LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n" ); |
| 4002 | return Resmii; |
| 4003 | } |
| 4004 | |
| 4005 | int ResourceManager::calculateResMII() const { |
| 4006 | if (UseDFA) |
| 4007 | return calculateResMIIDFA(); |
| 4008 | |
| 4009 | // Count each resource consumption and divide it by the number of units. |
| 4010 | // ResMII is the max value among them. |
| 4011 | |
| 4012 | int NumMops = 0; |
| 4013 | SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds()); |
| 4014 | for (SUnit &SU : DAG->SUnits) { |
| 4015 | if (TII->isZeroCost(Opcode: SU.getInstr()->getOpcode())) |
| 4016 | continue; |
| 4017 | |
| 4018 | const MCSchedClassDesc *SCDesc = DAG->getSchedClass(SU: &SU); |
| 4019 | if (!SCDesc->isValid()) |
| 4020 | continue; |
| 4021 | |
| 4022 | LLVM_DEBUG({ |
| 4023 | if (SwpDebugResource) { |
| 4024 | DAG->dumpNode(SU); |
| 4025 | dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n" |
| 4026 | << " WriteProcRes: " ; |
| 4027 | } |
| 4028 | }); |
| 4029 | NumMops += SCDesc->NumMicroOps; |
| 4030 | for (const MCWriteProcResEntry &PRE : |
| 4031 | make_range(x: STI->getWriteProcResBegin(SC: SCDesc), |
| 4032 | y: STI->getWriteProcResEnd(SC: SCDesc))) { |
| 4033 | LLVM_DEBUG({ |
| 4034 | if (SwpDebugResource) { |
| 4035 | const MCProcResourceDesc *Desc = |
| 4036 | SM.getProcResource(PRE.ProcResourceIdx); |
| 4037 | dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", " ; |
| 4038 | } |
| 4039 | }); |
| 4040 | ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle; |
| 4041 | } |
| 4042 | LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n" ); |
| 4043 | } |
| 4044 | |
| 4045 | int Result = (NumMops + IssueWidth - 1) / IssueWidth; |
| 4046 | LLVM_DEBUG({ |
| 4047 | if (SwpDebugResource) |
| 4048 | dbgs() << "#Mops: " << NumMops << ", " |
| 4049 | << "IssueWidth: " << IssueWidth << ", " |
| 4050 | << "Cycles: " << Result << "\n" ; |
| 4051 | }); |
| 4052 | |
| 4053 | LLVM_DEBUG({ |
| 4054 | if (SwpDebugResource) { |
| 4055 | std::stringstream SS; |
| 4056 | SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10) |
| 4057 | << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles" |
| 4058 | << "\n" ; |
| 4059 | dbgs() << SS.str(); |
| 4060 | } |
| 4061 | }); |
| 4062 | for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { |
| 4063 | const MCProcResourceDesc *Desc = SM.getProcResource(ProcResourceIdx: I); |
| 4064 | int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits; |
| 4065 | LLVM_DEBUG({ |
| 4066 | if (SwpDebugResource) { |
| 4067 | std::stringstream SS; |
| 4068 | SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10) |
| 4069 | << Desc->NumUnits << std::setw(10) << ResourceCount[I] |
| 4070 | << std::setw(10) << Cycles << "\n" ; |
| 4071 | dbgs() << SS.str(); |
| 4072 | } |
| 4073 | }); |
| 4074 | if (Cycles > Result) |
| 4075 | Result = Cycles; |
| 4076 | } |
| 4077 | return Result; |
| 4078 | } |
| 4079 | |
| 4080 | void ResourceManager::init(int II) { |
| 4081 | InitiationInterval = II; |
| 4082 | DFAResources.clear(); |
| 4083 | DFAResources.resize(N: II); |
| 4084 | for (auto &I : DFAResources) |
| 4085 | I.reset(p: ST->getInstrInfo()->CreateTargetScheduleState(*ST)); |
| 4086 | MRT.clear(); |
| 4087 | MRT.resize(N: II, NV: SmallVector<uint64_t>(SM.getNumProcResourceKinds())); |
| 4088 | NumScheduledMops.clear(); |
| 4089 | NumScheduledMops.resize(N: II); |
| 4090 | } |
| 4091 | |
| 4092 | bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const { |
| 4093 | if (Pred.isArtificial() || Dst->isBoundaryNode()) |
| 4094 | return true; |
| 4095 | // Currently, dependence that is an anti-dependences but not a loop-carried is |
| 4096 | // also ignored. This behavior is preserved to prevent regression. |
| 4097 | // FIXME: Remove if this doesn't have significant impact on performance |
| 4098 | return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0); |
| 4099 | } |
| 4100 | |
| 4101 | SwingSchedulerDDG::SwingSchedulerDDGEdges & |
| 4102 | SwingSchedulerDDG::getEdges(const SUnit *SU) { |
| 4103 | if (SU == EntrySU) |
| 4104 | return EntrySUEdges; |
| 4105 | if (SU == ExitSU) |
| 4106 | return ExitSUEdges; |
| 4107 | return EdgesVec[SU->NodeNum]; |
| 4108 | } |
| 4109 | |
| 4110 | const SwingSchedulerDDG::SwingSchedulerDDGEdges & |
| 4111 | SwingSchedulerDDG::getEdges(const SUnit *SU) const { |
| 4112 | if (SU == EntrySU) |
| 4113 | return EntrySUEdges; |
| 4114 | if (SU == ExitSU) |
| 4115 | return ExitSUEdges; |
| 4116 | return EdgesVec[SU->NodeNum]; |
| 4117 | } |
| 4118 | |
| 4119 | void SwingSchedulerDDG::addEdge(const SUnit *SU, |
| 4120 | const SwingSchedulerDDGEdge &Edge) { |
| 4121 | auto &Edges = getEdges(SU); |
| 4122 | if (Edge.getSrc() == SU) |
| 4123 | Edges.Succs.push_back(Elt: Edge); |
| 4124 | else |
| 4125 | Edges.Preds.push_back(Elt: Edge); |
| 4126 | } |
| 4127 | |
| 4128 | void SwingSchedulerDDG::initEdges(SUnit *SU) { |
| 4129 | for (const auto &PI : SU->Preds) { |
| 4130 | SwingSchedulerDDGEdge Edge(SU, PI, false); |
| 4131 | addEdge(SU, Edge); |
| 4132 | } |
| 4133 | |
| 4134 | for (const auto &SI : SU->Succs) { |
| 4135 | SwingSchedulerDDGEdge Edge(SU, SI, true); |
| 4136 | addEdge(SU, Edge); |
| 4137 | } |
| 4138 | } |
| 4139 | |
| 4140 | SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, |
| 4141 | SUnit *ExitSU) |
| 4142 | : EntrySU(EntrySU), ExitSU(ExitSU) { |
| 4143 | EdgesVec.resize(new_size: SUnits.size()); |
| 4144 | |
| 4145 | initEdges(SU: EntrySU); |
| 4146 | initEdges(SU: ExitSU); |
| 4147 | for (auto &SU : SUnits) |
| 4148 | initEdges(SU: &SU); |
| 4149 | } |
| 4150 | |
| 4151 | const SwingSchedulerDDG::EdgesType & |
| 4152 | SwingSchedulerDDG::getInEdges(const SUnit *SU) const { |
| 4153 | return getEdges(SU).Preds; |
| 4154 | } |
| 4155 | |
| 4156 | const SwingSchedulerDDG::EdgesType & |
| 4157 | SwingSchedulerDDG::getOutEdges(const SUnit *SU) const { |
| 4158 | return getEdges(SU).Succs; |
| 4159 | } |
| 4160 | |
| 4161 | void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits) { |
| 4162 | // Currently this function simply adds all dependencies represented by this |
| 4163 | // object. After we properly handle missed dependencies, the logic here will |
| 4164 | // be more complex, as currently missed edges should not be added to the DAG. |
| 4165 | for (SUnit &SU : SUnits) { |
| 4166 | SUnit *Src = &SU; |
| 4167 | if (auto *OrderDep = getOrderDepOrNull(Key: Src)) { |
| 4168 | SDep Dep(Src, SDep::Barrier); |
| 4169 | Dep.setLatency(1); |
| 4170 | for (SUnit *Dst : *OrderDep) |
| 4171 | Dst->addPred(D: Dep); |
| 4172 | } |
| 4173 | } |
| 4174 | } |
| 4175 | |
| 4176 | void LoopCarriedEdges::dump(SUnit *SU, const TargetRegisterInfo *TRI, |
| 4177 | const MachineRegisterInfo *MRI) const { |
| 4178 | const auto *Order = getOrderDepOrNull(Key: SU); |
| 4179 | |
| 4180 | if (!Order) |
| 4181 | return; |
| 4182 | |
| 4183 | const auto DumpSU = [](const SUnit *SU) { |
| 4184 | std::ostringstream OSS; |
| 4185 | OSS << "SU(" << SU->NodeNum << ")" ; |
| 4186 | return OSS.str(); |
| 4187 | }; |
| 4188 | |
| 4189 | dbgs() << " Loop carried edges from " << DumpSU(SU) << "\n" |
| 4190 | << " Order\n" ; |
| 4191 | for (SUnit *Dst : *Order) |
| 4192 | dbgs() << " " << DumpSU(Dst) << "\n" ; |
| 4193 | } |
| 4194 | |