1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
32#include "llvm/CodeGen/MachinePipeliner.h"
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/PriorityQueue.h"
37#include "llvm/ADT/STLExtras.h"
38#include "llvm/ADT/SetOperations.h"
39#include "llvm/ADT/SetVector.h"
40#include "llvm/ADT/SmallPtrSet.h"
41#include "llvm/ADT/SmallSet.h"
42#include "llvm/ADT/SmallVector.h"
43#include "llvm/ADT/Statistic.h"
44#include "llvm/ADT/iterator_range.h"
45#include "llvm/Analysis/AliasAnalysis.h"
46#include "llvm/Analysis/MemoryLocation.h"
47#include "llvm/Analysis/OptimizationRemarkEmitter.h"
48#include "llvm/Analysis/ValueTracking.h"
49#include "llvm/CodeGen/DFAPacketizer.h"
50#include "llvm/CodeGen/LiveIntervals.h"
51#include "llvm/CodeGen/MachineBasicBlock.h"
52#include "llvm/CodeGen/MachineDominators.h"
53#include "llvm/CodeGen/MachineFunction.h"
54#include "llvm/CodeGen/MachineFunctionPass.h"
55#include "llvm/CodeGen/MachineInstr.h"
56#include "llvm/CodeGen/MachineInstrBuilder.h"
57#include "llvm/CodeGen/MachineLoopInfo.h"
58#include "llvm/CodeGen/MachineMemOperand.h"
59#include "llvm/CodeGen/MachineOperand.h"
60#include "llvm/CodeGen/MachineRegisterInfo.h"
61#include "llvm/CodeGen/ModuloSchedule.h"
62#include "llvm/CodeGen/Register.h"
63#include "llvm/CodeGen/RegisterClassInfo.h"
64#include "llvm/CodeGen/RegisterPressure.h"
65#include "llvm/CodeGen/ScheduleDAG.h"
66#include "llvm/CodeGen/ScheduleDAGMutation.h"
67#include "llvm/CodeGen/TargetInstrInfo.h"
68#include "llvm/CodeGen/TargetOpcodes.h"
69#include "llvm/CodeGen/TargetPassConfig.h"
70#include "llvm/CodeGen/TargetRegisterInfo.h"
71#include "llvm/CodeGen/TargetSubtargetInfo.h"
72#include "llvm/Config/llvm-config.h"
73#include "llvm/IR/Attributes.h"
74#include "llvm/IR/Function.h"
75#include "llvm/MC/LaneBitmask.h"
76#include "llvm/MC/MCInstrDesc.h"
77#include "llvm/MC/MCInstrItineraries.h"
78#include "llvm/Pass.h"
79#include "llvm/Support/CommandLine.h"
80#include "llvm/Support/Compiler.h"
81#include "llvm/Support/Debug.h"
82#include "llvm/Support/raw_ostream.h"
83#include <algorithm>
84#include <cassert>
85#include <climits>
86#include <cstdint>
87#include <deque>
88#include <functional>
89#include <iomanip>
90#include <iterator>
91#include <map>
92#include <memory>
93#include <sstream>
94#include <tuple>
95#include <utility>
96#include <vector>
97
98using namespace llvm;
99
100#define DEBUG_TYPE "pipeliner"
101
102STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
103STATISTIC(NumPipelined, "Number of loops software pipelined");
104STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
105STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
106STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
107STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
108STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
109STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
110STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
111STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
112STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
113
114/// A command line option to turn software pipelining on or off.
115static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(Val: true),
116 cl::desc("Enable Software Pipelining"));
117
118/// A command line option to enable SWP at -Os.
119static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
120 cl::desc("Enable SWP at Os."), cl::Hidden,
121 cl::init(Val: false));
122
123/// A command line argument to limit minimum initial interval for pipelining.
124static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
125 cl::desc("Size limit for the MII."),
126 cl::Hidden, cl::init(Val: 27));
127
128/// A command line argument to force pipeliner to use specified initial
129/// interval.
130static cl::opt<int> SwpForceII("pipeliner-force-ii",
131 cl::desc("Force pipeliner to use specified II."),
132 cl::Hidden, cl::init(Val: -1));
133
134/// A command line argument to limit the number of stages in the pipeline.
135static cl::opt<int>
136 SwpMaxStages("pipeliner-max-stages",
137 cl::desc("Maximum stages allowed in the generated scheduled."),
138 cl::Hidden, cl::init(Val: 3));
139
140/// A command line option to disable the pruning of chain dependences due to
141/// an unrelated Phi.
142static cl::opt<bool>
143 SwpPruneDeps("pipeliner-prune-deps",
144 cl::desc("Prune dependences between unrelated Phi nodes."),
145 cl::Hidden, cl::init(Val: true));
146
147/// A command line option to disable the pruning of loop carried order
148/// dependences.
149static cl::opt<bool>
150 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
151 cl::desc("Prune loop carried order dependences."),
152 cl::Hidden, cl::init(Val: true));
153
154#ifndef NDEBUG
155static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
156#endif
157
158static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
159 cl::ReallyHidden,
160 cl::desc("Ignore RecMII"));
161
162static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
163 cl::init(Val: false));
164static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
165 cl::init(Val: false));
166
167static cl::opt<bool> EmitTestAnnotations(
168 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(Val: false),
169 cl::desc("Instead of emitting the pipelined code, annotate instructions "
170 "with the generated schedule for feeding into the "
171 "-modulo-schedule-test pass"));
172
173static cl::opt<bool> ExperimentalCodeGen(
174 "pipeliner-experimental-cg", cl::Hidden, cl::init(Val: false),
175 cl::desc(
176 "Use the experimental peeling code generator for software pipelining"));
177
178static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
179 cl::desc("Range to search for II"),
180 cl::Hidden, cl::init(Val: 10));
181
182static cl::opt<bool>
183 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(Val: false),
184 cl::desc("Limit register pressure of scheduled loop"));
185
186static cl::opt<int>
187 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
188 cl::init(Val: 5),
189 cl::desc("Margin representing the unused percentage of "
190 "the register pressure limit"));
191
192static cl::opt<bool>
193 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(Val: false),
194 cl::desc("Use the MVE code generator for software pipelining"));
195
196namespace llvm {
197
198// A command line option to enable the CopyToPhi DAG mutation.
199cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
200 cl::init(Val: true),
201 cl::desc("Enable CopyToPhi DAG Mutation"));
202
203/// A command line argument to force pipeliner to use specified issue
204/// width.
205cl::opt<int> SwpForceIssueWidth(
206 "pipeliner-force-issue-width",
207 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
208 cl::init(Val: -1));
209
210/// A command line argument to set the window scheduling option.
211static cl::opt<WindowSchedulingFlag> WindowSchedulingOption(
212 "window-sched", cl::Hidden, cl::init(Val: WindowSchedulingFlag::WS_On),
213 cl::desc("Set how to use window scheduling algorithm."),
214 cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off",
215 "Turn off window algorithm."),
216 clEnumValN(WindowSchedulingFlag::WS_On, "on",
217 "Use window algorithm after SMS algorithm fails."),
218 clEnumValN(WindowSchedulingFlag::WS_Force, "force",
219 "Use window algorithm instead of SMS algorithm.")));
220
221} // end namespace llvm
222
223unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
224char MachinePipeliner::ID = 0;
225#ifndef NDEBUG
226int MachinePipeliner::NumTries = 0;
227#endif
228char &llvm::MachinePipelinerID = MachinePipeliner::ID;
229
230INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
231 "Modulo Software Pipelining", false, false)
232INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
233INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
234INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
235INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
236INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
237 "Modulo Software Pipelining", false, false)
238
239namespace {
240
241/// This class holds an SUnit corresponding to a memory operation and other
242/// information related to the instruction.
243struct SUnitWithMemInfo {
244 SUnit *SU;
245 SmallVector<const Value *, 2> UnderlyingObjs;
246
247 /// The value of a memory operand.
248 const Value *MemOpValue = nullptr;
249
250 /// The offset of a memory operand.
251 int64_t MemOpOffset = 0;
252
253 AAMDNodes AATags;
254
255 /// True if all the underlying objects are identified.
256 bool IsAllIdentified = false;
257
258 SUnitWithMemInfo(SUnit *SU);
259
260 bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
261
262 bool isUnknown() const { return MemOpValue == nullptr; }
263
264private:
265 bool getUnderlyingObjects();
266};
267
268/// Add loop-carried chain dependencies. This class handles the same type of
269/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
270/// account dependencies across iterations.
271class LoopCarriedOrderDepsTracker {
272 // Type of instruction that is relevant to order-dependencies
273 enum class InstrTag {
274 Barrier = 0, ///< A barrier event instruction.
275 LoadOrStore = 1, ///< An instruction that may load or store memory, but is
276 ///< not a barrier event.
277 FPExceptions = 2, ///< An instruction that does not match above, but may
278 ///< raise floatin-point exceptions.
279 };
280
281 struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
282 TaggedSUnit(SUnit *SU, InstrTag Tag)
283 : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}
284
285 InstrTag getTag() const { return InstrTag(getInt()); }
286 };
287
288 /// Holds loads and stores with memory related information.
289 struct LoadStoreChunk {
290 SmallVector<SUnitWithMemInfo, 4> Loads;
291 SmallVector<SUnitWithMemInfo, 4> Stores;
292
293 void append(SUnit *SU);
294 };
295
296 SwingSchedulerDAG *DAG;
297 BatchAAResults *BAA;
298 std::vector<SUnit> &SUnits;
299
300 /// The size of SUnits, for convenience.
301 const unsigned N;
302
303 /// Loop-carried Edges.
304 std::vector<BitVector> LoopCarried;
305
306 /// Instructions related to chain dependencies. They are one of the
307 /// following:
308 ///
309 /// 1. Barrier event.
310 /// 2. Load, but neither a barrier event, invariant load, nor may load trap
311 /// value.
312 /// 3. Store, but not a barrier event.
313 /// 4. None of them, but may raise floating-point exceptions.
314 ///
315 /// This is used when analyzing loop-carried dependencies that access global
316 /// barrier instructions.
317 std::vector<TaggedSUnit> TaggedSUnits;
318
319 const TargetInstrInfo *TII = nullptr;
320 const TargetRegisterInfo *TRI = nullptr;
321
322public:
323 LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA,
324 const TargetInstrInfo *TII,
325 const TargetRegisterInfo *TRI);
326
327 /// The main function to compute loop-carried order-dependencies.
328 void computeDependencies();
329
330 const BitVector &getLoopCarried(unsigned Idx) const {
331 return LoopCarried[Idx];
332 }
333
334private:
335 /// Tags to \p SU if the instruction may affect the order-dependencies.
336 std::optional<InstrTag> getInstrTag(SUnit *SU) const;
337
338 void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
339 const LoadStoreChunk &To);
340
341 void computeDependenciesAux();
342};
343
344} // end anonymous namespace
345
346/// The "main" function for implementing Swing Modulo Scheduling.
347bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
348 if (skipFunction(F: mf.getFunction()))
349 return false;
350
351 if (!EnableSWP)
352 return false;
353
354 if (mf.getFunction().getAttributes().hasFnAttr(Kind: Attribute::OptimizeForSize) &&
355 !EnableSWPOptSize.getPosition())
356 return false;
357
358 if (!mf.getSubtarget().enableMachinePipeliner())
359 return false;
360
361 // Cannot pipeline loops without instruction itineraries if we are using
362 // DFA for the pipeliner.
363 if (mf.getSubtarget().useDFAforSMS() &&
364 (!mf.getSubtarget().getInstrItineraryData() ||
365 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
366 return false;
367
368 MF = &mf;
369 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
370 MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
371 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
372 TII = MF->getSubtarget().getInstrInfo();
373 RegClassInfo.runOnMachineFunction(MF: *MF);
374
375 for (const auto &L : *MLI)
376 scheduleLoop(L&: *L);
377
378 return false;
379}
380
381/// Attempt to perform the SMS algorithm on the specified loop. This function is
382/// the main entry point for the algorithm. The function identifies candidate
383/// loops, calculates the minimum initiation interval, and attempts to schedule
384/// the loop.
385bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
386 bool Changed = false;
387 for (const auto &InnerLoop : L)
388 Changed |= scheduleLoop(L&: *InnerLoop);
389
390#ifndef NDEBUG
391 // Stop trying after reaching the limit (if any).
392 int Limit = SwpLoopLimit;
393 if (Limit >= 0) {
394 if (NumTries >= SwpLoopLimit)
395 return Changed;
396 NumTries++;
397 }
398#endif
399
400 setPragmaPipelineOptions(L);
401 if (!canPipelineLoop(L)) {
402 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
403 ORE->emit(RemarkBuilder: [&]() {
404 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
405 L.getStartLoc(), L.getHeader())
406 << "Failed to pipeline loop";
407 });
408
409 LI.LoopPipelinerInfo.reset();
410 return Changed;
411 }
412
413 ++NumTrytoPipeline;
414 if (useSwingModuloScheduler())
415 Changed = swingModuloScheduler(L);
416
417 if (useWindowScheduler(Changed))
418 Changed = runWindowScheduler(L);
419
420 LI.LoopPipelinerInfo.reset();
421 return Changed;
422}
423
424void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
425 // Reset the pragma for the next loop in iteration.
426 disabledByPragma = false;
427 II_setByPragma = 0;
428
429 MachineBasicBlock *LBLK = L.getTopBlock();
430
431 if (LBLK == nullptr)
432 return;
433
434 const BasicBlock *BBLK = LBLK->getBasicBlock();
435 if (BBLK == nullptr)
436 return;
437
438 const Instruction *TI = BBLK->getTerminator();
439 if (TI == nullptr)
440 return;
441
442 MDNode *LoopID = TI->getMetadata(KindID: LLVMContext::MD_loop);
443 if (LoopID == nullptr)
444 return;
445
446 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
447 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
448
449 for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) {
450 MDNode *MD = dyn_cast<MDNode>(Val: MDO);
451
452 if (MD == nullptr)
453 continue;
454
455 MDString *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
456
457 if (S == nullptr)
458 continue;
459
460 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
461 assert(MD->getNumOperands() == 2 &&
462 "Pipeline initiation interval hint metadata should have two operands.");
463 II_setByPragma =
464 mdconst::extract<ConstantInt>(MD: MD->getOperand(I: 1))->getZExtValue();
465 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
466 } else if (S->getString() == "llvm.loop.pipeline.disable") {
467 disabledByPragma = true;
468 }
469 }
470}
471
472/// Return true if the loop can be software pipelined. The algorithm is
473/// restricted to loops with a single basic block. Make sure that the
474/// branch in the loop can be analyzed.
475bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
476 if (L.getNumBlocks() != 1) {
477 ORE->emit(RemarkBuilder: [&]() {
478 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
479 L.getStartLoc(), L.getHeader())
480 << "Not a single basic block: "
481 << ore::NV("NumBlocks", L.getNumBlocks());
482 });
483 return false;
484 }
485
486 if (disabledByPragma) {
487 ORE->emit(RemarkBuilder: [&]() {
488 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
489 L.getStartLoc(), L.getHeader())
490 << "Disabled by Pragma.";
491 });
492 return false;
493 }
494
495 // Check if the branch can't be understood because we can't do pipelining
496 // if that's the case.
497 LI.TBB = nullptr;
498 LI.FBB = nullptr;
499 LI.BrCond.clear();
500 if (TII->analyzeBranch(MBB&: *L.getHeader(), TBB&: LI.TBB, FBB&: LI.FBB, Cond&: LI.BrCond)) {
501 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
502 NumFailBranch++;
503 ORE->emit(RemarkBuilder: [&]() {
504 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
505 L.getStartLoc(), L.getHeader())
506 << "The branch can't be understood";
507 });
508 return false;
509 }
510
511 LI.LoopInductionVar = nullptr;
512 LI.LoopCompare = nullptr;
513 LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(LoopBB: L.getTopBlock());
514 if (!LI.LoopPipelinerInfo) {
515 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
516 NumFailLoop++;
517 ORE->emit(RemarkBuilder: [&]() {
518 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
519 L.getStartLoc(), L.getHeader())
520 << "The loop structure is not supported";
521 });
522 return false;
523 }
524
525 if (!L.getLoopPreheader()) {
526 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
527 NumFailPreheader++;
528 ORE->emit(RemarkBuilder: [&]() {
529 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
530 L.getStartLoc(), L.getHeader())
531 << "No loop preheader found";
532 });
533 return false;
534 }
535
536 // Remove any subregisters from inputs to phi nodes.
537 preprocessPhiNodes(B&: *L.getHeader());
538 return true;
539}
540
541void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
542 MachineRegisterInfo &MRI = MF->getRegInfo();
543 SlotIndexes &Slots =
544 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
545
546 for (MachineInstr &PI : B.phis()) {
547 MachineOperand &DefOp = PI.getOperand(i: 0);
548 assert(DefOp.getSubReg() == 0);
549 auto *RC = MRI.getRegClass(Reg: DefOp.getReg());
550
551 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
552 MachineOperand &RegOp = PI.getOperand(i);
553 if (RegOp.getSubReg() == 0)
554 continue;
555
556 // If the operand uses a subregister, replace it with a new register
557 // without subregisters, and generate a copy to the new register.
558 Register NewReg = MRI.createVirtualRegister(RegClass: RC);
559 MachineBasicBlock &PredB = *PI.getOperand(i: i+1).getMBB();
560 MachineBasicBlock::iterator At = PredB.getFirstTerminator();
561 const DebugLoc &DL = PredB.findDebugLoc(MBBI: At);
562 auto Copy = BuildMI(BB&: PredB, I: At, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewReg)
563 .addReg(RegNo: RegOp.getReg(), flags: getRegState(RegOp),
564 SubReg: RegOp.getSubReg());
565 Slots.insertMachineInstrInMaps(MI&: *Copy);
566 RegOp.setReg(NewReg);
567 RegOp.setSubReg(0);
568 }
569 }
570}
571
572/// The SMS algorithm consists of the following main steps:
573/// 1. Computation and analysis of the dependence graph.
574/// 2. Ordering of the nodes (instructions).
575/// 3. Attempt to Schedule the loop.
576bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
577 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
578
579 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
580 SwingSchedulerDAG SMS(
581 *this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
582 II_setByPragma, LI.LoopPipelinerInfo.get(), AA);
583
584 MachineBasicBlock *MBB = L.getHeader();
585 // The kernel should not include any terminator instructions. These
586 // will be added back later.
587 SMS.startBlock(BB: MBB);
588
589 // Compute the number of 'real' instructions in the basic block by
590 // ignoring terminators.
591 unsigned size = MBB->size();
592 for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
593 E = MBB->instr_end();
594 I != E; ++I, --size)
595 ;
596
597 SMS.enterRegion(bb: MBB, begin: MBB->begin(), end: MBB->getFirstTerminator(), regioninstrs: size);
598 SMS.schedule();
599 SMS.exitRegion();
600
601 SMS.finishBlock();
602 return SMS.hasNewSchedule();
603}
604
605void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const {
606 AU.addRequired<AAResultsWrapperPass>();
607 AU.addPreserved<AAResultsWrapperPass>();
608 AU.addRequired<MachineLoopInfoWrapperPass>();
609 AU.addRequired<MachineDominatorTreeWrapperPass>();
610 AU.addRequired<LiveIntervalsWrapperPass>();
611 AU.addRequired<MachineOptimizationRemarkEmitterPass>();
612 AU.addRequired<TargetPassConfig>();
613 MachineFunctionPass::getAnalysisUsage(AU);
614}
615
616bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
617 MachineSchedContext Context;
618 Context.MF = MF;
619 Context.MLI = MLI;
620 Context.MDT = MDT;
621 Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
622 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
623 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
624 Context.RegClassInfo->runOnMachineFunction(MF: *MF);
625 WindowScheduler WS(&Context, L);
626 return WS.run();
627}
628
629bool MachinePipeliner::useSwingModuloScheduler() {
630 // SwingModuloScheduler does not work when WindowScheduler is forced.
631 return WindowSchedulingOption != WindowSchedulingFlag::WS_Force;
632}
633
634bool MachinePipeliner::useWindowScheduler(bool Changed) {
635 // WindowScheduler does not work for following cases:
636 // 1. when it is off.
637 // 2. when SwingModuloScheduler is successfully scheduled.
638 // 3. when pragma II is enabled.
639 if (II_setByPragma) {
640 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
641 "llvm.loop.pipeline.initiationinterval is set.\n");
642 return false;
643 }
644
645 return WindowSchedulingOption == WindowSchedulingFlag::WS_Force ||
646 (WindowSchedulingOption == WindowSchedulingFlag::WS_On && !Changed);
647}
648
649void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
650 if (SwpForceII > 0)
651 MII = SwpForceII;
652 else if (II_setByPragma > 0)
653 MII = II_setByPragma;
654 else
655 MII = std::max(a: ResMII, b: RecMII);
656}
657
658void SwingSchedulerDAG::setMAX_II() {
659 if (SwpForceII > 0)
660 MAX_II = SwpForceII;
661 else if (II_setByPragma > 0)
662 MAX_II = II_setByPragma;
663 else
664 MAX_II = MII + SwpIISearchRange;
665}
666
667/// We override the schedule function in ScheduleDAGInstrs to implement the
668/// scheduling part of the Swing Modulo Scheduling algorithm.
669void SwingSchedulerDAG::schedule() {
670 buildSchedGraph(AA);
671 const LoopCarriedEdges LCE = addLoopCarriedDependences();
672 updatePhiDependences();
673 Topo.InitDAGTopologicalSorting();
674 changeDependences();
675 postProcessDAG();
676 DDG = std::make_unique<SwingSchedulerDDG>(args&: SUnits, args: &EntrySU, args: &ExitSU);
677 LLVM_DEBUG({
678 dump();
679 dbgs() << "===== Loop Carried Edges Begin =====\n";
680 for (SUnit &SU : SUnits)
681 LCE.dump(&SU, TRI, &MRI);
682 dbgs() << "===== Loop Carried Edges End =====\n";
683 });
684
685 NodeSetType NodeSets;
686 findCircuits(NodeSets);
687 NodeSetType Circuits = NodeSets;
688
689 // Calculate the MII.
690 unsigned ResMII = calculateResMII();
691 unsigned RecMII = calculateRecMII(RecNodeSets&: NodeSets);
692
693 fuseRecs(NodeSets);
694
695 // This flag is used for testing and can cause correctness problems.
696 if (SwpIgnoreRecMII)
697 RecMII = 0;
698
699 setMII(ResMII, RecMII);
700 setMAX_II();
701
702 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
703 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
704
705 // Can't schedule a loop without a valid MII.
706 if (MII == 0) {
707 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
708 NumFailZeroMII++;
709 Pass.ORE->emit(RemarkBuilder: [&]() {
710 return MachineOptimizationRemarkAnalysis(
711 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
712 << "Invalid Minimal Initiation Interval: 0";
713 });
714 return;
715 }
716
717 // Don't pipeline large loops.
718 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
719 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
720 << ", we don't pipeline large loops\n");
721 NumFailLargeMaxMII++;
722 Pass.ORE->emit(RemarkBuilder: [&]() {
723 return MachineOptimizationRemarkAnalysis(
724 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
725 << "Minimal Initiation Interval too large: "
726 << ore::NV("MII", (int)MII) << " > "
727 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
728 << "Refer to -pipeliner-max-mii.";
729 });
730 return;
731 }
732
733 computeNodeFunctions(NodeSets);
734
735 registerPressureFilter(NodeSets);
736
737 colocateNodeSets(NodeSets);
738
739 checkNodeSets(NodeSets);
740
741 LLVM_DEBUG({
742 for (auto &I : NodeSets) {
743 dbgs() << " Rec NodeSet ";
744 I.dump();
745 }
746 });
747
748 llvm::stable_sort(Range&: NodeSets, C: std::greater<NodeSet>());
749
750 groupRemainingNodes(NodeSets);
751
752 removeDuplicateNodes(NodeSets);
753
754 LLVM_DEBUG({
755 for (auto &I : NodeSets) {
756 dbgs() << " NodeSet ";
757 I.dump();
758 }
759 });
760
761 computeNodeOrder(NodeSets);
762
763 // check for node order issues
764 checkValidNodeOrder(Circuits);
765
766 SMSchedule Schedule(Pass.MF, this);
767 Scheduled = schedulePipeline(Schedule);
768
769 if (!Scheduled){
770 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
771 NumFailNoSchedule++;
772 Pass.ORE->emit(RemarkBuilder: [&]() {
773 return MachineOptimizationRemarkAnalysis(
774 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
775 << "Unable to find schedule";
776 });
777 return;
778 }
779
780 unsigned numStages = Schedule.getMaxStageCount();
781 // No need to generate pipeline if there are no overlapped iterations.
782 if (numStages == 0) {
783 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
784 NumFailZeroStage++;
785 Pass.ORE->emit(RemarkBuilder: [&]() {
786 return MachineOptimizationRemarkAnalysis(
787 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
788 << "No need to pipeline - no overlapped iterations in schedule.";
789 });
790 return;
791 }
792 // Check that the maximum stage count is less than user-defined limit.
793 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
794 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
795 << " : too many stages, abort\n");
796 NumFailLargeMaxStage++;
797 Pass.ORE->emit(RemarkBuilder: [&]() {
798 return MachineOptimizationRemarkAnalysis(
799 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
800 << "Too many stages in schedule: "
801 << ore::NV("numStages", (int)numStages) << " > "
802 << ore::NV("SwpMaxStages", SwpMaxStages)
803 << ". Refer to -pipeliner-max-stages.";
804 });
805 return;
806 }
807
808 Pass.ORE->emit(RemarkBuilder: [&]() {
809 return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
810 Loop.getHeader())
811 << "Pipelined succesfully!";
812 });
813
814 // Generate the schedule as a ModuloSchedule.
815 DenseMap<MachineInstr *, int> Cycles, Stages;
816 std::vector<MachineInstr *> OrderedInsts;
817 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
818 ++Cycle) {
819 for (SUnit *SU : Schedule.getInstructions(cycle: Cycle)) {
820 OrderedInsts.push_back(x: SU->getInstr());
821 Cycles[SU->getInstr()] = Cycle;
822 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
823 }
824 }
825 DenseMap<MachineInstr *, std::pair<Register, int64_t>> NewInstrChanges;
826 for (auto &KV : NewMIs) {
827 Cycles[KV.first] = Cycles[KV.second];
828 Stages[KV.first] = Stages[KV.second];
829 NewInstrChanges[KV.first] = InstrChanges[getSUnit(MI: KV.first)];
830 }
831
832 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
833 std::move(Stages));
834 if (EmitTestAnnotations) {
835 assert(NewInstrChanges.empty() &&
836 "Cannot serialize a schedule with InstrChanges!");
837 ModuloScheduleTestAnnotater MSTI(MF, MS);
838 MSTI.annotate();
839 return;
840 }
841 // The experimental code generator can't work if there are InstChanges.
842 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
843 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
844 MSE.expand();
845 } else if (MVECodeGen && NewInstrChanges.empty() &&
846 LoopPipelinerInfo->isMVEExpanderSupported() &&
847 ModuloScheduleExpanderMVE::canApply(L&: Loop)) {
848 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
849 MSE.expand();
850 } else {
851 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
852 MSE.expand();
853 MSE.cleanup();
854 }
855 ++NumPipelined;
856}
857
858/// Clean up after the software pipeliner runs.
859void SwingSchedulerDAG::finishBlock() {
860 for (auto &KV : NewMIs)
861 MF.deleteMachineInstr(MI: KV.second);
862 NewMIs.clear();
863
864 // Call the superclass.
865 ScheduleDAGInstrs::finishBlock();
866}
867
868/// Return the register values for the operands of a Phi instruction.
869/// This function assume the instruction is a Phi.
870static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
871 Register &InitVal, Register &LoopVal) {
872 assert(Phi.isPHI() && "Expecting a Phi.");
873
874 InitVal = Register();
875 LoopVal = Register();
876 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
877 if (Phi.getOperand(i: i + 1).getMBB() != Loop)
878 InitVal = Phi.getOperand(i).getReg();
879 else
880 LoopVal = Phi.getOperand(i).getReg();
881
882 assert(InitVal && LoopVal && "Unexpected Phi structure.");
883}
884
885/// Return the Phi register value that comes the loop block.
886static Register getLoopPhiReg(const MachineInstr &Phi,
887 const MachineBasicBlock *LoopBB) {
888 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
889 if (Phi.getOperand(i: i + 1).getMBB() == LoopBB)
890 return Phi.getOperand(i).getReg();
891 return Register();
892}
893
894/// Return true if SUb can be reached from SUa following the chain edges.
895static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
896 SmallPtrSet<SUnit *, 8> Visited;
897 SmallVector<SUnit *, 8> Worklist;
898 Worklist.push_back(Elt: SUa);
899 while (!Worklist.empty()) {
900 const SUnit *SU = Worklist.pop_back_val();
901 for (const auto &SI : SU->Succs) {
902 SUnit *SuccSU = SI.getSUnit();
903 if (SI.getKind() == SDep::Order) {
904 if (Visited.count(Ptr: SuccSU))
905 continue;
906 if (SuccSU == SUb)
907 return true;
908 Worklist.push_back(Elt: SuccSU);
909 Visited.insert(Ptr: SuccSU);
910 }
911 }
912 }
913 return false;
914}
915
916SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) {
917 if (!getUnderlyingObjects())
918 return;
919 for (const Value *Obj : UnderlyingObjs)
920 if (!isIdentifiedObject(V: Obj)) {
921 IsAllIdentified = false;
922 break;
923 }
924}
925
926bool SUnitWithMemInfo::isTriviallyDisjoint(
927 const SUnitWithMemInfo &Other) const {
928 // If all underlying objects are identified objects and there is no overlap
929 // between them, then these two instructions are disjoint.
930 if (!IsAllIdentified || !Other.IsAllIdentified)
931 return false;
932 for (const Value *Obj : UnderlyingObjs)
933 if (llvm::is_contained(Range: Other.UnderlyingObjs, Element: Obj))
934 return false;
935 return true;
936}
937
938/// Collect the underlying objects for the memory references of an instruction.
939/// This function calls the code in ValueTracking, but first checks that the
940/// instruction has a memory operand.
941/// Returns false if we cannot find the underlying objects.
942bool SUnitWithMemInfo::getUnderlyingObjects() {
943 const MachineInstr *MI = SU->getInstr();
944 if (!MI->hasOneMemOperand())
945 return false;
946 MachineMemOperand *MM = *MI->memoperands_begin();
947 if (!MM->getValue())
948 return false;
949 MemOpValue = MM->getValue();
950 MemOpOffset = MM->getOffset();
951 llvm::getUnderlyingObjects(V: MemOpValue, Objects&: UnderlyingObjs);
952
953 // TODO: A no alias scope may be valid only in a single iteration. In this
954 // case we need to peel off it like LoopAccessAnalysis does.
955 AATags = MM->getAAInfo();
956 return true;
957}
958
959/// Returns true if there is a loop-carried order dependency from \p Src to \p
960/// Dst.
961static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
962 const SUnitWithMemInfo &Dst,
963 BatchAAResults &BAA,
964 const TargetInstrInfo *TII,
965 const TargetRegisterInfo *TRI) {
966 if (Src.isTriviallyDisjoint(Other: Dst))
967 return false;
968 if (isSuccOrder(SUa: Src.SU, SUb: Dst.SU))
969 return false;
970
971 MachineInstr &SrcMI = *Src.SU->getInstr();
972 MachineInstr &DstMI = *Dst.SU->getInstr();
973 // First, perform the cheaper check that compares the base register.
974 // If they are the same and the load offset is less than the store
975 // offset, then mark the dependence as loop carried potentially.
976 const MachineOperand *BaseOp1, *BaseOp2;
977 int64_t Offset1, Offset2;
978 bool Offset1IsScalable, Offset2IsScalable;
979 if (TII->getMemOperandWithOffset(MI: SrcMI, BaseOp&: BaseOp1, Offset&: Offset1, OffsetIsScalable&: Offset1IsScalable,
980 TRI) &&
981 TII->getMemOperandWithOffset(MI: DstMI, BaseOp&: BaseOp2, Offset&: Offset2, OffsetIsScalable&: Offset2IsScalable,
982 TRI)) {
983 if (BaseOp1->isIdenticalTo(Other: *BaseOp2) &&
984 Offset1IsScalable == Offset2IsScalable && (int)Offset1 < (int)Offset2) {
985 assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) &&
986 "What happened to the chain edge?");
987 return true;
988 }
989 }
990
991 // Second, the more expensive check that uses alias analysis on the
992 // base registers. If they alias, and the load offset is less than
993 // the store offset, the mark the dependence as loop carried.
994 if (Src.isUnknown() || Dst.isUnknown())
995 return true;
996 if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset)
997 return true;
998
999 if (BAA.isNoAlias(
1000 LocA: MemoryLocation::getBeforeOrAfter(Ptr: Src.MemOpValue, AATags: Src.AATags),
1001 LocB: MemoryLocation::getBeforeOrAfter(Ptr: Dst.MemOpValue, AATags: Dst.AATags)))
1002 return false;
1003
1004 // AliasAnalysis sometimes gives up on following the underlying
1005 // object. In such a case, separate checks for underlying objects may
1006 // prove that there are no aliases between two accesses.
1007 for (const Value *SrcObj : Src.UnderlyingObjs)
1008 for (const Value *DstObj : Dst.UnderlyingObjs)
1009 if (!BAA.isNoAlias(LocA: MemoryLocation::getBeforeOrAfter(Ptr: SrcObj, AATags: Src.AATags),
1010 LocB: MemoryLocation::getBeforeOrAfter(Ptr: DstObj, AATags: Dst.AATags)))
1011 return true;
1012
1013 return false;
1014}
1015
1016void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {
1017 const MachineInstr *MI = SU->getInstr();
1018 if (!MI->mayLoadOrStore())
1019 return;
1020 (MI->mayStore() ? Stores : Loads).emplace_back(Args&: SU);
1021}
1022
1023LoopCarriedOrderDepsTracker::LoopCarriedOrderDepsTracker(
1024 SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
1025 const TargetRegisterInfo *TRI)
1026 : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1027 LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1028
1029void LoopCarriedOrderDepsTracker::computeDependencies() {
1030 // Traverse all instructions and extract only what we are targetting.
1031 for (auto &SU : SUnits) {
1032 auto Tagged = getInstrTag(SU: &SU);
1033
1034 // This instruction has no loop-carried order-dependencies.
1035 if (!Tagged)
1036 continue;
1037 TaggedSUnits.emplace_back(args: &SU, args&: *Tagged);
1038 }
1039
1040 computeDependenciesAux();
1041}
1042
1043std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1044LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
1045 MachineInstr *MI = SU->getInstr();
1046 if (TII->isGlobalMemoryObject(MI))
1047 return InstrTag::Barrier;
1048
1049 if (MI->mayStore() ||
1050 (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
1051 return InstrTag::LoadOrStore;
1052
1053 if (MI->mayRaiseFPException())
1054 return InstrTag::FPExceptions;
1055
1056 return std::nullopt;
1057}
1058
1059void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
1060 const LoadStoreChunk &From, const LoadStoreChunk &To) {
1061 // Add dependencies for load-to-store (WAR) from top to bottom.
1062 for (const SUnitWithMemInfo &Src : From.Loads)
1063 for (const SUnitWithMemInfo &Dst : To.Stores)
1064 if (Src.SU->NodeNum < Dst.SU->NodeNum &&
1065 hasLoopCarriedMemDep(Src, Dst, BAA&: *BAA, TII, TRI))
1066 LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
1067
1068 // TODO: The following dependencies are missed.
1069 //
1070 // - Dependencies for load-to-store from bottom to top.
1071 // - Dependencies for store-to-load (RAW).
1072 // - Dependencies for store-to-store (WAW).
1073}
1074
1075void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
1076 SmallVector<LoadStoreChunk, 2> Chunks(1);
1077 for (const auto &TSU : TaggedSUnits) {
1078 InstrTag Tag = TSU.getTag();
1079 SUnit *SU = TSU.getPointer();
1080 switch (Tag) {
1081 case InstrTag::Barrier:
1082 Chunks.emplace_back();
1083 break;
1084 case InstrTag::LoadOrStore:
1085 Chunks.back().append(SU);
1086 break;
1087 case InstrTag::FPExceptions:
1088 // TODO: Handle this properly.
1089 break;
1090 }
1091 }
1092
1093 // Add dependencies between memory operations. If there are one or more
1094 // barrier events between two memory instructions, we don't add a
1095 // loop-carried dependence for them.
1096 for (const LoadStoreChunk &Chunk : Chunks)
1097 addLoopCarriedDepenenciesForChunks(From: Chunk, To: Chunk);
1098
1099 // TODO: If there are multiple barrier instructions, dependencies from the
1100 // last barrier instruction (or load/store below it) to the first barrier
1101 // instruction (or load/store above it).
1102}
1103
1104/// Add a chain edge between a load and store if the store can be an
1105/// alias of the load on a subsequent iteration, i.e., a loop carried
1106/// dependence. This code is very similar to the code in ScheduleDAGInstrs
1107/// but that code doesn't create loop carried dependences.
1108/// TODO: Also compute output-dependencies.
1109LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
1110 LoopCarriedEdges LCE;
1111
1112 // Add loop-carried order-dependencies
1113 LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);
1114 LCODTracker.computeDependencies();
1115 for (unsigned I = 0; I != SUnits.size(); I++)
1116 for (const int Succ : LCODTracker.getLoopCarried(Idx: I).set_bits())
1117 LCE.OrderDeps[&SUnits[I]].insert(X: &SUnits[Succ]);
1118
1119 LCE.modifySUnits(SUnits);
1120 return LCE;
1121}
1122
1123/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1124/// processes dependences for PHIs. This function adds true dependences
1125/// from a PHI to a use, and a loop carried dependence from the use to the
1126/// PHI. The loop carried dependence is represented as an anti dependence
1127/// edge. This function also removes chain dependences between unrelated
1128/// PHIs.
1129void SwingSchedulerDAG::updatePhiDependences() {
1130 SmallVector<SDep, 4> RemoveDeps;
1131 const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1132
1133 // Iterate over each DAG node.
1134 for (SUnit &I : SUnits) {
1135 RemoveDeps.clear();
1136 // Set to true if the instruction has an operand defined by a Phi.
1137 Register HasPhiUse;
1138 Register HasPhiDef;
1139 MachineInstr *MI = I.getInstr();
1140 // Iterate over each operand, and we process the definitions.
1141 for (const MachineOperand &MO : MI->operands()) {
1142 if (!MO.isReg())
1143 continue;
1144 Register Reg = MO.getReg();
1145 if (MO.isDef()) {
1146 // If the register is used by a Phi, then create an anti dependence.
1147 for (MachineRegisterInfo::use_instr_iterator
1148 UI = MRI.use_instr_begin(RegNo: Reg),
1149 UE = MRI.use_instr_end();
1150 UI != UE; ++UI) {
1151 MachineInstr *UseMI = &*UI;
1152 SUnit *SU = getSUnit(MI: UseMI);
1153 if (SU != nullptr && UseMI->isPHI()) {
1154 if (!MI->isPHI()) {
1155 SDep Dep(SU, SDep::Anti, Reg);
1156 Dep.setLatency(1);
1157 I.addPred(D: Dep);
1158 } else {
1159 HasPhiDef = Reg;
1160 // Add a chain edge to a dependent Phi that isn't an existing
1161 // predecessor.
1162 if (SU->NodeNum < I.NodeNum && !I.isPred(N: SU))
1163 I.addPred(D: SDep(SU, SDep::Barrier));
1164 }
1165 }
1166 }
1167 } else if (MO.isUse()) {
1168 // If the register is defined by a Phi, then create a true dependence.
1169 MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1170 if (DefMI == nullptr)
1171 continue;
1172 SUnit *SU = getSUnit(MI: DefMI);
1173 if (SU != nullptr && DefMI->isPHI()) {
1174 if (!MI->isPHI()) {
1175 SDep Dep(SU, SDep::Data, Reg);
1176 Dep.setLatency(0);
1177 ST.adjustSchedDependency(Def: SU, DefOpIdx: 0, Use: &I, UseOpIdx: MO.getOperandNo(), Dep,
1178 SchedModel: &SchedModel);
1179 I.addPred(D: Dep);
1180 } else {
1181 HasPhiUse = Reg;
1182 // Add a chain edge to a dependent Phi that isn't an existing
1183 // predecessor.
1184 if (SU->NodeNum < I.NodeNum && !I.isPred(N: SU))
1185 I.addPred(D: SDep(SU, SDep::Barrier));
1186 }
1187 }
1188 }
1189 }
1190 // Remove order dependences from an unrelated Phi.
1191 if (!SwpPruneDeps)
1192 continue;
1193 for (auto &PI : I.Preds) {
1194 MachineInstr *PMI = PI.getSUnit()->getInstr();
1195 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1196 if (I.getInstr()->isPHI()) {
1197 if (PMI->getOperand(i: 0).getReg() == HasPhiUse)
1198 continue;
1199 if (getLoopPhiReg(Phi: *PMI, LoopBB: PMI->getParent()) == HasPhiDef)
1200 continue;
1201 }
1202 RemoveDeps.push_back(Elt: PI);
1203 }
1204 }
1205 for (const SDep &D : RemoveDeps)
1206 I.removePred(D);
1207 }
1208}
1209
1210/// Iterate over each DAG node and see if we can change any dependences
1211/// in order to reduce the recurrence MII.
1212void SwingSchedulerDAG::changeDependences() {
1213 // See if an instruction can use a value from the previous iteration.
1214 // If so, we update the base and offset of the instruction and change
1215 // the dependences.
1216 for (SUnit &I : SUnits) {
1217 unsigned BasePos = 0, OffsetPos = 0;
1218 Register NewBase;
1219 int64_t NewOffset = 0;
1220 if (!canUseLastOffsetValue(MI: I.getInstr(), BasePos, OffsetPos, NewBase,
1221 NewOffset))
1222 continue;
1223
1224 // Get the MI and SUnit for the instruction that defines the original base.
1225 Register OrigBase = I.getInstr()->getOperand(i: BasePos).getReg();
1226 MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg: OrigBase);
1227 if (!DefMI)
1228 continue;
1229 SUnit *DefSU = getSUnit(MI: DefMI);
1230 if (!DefSU)
1231 continue;
1232 // Get the MI and SUnit for the instruction that defins the new base.
1233 MachineInstr *LastMI = MRI.getUniqueVRegDef(Reg: NewBase);
1234 if (!LastMI)
1235 continue;
1236 SUnit *LastSU = getSUnit(MI: LastMI);
1237 if (!LastSU)
1238 continue;
1239
1240 if (Topo.IsReachable(SU: &I, TargetSU: LastSU))
1241 continue;
1242
1243 // Remove the dependence. The value now depends on a prior iteration.
1244 SmallVector<SDep, 4> Deps;
1245 for (const SDep &P : I.Preds)
1246 if (P.getSUnit() == DefSU)
1247 Deps.push_back(Elt: P);
1248 for (const SDep &D : Deps) {
1249 Topo.RemovePred(M: &I, N: D.getSUnit());
1250 I.removePred(D);
1251 }
1252 // Remove the chain dependence between the instructions.
1253 Deps.clear();
1254 for (auto &P : LastSU->Preds)
1255 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1256 Deps.push_back(Elt: P);
1257 for (const SDep &D : Deps) {
1258 Topo.RemovePred(M: LastSU, N: D.getSUnit());
1259 LastSU->removePred(D);
1260 }
1261
1262 // Add a dependence between the new instruction and the instruction
1263 // that defines the new base.
1264 SDep Dep(&I, SDep::Anti, NewBase);
1265 Topo.AddPred(Y: LastSU, X: &I);
1266 LastSU->addPred(D: Dep);
1267
1268 // Remember the base and offset information so that we can update the
1269 // instruction during code generation.
1270 InstrChanges[&I] = std::make_pair(x&: NewBase, y&: NewOffset);
1271 }
1272}
1273
1274/// Create an instruction stream that represents a single iteration and stage of
1275/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1276/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1277/// function is an approximation of SMSchedule::finalizeSchedule with all
1278/// non-const operations removed.
1279static void computeScheduledInsts(const SwingSchedulerDAG *SSD,
1280 SMSchedule &Schedule,
1281 std::vector<MachineInstr *> &OrderedInsts,
1282 DenseMap<MachineInstr *, unsigned> &Stages) {
1283 DenseMap<int, std::deque<SUnit *>> Instrs;
1284
1285 // Move all instructions to the first stage from the later stages.
1286 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1287 ++Cycle) {
1288 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1289 Stage <= LastStage; ++Stage) {
1290 for (SUnit *SU : llvm::reverse(C&: Schedule.getInstructions(
1291 cycle: Cycle + Stage * Schedule.getInitiationInterval()))) {
1292 Instrs[Cycle].push_front(x: SU);
1293 }
1294 }
1295 }
1296
1297 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1298 ++Cycle) {
1299 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1300 CycleInstrs = Schedule.reorderInstructions(SSD, Instrs: CycleInstrs);
1301 for (SUnit *SU : CycleInstrs) {
1302 MachineInstr *MI = SU->getInstr();
1303 OrderedInsts.push_back(x: MI);
1304 Stages[MI] = Schedule.stageScheduled(SU);
1305 }
1306 }
1307}
1308
1309namespace {
1310
1311// FuncUnitSorter - Comparison operator used to sort instructions by
1312// the number of functional unit choices.
1313struct FuncUnitSorter {
1314 const InstrItineraryData *InstrItins;
1315 const MCSubtargetInfo *STI;
1316 DenseMap<InstrStage::FuncUnits, unsigned> Resources;
1317
1318 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1319 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1320
1321 // Compute the number of functional unit alternatives needed
1322 // at each stage, and take the minimum value. We prioritize the
1323 // instructions by the least number of choices first.
1324 unsigned minFuncUnits(const MachineInstr *Inst,
1325 InstrStage::FuncUnits &F) const {
1326 unsigned SchedClass = Inst->getDesc().getSchedClass();
1327 unsigned min = UINT_MAX;
1328 if (InstrItins && !InstrItins->isEmpty()) {
1329 for (const InstrStage &IS :
1330 make_range(x: InstrItins->beginStage(ItinClassIndx: SchedClass),
1331 y: InstrItins->endStage(ItinClassIndx: SchedClass))) {
1332 InstrStage::FuncUnits funcUnits = IS.getUnits();
1333 unsigned numAlternatives = llvm::popcount(Value: funcUnits);
1334 if (numAlternatives < min) {
1335 min = numAlternatives;
1336 F = funcUnits;
1337 }
1338 }
1339 return min;
1340 }
1341 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1342 const MCSchedClassDesc *SCDesc =
1343 STI->getSchedModel().getSchedClassDesc(SchedClassIdx: SchedClass);
1344 if (!SCDesc->isValid())
1345 // No valid Schedule Class Desc for schedClass, should be
1346 // Pseudo/PostRAPseudo
1347 return min;
1348
1349 for (const MCWriteProcResEntry &PRE :
1350 make_range(x: STI->getWriteProcResBegin(SC: SCDesc),
1351 y: STI->getWriteProcResEnd(SC: SCDesc))) {
1352 if (!PRE.ReleaseAtCycle)
1353 continue;
1354 const MCProcResourceDesc *ProcResource =
1355 STI->getSchedModel().getProcResource(ProcResourceIdx: PRE.ProcResourceIdx);
1356 unsigned NumUnits = ProcResource->NumUnits;
1357 if (NumUnits < min) {
1358 min = NumUnits;
1359 F = PRE.ProcResourceIdx;
1360 }
1361 }
1362 return min;
1363 }
1364 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1365 }
1366
1367 // Compute the critical resources needed by the instruction. This
1368 // function records the functional units needed by instructions that
1369 // must use only one functional unit. We use this as a tie breaker
1370 // for computing the resource MII. The instrutions that require
1371 // the same, highly used, functional unit have high priority.
1372 void calcCriticalResources(MachineInstr &MI) {
1373 unsigned SchedClass = MI.getDesc().getSchedClass();
1374 if (InstrItins && !InstrItins->isEmpty()) {
1375 for (const InstrStage &IS :
1376 make_range(x: InstrItins->beginStage(ItinClassIndx: SchedClass),
1377 y: InstrItins->endStage(ItinClassIndx: SchedClass))) {
1378 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1379 if (llvm::popcount(Value: FuncUnits) == 1)
1380 Resources[FuncUnits]++;
1381 }
1382 return;
1383 }
1384 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1385 const MCSchedClassDesc *SCDesc =
1386 STI->getSchedModel().getSchedClassDesc(SchedClassIdx: SchedClass);
1387 if (!SCDesc->isValid())
1388 // No valid Schedule Class Desc for schedClass, should be
1389 // Pseudo/PostRAPseudo
1390 return;
1391
1392 for (const MCWriteProcResEntry &PRE :
1393 make_range(x: STI->getWriteProcResBegin(SC: SCDesc),
1394 y: STI->getWriteProcResEnd(SC: SCDesc))) {
1395 if (!PRE.ReleaseAtCycle)
1396 continue;
1397 Resources[PRE.ProcResourceIdx]++;
1398 }
1399 return;
1400 }
1401 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1402 }
1403
1404 /// Return true if IS1 has less priority than IS2.
1405 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1406 InstrStage::FuncUnits F1 = 0, F2 = 0;
1407 unsigned MFUs1 = minFuncUnits(Inst: IS1, F&: F1);
1408 unsigned MFUs2 = minFuncUnits(Inst: IS2, F&: F2);
1409 if (MFUs1 == MFUs2)
1410 return Resources.lookup(Val: F1) < Resources.lookup(Val: F2);
1411 return MFUs1 > MFUs2;
1412 }
1413};
1414
1415/// Calculate the maximum register pressure of the scheduled instructions stream
1416class HighRegisterPressureDetector {
1417 MachineBasicBlock *OrigMBB;
1418 const MachineRegisterInfo &MRI;
1419 const TargetRegisterInfo *TRI;
1420
1421 const unsigned PSetNum;
1422
1423 // Indexed by PSet ID
1424 // InitSetPressure takes into account the register pressure of live-in
1425 // registers. It's not depend on how the loop is scheduled, so it's enough to
1426 // calculate them once at the beginning.
1427 std::vector<unsigned> InitSetPressure;
1428
1429 // Indexed by PSet ID
1430 // Upper limit for each register pressure set
1431 std::vector<unsigned> PressureSetLimit;
1432
1433 DenseMap<MachineInstr *, RegisterOperands> ROMap;
1434
1435 using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
1436
1437public:
1438 using OrderedInstsTy = std::vector<MachineInstr *>;
1439 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1440
1441private:
1442 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1443 if (Pressures.size() == 0) {
1444 dbgs() << "[]";
1445 } else {
1446 char Prefix = '[';
1447 for (unsigned P : Pressures) {
1448 dbgs() << Prefix << P;
1449 Prefix = ' ';
1450 }
1451 dbgs() << ']';
1452 }
1453 }
1454
1455 void dumpPSet(Register Reg) const {
1456 dbgs() << "Reg=" << printReg(Reg, TRI, SubIdx: 0, MRI: &MRI) << " PSet=";
1457 for (auto PSetIter = MRI.getPressureSets(RegUnit: Reg); PSetIter.isValid();
1458 ++PSetIter) {
1459 dbgs() << *PSetIter << ' ';
1460 }
1461 dbgs() << '\n';
1462 }
1463
1464 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1465 Register Reg) const {
1466 auto PSetIter = MRI.getPressureSets(RegUnit: Reg);
1467 unsigned Weight = PSetIter.getWeight();
1468 for (; PSetIter.isValid(); ++PSetIter)
1469 Pressure[*PSetIter] += Weight;
1470 }
1471
1472 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1473 Register Reg) const {
1474 auto PSetIter = MRI.getPressureSets(RegUnit: Reg);
1475 unsigned Weight = PSetIter.getWeight();
1476 for (; PSetIter.isValid(); ++PSetIter) {
1477 auto &P = Pressure[*PSetIter];
1478 assert(P >= Weight &&
1479 "register pressure must be greater than or equal weight");
1480 P -= Weight;
1481 }
1482 }
1483
1484 // Return true if Reg is reserved one, for example, stack pointer
1485 bool isReservedRegister(Register Reg) const {
1486 return Reg.isPhysical() && MRI.isReserved(PhysReg: Reg.asMCReg());
1487 }
1488
1489 bool isDefinedInThisLoop(Register Reg) const {
1490 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1491 }
1492
1493 // Search for live-in variables. They are factored into the register pressure
1494 // from the begining. Live-in variables used by every iteration should be
1495 // considered as alive throughout the loop. For example, the variable `c` in
1496 // following code. \code
1497 // int c = ...;
1498 // for (int i = 0; i < n; i++)
1499 // a[i] += b[i] + c;
1500 // \endcode
1501 void computeLiveIn() {
1502 DenseSet<Register> Used;
1503 for (auto &MI : *OrigMBB) {
1504 if (MI.isDebugInstr())
1505 continue;
1506 for (auto &Use : ROMap[&MI].Uses) {
1507 auto Reg = Use.RegUnit;
1508 // Ignore the variable that appears only on one side of phi instruction
1509 // because it's used only at the first iteration.
1510 if (MI.isPHI() && Reg != getLoopPhiReg(Phi: MI, LoopBB: OrigMBB))
1511 continue;
1512 if (isReservedRegister(Reg))
1513 continue;
1514 if (isDefinedInThisLoop(Reg))
1515 continue;
1516 Used.insert(V: Reg);
1517 }
1518 }
1519
1520 for (auto LiveIn : Used)
1521 increaseRegisterPressure(Pressure&: InitSetPressure, Reg: LiveIn);
1522 }
1523
1524 // Calculate the upper limit of each pressure set
1525 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1526 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1527 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(Idx: PSet);
1528 }
1529
1530 // There are two patterns of last-use.
1531 // - by an instruction of the current iteration
1532 // - by a phi instruction of the next iteration (loop carried value)
1533 //
1534 // Furthermore, following two groups of instructions are executed
1535 // simultaneously
1536 // - next iteration's phi instructions in i-th stage
1537 // - current iteration's instructions in i+1-th stage
1538 //
1539 // This function calculates the last-use of each register while taking into
1540 // account the above two patterns.
1541 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1542 Instr2StageTy &Stages) const {
1543 // We treat virtual registers that are defined and used in this loop.
1544 // Following virtual register will be ignored
1545 // - live-in one
1546 // - defined but not used in the loop (potentially live-out)
1547 DenseSet<Register> TargetRegs;
1548 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1549 if (isDefinedInThisLoop(Reg))
1550 TargetRegs.insert(V: Reg);
1551 };
1552 for (MachineInstr *MI : OrderedInsts) {
1553 if (MI->isPHI()) {
1554 Register Reg = getLoopPhiReg(Phi: *MI, LoopBB: OrigMBB);
1555 UpdateTargetRegs(Reg);
1556 } else {
1557 for (auto &Use : ROMap.find(Val: MI)->getSecond().Uses)
1558 UpdateTargetRegs(Use.RegUnit);
1559 }
1560 }
1561
1562 const auto InstrScore = [&Stages](MachineInstr *MI) {
1563 return Stages[MI] + MI->isPHI();
1564 };
1565
1566 DenseMap<Register, MachineInstr *> LastUseMI;
1567 for (MachineInstr *MI : llvm::reverse(C: OrderedInsts)) {
1568 for (auto &Use : ROMap.find(Val: MI)->getSecond().Uses) {
1569 auto Reg = Use.RegUnit;
1570 if (!TargetRegs.contains(V: Reg))
1571 continue;
1572 auto [Ite, Inserted] = LastUseMI.try_emplace(Key: Reg, Args&: MI);
1573 if (!Inserted) {
1574 MachineInstr *Orig = Ite->second;
1575 MachineInstr *New = MI;
1576 if (InstrScore(Orig) < InstrScore(New))
1577 Ite->second = New;
1578 }
1579 }
1580 }
1581
1582 Instr2LastUsesTy LastUses;
1583 for (auto &Entry : LastUseMI)
1584 LastUses[Entry.second].insert(V: Entry.first);
1585 return LastUses;
1586 }
1587
1588 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1589 // iterations and check the register pressure at the point where all stages
1590 // overlapping.
1591 //
1592 // An example of unrolled loop where #Stage is 4..
1593 // Iter i+0 i+1 i+2 i+3
1594 // ------------------------
1595 // Stage 0
1596 // Stage 1 0
1597 // Stage 2 1 0
1598 // Stage 3 2 1 0 <- All stages overlap
1599 //
1600 std::vector<unsigned>
1601 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1602 Instr2StageTy &Stages,
1603 const unsigned StageCount) const {
1604 using RegSetTy = SmallDenseSet<Register, 16>;
1605
1606 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1607 // manage the liveness of the registers independently by iterations.
1608 SmallVector<RegSetTy> LiveRegSets(StageCount);
1609
1610 auto CurSetPressure = InitSetPressure;
1611 auto MaxSetPressure = InitSetPressure;
1612 auto LastUses = computeLastUses(OrderedInsts, Stages);
1613
1614 LLVM_DEBUG({
1615 dbgs() << "Ordered instructions:\n";
1616 for (MachineInstr *MI : OrderedInsts) {
1617 dbgs() << "Stage " << Stages[MI] << ": ";
1618 MI->dump();
1619 }
1620 });
1621
1622 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1623 Register Reg) {
1624 if (!Reg.isValid() || isReservedRegister(Reg))
1625 return;
1626
1627 bool Inserted = RegSet.insert(V: Reg).second;
1628 if (!Inserted)
1629 return;
1630
1631 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1632 increaseRegisterPressure(Pressure&: CurSetPressure, Reg);
1633 LLVM_DEBUG(dumpPSet(Reg));
1634 };
1635
1636 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1637 Register Reg) {
1638 if (!Reg.isValid() || isReservedRegister(Reg))
1639 return;
1640
1641 // live-in register
1642 if (!RegSet.contains(V: Reg))
1643 return;
1644
1645 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1646 RegSet.erase(V: Reg);
1647 decreaseRegisterPressure(Pressure&: CurSetPressure, Reg);
1648 LLVM_DEBUG(dumpPSet(Reg));
1649 };
1650
1651 for (unsigned I = 0; I < StageCount; I++) {
1652 for (MachineInstr *MI : OrderedInsts) {
1653 const auto Stage = Stages[MI];
1654 if (I < Stage)
1655 continue;
1656
1657 const unsigned Iter = I - Stage;
1658
1659 for (auto &Def : ROMap.find(Val: MI)->getSecond().Defs)
1660 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1661
1662 for (auto LastUse : LastUses[MI]) {
1663 if (MI->isPHI()) {
1664 if (Iter != 0)
1665 EraseReg(LiveRegSets[Iter - 1], LastUse);
1666 } else {
1667 EraseReg(LiveRegSets[Iter], LastUse);
1668 }
1669 }
1670
1671 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1672 MaxSetPressure[PSet] =
1673 std::max(a: MaxSetPressure[PSet], b: CurSetPressure[PSet]);
1674
1675 LLVM_DEBUG({
1676 dbgs() << "CurSetPressure=";
1677 dumpRegisterPressures(CurSetPressure);
1678 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1679 MI->dump();
1680 });
1681 }
1682 }
1683
1684 return MaxSetPressure;
1685 }
1686
1687public:
1688 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1689 const MachineFunction &MF)
1690 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1691 TRI(MF.getSubtarget().getRegisterInfo()),
1692 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1693 PressureSetLimit(PSetNum, 0) {}
1694
1695 // Used to calculate register pressure, which is independent of loop
1696 // scheduling.
1697 void init(const RegisterClassInfo &RCI) {
1698 for (MachineInstr &MI : *OrigMBB) {
1699 if (MI.isDebugInstr())
1700 continue;
1701 ROMap[&MI].collect(MI, TRI: *TRI, MRI, TrackLaneMasks: false, IgnoreDead: true);
1702 }
1703
1704 computeLiveIn();
1705 computePressureSetLimit(RCI);
1706 }
1707
1708 // Calculate the maximum register pressures of the loop and check if they
1709 // exceed the limit
1710 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1711 const unsigned MaxStage) const {
1712 assert(0 <= RegPressureMargin && RegPressureMargin <= 100 &&
1713 "the percentage of the margin must be between 0 to 100");
1714
1715 OrderedInstsTy OrderedInsts;
1716 Instr2StageTy Stages;
1717 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1718 const auto MaxSetPressure =
1719 computeMaxSetPressure(OrderedInsts, Stages, StageCount: MaxStage + 1);
1720
1721 LLVM_DEBUG({
1722 dbgs() << "Dump MaxSetPressure:\n";
1723 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1724 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1725 }
1726 dbgs() << '\n';
1727 });
1728
1729 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1730 unsigned Limit = PressureSetLimit[PSet];
1731 unsigned Margin = Limit * RegPressureMargin / 100;
1732 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1733 << " Margin=" << Margin << "\n");
1734 if (Limit < MaxSetPressure[PSet] + Margin) {
1735 LLVM_DEBUG(
1736 dbgs()
1737 << "Rejected the schedule because of too high register pressure\n");
1738 return true;
1739 }
1740 }
1741 return false;
1742 }
1743};
1744
1745} // end anonymous namespace
1746
1747/// Calculate the resource constrained minimum initiation interval for the
1748/// specified loop. We use the DFA to model the resources needed for
1749/// each instruction, and we ignore dependences. A different DFA is created
1750/// for each cycle that is required. When adding a new instruction, we attempt
1751/// to add it to each existing DFA, until a legal space is found. If the
1752/// instruction cannot be reserved in an existing DFA, we create a new one.
1753unsigned SwingSchedulerDAG::calculateResMII() {
1754 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1755 ResourceManager RM(&MF.getSubtarget(), this);
1756 return RM.calculateResMII();
1757}
1758
1759/// Calculate the recurrence-constrainted minimum initiation interval.
1760/// Iterate over each circuit. Compute the delay(c) and distance(c)
1761/// for each circuit. The II needs to satisfy the inequality
1762/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1763/// II that satisfies the inequality, and the RecMII is the maximum
1764/// of those values.
1765unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1766 unsigned RecMII = 0;
1767
1768 for (NodeSet &Nodes : NodeSets) {
1769 if (Nodes.empty())
1770 continue;
1771
1772 unsigned Delay = Nodes.getLatency();
1773 unsigned Distance = 1;
1774
1775 // ii = ceil(delay / distance)
1776 unsigned CurMII = (Delay + Distance - 1) / Distance;
1777 Nodes.setRecMII(CurMII);
1778 if (CurMII > RecMII)
1779 RecMII = CurMII;
1780 }
1781
1782 return RecMII;
1783}
1784
1785/// Create the adjacency structure of the nodes in the graph.
1786void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1787 SwingSchedulerDAG *DAG) {
1788 BitVector Added(SUnits.size());
1789 DenseMap<int, int> OutputDeps;
1790 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1791 Added.reset();
1792 // Add any successor to the adjacency matrix and exclude duplicates.
1793 for (auto &OE : DAG->DDG->getOutEdges(SU: &SUnits[i])) {
1794 // Only create a back-edge on the first and last nodes of a dependence
1795 // chain. This records any chains and adds them later.
1796 if (OE.isOutputDep()) {
1797 int N = OE.getDst()->NodeNum;
1798 int BackEdge = i;
1799 auto Dep = OutputDeps.find(Val: BackEdge);
1800 if (Dep != OutputDeps.end()) {
1801 BackEdge = Dep->second;
1802 OutputDeps.erase(I: Dep);
1803 }
1804 OutputDeps[N] = BackEdge;
1805 }
1806 // Do not process a boundary node, an artificial node.
1807 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1808 continue;
1809
1810 // This code is retained o preserve previous behavior and prevent
1811 // regression. This condition means that anti-dependnecies within an
1812 // iteration are ignored when searching circuits. Therefore it's natural
1813 // to consider this dependence as well.
1814 // FIXME: Remove this code if it doesn't have significant impact on
1815 // performance.
1816 if (OE.isAntiDep())
1817 continue;
1818
1819 int N = OE.getDst()->NodeNum;
1820 if (!Added.test(Idx: N)) {
1821 AdjK[i].push_back(Elt: N);
1822 Added.set(N);
1823 }
1824 }
1825 // A chain edge between a store and a load is treated as a back-edge in the
1826 // adjacency matrix.
1827 for (auto &IE : DAG->DDG->getInEdges(SU: &SUnits[i])) {
1828 SUnit *Src = IE.getSrc();
1829 SUnit *Dst = IE.getDst();
1830 if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(Edge: IE))
1831 continue;
1832 if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1833 int N = Src->NodeNum;
1834 if (!Added.test(Idx: N)) {
1835 AdjK[i].push_back(Elt: N);
1836 Added.set(N);
1837 }
1838 }
1839 }
1840 }
1841 // Add back-edges in the adjacency matrix for the output dependences.
1842 for (auto &OD : OutputDeps)
1843 if (!Added.test(Idx: OD.second)) {
1844 AdjK[OD.first].push_back(Elt: OD.second);
1845 Added.set(OD.second);
1846 }
1847}
1848
1849/// Identify an elementary circuit in the dependence graph starting at the
1850/// specified node.
1851bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1852 const SwingSchedulerDAG *DAG,
1853 bool HasBackedge) {
1854 SUnit *SV = &SUnits[V];
1855 bool F = false;
1856 Stack.insert(X: SV);
1857 Blocked.set(V);
1858
1859 for (auto W : AdjK[V]) {
1860 if (NumPaths > MaxPaths)
1861 break;
1862 if (W < S)
1863 continue;
1864 if (W == S) {
1865 if (!HasBackedge)
1866 NodeSets.push_back(Elt: NodeSet(Stack.begin(), Stack.end(), DAG));
1867 F = true;
1868 ++NumPaths;
1869 break;
1870 }
1871 if (!Blocked.test(Idx: W)) {
1872 if (circuit(V: W, S, NodeSets, DAG,
1873 HasBackedge: Node2Idx->at(n: W) < Node2Idx->at(n: V) ? true : HasBackedge))
1874 F = true;
1875 }
1876 }
1877
1878 if (F)
1879 unblock(U: V);
1880 else {
1881 for (auto W : AdjK[V]) {
1882 if (W < S)
1883 continue;
1884 B[W].insert(Ptr: SV);
1885 }
1886 }
1887 Stack.pop_back();
1888 return F;
1889}
1890
1891/// Unblock a node in the circuit finding algorithm.
1892void SwingSchedulerDAG::Circuits::unblock(int U) {
1893 Blocked.reset(Idx: U);
1894 SmallPtrSet<SUnit *, 4> &BU = B[U];
1895 while (!BU.empty()) {
1896 SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1897 assert(SI != BU.end() && "Invalid B set.");
1898 SUnit *W = *SI;
1899 BU.erase(Ptr: W);
1900 if (Blocked.test(Idx: W->NodeNum))
1901 unblock(U: W->NodeNum);
1902 }
1903}
1904
1905/// Identify all the elementary circuits in the dependence graph using
1906/// Johnson's circuit algorithm.
1907void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1908 Circuits Cir(SUnits, Topo);
1909 // Create the adjacency structure.
1910 Cir.createAdjacencyStructure(DAG: this);
1911 for (int I = 0, E = SUnits.size(); I != E; ++I) {
1912 Cir.reset();
1913 Cir.circuit(V: I, S: I, NodeSets, DAG: this);
1914 }
1915}
1916
1917// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1918// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1919// additional copies that are needed across iterations. An artificial dependence
1920// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1921
1922// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1923// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1924// PHI-------True-Dep------> USEOfPhi
1925
1926// The mutation creates
1927// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1928
1929// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1930// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1931// late to avoid additional copies across iterations. The possible scheduling
1932// order would be
1933// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1934
1935void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1936 for (SUnit &SU : DAG->SUnits) {
1937 // Find the COPY/REG_SEQUENCE instruction.
1938 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1939 continue;
1940
1941 // Record the loop carried PHIs.
1942 SmallVector<SUnit *, 4> PHISUs;
1943 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1944 SmallVector<SUnit *, 4> SrcSUs;
1945
1946 for (auto &Dep : SU.Preds) {
1947 SUnit *TmpSU = Dep.getSUnit();
1948 MachineInstr *TmpMI = TmpSU->getInstr();
1949 SDep::Kind DepKind = Dep.getKind();
1950 // Save the loop carried PHI.
1951 if (DepKind == SDep::Anti && TmpMI->isPHI())
1952 PHISUs.push_back(Elt: TmpSU);
1953 // Save the source of COPY/REG_SEQUENCE.
1954 // If the source has no pre-decessors, we will end up creating cycles.
1955 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1956 SrcSUs.push_back(Elt: TmpSU);
1957 }
1958
1959 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1960 continue;
1961
1962 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1963 // SUnit to the container.
1964 SmallVector<SUnit *, 8> UseSUs;
1965 // Do not use iterator based loop here as we are updating the container.
1966 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1967 for (auto &Dep : PHISUs[Index]->Succs) {
1968 if (Dep.getKind() != SDep::Data)
1969 continue;
1970
1971 SUnit *TmpSU = Dep.getSUnit();
1972 MachineInstr *TmpMI = TmpSU->getInstr();
1973 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1974 PHISUs.push_back(Elt: TmpSU);
1975 continue;
1976 }
1977 UseSUs.push_back(Elt: TmpSU);
1978 }
1979 }
1980
1981 if (UseSUs.size() == 0)
1982 continue;
1983
1984 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(Val: DAG);
1985 // Add the artificial dependencies if it does not form a cycle.
1986 for (auto *I : UseSUs) {
1987 for (auto *Src : SrcSUs) {
1988 if (!SDAG->Topo.IsReachable(SU: I, TargetSU: Src) && Src != I) {
1989 Src->addPred(D: SDep(I, SDep::Artificial));
1990 SDAG->Topo.AddPred(Y: Src, X: I);
1991 }
1992 }
1993 }
1994 }
1995}
1996
1997/// Compute several functions need to order the nodes for scheduling.
1998/// ASAP - Earliest time to schedule a node.
1999/// ALAP - Latest time to schedule a node.
2000/// MOV - Mobility function, difference between ALAP and ASAP.
2001/// D - Depth of each node.
2002/// H - Height of each node.
2003void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
2004 ScheduleInfo.resize(new_size: SUnits.size());
2005
2006 LLVM_DEBUG({
2007 for (int I : Topo) {
2008 const SUnit &SU = SUnits[I];
2009 dumpNode(SU);
2010 }
2011 });
2012
2013 int maxASAP = 0;
2014 // Compute ASAP and ZeroLatencyDepth.
2015 for (int I : Topo) {
2016 int asap = 0;
2017 int zeroLatencyDepth = 0;
2018 SUnit *SU = &SUnits[I];
2019 for (const auto &IE : DDG->getInEdges(SU)) {
2020 SUnit *Pred = IE.getSrc();
2021 if (IE.getLatency() == 0)
2022 zeroLatencyDepth =
2023 std::max(a: zeroLatencyDepth, b: getZeroLatencyDepth(Node: Pred) + 1);
2024 if (IE.ignoreDependence(IgnoreAnti: true))
2025 continue;
2026 asap = std::max(a: asap, b: (int)(getASAP(Node: Pred) + IE.getLatency() -
2027 IE.getDistance() * MII));
2028 }
2029 maxASAP = std::max(a: maxASAP, b: asap);
2030 ScheduleInfo[I].ASAP = asap;
2031 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
2032 }
2033
2034 // Compute ALAP, ZeroLatencyHeight, and MOV.
2035 for (int I : llvm::reverse(C&: Topo)) {
2036 int alap = maxASAP;
2037 int zeroLatencyHeight = 0;
2038 SUnit *SU = &SUnits[I];
2039 for (const auto &OE : DDG->getOutEdges(SU)) {
2040 SUnit *Succ = OE.getDst();
2041 if (Succ->isBoundaryNode())
2042 continue;
2043 if (OE.getLatency() == 0)
2044 zeroLatencyHeight =
2045 std::max(a: zeroLatencyHeight, b: getZeroLatencyHeight(Node: Succ) + 1);
2046 if (OE.ignoreDependence(IgnoreAnti: true))
2047 continue;
2048 alap = std::min(a: alap, b: (int)(getALAP(Node: Succ) - OE.getLatency() +
2049 OE.getDistance() * MII));
2050 }
2051
2052 ScheduleInfo[I].ALAP = alap;
2053 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
2054 }
2055
2056 // After computing the node functions, compute the summary for each node set.
2057 for (NodeSet &I : NodeSets)
2058 I.computeNodeSetInfo(SSD: this);
2059
2060 LLVM_DEBUG({
2061 for (unsigned i = 0; i < SUnits.size(); i++) {
2062 dbgs() << "\tNode " << i << ":\n";
2063 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
2064 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
2065 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
2066 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
2067 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
2068 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
2069 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
2070 }
2071 });
2072}
2073
2074/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
2075/// as the predecessors of the elements of NodeOrder that are not also in
2076/// NodeOrder.
2077static bool pred_L(SetVector<SUnit *> &NodeOrder,
2078 SmallSetVector<SUnit *, 8> &Preds, SwingSchedulerDDG *DDG,
2079 const NodeSet *S = nullptr) {
2080 Preds.clear();
2081
2082 for (SUnit *SU : NodeOrder) {
2083 for (const auto &IE : DDG->getInEdges(SU)) {
2084 SUnit *PredSU = IE.getSrc();
2085 if (S && S->count(SU: PredSU) == 0)
2086 continue;
2087 if (IE.ignoreDependence(IgnoreAnti: true))
2088 continue;
2089 if (NodeOrder.count(key: PredSU) == 0)
2090 Preds.insert(X: PredSU);
2091 }
2092
2093 // FIXME: The following loop-carried dependencies may also need to be
2094 // considered.
2095 // - Physical register dependencies (true-dependence and WAW).
2096 // - Memory dependencies.
2097 for (const auto &OE : DDG->getOutEdges(SU)) {
2098 SUnit *SuccSU = OE.getDst();
2099 if (!OE.isAntiDep())
2100 continue;
2101 if (S && S->count(SU: SuccSU) == 0)
2102 continue;
2103 if (NodeOrder.count(key: SuccSU) == 0)
2104 Preds.insert(X: SuccSU);
2105 }
2106 }
2107 return !Preds.empty();
2108}
2109
2110/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
2111/// as the successors of the elements of NodeOrder that are not also in
2112/// NodeOrder.
2113static bool succ_L(SetVector<SUnit *> &NodeOrder,
2114 SmallSetVector<SUnit *, 8> &Succs, SwingSchedulerDDG *DDG,
2115 const NodeSet *S = nullptr) {
2116 Succs.clear();
2117
2118 for (SUnit *SU : NodeOrder) {
2119 for (const auto &OE : DDG->getOutEdges(SU)) {
2120 SUnit *SuccSU = OE.getDst();
2121 if (S && S->count(SU: SuccSU) == 0)
2122 continue;
2123 if (OE.ignoreDependence(IgnoreAnti: false))
2124 continue;
2125 if (NodeOrder.count(key: SuccSU) == 0)
2126 Succs.insert(X: SuccSU);
2127 }
2128
2129 // FIXME: The following loop-carried dependencies may also need to be
2130 // considered.
2131 // - Physical register dependnecies (true-dependnece and WAW).
2132 // - Memory dependencies.
2133 for (const auto &IE : DDG->getInEdges(SU)) {
2134 SUnit *PredSU = IE.getSrc();
2135 if (!IE.isAntiDep())
2136 continue;
2137 if (S && S->count(SU: PredSU) == 0)
2138 continue;
2139 if (NodeOrder.count(key: PredSU) == 0)
2140 Succs.insert(X: PredSU);
2141 }
2142 }
2143 return !Succs.empty();
2144}
2145
2146/// Return true if there is a path from the specified node to any of the nodes
2147/// in DestNodes. Keep track and return the nodes in any path.
2148static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
2149 SetVector<SUnit *> &DestNodes,
2150 SetVector<SUnit *> &Exclude,
2151 SmallPtrSet<SUnit *, 8> &Visited,
2152 SwingSchedulerDDG *DDG) {
2153 if (Cur->isBoundaryNode())
2154 return false;
2155 if (Exclude.contains(key: Cur))
2156 return false;
2157 if (DestNodes.contains(key: Cur))
2158 return true;
2159 if (!Visited.insert(Ptr: Cur).second)
2160 return Path.contains(key: Cur);
2161 bool FoundPath = false;
2162 for (const auto &OE : DDG->getOutEdges(SU: Cur))
2163 if (!OE.ignoreDependence(IgnoreAnti: false))
2164 FoundPath |=
2165 computePath(Cur: OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
2166 for (const auto &IE : DDG->getInEdges(SU: Cur))
2167 if (IE.isAntiDep() && IE.getDistance() == 0)
2168 FoundPath |=
2169 computePath(Cur: IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
2170 if (FoundPath)
2171 Path.insert(X: Cur);
2172 return FoundPath;
2173}
2174
2175/// Compute the live-out registers for the instructions in a node-set.
2176/// The live-out registers are those that are defined in the node-set,
2177/// but not used. Except for use operands of Phis.
2178static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
2179 NodeSet &NS) {
2180 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2181 MachineRegisterInfo &MRI = MF.getRegInfo();
2182 SmallVector<VRegMaskOrUnit, 8> LiveOutRegs;
2183 SmallSet<Register, 4> Uses;
2184 for (SUnit *SU : NS) {
2185 const MachineInstr *MI = SU->getInstr();
2186 if (MI->isPHI())
2187 continue;
2188 for (const MachineOperand &MO : MI->all_uses()) {
2189 Register Reg = MO.getReg();
2190 if (Reg.isVirtual())
2191 Uses.insert(V: Reg);
2192 else if (MRI.isAllocatable(PhysReg: Reg))
2193 Uses.insert_range(R: TRI->regunits(Reg: Reg.asMCReg()));
2194 }
2195 }
2196 for (SUnit *SU : NS)
2197 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2198 if (!MO.isDead()) {
2199 Register Reg = MO.getReg();
2200 if (Reg.isVirtual()) {
2201 if (!Uses.count(V: Reg))
2202 LiveOutRegs.emplace_back(Args&: Reg, Args: LaneBitmask::getNone());
2203 } else if (MRI.isAllocatable(PhysReg: Reg)) {
2204 for (MCRegUnit Unit : TRI->regunits(Reg: Reg.asMCReg()))
2205 if (!Uses.count(V: Unit))
2206 LiveOutRegs.emplace_back(Args&: Unit, Args: LaneBitmask::getNone());
2207 }
2208 }
2209 RPTracker.addLiveRegs(Regs: LiveOutRegs);
2210}
2211
2212/// A heuristic to filter nodes in recurrent node-sets if the register
2213/// pressure of a set is too high.
2214void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2215 for (auto &NS : NodeSets) {
2216 // Skip small node-sets since they won't cause register pressure problems.
2217 if (NS.size() <= 2)
2218 continue;
2219 IntervalPressure RecRegPressure;
2220 RegPressureTracker RecRPTracker(RecRegPressure);
2221 RecRPTracker.init(mf: &MF, rci: &RegClassInfo, lis: &LIS, mbb: BB, pos: BB->end(), TrackLaneMasks: false, TrackUntiedDefs: true);
2222 computeLiveOuts(MF, RPTracker&: RecRPTracker, NS);
2223 RecRPTracker.closeBottom();
2224
2225 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2226 llvm::sort(C&: SUnits, Comp: [](const SUnit *A, const SUnit *B) {
2227 return A->NodeNum > B->NodeNum;
2228 });
2229
2230 for (auto &SU : SUnits) {
2231 // Since we're computing the register pressure for a subset of the
2232 // instructions in a block, we need to set the tracker for each
2233 // instruction in the node-set. The tracker is set to the instruction
2234 // just after the one we're interested in.
2235 MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
2236 RecRPTracker.setPos(std::next(x: CurInstI));
2237
2238 RegPressureDelta RPDelta;
2239 ArrayRef<PressureChange> CriticalPSets;
2240 RecRPTracker.getMaxUpwardPressureDelta(MI: SU->getInstr(), PDiff: nullptr, Delta&: RPDelta,
2241 CriticalPSets,
2242 MaxPressureLimit: RecRegPressure.MaxSetPressure);
2243 if (RPDelta.Excess.isValid()) {
2244 LLVM_DEBUG(
2245 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2246 << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2247 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2248 NS.setExceedPressure(SU);
2249 break;
2250 }
2251 RecRPTracker.recede();
2252 }
2253 }
2254}
2255
2256/// A heuristic to colocate node sets that have the same set of
2257/// successors.
2258void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2259 unsigned Colocate = 0;
2260 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2261 NodeSet &N1 = NodeSets[i];
2262 SmallSetVector<SUnit *, 8> S1;
2263 if (N1.empty() || !succ_L(NodeOrder&: N1, Succs&: S1, DDG: DDG.get()))
2264 continue;
2265 for (int j = i + 1; j < e; ++j) {
2266 NodeSet &N2 = NodeSets[j];
2267 if (N1.compareRecMII(RHS&: N2) != 0)
2268 continue;
2269 SmallSetVector<SUnit *, 8> S2;
2270 if (N2.empty() || !succ_L(NodeOrder&: N2, Succs&: S2, DDG: DDG.get()))
2271 continue;
2272 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2273 N1.setColocate(++Colocate);
2274 N2.setColocate(Colocate);
2275 break;
2276 }
2277 }
2278 }
2279}
2280
2281/// Check if the existing node-sets are profitable. If not, then ignore the
2282/// recurrent node-sets, and attempt to schedule all nodes together. This is
2283/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2284/// then it's best to try to schedule all instructions together instead of
2285/// starting with the recurrent node-sets.
2286void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2287 // Look for loops with a large MII.
2288 if (MII < 17)
2289 return;
2290 // Check if the node-set contains only a simple add recurrence.
2291 for (auto &NS : NodeSets) {
2292 if (NS.getRecMII() > 2)
2293 return;
2294 if (NS.getMaxDepth() > MII)
2295 return;
2296 }
2297 NodeSets.clear();
2298 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2299}
2300
2301/// Add the nodes that do not belong to a recurrence set into groups
2302/// based upon connected components.
2303void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2304 SetVector<SUnit *> NodesAdded;
2305 SmallPtrSet<SUnit *, 8> Visited;
2306 // Add the nodes that are on a path between the previous node sets and
2307 // the current node set.
2308 for (NodeSet &I : NodeSets) {
2309 SmallSetVector<SUnit *, 8> N;
2310 // Add the nodes from the current node set to the previous node set.
2311 if (succ_L(NodeOrder&: I, Succs&: N, DDG: DDG.get())) {
2312 SetVector<SUnit *> Path;
2313 for (SUnit *NI : N) {
2314 Visited.clear();
2315 computePath(Cur: NI, Path, DestNodes&: NodesAdded, Exclude&: I, Visited, DDG: DDG.get());
2316 }
2317 if (!Path.empty())
2318 I.insert(S: Path.begin(), E: Path.end());
2319 }
2320 // Add the nodes from the previous node set to the current node set.
2321 N.clear();
2322 if (succ_L(NodeOrder&: NodesAdded, Succs&: N, DDG: DDG.get())) {
2323 SetVector<SUnit *> Path;
2324 for (SUnit *NI : N) {
2325 Visited.clear();
2326 computePath(Cur: NI, Path, DestNodes&: I, Exclude&: NodesAdded, Visited, DDG: DDG.get());
2327 }
2328 if (!Path.empty())
2329 I.insert(S: Path.begin(), E: Path.end());
2330 }
2331 NodesAdded.insert_range(R&: I);
2332 }
2333
2334 // Create a new node set with the connected nodes of any successor of a node
2335 // in a recurrent set.
2336 NodeSet NewSet;
2337 SmallSetVector<SUnit *, 8> N;
2338 if (succ_L(NodeOrder&: NodesAdded, Succs&: N, DDG: DDG.get()))
2339 for (SUnit *I : N)
2340 addConnectedNodes(SU: I, NewSet, NodesAdded);
2341 if (!NewSet.empty())
2342 NodeSets.push_back(Elt: NewSet);
2343
2344 // Create a new node set with the connected nodes of any predecessor of a node
2345 // in a recurrent set.
2346 NewSet.clear();
2347 if (pred_L(NodeOrder&: NodesAdded, Preds&: N, DDG: DDG.get()))
2348 for (SUnit *I : N)
2349 addConnectedNodes(SU: I, NewSet, NodesAdded);
2350 if (!NewSet.empty())
2351 NodeSets.push_back(Elt: NewSet);
2352
2353 // Create new nodes sets with the connected nodes any remaining node that
2354 // has no predecessor.
2355 for (SUnit &SU : SUnits) {
2356 if (NodesAdded.count(key: &SU) == 0) {
2357 NewSet.clear();
2358 addConnectedNodes(SU: &SU, NewSet, NodesAdded);
2359 if (!NewSet.empty())
2360 NodeSets.push_back(Elt: NewSet);
2361 }
2362 }
2363}
2364
2365/// Add the node to the set, and add all of its connected nodes to the set.
2366void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2367 SetVector<SUnit *> &NodesAdded) {
2368 NewSet.insert(SU);
2369 NodesAdded.insert(X: SU);
2370 for (auto &OE : DDG->getOutEdges(SU)) {
2371 SUnit *Successor = OE.getDst();
2372 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2373 NodesAdded.count(key: Successor) == 0)
2374 addConnectedNodes(SU: Successor, NewSet, NodesAdded);
2375 }
2376 for (auto &IE : DDG->getInEdges(SU)) {
2377 SUnit *Predecessor = IE.getSrc();
2378 if (!IE.isArtificial() && NodesAdded.count(key: Predecessor) == 0)
2379 addConnectedNodes(SU: Predecessor, NewSet, NodesAdded);
2380 }
2381}
2382
2383/// Return true if Set1 contains elements in Set2. The elements in common
2384/// are returned in a different container.
2385static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2386 SmallSetVector<SUnit *, 8> &Result) {
2387 Result.clear();
2388 for (SUnit *SU : Set1) {
2389 if (Set2.count(SU) != 0)
2390 Result.insert(X: SU);
2391 }
2392 return !Result.empty();
2393}
2394
2395/// Merge the recurrence node sets that have the same initial node.
2396void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2397 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2398 ++I) {
2399 NodeSet &NI = *I;
2400 for (NodeSetType::iterator J = I + 1; J != E;) {
2401 NodeSet &NJ = *J;
2402 if (NI.getNode(i: 0)->NodeNum == NJ.getNode(i: 0)->NodeNum) {
2403 if (NJ.compareRecMII(RHS&: NI) > 0)
2404 NI.setRecMII(NJ.getRecMII());
2405 for (SUnit *SU : *J)
2406 I->insert(SU);
2407 NodeSets.erase(CI: J);
2408 E = NodeSets.end();
2409 } else {
2410 ++J;
2411 }
2412 }
2413 }
2414}
2415
2416/// Remove nodes that have been scheduled in previous NodeSets.
2417void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2418 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2419 ++I)
2420 for (NodeSetType::iterator J = I + 1; J != E;) {
2421 J->remove_if(P: [&](SUnit *SUJ) { return I->count(SU: SUJ); });
2422
2423 if (J->empty()) {
2424 NodeSets.erase(CI: J);
2425 E = NodeSets.end();
2426 } else {
2427 ++J;
2428 }
2429 }
2430}
2431
2432/// Compute an ordered list of the dependence graph nodes, which
2433/// indicates the order that the nodes will be scheduled. This is a
2434/// two-level algorithm. First, a partial order is created, which
2435/// consists of a list of sets ordered from highest to lowest priority.
2436void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2437 SmallSetVector<SUnit *, 8> R;
2438 NodeOrder.clear();
2439
2440 for (auto &Nodes : NodeSets) {
2441 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2442 OrderKind Order;
2443 SmallSetVector<SUnit *, 8> N;
2444 if (pred_L(NodeOrder, Preds&: N, DDG: DDG.get()) && llvm::set_is_subset(S1: N, S2: Nodes)) {
2445 R.insert_range(R&: N);
2446 Order = BottomUp;
2447 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2448 } else if (succ_L(NodeOrder, Succs&: N, DDG: DDG.get()) &&
2449 llvm::set_is_subset(S1: N, S2: Nodes)) {
2450 R.insert_range(R&: N);
2451 Order = TopDown;
2452 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2453 } else if (isIntersect(Set1&: N, Set2: Nodes, Result&: R)) {
2454 // If some of the successors are in the existing node-set, then use the
2455 // top-down ordering.
2456 Order = TopDown;
2457 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2458 } else if (NodeSets.size() == 1) {
2459 for (const auto &N : Nodes)
2460 if (N->Succs.size() == 0)
2461 R.insert(X: N);
2462 Order = BottomUp;
2463 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2464 } else {
2465 // Find the node with the highest ASAP.
2466 SUnit *maxASAP = nullptr;
2467 for (SUnit *SU : Nodes) {
2468 if (maxASAP == nullptr || getASAP(Node: SU) > getASAP(Node: maxASAP) ||
2469 (getASAP(Node: SU) == getASAP(Node: maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2470 maxASAP = SU;
2471 }
2472 R.insert(X: maxASAP);
2473 Order = BottomUp;
2474 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2475 }
2476
2477 while (!R.empty()) {
2478 if (Order == TopDown) {
2479 // Choose the node with the maximum height. If more than one, choose
2480 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2481 // choose the node with the lowest MOV.
2482 while (!R.empty()) {
2483 SUnit *maxHeight = nullptr;
2484 for (SUnit *I : R) {
2485 if (maxHeight == nullptr || getHeight(Node: I) > getHeight(Node: maxHeight))
2486 maxHeight = I;
2487 else if (getHeight(Node: I) == getHeight(Node: maxHeight) &&
2488 getZeroLatencyHeight(Node: I) > getZeroLatencyHeight(Node: maxHeight))
2489 maxHeight = I;
2490 else if (getHeight(Node: I) == getHeight(Node: maxHeight) &&
2491 getZeroLatencyHeight(Node: I) ==
2492 getZeroLatencyHeight(Node: maxHeight) &&
2493 getMOV(Node: I) < getMOV(Node: maxHeight))
2494 maxHeight = I;
2495 }
2496 NodeOrder.insert(X: maxHeight);
2497 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2498 R.remove(X: maxHeight);
2499 for (const auto &OE : DDG->getOutEdges(SU: maxHeight)) {
2500 SUnit *SU = OE.getDst();
2501 if (Nodes.count(SU) == 0)
2502 continue;
2503 if (NodeOrder.contains(key: SU))
2504 continue;
2505 if (OE.ignoreDependence(IgnoreAnti: false))
2506 continue;
2507 R.insert(X: SU);
2508 }
2509
2510 // FIXME: The following loop-carried dependencies may also need to be
2511 // considered.
2512 // - Physical register dependnecies (true-dependnece and WAW).
2513 // - Memory dependencies.
2514 for (const auto &IE : DDG->getInEdges(SU: maxHeight)) {
2515 SUnit *SU = IE.getSrc();
2516 if (!IE.isAntiDep())
2517 continue;
2518 if (Nodes.count(SU) == 0)
2519 continue;
2520 if (NodeOrder.contains(key: SU))
2521 continue;
2522 R.insert(X: SU);
2523 }
2524 }
2525 Order = BottomUp;
2526 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2527 SmallSetVector<SUnit *, 8> N;
2528 if (pred_L(NodeOrder, Preds&: N, DDG: DDG.get(), S: &Nodes))
2529 R.insert_range(R&: N);
2530 } else {
2531 // Choose the node with the maximum depth. If more than one, choose
2532 // the node with the maximum ZeroLatencyDepth. If still more than one,
2533 // choose the node with the lowest MOV.
2534 while (!R.empty()) {
2535 SUnit *maxDepth = nullptr;
2536 for (SUnit *I : R) {
2537 if (maxDepth == nullptr || getDepth(Node: I) > getDepth(Node: maxDepth))
2538 maxDepth = I;
2539 else if (getDepth(Node: I) == getDepth(Node: maxDepth) &&
2540 getZeroLatencyDepth(Node: I) > getZeroLatencyDepth(Node: maxDepth))
2541 maxDepth = I;
2542 else if (getDepth(Node: I) == getDepth(Node: maxDepth) &&
2543 getZeroLatencyDepth(Node: I) == getZeroLatencyDepth(Node: maxDepth) &&
2544 getMOV(Node: I) < getMOV(Node: maxDepth))
2545 maxDepth = I;
2546 }
2547 NodeOrder.insert(X: maxDepth);
2548 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2549 R.remove(X: maxDepth);
2550 if (Nodes.isExceedSU(SU: maxDepth)) {
2551 Order = TopDown;
2552 R.clear();
2553 R.insert(X: Nodes.getNode(i: 0));
2554 break;
2555 }
2556 for (const auto &IE : DDG->getInEdges(SU: maxDepth)) {
2557 SUnit *SU = IE.getSrc();
2558 if (Nodes.count(SU) == 0)
2559 continue;
2560 if (NodeOrder.contains(key: SU))
2561 continue;
2562 R.insert(X: SU);
2563 }
2564
2565 // FIXME: The following loop-carried dependencies may also need to be
2566 // considered.
2567 // - Physical register dependnecies (true-dependnece and WAW).
2568 // - Memory dependencies.
2569 for (const auto &OE : DDG->getOutEdges(SU: maxDepth)) {
2570 SUnit *SU = OE.getDst();
2571 if (!OE.isAntiDep())
2572 continue;
2573 if (Nodes.count(SU) == 0)
2574 continue;
2575 if (NodeOrder.contains(key: SU))
2576 continue;
2577 R.insert(X: SU);
2578 }
2579 }
2580 Order = TopDown;
2581 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2582 SmallSetVector<SUnit *, 8> N;
2583 if (succ_L(NodeOrder, Succs&: N, DDG: DDG.get(), S: &Nodes))
2584 R.insert_range(R&: N);
2585 }
2586 }
2587 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2588 }
2589
2590 LLVM_DEBUG({
2591 dbgs() << "Node order: ";
2592 for (SUnit *I : NodeOrder)
2593 dbgs() << " " << I->NodeNum << " ";
2594 dbgs() << "\n";
2595 });
2596}
2597
2598/// Process the nodes in the computed order and create the pipelined schedule
2599/// of the instructions, if possible. Return true if a schedule is found.
2600bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2601
2602 if (NodeOrder.empty()){
2603 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2604 return false;
2605 }
2606
2607 bool scheduleFound = false;
2608 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2609 if (LimitRegPressure) {
2610 HRPDetector =
2611 std::make_unique<HighRegisterPressureDetector>(args: Loop.getHeader(), args&: MF);
2612 HRPDetector->init(RCI: RegClassInfo);
2613 }
2614 // Keep increasing II until a valid schedule is found.
2615 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2616 Schedule.reset();
2617 Schedule.setInitiationInterval(II);
2618 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2619
2620 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2621 SetVector<SUnit *>::iterator NE = NodeOrder.end();
2622 do {
2623 SUnit *SU = *NI;
2624
2625 // Compute the schedule time for the instruction, which is based
2626 // upon the scheduled time for any predecessors/successors.
2627 int EarlyStart = INT_MIN;
2628 int LateStart = INT_MAX;
2629 Schedule.computeStart(SU, MaxEarlyStart: &EarlyStart, MinLateStart: &LateStart, II, DAG: this);
2630 LLVM_DEBUG({
2631 dbgs() << "\n";
2632 dbgs() << "Inst (" << SU->NodeNum << ") ";
2633 SU->getInstr()->dump();
2634 dbgs() << "\n";
2635 });
2636 LLVM_DEBUG(
2637 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2638
2639 if (EarlyStart > LateStart)
2640 scheduleFound = false;
2641 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2642 scheduleFound =
2643 Schedule.insert(SU, StartCycle: EarlyStart, EndCycle: EarlyStart + (int)II - 1, II);
2644 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2645 scheduleFound =
2646 Schedule.insert(SU, StartCycle: LateStart, EndCycle: LateStart - (int)II + 1, II);
2647 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2648 LateStart = std::min(a: LateStart, b: EarlyStart + (int)II - 1);
2649 // When scheduling a Phi it is better to start at the late cycle and
2650 // go backwards. The default order may insert the Phi too far away
2651 // from its first dependence.
2652 // Also, do backward search when all scheduled predecessors are
2653 // loop-carried output/order dependencies. Empirically, there are also
2654 // cases where scheduling becomes possible with backward search.
2655 if (SU->getInstr()->isPHI() ||
2656 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, DDG: this->getDDG()))
2657 scheduleFound = Schedule.insert(SU, StartCycle: LateStart, EndCycle: EarlyStart, II);
2658 else
2659 scheduleFound = Schedule.insert(SU, StartCycle: EarlyStart, EndCycle: LateStart, II);
2660 } else {
2661 int FirstCycle = Schedule.getFirstCycle();
2662 scheduleFound = Schedule.insert(SU, StartCycle: FirstCycle + getASAP(Node: SU),
2663 EndCycle: FirstCycle + getASAP(Node: SU) + II - 1, II);
2664 }
2665
2666 // Even if we find a schedule, make sure the schedule doesn't exceed the
2667 // allowable number of stages. We keep trying if this happens.
2668 if (scheduleFound)
2669 if (SwpMaxStages > -1 &&
2670 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2671 scheduleFound = false;
2672
2673 LLVM_DEBUG({
2674 if (!scheduleFound)
2675 dbgs() << "\tCan't schedule\n";
2676 });
2677 } while (++NI != NE && scheduleFound);
2678
2679 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2680 if (scheduleFound)
2681 scheduleFound =
2682 Schedule.normalizeNonPipelinedInstructions(SSD: this, PLI: LoopPipelinerInfo);
2683
2684 // If a schedule is found, check if it is a valid schedule too.
2685 if (scheduleFound)
2686 scheduleFound = Schedule.isValidSchedule(SSD: this);
2687
2688 // If a schedule was found and the option is enabled, check if the schedule
2689 // might generate additional register spills/fills.
2690 if (scheduleFound && LimitRegPressure)
2691 scheduleFound =
2692 !HRPDetector->detect(SSD: this, Schedule, MaxStage: Schedule.getMaxStageCount());
2693 }
2694
2695 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2696 << " (II=" << Schedule.getInitiationInterval()
2697 << ")\n");
2698
2699 if (scheduleFound) {
2700 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(SSD&: *this, SMS&: Schedule);
2701 if (!scheduleFound)
2702 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2703 }
2704
2705 if (scheduleFound) {
2706 Schedule.finalizeSchedule(SSD: this);
2707 Pass.ORE->emit(RemarkBuilder: [&]() {
2708 return MachineOptimizationRemarkAnalysis(
2709 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2710 << "Schedule found with Initiation Interval: "
2711 << ore::NV("II", Schedule.getInitiationInterval())
2712 << ", MaxStageCount: "
2713 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2714 });
2715 } else
2716 Schedule.reset();
2717
2718 return scheduleFound && Schedule.getMaxStageCount() > 0;
2719}
2720
2721static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI) {
2722 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2723 Register Result;
2724 for (const MachineOperand &Use : MI.all_uses()) {
2725 Register Reg = Use.getReg();
2726 if (!Reg.isVirtual())
2727 return Register();
2728 if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2729 continue;
2730 if (Result)
2731 return Register();
2732 Result = Reg;
2733 }
2734 return Result;
2735}
2736
2737/// When Op is a value that is incremented recursively in a loop and there is a
2738/// unique instruction that increments it, returns true and sets Value.
2739static bool findLoopIncrementValue(const MachineOperand &Op, int &Value) {
2740 if (!Op.isReg() || !Op.getReg().isVirtual())
2741 return false;
2742
2743 Register OrgReg = Op.getReg();
2744 Register CurReg = OrgReg;
2745 const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2746 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2747
2748 const TargetInstrInfo *TII =
2749 LoopBB->getParent()->getSubtarget().getInstrInfo();
2750 const TargetRegisterInfo *TRI =
2751 LoopBB->getParent()->getSubtarget().getRegisterInfo();
2752
2753 MachineInstr *Phi = nullptr;
2754 MachineInstr *Increment = nullptr;
2755
2756 // Traverse definitions until it reaches Op or an instruction that does not
2757 // satisfy the condition.
2758 // Acceptable example:
2759 // bb.0:
2760 // %0 = PHI %3, %bb.0, ...
2761 // %2 = ADD %0, Value
2762 // ... = LOAD %2(Op)
2763 // %3 = COPY %2
2764 while (true) {
2765 if (!CurReg.isValid() || !CurReg.isVirtual())
2766 return false;
2767 MachineInstr *Def = MRI.getVRegDef(Reg: CurReg);
2768 if (Def->getParent() != LoopBB)
2769 return false;
2770
2771 if (Def->isCopy()) {
2772 // Ignore copy instructions unless they contain subregisters
2773 if (Def->getOperand(i: 0).getSubReg() || Def->getOperand(i: 1).getSubReg())
2774 return false;
2775 CurReg = Def->getOperand(i: 1).getReg();
2776 } else if (Def->isPHI()) {
2777 // There must be just one Phi
2778 if (Phi)
2779 return false;
2780 Phi = Def;
2781 CurReg = getLoopPhiReg(Phi: *Def, LoopBB);
2782 } else if (TII->getIncrementValue(MI: *Def, Value)) {
2783 // Potentially a unique increment
2784 if (Increment)
2785 // Multiple increments exist
2786 return false;
2787
2788 const MachineOperand *BaseOp;
2789 int64_t Offset;
2790 bool OffsetIsScalable;
2791 if (TII->getMemOperandWithOffset(MI: *Def, BaseOp, Offset, OffsetIsScalable,
2792 TRI)) {
2793 // Pre/post increment instruction
2794 CurReg = BaseOp->getReg();
2795 } else {
2796 // If only one of the operands is defined within the loop, it is assumed
2797 // to be an incremented value.
2798 CurReg = findUniqueOperandDefinedInLoop(MI: *Def);
2799 if (!CurReg.isValid())
2800 return false;
2801 }
2802 Increment = Def;
2803 } else {
2804 return false;
2805 }
2806 if (CurReg == OrgReg)
2807 break;
2808 }
2809
2810 if (!Phi || !Increment)
2811 return false;
2812
2813 return true;
2814}
2815
2816/// Return true if we can compute the amount the instruction changes
2817/// during each iteration. Set Delta to the amount of the change.
2818bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
2819 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2820 const MachineOperand *BaseOp;
2821 int64_t Offset;
2822 bool OffsetIsScalable;
2823 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2824 return false;
2825
2826 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2827 if (OffsetIsScalable)
2828 return false;
2829
2830 if (!BaseOp->isReg())
2831 return false;
2832
2833 return findLoopIncrementValue(Op: *BaseOp, Value&: Delta);
2834}
2835
2836/// Check if we can change the instruction to use an offset value from the
2837/// previous iteration. If so, return true and set the base and offset values
2838/// so that we can rewrite the load, if necessary.
2839/// v1 = Phi(v0, v3)
2840/// v2 = load v1, 0
2841/// v3 = post_store v1, 4, x
2842/// This function enables the load to be rewritten as v2 = load v3, 4.
2843bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2844 unsigned &BasePos,
2845 unsigned &OffsetPos,
2846 Register &NewBase,
2847 int64_t &Offset) {
2848 // Get the load instruction.
2849 if (TII->isPostIncrement(MI: *MI))
2850 return false;
2851 unsigned BasePosLd, OffsetPosLd;
2852 if (!TII->getBaseAndOffsetPosition(MI: *MI, BasePos&: BasePosLd, OffsetPos&: OffsetPosLd))
2853 return false;
2854 Register BaseReg = MI->getOperand(i: BasePosLd).getReg();
2855
2856 // Look for the Phi instruction.
2857 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2858 MachineInstr *Phi = MRI.getVRegDef(Reg: BaseReg);
2859 if (!Phi || !Phi->isPHI())
2860 return false;
2861 // Get the register defined in the loop block.
2862 Register PrevReg = getLoopPhiReg(Phi: *Phi, LoopBB: MI->getParent());
2863 if (!PrevReg)
2864 return false;
2865
2866 // Check for the post-increment load/store instruction.
2867 MachineInstr *PrevDef = MRI.getVRegDef(Reg: PrevReg);
2868 if (!PrevDef || PrevDef == MI)
2869 return false;
2870
2871 if (!TII->isPostIncrement(MI: *PrevDef))
2872 return false;
2873
2874 unsigned BasePos1 = 0, OffsetPos1 = 0;
2875 if (!TII->getBaseAndOffsetPosition(MI: *PrevDef, BasePos&: BasePos1, OffsetPos&: OffsetPos1))
2876 return false;
2877
2878 // Make sure that the instructions do not access the same memory location in
2879 // the next iteration.
2880 int64_t LoadOffset = MI->getOperand(i: OffsetPosLd).getImm();
2881 int64_t StoreOffset = PrevDef->getOperand(i: OffsetPos1).getImm();
2882 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: MI);
2883 NewMI->getOperand(i: OffsetPosLd).setImm(LoadOffset + StoreOffset);
2884 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(MIa: *NewMI, MIb: *PrevDef);
2885 MF.deleteMachineInstr(MI: NewMI);
2886 if (!Disjoint)
2887 return false;
2888
2889 // Set the return value once we determine that we return true.
2890 BasePos = BasePosLd;
2891 OffsetPos = OffsetPosLd;
2892 NewBase = PrevReg;
2893 Offset = StoreOffset;
2894 return true;
2895}
2896
2897/// Apply changes to the instruction if needed. The changes are need
2898/// to improve the scheduling and depend up on the final schedule.
2899void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
2900 SMSchedule &Schedule) {
2901 SUnit *SU = getSUnit(MI);
2902 DenseMap<SUnit *, std::pair<Register, int64_t>>::iterator It =
2903 InstrChanges.find(Val: SU);
2904 if (It != InstrChanges.end()) {
2905 std::pair<Register, int64_t> RegAndOffset = It->second;
2906 unsigned BasePos, OffsetPos;
2907 if (!TII->getBaseAndOffsetPosition(MI: *MI, BasePos, OffsetPos))
2908 return;
2909 Register BaseReg = MI->getOperand(i: BasePos).getReg();
2910 MachineInstr *LoopDef = findDefInLoop(Reg: BaseReg);
2911 int DefStageNum = Schedule.stageScheduled(SU: getSUnit(MI: LoopDef));
2912 int DefCycleNum = Schedule.cycleScheduled(SU: getSUnit(MI: LoopDef));
2913 int BaseStageNum = Schedule.stageScheduled(SU);
2914 int BaseCycleNum = Schedule.cycleScheduled(SU);
2915 if (BaseStageNum < DefStageNum) {
2916 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: MI);
2917 int OffsetDiff = DefStageNum - BaseStageNum;
2918 if (DefCycleNum < BaseCycleNum) {
2919 NewMI->getOperand(i: BasePos).setReg(RegAndOffset.first);
2920 if (OffsetDiff > 0)
2921 --OffsetDiff;
2922 }
2923 int64_t NewOffset =
2924 MI->getOperand(i: OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2925 NewMI->getOperand(i: OffsetPos).setImm(NewOffset);
2926 SU->setInstr(NewMI);
2927 MISUnitMap[NewMI] = SU;
2928 NewMIs[MI] = NewMI;
2929 }
2930 }
2931}
2932
2933/// Return the instruction in the loop that defines the register.
2934/// If the definition is a Phi, then follow the Phi operand to
2935/// the instruction in the loop.
2936MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2937 SmallPtrSet<MachineInstr *, 8> Visited;
2938 MachineInstr *Def = MRI.getVRegDef(Reg);
2939 while (Def->isPHI()) {
2940 if (!Visited.insert(Ptr: Def).second)
2941 break;
2942 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2943 if (Def->getOperand(i: i + 1).getMBB() == BB) {
2944 Def = MRI.getVRegDef(Reg: Def->getOperand(i).getReg());
2945 break;
2946 }
2947 }
2948 return Def;
2949}
2950
2951/// Return false if there is no overlap between the region accessed by BaseMI in
2952/// an iteration and the region accessed by OtherMI in subsequent iterations.
2953bool SwingSchedulerDAG::mayOverlapInLaterIter(
2954 const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
2955 int DeltaB, DeltaO, Delta;
2956 if (!computeDelta(MI: *BaseMI, Delta&: DeltaB) || !computeDelta(MI: *OtherMI, Delta&: DeltaO) ||
2957 DeltaB != DeltaO)
2958 return true;
2959 Delta = DeltaB;
2960
2961 const MachineOperand *BaseOpB, *BaseOpO;
2962 int64_t OffsetB, OffsetO;
2963 bool OffsetBIsScalable, OffsetOIsScalable;
2964 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2965 if (!TII->getMemOperandWithOffset(MI: *BaseMI, BaseOp&: BaseOpB, Offset&: OffsetB,
2966 OffsetIsScalable&: OffsetBIsScalable, TRI) ||
2967 !TII->getMemOperandWithOffset(MI: *OtherMI, BaseOp&: BaseOpO, Offset&: OffsetO,
2968 OffsetIsScalable&: OffsetOIsScalable, TRI))
2969 return true;
2970
2971 if (OffsetBIsScalable || OffsetOIsScalable)
2972 return true;
2973
2974 if (!BaseOpB->isIdenticalTo(Other: *BaseOpO)) {
2975 // Pass cases with different base operands but same initial values.
2976 // Typically for when pre/post increment is used.
2977
2978 if (!BaseOpB->isReg() || !BaseOpO->isReg())
2979 return true;
2980 Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
2981 if (!RegB.isVirtual() || !RegO.isVirtual())
2982 return true;
2983
2984 MachineInstr *DefB = MRI.getVRegDef(Reg: BaseOpB->getReg());
2985 MachineInstr *DefO = MRI.getVRegDef(Reg: BaseOpO->getReg());
2986 if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
2987 return true;
2988
2989 Register InitValB;
2990 Register LoopValB;
2991 Register InitValO;
2992 Register LoopValO;
2993 getPhiRegs(Phi&: *DefB, Loop: BB, InitVal&: InitValB, LoopVal&: LoopValB);
2994 getPhiRegs(Phi&: *DefO, Loop: BB, InitVal&: InitValO, LoopVal&: LoopValO);
2995 MachineInstr *InitDefB = MRI.getVRegDef(Reg: InitValB);
2996 MachineInstr *InitDefO = MRI.getVRegDef(Reg: InitValO);
2997
2998 if (!InitDefB->isIdenticalTo(Other: *InitDefO))
2999 return true;
3000 }
3001
3002 LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
3003 LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
3004
3005 // This is the main test, which checks the offset values and the loop
3006 // increment value to determine if the accesses may be loop carried.
3007 if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
3008 return true;
3009
3010 LLVM_DEBUG({
3011 dbgs() << "Overlap check:\n";
3012 dbgs() << " BaseMI: ";
3013 BaseMI->dump();
3014 dbgs() << " Base + " << OffsetB << " + I * " << Delta
3015 << ", Len: " << AccessSizeB.getValue() << "\n";
3016 dbgs() << " OtherMI: ";
3017 OtherMI->dump();
3018 dbgs() << " Base + " << OffsetO << " + I * " << Delta
3019 << ", Len: " << AccessSizeO.getValue() << "\n";
3020 });
3021
3022 // Excessive overlap may be detected in strided patterns.
3023 // For example, the memory addresses of the store and the load in
3024 // for (i=0; i<n; i+=2) a[i+1] = a[i];
3025 // are assumed to overlap.
3026 if (Delta < 0) {
3027 int64_t BaseMinAddr = OffsetB;
3028 int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
3029 if (BaseMinAddr > OhterNextIterMaxAddr) {
3030 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3031 return false;
3032 }
3033 } else {
3034 int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
3035 int64_t OtherNextIterMinAddr = OffsetO + Delta;
3036 if (BaseMaxAddr < OtherNextIterMinAddr) {
3037 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3038 return false;
3039 }
3040 }
3041 LLVM_DEBUG(dbgs() << " Result: Overlap\n");
3042 return true;
3043}
3044
3045/// Return true for an order or output dependence that is loop carried
3046/// potentially. A dependence is loop carried if the destination defines a value
3047/// that may be used or defined by the source in a subsequent iteration.
3048bool SwingSchedulerDAG::isLoopCarriedDep(
3049 const SwingSchedulerDDGEdge &Edge) const {
3050 if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
3051 Edge.getDst()->isBoundaryNode())
3052 return false;
3053
3054 if (!SwpPruneLoopCarried)
3055 return true;
3056
3057 if (Edge.isOutputDep())
3058 return true;
3059
3060 MachineInstr *SI = Edge.getSrc()->getInstr();
3061 MachineInstr *DI = Edge.getDst()->getInstr();
3062 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3063
3064 // Assume ordered loads and stores may have a loop carried dependence.
3065 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3066 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
3067 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3068 return true;
3069
3070 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
3071 return false;
3072
3073 // The conservative assumption is that a dependence between memory operations
3074 // may be loop carried. The following code checks when it can be proved that
3075 // there is no loop carried dependence.
3076 return mayOverlapInLaterIter(BaseMI: DI, OtherMI: SI);
3077}
3078
3079void SwingSchedulerDAG::postProcessDAG() {
3080 for (auto &M : Mutations)
3081 M->apply(DAG: this);
3082}
3083
3084/// Try to schedule the node at the specified StartCycle and continue
3085/// until the node is schedule or the EndCycle is reached. This function
3086/// returns true if the node is scheduled. This routine may search either
3087/// forward or backward for a place to insert the instruction based upon
3088/// the relative values of StartCycle and EndCycle.
3089bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3090 bool forward = true;
3091 LLVM_DEBUG({
3092 dbgs() << "Trying to insert node between " << StartCycle << " and "
3093 << EndCycle << " II: " << II << "\n";
3094 });
3095 if (StartCycle > EndCycle)
3096 forward = false;
3097
3098 // The terminating condition depends on the direction.
3099 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3100 for (int curCycle = StartCycle; curCycle != termCycle;
3101 forward ? ++curCycle : --curCycle) {
3102
3103 if (ST.getInstrInfo()->isZeroCost(Opcode: SU->getInstr()->getOpcode()) ||
3104 ProcItinResources.canReserveResources(SU&: *SU, Cycle: curCycle)) {
3105 LLVM_DEBUG({
3106 dbgs() << "\tinsert at cycle " << curCycle << " ";
3107 SU->getInstr()->dump();
3108 });
3109
3110 if (!ST.getInstrInfo()->isZeroCost(Opcode: SU->getInstr()->getOpcode()))
3111 ProcItinResources.reserveResources(SU&: *SU, Cycle: curCycle);
3112 ScheduledInstrs[curCycle].push_back(x: SU);
3113 InstrToCycle.insert(x: std::make_pair(x&: SU, y&: curCycle));
3114 if (curCycle > LastCycle)
3115 LastCycle = curCycle;
3116 if (curCycle < FirstCycle)
3117 FirstCycle = curCycle;
3118 return true;
3119 }
3120 LLVM_DEBUG({
3121 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3122 SU->getInstr()->dump();
3123 });
3124 }
3125 return false;
3126}
3127
3128// Return the cycle of the earliest scheduled instruction in the chain.
3129int SMSchedule::earliestCycleInChain(const SwingSchedulerDDGEdge &Dep,
3130 const SwingSchedulerDDG *DDG) {
3131 SmallPtrSet<SUnit *, 8> Visited;
3132 SmallVector<SwingSchedulerDDGEdge, 8> Worklist;
3133 Worklist.push_back(Elt: Dep);
3134 int EarlyCycle = INT_MAX;
3135 while (!Worklist.empty()) {
3136 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3137 SUnit *PrevSU = Cur.getSrc();
3138 if (Visited.count(Ptr: PrevSU))
3139 continue;
3140 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(x: PrevSU);
3141 if (it == InstrToCycle.end())
3142 continue;
3143 EarlyCycle = std::min(a: EarlyCycle, b: it->second);
3144 for (const auto &IE : DDG->getInEdges(SU: PrevSU))
3145 if (IE.isOrderDep() || IE.isOutputDep())
3146 Worklist.push_back(Elt: IE);
3147 Visited.insert(Ptr: PrevSU);
3148 }
3149 return EarlyCycle;
3150}
3151
3152// Return the cycle of the latest scheduled instruction in the chain.
3153int SMSchedule::latestCycleInChain(const SwingSchedulerDDGEdge &Dep,
3154 const SwingSchedulerDDG *DDG) {
3155 SmallPtrSet<SUnit *, 8> Visited;
3156 SmallVector<SwingSchedulerDDGEdge, 8> Worklist;
3157 Worklist.push_back(Elt: Dep);
3158 int LateCycle = INT_MIN;
3159 while (!Worklist.empty()) {
3160 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3161 SUnit *SuccSU = Cur.getDst();
3162 if (Visited.count(Ptr: SuccSU) || SuccSU->isBoundaryNode())
3163 continue;
3164 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(x: SuccSU);
3165 if (it == InstrToCycle.end())
3166 continue;
3167 LateCycle = std::max(a: LateCycle, b: it->second);
3168 for (const auto &OE : DDG->getOutEdges(SU: SuccSU))
3169 if (OE.isOrderDep() || OE.isOutputDep())
3170 Worklist.push_back(Elt: OE);
3171 Visited.insert(Ptr: SuccSU);
3172 }
3173 return LateCycle;
3174}
3175
3176/// If an instruction has a use that spans multiple iterations, then
3177/// return true. These instructions are characterized by having a back-ege
3178/// to a Phi, which contains a reference to another Phi.
3179static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
3180 for (auto &P : SU->Preds)
3181 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
3182 for (auto &S : P.getSUnit()->Succs)
3183 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3184 return P.getSUnit();
3185 return nullptr;
3186}
3187
3188/// Compute the scheduling start slot for the instruction. The start slot
3189/// depends on any predecessor or successor nodes scheduled already.
3190void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3191 int II, SwingSchedulerDAG *DAG) {
3192 const SwingSchedulerDDG *DDG = DAG->getDDG();
3193
3194 // Iterate over each instruction that has been scheduled already. The start
3195 // slot computation depends on whether the previously scheduled instruction
3196 // is a predecessor or successor of the specified instruction.
3197 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3198 for (SUnit *I : getInstructions(cycle)) {
3199 for (const auto &IE : DDG->getInEdges(SU)) {
3200 if (IE.getSrc() == I) {
3201 // FIXME: Add reverse edge to `DDG` instead of calling
3202 // `isLoopCarriedDep`
3203 if (DAG->isLoopCarriedDep(Edge: IE)) {
3204 int End = earliestCycleInChain(Dep: IE, DDG) + (II - 1);
3205 *MinLateStart = std::min(a: *MinLateStart, b: End);
3206 }
3207 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
3208 *MaxEarlyStart = std::max(a: *MaxEarlyStart, b: EarlyStart);
3209 }
3210 }
3211
3212 for (const auto &OE : DDG->getOutEdges(SU)) {
3213 if (OE.getDst() == I) {
3214 // FIXME: Add reverse edge to `DDG` instead of calling
3215 // `isLoopCarriedDep`
3216 if (DAG->isLoopCarriedDep(Edge: OE)) {
3217 int Start = latestCycleInChain(Dep: OE, DDG) + 1 - II;
3218 *MaxEarlyStart = std::max(a: *MaxEarlyStart, b: Start);
3219 }
3220 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
3221 *MinLateStart = std::min(a: *MinLateStart, b: LateStart);
3222 }
3223 }
3224
3225 SUnit *BE = multipleIterations(SU: I, DAG);
3226 for (const auto &Dep : SU->Preds) {
3227 // For instruction that requires multiple iterations, make sure that
3228 // the dependent instruction is not scheduled past the definition.
3229 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3230 !SU->isPred(N: I))
3231 *MinLateStart = std::min(a: *MinLateStart, b: cycle);
3232 }
3233 }
3234 }
3235}
3236
3237/// Order the instructions within a cycle so that the definitions occur
3238/// before the uses. Returns true if the instruction is added to the start
3239/// of the list, or false if added to the end.
3240void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
3241 std::deque<SUnit *> &Insts) const {
3242 MachineInstr *MI = SU->getInstr();
3243 bool OrderBeforeUse = false;
3244 bool OrderAfterDef = false;
3245 bool OrderBeforeDef = false;
3246 unsigned MoveDef = 0;
3247 unsigned MoveUse = 0;
3248 int StageInst1 = stageScheduled(SU);
3249 const SwingSchedulerDDG *DDG = SSD->getDDG();
3250
3251 unsigned Pos = 0;
3252 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3253 ++I, ++Pos) {
3254 for (MachineOperand &MO : MI->operands()) {
3255 if (!MO.isReg() || !MO.getReg().isVirtual())
3256 continue;
3257
3258 Register Reg = MO.getReg();
3259 unsigned BasePos, OffsetPos;
3260 if (ST.getInstrInfo()->getBaseAndOffsetPosition(MI: *MI, BasePos, OffsetPos))
3261 if (MI->getOperand(i: BasePos).getReg() == Reg)
3262 if (Register NewReg = SSD->getInstrBaseReg(SU))
3263 Reg = NewReg;
3264 bool Reads, Writes;
3265 std::tie(args&: Reads, args&: Writes) =
3266 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3267 if (MO.isDef() && Reads && stageScheduled(SU: *I) <= StageInst1) {
3268 OrderBeforeUse = true;
3269 if (MoveUse == 0)
3270 MoveUse = Pos;
3271 } else if (MO.isDef() && Reads && stageScheduled(SU: *I) > StageInst1) {
3272 // Add the instruction after the scheduled instruction.
3273 OrderAfterDef = true;
3274 MoveDef = Pos;
3275 } else if (MO.isUse() && Writes && stageScheduled(SU: *I) == StageInst1) {
3276 if (cycleScheduled(SU: *I) == cycleScheduled(SU) && !(*I)->isSucc(N: SU)) {
3277 OrderBeforeUse = true;
3278 if (MoveUse == 0)
3279 MoveUse = Pos;
3280 } else {
3281 OrderAfterDef = true;
3282 MoveDef = Pos;
3283 }
3284 } else if (MO.isUse() && Writes && stageScheduled(SU: *I) > StageInst1) {
3285 OrderBeforeUse = true;
3286 if (MoveUse == 0)
3287 MoveUse = Pos;
3288 if (MoveUse != 0) {
3289 OrderAfterDef = true;
3290 MoveDef = Pos - 1;
3291 }
3292 } else if (MO.isUse() && Writes && stageScheduled(SU: *I) < StageInst1) {
3293 // Add the instruction before the scheduled instruction.
3294 OrderBeforeUse = true;
3295 if (MoveUse == 0)
3296 MoveUse = Pos;
3297 } else if (MO.isUse() && stageScheduled(SU: *I) == StageInst1 &&
3298 isLoopCarriedDefOfUse(SSD, Def: (*I)->getInstr(), MO)) {
3299 if (MoveUse == 0) {
3300 OrderBeforeDef = true;
3301 MoveUse = Pos;
3302 }
3303 }
3304 }
3305 // Check for order dependences between instructions. Make sure the source
3306 // is ordered before the destination.
3307 for (auto &OE : DDG->getOutEdges(SU)) {
3308 if (OE.getDst() != *I)
3309 continue;
3310 if (OE.isOrderDep() && stageScheduled(SU: *I) == StageInst1) {
3311 OrderBeforeUse = true;
3312 if (Pos < MoveUse)
3313 MoveUse = Pos;
3314 }
3315 // We did not handle HW dependences in previous for loop,
3316 // and we normally set Latency = 0 for Anti/Output deps,
3317 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3318 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3319 stageScheduled(SU: *I) == StageInst1) {
3320 OrderBeforeUse = true;
3321 if ((MoveUse == 0) || (Pos < MoveUse))
3322 MoveUse = Pos;
3323 }
3324 }
3325 for (auto &IE : DDG->getInEdges(SU)) {
3326 if (IE.getSrc() != *I)
3327 continue;
3328 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3329 stageScheduled(SU: *I) == StageInst1) {
3330 OrderAfterDef = true;
3331 MoveDef = Pos;
3332 }
3333 }
3334 }
3335
3336 // A circular dependence.
3337 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3338 OrderBeforeUse = false;
3339
3340 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3341 // to a loop-carried dependence.
3342 if (OrderBeforeDef)
3343 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3344
3345 // The uncommon case when the instruction order needs to be updated because
3346 // there is both a use and def.
3347 if (OrderBeforeUse && OrderAfterDef) {
3348 SUnit *UseSU = Insts.at(n: MoveUse);
3349 SUnit *DefSU = Insts.at(n: MoveDef);
3350 if (MoveUse > MoveDef) {
3351 Insts.erase(position: Insts.begin() + MoveUse);
3352 Insts.erase(position: Insts.begin() + MoveDef);
3353 } else {
3354 Insts.erase(position: Insts.begin() + MoveDef);
3355 Insts.erase(position: Insts.begin() + MoveUse);
3356 }
3357 orderDependence(SSD, SU: UseSU, Insts);
3358 orderDependence(SSD, SU, Insts);
3359 orderDependence(SSD, SU: DefSU, Insts);
3360 return;
3361 }
3362 // Put the new instruction first if there is a use in the list. Otherwise,
3363 // put it at the end of the list.
3364 if (OrderBeforeUse)
3365 Insts.push_front(x: SU);
3366 else
3367 Insts.push_back(x: SU);
3368}
3369
3370/// Return true if the scheduled Phi has a loop carried operand.
3371bool SMSchedule::isLoopCarried(const SwingSchedulerDAG *SSD,
3372 MachineInstr &Phi) const {
3373 if (!Phi.isPHI())
3374 return false;
3375 assert(Phi.isPHI() && "Expecting a Phi.");
3376 SUnit *DefSU = SSD->getSUnit(MI: &Phi);
3377 unsigned DefCycle = cycleScheduled(SU: DefSU);
3378 int DefStage = stageScheduled(SU: DefSU);
3379
3380 Register InitVal;
3381 Register LoopVal;
3382 getPhiRegs(Phi, Loop: Phi.getParent(), InitVal, LoopVal);
3383 SUnit *UseSU = SSD->getSUnit(MI: MRI.getVRegDef(Reg: LoopVal));
3384 if (!UseSU)
3385 return true;
3386 if (UseSU->getInstr()->isPHI())
3387 return true;
3388 unsigned LoopCycle = cycleScheduled(SU: UseSU);
3389 int LoopStage = stageScheduled(SU: UseSU);
3390 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3391}
3392
3393/// Return true if the instruction is a definition that is loop carried
3394/// and defines the use on the next iteration.
3395/// v1 = phi(v2, v3)
3396/// (Def) v3 = op v1
3397/// (MO) = v1
3398/// If MO appears before Def, then v1 and v3 may get assigned to the same
3399/// register.
3400bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD,
3401 MachineInstr *Def,
3402 MachineOperand &MO) const {
3403 if (!MO.isReg())
3404 return false;
3405 if (Def->isPHI())
3406 return false;
3407 MachineInstr *Phi = MRI.getVRegDef(Reg: MO.getReg());
3408 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3409 return false;
3410 if (!isLoopCarried(SSD, Phi&: *Phi))
3411 return false;
3412 Register LoopReg = getLoopPhiReg(Phi: *Phi, LoopBB: Phi->getParent());
3413 for (MachineOperand &DMO : Def->all_defs()) {
3414 if (DMO.getReg() == LoopReg)
3415 return true;
3416 }
3417 return false;
3418}
3419
3420/// Return true if all scheduled predecessors are loop-carried output/order
3421/// dependencies.
3422bool SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds(
3423 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3424 for (const auto &IE : DDG->getInEdges(SU))
3425 if (InstrToCycle.count(x: IE.getSrc()))
3426 return false;
3427 return true;
3428}
3429
3430/// Determine transitive dependences of unpipelineable instructions
3431SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
3432 SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
3433 SmallSet<SUnit *, 8> DoNotPipeline;
3434 SmallVector<SUnit *, 8> Worklist;
3435
3436 for (auto &SU : SSD->SUnits)
3437 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(MI: SU.getInstr()))
3438 Worklist.push_back(Elt: &SU);
3439
3440 const SwingSchedulerDDG *DDG = SSD->getDDG();
3441 while (!Worklist.empty()) {
3442 auto SU = Worklist.pop_back_val();
3443 if (DoNotPipeline.count(Ptr: SU))
3444 continue;
3445 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3446 DoNotPipeline.insert(Ptr: SU);
3447 for (const auto &IE : DDG->getInEdges(SU))
3448 Worklist.push_back(Elt: IE.getSrc());
3449
3450 // To preserve previous behavior and prevent regression
3451 // FIXME: Remove if this doesn't have significant impact on
3452 for (const auto &OE : DDG->getOutEdges(SU))
3453 if (OE.getDistance() == 1)
3454 Worklist.push_back(Elt: OE.getDst());
3455 }
3456 return DoNotPipeline;
3457}
3458
3459// Determine all instructions upon which any unpipelineable instruction depends
3460// and ensure that they are in stage 0. If unable to do so, return false.
3461bool SMSchedule::normalizeNonPipelinedInstructions(
3462 SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
3463 SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
3464
3465 int NewLastCycle = INT_MIN;
3466 for (SUnit &SU : SSD->SUnits) {
3467 if (!SU.isInstr())
3468 continue;
3469 if (!DNP.contains(Ptr: &SU) || stageScheduled(SU: &SU) == 0) {
3470 NewLastCycle = std::max(a: NewLastCycle, b: InstrToCycle[&SU]);
3471 continue;
3472 }
3473
3474 // Put the non-pipelined instruction as early as possible in the schedule
3475 int NewCycle = getFirstCycle();
3476 for (const auto &IE : SSD->getDDG()->getInEdges(SU: &SU))
3477 if (IE.getDistance() == 0)
3478 NewCycle = std::max(a: InstrToCycle[IE.getSrc()], b: NewCycle);
3479
3480 // To preserve previous behavior and prevent regression
3481 // FIXME: Remove if this doesn't have significant impact on performance
3482 for (auto &OE : SSD->getDDG()->getOutEdges(SU: &SU))
3483 if (OE.getDistance() == 1)
3484 NewCycle = std::max(a: InstrToCycle[OE.getDst()], b: NewCycle);
3485
3486 int OldCycle = InstrToCycle[&SU];
3487 if (OldCycle != NewCycle) {
3488 InstrToCycle[&SU] = NewCycle;
3489 auto &OldS = getInstructions(cycle: OldCycle);
3490 llvm::erase(C&: OldS, V: &SU);
3491 getInstructions(cycle: NewCycle).emplace_back(args: &SU);
3492 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3493 << ") is not pipelined; moving from cycle " << OldCycle
3494 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3495 }
3496
3497 // We traverse the SUs in the order of the original basic block. Computing
3498 // NewCycle in this order normally works fine because all dependencies
3499 // (except for loop-carried dependencies) don't violate the original order.
3500 // However, an artificial dependency (e.g., added by CopyToPhiMutation) can
3501 // break it. That is, there may be exist an artificial dependency from
3502 // bottom to top. In such a case, NewCycle may become too large to be
3503 // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the
3504 // following case:
3505 //
3506 // | Inst0 <-+
3507 // SU order | | artificial dep
3508 // | Inst1 --+
3509 // v
3510 //
3511 // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of
3512 // Inst0 must be greater than or equal to N so that Inst0 is not be
3513 // scheduled at Stage 0. In such cases, we reject this schedule at this
3514 // time.
3515 // FIXME: The reason for this is the existence of artificial dependencies
3516 // that are contradict to the original SU order. If ignoring artificial
3517 // dependencies does not affect correctness, then it is better to ignore
3518 // them.
3519 if (FirstCycle + InitiationInterval <= NewCycle)
3520 return false;
3521
3522 NewLastCycle = std::max(a: NewLastCycle, b: NewCycle);
3523 }
3524 LastCycle = NewLastCycle;
3525 return true;
3526}
3527
3528// Check if the generated schedule is valid. This function checks if
3529// an instruction that uses a physical register is scheduled in a
3530// different stage than the definition. The pipeliner does not handle
3531// physical register values that may cross a basic block boundary.
3532// Furthermore, if a physical def/use pair is assigned to the same
3533// cycle, orderDependence does not guarantee def/use ordering, so that
3534// case should be considered invalid. (The test checks for both
3535// earlier and same-cycle use to be more robust.)
3536bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
3537 for (SUnit &SU : SSD->SUnits) {
3538 if (!SU.hasPhysRegDefs)
3539 continue;
3540 int StageDef = stageScheduled(SU: &SU);
3541 int CycleDef = InstrToCycle[&SU];
3542 assert(StageDef != -1 && "Instruction should have been scheduled.");
3543 for (auto &OE : SSD->getDDG()->getOutEdges(SU: &SU)) {
3544 SUnit *Dst = OE.getDst();
3545 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3546 if (OE.getReg().isPhysical()) {
3547 if (stageScheduled(SU: Dst) != StageDef)
3548 return false;
3549 if (InstrToCycle[Dst] <= CycleDef)
3550 return false;
3551 }
3552 }
3553 }
3554 return true;
3555}
3556
3557/// A property of the node order in swing-modulo-scheduling is
3558/// that for nodes outside circuits the following holds:
3559/// none of them is scheduled after both a successor and a
3560/// predecessor.
3561/// The method below checks whether the property is met.
3562/// If not, debug information is printed and statistics information updated.
3563/// Note that we do not use an assert statement.
3564/// The reason is that although an invalid node order may prevent
3565/// the pipeliner from finding a pipelined schedule for arbitrary II,
3566/// it does not lead to the generation of incorrect code.
3567void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3568
3569 // a sorted vector that maps each SUnit to its index in the NodeOrder
3570 typedef std::pair<SUnit *, unsigned> UnitIndex;
3571 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(x: nullptr, y: 0));
3572
3573 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3574 Indices.push_back(x: std::make_pair(x: NodeOrder[i], y&: i));
3575
3576 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3577 return std::get<0>(in&: i1) < std::get<0>(in&: i2);
3578 };
3579
3580 // sort, so that we can perform a binary search
3581 llvm::sort(C&: Indices, Comp: CompareKey);
3582
3583 bool Valid = true;
3584 (void)Valid;
3585 // for each SUnit in the NodeOrder, check whether
3586 // it appears after both a successor and a predecessor
3587 // of the SUnit. If this is the case, and the SUnit
3588 // is not part of circuit, then the NodeOrder is not
3589 // valid.
3590 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3591 SUnit *SU = NodeOrder[i];
3592 unsigned Index = i;
3593
3594 bool PredBefore = false;
3595 bool SuccBefore = false;
3596
3597 SUnit *Succ;
3598 SUnit *Pred;
3599 (void)Succ;
3600 (void)Pred;
3601
3602 for (const auto &IE : DDG->getInEdges(SU)) {
3603 SUnit *PredSU = IE.getSrc();
3604 unsigned PredIndex = std::get<1>(
3605 in&: *llvm::lower_bound(Range&: Indices, Value: std::make_pair(x&: PredSU, y: 0), C: CompareKey));
3606 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3607 PredBefore = true;
3608 Pred = PredSU;
3609 break;
3610 }
3611 }
3612
3613 for (const auto &OE : DDG->getOutEdges(SU)) {
3614 SUnit *SuccSU = OE.getDst();
3615 // Do not process a boundary node, it was not included in NodeOrder,
3616 // hence not in Indices either, call to std::lower_bound() below will
3617 // return Indices.end().
3618 if (SuccSU->isBoundaryNode())
3619 continue;
3620 unsigned SuccIndex = std::get<1>(
3621 in&: *llvm::lower_bound(Range&: Indices, Value: std::make_pair(x&: SuccSU, y: 0), C: CompareKey));
3622 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3623 SuccBefore = true;
3624 Succ = SuccSU;
3625 break;
3626 }
3627 }
3628
3629 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3630 // instructions in circuits are allowed to be scheduled
3631 // after both a successor and predecessor.
3632 bool InCircuit = llvm::any_of(
3633 Range: Circuits, P: [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3634 if (InCircuit)
3635 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3636 else {
3637 Valid = false;
3638 NumNodeOrderIssues++;
3639 LLVM_DEBUG(dbgs() << "Predecessor ");
3640 }
3641 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3642 << " are scheduled before node " << SU->NodeNum
3643 << "\n");
3644 }
3645 }
3646
3647 LLVM_DEBUG({
3648 if (!Valid)
3649 dbgs() << "Invalid node order found!\n";
3650 });
3651}
3652
3653/// Attempt to fix the degenerate cases when the instruction serialization
3654/// causes the register lifetimes to overlap. For example,
3655/// p' = store_pi(p, b)
3656/// = load p, offset
3657/// In this case p and p' overlap, which means that two registers are needed.
3658/// Instead, this function changes the load to use p' and updates the offset.
3659void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3660 Register OverlapReg;
3661 Register NewBaseReg;
3662 for (SUnit *SU : Instrs) {
3663 MachineInstr *MI = SU->getInstr();
3664 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3665 const MachineOperand &MO = MI->getOperand(i);
3666 // Look for an instruction that uses p. The instruction occurs in the
3667 // same cycle but occurs later in the serialized order.
3668 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3669 // Check that the instruction appears in the InstrChanges structure,
3670 // which contains instructions that can have the offset updated.
3671 DenseMap<SUnit *, std::pair<Register, int64_t>>::iterator It =
3672 InstrChanges.find(Val: SU);
3673 if (It != InstrChanges.end()) {
3674 unsigned BasePos, OffsetPos;
3675 // Update the base register and adjust the offset.
3676 if (TII->getBaseAndOffsetPosition(MI: *MI, BasePos, OffsetPos)) {
3677 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: MI);
3678 NewMI->getOperand(i: BasePos).setReg(NewBaseReg);
3679 int64_t NewOffset =
3680 MI->getOperand(i: OffsetPos).getImm() - It->second.second;
3681 NewMI->getOperand(i: OffsetPos).setImm(NewOffset);
3682 SU->setInstr(NewMI);
3683 MISUnitMap[NewMI] = SU;
3684 NewMIs[MI] = NewMI;
3685 }
3686 }
3687 OverlapReg = Register();
3688 NewBaseReg = Register();
3689 break;
3690 }
3691 // Look for an instruction of the form p' = op(p), which uses and defines
3692 // two virtual registers that get allocated to the same physical register.
3693 unsigned TiedUseIdx = 0;
3694 if (MI->isRegTiedToUseOperand(DefOpIdx: i, UseOpIdx: &TiedUseIdx)) {
3695 // OverlapReg is p in the example above.
3696 OverlapReg = MI->getOperand(i: TiedUseIdx).getReg();
3697 // NewBaseReg is p' in the example above.
3698 NewBaseReg = MI->getOperand(i).getReg();
3699 break;
3700 }
3701 }
3702 }
3703}
3704
3705std::deque<SUnit *>
3706SMSchedule::reorderInstructions(const SwingSchedulerDAG *SSD,
3707 const std::deque<SUnit *> &Instrs) const {
3708 std::deque<SUnit *> NewOrderPhi;
3709 for (SUnit *SU : Instrs) {
3710 if (SU->getInstr()->isPHI())
3711 NewOrderPhi.push_back(x: SU);
3712 }
3713 std::deque<SUnit *> NewOrderI;
3714 for (SUnit *SU : Instrs) {
3715 if (!SU->getInstr()->isPHI())
3716 orderDependence(SSD, SU, Insts&: NewOrderI);
3717 }
3718 llvm::append_range(C&: NewOrderPhi, R&: NewOrderI);
3719 return NewOrderPhi;
3720}
3721
3722/// After the schedule has been formed, call this function to combine
3723/// the instructions from the different stages/cycles. That is, this
3724/// function creates a schedule that represents a single iteration.
3725void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
3726 // Move all instructions to the first stage from later stages.
3727 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3728 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3729 ++stage) {
3730 std::deque<SUnit *> &cycleInstrs =
3731 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3732 for (SUnit *SU : llvm::reverse(C&: cycleInstrs))
3733 ScheduledInstrs[cycle].push_front(x: SU);
3734 }
3735 }
3736
3737 // Erase all the elements in the later stages. Only one iteration should
3738 // remain in the scheduled list, and it contains all the instructions.
3739 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3740 ScheduledInstrs.erase(Val: cycle);
3741
3742 // Change the registers in instruction as specified in the InstrChanges
3743 // map. We need to use the new registers to create the correct order.
3744 for (const SUnit &SU : SSD->SUnits)
3745 SSD->applyInstrChange(MI: SU.getInstr(), Schedule&: *this);
3746
3747 // Reorder the instructions in each cycle to fix and improve the
3748 // generated code.
3749 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3750 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3751 cycleInstrs = reorderInstructions(SSD, Instrs: cycleInstrs);
3752 SSD->fixupRegisterOverlaps(Instrs&: cycleInstrs);
3753 }
3754
3755 LLVM_DEBUG(dump(););
3756}
3757
3758void NodeSet::print(raw_ostream &os) const {
3759 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3760 << " depth " << MaxDepth << " col " << Colocate << "\n";
3761 for (const auto &I : Nodes)
3762 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3763 os << "\n";
3764}
3765
3766#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3767/// Print the schedule information to the given output.
3768void SMSchedule::print(raw_ostream &os) const {
3769 // Iterate over each cycle.
3770 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3771 // Iterate over each instruction in the cycle.
3772 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3773 for (SUnit *CI : cycleInstrs->second) {
3774 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3775 os << "(" << CI->NodeNum << ") ";
3776 CI->getInstr()->print(os);
3777 os << "\n";
3778 }
3779 }
3780}
3781
3782/// Utility function used for debugging to print the schedule.
3783LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
3784LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }
3785
3786void ResourceManager::dumpMRT() const {
3787 LLVM_DEBUG({
3788 if (UseDFA)
3789 return;
3790 std::stringstream SS;
3791 SS << "MRT:\n";
3792 SS << std::setw(4) << "Slot";
3793 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3794 SS << std::setw(3) << I;
3795 SS << std::setw(7) << "#Mops"
3796 << "\n";
3797 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3798 SS << std::setw(4) << Slot;
3799 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3800 SS << std::setw(3) << MRT[Slot][I];
3801 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3802 }
3803 dbgs() << SS.str();
3804 });
3805}
3806#endif
3807
3808void ResourceManager::initProcResourceVectors(
3809 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3810 unsigned ProcResourceID = 0;
3811
3812 // We currently limit the resource kinds to 64 and below so that we can use
3813 // uint64_t for Masks
3814 assert(SM.getNumProcResourceKinds() < 64 &&
3815 "Too many kinds of resources, unsupported");
3816 // Create a unique bitmask for every processor resource unit.
3817 // Skip resource at index 0, since it always references 'InvalidUnit'.
3818 Masks.resize(N: SM.getNumProcResourceKinds());
3819 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3820 const MCProcResourceDesc &Desc = *SM.getProcResource(ProcResourceIdx: I);
3821 if (Desc.SubUnitsIdxBegin)
3822 continue;
3823 Masks[I] = 1ULL << ProcResourceID;
3824 ProcResourceID++;
3825 }
3826 // Create a unique bitmask for every processor resource group.
3827 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3828 const MCProcResourceDesc &Desc = *SM.getProcResource(ProcResourceIdx: I);
3829 if (!Desc.SubUnitsIdxBegin)
3830 continue;
3831 Masks[I] = 1ULL << ProcResourceID;
3832 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3833 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3834 ProcResourceID++;
3835 }
3836 LLVM_DEBUG({
3837 if (SwpShowResMask) {
3838 dbgs() << "ProcResourceDesc:\n";
3839 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3840 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3841 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3842 ProcResource->Name, I, Masks[I],
3843 ProcResource->NumUnits);
3844 }
3845 dbgs() << " -----------------\n";
3846 }
3847 });
3848}
3849
3850bool ResourceManager::canReserveResources(SUnit &SU, int Cycle) {
3851 LLVM_DEBUG({
3852 if (SwpDebugResource)
3853 dbgs() << "canReserveResources:\n";
3854 });
3855 if (UseDFA)
3856 return DFAResources[positiveModulo(Dividend: Cycle, Divisor: InitiationInterval)]
3857 ->canReserveResources(MID: &SU.getInstr()->getDesc());
3858
3859 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(SU: &SU);
3860 if (!SCDesc->isValid()) {
3861 LLVM_DEBUG({
3862 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3863 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3864 });
3865 return true;
3866 }
3867
3868 reserveResources(SCDesc, Cycle);
3869 bool Result = !isOverbooked();
3870 unreserveResources(SCDesc, Cycle);
3871
3872 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
3873 return Result;
3874}
3875
3876void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3877 LLVM_DEBUG({
3878 if (SwpDebugResource)
3879 dbgs() << "reserveResources:\n";
3880 });
3881 if (UseDFA)
3882 return DFAResources[positiveModulo(Dividend: Cycle, Divisor: InitiationInterval)]
3883 ->reserveResources(MID: &SU.getInstr()->getDesc());
3884
3885 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(SU: &SU);
3886 if (!SCDesc->isValid()) {
3887 LLVM_DEBUG({
3888 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3889 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3890 });
3891 return;
3892 }
3893
3894 reserveResources(SCDesc, Cycle);
3895
3896 LLVM_DEBUG({
3897 if (SwpDebugResource) {
3898 dumpMRT();
3899 dbgs() << "reserveResources: done!\n\n";
3900 }
3901 });
3902}
3903
3904void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3905 int Cycle) {
3906 assert(!UseDFA);
3907 for (const MCWriteProcResEntry &PRE : make_range(
3908 x: STI->getWriteProcResBegin(SC: SCDesc), y: STI->getWriteProcResEnd(SC: SCDesc)))
3909 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3910 ++MRT[positiveModulo(Dividend: C, Divisor: InitiationInterval)][PRE.ProcResourceIdx];
3911
3912 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3913 ++NumScheduledMops[positiveModulo(Dividend: C, Divisor: InitiationInterval)];
3914}
3915
3916void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3917 int Cycle) {
3918 assert(!UseDFA);
3919 for (const MCWriteProcResEntry &PRE : make_range(
3920 x: STI->getWriteProcResBegin(SC: SCDesc), y: STI->getWriteProcResEnd(SC: SCDesc)))
3921 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3922 --MRT[positiveModulo(Dividend: C, Divisor: InitiationInterval)][PRE.ProcResourceIdx];
3923
3924 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3925 --NumScheduledMops[positiveModulo(Dividend: C, Divisor: InitiationInterval)];
3926}
3927
3928bool ResourceManager::isOverbooked() const {
3929 assert(!UseDFA);
3930 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3931 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3932 const MCProcResourceDesc *Desc = SM.getProcResource(ProcResourceIdx: I);
3933 if (MRT[Slot][I] > Desc->NumUnits)
3934 return true;
3935 }
3936 if (NumScheduledMops[Slot] > IssueWidth)
3937 return true;
3938 }
3939 return false;
3940}
3941
3942int ResourceManager::calculateResMIIDFA() const {
3943 assert(UseDFA);
3944
3945 // Sort the instructions by the number of available choices for scheduling,
3946 // least to most. Use the number of critical resources as the tie breaker.
3947 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3948 for (SUnit &SU : DAG->SUnits)
3949 FUS.calcCriticalResources(MI&: *SU.getInstr());
3950 PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
3951 FuncUnitOrder(FUS);
3952
3953 for (SUnit &SU : DAG->SUnits)
3954 FuncUnitOrder.push(x: SU.getInstr());
3955
3956 SmallVector<std::unique_ptr<DFAPacketizer>, 8> Resources;
3957 Resources.push_back(
3958 Elt: std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3959
3960 while (!FuncUnitOrder.empty()) {
3961 MachineInstr *MI = FuncUnitOrder.top();
3962 FuncUnitOrder.pop();
3963 if (TII->isZeroCost(Opcode: MI->getOpcode()))
3964 continue;
3965
3966 // Attempt to reserve the instruction in an existing DFA. At least one
3967 // DFA is needed for each cycle.
3968 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3969 unsigned ReservedCycles = 0;
3970 auto *RI = Resources.begin();
3971 auto *RE = Resources.end();
3972 LLVM_DEBUG({
3973 dbgs() << "Trying to reserve resource for " << NumCycles
3974 << " cycles for \n";
3975 MI->dump();
3976 });
3977 for (unsigned C = 0; C < NumCycles; ++C)
3978 while (RI != RE) {
3979 if ((*RI)->canReserveResources(MI&: *MI)) {
3980 (*RI)->reserveResources(MI&: *MI);
3981 ++ReservedCycles;
3982 break;
3983 }
3984 RI++;
3985 }
3986 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3987 << ", NumCycles:" << NumCycles << "\n");
3988 // Add new DFAs, if needed, to reserve resources.
3989 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3990 LLVM_DEBUG(if (SwpDebugResource) dbgs()
3991 << "NewResource created to reserve resources"
3992 << "\n");
3993 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3994 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3995 NewResource->reserveResources(MI&: *MI);
3996 Resources.push_back(Elt: std::unique_ptr<DFAPacketizer>(NewResource));
3997 }
3998 }
3999
4000 int Resmii = Resources.size();
4001 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
4002 return Resmii;
4003}
4004
4005int ResourceManager::calculateResMII() const {
4006 if (UseDFA)
4007 return calculateResMIIDFA();
4008
4009 // Count each resource consumption and divide it by the number of units.
4010 // ResMII is the max value among them.
4011
4012 int NumMops = 0;
4013 SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
4014 for (SUnit &SU : DAG->SUnits) {
4015 if (TII->isZeroCost(Opcode: SU.getInstr()->getOpcode()))
4016 continue;
4017
4018 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(SU: &SU);
4019 if (!SCDesc->isValid())
4020 continue;
4021
4022 LLVM_DEBUG({
4023 if (SwpDebugResource) {
4024 DAG->dumpNode(SU);
4025 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
4026 << " WriteProcRes: ";
4027 }
4028 });
4029 NumMops += SCDesc->NumMicroOps;
4030 for (const MCWriteProcResEntry &PRE :
4031 make_range(x: STI->getWriteProcResBegin(SC: SCDesc),
4032 y: STI->getWriteProcResEnd(SC: SCDesc))) {
4033 LLVM_DEBUG({
4034 if (SwpDebugResource) {
4035 const MCProcResourceDesc *Desc =
4036 SM.getProcResource(PRE.ProcResourceIdx);
4037 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
4038 }
4039 });
4040 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
4041 }
4042 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
4043 }
4044
4045 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
4046 LLVM_DEBUG({
4047 if (SwpDebugResource)
4048 dbgs() << "#Mops: " << NumMops << ", "
4049 << "IssueWidth: " << IssueWidth << ", "
4050 << "Cycles: " << Result << "\n";
4051 });
4052
4053 LLVM_DEBUG({
4054 if (SwpDebugResource) {
4055 std::stringstream SS;
4056 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
4057 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
4058 << "\n";
4059 dbgs() << SS.str();
4060 }
4061 });
4062 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4063 const MCProcResourceDesc *Desc = SM.getProcResource(ProcResourceIdx: I);
4064 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
4065 LLVM_DEBUG({
4066 if (SwpDebugResource) {
4067 std::stringstream SS;
4068 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
4069 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
4070 << std::setw(10) << Cycles << "\n";
4071 dbgs() << SS.str();
4072 }
4073 });
4074 if (Cycles > Result)
4075 Result = Cycles;
4076 }
4077 return Result;
4078}
4079
4080void ResourceManager::init(int II) {
4081 InitiationInterval = II;
4082 DFAResources.clear();
4083 DFAResources.resize(N: II);
4084 for (auto &I : DFAResources)
4085 I.reset(p: ST->getInstrInfo()->CreateTargetScheduleState(*ST));
4086 MRT.clear();
4087 MRT.resize(N: II, NV: SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
4088 NumScheduledMops.clear();
4089 NumScheduledMops.resize(N: II);
4090}
4091
4092bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
4093 if (Pred.isArtificial() || Dst->isBoundaryNode())
4094 return true;
4095 // Currently, dependence that is an anti-dependences but not a loop-carried is
4096 // also ignored. This behavior is preserved to prevent regression.
4097 // FIXME: Remove if this doesn't have significant impact on performance
4098 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
4099}
4100
4101SwingSchedulerDDG::SwingSchedulerDDGEdges &
4102SwingSchedulerDDG::getEdges(const SUnit *SU) {
4103 if (SU == EntrySU)
4104 return EntrySUEdges;
4105 if (SU == ExitSU)
4106 return ExitSUEdges;
4107 return EdgesVec[SU->NodeNum];
4108}
4109
4110const SwingSchedulerDDG::SwingSchedulerDDGEdges &
4111SwingSchedulerDDG::getEdges(const SUnit *SU) const {
4112 if (SU == EntrySU)
4113 return EntrySUEdges;
4114 if (SU == ExitSU)
4115 return ExitSUEdges;
4116 return EdgesVec[SU->NodeNum];
4117}
4118
4119void SwingSchedulerDDG::addEdge(const SUnit *SU,
4120 const SwingSchedulerDDGEdge &Edge) {
4121 auto &Edges = getEdges(SU);
4122 if (Edge.getSrc() == SU)
4123 Edges.Succs.push_back(Elt: Edge);
4124 else
4125 Edges.Preds.push_back(Elt: Edge);
4126}
4127
4128void SwingSchedulerDDG::initEdges(SUnit *SU) {
4129 for (const auto &PI : SU->Preds) {
4130 SwingSchedulerDDGEdge Edge(SU, PI, false);
4131 addEdge(SU, Edge);
4132 }
4133
4134 for (const auto &SI : SU->Succs) {
4135 SwingSchedulerDDGEdge Edge(SU, SI, true);
4136 addEdge(SU, Edge);
4137 }
4138}
4139
4140SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
4141 SUnit *ExitSU)
4142 : EntrySU(EntrySU), ExitSU(ExitSU) {
4143 EdgesVec.resize(new_size: SUnits.size());
4144
4145 initEdges(SU: EntrySU);
4146 initEdges(SU: ExitSU);
4147 for (auto &SU : SUnits)
4148 initEdges(SU: &SU);
4149}
4150
4151const SwingSchedulerDDG::EdgesType &
4152SwingSchedulerDDG::getInEdges(const SUnit *SU) const {
4153 return getEdges(SU).Preds;
4154}
4155
4156const SwingSchedulerDDG::EdgesType &
4157SwingSchedulerDDG::getOutEdges(const SUnit *SU) const {
4158 return getEdges(SU).Succs;
4159}
4160
4161void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits) {
4162 // Currently this function simply adds all dependencies represented by this
4163 // object. After we properly handle missed dependencies, the logic here will
4164 // be more complex, as currently missed edges should not be added to the DAG.
4165 for (SUnit &SU : SUnits) {
4166 SUnit *Src = &SU;
4167 if (auto *OrderDep = getOrderDepOrNull(Key: Src)) {
4168 SDep Dep(Src, SDep::Barrier);
4169 Dep.setLatency(1);
4170 for (SUnit *Dst : *OrderDep)
4171 Dst->addPred(D: Dep);
4172 }
4173 }
4174}
4175
4176void LoopCarriedEdges::dump(SUnit *SU, const TargetRegisterInfo *TRI,
4177 const MachineRegisterInfo *MRI) const {
4178 const auto *Order = getOrderDepOrNull(Key: SU);
4179
4180 if (!Order)
4181 return;
4182
4183 const auto DumpSU = [](const SUnit *SU) {
4184 std::ostringstream OSS;
4185 OSS << "SU(" << SU->NodeNum << ")";
4186 return OSS.str();
4187 };
4188
4189 dbgs() << " Loop carried edges from " << DumpSU(SU) << "\n"
4190 << " Order\n";
4191 for (SUnit *Dst : *Order)
4192 dbgs() << " " << DumpSU(Dst) << "\n";
4193}
4194