1 | //===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // MachineScheduler schedules machine instructions after phi elimination. It |
10 | // preserves LiveIntervals so it can be invoked before register allocation. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "llvm/CodeGen/MachineScheduler.h" |
15 | #include "llvm/ADT/ArrayRef.h" |
16 | #include "llvm/ADT/BitVector.h" |
17 | #include "llvm/ADT/DenseMap.h" |
18 | #include "llvm/ADT/PriorityQueue.h" |
19 | #include "llvm/ADT/STLExtras.h" |
20 | #include "llvm/ADT/SmallVector.h" |
21 | #include "llvm/ADT/Statistic.h" |
22 | #include "llvm/ADT/iterator_range.h" |
23 | #include "llvm/Analysis/AliasAnalysis.h" |
24 | #include "llvm/CodeGen/LiveInterval.h" |
25 | #include "llvm/CodeGen/LiveIntervals.h" |
26 | #include "llvm/CodeGen/MachineBasicBlock.h" |
27 | #include "llvm/CodeGen/MachineDominators.h" |
28 | #include "llvm/CodeGen/MachineFunction.h" |
29 | #include "llvm/CodeGen/MachineFunctionPass.h" |
30 | #include "llvm/CodeGen/MachineInstr.h" |
31 | #include "llvm/CodeGen/MachineLoopInfo.h" |
32 | #include "llvm/CodeGen/MachineOperand.h" |
33 | #include "llvm/CodeGen/MachinePassRegistry.h" |
34 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
35 | #include "llvm/CodeGen/RegisterClassInfo.h" |
36 | #include "llvm/CodeGen/RegisterPressure.h" |
37 | #include "llvm/CodeGen/ScheduleDAG.h" |
38 | #include "llvm/CodeGen/ScheduleDAGInstrs.h" |
39 | #include "llvm/CodeGen/ScheduleDAGMutation.h" |
40 | #include "llvm/CodeGen/ScheduleDFS.h" |
41 | #include "llvm/CodeGen/ScheduleHazardRecognizer.h" |
42 | #include "llvm/CodeGen/SlotIndexes.h" |
43 | #include "llvm/CodeGen/TargetFrameLowering.h" |
44 | #include "llvm/CodeGen/TargetInstrInfo.h" |
45 | #include "llvm/CodeGen/TargetLowering.h" |
46 | #include "llvm/CodeGen/TargetPassConfig.h" |
47 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
48 | #include "llvm/CodeGen/TargetSchedule.h" |
49 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
50 | #include "llvm/CodeGenTypes/MachineValueType.h" |
51 | #include "llvm/Config/llvm-config.h" |
52 | #include "llvm/InitializePasses.h" |
53 | #include "llvm/MC/LaneBitmask.h" |
54 | #include "llvm/Pass.h" |
55 | #include "llvm/Support/CommandLine.h" |
56 | #include "llvm/Support/Compiler.h" |
57 | #include "llvm/Support/Debug.h" |
58 | #include "llvm/Support/ErrorHandling.h" |
59 | #include "llvm/Support/GraphWriter.h" |
60 | #include "llvm/Support/raw_ostream.h" |
61 | #include <algorithm> |
62 | #include <cassert> |
63 | #include <cstdint> |
64 | #include <iterator> |
65 | #include <limits> |
66 | #include <memory> |
67 | #include <string> |
68 | #include <tuple> |
69 | #include <utility> |
70 | #include <vector> |
71 | |
72 | using namespace llvm; |
73 | |
74 | #define DEBUG_TYPE "machine-scheduler" |
75 | |
76 | STATISTIC(NumClustered, "Number of load/store pairs clustered" ); |
77 | |
78 | namespace llvm { |
79 | |
80 | cl::opt<bool> ForceTopDown("misched-topdown" , cl::Hidden, |
81 | cl::desc("Force top-down list scheduling" )); |
82 | cl::opt<bool> ForceBottomUp("misched-bottomup" , cl::Hidden, |
83 | cl::desc("Force bottom-up list scheduling" )); |
84 | namespace MISchedPostRASched { |
85 | enum Direction { |
86 | TopDown, |
87 | BottomUp, |
88 | Bidirectional, |
89 | }; |
90 | } // end namespace MISchedPostRASched |
91 | cl::opt<MISchedPostRASched::Direction> PostRADirection( |
92 | "misched-postra-direction" , cl::Hidden, |
93 | cl::desc("Post reg-alloc list scheduling direction" ), |
94 | // Default to top-down because it was implemented first and existing targets |
95 | // expect that behavior by default. |
96 | cl::init(Val: MISchedPostRASched::TopDown), |
97 | cl::values( |
98 | clEnumValN(MISchedPostRASched::TopDown, "topdown" , |
99 | "Force top-down post reg-alloc list scheduling" ), |
100 | clEnumValN(MISchedPostRASched::BottomUp, "bottomup" , |
101 | "Force bottom-up post reg-alloc list scheduling" ), |
102 | clEnumValN(MISchedPostRASched::Bidirectional, "bidirectional" , |
103 | "Force bidirectional post reg-alloc list scheduling" ))); |
104 | cl::opt<bool> |
105 | DumpCriticalPathLength("misched-dcpl" , cl::Hidden, |
106 | cl::desc("Print critical path length to stdout" )); |
107 | |
108 | cl::opt<bool> VerifyScheduling( |
109 | "verify-misched" , cl::Hidden, |
110 | cl::desc("Verify machine instrs before and after machine scheduling" )); |
111 | |
112 | #ifndef NDEBUG |
113 | cl::opt<bool> ViewMISchedDAGs( |
114 | "view-misched-dags" , cl::Hidden, |
115 | cl::desc("Pop up a window to show MISched dags after they are processed" )); |
116 | cl::opt<bool> PrintDAGs("misched-print-dags" , cl::Hidden, |
117 | cl::desc("Print schedule DAGs" )); |
118 | cl::opt<bool> MISchedDumpReservedCycles( |
119 | "misched-dump-reserved-cycles" , cl::Hidden, cl::init(false), |
120 | cl::desc("Dump resource usage at schedule boundary." )); |
121 | cl::opt<bool> MischedDetailResourceBooking( |
122 | "misched-detail-resource-booking" , cl::Hidden, cl::init(false), |
123 | cl::desc("Show details of invoking getNextResoufceCycle." )); |
124 | #else |
125 | const bool ViewMISchedDAGs = false; |
126 | const bool PrintDAGs = false; |
127 | const bool MischedDetailResourceBooking = false; |
128 | #ifdef LLVM_ENABLE_DUMP |
129 | const bool MISchedDumpReservedCycles = false; |
130 | #endif // LLVM_ENABLE_DUMP |
131 | #endif // NDEBUG |
132 | |
133 | } // end namespace llvm |
134 | |
135 | #ifndef NDEBUG |
136 | /// In some situations a few uninteresting nodes depend on nearly all other |
137 | /// nodes in the graph, provide a cutoff to hide them. |
138 | static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff" , cl::Hidden, |
139 | cl::desc("Hide nodes with more predecessor/successor than cutoff" )); |
140 | |
141 | static cl::opt<unsigned> MISchedCutoff("misched-cutoff" , cl::Hidden, |
142 | cl::desc("Stop scheduling after N instructions" ), cl::init(~0U)); |
143 | |
144 | static cl::opt<std::string> SchedOnlyFunc("misched-only-func" , cl::Hidden, |
145 | cl::desc("Only schedule this function" )); |
146 | static cl::opt<unsigned> SchedOnlyBlock("misched-only-block" , cl::Hidden, |
147 | cl::desc("Only schedule this MBB#" )); |
148 | #endif // NDEBUG |
149 | |
150 | /// Avoid quadratic complexity in unusually large basic blocks by limiting the |
151 | /// size of the ready lists. |
152 | static cl::opt<unsigned> ReadyListLimit("misched-limit" , cl::Hidden, |
153 | cl::desc("Limit ready list to N instructions" ), cl::init(Val: 256)); |
154 | |
155 | static cl::opt<bool> EnableRegPressure("misched-regpressure" , cl::Hidden, |
156 | cl::desc("Enable register pressure scheduling." ), cl::init(Val: true)); |
157 | |
158 | static cl::opt<bool> EnableCyclicPath("misched-cyclicpath" , cl::Hidden, |
159 | cl::desc("Enable cyclic critical path analysis." ), cl::init(Val: true)); |
160 | |
161 | static cl::opt<bool> EnableMemOpCluster("misched-cluster" , cl::Hidden, |
162 | cl::desc("Enable memop clustering." ), |
163 | cl::init(Val: true)); |
164 | static cl::opt<bool> |
165 | ForceFastCluster("force-fast-cluster" , cl::Hidden, |
166 | cl::desc("Switch to fast cluster algorithm with the lost " |
167 | "of some fusion opportunities" ), |
168 | cl::init(Val: false)); |
169 | static cl::opt<unsigned> |
170 | FastClusterThreshold("fast-cluster-threshold" , cl::Hidden, |
171 | cl::desc("The threshold for fast cluster" ), |
172 | cl::init(Val: 1000)); |
173 | |
174 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
175 | static cl::opt<bool> MISchedDumpScheduleTrace( |
176 | "misched-dump-schedule-trace" , cl::Hidden, cl::init(false), |
177 | cl::desc("Dump resource usage at schedule boundary." )); |
178 | static cl::opt<unsigned> |
179 | HeaderColWidth("misched-dump-schedule-trace-col-header-width" , cl::Hidden, |
180 | cl::desc("Set width of the columns with " |
181 | "the resources and schedule units" ), |
182 | cl::init(19)); |
183 | static cl::opt<unsigned> |
184 | ColWidth("misched-dump-schedule-trace-col-width" , cl::Hidden, |
185 | cl::desc("Set width of the columns showing resource booking." ), |
186 | cl::init(5)); |
187 | static cl::opt<bool> MISchedSortResourcesInTrace( |
188 | "misched-sort-resources-in-trace" , cl::Hidden, cl::init(true), |
189 | cl::desc("Sort the resources printed in the dump trace" )); |
190 | #endif |
191 | |
192 | static cl::opt<unsigned> |
193 | MIResourceCutOff("misched-resource-cutoff" , cl::Hidden, |
194 | cl::desc("Number of intervals to track" ), cl::init(Val: 10)); |
195 | |
196 | // DAG subtrees must have at least this many nodes. |
197 | static const unsigned MinSubtreeSize = 8; |
198 | |
199 | // Pin the vtables to this file. |
200 | void MachineSchedStrategy::anchor() {} |
201 | |
202 | void ScheduleDAGMutation::anchor() {} |
203 | |
204 | //===----------------------------------------------------------------------===// |
205 | // Machine Instruction Scheduling Pass and Registry |
206 | //===----------------------------------------------------------------------===// |
207 | |
208 | MachineSchedContext::MachineSchedContext() { |
209 | RegClassInfo = new RegisterClassInfo(); |
210 | } |
211 | |
212 | MachineSchedContext::~MachineSchedContext() { |
213 | delete RegClassInfo; |
214 | } |
215 | |
216 | namespace { |
217 | |
218 | /// Base class for a machine scheduler class that can run at any point. |
219 | class MachineSchedulerBase : public MachineSchedContext, |
220 | public MachineFunctionPass { |
221 | public: |
222 | MachineSchedulerBase(char &ID): MachineFunctionPass(ID) {} |
223 | |
224 | void print(raw_ostream &O, const Module* = nullptr) const override; |
225 | |
226 | protected: |
227 | void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags); |
228 | }; |
229 | |
230 | /// MachineScheduler runs after coalescing and before register allocation. |
231 | class MachineScheduler : public MachineSchedulerBase { |
232 | public: |
233 | MachineScheduler(); |
234 | |
235 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
236 | |
237 | bool runOnMachineFunction(MachineFunction&) override; |
238 | |
239 | static char ID; // Class identification, replacement for typeinfo |
240 | |
241 | protected: |
242 | ScheduleDAGInstrs *createMachineScheduler(); |
243 | }; |
244 | |
245 | /// PostMachineScheduler runs after shortly before code emission. |
246 | class PostMachineScheduler : public MachineSchedulerBase { |
247 | public: |
248 | PostMachineScheduler(); |
249 | |
250 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
251 | |
252 | bool runOnMachineFunction(MachineFunction&) override; |
253 | |
254 | static char ID; // Class identification, replacement for typeinfo |
255 | |
256 | protected: |
257 | ScheduleDAGInstrs *createPostMachineScheduler(); |
258 | }; |
259 | |
260 | } // end anonymous namespace |
261 | |
262 | char MachineScheduler::ID = 0; |
263 | |
264 | char &llvm::MachineSchedulerID = MachineScheduler::ID; |
265 | |
266 | INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE, |
267 | "Machine Instruction Scheduler" , false, false) |
268 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
269 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
270 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) |
271 | INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) |
272 | INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) |
273 | INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE, |
274 | "Machine Instruction Scheduler" , false, false) |
275 | |
276 | MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) { |
277 | initializeMachineSchedulerPass(Registry&: *PassRegistry::getPassRegistry()); |
278 | } |
279 | |
280 | void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { |
281 | AU.setPreservesCFG(); |
282 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
283 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
284 | AU.addRequired<AAResultsWrapperPass>(); |
285 | AU.addRequired<TargetPassConfig>(); |
286 | AU.addRequired<SlotIndexesWrapperPass>(); |
287 | AU.addPreserved<SlotIndexesWrapperPass>(); |
288 | AU.addRequired<LiveIntervalsWrapperPass>(); |
289 | AU.addPreserved<LiveIntervalsWrapperPass>(); |
290 | MachineFunctionPass::getAnalysisUsage(AU); |
291 | } |
292 | |
293 | char PostMachineScheduler::ID = 0; |
294 | |
295 | char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID; |
296 | |
297 | INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched" , |
298 | "PostRA Machine Instruction Scheduler" , false, false) |
299 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) |
300 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) |
301 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) |
302 | INITIALIZE_PASS_END(PostMachineScheduler, "postmisched" , |
303 | "PostRA Machine Instruction Scheduler" , false, false) |
304 | |
305 | PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) { |
306 | initializePostMachineSchedulerPass(Registry&: *PassRegistry::getPassRegistry()); |
307 | } |
308 | |
309 | void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { |
310 | AU.setPreservesCFG(); |
311 | AU.addRequired<MachineDominatorTreeWrapperPass>(); |
312 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
313 | AU.addRequired<AAResultsWrapperPass>(); |
314 | AU.addRequired<TargetPassConfig>(); |
315 | MachineFunctionPass::getAnalysisUsage(AU); |
316 | } |
317 | |
318 | MachinePassRegistry<MachineSchedRegistry::ScheduleDAGCtor> |
319 | MachineSchedRegistry::Registry; |
320 | |
321 | /// A dummy default scheduler factory indicates whether the scheduler |
322 | /// is overridden on the command line. |
323 | static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) { |
324 | return nullptr; |
325 | } |
326 | |
327 | /// MachineSchedOpt allows command line selection of the scheduler. |
328 | static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false, |
329 | RegisterPassParser<MachineSchedRegistry>> |
330 | MachineSchedOpt("misched" , |
331 | cl::init(Val: &useDefaultMachineSched), cl::Hidden, |
332 | cl::desc("Machine instruction scheduler to use" )); |
333 | |
334 | static MachineSchedRegistry |
335 | DefaultSchedRegistry("default" , "Use the target's default scheduler choice." , |
336 | useDefaultMachineSched); |
337 | |
338 | static cl::opt<bool> EnableMachineSched( |
339 | "enable-misched" , |
340 | cl::desc("Enable the machine instruction scheduling pass." ), cl::init(Val: true), |
341 | cl::Hidden); |
342 | |
343 | static cl::opt<bool> EnablePostRAMachineSched( |
344 | "enable-post-misched" , |
345 | cl::desc("Enable the post-ra machine instruction scheduling pass." ), |
346 | cl::init(Val: true), cl::Hidden); |
347 | |
348 | /// Decrement this iterator until reaching the top or a non-debug instr. |
349 | static MachineBasicBlock::const_iterator |
350 | priorNonDebug(MachineBasicBlock::const_iterator I, |
351 | MachineBasicBlock::const_iterator Beg) { |
352 | assert(I != Beg && "reached the top of the region, cannot decrement" ); |
353 | while (--I != Beg) { |
354 | if (!I->isDebugOrPseudoInstr()) |
355 | break; |
356 | } |
357 | return I; |
358 | } |
359 | |
360 | /// Non-const version. |
361 | static MachineBasicBlock::iterator |
362 | priorNonDebug(MachineBasicBlock::iterator I, |
363 | MachineBasicBlock::const_iterator Beg) { |
364 | return priorNonDebug(I: MachineBasicBlock::const_iterator(I), Beg) |
365 | .getNonConstIterator(); |
366 | } |
367 | |
368 | /// If this iterator is a debug value, increment until reaching the End or a |
369 | /// non-debug instruction. |
370 | static MachineBasicBlock::const_iterator |
371 | nextIfDebug(MachineBasicBlock::const_iterator I, |
372 | MachineBasicBlock::const_iterator End) { |
373 | for(; I != End; ++I) { |
374 | if (!I->isDebugOrPseudoInstr()) |
375 | break; |
376 | } |
377 | return I; |
378 | } |
379 | |
380 | /// Non-const version. |
381 | static MachineBasicBlock::iterator |
382 | nextIfDebug(MachineBasicBlock::iterator I, |
383 | MachineBasicBlock::const_iterator End) { |
384 | return nextIfDebug(I: MachineBasicBlock::const_iterator(I), End) |
385 | .getNonConstIterator(); |
386 | } |
387 | |
388 | /// Instantiate a ScheduleDAGInstrs that will be owned by the caller. |
389 | ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() { |
390 | // Select the scheduler, or set the default. |
391 | MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt; |
392 | if (Ctor != useDefaultMachineSched) |
393 | return Ctor(this); |
394 | |
395 | // Get the default scheduler set by the target for this function. |
396 | ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(C: this); |
397 | if (Scheduler) |
398 | return Scheduler; |
399 | |
400 | // Default to GenericScheduler. |
401 | return createGenericSchedLive(C: this); |
402 | } |
403 | |
404 | /// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by |
405 | /// the caller. We don't have a command line option to override the postRA |
406 | /// scheduler. The Target must configure it. |
407 | ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() { |
408 | // Get the postRA scheduler set by the target for this function. |
409 | ScheduleDAGInstrs *Scheduler = PassConfig->createPostMachineScheduler(C: this); |
410 | if (Scheduler) |
411 | return Scheduler; |
412 | |
413 | // Default to GenericScheduler. |
414 | return createGenericSchedPostRA(C: this); |
415 | } |
416 | |
417 | /// Top-level MachineScheduler pass driver. |
418 | /// |
419 | /// Visit blocks in function order. Divide each block into scheduling regions |
420 | /// and visit them bottom-up. Visiting regions bottom-up is not required, but is |
421 | /// consistent with the DAG builder, which traverses the interior of the |
422 | /// scheduling regions bottom-up. |
423 | /// |
424 | /// This design avoids exposing scheduling boundaries to the DAG builder, |
425 | /// simplifying the DAG builder's support for "special" target instructions. |
426 | /// At the same time the design allows target schedulers to operate across |
427 | /// scheduling boundaries, for example to bundle the boundary instructions |
428 | /// without reordering them. This creates complexity, because the target |
429 | /// scheduler must update the RegionBegin and RegionEnd positions cached by |
430 | /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler |
431 | /// design would be to split blocks at scheduling boundaries, but LLVM has a |
432 | /// general bias against block splitting purely for implementation simplicity. |
433 | bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { |
434 | if (skipFunction(F: mf.getFunction())) |
435 | return false; |
436 | |
437 | if (EnableMachineSched.getNumOccurrences()) { |
438 | if (!EnableMachineSched) |
439 | return false; |
440 | } else if (!mf.getSubtarget().enableMachineScheduler()) |
441 | return false; |
442 | |
443 | LLVM_DEBUG(dbgs() << "Before MISched:\n" ; mf.print(dbgs())); |
444 | |
445 | // Initialize the context of the pass. |
446 | MF = &mf; |
447 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
448 | MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); |
449 | PassConfig = &getAnalysis<TargetPassConfig>(); |
450 | AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
451 | |
452 | LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); |
453 | |
454 | if (VerifyScheduling) { |
455 | LLVM_DEBUG(LIS->dump()); |
456 | MF->verify(p: this, Banner: "Before machine scheduling." ); |
457 | } |
458 | RegClassInfo->runOnMachineFunction(MF: *MF); |
459 | |
460 | // Instantiate the selected scheduler for this target, function, and |
461 | // optimization level. |
462 | std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler()); |
463 | ScheduleDAGMI::DumpDirection D; |
464 | if (ForceTopDown) |
465 | D = ScheduleDAGMI::DumpDirection::TopDown; |
466 | else if (ForceBottomUp) |
467 | D = ScheduleDAGMI::DumpDirection::BottomUp; |
468 | else |
469 | D = ScheduleDAGMI::DumpDirection::Bidirectional; |
470 | Scheduler->setDumpDirection(D); |
471 | scheduleRegions(Scheduler&: *Scheduler, FixKillFlags: false); |
472 | |
473 | LLVM_DEBUG(LIS->dump()); |
474 | if (VerifyScheduling) |
475 | MF->verify(p: this, Banner: "After machine scheduling." ); |
476 | return true; |
477 | } |
478 | |
479 | bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) { |
480 | if (skipFunction(F: mf.getFunction())) |
481 | return false; |
482 | |
483 | if (EnablePostRAMachineSched.getNumOccurrences()) { |
484 | if (!EnablePostRAMachineSched) |
485 | return false; |
486 | } else if (!mf.getSubtarget().enablePostRAMachineScheduler()) { |
487 | LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n" ); |
488 | return false; |
489 | } |
490 | LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n" ; mf.print(dbgs())); |
491 | |
492 | // Initialize the context of the pass. |
493 | MF = &mf; |
494 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
495 | PassConfig = &getAnalysis<TargetPassConfig>(); |
496 | AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); |
497 | |
498 | if (VerifyScheduling) |
499 | MF->verify(p: this, Banner: "Before post machine scheduling." ); |
500 | |
501 | // Instantiate the selected scheduler for this target, function, and |
502 | // optimization level. |
503 | std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler()); |
504 | ScheduleDAGMI::DumpDirection D; |
505 | if (PostRADirection == MISchedPostRASched::TopDown) |
506 | D = ScheduleDAGMI::DumpDirection::TopDown; |
507 | else if (PostRADirection == MISchedPostRASched::BottomUp) |
508 | D = ScheduleDAGMI::DumpDirection::BottomUp; |
509 | else |
510 | D = ScheduleDAGMI::DumpDirection::Bidirectional; |
511 | Scheduler->setDumpDirection(D); |
512 | scheduleRegions(Scheduler&: *Scheduler, FixKillFlags: true); |
513 | |
514 | if (VerifyScheduling) |
515 | MF->verify(p: this, Banner: "After post machine scheduling." ); |
516 | return true; |
517 | } |
518 | |
519 | /// Return true of the given instruction should not be included in a scheduling |
520 | /// region. |
521 | /// |
522 | /// MachineScheduler does not currently support scheduling across calls. To |
523 | /// handle calls, the DAG builder needs to be modified to create register |
524 | /// anti/output dependencies on the registers clobbered by the call's regmask |
525 | /// operand. In PreRA scheduling, the stack pointer adjustment already prevents |
526 | /// scheduling across calls. In PostRA scheduling, we need the isCall to enforce |
527 | /// the boundary, but there would be no benefit to postRA scheduling across |
528 | /// calls this late anyway. |
529 | static bool isSchedBoundary(MachineBasicBlock::iterator MI, |
530 | MachineBasicBlock *MBB, |
531 | MachineFunction *MF, |
532 | const TargetInstrInfo *TII) { |
533 | return MI->isCall() || TII->isSchedulingBoundary(MI: *MI, MBB, MF: *MF); |
534 | } |
535 | |
536 | /// A region of an MBB for scheduling. |
537 | namespace { |
538 | struct SchedRegion { |
539 | /// RegionBegin is the first instruction in the scheduling region, and |
540 | /// RegionEnd is either MBB->end() or the scheduling boundary after the |
541 | /// last instruction in the scheduling region. These iterators cannot refer |
542 | /// to instructions outside of the identified scheduling region because |
543 | /// those may be reordered before scheduling this region. |
544 | MachineBasicBlock::iterator RegionBegin; |
545 | MachineBasicBlock::iterator RegionEnd; |
546 | unsigned NumRegionInstrs; |
547 | |
548 | SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E, |
549 | unsigned N) : |
550 | RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {} |
551 | }; |
552 | } // end anonymous namespace |
553 | |
554 | using MBBRegionsVector = SmallVector<SchedRegion, 16>; |
555 | |
556 | static void |
557 | getSchedRegions(MachineBasicBlock *MBB, |
558 | MBBRegionsVector &Regions, |
559 | bool RegionsTopDown) { |
560 | MachineFunction *MF = MBB->getParent(); |
561 | const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); |
562 | |
563 | MachineBasicBlock::iterator I = nullptr; |
564 | for(MachineBasicBlock::iterator RegionEnd = MBB->end(); |
565 | RegionEnd != MBB->begin(); RegionEnd = I) { |
566 | |
567 | // Avoid decrementing RegionEnd for blocks with no terminator. |
568 | if (RegionEnd != MBB->end() || |
569 | isSchedBoundary(MI: &*std::prev(x: RegionEnd), MBB: &*MBB, MF, TII)) { |
570 | --RegionEnd; |
571 | } |
572 | |
573 | // The next region starts above the previous region. Look backward in the |
574 | // instruction stream until we find the nearest boundary. |
575 | unsigned NumRegionInstrs = 0; |
576 | I = RegionEnd; |
577 | for (;I != MBB->begin(); --I) { |
578 | MachineInstr &MI = *std::prev(x: I); |
579 | if (isSchedBoundary(MI: &MI, MBB: &*MBB, MF, TII)) |
580 | break; |
581 | if (!MI.isDebugOrPseudoInstr()) { |
582 | // MBB::size() uses instr_iterator to count. Here we need a bundle to |
583 | // count as a single instruction. |
584 | ++NumRegionInstrs; |
585 | } |
586 | } |
587 | |
588 | // It's possible we found a scheduling region that only has debug |
589 | // instructions. Don't bother scheduling these. |
590 | if (NumRegionInstrs != 0) |
591 | Regions.push_back(Elt: SchedRegion(I, RegionEnd, NumRegionInstrs)); |
592 | } |
593 | |
594 | if (RegionsTopDown) |
595 | std::reverse(first: Regions.begin(), last: Regions.end()); |
596 | } |
597 | |
598 | /// Main driver for both MachineScheduler and PostMachineScheduler. |
599 | void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, |
600 | bool FixKillFlags) { |
601 | // Visit all machine basic blocks. |
602 | // |
603 | // TODO: Visit blocks in global postorder or postorder within the bottom-up |
604 | // loop tree. Then we can optionally compute global RegPressure. |
605 | for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end(); |
606 | MBB != MBBEnd; ++MBB) { |
607 | |
608 | Scheduler.startBlock(BB: &*MBB); |
609 | |
610 | #ifndef NDEBUG |
611 | if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName()) |
612 | continue; |
613 | if (SchedOnlyBlock.getNumOccurrences() |
614 | && (int)SchedOnlyBlock != MBB->getNumber()) |
615 | continue; |
616 | #endif |
617 | |
618 | // Break the block into scheduling regions [I, RegionEnd). RegionEnd |
619 | // points to the scheduling boundary at the bottom of the region. The DAG |
620 | // does not include RegionEnd, but the region does (i.e. the next |
621 | // RegionEnd is above the previous RegionBegin). If the current block has |
622 | // no terminator then RegionEnd == MBB->end() for the bottom region. |
623 | // |
624 | // All the regions of MBB are first found and stored in MBBRegions, which |
625 | // will be processed (MBB) top-down if initialized with true. |
626 | // |
627 | // The Scheduler may insert instructions during either schedule() or |
628 | // exitRegion(), even for empty regions. So the local iterators 'I' and |
629 | // 'RegionEnd' are invalid across these calls. Instructions must not be |
630 | // added to other regions than the current one without updating MBBRegions. |
631 | |
632 | MBBRegionsVector MBBRegions; |
633 | getSchedRegions(MBB: &*MBB, Regions&: MBBRegions, RegionsTopDown: Scheduler.doMBBSchedRegionsTopDown()); |
634 | for (const SchedRegion &R : MBBRegions) { |
635 | MachineBasicBlock::iterator I = R.RegionBegin; |
636 | MachineBasicBlock::iterator RegionEnd = R.RegionEnd; |
637 | unsigned NumRegionInstrs = R.NumRegionInstrs; |
638 | |
639 | // Notify the scheduler of the region, even if we may skip scheduling |
640 | // it. Perhaps it still needs to be bundled. |
641 | Scheduler.enterRegion(bb: &*MBB, begin: I, end: RegionEnd, regioninstrs: NumRegionInstrs); |
642 | |
643 | // Skip empty scheduling regions (0 or 1 schedulable instructions). |
644 | if (I == RegionEnd || I == std::prev(x: RegionEnd)) { |
645 | // Close the current region. Bundle the terminator if needed. |
646 | // This invalidates 'RegionEnd' and 'I'. |
647 | Scheduler.exitRegion(); |
648 | continue; |
649 | } |
650 | LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n" ); |
651 | LLVM_DEBUG(dbgs() << MF->getName() << ":" << printMBBReference(*MBB) |
652 | << " " << MBB->getName() << "\n From: " << *I |
653 | << " To: " ; |
654 | if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; |
655 | else dbgs() << "End\n" ; |
656 | dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); |
657 | if (DumpCriticalPathLength) { |
658 | errs() << MF->getName(); |
659 | errs() << ":%bb. " << MBB->getNumber(); |
660 | errs() << " " << MBB->getName() << " \n" ; |
661 | } |
662 | |
663 | // Schedule a region: possibly reorder instructions. |
664 | // This invalidates the original region iterators. |
665 | Scheduler.schedule(); |
666 | |
667 | // Close the current region. |
668 | Scheduler.exitRegion(); |
669 | } |
670 | Scheduler.finishBlock(); |
671 | // FIXME: Ideally, no further passes should rely on kill flags. However, |
672 | // thumb2 size reduction is currently an exception, so the PostMIScheduler |
673 | // needs to do this. |
674 | if (FixKillFlags) |
675 | Scheduler.fixupKills(MBB&: *MBB); |
676 | } |
677 | Scheduler.finalizeSchedule(); |
678 | } |
679 | |
680 | void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const { |
681 | // unimplemented |
682 | } |
683 | |
684 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
685 | LLVM_DUMP_METHOD void ReadyQueue::dump() const { |
686 | dbgs() << "Queue " << Name << ": " ; |
687 | for (const SUnit *SU : Queue) |
688 | dbgs() << SU->NodeNum << " " ; |
689 | dbgs() << "\n" ; |
690 | } |
691 | #endif |
692 | |
693 | //===----------------------------------------------------------------------===// |
694 | // ScheduleDAGMI - Basic machine instruction scheduling. This is |
695 | // independent of PreRA/PostRA scheduling and involves no extra book-keeping for |
696 | // virtual registers. |
697 | // ===----------------------------------------------------------------------===/ |
698 | |
699 | // Provide a vtable anchor. |
700 | ScheduleDAGMI::~ScheduleDAGMI() = default; |
701 | |
702 | /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When |
703 | /// NumPredsLeft reaches zero, release the successor node. |
704 | /// |
705 | /// FIXME: Adjust SuccSU height based on MinLatency. |
706 | void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { |
707 | SUnit *SuccSU = SuccEdge->getSUnit(); |
708 | |
709 | if (SuccEdge->isWeak()) { |
710 | --SuccSU->WeakPredsLeft; |
711 | if (SuccEdge->isCluster()) |
712 | NextClusterSucc = SuccSU; |
713 | return; |
714 | } |
715 | #ifndef NDEBUG |
716 | if (SuccSU->NumPredsLeft == 0) { |
717 | dbgs() << "*** Scheduling failed! ***\n" ; |
718 | dumpNode(*SuccSU); |
719 | dbgs() << " has been released too many times!\n" ; |
720 | llvm_unreachable(nullptr); |
721 | } |
722 | #endif |
723 | // SU->TopReadyCycle was set to CurrCycle when it was scheduled. However, |
724 | // CurrCycle may have advanced since then. |
725 | if (SuccSU->TopReadyCycle < SU->TopReadyCycle + SuccEdge->getLatency()) |
726 | SuccSU->TopReadyCycle = SU->TopReadyCycle + SuccEdge->getLatency(); |
727 | |
728 | --SuccSU->NumPredsLeft; |
729 | if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) |
730 | SchedImpl->releaseTopNode(SU: SuccSU); |
731 | } |
732 | |
733 | /// releaseSuccessors - Call releaseSucc on each of SU's successors. |
734 | void ScheduleDAGMI::releaseSuccessors(SUnit *SU) { |
735 | for (SDep &Succ : SU->Succs) |
736 | releaseSucc(SU, SuccEdge: &Succ); |
737 | } |
738 | |
739 | /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When |
740 | /// NumSuccsLeft reaches zero, release the predecessor node. |
741 | /// |
742 | /// FIXME: Adjust PredSU height based on MinLatency. |
743 | void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { |
744 | SUnit *PredSU = PredEdge->getSUnit(); |
745 | |
746 | if (PredEdge->isWeak()) { |
747 | --PredSU->WeakSuccsLeft; |
748 | if (PredEdge->isCluster()) |
749 | NextClusterPred = PredSU; |
750 | return; |
751 | } |
752 | #ifndef NDEBUG |
753 | if (PredSU->NumSuccsLeft == 0) { |
754 | dbgs() << "*** Scheduling failed! ***\n" ; |
755 | dumpNode(*PredSU); |
756 | dbgs() << " has been released too many times!\n" ; |
757 | llvm_unreachable(nullptr); |
758 | } |
759 | #endif |
760 | // SU->BotReadyCycle was set to CurrCycle when it was scheduled. However, |
761 | // CurrCycle may have advanced since then. |
762 | if (PredSU->BotReadyCycle < SU->BotReadyCycle + PredEdge->getLatency()) |
763 | PredSU->BotReadyCycle = SU->BotReadyCycle + PredEdge->getLatency(); |
764 | |
765 | --PredSU->NumSuccsLeft; |
766 | if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) |
767 | SchedImpl->releaseBottomNode(SU: PredSU); |
768 | } |
769 | |
770 | /// releasePredecessors - Call releasePred on each of SU's predecessors. |
771 | void ScheduleDAGMI::releasePredecessors(SUnit *SU) { |
772 | for (SDep &Pred : SU->Preds) |
773 | releasePred(SU, PredEdge: &Pred); |
774 | } |
775 | |
776 | void ScheduleDAGMI::startBlock(MachineBasicBlock *bb) { |
777 | ScheduleDAGInstrs::startBlock(BB: bb); |
778 | SchedImpl->enterMBB(MBB: bb); |
779 | } |
780 | |
781 | void ScheduleDAGMI::finishBlock() { |
782 | SchedImpl->leaveMBB(); |
783 | ScheduleDAGInstrs::finishBlock(); |
784 | } |
785 | |
786 | /// enterRegion - Called back from PostMachineScheduler::runOnMachineFunction |
787 | /// after crossing a scheduling boundary. [begin, end) includes all instructions |
788 | /// in the region, including the boundary itself and single-instruction regions |
789 | /// that don't get scheduled. |
790 | void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb, |
791 | MachineBasicBlock::iterator begin, |
792 | MachineBasicBlock::iterator end, |
793 | unsigned regioninstrs) |
794 | { |
795 | ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs); |
796 | |
797 | SchedImpl->initPolicy(Begin: begin, End: end, NumRegionInstrs: regioninstrs); |
798 | } |
799 | |
800 | /// This is normally called from the main scheduler loop but may also be invoked |
801 | /// by the scheduling strategy to perform additional code motion. |
802 | void ScheduleDAGMI::moveInstruction( |
803 | MachineInstr *MI, MachineBasicBlock::iterator InsertPos) { |
804 | // Advance RegionBegin if the first instruction moves down. |
805 | if (&*RegionBegin == MI) |
806 | ++RegionBegin; |
807 | |
808 | // Update the instruction stream. |
809 | BB->splice(Where: InsertPos, Other: BB, From: MI); |
810 | |
811 | // Update LiveIntervals |
812 | if (LIS) |
813 | LIS->handleMove(MI&: *MI, /*UpdateFlags=*/true); |
814 | |
815 | // Recede RegionBegin if an instruction moves above the first. |
816 | if (RegionBegin == InsertPos) |
817 | RegionBegin = MI; |
818 | } |
819 | |
820 | bool ScheduleDAGMI::checkSchedLimit() { |
821 | #if LLVM_ENABLE_ABI_BREAKING_CHECKS && !defined(NDEBUG) |
822 | if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) { |
823 | CurrentTop = CurrentBottom; |
824 | return false; |
825 | } |
826 | ++NumInstrsScheduled; |
827 | #endif |
828 | return true; |
829 | } |
830 | |
831 | /// Per-region scheduling driver, called back from |
832 | /// PostMachineScheduler::runOnMachineFunction. This is a simplified driver |
833 | /// that does not consider liveness or register pressure. It is useful for |
834 | /// PostRA scheduling and potentially other custom schedulers. |
835 | void ScheduleDAGMI::schedule() { |
836 | LLVM_DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n" ); |
837 | LLVM_DEBUG(SchedImpl->dumpPolicy()); |
838 | |
839 | // Build the DAG. |
840 | buildSchedGraph(AA); |
841 | |
842 | postProcessDAG(); |
843 | |
844 | SmallVector<SUnit*, 8> TopRoots, BotRoots; |
845 | findRootsAndBiasEdges(TopRoots, BotRoots); |
846 | |
847 | LLVM_DEBUG(dump()); |
848 | if (PrintDAGs) dump(); |
849 | if (ViewMISchedDAGs) viewGraph(); |
850 | |
851 | // Initialize the strategy before modifying the DAG. |
852 | // This may initialize a DFSResult to be used for queue priority. |
853 | SchedImpl->initialize(DAG: this); |
854 | |
855 | // Initialize ready queues now that the DAG and priority data are finalized. |
856 | initQueues(TopRoots, BotRoots); |
857 | |
858 | bool IsTopNode = false; |
859 | while (true) { |
860 | LLVM_DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n" ); |
861 | SUnit *SU = SchedImpl->pickNode(IsTopNode); |
862 | if (!SU) break; |
863 | |
864 | assert(!SU->isScheduled && "Node already scheduled" ); |
865 | if (!checkSchedLimit()) |
866 | break; |
867 | |
868 | MachineInstr *MI = SU->getInstr(); |
869 | if (IsTopNode) { |
870 | assert(SU->isTopReady() && "node still has unscheduled dependencies" ); |
871 | if (&*CurrentTop == MI) |
872 | CurrentTop = nextIfDebug(I: ++CurrentTop, End: CurrentBottom); |
873 | else |
874 | moveInstruction(MI, InsertPos: CurrentTop); |
875 | } else { |
876 | assert(SU->isBottomReady() && "node still has unscheduled dependencies" ); |
877 | MachineBasicBlock::iterator priorII = |
878 | priorNonDebug(I: CurrentBottom, Beg: CurrentTop); |
879 | if (&*priorII == MI) |
880 | CurrentBottom = priorII; |
881 | else { |
882 | if (&*CurrentTop == MI) |
883 | CurrentTop = nextIfDebug(I: ++CurrentTop, End: priorII); |
884 | moveInstruction(MI, InsertPos: CurrentBottom); |
885 | CurrentBottom = MI; |
886 | } |
887 | } |
888 | // Notify the scheduling strategy before updating the DAG. |
889 | // This sets the scheduled node's ReadyCycle to CurrCycle. When updateQueues |
890 | // runs, it can then use the accurate ReadyCycle time to determine whether |
891 | // newly released nodes can move to the readyQ. |
892 | SchedImpl->schedNode(SU, IsTopNode); |
893 | |
894 | updateQueues(SU, IsTopNode); |
895 | } |
896 | assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone." ); |
897 | |
898 | placeDebugValues(); |
899 | |
900 | LLVM_DEBUG({ |
901 | dbgs() << "*** Final schedule for " |
902 | << printMBBReference(*begin()->getParent()) << " ***\n" ; |
903 | dumpSchedule(); |
904 | dbgs() << '\n'; |
905 | }); |
906 | } |
907 | |
908 | /// Apply each ScheduleDAGMutation step in order. |
909 | void ScheduleDAGMI::postProcessDAG() { |
910 | for (auto &m : Mutations) |
911 | m->apply(DAG: this); |
912 | } |
913 | |
914 | void ScheduleDAGMI:: |
915 | findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots, |
916 | SmallVectorImpl<SUnit*> &BotRoots) { |
917 | for (SUnit &SU : SUnits) { |
918 | assert(!SU.isBoundaryNode() && "Boundary node should not be in SUnits" ); |
919 | |
920 | // Order predecessors so DFSResult follows the critical path. |
921 | SU.biasCriticalPath(); |
922 | |
923 | // A SUnit is ready to top schedule if it has no predecessors. |
924 | if (!SU.NumPredsLeft) |
925 | TopRoots.push_back(Elt: &SU); |
926 | // A SUnit is ready to bottom schedule if it has no successors. |
927 | if (!SU.NumSuccsLeft) |
928 | BotRoots.push_back(Elt: &SU); |
929 | } |
930 | ExitSU.biasCriticalPath(); |
931 | } |
932 | |
933 | /// Identify DAG roots and setup scheduler queues. |
934 | void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots, |
935 | ArrayRef<SUnit*> BotRoots) { |
936 | NextClusterSucc = nullptr; |
937 | NextClusterPred = nullptr; |
938 | |
939 | // Release all DAG roots for scheduling, not including EntrySU/ExitSU. |
940 | // |
941 | // Nodes with unreleased weak edges can still be roots. |
942 | // Release top roots in forward order. |
943 | for (SUnit *SU : TopRoots) |
944 | SchedImpl->releaseTopNode(SU); |
945 | |
946 | // Release bottom roots in reverse order so the higher priority nodes appear |
947 | // first. This is more natural and slightly more efficient. |
948 | for (SmallVectorImpl<SUnit*>::const_reverse_iterator |
949 | I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) { |
950 | SchedImpl->releaseBottomNode(SU: *I); |
951 | } |
952 | |
953 | releaseSuccessors(SU: &EntrySU); |
954 | releasePredecessors(SU: &ExitSU); |
955 | |
956 | SchedImpl->registerRoots(); |
957 | |
958 | // Advance past initial DebugValues. |
959 | CurrentTop = nextIfDebug(I: RegionBegin, End: RegionEnd); |
960 | CurrentBottom = RegionEnd; |
961 | } |
962 | |
963 | /// Update scheduler queues after scheduling an instruction. |
964 | void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) { |
965 | // Release dependent instructions for scheduling. |
966 | if (IsTopNode) |
967 | releaseSuccessors(SU); |
968 | else |
969 | releasePredecessors(SU); |
970 | |
971 | SU->isScheduled = true; |
972 | } |
973 | |
974 | /// Reinsert any remaining debug_values, just like the PostRA scheduler. |
975 | void ScheduleDAGMI::placeDebugValues() { |
976 | // If first instruction was a DBG_VALUE then put it back. |
977 | if (FirstDbgValue) { |
978 | BB->splice(Where: RegionBegin, Other: BB, From: FirstDbgValue); |
979 | RegionBegin = FirstDbgValue; |
980 | } |
981 | |
982 | for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator |
983 | DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { |
984 | std::pair<MachineInstr *, MachineInstr *> P = *std::prev(x: DI); |
985 | MachineInstr *DbgValue = P.first; |
986 | MachineBasicBlock::iterator OrigPrevMI = P.second; |
987 | if (&*RegionBegin == DbgValue) |
988 | ++RegionBegin; |
989 | BB->splice(Where: std::next(x: OrigPrevMI), Other: BB, From: DbgValue); |
990 | if (RegionEnd != BB->end() && OrigPrevMI == &*RegionEnd) |
991 | RegionEnd = DbgValue; |
992 | } |
993 | } |
994 | |
995 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
996 | static const char *scheduleTableLegend = " i: issue\n x: resource booked" ; |
997 | |
998 | LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceTopDown() const { |
999 | // Bail off when there is no schedule model to query. |
1000 | if (!SchedModel.hasInstrSchedModel()) |
1001 | return; |
1002 | |
1003 | // Nothing to show if there is no or just one instruction. |
1004 | if (BB->size() < 2) |
1005 | return; |
1006 | |
1007 | dbgs() << " * Schedule table (TopDown):\n" ; |
1008 | dbgs() << scheduleTableLegend << "\n" ; |
1009 | const unsigned FirstCycle = getSUnit(&*(std::begin(*this)))->TopReadyCycle; |
1010 | unsigned LastCycle = getSUnit(&*(std::prev(std::end(*this))))->TopReadyCycle; |
1011 | for (MachineInstr &MI : *this) { |
1012 | SUnit *SU = getSUnit(&MI); |
1013 | if (!SU) |
1014 | continue; |
1015 | const MCSchedClassDesc *SC = getSchedClass(SU); |
1016 | for (TargetSchedModel::ProcResIter PI = SchedModel.getWriteProcResBegin(SC), |
1017 | PE = SchedModel.getWriteProcResEnd(SC); |
1018 | PI != PE; ++PI) { |
1019 | if (SU->TopReadyCycle + PI->ReleaseAtCycle - 1 > LastCycle) |
1020 | LastCycle = SU->TopReadyCycle + PI->ReleaseAtCycle - 1; |
1021 | } |
1022 | } |
1023 | // Print the header with the cycles |
1024 | dbgs() << llvm::left_justify("Cycle" , HeaderColWidth); |
1025 | for (unsigned C = FirstCycle; C <= LastCycle; ++C) |
1026 | dbgs() << llvm::left_justify("| " + std::to_string(C), ColWidth); |
1027 | dbgs() << "|\n" ; |
1028 | |
1029 | for (MachineInstr &MI : *this) { |
1030 | SUnit *SU = getSUnit(&MI); |
1031 | if (!SU) { |
1032 | dbgs() << "Missing SUnit\n" ; |
1033 | continue; |
1034 | } |
1035 | std::string NodeName("SU(" ); |
1036 | NodeName += std::to_string(SU->NodeNum) + ")" ; |
1037 | dbgs() << llvm::left_justify(NodeName, HeaderColWidth); |
1038 | unsigned C = FirstCycle; |
1039 | for (; C <= LastCycle; ++C) { |
1040 | if (C == SU->TopReadyCycle) |
1041 | dbgs() << llvm::left_justify("| i" , ColWidth); |
1042 | else |
1043 | dbgs() << llvm::left_justify("|" , ColWidth); |
1044 | } |
1045 | dbgs() << "|\n" ; |
1046 | const MCSchedClassDesc *SC = getSchedClass(SU); |
1047 | |
1048 | SmallVector<MCWriteProcResEntry, 4> ResourcesIt( |
1049 | make_range(SchedModel.getWriteProcResBegin(SC), |
1050 | SchedModel.getWriteProcResEnd(SC))); |
1051 | |
1052 | if (MISchedSortResourcesInTrace) |
1053 | llvm::stable_sort(ResourcesIt, |
1054 | [](const MCWriteProcResEntry &LHS, |
1055 | const MCWriteProcResEntry &RHS) -> bool { |
1056 | return LHS.AcquireAtCycle < RHS.AcquireAtCycle || |
1057 | (LHS.AcquireAtCycle == RHS.AcquireAtCycle && |
1058 | LHS.ReleaseAtCycle < RHS.ReleaseAtCycle); |
1059 | }); |
1060 | for (const MCWriteProcResEntry &PI : ResourcesIt) { |
1061 | C = FirstCycle; |
1062 | const std::string ResName = |
1063 | SchedModel.getResourceName(PI.ProcResourceIdx); |
1064 | dbgs() << llvm::right_justify(ResName + " " , HeaderColWidth); |
1065 | for (; C < SU->TopReadyCycle + PI.AcquireAtCycle; ++C) { |
1066 | dbgs() << llvm::left_justify("|" , ColWidth); |
1067 | } |
1068 | for (unsigned I = 0, E = PI.ReleaseAtCycle - PI.AcquireAtCycle; I != E; |
1069 | ++I, ++C) |
1070 | dbgs() << llvm::left_justify("| x" , ColWidth); |
1071 | while (C++ <= LastCycle) |
1072 | dbgs() << llvm::left_justify("|" , ColWidth); |
1073 | // Place end char |
1074 | dbgs() << "| \n" ; |
1075 | } |
1076 | } |
1077 | } |
1078 | |
1079 | LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceBottomUp() const { |
1080 | // Bail off when there is no schedule model to query. |
1081 | if (!SchedModel.hasInstrSchedModel()) |
1082 | return; |
1083 | |
1084 | // Nothing to show if there is no or just one instruction. |
1085 | if (BB->size() < 2) |
1086 | return; |
1087 | |
1088 | dbgs() << " * Schedule table (BottomUp):\n" ; |
1089 | dbgs() << scheduleTableLegend << "\n" ; |
1090 | |
1091 | const int FirstCycle = getSUnit(&*(std::begin(*this)))->BotReadyCycle; |
1092 | int LastCycle = getSUnit(&*(std::prev(std::end(*this))))->BotReadyCycle; |
1093 | for (MachineInstr &MI : *this) { |
1094 | SUnit *SU = getSUnit(&MI); |
1095 | if (!SU) |
1096 | continue; |
1097 | const MCSchedClassDesc *SC = getSchedClass(SU); |
1098 | for (TargetSchedModel::ProcResIter PI = SchedModel.getWriteProcResBegin(SC), |
1099 | PE = SchedModel.getWriteProcResEnd(SC); |
1100 | PI != PE; ++PI) { |
1101 | if ((int)SU->BotReadyCycle - PI->ReleaseAtCycle + 1 < LastCycle) |
1102 | LastCycle = (int)SU->BotReadyCycle - PI->ReleaseAtCycle + 1; |
1103 | } |
1104 | } |
1105 | // Print the header with the cycles |
1106 | dbgs() << llvm::left_justify("Cycle" , HeaderColWidth); |
1107 | for (int C = FirstCycle; C >= LastCycle; --C) |
1108 | dbgs() << llvm::left_justify("| " + std::to_string(C), ColWidth); |
1109 | dbgs() << "|\n" ; |
1110 | |
1111 | for (MachineInstr &MI : *this) { |
1112 | SUnit *SU = getSUnit(&MI); |
1113 | if (!SU) { |
1114 | dbgs() << "Missing SUnit\n" ; |
1115 | continue; |
1116 | } |
1117 | std::string NodeName("SU(" ); |
1118 | NodeName += std::to_string(SU->NodeNum) + ")" ; |
1119 | dbgs() << llvm::left_justify(NodeName, HeaderColWidth); |
1120 | int C = FirstCycle; |
1121 | for (; C >= LastCycle; --C) { |
1122 | if (C == (int)SU->BotReadyCycle) |
1123 | dbgs() << llvm::left_justify("| i" , ColWidth); |
1124 | else |
1125 | dbgs() << llvm::left_justify("|" , ColWidth); |
1126 | } |
1127 | dbgs() << "|\n" ; |
1128 | const MCSchedClassDesc *SC = getSchedClass(SU); |
1129 | SmallVector<MCWriteProcResEntry, 4> ResourcesIt( |
1130 | make_range(SchedModel.getWriteProcResBegin(SC), |
1131 | SchedModel.getWriteProcResEnd(SC))); |
1132 | |
1133 | if (MISchedSortResourcesInTrace) |
1134 | llvm::stable_sort(ResourcesIt, |
1135 | [](const MCWriteProcResEntry &LHS, |
1136 | const MCWriteProcResEntry &RHS) -> bool { |
1137 | return LHS.AcquireAtCycle < RHS.AcquireAtCycle || |
1138 | (LHS.AcquireAtCycle == RHS.AcquireAtCycle && |
1139 | LHS.ReleaseAtCycle < RHS.ReleaseAtCycle); |
1140 | }); |
1141 | for (const MCWriteProcResEntry &PI : ResourcesIt) { |
1142 | C = FirstCycle; |
1143 | const std::string ResName = |
1144 | SchedModel.getResourceName(PI.ProcResourceIdx); |
1145 | dbgs() << llvm::right_justify(ResName + " " , HeaderColWidth); |
1146 | for (; C > ((int)SU->BotReadyCycle - (int)PI.AcquireAtCycle); --C) { |
1147 | dbgs() << llvm::left_justify("|" , ColWidth); |
1148 | } |
1149 | for (unsigned I = 0, E = PI.ReleaseAtCycle - PI.AcquireAtCycle; I != E; |
1150 | ++I, --C) |
1151 | dbgs() << llvm::left_justify("| x" , ColWidth); |
1152 | while (C-- >= LastCycle) |
1153 | dbgs() << llvm::left_justify("|" , ColWidth); |
1154 | // Place end char |
1155 | dbgs() << "| \n" ; |
1156 | } |
1157 | } |
1158 | } |
1159 | #endif |
1160 | |
1161 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1162 | LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const { |
1163 | if (MISchedDumpScheduleTrace) { |
1164 | if (DumpDir == DumpDirection::TopDown) |
1165 | dumpScheduleTraceTopDown(); |
1166 | else if (DumpDir == DumpDirection::BottomUp) |
1167 | dumpScheduleTraceBottomUp(); |
1168 | else if (DumpDir == DumpDirection::Bidirectional) { |
1169 | dbgs() << "* Schedule table (Bidirectional): not implemented\n" ; |
1170 | } else { |
1171 | dbgs() << "* Schedule table: DumpDirection not set.\n" ; |
1172 | } |
1173 | } |
1174 | |
1175 | for (MachineInstr &MI : *this) { |
1176 | if (SUnit *SU = getSUnit(&MI)) |
1177 | dumpNode(*SU); |
1178 | else |
1179 | dbgs() << "Missing SUnit\n" ; |
1180 | } |
1181 | } |
1182 | #endif |
1183 | |
1184 | //===----------------------------------------------------------------------===// |
1185 | // ScheduleDAGMILive - Base class for MachineInstr scheduling with LiveIntervals |
1186 | // preservation. |
1187 | //===----------------------------------------------------------------------===// |
1188 | |
1189 | ScheduleDAGMILive::~ScheduleDAGMILive() { |
1190 | delete DFSResult; |
1191 | } |
1192 | |
1193 | void ScheduleDAGMILive::collectVRegUses(SUnit &SU) { |
1194 | const MachineInstr &MI = *SU.getInstr(); |
1195 | for (const MachineOperand &MO : MI.operands()) { |
1196 | if (!MO.isReg()) |
1197 | continue; |
1198 | if (!MO.readsReg()) |
1199 | continue; |
1200 | if (TrackLaneMasks && !MO.isUse()) |
1201 | continue; |
1202 | |
1203 | Register Reg = MO.getReg(); |
1204 | if (!Reg.isVirtual()) |
1205 | continue; |
1206 | |
1207 | // Ignore re-defs. |
1208 | if (TrackLaneMasks) { |
1209 | bool FoundDef = false; |
1210 | for (const MachineOperand &MO2 : MI.all_defs()) { |
1211 | if (MO2.getReg() == Reg && !MO2.isDead()) { |
1212 | FoundDef = true; |
1213 | break; |
1214 | } |
1215 | } |
1216 | if (FoundDef) |
1217 | continue; |
1218 | } |
1219 | |
1220 | // Record this local VReg use. |
1221 | VReg2SUnitMultiMap::iterator UI = VRegUses.find(Key: Reg); |
1222 | for (; UI != VRegUses.end(); ++UI) { |
1223 | if (UI->SU == &SU) |
1224 | break; |
1225 | } |
1226 | if (UI == VRegUses.end()) |
1227 | VRegUses.insert(Val: VReg2SUnit(Reg, LaneBitmask::getNone(), &SU)); |
1228 | } |
1229 | } |
1230 | |
1231 | /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after |
1232 | /// crossing a scheduling boundary. [begin, end) includes all instructions in |
1233 | /// the region, including the boundary itself and single-instruction regions |
1234 | /// that don't get scheduled. |
1235 | void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb, |
1236 | MachineBasicBlock::iterator begin, |
1237 | MachineBasicBlock::iterator end, |
1238 | unsigned regioninstrs) |
1239 | { |
1240 | // ScheduleDAGMI initializes SchedImpl's per-region policy. |
1241 | ScheduleDAGMI::enterRegion(bb, begin, end, regioninstrs); |
1242 | |
1243 | // For convenience remember the end of the liveness region. |
1244 | LiveRegionEnd = (RegionEnd == bb->end()) ? RegionEnd : std::next(x: RegionEnd); |
1245 | |
1246 | SUPressureDiffs.clear(); |
1247 | |
1248 | ShouldTrackPressure = SchedImpl->shouldTrackPressure(); |
1249 | ShouldTrackLaneMasks = SchedImpl->shouldTrackLaneMasks(); |
1250 | |
1251 | assert((!ShouldTrackLaneMasks || ShouldTrackPressure) && |
1252 | "ShouldTrackLaneMasks requires ShouldTrackPressure" ); |
1253 | } |
1254 | |
1255 | // Setup the register pressure trackers for the top scheduled and bottom |
1256 | // scheduled regions. |
1257 | void ScheduleDAGMILive::initRegPressure() { |
1258 | VRegUses.clear(); |
1259 | VRegUses.setUniverse(MRI.getNumVirtRegs()); |
1260 | for (SUnit &SU : SUnits) |
1261 | collectVRegUses(SU); |
1262 | |
1263 | TopRPTracker.init(mf: &MF, rci: RegClassInfo, lis: LIS, mbb: BB, pos: RegionBegin, |
1264 | TrackLaneMasks: ShouldTrackLaneMasks, TrackUntiedDefs: false); |
1265 | BotRPTracker.init(mf: &MF, rci: RegClassInfo, lis: LIS, mbb: BB, pos: LiveRegionEnd, |
1266 | TrackLaneMasks: ShouldTrackLaneMasks, TrackUntiedDefs: false); |
1267 | |
1268 | // Close the RPTracker to finalize live ins. |
1269 | RPTracker.closeRegion(); |
1270 | |
1271 | LLVM_DEBUG(RPTracker.dump()); |
1272 | |
1273 | // Initialize the live ins and live outs. |
1274 | TopRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveInRegs); |
1275 | BotRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveOutRegs); |
1276 | |
1277 | // Close one end of the tracker so we can call |
1278 | // getMaxUpward/DownwardPressureDelta before advancing across any |
1279 | // instructions. This converts currently live regs into live ins/outs. |
1280 | TopRPTracker.closeTop(); |
1281 | BotRPTracker.closeBottom(); |
1282 | |
1283 | BotRPTracker.initLiveThru(RPTracker); |
1284 | if (!BotRPTracker.getLiveThru().empty()) { |
1285 | TopRPTracker.initLiveThru(PressureSet: BotRPTracker.getLiveThru()); |
1286 | LLVM_DEBUG(dbgs() << "Live Thru: " ; |
1287 | dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI)); |
1288 | }; |
1289 | |
1290 | // For each live out vreg reduce the pressure change associated with other |
1291 | // uses of the same vreg below the live-out reaching def. |
1292 | updatePressureDiffs(LiveUses: RPTracker.getPressure().LiveOutRegs); |
1293 | |
1294 | // Account for liveness generated by the region boundary. |
1295 | if (LiveRegionEnd != RegionEnd) { |
1296 | SmallVector<RegisterMaskPair, 8> LiveUses; |
1297 | BotRPTracker.recede(LiveUses: &LiveUses); |
1298 | updatePressureDiffs(LiveUses); |
1299 | } |
1300 | |
1301 | LLVM_DEBUG(dbgs() << "Top Pressure:\n" ; |
1302 | dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI); |
1303 | dbgs() << "Bottom Pressure:\n" ; |
1304 | dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI);); |
1305 | |
1306 | assert((BotRPTracker.getPos() == RegionEnd || |
1307 | (RegionEnd->isDebugInstr() && |
1308 | BotRPTracker.getPos() == priorNonDebug(RegionEnd, RegionBegin))) && |
1309 | "Can't find the region bottom" ); |
1310 | |
1311 | // Cache the list of excess pressure sets in this region. This will also track |
1312 | // the max pressure in the scheduled code for these sets. |
1313 | RegionCriticalPSets.clear(); |
1314 | const std::vector<unsigned> &RegionPressure = |
1315 | RPTracker.getPressure().MaxSetPressure; |
1316 | for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) { |
1317 | unsigned Limit = RegClassInfo->getRegPressureSetLimit(Idx: i); |
1318 | if (RegionPressure[i] > Limit) { |
1319 | LLVM_DEBUG(dbgs() << TRI->getRegPressureSetName(i) << " Limit " << Limit |
1320 | << " Actual " << RegionPressure[i] << "\n" ); |
1321 | RegionCriticalPSets.push_back(x: PressureChange(i)); |
1322 | } |
1323 | } |
1324 | LLVM_DEBUG(dbgs() << "Excess PSets: " ; |
1325 | for (const PressureChange &RCPS |
1326 | : RegionCriticalPSets) dbgs() |
1327 | << TRI->getRegPressureSetName(RCPS.getPSet()) << " " ; |
1328 | dbgs() << "\n" ); |
1329 | } |
1330 | |
1331 | void ScheduleDAGMILive:: |
1332 | updateScheduledPressure(const SUnit *SU, |
1333 | const std::vector<unsigned> &NewMaxPressure) { |
1334 | const PressureDiff &PDiff = getPressureDiff(SU); |
1335 | unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size(); |
1336 | for (const PressureChange &PC : PDiff) { |
1337 | if (!PC.isValid()) |
1338 | break; |
1339 | unsigned ID = PC.getPSet(); |
1340 | while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID) |
1341 | ++CritIdx; |
1342 | if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) { |
1343 | if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc() |
1344 | && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max()) |
1345 | RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]); |
1346 | } |
1347 | unsigned Limit = RegClassInfo->getRegPressureSetLimit(Idx: ID); |
1348 | if (NewMaxPressure[ID] >= Limit - 2) { |
1349 | LLVM_DEBUG(dbgs() << " " << TRI->getRegPressureSetName(ID) << ": " |
1350 | << NewMaxPressure[ID] |
1351 | << ((NewMaxPressure[ID] > Limit) ? " > " : " <= " ) |
1352 | << Limit << "(+ " << BotRPTracker.getLiveThru()[ID] |
1353 | << " livethru)\n" ); |
1354 | } |
1355 | } |
1356 | } |
1357 | |
1358 | /// Update the PressureDiff array for liveness after scheduling this |
1359 | /// instruction. |
1360 | void ScheduleDAGMILive::updatePressureDiffs( |
1361 | ArrayRef<RegisterMaskPair> LiveUses) { |
1362 | for (const RegisterMaskPair &P : LiveUses) { |
1363 | Register Reg = P.RegUnit; |
1364 | /// FIXME: Currently assuming single-use physregs. |
1365 | if (!Reg.isVirtual()) |
1366 | continue; |
1367 | |
1368 | if (ShouldTrackLaneMasks) { |
1369 | // If the register has just become live then other uses won't change |
1370 | // this fact anymore => decrement pressure. |
1371 | // If the register has just become dead then other uses make it come |
1372 | // back to life => increment pressure. |
1373 | bool Decrement = P.LaneMask.any(); |
1374 | |
1375 | for (const VReg2SUnit &V2SU |
1376 | : make_range(x: VRegUses.find(Key: Reg), y: VRegUses.end())) { |
1377 | SUnit &SU = *V2SU.SU; |
1378 | if (SU.isScheduled || &SU == &ExitSU) |
1379 | continue; |
1380 | |
1381 | PressureDiff &PDiff = getPressureDiff(SU: &SU); |
1382 | PDiff.addPressureChange(RegUnit: Reg, IsDec: Decrement, MRI: &MRI); |
1383 | LLVM_DEBUG(dbgs() << " UpdateRegP: SU(" << SU.NodeNum << ") " |
1384 | << printReg(Reg, TRI) << ':' |
1385 | << PrintLaneMask(P.LaneMask) << ' ' << *SU.getInstr(); |
1386 | dbgs() << " to " ; PDiff.dump(*TRI);); |
1387 | } |
1388 | } else { |
1389 | assert(P.LaneMask.any()); |
1390 | LLVM_DEBUG(dbgs() << " LiveReg: " << printVRegOrUnit(Reg, TRI) << "\n" ); |
1391 | // This may be called before CurrentBottom has been initialized. However, |
1392 | // BotRPTracker must have a valid position. We want the value live into the |
1393 | // instruction or live out of the block, so ask for the previous |
1394 | // instruction's live-out. |
1395 | const LiveInterval &LI = LIS->getInterval(Reg); |
1396 | VNInfo *VNI; |
1397 | MachineBasicBlock::const_iterator I = |
1398 | nextIfDebug(I: BotRPTracker.getPos(), End: BB->end()); |
1399 | if (I == BB->end()) |
1400 | VNI = LI.getVNInfoBefore(Idx: LIS->getMBBEndIdx(mbb: BB)); |
1401 | else { |
1402 | LiveQueryResult LRQ = LI.Query(Idx: LIS->getInstructionIndex(Instr: *I)); |
1403 | VNI = LRQ.valueIn(); |
1404 | } |
1405 | // RegisterPressureTracker guarantees that readsReg is true for LiveUses. |
1406 | assert(VNI && "No live value at use." ); |
1407 | for (const VReg2SUnit &V2SU |
1408 | : make_range(x: VRegUses.find(Key: Reg), y: VRegUses.end())) { |
1409 | SUnit *SU = V2SU.SU; |
1410 | // If this use comes before the reaching def, it cannot be a last use, |
1411 | // so decrease its pressure change. |
1412 | if (!SU->isScheduled && SU != &ExitSU) { |
1413 | LiveQueryResult LRQ = |
1414 | LI.Query(Idx: LIS->getInstructionIndex(Instr: *SU->getInstr())); |
1415 | if (LRQ.valueIn() == VNI) { |
1416 | PressureDiff &PDiff = getPressureDiff(SU); |
1417 | PDiff.addPressureChange(RegUnit: Reg, IsDec: true, MRI: &MRI); |
1418 | LLVM_DEBUG(dbgs() << " UpdateRegP: SU(" << SU->NodeNum << ") " |
1419 | << *SU->getInstr(); |
1420 | dbgs() << " to " ; PDiff.dump(*TRI);); |
1421 | } |
1422 | } |
1423 | } |
1424 | } |
1425 | } |
1426 | } |
1427 | |
1428 | void ScheduleDAGMILive::dump() const { |
1429 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1430 | if (EntrySU.getInstr() != nullptr) |
1431 | dumpNodeAll(EntrySU); |
1432 | for (const SUnit &SU : SUnits) { |
1433 | dumpNodeAll(SU); |
1434 | if (ShouldTrackPressure) { |
1435 | dbgs() << " Pressure Diff : " ; |
1436 | getPressureDiff(&SU).dump(*TRI); |
1437 | } |
1438 | dbgs() << " Single Issue : " ; |
1439 | if (SchedModel.mustBeginGroup(SU.getInstr()) && |
1440 | SchedModel.mustEndGroup(SU.getInstr())) |
1441 | dbgs() << "true;" ; |
1442 | else |
1443 | dbgs() << "false;" ; |
1444 | dbgs() << '\n'; |
1445 | } |
1446 | if (ExitSU.getInstr() != nullptr) |
1447 | dumpNodeAll(ExitSU); |
1448 | #endif |
1449 | } |
1450 | |
1451 | /// schedule - Called back from MachineScheduler::runOnMachineFunction |
1452 | /// after setting up the current scheduling region. [RegionBegin, RegionEnd) |
1453 | /// only includes instructions that have DAG nodes, not scheduling boundaries. |
1454 | /// |
1455 | /// This is a skeletal driver, with all the functionality pushed into helpers, |
1456 | /// so that it can be easily extended by experimental schedulers. Generally, |
1457 | /// implementing MachineSchedStrategy should be sufficient to implement a new |
1458 | /// scheduling algorithm. However, if a scheduler further subclasses |
1459 | /// ScheduleDAGMILive then it will want to override this virtual method in order |
1460 | /// to update any specialized state. |
1461 | void ScheduleDAGMILive::schedule() { |
1462 | LLVM_DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n" ); |
1463 | LLVM_DEBUG(SchedImpl->dumpPolicy()); |
1464 | buildDAGWithRegPressure(); |
1465 | |
1466 | postProcessDAG(); |
1467 | |
1468 | SmallVector<SUnit*, 8> TopRoots, BotRoots; |
1469 | findRootsAndBiasEdges(TopRoots, BotRoots); |
1470 | |
1471 | // Initialize the strategy before modifying the DAG. |
1472 | // This may initialize a DFSResult to be used for queue priority. |
1473 | SchedImpl->initialize(DAG: this); |
1474 | |
1475 | LLVM_DEBUG(dump()); |
1476 | if (PrintDAGs) dump(); |
1477 | if (ViewMISchedDAGs) viewGraph(); |
1478 | |
1479 | // Initialize ready queues now that the DAG and priority data are finalized. |
1480 | initQueues(TopRoots, BotRoots); |
1481 | |
1482 | bool IsTopNode = false; |
1483 | while (true) { |
1484 | LLVM_DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n" ); |
1485 | SUnit *SU = SchedImpl->pickNode(IsTopNode); |
1486 | if (!SU) break; |
1487 | |
1488 | assert(!SU->isScheduled && "Node already scheduled" ); |
1489 | if (!checkSchedLimit()) |
1490 | break; |
1491 | |
1492 | scheduleMI(SU, IsTopNode); |
1493 | |
1494 | if (DFSResult) { |
1495 | unsigned SubtreeID = DFSResult->getSubtreeID(SU); |
1496 | if (!ScheduledTrees.test(Idx: SubtreeID)) { |
1497 | ScheduledTrees.set(SubtreeID); |
1498 | DFSResult->scheduleTree(SubtreeID); |
1499 | SchedImpl->scheduleTree(SubtreeID); |
1500 | } |
1501 | } |
1502 | |
1503 | // Notify the scheduling strategy after updating the DAG. |
1504 | SchedImpl->schedNode(SU, IsTopNode); |
1505 | |
1506 | updateQueues(SU, IsTopNode); |
1507 | } |
1508 | assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone." ); |
1509 | |
1510 | placeDebugValues(); |
1511 | |
1512 | LLVM_DEBUG({ |
1513 | dbgs() << "*** Final schedule for " |
1514 | << printMBBReference(*begin()->getParent()) << " ***\n" ; |
1515 | dumpSchedule(); |
1516 | dbgs() << '\n'; |
1517 | }); |
1518 | } |
1519 | |
1520 | /// Build the DAG and setup three register pressure trackers. |
1521 | void ScheduleDAGMILive::buildDAGWithRegPressure() { |
1522 | if (!ShouldTrackPressure) { |
1523 | RPTracker.reset(); |
1524 | RegionCriticalPSets.clear(); |
1525 | buildSchedGraph(AA); |
1526 | return; |
1527 | } |
1528 | |
1529 | // Initialize the register pressure tracker used by buildSchedGraph. |
1530 | RPTracker.init(mf: &MF, rci: RegClassInfo, lis: LIS, mbb: BB, pos: LiveRegionEnd, |
1531 | TrackLaneMasks: ShouldTrackLaneMasks, /*TrackUntiedDefs=*/true); |
1532 | |
1533 | // Account for liveness generate by the region boundary. |
1534 | if (LiveRegionEnd != RegionEnd) |
1535 | RPTracker.recede(); |
1536 | |
1537 | // Build the DAG, and compute current register pressure. |
1538 | buildSchedGraph(AA, RPTracker: &RPTracker, PDiffs: &SUPressureDiffs, LIS, TrackLaneMasks: ShouldTrackLaneMasks); |
1539 | |
1540 | // Initialize top/bottom trackers after computing region pressure. |
1541 | initRegPressure(); |
1542 | } |
1543 | |
1544 | void ScheduleDAGMILive::computeDFSResult() { |
1545 | if (!DFSResult) |
1546 | DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize); |
1547 | DFSResult->clear(); |
1548 | ScheduledTrees.clear(); |
1549 | DFSResult->resize(NumSUnits: SUnits.size()); |
1550 | DFSResult->compute(SUnits); |
1551 | ScheduledTrees.resize(N: DFSResult->getNumSubtrees()); |
1552 | } |
1553 | |
1554 | /// Compute the max cyclic critical path through the DAG. The scheduling DAG |
1555 | /// only provides the critical path for single block loops. To handle loops that |
1556 | /// span blocks, we could use the vreg path latencies provided by |
1557 | /// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently |
1558 | /// available for use in the scheduler. |
1559 | /// |
1560 | /// The cyclic path estimation identifies a def-use pair that crosses the back |
1561 | /// edge and considers the depth and height of the nodes. For example, consider |
1562 | /// the following instruction sequence where each instruction has unit latency |
1563 | /// and defines an eponymous virtual register: |
1564 | /// |
1565 | /// a->b(a,c)->c(b)->d(c)->exit |
1566 | /// |
1567 | /// The cyclic critical path is a two cycles: b->c->b |
1568 | /// The acyclic critical path is four cycles: a->b->c->d->exit |
1569 | /// LiveOutHeight = height(c) = len(c->d->exit) = 2 |
1570 | /// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3 |
1571 | /// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4 |
1572 | /// LiveInDepth = depth(b) = len(a->b) = 1 |
1573 | /// |
1574 | /// LiveOutDepth - LiveInDepth = 3 - 1 = 2 |
1575 | /// LiveInHeight - LiveOutHeight = 4 - 2 = 2 |
1576 | /// CyclicCriticalPath = min(2, 2) = 2 |
1577 | /// |
1578 | /// This could be relevant to PostRA scheduling, but is currently implemented |
1579 | /// assuming LiveIntervals. |
1580 | unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { |
1581 | // This only applies to single block loop. |
1582 | if (!BB->isSuccessor(MBB: BB)) |
1583 | return 0; |
1584 | |
1585 | unsigned MaxCyclicLatency = 0; |
1586 | // Visit each live out vreg def to find def/use pairs that cross iterations. |
1587 | for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) { |
1588 | Register Reg = P.RegUnit; |
1589 | if (!Reg.isVirtual()) |
1590 | continue; |
1591 | const LiveInterval &LI = LIS->getInterval(Reg); |
1592 | const VNInfo *DefVNI = LI.getVNInfoBefore(Idx: LIS->getMBBEndIdx(mbb: BB)); |
1593 | if (!DefVNI) |
1594 | continue; |
1595 | |
1596 | MachineInstr *DefMI = LIS->getInstructionFromIndex(index: DefVNI->def); |
1597 | const SUnit *DefSU = getSUnit(MI: DefMI); |
1598 | if (!DefSU) |
1599 | continue; |
1600 | |
1601 | unsigned LiveOutHeight = DefSU->getHeight(); |
1602 | unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency; |
1603 | // Visit all local users of the vreg def. |
1604 | for (const VReg2SUnit &V2SU |
1605 | : make_range(x: VRegUses.find(Key: Reg), y: VRegUses.end())) { |
1606 | SUnit *SU = V2SU.SU; |
1607 | if (SU == &ExitSU) |
1608 | continue; |
1609 | |
1610 | // Only consider uses of the phi. |
1611 | LiveQueryResult LRQ = LI.Query(Idx: LIS->getInstructionIndex(Instr: *SU->getInstr())); |
1612 | if (!LRQ.valueIn()->isPHIDef()) |
1613 | continue; |
1614 | |
1615 | // Assume that a path spanning two iterations is a cycle, which could |
1616 | // overestimate in strange cases. This allows cyclic latency to be |
1617 | // estimated as the minimum slack of the vreg's depth or height. |
1618 | unsigned CyclicLatency = 0; |
1619 | if (LiveOutDepth > SU->getDepth()) |
1620 | CyclicLatency = LiveOutDepth - SU->getDepth(); |
1621 | |
1622 | unsigned LiveInHeight = SU->getHeight() + DefSU->Latency; |
1623 | if (LiveInHeight > LiveOutHeight) { |
1624 | if (LiveInHeight - LiveOutHeight < CyclicLatency) |
1625 | CyclicLatency = LiveInHeight - LiveOutHeight; |
1626 | } else |
1627 | CyclicLatency = 0; |
1628 | |
1629 | LLVM_DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU(" |
1630 | << SU->NodeNum << ") = " << CyclicLatency << "c\n" ); |
1631 | if (CyclicLatency > MaxCyclicLatency) |
1632 | MaxCyclicLatency = CyclicLatency; |
1633 | } |
1634 | } |
1635 | LLVM_DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n" ); |
1636 | return MaxCyclicLatency; |
1637 | } |
1638 | |
1639 | /// Release ExitSU predecessors and setup scheduler queues. Re-position |
1640 | /// the Top RP tracker in case the region beginning has changed. |
1641 | void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots, |
1642 | ArrayRef<SUnit*> BotRoots) { |
1643 | ScheduleDAGMI::initQueues(TopRoots, BotRoots); |
1644 | if (ShouldTrackPressure) { |
1645 | assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker" ); |
1646 | TopRPTracker.setPos(CurrentTop); |
1647 | } |
1648 | } |
1649 | |
1650 | /// Move an instruction and update register pressure. |
1651 | void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { |
1652 | // Move the instruction to its new location in the instruction stream. |
1653 | MachineInstr *MI = SU->getInstr(); |
1654 | |
1655 | if (IsTopNode) { |
1656 | assert(SU->isTopReady() && "node still has unscheduled dependencies" ); |
1657 | if (&*CurrentTop == MI) |
1658 | CurrentTop = nextIfDebug(I: ++CurrentTop, End: CurrentBottom); |
1659 | else { |
1660 | moveInstruction(MI, InsertPos: CurrentTop); |
1661 | TopRPTracker.setPos(MI); |
1662 | } |
1663 | |
1664 | if (ShouldTrackPressure) { |
1665 | // Update top scheduled pressure. |
1666 | RegisterOperands RegOpers; |
1667 | RegOpers.collect(MI: *MI, TRI: *TRI, MRI, TrackLaneMasks: ShouldTrackLaneMasks, |
1668 | /*IgnoreDead=*/false); |
1669 | if (ShouldTrackLaneMasks) { |
1670 | // Adjust liveness and add missing dead+read-undef flags. |
1671 | SlotIndex SlotIdx = LIS->getInstructionIndex(Instr: *MI).getRegSlot(); |
1672 | RegOpers.adjustLaneLiveness(LIS: *LIS, MRI, Pos: SlotIdx, AddFlagsMI: MI); |
1673 | } else { |
1674 | // Adjust for missing dead-def flags. |
1675 | RegOpers.detectDeadDefs(MI: *MI, LIS: *LIS); |
1676 | } |
1677 | |
1678 | TopRPTracker.advance(RegOpers); |
1679 | assert(TopRPTracker.getPos() == CurrentTop && "out of sync" ); |
1680 | LLVM_DEBUG(dbgs() << "Top Pressure:\n" ; dumpRegSetPressure( |
1681 | TopRPTracker.getRegSetPressureAtPos(), TRI);); |
1682 | |
1683 | updateScheduledPressure(SU, NewMaxPressure: TopRPTracker.getPressure().MaxSetPressure); |
1684 | } |
1685 | } else { |
1686 | assert(SU->isBottomReady() && "node still has unscheduled dependencies" ); |
1687 | MachineBasicBlock::iterator priorII = |
1688 | priorNonDebug(I: CurrentBottom, Beg: CurrentTop); |
1689 | if (&*priorII == MI) |
1690 | CurrentBottom = priorII; |
1691 | else { |
1692 | if (&*CurrentTop == MI) { |
1693 | CurrentTop = nextIfDebug(I: ++CurrentTop, End: priorII); |
1694 | TopRPTracker.setPos(CurrentTop); |
1695 | } |
1696 | moveInstruction(MI, InsertPos: CurrentBottom); |
1697 | CurrentBottom = MI; |
1698 | BotRPTracker.setPos(CurrentBottom); |
1699 | } |
1700 | if (ShouldTrackPressure) { |
1701 | RegisterOperands RegOpers; |
1702 | RegOpers.collect(MI: *MI, TRI: *TRI, MRI, TrackLaneMasks: ShouldTrackLaneMasks, |
1703 | /*IgnoreDead=*/false); |
1704 | if (ShouldTrackLaneMasks) { |
1705 | // Adjust liveness and add missing dead+read-undef flags. |
1706 | SlotIndex SlotIdx = LIS->getInstructionIndex(Instr: *MI).getRegSlot(); |
1707 | RegOpers.adjustLaneLiveness(LIS: *LIS, MRI, Pos: SlotIdx, AddFlagsMI: MI); |
1708 | } else { |
1709 | // Adjust for missing dead-def flags. |
1710 | RegOpers.detectDeadDefs(MI: *MI, LIS: *LIS); |
1711 | } |
1712 | |
1713 | if (BotRPTracker.getPos() != CurrentBottom) |
1714 | BotRPTracker.recedeSkipDebugValues(); |
1715 | SmallVector<RegisterMaskPair, 8> LiveUses; |
1716 | BotRPTracker.recede(RegOpers, LiveUses: &LiveUses); |
1717 | assert(BotRPTracker.getPos() == CurrentBottom && "out of sync" ); |
1718 | LLVM_DEBUG(dbgs() << "Bottom Pressure:\n" ; dumpRegSetPressure( |
1719 | BotRPTracker.getRegSetPressureAtPos(), TRI);); |
1720 | |
1721 | updateScheduledPressure(SU, NewMaxPressure: BotRPTracker.getPressure().MaxSetPressure); |
1722 | updatePressureDiffs(LiveUses); |
1723 | } |
1724 | } |
1725 | } |
1726 | |
1727 | //===----------------------------------------------------------------------===// |
1728 | // BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores. |
1729 | //===----------------------------------------------------------------------===// |
1730 | |
1731 | namespace { |
1732 | |
1733 | /// Post-process the DAG to create cluster edges between neighboring |
1734 | /// loads or between neighboring stores. |
1735 | class BaseMemOpClusterMutation : public ScheduleDAGMutation { |
1736 | struct MemOpInfo { |
1737 | SUnit *SU; |
1738 | SmallVector<const MachineOperand *, 4> BaseOps; |
1739 | int64_t Offset; |
1740 | LocationSize Width; |
1741 | bool OffsetIsScalable; |
1742 | |
1743 | MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps, |
1744 | int64_t Offset, bool OffsetIsScalable, LocationSize Width) |
1745 | : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset), |
1746 | Width(Width), OffsetIsScalable(OffsetIsScalable) {} |
1747 | |
1748 | static bool Compare(const MachineOperand *const &A, |
1749 | const MachineOperand *const &B) { |
1750 | if (A->getType() != B->getType()) |
1751 | return A->getType() < B->getType(); |
1752 | if (A->isReg()) |
1753 | return A->getReg() < B->getReg(); |
1754 | if (A->isFI()) { |
1755 | const MachineFunction &MF = *A->getParent()->getParent()->getParent(); |
1756 | const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); |
1757 | bool StackGrowsDown = TFI.getStackGrowthDirection() == |
1758 | TargetFrameLowering::StackGrowsDown; |
1759 | return StackGrowsDown ? A->getIndex() > B->getIndex() |
1760 | : A->getIndex() < B->getIndex(); |
1761 | } |
1762 | |
1763 | llvm_unreachable("MemOpClusterMutation only supports register or frame " |
1764 | "index bases." ); |
1765 | } |
1766 | |
1767 | bool operator<(const MemOpInfo &RHS) const { |
1768 | // FIXME: Don't compare everything twice. Maybe use C++20 three way |
1769 | // comparison instead when it's available. |
1770 | if (std::lexicographical_compare(first1: BaseOps.begin(), last1: BaseOps.end(), |
1771 | first2: RHS.BaseOps.begin(), last2: RHS.BaseOps.end(), |
1772 | comp: Compare)) |
1773 | return true; |
1774 | if (std::lexicographical_compare(first1: RHS.BaseOps.begin(), last1: RHS.BaseOps.end(), |
1775 | first2: BaseOps.begin(), last2: BaseOps.end(), comp: Compare)) |
1776 | return false; |
1777 | if (Offset != RHS.Offset) |
1778 | return Offset < RHS.Offset; |
1779 | return SU->NodeNum < RHS.SU->NodeNum; |
1780 | } |
1781 | }; |
1782 | |
1783 | const TargetInstrInfo *TII; |
1784 | const TargetRegisterInfo *TRI; |
1785 | bool IsLoad; |
1786 | bool ReorderWhileClustering; |
1787 | |
1788 | public: |
1789 | BaseMemOpClusterMutation(const TargetInstrInfo *tii, |
1790 | const TargetRegisterInfo *tri, bool IsLoad, |
1791 | bool ReorderWhileClustering) |
1792 | : TII(tii), TRI(tri), IsLoad(IsLoad), |
1793 | ReorderWhileClustering(ReorderWhileClustering) {} |
1794 | |
1795 | void apply(ScheduleDAGInstrs *DAGInstrs) override; |
1796 | |
1797 | protected: |
1798 | void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps, bool FastCluster, |
1799 | ScheduleDAGInstrs *DAG); |
1800 | void collectMemOpRecords(std::vector<SUnit> &SUnits, |
1801 | SmallVectorImpl<MemOpInfo> &MemOpRecords); |
1802 | bool groupMemOps(ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG, |
1803 | DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups); |
1804 | }; |
1805 | |
1806 | class StoreClusterMutation : public BaseMemOpClusterMutation { |
1807 | public: |
1808 | StoreClusterMutation(const TargetInstrInfo *tii, |
1809 | const TargetRegisterInfo *tri, |
1810 | bool ReorderWhileClustering) |
1811 | : BaseMemOpClusterMutation(tii, tri, false, ReorderWhileClustering) {} |
1812 | }; |
1813 | |
1814 | class LoadClusterMutation : public BaseMemOpClusterMutation { |
1815 | public: |
1816 | LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri, |
1817 | bool ReorderWhileClustering) |
1818 | : BaseMemOpClusterMutation(tii, tri, true, ReorderWhileClustering) {} |
1819 | }; |
1820 | |
1821 | } // end anonymous namespace |
1822 | |
1823 | namespace llvm { |
1824 | |
1825 | std::unique_ptr<ScheduleDAGMutation> |
1826 | createLoadClusterDAGMutation(const TargetInstrInfo *TII, |
1827 | const TargetRegisterInfo *TRI, |
1828 | bool ReorderWhileClustering) { |
1829 | return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>( |
1830 | args&: TII, args&: TRI, args&: ReorderWhileClustering) |
1831 | : nullptr; |
1832 | } |
1833 | |
1834 | std::unique_ptr<ScheduleDAGMutation> |
1835 | createStoreClusterDAGMutation(const TargetInstrInfo *TII, |
1836 | const TargetRegisterInfo *TRI, |
1837 | bool ReorderWhileClustering) { |
1838 | return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>( |
1839 | args&: TII, args&: TRI, args&: ReorderWhileClustering) |
1840 | : nullptr; |
1841 | } |
1842 | |
1843 | } // end namespace llvm |
1844 | |
1845 | // Sorting all the loads/stores first, then for each load/store, checking the |
1846 | // following load/store one by one, until reach the first non-dependent one and |
1847 | // call target hook to see if they can cluster. |
1848 | // If FastCluster is enabled, we assume that, all the loads/stores have been |
1849 | // preprocessed and now, they didn't have dependencies on each other. |
1850 | void BaseMemOpClusterMutation::clusterNeighboringMemOps( |
1851 | ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster, |
1852 | ScheduleDAGInstrs *DAG) { |
1853 | // Keep track of the current cluster length and bytes for each SUnit. |
1854 | DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo; |
1855 | |
1856 | // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to |
1857 | // cluster mem ops collected within `MemOpRecords` array. |
1858 | for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { |
1859 | // Decision to cluster mem ops is taken based on target dependent logic |
1860 | auto MemOpa = MemOpRecords[Idx]; |
1861 | |
1862 | // Seek for the next load/store to do the cluster. |
1863 | unsigned NextIdx = Idx + 1; |
1864 | for (; NextIdx < End; ++NextIdx) |
1865 | // Skip if MemOpb has been clustered already or has dependency with |
1866 | // MemOpa. |
1867 | if (!SUnit2ClusterInfo.count(Val: MemOpRecords[NextIdx].SU->NodeNum) && |
1868 | (FastCluster || |
1869 | (!DAG->IsReachable(SU: MemOpRecords[NextIdx].SU, TargetSU: MemOpa.SU) && |
1870 | !DAG->IsReachable(SU: MemOpa.SU, TargetSU: MemOpRecords[NextIdx].SU)))) |
1871 | break; |
1872 | if (NextIdx == End) |
1873 | continue; |
1874 | |
1875 | auto MemOpb = MemOpRecords[NextIdx]; |
1876 | unsigned ClusterLength = 2; |
1877 | unsigned CurrentClusterBytes = MemOpa.Width.getValue().getKnownMinValue() + |
1878 | MemOpb.Width.getValue().getKnownMinValue(); |
1879 | if (SUnit2ClusterInfo.count(Val: MemOpa.SU->NodeNum)) { |
1880 | ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1; |
1881 | CurrentClusterBytes = SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + |
1882 | MemOpb.Width.getValue().getKnownMinValue(); |
1883 | } |
1884 | |
1885 | if (!TII->shouldClusterMemOps(BaseOps1: MemOpa.BaseOps, Offset1: MemOpa.Offset, |
1886 | OffsetIsScalable1: MemOpa.OffsetIsScalable, BaseOps2: MemOpb.BaseOps, |
1887 | Offset2: MemOpb.Offset, OffsetIsScalable2: MemOpb.OffsetIsScalable, |
1888 | ClusterSize: ClusterLength, NumBytes: CurrentClusterBytes)) |
1889 | continue; |
1890 | |
1891 | SUnit *SUa = MemOpa.SU; |
1892 | SUnit *SUb = MemOpb.SU; |
1893 | if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum) |
1894 | std::swap(a&: SUa, b&: SUb); |
1895 | |
1896 | // FIXME: Is this check really required? |
1897 | if (!DAG->addEdge(SuccSU: SUb, PredDep: SDep(SUa, SDep::Cluster))) |
1898 | continue; |
1899 | |
1900 | LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" |
1901 | << SUb->NodeNum << ")\n" ); |
1902 | ++NumClustered; |
1903 | |
1904 | if (IsLoad) { |
1905 | // Copy successor edges from SUa to SUb. Interleaving computation |
1906 | // dependent on SUa can prevent load combining due to register reuse. |
1907 | // Predecessor edges do not need to be copied from SUb to SUa since |
1908 | // nearby loads should have effectively the same inputs. |
1909 | for (const SDep &Succ : SUa->Succs) { |
1910 | if (Succ.getSUnit() == SUb) |
1911 | continue; |
1912 | LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum |
1913 | << ")\n" ); |
1914 | DAG->addEdge(SuccSU: Succ.getSUnit(), PredDep: SDep(SUb, SDep::Artificial)); |
1915 | } |
1916 | } else { |
1917 | // Copy predecessor edges from SUb to SUa to avoid the SUnits that |
1918 | // SUb dependent on scheduled in-between SUb and SUa. Successor edges |
1919 | // do not need to be copied from SUa to SUb since no one will depend |
1920 | // on stores. |
1921 | // Notice that, we don't need to care about the memory dependency as |
1922 | // we won't try to cluster them if they have any memory dependency. |
1923 | for (const SDep &Pred : SUb->Preds) { |
1924 | if (Pred.getSUnit() == SUa) |
1925 | continue; |
1926 | LLVM_DEBUG(dbgs() << " Copy Pred SU(" << Pred.getSUnit()->NodeNum |
1927 | << ")\n" ); |
1928 | DAG->addEdge(SuccSU: SUa, PredDep: SDep(Pred.getSUnit(), SDep::Artificial)); |
1929 | } |
1930 | } |
1931 | |
1932 | SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength, |
1933 | CurrentClusterBytes}; |
1934 | |
1935 | LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength |
1936 | << ", Curr cluster bytes: " << CurrentClusterBytes |
1937 | << "\n" ); |
1938 | } |
1939 | } |
1940 | |
1941 | void BaseMemOpClusterMutation::collectMemOpRecords( |
1942 | std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) { |
1943 | for (auto &SU : SUnits) { |
1944 | if ((IsLoad && !SU.getInstr()->mayLoad()) || |
1945 | (!IsLoad && !SU.getInstr()->mayStore())) |
1946 | continue; |
1947 | |
1948 | const MachineInstr &MI = *SU.getInstr(); |
1949 | SmallVector<const MachineOperand *, 4> BaseOps; |
1950 | int64_t Offset; |
1951 | bool OffsetIsScalable; |
1952 | LocationSize Width = 0; |
1953 | if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, |
1954 | OffsetIsScalable, Width, TRI)) { |
1955 | MemOpRecords.push_back( |
1956 | Elt: MemOpInfo(&SU, BaseOps, Offset, OffsetIsScalable, Width)); |
1957 | |
1958 | LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: " |
1959 | << Offset << ", OffsetIsScalable: " << OffsetIsScalable |
1960 | << ", Width: " << Width << "\n" ); |
1961 | } |
1962 | #ifndef NDEBUG |
1963 | for (const auto *Op : BaseOps) |
1964 | assert(Op); |
1965 | #endif |
1966 | } |
1967 | } |
1968 | |
1969 | bool BaseMemOpClusterMutation::groupMemOps( |
1970 | ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG, |
1971 | DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) { |
1972 | bool FastCluster = |
1973 | ForceFastCluster || |
1974 | MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold; |
1975 | |
1976 | for (const auto &MemOp : MemOps) { |
1977 | unsigned ChainPredID = DAG->SUnits.size(); |
1978 | if (FastCluster) { |
1979 | for (const SDep &Pred : MemOp.SU->Preds) { |
1980 | // We only want to cluster the mem ops that have the same ctrl(non-data) |
1981 | // pred so that they didn't have ctrl dependency for each other. But for |
1982 | // store instrs, we can still cluster them if the pred is load instr. |
1983 | if ((Pred.isCtrl() && |
1984 | (IsLoad || |
1985 | (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) && |
1986 | !Pred.isArtificial()) { |
1987 | ChainPredID = Pred.getSUnit()->NodeNum; |
1988 | break; |
1989 | } |
1990 | } |
1991 | } else |
1992 | ChainPredID = 0; |
1993 | |
1994 | Groups[ChainPredID].push_back(Elt: MemOp); |
1995 | } |
1996 | return FastCluster; |
1997 | } |
1998 | |
1999 | /// Callback from DAG postProcessing to create cluster edges for loads/stores. |
2000 | void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { |
2001 | // Collect all the clusterable loads/stores |
2002 | SmallVector<MemOpInfo, 32> MemOpRecords; |
2003 | collectMemOpRecords(SUnits&: DAG->SUnits, MemOpRecords); |
2004 | |
2005 | if (MemOpRecords.size() < 2) |
2006 | return; |
2007 | |
2008 | // Put the loads/stores without dependency into the same group with some |
2009 | // heuristic if the DAG is too complex to avoid compiling time blow up. |
2010 | // Notice that, some fusion pair could be lost with this. |
2011 | DenseMap<unsigned, SmallVector<MemOpInfo, 32>> Groups; |
2012 | bool FastCluster = groupMemOps(MemOps: MemOpRecords, DAG, Groups); |
2013 | |
2014 | for (auto &Group : Groups) { |
2015 | // Sorting the loads/stores, so that, we can stop the cluster as early as |
2016 | // possible. |
2017 | llvm::sort(C&: Group.second); |
2018 | |
2019 | // Trying to cluster all the neighboring loads/stores. |
2020 | clusterNeighboringMemOps(MemOpRecords: Group.second, FastCluster, DAG); |
2021 | } |
2022 | } |
2023 | |
2024 | //===----------------------------------------------------------------------===// |
2025 | // CopyConstrain - DAG post-processing to encourage copy elimination. |
2026 | //===----------------------------------------------------------------------===// |
2027 | |
2028 | namespace { |
2029 | |
2030 | /// Post-process the DAG to create weak edges from all uses of a copy to |
2031 | /// the one use that defines the copy's source vreg, most likely an induction |
2032 | /// variable increment. |
2033 | class CopyConstrain : public ScheduleDAGMutation { |
2034 | // Transient state. |
2035 | SlotIndex RegionBeginIdx; |
2036 | |
2037 | // RegionEndIdx is the slot index of the last non-debug instruction in the |
2038 | // scheduling region. So we may have RegionBeginIdx == RegionEndIdx. |
2039 | SlotIndex RegionEndIdx; |
2040 | |
2041 | public: |
2042 | CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} |
2043 | |
2044 | void apply(ScheduleDAGInstrs *DAGInstrs) override; |
2045 | |
2046 | protected: |
2047 | void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG); |
2048 | }; |
2049 | |
2050 | } // end anonymous namespace |
2051 | |
2052 | namespace llvm { |
2053 | |
2054 | std::unique_ptr<ScheduleDAGMutation> |
2055 | createCopyConstrainDAGMutation(const TargetInstrInfo *TII, |
2056 | const TargetRegisterInfo *TRI) { |
2057 | return std::make_unique<CopyConstrain>(args&: TII, args&: TRI); |
2058 | } |
2059 | |
2060 | } // end namespace llvm |
2061 | |
2062 | /// constrainLocalCopy handles two possibilities: |
2063 | /// 1) Local src: |
2064 | /// I0: = dst |
2065 | /// I1: src = ... |
2066 | /// I2: = dst |
2067 | /// I3: dst = src (copy) |
2068 | /// (create pred->succ edges I0->I1, I2->I1) |
2069 | /// |
2070 | /// 2) Local copy: |
2071 | /// I0: dst = src (copy) |
2072 | /// I1: = dst |
2073 | /// I2: src = ... |
2074 | /// I3: = dst |
2075 | /// (create pred->succ edges I1->I2, I3->I2) |
2076 | /// |
2077 | /// Although the MachineScheduler is currently constrained to single blocks, |
2078 | /// this algorithm should handle extended blocks. An EBB is a set of |
2079 | /// contiguously numbered blocks such that the previous block in the EBB is |
2080 | /// always the single predecessor. |
2081 | void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) { |
2082 | LiveIntervals *LIS = DAG->getLIS(); |
2083 | MachineInstr *Copy = CopySU->getInstr(); |
2084 | |
2085 | // Check for pure vreg copies. |
2086 | const MachineOperand &SrcOp = Copy->getOperand(i: 1); |
2087 | Register SrcReg = SrcOp.getReg(); |
2088 | if (!SrcReg.isVirtual() || !SrcOp.readsReg()) |
2089 | return; |
2090 | |
2091 | const MachineOperand &DstOp = Copy->getOperand(i: 0); |
2092 | Register DstReg = DstOp.getReg(); |
2093 | if (!DstReg.isVirtual() || DstOp.isDead()) |
2094 | return; |
2095 | |
2096 | // Check if either the dest or source is local. If it's live across a back |
2097 | // edge, it's not local. Note that if both vregs are live across the back |
2098 | // edge, we cannot successfully contrain the copy without cyclic scheduling. |
2099 | // If both the copy's source and dest are local live intervals, then we |
2100 | // should treat the dest as the global for the purpose of adding |
2101 | // constraints. This adds edges from source's other uses to the copy. |
2102 | unsigned LocalReg = SrcReg; |
2103 | unsigned GlobalReg = DstReg; |
2104 | LiveInterval *LocalLI = &LIS->getInterval(Reg: LocalReg); |
2105 | if (!LocalLI->isLocal(Start: RegionBeginIdx, End: RegionEndIdx)) { |
2106 | LocalReg = DstReg; |
2107 | GlobalReg = SrcReg; |
2108 | LocalLI = &LIS->getInterval(Reg: LocalReg); |
2109 | if (!LocalLI->isLocal(Start: RegionBeginIdx, End: RegionEndIdx)) |
2110 | return; |
2111 | } |
2112 | LiveInterval *GlobalLI = &LIS->getInterval(Reg: GlobalReg); |
2113 | |
2114 | // Find the global segment after the start of the local LI. |
2115 | LiveInterval::iterator GlobalSegment = GlobalLI->find(Pos: LocalLI->beginIndex()); |
2116 | // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a |
2117 | // local live range. We could create edges from other global uses to the local |
2118 | // start, but the coalescer should have already eliminated these cases, so |
2119 | // don't bother dealing with it. |
2120 | if (GlobalSegment == GlobalLI->end()) |
2121 | return; |
2122 | |
2123 | // If GlobalSegment is killed at the LocalLI->start, the call to find() |
2124 | // returned the next global segment. But if GlobalSegment overlaps with |
2125 | // LocalLI->start, then advance to the next segment. If a hole in GlobalLI |
2126 | // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole. |
2127 | if (GlobalSegment->contains(I: LocalLI->beginIndex())) |
2128 | ++GlobalSegment; |
2129 | |
2130 | if (GlobalSegment == GlobalLI->end()) |
2131 | return; |
2132 | |
2133 | // Check if GlobalLI contains a hole in the vicinity of LocalLI. |
2134 | if (GlobalSegment != GlobalLI->begin()) { |
2135 | // Two address defs have no hole. |
2136 | if (SlotIndex::isSameInstr(A: std::prev(x: GlobalSegment)->end, |
2137 | B: GlobalSegment->start)) { |
2138 | return; |
2139 | } |
2140 | // If the prior global segment may be defined by the same two-address |
2141 | // instruction that also defines LocalLI, then can't make a hole here. |
2142 | if (SlotIndex::isSameInstr(A: std::prev(x: GlobalSegment)->start, |
2143 | B: LocalLI->beginIndex())) { |
2144 | return; |
2145 | } |
2146 | // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise |
2147 | // it would be a disconnected component in the live range. |
2148 | assert(std::prev(GlobalSegment)->start < LocalLI->beginIndex() && |
2149 | "Disconnected LRG within the scheduling region." ); |
2150 | } |
2151 | MachineInstr *GlobalDef = LIS->getInstructionFromIndex(index: GlobalSegment->start); |
2152 | if (!GlobalDef) |
2153 | return; |
2154 | |
2155 | SUnit *GlobalSU = DAG->getSUnit(MI: GlobalDef); |
2156 | if (!GlobalSU) |
2157 | return; |
2158 | |
2159 | // GlobalDef is the bottom of the GlobalLI hole. Open the hole by |
2160 | // constraining the uses of the last local def to precede GlobalDef. |
2161 | SmallVector<SUnit*,8> LocalUses; |
2162 | const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(Idx: LocalLI->endIndex()); |
2163 | MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(index: LastLocalVN->def); |
2164 | SUnit *LastLocalSU = DAG->getSUnit(MI: LastLocalDef); |
2165 | for (const SDep &Succ : LastLocalSU->Succs) { |
2166 | if (Succ.getKind() != SDep::Data || Succ.getReg() != LocalReg) |
2167 | continue; |
2168 | if (Succ.getSUnit() == GlobalSU) |
2169 | continue; |
2170 | if (!DAG->canAddEdge(SuccSU: GlobalSU, PredSU: Succ.getSUnit())) |
2171 | return; |
2172 | LocalUses.push_back(Elt: Succ.getSUnit()); |
2173 | } |
2174 | // Open the top of the GlobalLI hole by constraining any earlier global uses |
2175 | // to precede the start of LocalLI. |
2176 | SmallVector<SUnit*,8> GlobalUses; |
2177 | MachineInstr *FirstLocalDef = |
2178 | LIS->getInstructionFromIndex(index: LocalLI->beginIndex()); |
2179 | SUnit *FirstLocalSU = DAG->getSUnit(MI: FirstLocalDef); |
2180 | for (const SDep &Pred : GlobalSU->Preds) { |
2181 | if (Pred.getKind() != SDep::Anti || Pred.getReg() != GlobalReg) |
2182 | continue; |
2183 | if (Pred.getSUnit() == FirstLocalSU) |
2184 | continue; |
2185 | if (!DAG->canAddEdge(SuccSU: FirstLocalSU, PredSU: Pred.getSUnit())) |
2186 | return; |
2187 | GlobalUses.push_back(Elt: Pred.getSUnit()); |
2188 | } |
2189 | LLVM_DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n" ); |
2190 | // Add the weak edges. |
2191 | for (SUnit *LU : LocalUses) { |
2192 | LLVM_DEBUG(dbgs() << " Local use SU(" << LU->NodeNum << ") -> SU(" |
2193 | << GlobalSU->NodeNum << ")\n" ); |
2194 | DAG->addEdge(SuccSU: GlobalSU, PredDep: SDep(LU, SDep::Weak)); |
2195 | } |
2196 | for (SUnit *GU : GlobalUses) { |
2197 | LLVM_DEBUG(dbgs() << " Global use SU(" << GU->NodeNum << ") -> SU(" |
2198 | << FirstLocalSU->NodeNum << ")\n" ); |
2199 | DAG->addEdge(SuccSU: FirstLocalSU, PredDep: SDep(GU, SDep::Weak)); |
2200 | } |
2201 | } |
2202 | |
2203 | /// Callback from DAG postProcessing to create weak edges to encourage |
2204 | /// copy elimination. |
2205 | void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) { |
2206 | ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); |
2207 | assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals" ); |
2208 | |
2209 | MachineBasicBlock::iterator FirstPos = nextIfDebug(I: DAG->begin(), End: DAG->end()); |
2210 | if (FirstPos == DAG->end()) |
2211 | return; |
2212 | RegionBeginIdx = DAG->getLIS()->getInstructionIndex(Instr: *FirstPos); |
2213 | RegionEndIdx = DAG->getLIS()->getInstructionIndex( |
2214 | Instr: *priorNonDebug(I: DAG->end(), Beg: DAG->begin())); |
2215 | |
2216 | for (SUnit &SU : DAG->SUnits) { |
2217 | if (!SU.getInstr()->isCopy()) |
2218 | continue; |
2219 | |
2220 | constrainLocalCopy(CopySU: &SU, DAG: static_cast<ScheduleDAGMILive*>(DAG)); |
2221 | } |
2222 | } |
2223 | |
2224 | //===----------------------------------------------------------------------===// |
2225 | // MachineSchedStrategy helpers used by GenericScheduler, GenericPostScheduler |
2226 | // and possibly other custom schedulers. |
2227 | //===----------------------------------------------------------------------===// |
2228 | |
2229 | static const unsigned InvalidCycle = ~0U; |
2230 | |
2231 | SchedBoundary::~SchedBoundary() { delete HazardRec; } |
2232 | |
2233 | /// Given a Count of resource usage and a Latency value, return true if a |
2234 | /// SchedBoundary becomes resource limited. |
2235 | /// If we are checking after scheduling a node, we should return true when |
2236 | /// we just reach the resource limit. |
2237 | static bool checkResourceLimit(unsigned LFactor, unsigned Count, |
2238 | unsigned Latency, bool AfterSchedNode) { |
2239 | int ResCntFactor = (int)(Count - (Latency * LFactor)); |
2240 | if (AfterSchedNode) |
2241 | return ResCntFactor >= (int)LFactor; |
2242 | else |
2243 | return ResCntFactor > (int)LFactor; |
2244 | } |
2245 | |
2246 | void SchedBoundary::reset() { |
2247 | // A new HazardRec is created for each DAG and owned by SchedBoundary. |
2248 | // Destroying and reconstructing it is very expensive though. So keep |
2249 | // invalid, placeholder HazardRecs. |
2250 | if (HazardRec && HazardRec->isEnabled()) { |
2251 | delete HazardRec; |
2252 | HazardRec = nullptr; |
2253 | } |
2254 | Available.clear(); |
2255 | Pending.clear(); |
2256 | CheckPending = false; |
2257 | CurrCycle = 0; |
2258 | CurrMOps = 0; |
2259 | MinReadyCycle = std::numeric_limits<unsigned>::max(); |
2260 | ExpectedLatency = 0; |
2261 | DependentLatency = 0; |
2262 | RetiredMOps = 0; |
2263 | MaxExecutedResCount = 0; |
2264 | ZoneCritResIdx = 0; |
2265 | IsResourceLimited = false; |
2266 | ReservedCycles.clear(); |
2267 | ReservedResourceSegments.clear(); |
2268 | ReservedCyclesIndex.clear(); |
2269 | ResourceGroupSubUnitMasks.clear(); |
2270 | #if LLVM_ENABLE_ABI_BREAKING_CHECKS |
2271 | // Track the maximum number of stall cycles that could arise either from the |
2272 | // latency of a DAG edge or the number of cycles that a processor resource is |
2273 | // reserved (SchedBoundary::ReservedCycles). |
2274 | MaxObservedStall = 0; |
2275 | #endif |
2276 | // Reserve a zero-count for invalid CritResIdx. |
2277 | ExecutedResCounts.resize(N: 1); |
2278 | assert(!ExecutedResCounts[0] && "nonzero count for bad resource" ); |
2279 | } |
2280 | |
2281 | void SchedRemainder:: |
2282 | init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) { |
2283 | reset(); |
2284 | if (!SchedModel->hasInstrSchedModel()) |
2285 | return; |
2286 | RemainingCounts.resize(N: SchedModel->getNumProcResourceKinds()); |
2287 | for (SUnit &SU : DAG->SUnits) { |
2288 | const MCSchedClassDesc *SC = DAG->getSchedClass(SU: &SU); |
2289 | RemIssueCount += SchedModel->getNumMicroOps(MI: SU.getInstr(), SC) |
2290 | * SchedModel->getMicroOpFactor(); |
2291 | for (TargetSchedModel::ProcResIter |
2292 | PI = SchedModel->getWriteProcResBegin(SC), |
2293 | PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { |
2294 | unsigned PIdx = PI->ProcResourceIdx; |
2295 | unsigned Factor = SchedModel->getResourceFactor(ResIdx: PIdx); |
2296 | assert(PI->ReleaseAtCycle >= PI->AcquireAtCycle); |
2297 | RemainingCounts[PIdx] += |
2298 | (Factor * (PI->ReleaseAtCycle - PI->AcquireAtCycle)); |
2299 | } |
2300 | } |
2301 | } |
2302 | |
2303 | void SchedBoundary:: |
2304 | init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { |
2305 | reset(); |
2306 | DAG = dag; |
2307 | SchedModel = smodel; |
2308 | Rem = rem; |
2309 | if (SchedModel->hasInstrSchedModel()) { |
2310 | unsigned ResourceCount = SchedModel->getNumProcResourceKinds(); |
2311 | ReservedCyclesIndex.resize(N: ResourceCount); |
2312 | ExecutedResCounts.resize(N: ResourceCount); |
2313 | ResourceGroupSubUnitMasks.resize(N: ResourceCount, NV: APInt(ResourceCount, 0)); |
2314 | unsigned NumUnits = 0; |
2315 | |
2316 | for (unsigned i = 0; i < ResourceCount; ++i) { |
2317 | ReservedCyclesIndex[i] = NumUnits; |
2318 | NumUnits += SchedModel->getProcResource(PIdx: i)->NumUnits; |
2319 | if (isUnbufferedGroup(PIdx: i)) { |
2320 | auto SubUnits = SchedModel->getProcResource(PIdx: i)->SubUnitsIdxBegin; |
2321 | for (unsigned U = 0, UE = SchedModel->getProcResource(PIdx: i)->NumUnits; |
2322 | U != UE; ++U) |
2323 | ResourceGroupSubUnitMasks[i].setBit(SubUnits[U]); |
2324 | } |
2325 | } |
2326 | |
2327 | ReservedCycles.resize(new_size: NumUnits, x: InvalidCycle); |
2328 | } |
2329 | } |
2330 | |
2331 | /// Compute the stall cycles based on this SUnit's ready time. Heuristics treat |
2332 | /// these "soft stalls" differently than the hard stall cycles based on CPU |
2333 | /// resources and computed by checkHazard(). A fully in-order model |
2334 | /// (MicroOpBufferSize==0) will not make use of this since instructions are not |
2335 | /// available for scheduling until they are ready. However, a weaker in-order |
2336 | /// model may use this for heuristics. For example, if a processor has in-order |
2337 | /// behavior when reading certain resources, this may come into play. |
2338 | unsigned SchedBoundary::getLatencyStallCycles(SUnit *SU) { |
2339 | if (!SU->isUnbuffered) |
2340 | return 0; |
2341 | |
2342 | unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle); |
2343 | if (ReadyCycle > CurrCycle) |
2344 | return ReadyCycle - CurrCycle; |
2345 | return 0; |
2346 | } |
2347 | |
2348 | /// Compute the next cycle at which the given processor resource unit |
2349 | /// can be scheduled. |
2350 | unsigned SchedBoundary::getNextResourceCycleByInstance(unsigned InstanceIdx, |
2351 | unsigned ReleaseAtCycle, |
2352 | unsigned AcquireAtCycle) { |
2353 | if (SchedModel && SchedModel->enableIntervals()) { |
2354 | if (isTop()) |
2355 | return ReservedResourceSegments[InstanceIdx].getFirstAvailableAtFromTop( |
2356 | CurrCycle, AcquireAtCycle, ReleaseAtCycle); |
2357 | |
2358 | return ReservedResourceSegments[InstanceIdx].getFirstAvailableAtFromBottom( |
2359 | CurrCycle, AcquireAtCycle, ReleaseAtCycle); |
2360 | } |
2361 | |
2362 | unsigned NextUnreserved = ReservedCycles[InstanceIdx]; |
2363 | // If this resource has never been used, always return cycle zero. |
2364 | if (NextUnreserved == InvalidCycle) |
2365 | return CurrCycle; |
2366 | // For bottom-up scheduling add the cycles needed for the current operation. |
2367 | if (!isTop()) |
2368 | NextUnreserved = std::max(a: CurrCycle, b: NextUnreserved + ReleaseAtCycle); |
2369 | return NextUnreserved; |
2370 | } |
2371 | |
2372 | /// Compute the next cycle at which the given processor resource can be |
2373 | /// scheduled. Returns the next cycle and the index of the processor resource |
2374 | /// instance in the reserved cycles vector. |
2375 | std::pair<unsigned, unsigned> |
2376 | SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, |
2377 | unsigned ReleaseAtCycle, |
2378 | unsigned AcquireAtCycle) { |
2379 | if (MischedDetailResourceBooking) { |
2380 | LLVM_DEBUG(dbgs() << " Resource booking (@" << CurrCycle << "c): \n" ); |
2381 | LLVM_DEBUG(dumpReservedCycles()); |
2382 | LLVM_DEBUG(dbgs() << " getNextResourceCycle (@" << CurrCycle << "c): \n" ); |
2383 | } |
2384 | unsigned MinNextUnreserved = InvalidCycle; |
2385 | unsigned InstanceIdx = 0; |
2386 | unsigned StartIndex = ReservedCyclesIndex[PIdx]; |
2387 | unsigned NumberOfInstances = SchedModel->getProcResource(PIdx)->NumUnits; |
2388 | assert(NumberOfInstances > 0 && |
2389 | "Cannot have zero instances of a ProcResource" ); |
2390 | |
2391 | if (isUnbufferedGroup(PIdx)) { |
2392 | // If any subunits are used by the instruction, report that the |
2393 | // subunits of the resource group are available at the first cycle |
2394 | // in which the unit is available, effectively removing the group |
2395 | // record from hazarding and basing the hazarding decisions on the |
2396 | // subunit records. Otherwise, choose the first available instance |
2397 | // from among the subunits. Specifications which assign cycles to |
2398 | // both the subunits and the group or which use an unbuffered |
2399 | // group with buffered subunits will appear to schedule |
2400 | // strangely. In the first case, the additional cycles for the |
2401 | // group will be ignored. In the second, the group will be |
2402 | // ignored entirely. |
2403 | for (const MCWriteProcResEntry &PE : |
2404 | make_range(x: SchedModel->getWriteProcResBegin(SC), |
2405 | y: SchedModel->getWriteProcResEnd(SC))) |
2406 | if (ResourceGroupSubUnitMasks[PIdx][PE.ProcResourceIdx]) |
2407 | return std::make_pair(x: getNextResourceCycleByInstance( |
2408 | InstanceIdx: StartIndex, ReleaseAtCycle, AcquireAtCycle), |
2409 | y&: StartIndex); |
2410 | |
2411 | auto SubUnits = SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin; |
2412 | for (unsigned I = 0, End = NumberOfInstances; I < End; ++I) { |
2413 | unsigned NextUnreserved, NextInstanceIdx; |
2414 | std::tie(args&: NextUnreserved, args&: NextInstanceIdx) = |
2415 | getNextResourceCycle(SC, PIdx: SubUnits[I], ReleaseAtCycle, AcquireAtCycle); |
2416 | if (MinNextUnreserved > NextUnreserved) { |
2417 | InstanceIdx = NextInstanceIdx; |
2418 | MinNextUnreserved = NextUnreserved; |
2419 | } |
2420 | } |
2421 | return std::make_pair(x&: MinNextUnreserved, y&: InstanceIdx); |
2422 | } |
2423 | |
2424 | for (unsigned I = StartIndex, End = StartIndex + NumberOfInstances; I < End; |
2425 | ++I) { |
2426 | unsigned NextUnreserved = |
2427 | getNextResourceCycleByInstance(InstanceIdx: I, ReleaseAtCycle, AcquireAtCycle); |
2428 | if (MischedDetailResourceBooking) |
2429 | LLVM_DEBUG(dbgs() << " Instance " << I - StartIndex << " available @" |
2430 | << NextUnreserved << "c\n" ); |
2431 | if (MinNextUnreserved > NextUnreserved) { |
2432 | InstanceIdx = I; |
2433 | MinNextUnreserved = NextUnreserved; |
2434 | } |
2435 | } |
2436 | if (MischedDetailResourceBooking) |
2437 | LLVM_DEBUG(dbgs() << " selecting " << SchedModel->getResourceName(PIdx) |
2438 | << "[" << InstanceIdx - StartIndex << "]" |
2439 | << " available @" << MinNextUnreserved << "c" |
2440 | << "\n" ); |
2441 | return std::make_pair(x&: MinNextUnreserved, y&: InstanceIdx); |
2442 | } |
2443 | |
2444 | /// Does this SU have a hazard within the current instruction group. |
2445 | /// |
2446 | /// The scheduler supports two modes of hazard recognition. The first is the |
2447 | /// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that |
2448 | /// supports highly complicated in-order reservation tables |
2449 | /// (ScoreboardHazardRecognizer) and arbitrary target-specific logic. |
2450 | /// |
2451 | /// The second is a streamlined mechanism that checks for hazards based on |
2452 | /// simple counters that the scheduler itself maintains. It explicitly checks |
2453 | /// for instruction dispatch limitations, including the number of micro-ops that |
2454 | /// can dispatch per cycle. |
2455 | /// |
2456 | /// TODO: Also check whether the SU must start a new group. |
2457 | bool SchedBoundary::checkHazard(SUnit *SU) { |
2458 | if (HazardRec->isEnabled() |
2459 | && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) { |
2460 | return true; |
2461 | } |
2462 | |
2463 | unsigned uops = SchedModel->getNumMicroOps(MI: SU->getInstr()); |
2464 | if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) { |
2465 | LLVM_DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops=" |
2466 | << SchedModel->getNumMicroOps(SU->getInstr()) << '\n'); |
2467 | return true; |
2468 | } |
2469 | |
2470 | if (CurrMOps > 0 && |
2471 | ((isTop() && SchedModel->mustBeginGroup(MI: SU->getInstr())) || |
2472 | (!isTop() && SchedModel->mustEndGroup(MI: SU->getInstr())))) { |
2473 | LLVM_DEBUG(dbgs() << " hazard: SU(" << SU->NodeNum << ") must " |
2474 | << (isTop() ? "begin" : "end" ) << " group\n" ); |
2475 | return true; |
2476 | } |
2477 | |
2478 | if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) { |
2479 | const MCSchedClassDesc *SC = DAG->getSchedClass(SU); |
2480 | for (const MCWriteProcResEntry &PE : |
2481 | make_range(x: SchedModel->getWriteProcResBegin(SC), |
2482 | y: SchedModel->getWriteProcResEnd(SC))) { |
2483 | unsigned ResIdx = PE.ProcResourceIdx; |
2484 | unsigned ReleaseAtCycle = PE.ReleaseAtCycle; |
2485 | unsigned AcquireAtCycle = PE.AcquireAtCycle; |
2486 | unsigned NRCycle, InstanceIdx; |
2487 | std::tie(args&: NRCycle, args&: InstanceIdx) = |
2488 | getNextResourceCycle(SC, PIdx: ResIdx, ReleaseAtCycle, AcquireAtCycle); |
2489 | if (NRCycle > CurrCycle) { |
2490 | #if LLVM_ENABLE_ABI_BREAKING_CHECKS |
2491 | MaxObservedStall = std::max(ReleaseAtCycle, MaxObservedStall); |
2492 | #endif |
2493 | LLVM_DEBUG(dbgs() << " SU(" << SU->NodeNum << ") " |
2494 | << SchedModel->getResourceName(ResIdx) |
2495 | << '[' << InstanceIdx - ReservedCyclesIndex[ResIdx] << ']' |
2496 | << "=" << NRCycle << "c\n" ); |
2497 | return true; |
2498 | } |
2499 | } |
2500 | } |
2501 | return false; |
2502 | } |
2503 | |
2504 | // Find the unscheduled node in ReadySUs with the highest latency. |
2505 | unsigned SchedBoundary:: |
2506 | findMaxLatency(ArrayRef<SUnit*> ReadySUs) { |
2507 | SUnit *LateSU = nullptr; |
2508 | unsigned RemLatency = 0; |
2509 | for (SUnit *SU : ReadySUs) { |
2510 | unsigned L = getUnscheduledLatency(SU); |
2511 | if (L > RemLatency) { |
2512 | RemLatency = L; |
2513 | LateSU = SU; |
2514 | } |
2515 | } |
2516 | if (LateSU) { |
2517 | LLVM_DEBUG(dbgs() << Available.getName() << " RemLatency SU(" |
2518 | << LateSU->NodeNum << ") " << RemLatency << "c\n" ); |
2519 | } |
2520 | return RemLatency; |
2521 | } |
2522 | |
2523 | // Count resources in this zone and the remaining unscheduled |
2524 | // instruction. Return the max count, scaled. Set OtherCritIdx to the critical |
2525 | // resource index, or zero if the zone is issue limited. |
2526 | unsigned SchedBoundary:: |
2527 | getOtherResourceCount(unsigned &OtherCritIdx) { |
2528 | OtherCritIdx = 0; |
2529 | if (!SchedModel->hasInstrSchedModel()) |
2530 | return 0; |
2531 | |
2532 | unsigned OtherCritCount = Rem->RemIssueCount |
2533 | + (RetiredMOps * SchedModel->getMicroOpFactor()); |
2534 | LLVM_DEBUG(dbgs() << " " << Available.getName() << " + Remain MOps: " |
2535 | << OtherCritCount / SchedModel->getMicroOpFactor() << '\n'); |
2536 | for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds(); |
2537 | PIdx != PEnd; ++PIdx) { |
2538 | unsigned OtherCount = getResourceCount(ResIdx: PIdx) + Rem->RemainingCounts[PIdx]; |
2539 | if (OtherCount > OtherCritCount) { |
2540 | OtherCritCount = OtherCount; |
2541 | OtherCritIdx = PIdx; |
2542 | } |
2543 | } |
2544 | if (OtherCritIdx) { |
2545 | LLVM_DEBUG( |
2546 | dbgs() << " " << Available.getName() << " + Remain CritRes: " |
2547 | << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx) |
2548 | << " " << SchedModel->getResourceName(OtherCritIdx) << "\n" ); |
2549 | } |
2550 | return OtherCritCount; |
2551 | } |
2552 | |
2553 | void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle, bool InPQueue, |
2554 | unsigned Idx) { |
2555 | assert(SU->getInstr() && "Scheduled SUnit must have instr" ); |
2556 | |
2557 | #if LLVM_ENABLE_ABI_BREAKING_CHECKS |
2558 | // ReadyCycle was been bumped up to the CurrCycle when this node was |
2559 | // scheduled, but CurrCycle may have been eagerly advanced immediately after |
2560 | // scheduling, so may now be greater than ReadyCycle. |
2561 | if (ReadyCycle > CurrCycle) |
2562 | MaxObservedStall = std::max(ReadyCycle - CurrCycle, MaxObservedStall); |
2563 | #endif |
2564 | |
2565 | if (ReadyCycle < MinReadyCycle) |
2566 | MinReadyCycle = ReadyCycle; |
2567 | |
2568 | // Check for interlocks first. For the purpose of other heuristics, an |
2569 | // instruction that cannot issue appears as if it's not in the ReadyQueue. |
2570 | bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; |
2571 | bool HazardDetected = (!IsBuffered && ReadyCycle > CurrCycle) || |
2572 | checkHazard(SU) || (Available.size() >= ReadyListLimit); |
2573 | |
2574 | if (!HazardDetected) { |
2575 | Available.push(SU); |
2576 | |
2577 | if (InPQueue) |
2578 | Pending.remove(I: Pending.begin() + Idx); |
2579 | return; |
2580 | } |
2581 | |
2582 | if (!InPQueue) |
2583 | Pending.push(SU); |
2584 | } |
2585 | |
2586 | /// Move the boundary of scheduled code by one cycle. |
2587 | void SchedBoundary::bumpCycle(unsigned NextCycle) { |
2588 | if (SchedModel->getMicroOpBufferSize() == 0) { |
2589 | assert(MinReadyCycle < std::numeric_limits<unsigned>::max() && |
2590 | "MinReadyCycle uninitialized" ); |
2591 | if (MinReadyCycle > NextCycle) |
2592 | NextCycle = MinReadyCycle; |
2593 | } |
2594 | // Update the current micro-ops, which will issue in the next cycle. |
2595 | unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle); |
2596 | CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps; |
2597 | |
2598 | // Decrement DependentLatency based on the next cycle. |
2599 | if ((NextCycle - CurrCycle) > DependentLatency) |
2600 | DependentLatency = 0; |
2601 | else |
2602 | DependentLatency -= (NextCycle - CurrCycle); |
2603 | |
2604 | if (!HazardRec->isEnabled()) { |
2605 | // Bypass HazardRec virtual calls. |
2606 | CurrCycle = NextCycle; |
2607 | } else { |
2608 | // Bypass getHazardType calls in case of long latency. |
2609 | for (; CurrCycle != NextCycle; ++CurrCycle) { |
2610 | if (isTop()) |
2611 | HazardRec->AdvanceCycle(); |
2612 | else |
2613 | HazardRec->RecedeCycle(); |
2614 | } |
2615 | } |
2616 | CheckPending = true; |
2617 | IsResourceLimited = |
2618 | checkResourceLimit(LFactor: SchedModel->getLatencyFactor(), Count: getCriticalCount(), |
2619 | Latency: getScheduledLatency(), AfterSchedNode: true); |
2620 | |
2621 | LLVM_DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() |
2622 | << '\n'); |
2623 | } |
2624 | |
2625 | void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) { |
2626 | ExecutedResCounts[PIdx] += Count; |
2627 | if (ExecutedResCounts[PIdx] > MaxExecutedResCount) |
2628 | MaxExecutedResCount = ExecutedResCounts[PIdx]; |
2629 | } |
2630 | |
2631 | /// Add the given processor resource to this scheduled zone. |
2632 | /// |
2633 | /// \param ReleaseAtCycle indicates the number of consecutive (non-pipelined) |
2634 | /// cycles during which this resource is released. |
2635 | /// |
2636 | /// \param AcquireAtCycle indicates the number of consecutive (non-pipelined) |
2637 | /// cycles at which the resource is aquired after issue (assuming no stalls). |
2638 | /// |
2639 | /// \return the next cycle at which the instruction may execute without |
2640 | /// oversubscribing resources. |
2641 | unsigned SchedBoundary::countResource(const MCSchedClassDesc *SC, unsigned PIdx, |
2642 | unsigned ReleaseAtCycle, |
2643 | unsigned NextCycle, |
2644 | unsigned AcquireAtCycle) { |
2645 | unsigned Factor = SchedModel->getResourceFactor(ResIdx: PIdx); |
2646 | unsigned Count = Factor * (ReleaseAtCycle- AcquireAtCycle); |
2647 | LLVM_DEBUG(dbgs() << " " << SchedModel->getResourceName(PIdx) << " +" |
2648 | << ReleaseAtCycle << "x" << Factor << "u\n" ); |
2649 | |
2650 | // Update Executed resources counts. |
2651 | incExecutedResources(PIdx, Count); |
2652 | assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted" ); |
2653 | Rem->RemainingCounts[PIdx] -= Count; |
2654 | |
2655 | // Check if this resource exceeds the current critical resource. If so, it |
2656 | // becomes the critical resource. |
2657 | if (ZoneCritResIdx != PIdx && (getResourceCount(ResIdx: PIdx) > getCriticalCount())) { |
2658 | ZoneCritResIdx = PIdx; |
2659 | LLVM_DEBUG(dbgs() << " *** Critical resource " |
2660 | << SchedModel->getResourceName(PIdx) << ": " |
2661 | << getResourceCount(PIdx) / SchedModel->getLatencyFactor() |
2662 | << "c\n" ); |
2663 | } |
2664 | // For reserved resources, record the highest cycle using the resource. |
2665 | unsigned NextAvailable, InstanceIdx; |
2666 | std::tie(args&: NextAvailable, args&: InstanceIdx) = |
2667 | getNextResourceCycle(SC, PIdx, ReleaseAtCycle, AcquireAtCycle); |
2668 | if (NextAvailable > CurrCycle) { |
2669 | LLVM_DEBUG(dbgs() << " Resource conflict: " |
2670 | << SchedModel->getResourceName(PIdx) |
2671 | << '[' << InstanceIdx - ReservedCyclesIndex[PIdx] << ']' |
2672 | << " reserved until @" << NextAvailable << "\n" ); |
2673 | } |
2674 | return NextAvailable; |
2675 | } |
2676 | |
2677 | /// Move the boundary of scheduled code by one SUnit. |
2678 | void SchedBoundary::bumpNode(SUnit *SU) { |
2679 | // Update the reservation table. |
2680 | if (HazardRec->isEnabled()) { |
2681 | if (!isTop() && SU->isCall) { |
2682 | // Calls are scheduled with their preceding instructions. For bottom-up |
2683 | // scheduling, clear the pipeline state before emitting. |
2684 | HazardRec->Reset(); |
2685 | } |
2686 | HazardRec->EmitInstruction(SU); |
2687 | // Scheduling an instruction may have made pending instructions available. |
2688 | CheckPending = true; |
2689 | } |
2690 | // checkHazard should prevent scheduling multiple instructions per cycle that |
2691 | // exceed the issue width. |
2692 | const MCSchedClassDesc *SC = DAG->getSchedClass(SU); |
2693 | unsigned IncMOps = SchedModel->getNumMicroOps(MI: SU->getInstr()); |
2694 | assert( |
2695 | (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth()) && |
2696 | "Cannot schedule this instruction's MicroOps in the current cycle." ); |
2697 | |
2698 | unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle); |
2699 | LLVM_DEBUG(dbgs() << " Ready @" << ReadyCycle << "c\n" ); |
2700 | |
2701 | unsigned NextCycle = CurrCycle; |
2702 | switch (SchedModel->getMicroOpBufferSize()) { |
2703 | case 0: |
2704 | assert(ReadyCycle <= CurrCycle && "Broken PendingQueue" ); |
2705 | break; |
2706 | case 1: |
2707 | if (ReadyCycle > NextCycle) { |
2708 | NextCycle = ReadyCycle; |
2709 | LLVM_DEBUG(dbgs() << " *** Stall until: " << ReadyCycle << "\n" ); |
2710 | } |
2711 | break; |
2712 | default: |
2713 | // We don't currently model the OOO reorder buffer, so consider all |
2714 | // scheduled MOps to be "retired". We do loosely model in-order resource |
2715 | // latency. If this instruction uses an in-order resource, account for any |
2716 | // likely stall cycles. |
2717 | if (SU->isUnbuffered && ReadyCycle > NextCycle) |
2718 | NextCycle = ReadyCycle; |
2719 | break; |
2720 | } |
2721 | RetiredMOps += IncMOps; |
2722 | |
2723 | // Update resource counts and critical resource. |
2724 | if (SchedModel->hasInstrSchedModel()) { |
2725 | unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor(); |
2726 | assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted" ); |
2727 | Rem->RemIssueCount -= DecRemIssue; |
2728 | if (ZoneCritResIdx) { |
2729 | // Scale scheduled micro-ops for comparing with the critical resource. |
2730 | unsigned ScaledMOps = |
2731 | RetiredMOps * SchedModel->getMicroOpFactor(); |
2732 | |
2733 | // If scaled micro-ops are now more than the previous critical resource by |
2734 | // a full cycle, then micro-ops issue becomes critical. |
2735 | if ((int)(ScaledMOps - getResourceCount(ResIdx: ZoneCritResIdx)) |
2736 | >= (int)SchedModel->getLatencyFactor()) { |
2737 | ZoneCritResIdx = 0; |
2738 | LLVM_DEBUG(dbgs() << " *** Critical resource NumMicroOps: " |
2739 | << ScaledMOps / SchedModel->getLatencyFactor() |
2740 | << "c\n" ); |
2741 | } |
2742 | } |
2743 | for (TargetSchedModel::ProcResIter |
2744 | PI = SchedModel->getWriteProcResBegin(SC), |
2745 | PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { |
2746 | unsigned RCycle = |
2747 | countResource(SC, PIdx: PI->ProcResourceIdx, ReleaseAtCycle: PI->ReleaseAtCycle, NextCycle, |
2748 | AcquireAtCycle: PI->AcquireAtCycle); |
2749 | if (RCycle > NextCycle) |
2750 | NextCycle = RCycle; |
2751 | } |
2752 | if (SU->hasReservedResource) { |
2753 | // For reserved resources, record the highest cycle using the resource. |
2754 | // For top-down scheduling, this is the cycle in which we schedule this |
2755 | // instruction plus the number of cycles the operations reserves the |
2756 | // resource. For bottom-up is it simply the instruction's cycle. |
2757 | for (TargetSchedModel::ProcResIter |
2758 | PI = SchedModel->getWriteProcResBegin(SC), |
2759 | PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { |
2760 | unsigned PIdx = PI->ProcResourceIdx; |
2761 | if (SchedModel->getProcResource(PIdx)->BufferSize == 0) { |
2762 | |
2763 | if (SchedModel && SchedModel->enableIntervals()) { |
2764 | unsigned ReservedUntil, InstanceIdx; |
2765 | std::tie(args&: ReservedUntil, args&: InstanceIdx) = getNextResourceCycle( |
2766 | SC, PIdx, ReleaseAtCycle: PI->ReleaseAtCycle, AcquireAtCycle: PI->AcquireAtCycle); |
2767 | if (isTop()) { |
2768 | ReservedResourceSegments[InstanceIdx].add( |
2769 | A: ResourceSegments::getResourceIntervalTop( |
2770 | C: NextCycle, AcquireAtCycle: PI->AcquireAtCycle, ReleaseAtCycle: PI->ReleaseAtCycle), |
2771 | CutOff: MIResourceCutOff); |
2772 | } else { |
2773 | ReservedResourceSegments[InstanceIdx].add( |
2774 | A: ResourceSegments::getResourceIntervalBottom( |
2775 | C: NextCycle, AcquireAtCycle: PI->AcquireAtCycle, ReleaseAtCycle: PI->ReleaseAtCycle), |
2776 | CutOff: MIResourceCutOff); |
2777 | } |
2778 | } else { |
2779 | |
2780 | unsigned ReservedUntil, InstanceIdx; |
2781 | std::tie(args&: ReservedUntil, args&: InstanceIdx) = getNextResourceCycle( |
2782 | SC, PIdx, ReleaseAtCycle: PI->ReleaseAtCycle, AcquireAtCycle: PI->AcquireAtCycle); |
2783 | if (isTop()) { |
2784 | ReservedCycles[InstanceIdx] = |
2785 | std::max(a: ReservedUntil, b: NextCycle + PI->ReleaseAtCycle); |
2786 | } else |
2787 | ReservedCycles[InstanceIdx] = NextCycle; |
2788 | } |
2789 | } |
2790 | } |
2791 | } |
2792 | } |
2793 | // Update ExpectedLatency and DependentLatency. |
2794 | unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency; |
2795 | unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency; |
2796 | if (SU->getDepth() > TopLatency) { |
2797 | TopLatency = SU->getDepth(); |
2798 | LLVM_DEBUG(dbgs() << " " << Available.getName() << " TopLatency SU(" |
2799 | << SU->NodeNum << ") " << TopLatency << "c\n" ); |
2800 | } |
2801 | if (SU->getHeight() > BotLatency) { |
2802 | BotLatency = SU->getHeight(); |
2803 | LLVM_DEBUG(dbgs() << " " << Available.getName() << " BotLatency SU(" |
2804 | << SU->NodeNum << ") " << BotLatency << "c\n" ); |
2805 | } |
2806 | // If we stall for any reason, bump the cycle. |
2807 | if (NextCycle > CurrCycle) |
2808 | bumpCycle(NextCycle); |
2809 | else |
2810 | // After updating ZoneCritResIdx and ExpectedLatency, check if we're |
2811 | // resource limited. If a stall occurred, bumpCycle does this. |
2812 | IsResourceLimited = |
2813 | checkResourceLimit(LFactor: SchedModel->getLatencyFactor(), Count: getCriticalCount(), |
2814 | Latency: getScheduledLatency(), AfterSchedNode: true); |
2815 | |
2816 | // Update CurrMOps after calling bumpCycle to handle stalls, since bumpCycle |
2817 | // resets CurrMOps. Loop to handle instructions with more MOps than issue in |
2818 | // one cycle. Since we commonly reach the max MOps here, opportunistically |
2819 | // bump the cycle to avoid uselessly checking everything in the readyQ. |
2820 | CurrMOps += IncMOps; |
2821 | |
2822 | // Bump the cycle count for issue group constraints. |
2823 | // This must be done after NextCycle has been adjust for all other stalls. |
2824 | // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set |
2825 | // currCycle to X. |
2826 | if ((isTop() && SchedModel->mustEndGroup(MI: SU->getInstr())) || |
2827 | (!isTop() && SchedModel->mustBeginGroup(MI: SU->getInstr()))) { |
2828 | LLVM_DEBUG(dbgs() << " Bump cycle to " << (isTop() ? "end" : "begin" ) |
2829 | << " group\n" ); |
2830 | bumpCycle(NextCycle: ++NextCycle); |
2831 | } |
2832 | |
2833 | while (CurrMOps >= SchedModel->getIssueWidth()) { |
2834 | LLVM_DEBUG(dbgs() << " *** Max MOps " << CurrMOps << " at cycle " |
2835 | << CurrCycle << '\n'); |
2836 | bumpCycle(NextCycle: ++NextCycle); |
2837 | } |
2838 | LLVM_DEBUG(dumpScheduledState()); |
2839 | } |
2840 | |
2841 | /// Release pending ready nodes in to the available queue. This makes them |
2842 | /// visible to heuristics. |
2843 | void SchedBoundary::releasePending() { |
2844 | // If the available queue is empty, it is safe to reset MinReadyCycle. |
2845 | if (Available.empty()) |
2846 | MinReadyCycle = std::numeric_limits<unsigned>::max(); |
2847 | |
2848 | // Check to see if any of the pending instructions are ready to issue. If |
2849 | // so, add them to the available queue. |
2850 | for (unsigned I = 0, E = Pending.size(); I < E; ++I) { |
2851 | SUnit *SU = *(Pending.begin() + I); |
2852 | unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; |
2853 | |
2854 | if (ReadyCycle < MinReadyCycle) |
2855 | MinReadyCycle = ReadyCycle; |
2856 | |
2857 | if (Available.size() >= ReadyListLimit) |
2858 | break; |
2859 | |
2860 | releaseNode(SU, ReadyCycle, InPQueue: true, Idx: I); |
2861 | if (E != Pending.size()) { |
2862 | --I; |
2863 | --E; |
2864 | } |
2865 | } |
2866 | CheckPending = false; |
2867 | } |
2868 | |
2869 | /// Remove SU from the ready set for this boundary. |
2870 | void SchedBoundary::removeReady(SUnit *SU) { |
2871 | if (Available.isInQueue(SU)) |
2872 | Available.remove(I: Available.find(SU)); |
2873 | else { |
2874 | assert(Pending.isInQueue(SU) && "bad ready count" ); |
2875 | Pending.remove(I: Pending.find(SU)); |
2876 | } |
2877 | } |
2878 | |
2879 | /// If this queue only has one ready candidate, return it. As a side effect, |
2880 | /// defer any nodes that now hit a hazard, and advance the cycle until at least |
2881 | /// one node is ready. If multiple instructions are ready, return NULL. |
2882 | SUnit *SchedBoundary::pickOnlyChoice() { |
2883 | if (CheckPending) |
2884 | releasePending(); |
2885 | |
2886 | // Defer any ready instrs that now have a hazard. |
2887 | for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) { |
2888 | if (checkHazard(SU: *I)) { |
2889 | Pending.push(SU: *I); |
2890 | I = Available.remove(I); |
2891 | continue; |
2892 | } |
2893 | ++I; |
2894 | } |
2895 | for (unsigned i = 0; Available.empty(); ++i) { |
2896 | // FIXME: Re-enable assert once PR20057 is resolved. |
2897 | // assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) && |
2898 | // "permanent hazard"); |
2899 | (void)i; |
2900 | bumpCycle(NextCycle: CurrCycle + 1); |
2901 | releasePending(); |
2902 | } |
2903 | |
2904 | LLVM_DEBUG(Pending.dump()); |
2905 | LLVM_DEBUG(Available.dump()); |
2906 | |
2907 | if (Available.size() == 1) |
2908 | return *Available.begin(); |
2909 | return nullptr; |
2910 | } |
2911 | |
2912 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2913 | |
2914 | /// Dump the content of the \ref ReservedCycles vector for the |
2915 | /// resources that are used in the basic block. |
2916 | /// |
2917 | LLVM_DUMP_METHOD void SchedBoundary::dumpReservedCycles() const { |
2918 | if (!SchedModel->hasInstrSchedModel()) |
2919 | return; |
2920 | |
2921 | unsigned ResourceCount = SchedModel->getNumProcResourceKinds(); |
2922 | unsigned StartIdx = 0; |
2923 | |
2924 | for (unsigned ResIdx = 0; ResIdx < ResourceCount; ++ResIdx) { |
2925 | const unsigned NumUnits = SchedModel->getProcResource(ResIdx)->NumUnits; |
2926 | std::string ResName = SchedModel->getResourceName(ResIdx); |
2927 | for (unsigned UnitIdx = 0; UnitIdx < NumUnits; ++UnitIdx) { |
2928 | dbgs() << ResName << "(" << UnitIdx << ") = " ; |
2929 | if (SchedModel && SchedModel->enableIntervals()) { |
2930 | if (ReservedResourceSegments.count(StartIdx + UnitIdx)) |
2931 | dbgs() << ReservedResourceSegments.at(StartIdx + UnitIdx); |
2932 | else |
2933 | dbgs() << "{ }\n" ; |
2934 | } else |
2935 | dbgs() << ReservedCycles[StartIdx + UnitIdx] << "\n" ; |
2936 | } |
2937 | StartIdx += NumUnits; |
2938 | } |
2939 | } |
2940 | |
2941 | // This is useful information to dump after bumpNode. |
2942 | // Note that the Queue contents are more useful before pickNodeFromQueue. |
2943 | LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() const { |
2944 | unsigned ResFactor; |
2945 | unsigned ResCount; |
2946 | if (ZoneCritResIdx) { |
2947 | ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx); |
2948 | ResCount = getResourceCount(ZoneCritResIdx); |
2949 | } else { |
2950 | ResFactor = SchedModel->getMicroOpFactor(); |
2951 | ResCount = RetiredMOps * ResFactor; |
2952 | } |
2953 | unsigned LFactor = SchedModel->getLatencyFactor(); |
2954 | dbgs() << Available.getName() << " @" << CurrCycle << "c\n" |
2955 | << " Retired: " << RetiredMOps; |
2956 | dbgs() << "\n Executed: " << getExecutedCount() / LFactor << "c" ; |
2957 | dbgs() << "\n Critical: " << ResCount / LFactor << "c, " |
2958 | << ResCount / ResFactor << " " |
2959 | << SchedModel->getResourceName(ZoneCritResIdx) |
2960 | << "\n ExpectedLatency: " << ExpectedLatency << "c\n" |
2961 | << (IsResourceLimited ? " - Resource" : " - Latency" ) |
2962 | << " limited.\n" ; |
2963 | if (MISchedDumpReservedCycles) |
2964 | dumpReservedCycles(); |
2965 | } |
2966 | #endif |
2967 | |
2968 | //===----------------------------------------------------------------------===// |
2969 | // GenericScheduler - Generic implementation of MachineSchedStrategy. |
2970 | //===----------------------------------------------------------------------===// |
2971 | |
2972 | void GenericSchedulerBase::SchedCandidate:: |
2973 | initResourceDelta(const ScheduleDAGMI *DAG, |
2974 | const TargetSchedModel *SchedModel) { |
2975 | if (!Policy.ReduceResIdx && !Policy.DemandResIdx) |
2976 | return; |
2977 | |
2978 | const MCSchedClassDesc *SC = DAG->getSchedClass(SU); |
2979 | for (TargetSchedModel::ProcResIter |
2980 | PI = SchedModel->getWriteProcResBegin(SC), |
2981 | PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { |
2982 | if (PI->ProcResourceIdx == Policy.ReduceResIdx) |
2983 | ResDelta.CritResources += PI->ReleaseAtCycle; |
2984 | if (PI->ProcResourceIdx == Policy.DemandResIdx) |
2985 | ResDelta.DemandedResources += PI->ReleaseAtCycle; |
2986 | } |
2987 | } |
2988 | |
2989 | /// Compute remaining latency. We need this both to determine whether the |
2990 | /// overall schedule has become latency-limited and whether the instructions |
2991 | /// outside this zone are resource or latency limited. |
2992 | /// |
2993 | /// The "dependent" latency is updated incrementally during scheduling as the |
2994 | /// max height/depth of scheduled nodes minus the cycles since it was |
2995 | /// scheduled: |
2996 | /// DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone |
2997 | /// |
2998 | /// The "independent" latency is the max ready queue depth: |
2999 | /// ILat = max N.depth for N in Available|Pending |
3000 | /// |
3001 | /// RemainingLatency is the greater of independent and dependent latency. |
3002 | /// |
3003 | /// These computations are expensive, especially in DAGs with many edges, so |
3004 | /// only do them if necessary. |
3005 | static unsigned computeRemLatency(SchedBoundary &CurrZone) { |
3006 | unsigned RemLatency = CurrZone.getDependentLatency(); |
3007 | RemLatency = std::max(a: RemLatency, |
3008 | b: CurrZone.findMaxLatency(ReadySUs: CurrZone.Available.elements())); |
3009 | RemLatency = std::max(a: RemLatency, |
3010 | b: CurrZone.findMaxLatency(ReadySUs: CurrZone.Pending.elements())); |
3011 | return RemLatency; |
3012 | } |
3013 | |
3014 | /// Returns true if the current cycle plus remaning latency is greater than |
3015 | /// the critical path in the scheduling region. |
3016 | bool GenericSchedulerBase::shouldReduceLatency(const CandPolicy &Policy, |
3017 | SchedBoundary &CurrZone, |
3018 | bool ComputeRemLatency, |
3019 | unsigned &RemLatency) const { |
3020 | // The current cycle is already greater than the critical path, so we are |
3021 | // already latency limited and don't need to compute the remaining latency. |
3022 | if (CurrZone.getCurrCycle() > Rem.CriticalPath) |
3023 | return true; |
3024 | |
3025 | // If we haven't scheduled anything yet, then we aren't latency limited. |
3026 | if (CurrZone.getCurrCycle() == 0) |
3027 | return false; |
3028 | |
3029 | if (ComputeRemLatency) |
3030 | RemLatency = computeRemLatency(CurrZone); |
3031 | |
3032 | return RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath; |
3033 | } |
3034 | |
3035 | /// Set the CandPolicy given a scheduling zone given the current resources and |
3036 | /// latencies inside and outside the zone. |
3037 | void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA, |
3038 | SchedBoundary &CurrZone, |
3039 | SchedBoundary *OtherZone) { |
3040 | // Apply preemptive heuristics based on the total latency and resources |
3041 | // inside and outside this zone. Potential stalls should be considered before |
3042 | // following this policy. |
3043 | |
3044 | // Compute the critical resource outside the zone. |
3045 | unsigned OtherCritIdx = 0; |
3046 | unsigned OtherCount = |
3047 | OtherZone ? OtherZone->getOtherResourceCount(OtherCritIdx) : 0; |
3048 | |
3049 | bool OtherResLimited = false; |
3050 | unsigned RemLatency = 0; |
3051 | bool RemLatencyComputed = false; |
3052 | if (SchedModel->hasInstrSchedModel() && OtherCount != 0) { |
3053 | RemLatency = computeRemLatency(CurrZone); |
3054 | RemLatencyComputed = true; |
3055 | OtherResLimited = checkResourceLimit(LFactor: SchedModel->getLatencyFactor(), |
3056 | Count: OtherCount, Latency: RemLatency, AfterSchedNode: false); |
3057 | } |
3058 | |
3059 | // Schedule aggressively for latency in PostRA mode. We don't check for |
3060 | // acyclic latency during PostRA, and highly out-of-order processors will |
3061 | // skip PostRA scheduling. |
3062 | if (!OtherResLimited && |
3063 | (IsPostRA || shouldReduceLatency(Policy, CurrZone, ComputeRemLatency: !RemLatencyComputed, |
3064 | RemLatency))) { |
3065 | Policy.ReduceLatency |= true; |
3066 | LLVM_DEBUG(dbgs() << " " << CurrZone.Available.getName() |
3067 | << " RemainingLatency " << RemLatency << " + " |
3068 | << CurrZone.getCurrCycle() << "c > CritPath " |
3069 | << Rem.CriticalPath << "\n" ); |
3070 | } |
3071 | // If the same resource is limiting inside and outside the zone, do nothing. |
3072 | if (CurrZone.getZoneCritResIdx() == OtherCritIdx) |
3073 | return; |
3074 | |
3075 | LLVM_DEBUG(if (CurrZone.isResourceLimited()) { |
3076 | dbgs() << " " << CurrZone.Available.getName() << " ResourceLimited: " |
3077 | << SchedModel->getResourceName(CurrZone.getZoneCritResIdx()) << "\n" ; |
3078 | } if (OtherResLimited) dbgs() |
3079 | << " RemainingLimit: " |
3080 | << SchedModel->getResourceName(OtherCritIdx) << "\n" ; |
3081 | if (!CurrZone.isResourceLimited() && !OtherResLimited) dbgs() |
3082 | << " Latency limited both directions.\n" ); |
3083 | |
3084 | if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx) |
3085 | Policy.ReduceResIdx = CurrZone.getZoneCritResIdx(); |
3086 | |
3087 | if (OtherResLimited) |
3088 | Policy.DemandResIdx = OtherCritIdx; |
3089 | } |
3090 | |
3091 | #ifndef NDEBUG |
3092 | const char *GenericSchedulerBase::getReasonStr( |
3093 | GenericSchedulerBase::CandReason Reason) { |
3094 | switch (Reason) { |
3095 | case NoCand: return "NOCAND " ; |
3096 | case Only1: return "ONLY1 " ; |
3097 | case PhysReg: return "PHYS-REG " ; |
3098 | case RegExcess: return "REG-EXCESS" ; |
3099 | case RegCritical: return "REG-CRIT " ; |
3100 | case Stall: return "STALL " ; |
3101 | case Cluster: return "CLUSTER " ; |
3102 | case Weak: return "WEAK " ; |
3103 | case RegMax: return "REG-MAX " ; |
3104 | case ResourceReduce: return "RES-REDUCE" ; |
3105 | case ResourceDemand: return "RES-DEMAND" ; |
3106 | case TopDepthReduce: return "TOP-DEPTH " ; |
3107 | case TopPathReduce: return "TOP-PATH " ; |
3108 | case BotHeightReduce:return "BOT-HEIGHT" ; |
3109 | case BotPathReduce: return "BOT-PATH " ; |
3110 | case NextDefUse: return "DEF-USE " ; |
3111 | case NodeOrder: return "ORDER " ; |
3112 | }; |
3113 | llvm_unreachable("Unknown reason!" ); |
3114 | } |
3115 | |
3116 | void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) { |
3117 | PressureChange P; |
3118 | unsigned ResIdx = 0; |
3119 | unsigned Latency = 0; |
3120 | switch (Cand.Reason) { |
3121 | default: |
3122 | break; |
3123 | case RegExcess: |
3124 | P = Cand.RPDelta.Excess; |
3125 | break; |
3126 | case RegCritical: |
3127 | P = Cand.RPDelta.CriticalMax; |
3128 | break; |
3129 | case RegMax: |
3130 | P = Cand.RPDelta.CurrentMax; |
3131 | break; |
3132 | case ResourceReduce: |
3133 | ResIdx = Cand.Policy.ReduceResIdx; |
3134 | break; |
3135 | case ResourceDemand: |
3136 | ResIdx = Cand.Policy.DemandResIdx; |
3137 | break; |
3138 | case TopDepthReduce: |
3139 | Latency = Cand.SU->getDepth(); |
3140 | break; |
3141 | case TopPathReduce: |
3142 | Latency = Cand.SU->getHeight(); |
3143 | break; |
3144 | case BotHeightReduce: |
3145 | Latency = Cand.SU->getHeight(); |
3146 | break; |
3147 | case BotPathReduce: |
3148 | Latency = Cand.SU->getDepth(); |
3149 | break; |
3150 | } |
3151 | dbgs() << " Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); |
3152 | if (P.isValid()) |
3153 | dbgs() << " " << TRI->getRegPressureSetName(P.getPSet()) |
3154 | << ":" << P.getUnitInc() << " " ; |
3155 | else |
3156 | dbgs() << " " ; |
3157 | if (ResIdx) |
3158 | dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " " ; |
3159 | else |
3160 | dbgs() << " " ; |
3161 | if (Latency) |
3162 | dbgs() << " " << Latency << " cycles " ; |
3163 | else |
3164 | dbgs() << " " ; |
3165 | dbgs() << '\n'; |
3166 | } |
3167 | #endif |
3168 | |
3169 | namespace llvm { |
3170 | /// Return true if this heuristic determines order. |
3171 | /// TODO: Consider refactor return type of these functions as integer or enum, |
3172 | /// as we may need to differentiate whether TryCand is better than Cand. |
3173 | bool tryLess(int TryVal, int CandVal, |
3174 | GenericSchedulerBase::SchedCandidate &TryCand, |
3175 | GenericSchedulerBase::SchedCandidate &Cand, |
3176 | GenericSchedulerBase::CandReason Reason) { |
3177 | if (TryVal < CandVal) { |
3178 | TryCand.Reason = Reason; |
3179 | return true; |
3180 | } |
3181 | if (TryVal > CandVal) { |
3182 | if (Cand.Reason > Reason) |
3183 | Cand.Reason = Reason; |
3184 | return true; |
3185 | } |
3186 | return false; |
3187 | } |
3188 | |
3189 | bool tryGreater(int TryVal, int CandVal, |
3190 | GenericSchedulerBase::SchedCandidate &TryCand, |
3191 | GenericSchedulerBase::SchedCandidate &Cand, |
3192 | GenericSchedulerBase::CandReason Reason) { |
3193 | if (TryVal > CandVal) { |
3194 | TryCand.Reason = Reason; |
3195 | return true; |
3196 | } |
3197 | if (TryVal < CandVal) { |
3198 | if (Cand.Reason > Reason) |
3199 | Cand.Reason = Reason; |
3200 | return true; |
3201 | } |
3202 | return false; |
3203 | } |
3204 | |
3205 | bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, |
3206 | GenericSchedulerBase::SchedCandidate &Cand, |
3207 | SchedBoundary &Zone) { |
3208 | if (Zone.isTop()) { |
3209 | // Prefer the candidate with the lesser depth, but only if one of them has |
3210 | // depth greater than the total latency scheduled so far, otherwise either |
3211 | // of them could be scheduled now with no stall. |
3212 | if (std::max(a: TryCand.SU->getDepth(), b: Cand.SU->getDepth()) > |
3213 | Zone.getScheduledLatency()) { |
3214 | if (tryLess(TryVal: TryCand.SU->getDepth(), CandVal: Cand.SU->getDepth(), |
3215 | TryCand, Cand, Reason: GenericSchedulerBase::TopDepthReduce)) |
3216 | return true; |
3217 | } |
3218 | if (tryGreater(TryVal: TryCand.SU->getHeight(), CandVal: Cand.SU->getHeight(), |
3219 | TryCand, Cand, Reason: GenericSchedulerBase::TopPathReduce)) |
3220 | return true; |
3221 | } else { |
3222 | // Prefer the candidate with the lesser height, but only if one of them has |
3223 | // height greater than the total latency scheduled so far, otherwise either |
3224 | // of them could be scheduled now with no stall. |
3225 | if (std::max(a: TryCand.SU->getHeight(), b: Cand.SU->getHeight()) > |
3226 | Zone.getScheduledLatency()) { |
3227 | if (tryLess(TryVal: TryCand.SU->getHeight(), CandVal: Cand.SU->getHeight(), |
3228 | TryCand, Cand, Reason: GenericSchedulerBase::BotHeightReduce)) |
3229 | return true; |
3230 | } |
3231 | if (tryGreater(TryVal: TryCand.SU->getDepth(), CandVal: Cand.SU->getDepth(), |
3232 | TryCand, Cand, Reason: GenericSchedulerBase::BotPathReduce)) |
3233 | return true; |
3234 | } |
3235 | return false; |
3236 | } |
3237 | } // end namespace llvm |
3238 | |
3239 | static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) { |
3240 | LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot " ) |
3241 | << GenericSchedulerBase::getReasonStr(Reason) << '\n'); |
3242 | } |
3243 | |
3244 | static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) { |
3245 | tracePick(Reason: Cand.Reason, IsTop: Cand.AtTop); |
3246 | } |
3247 | |
3248 | void GenericScheduler::initialize(ScheduleDAGMI *dag) { |
3249 | assert(dag->hasVRegLiveness() && |
3250 | "(PreRA)GenericScheduler needs vreg liveness" ); |
3251 | DAG = static_cast<ScheduleDAGMILive*>(dag); |
3252 | SchedModel = DAG->getSchedModel(); |
3253 | TRI = DAG->TRI; |
3254 | |
3255 | if (RegionPolicy.ComputeDFSResult) |
3256 | DAG->computeDFSResult(); |
3257 | |
3258 | Rem.init(DAG, SchedModel); |
3259 | Top.init(dag: DAG, smodel: SchedModel, rem: &Rem); |
3260 | Bot.init(dag: DAG, smodel: SchedModel, rem: &Rem); |
3261 | |
3262 | // Initialize resource counts. |
3263 | |
3264 | // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or |
3265 | // are disabled, then these HazardRecs will be disabled. |
3266 | const InstrItineraryData *Itin = SchedModel->getInstrItineraries(); |
3267 | if (!Top.HazardRec) { |
3268 | Top.HazardRec = DAG->TII->CreateTargetMIHazardRecognizer(Itin, DAG); |
3269 | } |
3270 | if (!Bot.HazardRec) { |
3271 | Bot.HazardRec = DAG->TII->CreateTargetMIHazardRecognizer(Itin, DAG); |
3272 | } |
3273 | TopCand.SU = nullptr; |
3274 | BotCand.SU = nullptr; |
3275 | } |
3276 | |
3277 | /// Initialize the per-region scheduling policy. |
3278 | void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, |
3279 | MachineBasicBlock::iterator End, |
3280 | unsigned NumRegionInstrs) { |
3281 | const MachineFunction &MF = *Begin->getMF(); |
3282 | const TargetLowering *TLI = MF.getSubtarget().getTargetLowering(); |
3283 | |
3284 | // Avoid setting up the register pressure tracker for small regions to save |
3285 | // compile time. As a rough heuristic, only track pressure when the number of |
3286 | // schedulable instructions exceeds half the allocatable integer register file |
3287 | // that is the largest legal integer regiser type. |
3288 | RegionPolicy.ShouldTrackPressure = true; |
3289 | for (unsigned VT = MVT::i64; VT > (unsigned)MVT::i1; --VT) { |
3290 | MVT::SimpleValueType LegalIntVT = (MVT::SimpleValueType)VT; |
3291 | if (TLI->isTypeLegal(VT: LegalIntVT)) { |
3292 | unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs( |
3293 | RC: TLI->getRegClassFor(VT: LegalIntVT)); |
3294 | RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2); |
3295 | break; |
3296 | } |
3297 | } |
3298 | |
3299 | // For generic targets, we default to bottom-up, because it's simpler and more |
3300 | // compile-time optimizations have been implemented in that direction. |
3301 | RegionPolicy.OnlyBottomUp = true; |
3302 | |
3303 | // Allow the subtarget to override default policy. |
3304 | MF.getSubtarget().overrideSchedPolicy(Policy&: RegionPolicy, NumRegionInstrs); |
3305 | |
3306 | // After subtarget overrides, apply command line options. |
3307 | if (!EnableRegPressure) { |
3308 | RegionPolicy.ShouldTrackPressure = false; |
3309 | RegionPolicy.ShouldTrackLaneMasks = false; |
3310 | } |
3311 | |
3312 | // Check -misched-topdown/bottomup can force or unforce scheduling direction. |
3313 | // e.g. -misched-bottomup=false allows scheduling in both directions. |
3314 | assert((!ForceTopDown || !ForceBottomUp) && |
3315 | "-misched-topdown incompatible with -misched-bottomup" ); |
3316 | if (ForceBottomUp.getNumOccurrences() > 0) { |
3317 | RegionPolicy.OnlyBottomUp = ForceBottomUp; |
3318 | if (RegionPolicy.OnlyBottomUp) |
3319 | RegionPolicy.OnlyTopDown = false; |
3320 | } |
3321 | if (ForceTopDown.getNumOccurrences() > 0) { |
3322 | RegionPolicy.OnlyTopDown = ForceTopDown; |
3323 | if (RegionPolicy.OnlyTopDown) |
3324 | RegionPolicy.OnlyBottomUp = false; |
3325 | } |
3326 | } |
3327 | |
3328 | void GenericScheduler::dumpPolicy() const { |
3329 | // Cannot completely remove virtual function even in release mode. |
3330 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3331 | dbgs() << "GenericScheduler RegionPolicy: " |
3332 | << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure |
3333 | << " OnlyTopDown=" << RegionPolicy.OnlyTopDown |
3334 | << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp |
3335 | << "\n" ; |
3336 | #endif |
3337 | } |
3338 | |
3339 | /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic |
3340 | /// critical path by more cycles than it takes to drain the instruction buffer. |
3341 | /// We estimate an upper bounds on in-flight instructions as: |
3342 | /// |
3343 | /// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height ) |
3344 | /// InFlightIterations = AcyclicPath / CyclesPerIteration |
3345 | /// InFlightResources = InFlightIterations * LoopResources |
3346 | /// |
3347 | /// TODO: Check execution resources in addition to IssueCount. |
3348 | void GenericScheduler::checkAcyclicLatency() { |
3349 | if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath) |
3350 | return; |
3351 | |
3352 | // Scaled number of cycles per loop iteration. |
3353 | unsigned IterCount = |
3354 | std::max(a: Rem.CyclicCritPath * SchedModel->getLatencyFactor(), |
3355 | b: Rem.RemIssueCount); |
3356 | // Scaled acyclic critical path. |
3357 | unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor(); |
3358 | // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop |
3359 | unsigned InFlightCount = |
3360 | (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount; |
3361 | unsigned BufferLimit = |
3362 | SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor(); |
3363 | |
3364 | Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit; |
3365 | |
3366 | LLVM_DEBUG( |
3367 | dbgs() << "IssueCycles=" |
3368 | << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c " |
3369 | << "IterCycles=" << IterCount / SchedModel->getLatencyFactor() |
3370 | << "c NumIters=" << (AcyclicCount + IterCount - 1) / IterCount |
3371 | << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor() |
3372 | << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n" ; |
3373 | if (Rem.IsAcyclicLatencyLimited) dbgs() << " ACYCLIC LATENCY LIMIT\n" ); |
3374 | } |
3375 | |
3376 | void GenericScheduler::registerRoots() { |
3377 | Rem.CriticalPath = DAG->ExitSU.getDepth(); |
3378 | |
3379 | // Some roots may not feed into ExitSU. Check all of them in case. |
3380 | for (const SUnit *SU : Bot.Available) { |
3381 | if (SU->getDepth() > Rem.CriticalPath) |
3382 | Rem.CriticalPath = SU->getDepth(); |
3383 | } |
3384 | LLVM_DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n'); |
3385 | if (DumpCriticalPathLength) { |
3386 | errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n" ; |
3387 | } |
3388 | |
3389 | if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) { |
3390 | Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); |
3391 | checkAcyclicLatency(); |
3392 | } |
3393 | } |
3394 | |
3395 | namespace llvm { |
3396 | bool tryPressure(const PressureChange &TryP, |
3397 | const PressureChange &CandP, |
3398 | GenericSchedulerBase::SchedCandidate &TryCand, |
3399 | GenericSchedulerBase::SchedCandidate &Cand, |
3400 | GenericSchedulerBase::CandReason Reason, |
3401 | const TargetRegisterInfo *TRI, |
3402 | const MachineFunction &MF) { |
3403 | // If one candidate decreases and the other increases, go with it. |
3404 | // Invalid candidates have UnitInc==0. |
3405 | if (tryGreater(TryVal: TryP.getUnitInc() < 0, CandVal: CandP.getUnitInc() < 0, TryCand, Cand, |
3406 | Reason)) { |
3407 | return true; |
3408 | } |
3409 | // Do not compare the magnitude of pressure changes between top and bottom |
3410 | // boundary. |
3411 | if (Cand.AtTop != TryCand.AtTop) |
3412 | return false; |
3413 | |
3414 | // If both candidates affect the same set in the same boundary, go with the |
3415 | // smallest increase. |
3416 | unsigned TryPSet = TryP.getPSetOrMax(); |
3417 | unsigned CandPSet = CandP.getPSetOrMax(); |
3418 | if (TryPSet == CandPSet) { |
3419 | return tryLess(TryVal: TryP.getUnitInc(), CandVal: CandP.getUnitInc(), TryCand, Cand, |
3420 | Reason); |
3421 | } |
3422 | |
3423 | int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, PSetID: TryPSet) : |
3424 | std::numeric_limits<int>::max(); |
3425 | |
3426 | int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, PSetID: CandPSet) : |
3427 | std::numeric_limits<int>::max(); |
3428 | |
3429 | // If the candidates are decreasing pressure, reverse priority. |
3430 | if (TryP.getUnitInc() < 0) |
3431 | std::swap(a&: TryRank, b&: CandRank); |
3432 | return tryGreater(TryVal: TryRank, CandVal: CandRank, TryCand, Cand, Reason); |
3433 | } |
3434 | |
3435 | unsigned getWeakLeft(const SUnit *SU, bool isTop) { |
3436 | return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft; |
3437 | } |
3438 | |
3439 | /// Minimize physical register live ranges. Regalloc wants them adjacent to |
3440 | /// their physreg def/use. |
3441 | /// |
3442 | /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf |
3443 | /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled |
3444 | /// with the operation that produces or consumes the physreg. We'll do this when |
3445 | /// regalloc has support for parallel copies. |
3446 | int biasPhysReg(const SUnit *SU, bool isTop) { |
3447 | const MachineInstr *MI = SU->getInstr(); |
3448 | |
3449 | if (MI->isCopy()) { |
3450 | unsigned ScheduledOper = isTop ? 1 : 0; |
3451 | unsigned UnscheduledOper = isTop ? 0 : 1; |
3452 | // If we have already scheduled the physreg produce/consumer, immediately |
3453 | // schedule the copy. |
3454 | if (MI->getOperand(i: ScheduledOper).getReg().isPhysical()) |
3455 | return 1; |
3456 | // If the physreg is at the boundary, defer it. Otherwise schedule it |
3457 | // immediately to free the dependent. We can hoist the copy later. |
3458 | bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft; |
3459 | if (MI->getOperand(i: UnscheduledOper).getReg().isPhysical()) |
3460 | return AtBoundary ? -1 : 1; |
3461 | } |
3462 | |
3463 | if (MI->isMoveImmediate()) { |
3464 | // If we have a move immediate and all successors have been assigned, bias |
3465 | // towards scheduling this later. Make sure all register defs are to |
3466 | // physical registers. |
3467 | bool DoBias = true; |
3468 | for (const MachineOperand &Op : MI->defs()) { |
3469 | if (Op.isReg() && !Op.getReg().isPhysical()) { |
3470 | DoBias = false; |
3471 | break; |
3472 | } |
3473 | } |
3474 | |
3475 | if (DoBias) |
3476 | return isTop ? -1 : 1; |
3477 | } |
3478 | |
3479 | return 0; |
3480 | } |
3481 | } // end namespace llvm |
3482 | |
3483 | void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU, |
3484 | bool AtTop, |
3485 | const RegPressureTracker &RPTracker, |
3486 | RegPressureTracker &TempTracker) { |
3487 | Cand.SU = SU; |
3488 | Cand.AtTop = AtTop; |
3489 | if (DAG->isTrackingPressure()) { |
3490 | if (AtTop) { |
3491 | TempTracker.getMaxDownwardPressureDelta( |
3492 | MI: Cand.SU->getInstr(), |
3493 | Delta&: Cand.RPDelta, |
3494 | CriticalPSets: DAG->getRegionCriticalPSets(), |
3495 | MaxPressureLimit: DAG->getRegPressure().MaxSetPressure); |
3496 | } else { |
3497 | if (VerifyScheduling) { |
3498 | TempTracker.getMaxUpwardPressureDelta( |
3499 | MI: Cand.SU->getInstr(), |
3500 | PDiff: &DAG->getPressureDiff(SU: Cand.SU), |
3501 | Delta&: Cand.RPDelta, |
3502 | CriticalPSets: DAG->getRegionCriticalPSets(), |
3503 | MaxPressureLimit: DAG->getRegPressure().MaxSetPressure); |
3504 | } else { |
3505 | RPTracker.getUpwardPressureDelta( |
3506 | MI: Cand.SU->getInstr(), |
3507 | PDiff&: DAG->getPressureDiff(SU: Cand.SU), |
3508 | Delta&: Cand.RPDelta, |
3509 | CriticalPSets: DAG->getRegionCriticalPSets(), |
3510 | MaxPressureLimit: DAG->getRegPressure().MaxSetPressure); |
3511 | } |
3512 | } |
3513 | } |
3514 | LLVM_DEBUG(if (Cand.RPDelta.Excess.isValid()) dbgs() |
3515 | << " Try SU(" << Cand.SU->NodeNum << ") " |
3516 | << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet()) << ":" |
3517 | << Cand.RPDelta.Excess.getUnitInc() << "\n" ); |
3518 | } |
3519 | |
3520 | /// Apply a set of heuristics to a new candidate. Heuristics are currently |
3521 | /// hierarchical. This may be more efficient than a graduated cost model because |
3522 | /// we don't need to evaluate all aspects of the model for each node in the |
3523 | /// queue. But it's really done to make the heuristics easier to debug and |
3524 | /// statistically analyze. |
3525 | /// |
3526 | /// \param Cand provides the policy and current best candidate. |
3527 | /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. |
3528 | /// \param Zone describes the scheduled zone that we are extending, or nullptr |
3529 | /// if Cand is from a different zone than TryCand. |
3530 | /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) |
3531 | bool GenericScheduler::tryCandidate(SchedCandidate &Cand, |
3532 | SchedCandidate &TryCand, |
3533 | SchedBoundary *Zone) const { |
3534 | // Initialize the candidate if needed. |
3535 | if (!Cand.isValid()) { |
3536 | TryCand.Reason = NodeOrder; |
3537 | return true; |
3538 | } |
3539 | |
3540 | // Bias PhysReg Defs and copies to their uses and defined respectively. |
3541 | if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop), |
3542 | CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg)) |
3543 | return TryCand.Reason != NoCand; |
3544 | |
3545 | // Avoid exceeding the target's limit. |
3546 | if (DAG->isTrackingPressure() && tryPressure(TryP: TryCand.RPDelta.Excess, |
3547 | CandP: Cand.RPDelta.Excess, |
3548 | TryCand, Cand, Reason: RegExcess, TRI, |
3549 | MF: DAG->MF)) |
3550 | return TryCand.Reason != NoCand; |
3551 | |
3552 | // Avoid increasing the max critical pressure in the scheduled region. |
3553 | if (DAG->isTrackingPressure() && tryPressure(TryP: TryCand.RPDelta.CriticalMax, |
3554 | CandP: Cand.RPDelta.CriticalMax, |
3555 | TryCand, Cand, Reason: RegCritical, TRI, |
3556 | MF: DAG->MF)) |
3557 | return TryCand.Reason != NoCand; |
3558 | |
3559 | // We only compare a subset of features when comparing nodes between |
3560 | // Top and Bottom boundary. Some properties are simply incomparable, in many |
3561 | // other instances we should only override the other boundary if something |
3562 | // is a clear good pick on one boundary. Skip heuristics that are more |
3563 | // "tie-breaking" in nature. |
3564 | bool SameBoundary = Zone != nullptr; |
3565 | if (SameBoundary) { |
3566 | // For loops that are acyclic path limited, aggressively schedule for |
3567 | // latency. Within an single cycle, whenever CurrMOps > 0, allow normal |
3568 | // heuristics to take precedence. |
3569 | if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && |
3570 | tryLatency(TryCand, Cand, Zone&: *Zone)) |
3571 | return TryCand.Reason != NoCand; |
3572 | |
3573 | // Prioritize instructions that read unbuffered resources by stall cycles. |
3574 | if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU), |
3575 | CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall)) |
3576 | return TryCand.Reason != NoCand; |
3577 | } |
3578 | |
3579 | // Keep clustered nodes together to encourage downstream peephole |
3580 | // optimizations which may reduce resource requirements. |
3581 | // |
3582 | // This is a best effort to set things up for a post-RA pass. Optimizations |
3583 | // like generating loads of multiple registers should ideally be done within |
3584 | // the scheduler pass by combining the loads during DAG postprocessing. |
3585 | const SUnit *CandNextClusterSU = |
3586 | Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
3587 | const SUnit *TryCandNextClusterSU = |
3588 | TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
3589 | if (tryGreater(TryVal: TryCand.SU == TryCandNextClusterSU, |
3590 | CandVal: Cand.SU == CandNextClusterSU, |
3591 | TryCand, Cand, Reason: Cluster)) |
3592 | return TryCand.Reason != NoCand; |
3593 | |
3594 | if (SameBoundary) { |
3595 | // Weak edges are for clustering and other constraints. |
3596 | if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop), |
3597 | CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), |
3598 | TryCand, Cand, Reason: Weak)) |
3599 | return TryCand.Reason != NoCand; |
3600 | } |
3601 | |
3602 | // Avoid increasing the max pressure of the entire region. |
3603 | if (DAG->isTrackingPressure() && tryPressure(TryP: TryCand.RPDelta.CurrentMax, |
3604 | CandP: Cand.RPDelta.CurrentMax, |
3605 | TryCand, Cand, Reason: RegMax, TRI, |
3606 | MF: DAG->MF)) |
3607 | return TryCand.Reason != NoCand; |
3608 | |
3609 | if (SameBoundary) { |
3610 | // Avoid critical resource consumption and balance the schedule. |
3611 | TryCand.initResourceDelta(DAG, SchedModel); |
3612 | if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources, |
3613 | TryCand, Cand, Reason: ResourceReduce)) |
3614 | return TryCand.Reason != NoCand; |
3615 | if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources, |
3616 | CandVal: Cand.ResDelta.DemandedResources, |
3617 | TryCand, Cand, Reason: ResourceDemand)) |
3618 | return TryCand.Reason != NoCand; |
3619 | |
3620 | // Avoid serializing long latency dependence chains. |
3621 | // For acyclic path limited loops, latency was already checked above. |
3622 | if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && |
3623 | !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone)) |
3624 | return TryCand.Reason != NoCand; |
3625 | |
3626 | // Fall through to original instruction order. |
3627 | if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) |
3628 | || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { |
3629 | TryCand.Reason = NodeOrder; |
3630 | return true; |
3631 | } |
3632 | } |
3633 | |
3634 | return false; |
3635 | } |
3636 | |
3637 | /// Pick the best candidate from the queue. |
3638 | /// |
3639 | /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during |
3640 | /// DAG building. To adjust for the current scheduling location we need to |
3641 | /// maintain the number of vreg uses remaining to be top-scheduled. |
3642 | void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone, |
3643 | const CandPolicy &ZonePolicy, |
3644 | const RegPressureTracker &RPTracker, |
3645 | SchedCandidate &Cand) { |
3646 | // getMaxPressureDelta temporarily modifies the tracker. |
3647 | RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); |
3648 | |
3649 | ReadyQueue &Q = Zone.Available; |
3650 | for (SUnit *SU : Q) { |
3651 | |
3652 | SchedCandidate TryCand(ZonePolicy); |
3653 | initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, TempTracker); |
3654 | // Pass SchedBoundary only when comparing nodes from the same boundary. |
3655 | SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; |
3656 | if (tryCandidate(Cand, TryCand, Zone: ZoneArg)) { |
3657 | // Initialize resource delta if needed in case future heuristics query it. |
3658 | if (TryCand.ResDelta == SchedResourceDelta()) |
3659 | TryCand.initResourceDelta(DAG, SchedModel); |
3660 | Cand.setBest(TryCand); |
3661 | LLVM_DEBUG(traceCandidate(Cand)); |
3662 | } |
3663 | } |
3664 | } |
3665 | |
3666 | /// Pick the best candidate node from either the top or bottom queue. |
3667 | SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) { |
3668 | // Schedule as far as possible in the direction of no choice. This is most |
3669 | // efficient, but also provides the best heuristics for CriticalPSets. |
3670 | if (SUnit *SU = Bot.pickOnlyChoice()) { |
3671 | IsTopNode = false; |
3672 | tracePick(Reason: Only1, IsTop: false); |
3673 | return SU; |
3674 | } |
3675 | if (SUnit *SU = Top.pickOnlyChoice()) { |
3676 | IsTopNode = true; |
3677 | tracePick(Reason: Only1, IsTop: true); |
3678 | return SU; |
3679 | } |
3680 | // Set the bottom-up policy based on the state of the current bottom zone and |
3681 | // the instructions outside the zone, including the top zone. |
3682 | CandPolicy BotPolicy; |
3683 | setPolicy(Policy&: BotPolicy, /*IsPostRA=*/false, CurrZone&: Bot, OtherZone: &Top); |
3684 | // Set the top-down policy based on the state of the current top zone and |
3685 | // the instructions outside the zone, including the bottom zone. |
3686 | CandPolicy TopPolicy; |
3687 | setPolicy(Policy&: TopPolicy, /*IsPostRA=*/false, CurrZone&: Top, OtherZone: &Bot); |
3688 | |
3689 | // See if BotCand is still valid (because we previously scheduled from Top). |
3690 | LLVM_DEBUG(dbgs() << "Picking from Bot:\n" ); |
3691 | if (!BotCand.isValid() || BotCand.SU->isScheduled || |
3692 | BotCand.Policy != BotPolicy) { |
3693 | BotCand.reset(NewPolicy: CandPolicy()); |
3694 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand); |
3695 | assert(BotCand.Reason != NoCand && "failed to find the first candidate" ); |
3696 | } else { |
3697 | LLVM_DEBUG(traceCandidate(BotCand)); |
3698 | #ifndef NDEBUG |
3699 | if (VerifyScheduling) { |
3700 | SchedCandidate TCand; |
3701 | TCand.reset(CandPolicy()); |
3702 | pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); |
3703 | assert(TCand.SU == BotCand.SU && |
3704 | "Last pick result should correspond to re-picking right now" ); |
3705 | } |
3706 | #endif |
3707 | } |
3708 | |
3709 | // Check if the top Q has a better candidate. |
3710 | LLVM_DEBUG(dbgs() << "Picking from Top:\n" ); |
3711 | if (!TopCand.isValid() || TopCand.SU->isScheduled || |
3712 | TopCand.Policy != TopPolicy) { |
3713 | TopCand.reset(NewPolicy: CandPolicy()); |
3714 | pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand); |
3715 | assert(TopCand.Reason != NoCand && "failed to find the first candidate" ); |
3716 | } else { |
3717 | LLVM_DEBUG(traceCandidate(TopCand)); |
3718 | #ifndef NDEBUG |
3719 | if (VerifyScheduling) { |
3720 | SchedCandidate TCand; |
3721 | TCand.reset(CandPolicy()); |
3722 | pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); |
3723 | assert(TCand.SU == TopCand.SU && |
3724 | "Last pick result should correspond to re-picking right now" ); |
3725 | } |
3726 | #endif |
3727 | } |
3728 | |
3729 | // Pick best from BotCand and TopCand. |
3730 | assert(BotCand.isValid()); |
3731 | assert(TopCand.isValid()); |
3732 | SchedCandidate Cand = BotCand; |
3733 | TopCand.Reason = NoCand; |
3734 | if (tryCandidate(Cand, TryCand&: TopCand, Zone: nullptr)) { |
3735 | Cand.setBest(TopCand); |
3736 | LLVM_DEBUG(traceCandidate(Cand)); |
3737 | } |
3738 | |
3739 | IsTopNode = Cand.AtTop; |
3740 | tracePick(Cand); |
3741 | return Cand.SU; |
3742 | } |
3743 | |
3744 | /// Pick the best node to balance the schedule. Implements MachineSchedStrategy. |
3745 | SUnit *GenericScheduler::pickNode(bool &IsTopNode) { |
3746 | if (DAG->top() == DAG->bottom()) { |
3747 | assert(Top.Available.empty() && Top.Pending.empty() && |
3748 | Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage" ); |
3749 | return nullptr; |
3750 | } |
3751 | SUnit *SU; |
3752 | do { |
3753 | if (RegionPolicy.OnlyTopDown) { |
3754 | SU = Top.pickOnlyChoice(); |
3755 | if (!SU) { |
3756 | CandPolicy NoPolicy; |
3757 | TopCand.reset(NewPolicy: NoPolicy); |
3758 | pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand); |
3759 | assert(TopCand.Reason != NoCand && "failed to find a candidate" ); |
3760 | tracePick(Cand: TopCand); |
3761 | SU = TopCand.SU; |
3762 | } |
3763 | IsTopNode = true; |
3764 | } else if (RegionPolicy.OnlyBottomUp) { |
3765 | SU = Bot.pickOnlyChoice(); |
3766 | if (!SU) { |
3767 | CandPolicy NoPolicy; |
3768 | BotCand.reset(NewPolicy: NoPolicy); |
3769 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand); |
3770 | assert(BotCand.Reason != NoCand && "failed to find a candidate" ); |
3771 | tracePick(Cand: BotCand); |
3772 | SU = BotCand.SU; |
3773 | } |
3774 | IsTopNode = false; |
3775 | } else { |
3776 | SU = pickNodeBidirectional(IsTopNode); |
3777 | } |
3778 | } while (SU->isScheduled); |
3779 | |
3780 | // If IsTopNode, then SU is in Top.Available and must be removed. Otherwise, |
3781 | // if isTopReady(), then SU is in either Top.Available or Top.Pending. |
3782 | // If !IsTopNode, then SU is in Bot.Available and must be removed. Otherwise, |
3783 | // if isBottomReady(), then SU is in either Bot.Available or Bot.Pending. |
3784 | // |
3785 | // It is coincidental when !IsTopNode && isTopReady or when IsTopNode && |
3786 | // isBottomReady. That is, it didn't factor into the decision to choose SU |
3787 | // because it isTopReady or isBottomReady, respectively. In fact, if the |
3788 | // RegionPolicy is OnlyTopDown or OnlyBottomUp, then the Bot queues and Top |
3789 | // queues respectivley contain the original roots and don't get updated when |
3790 | // picking a node. So if SU isTopReady on a OnlyBottomUp pick, then it was |
3791 | // because we schduled everything but the top roots. Conversley, if SU |
3792 | // isBottomReady on OnlyTopDown, then it was because we scheduled everything |
3793 | // but the bottom roots. If its in a queue even coincidentally, it should be |
3794 | // removed so it does not get re-picked in a subsequent pickNode call. |
3795 | if (SU->isTopReady()) |
3796 | Top.removeReady(SU); |
3797 | if (SU->isBottomReady()) |
3798 | Bot.removeReady(SU); |
3799 | |
3800 | LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " |
3801 | << *SU->getInstr()); |
3802 | return SU; |
3803 | } |
3804 | |
3805 | void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) { |
3806 | MachineBasicBlock::iterator InsertPos = SU->getInstr(); |
3807 | if (!isTop) |
3808 | ++InsertPos; |
3809 | SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs; |
3810 | |
3811 | // Find already scheduled copies with a single physreg dependence and move |
3812 | // them just above the scheduled instruction. |
3813 | for (SDep &Dep : Deps) { |
3814 | if (Dep.getKind() != SDep::Data || |
3815 | !Register::isPhysicalRegister(Reg: Dep.getReg())) |
3816 | continue; |
3817 | SUnit *DepSU = Dep.getSUnit(); |
3818 | if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1) |
3819 | continue; |
3820 | MachineInstr *Copy = DepSU->getInstr(); |
3821 | if (!Copy->isCopy() && !Copy->isMoveImmediate()) |
3822 | continue; |
3823 | LLVM_DEBUG(dbgs() << " Rescheduling physreg copy " ; |
3824 | DAG->dumpNode(*Dep.getSUnit())); |
3825 | DAG->moveInstruction(MI: Copy, InsertPos); |
3826 | } |
3827 | } |
3828 | |
3829 | /// Update the scheduler's state after scheduling a node. This is the same node |
3830 | /// that was just returned by pickNode(). However, ScheduleDAGMILive needs to |
3831 | /// update it's state based on the current cycle before MachineSchedStrategy |
3832 | /// does. |
3833 | /// |
3834 | /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling |
3835 | /// them here. See comments in biasPhysReg. |
3836 | void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { |
3837 | if (IsTopNode) { |
3838 | SU->TopReadyCycle = std::max(a: SU->TopReadyCycle, b: Top.getCurrCycle()); |
3839 | Top.bumpNode(SU); |
3840 | if (SU->hasPhysRegUses) |
3841 | reschedulePhysReg(SU, isTop: true); |
3842 | } else { |
3843 | SU->BotReadyCycle = std::max(a: SU->BotReadyCycle, b: Bot.getCurrCycle()); |
3844 | Bot.bumpNode(SU); |
3845 | if (SU->hasPhysRegDefs) |
3846 | reschedulePhysReg(SU, isTop: false); |
3847 | } |
3848 | } |
3849 | |
3850 | /// Create the standard converging machine scheduler. This will be used as the |
3851 | /// default scheduler if the target does not set a default. |
3852 | ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) { |
3853 | ScheduleDAGMILive *DAG = |
3854 | new ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(args&: C)); |
3855 | // Register DAG post-processors. |
3856 | // |
3857 | // FIXME: extend the mutation API to allow earlier mutations to instantiate |
3858 | // data and pass it to later mutations. Have a single mutation that gathers |
3859 | // the interesting nodes in one pass. |
3860 | DAG->addMutation(Mutation: createCopyConstrainDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
3861 | |
3862 | const TargetSubtargetInfo &STI = C->MF->getSubtarget(); |
3863 | // Add MacroFusion mutation if fusions are not empty. |
3864 | const auto &MacroFusions = STI.getMacroFusions(); |
3865 | if (!MacroFusions.empty()) |
3866 | DAG->addMutation(Mutation: createMacroFusionDAGMutation(Predicates: MacroFusions)); |
3867 | return DAG; |
3868 | } |
3869 | |
3870 | static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) { |
3871 | return createGenericSchedLive(C); |
3872 | } |
3873 | |
3874 | static MachineSchedRegistry |
3875 | GenericSchedRegistry("converge" , "Standard converging scheduler." , |
3876 | createConvergingSched); |
3877 | |
3878 | //===----------------------------------------------------------------------===// |
3879 | // PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy. |
3880 | //===----------------------------------------------------------------------===// |
3881 | |
3882 | void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) { |
3883 | DAG = Dag; |
3884 | SchedModel = DAG->getSchedModel(); |
3885 | TRI = DAG->TRI; |
3886 | |
3887 | Rem.init(DAG, SchedModel); |
3888 | Top.init(dag: DAG, smodel: SchedModel, rem: &Rem); |
3889 | Bot.init(dag: DAG, smodel: SchedModel, rem: &Rem); |
3890 | |
3891 | // Initialize the HazardRecognizers. If itineraries don't exist, are empty, |
3892 | // or are disabled, then these HazardRecs will be disabled. |
3893 | const InstrItineraryData *Itin = SchedModel->getInstrItineraries(); |
3894 | if (!Top.HazardRec) { |
3895 | Top.HazardRec = DAG->TII->CreateTargetMIHazardRecognizer(Itin, DAG); |
3896 | } |
3897 | if (!Bot.HazardRec) { |
3898 | Bot.HazardRec = DAG->TII->CreateTargetMIHazardRecognizer(Itin, DAG); |
3899 | } |
3900 | } |
3901 | |
3902 | void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, |
3903 | MachineBasicBlock::iterator End, |
3904 | unsigned NumRegionInstrs) { |
3905 | if (PostRADirection == MISchedPostRASched::TopDown) { |
3906 | RegionPolicy.OnlyTopDown = true; |
3907 | RegionPolicy.OnlyBottomUp = false; |
3908 | } else if (PostRADirection == MISchedPostRASched::BottomUp) { |
3909 | RegionPolicy.OnlyTopDown = false; |
3910 | RegionPolicy.OnlyBottomUp = true; |
3911 | } else if (PostRADirection == MISchedPostRASched::Bidirectional) { |
3912 | RegionPolicy.OnlyBottomUp = false; |
3913 | RegionPolicy.OnlyTopDown = false; |
3914 | } |
3915 | } |
3916 | |
3917 | void PostGenericScheduler::registerRoots() { |
3918 | Rem.CriticalPath = DAG->ExitSU.getDepth(); |
3919 | |
3920 | // Some roots may not feed into ExitSU. Check all of them in case. |
3921 | for (const SUnit *SU : Bot.Available) { |
3922 | if (SU->getDepth() > Rem.CriticalPath) |
3923 | Rem.CriticalPath = SU->getDepth(); |
3924 | } |
3925 | LLVM_DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n'); |
3926 | if (DumpCriticalPathLength) { |
3927 | errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n" ; |
3928 | } |
3929 | } |
3930 | |
3931 | /// Apply a set of heuristics to a new candidate for PostRA scheduling. |
3932 | /// |
3933 | /// \param Cand provides the policy and current best candidate. |
3934 | /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. |
3935 | /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) |
3936 | bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand, |
3937 | SchedCandidate &TryCand) { |
3938 | // Initialize the candidate if needed. |
3939 | if (!Cand.isValid()) { |
3940 | TryCand.Reason = NodeOrder; |
3941 | return true; |
3942 | } |
3943 | |
3944 | // Prioritize instructions that read unbuffered resources by stall cycles. |
3945 | if (tryLess(TryVal: Top.getLatencyStallCycles(SU: TryCand.SU), |
3946 | CandVal: Top.getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall)) |
3947 | return TryCand.Reason != NoCand; |
3948 | |
3949 | // Keep clustered nodes together. |
3950 | if (tryGreater(TryVal: TryCand.SU == DAG->getNextClusterSucc(), |
3951 | CandVal: Cand.SU == DAG->getNextClusterSucc(), |
3952 | TryCand, Cand, Reason: Cluster)) |
3953 | return TryCand.Reason != NoCand; |
3954 | |
3955 | // Avoid critical resource consumption and balance the schedule. |
3956 | if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources, |
3957 | TryCand, Cand, Reason: ResourceReduce)) |
3958 | return TryCand.Reason != NoCand; |
3959 | if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources, |
3960 | CandVal: Cand.ResDelta.DemandedResources, |
3961 | TryCand, Cand, Reason: ResourceDemand)) |
3962 | return TryCand.Reason != NoCand; |
3963 | |
3964 | // Avoid serializing long latency dependence chains. |
3965 | if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Zone&: Top)) { |
3966 | return TryCand.Reason != NoCand; |
3967 | } |
3968 | |
3969 | // Fall through to original instruction order. |
3970 | if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { |
3971 | TryCand.Reason = NodeOrder; |
3972 | return true; |
3973 | } |
3974 | |
3975 | return false; |
3976 | } |
3977 | |
3978 | void PostGenericScheduler::pickNodeFromQueue(SchedBoundary &Zone, |
3979 | SchedCandidate &Cand) { |
3980 | ReadyQueue &Q = Zone.Available; |
3981 | for (SUnit *SU : Q) { |
3982 | SchedCandidate TryCand(Cand.Policy); |
3983 | TryCand.SU = SU; |
3984 | TryCand.AtTop = Zone.isTop(); |
3985 | TryCand.initResourceDelta(DAG, SchedModel); |
3986 | if (tryCandidate(Cand, TryCand)) { |
3987 | Cand.setBest(TryCand); |
3988 | LLVM_DEBUG(traceCandidate(Cand)); |
3989 | } |
3990 | } |
3991 | } |
3992 | |
3993 | /// Pick the best candidate node from either the top or bottom queue. |
3994 | SUnit *PostGenericScheduler::pickNodeBidirectional(bool &IsTopNode) { |
3995 | // FIXME: This is similiar to GenericScheduler::pickNodeBidirectional. Factor |
3996 | // out common parts. |
3997 | |
3998 | // Schedule as far as possible in the direction of no choice. This is most |
3999 | // efficient, but also provides the best heuristics for CriticalPSets. |
4000 | if (SUnit *SU = Bot.pickOnlyChoice()) { |
4001 | IsTopNode = false; |
4002 | tracePick(Reason: Only1, IsTop: false); |
4003 | return SU; |
4004 | } |
4005 | if (SUnit *SU = Top.pickOnlyChoice()) { |
4006 | IsTopNode = true; |
4007 | tracePick(Reason: Only1, IsTop: true); |
4008 | return SU; |
4009 | } |
4010 | // Set the bottom-up policy based on the state of the current bottom zone and |
4011 | // the instructions outside the zone, including the top zone. |
4012 | CandPolicy BotPolicy; |
4013 | setPolicy(Policy&: BotPolicy, /*IsPostRA=*/true, CurrZone&: Bot, OtherZone: &Top); |
4014 | // Set the top-down policy based on the state of the current top zone and |
4015 | // the instructions outside the zone, including the bottom zone. |
4016 | CandPolicy TopPolicy; |
4017 | setPolicy(Policy&: TopPolicy, /*IsPostRA=*/true, CurrZone&: Top, OtherZone: &Bot); |
4018 | |
4019 | // See if BotCand is still valid (because we previously scheduled from Top). |
4020 | LLVM_DEBUG(dbgs() << "Picking from Bot:\n" ); |
4021 | if (!BotCand.isValid() || BotCand.SU->isScheduled || |
4022 | BotCand.Policy != BotPolicy) { |
4023 | BotCand.reset(NewPolicy: CandPolicy()); |
4024 | pickNodeFromQueue(Zone&: Bot, Cand&: BotCand); |
4025 | assert(BotCand.Reason != NoCand && "failed to find the first candidate" ); |
4026 | } else { |
4027 | LLVM_DEBUG(traceCandidate(BotCand)); |
4028 | #ifndef NDEBUG |
4029 | if (VerifyScheduling) { |
4030 | SchedCandidate TCand; |
4031 | TCand.reset(CandPolicy()); |
4032 | pickNodeFromQueue(Bot, BotCand); |
4033 | assert(TCand.SU == BotCand.SU && |
4034 | "Last pick result should correspond to re-picking right now" ); |
4035 | } |
4036 | #endif |
4037 | } |
4038 | |
4039 | // Check if the top Q has a better candidate. |
4040 | LLVM_DEBUG(dbgs() << "Picking from Top:\n" ); |
4041 | if (!TopCand.isValid() || TopCand.SU->isScheduled || |
4042 | TopCand.Policy != TopPolicy) { |
4043 | TopCand.reset(NewPolicy: CandPolicy()); |
4044 | pickNodeFromQueue(Zone&: Top, Cand&: TopCand); |
4045 | assert(TopCand.Reason != NoCand && "failed to find the first candidate" ); |
4046 | } else { |
4047 | LLVM_DEBUG(traceCandidate(TopCand)); |
4048 | #ifndef NDEBUG |
4049 | if (VerifyScheduling) { |
4050 | SchedCandidate TCand; |
4051 | TCand.reset(CandPolicy()); |
4052 | pickNodeFromQueue(Top, TopCand); |
4053 | assert(TCand.SU == TopCand.SU && |
4054 | "Last pick result should correspond to re-picking right now" ); |
4055 | } |
4056 | #endif |
4057 | } |
4058 | |
4059 | // Pick best from BotCand and TopCand. |
4060 | assert(BotCand.isValid()); |
4061 | assert(TopCand.isValid()); |
4062 | SchedCandidate Cand = BotCand; |
4063 | TopCand.Reason = NoCand; |
4064 | if (tryCandidate(Cand, TryCand&: TopCand)) { |
4065 | Cand.setBest(TopCand); |
4066 | LLVM_DEBUG(traceCandidate(Cand)); |
4067 | } |
4068 | |
4069 | IsTopNode = Cand.AtTop; |
4070 | tracePick(Cand); |
4071 | return Cand.SU; |
4072 | } |
4073 | |
4074 | /// Pick the next node to schedule. |
4075 | SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) { |
4076 | if (DAG->top() == DAG->bottom()) { |
4077 | assert(Top.Available.empty() && Top.Pending.empty() && |
4078 | Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage" ); |
4079 | return nullptr; |
4080 | } |
4081 | SUnit *SU; |
4082 | do { |
4083 | if (RegionPolicy.OnlyBottomUp) { |
4084 | SU = Bot.pickOnlyChoice(); |
4085 | if (SU) { |
4086 | tracePick(Reason: Only1, IsTop: true); |
4087 | } else { |
4088 | CandPolicy NoPolicy; |
4089 | BotCand.reset(NewPolicy: NoPolicy); |
4090 | // Set the bottom-up policy based on the state of the current bottom |
4091 | // zone and the instructions outside the zone, including the top zone. |
4092 | setPolicy(Policy&: BotCand.Policy, /*IsPostRA=*/true, CurrZone&: Bot, OtherZone: nullptr); |
4093 | pickNodeFromQueue(Zone&: Bot, Cand&: BotCand); |
4094 | assert(BotCand.Reason != NoCand && "failed to find a candidate" ); |
4095 | tracePick(Cand: BotCand); |
4096 | SU = BotCand.SU; |
4097 | } |
4098 | IsTopNode = false; |
4099 | } else if (RegionPolicy.OnlyTopDown) { |
4100 | SU = Top.pickOnlyChoice(); |
4101 | if (SU) { |
4102 | tracePick(Reason: Only1, IsTop: true); |
4103 | } else { |
4104 | CandPolicy NoPolicy; |
4105 | TopCand.reset(NewPolicy: NoPolicy); |
4106 | // Set the top-down policy based on the state of the current top zone |
4107 | // and the instructions outside the zone, including the bottom zone. |
4108 | setPolicy(Policy&: TopCand.Policy, /*IsPostRA=*/true, CurrZone&: Top, OtherZone: nullptr); |
4109 | pickNodeFromQueue(Zone&: Top, Cand&: TopCand); |
4110 | assert(TopCand.Reason != NoCand && "failed to find a candidate" ); |
4111 | tracePick(Cand: TopCand); |
4112 | SU = TopCand.SU; |
4113 | } |
4114 | IsTopNode = true; |
4115 | } else { |
4116 | SU = pickNodeBidirectional(IsTopNode); |
4117 | } |
4118 | } while (SU->isScheduled); |
4119 | |
4120 | if (SU->isTopReady()) |
4121 | Top.removeReady(SU); |
4122 | if (SU->isBottomReady()) |
4123 | Bot.removeReady(SU); |
4124 | |
4125 | LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " |
4126 | << *SU->getInstr()); |
4127 | return SU; |
4128 | } |
4129 | |
4130 | /// Called after ScheduleDAGMI has scheduled an instruction and updated |
4131 | /// scheduled/remaining flags in the DAG nodes. |
4132 | void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { |
4133 | if (IsTopNode) { |
4134 | SU->TopReadyCycle = std::max(a: SU->TopReadyCycle, b: Top.getCurrCycle()); |
4135 | Top.bumpNode(SU); |
4136 | } else { |
4137 | SU->BotReadyCycle = std::max(a: SU->BotReadyCycle, b: Bot.getCurrCycle()); |
4138 | Bot.bumpNode(SU); |
4139 | } |
4140 | } |
4141 | |
4142 | ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { |
4143 | ScheduleDAGMI *DAG = |
4144 | new ScheduleDAGMI(C, std::make_unique<PostGenericScheduler>(args&: C), |
4145 | /*RemoveKillFlags=*/true); |
4146 | const TargetSubtargetInfo &STI = C->MF->getSubtarget(); |
4147 | // Add MacroFusion mutation if fusions are not empty. |
4148 | const auto &MacroFusions = STI.getMacroFusions(); |
4149 | if (!MacroFusions.empty()) |
4150 | DAG->addMutation(Mutation: createMacroFusionDAGMutation(Predicates: MacroFusions)); |
4151 | return DAG; |
4152 | } |
4153 | |
4154 | //===----------------------------------------------------------------------===// |
4155 | // ILP Scheduler. Currently for experimental analysis of heuristics. |
4156 | //===----------------------------------------------------------------------===// |
4157 | |
4158 | namespace { |
4159 | |
4160 | /// Order nodes by the ILP metric. |
4161 | struct ILPOrder { |
4162 | const SchedDFSResult *DFSResult = nullptr; |
4163 | const BitVector *ScheduledTrees = nullptr; |
4164 | bool MaximizeILP; |
4165 | |
4166 | ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {} |
4167 | |
4168 | /// Apply a less-than relation on node priority. |
4169 | /// |
4170 | /// (Return true if A comes after B in the Q.) |
4171 | bool operator()(const SUnit *A, const SUnit *B) const { |
4172 | unsigned SchedTreeA = DFSResult->getSubtreeID(SU: A); |
4173 | unsigned SchedTreeB = DFSResult->getSubtreeID(SU: B); |
4174 | if (SchedTreeA != SchedTreeB) { |
4175 | // Unscheduled trees have lower priority. |
4176 | if (ScheduledTrees->test(Idx: SchedTreeA) != ScheduledTrees->test(Idx: SchedTreeB)) |
4177 | return ScheduledTrees->test(Idx: SchedTreeB); |
4178 | |
4179 | // Trees with shallower connections have lower priority. |
4180 | if (DFSResult->getSubtreeLevel(SubtreeID: SchedTreeA) |
4181 | != DFSResult->getSubtreeLevel(SubtreeID: SchedTreeB)) { |
4182 | return DFSResult->getSubtreeLevel(SubtreeID: SchedTreeA) |
4183 | < DFSResult->getSubtreeLevel(SubtreeID: SchedTreeB); |
4184 | } |
4185 | } |
4186 | if (MaximizeILP) |
4187 | return DFSResult->getILP(SU: A) < DFSResult->getILP(SU: B); |
4188 | else |
4189 | return DFSResult->getILP(SU: A) > DFSResult->getILP(SU: B); |
4190 | } |
4191 | }; |
4192 | |
4193 | /// Schedule based on the ILP metric. |
4194 | class ILPScheduler : public MachineSchedStrategy { |
4195 | ScheduleDAGMILive *DAG = nullptr; |
4196 | ILPOrder Cmp; |
4197 | |
4198 | std::vector<SUnit*> ReadyQ; |
4199 | |
4200 | public: |
4201 | ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {} |
4202 | |
4203 | void initialize(ScheduleDAGMI *dag) override { |
4204 | assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness" ); |
4205 | DAG = static_cast<ScheduleDAGMILive*>(dag); |
4206 | DAG->computeDFSResult(); |
4207 | Cmp.DFSResult = DAG->getDFSResult(); |
4208 | Cmp.ScheduledTrees = &DAG->getScheduledTrees(); |
4209 | ReadyQ.clear(); |
4210 | } |
4211 | |
4212 | void registerRoots() override { |
4213 | // Restore the heap in ReadyQ with the updated DFS results. |
4214 | std::make_heap(first: ReadyQ.begin(), last: ReadyQ.end(), comp: Cmp); |
4215 | } |
4216 | |
4217 | /// Implement MachineSchedStrategy interface. |
4218 | /// ----------------------------------------- |
4219 | |
4220 | /// Callback to select the highest priority node from the ready Q. |
4221 | SUnit *pickNode(bool &IsTopNode) override { |
4222 | if (ReadyQ.empty()) return nullptr; |
4223 | std::pop_heap(first: ReadyQ.begin(), last: ReadyQ.end(), comp: Cmp); |
4224 | SUnit *SU = ReadyQ.back(); |
4225 | ReadyQ.pop_back(); |
4226 | IsTopNode = false; |
4227 | LLVM_DEBUG(dbgs() << "Pick node " |
4228 | << "SU(" << SU->NodeNum << ") " |
4229 | << " ILP: " << DAG->getDFSResult()->getILP(SU) |
4230 | << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) |
4231 | << " @" |
4232 | << DAG->getDFSResult()->getSubtreeLevel( |
4233 | DAG->getDFSResult()->getSubtreeID(SU)) |
4234 | << '\n' |
4235 | << "Scheduling " << *SU->getInstr()); |
4236 | return SU; |
4237 | } |
4238 | |
4239 | /// Scheduler callback to notify that a new subtree is scheduled. |
4240 | void scheduleTree(unsigned SubtreeID) override { |
4241 | std::make_heap(first: ReadyQ.begin(), last: ReadyQ.end(), comp: Cmp); |
4242 | } |
4243 | |
4244 | /// Callback after a node is scheduled. Mark a newly scheduled tree, notify |
4245 | /// DFSResults, and resort the priority Q. |
4246 | void schedNode(SUnit *SU, bool IsTopNode) override { |
4247 | assert(!IsTopNode && "SchedDFSResult needs bottom-up" ); |
4248 | } |
4249 | |
4250 | void releaseTopNode(SUnit *) override { /*only called for top roots*/ } |
4251 | |
4252 | void releaseBottomNode(SUnit *SU) override { |
4253 | ReadyQ.push_back(x: SU); |
4254 | std::push_heap(first: ReadyQ.begin(), last: ReadyQ.end(), comp: Cmp); |
4255 | } |
4256 | }; |
4257 | |
4258 | } // end anonymous namespace |
4259 | |
4260 | static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) { |
4261 | return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(args: true)); |
4262 | } |
4263 | static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) { |
4264 | return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(args: false)); |
4265 | } |
4266 | |
4267 | static MachineSchedRegistry ILPMaxRegistry( |
4268 | "ilpmax" , "Schedule bottom-up for max ILP" , createILPMaxScheduler); |
4269 | static MachineSchedRegistry ILPMinRegistry( |
4270 | "ilpmin" , "Schedule bottom-up for min ILP" , createILPMinScheduler); |
4271 | |
4272 | //===----------------------------------------------------------------------===// |
4273 | // Machine Instruction Shuffler for Correctness Testing |
4274 | //===----------------------------------------------------------------------===// |
4275 | |
4276 | #ifndef NDEBUG |
4277 | namespace { |
4278 | |
4279 | /// Apply a less-than relation on the node order, which corresponds to the |
4280 | /// instruction order prior to scheduling. IsReverse implements greater-than. |
4281 | template<bool IsReverse> |
4282 | struct SUnitOrder { |
4283 | bool operator()(SUnit *A, SUnit *B) const { |
4284 | if (IsReverse) |
4285 | return A->NodeNum > B->NodeNum; |
4286 | else |
4287 | return A->NodeNum < B->NodeNum; |
4288 | } |
4289 | }; |
4290 | |
4291 | /// Reorder instructions as much as possible. |
4292 | class InstructionShuffler : public MachineSchedStrategy { |
4293 | bool IsAlternating; |
4294 | bool IsTopDown; |
4295 | |
4296 | // Using a less-than relation (SUnitOrder<false>) for the TopQ priority |
4297 | // gives nodes with a higher number higher priority causing the latest |
4298 | // instructions to be scheduled first. |
4299 | PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>> |
4300 | TopQ; |
4301 | |
4302 | // When scheduling bottom-up, use greater-than as the queue priority. |
4303 | PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>> |
4304 | BottomQ; |
4305 | |
4306 | public: |
4307 | InstructionShuffler(bool alternate, bool topdown) |
4308 | : IsAlternating(alternate), IsTopDown(topdown) {} |
4309 | |
4310 | void initialize(ScheduleDAGMI*) override { |
4311 | TopQ.clear(); |
4312 | BottomQ.clear(); |
4313 | } |
4314 | |
4315 | /// Implement MachineSchedStrategy interface. |
4316 | /// ----------------------------------------- |
4317 | |
4318 | SUnit *pickNode(bool &IsTopNode) override { |
4319 | SUnit *SU; |
4320 | if (IsTopDown) { |
4321 | do { |
4322 | if (TopQ.empty()) return nullptr; |
4323 | SU = TopQ.top(); |
4324 | TopQ.pop(); |
4325 | } while (SU->isScheduled); |
4326 | IsTopNode = true; |
4327 | } else { |
4328 | do { |
4329 | if (BottomQ.empty()) return nullptr; |
4330 | SU = BottomQ.top(); |
4331 | BottomQ.pop(); |
4332 | } while (SU->isScheduled); |
4333 | IsTopNode = false; |
4334 | } |
4335 | if (IsAlternating) |
4336 | IsTopDown = !IsTopDown; |
4337 | return SU; |
4338 | } |
4339 | |
4340 | void schedNode(SUnit *SU, bool IsTopNode) override {} |
4341 | |
4342 | void releaseTopNode(SUnit *SU) override { |
4343 | TopQ.push(SU); |
4344 | } |
4345 | void releaseBottomNode(SUnit *SU) override { |
4346 | BottomQ.push(SU); |
4347 | } |
4348 | }; |
4349 | |
4350 | } // end anonymous namespace |
4351 | |
4352 | static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) { |
4353 | bool Alternate = !ForceTopDown && !ForceBottomUp; |
4354 | bool TopDown = !ForceBottomUp; |
4355 | assert((TopDown || !ForceTopDown) && |
4356 | "-misched-topdown incompatible with -misched-bottomup" ); |
4357 | return new ScheduleDAGMILive( |
4358 | C, std::make_unique<InstructionShuffler>(Alternate, TopDown)); |
4359 | } |
4360 | |
4361 | static MachineSchedRegistry ShufflerRegistry( |
4362 | "shuffle" , "Shuffle machine instructions alternating directions" , |
4363 | createInstructionShuffler); |
4364 | #endif // !NDEBUG |
4365 | |
4366 | //===----------------------------------------------------------------------===// |
4367 | // GraphWriter support for ScheduleDAGMILive. |
4368 | //===----------------------------------------------------------------------===// |
4369 | |
4370 | #ifndef NDEBUG |
4371 | namespace llvm { |
4372 | |
4373 | template<> struct GraphTraits< |
4374 | ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {}; |
4375 | |
4376 | template<> |
4377 | struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { |
4378 | DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} |
4379 | |
4380 | static std::string getGraphName(const ScheduleDAG *G) { |
4381 | return std::string(G->MF.getName()); |
4382 | } |
4383 | |
4384 | static bool renderGraphFromBottomUp() { |
4385 | return true; |
4386 | } |
4387 | |
4388 | static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) { |
4389 | if (ViewMISchedCutoff == 0) |
4390 | return false; |
4391 | return (Node->Preds.size() > ViewMISchedCutoff |
4392 | || Node->Succs.size() > ViewMISchedCutoff); |
4393 | } |
4394 | |
4395 | /// If you want to override the dot attributes printed for a particular |
4396 | /// edge, override this method. |
4397 | static std::string getEdgeAttributes(const SUnit *Node, |
4398 | SUnitIterator EI, |
4399 | const ScheduleDAG *Graph) { |
4400 | if (EI.isArtificialDep()) |
4401 | return "color=cyan,style=dashed" ; |
4402 | if (EI.isCtrlDep()) |
4403 | return "color=blue,style=dashed" ; |
4404 | return "" ; |
4405 | } |
4406 | |
4407 | static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) { |
4408 | std::string Str; |
4409 | raw_string_ostream SS(Str); |
4410 | const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G); |
4411 | const SchedDFSResult *DFS = DAG->hasVRegLiveness() ? |
4412 | static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr; |
4413 | SS << "SU:" << SU->NodeNum; |
4414 | if (DFS) |
4415 | SS << " I:" << DFS->getNumInstrs(SU); |
4416 | return Str; |
4417 | } |
4418 | |
4419 | static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) { |
4420 | return G->getGraphNodeLabel(SU); |
4421 | } |
4422 | |
4423 | static std::string getNodeAttributes(const SUnit *N, const ScheduleDAG *G) { |
4424 | std::string Str("shape=Mrecord" ); |
4425 | const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G); |
4426 | const SchedDFSResult *DFS = DAG->hasVRegLiveness() ? |
4427 | static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr; |
4428 | if (DFS) { |
4429 | Str += ",style=filled,fillcolor=\"#" ; |
4430 | Str += DOT::getColorString(DFS->getSubtreeID(N)); |
4431 | Str += '"'; |
4432 | } |
4433 | return Str; |
4434 | } |
4435 | }; |
4436 | |
4437 | } // end namespace llvm |
4438 | #endif // NDEBUG |
4439 | |
4440 | /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG |
4441 | /// rendered using 'dot'. |
4442 | void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) { |
4443 | #ifndef NDEBUG |
4444 | ViewGraph(this, Name, false, Title); |
4445 | #else |
4446 | errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on " |
4447 | << "systems with Graphviz or gv!\n" ; |
4448 | #endif // NDEBUG |
4449 | } |
4450 | |
4451 | /// Out-of-line implementation with no arguments is handy for gdb. |
4452 | void ScheduleDAGMI::viewGraph() { |
4453 | viewGraph(Name: getDAGName(), Title: "Scheduling-Units Graph for " + getDAGName()); |
4454 | } |
4455 | |
4456 | /// Sort predicate for the intervals stored in an instance of |
4457 | /// ResourceSegments. Intervals are always disjoint (no intersection |
4458 | /// for any pairs of intervals), therefore we can sort the totality of |
4459 | /// the intervals by looking only at the left boundary. |
4460 | static bool sortIntervals(const ResourceSegments::IntervalTy &A, |
4461 | const ResourceSegments::IntervalTy &B) { |
4462 | return A.first < B.first; |
4463 | } |
4464 | |
4465 | unsigned ResourceSegments::getFirstAvailableAt( |
4466 | unsigned CurrCycle, unsigned AcquireAtCycle, unsigned ReleaseAtCycle, |
4467 | std::function<ResourceSegments::IntervalTy(unsigned, unsigned, unsigned)> |
4468 | IntervalBuilder) const { |
4469 | assert(std::is_sorted(std::begin(_Intervals), std::end(_Intervals), |
4470 | sortIntervals) && |
4471 | "Cannot execute on an un-sorted set of intervals." ); |
4472 | |
4473 | // Zero resource usage is allowed by TargetSchedule.td but we do not construct |
4474 | // a ResourceSegment interval for that situation. |
4475 | if (AcquireAtCycle == ReleaseAtCycle) |
4476 | return CurrCycle; |
4477 | |
4478 | unsigned RetCycle = CurrCycle; |
4479 | ResourceSegments::IntervalTy NewInterval = |
4480 | IntervalBuilder(RetCycle, AcquireAtCycle, ReleaseAtCycle); |
4481 | for (auto &Interval : _Intervals) { |
4482 | if (!intersects(A: NewInterval, B: Interval)) |
4483 | continue; |
4484 | |
4485 | // Move the interval right next to the top of the one it |
4486 | // intersects. |
4487 | assert(Interval.second > NewInterval.first && |
4488 | "Invalid intervals configuration." ); |
4489 | RetCycle += (unsigned)Interval.second - (unsigned)NewInterval.first; |
4490 | NewInterval = IntervalBuilder(RetCycle, AcquireAtCycle, ReleaseAtCycle); |
4491 | } |
4492 | return RetCycle; |
4493 | } |
4494 | |
4495 | void ResourceSegments::add(ResourceSegments::IntervalTy A, |
4496 | const unsigned CutOff) { |
4497 | assert(A.first <= A.second && "Cannot add negative resource usage" ); |
4498 | assert(CutOff > 0 && "0-size interval history has no use." ); |
4499 | // Zero resource usage is allowed by TargetSchedule.td, in the case that the |
4500 | // instruction needed the resource to be available but does not use it. |
4501 | // However, ResourceSegment represents an interval that is closed on the left |
4502 | // and open on the right. It is impossible to represent an empty interval when |
4503 | // the left is closed. Do not add it to Intervals. |
4504 | if (A.first == A.second) |
4505 | return; |
4506 | |
4507 | assert(all_of(_Intervals, |
4508 | [&A](const ResourceSegments::IntervalTy &Interval) -> bool { |
4509 | return !intersects(A, Interval); |
4510 | }) && |
4511 | "A resource is being overwritten" ); |
4512 | _Intervals.push_back(x: A); |
4513 | |
4514 | sortAndMerge(); |
4515 | |
4516 | // Do not keep the full history of the intervals, just the |
4517 | // latest #CutOff. |
4518 | while (_Intervals.size() > CutOff) |
4519 | _Intervals.pop_front(); |
4520 | } |
4521 | |
4522 | bool ResourceSegments::intersects(ResourceSegments::IntervalTy A, |
4523 | ResourceSegments::IntervalTy B) { |
4524 | assert(A.first <= A.second && "Invalid interval" ); |
4525 | assert(B.first <= B.second && "Invalid interval" ); |
4526 | |
4527 | // Share one boundary. |
4528 | if ((A.first == B.first) || (A.second == B.second)) |
4529 | return true; |
4530 | |
4531 | // full intersersect: [ *** ) B |
4532 | // [***) A |
4533 | if ((A.first > B.first) && (A.second < B.second)) |
4534 | return true; |
4535 | |
4536 | // right intersect: [ ***) B |
4537 | // [*** ) A |
4538 | if ((A.first > B.first) && (A.first < B.second) && (A.second > B.second)) |
4539 | return true; |
4540 | |
4541 | // left intersect: [*** ) B |
4542 | // [ ***) A |
4543 | if ((A.first < B.first) && (B.first < A.second) && (B.second > B.first)) |
4544 | return true; |
4545 | |
4546 | return false; |
4547 | } |
4548 | |
4549 | void ResourceSegments::sortAndMerge() { |
4550 | if (_Intervals.size() <= 1) |
4551 | return; |
4552 | |
4553 | // First sort the collection. |
4554 | _Intervals.sort(comp: sortIntervals); |
4555 | |
4556 | // can use next because I have at least 2 elements in the list |
4557 | auto next = std::next(x: std::begin(cont&: _Intervals)); |
4558 | auto E = std::end(cont&: _Intervals); |
4559 | for (; next != E; ++next) { |
4560 | if (std::prev(x: next)->second >= next->first) { |
4561 | next->first = std::prev(x: next)->first; |
4562 | _Intervals.erase(position: std::prev(x: next)); |
4563 | continue; |
4564 | } |
4565 | } |
4566 | } |
4567 | |