1//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Machine Scheduler interface
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIMachineScheduler.h"
15#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16#include "SIInstrInfo.h"
17#include "llvm/CodeGen/LiveIntervals.h"
18#include "llvm/CodeGen/MachineRegisterInfo.h"
19
20using namespace llvm;
21
22#define DEBUG_TYPE "machine-scheduler"
23
24// This scheduler implements a different scheduling algorithm than
25// GenericScheduler.
26//
27// There are several specific architecture behaviours that can't be modelled
28// for GenericScheduler:
29// . When accessing the result of an SGPR load instruction, you have to wait
30// for all the SGPR load instructions before your current instruction to
31// have finished.
32// . When accessing the result of an VGPR load instruction, you have to wait
33// for all the VGPR load instructions previous to the VGPR load instruction
34// you are interested in to finish.
35// . The less the register pressure, the best load latencies are hidden
36//
37// Moreover some specifities (like the fact a lot of instructions in the shader
38// have few dependencies) makes the generic scheduler have some unpredictable
39// behaviours. For example when register pressure becomes high, it can either
40// manage to prevent register pressure from going too high, or it can
41// increase register pressure even more than if it hadn't taken register
42// pressure into account.
43//
44// Also some other bad behaviours are generated, like loading at the beginning
45// of the shader a constant in VGPR you won't need until the end of the shader.
46//
47// The scheduling problem for SI can distinguish three main parts:
48// . Hiding high latencies (texture sampling, etc)
49// . Hiding low latencies (SGPR constant loading, etc)
50// . Keeping register usage low for better latency hiding and general
51// performance
52//
53// Some other things can also affect performance, but are hard to predict
54// (cache usage, the fact the HW can issue several instructions from different
55// wavefronts if different types, etc)
56//
57// This scheduler tries to solve the scheduling problem by dividing it into
58// simpler sub-problems. It divides the instructions into blocks, schedules
59// locally inside the blocks where it takes care of low latencies, and then
60// chooses the order of the blocks by taking care of high latencies.
61// Dividing the instructions into blocks helps control keeping register
62// usage low.
63//
64// First the instructions are put into blocks.
65// We want the blocks help control register usage and hide high latencies
66// later. To help control register usage, we typically want all local
67// computations, when for example you create a result that can be consumed
68// right away, to be contained in a block. Block inputs and outputs would
69// typically be important results that are needed in several locations of
70// the shader. Since we do want blocks to help hide high latencies, we want
71// the instructions inside the block to have a minimal set of dependencies
72// on high latencies. It will make it easy to pick blocks to hide specific
73// high latencies.
74// The block creation algorithm is divided into several steps, and several
75// variants can be tried during the scheduling process.
76//
77// Second the order of the instructions inside the blocks is chosen.
78// At that step we do take into account only register usage and hiding
79// low latency instructions
80//
81// Third the block order is chosen, there we try to hide high latencies
82// and keep register usage low.
83//
84// After the third step, a pass is done to improve the hiding of low
85// latencies.
86//
87// Actually when talking about 'low latency' or 'high latency' it includes
88// both the latency to get the cache (or global mem) data go to the register,
89// and the bandwidth limitations.
90// Increasing the number of active wavefronts helps hide the former, but it
91// doesn't solve the latter, thus why even if wavefront count is high, we have
92// to try have as many instructions hiding high latencies as possible.
93// The OpenCL doc says for example latency of 400 cycles for a global mem
94// access, which is hidden by 10 instructions if the wavefront count is 10.
95
96// Some figures taken from AMD docs:
97// Both texture and constant L1 caches are 4-way associative with 64 bytes
98// lines.
99// Constant cache is shared with 4 CUs.
100// For texture sampling, the address generation unit receives 4 texture
101// addresses per cycle, thus we could expect texture sampling latency to be
102// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
103// instructions in a wavefront group are executed every 4 cycles),
104// or 16 instructions if the other wavefronts associated to the 3 other VALUs
105// of the CU do texture sampling too. (Don't take these figures too seriously,
106// as I'm not 100% sure of the computation)
107// Data exports should get similar latency.
108// For constant loading, the cache is shader with 4 CUs.
109// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
110// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
111// It means a simple s_buffer_load should take one instruction to hide, as
112// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
113// cache line.
114//
115// As of today the driver doesn't preload the constants in cache, thus the
116// first loads get extra latency. The doc says global memory access can be
117// 300-600 cycles. We do not specially take that into account when scheduling
118// As we expect the driver to be able to preload the constants soon.
119
120// common code //
121
122#ifndef NDEBUG
123
124static const char *getReasonStr(SIScheduleCandReason Reason) {
125 switch (Reason) {
126 case NoCand: return "NOCAND";
127 case RegUsage: return "REGUSAGE";
128 case Latency: return "LATENCY";
129 case Successor: return "SUCCESSOR";
130 case Depth: return "DEPTH";
131 case NodeOrder: return "ORDER";
132 }
133 llvm_unreachable("Unknown reason!");
134}
135
136#endif
137
138namespace llvm::SISched {
139static bool tryLess(int TryVal, int CandVal,
140 SISchedulerCandidate &TryCand,
141 SISchedulerCandidate &Cand,
142 SIScheduleCandReason Reason) {
143 if (TryVal < CandVal) {
144 TryCand.Reason = Reason;
145 return true;
146 }
147 if (TryVal > CandVal) {
148 if (Cand.Reason > Reason)
149 Cand.Reason = Reason;
150 return true;
151 }
152 Cand.setRepeat(Reason);
153 return false;
154}
155
156static bool tryGreater(int TryVal, int CandVal,
157 SISchedulerCandidate &TryCand,
158 SISchedulerCandidate &Cand,
159 SIScheduleCandReason Reason) {
160 if (TryVal > CandVal) {
161 TryCand.Reason = Reason;
162 return true;
163 }
164 if (TryVal < CandVal) {
165 if (Cand.Reason > Reason)
166 Cand.Reason = Reason;
167 return true;
168 }
169 Cand.setRepeat(Reason);
170 return false;
171}
172} // end namespace llvm::SISched
173
174// SIScheduleBlock //
175
176void SIScheduleBlock::addUnit(SUnit *SU) {
177 NodeNum2Index[SU->NodeNum] = SUnits.size();
178 SUnits.push_back(x: SU);
179}
180
181#ifndef NDEBUG
182void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
183
184 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
185 dbgs() << '\n';
186}
187#endif
188
189void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
190 SISchedCandidate &TryCand) {
191 // Initialize the candidate if needed.
192 if (!Cand.isValid()) {
193 TryCand.Reason = NodeOrder;
194 return;
195 }
196
197 if (Cand.SGPRUsage > 60 &&
198 SISched::tryLess(TryVal: TryCand.SGPRUsage, CandVal: Cand.SGPRUsage,
199 TryCand, Cand, Reason: RegUsage))
200 return;
201
202 // Schedule low latency instructions as top as possible.
203 // Order of priority is:
204 // . Low latency instructions which do not depend on other low latency
205 // instructions we haven't waited for
206 // . Other instructions which do not depend on low latency instructions
207 // we haven't waited for
208 // . Low latencies
209 // . All other instructions
210 // Goal is to get: low latency instructions - independent instructions
211 // - (eventually some more low latency instructions)
212 // - instructions that depend on the first low latency instructions.
213 // If in the block there is a lot of constant loads, the SGPR usage
214 // could go quite high, thus above the arbitrary limit of 60 will encourage
215 // use the already loaded constants (in order to release some SGPRs) before
216 // loading more.
217 if (SISched::tryLess(TryVal: TryCand.HasLowLatencyNonWaitedParent,
218 CandVal: Cand.HasLowLatencyNonWaitedParent,
219 TryCand, Cand, Reason: SIScheduleCandReason::Depth))
220 return;
221
222 if (SISched::tryGreater(TryVal: TryCand.IsLowLatency, CandVal: Cand.IsLowLatency,
223 TryCand, Cand, Reason: SIScheduleCandReason::Depth))
224 return;
225
226 if (TryCand.IsLowLatency &&
227 SISched::tryLess(TryVal: TryCand.LowLatencyOffset, CandVal: Cand.LowLatencyOffset,
228 TryCand, Cand, Reason: SIScheduleCandReason::Depth))
229 return;
230
231 if (SISched::tryLess(TryVal: TryCand.VGPRUsage, CandVal: Cand.VGPRUsage,
232 TryCand, Cand, Reason: RegUsage))
233 return;
234
235 // Fall through to original instruction order.
236 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
237 TryCand.Reason = NodeOrder;
238 }
239}
240
241SUnit* SIScheduleBlock::pickNode() {
242 SISchedCandidate TopCand;
243
244 for (SUnit* SU : TopReadySUs) {
245 SISchedCandidate TryCand;
246 std::vector<unsigned> pressure;
247 std::vector<unsigned> MaxPressure;
248 // Predict register usage after this instruction.
249 TryCand.SU = SU;
250 TopRPTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: pressure, MaxPressureResult&: MaxPressure);
251 TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32];
252 TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32];
253 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
254 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
255 TryCand.HasLowLatencyNonWaitedParent =
256 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
257 tryCandidateTopDown(Cand&: TopCand, TryCand);
258 if (TryCand.Reason != NoCand)
259 TopCand.setBest(TryCand);
260 }
261
262 return TopCand.SU;
263}
264
265
266// Schedule something valid.
267void SIScheduleBlock::fastSchedule() {
268 TopReadySUs.clear();
269 if (Scheduled)
270 undoSchedule();
271
272 for (SUnit* SU : SUnits) {
273 if (!SU->NumPredsLeft)
274 TopReadySUs.push_back(x: SU);
275 }
276
277 while (!TopReadySUs.empty()) {
278 SUnit *SU = TopReadySUs[0];
279 ScheduledSUnits.push_back(x: SU);
280 nodeScheduled(SU);
281 }
282
283 Scheduled = true;
284}
285
286// Returns if the register was set between first and last.
287static bool isDefBetween(Register Reg, SlotIndex First, SlotIndex Last,
288 const MachineRegisterInfo *MRI,
289 const LiveIntervals *LIS) {
290 for (const MachineInstr &MI : MRI->def_instructions(Reg)) {
291 if (MI.isDebugValue())
292 continue;
293 SlotIndex InstSlot = LIS->getInstructionIndex(Instr: MI).getRegSlot();
294 if (InstSlot >= First && InstSlot <= Last)
295 return true;
296 }
297 return false;
298}
299
300void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
301 MachineBasicBlock::iterator EndBlock) {
302 IntervalPressure Pressure, BotPressure;
303 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
304 LiveIntervals *LIS = DAG->getLIS();
305 MachineRegisterInfo *MRI = DAG->getMRI();
306 DAG->initRPTracker(RPTracker&: TopRPTracker);
307 DAG->initRPTracker(RPTracker&: BotRPTracker);
308 DAG->initRPTracker(RPTracker);
309
310 // Goes though all SU. RPTracker captures what had to be alive for the SUs
311 // to execute, and what is still alive at the end.
312 for (SUnit* SU : ScheduledSUnits) {
313 RPTracker.setPos(SU->getInstr());
314 RPTracker.advance();
315 }
316
317 // Close the RPTracker to finalize live ins/outs.
318 RPTracker.closeRegion();
319
320 // Initialize the live ins and live outs.
321 TopRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveInRegs);
322 BotRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveOutRegs);
323
324 // Do not Track Physical Registers, because it messes up.
325 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
326 if (RegMaskPair.VRegOrUnit.isVirtualReg())
327 LiveInRegs.insert(x: RegMaskPair.VRegOrUnit.asVirtualReg());
328 }
329 LiveOutRegs.clear();
330 // There is several possibilities to distinguish:
331 // 1) Reg is not input to any instruction in the block, but is output of one
332 // 2) 1) + read in the block and not needed after it
333 // 3) 1) + read in the block but needed in another block
334 // 4) Reg is input of an instruction but another block will read it too
335 // 5) Reg is input of an instruction and then rewritten in the block.
336 // result is not read in the block (implies used in another block)
337 // 6) Reg is input of an instruction and then rewritten in the block.
338 // result is read in the block and not needed in another block
339 // 7) Reg is input of an instruction and then rewritten in the block.
340 // result is read in the block but also needed in another block
341 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
342 // We want LiveOutRegs to contain only Regs whose content will be read after
343 // in another block, and whose content was written in the current block,
344 // that is we want it to get 1, 3, 5, 7
345 // Since we made the MIs of a block to be packed all together before
346 // scheduling, then the LiveIntervals were correct, and the RPTracker was
347 // able to correctly handle 5 vs 6, 2 vs 3.
348 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
349 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
350 // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
351 // The use of findDefBetween removes the case 4.
352 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
353 VirtRegOrUnit VRegOrUnit = RegMaskPair.VRegOrUnit;
354 if (VRegOrUnit.isVirtualReg() &&
355 isDefBetween(Reg: VRegOrUnit.asVirtualReg(),
356 First: LIS->getInstructionIndex(Instr: *BeginBlock).getRegSlot(),
357 Last: LIS->getInstructionIndex(Instr: *EndBlock).getRegSlot(), MRI,
358 LIS)) {
359 LiveOutRegs.insert(x: VRegOrUnit.asVirtualReg());
360 }
361 }
362
363 // Pressure = sum_alive_registers register size
364 // Internally llvm will represent some registers as big 128 bits registers
365 // for example, but they actually correspond to 4 actual 32 bits registers.
366 // Thus Pressure is not equal to num_alive_registers * constant.
367 LiveInPressure = TopPressure.MaxSetPressure;
368 LiveOutPressure = BotPressure.MaxSetPressure;
369
370 // Prepares TopRPTracker for top down scheduling.
371 TopRPTracker.closeTop();
372}
373
374void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
375 MachineBasicBlock::iterator EndBlock) {
376 if (!Scheduled)
377 fastSchedule();
378
379 // PreScheduling phase to set LiveIn and LiveOut.
380 initRegPressure(BeginBlock, EndBlock);
381 undoSchedule();
382
383 // Schedule for real now.
384
385 TopReadySUs.clear();
386
387 for (SUnit* SU : SUnits) {
388 if (!SU->NumPredsLeft)
389 TopReadySUs.push_back(x: SU);
390 }
391
392 while (!TopReadySUs.empty()) {
393 SUnit *SU = pickNode();
394 ScheduledSUnits.push_back(x: SU);
395 TopRPTracker.setPos(SU->getInstr());
396 TopRPTracker.advance();
397 nodeScheduled(SU);
398 }
399
400 // TODO: compute InternalAdditionalPressure.
401 InternalAdditionalPressure.resize(new_size: TopPressure.MaxSetPressure.size());
402
403 // Check everything is right.
404#ifndef NDEBUG
405 assert(SUnits.size() == ScheduledSUnits.size() &&
406 TopReadySUs.empty());
407 for (SUnit* SU : SUnits) {
408 assert(SU->isScheduled &&
409 SU->NumPredsLeft == 0);
410 }
411#endif
412
413 Scheduled = true;
414}
415
416void SIScheduleBlock::undoSchedule() {
417 for (SUnit* SU : SUnits) {
418 SU->isScheduled = false;
419 for (SDep& Succ : SU->Succs) {
420 if (BC->isSUInBlock(SU: Succ.getSUnit(), ID))
421 undoReleaseSucc(SU, SuccEdge: &Succ);
422 }
423 }
424 HasLowLatencyNonWaitedParent.assign(n: SUnits.size(), val: 0);
425 ScheduledSUnits.clear();
426 Scheduled = false;
427}
428
429void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
430 SUnit *SuccSU = SuccEdge->getSUnit();
431
432 if (SuccEdge->isWeak()) {
433 ++SuccSU->WeakPredsLeft;
434 return;
435 }
436 ++SuccSU->NumPredsLeft;
437}
438
439void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
440 SUnit *SuccSU = SuccEdge->getSUnit();
441
442 if (SuccEdge->isWeak()) {
443 --SuccSU->WeakPredsLeft;
444 return;
445 }
446#ifndef NDEBUG
447 if (SuccSU->NumPredsLeft == 0) {
448 dbgs() << "*** Scheduling failed! ***\n";
449 DAG->dumpNode(*SuccSU);
450 dbgs() << " has been released too many times!\n";
451 llvm_unreachable(nullptr);
452 }
453#endif
454
455 --SuccSU->NumPredsLeft;
456}
457
458/// Release Successors of the SU that are in the block or not.
459void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
460 for (SDep& Succ : SU->Succs) {
461 SUnit *SuccSU = Succ.getSUnit();
462
463 if (SuccSU->NodeNum >= DAG->SUnits.size())
464 continue;
465
466 if (BC->isSUInBlock(SU: SuccSU, ID) != InOrOutBlock)
467 continue;
468
469 releaseSucc(SU, SuccEdge: &Succ);
470 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
471 TopReadySUs.push_back(x: SuccSU);
472 }
473}
474
475void SIScheduleBlock::nodeScheduled(SUnit *SU) {
476 // Is in TopReadySUs
477 assert (!SU->NumPredsLeft);
478 std::vector<SUnit *>::iterator I = llvm::find(Range&: TopReadySUs, Val: SU);
479 if (I == TopReadySUs.end()) {
480 dbgs() << "Data Structure Bug in SI Scheduler\n";
481 llvm_unreachable(nullptr);
482 }
483 TopReadySUs.erase(position: I);
484
485 releaseSuccessors(SU, InOrOutBlock: true);
486 // Scheduling this node will trigger a wait,
487 // thus propagate to other instructions that they do not need to wait either.
488 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
489 HasLowLatencyNonWaitedParent.assign(n: SUnits.size(), val: 0);
490
491 if (DAG->IsLowLatencySU[SU->NodeNum]) {
492 for (SDep& Succ : SU->Succs) {
493 std::map<unsigned, unsigned>::iterator I =
494 NodeNum2Index.find(x: Succ.getSUnit()->NodeNum);
495 if (I != NodeNum2Index.end())
496 HasLowLatencyNonWaitedParent[I->second] = 1;
497 }
498 }
499 SU->isScheduled = true;
500}
501
502void SIScheduleBlock::finalizeUnits() {
503 // We remove links from outside blocks to enable scheduling inside the block.
504 for (SUnit* SU : SUnits) {
505 releaseSuccessors(SU, InOrOutBlock: false);
506 if (DAG->IsHighLatencySU[SU->NodeNum])
507 HighLatencyBlock = true;
508 }
509 HasLowLatencyNonWaitedParent.resize(new_size: SUnits.size(), x: 0);
510}
511
512// we maintain ascending order of IDs
513void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
514 unsigned PredID = Pred->getID();
515
516 // Check if not already predecessor.
517 for (SIScheduleBlock* P : Preds) {
518 if (PredID == P->getID())
519 return;
520 }
521 Preds.push_back(x: Pred);
522
523 assert(none_of(Succs,
524 [=](std::pair<SIScheduleBlock*,
525 SIScheduleBlockLinkKind> S) {
526 return PredID == S.first->getID();
527 }) &&
528 "Loop in the Block Graph!");
529}
530
531void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
532 SIScheduleBlockLinkKind Kind) {
533 unsigned SuccID = Succ->getID();
534
535 // Check if not already predecessor.
536 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
537 if (SuccID == S.first->getID()) {
538 if (S.second == SIScheduleBlockLinkKind::NoData &&
539 Kind == SIScheduleBlockLinkKind::Data)
540 S.second = Kind;
541 return;
542 }
543 }
544 if (Succ->isHighLatencyBlock())
545 ++NumHighLatencySuccessors;
546 Succs.emplace_back(args&: Succ, args&: Kind);
547
548 assert(none_of(Preds,
549 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
550 "Loop in the Block Graph!");
551}
552
553#ifndef NDEBUG
554void SIScheduleBlock::printDebug(bool full) {
555 dbgs() << "Block (" << ID << ")\n";
556 if (!full)
557 return;
558
559 dbgs() << "\nContains High Latency Instruction: "
560 << HighLatencyBlock << '\n';
561 dbgs() << "\nDepends On:\n";
562 for (SIScheduleBlock* P : Preds) {
563 P->printDebug(false);
564 }
565
566 dbgs() << "\nSuccessors:\n";
567 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
568 if (S.second == SIScheduleBlockLinkKind::Data)
569 dbgs() << "(Data Dep) ";
570 S.first->printDebug(false);
571 }
572
573 if (Scheduled) {
574 dbgs() << "LiveInPressure "
575 << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
576 << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n';
577 dbgs() << "LiveOutPressure "
578 << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
579 << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
580 dbgs() << "LiveIns:\n";
581 for (Register Reg : LiveInRegs)
582 dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
583
584 dbgs() << "\nLiveOuts:\n";
585 for (Register Reg : LiveOutRegs)
586 dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
587 }
588
589 dbgs() << "\nInstructions:\n";
590 for (const SUnit* SU : SUnits)
591 DAG->dumpNode(*SU);
592
593 dbgs() << "///////////////////////\n";
594}
595#endif
596
597// SIScheduleBlockCreator //
598
599SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
600 : DAG(DAG) {}
601
602SIScheduleBlocks
603SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
604 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
605 Blocks.find(x: BlockVariant);
606 if (B == Blocks.end()) {
607 SIScheduleBlocks Res;
608 createBlocksForVariant(BlockVariant);
609 topologicalSort();
610 scheduleInsideBlocks();
611 fillStats();
612 Res.Blocks = CurrentBlocks;
613 Res.TopDownIndex2Block = TopDownIndex2Block;
614 Res.TopDownBlock2Index = TopDownBlock2Index;
615 Blocks[BlockVariant] = Res;
616 return Res;
617 }
618 return B->second;
619}
620
621bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
622 if (SU->NodeNum >= DAG->SUnits.size())
623 return false;
624 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
625}
626
627void SIScheduleBlockCreator::colorHighLatenciesAlone() {
628 unsigned DAGSize = DAG->SUnits.size();
629
630 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
631 SUnit *SU = &DAG->SUnits[i];
632 if (DAG->IsHighLatencySU[SU->NodeNum]) {
633 CurrentColoring[SU->NodeNum] = NextReservedID++;
634 }
635 }
636}
637
638static bool
639hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
640 for (const auto &PredDep : SU.Preds) {
641 if (PredDep.getSUnit() == &FromSU &&
642 PredDep.getKind() == llvm::SDep::Data)
643 return true;
644 }
645 return false;
646}
647
648void SIScheduleBlockCreator::colorHighLatenciesGroups() {
649 unsigned DAGSize = DAG->SUnits.size();
650 unsigned NumHighLatencies = 0;
651 unsigned GroupSize;
652 int Color = NextReservedID;
653 unsigned Count = 0;
654 std::set<unsigned> FormingGroup;
655
656 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
657 SUnit *SU = &DAG->SUnits[i];
658 if (DAG->IsHighLatencySU[SU->NodeNum])
659 ++NumHighLatencies;
660 }
661
662 if (NumHighLatencies == 0)
663 return;
664
665 if (NumHighLatencies <= 6)
666 GroupSize = 2;
667 else if (NumHighLatencies <= 12)
668 GroupSize = 3;
669 else
670 GroupSize = 4;
671
672 for (unsigned SUNum : DAG->TopDownIndex2SU) {
673 const SUnit &SU = DAG->SUnits[SUNum];
674 if (DAG->IsHighLatencySU[SU.NodeNum]) {
675 unsigned CompatibleGroup = true;
676 int ProposedColor = Color;
677 std::vector<int> AdditionalElements;
678
679 // We don't want to put in the same block
680 // two high latency instructions that depend
681 // on each other.
682 // One way would be to check canAddEdge
683 // in both directions, but that currently is not
684 // enough because there the high latency order is
685 // enforced (via links).
686 // Instead, look at the dependencies between the
687 // high latency instructions and deduce if it is
688 // a data dependency or not.
689 for (unsigned j : FormingGroup) {
690 bool HasSubGraph;
691 std::vector<int> SubGraph;
692 // By construction (topological order), if SU and
693 // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
694 // in the parent graph of SU.
695#ifndef NDEBUG
696 SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
697 HasSubGraph);
698 assert(!HasSubGraph);
699#endif
700 SubGraph = DAG->GetTopo()->GetSubGraph(StartSU: DAG->SUnits[j], TargetSU: SU,
701 Success&: HasSubGraph);
702 if (!HasSubGraph)
703 continue; // No dependencies between each other
704 if (SubGraph.size() > 5) {
705 // Too many elements would be required to be added to the block.
706 CompatibleGroup = false;
707 break;
708 }
709 // Check the type of dependency
710 for (unsigned k : SubGraph) {
711 // If in the path to join the two instructions,
712 // there is another high latency instruction,
713 // or instructions colored for another block
714 // abort the merge.
715 if (DAG->IsHighLatencySU[k] || (CurrentColoring[k] != ProposedColor &&
716 CurrentColoring[k] != 0)) {
717 CompatibleGroup = false;
718 break;
719 }
720 // If one of the SU in the subgraph depends on the result of SU j,
721 // there'll be a data dependency.
722 if (hasDataDependencyPred(SU: DAG->SUnits[k], FromSU: DAG->SUnits[j])) {
723 CompatibleGroup = false;
724 break;
725 }
726 }
727 if (!CompatibleGroup)
728 break;
729 // Same check for the SU
730 if (hasDataDependencyPred(SU, FromSU: DAG->SUnits[j])) {
731 CompatibleGroup = false;
732 break;
733 }
734 // Add all the required instructions to the block
735 // These cannot live in another block (because they
736 // depend (order dependency) on one of the
737 // instruction in the block, and are required for the
738 // high latency instruction we add.
739 llvm::append_range(C&: AdditionalElements, R&: SubGraph);
740 }
741 if (CompatibleGroup) {
742 FormingGroup.insert(x: SU.NodeNum);
743 for (unsigned j : AdditionalElements)
744 CurrentColoring[j] = ProposedColor;
745 CurrentColoring[SU.NodeNum] = ProposedColor;
746 ++Count;
747 }
748 // Found one incompatible instruction,
749 // or has filled a big enough group.
750 // -> start a new one.
751 if (!CompatibleGroup) {
752 FormingGroup.clear();
753 Color = ++NextReservedID;
754 ProposedColor = Color;
755 FormingGroup.insert(x: SU.NodeNum);
756 CurrentColoring[SU.NodeNum] = ProposedColor;
757 Count = 0;
758 } else if (Count == GroupSize) {
759 FormingGroup.clear();
760 Color = ++NextReservedID;
761 ProposedColor = Color;
762 Count = 0;
763 }
764 }
765 }
766}
767
768void SIScheduleBlockCreator::colorComputeReservedDependencies() {
769 unsigned DAGSize = DAG->SUnits.size();
770 std::map<std::set<unsigned>, unsigned> ColorCombinations;
771
772 CurrentTopDownReservedDependencyColoring.clear();
773 CurrentBottomUpReservedDependencyColoring.clear();
774
775 CurrentTopDownReservedDependencyColoring.resize(new_size: DAGSize, x: 0);
776 CurrentBottomUpReservedDependencyColoring.resize(new_size: DAGSize, x: 0);
777
778 // Traverse TopDown, and give different colors to SUs depending
779 // on which combination of High Latencies they depend on.
780
781 for (unsigned SUNum : DAG->TopDownIndex2SU) {
782 SUnit *SU = &DAG->SUnits[SUNum];
783 std::set<unsigned> SUColors;
784
785 // Already given.
786 if (CurrentColoring[SU->NodeNum]) {
787 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
788 CurrentColoring[SU->NodeNum];
789 continue;
790 }
791
792 for (SDep& PredDep : SU->Preds) {
793 SUnit *Pred = PredDep.getSUnit();
794 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
795 continue;
796 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
797 SUColors.insert(x: CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
798 }
799 // Color 0 by default.
800 if (SUColors.empty())
801 continue;
802 // Same color than parents.
803 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
804 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
805 *SUColors.begin();
806 else {
807 auto [Pos, Inserted] =
808 ColorCombinations.try_emplace(k: SUColors, args&: NextNonReservedID);
809 if (Inserted)
810 ++NextNonReservedID;
811 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
812 }
813 }
814
815 ColorCombinations.clear();
816
817 // Same as before, but BottomUp.
818
819 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
820 SUnit *SU = &DAG->SUnits[SUNum];
821 std::set<unsigned> SUColors;
822
823 // Already given.
824 if (CurrentColoring[SU->NodeNum]) {
825 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
826 CurrentColoring[SU->NodeNum];
827 continue;
828 }
829
830 for (SDep& SuccDep : SU->Succs) {
831 SUnit *Succ = SuccDep.getSUnit();
832 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
833 continue;
834 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
835 SUColors.insert(x: CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
836 }
837 // Keep color 0.
838 if (SUColors.empty())
839 continue;
840 // Same color than parents.
841 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
842 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
843 *SUColors.begin();
844 else {
845 std::map<std::set<unsigned>, unsigned>::iterator Pos =
846 ColorCombinations.find(x: SUColors);
847 if (Pos != ColorCombinations.end()) {
848 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
849 } else {
850 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
851 NextNonReservedID;
852 ColorCombinations[SUColors] = NextNonReservedID++;
853 }
854 }
855 }
856}
857
858void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
859 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
860
861 // Every combination of colors given by the top down
862 // and bottom up Reserved node dependency
863
864 for (const SUnit &SU : DAG->SUnits) {
865 std::pair<unsigned, unsigned> SUColors;
866
867 // High latency instructions: already given.
868 if (CurrentColoring[SU.NodeNum])
869 continue;
870
871 SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum];
872 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum];
873
874 auto [Pos, Inserted] =
875 ColorCombinations.try_emplace(k: SUColors, args&: NextNonReservedID);
876 CurrentColoring[SU.NodeNum] = Pos->second;
877 if (Inserted)
878 NextNonReservedID++;
879 }
880}
881
882void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
883 unsigned DAGSize = DAG->SUnits.size();
884 std::vector<int> PendingColoring = CurrentColoring;
885
886 assert(DAGSize >= 1 &&
887 CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
888 CurrentTopDownReservedDependencyColoring.size() == DAGSize);
889 // If there is no reserved block at all, do nothing. We don't want
890 // everything in one block.
891 if (*llvm::max_element(Range&: CurrentBottomUpReservedDependencyColoring) == 0 &&
892 *llvm::max_element(Range&: CurrentTopDownReservedDependencyColoring) == 0)
893 return;
894
895 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
896 SUnit *SU = &DAG->SUnits[SUNum];
897 std::set<unsigned> SUColors;
898 std::set<unsigned> SUColorsPending;
899
900 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
901 continue;
902
903 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
904 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
905 continue;
906
907 for (SDep& SuccDep : SU->Succs) {
908 SUnit *Succ = SuccDep.getSUnit();
909 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
910 continue;
911 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
912 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
913 SUColors.insert(x: CurrentColoring[Succ->NodeNum]);
914 SUColorsPending.insert(x: PendingColoring[Succ->NodeNum]);
915 }
916 // If there is only one child/parent block, and that block
917 // is not among the ones we are removing in this path, then
918 // merge the instruction to that block
919 if (SUColors.size() == 1 && SUColorsPending.size() == 1)
920 PendingColoring[SU->NodeNum] = *SUColors.begin();
921 else // TODO: Attribute new colors depending on color
922 // combination of children.
923 PendingColoring[SU->NodeNum] = NextNonReservedID++;
924 }
925 CurrentColoring = std::move(PendingColoring);
926}
927
928
929void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
930 unsigned DAGSize = DAG->SUnits.size();
931 unsigned PreviousColor;
932 std::set<unsigned> SeenColors;
933
934 if (DAGSize <= 1)
935 return;
936
937 PreviousColor = CurrentColoring[0];
938
939 for (unsigned i = 1, e = DAGSize; i != e; ++i) {
940 SUnit *SU = &DAG->SUnits[i];
941 unsigned CurrentColor = CurrentColoring[i];
942 unsigned PreviousColorSave = PreviousColor;
943 assert(i == SU->NodeNum);
944
945 if (CurrentColor != PreviousColor)
946 SeenColors.insert(x: PreviousColor);
947 PreviousColor = CurrentColor;
948
949 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
950 continue;
951
952 if (SeenColors.find(x: CurrentColor) == SeenColors.end())
953 continue;
954
955 if (PreviousColorSave != CurrentColor)
956 CurrentColoring[i] = NextNonReservedID++;
957 else
958 CurrentColoring[i] = CurrentColoring[i-1];
959 }
960}
961
962void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
963 unsigned DAGSize = DAG->SUnits.size();
964
965 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
966 SUnit *SU = &DAG->SUnits[SUNum];
967 std::set<unsigned> SUColors;
968
969 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
970 continue;
971
972 // No predecessor: Vgpr constant loading.
973 // Low latency instructions usually have a predecessor (the address)
974 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
975 continue;
976
977 for (SDep& SuccDep : SU->Succs) {
978 SUnit *Succ = SuccDep.getSUnit();
979 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
980 continue;
981 SUColors.insert(x: CurrentColoring[Succ->NodeNum]);
982 }
983 if (SUColors.size() == 1)
984 CurrentColoring[SU->NodeNum] = *SUColors.begin();
985 }
986}
987
988void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
989 unsigned DAGSize = DAG->SUnits.size();
990
991 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
992 SUnit *SU = &DAG->SUnits[SUNum];
993 std::set<unsigned> SUColors;
994
995 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
996 continue;
997
998 for (SDep& SuccDep : SU->Succs) {
999 SUnit *Succ = SuccDep.getSUnit();
1000 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1001 continue;
1002 SUColors.insert(x: CurrentColoring[Succ->NodeNum]);
1003 }
1004 if (SUColors.size() == 1)
1005 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1006 }
1007}
1008
1009void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1010 unsigned DAGSize = DAG->SUnits.size();
1011
1012 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1013 SUnit *SU = &DAG->SUnits[SUNum];
1014 std::set<unsigned> SUColors;
1015
1016 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1017 continue;
1018
1019 for (SDep& SuccDep : SU->Succs) {
1020 SUnit *Succ = SuccDep.getSUnit();
1021 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1022 continue;
1023 SUColors.insert(x: CurrentColoring[Succ->NodeNum]);
1024 }
1025 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
1026 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1027 }
1028}
1029
1030void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1031 unsigned DAGSize = DAG->SUnits.size();
1032 std::map<unsigned, unsigned> ColorCount;
1033
1034 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1035 SUnit *SU = &DAG->SUnits[SUNum];
1036 unsigned color = CurrentColoring[SU->NodeNum];
1037 ++ColorCount[color];
1038 }
1039
1040 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1041 SUnit *SU = &DAG->SUnits[SUNum];
1042 unsigned color = CurrentColoring[SU->NodeNum];
1043 std::set<unsigned> SUColors;
1044
1045 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1046 continue;
1047
1048 if (ColorCount[color] > 1)
1049 continue;
1050
1051 for (SDep& SuccDep : SU->Succs) {
1052 SUnit *Succ = SuccDep.getSUnit();
1053 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1054 continue;
1055 SUColors.insert(x: CurrentColoring[Succ->NodeNum]);
1056 }
1057 if (SUColors.size() == 1 && *SUColors.begin() != color) {
1058 --ColorCount[color];
1059 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1060 ++ColorCount[*SUColors.begin()];
1061 }
1062 }
1063}
1064
1065void SIScheduleBlockCreator::cutHugeBlocks() {
1066 // TODO
1067}
1068
1069void SIScheduleBlockCreator::regroupNoUserInstructions() {
1070 unsigned DAGSize = DAG->SUnits.size();
1071 int GroupID = NextNonReservedID++;
1072
1073 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1074 SUnit *SU = &DAG->SUnits[SUNum];
1075 bool hasSuccessor = false;
1076
1077 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1078 continue;
1079
1080 for (SDep& SuccDep : SU->Succs) {
1081 SUnit *Succ = SuccDep.getSUnit();
1082 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1083 continue;
1084 hasSuccessor = true;
1085 }
1086 if (!hasSuccessor)
1087 CurrentColoring[SU->NodeNum] = GroupID;
1088 }
1089}
1090
1091void SIScheduleBlockCreator::colorExports() {
1092 unsigned ExportColor = NextNonReservedID++;
1093 SmallVector<unsigned, 8> ExpGroup;
1094
1095 // Put all exports together in a block.
1096 // The block will naturally end up being scheduled last,
1097 // thus putting exports at the end of the schedule, which
1098 // is better for performance.
1099 // However we must ensure, for safety, the exports can be put
1100 // together in the same block without any other instruction.
1101 // This could happen, for example, when scheduling after regalloc
1102 // if reloading a spilled register from memory using the same
1103 // register than used in a previous export.
1104 // If that happens, do not regroup the exports.
1105 for (unsigned SUNum : DAG->TopDownIndex2SU) {
1106 const SUnit &SU = DAG->SUnits[SUNum];
1107 if (SIInstrInfo::isEXP(MI: *SU.getInstr())) {
1108 // SU is an export instruction. Check whether one of its successor
1109 // dependencies is a non-export, in which case we skip export grouping.
1110 for (const SDep &SuccDep : SU.Succs) {
1111 const SUnit *SuccSU = SuccDep.getSUnit();
1112 if (SuccDep.isWeak() || SuccSU->NodeNum >= DAG->SUnits.size()) {
1113 // Ignore these dependencies.
1114 continue;
1115 }
1116 assert(SuccSU->isInstr() &&
1117 "SUnit unexpectedly not representing an instruction!");
1118
1119 if (!SIInstrInfo::isEXP(MI: *SuccSU->getInstr())) {
1120 // A non-export depends on us. Skip export grouping.
1121 // Note that this is a bit pessimistic: We could still group all other
1122 // exports that are not depended on by non-exports, directly or
1123 // indirectly. Simply skipping this particular export but grouping all
1124 // others would not account for indirect dependencies.
1125 return;
1126 }
1127 }
1128 ExpGroup.push_back(Elt: SUNum);
1129 }
1130 }
1131
1132 // The group can be formed. Give the color.
1133 for (unsigned j : ExpGroup)
1134 CurrentColoring[j] = ExportColor;
1135}
1136
1137void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1138 unsigned DAGSize = DAG->SUnits.size();
1139 std::map<unsigned,unsigned> RealID;
1140
1141 CurrentBlocks.clear();
1142 CurrentColoring.clear();
1143 CurrentColoring.resize(new_size: DAGSize, x: 0);
1144 Node2CurrentBlock.clear();
1145
1146 // Restore links previous scheduling variant has overridden.
1147 DAG->restoreSULinksLeft();
1148
1149 NextReservedID = 1;
1150 NextNonReservedID = DAGSize + 1;
1151
1152 LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1153
1154 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1155 colorHighLatenciesGroups();
1156 else
1157 colorHighLatenciesAlone();
1158 colorComputeReservedDependencies();
1159 colorAccordingToReservedDependencies();
1160 colorEndsAccordingToDependencies();
1161 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1162 colorForceConsecutiveOrderInGroup();
1163 regroupNoUserInstructions();
1164 colorMergeConstantLoadsNextGroup();
1165 colorMergeIfPossibleNextGroupOnlyForReserved();
1166 colorExports();
1167
1168 // Put SUs of same color into same block
1169 Node2CurrentBlock.resize(new_size: DAGSize, x: -1);
1170 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1171 SUnit *SU = &DAG->SUnits[i];
1172 unsigned Color = CurrentColoring[SU->NodeNum];
1173 auto [It, Inserted] = RealID.try_emplace(k: Color);
1174 if (Inserted) {
1175 int ID = CurrentBlocks.size();
1176 BlockPtrs.push_back(x: std::make_unique<SIScheduleBlock>(args&: DAG, args: this, args&: ID));
1177 CurrentBlocks.push_back(x: BlockPtrs.rbegin()->get());
1178 It->second = ID;
1179 }
1180 CurrentBlocks[It->second]->addUnit(SU);
1181 Node2CurrentBlock[SU->NodeNum] = It->second;
1182 }
1183
1184 // Build dependencies between blocks.
1185 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1186 SUnit *SU = &DAG->SUnits[i];
1187 int SUID = Node2CurrentBlock[i];
1188 for (SDep& SuccDep : SU->Succs) {
1189 SUnit *Succ = SuccDep.getSUnit();
1190 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1191 continue;
1192 if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1193 CurrentBlocks[SUID]->addSucc(Succ: CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1194 Kind: SuccDep.isCtrl() ? NoData : Data);
1195 }
1196 for (SDep& PredDep : SU->Preds) {
1197 SUnit *Pred = PredDep.getSUnit();
1198 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1199 continue;
1200 if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1201 CurrentBlocks[SUID]->addPred(Pred: CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1202 }
1203 }
1204
1205 // Free root and leafs of all blocks to enable scheduling inside them.
1206 for (SIScheduleBlock *Block : CurrentBlocks)
1207 Block->finalizeUnits();
1208 LLVM_DEBUG({
1209 dbgs() << "Blocks created:\n\n";
1210 for (SIScheduleBlock *Block : CurrentBlocks)
1211 Block->printDebug(true);
1212 });
1213}
1214
1215// Two functions taken from Codegen/MachineScheduler.cpp
1216
1217/// Non-const version.
1218static MachineBasicBlock::iterator
1219nextIfDebug(MachineBasicBlock::iterator I,
1220 MachineBasicBlock::const_iterator End) {
1221 for (; I != End; ++I) {
1222 if (!I->isDebugInstr())
1223 break;
1224 }
1225 return I;
1226}
1227
1228void SIScheduleBlockCreator::topologicalSort() {
1229 unsigned DAGSize = CurrentBlocks.size();
1230 std::vector<int> WorkList;
1231
1232 LLVM_DEBUG(dbgs() << "Topological Sort\n");
1233
1234 WorkList.reserve(n: DAGSize);
1235 TopDownIndex2Block.resize(new_size: DAGSize);
1236 TopDownBlock2Index.resize(new_size: DAGSize);
1237 BottomUpIndex2Block.resize(new_size: DAGSize);
1238
1239 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1240 SIScheduleBlock *Block = CurrentBlocks[i];
1241 unsigned Degree = Block->getSuccs().size();
1242 TopDownBlock2Index[i] = Degree;
1243 if (Degree == 0) {
1244 WorkList.push_back(x: i);
1245 }
1246 }
1247
1248 int Id = DAGSize;
1249 while (!WorkList.empty()) {
1250 int i = WorkList.back();
1251 SIScheduleBlock *Block = CurrentBlocks[i];
1252 WorkList.pop_back();
1253 TopDownBlock2Index[i] = --Id;
1254 TopDownIndex2Block[Id] = i;
1255 for (SIScheduleBlock* Pred : Block->getPreds()) {
1256 if (!--TopDownBlock2Index[Pred->getID()])
1257 WorkList.push_back(x: Pred->getID());
1258 }
1259 }
1260
1261#ifndef NDEBUG
1262 // Check correctness of the ordering.
1263 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1264 SIScheduleBlock *Block = CurrentBlocks[i];
1265 for (SIScheduleBlock* Pred : Block->getPreds()) {
1266 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1267 "Wrong Top Down topological sorting");
1268 }
1269 }
1270#endif
1271
1272 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1273 TopDownIndex2Block.rend());
1274}
1275
1276void SIScheduleBlockCreator::scheduleInsideBlocks() {
1277 unsigned DAGSize = CurrentBlocks.size();
1278
1279 LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1280
1281 // We do schedule a valid scheduling such that a Block corresponds
1282 // to a range of instructions.
1283 LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1284 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1285 SIScheduleBlock *Block = CurrentBlocks[i];
1286 Block->fastSchedule();
1287 }
1288
1289 // Note: the following code, and the part restoring previous position
1290 // is by far the most expensive operation of the Scheduler.
1291
1292 // Do not update CurrentTop.
1293 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1294 std::vector<MachineBasicBlock::iterator> PosOld;
1295 std::vector<MachineBasicBlock::iterator> PosNew;
1296 PosOld.reserve(n: DAG->SUnits.size());
1297 PosNew.reserve(n: DAG->SUnits.size());
1298
1299 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1300 int BlockIndice = TopDownIndex2Block[i];
1301 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1302 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1303
1304 for (SUnit* SU : SUs) {
1305 MachineInstr *MI = SU->getInstr();
1306 MachineBasicBlock::iterator Pos = MI;
1307 PosOld.push_back(x: Pos);
1308 if (&*CurrentTopFastSched == MI) {
1309 PosNew.push_back(x: Pos);
1310 CurrentTopFastSched = nextIfDebug(I: ++CurrentTopFastSched,
1311 End: DAG->getCurrentBottom());
1312 } else {
1313 // Update the instruction stream.
1314 DAG->getBB()->splice(Where: CurrentTopFastSched, Other: DAG->getBB(), From: MI);
1315
1316 // Update LiveIntervals.
1317 // Note: Moving all instructions and calling handleMove every time
1318 // is the most cpu intensive operation of the scheduler.
1319 // It would gain a lot if there was a way to recompute the
1320 // LiveIntervals for the entire scheduling region.
1321 DAG->getLIS()->handleMove(MI&: *MI, /*UpdateFlags=*/true);
1322 PosNew.push_back(x: CurrentTopFastSched);
1323 }
1324 }
1325 }
1326
1327 // Now we have Block of SUs == Block of MI.
1328 // We do the final schedule for the instructions inside the block.
1329 // The property that all the SUs of the Block are grouped together as MI
1330 // is used for correct reg usage tracking.
1331 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1332 SIScheduleBlock *Block = CurrentBlocks[i];
1333 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1334 Block->schedule(BeginBlock: (*SUs.begin())->getInstr(), EndBlock: (*SUs.rbegin())->getInstr());
1335 }
1336
1337 LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1338 // Restore old ordering (which prevents a LIS->handleMove bug).
1339 for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1340 MachineBasicBlock::iterator POld = PosOld[i-1];
1341 MachineBasicBlock::iterator PNew = PosNew[i-1];
1342 if (PNew != POld) {
1343 // Update the instruction stream.
1344 DAG->getBB()->splice(Where: POld, Other: DAG->getBB(), From: PNew);
1345
1346 // Update LiveIntervals.
1347 DAG->getLIS()->handleMove(MI&: *POld, /*UpdateFlags=*/true);
1348 }
1349 }
1350
1351 LLVM_DEBUG({
1352 for (SIScheduleBlock *Block : CurrentBlocks)
1353 Block->printDebug(true);
1354 });
1355}
1356
1357void SIScheduleBlockCreator::fillStats() {
1358 unsigned DAGSize = CurrentBlocks.size();
1359
1360 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1361 int BlockIndice = TopDownIndex2Block[i];
1362 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1363 if (Block->getPreds().empty())
1364 Block->Depth = 0;
1365 else {
1366 unsigned Depth = 0;
1367 for (SIScheduleBlock *Pred : Block->getPreds()) {
1368 if (Depth < Pred->Depth + Pred->getCost())
1369 Depth = Pred->Depth + Pred->getCost();
1370 }
1371 Block->Depth = Depth;
1372 }
1373 }
1374
1375 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1376 int BlockIndice = BottomUpIndex2Block[i];
1377 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1378 if (Block->getSuccs().empty())
1379 Block->Height = 0;
1380 else {
1381 unsigned Height = 0;
1382 for (const auto &Succ : Block->getSuccs())
1383 Height = std::max(a: Height, b: Succ.first->Height + Succ.first->getCost());
1384 Block->Height = Height;
1385 }
1386 }
1387}
1388
1389// SIScheduleBlockScheduler //
1390
1391SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1392 SISchedulerBlockSchedulerVariant Variant,
1393 SIScheduleBlocks BlocksStruct) :
1394 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1395 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1396 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1397
1398 // Fill the usage of every output
1399 // Warning: while by construction we always have a link between two blocks
1400 // when one needs a result from the other, the number of users of an output
1401 // is not the sum of child blocks having as input the same virtual register.
1402 // Here is an example. A produces x and y. B eats x and produces x'.
1403 // C eats x' and y. The register coalescer may have attributed the same
1404 // virtual register to x and x'.
1405 // To count accurately, we do a topological sort. In case the register is
1406 // found for several parents, we increment the usage of the one with the
1407 // highest topological index.
1408 LiveOutRegsNumUsages.resize(new_size: Blocks.size());
1409 for (SIScheduleBlock *Block : Blocks) {
1410 for (Register Reg : Block->getInRegs()) {
1411 bool Found = false;
1412 int topoInd = -1;
1413 for (SIScheduleBlock* Pred: Block->getPreds()) {
1414 std::set<Register> PredOutRegs = Pred->getOutRegs();
1415 std::set<Register>::iterator RegPos = PredOutRegs.find(x: Reg);
1416
1417 if (RegPos != PredOutRegs.end()) {
1418 Found = true;
1419 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1420 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1421 }
1422 }
1423 }
1424
1425 if (!Found)
1426 continue;
1427
1428 int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1429 ++LiveOutRegsNumUsages[PredID][Reg];
1430 }
1431 }
1432
1433 LastPosHighLatencyParentScheduled.resize(new_size: Blocks.size(), x: 0);
1434 BlockNumPredsLeft.resize(new_size: Blocks.size());
1435 BlockNumSuccsLeft.resize(new_size: Blocks.size());
1436
1437 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1438 SIScheduleBlock *Block = Blocks[i];
1439 BlockNumPredsLeft[i] = Block->getPreds().size();
1440 BlockNumSuccsLeft[i] = Block->getSuccs().size();
1441 }
1442
1443#ifndef NDEBUG
1444 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1445 SIScheduleBlock *Block = Blocks[i];
1446 assert(Block->getID() == i);
1447 }
1448#endif
1449
1450 std::set<VirtRegOrUnit> InRegs = DAG->getInRegs();
1451 addLiveRegs(Regs&: InRegs);
1452
1453 // Increase LiveOutRegsNumUsages for blocks
1454 // producing registers consumed in another
1455 // scheduling region.
1456 for (VirtRegOrUnit VRegOrUnit : DAG->getOutRegs()) {
1457 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1458 // Do reverse traversal
1459 int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1460 SIScheduleBlock *Block = Blocks[ID];
1461 const std::set<Register> &OutRegs = Block->getOutRegs();
1462
1463 if (!VRegOrUnit.isVirtualReg() ||
1464 OutRegs.find(x: VRegOrUnit.asVirtualReg()) == OutRegs.end())
1465 continue;
1466
1467 ++LiveOutRegsNumUsages[ID][VRegOrUnit.asVirtualReg()];
1468 break;
1469 }
1470 }
1471
1472 // Fill LiveRegsConsumers for regs that were already
1473 // defined before scheduling.
1474 for (SIScheduleBlock *Block : Blocks) {
1475 for (Register Reg : Block->getInRegs()) {
1476 bool Found = false;
1477 for (SIScheduleBlock* Pred: Block->getPreds()) {
1478 std::set<Register> PredOutRegs = Pred->getOutRegs();
1479 std::set<Register>::iterator RegPos = PredOutRegs.find(x: Reg);
1480
1481 if (RegPos != PredOutRegs.end()) {
1482 Found = true;
1483 break;
1484 }
1485 }
1486
1487 if (!Found)
1488 ++LiveRegsConsumers[Reg];
1489 }
1490 }
1491
1492 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1493 SIScheduleBlock *Block = Blocks[i];
1494 if (BlockNumPredsLeft[i] == 0) {
1495 ReadyBlocks.push_back(x: Block);
1496 }
1497 }
1498
1499 while (SIScheduleBlock *Block = pickBlock()) {
1500 BlocksScheduled.push_back(x: Block);
1501 blockScheduled(Block);
1502 }
1503
1504 LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
1505 : BlocksScheduled) {
1506 dbgs() << ' ' << Block->getID();
1507 } dbgs() << '\n';);
1508}
1509
1510bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1511 SIBlockSchedCandidate &TryCand) {
1512 if (!Cand.isValid()) {
1513 TryCand.Reason = NodeOrder;
1514 return true;
1515 }
1516
1517 // Try to hide high latencies.
1518 if (SISched::tryLess(TryVal: TryCand.LastPosHighLatParentScheduled,
1519 CandVal: Cand.LastPosHighLatParentScheduled, TryCand, Cand, Reason: Latency))
1520 return true;
1521 // Schedule high latencies early so you can hide them better.
1522 if (SISched::tryGreater(TryVal: TryCand.IsHighLatency, CandVal: Cand.IsHighLatency,
1523 TryCand, Cand, Reason: Latency))
1524 return true;
1525 if (TryCand.IsHighLatency && SISched::tryGreater(TryVal: TryCand.Height, CandVal: Cand.Height,
1526 TryCand, Cand, Reason: Depth))
1527 return true;
1528 if (SISched::tryGreater(TryVal: TryCand.NumHighLatencySuccessors,
1529 CandVal: Cand.NumHighLatencySuccessors,
1530 TryCand, Cand, Reason: Successor))
1531 return true;
1532 return false;
1533}
1534
1535bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1536 SIBlockSchedCandidate &TryCand) {
1537 if (!Cand.isValid()) {
1538 TryCand.Reason = NodeOrder;
1539 return true;
1540 }
1541
1542 if (SISched::tryLess(TryVal: TryCand.VGPRUsageDiff > 0, CandVal: Cand.VGPRUsageDiff > 0,
1543 TryCand, Cand, Reason: RegUsage))
1544 return true;
1545 if (SISched::tryGreater(TryVal: TryCand.NumSuccessors > 0,
1546 CandVal: Cand.NumSuccessors > 0,
1547 TryCand, Cand, Reason: Successor))
1548 return true;
1549 if (SISched::tryGreater(TryVal: TryCand.Height, CandVal: Cand.Height, TryCand, Cand, Reason: Depth))
1550 return true;
1551 if (SISched::tryLess(TryVal: TryCand.VGPRUsageDiff, CandVal: Cand.VGPRUsageDiff,
1552 TryCand, Cand, Reason: RegUsage))
1553 return true;
1554 return false;
1555}
1556
1557SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1558 SIBlockSchedCandidate Cand;
1559 std::vector<SIScheduleBlock*>::iterator Best;
1560 SIScheduleBlock *Block;
1561 if (ReadyBlocks.empty())
1562 return nullptr;
1563
1564 DAG->fillVgprSgprCost(First: LiveRegs.begin(), End: LiveRegs.end(),
1565 VgprUsage&: VregCurrentUsage, SgprUsage&: SregCurrentUsage);
1566 if (VregCurrentUsage > maxVregUsage)
1567 maxVregUsage = VregCurrentUsage;
1568 if (SregCurrentUsage > maxSregUsage)
1569 maxSregUsage = SregCurrentUsage;
1570 LLVM_DEBUG({
1571 dbgs() << "Picking New Blocks\n";
1572 dbgs() << "Available: ";
1573 for (SIScheduleBlock *Block : ReadyBlocks)
1574 dbgs() << Block->getID() << ' ';
1575 dbgs() << "\nCurrent Live:\n";
1576 for (Register Reg : LiveRegs)
1577 dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
1578 dbgs() << '\n';
1579 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1580 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
1581 });
1582
1583 Cand.Block = nullptr;
1584 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1585 E = ReadyBlocks.end(); I != E; ++I) {
1586 SIBlockSchedCandidate TryCand;
1587 TryCand.Block = *I;
1588 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1589 TryCand.VGPRUsageDiff =
1590 checkRegUsageImpact(InRegs&: TryCand.Block->getInRegs(),
1591 OutRegs&: TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32];
1592 TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1593 TryCand.NumHighLatencySuccessors =
1594 TryCand.Block->getNumHighLatencySuccessors();
1595 TryCand.LastPosHighLatParentScheduled =
1596 (unsigned int) std::max<int> (a: 0,
1597 b: LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1598 LastPosWaitedHighLatency);
1599 TryCand.Height = TryCand.Block->Height;
1600 // Try not to increase VGPR usage too much, else we may spill.
1601 if (VregCurrentUsage > 120 ||
1602 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1603 if (!tryCandidateRegUsage(Cand, TryCand) &&
1604 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1605 tryCandidateLatency(Cand, TryCand);
1606 } else {
1607 if (!tryCandidateLatency(Cand, TryCand))
1608 tryCandidateRegUsage(Cand, TryCand);
1609 }
1610 if (TryCand.Reason != NoCand) {
1611 Cand.setBest(TryCand);
1612 Best = I;
1613 LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1614 << getReasonStr(Cand.Reason) << '\n');
1615 }
1616 }
1617
1618 LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1619 dbgs() << "Is a block with high latency instruction: "
1620 << (Cand.IsHighLatency ? "yes\n" : "no\n");
1621 dbgs() << "Position of last high latency dependency: "
1622 << Cand.LastPosHighLatParentScheduled << '\n';
1623 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1624 dbgs() << '\n';);
1625
1626 Block = Cand.Block;
1627 ReadyBlocks.erase(position: Best);
1628 return Block;
1629}
1630
1631// Tracking of currently alive registers to determine VGPR Usage.
1632
1633void SIScheduleBlockScheduler::addLiveRegs(std::set<VirtRegOrUnit> &Regs) {
1634 for (VirtRegOrUnit VRegOrUnit : Regs) {
1635 // For now only track virtual registers.
1636 if (!VRegOrUnit.isVirtualReg())
1637 continue;
1638 // If not already in the live set, then add it.
1639 (void)LiveRegs.insert(x: VRegOrUnit.asVirtualReg());
1640 }
1641}
1642
1643void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1644 std::set<Register> &Regs) {
1645 for (Register Reg : Regs) {
1646 // For now only track virtual registers.
1647 std::set<Register>::iterator Pos = LiveRegs.find(x: Reg);
1648 assert (Pos != LiveRegs.end() && // Reg must be live.
1649 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1650 LiveRegsConsumers[Reg] >= 1);
1651 --LiveRegsConsumers[Reg];
1652 if (LiveRegsConsumers[Reg] == 0)
1653 LiveRegs.erase(position: Pos);
1654 }
1655}
1656
1657void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1658 for (const auto &Block : Parent->getSuccs()) {
1659 if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1660 ReadyBlocks.push_back(x: Block.first);
1661
1662 if (Parent->isHighLatencyBlock() &&
1663 Block.second == SIScheduleBlockLinkKind::Data)
1664 LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
1665 }
1666}
1667
1668void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1669 decreaseLiveRegs(Block, Regs&: Block->getInRegs());
1670 LiveRegs.insert(first: Block->getOutRegs().begin(), last: Block->getOutRegs().end());
1671 releaseBlockSuccs(Parent: Block);
1672 for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) {
1673 // We produce this register, thus it must not be previously alive.
1674 assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
1675 LiveRegsConsumers[RegP.first] == 0);
1676 LiveRegsConsumers[RegP.first] += RegP.second;
1677 }
1678 if (LastPosHighLatencyParentScheduled[Block->getID()] >
1679 (unsigned)LastPosWaitedHighLatency)
1680 LastPosWaitedHighLatency =
1681 LastPosHighLatencyParentScheduled[Block->getID()];
1682 ++NumBlockScheduled;
1683}
1684
1685std::vector<int>
1686SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
1687 std::set<Register> &OutRegs) {
1688 std::vector<int> DiffSetPressure;
1689 DiffSetPressure.assign(n: DAG->getTRI()->getNumRegPressureSets(), val: 0);
1690
1691 for (Register Reg : InRegs) {
1692 // For now only track virtual registers.
1693 if (!Reg.isVirtual())
1694 continue;
1695 if (LiveRegsConsumers[Reg] > 1)
1696 continue;
1697 PSetIterator PSetI = DAG->getMRI()->getPressureSets(VRegOrUnit: VirtRegOrUnit(Reg));
1698 for (; PSetI.isValid(); ++PSetI) {
1699 DiffSetPressure[*PSetI] -= PSetI.getWeight();
1700 }
1701 }
1702
1703 for (Register Reg : OutRegs) {
1704 // For now only track virtual registers.
1705 if (!Reg.isVirtual())
1706 continue;
1707 PSetIterator PSetI = DAG->getMRI()->getPressureSets(VRegOrUnit: VirtRegOrUnit(Reg));
1708 for (; PSetI.isValid(); ++PSetI) {
1709 DiffSetPressure[*PSetI] += PSetI.getWeight();
1710 }
1711 }
1712
1713 return DiffSetPressure;
1714}
1715
1716// SIScheduler //
1717
1718struct SIScheduleBlockResult
1719SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1720 SISchedulerBlockSchedulerVariant ScheduleVariant) {
1721 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1722 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1723 std::vector<SIScheduleBlock*> ScheduledBlocks;
1724 struct SIScheduleBlockResult Res;
1725
1726 ScheduledBlocks = Scheduler.getBlocks();
1727
1728 for (SIScheduleBlock *Block : ScheduledBlocks) {
1729 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1730
1731 for (SUnit* SU : SUs)
1732 Res.SUs.push_back(x: SU->NodeNum);
1733 }
1734
1735 Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1736 Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1737 return Res;
1738}
1739
1740// SIScheduleDAGMI //
1741
1742SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1743 ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(args&: C)) {
1744 SITII = static_cast<const SIInstrInfo*>(TII);
1745 SITRI = static_cast<const SIRegisterInfo*>(TRI);
1746}
1747
1748SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1749
1750// Code adapted from scheduleDAG.cpp
1751// Does a topological sort over the SUs.
1752// Both TopDown and BottomUp
1753void SIScheduleDAGMI::topologicalSort() {
1754 Topo.InitDAGTopologicalSorting();
1755
1756 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1757 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1758}
1759
1760// Move low latencies further from their user without
1761// increasing SGPR usage (in general)
1762// This is to be replaced by a better pass that would
1763// take into account SGPR usage (based on VGPR Usage
1764// and the corresponding wavefront count), that would
1765// try to merge groups of loads if it make sense, etc
1766void SIScheduleDAGMI::moveLowLatencies() {
1767 unsigned DAGSize = SUnits.size();
1768 int LastLowLatencyUser = -1;
1769 int LastLowLatencyPos = -1;
1770
1771 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1772 SUnit *SU = &SUnits[ScheduledSUnits[i]];
1773 bool IsLowLatencyUser = false;
1774 unsigned MinPos = 0;
1775
1776 for (SDep& PredDep : SU->Preds) {
1777 SUnit *Pred = PredDep.getSUnit();
1778 if (SITII->isLowLatencyInstruction(MI: *Pred->getInstr())) {
1779 IsLowLatencyUser = true;
1780 }
1781 if (Pred->NodeNum >= DAGSize)
1782 continue;
1783 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1784 if (PredPos >= MinPos)
1785 MinPos = PredPos + 1;
1786 }
1787
1788 if (SITII->isLowLatencyInstruction(MI: *SU->getInstr())) {
1789 unsigned BestPos = LastLowLatencyUser + 1;
1790 if ((int)BestPos <= LastLowLatencyPos)
1791 BestPos = LastLowLatencyPos + 1;
1792 if (BestPos < MinPos)
1793 BestPos = MinPos;
1794 if (BestPos < i) {
1795 for (unsigned u = i; u > BestPos; --u) {
1796 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1797 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1798 }
1799 ScheduledSUnits[BestPos] = SU->NodeNum;
1800 ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1801 }
1802 LastLowLatencyPos = BestPos;
1803 if (IsLowLatencyUser)
1804 LastLowLatencyUser = BestPos;
1805 } else if (IsLowLatencyUser) {
1806 LastLowLatencyUser = i;
1807 // Moves COPY instructions on which depends
1808 // the low latency instructions too.
1809 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1810 bool CopyForLowLat = false;
1811 for (SDep& SuccDep : SU->Succs) {
1812 SUnit *Succ = SuccDep.getSUnit();
1813 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1814 continue;
1815 if (SITII->isLowLatencyInstruction(MI: *Succ->getInstr())) {
1816 CopyForLowLat = true;
1817 }
1818 }
1819 if (!CopyForLowLat)
1820 continue;
1821 if (MinPos < i) {
1822 for (unsigned u = i; u > MinPos; --u) {
1823 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1824 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1825 }
1826 ScheduledSUnits[MinPos] = SU->NodeNum;
1827 ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1828 }
1829 }
1830 }
1831}
1832
1833void SIScheduleDAGMI::restoreSULinksLeft() {
1834 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1835 SUnits[i].isScheduled = false;
1836 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1837 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1838 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1839 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1840 }
1841}
1842
1843// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1844template<typename _Iterator> void
1845SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1846 unsigned &VgprUsage, unsigned &SgprUsage) {
1847 VgprUsage = 0;
1848 SgprUsage = 0;
1849 for (_Iterator RegI = First; RegI != End; ++RegI) {
1850 Register Reg = *RegI;
1851 // For now only track virtual registers
1852 if (!Reg.isVirtual())
1853 continue;
1854 PSetIterator PSetI = MRI.getPressureSets(VRegOrUnit: VirtRegOrUnit(Reg));
1855 for (; PSetI.isValid(); ++PSetI) {
1856 if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
1857 VgprUsage += PSetI.getWeight();
1858 else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32)
1859 SgprUsage += PSetI.getWeight();
1860 }
1861 }
1862}
1863
1864void SIScheduleDAGMI::schedule()
1865{
1866 SmallVector<SUnit*, 8> TopRoots, BotRoots;
1867 SIScheduleBlockResult Best, Temp;
1868 LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1869
1870 buildDAGWithRegPressure();
1871 postProcessDAG();
1872
1873 LLVM_DEBUG(dump());
1874 if (PrintDAGs)
1875 dump();
1876 if (ViewMISchedDAGs)
1877 viewGraph();
1878
1879 topologicalSort();
1880 findRootsAndBiasEdges(TopRoots, BotRoots);
1881 // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1882 // functions, but to make them happy we must initialize
1883 // the default Scheduler implementation (even if we do not
1884 // run it)
1885 SchedImpl->initialize(DAG: this);
1886 initQueues(TopRoots, BotRoots);
1887
1888 // Fill some stats to help scheduling.
1889
1890 SUnitsLinksBackup = SUnits;
1891 IsLowLatencySU.clear();
1892 LowLatencyOffset.clear();
1893 IsHighLatencySU.clear();
1894
1895 IsLowLatencySU.resize(new_size: SUnits.size(), x: 0);
1896 LowLatencyOffset.resize(new_size: SUnits.size(), x: 0);
1897 IsHighLatencySU.resize(new_size: SUnits.size(), x: 0);
1898
1899 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1900 SUnit *SU = &SUnits[i];
1901 const MachineOperand *BaseLatOp;
1902 int64_t OffLatReg;
1903 if (SITII->isLowLatencyInstruction(MI: *SU->getInstr())) {
1904 IsLowLatencySU[i] = 1;
1905 bool OffsetIsScalable;
1906 if (SITII->getMemOperandWithOffset(MI: *SU->getInstr(), BaseOp&: BaseLatOp, Offset&: OffLatReg,
1907 OffsetIsScalable, TRI))
1908 LowLatencyOffset[i] = OffLatReg;
1909 } else if (SITII->isHighLatencyDef(Opc: SU->getInstr()->getOpcode()))
1910 IsHighLatencySU[i] = 1;
1911 }
1912
1913 SIScheduler Scheduler(this);
1914 Best = Scheduler.scheduleVariant(BlockVariant: SISchedulerBlockCreatorVariant::LatenciesAlone,
1915 ScheduleVariant: SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1916
1917 // if VGPR usage is extremely high, try other good performing variants
1918 // which could lead to lower VGPR usage
1919 if (Best.MaxVGPRUsage > 180) {
1920 static const std::pair<SISchedulerBlockCreatorVariant,
1921 SISchedulerBlockSchedulerVariant>
1922 Variants[] = {
1923 { LatenciesAlone, BlockRegUsageLatency },
1924// { LatenciesAlone, BlockRegUsage },
1925 { LatenciesGrouped, BlockLatencyRegUsage },
1926// { LatenciesGrouped, BlockRegUsageLatency },
1927// { LatenciesGrouped, BlockRegUsage },
1928 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1929// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1930// { LatenciesAlonePlusConsecutive, BlockRegUsage }
1931 };
1932 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1933 Temp = Scheduler.scheduleVariant(BlockVariant: v.first, ScheduleVariant: v.second);
1934 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1935 Best = Temp;
1936 }
1937 }
1938 // if VGPR usage is still extremely high, we may spill. Try other variants
1939 // which are less performing, but that could lead to lower VGPR usage.
1940 if (Best.MaxVGPRUsage > 200) {
1941 static const std::pair<SISchedulerBlockCreatorVariant,
1942 SISchedulerBlockSchedulerVariant>
1943 Variants[] = {
1944// { LatenciesAlone, BlockRegUsageLatency },
1945 { LatenciesAlone, BlockRegUsage },
1946// { LatenciesGrouped, BlockLatencyRegUsage },
1947 { LatenciesGrouped, BlockRegUsageLatency },
1948 { LatenciesGrouped, BlockRegUsage },
1949// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1950 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1951 { LatenciesAlonePlusConsecutive, BlockRegUsage }
1952 };
1953 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1954 Temp = Scheduler.scheduleVariant(BlockVariant: v.first, ScheduleVariant: v.second);
1955 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1956 Best = Temp;
1957 }
1958 }
1959
1960 ScheduledSUnits = Best.SUs;
1961 ScheduledSUnitsInv.resize(new_size: SUnits.size());
1962
1963 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1964 ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
1965 }
1966
1967 moveLowLatencies();
1968
1969 // Tell the outside world about the result of the scheduling.
1970
1971 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1972 TopRPTracker.setPos(CurrentTop);
1973
1974 for (unsigned I : ScheduledSUnits) {
1975 SUnit *SU = &SUnits[I];
1976
1977 scheduleMI(SU, IsTopNode: true);
1978
1979 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
1980 << *SU->getInstr());
1981 }
1982
1983 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1984
1985 placeDebugValues();
1986
1987 LLVM_DEBUG({
1988 dbgs() << "*** Final schedule for "
1989 << printMBBReference(*begin()->getParent()) << " ***\n";
1990 dumpSchedule();
1991 dbgs() << '\n';
1992 });
1993}
1994