1 | //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This contains a MachineSchedStrategy implementation for maximizing wave |
11 | /// occupancy on GCN hardware. |
12 | /// |
13 | /// This pass will apply multiple scheduling stages to the same function. |
14 | /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual |
15 | /// entry point for the scheduling of those regions is |
16 | /// GCNScheduleDAGMILive::runSchedStages. |
17 | |
18 | /// Generally, the reason for having multiple scheduling stages is to account |
19 | /// for the kernel-wide effect of register usage on occupancy. Usually, only a |
20 | /// few scheduling regions will have register pressure high enough to limit |
21 | /// occupancy for the kernel, so constraints can be relaxed to improve ILP in |
22 | /// other regions. |
23 | /// |
24 | //===----------------------------------------------------------------------===// |
25 | |
26 | #include "GCNSchedStrategy.h" |
27 | #include "AMDGPUIGroupLP.h" |
28 | #include "SIMachineFunctionInfo.h" |
29 | #include "llvm/CodeGen/RegisterClassInfo.h" |
30 | |
31 | #define DEBUG_TYPE "machine-scheduler" |
32 | |
33 | using namespace llvm; |
34 | |
35 | static cl::opt<bool> DisableUnclusterHighRP( |
36 | "amdgpu-disable-unclustered-high-rp-reschedule" , cl::Hidden, |
37 | cl::desc("Disable unclustered high register pressure " |
38 | "reduction scheduling stage." ), |
39 | cl::init(Val: false)); |
40 | |
41 | static cl::opt<bool> DisableClusteredLowOccupancy( |
42 | "amdgpu-disable-clustered-low-occupancy-reschedule" , cl::Hidden, |
43 | cl::desc("Disable clustered low occupancy " |
44 | "rescheduling for ILP scheduling stage." ), |
45 | cl::init(Val: false)); |
46 | |
47 | static cl::opt<unsigned> ScheduleMetricBias( |
48 | "amdgpu-schedule-metric-bias" , cl::Hidden, |
49 | cl::desc( |
50 | "Sets the bias which adds weight to occupancy vs latency. Set it to " |
51 | "100 to chase the occupancy only." ), |
52 | cl::init(Val: 10)); |
53 | |
54 | static cl::opt<bool> |
55 | RelaxedOcc("amdgpu-schedule-relaxed-occupancy" , cl::Hidden, |
56 | cl::desc("Relax occupancy targets for kernels which are memory " |
57 | "bound (amdgpu-membound-threshold), or " |
58 | "Wave Limited (amdgpu-limit-wave-threshold)." ), |
59 | cl::init(Val: false)); |
60 | |
61 | const unsigned ScheduleMetrics::ScaleFactor = 100; |
62 | |
63 | GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) |
64 | : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), |
65 | HasHighPressure(false) {} |
66 | |
67 | void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { |
68 | GenericScheduler::initialize(dag: DAG); |
69 | |
70 | MF = &DAG->MF; |
71 | |
72 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
73 | |
74 | SGPRExcessLimit = |
75 | Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass); |
76 | VGPRExcessLimit = |
77 | Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass); |
78 | |
79 | SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
80 | // Set the initial TargetOccupnacy to the maximum occupancy that we can |
81 | // achieve for this function. This effectively sets a lower bound on the |
82 | // 'Critical' register limits in the scheduler. |
83 | // Allow for lower occupancy targets if kernel is wave limited or memory |
84 | // bound, and using the relaxed occupancy feature. |
85 | TargetOccupancy = |
86 | RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); |
87 | SGPRCriticalLimit = |
88 | std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit); |
89 | |
90 | if (!KnownExcessRP) { |
91 | VGPRCriticalLimit = |
92 | std::min(a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy), b: VGPRExcessLimit); |
93 | } else { |
94 | // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except |
95 | // returns a reasonably small number for targets with lots of VGPRs, such |
96 | // as GFX10 and GFX11. |
97 | LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " |
98 | "VGPRCriticalLimit calculation method.\n" ); |
99 | |
100 | unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST); |
101 | unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: &ST); |
102 | unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule); |
103 | VGPRBudget = std::max(a: VGPRBudget, b: Granule); |
104 | VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit); |
105 | } |
106 | |
107 | // Subtract error margin and bias from register limits and avoid overflow. |
108 | SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit); |
109 | VGPRCriticalLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit); |
110 | SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit); |
111 | VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit); |
112 | |
113 | LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit |
114 | << ", VGPRExcessLimit = " << VGPRExcessLimit |
115 | << ", SGPRCriticalLimit = " << SGPRCriticalLimit |
116 | << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n" ); |
117 | } |
118 | |
119 | /// Checks whether \p SU can use the cached DAG pressure diffs to compute the |
120 | /// current register pressure. |
121 | /// |
122 | /// This works for the common case, but it has a few exceptions that have been |
123 | /// observed through trial and error: |
124 | /// - Explicit physical register operands |
125 | /// - Subregister definitions |
126 | /// |
127 | /// In both of those cases, PressureDiff doesn't represent the actual pressure, |
128 | /// and querying LiveIntervals through the RegPressureTracker is needed to get |
129 | /// an accurate value. |
130 | /// |
131 | /// We should eventually only use PressureDiff for maximum performance, but this |
132 | /// already allows 80% of SUs to take the fast path without changing scheduling |
133 | /// at all. Further changes would either change scheduling, or require a lot |
134 | /// more logic to recover an accurate pressure estimate from the PressureDiffs. |
135 | static bool canUsePressureDiffs(const SUnit &SU) { |
136 | if (!SU.isInstr()) |
137 | return false; |
138 | |
139 | // Cannot use pressure diffs for subregister defs or with physregs, it's |
140 | // imprecise in both cases. |
141 | for (const auto &Op : SU.getInstr()->operands()) { |
142 | if (!Op.isReg() || Op.isImplicit()) |
143 | continue; |
144 | if (Op.getReg().isPhysical() || |
145 | (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister)) |
146 | return false; |
147 | } |
148 | return true; |
149 | } |
150 | |
151 | static void getRegisterPressures(bool AtTop, |
152 | const RegPressureTracker &RPTracker, SUnit *SU, |
153 | std::vector<unsigned> &Pressure, |
154 | std::vector<unsigned> &MaxPressure) { |
155 | // getDownwardPressure() and getUpwardPressure() make temporary changes to |
156 | // the tracker, so we need to pass those function a non-const copy. |
157 | RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); |
158 | if (AtTop) |
159 | TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure); |
160 | else |
161 | TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure); |
162 | } |
163 | |
164 | void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, |
165 | bool AtTop, |
166 | const RegPressureTracker &RPTracker, |
167 | const SIRegisterInfo *SRI, |
168 | unsigned SGPRPressure, |
169 | unsigned VGPRPressure, bool IsBottomUp) { |
170 | Cand.SU = SU; |
171 | Cand.AtTop = AtTop; |
172 | |
173 | if (!DAG->isTrackingPressure()) |
174 | return; |
175 | |
176 | Pressure.clear(); |
177 | MaxPressure.clear(); |
178 | |
179 | // We try to use the cached PressureDiffs in the ScheduleDAG whenever |
180 | // possible over querying the RegPressureTracker. |
181 | // |
182 | // RegPressureTracker will make a lot of LIS queries which are very |
183 | // expensive, it is considered a slow function in this context. |
184 | // |
185 | // PressureDiffs are precomputed and cached, and getPressureDiff is just a |
186 | // trivial lookup into an array. It is pretty much free. |
187 | // |
188 | // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of |
189 | // PressureDiffs. |
190 | if (AtTop || !canUsePressureDiffs(SU: *SU)) { |
191 | getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure); |
192 | } else { |
193 | // Reserve 4 slots. |
194 | Pressure.resize(new_size: 4, x: 0); |
195 | Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; |
196 | Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; |
197 | |
198 | for (const auto &Diff : DAG->getPressureDiff(SU)) { |
199 | if (!Diff.isValid()) |
200 | continue; |
201 | // PressureDiffs is always bottom-up so if we're working top-down we need |
202 | // to invert its sign. |
203 | Pressure[Diff.getPSet()] += |
204 | (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc()); |
205 | } |
206 | |
207 | #ifdef EXPENSIVE_CHECKS |
208 | std::vector<unsigned> CheckPressure, CheckMaxPressure; |
209 | getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure); |
210 | if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != |
211 | CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || |
212 | Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != |
213 | CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { |
214 | errs() << "Register Pressure is inaccurate when calculated through " |
215 | "PressureDiff\n" |
216 | << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] |
217 | << ", expected " |
218 | << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" |
219 | << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] |
220 | << ", expected " |
221 | << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n" ; |
222 | report_fatal_error("inaccurate register pressure calculation" ); |
223 | } |
224 | #endif |
225 | } |
226 | |
227 | unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
228 | unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
229 | |
230 | // If two instructions increase the pressure of different register sets |
231 | // by the same amount, the generic scheduler will prefer to schedule the |
232 | // instruction that increases the set with the least amount of registers, |
233 | // which in our case would be SGPRs. This is rarely what we want, so |
234 | // when we report excess/critical register pressure, we do it either |
235 | // only for VGPRs or only for SGPRs. |
236 | |
237 | // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. |
238 | const unsigned MaxVGPRPressureInc = 16; |
239 | bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; |
240 | bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; |
241 | |
242 | // FIXME: We have to enter REG-EXCESS before we reach the actual threshold |
243 | // to increase the likelihood we don't go over the limits. We should improve |
244 | // the analysis to look through dependencies to find the path with the least |
245 | // register pressure. |
246 | |
247 | // We only need to update the RPDelta for instructions that increase register |
248 | // pressure. Instructions that decrease or keep reg pressure the same will be |
249 | // marked as RegExcess in tryCandidate() when they are compared with |
250 | // instructions that increase the register pressure. |
251 | if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { |
252 | HasHighPressure = true; |
253 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
254 | Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); |
255 | } |
256 | |
257 | if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { |
258 | HasHighPressure = true; |
259 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
260 | Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); |
261 | } |
262 | |
263 | // Register pressure is considered 'CRITICAL' if it is approaching a value |
264 | // that would reduce the wave occupancy for the execution unit. When |
265 | // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both |
266 | // has the same cost, so we don't need to prefer one over the other. |
267 | |
268 | int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; |
269 | int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; |
270 | |
271 | if (SGPRDelta >= 0 || VGPRDelta >= 0) { |
272 | HasHighPressure = true; |
273 | if (SGPRDelta > VGPRDelta) { |
274 | Cand.RPDelta.CriticalMax = |
275 | PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
276 | Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); |
277 | } else { |
278 | Cand.RPDelta.CriticalMax = |
279 | PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
280 | Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); |
281 | } |
282 | } |
283 | } |
284 | |
285 | // This function is mostly cut and pasted from |
286 | // GenericScheduler::pickNodeFromQueue() |
287 | void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, |
288 | const CandPolicy &ZonePolicy, |
289 | const RegPressureTracker &RPTracker, |
290 | SchedCandidate &Cand, |
291 | bool IsBottomUp) { |
292 | const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); |
293 | ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); |
294 | unsigned SGPRPressure = 0; |
295 | unsigned VGPRPressure = 0; |
296 | if (DAG->isTrackingPressure()) { |
297 | SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
298 | VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
299 | } |
300 | ReadyQueue &Q = Zone.Available; |
301 | for (SUnit *SU : Q) { |
302 | |
303 | SchedCandidate TryCand(ZonePolicy); |
304 | initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure, |
305 | VGPRPressure, IsBottomUp); |
306 | // Pass SchedBoundary only when comparing nodes from the same boundary. |
307 | SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; |
308 | tryCandidate(Cand, TryCand, Zone: ZoneArg); |
309 | if (TryCand.Reason != NoCand) { |
310 | // Initialize resource delta if needed in case future heuristics query it. |
311 | if (TryCand.ResDelta == SchedResourceDelta()) |
312 | TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel); |
313 | Cand.setBest(TryCand); |
314 | LLVM_DEBUG(traceCandidate(Cand)); |
315 | } |
316 | } |
317 | } |
318 | |
319 | // This function is mostly cut and pasted from |
320 | // GenericScheduler::pickNodeBidirectional() |
321 | SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { |
322 | // Schedule as far as possible in the direction of no choice. This is most |
323 | // efficient, but also provides the best heuristics for CriticalPSets. |
324 | if (SUnit *SU = Bot.pickOnlyChoice()) { |
325 | IsTopNode = false; |
326 | return SU; |
327 | } |
328 | if (SUnit *SU = Top.pickOnlyChoice()) { |
329 | IsTopNode = true; |
330 | return SU; |
331 | } |
332 | // Set the bottom-up policy based on the state of the current bottom zone and |
333 | // the instructions outside the zone, including the top zone. |
334 | CandPolicy BotPolicy; |
335 | setPolicy(Policy&: BotPolicy, /*IsPostRA=*/false, CurrZone&: Bot, OtherZone: &Top); |
336 | // Set the top-down policy based on the state of the current top zone and |
337 | // the instructions outside the zone, including the bottom zone. |
338 | CandPolicy TopPolicy; |
339 | setPolicy(Policy&: TopPolicy, /*IsPostRA=*/false, CurrZone&: Top, OtherZone: &Bot); |
340 | |
341 | // See if BotCand is still valid (because we previously scheduled from Top). |
342 | LLVM_DEBUG(dbgs() << "Picking from Bot:\n" ); |
343 | if (!BotCand.isValid() || BotCand.SU->isScheduled || |
344 | BotCand.Policy != BotPolicy) { |
345 | BotCand.reset(NewPolicy: CandPolicy()); |
346 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand, |
347 | /*IsBottomUp=*/true); |
348 | assert(BotCand.Reason != NoCand && "failed to find the first candidate" ); |
349 | } else { |
350 | LLVM_DEBUG(traceCandidate(BotCand)); |
351 | #ifndef NDEBUG |
352 | if (VerifyScheduling) { |
353 | SchedCandidate TCand; |
354 | TCand.reset(CandPolicy()); |
355 | pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, |
356 | /*IsBottomUp=*/true); |
357 | assert(TCand.SU == BotCand.SU && |
358 | "Last pick result should correspond to re-picking right now" ); |
359 | } |
360 | #endif |
361 | } |
362 | |
363 | // Check if the top Q has a better candidate. |
364 | LLVM_DEBUG(dbgs() << "Picking from Top:\n" ); |
365 | if (!TopCand.isValid() || TopCand.SU->isScheduled || |
366 | TopCand.Policy != TopPolicy) { |
367 | TopCand.reset(NewPolicy: CandPolicy()); |
368 | pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand, |
369 | /*IsBottomUp=*/false); |
370 | assert(TopCand.Reason != NoCand && "failed to find the first candidate" ); |
371 | } else { |
372 | LLVM_DEBUG(traceCandidate(TopCand)); |
373 | #ifndef NDEBUG |
374 | if (VerifyScheduling) { |
375 | SchedCandidate TCand; |
376 | TCand.reset(CandPolicy()); |
377 | pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, |
378 | /*IsBottomUp=*/false); |
379 | assert(TCand.SU == TopCand.SU && |
380 | "Last pick result should correspond to re-picking right now" ); |
381 | } |
382 | #endif |
383 | } |
384 | |
385 | // Pick best from BotCand and TopCand. |
386 | LLVM_DEBUG(dbgs() << "Top Cand: " ; traceCandidate(TopCand); |
387 | dbgs() << "Bot Cand: " ; traceCandidate(BotCand);); |
388 | SchedCandidate Cand = BotCand; |
389 | TopCand.Reason = NoCand; |
390 | tryCandidate(Cand, TryCand&: TopCand, Zone: nullptr); |
391 | if (TopCand.Reason != NoCand) { |
392 | Cand.setBest(TopCand); |
393 | } |
394 | LLVM_DEBUG(dbgs() << "Picking: " ; traceCandidate(Cand);); |
395 | |
396 | IsTopNode = Cand.AtTop; |
397 | return Cand.SU; |
398 | } |
399 | |
400 | // This function is mostly cut and pasted from |
401 | // GenericScheduler::pickNode() |
402 | SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { |
403 | if (DAG->top() == DAG->bottom()) { |
404 | assert(Top.Available.empty() && Top.Pending.empty() && |
405 | Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage" ); |
406 | return nullptr; |
407 | } |
408 | SUnit *SU; |
409 | do { |
410 | if (RegionPolicy.OnlyTopDown) { |
411 | SU = Top.pickOnlyChoice(); |
412 | if (!SU) { |
413 | CandPolicy NoPolicy; |
414 | TopCand.reset(NewPolicy: NoPolicy); |
415 | pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand, |
416 | /*IsBottomUp=*/false); |
417 | assert(TopCand.Reason != NoCand && "failed to find a candidate" ); |
418 | SU = TopCand.SU; |
419 | } |
420 | IsTopNode = true; |
421 | } else if (RegionPolicy.OnlyBottomUp) { |
422 | SU = Bot.pickOnlyChoice(); |
423 | if (!SU) { |
424 | CandPolicy NoPolicy; |
425 | BotCand.reset(NewPolicy: NoPolicy); |
426 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand, |
427 | /*IsBottomUp=*/true); |
428 | assert(BotCand.Reason != NoCand && "failed to find a candidate" ); |
429 | SU = BotCand.SU; |
430 | } |
431 | IsTopNode = false; |
432 | } else { |
433 | SU = pickNodeBidirectional(IsTopNode); |
434 | } |
435 | } while (SU->isScheduled); |
436 | |
437 | if (SU->isTopReady()) |
438 | Top.removeReady(SU); |
439 | if (SU->isBottomReady()) |
440 | Bot.removeReady(SU); |
441 | |
442 | LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " |
443 | << *SU->getInstr()); |
444 | return SU; |
445 | } |
446 | |
447 | GCNSchedStageID GCNSchedStrategy::getCurrentStage() { |
448 | assert(CurrentStage && CurrentStage != SchedStages.end()); |
449 | return *CurrentStage; |
450 | } |
451 | |
452 | bool GCNSchedStrategy::advanceStage() { |
453 | assert(CurrentStage != SchedStages.end()); |
454 | if (!CurrentStage) |
455 | CurrentStage = SchedStages.begin(); |
456 | else |
457 | CurrentStage++; |
458 | |
459 | return CurrentStage != SchedStages.end(); |
460 | } |
461 | |
462 | bool GCNSchedStrategy::hasNextStage() const { |
463 | assert(CurrentStage); |
464 | return std::next(x: CurrentStage) != SchedStages.end(); |
465 | } |
466 | |
467 | GCNSchedStageID GCNSchedStrategy::getNextStage() const { |
468 | assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); |
469 | return *std::next(x: CurrentStage); |
470 | } |
471 | |
472 | GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( |
473 | const MachineSchedContext *C) |
474 | : GCNSchedStrategy(C) { |
475 | SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule); |
476 | SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule); |
477 | SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule); |
478 | SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize); |
479 | } |
480 | |
481 | GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) |
482 | : GCNSchedStrategy(C) { |
483 | SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule); |
484 | } |
485 | |
486 | bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, |
487 | SchedCandidate &TryCand, |
488 | SchedBoundary *Zone) const { |
489 | // Initialize the candidate if needed. |
490 | if (!Cand.isValid()) { |
491 | TryCand.Reason = NodeOrder; |
492 | return true; |
493 | } |
494 | |
495 | // Avoid spilling by exceeding the register limit. |
496 | if (DAG->isTrackingPressure() && |
497 | tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand, |
498 | Reason: RegExcess, TRI, MF: DAG->MF)) |
499 | return TryCand.Reason != NoCand; |
500 | |
501 | // Bias PhysReg Defs and copies to their uses and defined respectively. |
502 | if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop), |
503 | CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg)) |
504 | return TryCand.Reason != NoCand; |
505 | |
506 | bool SameBoundary = Zone != nullptr; |
507 | if (SameBoundary) { |
508 | // Prioritize instructions that read unbuffered resources by stall cycles. |
509 | if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU), |
510 | CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall)) |
511 | return TryCand.Reason != NoCand; |
512 | |
513 | // Avoid critical resource consumption and balance the schedule. |
514 | TryCand.initResourceDelta(DAG, SchedModel); |
515 | if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources, |
516 | TryCand, Cand, Reason: ResourceReduce)) |
517 | return TryCand.Reason != NoCand; |
518 | if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources, |
519 | CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand, |
520 | Reason: ResourceDemand)) |
521 | return TryCand.Reason != NoCand; |
522 | |
523 | // Unconditionally try to reduce latency. |
524 | if (tryLatency(TryCand, Cand, Zone&: *Zone)) |
525 | return TryCand.Reason != NoCand; |
526 | |
527 | // Weak edges are for clustering and other constraints. |
528 | if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop), |
529 | CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak)) |
530 | return TryCand.Reason != NoCand; |
531 | } |
532 | |
533 | // Keep clustered nodes together to encourage downstream peephole |
534 | // optimizations which may reduce resource requirements. |
535 | // |
536 | // This is a best effort to set things up for a post-RA pass. Optimizations |
537 | // like generating loads of multiple registers should ideally be done within |
538 | // the scheduler pass by combining the loads during DAG postprocessing. |
539 | const SUnit *CandNextClusterSU = |
540 | Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
541 | const SUnit *TryCandNextClusterSU = |
542 | TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
543 | if (tryGreater(TryVal: TryCand.SU == TryCandNextClusterSU, |
544 | CandVal: Cand.SU == CandNextClusterSU, TryCand, Cand, Reason: Cluster)) |
545 | return TryCand.Reason != NoCand; |
546 | |
547 | // Avoid increasing the max critical pressure in the scheduled region. |
548 | if (DAG->isTrackingPressure() && |
549 | tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax, |
550 | TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF)) |
551 | return TryCand.Reason != NoCand; |
552 | |
553 | // Avoid increasing the max pressure of the entire region. |
554 | if (DAG->isTrackingPressure() && |
555 | tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand, |
556 | Cand, Reason: RegMax, TRI, MF: DAG->MF)) |
557 | return TryCand.Reason != NoCand; |
558 | |
559 | if (SameBoundary) { |
560 | // Fall through to original instruction order. |
561 | if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || |
562 | (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { |
563 | TryCand.Reason = NodeOrder; |
564 | return true; |
565 | } |
566 | } |
567 | return false; |
568 | } |
569 | |
570 | GCNScheduleDAGMILive::GCNScheduleDAGMILive( |
571 | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) |
572 | : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), |
573 | MFI(*MF.getInfo<SIMachineFunctionInfo>()), |
574 | StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { |
575 | |
576 | LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n" ); |
577 | if (RelaxedOcc) { |
578 | MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy); |
579 | if (MinOccupancy != StartingOccupancy) |
580 | LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy |
581 | << ".\n" ); |
582 | } |
583 | } |
584 | |
585 | std::unique_ptr<GCNSchedStage> |
586 | GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { |
587 | switch (SchedStageID) { |
588 | case GCNSchedStageID::OccInitialSchedule: |
589 | return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this); |
590 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
591 | return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this); |
592 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
593 | return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this); |
594 | case GCNSchedStageID::PreRARematerialize: |
595 | return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this); |
596 | case GCNSchedStageID::ILPInitialSchedule: |
597 | return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this); |
598 | } |
599 | |
600 | llvm_unreachable("Unknown SchedStageID." ); |
601 | } |
602 | |
603 | void GCNScheduleDAGMILive::schedule() { |
604 | // Collect all scheduling regions. The actual scheduling is performed in |
605 | // GCNScheduleDAGMILive::finalizeSchedule. |
606 | Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd)); |
607 | } |
608 | |
609 | GCNRegPressure |
610 | GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { |
611 | GCNDownwardRPTracker RPTracker(*LIS); |
612 | RPTracker.advance(Begin: begin(), End: end(), LiveRegsCopy: &LiveIns[RegionIdx]); |
613 | return RPTracker.moveMaxPressure(); |
614 | } |
615 | |
616 | void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, |
617 | const MachineBasicBlock *MBB) { |
618 | GCNDownwardRPTracker RPTracker(*LIS); |
619 | |
620 | // If the block has the only successor then live-ins of that successor are |
621 | // live-outs of the current block. We can reuse calculated live set if the |
622 | // successor will be sent to scheduling past current block. |
623 | |
624 | // However, due to the bug in LiveInterval analysis it may happen that two |
625 | // predecessors of the same successor block have different lane bitmasks for |
626 | // a live-out register. Workaround that by sticking to one-to-one relationship |
627 | // i.e. one predecessor with one successor block. |
628 | const MachineBasicBlock *OnlySucc = nullptr; |
629 | if (MBB->succ_size() == 1) { |
630 | auto *Candidate = *MBB->succ_begin(); |
631 | if (!Candidate->empty() && Candidate->pred_size() == 1) { |
632 | SlotIndexes *Ind = LIS->getSlotIndexes(); |
633 | if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate)) |
634 | OnlySucc = Candidate; |
635 | } |
636 | } |
637 | |
638 | // Scheduler sends regions from the end of the block upwards. |
639 | size_t CurRegion = RegionIdx; |
640 | for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) |
641 | if (Regions[CurRegion].first->getParent() != MBB) |
642 | break; |
643 | --CurRegion; |
644 | |
645 | auto I = MBB->begin(); |
646 | auto LiveInIt = MBBLiveIns.find(Val: MBB); |
647 | auto &Rgn = Regions[CurRegion]; |
648 | auto *NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second); |
649 | if (LiveInIt != MBBLiveIns.end()) { |
650 | auto LiveIn = std::move(LiveInIt->second); |
651 | RPTracker.reset(MI: *MBB->begin(), LiveRegs: &LiveIn); |
652 | MBBLiveIns.erase(I: LiveInIt); |
653 | } else { |
654 | I = Rgn.first; |
655 | auto LRS = BBLiveInMap.lookup(Val: NonDbgMI); |
656 | #ifdef EXPENSIVE_CHECKS |
657 | assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); |
658 | #endif |
659 | RPTracker.reset(MI: *I, LiveRegs: &LRS); |
660 | } |
661 | |
662 | for (;;) { |
663 | I = RPTracker.getNext(); |
664 | |
665 | if (Regions[CurRegion].first == I || NonDbgMI == I) { |
666 | LiveIns[CurRegion] = RPTracker.getLiveRegs(); |
667 | RPTracker.clearMaxPressure(); |
668 | } |
669 | |
670 | if (Regions[CurRegion].second == I) { |
671 | Pressure[CurRegion] = RPTracker.moveMaxPressure(); |
672 | if (CurRegion-- == RegionIdx) |
673 | break; |
674 | } |
675 | RPTracker.advanceToNext(); |
676 | RPTracker.advanceBeforeNext(); |
677 | } |
678 | |
679 | if (OnlySucc) { |
680 | if (I != MBB->end()) { |
681 | RPTracker.advanceToNext(); |
682 | RPTracker.advance(End: MBB->end()); |
683 | } |
684 | RPTracker.advanceBeforeNext(); |
685 | MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); |
686 | } |
687 | } |
688 | |
689 | DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
690 | GCNScheduleDAGMILive::getBBLiveInMap() const { |
691 | assert(!Regions.empty()); |
692 | std::vector<MachineInstr *> BBStarters; |
693 | BBStarters.reserve(n: Regions.size()); |
694 | auto I = Regions.rbegin(), E = Regions.rend(); |
695 | auto *BB = I->first->getParent(); |
696 | do { |
697 | auto *MI = &*skipDebugInstructionsForward(It: I->first, End: I->second); |
698 | BBStarters.push_back(x: MI); |
699 | do { |
700 | ++I; |
701 | } while (I != E && I->first->getParent() == BB); |
702 | } while (I != E); |
703 | return getLiveRegMap(R&: BBStarters, After: false /*After*/, LIS&: *LIS); |
704 | } |
705 | |
706 | void GCNScheduleDAGMILive::finalizeSchedule() { |
707 | // Start actual scheduling here. This function is called by the base |
708 | // MachineScheduler after all regions have been recorded by |
709 | // GCNScheduleDAGMILive::schedule(). |
710 | LiveIns.resize(N: Regions.size()); |
711 | Pressure.resize(N: Regions.size()); |
712 | RescheduleRegions.resize(N: Regions.size()); |
713 | RegionsWithHighRP.resize(N: Regions.size()); |
714 | RegionsWithExcessRP.resize(N: Regions.size()); |
715 | RegionsWithMinOcc.resize(N: Regions.size()); |
716 | RegionsWithIGLPInstrs.resize(N: Regions.size()); |
717 | RescheduleRegions.set(); |
718 | RegionsWithHighRP.reset(); |
719 | RegionsWithExcessRP.reset(); |
720 | RegionsWithMinOcc.reset(); |
721 | RegionsWithIGLPInstrs.reset(); |
722 | |
723 | runSchedStages(); |
724 | } |
725 | |
726 | void GCNScheduleDAGMILive::runSchedStages() { |
727 | LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n" ); |
728 | |
729 | if (!Regions.empty()) |
730 | BBLiveInMap = getBBLiveInMap(); |
731 | |
732 | GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); |
733 | while (S.advanceStage()) { |
734 | auto Stage = createSchedStage(SchedStageID: S.getCurrentStage()); |
735 | if (!Stage->initGCNSchedStage()) |
736 | continue; |
737 | |
738 | for (auto Region : Regions) { |
739 | RegionBegin = Region.first; |
740 | RegionEnd = Region.second; |
741 | // Setup for scheduling the region and check whether it should be skipped. |
742 | if (!Stage->initGCNRegion()) { |
743 | Stage->advanceRegion(); |
744 | exitRegion(); |
745 | continue; |
746 | } |
747 | |
748 | ScheduleDAGMILive::schedule(); |
749 | Stage->finalizeGCNRegion(); |
750 | } |
751 | |
752 | Stage->finalizeGCNSchedStage(); |
753 | } |
754 | } |
755 | |
756 | #ifndef NDEBUG |
757 | raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { |
758 | switch (StageID) { |
759 | case GCNSchedStageID::OccInitialSchedule: |
760 | OS << "Max Occupancy Initial Schedule" ; |
761 | break; |
762 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
763 | OS << "Unclustered High Register Pressure Reschedule" ; |
764 | break; |
765 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
766 | OS << "Clustered Low Occupancy Reschedule" ; |
767 | break; |
768 | case GCNSchedStageID::PreRARematerialize: |
769 | OS << "Pre-RA Rematerialize" ; |
770 | break; |
771 | case GCNSchedStageID::ILPInitialSchedule: |
772 | OS << "Max ILP Initial Schedule" ; |
773 | break; |
774 | } |
775 | |
776 | return OS; |
777 | } |
778 | #endif |
779 | |
780 | GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) |
781 | : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), |
782 | MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} |
783 | |
784 | bool GCNSchedStage::initGCNSchedStage() { |
785 | if (!DAG.LIS) |
786 | return false; |
787 | |
788 | LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n" ); |
789 | return true; |
790 | } |
791 | |
792 | bool UnclusteredHighRPStage::initGCNSchedStage() { |
793 | if (DisableUnclusterHighRP) |
794 | return false; |
795 | |
796 | if (!GCNSchedStage::initGCNSchedStage()) |
797 | return false; |
798 | |
799 | if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) |
800 | return false; |
801 | |
802 | SavedMutations.swap(x&: DAG.Mutations); |
803 | DAG.addMutation( |
804 | Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry)); |
805 | |
806 | InitialOccupancy = DAG.MinOccupancy; |
807 | // Aggressivly try to reduce register pressure in the unclustered high RP |
808 | // stage. Temporarily increase occupancy target in the region. |
809 | S.SGPRLimitBias = S.HighRPSGPRBias; |
810 | S.VGPRLimitBias = S.HighRPVGPRBias; |
811 | if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) |
812 | MFI.increaseOccupancy(MF, Limit: ++DAG.MinOccupancy); |
813 | |
814 | LLVM_DEBUG( |
815 | dbgs() |
816 | << "Retrying function scheduling without clustering. " |
817 | "Aggressivly try to reduce register pressure to achieve occupancy " |
818 | << DAG.MinOccupancy << ".\n" ); |
819 | |
820 | return true; |
821 | } |
822 | |
823 | bool ClusteredLowOccStage::initGCNSchedStage() { |
824 | if (DisableClusteredLowOccupancy) |
825 | return false; |
826 | |
827 | if (!GCNSchedStage::initGCNSchedStage()) |
828 | return false; |
829 | |
830 | // Don't bother trying to improve ILP in lower RP regions if occupancy has not |
831 | // been dropped. All regions will have already been scheduled with the ideal |
832 | // occupancy targets. |
833 | if (DAG.StartingOccupancy <= DAG.MinOccupancy) |
834 | return false; |
835 | |
836 | LLVM_DEBUG( |
837 | dbgs() << "Retrying function scheduling with lowest recorded occupancy " |
838 | << DAG.MinOccupancy << ".\n" ); |
839 | return true; |
840 | } |
841 | |
842 | bool PreRARematStage::initGCNSchedStage() { |
843 | if (!GCNSchedStage::initGCNSchedStage()) |
844 | return false; |
845 | |
846 | if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1) |
847 | return false; |
848 | |
849 | const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); |
850 | // Check maximum occupancy |
851 | if (ST.computeOccupancy(F: MF.getFunction(), LDSSize: MFI.getLDSSize()) == |
852 | DAG.MinOccupancy) |
853 | return false; |
854 | |
855 | // FIXME: This pass will invalidate cached MBBLiveIns for regions |
856 | // inbetween the defs and region we sinked the def to. Cached pressure |
857 | // for regions where a def is sinked from will also be invalidated. Will |
858 | // need to be fixed if there is another pass after this pass. |
859 | assert(!S.hasNextStage()); |
860 | |
861 | collectRematerializableInstructions(); |
862 | if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) |
863 | return false; |
864 | |
865 | LLVM_DEBUG( |
866 | dbgs() << "Retrying function scheduling with improved occupancy of " |
867 | << DAG.MinOccupancy << " from rematerializing\n" ); |
868 | return true; |
869 | } |
870 | |
871 | void GCNSchedStage::finalizeGCNSchedStage() { |
872 | DAG.finishBlock(); |
873 | LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n" ); |
874 | } |
875 | |
876 | void UnclusteredHighRPStage::finalizeGCNSchedStage() { |
877 | SavedMutations.swap(x&: DAG.Mutations); |
878 | S.SGPRLimitBias = S.VGPRLimitBias = 0; |
879 | if (DAG.MinOccupancy > InitialOccupancy) { |
880 | for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) |
881 | DAG.RegionsWithMinOcc[IDX] = |
882 | DAG.Pressure[IDX].getOccupancy(ST: DAG.ST) == DAG.MinOccupancy; |
883 | |
884 | LLVM_DEBUG(dbgs() << StageID |
885 | << " stage successfully increased occupancy to " |
886 | << DAG.MinOccupancy << '\n'); |
887 | } |
888 | |
889 | GCNSchedStage::finalizeGCNSchedStage(); |
890 | } |
891 | |
892 | bool GCNSchedStage::initGCNRegion() { |
893 | // Check whether this new region is also a new block. |
894 | if (DAG.RegionBegin->getParent() != CurrentMBB) |
895 | setupNewBlock(); |
896 | |
897 | unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end()); |
898 | DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs); |
899 | |
900 | // Skip empty scheduling regions (0 or 1 schedulable instructions). |
901 | if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(x: DAG.end())) |
902 | return false; |
903 | |
904 | LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n" ); |
905 | LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) |
906 | << " " << CurrentMBB->getName() |
907 | << "\n From: " << *DAG.begin() << " To: " ; |
908 | if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; |
909 | else dbgs() << "End" ; |
910 | dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); |
911 | |
912 | // Save original instruction order before scheduling for possible revert. |
913 | Unsched.clear(); |
914 | Unsched.reserve(n: DAG.NumRegionInstrs); |
915 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
916 | StageID == GCNSchedStageID::ILPInitialSchedule) { |
917 | for (auto &I : DAG) { |
918 | Unsched.push_back(x: &I); |
919 | if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || |
920 | I.getOpcode() == AMDGPU::IGLP_OPT) |
921 | DAG.RegionsWithIGLPInstrs[RegionIdx] = true; |
922 | } |
923 | } else { |
924 | for (auto &I : DAG) |
925 | Unsched.push_back(x: &I); |
926 | } |
927 | |
928 | PressureBefore = DAG.Pressure[RegionIdx]; |
929 | |
930 | LLVM_DEBUG( |
931 | dbgs() << "Pressure before scheduling:\nRegion live-ins:" |
932 | << print(DAG.LiveIns[RegionIdx], DAG.MRI) |
933 | << "Region live-in pressure: " |
934 | << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) |
935 | << "Region register pressure: " << print(PressureBefore)); |
936 | |
937 | S.HasHighPressure = false; |
938 | S.KnownExcessRP = isRegionWithExcessRP(); |
939 | |
940 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
941 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { |
942 | SavedMutations.clear(); |
943 | SavedMutations.swap(x&: DAG.Mutations); |
944 | bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || |
945 | StageID == GCNSchedStageID::ILPInitialSchedule; |
946 | DAG.addMutation(Mutation: createIGroupLPDAGMutation( |
947 | Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial |
948 | : AMDGPU::SchedulingPhase::PreRAReentry)); |
949 | } |
950 | |
951 | return true; |
952 | } |
953 | |
954 | bool UnclusteredHighRPStage::initGCNRegion() { |
955 | // Only reschedule regions with the minimum occupancy or regions that may have |
956 | // spilling (excess register pressure). |
957 | if ((!DAG.RegionsWithMinOcc[RegionIdx] || |
958 | DAG.MinOccupancy <= InitialOccupancy) && |
959 | !DAG.RegionsWithExcessRP[RegionIdx]) |
960 | return false; |
961 | |
962 | return GCNSchedStage::initGCNRegion(); |
963 | } |
964 | |
965 | bool ClusteredLowOccStage::initGCNRegion() { |
966 | // We may need to reschedule this region if it wasn't rescheduled in the last |
967 | // stage, or if we found it was testing critical register pressure limits in |
968 | // the unclustered reschedule stage. The later is because we may not have been |
969 | // able to raise the min occupancy in the previous stage so the region may be |
970 | // overly constrained even if it was already rescheduled. |
971 | if (!DAG.RegionsWithHighRP[RegionIdx]) |
972 | return false; |
973 | |
974 | return GCNSchedStage::initGCNRegion(); |
975 | } |
976 | |
977 | bool PreRARematStage::initGCNRegion() { |
978 | if (!DAG.RescheduleRegions[RegionIdx]) |
979 | return false; |
980 | |
981 | return GCNSchedStage::initGCNRegion(); |
982 | } |
983 | |
984 | void GCNSchedStage::setupNewBlock() { |
985 | if (CurrentMBB) |
986 | DAG.finishBlock(); |
987 | |
988 | CurrentMBB = DAG.RegionBegin->getParent(); |
989 | DAG.startBlock(bb: CurrentMBB); |
990 | // Get real RP for the region if it hasn't be calculated before. After the |
991 | // initial schedule stage real RP will be collected after scheduling. |
992 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
993 | StageID == GCNSchedStageID::ILPInitialSchedule) |
994 | DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB); |
995 | } |
996 | |
997 | void GCNSchedStage::finalizeGCNRegion() { |
998 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
999 | DAG.RescheduleRegions[RegionIdx] = false; |
1000 | if (S.HasHighPressure) |
1001 | DAG.RegionsWithHighRP[RegionIdx] = true; |
1002 | |
1003 | // Revert scheduling if we have dropped occupancy or there is some other |
1004 | // reason that the original schedule is better. |
1005 | checkScheduling(); |
1006 | |
1007 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
1008 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) |
1009 | SavedMutations.swap(x&: DAG.Mutations); |
1010 | |
1011 | DAG.exitRegion(); |
1012 | RegionIdx++; |
1013 | } |
1014 | |
1015 | void GCNSchedStage::checkScheduling() { |
1016 | // Check the results of scheduling. |
1017 | PressureAfter = DAG.getRealRegPressure(RegionIdx); |
1018 | LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); |
1019 | LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n" ); |
1020 | |
1021 | if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && |
1022 | PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { |
1023 | DAG.Pressure[RegionIdx] = PressureAfter; |
1024 | DAG.RegionsWithMinOcc[RegionIdx] = |
1025 | PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; |
1026 | |
1027 | // Early out if we have achieved the occupancy target. |
1028 | LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n" ); |
1029 | return; |
1030 | } |
1031 | |
1032 | unsigned TargetOccupancy = |
1033 | std::min(a: S.getTargetOccupancy(), b: ST.getOccupancyWithLocalMemSize(MF)); |
1034 | unsigned WavesAfter = |
1035 | std::min(a: TargetOccupancy, b: PressureAfter.getOccupancy(ST)); |
1036 | unsigned WavesBefore = |
1037 | std::min(a: TargetOccupancy, b: PressureBefore.getOccupancy(ST)); |
1038 | LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore |
1039 | << ", after " << WavesAfter << ".\n" ); |
1040 | |
1041 | // We may not be able to keep the current target occupancy because of the just |
1042 | // scheduled region. We might still be able to revert scheduling if the |
1043 | // occupancy before was higher, or if the current schedule has register |
1044 | // pressure higher than the excess limits which could lead to more spilling. |
1045 | unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore); |
1046 | |
1047 | // Allow memory bound functions to drop to 4 waves if not limited by an |
1048 | // attribute. |
1049 | if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && |
1050 | WavesAfter >= MFI.getMinAllowedOccupancy()) { |
1051 | LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " |
1052 | << MFI.getMinAllowedOccupancy() << " waves\n" ); |
1053 | NewOccupancy = WavesAfter; |
1054 | } |
1055 | |
1056 | if (NewOccupancy < DAG.MinOccupancy) { |
1057 | DAG.MinOccupancy = NewOccupancy; |
1058 | MFI.limitOccupancy(Limit: DAG.MinOccupancy); |
1059 | DAG.RegionsWithMinOcc.reset(); |
1060 | LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " |
1061 | << DAG.MinOccupancy << ".\n" ); |
1062 | } |
1063 | // The maximum number of arch VGPR on non-unified register file, or the |
1064 | // maximum VGPR + AGPR in the unified register file case. |
1065 | unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); |
1066 | // The maximum number of arch VGPR for both unified and non-unified register |
1067 | // file. |
1068 | unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs()); |
1069 | unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); |
1070 | |
1071 | if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs || |
1072 | PressureAfter.getVGPRNum(UnifiedVGPRFile: false) > MaxArchVGPRs || |
1073 | PressureAfter.getAGPRNum() > MaxArchVGPRs || |
1074 | PressureAfter.getSGPRNum() > MaxSGPRs) { |
1075 | DAG.RescheduleRegions[RegionIdx] = true; |
1076 | DAG.RegionsWithHighRP[RegionIdx] = true; |
1077 | DAG.RegionsWithExcessRP[RegionIdx] = true; |
1078 | } |
1079 | |
1080 | // Revert if this region's schedule would cause a drop in occupancy or |
1081 | // spilling. |
1082 | if (shouldRevertScheduling(WavesAfter)) { |
1083 | revertScheduling(); |
1084 | } else { |
1085 | DAG.Pressure[RegionIdx] = PressureAfter; |
1086 | DAG.RegionsWithMinOcc[RegionIdx] = |
1087 | PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; |
1088 | } |
1089 | } |
1090 | |
1091 | unsigned |
1092 | GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, |
1093 | DenseMap<unsigned, unsigned> &ReadyCycles, |
1094 | const TargetSchedModel &SM) { |
1095 | unsigned ReadyCycle = CurrCycle; |
1096 | for (auto &D : SU.Preds) { |
1097 | if (D.isAssignedRegDep()) { |
1098 | MachineInstr *DefMI = D.getSUnit()->getInstr(); |
1099 | unsigned Latency = SM.computeInstrLatency(MI: DefMI); |
1100 | unsigned DefReady = ReadyCycles[DAG.getSUnit(MI: DefMI)->NodeNum]; |
1101 | ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency); |
1102 | } |
1103 | } |
1104 | ReadyCycles[SU.NodeNum] = ReadyCycle; |
1105 | return ReadyCycle; |
1106 | } |
1107 | |
1108 | #ifndef NDEBUG |
1109 | struct EarlierIssuingCycle { |
1110 | bool operator()(std::pair<MachineInstr *, unsigned> A, |
1111 | std::pair<MachineInstr *, unsigned> B) const { |
1112 | return A.second < B.second; |
1113 | } |
1114 | }; |
1115 | |
1116 | static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, |
1117 | EarlierIssuingCycle> &ReadyCycles) { |
1118 | if (ReadyCycles.empty()) |
1119 | return; |
1120 | unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); |
1121 | dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum |
1122 | << " ##################\n# Cycle #\t\t\tInstruction " |
1123 | " " |
1124 | " \n" ; |
1125 | unsigned IPrev = 1; |
1126 | for (auto &I : ReadyCycles) { |
1127 | if (I.second > IPrev + 1) |
1128 | dbgs() << "****************************** BUBBLE OF " << I.second - IPrev |
1129 | << " CYCLES DETECTED ******************************\n\n" ; |
1130 | dbgs() << "[ " << I.second << " ] : " << *I.first << "\n" ; |
1131 | IPrev = I.second; |
1132 | } |
1133 | } |
1134 | #endif |
1135 | |
1136 | ScheduleMetrics |
1137 | GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { |
1138 | #ifndef NDEBUG |
1139 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
1140 | ReadyCyclesSorted; |
1141 | #endif |
1142 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
1143 | unsigned SumBubbles = 0; |
1144 | DenseMap<unsigned, unsigned> ReadyCycles; |
1145 | unsigned CurrCycle = 0; |
1146 | for (auto &SU : InputSchedule) { |
1147 | unsigned ReadyCycle = |
1148 | computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); |
1149 | SumBubbles += ReadyCycle - CurrCycle; |
1150 | #ifndef NDEBUG |
1151 | ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); |
1152 | #endif |
1153 | CurrCycle = ++ReadyCycle; |
1154 | } |
1155 | #ifndef NDEBUG |
1156 | LLVM_DEBUG( |
1157 | printScheduleModel(ReadyCyclesSorted); |
1158 | dbgs() << "\n\t" |
1159 | << "Metric: " |
1160 | << (SumBubbles |
1161 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
1162 | : 1) |
1163 | << "\n\n" ); |
1164 | #endif |
1165 | |
1166 | return ScheduleMetrics(CurrCycle, SumBubbles); |
1167 | } |
1168 | |
1169 | ScheduleMetrics |
1170 | GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { |
1171 | #ifndef NDEBUG |
1172 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
1173 | ReadyCyclesSorted; |
1174 | #endif |
1175 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
1176 | unsigned SumBubbles = 0; |
1177 | DenseMap<unsigned, unsigned> ReadyCycles; |
1178 | unsigned CurrCycle = 0; |
1179 | for (auto &MI : DAG) { |
1180 | SUnit *SU = DAG.getSUnit(MI: &MI); |
1181 | if (!SU) |
1182 | continue; |
1183 | unsigned ReadyCycle = |
1184 | computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM); |
1185 | SumBubbles += ReadyCycle - CurrCycle; |
1186 | #ifndef NDEBUG |
1187 | ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); |
1188 | #endif |
1189 | CurrCycle = ++ReadyCycle; |
1190 | } |
1191 | #ifndef NDEBUG |
1192 | LLVM_DEBUG( |
1193 | printScheduleModel(ReadyCyclesSorted); |
1194 | dbgs() << "\n\t" |
1195 | << "Metric: " |
1196 | << (SumBubbles |
1197 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
1198 | : 1) |
1199 | << "\n\n" ); |
1200 | #endif |
1201 | |
1202 | return ScheduleMetrics(CurrCycle, SumBubbles); |
1203 | } |
1204 | |
1205 | bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { |
1206 | if (WavesAfter < DAG.MinOccupancy) |
1207 | return true; |
1208 | |
1209 | return false; |
1210 | } |
1211 | |
1212 | bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
1213 | if (PressureAfter == PressureBefore) |
1214 | return false; |
1215 | |
1216 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
1217 | return true; |
1218 | |
1219 | if (mayCauseSpilling(WavesAfter)) |
1220 | return true; |
1221 | |
1222 | return false; |
1223 | } |
1224 | |
1225 | bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { |
1226 | // If RP is not reduced in the unclustered reschedule stage, revert to the |
1227 | // old schedule. |
1228 | if ((WavesAfter <= PressureBefore.getOccupancy(ST) && |
1229 | mayCauseSpilling(WavesAfter)) || |
1230 | GCNSchedStage::shouldRevertScheduling(WavesAfter)) { |
1231 | LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n" ); |
1232 | return true; |
1233 | } |
1234 | |
1235 | // Do not attempt to relax schedule even more if we are already spilling. |
1236 | if (isRegionWithExcessRP()) |
1237 | return false; |
1238 | |
1239 | LLVM_DEBUG( |
1240 | dbgs() |
1241 | << "\n\t *** In shouldRevertScheduling ***\n" |
1242 | << " *********** BEFORE UnclusteredHighRPStage ***********\n" ); |
1243 | ScheduleMetrics MBefore = |
1244 | getScheduleMetrics(InputSchedule: DAG.SUnits); |
1245 | LLVM_DEBUG( |
1246 | dbgs() |
1247 | << "\n *********** AFTER UnclusteredHighRPStage ***********\n" ); |
1248 | ScheduleMetrics MAfter = getScheduleMetrics(DAG); |
1249 | unsigned OldMetric = MBefore.getMetric(); |
1250 | unsigned NewMetric = MAfter.getMetric(); |
1251 | unsigned WavesBefore = |
1252 | std::min(a: S.getTargetOccupancy(), b: PressureBefore.getOccupancy(ST)); |
1253 | unsigned Profit = |
1254 | ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * |
1255 | ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / |
1256 | NewMetric) / |
1257 | ScheduleMetrics::ScaleFactor; |
1258 | LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " |
1259 | << MAfter << "Profit: " << Profit << "\n" ); |
1260 | return Profit < ScheduleMetrics::ScaleFactor; |
1261 | } |
1262 | |
1263 | bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { |
1264 | if (PressureAfter == PressureBefore) |
1265 | return false; |
1266 | |
1267 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
1268 | return true; |
1269 | |
1270 | if (mayCauseSpilling(WavesAfter)) |
1271 | return true; |
1272 | |
1273 | return false; |
1274 | } |
1275 | |
1276 | bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { |
1277 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
1278 | return true; |
1279 | |
1280 | if (mayCauseSpilling(WavesAfter)) |
1281 | return true; |
1282 | |
1283 | return false; |
1284 | } |
1285 | |
1286 | bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
1287 | if (mayCauseSpilling(WavesAfter)) |
1288 | return true; |
1289 | |
1290 | return false; |
1291 | } |
1292 | |
1293 | bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { |
1294 | if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() && |
1295 | !PressureAfter.less(MF, O: PressureBefore)) { |
1296 | LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n" ); |
1297 | return true; |
1298 | } |
1299 | |
1300 | return false; |
1301 | } |
1302 | |
1303 | void GCNSchedStage::revertScheduling() { |
1304 | DAG.RegionsWithMinOcc[RegionIdx] = |
1305 | PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; |
1306 | LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n" ); |
1307 | DAG.RescheduleRegions[RegionIdx] = |
1308 | S.hasNextStage() && |
1309 | S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule; |
1310 | DAG.RegionEnd = DAG.RegionBegin; |
1311 | int SkippedDebugInstr = 0; |
1312 | for (MachineInstr *MI : Unsched) { |
1313 | if (MI->isDebugInstr()) { |
1314 | ++SkippedDebugInstr; |
1315 | continue; |
1316 | } |
1317 | |
1318 | if (MI->getIterator() != DAG.RegionEnd) { |
1319 | DAG.BB->remove(I: MI); |
1320 | DAG.BB->insert(I: DAG.RegionEnd, MI); |
1321 | if (!MI->isDebugInstr()) |
1322 | DAG.LIS->handleMove(MI&: *MI, UpdateFlags: true); |
1323 | } |
1324 | |
1325 | // Reset read-undef flags and update them later. |
1326 | for (auto &Op : MI->all_defs()) |
1327 | Op.setIsUndef(false); |
1328 | RegisterOperands RegOpers; |
1329 | RegOpers.collect(MI: *MI, TRI: *DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false); |
1330 | if (!MI->isDebugInstr()) { |
1331 | if (DAG.ShouldTrackLaneMasks) { |
1332 | // Adjust liveness and add missing dead+read-undef flags. |
1333 | SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot(); |
1334 | RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI); |
1335 | } else { |
1336 | // Adjust for missing dead-def flags. |
1337 | RegOpers.detectDeadDefs(MI: *MI, LIS: *DAG.LIS); |
1338 | } |
1339 | } |
1340 | DAG.RegionEnd = MI->getIterator(); |
1341 | ++DAG.RegionEnd; |
1342 | LLVM_DEBUG(dbgs() << "Scheduling " << *MI); |
1343 | } |
1344 | |
1345 | // After reverting schedule, debug instrs will now be at the end of the block |
1346 | // and RegionEnd will point to the first debug instr. Increment RegionEnd |
1347 | // pass debug instrs to the actual end of the scheduling region. |
1348 | while (SkippedDebugInstr-- > 0) |
1349 | ++DAG.RegionEnd; |
1350 | |
1351 | // If Unsched.front() instruction is a debug instruction, this will actually |
1352 | // shrink the region since we moved all debug instructions to the end of the |
1353 | // block. Find the first instruction that is not a debug instruction. |
1354 | DAG.RegionBegin = Unsched.front()->getIterator(); |
1355 | if (DAG.RegionBegin->isDebugInstr()) { |
1356 | for (MachineInstr *MI : Unsched) { |
1357 | if (MI->isDebugInstr()) |
1358 | continue; |
1359 | DAG.RegionBegin = MI->getIterator(); |
1360 | break; |
1361 | } |
1362 | } |
1363 | |
1364 | // Then move the debug instructions back into their correct place and set |
1365 | // RegionBegin and RegionEnd if needed. |
1366 | DAG.placeDebugValues(); |
1367 | |
1368 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
1369 | } |
1370 | |
1371 | void PreRARematStage::collectRematerializableInstructions() { |
1372 | const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI); |
1373 | for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) { |
1374 | Register Reg = Register::index2VirtReg(Index: I); |
1375 | if (!DAG.LIS->hasInterval(Reg)) |
1376 | continue; |
1377 | |
1378 | // TODO: Handle AGPR and SGPR rematerialization |
1379 | if (!SRI->isVGPRClass(RC: DAG.MRI.getRegClass(Reg)) || |
1380 | !DAG.MRI.hasOneDef(RegNo: Reg) || !DAG.MRI.hasOneNonDBGUse(RegNo: Reg)) |
1381 | continue; |
1382 | |
1383 | MachineOperand *Op = DAG.MRI.getOneDef(Reg); |
1384 | MachineInstr *Def = Op->getParent(); |
1385 | if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(MI: *Def)) |
1386 | continue; |
1387 | |
1388 | MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(RegNo: Reg); |
1389 | if (Def->getParent() == UseI->getParent()) |
1390 | continue; |
1391 | |
1392 | // We are only collecting defs that are defined in another block and are |
1393 | // live-through or used inside regions at MinOccupancy. This means that the |
1394 | // register must be in the live-in set for the region. |
1395 | bool AddedToRematList = false; |
1396 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
1397 | auto It = DAG.LiveIns[I].find(Val: Reg); |
1398 | if (It != DAG.LiveIns[I].end() && !It->second.none()) { |
1399 | if (DAG.RegionsWithMinOcc[I]) { |
1400 | RematerializableInsts[I][Def] = UseI; |
1401 | AddedToRematList = true; |
1402 | } |
1403 | |
1404 | // Collect regions with rematerializable reg as live-in to avoid |
1405 | // searching later when updating RP. |
1406 | RematDefToLiveInRegions[Def].push_back(Elt: I); |
1407 | } |
1408 | } |
1409 | if (!AddedToRematList) |
1410 | RematDefToLiveInRegions.erase(Val: Def); |
1411 | } |
1412 | } |
1413 | |
1414 | bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, |
1415 | const TargetInstrInfo *TII) { |
1416 | // Temporary copies of cached variables we will be modifying and replacing if |
1417 | // sinking succeeds. |
1418 | SmallVector< |
1419 | std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> |
1420 | NewRegions; |
1421 | DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns; |
1422 | DenseMap<unsigned, GCNRegPressure> NewPressure; |
1423 | BitVector NewRescheduleRegions; |
1424 | LiveIntervals *LIS = DAG.LIS; |
1425 | |
1426 | NewRegions.resize(N: DAG.Regions.size()); |
1427 | NewRescheduleRegions.resize(N: DAG.Regions.size()); |
1428 | |
1429 | // Collect only regions that has a rematerializable def as a live-in. |
1430 | SmallSet<unsigned, 16> ImpactedRegions; |
1431 | for (const auto &It : RematDefToLiveInRegions) |
1432 | ImpactedRegions.insert(I: It.second.begin(), E: It.second.end()); |
1433 | |
1434 | // Make copies of register pressure and live-ins cache that will be updated |
1435 | // as we rematerialize. |
1436 | for (auto Idx : ImpactedRegions) { |
1437 | NewPressure[Idx] = DAG.Pressure[Idx]; |
1438 | NewLiveIns[Idx] = DAG.LiveIns[Idx]; |
1439 | } |
1440 | NewRegions = DAG.Regions; |
1441 | NewRescheduleRegions.reset(); |
1442 | |
1443 | DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; |
1444 | bool Improved = false; |
1445 | for (auto I : ImpactedRegions) { |
1446 | if (!DAG.RegionsWithMinOcc[I]) |
1447 | continue; |
1448 | |
1449 | Improved = false; |
1450 | int VGPRUsage = NewPressure[I].getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()); |
1451 | int SGPRUsage = NewPressure[I].getSGPRNum(); |
1452 | |
1453 | // TODO: Handle occupancy drop due to AGPR and SGPR. |
1454 | // Check if cause of occupancy drop is due to VGPR usage and not SGPR. |
1455 | if (ST.getOccupancyWithNumSGPRs(SGPRs: SGPRUsage) == DAG.MinOccupancy) |
1456 | break; |
1457 | |
1458 | // The occupancy of this region could have been improved by a previous |
1459 | // iteration's sinking of defs. |
1460 | if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) { |
1461 | NewRescheduleRegions[I] = true; |
1462 | Improved = true; |
1463 | continue; |
1464 | } |
1465 | |
1466 | // First check if we have enough trivially rematerializable instructions to |
1467 | // improve occupancy. Optimistically assume all instructions we are able to |
1468 | // sink decreased RP. |
1469 | int TotalSinkableRegs = 0; |
1470 | for (const auto &It : RematerializableInsts[I]) { |
1471 | MachineInstr *Def = It.first; |
1472 | Register DefReg = Def->getOperand(i: 0).getReg(); |
1473 | TotalSinkableRegs += |
1474 | SIRegisterInfo::getNumCoveredRegs(LM: NewLiveIns[I][DefReg]); |
1475 | } |
1476 | int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; |
1477 | unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRs: VGPRsAfterSink); |
1478 | // If in the most optimistic scenario, we cannot improve occupancy, then do |
1479 | // not attempt to sink any instructions. |
1480 | if (OptimisticOccupancy <= DAG.MinOccupancy) |
1481 | break; |
1482 | |
1483 | unsigned ImproveOccupancy = 0; |
1484 | SmallVector<MachineInstr *, 4> SinkedDefs; |
1485 | for (auto &It : RematerializableInsts[I]) { |
1486 | MachineInstr *Def = It.first; |
1487 | MachineBasicBlock::iterator InsertPos = |
1488 | MachineBasicBlock::iterator(It.second); |
1489 | Register Reg = Def->getOperand(i: 0).getReg(); |
1490 | // Rematerialize MI to its use block. Since we are only rematerializing |
1491 | // instructions that do not have any virtual reg uses, we do not need to |
1492 | // call LiveRangeEdit::allUsesAvailableAt() and |
1493 | // LiveRangeEdit::canRematerializeAt(). |
1494 | TII->reMaterialize(MBB&: *InsertPos->getParent(), MI: InsertPos, DestReg: Reg, |
1495 | SubIdx: Def->getOperand(i: 0).getSubReg(), Orig: *Def, TRI: *DAG.TRI); |
1496 | MachineInstr *NewMI = &*std::prev(x: InsertPos); |
1497 | LIS->InsertMachineInstrInMaps(MI&: *NewMI); |
1498 | LIS->removeInterval(Reg); |
1499 | LIS->createAndComputeVirtRegInterval(Reg); |
1500 | InsertedMIToOldDef[NewMI] = Def; |
1501 | |
1502 | // Update region boundaries in scheduling region we sinked from since we |
1503 | // may sink an instruction that was at the beginning or end of its region |
1504 | DAG.updateRegionBoundaries(RegionBoundaries&: NewRegions, MI: Def, /*NewMI =*/nullptr, |
1505 | /*Removing =*/true); |
1506 | |
1507 | // Update region boundaries in region we sinked to. |
1508 | DAG.updateRegionBoundaries(RegionBoundaries&: NewRegions, MI: InsertPos, NewMI); |
1509 | |
1510 | LaneBitmask PrevMask = NewLiveIns[I][Reg]; |
1511 | // FIXME: Also update cached pressure for where the def was sinked from. |
1512 | // Update RP for all regions that has this reg as a live-in and remove |
1513 | // the reg from all regions as a live-in. |
1514 | for (auto Idx : RematDefToLiveInRegions[Def]) { |
1515 | NewLiveIns[Idx].erase(Val: Reg); |
1516 | if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) { |
1517 | // Def is live-through and not used in this block. |
1518 | NewPressure[Idx].inc(Reg, PrevMask, NewMask: LaneBitmask::getNone(), MRI: DAG.MRI); |
1519 | } else { |
1520 | // Def is used and rematerialized into this block. |
1521 | GCNDownwardRPTracker RPT(*LIS); |
1522 | auto *NonDbgMI = &*skipDebugInstructionsForward( |
1523 | It: NewRegions[Idx].first, End: NewRegions[Idx].second); |
1524 | RPT.reset(MI: *NonDbgMI, LiveRegs: &NewLiveIns[Idx]); |
1525 | RPT.advance(End: NewRegions[Idx].second); |
1526 | NewPressure[Idx] = RPT.moveMaxPressure(); |
1527 | } |
1528 | } |
1529 | |
1530 | SinkedDefs.push_back(Elt: Def); |
1531 | ImproveOccupancy = NewPressure[I].getOccupancy(ST); |
1532 | if (ImproveOccupancy > DAG.MinOccupancy) |
1533 | break; |
1534 | } |
1535 | |
1536 | // Remove defs we just sinked from all regions' list of sinkable defs |
1537 | for (auto &Def : SinkedDefs) |
1538 | for (auto TrackedIdx : RematDefToLiveInRegions[Def]) |
1539 | RematerializableInsts[TrackedIdx].erase(Key: Def); |
1540 | |
1541 | if (ImproveOccupancy <= DAG.MinOccupancy) |
1542 | break; |
1543 | |
1544 | NewRescheduleRegions[I] = true; |
1545 | Improved = true; |
1546 | } |
1547 | |
1548 | if (!Improved) { |
1549 | // Occupancy was not improved for all regions that were at MinOccupancy. |
1550 | // Undo sinking and remove newly rematerialized instructions. |
1551 | for (auto &Entry : InsertedMIToOldDef) { |
1552 | MachineInstr *MI = Entry.first; |
1553 | MachineInstr *OldMI = Entry.second; |
1554 | Register Reg = MI->getOperand(i: 0).getReg(); |
1555 | LIS->RemoveMachineInstrFromMaps(MI&: *MI); |
1556 | MI->eraseFromParent(); |
1557 | OldMI->clearRegisterDeads(Reg); |
1558 | LIS->removeInterval(Reg); |
1559 | LIS->createAndComputeVirtRegInterval(Reg); |
1560 | } |
1561 | return false; |
1562 | } |
1563 | |
1564 | // Occupancy was improved for all regions. |
1565 | for (auto &Entry : InsertedMIToOldDef) { |
1566 | MachineInstr *MI = Entry.first; |
1567 | MachineInstr *OldMI = Entry.second; |
1568 | |
1569 | // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. |
1570 | DAG.BBLiveInMap.erase(Val: OldMI); |
1571 | |
1572 | // Remove OldMI and update LIS |
1573 | Register Reg = MI->getOperand(i: 0).getReg(); |
1574 | LIS->RemoveMachineInstrFromMaps(MI&: *OldMI); |
1575 | OldMI->eraseFromParent(); |
1576 | LIS->removeInterval(Reg); |
1577 | LIS->createAndComputeVirtRegInterval(Reg); |
1578 | } |
1579 | |
1580 | // Update live-ins, register pressure, and regions caches. |
1581 | for (auto Idx : ImpactedRegions) { |
1582 | DAG.LiveIns[Idx] = NewLiveIns[Idx]; |
1583 | DAG.Pressure[Idx] = NewPressure[Idx]; |
1584 | DAG.MBBLiveIns.erase(Val: DAG.Regions[Idx].first->getParent()); |
1585 | } |
1586 | DAG.Regions = NewRegions; |
1587 | DAG.RescheduleRegions = NewRescheduleRegions; |
1588 | |
1589 | SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
1590 | MFI.increaseOccupancy(MF, Limit: ++DAG.MinOccupancy); |
1591 | |
1592 | return true; |
1593 | } |
1594 | |
1595 | // Copied from MachineLICM |
1596 | bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { |
1597 | if (!DAG.TII->isTriviallyReMaterializable(MI)) |
1598 | return false; |
1599 | |
1600 | for (const MachineOperand &MO : MI.all_uses()) |
1601 | if (MO.getReg().isVirtual()) |
1602 | return false; |
1603 | |
1604 | return true; |
1605 | } |
1606 | |
1607 | // When removing, we will have to check both beginning and ending of the region. |
1608 | // When inserting, we will only have to check if we are inserting NewMI in front |
1609 | // of a scheduling region and do not need to check the ending since we will only |
1610 | // ever be inserting before an already existing MI. |
1611 | void GCNScheduleDAGMILive::updateRegionBoundaries( |
1612 | SmallVectorImpl<std::pair<MachineBasicBlock::iterator, |
1613 | MachineBasicBlock::iterator>> &RegionBoundaries, |
1614 | MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { |
1615 | unsigned I = 0, E = RegionBoundaries.size(); |
1616 | // Search for first region of the block where MI is located |
1617 | while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent()) |
1618 | ++I; |
1619 | |
1620 | for (; I != E; ++I) { |
1621 | if (MI->getParent() != RegionBoundaries[I].first->getParent()) |
1622 | return; |
1623 | |
1624 | if (Removing && MI == RegionBoundaries[I].first && |
1625 | MI == RegionBoundaries[I].second) { |
1626 | // MI is in a region with size 1, after removing, the region will be |
1627 | // size 0, set RegionBegin and RegionEnd to pass end of block iterator. |
1628 | RegionBoundaries[I] = |
1629 | std::pair(MI->getParent()->end(), MI->getParent()->end()); |
1630 | return; |
1631 | } |
1632 | if (MI == RegionBoundaries[I].first) { |
1633 | if (Removing) |
1634 | RegionBoundaries[I] = |
1635 | std::pair(std::next(x: MI), RegionBoundaries[I].second); |
1636 | else |
1637 | // Inserted NewMI in front of region, set new RegionBegin to NewMI |
1638 | RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI), |
1639 | RegionBoundaries[I].second); |
1640 | return; |
1641 | } |
1642 | if (Removing && MI == RegionBoundaries[I].second) { |
1643 | RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(x: MI)); |
1644 | return; |
1645 | } |
1646 | } |
1647 | } |
1648 | |
1649 | static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { |
1650 | return std::any_of( |
1651 | first: DAG->begin(), last: DAG->end(), pred: [](MachineBasicBlock::iterator MI) { |
1652 | unsigned Opc = MI->getOpcode(); |
1653 | return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT; |
1654 | }); |
1655 | } |
1656 | |
1657 | GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( |
1658 | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, |
1659 | bool RemoveKillFlags) |
1660 | : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} |
1661 | |
1662 | void GCNPostScheduleDAGMILive::schedule() { |
1663 | HasIGLPInstrs = hasIGLPInstrs(DAG: this); |
1664 | if (HasIGLPInstrs) { |
1665 | SavedMutations.clear(); |
1666 | SavedMutations.swap(x&: Mutations); |
1667 | addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA)); |
1668 | } |
1669 | |
1670 | ScheduleDAGMI::schedule(); |
1671 | } |
1672 | |
1673 | void GCNPostScheduleDAGMILive::finalizeSchedule() { |
1674 | if (HasIGLPInstrs) |
1675 | SavedMutations.swap(x&: Mutations); |
1676 | |
1677 | ScheduleDAGMI::finalizeSchedule(); |
1678 | } |
1679 | |