1 | //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This contains a MachineSchedStrategy implementation for maximizing wave |
11 | /// occupancy on GCN hardware. |
12 | /// |
13 | /// This pass will apply multiple scheduling stages to the same function. |
14 | /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual |
15 | /// entry point for the scheduling of those regions is |
16 | /// GCNScheduleDAGMILive::runSchedStages. |
17 | |
18 | /// Generally, the reason for having multiple scheduling stages is to account |
19 | /// for the kernel-wide effect of register usage on occupancy. Usually, only a |
20 | /// few scheduling regions will have register pressure high enough to limit |
21 | /// occupancy for the kernel, so constraints can be relaxed to improve ILP in |
22 | /// other regions. |
23 | /// |
24 | //===----------------------------------------------------------------------===// |
25 | |
26 | #include "GCNSchedStrategy.h" |
27 | #include "AMDGPUIGroupLP.h" |
28 | #include "GCNRegPressure.h" |
29 | #include "SIMachineFunctionInfo.h" |
30 | #include "Utils/AMDGPUBaseInfo.h" |
31 | #include "llvm/ADT/STLExtras.h" |
32 | #include "llvm/CodeGen/RegisterClassInfo.h" |
33 | #include "llvm/MC/LaneBitmask.h" |
34 | #include "llvm/Support/ErrorHandling.h" |
35 | |
36 | #define DEBUG_TYPE "machine-scheduler" |
37 | |
38 | using namespace llvm; |
39 | |
40 | static cl::opt<bool> DisableUnclusterHighRP( |
41 | "amdgpu-disable-unclustered-high-rp-reschedule" , cl::Hidden, |
42 | cl::desc("Disable unclustered high register pressure " |
43 | "reduction scheduling stage." ), |
44 | cl::init(Val: false)); |
45 | |
46 | static cl::opt<bool> DisableClusteredLowOccupancy( |
47 | "amdgpu-disable-clustered-low-occupancy-reschedule" , cl::Hidden, |
48 | cl::desc("Disable clustered low occupancy " |
49 | "rescheduling for ILP scheduling stage." ), |
50 | cl::init(Val: false)); |
51 | |
52 | static cl::opt<unsigned> ScheduleMetricBias( |
53 | "amdgpu-schedule-metric-bias" , cl::Hidden, |
54 | cl::desc( |
55 | "Sets the bias which adds weight to occupancy vs latency. Set it to " |
56 | "100 to chase the occupancy only." ), |
57 | cl::init(Val: 10)); |
58 | |
59 | static cl::opt<bool> |
60 | RelaxedOcc("amdgpu-schedule-relaxed-occupancy" , cl::Hidden, |
61 | cl::desc("Relax occupancy targets for kernels which are memory " |
62 | "bound (amdgpu-membound-threshold), or " |
63 | "Wave Limited (amdgpu-limit-wave-threshold)." ), |
64 | cl::init(Val: false)); |
65 | |
66 | static cl::opt<bool> GCNTrackers( |
67 | "amdgpu-use-amdgpu-trackers" , cl::Hidden, |
68 | cl::desc("Use the AMDGPU specific RPTrackers during scheduling" ), |
69 | cl::init(Val: false)); |
70 | |
71 | const unsigned ScheduleMetrics::ScaleFactor = 100; |
72 | |
73 | GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) |
74 | : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), |
75 | DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { |
76 | } |
77 | |
78 | void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { |
79 | GenericScheduler::initialize(dag: DAG); |
80 | |
81 | MF = &DAG->MF; |
82 | |
83 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
84 | |
85 | SGPRExcessLimit = |
86 | Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass); |
87 | VGPRExcessLimit = |
88 | Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass); |
89 | |
90 | SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
91 | // Set the initial TargetOccupnacy to the maximum occupancy that we can |
92 | // achieve for this function. This effectively sets a lower bound on the |
93 | // 'Critical' register limits in the scheduler. |
94 | // Allow for lower occupancy targets if kernel is wave limited or memory |
95 | // bound, and using the relaxed occupancy feature. |
96 | TargetOccupancy = |
97 | RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); |
98 | SGPRCriticalLimit = |
99 | std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit); |
100 | |
101 | if (!KnownExcessRP) { |
102 | VGPRCriticalLimit = std::min( |
103 | a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize()), |
104 | b: VGPRExcessLimit); |
105 | } else { |
106 | // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except |
107 | // returns a reasonably small number for targets with lots of VGPRs, such |
108 | // as GFX10 and GFX11. |
109 | LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " |
110 | "VGPRCriticalLimit calculation method.\n" ); |
111 | unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize(); |
112 | unsigned Granule = |
113 | AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST, DynamicVGPRBlockSize); |
114 | unsigned Addressable = |
115 | AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: &ST, DynamicVGPRBlockSize); |
116 | unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule); |
117 | VGPRBudget = std::max(a: VGPRBudget, b: Granule); |
118 | VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit); |
119 | } |
120 | |
121 | // Subtract error margin and bias from register limits and avoid overflow. |
122 | SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit); |
123 | VGPRCriticalLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit); |
124 | SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit); |
125 | VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit); |
126 | |
127 | LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit |
128 | << ", VGPRExcessLimit = " << VGPRExcessLimit |
129 | << ", SGPRCriticalLimit = " << SGPRCriticalLimit |
130 | << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n" ); |
131 | } |
132 | |
133 | /// Checks whether \p SU can use the cached DAG pressure diffs to compute the |
134 | /// current register pressure. |
135 | /// |
136 | /// This works for the common case, but it has a few exceptions that have been |
137 | /// observed through trial and error: |
138 | /// - Explicit physical register operands |
139 | /// - Subregister definitions |
140 | /// |
141 | /// In both of those cases, PressureDiff doesn't represent the actual pressure, |
142 | /// and querying LiveIntervals through the RegPressureTracker is needed to get |
143 | /// an accurate value. |
144 | /// |
145 | /// We should eventually only use PressureDiff for maximum performance, but this |
146 | /// already allows 80% of SUs to take the fast path without changing scheduling |
147 | /// at all. Further changes would either change scheduling, or require a lot |
148 | /// more logic to recover an accurate pressure estimate from the PressureDiffs. |
149 | static bool canUsePressureDiffs(const SUnit &SU) { |
150 | if (!SU.isInstr()) |
151 | return false; |
152 | |
153 | // Cannot use pressure diffs for subregister defs or with physregs, it's |
154 | // imprecise in both cases. |
155 | for (const auto &Op : SU.getInstr()->operands()) { |
156 | if (!Op.isReg() || Op.isImplicit()) |
157 | continue; |
158 | if (Op.getReg().isPhysical() || |
159 | (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister)) |
160 | return false; |
161 | } |
162 | return true; |
163 | } |
164 | |
165 | static void getRegisterPressures( |
166 | bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, |
167 | std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure, |
168 | GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, |
169 | ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) { |
170 | // getDownwardPressure() and getUpwardPressure() make temporary changes to |
171 | // the tracker, so we need to pass those function a non-const copy. |
172 | RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); |
173 | if (!GCNTrackers) { |
174 | AtTop |
175 | ? TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure) |
176 | : TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure); |
177 | |
178 | return; |
179 | } |
180 | |
181 | // GCNTrackers |
182 | Pressure.resize(new_size: 4, x: 0); |
183 | MachineInstr *MI = SU->getInstr(); |
184 | GCNRegPressure NewPressure; |
185 | if (AtTop) { |
186 | GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); |
187 | NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, TRI: SRI); |
188 | } else { |
189 | GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); |
190 | TempUpwardTracker.recede(MI: *MI); |
191 | NewPressure = TempUpwardTracker.getPressure(); |
192 | } |
193 | Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); |
194 | Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = |
195 | NewPressure.getArchVGPRNum(); |
196 | Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); |
197 | } |
198 | |
199 | void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, |
200 | bool AtTop, |
201 | const RegPressureTracker &RPTracker, |
202 | const SIRegisterInfo *SRI, |
203 | unsigned SGPRPressure, |
204 | unsigned VGPRPressure, bool IsBottomUp) { |
205 | Cand.SU = SU; |
206 | Cand.AtTop = AtTop; |
207 | |
208 | if (!DAG->isTrackingPressure()) |
209 | return; |
210 | |
211 | Pressure.clear(); |
212 | MaxPressure.clear(); |
213 | |
214 | // We try to use the cached PressureDiffs in the ScheduleDAG whenever |
215 | // possible over querying the RegPressureTracker. |
216 | // |
217 | // RegPressureTracker will make a lot of LIS queries which are very |
218 | // expensive, it is considered a slow function in this context. |
219 | // |
220 | // PressureDiffs are precomputed and cached, and getPressureDiff is just a |
221 | // trivial lookup into an array. It is pretty much free. |
222 | // |
223 | // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of |
224 | // PressureDiffs. |
225 | if (AtTop || !canUsePressureDiffs(SU: *SU) || GCNTrackers) { |
226 | getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, |
227 | DownwardTracker, UpwardTracker, DAG, SRI); |
228 | } else { |
229 | // Reserve 4 slots. |
230 | Pressure.resize(new_size: 4, x: 0); |
231 | Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; |
232 | Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; |
233 | |
234 | for (const auto &Diff : DAG->getPressureDiff(SU)) { |
235 | if (!Diff.isValid()) |
236 | continue; |
237 | // PressureDiffs is always bottom-up so if we're working top-down we need |
238 | // to invert its sign. |
239 | Pressure[Diff.getPSet()] += |
240 | (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc()); |
241 | } |
242 | |
243 | #ifdef EXPENSIVE_CHECKS |
244 | std::vector<unsigned> CheckPressure, CheckMaxPressure; |
245 | getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, |
246 | DownwardTracker, UpwardTracker, DAG, SRI); |
247 | if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != |
248 | CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || |
249 | Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != |
250 | CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { |
251 | errs() << "Register Pressure is inaccurate when calculated through " |
252 | "PressureDiff\n" |
253 | << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] |
254 | << ", expected " |
255 | << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" |
256 | << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] |
257 | << ", expected " |
258 | << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n" ; |
259 | report_fatal_error("inaccurate register pressure calculation" ); |
260 | } |
261 | #endif |
262 | } |
263 | |
264 | unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
265 | unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
266 | |
267 | // If two instructions increase the pressure of different register sets |
268 | // by the same amount, the generic scheduler will prefer to schedule the |
269 | // instruction that increases the set with the least amount of registers, |
270 | // which in our case would be SGPRs. This is rarely what we want, so |
271 | // when we report excess/critical register pressure, we do it either |
272 | // only for VGPRs or only for SGPRs. |
273 | |
274 | // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. |
275 | const unsigned MaxVGPRPressureInc = 16; |
276 | bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; |
277 | bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; |
278 | |
279 | // FIXME: We have to enter REG-EXCESS before we reach the actual threshold |
280 | // to increase the likelihood we don't go over the limits. We should improve |
281 | // the analysis to look through dependencies to find the path with the least |
282 | // register pressure. |
283 | |
284 | // We only need to update the RPDelta for instructions that increase register |
285 | // pressure. Instructions that decrease or keep reg pressure the same will be |
286 | // marked as RegExcess in tryCandidate() when they are compared with |
287 | // instructions that increase the register pressure. |
288 | if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { |
289 | HasHighPressure = true; |
290 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
291 | Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); |
292 | } |
293 | |
294 | if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { |
295 | HasHighPressure = true; |
296 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
297 | Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); |
298 | } |
299 | |
300 | // Register pressure is considered 'CRITICAL' if it is approaching a value |
301 | // that would reduce the wave occupancy for the execution unit. When |
302 | // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both |
303 | // has the same cost, so we don't need to prefer one over the other. |
304 | |
305 | int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; |
306 | int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; |
307 | |
308 | if (SGPRDelta >= 0 || VGPRDelta >= 0) { |
309 | HasHighPressure = true; |
310 | if (SGPRDelta > VGPRDelta) { |
311 | Cand.RPDelta.CriticalMax = |
312 | PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
313 | Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); |
314 | } else { |
315 | Cand.RPDelta.CriticalMax = |
316 | PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
317 | Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); |
318 | } |
319 | } |
320 | } |
321 | |
322 | // This function is mostly cut and pasted from |
323 | // GenericScheduler::pickNodeFromQueue() |
324 | void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, |
325 | const CandPolicy &ZonePolicy, |
326 | const RegPressureTracker &RPTracker, |
327 | SchedCandidate &Cand, |
328 | bool IsBottomUp) { |
329 | const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); |
330 | ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); |
331 | unsigned SGPRPressure = 0; |
332 | unsigned VGPRPressure = 0; |
333 | if (DAG->isTrackingPressure()) { |
334 | if (!GCNTrackers) { |
335 | SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
336 | VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
337 | } else { |
338 | GCNRPTracker *T = IsBottomUp |
339 | ? static_cast<GCNRPTracker *>(&UpwardTracker) |
340 | : static_cast<GCNRPTracker *>(&DownwardTracker); |
341 | SGPRPressure = T->getPressure().getSGPRNum(); |
342 | VGPRPressure = T->getPressure().getArchVGPRNum(); |
343 | } |
344 | } |
345 | ReadyQueue &Q = Zone.Available; |
346 | for (SUnit *SU : Q) { |
347 | |
348 | SchedCandidate TryCand(ZonePolicy); |
349 | initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure, |
350 | VGPRPressure, IsBottomUp); |
351 | // Pass SchedBoundary only when comparing nodes from the same boundary. |
352 | SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; |
353 | tryCandidate(Cand, TryCand, Zone: ZoneArg); |
354 | if (TryCand.Reason != NoCand) { |
355 | // Initialize resource delta if needed in case future heuristics query it. |
356 | if (TryCand.ResDelta == SchedResourceDelta()) |
357 | TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel); |
358 | Cand.setBest(TryCand); |
359 | LLVM_DEBUG(traceCandidate(Cand)); |
360 | } |
361 | } |
362 | } |
363 | |
364 | // This function is mostly cut and pasted from |
365 | // GenericScheduler::pickNodeBidirectional() |
366 | SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { |
367 | // Schedule as far as possible in the direction of no choice. This is most |
368 | // efficient, but also provides the best heuristics for CriticalPSets. |
369 | if (SUnit *SU = Bot.pickOnlyChoice()) { |
370 | IsTopNode = false; |
371 | return SU; |
372 | } |
373 | if (SUnit *SU = Top.pickOnlyChoice()) { |
374 | IsTopNode = true; |
375 | return SU; |
376 | } |
377 | // Set the bottom-up policy based on the state of the current bottom zone and |
378 | // the instructions outside the zone, including the top zone. |
379 | CandPolicy BotPolicy; |
380 | setPolicy(Policy&: BotPolicy, /*IsPostRA=*/false, CurrZone&: Bot, OtherZone: &Top); |
381 | // Set the top-down policy based on the state of the current top zone and |
382 | // the instructions outside the zone, including the bottom zone. |
383 | CandPolicy TopPolicy; |
384 | setPolicy(Policy&: TopPolicy, /*IsPostRA=*/false, CurrZone&: Top, OtherZone: &Bot); |
385 | |
386 | // See if BotCand is still valid (because we previously scheduled from Top). |
387 | LLVM_DEBUG(dbgs() << "Picking from Bot:\n" ); |
388 | if (!BotCand.isValid() || BotCand.SU->isScheduled || |
389 | BotCand.Policy != BotPolicy) { |
390 | BotCand.reset(NewPolicy: CandPolicy()); |
391 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand, |
392 | /*IsBottomUp=*/true); |
393 | assert(BotCand.Reason != NoCand && "failed to find the first candidate" ); |
394 | } else { |
395 | LLVM_DEBUG(traceCandidate(BotCand)); |
396 | #ifndef NDEBUG |
397 | if (VerifyScheduling) { |
398 | SchedCandidate TCand; |
399 | TCand.reset(CandPolicy()); |
400 | pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, |
401 | /*IsBottomUp=*/true); |
402 | assert(TCand.SU == BotCand.SU && |
403 | "Last pick result should correspond to re-picking right now" ); |
404 | } |
405 | #endif |
406 | } |
407 | |
408 | // Check if the top Q has a better candidate. |
409 | LLVM_DEBUG(dbgs() << "Picking from Top:\n" ); |
410 | if (!TopCand.isValid() || TopCand.SU->isScheduled || |
411 | TopCand.Policy != TopPolicy) { |
412 | TopCand.reset(NewPolicy: CandPolicy()); |
413 | pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand, |
414 | /*IsBottomUp=*/false); |
415 | assert(TopCand.Reason != NoCand && "failed to find the first candidate" ); |
416 | } else { |
417 | LLVM_DEBUG(traceCandidate(TopCand)); |
418 | #ifndef NDEBUG |
419 | if (VerifyScheduling) { |
420 | SchedCandidate TCand; |
421 | TCand.reset(CandPolicy()); |
422 | pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, |
423 | /*IsBottomUp=*/false); |
424 | assert(TCand.SU == TopCand.SU && |
425 | "Last pick result should correspond to re-picking right now" ); |
426 | } |
427 | #endif |
428 | } |
429 | |
430 | // Pick best from BotCand and TopCand. |
431 | LLVM_DEBUG(dbgs() << "Top Cand: " ; traceCandidate(TopCand); |
432 | dbgs() << "Bot Cand: " ; traceCandidate(BotCand);); |
433 | SchedCandidate Cand = BotCand; |
434 | TopCand.Reason = NoCand; |
435 | tryCandidate(Cand, TryCand&: TopCand, Zone: nullptr); |
436 | if (TopCand.Reason != NoCand) { |
437 | Cand.setBest(TopCand); |
438 | } |
439 | LLVM_DEBUG(dbgs() << "Picking: " ; traceCandidate(Cand);); |
440 | |
441 | IsTopNode = Cand.AtTop; |
442 | return Cand.SU; |
443 | } |
444 | |
445 | // This function is mostly cut and pasted from |
446 | // GenericScheduler::pickNode() |
447 | SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { |
448 | if (DAG->top() == DAG->bottom()) { |
449 | assert(Top.Available.empty() && Top.Pending.empty() && |
450 | Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage" ); |
451 | return nullptr; |
452 | } |
453 | SUnit *SU; |
454 | do { |
455 | if (RegionPolicy.OnlyTopDown) { |
456 | SU = Top.pickOnlyChoice(); |
457 | if (!SU) { |
458 | CandPolicy NoPolicy; |
459 | TopCand.reset(NewPolicy: NoPolicy); |
460 | pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand, |
461 | /*IsBottomUp=*/false); |
462 | assert(TopCand.Reason != NoCand && "failed to find a candidate" ); |
463 | SU = TopCand.SU; |
464 | } |
465 | IsTopNode = true; |
466 | } else if (RegionPolicy.OnlyBottomUp) { |
467 | SU = Bot.pickOnlyChoice(); |
468 | if (!SU) { |
469 | CandPolicy NoPolicy; |
470 | BotCand.reset(NewPolicy: NoPolicy); |
471 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand, |
472 | /*IsBottomUp=*/true); |
473 | assert(BotCand.Reason != NoCand && "failed to find a candidate" ); |
474 | SU = BotCand.SU; |
475 | } |
476 | IsTopNode = false; |
477 | } else { |
478 | SU = pickNodeBidirectional(IsTopNode); |
479 | } |
480 | } while (SU->isScheduled); |
481 | |
482 | if (SU->isTopReady()) |
483 | Top.removeReady(SU); |
484 | if (SU->isBottomReady()) |
485 | Bot.removeReady(SU); |
486 | |
487 | LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " |
488 | << *SU->getInstr()); |
489 | return SU; |
490 | } |
491 | |
492 | void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { |
493 | if (GCNTrackers) { |
494 | MachineInstr *MI = SU->getInstr(); |
495 | IsTopNode ? (void)DownwardTracker.advance(MI, UseInternalIterator: false) |
496 | : UpwardTracker.recede(MI: *MI); |
497 | } |
498 | |
499 | return GenericScheduler::schedNode(SU, IsTopNode); |
500 | } |
501 | |
502 | GCNSchedStageID GCNSchedStrategy::getCurrentStage() { |
503 | assert(CurrentStage && CurrentStage != SchedStages.end()); |
504 | return *CurrentStage; |
505 | } |
506 | |
507 | bool GCNSchedStrategy::advanceStage() { |
508 | assert(CurrentStage != SchedStages.end()); |
509 | if (!CurrentStage) |
510 | CurrentStage = SchedStages.begin(); |
511 | else |
512 | CurrentStage++; |
513 | |
514 | return CurrentStage != SchedStages.end(); |
515 | } |
516 | |
517 | bool GCNSchedStrategy::hasNextStage() const { |
518 | assert(CurrentStage); |
519 | return std::next(x: CurrentStage) != SchedStages.end(); |
520 | } |
521 | |
522 | GCNSchedStageID GCNSchedStrategy::getNextStage() const { |
523 | assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); |
524 | return *std::next(x: CurrentStage); |
525 | } |
526 | |
527 | GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( |
528 | const MachineSchedContext *C, bool IsLegacyScheduler) |
529 | : GCNSchedStrategy(C) { |
530 | SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule); |
531 | SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule); |
532 | SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule); |
533 | SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize); |
534 | GCNTrackers = GCNTrackers & !IsLegacyScheduler; |
535 | } |
536 | |
537 | GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) |
538 | : GCNSchedStrategy(C) { |
539 | SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule); |
540 | } |
541 | |
542 | bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, |
543 | SchedCandidate &TryCand, |
544 | SchedBoundary *Zone) const { |
545 | // Initialize the candidate if needed. |
546 | if (!Cand.isValid()) { |
547 | TryCand.Reason = NodeOrder; |
548 | return true; |
549 | } |
550 | |
551 | // Avoid spilling by exceeding the register limit. |
552 | if (DAG->isTrackingPressure() && |
553 | tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand, |
554 | Reason: RegExcess, TRI, MF: DAG->MF)) |
555 | return TryCand.Reason != NoCand; |
556 | |
557 | // Bias PhysReg Defs and copies to their uses and defined respectively. |
558 | if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop), |
559 | CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg)) |
560 | return TryCand.Reason != NoCand; |
561 | |
562 | bool SameBoundary = Zone != nullptr; |
563 | if (SameBoundary) { |
564 | // Prioritize instructions that read unbuffered resources by stall cycles. |
565 | if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU), |
566 | CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall)) |
567 | return TryCand.Reason != NoCand; |
568 | |
569 | // Avoid critical resource consumption and balance the schedule. |
570 | TryCand.initResourceDelta(DAG, SchedModel); |
571 | if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources, |
572 | TryCand, Cand, Reason: ResourceReduce)) |
573 | return TryCand.Reason != NoCand; |
574 | if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources, |
575 | CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand, |
576 | Reason: ResourceDemand)) |
577 | return TryCand.Reason != NoCand; |
578 | |
579 | // Unconditionally try to reduce latency. |
580 | if (tryLatency(TryCand, Cand, Zone&: *Zone)) |
581 | return TryCand.Reason != NoCand; |
582 | |
583 | // Weak edges are for clustering and other constraints. |
584 | if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop), |
585 | CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak)) |
586 | return TryCand.Reason != NoCand; |
587 | } |
588 | |
589 | // Keep clustered nodes together to encourage downstream peephole |
590 | // optimizations which may reduce resource requirements. |
591 | // |
592 | // This is a best effort to set things up for a post-RA pass. Optimizations |
593 | // like generating loads of multiple registers should ideally be done within |
594 | // the scheduler pass by combining the loads during DAG postprocessing. |
595 | const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; |
596 | const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; |
597 | if (tryGreater(TryVal: TryCandCluster && TryCandCluster->contains(Ptr: TryCand.SU), |
598 | CandVal: CandCluster && CandCluster->contains(Ptr: Cand.SU), TryCand, Cand, |
599 | Reason: Cluster)) |
600 | return TryCand.Reason != NoCand; |
601 | |
602 | // Avoid increasing the max critical pressure in the scheduled region. |
603 | if (DAG->isTrackingPressure() && |
604 | tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax, |
605 | TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF)) |
606 | return TryCand.Reason != NoCand; |
607 | |
608 | // Avoid increasing the max pressure of the entire region. |
609 | if (DAG->isTrackingPressure() && |
610 | tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand, |
611 | Cand, Reason: RegMax, TRI, MF: DAG->MF)) |
612 | return TryCand.Reason != NoCand; |
613 | |
614 | if (SameBoundary) { |
615 | // Fall through to original instruction order. |
616 | if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || |
617 | (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { |
618 | TryCand.Reason = NodeOrder; |
619 | return true; |
620 | } |
621 | } |
622 | return false; |
623 | } |
624 | |
625 | GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy( |
626 | const MachineSchedContext *C) |
627 | : GCNSchedStrategy(C) { |
628 | SchedStages.push_back(Elt: GCNSchedStageID::MemoryClauseInitialSchedule); |
629 | } |
630 | |
631 | /// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as |
632 | /// much as possible. This is achieved by: |
633 | // 1. Prioritize clustered operations before stall latency heuristic. |
634 | // 2. Prioritize long-latency-load before stall latency heuristic. |
635 | /// |
636 | /// \param Cand provides the policy and current best candidate. |
637 | /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. |
638 | /// \param Zone describes the scheduled zone that we are extending, or nullptr |
639 | /// if Cand is from a different zone than TryCand. |
640 | /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) |
641 | bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand, |
642 | SchedCandidate &TryCand, |
643 | SchedBoundary *Zone) const { |
644 | // Initialize the candidate if needed. |
645 | if (!Cand.isValid()) { |
646 | TryCand.Reason = NodeOrder; |
647 | return true; |
648 | } |
649 | |
650 | // Bias PhysReg Defs and copies to their uses and defined respectively. |
651 | if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop), |
652 | CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg)) |
653 | return TryCand.Reason != NoCand; |
654 | |
655 | if (DAG->isTrackingPressure()) { |
656 | // Avoid exceeding the target's limit. |
657 | if (tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand, |
658 | Reason: RegExcess, TRI, MF: DAG->MF)) |
659 | return TryCand.Reason != NoCand; |
660 | |
661 | // Avoid increasing the max critical pressure in the scheduled region. |
662 | if (tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax, |
663 | TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF)) |
664 | return TryCand.Reason != NoCand; |
665 | } |
666 | |
667 | // MaxMemoryClause-specific: We prioritize clustered instructions as we would |
668 | // get more benefit from clausing these memory instructions. |
669 | const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; |
670 | const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; |
671 | if (tryGreater(TryVal: TryCandCluster && TryCandCluster->contains(Ptr: TryCand.SU), |
672 | CandVal: CandCluster && CandCluster->contains(Ptr: Cand.SU), TryCand, Cand, |
673 | Reason: Cluster)) |
674 | return TryCand.Reason != NoCand; |
675 | |
676 | // We only compare a subset of features when comparing nodes between |
677 | // Top and Bottom boundary. Some properties are simply incomparable, in many |
678 | // other instances we should only override the other boundary if something |
679 | // is a clear good pick on one boundary. Skip heuristics that are more |
680 | // "tie-breaking" in nature. |
681 | bool SameBoundary = Zone != nullptr; |
682 | if (SameBoundary) { |
683 | // For loops that are acyclic path limited, aggressively schedule for |
684 | // latency. Within an single cycle, whenever CurrMOps > 0, allow normal |
685 | // heuristics to take precedence. |
686 | if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && |
687 | tryLatency(TryCand, Cand, Zone&: *Zone)) |
688 | return TryCand.Reason != NoCand; |
689 | |
690 | // MaxMemoryClause-specific: Prioritize long latency memory load |
691 | // instructions in top-bottom order to hide more latency. The mayLoad check |
692 | // is used to exclude store-like instructions, which we do not want to |
693 | // scheduler them too early. |
694 | bool TryMayLoad = |
695 | TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad(); |
696 | bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad(); |
697 | |
698 | if (TryMayLoad || CandMayLoad) { |
699 | bool TryLongLatency = |
700 | TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad; |
701 | bool CandLongLatency = |
702 | 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad; |
703 | |
704 | if (tryGreater(TryVal: Zone->isTop() ? TryLongLatency : CandLongLatency, |
705 | CandVal: Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand, |
706 | Cand, Reason: Stall)) |
707 | return TryCand.Reason != NoCand; |
708 | } |
709 | // Prioritize instructions that read unbuffered resources by stall cycles. |
710 | if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU), |
711 | CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall)) |
712 | return TryCand.Reason != NoCand; |
713 | } |
714 | |
715 | if (SameBoundary) { |
716 | // Weak edges are for clustering and other constraints. |
717 | if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop), |
718 | CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak)) |
719 | return TryCand.Reason != NoCand; |
720 | } |
721 | |
722 | // Avoid increasing the max pressure of the entire region. |
723 | if (DAG->isTrackingPressure() && |
724 | tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand, |
725 | Cand, Reason: RegMax, TRI, MF: DAG->MF)) |
726 | return TryCand.Reason != NoCand; |
727 | |
728 | if (SameBoundary) { |
729 | // Avoid critical resource consumption and balance the schedule. |
730 | TryCand.initResourceDelta(DAG, SchedModel); |
731 | if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources, |
732 | TryCand, Cand, Reason: ResourceReduce)) |
733 | return TryCand.Reason != NoCand; |
734 | if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources, |
735 | CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand, |
736 | Reason: ResourceDemand)) |
737 | return TryCand.Reason != NoCand; |
738 | |
739 | // Avoid serializing long latency dependence chains. |
740 | // For acyclic path limited loops, latency was already checked above. |
741 | if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && |
742 | !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone)) |
743 | return TryCand.Reason != NoCand; |
744 | |
745 | // Fall through to original instruction order. |
746 | if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) { |
747 | assert(TryCand.SU->NodeNum != Cand.SU->NodeNum); |
748 | TryCand.Reason = NodeOrder; |
749 | return true; |
750 | } |
751 | } |
752 | |
753 | return false; |
754 | } |
755 | |
756 | GCNScheduleDAGMILive::GCNScheduleDAGMILive( |
757 | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) |
758 | : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), |
759 | MFI(*MF.getInfo<SIMachineFunctionInfo>()), |
760 | StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), |
761 | RegionLiveOuts(this, /*IsLiveOut=*/true) { |
762 | |
763 | // We want regions with a single MI to be scheduled so that we can reason |
764 | // about them correctly during scheduling stages that move MIs between regions |
765 | // (e.g., rematerialization). |
766 | ScheduleSingleMIRegions = true; |
767 | LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n" ); |
768 | if (RelaxedOcc) { |
769 | MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy); |
770 | if (MinOccupancy != StartingOccupancy) |
771 | LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy |
772 | << ".\n" ); |
773 | } |
774 | } |
775 | |
776 | std::unique_ptr<GCNSchedStage> |
777 | GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { |
778 | switch (SchedStageID) { |
779 | case GCNSchedStageID::OccInitialSchedule: |
780 | return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this); |
781 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
782 | return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this); |
783 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
784 | return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this); |
785 | case GCNSchedStageID::PreRARematerialize: |
786 | return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this); |
787 | case GCNSchedStageID::ILPInitialSchedule: |
788 | return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this); |
789 | case GCNSchedStageID::MemoryClauseInitialSchedule: |
790 | return std::make_unique<MemoryClauseInitialScheduleStage>(args&: SchedStageID, |
791 | args&: *this); |
792 | } |
793 | |
794 | llvm_unreachable("Unknown SchedStageID." ); |
795 | } |
796 | |
797 | void GCNScheduleDAGMILive::schedule() { |
798 | // Collect all scheduling regions. The actual scheduling is performed in |
799 | // GCNScheduleDAGMILive::finalizeSchedule. |
800 | Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd)); |
801 | } |
802 | |
803 | GCNRegPressure |
804 | GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { |
805 | GCNDownwardRPTracker RPTracker(*LIS); |
806 | RPTracker.advance(Begin: begin(), End: end(), LiveRegsCopy: &LiveIns[RegionIdx]); |
807 | return RPTracker.moveMaxPressure(); |
808 | } |
809 | |
810 | static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, |
811 | MachineBasicBlock::iterator RegionEnd) { |
812 | auto REnd = RegionEnd == RegionBegin->getParent()->end() |
813 | ? std::prev(x: RegionEnd) |
814 | : RegionEnd; |
815 | return &*skipDebugInstructionsBackward(It: REnd, Begin: RegionBegin); |
816 | } |
817 | |
818 | void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, |
819 | const MachineBasicBlock *MBB) { |
820 | GCNDownwardRPTracker RPTracker(*LIS); |
821 | |
822 | // If the block has the only successor then live-ins of that successor are |
823 | // live-outs of the current block. We can reuse calculated live set if the |
824 | // successor will be sent to scheduling past current block. |
825 | |
826 | // However, due to the bug in LiveInterval analysis it may happen that two |
827 | // predecessors of the same successor block have different lane bitmasks for |
828 | // a live-out register. Workaround that by sticking to one-to-one relationship |
829 | // i.e. one predecessor with one successor block. |
830 | const MachineBasicBlock *OnlySucc = nullptr; |
831 | if (MBB->succ_size() == 1) { |
832 | auto *Candidate = *MBB->succ_begin(); |
833 | if (!Candidate->empty() && Candidate->pred_size() == 1) { |
834 | SlotIndexes *Ind = LIS->getSlotIndexes(); |
835 | if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate)) |
836 | OnlySucc = Candidate; |
837 | } |
838 | } |
839 | |
840 | // Scheduler sends regions from the end of the block upwards. |
841 | size_t CurRegion = RegionIdx; |
842 | for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) |
843 | if (Regions[CurRegion].first->getParent() != MBB) |
844 | break; |
845 | --CurRegion; |
846 | |
847 | auto I = MBB->begin(); |
848 | auto LiveInIt = MBBLiveIns.find(Val: MBB); |
849 | auto &Rgn = Regions[CurRegion]; |
850 | auto *NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second); |
851 | if (LiveInIt != MBBLiveIns.end()) { |
852 | auto LiveIn = std::move(LiveInIt->second); |
853 | RPTracker.reset(MI: *MBB->begin(), LiveRegs: &LiveIn); |
854 | MBBLiveIns.erase(I: LiveInIt); |
855 | } else { |
856 | I = Rgn.first; |
857 | auto LRS = BBLiveInMap.lookup(Val: NonDbgMI); |
858 | #ifdef EXPENSIVE_CHECKS |
859 | assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); |
860 | #endif |
861 | RPTracker.reset(MI: *I, LiveRegs: &LRS); |
862 | } |
863 | |
864 | for (;;) { |
865 | I = RPTracker.getNext(); |
866 | |
867 | if (Regions[CurRegion].first == I || NonDbgMI == I) { |
868 | LiveIns[CurRegion] = RPTracker.getLiveRegs(); |
869 | RPTracker.clearMaxPressure(); |
870 | } |
871 | |
872 | if (Regions[CurRegion].second == I) { |
873 | Pressure[CurRegion] = RPTracker.moveMaxPressure(); |
874 | if (CurRegion-- == RegionIdx) |
875 | break; |
876 | auto &Rgn = Regions[CurRegion]; |
877 | NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second); |
878 | } |
879 | RPTracker.advanceToNext(); |
880 | RPTracker.advanceBeforeNext(); |
881 | } |
882 | |
883 | if (OnlySucc) { |
884 | if (I != MBB->end()) { |
885 | RPTracker.advanceToNext(); |
886 | RPTracker.advance(End: MBB->end()); |
887 | } |
888 | RPTracker.advanceBeforeNext(); |
889 | MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); |
890 | } |
891 | } |
892 | |
893 | DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
894 | GCNScheduleDAGMILive::getRegionLiveInMap() const { |
895 | assert(!Regions.empty()); |
896 | std::vector<MachineInstr *> RegionFirstMIs; |
897 | RegionFirstMIs.reserve(n: Regions.size()); |
898 | auto I = Regions.rbegin(), E = Regions.rend(); |
899 | do { |
900 | const MachineBasicBlock *MBB = I->first->getParent(); |
901 | auto *MI = &*skipDebugInstructionsForward(It: I->first, End: I->second); |
902 | RegionFirstMIs.push_back(x: MI); |
903 | do { |
904 | ++I; |
905 | } while (I != E && I->first->getParent() == MBB); |
906 | } while (I != E); |
907 | return getLiveRegMap(R&: RegionFirstMIs, /*After=*/false, LIS&: *LIS); |
908 | } |
909 | |
910 | DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
911 | GCNScheduleDAGMILive::getRegionLiveOutMap() const { |
912 | assert(!Regions.empty()); |
913 | std::vector<MachineInstr *> RegionLastMIs; |
914 | RegionLastMIs.reserve(n: Regions.size()); |
915 | for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions)) |
916 | RegionLastMIs.push_back(x: getLastMIForRegion(RegionBegin, RegionEnd)); |
917 | |
918 | return getLiveRegMap(R&: RegionLastMIs, /*After=*/true, LIS&: *LIS); |
919 | } |
920 | |
921 | void RegionPressureMap::buildLiveRegMap() { |
922 | IdxToInstruction.clear(); |
923 | |
924 | RegionLiveRegMap = |
925 | IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); |
926 | for (unsigned I = 0; I < DAG->Regions.size(); I++) { |
927 | MachineInstr *RegionKey = |
928 | IsLiveOut |
929 | ? getLastMIForRegion(RegionBegin: DAG->Regions[I].first, RegionEnd: DAG->Regions[I].second) |
930 | : &*DAG->Regions[I].first; |
931 | IdxToInstruction[I] = RegionKey; |
932 | } |
933 | } |
934 | |
935 | void GCNScheduleDAGMILive::finalizeSchedule() { |
936 | // Start actual scheduling here. This function is called by the base |
937 | // MachineScheduler after all regions have been recorded by |
938 | // GCNScheduleDAGMILive::schedule(). |
939 | LiveIns.resize(N: Regions.size()); |
940 | Pressure.resize(N: Regions.size()); |
941 | RegionsWithHighRP.resize(N: Regions.size()); |
942 | RegionsWithExcessRP.resize(N: Regions.size()); |
943 | RegionsWithMinOcc.resize(N: Regions.size()); |
944 | RegionsWithIGLPInstrs.resize(N: Regions.size()); |
945 | RegionsWithHighRP.reset(); |
946 | RegionsWithExcessRP.reset(); |
947 | RegionsWithMinOcc.reset(); |
948 | RegionsWithIGLPInstrs.reset(); |
949 | |
950 | runSchedStages(); |
951 | } |
952 | |
953 | void GCNScheduleDAGMILive::runSchedStages() { |
954 | LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n" ); |
955 | |
956 | if (!Regions.empty()) { |
957 | BBLiveInMap = getRegionLiveInMap(); |
958 | if (GCNTrackers) |
959 | RegionLiveOuts.buildLiveRegMap(); |
960 | } |
961 | |
962 | GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); |
963 | while (S.advanceStage()) { |
964 | auto Stage = createSchedStage(SchedStageID: S.getCurrentStage()); |
965 | if (!Stage->initGCNSchedStage()) |
966 | continue; |
967 | |
968 | for (auto Region : Regions) { |
969 | RegionBegin = Region.first; |
970 | RegionEnd = Region.second; |
971 | // Setup for scheduling the region and check whether it should be skipped. |
972 | if (!Stage->initGCNRegion()) { |
973 | Stage->advanceRegion(); |
974 | exitRegion(); |
975 | continue; |
976 | } |
977 | |
978 | if (GCNTrackers) { |
979 | GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); |
980 | GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); |
981 | GCNRPTracker::LiveRegSet *RegionLiveIns = |
982 | &LiveIns[Stage->getRegionIdx()]; |
983 | |
984 | reinterpret_cast<GCNRPTracker *>(DownwardTracker) |
985 | ->reset(MRI_: MRI, LiveRegs_: *RegionLiveIns); |
986 | reinterpret_cast<GCNRPTracker *>(UpwardTracker) |
987 | ->reset(MRI_: MRI, LiveRegs_: RegionLiveOuts.getLiveRegsForRegionIdx( |
988 | RegionIdx: Stage->getRegionIdx())); |
989 | } |
990 | |
991 | ScheduleDAGMILive::schedule(); |
992 | Stage->finalizeGCNRegion(); |
993 | } |
994 | |
995 | Stage->finalizeGCNSchedStage(); |
996 | } |
997 | } |
998 | |
999 | #ifndef NDEBUG |
1000 | raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { |
1001 | switch (StageID) { |
1002 | case GCNSchedStageID::OccInitialSchedule: |
1003 | OS << "Max Occupancy Initial Schedule" ; |
1004 | break; |
1005 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
1006 | OS << "Unclustered High Register Pressure Reschedule" ; |
1007 | break; |
1008 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
1009 | OS << "Clustered Low Occupancy Reschedule" ; |
1010 | break; |
1011 | case GCNSchedStageID::PreRARematerialize: |
1012 | OS << "Pre-RA Rematerialize" ; |
1013 | break; |
1014 | case GCNSchedStageID::ILPInitialSchedule: |
1015 | OS << "Max ILP Initial Schedule" ; |
1016 | break; |
1017 | case GCNSchedStageID::MemoryClauseInitialSchedule: |
1018 | OS << "Max memory clause Initial Schedule" ; |
1019 | break; |
1020 | } |
1021 | |
1022 | return OS; |
1023 | } |
1024 | #endif |
1025 | |
1026 | GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) |
1027 | : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), |
1028 | MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} |
1029 | |
1030 | bool GCNSchedStage::initGCNSchedStage() { |
1031 | if (!DAG.LIS) |
1032 | return false; |
1033 | |
1034 | LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n" ); |
1035 | return true; |
1036 | } |
1037 | |
1038 | bool UnclusteredHighRPStage::initGCNSchedStage() { |
1039 | if (DisableUnclusterHighRP) |
1040 | return false; |
1041 | |
1042 | if (!GCNSchedStage::initGCNSchedStage()) |
1043 | return false; |
1044 | |
1045 | if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) |
1046 | return false; |
1047 | |
1048 | SavedMutations.swap(x&: DAG.Mutations); |
1049 | DAG.addMutation( |
1050 | Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry)); |
1051 | |
1052 | InitialOccupancy = DAG.MinOccupancy; |
1053 | // Aggressivly try to reduce register pressure in the unclustered high RP |
1054 | // stage. Temporarily increase occupancy target in the region. |
1055 | S.SGPRLimitBias = S.HighRPSGPRBias; |
1056 | S.VGPRLimitBias = S.HighRPVGPRBias; |
1057 | if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) |
1058 | MFI.increaseOccupancy(MF, Limit: ++DAG.MinOccupancy); |
1059 | |
1060 | LLVM_DEBUG( |
1061 | dbgs() |
1062 | << "Retrying function scheduling without clustering. " |
1063 | "Aggressivly try to reduce register pressure to achieve occupancy " |
1064 | << DAG.MinOccupancy << ".\n" ); |
1065 | |
1066 | return true; |
1067 | } |
1068 | |
1069 | bool ClusteredLowOccStage::initGCNSchedStage() { |
1070 | if (DisableClusteredLowOccupancy) |
1071 | return false; |
1072 | |
1073 | if (!GCNSchedStage::initGCNSchedStage()) |
1074 | return false; |
1075 | |
1076 | // Don't bother trying to improve ILP in lower RP regions if occupancy has not |
1077 | // been dropped. All regions will have already been scheduled with the ideal |
1078 | // occupancy targets. |
1079 | if (DAG.StartingOccupancy <= DAG.MinOccupancy) |
1080 | return false; |
1081 | |
1082 | LLVM_DEBUG( |
1083 | dbgs() << "Retrying function scheduling with lowest recorded occupancy " |
1084 | << DAG.MinOccupancy << ".\n" ); |
1085 | return true; |
1086 | } |
1087 | |
1088 | /// Allows to easily filter for this stage's debug output. |
1089 | #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;) |
1090 | |
1091 | bool PreRARematStage::initGCNSchedStage() { |
1092 | // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for |
1093 | // regions inbetween the defs and region we sinked the def to. Will need to be |
1094 | // fixed if there is another pass after this pass. |
1095 | assert(!S.hasNextStage()); |
1096 | |
1097 | if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() || |
1098 | DAG.Regions.size() == 1) |
1099 | return false; |
1100 | |
1101 | // Before performing any IR modification record the parent region of each MI |
1102 | // and the parent MBB of each region. |
1103 | const unsigned NumRegions = DAG.Regions.size(); |
1104 | RegionBB.reserve(N: NumRegions); |
1105 | for (unsigned I = 0; I < NumRegions; ++I) { |
1106 | RegionBoundaries Region = DAG.Regions[I]; |
1107 | for (auto MI = Region.first; MI != Region.second; ++MI) |
1108 | MIRegion.insert(KV: {&*MI, I}); |
1109 | RegionBB.push_back(Elt: Region.first->getParent()); |
1110 | } |
1111 | |
1112 | if (!canIncreaseOccupancyOrReduceSpill()) |
1113 | return false; |
1114 | |
1115 | // Rematerialize identified instructions and update scheduler's state. |
1116 | rematerialize(); |
1117 | if (GCNTrackers) |
1118 | DAG.RegionLiveOuts.buildLiveRegMap(); |
1119 | REMAT_DEBUG( |
1120 | dbgs() << "Retrying function scheduling with new min. occupancy of " |
1121 | << AchievedOcc << " from rematerializing (original was " |
1122 | << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n" ); |
1123 | if (AchievedOcc > DAG.MinOccupancy) { |
1124 | DAG.MinOccupancy = AchievedOcc; |
1125 | SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
1126 | MFI.increaseOccupancy(MF, Limit: DAG.MinOccupancy); |
1127 | } |
1128 | return true; |
1129 | } |
1130 | |
1131 | void GCNSchedStage::finalizeGCNSchedStage() { |
1132 | DAG.finishBlock(); |
1133 | LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n" ); |
1134 | } |
1135 | |
1136 | void UnclusteredHighRPStage::finalizeGCNSchedStage() { |
1137 | SavedMutations.swap(x&: DAG.Mutations); |
1138 | S.SGPRLimitBias = S.VGPRLimitBias = 0; |
1139 | if (DAG.MinOccupancy > InitialOccupancy) { |
1140 | for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) |
1141 | DAG.RegionsWithMinOcc[IDX] = |
1142 | DAG.Pressure[IDX].getOccupancy( |
1143 | ST: DAG.ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy; |
1144 | |
1145 | LLVM_DEBUG(dbgs() << StageID |
1146 | << " stage successfully increased occupancy to " |
1147 | << DAG.MinOccupancy << '\n'); |
1148 | } |
1149 | |
1150 | GCNSchedStage::finalizeGCNSchedStage(); |
1151 | } |
1152 | |
1153 | bool GCNSchedStage::initGCNRegion() { |
1154 | // Check whether this new region is also a new block. |
1155 | if (DAG.RegionBegin->getParent() != CurrentMBB) |
1156 | setupNewBlock(); |
1157 | |
1158 | unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end()); |
1159 | DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs); |
1160 | |
1161 | // Skip empty scheduling regions (0 or 1 schedulable instructions). |
1162 | if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(x: DAG.end())) |
1163 | return false; |
1164 | |
1165 | LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n" ); |
1166 | LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) |
1167 | << " " << CurrentMBB->getName() |
1168 | << "\n From: " << *DAG.begin() << " To: " ; |
1169 | if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; |
1170 | else dbgs() << "End" ; |
1171 | dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); |
1172 | |
1173 | // Save original instruction order before scheduling for possible revert. |
1174 | Unsched.clear(); |
1175 | Unsched.reserve(n: DAG.NumRegionInstrs); |
1176 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
1177 | StageID == GCNSchedStageID::ILPInitialSchedule) { |
1178 | const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII); |
1179 | for (auto &I : DAG) { |
1180 | Unsched.push_back(x: &I); |
1181 | if (SII->isIGLPMutationOnly(Opcode: I.getOpcode())) |
1182 | DAG.RegionsWithIGLPInstrs[RegionIdx] = true; |
1183 | } |
1184 | } else { |
1185 | for (auto &I : DAG) |
1186 | Unsched.push_back(x: &I); |
1187 | } |
1188 | |
1189 | PressureBefore = DAG.Pressure[RegionIdx]; |
1190 | |
1191 | LLVM_DEBUG( |
1192 | dbgs() << "Pressure before scheduling:\nRegion live-ins:" |
1193 | << print(DAG.LiveIns[RegionIdx], DAG.MRI) |
1194 | << "Region live-in pressure: " |
1195 | << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) |
1196 | << "Region register pressure: " << print(PressureBefore)); |
1197 | |
1198 | S.HasHighPressure = false; |
1199 | S.KnownExcessRP = isRegionWithExcessRP(); |
1200 | |
1201 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
1202 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { |
1203 | SavedMutations.clear(); |
1204 | SavedMutations.swap(x&: DAG.Mutations); |
1205 | bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || |
1206 | StageID == GCNSchedStageID::ILPInitialSchedule; |
1207 | DAG.addMutation(Mutation: createIGroupLPDAGMutation( |
1208 | Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial |
1209 | : AMDGPU::SchedulingPhase::PreRAReentry)); |
1210 | } |
1211 | |
1212 | return true; |
1213 | } |
1214 | |
1215 | bool UnclusteredHighRPStage::initGCNRegion() { |
1216 | // Only reschedule regions with the minimum occupancy or regions that may have |
1217 | // spilling (excess register pressure). |
1218 | if ((!DAG.RegionsWithMinOcc[RegionIdx] || |
1219 | DAG.MinOccupancy <= InitialOccupancy) && |
1220 | !DAG.RegionsWithExcessRP[RegionIdx]) |
1221 | return false; |
1222 | |
1223 | return GCNSchedStage::initGCNRegion(); |
1224 | } |
1225 | |
1226 | bool ClusteredLowOccStage::initGCNRegion() { |
1227 | // We may need to reschedule this region if it wasn't rescheduled in the last |
1228 | // stage, or if we found it was testing critical register pressure limits in |
1229 | // the unclustered reschedule stage. The later is because we may not have been |
1230 | // able to raise the min occupancy in the previous stage so the region may be |
1231 | // overly constrained even if it was already rescheduled. |
1232 | if (!DAG.RegionsWithHighRP[RegionIdx]) |
1233 | return false; |
1234 | |
1235 | return GCNSchedStage::initGCNRegion(); |
1236 | } |
1237 | |
1238 | bool PreRARematStage::initGCNRegion() { |
1239 | return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion(); |
1240 | } |
1241 | |
1242 | void GCNSchedStage::setupNewBlock() { |
1243 | if (CurrentMBB) |
1244 | DAG.finishBlock(); |
1245 | |
1246 | CurrentMBB = DAG.RegionBegin->getParent(); |
1247 | DAG.startBlock(bb: CurrentMBB); |
1248 | // Get real RP for the region if it hasn't be calculated before. After the |
1249 | // initial schedule stage real RP will be collected after scheduling. |
1250 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
1251 | StageID == GCNSchedStageID::ILPInitialSchedule || |
1252 | StageID == GCNSchedStageID::MemoryClauseInitialSchedule) |
1253 | DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB); |
1254 | } |
1255 | |
1256 | void GCNSchedStage::finalizeGCNRegion() { |
1257 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
1258 | if (S.HasHighPressure) |
1259 | DAG.RegionsWithHighRP[RegionIdx] = true; |
1260 | |
1261 | // Revert scheduling if we have dropped occupancy or there is some other |
1262 | // reason that the original schedule is better. |
1263 | checkScheduling(); |
1264 | |
1265 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
1266 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) |
1267 | SavedMutations.swap(x&: DAG.Mutations); |
1268 | |
1269 | DAG.exitRegion(); |
1270 | advanceRegion(); |
1271 | } |
1272 | |
1273 | void GCNSchedStage::checkScheduling() { |
1274 | // Check the results of scheduling. |
1275 | PressureAfter = DAG.getRealRegPressure(RegionIdx); |
1276 | |
1277 | LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); |
1278 | LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n" ); |
1279 | |
1280 | unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); |
1281 | |
1282 | if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && |
1283 | PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { |
1284 | DAG.Pressure[RegionIdx] = PressureAfter; |
1285 | DAG.RegionsWithMinOcc[RegionIdx] = |
1286 | PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == |
1287 | DAG.MinOccupancy; |
1288 | |
1289 | // Early out if we have achieved the occupancy target. |
1290 | LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n" ); |
1291 | return; |
1292 | } |
1293 | |
1294 | unsigned TargetOccupancy = std::min( |
1295 | a: S.getTargetOccupancy(), b: ST.getOccupancyWithWorkGroupSizes(MF).second); |
1296 | unsigned WavesAfter = std::min( |
1297 | a: TargetOccupancy, b: PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize)); |
1298 | unsigned WavesBefore = std::min( |
1299 | a: TargetOccupancy, b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize)); |
1300 | LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore |
1301 | << ", after " << WavesAfter << ".\n" ); |
1302 | |
1303 | // We may not be able to keep the current target occupancy because of the just |
1304 | // scheduled region. We might still be able to revert scheduling if the |
1305 | // occupancy before was higher, or if the current schedule has register |
1306 | // pressure higher than the excess limits which could lead to more spilling. |
1307 | unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore); |
1308 | |
1309 | // Allow memory bound functions to drop to 4 waves if not limited by an |
1310 | // attribute. |
1311 | if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && |
1312 | WavesAfter >= MFI.getMinAllowedOccupancy()) { |
1313 | LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " |
1314 | << MFI.getMinAllowedOccupancy() << " waves\n" ); |
1315 | NewOccupancy = WavesAfter; |
1316 | } |
1317 | |
1318 | if (NewOccupancy < DAG.MinOccupancy) { |
1319 | DAG.MinOccupancy = NewOccupancy; |
1320 | MFI.limitOccupancy(Limit: DAG.MinOccupancy); |
1321 | DAG.RegionsWithMinOcc.reset(); |
1322 | LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " |
1323 | << DAG.MinOccupancy << ".\n" ); |
1324 | } |
1325 | // The maximum number of arch VGPR on non-unified register file, or the |
1326 | // maximum VGPR + AGPR in the unified register file case. |
1327 | unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); |
1328 | // The maximum number of arch VGPR for both unified and non-unified register |
1329 | // file. |
1330 | unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs()); |
1331 | unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); |
1332 | |
1333 | if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs || |
1334 | PressureAfter.getArchVGPRNum() > MaxArchVGPRs || |
1335 | PressureAfter.getAGPRNum() > MaxArchVGPRs || |
1336 | PressureAfter.getSGPRNum() > MaxSGPRs) { |
1337 | DAG.RegionsWithHighRP[RegionIdx] = true; |
1338 | DAG.RegionsWithExcessRP[RegionIdx] = true; |
1339 | } |
1340 | |
1341 | // Revert if this region's schedule would cause a drop in occupancy or |
1342 | // spilling. |
1343 | if (shouldRevertScheduling(WavesAfter)) { |
1344 | revertScheduling(); |
1345 | } else { |
1346 | DAG.Pressure[RegionIdx] = PressureAfter; |
1347 | DAG.RegionsWithMinOcc[RegionIdx] = |
1348 | PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == |
1349 | DAG.MinOccupancy; |
1350 | } |
1351 | } |
1352 | |
1353 | unsigned |
1354 | GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, |
1355 | DenseMap<unsigned, unsigned> &ReadyCycles, |
1356 | const TargetSchedModel &SM) { |
1357 | unsigned ReadyCycle = CurrCycle; |
1358 | for (auto &D : SU.Preds) { |
1359 | if (D.isAssignedRegDep()) { |
1360 | MachineInstr *DefMI = D.getSUnit()->getInstr(); |
1361 | unsigned Latency = SM.computeInstrLatency(MI: DefMI); |
1362 | unsigned DefReady = ReadyCycles[DAG.getSUnit(MI: DefMI)->NodeNum]; |
1363 | ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency); |
1364 | } |
1365 | } |
1366 | ReadyCycles[SU.NodeNum] = ReadyCycle; |
1367 | return ReadyCycle; |
1368 | } |
1369 | |
1370 | #ifndef NDEBUG |
1371 | struct EarlierIssuingCycle { |
1372 | bool operator()(std::pair<MachineInstr *, unsigned> A, |
1373 | std::pair<MachineInstr *, unsigned> B) const { |
1374 | return A.second < B.second; |
1375 | } |
1376 | }; |
1377 | |
1378 | static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, |
1379 | EarlierIssuingCycle> &ReadyCycles) { |
1380 | if (ReadyCycles.empty()) |
1381 | return; |
1382 | unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); |
1383 | dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum |
1384 | << " ##################\n# Cycle #\t\t\tInstruction " |
1385 | " " |
1386 | " \n" ; |
1387 | unsigned IPrev = 1; |
1388 | for (auto &I : ReadyCycles) { |
1389 | if (I.second > IPrev + 1) |
1390 | dbgs() << "****************************** BUBBLE OF " << I.second - IPrev |
1391 | << " CYCLES DETECTED ******************************\n\n" ; |
1392 | dbgs() << "[ " << I.second << " ] : " << *I.first << "\n" ; |
1393 | IPrev = I.second; |
1394 | } |
1395 | } |
1396 | #endif |
1397 | |
1398 | ScheduleMetrics |
1399 | GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { |
1400 | #ifndef NDEBUG |
1401 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
1402 | ReadyCyclesSorted; |
1403 | #endif |
1404 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
1405 | unsigned SumBubbles = 0; |
1406 | DenseMap<unsigned, unsigned> ReadyCycles; |
1407 | unsigned CurrCycle = 0; |
1408 | for (auto &SU : InputSchedule) { |
1409 | unsigned ReadyCycle = |
1410 | computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); |
1411 | SumBubbles += ReadyCycle - CurrCycle; |
1412 | #ifndef NDEBUG |
1413 | ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); |
1414 | #endif |
1415 | CurrCycle = ++ReadyCycle; |
1416 | } |
1417 | #ifndef NDEBUG |
1418 | LLVM_DEBUG( |
1419 | printScheduleModel(ReadyCyclesSorted); |
1420 | dbgs() << "\n\t" |
1421 | << "Metric: " |
1422 | << (SumBubbles |
1423 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
1424 | : 1) |
1425 | << "\n\n" ); |
1426 | #endif |
1427 | |
1428 | return ScheduleMetrics(CurrCycle, SumBubbles); |
1429 | } |
1430 | |
1431 | ScheduleMetrics |
1432 | GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { |
1433 | #ifndef NDEBUG |
1434 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
1435 | ReadyCyclesSorted; |
1436 | #endif |
1437 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
1438 | unsigned SumBubbles = 0; |
1439 | DenseMap<unsigned, unsigned> ReadyCycles; |
1440 | unsigned CurrCycle = 0; |
1441 | for (auto &MI : DAG) { |
1442 | SUnit *SU = DAG.getSUnit(MI: &MI); |
1443 | if (!SU) |
1444 | continue; |
1445 | unsigned ReadyCycle = |
1446 | computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM); |
1447 | SumBubbles += ReadyCycle - CurrCycle; |
1448 | #ifndef NDEBUG |
1449 | ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); |
1450 | #endif |
1451 | CurrCycle = ++ReadyCycle; |
1452 | } |
1453 | #ifndef NDEBUG |
1454 | LLVM_DEBUG( |
1455 | printScheduleModel(ReadyCyclesSorted); |
1456 | dbgs() << "\n\t" |
1457 | << "Metric: " |
1458 | << (SumBubbles |
1459 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
1460 | : 1) |
1461 | << "\n\n" ); |
1462 | #endif |
1463 | |
1464 | return ScheduleMetrics(CurrCycle, SumBubbles); |
1465 | } |
1466 | |
1467 | bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { |
1468 | if (WavesAfter < DAG.MinOccupancy) |
1469 | return true; |
1470 | |
1471 | // For dynamic VGPR mode, we don't want to waste any VGPR blocks. |
1472 | if (DAG.MFI.isDynamicVGPREnabled()) { |
1473 | unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( |
1474 | STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(), |
1475 | DynamicVGPRBlockSize: PressureBefore.getVGPRNum(UnifiedVGPRFile: false)); |
1476 | unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( |
1477 | STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(), |
1478 | DynamicVGPRBlockSize: PressureAfter.getVGPRNum(UnifiedVGPRFile: false)); |
1479 | if (BlocksAfter > BlocksBefore) |
1480 | return true; |
1481 | } |
1482 | |
1483 | return false; |
1484 | } |
1485 | |
1486 | bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
1487 | if (PressureAfter == PressureBefore) |
1488 | return false; |
1489 | |
1490 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
1491 | return true; |
1492 | |
1493 | if (mayCauseSpilling(WavesAfter)) |
1494 | return true; |
1495 | |
1496 | return false; |
1497 | } |
1498 | |
1499 | bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { |
1500 | // If RP is not reduced in the unclustered reschedule stage, revert to the |
1501 | // old schedule. |
1502 | if ((WavesAfter <= |
1503 | PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) && |
1504 | mayCauseSpilling(WavesAfter)) || |
1505 | GCNSchedStage::shouldRevertScheduling(WavesAfter)) { |
1506 | LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n" ); |
1507 | return true; |
1508 | } |
1509 | |
1510 | // Do not attempt to relax schedule even more if we are already spilling. |
1511 | if (isRegionWithExcessRP()) |
1512 | return false; |
1513 | |
1514 | LLVM_DEBUG( |
1515 | dbgs() |
1516 | << "\n\t *** In shouldRevertScheduling ***\n" |
1517 | << " *********** BEFORE UnclusteredHighRPStage ***********\n" ); |
1518 | ScheduleMetrics MBefore = getScheduleMetrics(InputSchedule: DAG.SUnits); |
1519 | LLVM_DEBUG( |
1520 | dbgs() |
1521 | << "\n *********** AFTER UnclusteredHighRPStage ***********\n" ); |
1522 | ScheduleMetrics MAfter = getScheduleMetrics(DAG); |
1523 | unsigned OldMetric = MBefore.getMetric(); |
1524 | unsigned NewMetric = MAfter.getMetric(); |
1525 | unsigned WavesBefore = std::min( |
1526 | a: S.getTargetOccupancy(), |
1527 | b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize())); |
1528 | unsigned Profit = |
1529 | ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * |
1530 | ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / |
1531 | NewMetric) / |
1532 | ScheduleMetrics::ScaleFactor; |
1533 | LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " |
1534 | << MAfter << "Profit: " << Profit << "\n" ); |
1535 | return Profit < ScheduleMetrics::ScaleFactor; |
1536 | } |
1537 | |
1538 | bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { |
1539 | if (PressureAfter == PressureBefore) |
1540 | return false; |
1541 | |
1542 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
1543 | return true; |
1544 | |
1545 | if (mayCauseSpilling(WavesAfter)) |
1546 | return true; |
1547 | |
1548 | return false; |
1549 | } |
1550 | |
1551 | bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { |
1552 | return GCNSchedStage::shouldRevertScheduling(WavesAfter) || |
1553 | mayCauseSpilling(WavesAfter) || |
1554 | (IncreaseOccupancy && WavesAfter < TargetOcc); |
1555 | } |
1556 | |
1557 | bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
1558 | if (mayCauseSpilling(WavesAfter)) |
1559 | return true; |
1560 | |
1561 | return false; |
1562 | } |
1563 | |
1564 | bool MemoryClauseInitialScheduleStage::shouldRevertScheduling( |
1565 | unsigned WavesAfter) { |
1566 | return mayCauseSpilling(WavesAfter); |
1567 | } |
1568 | |
1569 | bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { |
1570 | if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() && |
1571 | !PressureAfter.less(MF, O: PressureBefore)) { |
1572 | LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n" ); |
1573 | return true; |
1574 | } |
1575 | |
1576 | return false; |
1577 | } |
1578 | |
1579 | void GCNSchedStage::revertScheduling() { |
1580 | DAG.RegionsWithMinOcc[RegionIdx] = |
1581 | PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) == |
1582 | DAG.MinOccupancy; |
1583 | LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n" ); |
1584 | DAG.RegionEnd = DAG.RegionBegin; |
1585 | int SkippedDebugInstr = 0; |
1586 | for (MachineInstr *MI : Unsched) { |
1587 | if (MI->isDebugInstr()) { |
1588 | ++SkippedDebugInstr; |
1589 | continue; |
1590 | } |
1591 | |
1592 | if (MI->getIterator() != DAG.RegionEnd) { |
1593 | DAG.BB->splice(Where: DAG.RegionEnd, Other: DAG.BB, From: MI); |
1594 | if (!MI->isDebugInstr()) |
1595 | DAG.LIS->handleMove(MI&: *MI, UpdateFlags: true); |
1596 | } |
1597 | |
1598 | // Reset read-undef flags and update them later. |
1599 | for (auto &Op : MI->all_defs()) |
1600 | Op.setIsUndef(false); |
1601 | RegisterOperands RegOpers; |
1602 | RegOpers.collect(MI: *MI, TRI: *DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false); |
1603 | if (!MI->isDebugInstr()) { |
1604 | if (DAG.ShouldTrackLaneMasks) { |
1605 | // Adjust liveness and add missing dead+read-undef flags. |
1606 | SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot(); |
1607 | RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI); |
1608 | } else { |
1609 | // Adjust for missing dead-def flags. |
1610 | RegOpers.detectDeadDefs(MI: *MI, LIS: *DAG.LIS); |
1611 | } |
1612 | } |
1613 | DAG.RegionEnd = MI->getIterator(); |
1614 | ++DAG.RegionEnd; |
1615 | LLVM_DEBUG(dbgs() << "Scheduling " << *MI); |
1616 | } |
1617 | |
1618 | // After reverting schedule, debug instrs will now be at the end of the block |
1619 | // and RegionEnd will point to the first debug instr. Increment RegionEnd |
1620 | // pass debug instrs to the actual end of the scheduling region. |
1621 | while (SkippedDebugInstr-- > 0) |
1622 | ++DAG.RegionEnd; |
1623 | |
1624 | // If Unsched.front() instruction is a debug instruction, this will actually |
1625 | // shrink the region since we moved all debug instructions to the end of the |
1626 | // block. Find the first instruction that is not a debug instruction. |
1627 | DAG.RegionBegin = Unsched.front()->getIterator(); |
1628 | if (DAG.RegionBegin->isDebugInstr()) { |
1629 | for (MachineInstr *MI : Unsched) { |
1630 | if (MI->isDebugInstr()) |
1631 | continue; |
1632 | DAG.RegionBegin = MI->getIterator(); |
1633 | break; |
1634 | } |
1635 | } |
1636 | |
1637 | // Then move the debug instructions back into their correct place and set |
1638 | // RegionBegin and RegionEnd if needed. |
1639 | DAG.placeDebugValues(); |
1640 | |
1641 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
1642 | } |
1643 | |
1644 | bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, |
1645 | SlotIndex OriginalIdx, |
1646 | SlotIndex RematIdx) const { |
1647 | |
1648 | LiveIntervals *LIS = DAG.LIS; |
1649 | MachineRegisterInfo &MRI = DAG.MRI; |
1650 | OriginalIdx = OriginalIdx.getRegSlot(EC: true); |
1651 | RematIdx = std::max(a: RematIdx, b: RematIdx.getRegSlot(EC: true)); |
1652 | for (const MachineOperand &MO : InstToRemat->operands()) { |
1653 | if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) |
1654 | continue; |
1655 | |
1656 | if (!MO.getReg().isVirtual()) { |
1657 | // Do not attempt to reason about PhysRegs |
1658 | // TODO: better analysis of PhysReg livness |
1659 | if (!DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) && |
1660 | !DAG.TII->isIgnorableUse(MO)) |
1661 | return false; |
1662 | |
1663 | // Constant PhysRegs and IgnorableUses are okay |
1664 | continue; |
1665 | } |
1666 | |
1667 | LiveInterval &LI = LIS->getInterval(Reg: MO.getReg()); |
1668 | const VNInfo *OVNI = LI.getVNInfoAt(Idx: OriginalIdx); |
1669 | assert(OVNI); |
1670 | |
1671 | // Don't allow rematerialization immediately after the original def. |
1672 | // It would be incorrect if InstToRemat redefines the register. |
1673 | // See PR14098. |
1674 | if (SlotIndex::isSameInstr(A: OriginalIdx, B: RematIdx)) |
1675 | return false; |
1676 | |
1677 | if (OVNI != LI.getVNInfoAt(Idx: RematIdx)) |
1678 | return false; |
1679 | |
1680 | // Check that subrange is live at RematIdx. |
1681 | if (LI.hasSubRanges()) { |
1682 | const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); |
1683 | unsigned SubReg = MO.getSubReg(); |
1684 | LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubIdx: SubReg) |
1685 | : MRI.getMaxLaneMaskForVReg(Reg: MO.getReg()); |
1686 | for (LiveInterval::SubRange &SR : LI.subranges()) { |
1687 | if ((SR.LaneMask & LM).none()) |
1688 | continue; |
1689 | if (!SR.liveAt(index: RematIdx)) |
1690 | return false; |
1691 | |
1692 | // Early exit if all used lanes are checked. No need to continue. |
1693 | LM &= ~SR.LaneMask; |
1694 | if (LM.none()) |
1695 | break; |
1696 | } |
1697 | } |
1698 | } |
1699 | return true; |
1700 | } |
1701 | |
1702 | bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { |
1703 | REMAT_DEBUG({ |
1704 | dbgs() << "Collecting rematerializable instructions in " ; |
1705 | MF.getFunction().printAsOperand(dbgs(), false); |
1706 | dbgs() << '\n'; |
1707 | }); |
1708 | |
1709 | // Maps optimizable regions (i.e., regions at minimum and register-limited |
1710 | // occupancy, or regions with spilling) to the target RP we would like to |
1711 | // reach. |
1712 | DenseMap<unsigned, GCNRPTarget> OptRegions; |
1713 | const Function &F = MF.getFunction(); |
1714 | unsigned DynamicVGPRBlockSize = |
1715 | MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); |
1716 | |
1717 | std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F); |
1718 | const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F); |
1719 | const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F); |
1720 | const unsigned MaxSGPRsIncOcc = |
1721 | ST.getMaxNumSGPRs(WavesPerEU: DAG.MinOccupancy + 1, Addressable: false); |
1722 | const unsigned MaxVGPRsIncOcc = |
1723 | ST.getMaxNumVGPRs(WavesPerEU: DAG.MinOccupancy + 1, DynamicVGPRBlockSize); |
1724 | IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy; |
1725 | |
1726 | // Collect optimizable regions. If there is spilling in any region we will |
1727 | // just try to reduce spilling. Otherwise we will try to increase occupancy by |
1728 | // one in the whole function. |
1729 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
1730 | GCNRegPressure &RP = DAG.Pressure[I]; |
1731 | // We allow ArchVGPR or AGPR savings to count as savings of the other kind |
1732 | // of VGPR only when trying to eliminate spilling. We cannot do this when |
1733 | // trying to increase occupancy since VGPR class swaps only occur later in |
1734 | // the register allocator i.e., the scheduler will not be able to reason |
1735 | // about these savings and will not report an increase in the achievable |
1736 | // occupancy, triggering rollbacks. |
1737 | GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP, |
1738 | /*CombineVGPRSavings=*/true); |
1739 | if (!Target.satisfied() && IncreaseOccupancy) { |
1740 | // There is spilling in the region and we were so far trying to increase |
1741 | // occupancy. Strop trying that and focus on reducing spilling. |
1742 | IncreaseOccupancy = false; |
1743 | OptRegions.clear(); |
1744 | } else if (IncreaseOccupancy) { |
1745 | // There is no spilling in the region, try to increase occupancy. |
1746 | Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP, |
1747 | /*CombineVGPRSavings=*/false); |
1748 | } |
1749 | if (!Target.satisfied()) |
1750 | OptRegions.insert(KV: {I, Target}); |
1751 | } |
1752 | if (OptRegions.empty()) |
1753 | return false; |
1754 | |
1755 | #ifndef NDEBUG |
1756 | if (IncreaseOccupancy) { |
1757 | REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy |
1758 | << ") in regions:\n" ); |
1759 | } else { |
1760 | REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy (" |
1761 | << WavesPerEU.first << ") in regions:\n" ); |
1762 | } |
1763 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
1764 | if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) |
1765 | REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n'); |
1766 | } |
1767 | #endif |
1768 | |
1769 | // When we are reducing spilling, the target is the minimum target number of |
1770 | // waves/EU determined by the subtarget. In cases where either one of |
1771 | // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current |
1772 | // minimum region occupancy may be higher than the latter. |
1773 | TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 |
1774 | : std::max(a: DAG.MinOccupancy, b: WavesPerEU.first); |
1775 | |
1776 | // Accounts for a reduction in RP in an optimizable region. Returns whether we |
1777 | // estimate that we have identified enough rematerialization opportunities to |
1778 | // achieve our goal, and sets Progress to true when this particular reduction |
1779 | // in pressure was helpful toward that goal. |
1780 | auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask, |
1781 | bool &Progress) -> bool { |
1782 | GCNRPTarget &Target = OptIt->getSecond(); |
1783 | if (!Target.isSaveBeneficial(Reg, MRI: DAG.MRI)) |
1784 | return false; |
1785 | Progress = true; |
1786 | Target.saveReg(Reg, Mask, MRI: DAG.MRI); |
1787 | if (Target.satisfied()) |
1788 | OptRegions.erase(OptIt->getFirst()); |
1789 | return OptRegions.empty(); |
1790 | }; |
1791 | |
1792 | // We need up-to-date live-out info. to query live-out register masks in |
1793 | // regions containing rematerializable instructions. |
1794 | DAG.RegionLiveOuts.buildLiveRegMap(); |
1795 | |
1796 | // Cache set of registers that are going to be rematerialized. |
1797 | DenseSet<unsigned> RematRegs; |
1798 | |
1799 | // Identify rematerializable instructions in the function. |
1800 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
1801 | auto Region = DAG.Regions[I]; |
1802 | for (auto MI = Region.first; MI != Region.second; ++MI) { |
1803 | // The instruction must be trivially rematerializable. |
1804 | MachineInstr &DefMI = *MI; |
1805 | if (!isTriviallyReMaterializable(MI: DefMI)) |
1806 | continue; |
1807 | |
1808 | // We only support rematerializing virtual registers with one definition. |
1809 | Register Reg = DefMI.getOperand(i: 0).getReg(); |
1810 | if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(RegNo: Reg)) |
1811 | continue; |
1812 | |
1813 | // We only care to rematerialize the instruction if it has a single |
1814 | // non-debug user in a different region. The using MI may not belong to a |
1815 | // region if it is a lone region terminator. |
1816 | MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(RegNo: Reg); |
1817 | if (!UseMI) |
1818 | continue; |
1819 | auto UseRegion = MIRegion.find(Val: UseMI); |
1820 | if (UseRegion != MIRegion.end() && UseRegion->second == I) |
1821 | continue; |
1822 | |
1823 | // Do not rematerialize an instruction if it uses or is used by an |
1824 | // instruction that we have designated for rematerialization. |
1825 | // FIXME: Allow for rematerialization chains: this requires 1. updating |
1826 | // remat points to account for uses that are rematerialized, and 2. either |
1827 | // rematerializing the candidates in careful ordering, or deferring the |
1828 | // MBB RP walk until the entire chain has been rematerialized. |
1829 | if (Rematerializations.contains(Key: UseMI) || |
1830 | llvm::any_of(Range: DefMI.operands(), P: [&RematRegs](MachineOperand &MO) { |
1831 | return MO.isReg() && RematRegs.contains(V: MO.getReg()); |
1832 | })) |
1833 | continue; |
1834 | |
1835 | // Do not rematerialize an instruction it it uses registers that aren't |
1836 | // available at its use. This ensures that we are not extending any live |
1837 | // range while rematerializing. |
1838 | SlotIndex DefIdx = DAG.LIS->getInstructionIndex(Instr: DefMI); |
1839 | SlotIndex UseIdx = DAG.LIS->getInstructionIndex(Instr: *UseMI).getRegSlot(EC: true); |
1840 | if (!allUsesAvailableAt(InstToRemat: &DefMI, OriginalIdx: DefIdx, RematIdx: UseIdx)) |
1841 | continue; |
1842 | |
1843 | REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI); |
1844 | RematInstruction &Remat = |
1845 | Rematerializations.try_emplace(Key: &DefMI, Args&: UseMI).first->second; |
1846 | |
1847 | bool RematUseful = false; |
1848 | if (auto It = OptRegions.find(Val: I); It != OptRegions.end()) { |
1849 | // Optimistically consider that moving the instruction out of its |
1850 | // defining region will reduce RP in the latter; this assumes that |
1851 | // maximum RP in the region is reached somewhere between the defining |
1852 | // instruction and the end of the region. |
1853 | REMAT_DEBUG(dbgs() << " Defining region is optimizable\n" ); |
1854 | LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I)[Reg]; |
1855 | if (ReduceRPInRegion(It, Reg, Mask, RematUseful)) |
1856 | return true; |
1857 | } |
1858 | |
1859 | for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) { |
1860 | // We are only collecting regions in which the register is a live-in |
1861 | // (and may be live-through). |
1862 | auto It = DAG.LiveIns[LIRegion].find(Val: Reg); |
1863 | if (It == DAG.LiveIns[LIRegion].end() || It->second.none()) |
1864 | continue; |
1865 | Remat.LiveInRegions.insert(V: LIRegion); |
1866 | |
1867 | // Account for the reduction in RP due to the rematerialization in an |
1868 | // optimizable region in which the defined register is a live-in. This |
1869 | // is exact for live-through region but optimistic in the using region, |
1870 | // where RP is actually reduced only if maximum RP is reached somewhere |
1871 | // between the beginning of the region and the rematerializable |
1872 | // instruction's use. |
1873 | if (auto It = OptRegions.find(Val: LIRegion); It != OptRegions.end()) { |
1874 | REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n'); |
1875 | if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg], |
1876 | RematUseful)) |
1877 | return true; |
1878 | } |
1879 | } |
1880 | |
1881 | // If the instruction is not a live-in or live-out in any optimizable |
1882 | // region then there is no point in rematerializing it. |
1883 | if (!RematUseful) { |
1884 | Rematerializations.pop_back(); |
1885 | REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n" ); |
1886 | } else { |
1887 | RematRegs.insert(V: Reg); |
1888 | } |
1889 | } |
1890 | } |
1891 | |
1892 | if (IncreaseOccupancy) { |
1893 | // We were trying to increase occupancy but failed, abort the stage. |
1894 | REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n" ); |
1895 | Rematerializations.clear(); |
1896 | return false; |
1897 | } |
1898 | REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n" ); |
1899 | return !Rematerializations.empty(); |
1900 | } |
1901 | |
1902 | void PreRARematStage::rematerialize() { |
1903 | const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); |
1904 | |
1905 | // Collect regions whose RP changes in unpredictable way; we will have to |
1906 | // fully recompute their RP after all rematerailizations. |
1907 | DenseSet<unsigned> RecomputeRP; |
1908 | |
1909 | // Rematerialize all instructions. |
1910 | for (auto &[DefMI, Remat] : Rematerializations) { |
1911 | MachineBasicBlock::iterator InsertPos(Remat.UseMI); |
1912 | Register Reg = DefMI->getOperand(i: 0).getReg(); |
1913 | unsigned SubReg = DefMI->getOperand(i: 0).getSubReg(); |
1914 | unsigned DefRegion = MIRegion.at(Val: DefMI); |
1915 | |
1916 | // Rematerialize DefMI to its use block. |
1917 | TII->reMaterialize(MBB&: *InsertPos->getParent(), MI: InsertPos, DestReg: Reg, SubIdx: SubReg, Orig: *DefMI, |
1918 | TRI: *DAG.TRI); |
1919 | Remat.RematMI = &*std::prev(x: InsertPos); |
1920 | Remat.RematMI->getOperand(i: 0).setSubReg(SubReg); |
1921 | DAG.LIS->InsertMachineInstrInMaps(MI&: *Remat.RematMI); |
1922 | |
1923 | // Update region boundaries in regions we sinked from (remove defining MI) |
1924 | // and to (insert MI rematerialized in use block). Only then we can erase |
1925 | // the original MI. |
1926 | DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[DefRegion], MI: DefMI, NewMI: nullptr); |
1927 | auto UseRegion = MIRegion.find(Val: Remat.UseMI); |
1928 | if (UseRegion != MIRegion.end()) { |
1929 | DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[UseRegion->second], MI: InsertPos, |
1930 | NewMI: Remat.RematMI); |
1931 | } |
1932 | DAG.LIS->RemoveMachineInstrFromMaps(MI&: *DefMI); |
1933 | DefMI->eraseFromParent(); |
1934 | |
1935 | // Collect all regions impacted by the rematerialization and update their |
1936 | // live-in/RP information. |
1937 | for (unsigned I : Remat.LiveInRegions) { |
1938 | ImpactedRegions.insert(KV: {I, DAG.Pressure[I]}); |
1939 | GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I]; |
1940 | |
1941 | #ifdef EXPENSIVE_CHECKS |
1942 | // All uses are known to be available / live at the remat point. Thus, the |
1943 | // uses should already be live in to the region. |
1944 | for (MachineOperand &MO : DefMI->operands()) { |
1945 | if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) |
1946 | continue; |
1947 | |
1948 | Register UseReg = MO.getReg(); |
1949 | if (!UseReg.isVirtual()) |
1950 | continue; |
1951 | |
1952 | LiveInterval &LI = DAG.LIS->getInterval(UseReg); |
1953 | LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg()); |
1954 | if (LI.hasSubRanges() && MO.getSubReg()) |
1955 | LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg()); |
1956 | |
1957 | LaneBitmask LiveInMask = RegionLiveIns.at(UseReg); |
1958 | LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM); |
1959 | // If this register has lanes not covered by the LiveIns, be sure they |
1960 | // do not map to any subrange. ref: |
1961 | // machine-scheduler-sink-trivial-remats.mir::omitted_subrange |
1962 | if (UncoveredLanes.any()) { |
1963 | assert(LI.hasSubRanges()); |
1964 | for (LiveInterval::SubRange &SR : LI.subranges()) |
1965 | assert((SR.LaneMask & UncoveredLanes).none()); |
1966 | } |
1967 | } |
1968 | #endif |
1969 | |
1970 | // The register is no longer a live-in in all regions but the one that |
1971 | // contains the single use. In live-through regions, maximum register |
1972 | // pressure decreases predictably so we can directly update it. In the |
1973 | // using region, maximum RP may or may not decrease, so we will mark it |
1974 | // for re-computation after all materializations have taken place. |
1975 | LaneBitmask PrevMask = RegionLiveIns[Reg]; |
1976 | RegionLiveIns.erase(Val: Reg); |
1977 | RegMasks.insert(KV: {{I, Remat.RematMI->getOperand(i: 0).getReg()}, PrevMask}); |
1978 | if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent()) |
1979 | DAG.Pressure[I].inc(Reg, PrevMask, NewMask: LaneBitmask::getNone(), MRI: DAG.MRI); |
1980 | else |
1981 | RecomputeRP.insert(V: I); |
1982 | } |
1983 | // RP in the region from which the instruction was rematerialized may or may |
1984 | // not decrease. |
1985 | ImpactedRegions.insert(KV: {DefRegion, DAG.Pressure[DefRegion]}); |
1986 | RecomputeRP.insert(V: DefRegion); |
1987 | |
1988 | // Recompute live interval to reflect the register's rematerialization. |
1989 | Register RematReg = Remat.RematMI->getOperand(i: 0).getReg(); |
1990 | DAG.LIS->removeInterval(Reg: RematReg); |
1991 | DAG.LIS->createAndComputeVirtRegInterval(Reg: RematReg); |
1992 | } |
1993 | |
1994 | // All regions impacted by at least one rematerialization must be rescheduled. |
1995 | // Maximum pressure must also be recomputed for all regions where it changed |
1996 | // non-predictably and checked against the target occupancy. |
1997 | AchievedOcc = TargetOcc; |
1998 | for (auto &[I, OriginalRP] : ImpactedRegions) { |
1999 | bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; |
2000 | RescheduleRegions[I] = !IsEmptyRegion; |
2001 | if (!RecomputeRP.contains(V: I)) |
2002 | continue; |
2003 | |
2004 | GCNRegPressure RP; |
2005 | if (IsEmptyRegion) { |
2006 | RP = getRegPressure(MRI: DAG.MRI, LiveRegs&: DAG.LiveIns[I]); |
2007 | } else { |
2008 | GCNDownwardRPTracker RPT(*DAG.LIS); |
2009 | auto *NonDbgMI = &*skipDebugInstructionsForward(It: DAG.Regions[I].first, |
2010 | End: DAG.Regions[I].second); |
2011 | if (NonDbgMI == DAG.Regions[I].second) { |
2012 | // Region is non-empty but contains only debug instructions. |
2013 | RP = getRegPressure(MRI: DAG.MRI, LiveRegs&: DAG.LiveIns[I]); |
2014 | } else { |
2015 | RPT.reset(MI: *NonDbgMI, LiveRegs: &DAG.LiveIns[I]); |
2016 | RPT.advance(End: DAG.Regions[I].second); |
2017 | RP = RPT.moveMaxPressure(); |
2018 | } |
2019 | } |
2020 | DAG.Pressure[I] = RP; |
2021 | AchievedOcc = std::min( |
2022 | a: AchievedOcc, b: RP.getOccupancy(ST, DynamicVGPRBlockSize: MF.getInfo<SIMachineFunctionInfo>() |
2023 | ->getDynamicVGPRBlockSize())); |
2024 | } |
2025 | REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n" ); |
2026 | } |
2027 | |
2028 | // Copied from MachineLICM |
2029 | bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { |
2030 | if (!DAG.TII->isTriviallyReMaterializable(MI)) |
2031 | return false; |
2032 | |
2033 | for (const MachineOperand &MO : MI.all_uses()) { |
2034 | // We can't remat physreg uses, unless it is a constant or an ignorable |
2035 | // use (e.g. implicit exec use on VALU instructions) |
2036 | if (MO.getReg().isPhysical()) { |
2037 | if (DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) || DAG.TII->isIgnorableUse(MO)) |
2038 | continue; |
2039 | return false; |
2040 | } |
2041 | } |
2042 | |
2043 | return true; |
2044 | } |
2045 | |
2046 | void PreRARematStage::finalizeGCNSchedStage() { |
2047 | // We consider that reducing spilling is always beneficial so we never |
2048 | // rollback rematerializations in such cases. It's also possible that |
2049 | // rescheduling lowers occupancy over the one achieved just through remats, in |
2050 | // which case we do not want to rollback either (the rescheduling was already |
2051 | // reverted in PreRARematStage::shouldRevertScheduling in such cases). |
2052 | unsigned MaxOcc = std::max(a: AchievedOcc, b: DAG.MinOccupancy); |
2053 | if (!IncreaseOccupancy || MaxOcc >= TargetOcc) |
2054 | return; |
2055 | |
2056 | REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n" ); |
2057 | const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); |
2058 | |
2059 | // Rollback the rematerializations. |
2060 | for (const auto &[DefMI, Remat] : Rematerializations) { |
2061 | MachineInstr &RematMI = *Remat.RematMI; |
2062 | unsigned DefRegion = MIRegion.at(Val: DefMI); |
2063 | MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second); |
2064 | MachineBasicBlock *MBB = RegionBB[DefRegion]; |
2065 | Register Reg = RematMI.getOperand(i: 0).getReg(); |
2066 | unsigned SubReg = RematMI.getOperand(i: 0).getSubReg(); |
2067 | |
2068 | // Re-rematerialize MI at the end of its original region. Note that it may |
2069 | // not be rematerialized exactly in the same position as originally within |
2070 | // the region, but it should not matter much. |
2071 | TII->reMaterialize(MBB&: *MBB, MI: InsertPos, DestReg: Reg, SubIdx: SubReg, Orig: RematMI, TRI: *DAG.TRI); |
2072 | MachineInstr *NewMI = &*std::prev(x: InsertPos); |
2073 | NewMI->getOperand(i: 0).setSubReg(SubReg); |
2074 | DAG.LIS->InsertMachineInstrInMaps(MI&: *NewMI); |
2075 | |
2076 | auto UseRegion = MIRegion.find(Val: Remat.UseMI); |
2077 | if (UseRegion != MIRegion.end()) { |
2078 | DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[UseRegion->second], MI: RematMI, |
2079 | NewMI: nullptr); |
2080 | } |
2081 | DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[DefRegion], MI: InsertPos, NewMI); |
2082 | |
2083 | // Erase rematerialized MI. |
2084 | DAG.LIS->RemoveMachineInstrFromMaps(MI&: RematMI); |
2085 | RematMI.eraseFromParent(); |
2086 | |
2087 | // Recompute live interval for the re-rematerialized register |
2088 | DAG.LIS->removeInterval(Reg); |
2089 | DAG.LIS->createAndComputeVirtRegInterval(Reg); |
2090 | |
2091 | // Re-add the register as a live-in in all regions it used to be one in. |
2092 | for (unsigned LIRegion : Remat.LiveInRegions) |
2093 | DAG.LiveIns[LIRegion].insert(KV: {Reg, RegMasks.at(Val: {LIRegion, Reg})}); |
2094 | } |
2095 | |
2096 | // Reset RP in all impacted regions. |
2097 | for (auto &[I, OriginalRP] : ImpactedRegions) |
2098 | DAG.Pressure[I] = OriginalRP; |
2099 | |
2100 | GCNSchedStage::finalizeGCNSchedStage(); |
2101 | } |
2102 | |
2103 | void GCNScheduleDAGMILive::updateRegionBoundaries( |
2104 | RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI, |
2105 | MachineInstr *NewMI) { |
2106 | assert((!NewMI || NewMI != RegionBounds.second) && |
2107 | "cannot remove at region end" ); |
2108 | |
2109 | if (RegionBounds.first == RegionBounds.second) { |
2110 | assert(NewMI && "cannot remove from an empty region" ); |
2111 | RegionBounds.first = NewMI; |
2112 | return; |
2113 | } |
2114 | |
2115 | // We only care for modifications at the beginning of a non-empty region since |
2116 | // the upper region boundary is exclusive. |
2117 | if (MI != RegionBounds.first) |
2118 | return; |
2119 | if (!NewMI) |
2120 | RegionBounds.first = std::next(x: MI); // Removal |
2121 | else |
2122 | RegionBounds.first = NewMI; // Insertion |
2123 | } |
2124 | |
2125 | static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { |
2126 | const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII); |
2127 | return any_of(Range&: *DAG, P: [SII](MachineBasicBlock::iterator MI) { |
2128 | return SII->isIGLPMutationOnly(Opcode: MI->getOpcode()); |
2129 | }); |
2130 | } |
2131 | |
2132 | GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( |
2133 | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, |
2134 | bool RemoveKillFlags) |
2135 | : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} |
2136 | |
2137 | void GCNPostScheduleDAGMILive::schedule() { |
2138 | HasIGLPInstrs = hasIGLPInstrs(DAG: this); |
2139 | if (HasIGLPInstrs) { |
2140 | SavedMutations.clear(); |
2141 | SavedMutations.swap(x&: Mutations); |
2142 | addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA)); |
2143 | } |
2144 | |
2145 | ScheduleDAGMI::schedule(); |
2146 | } |
2147 | |
2148 | void GCNPostScheduleDAGMILive::finalizeSchedule() { |
2149 | if (HasIGLPInstrs) |
2150 | SavedMutations.swap(x&: Mutations); |
2151 | |
2152 | ScheduleDAGMI::finalizeSchedule(); |
2153 | } |
2154 | |