| 1 | //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// This contains a MachineSchedStrategy implementation for maximizing wave |
| 11 | /// occupancy on GCN hardware. |
| 12 | /// |
| 13 | /// This pass will apply multiple scheduling stages to the same function. |
| 14 | /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual |
| 15 | /// entry point for the scheduling of those regions is |
| 16 | /// GCNScheduleDAGMILive::runSchedStages. |
| 17 | |
| 18 | /// Generally, the reason for having multiple scheduling stages is to account |
| 19 | /// for the kernel-wide effect of register usage on occupancy. Usually, only a |
| 20 | /// few scheduling regions will have register pressure high enough to limit |
| 21 | /// occupancy for the kernel, so constraints can be relaxed to improve ILP in |
| 22 | /// other regions. |
| 23 | /// |
| 24 | //===----------------------------------------------------------------------===// |
| 25 | |
| 26 | #include "GCNSchedStrategy.h" |
| 27 | #include "AMDGPUIGroupLP.h" |
| 28 | #include "GCNRegPressure.h" |
| 29 | #include "SIMachineFunctionInfo.h" |
| 30 | #include "Utils/AMDGPUBaseInfo.h" |
| 31 | #include "llvm/ADT/STLExtras.h" |
| 32 | #include "llvm/CodeGen/RegisterClassInfo.h" |
| 33 | #include "llvm/MC/LaneBitmask.h" |
| 34 | #include "llvm/Support/ErrorHandling.h" |
| 35 | |
| 36 | #define DEBUG_TYPE "machine-scheduler" |
| 37 | |
| 38 | using namespace llvm; |
| 39 | |
| 40 | static cl::opt<bool> DisableUnclusterHighRP( |
| 41 | "amdgpu-disable-unclustered-high-rp-reschedule" , cl::Hidden, |
| 42 | cl::desc("Disable unclustered high register pressure " |
| 43 | "reduction scheduling stage." ), |
| 44 | cl::init(Val: false)); |
| 45 | |
| 46 | static cl::opt<bool> DisableClusteredLowOccupancy( |
| 47 | "amdgpu-disable-clustered-low-occupancy-reschedule" , cl::Hidden, |
| 48 | cl::desc("Disable clustered low occupancy " |
| 49 | "rescheduling for ILP scheduling stage." ), |
| 50 | cl::init(Val: false)); |
| 51 | |
| 52 | static cl::opt<unsigned> ScheduleMetricBias( |
| 53 | "amdgpu-schedule-metric-bias" , cl::Hidden, |
| 54 | cl::desc( |
| 55 | "Sets the bias which adds weight to occupancy vs latency. Set it to " |
| 56 | "100 to chase the occupancy only." ), |
| 57 | cl::init(Val: 10)); |
| 58 | |
| 59 | static cl::opt<bool> |
| 60 | RelaxedOcc("amdgpu-schedule-relaxed-occupancy" , cl::Hidden, |
| 61 | cl::desc("Relax occupancy targets for kernels which are memory " |
| 62 | "bound (amdgpu-membound-threshold), or " |
| 63 | "Wave Limited (amdgpu-limit-wave-threshold)." ), |
| 64 | cl::init(Val: false)); |
| 65 | |
| 66 | static cl::opt<bool> GCNTrackers( |
| 67 | "amdgpu-use-amdgpu-trackers" , cl::Hidden, |
| 68 | cl::desc("Use the AMDGPU specific RPTrackers during scheduling" ), |
| 69 | cl::init(Val: false)); |
| 70 | |
| 71 | const unsigned ScheduleMetrics::ScaleFactor = 100; |
| 72 | |
| 73 | GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) |
| 74 | : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), |
| 75 | DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { |
| 76 | } |
| 77 | |
| 78 | void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { |
| 79 | GenericScheduler::initialize(dag: DAG); |
| 80 | |
| 81 | MF = &DAG->MF; |
| 82 | |
| 83 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
| 84 | |
| 85 | SGPRExcessLimit = |
| 86 | Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass); |
| 87 | VGPRExcessLimit = |
| 88 | Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass); |
| 89 | |
| 90 | SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
| 91 | // Set the initial TargetOccupnacy to the maximum occupancy that we can |
| 92 | // achieve for this function. This effectively sets a lower bound on the |
| 93 | // 'Critical' register limits in the scheduler. |
| 94 | // Allow for lower occupancy targets if kernel is wave limited or memory |
| 95 | // bound, and using the relaxed occupancy feature. |
| 96 | TargetOccupancy = |
| 97 | RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); |
| 98 | SGPRCriticalLimit = |
| 99 | std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit); |
| 100 | |
| 101 | if (!KnownExcessRP) { |
| 102 | VGPRCriticalLimit = std::min( |
| 103 | a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize()), |
| 104 | b: VGPRExcessLimit); |
| 105 | } else { |
| 106 | // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except |
| 107 | // returns a reasonably small number for targets with lots of VGPRs, such |
| 108 | // as GFX10 and GFX11. |
| 109 | LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " |
| 110 | "VGPRCriticalLimit calculation method.\n" ); |
| 111 | unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize(); |
| 112 | unsigned Granule = |
| 113 | AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST, DynamicVGPRBlockSize); |
| 114 | unsigned Addressable = |
| 115 | AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: &ST, DynamicVGPRBlockSize); |
| 116 | unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule); |
| 117 | VGPRBudget = std::max(a: VGPRBudget, b: Granule); |
| 118 | VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit); |
| 119 | } |
| 120 | |
| 121 | // Subtract error margin and bias from register limits and avoid overflow. |
| 122 | SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit); |
| 123 | VGPRCriticalLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit); |
| 124 | SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit); |
| 125 | VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit); |
| 126 | |
| 127 | LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit |
| 128 | << ", VGPRExcessLimit = " << VGPRExcessLimit |
| 129 | << ", SGPRCriticalLimit = " << SGPRCriticalLimit |
| 130 | << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n" ); |
| 131 | } |
| 132 | |
| 133 | /// Checks whether \p SU can use the cached DAG pressure diffs to compute the |
| 134 | /// current register pressure. |
| 135 | /// |
| 136 | /// This works for the common case, but it has a few exceptions that have been |
| 137 | /// observed through trial and error: |
| 138 | /// - Explicit physical register operands |
| 139 | /// - Subregister definitions |
| 140 | /// |
| 141 | /// In both of those cases, PressureDiff doesn't represent the actual pressure, |
| 142 | /// and querying LiveIntervals through the RegPressureTracker is needed to get |
| 143 | /// an accurate value. |
| 144 | /// |
| 145 | /// We should eventually only use PressureDiff for maximum performance, but this |
| 146 | /// already allows 80% of SUs to take the fast path without changing scheduling |
| 147 | /// at all. Further changes would either change scheduling, or require a lot |
| 148 | /// more logic to recover an accurate pressure estimate from the PressureDiffs. |
| 149 | static bool canUsePressureDiffs(const SUnit &SU) { |
| 150 | if (!SU.isInstr()) |
| 151 | return false; |
| 152 | |
| 153 | // Cannot use pressure diffs for subregister defs or with physregs, it's |
| 154 | // imprecise in both cases. |
| 155 | for (const auto &Op : SU.getInstr()->operands()) { |
| 156 | if (!Op.isReg() || Op.isImplicit()) |
| 157 | continue; |
| 158 | if (Op.getReg().isPhysical() || |
| 159 | (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister)) |
| 160 | return false; |
| 161 | } |
| 162 | return true; |
| 163 | } |
| 164 | |
| 165 | static void getRegisterPressures( |
| 166 | bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, |
| 167 | std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure, |
| 168 | GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, |
| 169 | ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) { |
| 170 | // getDownwardPressure() and getUpwardPressure() make temporary changes to |
| 171 | // the tracker, so we need to pass those function a non-const copy. |
| 172 | RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); |
| 173 | if (!GCNTrackers) { |
| 174 | AtTop |
| 175 | ? TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure) |
| 176 | : TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure); |
| 177 | |
| 178 | return; |
| 179 | } |
| 180 | |
| 181 | // GCNTrackers |
| 182 | Pressure.resize(new_size: 4, x: 0); |
| 183 | MachineInstr *MI = SU->getInstr(); |
| 184 | GCNRegPressure NewPressure; |
| 185 | if (AtTop) { |
| 186 | GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); |
| 187 | NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, TRI: SRI); |
| 188 | } else { |
| 189 | GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); |
| 190 | TempUpwardTracker.recede(MI: *MI); |
| 191 | NewPressure = TempUpwardTracker.getPressure(); |
| 192 | } |
| 193 | Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); |
| 194 | Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = |
| 195 | NewPressure.getArchVGPRNum(); |
| 196 | Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); |
| 197 | } |
| 198 | |
| 199 | void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, |
| 200 | bool AtTop, |
| 201 | const RegPressureTracker &RPTracker, |
| 202 | const SIRegisterInfo *SRI, |
| 203 | unsigned SGPRPressure, |
| 204 | unsigned VGPRPressure, bool IsBottomUp) { |
| 205 | Cand.SU = SU; |
| 206 | Cand.AtTop = AtTop; |
| 207 | |
| 208 | if (!DAG->isTrackingPressure()) |
| 209 | return; |
| 210 | |
| 211 | Pressure.clear(); |
| 212 | MaxPressure.clear(); |
| 213 | |
| 214 | // We try to use the cached PressureDiffs in the ScheduleDAG whenever |
| 215 | // possible over querying the RegPressureTracker. |
| 216 | // |
| 217 | // RegPressureTracker will make a lot of LIS queries which are very |
| 218 | // expensive, it is considered a slow function in this context. |
| 219 | // |
| 220 | // PressureDiffs are precomputed and cached, and getPressureDiff is just a |
| 221 | // trivial lookup into an array. It is pretty much free. |
| 222 | // |
| 223 | // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of |
| 224 | // PressureDiffs. |
| 225 | if (AtTop || !canUsePressureDiffs(SU: *SU) || GCNTrackers) { |
| 226 | getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, |
| 227 | DownwardTracker, UpwardTracker, DAG, SRI); |
| 228 | } else { |
| 229 | // Reserve 4 slots. |
| 230 | Pressure.resize(new_size: 4, x: 0); |
| 231 | Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; |
| 232 | Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; |
| 233 | |
| 234 | for (const auto &Diff : DAG->getPressureDiff(SU)) { |
| 235 | if (!Diff.isValid()) |
| 236 | continue; |
| 237 | // PressureDiffs is always bottom-up so if we're working top-down we need |
| 238 | // to invert its sign. |
| 239 | Pressure[Diff.getPSet()] += |
| 240 | (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc()); |
| 241 | } |
| 242 | |
| 243 | #ifdef EXPENSIVE_CHECKS |
| 244 | std::vector<unsigned> CheckPressure, CheckMaxPressure; |
| 245 | getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, |
| 246 | DownwardTracker, UpwardTracker, DAG, SRI); |
| 247 | if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != |
| 248 | CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || |
| 249 | Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != |
| 250 | CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { |
| 251 | errs() << "Register Pressure is inaccurate when calculated through " |
| 252 | "PressureDiff\n" |
| 253 | << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] |
| 254 | << ", expected " |
| 255 | << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" |
| 256 | << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] |
| 257 | << ", expected " |
| 258 | << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n" ; |
| 259 | report_fatal_error("inaccurate register pressure calculation" ); |
| 260 | } |
| 261 | #endif |
| 262 | } |
| 263 | |
| 264 | unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
| 265 | unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
| 266 | |
| 267 | // If two instructions increase the pressure of different register sets |
| 268 | // by the same amount, the generic scheduler will prefer to schedule the |
| 269 | // instruction that increases the set with the least amount of registers, |
| 270 | // which in our case would be SGPRs. This is rarely what we want, so |
| 271 | // when we report excess/critical register pressure, we do it either |
| 272 | // only for VGPRs or only for SGPRs. |
| 273 | |
| 274 | // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. |
| 275 | const unsigned MaxVGPRPressureInc = 16; |
| 276 | bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; |
| 277 | bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; |
| 278 | |
| 279 | // FIXME: We have to enter REG-EXCESS before we reach the actual threshold |
| 280 | // to increase the likelihood we don't go over the limits. We should improve |
| 281 | // the analysis to look through dependencies to find the path with the least |
| 282 | // register pressure. |
| 283 | |
| 284 | // We only need to update the RPDelta for instructions that increase register |
| 285 | // pressure. Instructions that decrease or keep reg pressure the same will be |
| 286 | // marked as RegExcess in tryCandidate() when they are compared with |
| 287 | // instructions that increase the register pressure. |
| 288 | if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { |
| 289 | HasHighPressure = true; |
| 290 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
| 291 | Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); |
| 292 | } |
| 293 | |
| 294 | if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { |
| 295 | HasHighPressure = true; |
| 296 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
| 297 | Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); |
| 298 | } |
| 299 | |
| 300 | // Register pressure is considered 'CRITICAL' if it is approaching a value |
| 301 | // that would reduce the wave occupancy for the execution unit. When |
| 302 | // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both |
| 303 | // has the same cost, so we don't need to prefer one over the other. |
| 304 | |
| 305 | int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; |
| 306 | int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; |
| 307 | |
| 308 | if (SGPRDelta >= 0 || VGPRDelta >= 0) { |
| 309 | HasHighPressure = true; |
| 310 | if (SGPRDelta > VGPRDelta) { |
| 311 | Cand.RPDelta.CriticalMax = |
| 312 | PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
| 313 | Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); |
| 314 | } else { |
| 315 | Cand.RPDelta.CriticalMax = |
| 316 | PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
| 317 | Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); |
| 318 | } |
| 319 | } |
| 320 | } |
| 321 | |
| 322 | // This function is mostly cut and pasted from |
| 323 | // GenericScheduler::pickNodeFromQueue() |
| 324 | void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, |
| 325 | const CandPolicy &ZonePolicy, |
| 326 | const RegPressureTracker &RPTracker, |
| 327 | SchedCandidate &Cand, |
| 328 | bool IsBottomUp) { |
| 329 | const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); |
| 330 | ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); |
| 331 | unsigned SGPRPressure = 0; |
| 332 | unsigned VGPRPressure = 0; |
| 333 | if (DAG->isTrackingPressure()) { |
| 334 | if (!GCNTrackers) { |
| 335 | SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
| 336 | VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
| 337 | } else { |
| 338 | GCNRPTracker *T = IsBottomUp |
| 339 | ? static_cast<GCNRPTracker *>(&UpwardTracker) |
| 340 | : static_cast<GCNRPTracker *>(&DownwardTracker); |
| 341 | SGPRPressure = T->getPressure().getSGPRNum(); |
| 342 | VGPRPressure = T->getPressure().getArchVGPRNum(); |
| 343 | } |
| 344 | } |
| 345 | ReadyQueue &Q = Zone.Available; |
| 346 | for (SUnit *SU : Q) { |
| 347 | |
| 348 | SchedCandidate TryCand(ZonePolicy); |
| 349 | initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure, |
| 350 | VGPRPressure, IsBottomUp); |
| 351 | // Pass SchedBoundary only when comparing nodes from the same boundary. |
| 352 | SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; |
| 353 | tryCandidate(Cand, TryCand, Zone: ZoneArg); |
| 354 | if (TryCand.Reason != NoCand) { |
| 355 | // Initialize resource delta if needed in case future heuristics query it. |
| 356 | if (TryCand.ResDelta == SchedResourceDelta()) |
| 357 | TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel); |
| 358 | Cand.setBest(TryCand); |
| 359 | LLVM_DEBUG(traceCandidate(Cand)); |
| 360 | } |
| 361 | } |
| 362 | } |
| 363 | |
| 364 | // This function is mostly cut and pasted from |
| 365 | // GenericScheduler::pickNodeBidirectional() |
| 366 | SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { |
| 367 | // Schedule as far as possible in the direction of no choice. This is most |
| 368 | // efficient, but also provides the best heuristics for CriticalPSets. |
| 369 | if (SUnit *SU = Bot.pickOnlyChoice()) { |
| 370 | IsTopNode = false; |
| 371 | return SU; |
| 372 | } |
| 373 | if (SUnit *SU = Top.pickOnlyChoice()) { |
| 374 | IsTopNode = true; |
| 375 | return SU; |
| 376 | } |
| 377 | // Set the bottom-up policy based on the state of the current bottom zone and |
| 378 | // the instructions outside the zone, including the top zone. |
| 379 | CandPolicy BotPolicy; |
| 380 | setPolicy(Policy&: BotPolicy, /*IsPostRA=*/false, CurrZone&: Bot, OtherZone: &Top); |
| 381 | // Set the top-down policy based on the state of the current top zone and |
| 382 | // the instructions outside the zone, including the bottom zone. |
| 383 | CandPolicy TopPolicy; |
| 384 | setPolicy(Policy&: TopPolicy, /*IsPostRA=*/false, CurrZone&: Top, OtherZone: &Bot); |
| 385 | |
| 386 | // See if BotCand is still valid (because we previously scheduled from Top). |
| 387 | LLVM_DEBUG(dbgs() << "Picking from Bot:\n" ); |
| 388 | if (!BotCand.isValid() || BotCand.SU->isScheduled || |
| 389 | BotCand.Policy != BotPolicy) { |
| 390 | BotCand.reset(NewPolicy: CandPolicy()); |
| 391 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand, |
| 392 | /*IsBottomUp=*/true); |
| 393 | assert(BotCand.Reason != NoCand && "failed to find the first candidate" ); |
| 394 | } else { |
| 395 | LLVM_DEBUG(traceCandidate(BotCand)); |
| 396 | #ifndef NDEBUG |
| 397 | if (VerifyScheduling) { |
| 398 | SchedCandidate TCand; |
| 399 | TCand.reset(CandPolicy()); |
| 400 | pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, |
| 401 | /*IsBottomUp=*/true); |
| 402 | assert(TCand.SU == BotCand.SU && |
| 403 | "Last pick result should correspond to re-picking right now" ); |
| 404 | } |
| 405 | #endif |
| 406 | } |
| 407 | |
| 408 | // Check if the top Q has a better candidate. |
| 409 | LLVM_DEBUG(dbgs() << "Picking from Top:\n" ); |
| 410 | if (!TopCand.isValid() || TopCand.SU->isScheduled || |
| 411 | TopCand.Policy != TopPolicy) { |
| 412 | TopCand.reset(NewPolicy: CandPolicy()); |
| 413 | pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand, |
| 414 | /*IsBottomUp=*/false); |
| 415 | assert(TopCand.Reason != NoCand && "failed to find the first candidate" ); |
| 416 | } else { |
| 417 | LLVM_DEBUG(traceCandidate(TopCand)); |
| 418 | #ifndef NDEBUG |
| 419 | if (VerifyScheduling) { |
| 420 | SchedCandidate TCand; |
| 421 | TCand.reset(CandPolicy()); |
| 422 | pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, |
| 423 | /*IsBottomUp=*/false); |
| 424 | assert(TCand.SU == TopCand.SU && |
| 425 | "Last pick result should correspond to re-picking right now" ); |
| 426 | } |
| 427 | #endif |
| 428 | } |
| 429 | |
| 430 | // Pick best from BotCand and TopCand. |
| 431 | LLVM_DEBUG(dbgs() << "Top Cand: " ; traceCandidate(TopCand); |
| 432 | dbgs() << "Bot Cand: " ; traceCandidate(BotCand);); |
| 433 | SchedCandidate Cand = BotCand; |
| 434 | TopCand.Reason = NoCand; |
| 435 | tryCandidate(Cand, TryCand&: TopCand, Zone: nullptr); |
| 436 | if (TopCand.Reason != NoCand) { |
| 437 | Cand.setBest(TopCand); |
| 438 | } |
| 439 | LLVM_DEBUG(dbgs() << "Picking: " ; traceCandidate(Cand);); |
| 440 | |
| 441 | IsTopNode = Cand.AtTop; |
| 442 | return Cand.SU; |
| 443 | } |
| 444 | |
| 445 | // This function is mostly cut and pasted from |
| 446 | // GenericScheduler::pickNode() |
| 447 | SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { |
| 448 | if (DAG->top() == DAG->bottom()) { |
| 449 | assert(Top.Available.empty() && Top.Pending.empty() && |
| 450 | Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage" ); |
| 451 | return nullptr; |
| 452 | } |
| 453 | SUnit *SU; |
| 454 | do { |
| 455 | if (RegionPolicy.OnlyTopDown) { |
| 456 | SU = Top.pickOnlyChoice(); |
| 457 | if (!SU) { |
| 458 | CandPolicy NoPolicy; |
| 459 | TopCand.reset(NewPolicy: NoPolicy); |
| 460 | pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand, |
| 461 | /*IsBottomUp=*/false); |
| 462 | assert(TopCand.Reason != NoCand && "failed to find a candidate" ); |
| 463 | SU = TopCand.SU; |
| 464 | } |
| 465 | IsTopNode = true; |
| 466 | } else if (RegionPolicy.OnlyBottomUp) { |
| 467 | SU = Bot.pickOnlyChoice(); |
| 468 | if (!SU) { |
| 469 | CandPolicy NoPolicy; |
| 470 | BotCand.reset(NewPolicy: NoPolicy); |
| 471 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand, |
| 472 | /*IsBottomUp=*/true); |
| 473 | assert(BotCand.Reason != NoCand && "failed to find a candidate" ); |
| 474 | SU = BotCand.SU; |
| 475 | } |
| 476 | IsTopNode = false; |
| 477 | } else { |
| 478 | SU = pickNodeBidirectional(IsTopNode); |
| 479 | } |
| 480 | } while (SU->isScheduled); |
| 481 | |
| 482 | if (SU->isTopReady()) |
| 483 | Top.removeReady(SU); |
| 484 | if (SU->isBottomReady()) |
| 485 | Bot.removeReady(SU); |
| 486 | |
| 487 | LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " |
| 488 | << *SU->getInstr()); |
| 489 | return SU; |
| 490 | } |
| 491 | |
| 492 | void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { |
| 493 | if (GCNTrackers) { |
| 494 | MachineInstr *MI = SU->getInstr(); |
| 495 | IsTopNode ? (void)DownwardTracker.advance(MI, UseInternalIterator: false) |
| 496 | : UpwardTracker.recede(MI: *MI); |
| 497 | } |
| 498 | |
| 499 | return GenericScheduler::schedNode(SU, IsTopNode); |
| 500 | } |
| 501 | |
| 502 | GCNSchedStageID GCNSchedStrategy::getCurrentStage() { |
| 503 | assert(CurrentStage && CurrentStage != SchedStages.end()); |
| 504 | return *CurrentStage; |
| 505 | } |
| 506 | |
| 507 | bool GCNSchedStrategy::advanceStage() { |
| 508 | assert(CurrentStage != SchedStages.end()); |
| 509 | if (!CurrentStage) |
| 510 | CurrentStage = SchedStages.begin(); |
| 511 | else |
| 512 | CurrentStage++; |
| 513 | |
| 514 | return CurrentStage != SchedStages.end(); |
| 515 | } |
| 516 | |
| 517 | bool GCNSchedStrategy::hasNextStage() const { |
| 518 | assert(CurrentStage); |
| 519 | return std::next(x: CurrentStage) != SchedStages.end(); |
| 520 | } |
| 521 | |
| 522 | GCNSchedStageID GCNSchedStrategy::getNextStage() const { |
| 523 | assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); |
| 524 | return *std::next(x: CurrentStage); |
| 525 | } |
| 526 | |
| 527 | GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( |
| 528 | const MachineSchedContext *C, bool IsLegacyScheduler) |
| 529 | : GCNSchedStrategy(C) { |
| 530 | SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule); |
| 531 | SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule); |
| 532 | SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule); |
| 533 | SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize); |
| 534 | GCNTrackers = GCNTrackers & !IsLegacyScheduler; |
| 535 | } |
| 536 | |
| 537 | GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) |
| 538 | : GCNSchedStrategy(C) { |
| 539 | SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule); |
| 540 | } |
| 541 | |
| 542 | bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, |
| 543 | SchedCandidate &TryCand, |
| 544 | SchedBoundary *Zone) const { |
| 545 | // Initialize the candidate if needed. |
| 546 | if (!Cand.isValid()) { |
| 547 | TryCand.Reason = NodeOrder; |
| 548 | return true; |
| 549 | } |
| 550 | |
| 551 | // Avoid spilling by exceeding the register limit. |
| 552 | if (DAG->isTrackingPressure() && |
| 553 | tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand, |
| 554 | Reason: RegExcess, TRI, MF: DAG->MF)) |
| 555 | return TryCand.Reason != NoCand; |
| 556 | |
| 557 | // Bias PhysReg Defs and copies to their uses and defined respectively. |
| 558 | if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop), |
| 559 | CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg)) |
| 560 | return TryCand.Reason != NoCand; |
| 561 | |
| 562 | bool SameBoundary = Zone != nullptr; |
| 563 | if (SameBoundary) { |
| 564 | // Prioritize instructions that read unbuffered resources by stall cycles. |
| 565 | if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU), |
| 566 | CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall)) |
| 567 | return TryCand.Reason != NoCand; |
| 568 | |
| 569 | // Avoid critical resource consumption and balance the schedule. |
| 570 | TryCand.initResourceDelta(DAG, SchedModel); |
| 571 | if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources, |
| 572 | TryCand, Cand, Reason: ResourceReduce)) |
| 573 | return TryCand.Reason != NoCand; |
| 574 | if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources, |
| 575 | CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand, |
| 576 | Reason: ResourceDemand)) |
| 577 | return TryCand.Reason != NoCand; |
| 578 | |
| 579 | // Unconditionally try to reduce latency. |
| 580 | if (tryLatency(TryCand, Cand, Zone&: *Zone)) |
| 581 | return TryCand.Reason != NoCand; |
| 582 | |
| 583 | // Weak edges are for clustering and other constraints. |
| 584 | if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop), |
| 585 | CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak)) |
| 586 | return TryCand.Reason != NoCand; |
| 587 | } |
| 588 | |
| 589 | // Keep clustered nodes together to encourage downstream peephole |
| 590 | // optimizations which may reduce resource requirements. |
| 591 | // |
| 592 | // This is a best effort to set things up for a post-RA pass. Optimizations |
| 593 | // like generating loads of multiple registers should ideally be done within |
| 594 | // the scheduler pass by combining the loads during DAG postprocessing. |
| 595 | const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; |
| 596 | const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; |
| 597 | if (tryGreater(TryVal: TryCandCluster && TryCandCluster->contains(Ptr: TryCand.SU), |
| 598 | CandVal: CandCluster && CandCluster->contains(Ptr: Cand.SU), TryCand, Cand, |
| 599 | Reason: Cluster)) |
| 600 | return TryCand.Reason != NoCand; |
| 601 | |
| 602 | // Avoid increasing the max critical pressure in the scheduled region. |
| 603 | if (DAG->isTrackingPressure() && |
| 604 | tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax, |
| 605 | TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF)) |
| 606 | return TryCand.Reason != NoCand; |
| 607 | |
| 608 | // Avoid increasing the max pressure of the entire region. |
| 609 | if (DAG->isTrackingPressure() && |
| 610 | tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand, |
| 611 | Cand, Reason: RegMax, TRI, MF: DAG->MF)) |
| 612 | return TryCand.Reason != NoCand; |
| 613 | |
| 614 | if (SameBoundary) { |
| 615 | // Fall through to original instruction order. |
| 616 | if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || |
| 617 | (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { |
| 618 | TryCand.Reason = NodeOrder; |
| 619 | return true; |
| 620 | } |
| 621 | } |
| 622 | return false; |
| 623 | } |
| 624 | |
| 625 | GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy( |
| 626 | const MachineSchedContext *C) |
| 627 | : GCNSchedStrategy(C) { |
| 628 | SchedStages.push_back(Elt: GCNSchedStageID::MemoryClauseInitialSchedule); |
| 629 | } |
| 630 | |
| 631 | /// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as |
| 632 | /// much as possible. This is achieved by: |
| 633 | // 1. Prioritize clustered operations before stall latency heuristic. |
| 634 | // 2. Prioritize long-latency-load before stall latency heuristic. |
| 635 | /// |
| 636 | /// \param Cand provides the policy and current best candidate. |
| 637 | /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. |
| 638 | /// \param Zone describes the scheduled zone that we are extending, or nullptr |
| 639 | /// if Cand is from a different zone than TryCand. |
| 640 | /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) |
| 641 | bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand, |
| 642 | SchedCandidate &TryCand, |
| 643 | SchedBoundary *Zone) const { |
| 644 | // Initialize the candidate if needed. |
| 645 | if (!Cand.isValid()) { |
| 646 | TryCand.Reason = NodeOrder; |
| 647 | return true; |
| 648 | } |
| 649 | |
| 650 | // Bias PhysReg Defs and copies to their uses and defined respectively. |
| 651 | if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop), |
| 652 | CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg)) |
| 653 | return TryCand.Reason != NoCand; |
| 654 | |
| 655 | if (DAG->isTrackingPressure()) { |
| 656 | // Avoid exceeding the target's limit. |
| 657 | if (tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand, |
| 658 | Reason: RegExcess, TRI, MF: DAG->MF)) |
| 659 | return TryCand.Reason != NoCand; |
| 660 | |
| 661 | // Avoid increasing the max critical pressure in the scheduled region. |
| 662 | if (tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax, |
| 663 | TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF)) |
| 664 | return TryCand.Reason != NoCand; |
| 665 | } |
| 666 | |
| 667 | // MaxMemoryClause-specific: We prioritize clustered instructions as we would |
| 668 | // get more benefit from clausing these memory instructions. |
| 669 | const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster; |
| 670 | const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster; |
| 671 | if (tryGreater(TryVal: TryCandCluster && TryCandCluster->contains(Ptr: TryCand.SU), |
| 672 | CandVal: CandCluster && CandCluster->contains(Ptr: Cand.SU), TryCand, Cand, |
| 673 | Reason: Cluster)) |
| 674 | return TryCand.Reason != NoCand; |
| 675 | |
| 676 | // We only compare a subset of features when comparing nodes between |
| 677 | // Top and Bottom boundary. Some properties are simply incomparable, in many |
| 678 | // other instances we should only override the other boundary if something |
| 679 | // is a clear good pick on one boundary. Skip heuristics that are more |
| 680 | // "tie-breaking" in nature. |
| 681 | bool SameBoundary = Zone != nullptr; |
| 682 | if (SameBoundary) { |
| 683 | // For loops that are acyclic path limited, aggressively schedule for |
| 684 | // latency. Within an single cycle, whenever CurrMOps > 0, allow normal |
| 685 | // heuristics to take precedence. |
| 686 | if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && |
| 687 | tryLatency(TryCand, Cand, Zone&: *Zone)) |
| 688 | return TryCand.Reason != NoCand; |
| 689 | |
| 690 | // MaxMemoryClause-specific: Prioritize long latency memory load |
| 691 | // instructions in top-bottom order to hide more latency. The mayLoad check |
| 692 | // is used to exclude store-like instructions, which we do not want to |
| 693 | // scheduler them too early. |
| 694 | bool TryMayLoad = |
| 695 | TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad(); |
| 696 | bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad(); |
| 697 | |
| 698 | if (TryMayLoad || CandMayLoad) { |
| 699 | bool TryLongLatency = |
| 700 | TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad; |
| 701 | bool CandLongLatency = |
| 702 | 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad; |
| 703 | |
| 704 | if (tryGreater(TryVal: Zone->isTop() ? TryLongLatency : CandLongLatency, |
| 705 | CandVal: Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand, |
| 706 | Cand, Reason: Stall)) |
| 707 | return TryCand.Reason != NoCand; |
| 708 | } |
| 709 | // Prioritize instructions that read unbuffered resources by stall cycles. |
| 710 | if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU), |
| 711 | CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall)) |
| 712 | return TryCand.Reason != NoCand; |
| 713 | } |
| 714 | |
| 715 | if (SameBoundary) { |
| 716 | // Weak edges are for clustering and other constraints. |
| 717 | if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop), |
| 718 | CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak)) |
| 719 | return TryCand.Reason != NoCand; |
| 720 | } |
| 721 | |
| 722 | // Avoid increasing the max pressure of the entire region. |
| 723 | if (DAG->isTrackingPressure() && |
| 724 | tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand, |
| 725 | Cand, Reason: RegMax, TRI, MF: DAG->MF)) |
| 726 | return TryCand.Reason != NoCand; |
| 727 | |
| 728 | if (SameBoundary) { |
| 729 | // Avoid critical resource consumption and balance the schedule. |
| 730 | TryCand.initResourceDelta(DAG, SchedModel); |
| 731 | if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources, |
| 732 | TryCand, Cand, Reason: ResourceReduce)) |
| 733 | return TryCand.Reason != NoCand; |
| 734 | if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources, |
| 735 | CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand, |
| 736 | Reason: ResourceDemand)) |
| 737 | return TryCand.Reason != NoCand; |
| 738 | |
| 739 | // Avoid serializing long latency dependence chains. |
| 740 | // For acyclic path limited loops, latency was already checked above. |
| 741 | if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && |
| 742 | !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone)) |
| 743 | return TryCand.Reason != NoCand; |
| 744 | |
| 745 | // Fall through to original instruction order. |
| 746 | if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) { |
| 747 | assert(TryCand.SU->NodeNum != Cand.SU->NodeNum); |
| 748 | TryCand.Reason = NodeOrder; |
| 749 | return true; |
| 750 | } |
| 751 | } |
| 752 | |
| 753 | return false; |
| 754 | } |
| 755 | |
| 756 | GCNScheduleDAGMILive::GCNScheduleDAGMILive( |
| 757 | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) |
| 758 | : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), |
| 759 | MFI(*MF.getInfo<SIMachineFunctionInfo>()), |
| 760 | StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), |
| 761 | RegionLiveOuts(this, /*IsLiveOut=*/true) { |
| 762 | |
| 763 | // We want regions with a single MI to be scheduled so that we can reason |
| 764 | // about them correctly during scheduling stages that move MIs between regions |
| 765 | // (e.g., rematerialization). |
| 766 | ScheduleSingleMIRegions = true; |
| 767 | LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n" ); |
| 768 | if (RelaxedOcc) { |
| 769 | MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy); |
| 770 | if (MinOccupancy != StartingOccupancy) |
| 771 | LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy |
| 772 | << ".\n" ); |
| 773 | } |
| 774 | } |
| 775 | |
| 776 | std::unique_ptr<GCNSchedStage> |
| 777 | GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { |
| 778 | switch (SchedStageID) { |
| 779 | case GCNSchedStageID::OccInitialSchedule: |
| 780 | return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this); |
| 781 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
| 782 | return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this); |
| 783 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
| 784 | return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this); |
| 785 | case GCNSchedStageID::PreRARematerialize: |
| 786 | return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this); |
| 787 | case GCNSchedStageID::ILPInitialSchedule: |
| 788 | return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this); |
| 789 | case GCNSchedStageID::MemoryClauseInitialSchedule: |
| 790 | return std::make_unique<MemoryClauseInitialScheduleStage>(args&: SchedStageID, |
| 791 | args&: *this); |
| 792 | } |
| 793 | |
| 794 | llvm_unreachable("Unknown SchedStageID." ); |
| 795 | } |
| 796 | |
| 797 | void GCNScheduleDAGMILive::schedule() { |
| 798 | // Collect all scheduling regions. The actual scheduling is performed in |
| 799 | // GCNScheduleDAGMILive::finalizeSchedule. |
| 800 | Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd)); |
| 801 | } |
| 802 | |
| 803 | GCNRegPressure |
| 804 | GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { |
| 805 | GCNDownwardRPTracker RPTracker(*LIS); |
| 806 | RPTracker.advance(Begin: begin(), End: end(), LiveRegsCopy: &LiveIns[RegionIdx]); |
| 807 | return RPTracker.moveMaxPressure(); |
| 808 | } |
| 809 | |
| 810 | static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, |
| 811 | MachineBasicBlock::iterator RegionEnd) { |
| 812 | auto REnd = RegionEnd == RegionBegin->getParent()->end() |
| 813 | ? std::prev(x: RegionEnd) |
| 814 | : RegionEnd; |
| 815 | return &*skipDebugInstructionsBackward(It: REnd, Begin: RegionBegin); |
| 816 | } |
| 817 | |
| 818 | void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, |
| 819 | const MachineBasicBlock *MBB) { |
| 820 | GCNDownwardRPTracker RPTracker(*LIS); |
| 821 | |
| 822 | // If the block has the only successor then live-ins of that successor are |
| 823 | // live-outs of the current block. We can reuse calculated live set if the |
| 824 | // successor will be sent to scheduling past current block. |
| 825 | |
| 826 | // However, due to the bug in LiveInterval analysis it may happen that two |
| 827 | // predecessors of the same successor block have different lane bitmasks for |
| 828 | // a live-out register. Workaround that by sticking to one-to-one relationship |
| 829 | // i.e. one predecessor with one successor block. |
| 830 | const MachineBasicBlock *OnlySucc = nullptr; |
| 831 | if (MBB->succ_size() == 1) { |
| 832 | auto *Candidate = *MBB->succ_begin(); |
| 833 | if (!Candidate->empty() && Candidate->pred_size() == 1) { |
| 834 | SlotIndexes *Ind = LIS->getSlotIndexes(); |
| 835 | if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate)) |
| 836 | OnlySucc = Candidate; |
| 837 | } |
| 838 | } |
| 839 | |
| 840 | // Scheduler sends regions from the end of the block upwards. |
| 841 | size_t CurRegion = RegionIdx; |
| 842 | for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) |
| 843 | if (Regions[CurRegion].first->getParent() != MBB) |
| 844 | break; |
| 845 | --CurRegion; |
| 846 | |
| 847 | auto I = MBB->begin(); |
| 848 | auto LiveInIt = MBBLiveIns.find(Val: MBB); |
| 849 | auto &Rgn = Regions[CurRegion]; |
| 850 | auto *NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second); |
| 851 | if (LiveInIt != MBBLiveIns.end()) { |
| 852 | auto LiveIn = std::move(LiveInIt->second); |
| 853 | RPTracker.reset(MI: *MBB->begin(), LiveRegs: &LiveIn); |
| 854 | MBBLiveIns.erase(I: LiveInIt); |
| 855 | } else { |
| 856 | I = Rgn.first; |
| 857 | auto LRS = BBLiveInMap.lookup(Val: NonDbgMI); |
| 858 | #ifdef EXPENSIVE_CHECKS |
| 859 | assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); |
| 860 | #endif |
| 861 | RPTracker.reset(MI: *I, LiveRegs: &LRS); |
| 862 | } |
| 863 | |
| 864 | for (;;) { |
| 865 | I = RPTracker.getNext(); |
| 866 | |
| 867 | if (Regions[CurRegion].first == I || NonDbgMI == I) { |
| 868 | LiveIns[CurRegion] = RPTracker.getLiveRegs(); |
| 869 | RPTracker.clearMaxPressure(); |
| 870 | } |
| 871 | |
| 872 | if (Regions[CurRegion].second == I) { |
| 873 | Pressure[CurRegion] = RPTracker.moveMaxPressure(); |
| 874 | if (CurRegion-- == RegionIdx) |
| 875 | break; |
| 876 | auto &Rgn = Regions[CurRegion]; |
| 877 | NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second); |
| 878 | } |
| 879 | RPTracker.advanceToNext(); |
| 880 | RPTracker.advanceBeforeNext(); |
| 881 | } |
| 882 | |
| 883 | if (OnlySucc) { |
| 884 | if (I != MBB->end()) { |
| 885 | RPTracker.advanceToNext(); |
| 886 | RPTracker.advance(End: MBB->end()); |
| 887 | } |
| 888 | RPTracker.advanceBeforeNext(); |
| 889 | MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); |
| 890 | } |
| 891 | } |
| 892 | |
| 893 | DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
| 894 | GCNScheduleDAGMILive::getRegionLiveInMap() const { |
| 895 | assert(!Regions.empty()); |
| 896 | std::vector<MachineInstr *> RegionFirstMIs; |
| 897 | RegionFirstMIs.reserve(n: Regions.size()); |
| 898 | auto I = Regions.rbegin(), E = Regions.rend(); |
| 899 | do { |
| 900 | const MachineBasicBlock *MBB = I->first->getParent(); |
| 901 | auto *MI = &*skipDebugInstructionsForward(It: I->first, End: I->second); |
| 902 | RegionFirstMIs.push_back(x: MI); |
| 903 | do { |
| 904 | ++I; |
| 905 | } while (I != E && I->first->getParent() == MBB); |
| 906 | } while (I != E); |
| 907 | return getLiveRegMap(R&: RegionFirstMIs, /*After=*/false, LIS&: *LIS); |
| 908 | } |
| 909 | |
| 910 | DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
| 911 | GCNScheduleDAGMILive::getRegionLiveOutMap() const { |
| 912 | assert(!Regions.empty()); |
| 913 | std::vector<MachineInstr *> RegionLastMIs; |
| 914 | RegionLastMIs.reserve(n: Regions.size()); |
| 915 | for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions)) |
| 916 | RegionLastMIs.push_back(x: getLastMIForRegion(RegionBegin, RegionEnd)); |
| 917 | |
| 918 | return getLiveRegMap(R&: RegionLastMIs, /*After=*/true, LIS&: *LIS); |
| 919 | } |
| 920 | |
| 921 | void RegionPressureMap::buildLiveRegMap() { |
| 922 | IdxToInstruction.clear(); |
| 923 | |
| 924 | RegionLiveRegMap = |
| 925 | IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); |
| 926 | for (unsigned I = 0; I < DAG->Regions.size(); I++) { |
| 927 | MachineInstr *RegionKey = |
| 928 | IsLiveOut |
| 929 | ? getLastMIForRegion(RegionBegin: DAG->Regions[I].first, RegionEnd: DAG->Regions[I].second) |
| 930 | : &*DAG->Regions[I].first; |
| 931 | IdxToInstruction[I] = RegionKey; |
| 932 | } |
| 933 | } |
| 934 | |
| 935 | void GCNScheduleDAGMILive::finalizeSchedule() { |
| 936 | // Start actual scheduling here. This function is called by the base |
| 937 | // MachineScheduler after all regions have been recorded by |
| 938 | // GCNScheduleDAGMILive::schedule(). |
| 939 | LiveIns.resize(N: Regions.size()); |
| 940 | Pressure.resize(N: Regions.size()); |
| 941 | RegionsWithHighRP.resize(N: Regions.size()); |
| 942 | RegionsWithExcessRP.resize(N: Regions.size()); |
| 943 | RegionsWithMinOcc.resize(N: Regions.size()); |
| 944 | RegionsWithIGLPInstrs.resize(N: Regions.size()); |
| 945 | RegionsWithHighRP.reset(); |
| 946 | RegionsWithExcessRP.reset(); |
| 947 | RegionsWithMinOcc.reset(); |
| 948 | RegionsWithIGLPInstrs.reset(); |
| 949 | |
| 950 | runSchedStages(); |
| 951 | } |
| 952 | |
| 953 | void GCNScheduleDAGMILive::runSchedStages() { |
| 954 | LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n" ); |
| 955 | |
| 956 | if (!Regions.empty()) { |
| 957 | BBLiveInMap = getRegionLiveInMap(); |
| 958 | if (GCNTrackers) |
| 959 | RegionLiveOuts.buildLiveRegMap(); |
| 960 | } |
| 961 | |
| 962 | GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); |
| 963 | while (S.advanceStage()) { |
| 964 | auto Stage = createSchedStage(SchedStageID: S.getCurrentStage()); |
| 965 | if (!Stage->initGCNSchedStage()) |
| 966 | continue; |
| 967 | |
| 968 | for (auto Region : Regions) { |
| 969 | RegionBegin = Region.first; |
| 970 | RegionEnd = Region.second; |
| 971 | // Setup for scheduling the region and check whether it should be skipped. |
| 972 | if (!Stage->initGCNRegion()) { |
| 973 | Stage->advanceRegion(); |
| 974 | exitRegion(); |
| 975 | continue; |
| 976 | } |
| 977 | |
| 978 | if (GCNTrackers) { |
| 979 | GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); |
| 980 | GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); |
| 981 | GCNRPTracker::LiveRegSet *RegionLiveIns = |
| 982 | &LiveIns[Stage->getRegionIdx()]; |
| 983 | |
| 984 | reinterpret_cast<GCNRPTracker *>(DownwardTracker) |
| 985 | ->reset(MRI_: MRI, LiveRegs_: *RegionLiveIns); |
| 986 | reinterpret_cast<GCNRPTracker *>(UpwardTracker) |
| 987 | ->reset(MRI_: MRI, LiveRegs_: RegionLiveOuts.getLiveRegsForRegionIdx( |
| 988 | RegionIdx: Stage->getRegionIdx())); |
| 989 | } |
| 990 | |
| 991 | ScheduleDAGMILive::schedule(); |
| 992 | Stage->finalizeGCNRegion(); |
| 993 | } |
| 994 | |
| 995 | Stage->finalizeGCNSchedStage(); |
| 996 | } |
| 997 | } |
| 998 | |
| 999 | #ifndef NDEBUG |
| 1000 | raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { |
| 1001 | switch (StageID) { |
| 1002 | case GCNSchedStageID::OccInitialSchedule: |
| 1003 | OS << "Max Occupancy Initial Schedule" ; |
| 1004 | break; |
| 1005 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
| 1006 | OS << "Unclustered High Register Pressure Reschedule" ; |
| 1007 | break; |
| 1008 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
| 1009 | OS << "Clustered Low Occupancy Reschedule" ; |
| 1010 | break; |
| 1011 | case GCNSchedStageID::PreRARematerialize: |
| 1012 | OS << "Pre-RA Rematerialize" ; |
| 1013 | break; |
| 1014 | case GCNSchedStageID::ILPInitialSchedule: |
| 1015 | OS << "Max ILP Initial Schedule" ; |
| 1016 | break; |
| 1017 | case GCNSchedStageID::MemoryClauseInitialSchedule: |
| 1018 | OS << "Max memory clause Initial Schedule" ; |
| 1019 | break; |
| 1020 | } |
| 1021 | |
| 1022 | return OS; |
| 1023 | } |
| 1024 | #endif |
| 1025 | |
| 1026 | GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) |
| 1027 | : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), |
| 1028 | MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} |
| 1029 | |
| 1030 | bool GCNSchedStage::initGCNSchedStage() { |
| 1031 | if (!DAG.LIS) |
| 1032 | return false; |
| 1033 | |
| 1034 | LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n" ); |
| 1035 | return true; |
| 1036 | } |
| 1037 | |
| 1038 | bool UnclusteredHighRPStage::initGCNSchedStage() { |
| 1039 | if (DisableUnclusterHighRP) |
| 1040 | return false; |
| 1041 | |
| 1042 | if (!GCNSchedStage::initGCNSchedStage()) |
| 1043 | return false; |
| 1044 | |
| 1045 | if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) |
| 1046 | return false; |
| 1047 | |
| 1048 | SavedMutations.swap(x&: DAG.Mutations); |
| 1049 | DAG.addMutation( |
| 1050 | Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry)); |
| 1051 | |
| 1052 | InitialOccupancy = DAG.MinOccupancy; |
| 1053 | // Aggressivly try to reduce register pressure in the unclustered high RP |
| 1054 | // stage. Temporarily increase occupancy target in the region. |
| 1055 | S.SGPRLimitBias = S.HighRPSGPRBias; |
| 1056 | S.VGPRLimitBias = S.HighRPVGPRBias; |
| 1057 | if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) |
| 1058 | MFI.increaseOccupancy(MF, Limit: ++DAG.MinOccupancy); |
| 1059 | |
| 1060 | LLVM_DEBUG( |
| 1061 | dbgs() |
| 1062 | << "Retrying function scheduling without clustering. " |
| 1063 | "Aggressivly try to reduce register pressure to achieve occupancy " |
| 1064 | << DAG.MinOccupancy << ".\n" ); |
| 1065 | |
| 1066 | return true; |
| 1067 | } |
| 1068 | |
| 1069 | bool ClusteredLowOccStage::initGCNSchedStage() { |
| 1070 | if (DisableClusteredLowOccupancy) |
| 1071 | return false; |
| 1072 | |
| 1073 | if (!GCNSchedStage::initGCNSchedStage()) |
| 1074 | return false; |
| 1075 | |
| 1076 | // Don't bother trying to improve ILP in lower RP regions if occupancy has not |
| 1077 | // been dropped. All regions will have already been scheduled with the ideal |
| 1078 | // occupancy targets. |
| 1079 | if (DAG.StartingOccupancy <= DAG.MinOccupancy) |
| 1080 | return false; |
| 1081 | |
| 1082 | LLVM_DEBUG( |
| 1083 | dbgs() << "Retrying function scheduling with lowest recorded occupancy " |
| 1084 | << DAG.MinOccupancy << ".\n" ); |
| 1085 | return true; |
| 1086 | } |
| 1087 | |
| 1088 | /// Allows to easily filter for this stage's debug output. |
| 1089 | #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;) |
| 1090 | |
| 1091 | bool PreRARematStage::initGCNSchedStage() { |
| 1092 | // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for |
| 1093 | // regions inbetween the defs and region we sinked the def to. Will need to be |
| 1094 | // fixed if there is another pass after this pass. |
| 1095 | assert(!S.hasNextStage()); |
| 1096 | |
| 1097 | if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() || |
| 1098 | DAG.Regions.size() == 1) |
| 1099 | return false; |
| 1100 | |
| 1101 | // Before performing any IR modification record the parent region of each MI |
| 1102 | // and the parent MBB of each region. |
| 1103 | const unsigned NumRegions = DAG.Regions.size(); |
| 1104 | RegionBB.reserve(N: NumRegions); |
| 1105 | for (unsigned I = 0; I < NumRegions; ++I) { |
| 1106 | RegionBoundaries Region = DAG.Regions[I]; |
| 1107 | for (auto MI = Region.first; MI != Region.second; ++MI) |
| 1108 | MIRegion.insert(KV: {&*MI, I}); |
| 1109 | RegionBB.push_back(Elt: Region.first->getParent()); |
| 1110 | } |
| 1111 | |
| 1112 | if (!canIncreaseOccupancyOrReduceSpill()) |
| 1113 | return false; |
| 1114 | |
| 1115 | // Rematerialize identified instructions and update scheduler's state. |
| 1116 | rematerialize(); |
| 1117 | if (GCNTrackers) |
| 1118 | DAG.RegionLiveOuts.buildLiveRegMap(); |
| 1119 | REMAT_DEBUG( |
| 1120 | dbgs() << "Retrying function scheduling with new min. occupancy of " |
| 1121 | << AchievedOcc << " from rematerializing (original was " |
| 1122 | << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n" ); |
| 1123 | if (AchievedOcc > DAG.MinOccupancy) { |
| 1124 | DAG.MinOccupancy = AchievedOcc; |
| 1125 | SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
| 1126 | MFI.increaseOccupancy(MF, Limit: DAG.MinOccupancy); |
| 1127 | } |
| 1128 | return true; |
| 1129 | } |
| 1130 | |
| 1131 | void GCNSchedStage::finalizeGCNSchedStage() { |
| 1132 | DAG.finishBlock(); |
| 1133 | LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n" ); |
| 1134 | } |
| 1135 | |
| 1136 | void UnclusteredHighRPStage::finalizeGCNSchedStage() { |
| 1137 | SavedMutations.swap(x&: DAG.Mutations); |
| 1138 | S.SGPRLimitBias = S.VGPRLimitBias = 0; |
| 1139 | if (DAG.MinOccupancy > InitialOccupancy) { |
| 1140 | for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) |
| 1141 | DAG.RegionsWithMinOcc[IDX] = |
| 1142 | DAG.Pressure[IDX].getOccupancy( |
| 1143 | ST: DAG.ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy; |
| 1144 | |
| 1145 | LLVM_DEBUG(dbgs() << StageID |
| 1146 | << " stage successfully increased occupancy to " |
| 1147 | << DAG.MinOccupancy << '\n'); |
| 1148 | } |
| 1149 | |
| 1150 | GCNSchedStage::finalizeGCNSchedStage(); |
| 1151 | } |
| 1152 | |
| 1153 | bool GCNSchedStage::initGCNRegion() { |
| 1154 | // Check whether this new region is also a new block. |
| 1155 | if (DAG.RegionBegin->getParent() != CurrentMBB) |
| 1156 | setupNewBlock(); |
| 1157 | |
| 1158 | unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end()); |
| 1159 | DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs); |
| 1160 | |
| 1161 | // Skip empty scheduling regions (0 or 1 schedulable instructions). |
| 1162 | if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(x: DAG.end())) |
| 1163 | return false; |
| 1164 | |
| 1165 | LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n" ); |
| 1166 | LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) |
| 1167 | << " " << CurrentMBB->getName() |
| 1168 | << "\n From: " << *DAG.begin() << " To: " ; |
| 1169 | if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; |
| 1170 | else dbgs() << "End" ; |
| 1171 | dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); |
| 1172 | |
| 1173 | // Save original instruction order before scheduling for possible revert. |
| 1174 | Unsched.clear(); |
| 1175 | Unsched.reserve(n: DAG.NumRegionInstrs); |
| 1176 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
| 1177 | StageID == GCNSchedStageID::ILPInitialSchedule) { |
| 1178 | const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII); |
| 1179 | for (auto &I : DAG) { |
| 1180 | Unsched.push_back(x: &I); |
| 1181 | if (SII->isIGLPMutationOnly(Opcode: I.getOpcode())) |
| 1182 | DAG.RegionsWithIGLPInstrs[RegionIdx] = true; |
| 1183 | } |
| 1184 | } else { |
| 1185 | for (auto &I : DAG) |
| 1186 | Unsched.push_back(x: &I); |
| 1187 | } |
| 1188 | |
| 1189 | PressureBefore = DAG.Pressure[RegionIdx]; |
| 1190 | |
| 1191 | LLVM_DEBUG( |
| 1192 | dbgs() << "Pressure before scheduling:\nRegion live-ins:" |
| 1193 | << print(DAG.LiveIns[RegionIdx], DAG.MRI) |
| 1194 | << "Region live-in pressure: " |
| 1195 | << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) |
| 1196 | << "Region register pressure: " << print(PressureBefore)); |
| 1197 | |
| 1198 | S.HasHighPressure = false; |
| 1199 | S.KnownExcessRP = isRegionWithExcessRP(); |
| 1200 | |
| 1201 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
| 1202 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { |
| 1203 | SavedMutations.clear(); |
| 1204 | SavedMutations.swap(x&: DAG.Mutations); |
| 1205 | bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || |
| 1206 | StageID == GCNSchedStageID::ILPInitialSchedule; |
| 1207 | DAG.addMutation(Mutation: createIGroupLPDAGMutation( |
| 1208 | Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial |
| 1209 | : AMDGPU::SchedulingPhase::PreRAReentry)); |
| 1210 | } |
| 1211 | |
| 1212 | return true; |
| 1213 | } |
| 1214 | |
| 1215 | bool UnclusteredHighRPStage::initGCNRegion() { |
| 1216 | // Only reschedule regions with the minimum occupancy or regions that may have |
| 1217 | // spilling (excess register pressure). |
| 1218 | if ((!DAG.RegionsWithMinOcc[RegionIdx] || |
| 1219 | DAG.MinOccupancy <= InitialOccupancy) && |
| 1220 | !DAG.RegionsWithExcessRP[RegionIdx]) |
| 1221 | return false; |
| 1222 | |
| 1223 | return GCNSchedStage::initGCNRegion(); |
| 1224 | } |
| 1225 | |
| 1226 | bool ClusteredLowOccStage::initGCNRegion() { |
| 1227 | // We may need to reschedule this region if it wasn't rescheduled in the last |
| 1228 | // stage, or if we found it was testing critical register pressure limits in |
| 1229 | // the unclustered reschedule stage. The later is because we may not have been |
| 1230 | // able to raise the min occupancy in the previous stage so the region may be |
| 1231 | // overly constrained even if it was already rescheduled. |
| 1232 | if (!DAG.RegionsWithHighRP[RegionIdx]) |
| 1233 | return false; |
| 1234 | |
| 1235 | return GCNSchedStage::initGCNRegion(); |
| 1236 | } |
| 1237 | |
| 1238 | bool PreRARematStage::initGCNRegion() { |
| 1239 | return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion(); |
| 1240 | } |
| 1241 | |
| 1242 | void GCNSchedStage::setupNewBlock() { |
| 1243 | if (CurrentMBB) |
| 1244 | DAG.finishBlock(); |
| 1245 | |
| 1246 | CurrentMBB = DAG.RegionBegin->getParent(); |
| 1247 | DAG.startBlock(bb: CurrentMBB); |
| 1248 | // Get real RP for the region if it hasn't be calculated before. After the |
| 1249 | // initial schedule stage real RP will be collected after scheduling. |
| 1250 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
| 1251 | StageID == GCNSchedStageID::ILPInitialSchedule || |
| 1252 | StageID == GCNSchedStageID::MemoryClauseInitialSchedule) |
| 1253 | DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB); |
| 1254 | } |
| 1255 | |
| 1256 | void GCNSchedStage::finalizeGCNRegion() { |
| 1257 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
| 1258 | if (S.HasHighPressure) |
| 1259 | DAG.RegionsWithHighRP[RegionIdx] = true; |
| 1260 | |
| 1261 | // Revert scheduling if we have dropped occupancy or there is some other |
| 1262 | // reason that the original schedule is better. |
| 1263 | checkScheduling(); |
| 1264 | |
| 1265 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
| 1266 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) |
| 1267 | SavedMutations.swap(x&: DAG.Mutations); |
| 1268 | |
| 1269 | DAG.exitRegion(); |
| 1270 | advanceRegion(); |
| 1271 | } |
| 1272 | |
| 1273 | void GCNSchedStage::checkScheduling() { |
| 1274 | // Check the results of scheduling. |
| 1275 | PressureAfter = DAG.getRealRegPressure(RegionIdx); |
| 1276 | |
| 1277 | LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); |
| 1278 | LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n" ); |
| 1279 | |
| 1280 | unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); |
| 1281 | |
| 1282 | if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && |
| 1283 | PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { |
| 1284 | DAG.Pressure[RegionIdx] = PressureAfter; |
| 1285 | DAG.RegionsWithMinOcc[RegionIdx] = |
| 1286 | PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == |
| 1287 | DAG.MinOccupancy; |
| 1288 | |
| 1289 | // Early out if we have achieved the occupancy target. |
| 1290 | LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n" ); |
| 1291 | return; |
| 1292 | } |
| 1293 | |
| 1294 | unsigned TargetOccupancy = std::min( |
| 1295 | a: S.getTargetOccupancy(), b: ST.getOccupancyWithWorkGroupSizes(MF).second); |
| 1296 | unsigned WavesAfter = std::min( |
| 1297 | a: TargetOccupancy, b: PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize)); |
| 1298 | unsigned WavesBefore = std::min( |
| 1299 | a: TargetOccupancy, b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize)); |
| 1300 | LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore |
| 1301 | << ", after " << WavesAfter << ".\n" ); |
| 1302 | |
| 1303 | // We may not be able to keep the current target occupancy because of the just |
| 1304 | // scheduled region. We might still be able to revert scheduling if the |
| 1305 | // occupancy before was higher, or if the current schedule has register |
| 1306 | // pressure higher than the excess limits which could lead to more spilling. |
| 1307 | unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore); |
| 1308 | |
| 1309 | // Allow memory bound functions to drop to 4 waves if not limited by an |
| 1310 | // attribute. |
| 1311 | if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && |
| 1312 | WavesAfter >= MFI.getMinAllowedOccupancy()) { |
| 1313 | LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " |
| 1314 | << MFI.getMinAllowedOccupancy() << " waves\n" ); |
| 1315 | NewOccupancy = WavesAfter; |
| 1316 | } |
| 1317 | |
| 1318 | if (NewOccupancy < DAG.MinOccupancy) { |
| 1319 | DAG.MinOccupancy = NewOccupancy; |
| 1320 | MFI.limitOccupancy(Limit: DAG.MinOccupancy); |
| 1321 | DAG.RegionsWithMinOcc.reset(); |
| 1322 | LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " |
| 1323 | << DAG.MinOccupancy << ".\n" ); |
| 1324 | } |
| 1325 | // The maximum number of arch VGPR on non-unified register file, or the |
| 1326 | // maximum VGPR + AGPR in the unified register file case. |
| 1327 | unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); |
| 1328 | // The maximum number of arch VGPR for both unified and non-unified register |
| 1329 | // file. |
| 1330 | unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs()); |
| 1331 | unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); |
| 1332 | |
| 1333 | if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs || |
| 1334 | PressureAfter.getArchVGPRNum() > MaxArchVGPRs || |
| 1335 | PressureAfter.getAGPRNum() > MaxArchVGPRs || |
| 1336 | PressureAfter.getSGPRNum() > MaxSGPRs) { |
| 1337 | DAG.RegionsWithHighRP[RegionIdx] = true; |
| 1338 | DAG.RegionsWithExcessRP[RegionIdx] = true; |
| 1339 | } |
| 1340 | |
| 1341 | // Revert if this region's schedule would cause a drop in occupancy or |
| 1342 | // spilling. |
| 1343 | if (shouldRevertScheduling(WavesAfter)) { |
| 1344 | revertScheduling(); |
| 1345 | } else { |
| 1346 | DAG.Pressure[RegionIdx] = PressureAfter; |
| 1347 | DAG.RegionsWithMinOcc[RegionIdx] = |
| 1348 | PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) == |
| 1349 | DAG.MinOccupancy; |
| 1350 | } |
| 1351 | } |
| 1352 | |
| 1353 | unsigned |
| 1354 | GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, |
| 1355 | DenseMap<unsigned, unsigned> &ReadyCycles, |
| 1356 | const TargetSchedModel &SM) { |
| 1357 | unsigned ReadyCycle = CurrCycle; |
| 1358 | for (auto &D : SU.Preds) { |
| 1359 | if (D.isAssignedRegDep()) { |
| 1360 | MachineInstr *DefMI = D.getSUnit()->getInstr(); |
| 1361 | unsigned Latency = SM.computeInstrLatency(MI: DefMI); |
| 1362 | unsigned DefReady = ReadyCycles[DAG.getSUnit(MI: DefMI)->NodeNum]; |
| 1363 | ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency); |
| 1364 | } |
| 1365 | } |
| 1366 | ReadyCycles[SU.NodeNum] = ReadyCycle; |
| 1367 | return ReadyCycle; |
| 1368 | } |
| 1369 | |
| 1370 | #ifndef NDEBUG |
| 1371 | struct EarlierIssuingCycle { |
| 1372 | bool operator()(std::pair<MachineInstr *, unsigned> A, |
| 1373 | std::pair<MachineInstr *, unsigned> B) const { |
| 1374 | return A.second < B.second; |
| 1375 | } |
| 1376 | }; |
| 1377 | |
| 1378 | static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, |
| 1379 | EarlierIssuingCycle> &ReadyCycles) { |
| 1380 | if (ReadyCycles.empty()) |
| 1381 | return; |
| 1382 | unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); |
| 1383 | dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum |
| 1384 | << " ##################\n# Cycle #\t\t\tInstruction " |
| 1385 | " " |
| 1386 | " \n" ; |
| 1387 | unsigned IPrev = 1; |
| 1388 | for (auto &I : ReadyCycles) { |
| 1389 | if (I.second > IPrev + 1) |
| 1390 | dbgs() << "****************************** BUBBLE OF " << I.second - IPrev |
| 1391 | << " CYCLES DETECTED ******************************\n\n" ; |
| 1392 | dbgs() << "[ " << I.second << " ] : " << *I.first << "\n" ; |
| 1393 | IPrev = I.second; |
| 1394 | } |
| 1395 | } |
| 1396 | #endif |
| 1397 | |
| 1398 | ScheduleMetrics |
| 1399 | GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { |
| 1400 | #ifndef NDEBUG |
| 1401 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
| 1402 | ReadyCyclesSorted; |
| 1403 | #endif |
| 1404 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
| 1405 | unsigned SumBubbles = 0; |
| 1406 | DenseMap<unsigned, unsigned> ReadyCycles; |
| 1407 | unsigned CurrCycle = 0; |
| 1408 | for (auto &SU : InputSchedule) { |
| 1409 | unsigned ReadyCycle = |
| 1410 | computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); |
| 1411 | SumBubbles += ReadyCycle - CurrCycle; |
| 1412 | #ifndef NDEBUG |
| 1413 | ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); |
| 1414 | #endif |
| 1415 | CurrCycle = ++ReadyCycle; |
| 1416 | } |
| 1417 | #ifndef NDEBUG |
| 1418 | LLVM_DEBUG( |
| 1419 | printScheduleModel(ReadyCyclesSorted); |
| 1420 | dbgs() << "\n\t" |
| 1421 | << "Metric: " |
| 1422 | << (SumBubbles |
| 1423 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
| 1424 | : 1) |
| 1425 | << "\n\n" ); |
| 1426 | #endif |
| 1427 | |
| 1428 | return ScheduleMetrics(CurrCycle, SumBubbles); |
| 1429 | } |
| 1430 | |
| 1431 | ScheduleMetrics |
| 1432 | GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { |
| 1433 | #ifndef NDEBUG |
| 1434 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
| 1435 | ReadyCyclesSorted; |
| 1436 | #endif |
| 1437 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
| 1438 | unsigned SumBubbles = 0; |
| 1439 | DenseMap<unsigned, unsigned> ReadyCycles; |
| 1440 | unsigned CurrCycle = 0; |
| 1441 | for (auto &MI : DAG) { |
| 1442 | SUnit *SU = DAG.getSUnit(MI: &MI); |
| 1443 | if (!SU) |
| 1444 | continue; |
| 1445 | unsigned ReadyCycle = |
| 1446 | computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM); |
| 1447 | SumBubbles += ReadyCycle - CurrCycle; |
| 1448 | #ifndef NDEBUG |
| 1449 | ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); |
| 1450 | #endif |
| 1451 | CurrCycle = ++ReadyCycle; |
| 1452 | } |
| 1453 | #ifndef NDEBUG |
| 1454 | LLVM_DEBUG( |
| 1455 | printScheduleModel(ReadyCyclesSorted); |
| 1456 | dbgs() << "\n\t" |
| 1457 | << "Metric: " |
| 1458 | << (SumBubbles |
| 1459 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
| 1460 | : 1) |
| 1461 | << "\n\n" ); |
| 1462 | #endif |
| 1463 | |
| 1464 | return ScheduleMetrics(CurrCycle, SumBubbles); |
| 1465 | } |
| 1466 | |
| 1467 | bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 1468 | if (WavesAfter < DAG.MinOccupancy) |
| 1469 | return true; |
| 1470 | |
| 1471 | // For dynamic VGPR mode, we don't want to waste any VGPR blocks. |
| 1472 | if (DAG.MFI.isDynamicVGPREnabled()) { |
| 1473 | unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( |
| 1474 | STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(), |
| 1475 | DynamicVGPRBlockSize: PressureBefore.getVGPRNum(UnifiedVGPRFile: false)); |
| 1476 | unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( |
| 1477 | STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(), |
| 1478 | DynamicVGPRBlockSize: PressureAfter.getVGPRNum(UnifiedVGPRFile: false)); |
| 1479 | if (BlocksAfter > BlocksBefore) |
| 1480 | return true; |
| 1481 | } |
| 1482 | |
| 1483 | return false; |
| 1484 | } |
| 1485 | |
| 1486 | bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 1487 | if (PressureAfter == PressureBefore) |
| 1488 | return false; |
| 1489 | |
| 1490 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
| 1491 | return true; |
| 1492 | |
| 1493 | if (mayCauseSpilling(WavesAfter)) |
| 1494 | return true; |
| 1495 | |
| 1496 | return false; |
| 1497 | } |
| 1498 | |
| 1499 | bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 1500 | // If RP is not reduced in the unclustered reschedule stage, revert to the |
| 1501 | // old schedule. |
| 1502 | if ((WavesAfter <= |
| 1503 | PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) && |
| 1504 | mayCauseSpilling(WavesAfter)) || |
| 1505 | GCNSchedStage::shouldRevertScheduling(WavesAfter)) { |
| 1506 | LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n" ); |
| 1507 | return true; |
| 1508 | } |
| 1509 | |
| 1510 | // Do not attempt to relax schedule even more if we are already spilling. |
| 1511 | if (isRegionWithExcessRP()) |
| 1512 | return false; |
| 1513 | |
| 1514 | LLVM_DEBUG( |
| 1515 | dbgs() |
| 1516 | << "\n\t *** In shouldRevertScheduling ***\n" |
| 1517 | << " *********** BEFORE UnclusteredHighRPStage ***********\n" ); |
| 1518 | ScheduleMetrics MBefore = getScheduleMetrics(InputSchedule: DAG.SUnits); |
| 1519 | LLVM_DEBUG( |
| 1520 | dbgs() |
| 1521 | << "\n *********** AFTER UnclusteredHighRPStage ***********\n" ); |
| 1522 | ScheduleMetrics MAfter = getScheduleMetrics(DAG); |
| 1523 | unsigned OldMetric = MBefore.getMetric(); |
| 1524 | unsigned NewMetric = MAfter.getMetric(); |
| 1525 | unsigned WavesBefore = std::min( |
| 1526 | a: S.getTargetOccupancy(), |
| 1527 | b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize())); |
| 1528 | unsigned Profit = |
| 1529 | ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * |
| 1530 | ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / |
| 1531 | NewMetric) / |
| 1532 | ScheduleMetrics::ScaleFactor; |
| 1533 | LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " |
| 1534 | << MAfter << "Profit: " << Profit << "\n" ); |
| 1535 | return Profit < ScheduleMetrics::ScaleFactor; |
| 1536 | } |
| 1537 | |
| 1538 | bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 1539 | if (PressureAfter == PressureBefore) |
| 1540 | return false; |
| 1541 | |
| 1542 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
| 1543 | return true; |
| 1544 | |
| 1545 | if (mayCauseSpilling(WavesAfter)) |
| 1546 | return true; |
| 1547 | |
| 1548 | return false; |
| 1549 | } |
| 1550 | |
| 1551 | bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 1552 | return GCNSchedStage::shouldRevertScheduling(WavesAfter) || |
| 1553 | mayCauseSpilling(WavesAfter) || |
| 1554 | (IncreaseOccupancy && WavesAfter < TargetOcc); |
| 1555 | } |
| 1556 | |
| 1557 | bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 1558 | if (mayCauseSpilling(WavesAfter)) |
| 1559 | return true; |
| 1560 | |
| 1561 | return false; |
| 1562 | } |
| 1563 | |
| 1564 | bool MemoryClauseInitialScheduleStage::shouldRevertScheduling( |
| 1565 | unsigned WavesAfter) { |
| 1566 | return mayCauseSpilling(WavesAfter); |
| 1567 | } |
| 1568 | |
| 1569 | bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { |
| 1570 | if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() && |
| 1571 | !PressureAfter.less(MF, O: PressureBefore)) { |
| 1572 | LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n" ); |
| 1573 | return true; |
| 1574 | } |
| 1575 | |
| 1576 | return false; |
| 1577 | } |
| 1578 | |
| 1579 | void GCNSchedStage::revertScheduling() { |
| 1580 | DAG.RegionsWithMinOcc[RegionIdx] = |
| 1581 | PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) == |
| 1582 | DAG.MinOccupancy; |
| 1583 | LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n" ); |
| 1584 | DAG.RegionEnd = DAG.RegionBegin; |
| 1585 | int SkippedDebugInstr = 0; |
| 1586 | for (MachineInstr *MI : Unsched) { |
| 1587 | if (MI->isDebugInstr()) { |
| 1588 | ++SkippedDebugInstr; |
| 1589 | continue; |
| 1590 | } |
| 1591 | |
| 1592 | if (MI->getIterator() != DAG.RegionEnd) { |
| 1593 | DAG.BB->splice(Where: DAG.RegionEnd, Other: DAG.BB, From: MI); |
| 1594 | if (!MI->isDebugInstr()) |
| 1595 | DAG.LIS->handleMove(MI&: *MI, UpdateFlags: true); |
| 1596 | } |
| 1597 | |
| 1598 | // Reset read-undef flags and update them later. |
| 1599 | for (auto &Op : MI->all_defs()) |
| 1600 | Op.setIsUndef(false); |
| 1601 | RegisterOperands RegOpers; |
| 1602 | RegOpers.collect(MI: *MI, TRI: *DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false); |
| 1603 | if (!MI->isDebugInstr()) { |
| 1604 | if (DAG.ShouldTrackLaneMasks) { |
| 1605 | // Adjust liveness and add missing dead+read-undef flags. |
| 1606 | SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot(); |
| 1607 | RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI); |
| 1608 | } else { |
| 1609 | // Adjust for missing dead-def flags. |
| 1610 | RegOpers.detectDeadDefs(MI: *MI, LIS: *DAG.LIS); |
| 1611 | } |
| 1612 | } |
| 1613 | DAG.RegionEnd = MI->getIterator(); |
| 1614 | ++DAG.RegionEnd; |
| 1615 | LLVM_DEBUG(dbgs() << "Scheduling " << *MI); |
| 1616 | } |
| 1617 | |
| 1618 | // After reverting schedule, debug instrs will now be at the end of the block |
| 1619 | // and RegionEnd will point to the first debug instr. Increment RegionEnd |
| 1620 | // pass debug instrs to the actual end of the scheduling region. |
| 1621 | while (SkippedDebugInstr-- > 0) |
| 1622 | ++DAG.RegionEnd; |
| 1623 | |
| 1624 | // If Unsched.front() instruction is a debug instruction, this will actually |
| 1625 | // shrink the region since we moved all debug instructions to the end of the |
| 1626 | // block. Find the first instruction that is not a debug instruction. |
| 1627 | DAG.RegionBegin = Unsched.front()->getIterator(); |
| 1628 | if (DAG.RegionBegin->isDebugInstr()) { |
| 1629 | for (MachineInstr *MI : Unsched) { |
| 1630 | if (MI->isDebugInstr()) |
| 1631 | continue; |
| 1632 | DAG.RegionBegin = MI->getIterator(); |
| 1633 | break; |
| 1634 | } |
| 1635 | } |
| 1636 | |
| 1637 | // Then move the debug instructions back into their correct place and set |
| 1638 | // RegionBegin and RegionEnd if needed. |
| 1639 | DAG.placeDebugValues(); |
| 1640 | |
| 1641 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
| 1642 | } |
| 1643 | |
| 1644 | bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, |
| 1645 | SlotIndex OriginalIdx, |
| 1646 | SlotIndex RematIdx) const { |
| 1647 | |
| 1648 | LiveIntervals *LIS = DAG.LIS; |
| 1649 | MachineRegisterInfo &MRI = DAG.MRI; |
| 1650 | OriginalIdx = OriginalIdx.getRegSlot(EC: true); |
| 1651 | RematIdx = std::max(a: RematIdx, b: RematIdx.getRegSlot(EC: true)); |
| 1652 | for (const MachineOperand &MO : InstToRemat->operands()) { |
| 1653 | if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) |
| 1654 | continue; |
| 1655 | |
| 1656 | if (!MO.getReg().isVirtual()) { |
| 1657 | // Do not attempt to reason about PhysRegs |
| 1658 | // TODO: better analysis of PhysReg livness |
| 1659 | if (!DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) && |
| 1660 | !DAG.TII->isIgnorableUse(MO)) |
| 1661 | return false; |
| 1662 | |
| 1663 | // Constant PhysRegs and IgnorableUses are okay |
| 1664 | continue; |
| 1665 | } |
| 1666 | |
| 1667 | LiveInterval &LI = LIS->getInterval(Reg: MO.getReg()); |
| 1668 | const VNInfo *OVNI = LI.getVNInfoAt(Idx: OriginalIdx); |
| 1669 | assert(OVNI); |
| 1670 | |
| 1671 | // Don't allow rematerialization immediately after the original def. |
| 1672 | // It would be incorrect if InstToRemat redefines the register. |
| 1673 | // See PR14098. |
| 1674 | if (SlotIndex::isSameInstr(A: OriginalIdx, B: RematIdx)) |
| 1675 | return false; |
| 1676 | |
| 1677 | if (OVNI != LI.getVNInfoAt(Idx: RematIdx)) |
| 1678 | return false; |
| 1679 | |
| 1680 | // Check that subrange is live at RematIdx. |
| 1681 | if (LI.hasSubRanges()) { |
| 1682 | const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); |
| 1683 | unsigned SubReg = MO.getSubReg(); |
| 1684 | LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubIdx: SubReg) |
| 1685 | : MRI.getMaxLaneMaskForVReg(Reg: MO.getReg()); |
| 1686 | for (LiveInterval::SubRange &SR : LI.subranges()) { |
| 1687 | if ((SR.LaneMask & LM).none()) |
| 1688 | continue; |
| 1689 | if (!SR.liveAt(index: RematIdx)) |
| 1690 | return false; |
| 1691 | |
| 1692 | // Early exit if all used lanes are checked. No need to continue. |
| 1693 | LM &= ~SR.LaneMask; |
| 1694 | if (LM.none()) |
| 1695 | break; |
| 1696 | } |
| 1697 | } |
| 1698 | } |
| 1699 | return true; |
| 1700 | } |
| 1701 | |
| 1702 | bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { |
| 1703 | REMAT_DEBUG({ |
| 1704 | dbgs() << "Collecting rematerializable instructions in " ; |
| 1705 | MF.getFunction().printAsOperand(dbgs(), false); |
| 1706 | dbgs() << '\n'; |
| 1707 | }); |
| 1708 | |
| 1709 | // Maps optimizable regions (i.e., regions at minimum and register-limited |
| 1710 | // occupancy, or regions with spilling) to the target RP we would like to |
| 1711 | // reach. |
| 1712 | DenseMap<unsigned, GCNRPTarget> OptRegions; |
| 1713 | const Function &F = MF.getFunction(); |
| 1714 | unsigned DynamicVGPRBlockSize = |
| 1715 | MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); |
| 1716 | |
| 1717 | std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F); |
| 1718 | const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F); |
| 1719 | const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F); |
| 1720 | const unsigned MaxSGPRsIncOcc = |
| 1721 | ST.getMaxNumSGPRs(WavesPerEU: DAG.MinOccupancy + 1, Addressable: false); |
| 1722 | const unsigned MaxVGPRsIncOcc = |
| 1723 | ST.getMaxNumVGPRs(WavesPerEU: DAG.MinOccupancy + 1, DynamicVGPRBlockSize); |
| 1724 | IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy; |
| 1725 | |
| 1726 | // Collect optimizable regions. If there is spilling in any region we will |
| 1727 | // just try to reduce spilling. Otherwise we will try to increase occupancy by |
| 1728 | // one in the whole function. |
| 1729 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| 1730 | GCNRegPressure &RP = DAG.Pressure[I]; |
| 1731 | // We allow ArchVGPR or AGPR savings to count as savings of the other kind |
| 1732 | // of VGPR only when trying to eliminate spilling. We cannot do this when |
| 1733 | // trying to increase occupancy since VGPR class swaps only occur later in |
| 1734 | // the register allocator i.e., the scheduler will not be able to reason |
| 1735 | // about these savings and will not report an increase in the achievable |
| 1736 | // occupancy, triggering rollbacks. |
| 1737 | GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP, |
| 1738 | /*CombineVGPRSavings=*/true); |
| 1739 | if (!Target.satisfied() && IncreaseOccupancy) { |
| 1740 | // There is spilling in the region and we were so far trying to increase |
| 1741 | // occupancy. Strop trying that and focus on reducing spilling. |
| 1742 | IncreaseOccupancy = false; |
| 1743 | OptRegions.clear(); |
| 1744 | } else if (IncreaseOccupancy) { |
| 1745 | // There is no spilling in the region, try to increase occupancy. |
| 1746 | Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP, |
| 1747 | /*CombineVGPRSavings=*/false); |
| 1748 | } |
| 1749 | if (!Target.satisfied()) |
| 1750 | OptRegions.insert(KV: {I, Target}); |
| 1751 | } |
| 1752 | if (OptRegions.empty()) |
| 1753 | return false; |
| 1754 | |
| 1755 | #ifndef NDEBUG |
| 1756 | if (IncreaseOccupancy) { |
| 1757 | REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy |
| 1758 | << ") in regions:\n" ); |
| 1759 | } else { |
| 1760 | REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy (" |
| 1761 | << WavesPerEU.first << ") in regions:\n" ); |
| 1762 | } |
| 1763 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| 1764 | if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) |
| 1765 | REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n'); |
| 1766 | } |
| 1767 | #endif |
| 1768 | |
| 1769 | // When we are reducing spilling, the target is the minimum target number of |
| 1770 | // waves/EU determined by the subtarget. In cases where either one of |
| 1771 | // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current |
| 1772 | // minimum region occupancy may be higher than the latter. |
| 1773 | TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 |
| 1774 | : std::max(a: DAG.MinOccupancy, b: WavesPerEU.first); |
| 1775 | |
| 1776 | // Accounts for a reduction in RP in an optimizable region. Returns whether we |
| 1777 | // estimate that we have identified enough rematerialization opportunities to |
| 1778 | // achieve our goal, and sets Progress to true when this particular reduction |
| 1779 | // in pressure was helpful toward that goal. |
| 1780 | auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask, |
| 1781 | bool &Progress) -> bool { |
| 1782 | GCNRPTarget &Target = OptIt->getSecond(); |
| 1783 | if (!Target.isSaveBeneficial(Reg, MRI: DAG.MRI)) |
| 1784 | return false; |
| 1785 | Progress = true; |
| 1786 | Target.saveReg(Reg, Mask, MRI: DAG.MRI); |
| 1787 | if (Target.satisfied()) |
| 1788 | OptRegions.erase(OptIt->getFirst()); |
| 1789 | return OptRegions.empty(); |
| 1790 | }; |
| 1791 | |
| 1792 | // We need up-to-date live-out info. to query live-out register masks in |
| 1793 | // regions containing rematerializable instructions. |
| 1794 | DAG.RegionLiveOuts.buildLiveRegMap(); |
| 1795 | |
| 1796 | // Cache set of registers that are going to be rematerialized. |
| 1797 | DenseSet<unsigned> RematRegs; |
| 1798 | |
| 1799 | // Identify rematerializable instructions in the function. |
| 1800 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| 1801 | auto Region = DAG.Regions[I]; |
| 1802 | for (auto MI = Region.first; MI != Region.second; ++MI) { |
| 1803 | // The instruction must be trivially rematerializable. |
| 1804 | MachineInstr &DefMI = *MI; |
| 1805 | if (!isTriviallyReMaterializable(MI: DefMI)) |
| 1806 | continue; |
| 1807 | |
| 1808 | // We only support rematerializing virtual registers with one definition. |
| 1809 | Register Reg = DefMI.getOperand(i: 0).getReg(); |
| 1810 | if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(RegNo: Reg)) |
| 1811 | continue; |
| 1812 | |
| 1813 | // We only care to rematerialize the instruction if it has a single |
| 1814 | // non-debug user in a different region. The using MI may not belong to a |
| 1815 | // region if it is a lone region terminator. |
| 1816 | MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(RegNo: Reg); |
| 1817 | if (!UseMI) |
| 1818 | continue; |
| 1819 | auto UseRegion = MIRegion.find(Val: UseMI); |
| 1820 | if (UseRegion != MIRegion.end() && UseRegion->second == I) |
| 1821 | continue; |
| 1822 | |
| 1823 | // Do not rematerialize an instruction if it uses or is used by an |
| 1824 | // instruction that we have designated for rematerialization. |
| 1825 | // FIXME: Allow for rematerialization chains: this requires 1. updating |
| 1826 | // remat points to account for uses that are rematerialized, and 2. either |
| 1827 | // rematerializing the candidates in careful ordering, or deferring the |
| 1828 | // MBB RP walk until the entire chain has been rematerialized. |
| 1829 | if (Rematerializations.contains(Key: UseMI) || |
| 1830 | llvm::any_of(Range: DefMI.operands(), P: [&RematRegs](MachineOperand &MO) { |
| 1831 | return MO.isReg() && RematRegs.contains(V: MO.getReg()); |
| 1832 | })) |
| 1833 | continue; |
| 1834 | |
| 1835 | // Do not rematerialize an instruction it it uses registers that aren't |
| 1836 | // available at its use. This ensures that we are not extending any live |
| 1837 | // range while rematerializing. |
| 1838 | SlotIndex DefIdx = DAG.LIS->getInstructionIndex(Instr: DefMI); |
| 1839 | SlotIndex UseIdx = DAG.LIS->getInstructionIndex(Instr: *UseMI).getRegSlot(EC: true); |
| 1840 | if (!allUsesAvailableAt(InstToRemat: &DefMI, OriginalIdx: DefIdx, RematIdx: UseIdx)) |
| 1841 | continue; |
| 1842 | |
| 1843 | REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI); |
| 1844 | RematInstruction &Remat = |
| 1845 | Rematerializations.try_emplace(Key: &DefMI, Args&: UseMI).first->second; |
| 1846 | |
| 1847 | bool RematUseful = false; |
| 1848 | if (auto It = OptRegions.find(Val: I); It != OptRegions.end()) { |
| 1849 | // Optimistically consider that moving the instruction out of its |
| 1850 | // defining region will reduce RP in the latter; this assumes that |
| 1851 | // maximum RP in the region is reached somewhere between the defining |
| 1852 | // instruction and the end of the region. |
| 1853 | REMAT_DEBUG(dbgs() << " Defining region is optimizable\n" ); |
| 1854 | LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I)[Reg]; |
| 1855 | if (ReduceRPInRegion(It, Reg, Mask, RematUseful)) |
| 1856 | return true; |
| 1857 | } |
| 1858 | |
| 1859 | for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) { |
| 1860 | // We are only collecting regions in which the register is a live-in |
| 1861 | // (and may be live-through). |
| 1862 | auto It = DAG.LiveIns[LIRegion].find(Val: Reg); |
| 1863 | if (It == DAG.LiveIns[LIRegion].end() || It->second.none()) |
| 1864 | continue; |
| 1865 | Remat.LiveInRegions.insert(V: LIRegion); |
| 1866 | |
| 1867 | // Account for the reduction in RP due to the rematerialization in an |
| 1868 | // optimizable region in which the defined register is a live-in. This |
| 1869 | // is exact for live-through region but optimistic in the using region, |
| 1870 | // where RP is actually reduced only if maximum RP is reached somewhere |
| 1871 | // between the beginning of the region and the rematerializable |
| 1872 | // instruction's use. |
| 1873 | if (auto It = OptRegions.find(Val: LIRegion); It != OptRegions.end()) { |
| 1874 | REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n'); |
| 1875 | if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg], |
| 1876 | RematUseful)) |
| 1877 | return true; |
| 1878 | } |
| 1879 | } |
| 1880 | |
| 1881 | // If the instruction is not a live-in or live-out in any optimizable |
| 1882 | // region then there is no point in rematerializing it. |
| 1883 | if (!RematUseful) { |
| 1884 | Rematerializations.pop_back(); |
| 1885 | REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n" ); |
| 1886 | } else { |
| 1887 | RematRegs.insert(V: Reg); |
| 1888 | } |
| 1889 | } |
| 1890 | } |
| 1891 | |
| 1892 | if (IncreaseOccupancy) { |
| 1893 | // We were trying to increase occupancy but failed, abort the stage. |
| 1894 | REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n" ); |
| 1895 | Rematerializations.clear(); |
| 1896 | return false; |
| 1897 | } |
| 1898 | REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n" ); |
| 1899 | return !Rematerializations.empty(); |
| 1900 | } |
| 1901 | |
| 1902 | void PreRARematStage::rematerialize() { |
| 1903 | const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); |
| 1904 | |
| 1905 | // Collect regions whose RP changes in unpredictable way; we will have to |
| 1906 | // fully recompute their RP after all rematerailizations. |
| 1907 | DenseSet<unsigned> RecomputeRP; |
| 1908 | |
| 1909 | // Rematerialize all instructions. |
| 1910 | for (auto &[DefMI, Remat] : Rematerializations) { |
| 1911 | MachineBasicBlock::iterator InsertPos(Remat.UseMI); |
| 1912 | Register Reg = DefMI->getOperand(i: 0).getReg(); |
| 1913 | unsigned SubReg = DefMI->getOperand(i: 0).getSubReg(); |
| 1914 | unsigned DefRegion = MIRegion.at(Val: DefMI); |
| 1915 | |
| 1916 | // Rematerialize DefMI to its use block. |
| 1917 | TII->reMaterialize(MBB&: *InsertPos->getParent(), MI: InsertPos, DestReg: Reg, SubIdx: SubReg, Orig: *DefMI, |
| 1918 | TRI: *DAG.TRI); |
| 1919 | Remat.RematMI = &*std::prev(x: InsertPos); |
| 1920 | Remat.RematMI->getOperand(i: 0).setSubReg(SubReg); |
| 1921 | DAG.LIS->InsertMachineInstrInMaps(MI&: *Remat.RematMI); |
| 1922 | |
| 1923 | // Update region boundaries in regions we sinked from (remove defining MI) |
| 1924 | // and to (insert MI rematerialized in use block). Only then we can erase |
| 1925 | // the original MI. |
| 1926 | DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[DefRegion], MI: DefMI, NewMI: nullptr); |
| 1927 | auto UseRegion = MIRegion.find(Val: Remat.UseMI); |
| 1928 | if (UseRegion != MIRegion.end()) { |
| 1929 | DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[UseRegion->second], MI: InsertPos, |
| 1930 | NewMI: Remat.RematMI); |
| 1931 | } |
| 1932 | DAG.LIS->RemoveMachineInstrFromMaps(MI&: *DefMI); |
| 1933 | DefMI->eraseFromParent(); |
| 1934 | |
| 1935 | // Collect all regions impacted by the rematerialization and update their |
| 1936 | // live-in/RP information. |
| 1937 | for (unsigned I : Remat.LiveInRegions) { |
| 1938 | ImpactedRegions.insert(KV: {I, DAG.Pressure[I]}); |
| 1939 | GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I]; |
| 1940 | |
| 1941 | #ifdef EXPENSIVE_CHECKS |
| 1942 | // All uses are known to be available / live at the remat point. Thus, the |
| 1943 | // uses should already be live in to the region. |
| 1944 | for (MachineOperand &MO : DefMI->operands()) { |
| 1945 | if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) |
| 1946 | continue; |
| 1947 | |
| 1948 | Register UseReg = MO.getReg(); |
| 1949 | if (!UseReg.isVirtual()) |
| 1950 | continue; |
| 1951 | |
| 1952 | LiveInterval &LI = DAG.LIS->getInterval(UseReg); |
| 1953 | LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg()); |
| 1954 | if (LI.hasSubRanges() && MO.getSubReg()) |
| 1955 | LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg()); |
| 1956 | |
| 1957 | LaneBitmask LiveInMask = RegionLiveIns.at(UseReg); |
| 1958 | LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM); |
| 1959 | // If this register has lanes not covered by the LiveIns, be sure they |
| 1960 | // do not map to any subrange. ref: |
| 1961 | // machine-scheduler-sink-trivial-remats.mir::omitted_subrange |
| 1962 | if (UncoveredLanes.any()) { |
| 1963 | assert(LI.hasSubRanges()); |
| 1964 | for (LiveInterval::SubRange &SR : LI.subranges()) |
| 1965 | assert((SR.LaneMask & UncoveredLanes).none()); |
| 1966 | } |
| 1967 | } |
| 1968 | #endif |
| 1969 | |
| 1970 | // The register is no longer a live-in in all regions but the one that |
| 1971 | // contains the single use. In live-through regions, maximum register |
| 1972 | // pressure decreases predictably so we can directly update it. In the |
| 1973 | // using region, maximum RP may or may not decrease, so we will mark it |
| 1974 | // for re-computation after all materializations have taken place. |
| 1975 | LaneBitmask PrevMask = RegionLiveIns[Reg]; |
| 1976 | RegionLiveIns.erase(Val: Reg); |
| 1977 | RegMasks.insert(KV: {{I, Remat.RematMI->getOperand(i: 0).getReg()}, PrevMask}); |
| 1978 | if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent()) |
| 1979 | DAG.Pressure[I].inc(Reg, PrevMask, NewMask: LaneBitmask::getNone(), MRI: DAG.MRI); |
| 1980 | else |
| 1981 | RecomputeRP.insert(V: I); |
| 1982 | } |
| 1983 | // RP in the region from which the instruction was rematerialized may or may |
| 1984 | // not decrease. |
| 1985 | ImpactedRegions.insert(KV: {DefRegion, DAG.Pressure[DefRegion]}); |
| 1986 | RecomputeRP.insert(V: DefRegion); |
| 1987 | |
| 1988 | // Recompute live interval to reflect the register's rematerialization. |
| 1989 | Register RematReg = Remat.RematMI->getOperand(i: 0).getReg(); |
| 1990 | DAG.LIS->removeInterval(Reg: RematReg); |
| 1991 | DAG.LIS->createAndComputeVirtRegInterval(Reg: RematReg); |
| 1992 | } |
| 1993 | |
| 1994 | // All regions impacted by at least one rematerialization must be rescheduled. |
| 1995 | // Maximum pressure must also be recomputed for all regions where it changed |
| 1996 | // non-predictably and checked against the target occupancy. |
| 1997 | AchievedOcc = TargetOcc; |
| 1998 | for (auto &[I, OriginalRP] : ImpactedRegions) { |
| 1999 | bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; |
| 2000 | RescheduleRegions[I] = !IsEmptyRegion; |
| 2001 | if (!RecomputeRP.contains(V: I)) |
| 2002 | continue; |
| 2003 | |
| 2004 | GCNRegPressure RP; |
| 2005 | if (IsEmptyRegion) { |
| 2006 | RP = getRegPressure(MRI: DAG.MRI, LiveRegs&: DAG.LiveIns[I]); |
| 2007 | } else { |
| 2008 | GCNDownwardRPTracker RPT(*DAG.LIS); |
| 2009 | auto *NonDbgMI = &*skipDebugInstructionsForward(It: DAG.Regions[I].first, |
| 2010 | End: DAG.Regions[I].second); |
| 2011 | if (NonDbgMI == DAG.Regions[I].second) { |
| 2012 | // Region is non-empty but contains only debug instructions. |
| 2013 | RP = getRegPressure(MRI: DAG.MRI, LiveRegs&: DAG.LiveIns[I]); |
| 2014 | } else { |
| 2015 | RPT.reset(MI: *NonDbgMI, LiveRegs: &DAG.LiveIns[I]); |
| 2016 | RPT.advance(End: DAG.Regions[I].second); |
| 2017 | RP = RPT.moveMaxPressure(); |
| 2018 | } |
| 2019 | } |
| 2020 | DAG.Pressure[I] = RP; |
| 2021 | AchievedOcc = std::min( |
| 2022 | a: AchievedOcc, b: RP.getOccupancy(ST, DynamicVGPRBlockSize: MF.getInfo<SIMachineFunctionInfo>() |
| 2023 | ->getDynamicVGPRBlockSize())); |
| 2024 | } |
| 2025 | REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n" ); |
| 2026 | } |
| 2027 | |
| 2028 | // Copied from MachineLICM |
| 2029 | bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { |
| 2030 | if (!DAG.TII->isTriviallyReMaterializable(MI)) |
| 2031 | return false; |
| 2032 | |
| 2033 | for (const MachineOperand &MO : MI.all_uses()) { |
| 2034 | // We can't remat physreg uses, unless it is a constant or an ignorable |
| 2035 | // use (e.g. implicit exec use on VALU instructions) |
| 2036 | if (MO.getReg().isPhysical()) { |
| 2037 | if (DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) || DAG.TII->isIgnorableUse(MO)) |
| 2038 | continue; |
| 2039 | return false; |
| 2040 | } |
| 2041 | } |
| 2042 | |
| 2043 | return true; |
| 2044 | } |
| 2045 | |
| 2046 | void PreRARematStage::finalizeGCNSchedStage() { |
| 2047 | // We consider that reducing spilling is always beneficial so we never |
| 2048 | // rollback rematerializations in such cases. It's also possible that |
| 2049 | // rescheduling lowers occupancy over the one achieved just through remats, in |
| 2050 | // which case we do not want to rollback either (the rescheduling was already |
| 2051 | // reverted in PreRARematStage::shouldRevertScheduling in such cases). |
| 2052 | unsigned MaxOcc = std::max(a: AchievedOcc, b: DAG.MinOccupancy); |
| 2053 | if (!IncreaseOccupancy || MaxOcc >= TargetOcc) |
| 2054 | return; |
| 2055 | |
| 2056 | REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n" ); |
| 2057 | const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); |
| 2058 | |
| 2059 | // Rollback the rematerializations. |
| 2060 | for (const auto &[DefMI, Remat] : Rematerializations) { |
| 2061 | MachineInstr &RematMI = *Remat.RematMI; |
| 2062 | unsigned DefRegion = MIRegion.at(Val: DefMI); |
| 2063 | MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second); |
| 2064 | MachineBasicBlock *MBB = RegionBB[DefRegion]; |
| 2065 | Register Reg = RematMI.getOperand(i: 0).getReg(); |
| 2066 | unsigned SubReg = RematMI.getOperand(i: 0).getSubReg(); |
| 2067 | |
| 2068 | // Re-rematerialize MI at the end of its original region. Note that it may |
| 2069 | // not be rematerialized exactly in the same position as originally within |
| 2070 | // the region, but it should not matter much. |
| 2071 | TII->reMaterialize(MBB&: *MBB, MI: InsertPos, DestReg: Reg, SubIdx: SubReg, Orig: RematMI, TRI: *DAG.TRI); |
| 2072 | MachineInstr *NewMI = &*std::prev(x: InsertPos); |
| 2073 | NewMI->getOperand(i: 0).setSubReg(SubReg); |
| 2074 | DAG.LIS->InsertMachineInstrInMaps(MI&: *NewMI); |
| 2075 | |
| 2076 | auto UseRegion = MIRegion.find(Val: Remat.UseMI); |
| 2077 | if (UseRegion != MIRegion.end()) { |
| 2078 | DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[UseRegion->second], MI: RematMI, |
| 2079 | NewMI: nullptr); |
| 2080 | } |
| 2081 | DAG.updateRegionBoundaries(RegionBounds&: DAG.Regions[DefRegion], MI: InsertPos, NewMI); |
| 2082 | |
| 2083 | // Erase rematerialized MI. |
| 2084 | DAG.LIS->RemoveMachineInstrFromMaps(MI&: RematMI); |
| 2085 | RematMI.eraseFromParent(); |
| 2086 | |
| 2087 | // Recompute live interval for the re-rematerialized register |
| 2088 | DAG.LIS->removeInterval(Reg); |
| 2089 | DAG.LIS->createAndComputeVirtRegInterval(Reg); |
| 2090 | |
| 2091 | // Re-add the register as a live-in in all regions it used to be one in. |
| 2092 | for (unsigned LIRegion : Remat.LiveInRegions) |
| 2093 | DAG.LiveIns[LIRegion].insert(KV: {Reg, RegMasks.at(Val: {LIRegion, Reg})}); |
| 2094 | } |
| 2095 | |
| 2096 | // Reset RP in all impacted regions. |
| 2097 | for (auto &[I, OriginalRP] : ImpactedRegions) |
| 2098 | DAG.Pressure[I] = OriginalRP; |
| 2099 | |
| 2100 | GCNSchedStage::finalizeGCNSchedStage(); |
| 2101 | } |
| 2102 | |
| 2103 | void GCNScheduleDAGMILive::updateRegionBoundaries( |
| 2104 | RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI, |
| 2105 | MachineInstr *NewMI) { |
| 2106 | assert((!NewMI || NewMI != RegionBounds.second) && |
| 2107 | "cannot remove at region end" ); |
| 2108 | |
| 2109 | if (RegionBounds.first == RegionBounds.second) { |
| 2110 | assert(NewMI && "cannot remove from an empty region" ); |
| 2111 | RegionBounds.first = NewMI; |
| 2112 | return; |
| 2113 | } |
| 2114 | |
| 2115 | // We only care for modifications at the beginning of a non-empty region since |
| 2116 | // the upper region boundary is exclusive. |
| 2117 | if (MI != RegionBounds.first) |
| 2118 | return; |
| 2119 | if (!NewMI) |
| 2120 | RegionBounds.first = std::next(x: MI); // Removal |
| 2121 | else |
| 2122 | RegionBounds.first = NewMI; // Insertion |
| 2123 | } |
| 2124 | |
| 2125 | static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { |
| 2126 | const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII); |
| 2127 | return any_of(Range&: *DAG, P: [SII](MachineBasicBlock::iterator MI) { |
| 2128 | return SII->isIGLPMutationOnly(Opcode: MI->getOpcode()); |
| 2129 | }); |
| 2130 | } |
| 2131 | |
| 2132 | GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( |
| 2133 | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, |
| 2134 | bool RemoveKillFlags) |
| 2135 | : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} |
| 2136 | |
| 2137 | void GCNPostScheduleDAGMILive::schedule() { |
| 2138 | HasIGLPInstrs = hasIGLPInstrs(DAG: this); |
| 2139 | if (HasIGLPInstrs) { |
| 2140 | SavedMutations.clear(); |
| 2141 | SavedMutations.swap(x&: Mutations); |
| 2142 | addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA)); |
| 2143 | } |
| 2144 | |
| 2145 | ScheduleDAGMI::schedule(); |
| 2146 | } |
| 2147 | |
| 2148 | void GCNPostScheduleDAGMILive::finalizeSchedule() { |
| 2149 | if (HasIGLPInstrs) |
| 2150 | SavedMutations.swap(x&: Mutations); |
| 2151 | |
| 2152 | ScheduleDAGMI::finalizeSchedule(); |
| 2153 | } |
| 2154 | |