| 1 | //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// This contains a MachineSchedStrategy implementation for maximizing wave |
| 11 | /// occupancy on GCN hardware. |
| 12 | /// |
| 13 | /// This pass will apply multiple scheduling stages to the same function. |
| 14 | /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual |
| 15 | /// entry point for the scheduling of those regions is |
| 16 | /// GCNScheduleDAGMILive::runSchedStages. |
| 17 | |
| 18 | /// Generally, the reason for having multiple scheduling stages is to account |
| 19 | /// for the kernel-wide effect of register usage on occupancy. Usually, only a |
| 20 | /// few scheduling regions will have register pressure high enough to limit |
| 21 | /// occupancy for the kernel, so constraints can be relaxed to improve ILP in |
| 22 | /// other regions. |
| 23 | /// |
| 24 | //===----------------------------------------------------------------------===// |
| 25 | |
| 26 | #include "GCNSchedStrategy.h" |
| 27 | #include "AMDGPUIGroupLP.h" |
| 28 | #include "GCNRegPressure.h" |
| 29 | #include "SIMachineFunctionInfo.h" |
| 30 | #include "Utils/AMDGPUBaseInfo.h" |
| 31 | #include "llvm/ADT/BitVector.h" |
| 32 | #include "llvm/ADT/STLExtras.h" |
| 33 | #include "llvm/CodeGen/CalcSpillWeights.h" |
| 34 | #include "llvm/CodeGen/MachineBasicBlock.h" |
| 35 | #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" |
| 36 | #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" |
| 37 | #include "llvm/CodeGen/MachineCycleAnalysis.h" |
| 38 | #include "llvm/CodeGen/MachineOperand.h" |
| 39 | #include "llvm/CodeGen/RegisterClassInfo.h" |
| 40 | #include "llvm/MC/LaneBitmask.h" |
| 41 | #include "llvm/MC/MCInstrItineraries.h" |
| 42 | #include "llvm/MC/MCSchedule.h" |
| 43 | #include "llvm/MC/TargetRegistry.h" |
| 44 | #include "llvm/Support/ErrorHandling.h" |
| 45 | |
| 46 | #define DEBUG_TYPE "machine-scheduler" |
| 47 | |
| 48 | using namespace llvm; |
| 49 | |
| 50 | static cl::opt<bool> DisableUnclusterHighRP( |
| 51 | "amdgpu-disable-unclustered-high-rp-reschedule" , cl::Hidden, |
| 52 | cl::desc("Disable unclustered high register pressure " |
| 53 | "reduction scheduling stage." ), |
| 54 | cl::init(Val: false)); |
| 55 | |
| 56 | static cl::opt<bool> DisableClusteredLowOccupancy( |
| 57 | "amdgpu-disable-clustered-low-occupancy-reschedule" , cl::Hidden, |
| 58 | cl::desc("Disable clustered low occupancy " |
| 59 | "rescheduling for ILP scheduling stage." ), |
| 60 | cl::init(Val: false)); |
| 61 | |
| 62 | static cl::opt<unsigned> ScheduleMetricBias( |
| 63 | "amdgpu-schedule-metric-bias" , cl::Hidden, |
| 64 | cl::desc( |
| 65 | "Sets the bias which adds weight to occupancy vs latency. Set it to " |
| 66 | "100 to chase the occupancy only." ), |
| 67 | cl::init(Val: 10)); |
| 68 | |
| 69 | static cl::opt<bool> |
| 70 | RelaxedOcc("amdgpu-schedule-relaxed-occupancy" , cl::Hidden, |
| 71 | cl::desc("Relax occupancy targets for kernels which are memory " |
| 72 | "bound (amdgpu-membound-threshold), or " |
| 73 | "Wave Limited (amdgpu-limit-wave-threshold)." ), |
| 74 | cl::init(Val: false)); |
| 75 | |
| 76 | static cl::opt<bool> GCNTrackers( |
| 77 | "amdgpu-use-amdgpu-trackers" , cl::Hidden, |
| 78 | cl::desc("Use the AMDGPU specific RPTrackers during scheduling" ), |
| 79 | cl::init(Val: false)); |
| 80 | |
| 81 | static cl::opt<unsigned> PendingQueueLimit( |
| 82 | "amdgpu-scheduler-pending-queue-limit" , cl::Hidden, |
| 83 | cl::desc( |
| 84 | "Max (Available+Pending) size to inspect pending queue (0 disables)" ), |
| 85 | cl::init(Val: 256)); |
| 86 | |
| 87 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 88 | #define DUMP_MAX_REG_PRESSURE |
| 89 | static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler( |
| 90 | "amdgpu-print-max-reg-pressure-regusage-before-scheduler" , cl::Hidden, |
| 91 | cl::desc("Print a list of live registers along with their def/uses at the " |
| 92 | "point of maximum register pressure before scheduling." ), |
| 93 | cl::init(false)); |
| 94 | |
| 95 | static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler( |
| 96 | "amdgpu-print-max-reg-pressure-regusage-after-scheduler" , cl::Hidden, |
| 97 | cl::desc("Print a list of live registers along with their def/uses at the " |
| 98 | "point of maximum register pressure after scheduling." ), |
| 99 | cl::init(false)); |
| 100 | #endif |
| 101 | |
| 102 | static cl::opt<bool> DisableRewriteMFMAFormSchedStage( |
| 103 | "amdgpu-disable-rewrite-mfma-form-sched-stage" , cl::Hidden, |
| 104 | cl::desc("Disable rewrie mfma rewrite scheduling stage" ), cl::init(Val: true)); |
| 105 | |
| 106 | const unsigned ScheduleMetrics::ScaleFactor = 100; |
| 107 | |
| 108 | GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) |
| 109 | : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), |
| 110 | DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { |
| 111 | } |
| 112 | |
| 113 | void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { |
| 114 | GenericScheduler::initialize(dag: DAG); |
| 115 | |
| 116 | MF = &DAG->MF; |
| 117 | |
| 118 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
| 119 | |
| 120 | SGPRExcessLimit = |
| 121 | Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass); |
| 122 | VGPRExcessLimit = |
| 123 | Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass); |
| 124 | |
| 125 | SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
| 126 | // Set the initial TargetOccupnacy to the maximum occupancy that we can |
| 127 | // achieve for this function. This effectively sets a lower bound on the |
| 128 | // 'Critical' register limits in the scheduler. |
| 129 | // Allow for lower occupancy targets if kernel is wave limited or memory |
| 130 | // bound, and using the relaxed occupancy feature. |
| 131 | TargetOccupancy = |
| 132 | RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); |
| 133 | SGPRCriticalLimit = |
| 134 | std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit); |
| 135 | |
| 136 | if (!KnownExcessRP) { |
| 137 | VGPRCriticalLimit = std::min( |
| 138 | a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize()), |
| 139 | b: VGPRExcessLimit); |
| 140 | } else { |
| 141 | // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except |
| 142 | // returns a reasonably small number for targets with lots of VGPRs, such |
| 143 | // as GFX10 and GFX11. |
| 144 | LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " |
| 145 | "VGPRCriticalLimit calculation method.\n" ); |
| 146 | unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize(); |
| 147 | unsigned Granule = |
| 148 | AMDGPU::IsaInfo::getVGPRAllocGranule(STI: &ST, DynamicVGPRBlockSize); |
| 149 | unsigned Addressable = |
| 150 | AMDGPU::IsaInfo::getAddressableNumVGPRs(STI: &ST, DynamicVGPRBlockSize); |
| 151 | unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule); |
| 152 | VGPRBudget = std::max(a: VGPRBudget, b: Granule); |
| 153 | VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit); |
| 154 | } |
| 155 | |
| 156 | // Subtract error margin and bias from register limits and avoid overflow. |
| 157 | SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit); |
| 158 | VGPRCriticalLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit); |
| 159 | SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit); |
| 160 | VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit); |
| 161 | LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit |
| 162 | << ", VGPRExcessLimit = " << VGPRExcessLimit |
| 163 | << ", SGPRCriticalLimit = " << SGPRCriticalLimit |
| 164 | << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n" ); |
| 165 | } |
| 166 | |
| 167 | /// Checks whether \p SU can use the cached DAG pressure diffs to compute the |
| 168 | /// current register pressure. |
| 169 | /// |
| 170 | /// This works for the common case, but it has a few exceptions that have been |
| 171 | /// observed through trial and error: |
| 172 | /// - Explicit physical register operands |
| 173 | /// - Subregister definitions |
| 174 | /// |
| 175 | /// In both of those cases, PressureDiff doesn't represent the actual pressure, |
| 176 | /// and querying LiveIntervals through the RegPressureTracker is needed to get |
| 177 | /// an accurate value. |
| 178 | /// |
| 179 | /// We should eventually only use PressureDiff for maximum performance, but this |
| 180 | /// already allows 80% of SUs to take the fast path without changing scheduling |
| 181 | /// at all. Further changes would either change scheduling, or require a lot |
| 182 | /// more logic to recover an accurate pressure estimate from the PressureDiffs. |
| 183 | static bool canUsePressureDiffs(const SUnit &SU) { |
| 184 | if (!SU.isInstr()) |
| 185 | return false; |
| 186 | |
| 187 | // Cannot use pressure diffs for subregister defs or with physregs, it's |
| 188 | // imprecise in both cases. |
| 189 | for (const auto &Op : SU.getInstr()->operands()) { |
| 190 | if (!Op.isReg() || Op.isImplicit()) |
| 191 | continue; |
| 192 | if (Op.getReg().isPhysical() || |
| 193 | (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister)) |
| 194 | return false; |
| 195 | } |
| 196 | return true; |
| 197 | } |
| 198 | |
| 199 | static void getRegisterPressures( |
| 200 | bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, |
| 201 | std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure, |
| 202 | GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, |
| 203 | ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) { |
| 204 | // getDownwardPressure() and getUpwardPressure() make temporary changes to |
| 205 | // the tracker, so we need to pass those function a non-const copy. |
| 206 | RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); |
| 207 | if (!GCNTrackers) { |
| 208 | AtTop |
| 209 | ? TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure) |
| 210 | : TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure); |
| 211 | |
| 212 | return; |
| 213 | } |
| 214 | |
| 215 | // GCNTrackers |
| 216 | Pressure.resize(new_size: 4, x: 0); |
| 217 | MachineInstr *MI = SU->getInstr(); |
| 218 | GCNRegPressure NewPressure; |
| 219 | if (AtTop) { |
| 220 | GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); |
| 221 | NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, TRI: SRI); |
| 222 | } else { |
| 223 | GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); |
| 224 | TempUpwardTracker.recede(MI: *MI); |
| 225 | NewPressure = TempUpwardTracker.getPressure(); |
| 226 | } |
| 227 | Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); |
| 228 | Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = |
| 229 | NewPressure.getArchVGPRNum(); |
| 230 | Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); |
| 231 | } |
| 232 | |
| 233 | void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, |
| 234 | bool AtTop, |
| 235 | const RegPressureTracker &RPTracker, |
| 236 | const SIRegisterInfo *SRI, |
| 237 | unsigned SGPRPressure, |
| 238 | unsigned VGPRPressure, bool IsBottomUp) { |
| 239 | Cand.SU = SU; |
| 240 | Cand.AtTop = AtTop; |
| 241 | |
| 242 | if (!DAG->isTrackingPressure()) |
| 243 | return; |
| 244 | |
| 245 | Pressure.clear(); |
| 246 | MaxPressure.clear(); |
| 247 | |
| 248 | // We try to use the cached PressureDiffs in the ScheduleDAG whenever |
| 249 | // possible over querying the RegPressureTracker. |
| 250 | // |
| 251 | // RegPressureTracker will make a lot of LIS queries which are very |
| 252 | // expensive, it is considered a slow function in this context. |
| 253 | // |
| 254 | // PressureDiffs are precomputed and cached, and getPressureDiff is just a |
| 255 | // trivial lookup into an array. It is pretty much free. |
| 256 | // |
| 257 | // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of |
| 258 | // PressureDiffs. |
| 259 | if (AtTop || !canUsePressureDiffs(SU: *SU) || GCNTrackers) { |
| 260 | getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, |
| 261 | DownwardTracker, UpwardTracker, DAG, SRI); |
| 262 | } else { |
| 263 | // Reserve 4 slots. |
| 264 | Pressure.resize(new_size: 4, x: 0); |
| 265 | Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; |
| 266 | Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; |
| 267 | |
| 268 | for (const auto &Diff : DAG->getPressureDiff(SU)) { |
| 269 | if (!Diff.isValid()) |
| 270 | continue; |
| 271 | // PressureDiffs is always bottom-up so if we're working top-down we need |
| 272 | // to invert its sign. |
| 273 | Pressure[Diff.getPSet()] += |
| 274 | (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc()); |
| 275 | } |
| 276 | |
| 277 | #ifdef EXPENSIVE_CHECKS |
| 278 | std::vector<unsigned> CheckPressure, CheckMaxPressure; |
| 279 | getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, |
| 280 | DownwardTracker, UpwardTracker, DAG, SRI); |
| 281 | if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != |
| 282 | CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || |
| 283 | Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != |
| 284 | CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { |
| 285 | errs() << "Register Pressure is inaccurate when calculated through " |
| 286 | "PressureDiff\n" |
| 287 | << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] |
| 288 | << ", expected " |
| 289 | << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" |
| 290 | << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] |
| 291 | << ", expected " |
| 292 | << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n" ; |
| 293 | report_fatal_error("inaccurate register pressure calculation" ); |
| 294 | } |
| 295 | #endif |
| 296 | } |
| 297 | |
| 298 | unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
| 299 | unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
| 300 | |
| 301 | // If two instructions increase the pressure of different register sets |
| 302 | // by the same amount, the generic scheduler will prefer to schedule the |
| 303 | // instruction that increases the set with the least amount of registers, |
| 304 | // which in our case would be SGPRs. This is rarely what we want, so |
| 305 | // when we report excess/critical register pressure, we do it either |
| 306 | // only for VGPRs or only for SGPRs. |
| 307 | |
| 308 | // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. |
| 309 | const unsigned MaxVGPRPressureInc = 16; |
| 310 | bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; |
| 311 | bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; |
| 312 | |
| 313 | // FIXME: We have to enter REG-EXCESS before we reach the actual threshold |
| 314 | // to increase the likelihood we don't go over the limits. We should improve |
| 315 | // the analysis to look through dependencies to find the path with the least |
| 316 | // register pressure. |
| 317 | |
| 318 | // We only need to update the RPDelta for instructions that increase register |
| 319 | // pressure. Instructions that decrease or keep reg pressure the same will be |
| 320 | // marked as RegExcess in tryCandidate() when they are compared with |
| 321 | // instructions that increase the register pressure. |
| 322 | if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { |
| 323 | HasHighPressure = true; |
| 324 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
| 325 | Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); |
| 326 | } |
| 327 | |
| 328 | if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { |
| 329 | HasHighPressure = true; |
| 330 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
| 331 | Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); |
| 332 | } |
| 333 | |
| 334 | // Register pressure is considered 'CRITICAL' if it is approaching a value |
| 335 | // that would reduce the wave occupancy for the execution unit. When |
| 336 | // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both |
| 337 | // has the same cost, so we don't need to prefer one over the other. |
| 338 | |
| 339 | int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; |
| 340 | int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; |
| 341 | |
| 342 | if (SGPRDelta >= 0 || VGPRDelta >= 0) { |
| 343 | HasHighPressure = true; |
| 344 | if (SGPRDelta > VGPRDelta) { |
| 345 | Cand.RPDelta.CriticalMax = |
| 346 | PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
| 347 | Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); |
| 348 | } else { |
| 349 | Cand.RPDelta.CriticalMax = |
| 350 | PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
| 351 | Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); |
| 352 | } |
| 353 | } |
| 354 | } |
| 355 | |
| 356 | static bool shouldCheckPending(SchedBoundary &Zone, |
| 357 | const TargetSchedModel *SchedModel) { |
| 358 | bool HasBufferedModel = |
| 359 | SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize(); |
| 360 | unsigned Combined = Zone.Available.size() + Zone.Pending.size(); |
| 361 | return Combined <= PendingQueueLimit && HasBufferedModel; |
| 362 | } |
| 363 | |
| 364 | static SUnit *pickOnlyChoice(SchedBoundary &Zone, |
| 365 | const TargetSchedModel *SchedModel) { |
| 366 | // pickOnlyChoice() releases pending instructions and checks for new hazards. |
| 367 | SUnit *OnlyChoice = Zone.pickOnlyChoice(); |
| 368 | if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty()) |
| 369 | return OnlyChoice; |
| 370 | |
| 371 | return nullptr; |
| 372 | } |
| 373 | |
| 374 | void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current, |
| 375 | const SchedCandidate &Preferred) { |
| 376 | LLVM_DEBUG({ |
| 377 | dbgs() << "Prefer:\t\t" ; |
| 378 | DAG->dumpNode(*Preferred.SU); |
| 379 | |
| 380 | if (Current.SU) { |
| 381 | dbgs() << "Not:\t" ; |
| 382 | DAG->dumpNode(*Current.SU); |
| 383 | } |
| 384 | |
| 385 | dbgs() << "Reason:\t\t" ; |
| 386 | traceCandidate(Preferred); |
| 387 | }); |
| 388 | } |
| 389 | |
| 390 | // This function is mostly cut and pasted from |
| 391 | // GenericScheduler::pickNodeFromQueue() |
| 392 | void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, |
| 393 | const CandPolicy &ZonePolicy, |
| 394 | const RegPressureTracker &RPTracker, |
| 395 | SchedCandidate &Cand, bool &IsPending, |
| 396 | bool IsBottomUp) { |
| 397 | const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); |
| 398 | ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); |
| 399 | unsigned SGPRPressure = 0; |
| 400 | unsigned VGPRPressure = 0; |
| 401 | IsPending = false; |
| 402 | if (DAG->isTrackingPressure()) { |
| 403 | if (!GCNTrackers) { |
| 404 | SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
| 405 | VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
| 406 | } else { |
| 407 | GCNRPTracker *T = IsBottomUp |
| 408 | ? static_cast<GCNRPTracker *>(&UpwardTracker) |
| 409 | : static_cast<GCNRPTracker *>(&DownwardTracker); |
| 410 | SGPRPressure = T->getPressure().getSGPRNum(); |
| 411 | VGPRPressure = T->getPressure().getArchVGPRNum(); |
| 412 | } |
| 413 | } |
| 414 | LLVM_DEBUG(dbgs() << "Available Q:\n" ); |
| 415 | ReadyQueue &AQ = Zone.Available; |
| 416 | for (SUnit *SU : AQ) { |
| 417 | |
| 418 | SchedCandidate TryCand(ZonePolicy); |
| 419 | initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure, |
| 420 | VGPRPressure, IsBottomUp); |
| 421 | // Pass SchedBoundary only when comparing nodes from the same boundary. |
| 422 | SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; |
| 423 | tryCandidate(Cand, TryCand, Zone: ZoneArg); |
| 424 | if (TryCand.Reason != NoCand) { |
| 425 | // Initialize resource delta if needed in case future heuristics query it. |
| 426 | if (TryCand.ResDelta == SchedResourceDelta()) |
| 427 | TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel); |
| 428 | LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); |
| 429 | Cand.setBest(TryCand); |
| 430 | } else { |
| 431 | printCandidateDecision(Current: TryCand, Preferred: Cand); |
| 432 | } |
| 433 | } |
| 434 | |
| 435 | if (!shouldCheckPending(Zone, SchedModel)) |
| 436 | return; |
| 437 | |
| 438 | LLVM_DEBUG(dbgs() << "Pending Q:\n" ); |
| 439 | ReadyQueue &PQ = Zone.Pending; |
| 440 | for (SUnit *SU : PQ) { |
| 441 | |
| 442 | SchedCandidate TryCand(ZonePolicy); |
| 443 | initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, SGPRPressure, |
| 444 | VGPRPressure, IsBottomUp); |
| 445 | // Pass SchedBoundary only when comparing nodes from the same boundary. |
| 446 | SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; |
| 447 | tryPendingCandidate(Cand, TryCand, Zone: ZoneArg); |
| 448 | if (TryCand.Reason != NoCand) { |
| 449 | // Initialize resource delta if needed in case future heuristics query it. |
| 450 | if (TryCand.ResDelta == SchedResourceDelta()) |
| 451 | TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel); |
| 452 | LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); |
| 453 | IsPending = true; |
| 454 | Cand.setBest(TryCand); |
| 455 | } else { |
| 456 | printCandidateDecision(Current: TryCand, Preferred: Cand); |
| 457 | } |
| 458 | } |
| 459 | } |
| 460 | |
| 461 | // This function is mostly cut and pasted from |
| 462 | // GenericScheduler::pickNodeBidirectional() |
| 463 | SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode, |
| 464 | bool &PickedPending) { |
| 465 | // Schedule as far as possible in the direction of no choice. This is most |
| 466 | // efficient, but also provides the best heuristics for CriticalPSets. |
| 467 | if (SUnit *SU = pickOnlyChoice(Zone&: Bot, SchedModel)) { |
| 468 | IsTopNode = false; |
| 469 | return SU; |
| 470 | } |
| 471 | if (SUnit *SU = pickOnlyChoice(Zone&: Top, SchedModel)) { |
| 472 | IsTopNode = true; |
| 473 | return SU; |
| 474 | } |
| 475 | // Set the bottom-up policy based on the state of the current bottom zone |
| 476 | // and the instructions outside the zone, including the top zone. |
| 477 | CandPolicy BotPolicy; |
| 478 | setPolicy(Policy&: BotPolicy, /*IsPostRA=*/false, CurrZone&: Bot, OtherZone: &Top); |
| 479 | // Set the top-down policy based on the state of the current top zone and |
| 480 | // the instructions outside the zone, including the bottom zone. |
| 481 | CandPolicy TopPolicy; |
| 482 | setPolicy(Policy&: TopPolicy, /*IsPostRA=*/false, CurrZone&: Top, OtherZone: &Bot); |
| 483 | |
| 484 | bool BotPending = false; |
| 485 | // See if BotCand is still valid (because we previously scheduled from Top). |
| 486 | LLVM_DEBUG(dbgs() << "Picking from Bot:\n" ); |
| 487 | if (!BotCand.isValid() || BotCand.SU->isScheduled || |
| 488 | BotCand.Policy != BotPolicy) { |
| 489 | BotCand.reset(NewPolicy: CandPolicy()); |
| 490 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand, |
| 491 | IsPending&: BotPending, |
| 492 | /*IsBottomUp=*/true); |
| 493 | assert(BotCand.Reason != NoCand && "failed to find the first candidate" ); |
| 494 | } else { |
| 495 | LLVM_DEBUG(traceCandidate(BotCand)); |
| 496 | #ifndef NDEBUG |
| 497 | if (VerifyScheduling) { |
| 498 | SchedCandidate TCand; |
| 499 | TCand.reset(CandPolicy()); |
| 500 | pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, |
| 501 | BotPending, |
| 502 | /*IsBottomUp=*/true); |
| 503 | assert(TCand.SU == BotCand.SU && |
| 504 | "Last pick result should correspond to re-picking right now" ); |
| 505 | } |
| 506 | #endif |
| 507 | } |
| 508 | |
| 509 | bool TopPending = false; |
| 510 | // Check if the top Q has a better candidate. |
| 511 | LLVM_DEBUG(dbgs() << "Picking from Top:\n" ); |
| 512 | if (!TopCand.isValid() || TopCand.SU->isScheduled || |
| 513 | TopCand.Policy != TopPolicy) { |
| 514 | TopCand.reset(NewPolicy: CandPolicy()); |
| 515 | pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand, |
| 516 | IsPending&: TopPending, |
| 517 | /*IsBottomUp=*/false); |
| 518 | assert(TopCand.Reason != NoCand && "failed to find the first candidate" ); |
| 519 | } else { |
| 520 | LLVM_DEBUG(traceCandidate(TopCand)); |
| 521 | #ifndef NDEBUG |
| 522 | if (VerifyScheduling) { |
| 523 | SchedCandidate TCand; |
| 524 | TCand.reset(CandPolicy()); |
| 525 | pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, |
| 526 | TopPending, |
| 527 | /*IsBottomUp=*/false); |
| 528 | assert(TCand.SU == TopCand.SU && |
| 529 | "Last pick result should correspond to re-picking right now" ); |
| 530 | } |
| 531 | #endif |
| 532 | } |
| 533 | |
| 534 | // Pick best from BotCand and TopCand. |
| 535 | LLVM_DEBUG(dbgs() << "Top Cand: " ; traceCandidate(TopCand); |
| 536 | dbgs() << "Bot Cand: " ; traceCandidate(BotCand);); |
| 537 | SchedCandidate Cand = BotPending ? TopCand : BotCand; |
| 538 | SchedCandidate TryCand = BotPending ? BotCand : TopCand; |
| 539 | PickedPending = BotPending && TopPending; |
| 540 | |
| 541 | TryCand.Reason = NoCand; |
| 542 | if (BotPending || TopPending) { |
| 543 | PickedPending |= tryPendingCandidate(Cand, TryCand&: TopCand, Zone: nullptr); |
| 544 | } else { |
| 545 | tryCandidate(Cand, TryCand, Zone: nullptr); |
| 546 | } |
| 547 | |
| 548 | if (TryCand.Reason != NoCand) { |
| 549 | Cand.setBest(TryCand); |
| 550 | } |
| 551 | |
| 552 | LLVM_DEBUG(dbgs() << "Picking: " ; traceCandidate(Cand);); |
| 553 | |
| 554 | IsTopNode = Cand.AtTop; |
| 555 | return Cand.SU; |
| 556 | } |
| 557 | |
| 558 | // This function is mostly cut and pasted from |
| 559 | // GenericScheduler::pickNode() |
| 560 | SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { |
| 561 | if (DAG->top() == DAG->bottom()) { |
| 562 | assert(Top.Available.empty() && Top.Pending.empty() && |
| 563 | Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage" ); |
| 564 | return nullptr; |
| 565 | } |
| 566 | bool PickedPending; |
| 567 | SUnit *SU; |
| 568 | do { |
| 569 | PickedPending = false; |
| 570 | if (RegionPolicy.OnlyTopDown) { |
| 571 | SU = pickOnlyChoice(Zone&: Top, SchedModel); |
| 572 | if (!SU) { |
| 573 | CandPolicy NoPolicy; |
| 574 | TopCand.reset(NewPolicy: NoPolicy); |
| 575 | pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand, |
| 576 | IsPending&: PickedPending, |
| 577 | /*IsBottomUp=*/false); |
| 578 | assert(TopCand.Reason != NoCand && "failed to find a candidate" ); |
| 579 | SU = TopCand.SU; |
| 580 | } |
| 581 | IsTopNode = true; |
| 582 | } else if (RegionPolicy.OnlyBottomUp) { |
| 583 | SU = pickOnlyChoice(Zone&: Bot, SchedModel); |
| 584 | if (!SU) { |
| 585 | CandPolicy NoPolicy; |
| 586 | BotCand.reset(NewPolicy: NoPolicy); |
| 587 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand, |
| 588 | IsPending&: PickedPending, |
| 589 | /*IsBottomUp=*/true); |
| 590 | assert(BotCand.Reason != NoCand && "failed to find a candidate" ); |
| 591 | SU = BotCand.SU; |
| 592 | } |
| 593 | IsTopNode = false; |
| 594 | } else { |
| 595 | SU = pickNodeBidirectional(IsTopNode, PickedPending); |
| 596 | } |
| 597 | } while (SU->isScheduled); |
| 598 | |
| 599 | if (PickedPending) { |
| 600 | unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle; |
| 601 | SchedBoundary &Zone = IsTopNode ? Top : Bot; |
| 602 | unsigned CurrentCycle = Zone.getCurrCycle(); |
| 603 | if (ReadyCycle > CurrentCycle) |
| 604 | Zone.bumpCycle(NextCycle: ReadyCycle); |
| 605 | |
| 606 | // FIXME: checkHazard() doesn't give information about which cycle the |
| 607 | // hazard will resolve so just keep bumping the cycle by 1. This could be |
| 608 | // made more efficient if checkHazard() returned more details. |
| 609 | while (Zone.checkHazard(SU)) |
| 610 | Zone.bumpCycle(NextCycle: Zone.getCurrCycle() + 1); |
| 611 | |
| 612 | Zone.releasePending(); |
| 613 | } |
| 614 | |
| 615 | if (SU->isTopReady()) |
| 616 | Top.removeReady(SU); |
| 617 | if (SU->isBottomReady()) |
| 618 | Bot.removeReady(SU); |
| 619 | |
| 620 | LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " |
| 621 | << *SU->getInstr()); |
| 622 | return SU; |
| 623 | } |
| 624 | |
| 625 | void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { |
| 626 | if (GCNTrackers) { |
| 627 | MachineInstr *MI = SU->getInstr(); |
| 628 | IsTopNode ? (void)DownwardTracker.advance(MI, UseInternalIterator: false) |
| 629 | : UpwardTracker.recede(MI: *MI); |
| 630 | } |
| 631 | |
| 632 | return GenericScheduler::schedNode(SU, IsTopNode); |
| 633 | } |
| 634 | |
| 635 | GCNSchedStageID GCNSchedStrategy::getCurrentStage() { |
| 636 | assert(CurrentStage && CurrentStage != SchedStages.end()); |
| 637 | return *CurrentStage; |
| 638 | } |
| 639 | |
| 640 | bool GCNSchedStrategy::advanceStage() { |
| 641 | assert(CurrentStage != SchedStages.end()); |
| 642 | if (!CurrentStage) |
| 643 | CurrentStage = SchedStages.begin(); |
| 644 | else |
| 645 | CurrentStage++; |
| 646 | |
| 647 | return CurrentStage != SchedStages.end(); |
| 648 | } |
| 649 | |
| 650 | bool GCNSchedStrategy::hasNextStage() const { |
| 651 | assert(CurrentStage); |
| 652 | return std::next(x: CurrentStage) != SchedStages.end(); |
| 653 | } |
| 654 | |
| 655 | GCNSchedStageID GCNSchedStrategy::getNextStage() const { |
| 656 | assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); |
| 657 | return *std::next(x: CurrentStage); |
| 658 | } |
| 659 | |
| 660 | bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand, |
| 661 | SchedCandidate &TryCand, |
| 662 | SchedBoundary *Zone) const { |
| 663 | // Initialize the candidate if needed. |
| 664 | if (!Cand.isValid()) { |
| 665 | TryCand.Reason = NodeOrder; |
| 666 | return true; |
| 667 | } |
| 668 | |
| 669 | // Bias PhysReg Defs and copies to their uses and defined respectively. |
| 670 | if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop), |
| 671 | CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg)) |
| 672 | return TryCand.Reason != NoCand; |
| 673 | |
| 674 | // Avoid exceeding the target's limit. |
| 675 | if (DAG->isTrackingPressure() && |
| 676 | tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand, |
| 677 | Reason: RegExcess, TRI, MF: DAG->MF)) |
| 678 | return TryCand.Reason != NoCand; |
| 679 | |
| 680 | // Avoid increasing the max critical pressure in the scheduled region. |
| 681 | if (DAG->isTrackingPressure() && |
| 682 | tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax, |
| 683 | TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF)) |
| 684 | return TryCand.Reason != NoCand; |
| 685 | |
| 686 | bool SameBoundary = Zone != nullptr; |
| 687 | if (SameBoundary) { |
| 688 | TryCand.initResourceDelta(DAG, SchedModel); |
| 689 | if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources, |
| 690 | TryCand, Cand, Reason: ResourceReduce)) |
| 691 | return TryCand.Reason != NoCand; |
| 692 | if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources, |
| 693 | CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand, |
| 694 | Reason: ResourceDemand)) |
| 695 | return TryCand.Reason != NoCand; |
| 696 | } |
| 697 | |
| 698 | return false; |
| 699 | } |
| 700 | |
| 701 | GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( |
| 702 | const MachineSchedContext *C, bool IsLegacyScheduler) |
| 703 | : GCNSchedStrategy(C) { |
| 704 | SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule); |
| 705 | if (!DisableRewriteMFMAFormSchedStage) |
| 706 | SchedStages.push_back(Elt: GCNSchedStageID::RewriteMFMAForm); |
| 707 | SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule); |
| 708 | SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule); |
| 709 | SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize); |
| 710 | GCNTrackers = GCNTrackers & !IsLegacyScheduler; |
| 711 | } |
| 712 | |
| 713 | GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) |
| 714 | : GCNSchedStrategy(C) { |
| 715 | SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule); |
| 716 | } |
| 717 | |
| 718 | bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, |
| 719 | SchedCandidate &TryCand, |
| 720 | SchedBoundary *Zone) const { |
| 721 | // Initialize the candidate if needed. |
| 722 | if (!Cand.isValid()) { |
| 723 | TryCand.Reason = NodeOrder; |
| 724 | return true; |
| 725 | } |
| 726 | |
| 727 | // Avoid spilling by exceeding the register limit. |
| 728 | if (DAG->isTrackingPressure() && |
| 729 | tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand, |
| 730 | Reason: RegExcess, TRI, MF: DAG->MF)) |
| 731 | return TryCand.Reason != NoCand; |
| 732 | |
| 733 | // Bias PhysReg Defs and copies to their uses and defined respectively. |
| 734 | if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop), |
| 735 | CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg)) |
| 736 | return TryCand.Reason != NoCand; |
| 737 | |
| 738 | bool SameBoundary = Zone != nullptr; |
| 739 | if (SameBoundary) { |
| 740 | // Prioritize instructions that read unbuffered resources by stall cycles. |
| 741 | if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU), |
| 742 | CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall)) |
| 743 | return TryCand.Reason != NoCand; |
| 744 | |
| 745 | // Avoid critical resource consumption and balance the schedule. |
| 746 | TryCand.initResourceDelta(DAG, SchedModel); |
| 747 | if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources, |
| 748 | TryCand, Cand, Reason: ResourceReduce)) |
| 749 | return TryCand.Reason != NoCand; |
| 750 | if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources, |
| 751 | CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand, |
| 752 | Reason: ResourceDemand)) |
| 753 | return TryCand.Reason != NoCand; |
| 754 | |
| 755 | // Unconditionally try to reduce latency. |
| 756 | if (tryLatency(TryCand, Cand, Zone&: *Zone)) |
| 757 | return TryCand.Reason != NoCand; |
| 758 | |
| 759 | // Weak edges are for clustering and other constraints. |
| 760 | if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop), |
| 761 | CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak)) |
| 762 | return TryCand.Reason != NoCand; |
| 763 | } |
| 764 | |
| 765 | // Keep clustered nodes together to encourage downstream peephole |
| 766 | // optimizations which may reduce resource requirements. |
| 767 | // |
| 768 | // This is a best effort to set things up for a post-RA pass. Optimizations |
| 769 | // like generating loads of multiple registers should ideally be done within |
| 770 | // the scheduler pass by combining the loads during DAG postprocessing. |
| 771 | unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; |
| 772 | unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; |
| 773 | bool CandIsClusterSucc = |
| 774 | isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx); |
| 775 | bool TryCandIsClusterSucc = |
| 776 | isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx); |
| 777 | if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand, |
| 778 | Reason: Cluster)) |
| 779 | return TryCand.Reason != NoCand; |
| 780 | |
| 781 | // Avoid increasing the max critical pressure in the scheduled region. |
| 782 | if (DAG->isTrackingPressure() && |
| 783 | tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax, |
| 784 | TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF)) |
| 785 | return TryCand.Reason != NoCand; |
| 786 | |
| 787 | // Avoid increasing the max pressure of the entire region. |
| 788 | if (DAG->isTrackingPressure() && |
| 789 | tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand, |
| 790 | Cand, Reason: RegMax, TRI, MF: DAG->MF)) |
| 791 | return TryCand.Reason != NoCand; |
| 792 | |
| 793 | if (SameBoundary) { |
| 794 | // Fall through to original instruction order. |
| 795 | if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || |
| 796 | (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { |
| 797 | TryCand.Reason = NodeOrder; |
| 798 | return true; |
| 799 | } |
| 800 | } |
| 801 | return false; |
| 802 | } |
| 803 | |
| 804 | GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy( |
| 805 | const MachineSchedContext *C) |
| 806 | : GCNSchedStrategy(C) { |
| 807 | SchedStages.push_back(Elt: GCNSchedStageID::MemoryClauseInitialSchedule); |
| 808 | } |
| 809 | |
| 810 | /// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as |
| 811 | /// much as possible. This is achieved by: |
| 812 | // 1. Prioritize clustered operations before stall latency heuristic. |
| 813 | // 2. Prioritize long-latency-load before stall latency heuristic. |
| 814 | /// |
| 815 | /// \param Cand provides the policy and current best candidate. |
| 816 | /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. |
| 817 | /// \param Zone describes the scheduled zone that we are extending, or nullptr |
| 818 | /// if Cand is from a different zone than TryCand. |
| 819 | /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) |
| 820 | bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand, |
| 821 | SchedCandidate &TryCand, |
| 822 | SchedBoundary *Zone) const { |
| 823 | // Initialize the candidate if needed. |
| 824 | if (!Cand.isValid()) { |
| 825 | TryCand.Reason = NodeOrder; |
| 826 | return true; |
| 827 | } |
| 828 | |
| 829 | // Bias PhysReg Defs and copies to their uses and defined respectively. |
| 830 | if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop), |
| 831 | CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg)) |
| 832 | return TryCand.Reason != NoCand; |
| 833 | |
| 834 | if (DAG->isTrackingPressure()) { |
| 835 | // Avoid exceeding the target's limit. |
| 836 | if (tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand, |
| 837 | Reason: RegExcess, TRI, MF: DAG->MF)) |
| 838 | return TryCand.Reason != NoCand; |
| 839 | |
| 840 | // Avoid increasing the max critical pressure in the scheduled region. |
| 841 | if (tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax, |
| 842 | TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF)) |
| 843 | return TryCand.Reason != NoCand; |
| 844 | } |
| 845 | |
| 846 | // MaxMemoryClause-specific: We prioritize clustered instructions as we would |
| 847 | // get more benefit from clausing these memory instructions. |
| 848 | unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; |
| 849 | unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; |
| 850 | bool CandIsClusterSucc = |
| 851 | isTheSameCluster(A: CandZoneCluster, B: Cand.SU->ParentClusterIdx); |
| 852 | bool TryCandIsClusterSucc = |
| 853 | isTheSameCluster(A: TryCandZoneCluster, B: TryCand.SU->ParentClusterIdx); |
| 854 | if (tryGreater(TryVal: TryCandIsClusterSucc, CandVal: CandIsClusterSucc, TryCand, Cand, |
| 855 | Reason: Cluster)) |
| 856 | return TryCand.Reason != NoCand; |
| 857 | |
| 858 | // We only compare a subset of features when comparing nodes between |
| 859 | // Top and Bottom boundary. Some properties are simply incomparable, in many |
| 860 | // other instances we should only override the other boundary if something |
| 861 | // is a clear good pick on one boundary. Skip heuristics that are more |
| 862 | // "tie-breaking" in nature. |
| 863 | bool SameBoundary = Zone != nullptr; |
| 864 | if (SameBoundary) { |
| 865 | // For loops that are acyclic path limited, aggressively schedule for |
| 866 | // latency. Within an single cycle, whenever CurrMOps > 0, allow normal |
| 867 | // heuristics to take precedence. |
| 868 | if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && |
| 869 | tryLatency(TryCand, Cand, Zone&: *Zone)) |
| 870 | return TryCand.Reason != NoCand; |
| 871 | |
| 872 | // MaxMemoryClause-specific: Prioritize long latency memory load |
| 873 | // instructions in top-bottom order to hide more latency. The mayLoad check |
| 874 | // is used to exclude store-like instructions, which we do not want to |
| 875 | // scheduler them too early. |
| 876 | bool TryMayLoad = |
| 877 | TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad(); |
| 878 | bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad(); |
| 879 | |
| 880 | if (TryMayLoad || CandMayLoad) { |
| 881 | bool TryLongLatency = |
| 882 | TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad; |
| 883 | bool CandLongLatency = |
| 884 | 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad; |
| 885 | |
| 886 | if (tryGreater(TryVal: Zone->isTop() ? TryLongLatency : CandLongLatency, |
| 887 | CandVal: Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand, |
| 888 | Cand, Reason: Stall)) |
| 889 | return TryCand.Reason != NoCand; |
| 890 | } |
| 891 | // Prioritize instructions that read unbuffered resources by stall cycles. |
| 892 | if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU), |
| 893 | CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall)) |
| 894 | return TryCand.Reason != NoCand; |
| 895 | } |
| 896 | |
| 897 | if (SameBoundary) { |
| 898 | // Weak edges are for clustering and other constraints. |
| 899 | if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop), |
| 900 | CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak)) |
| 901 | return TryCand.Reason != NoCand; |
| 902 | } |
| 903 | |
| 904 | // Avoid increasing the max pressure of the entire region. |
| 905 | if (DAG->isTrackingPressure() && |
| 906 | tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand, |
| 907 | Cand, Reason: RegMax, TRI, MF: DAG->MF)) |
| 908 | return TryCand.Reason != NoCand; |
| 909 | |
| 910 | if (SameBoundary) { |
| 911 | // Avoid critical resource consumption and balance the schedule. |
| 912 | TryCand.initResourceDelta(DAG, SchedModel); |
| 913 | if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources, |
| 914 | TryCand, Cand, Reason: ResourceReduce)) |
| 915 | return TryCand.Reason != NoCand; |
| 916 | if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources, |
| 917 | CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand, |
| 918 | Reason: ResourceDemand)) |
| 919 | return TryCand.Reason != NoCand; |
| 920 | |
| 921 | // Avoid serializing long latency dependence chains. |
| 922 | // For acyclic path limited loops, latency was already checked above. |
| 923 | if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && |
| 924 | !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone&: *Zone)) |
| 925 | return TryCand.Reason != NoCand; |
| 926 | |
| 927 | // Fall through to original instruction order. |
| 928 | if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) { |
| 929 | assert(TryCand.SU->NodeNum != Cand.SU->NodeNum); |
| 930 | TryCand.Reason = NodeOrder; |
| 931 | return true; |
| 932 | } |
| 933 | } |
| 934 | |
| 935 | return false; |
| 936 | } |
| 937 | |
| 938 | GCNScheduleDAGMILive::GCNScheduleDAGMILive( |
| 939 | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) |
| 940 | : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), |
| 941 | MFI(*MF.getInfo<SIMachineFunctionInfo>()), |
| 942 | StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), |
| 943 | RegionLiveOuts(this, /*IsLiveOut=*/true) { |
| 944 | |
| 945 | // We want regions with a single MI to be scheduled so that we can reason |
| 946 | // about them correctly during scheduling stages that move MIs between regions |
| 947 | // (e.g., rematerialization). |
| 948 | ScheduleSingleMIRegions = true; |
| 949 | LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n" ); |
| 950 | if (RelaxedOcc) { |
| 951 | MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy); |
| 952 | if (MinOccupancy != StartingOccupancy) |
| 953 | LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy |
| 954 | << ".\n" ); |
| 955 | } |
| 956 | } |
| 957 | |
| 958 | std::unique_ptr<GCNSchedStage> |
| 959 | GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { |
| 960 | switch (SchedStageID) { |
| 961 | case GCNSchedStageID::OccInitialSchedule: |
| 962 | return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this); |
| 963 | case GCNSchedStageID::RewriteMFMAForm: |
| 964 | return std::make_unique<RewriteMFMAFormStage>(args&: SchedStageID, args&: *this); |
| 965 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
| 966 | return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this); |
| 967 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
| 968 | return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this); |
| 969 | case GCNSchedStageID::PreRARematerialize: |
| 970 | return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this); |
| 971 | case GCNSchedStageID::ILPInitialSchedule: |
| 972 | return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this); |
| 973 | case GCNSchedStageID::MemoryClauseInitialSchedule: |
| 974 | return std::make_unique<MemoryClauseInitialScheduleStage>(args&: SchedStageID, |
| 975 | args&: *this); |
| 976 | } |
| 977 | |
| 978 | llvm_unreachable("Unknown SchedStageID." ); |
| 979 | } |
| 980 | |
| 981 | void GCNScheduleDAGMILive::schedule() { |
| 982 | // Collect all scheduling regions. The actual scheduling is performed in |
| 983 | // GCNScheduleDAGMILive::finalizeSchedule. |
| 984 | Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd)); |
| 985 | } |
| 986 | |
| 987 | GCNRegPressure |
| 988 | GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { |
| 989 | if (Regions[RegionIdx].first == Regions[RegionIdx].second) |
| 990 | return llvm::getRegPressure(MRI, LiveRegs: LiveIns[RegionIdx]); |
| 991 | GCNDownwardRPTracker RPTracker(*LIS); |
| 992 | RPTracker.advance(Begin: Regions[RegionIdx].first, End: Regions[RegionIdx].second, |
| 993 | LiveRegsCopy: &LiveIns[RegionIdx]); |
| 994 | return RPTracker.moveMaxPressure(); |
| 995 | } |
| 996 | |
| 997 | static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, |
| 998 | MachineBasicBlock::iterator RegionEnd) { |
| 999 | assert(RegionBegin != RegionEnd && "Region must not be empty" ); |
| 1000 | return &*skipDebugInstructionsBackward(It: std::prev(x: RegionEnd), Begin: RegionBegin); |
| 1001 | } |
| 1002 | |
| 1003 | void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, |
| 1004 | const MachineBasicBlock *MBB) { |
| 1005 | GCNDownwardRPTracker RPTracker(*LIS); |
| 1006 | |
| 1007 | // If the block has the only successor then live-ins of that successor are |
| 1008 | // live-outs of the current block. We can reuse calculated live set if the |
| 1009 | // successor will be sent to scheduling past current block. |
| 1010 | |
| 1011 | // However, due to the bug in LiveInterval analysis it may happen that two |
| 1012 | // predecessors of the same successor block have different lane bitmasks for |
| 1013 | // a live-out register. Workaround that by sticking to one-to-one relationship |
| 1014 | // i.e. one predecessor with one successor block. |
| 1015 | const MachineBasicBlock *OnlySucc = nullptr; |
| 1016 | if (MBB->succ_size() == 1) { |
| 1017 | auto *Candidate = *MBB->succ_begin(); |
| 1018 | if (!Candidate->empty() && Candidate->pred_size() == 1) { |
| 1019 | SlotIndexes *Ind = LIS->getSlotIndexes(); |
| 1020 | if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate)) |
| 1021 | OnlySucc = Candidate; |
| 1022 | } |
| 1023 | } |
| 1024 | |
| 1025 | // Scheduler sends regions from the end of the block upwards. |
| 1026 | size_t CurRegion = RegionIdx; |
| 1027 | for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) |
| 1028 | if (Regions[CurRegion].first->getParent() != MBB) |
| 1029 | break; |
| 1030 | --CurRegion; |
| 1031 | |
| 1032 | auto I = MBB->begin(); |
| 1033 | auto LiveInIt = MBBLiveIns.find(Val: MBB); |
| 1034 | auto &Rgn = Regions[CurRegion]; |
| 1035 | auto *NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second); |
| 1036 | if (LiveInIt != MBBLiveIns.end()) { |
| 1037 | auto LiveIn = std::move(LiveInIt->second); |
| 1038 | RPTracker.reset(MI: *MBB->begin(), LiveRegs: &LiveIn); |
| 1039 | MBBLiveIns.erase(I: LiveInIt); |
| 1040 | } else { |
| 1041 | I = Rgn.first; |
| 1042 | auto LRS = BBLiveInMap.lookup(Val: NonDbgMI); |
| 1043 | #ifdef EXPENSIVE_CHECKS |
| 1044 | assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); |
| 1045 | #endif |
| 1046 | RPTracker.reset(MI: *I, LiveRegs: &LRS); |
| 1047 | } |
| 1048 | |
| 1049 | for (;;) { |
| 1050 | I = RPTracker.getNext(); |
| 1051 | |
| 1052 | if (Regions[CurRegion].first == I || NonDbgMI == I) { |
| 1053 | LiveIns[CurRegion] = RPTracker.getLiveRegs(); |
| 1054 | RPTracker.clearMaxPressure(); |
| 1055 | } |
| 1056 | |
| 1057 | if (Regions[CurRegion].second == I) { |
| 1058 | Pressure[CurRegion] = RPTracker.moveMaxPressure(); |
| 1059 | if (CurRegion-- == RegionIdx) |
| 1060 | break; |
| 1061 | auto &Rgn = Regions[CurRegion]; |
| 1062 | NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second); |
| 1063 | } |
| 1064 | RPTracker.advanceToNext(); |
| 1065 | RPTracker.advanceBeforeNext(); |
| 1066 | } |
| 1067 | |
| 1068 | if (OnlySucc) { |
| 1069 | if (I != MBB->end()) { |
| 1070 | RPTracker.advanceToNext(); |
| 1071 | RPTracker.advance(End: MBB->end()); |
| 1072 | } |
| 1073 | RPTracker.advanceBeforeNext(); |
| 1074 | MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); |
| 1075 | } |
| 1076 | } |
| 1077 | |
| 1078 | DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
| 1079 | GCNScheduleDAGMILive::getRegionLiveInMap() const { |
| 1080 | assert(!Regions.empty()); |
| 1081 | std::vector<MachineInstr *> RegionFirstMIs; |
| 1082 | RegionFirstMIs.reserve(n: Regions.size()); |
| 1083 | for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions)) |
| 1084 | RegionFirstMIs.push_back( |
| 1085 | x: &*skipDebugInstructionsForward(It: RegionBegin, End: RegionEnd)); |
| 1086 | |
| 1087 | return getLiveRegMap(R&: RegionFirstMIs, /*After=*/false, LIS&: *LIS); |
| 1088 | } |
| 1089 | |
| 1090 | DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
| 1091 | GCNScheduleDAGMILive::getRegionLiveOutMap() const { |
| 1092 | assert(!Regions.empty()); |
| 1093 | std::vector<MachineInstr *> RegionLastMIs; |
| 1094 | RegionLastMIs.reserve(n: Regions.size()); |
| 1095 | for (auto &[RegionBegin, RegionEnd] : reverse(C: Regions)) { |
| 1096 | // Skip empty regions. |
| 1097 | if (RegionBegin == RegionEnd) |
| 1098 | continue; |
| 1099 | RegionLastMIs.push_back(x: getLastMIForRegion(RegionBegin, RegionEnd)); |
| 1100 | } |
| 1101 | return getLiveRegMap(R&: RegionLastMIs, /*After=*/true, LIS&: *LIS); |
| 1102 | } |
| 1103 | |
| 1104 | void RegionPressureMap::buildLiveRegMap() { |
| 1105 | IdxToInstruction.clear(); |
| 1106 | |
| 1107 | RegionLiveRegMap = |
| 1108 | IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); |
| 1109 | for (unsigned I = 0; I < DAG->Regions.size(); I++) { |
| 1110 | auto &[RegionBegin, RegionEnd] = DAG->Regions[I]; |
| 1111 | // Skip empty regions. |
| 1112 | if (RegionBegin == RegionEnd) |
| 1113 | continue; |
| 1114 | MachineInstr *RegionKey = |
| 1115 | IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin; |
| 1116 | IdxToInstruction[I] = RegionKey; |
| 1117 | } |
| 1118 | } |
| 1119 | |
| 1120 | void GCNScheduleDAGMILive::finalizeSchedule() { |
| 1121 | // Start actual scheduling here. This function is called by the base |
| 1122 | // MachineScheduler after all regions have been recorded by |
| 1123 | // GCNScheduleDAGMILive::schedule(). |
| 1124 | LiveIns.resize(N: Regions.size()); |
| 1125 | Pressure.resize(N: Regions.size()); |
| 1126 | RegionsWithHighRP.resize(N: Regions.size()); |
| 1127 | RegionsWithExcessRP.resize(N: Regions.size()); |
| 1128 | RegionsWithIGLPInstrs.resize(N: Regions.size()); |
| 1129 | RegionsWithHighRP.reset(); |
| 1130 | RegionsWithExcessRP.reset(); |
| 1131 | RegionsWithIGLPInstrs.reset(); |
| 1132 | |
| 1133 | runSchedStages(); |
| 1134 | } |
| 1135 | |
| 1136 | void GCNScheduleDAGMILive::runSchedStages() { |
| 1137 | LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n" ); |
| 1138 | |
| 1139 | if (!Regions.empty()) { |
| 1140 | BBLiveInMap = getRegionLiveInMap(); |
| 1141 | if (GCNTrackers) |
| 1142 | RegionLiveOuts.buildLiveRegMap(); |
| 1143 | } |
| 1144 | |
| 1145 | #ifdef DUMP_MAX_REG_PRESSURE |
| 1146 | if (PrintMaxRPRegUsageBeforeScheduler) { |
| 1147 | dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); |
| 1148 | dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); |
| 1149 | LIS->dump(); |
| 1150 | } |
| 1151 | #endif |
| 1152 | |
| 1153 | GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); |
| 1154 | while (S.advanceStage()) { |
| 1155 | auto Stage = createSchedStage(SchedStageID: S.getCurrentStage()); |
| 1156 | if (!Stage->initGCNSchedStage()) |
| 1157 | continue; |
| 1158 | |
| 1159 | for (auto Region : Regions) { |
| 1160 | RegionBegin = Region.first; |
| 1161 | RegionEnd = Region.second; |
| 1162 | // Setup for scheduling the region and check whether it should be skipped. |
| 1163 | if (!Stage->initGCNRegion()) { |
| 1164 | Stage->advanceRegion(); |
| 1165 | exitRegion(); |
| 1166 | continue; |
| 1167 | } |
| 1168 | |
| 1169 | if (GCNTrackers) { |
| 1170 | GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); |
| 1171 | GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); |
| 1172 | GCNRPTracker::LiveRegSet *RegionLiveIns = |
| 1173 | &LiveIns[Stage->getRegionIdx()]; |
| 1174 | |
| 1175 | reinterpret_cast<GCNRPTracker *>(DownwardTracker) |
| 1176 | ->reset(MRI_: MRI, LiveRegs_: *RegionLiveIns); |
| 1177 | reinterpret_cast<GCNRPTracker *>(UpwardTracker) |
| 1178 | ->reset(MRI_: MRI, LiveRegs_: RegionLiveOuts.getLiveRegsForRegionIdx( |
| 1179 | RegionIdx: Stage->getRegionIdx())); |
| 1180 | } |
| 1181 | |
| 1182 | ScheduleDAGMILive::schedule(); |
| 1183 | Stage->finalizeGCNRegion(); |
| 1184 | Stage->advanceRegion(); |
| 1185 | exitRegion(); |
| 1186 | } |
| 1187 | |
| 1188 | Stage->finalizeGCNSchedStage(); |
| 1189 | } |
| 1190 | |
| 1191 | #ifdef DUMP_MAX_REG_PRESSURE |
| 1192 | if (PrintMaxRPRegUsageAfterScheduler) { |
| 1193 | dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); |
| 1194 | dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); |
| 1195 | LIS->dump(); |
| 1196 | } |
| 1197 | #endif |
| 1198 | } |
| 1199 | |
| 1200 | #ifndef NDEBUG |
| 1201 | raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { |
| 1202 | switch (StageID) { |
| 1203 | case GCNSchedStageID::OccInitialSchedule: |
| 1204 | OS << "Max Occupancy Initial Schedule" ; |
| 1205 | break; |
| 1206 | case GCNSchedStageID::RewriteMFMAForm: |
| 1207 | OS << "Instruction Rewriting Reschedule" ; |
| 1208 | break; |
| 1209 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
| 1210 | OS << "Unclustered High Register Pressure Reschedule" ; |
| 1211 | break; |
| 1212 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
| 1213 | OS << "Clustered Low Occupancy Reschedule" ; |
| 1214 | break; |
| 1215 | case GCNSchedStageID::PreRARematerialize: |
| 1216 | OS << "Pre-RA Rematerialize" ; |
| 1217 | break; |
| 1218 | case GCNSchedStageID::ILPInitialSchedule: |
| 1219 | OS << "Max ILP Initial Schedule" ; |
| 1220 | break; |
| 1221 | case GCNSchedStageID::MemoryClauseInitialSchedule: |
| 1222 | OS << "Max memory clause Initial Schedule" ; |
| 1223 | break; |
| 1224 | } |
| 1225 | |
| 1226 | return OS; |
| 1227 | } |
| 1228 | #endif |
| 1229 | |
| 1230 | GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) |
| 1231 | : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), |
| 1232 | MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} |
| 1233 | |
| 1234 | bool GCNSchedStage::initGCNSchedStage() { |
| 1235 | if (!DAG.LIS) |
| 1236 | return false; |
| 1237 | |
| 1238 | LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n" ); |
| 1239 | return true; |
| 1240 | } |
| 1241 | |
| 1242 | void RewriteMFMAFormStage::findReachingDefs( |
| 1243 | MachineOperand &UseMO, LiveIntervals *LIS, |
| 1244 | SmallVectorImpl<SlotIndex> &DefIdxs) { |
| 1245 | MachineInstr *UseMI = UseMO.getParent(); |
| 1246 | LiveInterval &UseLI = LIS->getInterval(Reg: UseMO.getReg()); |
| 1247 | VNInfo *VNI = UseLI.getVNInfoAt(Idx: LIS->getInstructionIndex(Instr: *UseMI)); |
| 1248 | |
| 1249 | // If the def is not a PHI, then it must be the only reaching def. |
| 1250 | if (!VNI->isPHIDef()) { |
| 1251 | DefIdxs.push_back(Elt: VNI->def); |
| 1252 | return; |
| 1253 | } |
| 1254 | |
| 1255 | SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()}; |
| 1256 | SmallVector<MachineBasicBlock *, 8> Worklist; |
| 1257 | |
| 1258 | // Mark the predecessor blocks for traversal |
| 1259 | for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) { |
| 1260 | Worklist.push_back(Elt: PredMBB); |
| 1261 | Visited.insert(Ptr: PredMBB); |
| 1262 | } |
| 1263 | |
| 1264 | while (!Worklist.empty()) { |
| 1265 | MachineBasicBlock *CurrMBB = Worklist.pop_back_val(); |
| 1266 | |
| 1267 | SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(mbb: CurrMBB); |
| 1268 | VNInfo *VNI = UseLI.getVNInfoAt(Idx: CurrMBBEnd.getPrevSlot()); |
| 1269 | |
| 1270 | MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(index: VNI->def); |
| 1271 | |
| 1272 | // If there is a def in this block, then add it to the list. This is the |
| 1273 | // reaching def of this path. |
| 1274 | if (!VNI->isPHIDef()) { |
| 1275 | DefIdxs.push_back(Elt: VNI->def); |
| 1276 | continue; |
| 1277 | } |
| 1278 | |
| 1279 | for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) { |
| 1280 | if (Visited.insert(Ptr: PredMBB).second) |
| 1281 | Worklist.push_back(Elt: PredMBB); |
| 1282 | } |
| 1283 | } |
| 1284 | } |
| 1285 | |
| 1286 | void RewriteMFMAFormStage::findReachingUses( |
| 1287 | MachineInstr *DefMI, LiveIntervals *LIS, |
| 1288 | SmallVectorImpl<MachineOperand *> &ReachingUses) { |
| 1289 | SlotIndex DefIdx = LIS->getInstructionIndex(Instr: *DefMI); |
| 1290 | for (MachineOperand &UseMO : |
| 1291 | DAG.MRI.use_nodbg_operands(Reg: DefMI->getOperand(i: 0).getReg())) { |
| 1292 | SmallVector<SlotIndex, 8> ReachingDefIndexes; |
| 1293 | findReachingDefs(UseMO, LIS, DefIdxs&: ReachingDefIndexes); |
| 1294 | |
| 1295 | // If we find a use that contains this DefMI in its reachingDefs, then it is |
| 1296 | // a reaching use. |
| 1297 | if (any_of(Range&: ReachingDefIndexes, P: [DefIdx](SlotIndex RDIdx) { |
| 1298 | return SlotIndex::isSameInstr(A: RDIdx, B: DefIdx); |
| 1299 | })) |
| 1300 | ReachingUses.push_back(Elt: &UseMO); |
| 1301 | } |
| 1302 | } |
| 1303 | |
| 1304 | bool RewriteMFMAFormStage::initGCNSchedStage() { |
| 1305 | // We only need to run this pass if the architecture supports AGPRs. |
| 1306 | // Additionally, we don't use AGPRs at occupancy levels above 1 so there |
| 1307 | // is no need for this pass in that case, either. |
| 1308 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| 1309 | if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1) |
| 1310 | return false; |
| 1311 | |
| 1312 | RegionsWithExcessArchVGPR.resize(N: DAG.Regions.size()); |
| 1313 | RegionsWithExcessArchVGPR.reset(); |
| 1314 | for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) { |
| 1315 | GCNRegPressure PressureBefore = DAG.Pressure[Region]; |
| 1316 | if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs()) |
| 1317 | RegionsWithExcessArchVGPR[Region] = true; |
| 1318 | } |
| 1319 | |
| 1320 | if (RegionsWithExcessArchVGPR.none()) |
| 1321 | return false; |
| 1322 | |
| 1323 | TII = ST.getInstrInfo(); |
| 1324 | SRI = ST.getRegisterInfo(); |
| 1325 | |
| 1326 | std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands; |
| 1327 | DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse; |
| 1328 | SmallPtrSet<MachineInstr *, 8> CopyForDef; |
| 1329 | |
| 1330 | if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef)) |
| 1331 | return false; |
| 1332 | |
| 1333 | int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef); |
| 1334 | |
| 1335 | // If we haven't found the beneficial conditions, prefer the VGPR form which |
| 1336 | // may result in less cross RC copies. |
| 1337 | if (Cost > 0) |
| 1338 | return false; |
| 1339 | |
| 1340 | return rewrite(RewriteCands); |
| 1341 | } |
| 1342 | |
| 1343 | bool UnclusteredHighRPStage::initGCNSchedStage() { |
| 1344 | if (DisableUnclusterHighRP) |
| 1345 | return false; |
| 1346 | |
| 1347 | if (!GCNSchedStage::initGCNSchedStage()) |
| 1348 | return false; |
| 1349 | |
| 1350 | if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) |
| 1351 | return false; |
| 1352 | |
| 1353 | SavedMutations.swap(x&: DAG.Mutations); |
| 1354 | DAG.addMutation( |
| 1355 | Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry)); |
| 1356 | |
| 1357 | InitialOccupancy = DAG.MinOccupancy; |
| 1358 | // Aggressively try to reduce register pressure in the unclustered high RP |
| 1359 | // stage. Temporarily increase occupancy target in the region. |
| 1360 | TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy |
| 1361 | ? InitialOccupancy + 1 |
| 1362 | : InitialOccupancy; |
| 1363 | IsAnyRegionScheduled = false; |
| 1364 | S.SGPRLimitBias = S.HighRPSGPRBias; |
| 1365 | S.VGPRLimitBias = S.HighRPVGPRBias; |
| 1366 | |
| 1367 | LLVM_DEBUG( |
| 1368 | dbgs() |
| 1369 | << "Retrying function scheduling without clustering. " |
| 1370 | "Aggressively try to reduce register pressure to achieve occupancy " |
| 1371 | << TempTargetOccupancy << ".\n" ); |
| 1372 | |
| 1373 | return true; |
| 1374 | } |
| 1375 | |
| 1376 | bool ClusteredLowOccStage::initGCNSchedStage() { |
| 1377 | if (DisableClusteredLowOccupancy) |
| 1378 | return false; |
| 1379 | |
| 1380 | if (!GCNSchedStage::initGCNSchedStage()) |
| 1381 | return false; |
| 1382 | |
| 1383 | // Don't bother trying to improve ILP in lower RP regions if occupancy has not |
| 1384 | // been dropped. All regions will have already been scheduled with the ideal |
| 1385 | // occupancy targets. |
| 1386 | if (DAG.StartingOccupancy <= DAG.MinOccupancy) |
| 1387 | return false; |
| 1388 | |
| 1389 | LLVM_DEBUG( |
| 1390 | dbgs() << "Retrying function scheduling with lowest recorded occupancy " |
| 1391 | << DAG.MinOccupancy << ".\n" ); |
| 1392 | return true; |
| 1393 | } |
| 1394 | |
| 1395 | /// Allows to easily filter for this stage's debug output. |
| 1396 | #define REMAT_PREFIX "[PreRARemat] " |
| 1397 | #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;) |
| 1398 | |
| 1399 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 1400 | Printable PreRARematStage::ScoredRemat::print() const { |
| 1401 | return Printable([&](raw_ostream &OS) { |
| 1402 | OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')'; |
| 1403 | }); |
| 1404 | } |
| 1405 | #endif |
| 1406 | |
| 1407 | bool PreRARematStage::initGCNSchedStage() { |
| 1408 | // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for |
| 1409 | // regions inbetween the defs and region we sinked the def to. Will need to be |
| 1410 | // fixed if there is another pass after this pass. |
| 1411 | assert(!S.hasNextStage()); |
| 1412 | |
| 1413 | if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1) |
| 1414 | return false; |
| 1415 | |
| 1416 | // Maps all MIs (except lone terminators, which are not part of any region) to |
| 1417 | // their parent region. Non-lone terminators are considered part of the region |
| 1418 | // they delimitate. |
| 1419 | DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount()); |
| 1420 | |
| 1421 | // Before performing any IR modification record the parent region of each MI |
| 1422 | // and the parent MBB of each region. |
| 1423 | const unsigned NumRegions = DAG.Regions.size(); |
| 1424 | for (unsigned I = 0; I < NumRegions; ++I) { |
| 1425 | RegionBoundaries Region = DAG.Regions[I]; |
| 1426 | for (auto MI = Region.first; MI != Region.second; ++MI) |
| 1427 | MIRegion.insert(KV: {&*MI, I}); |
| 1428 | MachineBasicBlock *ParentMBB = Region.first->getParent(); |
| 1429 | if (Region.second != ParentMBB->end()) |
| 1430 | MIRegion.insert(KV: {&*Region.second, I}); |
| 1431 | RegionBB.push_back(Elt: ParentMBB); |
| 1432 | } |
| 1433 | |
| 1434 | #ifndef NDEBUG |
| 1435 | auto PrintTargetRegions = [&]() -> void { |
| 1436 | if (TargetRegions.none()) { |
| 1437 | dbgs() << REMAT_PREFIX << "No target regions\n" ; |
| 1438 | return; |
| 1439 | } |
| 1440 | dbgs() << REMAT_PREFIX << "Target regions:\n" ; |
| 1441 | for (unsigned I : TargetRegions.set_bits()) |
| 1442 | dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n'; |
| 1443 | }; |
| 1444 | auto PrintRematReg = [&](const RematReg &Remat) -> Printable { |
| 1445 | return Printable([&, Remat](raw_ostream &OS) { |
| 1446 | // Concatenate all region numbers in which the register is unused and |
| 1447 | // live-through. |
| 1448 | bool HasLiveThroughRegion = false; |
| 1449 | OS << '[' << Remat.DefRegion << " -" ; |
| 1450 | for (unsigned I = 0; I < NumRegions; ++I) { |
| 1451 | if (Remat.isUnusedLiveThrough(I)) { |
| 1452 | if (HasLiveThroughRegion) { |
| 1453 | OS << ','; |
| 1454 | } else { |
| 1455 | OS << "- " ; |
| 1456 | HasLiveThroughRegion = true; |
| 1457 | } |
| 1458 | OS << I; |
| 1459 | } |
| 1460 | } |
| 1461 | if (HasLiveThroughRegion) |
| 1462 | OS << " -" ; |
| 1463 | OS << "-> " << Remat.UseRegion << "] " ; |
| 1464 | Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false, |
| 1465 | /*SkipDebugLoc=*/false, /*AddNewLine=*/false); |
| 1466 | }); |
| 1467 | }; |
| 1468 | #endif |
| 1469 | |
| 1470 | // Set an objective for the stage based on current RP in each region. |
| 1471 | REMAT_DEBUG({ |
| 1472 | dbgs() << "Analyzing " ; |
| 1473 | MF.getFunction().printAsOperand(dbgs(), false); |
| 1474 | dbgs() << ": " ; |
| 1475 | }); |
| 1476 | if (!setObjective()) { |
| 1477 | LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at " |
| 1478 | << MFI.getMaxWavesPerEU() << '\n'); |
| 1479 | return false; |
| 1480 | } |
| 1481 | LLVM_DEBUG({ |
| 1482 | if (TargetOcc) { |
| 1483 | dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n'; |
| 1484 | } else { |
| 1485 | dbgs() << "reduce spilling (minimum target occupancy is " |
| 1486 | << MFI.getMinWavesPerEU() << ")\n" ; |
| 1487 | } |
| 1488 | PrintTargetRegions(); |
| 1489 | }); |
| 1490 | |
| 1491 | if (!collectRematRegs(MIRegion)) { |
| 1492 | REMAT_DEBUG(dbgs() << "No rematerializable registers\n" ); |
| 1493 | return false; |
| 1494 | } |
| 1495 | const ScoredRemat::FreqInfo FreqInfo(MF, DAG); |
| 1496 | REMAT_DEBUG({ |
| 1497 | dbgs() << "Rematerializable registers:\n" ; |
| 1498 | for (const RematReg &Remat : RematRegs) |
| 1499 | dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n'; |
| 1500 | dbgs() << REMAT_PREFIX << "Region frequencies\n" ; |
| 1501 | for (auto [I, Freq] : enumerate(FreqInfo.Regions)) { |
| 1502 | dbgs() << REMAT_PREFIX << " [" << I << "] " ; |
| 1503 | if (Freq) |
| 1504 | dbgs() << Freq; |
| 1505 | else |
| 1506 | dbgs() << "unknown " ; |
| 1507 | dbgs() << " | " << *DAG.Regions[I].first; |
| 1508 | } |
| 1509 | }); |
| 1510 | |
| 1511 | SmallVector<ScoredRemat> ScoredRemats; |
| 1512 | for (RematReg &Remat : RematRegs) |
| 1513 | ScoredRemats.emplace_back(Args: &Remat, Args: FreqInfo, Args&: DAG); |
| 1514 | |
| 1515 | // Rematerialize registers in successive rounds until all RP targets are |
| 1516 | // satisifed or until we run out of rematerialization candidates. |
| 1517 | #ifndef NDEBUG |
| 1518 | unsigned RoundNum = 0; |
| 1519 | #endif |
| 1520 | BitVector RecomputeRP(NumRegions); |
| 1521 | do { |
| 1522 | assert(!ScoredRemats.empty() && "no more remat candidates" ); |
| 1523 | |
| 1524 | // (Re-)Score and (re-)sort all remats in increasing score order. |
| 1525 | for (ScoredRemat &Remat : ScoredRemats) |
| 1526 | Remat.update(TargetRegions, RPTargets, Freq: FreqInfo, ReduceSpill: !TargetOcc); |
| 1527 | sort(C&: ScoredRemats); |
| 1528 | |
| 1529 | REMAT_DEBUG({ |
| 1530 | dbgs() << "==== ROUND " << RoundNum++ << " ====\n" |
| 1531 | << REMAT_PREFIX |
| 1532 | << "Candidates with non-null score, in rematerialization order:\n" ; |
| 1533 | for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) { |
| 1534 | if (RematDecision.hasNullScore()) |
| 1535 | break; |
| 1536 | dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | " |
| 1537 | << *RematDecision.Remat->DefMI; |
| 1538 | } |
| 1539 | PrintTargetRegions(); |
| 1540 | }); |
| 1541 | |
| 1542 | RecomputeRP.reset(); |
| 1543 | unsigned RematIdx = ScoredRemats.size(); |
| 1544 | |
| 1545 | // Rematerialize registers in decreasing score order until we estimate |
| 1546 | // that all RP targets are satisfied or until rematerialization candidates |
| 1547 | // are no longer useful to decrease RP. |
| 1548 | for (; RematIdx && TargetRegions.any(); --RematIdx) { |
| 1549 | const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1]; |
| 1550 | // Stop rematerializing on encountering a null score. Since scores |
| 1551 | // monotonically decrease as we rematerialize, we know there is nothing |
| 1552 | // useful left to do in such cases, even if we were to re-score. |
| 1553 | if (Candidate.hasNullScore()) { |
| 1554 | RematIdx = 0; |
| 1555 | break; |
| 1556 | } |
| 1557 | |
| 1558 | RematReg &Remat = *Candidate.Remat; |
| 1559 | // When previous rematerializations in this round have already satisfied |
| 1560 | // RP targets in all regions this rematerialization can impact, we have a |
| 1561 | // good indication that our scores have diverged significantly from |
| 1562 | // reality, in which case we interrupt this round and re-score. This also |
| 1563 | // ensures that every rematerialization we perform is possibly impactful |
| 1564 | // in at least one target region. |
| 1565 | if (!Remat.maybeBeneficial(TargetRegions, RPTargets)) |
| 1566 | break; |
| 1567 | |
| 1568 | REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';); |
| 1569 | // Every rematerialization we do here is likely to move the instruction |
| 1570 | // into a higher frequency region, increasing the total sum latency of the |
| 1571 | // instruction itself. This is acceptable if we are eliminating a spill in |
| 1572 | // the process, but when the goal is increasing occupancy we get nothing |
| 1573 | // out of rematerialization if occupancy is not increased in the end; in |
| 1574 | // such cases we want to roll back the rematerialization. |
| 1575 | RollbackInfo *Rollback = |
| 1576 | TargetOcc ? &Rollbacks.emplace_back(Args: &Remat) : nullptr; |
| 1577 | rematerialize(Remat, RecomputeRP, Rollback); |
| 1578 | unsetSatisifedRPTargets(Regions: Remat.Live); |
| 1579 | } |
| 1580 | |
| 1581 | REMAT_DEBUG({ |
| 1582 | if (!TargetRegions.any()) { |
| 1583 | dbgs() << "** Interrupt round on all targets achieved\n" ; |
| 1584 | } else if (RematIdx) { |
| 1585 | dbgs() << "** Interrupt round on stale score for " |
| 1586 | << *ScoredRemats[RematIdx - 1].Remat->DefMI; |
| 1587 | } else { |
| 1588 | dbgs() << "** Stop on exhausted rematerialization candidates\n" ; |
| 1589 | } |
| 1590 | }); |
| 1591 | |
| 1592 | // Peel off registers we already rematerialized from the vector's tail. |
| 1593 | ScoredRemats.truncate(N: RematIdx); |
| 1594 | } while ((updateAndVerifyRPTargets(Regions: RecomputeRP) || TargetRegions.any()) && |
| 1595 | !ScoredRemats.empty()); |
| 1596 | if (RescheduleRegions.none()) |
| 1597 | return false; |
| 1598 | |
| 1599 | // Commit all pressure changes to the DAG and compute minimum achieved |
| 1600 | // occupancy in impacted regions. |
| 1601 | REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n" ); |
| 1602 | unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize(); |
| 1603 | for (unsigned I : RescheduleRegions.set_bits()) { |
| 1604 | DAG.Pressure[I] = RPTargets[I].getCurrentRP(); |
| 1605 | REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy " |
| 1606 | << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize) |
| 1607 | << " (" << RPTargets[I] << ")\n" ); |
| 1608 | } |
| 1609 | AchievedOcc = MFI.getMaxWavesPerEU(); |
| 1610 | for (const GCNRegPressure &RP : DAG.Pressure) { |
| 1611 | AchievedOcc = |
| 1612 | std::min(a: AchievedOcc, b: RP.getOccupancy(ST, DynamicVGPRBlockSize)); |
| 1613 | } |
| 1614 | |
| 1615 | REMAT_DEBUG({ |
| 1616 | dbgs() << "Retrying function scheduling with new min. occupancy of " |
| 1617 | << AchievedOcc << " from rematerializing (original was " |
| 1618 | << DAG.MinOccupancy; |
| 1619 | if (TargetOcc) |
| 1620 | dbgs() << ", target was " << *TargetOcc; |
| 1621 | dbgs() << ")\n" ; |
| 1622 | }); |
| 1623 | |
| 1624 | DAG.setTargetOccupancy(getStageTargetOccupancy()); |
| 1625 | return true; |
| 1626 | } |
| 1627 | |
| 1628 | void GCNSchedStage::finalizeGCNSchedStage() { |
| 1629 | DAG.finishBlock(); |
| 1630 | LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n" ); |
| 1631 | } |
| 1632 | |
| 1633 | void UnclusteredHighRPStage::finalizeGCNSchedStage() { |
| 1634 | SavedMutations.swap(x&: DAG.Mutations); |
| 1635 | S.SGPRLimitBias = S.VGPRLimitBias = 0; |
| 1636 | if (DAG.MinOccupancy > InitialOccupancy) { |
| 1637 | assert(IsAnyRegionScheduled); |
| 1638 | LLVM_DEBUG(dbgs() << StageID |
| 1639 | << " stage successfully increased occupancy to " |
| 1640 | << DAG.MinOccupancy << '\n'); |
| 1641 | } else if (!IsAnyRegionScheduled) { |
| 1642 | assert(DAG.MinOccupancy == InitialOccupancy); |
| 1643 | LLVM_DEBUG(dbgs() << StageID |
| 1644 | << ": No regions scheduled, min occupancy stays at " |
| 1645 | << DAG.MinOccupancy << ", MFI occupancy stays at " |
| 1646 | << MFI.getOccupancy() << ".\n" ); |
| 1647 | } |
| 1648 | |
| 1649 | GCNSchedStage::finalizeGCNSchedStage(); |
| 1650 | } |
| 1651 | |
| 1652 | bool GCNSchedStage::initGCNRegion() { |
| 1653 | // Skip empty scheduling region. |
| 1654 | if (DAG.begin() == DAG.end()) |
| 1655 | return false; |
| 1656 | |
| 1657 | // Check whether this new region is also a new block. |
| 1658 | if (DAG.RegionBegin->getParent() != CurrentMBB) |
| 1659 | setupNewBlock(); |
| 1660 | |
| 1661 | unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end()); |
| 1662 | DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs); |
| 1663 | |
| 1664 | // Skip regions with 1 schedulable instruction. |
| 1665 | if (DAG.begin() == std::prev(x: DAG.end())) |
| 1666 | return false; |
| 1667 | |
| 1668 | LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n" ); |
| 1669 | LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) |
| 1670 | << " " << CurrentMBB->getName() |
| 1671 | << "\n From: " << *DAG.begin() << " To: " ; |
| 1672 | if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; |
| 1673 | else dbgs() << "End" ; |
| 1674 | dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); |
| 1675 | |
| 1676 | // Save original instruction order before scheduling for possible revert. |
| 1677 | Unsched.clear(); |
| 1678 | Unsched.reserve(n: DAG.NumRegionInstrs); |
| 1679 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
| 1680 | StageID == GCNSchedStageID::ILPInitialSchedule) { |
| 1681 | const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII); |
| 1682 | for (auto &I : DAG) { |
| 1683 | Unsched.push_back(x: &I); |
| 1684 | if (SII->isIGLPMutationOnly(Opcode: I.getOpcode())) |
| 1685 | DAG.RegionsWithIGLPInstrs[RegionIdx] = true; |
| 1686 | } |
| 1687 | } else { |
| 1688 | for (auto &I : DAG) |
| 1689 | Unsched.push_back(x: &I); |
| 1690 | } |
| 1691 | |
| 1692 | PressureBefore = DAG.Pressure[RegionIdx]; |
| 1693 | |
| 1694 | LLVM_DEBUG( |
| 1695 | dbgs() << "Pressure before scheduling:\nRegion live-ins:" |
| 1696 | << print(DAG.LiveIns[RegionIdx], DAG.MRI) |
| 1697 | << "Region live-in pressure: " |
| 1698 | << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) |
| 1699 | << "Region register pressure: " << print(PressureBefore)); |
| 1700 | |
| 1701 | S.HasHighPressure = false; |
| 1702 | S.KnownExcessRP = isRegionWithExcessRP(); |
| 1703 | |
| 1704 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
| 1705 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { |
| 1706 | SavedMutations.clear(); |
| 1707 | SavedMutations.swap(x&: DAG.Mutations); |
| 1708 | bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || |
| 1709 | StageID == GCNSchedStageID::ILPInitialSchedule; |
| 1710 | DAG.addMutation(Mutation: createIGroupLPDAGMutation( |
| 1711 | Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial |
| 1712 | : AMDGPU::SchedulingPhase::PreRAReentry)); |
| 1713 | } |
| 1714 | |
| 1715 | return true; |
| 1716 | } |
| 1717 | |
| 1718 | bool UnclusteredHighRPStage::initGCNRegion() { |
| 1719 | // Only reschedule regions that have excess register pressure (i.e. spilling) |
| 1720 | // or had minimum occupancy at the beginning of the stage (as long as |
| 1721 | // rescheduling of previous regions did not make occupancy drop back down to |
| 1722 | // the initial minimum). |
| 1723 | unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); |
| 1724 | // If no region has been scheduled yet, the DAG has not yet been updated with |
| 1725 | // the occupancy target. So retrieve it from the temporary. |
| 1726 | unsigned CurrentTargetOccupancy = |
| 1727 | IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy; |
| 1728 | if (!DAG.RegionsWithExcessRP[RegionIdx] && |
| 1729 | (CurrentTargetOccupancy <= InitialOccupancy || |
| 1730 | DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) != |
| 1731 | InitialOccupancy)) |
| 1732 | return false; |
| 1733 | |
| 1734 | bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion(); |
| 1735 | // If this is the first region scheduled during this stage, make the target |
| 1736 | // occupancy changes in the DAG and MFI. |
| 1737 | if (!IsAnyRegionScheduled && IsSchedulingThisRegion) { |
| 1738 | IsAnyRegionScheduled = true; |
| 1739 | if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) |
| 1740 | DAG.setTargetOccupancy(TempTargetOccupancy); |
| 1741 | } |
| 1742 | return IsSchedulingThisRegion; |
| 1743 | } |
| 1744 | |
| 1745 | bool ClusteredLowOccStage::initGCNRegion() { |
| 1746 | // We may need to reschedule this region if it wasn't rescheduled in the last |
| 1747 | // stage, or if we found it was testing critical register pressure limits in |
| 1748 | // the unclustered reschedule stage. The later is because we may not have been |
| 1749 | // able to raise the min occupancy in the previous stage so the region may be |
| 1750 | // overly constrained even if it was already rescheduled. |
| 1751 | if (!DAG.RegionsWithHighRP[RegionIdx]) |
| 1752 | return false; |
| 1753 | |
| 1754 | return GCNSchedStage::initGCNRegion(); |
| 1755 | } |
| 1756 | |
| 1757 | bool PreRARematStage::initGCNRegion() { |
| 1758 | return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion(); |
| 1759 | } |
| 1760 | |
| 1761 | void GCNSchedStage::setupNewBlock() { |
| 1762 | if (CurrentMBB) |
| 1763 | DAG.finishBlock(); |
| 1764 | |
| 1765 | CurrentMBB = DAG.RegionBegin->getParent(); |
| 1766 | DAG.startBlock(bb: CurrentMBB); |
| 1767 | // Get real RP for the region if it hasn't be calculated before. After the |
| 1768 | // initial schedule stage real RP will be collected after scheduling. |
| 1769 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
| 1770 | StageID == GCNSchedStageID::ILPInitialSchedule || |
| 1771 | StageID == GCNSchedStageID::MemoryClauseInitialSchedule) |
| 1772 | DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB); |
| 1773 | } |
| 1774 | |
| 1775 | void GCNSchedStage::finalizeGCNRegion() { |
| 1776 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
| 1777 | if (S.HasHighPressure) |
| 1778 | DAG.RegionsWithHighRP[RegionIdx] = true; |
| 1779 | |
| 1780 | // Revert scheduling if we have dropped occupancy or there is some other |
| 1781 | // reason that the original schedule is better. |
| 1782 | checkScheduling(); |
| 1783 | |
| 1784 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
| 1785 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) |
| 1786 | SavedMutations.swap(x&: DAG.Mutations); |
| 1787 | } |
| 1788 | |
| 1789 | void PreRARematStage::finalizeGCNRegion() { |
| 1790 | GCNSchedStage::finalizeGCNRegion(); |
| 1791 | // When the goal is to increase occupancy, all regions must reach the target |
| 1792 | // occupancy for rematerializations to be possibly useful, otherwise we will |
| 1793 | // just hurt latency for no benefit. If minimum occupancy drops below the |
| 1794 | // target there is no point in trying to re-schedule further regions. |
| 1795 | if (!TargetOcc) |
| 1796 | return; |
| 1797 | RegionReverts.emplace_back(Args&: RegionIdx, Args&: Unsched, Args&: PressureBefore); |
| 1798 | if (DAG.MinOccupancy < *TargetOcc) { |
| 1799 | REMAT_DEBUG(dbgs() << "Region " << RegionIdx |
| 1800 | << " cannot meet occupancy target, interrupting " |
| 1801 | "re-scheduling in all regions\n" ); |
| 1802 | RescheduleRegions.reset(); |
| 1803 | } |
| 1804 | } |
| 1805 | |
| 1806 | void GCNSchedStage::checkScheduling() { |
| 1807 | // Check the results of scheduling. |
| 1808 | PressureAfter = DAG.getRealRegPressure(RegionIdx); |
| 1809 | |
| 1810 | LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); |
| 1811 | LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n" ); |
| 1812 | |
| 1813 | unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); |
| 1814 | |
| 1815 | if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && |
| 1816 | PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { |
| 1817 | DAG.Pressure[RegionIdx] = PressureAfter; |
| 1818 | |
| 1819 | // Early out if we have achieved the occupancy target. |
| 1820 | LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n" ); |
| 1821 | return; |
| 1822 | } |
| 1823 | |
| 1824 | unsigned TargetOccupancy = std::min( |
| 1825 | a: S.getTargetOccupancy(), b: ST.getOccupancyWithWorkGroupSizes(MF).second); |
| 1826 | unsigned WavesAfter = std::min( |
| 1827 | a: TargetOccupancy, b: PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize)); |
| 1828 | unsigned WavesBefore = std::min( |
| 1829 | a: TargetOccupancy, b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize)); |
| 1830 | LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore |
| 1831 | << ", after " << WavesAfter << ".\n" ); |
| 1832 | |
| 1833 | // We may not be able to keep the current target occupancy because of the just |
| 1834 | // scheduled region. We might still be able to revert scheduling if the |
| 1835 | // occupancy before was higher, or if the current schedule has register |
| 1836 | // pressure higher than the excess limits which could lead to more spilling. |
| 1837 | unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore); |
| 1838 | |
| 1839 | // Allow memory bound functions to drop to 4 waves if not limited by an |
| 1840 | // attribute. |
| 1841 | if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && |
| 1842 | WavesAfter >= MFI.getMinAllowedOccupancy()) { |
| 1843 | LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " |
| 1844 | << MFI.getMinAllowedOccupancy() << " waves\n" ); |
| 1845 | NewOccupancy = WavesAfter; |
| 1846 | } |
| 1847 | |
| 1848 | if (NewOccupancy < DAG.MinOccupancy) { |
| 1849 | DAG.MinOccupancy = NewOccupancy; |
| 1850 | MFI.limitOccupancy(Limit: DAG.MinOccupancy); |
| 1851 | LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " |
| 1852 | << DAG.MinOccupancy << ".\n" ); |
| 1853 | } |
| 1854 | // The maximum number of arch VGPR on non-unified register file, or the |
| 1855 | // maximum VGPR + AGPR in the unified register file case. |
| 1856 | unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); |
| 1857 | // The maximum number of arch VGPR for both unified and non-unified register |
| 1858 | // file. |
| 1859 | unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs()); |
| 1860 | unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); |
| 1861 | |
| 1862 | if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs || |
| 1863 | PressureAfter.getArchVGPRNum() > MaxArchVGPRs || |
| 1864 | PressureAfter.getAGPRNum() > MaxArchVGPRs || |
| 1865 | PressureAfter.getSGPRNum() > MaxSGPRs) { |
| 1866 | DAG.RegionsWithHighRP[RegionIdx] = true; |
| 1867 | DAG.RegionsWithExcessRP[RegionIdx] = true; |
| 1868 | } |
| 1869 | |
| 1870 | // Revert if this region's schedule would cause a drop in occupancy or |
| 1871 | // spilling. |
| 1872 | if (shouldRevertScheduling(WavesAfter)) { |
| 1873 | modifyRegionSchedule(RegionIdx, MBB: DAG.BB, MIOrder: Unsched); |
| 1874 | std::tie(args&: DAG.RegionBegin, args&: DAG.RegionEnd) = DAG.Regions[RegionIdx]; |
| 1875 | } else { |
| 1876 | DAG.Pressure[RegionIdx] = PressureAfter; |
| 1877 | } |
| 1878 | } |
| 1879 | |
| 1880 | unsigned |
| 1881 | GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, |
| 1882 | DenseMap<unsigned, unsigned> &ReadyCycles, |
| 1883 | const TargetSchedModel &SM) { |
| 1884 | unsigned ReadyCycle = CurrCycle; |
| 1885 | for (auto &D : SU.Preds) { |
| 1886 | if (D.isAssignedRegDep()) { |
| 1887 | MachineInstr *DefMI = D.getSUnit()->getInstr(); |
| 1888 | unsigned Latency = SM.computeInstrLatency(MI: DefMI); |
| 1889 | unsigned DefReady = ReadyCycles[DAG.getSUnit(MI: DefMI)->NodeNum]; |
| 1890 | ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency); |
| 1891 | } |
| 1892 | } |
| 1893 | ReadyCycles[SU.NodeNum] = ReadyCycle; |
| 1894 | return ReadyCycle; |
| 1895 | } |
| 1896 | |
| 1897 | #ifndef NDEBUG |
| 1898 | struct EarlierIssuingCycle { |
| 1899 | bool operator()(std::pair<MachineInstr *, unsigned> A, |
| 1900 | std::pair<MachineInstr *, unsigned> B) const { |
| 1901 | return A.second < B.second; |
| 1902 | } |
| 1903 | }; |
| 1904 | |
| 1905 | static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, |
| 1906 | EarlierIssuingCycle> &ReadyCycles) { |
| 1907 | if (ReadyCycles.empty()) |
| 1908 | return; |
| 1909 | unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); |
| 1910 | dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum |
| 1911 | << " ##################\n# Cycle #\t\t\tInstruction " |
| 1912 | " " |
| 1913 | " \n" ; |
| 1914 | unsigned IPrev = 1; |
| 1915 | for (auto &I : ReadyCycles) { |
| 1916 | if (I.second > IPrev + 1) |
| 1917 | dbgs() << "****************************** BUBBLE OF " << I.second - IPrev |
| 1918 | << " CYCLES DETECTED ******************************\n\n" ; |
| 1919 | dbgs() << "[ " << I.second << " ] : " << *I.first << "\n" ; |
| 1920 | IPrev = I.second; |
| 1921 | } |
| 1922 | } |
| 1923 | #endif |
| 1924 | |
| 1925 | ScheduleMetrics |
| 1926 | GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { |
| 1927 | #ifndef NDEBUG |
| 1928 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
| 1929 | ReadyCyclesSorted; |
| 1930 | #endif |
| 1931 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
| 1932 | unsigned SumBubbles = 0; |
| 1933 | DenseMap<unsigned, unsigned> ReadyCycles; |
| 1934 | unsigned CurrCycle = 0; |
| 1935 | for (auto &SU : InputSchedule) { |
| 1936 | unsigned ReadyCycle = |
| 1937 | computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); |
| 1938 | SumBubbles += ReadyCycle - CurrCycle; |
| 1939 | #ifndef NDEBUG |
| 1940 | ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); |
| 1941 | #endif |
| 1942 | CurrCycle = ++ReadyCycle; |
| 1943 | } |
| 1944 | #ifndef NDEBUG |
| 1945 | LLVM_DEBUG( |
| 1946 | printScheduleModel(ReadyCyclesSorted); |
| 1947 | dbgs() << "\n\t" |
| 1948 | << "Metric: " |
| 1949 | << (SumBubbles |
| 1950 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
| 1951 | : 1) |
| 1952 | << "\n\n" ); |
| 1953 | #endif |
| 1954 | |
| 1955 | return ScheduleMetrics(CurrCycle, SumBubbles); |
| 1956 | } |
| 1957 | |
| 1958 | ScheduleMetrics |
| 1959 | GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { |
| 1960 | #ifndef NDEBUG |
| 1961 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
| 1962 | ReadyCyclesSorted; |
| 1963 | #endif |
| 1964 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
| 1965 | unsigned SumBubbles = 0; |
| 1966 | DenseMap<unsigned, unsigned> ReadyCycles; |
| 1967 | unsigned CurrCycle = 0; |
| 1968 | for (auto &MI : DAG) { |
| 1969 | SUnit *SU = DAG.getSUnit(MI: &MI); |
| 1970 | if (!SU) |
| 1971 | continue; |
| 1972 | unsigned ReadyCycle = |
| 1973 | computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM); |
| 1974 | SumBubbles += ReadyCycle - CurrCycle; |
| 1975 | #ifndef NDEBUG |
| 1976 | ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); |
| 1977 | #endif |
| 1978 | CurrCycle = ++ReadyCycle; |
| 1979 | } |
| 1980 | #ifndef NDEBUG |
| 1981 | LLVM_DEBUG( |
| 1982 | printScheduleModel(ReadyCyclesSorted); |
| 1983 | dbgs() << "\n\t" |
| 1984 | << "Metric: " |
| 1985 | << (SumBubbles |
| 1986 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
| 1987 | : 1) |
| 1988 | << "\n\n" ); |
| 1989 | #endif |
| 1990 | |
| 1991 | return ScheduleMetrics(CurrCycle, SumBubbles); |
| 1992 | } |
| 1993 | |
| 1994 | bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 1995 | if (WavesAfter < DAG.MinOccupancy) |
| 1996 | return true; |
| 1997 | |
| 1998 | // For dynamic VGPR mode, we don't want to waste any VGPR blocks. |
| 1999 | if (DAG.MFI.isDynamicVGPREnabled()) { |
| 2000 | unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( |
| 2001 | STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(), |
| 2002 | DynamicVGPRBlockSize: PressureBefore.getVGPRNum(UnifiedVGPRFile: false)); |
| 2003 | unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( |
| 2004 | STI: &ST, NumVGPRs: DAG.MFI.getDynamicVGPRBlockSize(), |
| 2005 | DynamicVGPRBlockSize: PressureAfter.getVGPRNum(UnifiedVGPRFile: false)); |
| 2006 | if (BlocksAfter > BlocksBefore) |
| 2007 | return true; |
| 2008 | } |
| 2009 | |
| 2010 | return false; |
| 2011 | } |
| 2012 | |
| 2013 | bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 2014 | if (PressureAfter == PressureBefore) |
| 2015 | return false; |
| 2016 | |
| 2017 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
| 2018 | return true; |
| 2019 | |
| 2020 | if (mayCauseSpilling(WavesAfter)) |
| 2021 | return true; |
| 2022 | |
| 2023 | return false; |
| 2024 | } |
| 2025 | |
| 2026 | bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 2027 | // If RP is not reduced in the unclustered reschedule stage, revert to the |
| 2028 | // old schedule. |
| 2029 | if ((WavesAfter <= |
| 2030 | PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize()) && |
| 2031 | mayCauseSpilling(WavesAfter)) || |
| 2032 | GCNSchedStage::shouldRevertScheduling(WavesAfter)) { |
| 2033 | LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n" ); |
| 2034 | return true; |
| 2035 | } |
| 2036 | |
| 2037 | // Do not attempt to relax schedule even more if we are already spilling. |
| 2038 | if (isRegionWithExcessRP()) |
| 2039 | return false; |
| 2040 | |
| 2041 | LLVM_DEBUG( |
| 2042 | dbgs() |
| 2043 | << "\n\t *** In shouldRevertScheduling ***\n" |
| 2044 | << " *********** BEFORE UnclusteredHighRPStage ***********\n" ); |
| 2045 | ScheduleMetrics MBefore = getScheduleMetrics(InputSchedule: DAG.SUnits); |
| 2046 | LLVM_DEBUG( |
| 2047 | dbgs() |
| 2048 | << "\n *********** AFTER UnclusteredHighRPStage ***********\n" ); |
| 2049 | ScheduleMetrics MAfter = getScheduleMetrics(DAG); |
| 2050 | unsigned OldMetric = MBefore.getMetric(); |
| 2051 | unsigned NewMetric = MAfter.getMetric(); |
| 2052 | unsigned WavesBefore = std::min( |
| 2053 | a: S.getTargetOccupancy(), |
| 2054 | b: PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize: DAG.MFI.getDynamicVGPRBlockSize())); |
| 2055 | unsigned Profit = |
| 2056 | ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * |
| 2057 | ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / |
| 2058 | NewMetric) / |
| 2059 | ScheduleMetrics::ScaleFactor; |
| 2060 | LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " |
| 2061 | << MAfter << "Profit: " << Profit << "\n" ); |
| 2062 | return Profit < ScheduleMetrics::ScaleFactor; |
| 2063 | } |
| 2064 | |
| 2065 | bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 2066 | if (PressureAfter == PressureBefore) |
| 2067 | return false; |
| 2068 | |
| 2069 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
| 2070 | return true; |
| 2071 | |
| 2072 | if (mayCauseSpilling(WavesAfter)) |
| 2073 | return true; |
| 2074 | |
| 2075 | return false; |
| 2076 | } |
| 2077 | |
| 2078 | bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 2079 | return mayCauseSpilling(WavesAfter); |
| 2080 | } |
| 2081 | |
| 2082 | bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
| 2083 | if (mayCauseSpilling(WavesAfter)) |
| 2084 | return true; |
| 2085 | |
| 2086 | return false; |
| 2087 | } |
| 2088 | |
| 2089 | bool MemoryClauseInitialScheduleStage::shouldRevertScheduling( |
| 2090 | unsigned WavesAfter) { |
| 2091 | return mayCauseSpilling(WavesAfter); |
| 2092 | } |
| 2093 | |
| 2094 | bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { |
| 2095 | if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() && |
| 2096 | !PressureAfter.less(MF, O: PressureBefore)) { |
| 2097 | LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n" ); |
| 2098 | return true; |
| 2099 | } |
| 2100 | |
| 2101 | return false; |
| 2102 | } |
| 2103 | |
| 2104 | void GCNSchedStage::modifyRegionSchedule(unsigned RegionIdx, |
| 2105 | MachineBasicBlock *MBB, |
| 2106 | ArrayRef<MachineInstr *> MIOrder) { |
| 2107 | assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first, |
| 2108 | DAG.Regions[RegionIdx].second)) == |
| 2109 | MIOrder.size() && |
| 2110 | "instruction number mismatch" ); |
| 2111 | if (MIOrder.empty()) |
| 2112 | return; |
| 2113 | |
| 2114 | LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n'); |
| 2115 | |
| 2116 | // Reconstruct MI sequence by moving instructions in desired order before |
| 2117 | // the current region's start. |
| 2118 | MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first; |
| 2119 | for (MachineInstr *MI : MIOrder) { |
| 2120 | // Either move the next MI in order before the end of the region or move the |
| 2121 | // region end past the MI if it is at the correct position. |
| 2122 | MachineBasicBlock::iterator MII = MI->getIterator(); |
| 2123 | if (MII != RegionEnd) { |
| 2124 | // Will subsequent splice move MI up past a non-debug instruction? |
| 2125 | bool NonDebugReordered = |
| 2126 | !MI->isDebugInstr() && |
| 2127 | skipDebugInstructionsForward(It: RegionEnd, End: MII) != MII; |
| 2128 | MBB->splice(Where: RegionEnd, Other: MBB, From: MI); |
| 2129 | // Only update LiveIntervals information if non-debug instructions are |
| 2130 | // reordered. Otherwise debug instructions could cause code generation to |
| 2131 | // change. |
| 2132 | if (NonDebugReordered) |
| 2133 | DAG.LIS->handleMove(MI&: *MI, UpdateFlags: true); |
| 2134 | } else { |
| 2135 | ++RegionEnd; |
| 2136 | } |
| 2137 | if (MI->isDebugInstr()) { |
| 2138 | LLVM_DEBUG(dbgs() << "Scheduling " << *MI); |
| 2139 | continue; |
| 2140 | } |
| 2141 | |
| 2142 | // Reset read-undef flags and update them later. |
| 2143 | for (MachineOperand &Op : MI->all_defs()) |
| 2144 | Op.setIsUndef(false); |
| 2145 | RegisterOperands RegOpers; |
| 2146 | RegOpers.collect(MI: *MI, TRI: *DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false); |
| 2147 | if (DAG.ShouldTrackLaneMasks) { |
| 2148 | // Adjust liveness and add missing dead+read-undef flags. |
| 2149 | SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot(); |
| 2150 | RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI); |
| 2151 | } else { |
| 2152 | // Adjust for missing dead-def flags. |
| 2153 | RegOpers.detectDeadDefs(MI: *MI, LIS: *DAG.LIS); |
| 2154 | } |
| 2155 | LLVM_DEBUG(dbgs() << "Scheduling " << *MI); |
| 2156 | } |
| 2157 | |
| 2158 | // The region end doesn't change throughout scheduling since it itself is |
| 2159 | // outside the region (whether that is a MBB end or a terminator MI). |
| 2160 | assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch" ); |
| 2161 | DAG.Regions[RegionIdx].first = MIOrder.front(); |
| 2162 | } |
| 2163 | |
| 2164 | bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const { |
| 2165 | |
| 2166 | if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(MI: *MI)) |
| 2167 | return false; |
| 2168 | return AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI->getOpcode()) != -1; |
| 2169 | } |
| 2170 | |
| 2171 | bool RewriteMFMAFormStage::initHeuristics( |
| 2172 | std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands, |
| 2173 | DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse, |
| 2174 | SmallPtrSetImpl<MachineInstr *> &CopyForDef) { |
| 2175 | bool Changed = false; |
| 2176 | |
| 2177 | // Prepare for the heuristics |
| 2178 | for (MachineBasicBlock &MBB : MF) { |
| 2179 | for (MachineInstr &MI : MBB) { |
| 2180 | if (!isRewriteCandidate(MI: &MI)) |
| 2181 | continue; |
| 2182 | |
| 2183 | int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI.getOpcode()); |
| 2184 | assert(ReplacementOp != -1); |
| 2185 | |
| 2186 | RewriteCands.push_back(x: {&MI, MI.getOpcode()}); |
| 2187 | MI.setDesc(TII->get(Opcode: ReplacementOp)); |
| 2188 | |
| 2189 | MachineOperand *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2); |
| 2190 | if (Src2->isReg()) { |
| 2191 | SmallVector<SlotIndex, 8> Src2ReachingDefs; |
| 2192 | findReachingDefs(UseMO&: *Src2, LIS: DAG.LIS, DefIdxs&: Src2ReachingDefs); |
| 2193 | |
| 2194 | // For any definition of the src2 register which is non-MFMA, we |
| 2195 | // insert a copy. |
| 2196 | for (SlotIndex RDIdx : Src2ReachingDefs) { |
| 2197 | MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIdx); |
| 2198 | if (!TII->isMAI(MI: *RD)) |
| 2199 | CopyForDef.insert(Ptr: RD); |
| 2200 | } |
| 2201 | } |
| 2202 | |
| 2203 | MachineOperand &Dst = MI.getOperand(i: 0); |
| 2204 | SmallVector<MachineOperand *, 8> DstReachingUses; |
| 2205 | |
| 2206 | findReachingUses(DefMI: &MI, LIS: DAG.LIS, ReachingUses&: DstReachingUses); |
| 2207 | |
| 2208 | for (MachineOperand *RUOp : DstReachingUses) { |
| 2209 | if (TII->isMAI(MI: *RUOp->getParent())) |
| 2210 | continue; |
| 2211 | |
| 2212 | // For any user of the result of the MFMA which is not an MFMA, we |
| 2213 | // insert a copy. For a given register, we will only insert one copy |
| 2214 | // per user block. |
| 2215 | CopyForUse[RUOp->getParent()->getParent()].insert(x: RUOp->getReg()); |
| 2216 | |
| 2217 | SmallVector<SlotIndex, 8> DstUsesReachingDefs; |
| 2218 | findReachingDefs(UseMO&: *RUOp, LIS: DAG.LIS, DefIdxs&: DstUsesReachingDefs); |
| 2219 | |
| 2220 | for (SlotIndex RDIndex : DstUsesReachingDefs) { |
| 2221 | MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex); |
| 2222 | if (TII->isMAI(MI: *RD)) |
| 2223 | continue; |
| 2224 | |
| 2225 | // For any definition of the user of the MFMA which is not an MFMA, |
| 2226 | // we insert a copy. We do this to transform all the reaching defs |
| 2227 | // of this use to AGPR. By doing this, we can insert a copy from |
| 2228 | // AGPR to VGPR at the user rather than after the MFMA. |
| 2229 | CopyForDef.insert(Ptr: RD); |
| 2230 | } |
| 2231 | } |
| 2232 | |
| 2233 | // Do the rewrite to allow for updated RP calculation. |
| 2234 | const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Reg: Dst.getReg()); |
| 2235 | const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(SRC: VGPRRC); |
| 2236 | DAG.MRI.setRegClass(Reg: Dst.getReg(), RC: AGPRRC); |
| 2237 | if (Src2->isReg()) |
| 2238 | DAG.MRI.setRegClass(Reg: Src2->getReg(), RC: AGPRRC); |
| 2239 | Changed = true; |
| 2240 | } |
| 2241 | } |
| 2242 | |
| 2243 | return Changed; |
| 2244 | } |
| 2245 | |
| 2246 | int64_t RewriteMFMAFormStage::getRewriteCost( |
| 2247 | const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands, |
| 2248 | const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse, |
| 2249 | const SmallPtrSetImpl<MachineInstr *> &CopyForDef) { |
| 2250 | MachineBlockFrequencyInfo *MBFI = DAG.MBFI; |
| 2251 | |
| 2252 | int64_t BestSpillCost = 0; |
| 2253 | int64_t Cost = 0; |
| 2254 | uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency(); |
| 2255 | |
| 2256 | std::pair<unsigned, unsigned> MaxVectorRegs = |
| 2257 | ST.getMaxNumVectorRegs(F: MF.getFunction()); |
| 2258 | unsigned ArchVGPRThreshold = MaxVectorRegs.first; |
| 2259 | unsigned AGPRThreshold = MaxVectorRegs.second; |
| 2260 | unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF); |
| 2261 | |
| 2262 | for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) { |
| 2263 | if (!RegionsWithExcessArchVGPR[Region]) |
| 2264 | continue; |
| 2265 | |
| 2266 | GCNRegPressure &PressureBefore = DAG.Pressure[Region]; |
| 2267 | unsigned SpillCostBefore = PressureBefore.getVGPRSpills( |
| 2268 | MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold); |
| 2269 | |
| 2270 | // For the cases we care about (i.e. ArchVGPR usage is greater than the |
| 2271 | // addressable limit), rewriting alone should bring pressure to manageable |
| 2272 | // level. If we find any such region, then the rewrite is potentially |
| 2273 | // beneficial. |
| 2274 | GCNRegPressure PressureAfter = DAG.getRealRegPressure(RegionIdx: Region); |
| 2275 | unsigned SpillCostAfter = PressureAfter.getVGPRSpills( |
| 2276 | MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold); |
| 2277 | |
| 2278 | uint64_t BlockFreq = |
| 2279 | MBFI->getBlockFreq(MBB: DAG.Regions[Region].first->getParent()) |
| 2280 | .getFrequency(); |
| 2281 | |
| 2282 | bool RelativeFreqIsDenom = EntryFreq > BlockFreq; |
| 2283 | uint64_t RelativeFreq = EntryFreq && BlockFreq |
| 2284 | ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq |
| 2285 | : BlockFreq / EntryFreq) |
| 2286 | : 1; |
| 2287 | |
| 2288 | // This assumes perfect spilling / splitting -- using one spill / copy |
| 2289 | // instruction and one restoreFrom / copy for each excess register, |
| 2290 | int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2; |
| 2291 | |
| 2292 | // Also account for the block frequency. |
| 2293 | if (RelativeFreqIsDenom) |
| 2294 | SpillCost /= (int64_t)RelativeFreq; |
| 2295 | else |
| 2296 | SpillCost *= (int64_t)RelativeFreq; |
| 2297 | |
| 2298 | // If we have increased spilling in any block, just bail. |
| 2299 | if (SpillCost > 0) |
| 2300 | return SpillCost; |
| 2301 | |
| 2302 | if (SpillCost < BestSpillCost) |
| 2303 | BestSpillCost = SpillCost; |
| 2304 | } |
| 2305 | |
| 2306 | // Set the cost to the largest decrease in spill cost in order to not double |
| 2307 | // count spill reductions. |
| 2308 | Cost = BestSpillCost; |
| 2309 | assert(Cost <= 0); |
| 2310 | |
| 2311 | unsigned CopyCost = 0; |
| 2312 | |
| 2313 | // For each CopyForDef, increase the cost by the register size while |
| 2314 | // accounting for block frequency. |
| 2315 | for (MachineInstr *DefMI : CopyForDef) { |
| 2316 | Register DefReg = DefMI->getOperand(i: 0).getReg(); |
| 2317 | uint64_t DefFreq = |
| 2318 | EntryFreq |
| 2319 | ? MBFI->getBlockFreq(MBB: DefMI->getParent()).getFrequency() / EntryFreq |
| 2320 | : 1; |
| 2321 | |
| 2322 | const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg: DefReg); |
| 2323 | CopyCost += RC->getCopyCost() * DefFreq; |
| 2324 | } |
| 2325 | |
| 2326 | // Account for CopyForUse copies in each block that the register is used. |
| 2327 | for (auto &[UseBlock, UseRegs] : CopyForUse) { |
| 2328 | uint64_t UseFreq = |
| 2329 | EntryFreq ? MBFI->getBlockFreq(MBB: UseBlock).getFrequency() / EntryFreq : 1; |
| 2330 | |
| 2331 | for (Register UseReg : UseRegs) { |
| 2332 | const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg: UseReg); |
| 2333 | CopyCost += RC->getCopyCost() * UseFreq; |
| 2334 | } |
| 2335 | } |
| 2336 | |
| 2337 | // Reset the classes that were changed to AGPR for better RB analysis. |
| 2338 | // We must do rewriting after copy-insertion, as some defs of the register |
| 2339 | // may require VGPR. Additionally, if we bail out and don't perform the |
| 2340 | // rewrite then these need to be restored anyway. |
| 2341 | for (auto &[MI, OriginalOpcode] : RewriteCands) { |
| 2342 | assert(TII->isMAI(*MI)); |
| 2343 | const TargetRegisterClass *AGPRRC = |
| 2344 | DAG.MRI.getRegClass(Reg: MI->getOperand(i: 0).getReg()); |
| 2345 | const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: AGPRRC); |
| 2346 | |
| 2347 | MachineOperand *Src2 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2); |
| 2348 | assert(Src2); |
| 2349 | |
| 2350 | if (Src2->isReg()) |
| 2351 | DAG.MRI.setRegClass(Reg: Src2->getReg(), RC: VGPRRC); |
| 2352 | DAG.MRI.setRegClass(Reg: MI->getOperand(i: 0).getReg(), RC: VGPRRC); |
| 2353 | MI->setDesc(TII->get(Opcode: OriginalOpcode)); |
| 2354 | } |
| 2355 | |
| 2356 | return Cost + CopyCost; |
| 2357 | } |
| 2358 | |
| 2359 | bool RewriteMFMAFormStage::rewrite( |
| 2360 | const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) { |
| 2361 | DenseMap<MachineInstr *, unsigned> FirstMIToRegion; |
| 2362 | DenseMap<MachineInstr *, unsigned> LastMIToRegion; |
| 2363 | |
| 2364 | for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) { |
| 2365 | RegionBoundaries Entry = DAG.Regions[Region]; |
| 2366 | if (Entry.first == Entry.second) |
| 2367 | continue; |
| 2368 | |
| 2369 | FirstMIToRegion[&*Entry.first] = Region; |
| 2370 | if (Entry.second != Entry.first->getParent()->end()) |
| 2371 | LastMIToRegion[&*Entry.second] = Region; |
| 2372 | } |
| 2373 | |
| 2374 | // Rewrite the MFMAs to AGPR, and insert any copies as needed. |
| 2375 | // The general assumption of the algorithm (and the previous cost calculation) |
| 2376 | // is that it is better to insert the copies in the MBB of the def of the src2 |
| 2377 | // operands, and in the MBB of the user of the dest operands. This is based on |
| 2378 | // the assumption that the MFMAs are likely to appear in loop bodies, while |
| 2379 | // the src2 and dest operands are live-in / live-out of the loop. Due to this |
| 2380 | // design, the algorithm for finding copy insertion points is more |
| 2381 | // complicated. |
| 2382 | // |
| 2383 | // There are three main cases to handle: 1. the reaching defs of the src2 |
| 2384 | // operands, 2. the reaching uses of the dst operands, and 3. the reaching |
| 2385 | // defs of the reaching uses of the dst operand. |
| 2386 | // |
| 2387 | // In the first case, we simply insert copies after each of the reaching |
| 2388 | // definitions. In the second case, we collect all the uses of a given dest |
| 2389 | // and organize them by MBB. Then, we insert 1 copy for each MBB before the |
| 2390 | // earliest use. Since the use may have multiple reaching defs, and since we |
| 2391 | // want to replace the register it is using with the result of the copy, we |
| 2392 | // must handle case 3. In the third case, we simply insert a copy after each |
| 2393 | // of the reaching defs to connect to the copy of the reaching uses of the dst |
| 2394 | // reg. This allows us to avoid inserting copies next to the MFMAs. |
| 2395 | // |
| 2396 | // While inserting the copies, we maintain a map of operands which will use |
| 2397 | // different regs (i.e. the result of the copies). For example, a case 1 src2 |
| 2398 | // operand will use the register result of the copies after the reaching defs, |
| 2399 | // as opposed to the original register. Now that we have completed our copy |
| 2400 | // analysis and placement, we can bulk update the registers. We do this |
| 2401 | // separately as to avoid complicating the reachingDef and reachingUse |
| 2402 | // queries. |
| 2403 | // |
| 2404 | // While inserting the copies, we also maintain a list or registers which we |
| 2405 | // will want to reclassify as AGPR. After doing the copy insertion and the |
| 2406 | // register replacement, we can finally do the reclassification. This uses the |
| 2407 | // redef map, as the registers we are interested in reclassifying may be |
| 2408 | // replaced by the result of a copy. We must do this after the copy analysis |
| 2409 | // and placement as we must have an accurate redef map -- otherwise we may end |
| 2410 | // up creating illegal instructions. |
| 2411 | |
| 2412 | // The original registers of the MFMA that need to be reclassified as AGPR. |
| 2413 | DenseSet<Register> RewriteRegs; |
| 2414 | // The map of an original register in the MFMA to a new register (result of a |
| 2415 | // copy) that it should be replaced with. |
| 2416 | DenseMap<Register, Register> RedefMap; |
| 2417 | // The map of the original MFMA registers to the relevant MFMA operands. |
| 2418 | DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap; |
| 2419 | // The map of reaching defs for a given register -- to avoid duplicate copies. |
| 2420 | DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap; |
| 2421 | // The map of reaching uses for a given register by basic block -- to avoid |
| 2422 | // duplicate copies and to calculate per MBB insert pts. |
| 2423 | DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>> |
| 2424 | ReachingUseTracker; |
| 2425 | |
| 2426 | for (auto &[MI, OriginalOpcode] : RewriteCands) { |
| 2427 | int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(Opcode: MI->getOpcode()); |
| 2428 | if (ReplacementOp == -1) |
| 2429 | continue; |
| 2430 | MI->setDesc(TII->get(Opcode: ReplacementOp)); |
| 2431 | |
| 2432 | // Case 1: insert copies for the reaching defs of the Src2Reg. |
| 2433 | MachineOperand *Src2 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2); |
| 2434 | if (Src2->isReg()) { |
| 2435 | Register Src2Reg = Src2->getReg(); |
| 2436 | if (!Src2Reg.isVirtual()) |
| 2437 | return false; |
| 2438 | |
| 2439 | Register MappedReg = Src2->getReg(); |
| 2440 | SmallVector<SlotIndex, 8> Src2ReachingDefs; |
| 2441 | findReachingDefs(UseMO&: *Src2, LIS: DAG.LIS, DefIdxs&: Src2ReachingDefs); |
| 2442 | SmallSetVector<MachineInstr *, 8> Src2DefsReplace; |
| 2443 | |
| 2444 | for (SlotIndex RDIndex : Src2ReachingDefs) { |
| 2445 | MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex); |
| 2446 | if (TII->isMAI(MI: *RD)) |
| 2447 | continue; |
| 2448 | |
| 2449 | // If there is a non mai reaching def, then we need a copy. |
| 2450 | Src2DefsReplace.insert(X: RD); |
| 2451 | } |
| 2452 | |
| 2453 | if (!Src2DefsReplace.empty()) { |
| 2454 | DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: Src2Reg); |
| 2455 | if (RI != RedefMap.end()) { |
| 2456 | MappedReg = RI->second; |
| 2457 | } else { |
| 2458 | assert(!ReachingDefCopyMap.contains(Src2Reg)); |
| 2459 | const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Reg: Src2Reg); |
| 2460 | const TargetRegisterClass *VGPRRC = |
| 2461 | SRI->getEquivalentVGPRClass(SRC: Src2RC); |
| 2462 | |
| 2463 | // Track the mapping of the original register to the new register. |
| 2464 | MappedReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC); |
| 2465 | RedefMap[Src2Reg] = MappedReg; |
| 2466 | } |
| 2467 | |
| 2468 | // If none exists, create a copy from this reaching def. |
| 2469 | // We may have inserted a copy already in an earlier iteration. |
| 2470 | for (MachineInstr *RD : Src2DefsReplace) { |
| 2471 | // Do not create redundant copies. |
| 2472 | if (ReachingDefCopyMap[Src2Reg].insert(Ptr: RD).second) { |
| 2473 | MachineInstrBuilder VGPRCopy = |
| 2474 | BuildMI(BB&: *RD->getParent(), I: std::next(x: RD->getIterator()), |
| 2475 | MIMD: RD->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY)) |
| 2476 | .addDef(RegNo: MappedReg, Flags: {}, SubReg: 0) |
| 2477 | .addUse(RegNo: Src2Reg, Flags: {}, SubReg: 0); |
| 2478 | DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy); |
| 2479 | |
| 2480 | // If this reaching def was the last MI in the region, update the |
| 2481 | // region boundaries. |
| 2482 | if (LastMIToRegion.contains(Val: RD)) { |
| 2483 | unsigned UpdateRegion = LastMIToRegion[RD]; |
| 2484 | DAG.Regions[UpdateRegion].second = VGPRCopy; |
| 2485 | LastMIToRegion.erase(Val: RD); |
| 2486 | } |
| 2487 | } |
| 2488 | } |
| 2489 | } |
| 2490 | |
| 2491 | // Track the register for reclassification |
| 2492 | RewriteRegs.insert(V: Src2Reg); |
| 2493 | |
| 2494 | // Always insert the operand for replacement. If this corresponds with a |
| 2495 | // chain of tied-def we may not see the VGPR requirement until later. |
| 2496 | ReplaceMap[Src2Reg].insert(V: Src2); |
| 2497 | } |
| 2498 | |
| 2499 | // Case 2 and Case 3: insert copies before the reaching uses of the dsts, |
| 2500 | // and after the reaching defs of the reaching uses of the dsts. |
| 2501 | |
| 2502 | MachineOperand *Dst = &MI->getOperand(i: 0); |
| 2503 | Register DstReg = Dst->getReg(); |
| 2504 | if (!DstReg.isVirtual()) |
| 2505 | return false; |
| 2506 | |
| 2507 | Register MappedReg = DstReg; |
| 2508 | SmallVector<MachineOperand *, 8> DstReachingUses; |
| 2509 | |
| 2510 | SmallVector<MachineOperand *, 8> DstReachingUseCopies; |
| 2511 | SmallVector<MachineInstr *, 8> DstUseDefsReplace; |
| 2512 | |
| 2513 | findReachingUses(DefMI: MI, LIS: DAG.LIS, ReachingUses&: DstReachingUses); |
| 2514 | |
| 2515 | for (MachineOperand *RUOp : DstReachingUses) { |
| 2516 | if (TII->isMAI(MI: *RUOp->getParent())) |
| 2517 | continue; |
| 2518 | |
| 2519 | // If there is a non mai reaching use, then we need a copy. |
| 2520 | if (find(Range&: DstReachingUseCopies, Val: RUOp) == DstReachingUseCopies.end()) |
| 2521 | DstReachingUseCopies.push_back(Elt: RUOp); |
| 2522 | SmallVector<SlotIndex, 8> DstUsesReachingDefs; |
| 2523 | findReachingDefs(UseMO&: *RUOp, LIS: DAG.LIS, DefIdxs&: DstUsesReachingDefs); |
| 2524 | |
| 2525 | for (SlotIndex RDIndex : DstUsesReachingDefs) { |
| 2526 | MachineInstr *RD = DAG.LIS->getInstructionFromIndex(index: RDIndex); |
| 2527 | if (TII->isMAI(MI: *RD)) |
| 2528 | continue; |
| 2529 | |
| 2530 | // If there is a non mai reaching def of this reaching use, then we will |
| 2531 | // need a copy. |
| 2532 | if (find(Range&: DstUseDefsReplace, Val: RD) == DstUseDefsReplace.end()) |
| 2533 | DstUseDefsReplace.push_back(Elt: RD); |
| 2534 | } |
| 2535 | } |
| 2536 | |
| 2537 | if (!DstUseDefsReplace.empty()) { |
| 2538 | DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: DstReg); |
| 2539 | if (RI != RedefMap.end()) { |
| 2540 | MappedReg = RI->second; |
| 2541 | } else { |
| 2542 | assert(!ReachingDefCopyMap.contains(DstReg)); |
| 2543 | const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: DstReg); |
| 2544 | const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC); |
| 2545 | |
| 2546 | // Track the mapping of the original register to the new register. |
| 2547 | MappedReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC); |
| 2548 | RedefMap[DstReg] = MappedReg; |
| 2549 | } |
| 2550 | |
| 2551 | // If none exists, create a copy from this reaching def. |
| 2552 | // We may have inserted a copy already in an earlier iteration. |
| 2553 | for (MachineInstr *RD : DstUseDefsReplace) { |
| 2554 | // Do not create reundant copies. |
| 2555 | if (ReachingDefCopyMap[DstReg].insert(Ptr: RD).second) { |
| 2556 | MachineInstrBuilder VGPRCopy = |
| 2557 | BuildMI(BB&: *RD->getParent(), I: std::next(x: RD->getIterator()), |
| 2558 | MIMD: RD->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY)) |
| 2559 | .addDef(RegNo: MappedReg, Flags: {}, SubReg: 0) |
| 2560 | .addUse(RegNo: DstReg, Flags: {}, SubReg: 0); |
| 2561 | DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy); |
| 2562 | |
| 2563 | // If this reaching def was the last MI in the region, update the |
| 2564 | // region boundaries. |
| 2565 | DenseMap<MachineInstr *, unsigned>::iterator LMI = |
| 2566 | LastMIToRegion.find(Val: RD); |
| 2567 | if (LMI != LastMIToRegion.end()) { |
| 2568 | unsigned UpdateRegion = LMI->second; |
| 2569 | DAG.Regions[UpdateRegion].second = VGPRCopy; |
| 2570 | LastMIToRegion.erase(Val: RD); |
| 2571 | } |
| 2572 | } |
| 2573 | } |
| 2574 | } |
| 2575 | |
| 2576 | DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg]; |
| 2577 | for (MachineOperand *RU : DstReachingUseCopies) { |
| 2578 | MachineBasicBlock *RUBlock = RU->getParent()->getParent(); |
| 2579 | // Just keep track of the reaching use of this register by block. After we |
| 2580 | // have scanned all the MFMAs we can find optimal insert pts. |
| 2581 | if (RUBlock != MI->getParent()) { |
| 2582 | ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(Ptr: RU); |
| 2583 | continue; |
| 2584 | } |
| 2585 | |
| 2586 | // Special case, the use is in the same block as the MFMA. Insert the copy |
| 2587 | // just before the use. |
| 2588 | const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: DstReg); |
| 2589 | const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC); |
| 2590 | Register NewUseReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC); |
| 2591 | MachineInstr *UseInst = RU->getParent(); |
| 2592 | MachineInstrBuilder VGPRCopy = |
| 2593 | BuildMI(BB&: *UseInst->getParent(), I: UseInst->getIterator(), |
| 2594 | MIMD: UseInst->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY)) |
| 2595 | .addDef(RegNo: NewUseReg, Flags: {}, SubReg: 0) |
| 2596 | .addUse(RegNo: DstReg, Flags: {}, SubReg: 0); |
| 2597 | DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy); |
| 2598 | // Since we know this use has only one reaching def, we can replace the |
| 2599 | // use reg. |
| 2600 | RU->setReg(NewUseReg); |
| 2601 | // Track the copy source operand for r eplacement. |
| 2602 | DstRegSet.insert(V: &VGPRCopy->getOperand(i: 1)); |
| 2603 | } |
| 2604 | |
| 2605 | // Track the register for reclassification |
| 2606 | RewriteRegs.insert(V: DstReg); |
| 2607 | |
| 2608 | // Insert the dst operand for replacement. If this dst is in a chain of |
| 2609 | // tied-def MFMAs, and the first src2 needs to be replaced with a new reg, |
| 2610 | // all the correspond operands need to be replaced. |
| 2611 | DstRegSet.insert(V: Dst); |
| 2612 | } |
| 2613 | |
| 2614 | // Handle the copies for dst uses. |
| 2615 | using RUBType = |
| 2616 | std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>; |
| 2617 | for (RUBType RUBlockEntry : ReachingUseTracker) { |
| 2618 | using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>; |
| 2619 | for (RUDType RUDst : RUBlockEntry.second) { |
| 2620 | MachineOperand *OpBegin = *RUDst.second.begin(); |
| 2621 | SlotIndex InstPt = DAG.LIS->getInstructionIndex(Instr: *OpBegin->getParent()); |
| 2622 | |
| 2623 | // Find the earliest use in this block. |
| 2624 | for (MachineOperand *User : RUDst.second) { |
| 2625 | SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(Instr: *User->getParent()); |
| 2626 | if (SlotIndex::isEarlierInstr(A: NewInstPt, B: InstPt)) |
| 2627 | InstPt = NewInstPt; |
| 2628 | } |
| 2629 | |
| 2630 | const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(Reg: RUDst.first); |
| 2631 | const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(SRC: DstRC); |
| 2632 | Register NewUseReg = DAG.MRI.createVirtualRegister(RegClass: VGPRRC); |
| 2633 | MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(index: InstPt); |
| 2634 | |
| 2635 | MachineInstrBuilder VGPRCopy = |
| 2636 | BuildMI(BB&: *UseInst->getParent(), I: UseInst->getIterator(), |
| 2637 | MIMD: UseInst->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY)) |
| 2638 | .addDef(RegNo: NewUseReg, Flags: {}, SubReg: 0) |
| 2639 | .addUse(RegNo: RUDst.first, Flags: {}, SubReg: 0); |
| 2640 | DAG.LIS->InsertMachineInstrInMaps(MI&: *VGPRCopy); |
| 2641 | |
| 2642 | // If this UseInst was the first MI in the region, update the region |
| 2643 | // boundaries. |
| 2644 | DenseMap<MachineInstr *, unsigned>::iterator FI = |
| 2645 | FirstMIToRegion.find(Val: UseInst); |
| 2646 | if (FI != FirstMIToRegion.end()) { |
| 2647 | unsigned UpdateRegion = FI->second; |
| 2648 | DAG.Regions[UpdateRegion].first = VGPRCopy; |
| 2649 | FirstMIToRegion.erase(Val: UseInst); |
| 2650 | } |
| 2651 | |
| 2652 | // Replace the operand for all users. |
| 2653 | for (MachineOperand *User : RUDst.second) { |
| 2654 | User->setReg(NewUseReg); |
| 2655 | } |
| 2656 | |
| 2657 | // Track the copy source operand for replacement. |
| 2658 | ReplaceMap[RUDst.first].insert(V: &VGPRCopy->getOperand(i: 1)); |
| 2659 | } |
| 2660 | } |
| 2661 | |
| 2662 | // We may have needed to insert copies after the reaching defs of the MFMAs. |
| 2663 | // Replace the original register with the result of the copy for all relevant |
| 2664 | // operands. |
| 2665 | for (std::pair<Register, Register> NewDef : RedefMap) { |
| 2666 | Register OldReg = NewDef.first; |
| 2667 | Register NewReg = NewDef.second; |
| 2668 | |
| 2669 | // Replace the register for any associated operand in the MFMA chain. |
| 2670 | for (MachineOperand *ReplaceOp : ReplaceMap[OldReg]) |
| 2671 | ReplaceOp->setReg(NewReg); |
| 2672 | } |
| 2673 | |
| 2674 | // Finally, do the reclassification of the MFMA registers. |
| 2675 | for (Register RewriteReg : RewriteRegs) { |
| 2676 | Register RegToRewrite = RewriteReg; |
| 2677 | |
| 2678 | // Be sure to update the replacement register and not the original. |
| 2679 | DenseMap<Register, Register>::iterator RI = RedefMap.find(Val: RewriteReg); |
| 2680 | if (RI != RedefMap.end()) |
| 2681 | RegToRewrite = RI->second; |
| 2682 | |
| 2683 | const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(Reg: RegToRewrite); |
| 2684 | const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(SRC: CurrRC); |
| 2685 | |
| 2686 | DAG.MRI.setRegClass(Reg: RegToRewrite, RC: AGPRRC); |
| 2687 | } |
| 2688 | |
| 2689 | // Bulk update the LIS. |
| 2690 | DAG.LIS->reanalyze(MF&: DAG.MF); |
| 2691 | // Liveins may have been modified for cross RC copies |
| 2692 | RegionPressureMap LiveInUpdater(&DAG, false); |
| 2693 | LiveInUpdater.buildLiveRegMap(); |
| 2694 | |
| 2695 | for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) |
| 2696 | DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(RegionIdx: Region); |
| 2697 | |
| 2698 | DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx); |
| 2699 | |
| 2700 | return true; |
| 2701 | } |
| 2702 | |
| 2703 | unsigned PreRARematStage::getStageTargetOccupancy() const { |
| 2704 | return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU(); |
| 2705 | } |
| 2706 | |
| 2707 | bool PreRARematStage::setObjective() { |
| 2708 | const Function &F = MF.getFunction(); |
| 2709 | |
| 2710 | // Set up "spilling targets" for all regions. |
| 2711 | unsigned MaxSGPRs = ST.getMaxNumSGPRs(F); |
| 2712 | unsigned MaxVGPRs = ST.getMaxNumVGPRs(F); |
| 2713 | bool HasVectorRegisterExcess = false; |
| 2714 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| 2715 | const GCNRegPressure &RP = DAG.Pressure[I]; |
| 2716 | GCNRPTarget &Target = RPTargets.emplace_back(Args&: MaxSGPRs, Args&: MaxVGPRs, Args&: MF, Args: RP); |
| 2717 | if (!Target.satisfied()) |
| 2718 | TargetRegions.set(I); |
| 2719 | HasVectorRegisterExcess |= Target.hasVectorRegisterExcess(); |
| 2720 | } |
| 2721 | |
| 2722 | if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { |
| 2723 | // In addition to register usage being above addressable limits, occupancy |
| 2724 | // below the minimum is considered like "spilling" as well. |
| 2725 | TargetOcc = std::nullopt; |
| 2726 | } else { |
| 2727 | // There is no spilling and room to improve occupancy; set up "increased |
| 2728 | // occupancy targets" for all regions. |
| 2729 | TargetOcc = DAG.MinOccupancy + 1; |
| 2730 | const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize(); |
| 2731 | MaxSGPRs = ST.getMaxNumSGPRs(WavesPerEU: *TargetOcc, Addressable: false); |
| 2732 | MaxVGPRs = ST.getMaxNumVGPRs(WavesPerEU: *TargetOcc, DynamicVGPRBlockSize: VGPRBlockSize); |
| 2733 | for (auto [I, Target] : enumerate(First&: RPTargets)) { |
| 2734 | Target.setTarget(NumSGPRs: MaxSGPRs, NumVGPRs: MaxVGPRs); |
| 2735 | if (!Target.satisfied()) |
| 2736 | TargetRegions.set(I); |
| 2737 | } |
| 2738 | } |
| 2739 | |
| 2740 | return TargetRegions.any(); |
| 2741 | } |
| 2742 | |
| 2743 | bool PreRARematStage::collectRematRegs( |
| 2744 | const DenseMap<MachineInstr *, unsigned> &MIRegion) { |
| 2745 | // We need up-to-date live-out info. to query live-out register masks in |
| 2746 | // regions containing rematerializable instructions. |
| 2747 | DAG.RegionLiveOuts.buildLiveRegMap(); |
| 2748 | |
| 2749 | // Set of registers already marked for potential remterialization; used to |
| 2750 | // avoid rematerialization chains. |
| 2751 | SmallSet<Register, 4> MarkedRegs; |
| 2752 | auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool { |
| 2753 | return MO.isReg() && MarkedRegs.contains(V: MO.getReg()); |
| 2754 | }; |
| 2755 | |
| 2756 | // Identify rematerializable instructions in the function. |
| 2757 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| 2758 | RegionBoundaries Bounds = DAG.Regions[I]; |
| 2759 | for (auto MI = Bounds.first; MI != Bounds.second; ++MI) { |
| 2760 | // The instruction must be rematerializable. |
| 2761 | MachineInstr &DefMI = *MI; |
| 2762 | if (!isReMaterializable(MI: DefMI)) |
| 2763 | continue; |
| 2764 | |
| 2765 | // We only support rematerializing virtual registers with one |
| 2766 | // definition. |
| 2767 | Register Reg = DefMI.getOperand(i: 0).getReg(); |
| 2768 | if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(RegNo: Reg)) |
| 2769 | continue; |
| 2770 | |
| 2771 | // We only care to rematerialize the instruction if it has a single |
| 2772 | // non-debug user in a different region. |
| 2773 | // FIXME: Allow rematerializations with multiple uses. This should be |
| 2774 | // relatively easy to support using the current cost model. |
| 2775 | MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(RegNo: Reg); |
| 2776 | if (!UseMI) |
| 2777 | continue; |
| 2778 | auto UseRegion = MIRegion.find(Val: UseMI); |
| 2779 | if (UseRegion == MIRegion.end() || UseRegion->second == I) |
| 2780 | continue; |
| 2781 | |
| 2782 | // Do not rematerialize an instruction if it uses or is used by an |
| 2783 | // instruction that we have designated for rematerialization. |
| 2784 | // FIXME: Allow for rematerialization chains: this requires 1. updating |
| 2785 | // remat points to account for uses that are rematerialized, and 2. |
| 2786 | // either rematerializing the candidates in careful ordering, or |
| 2787 | // deferring the MBB RP walk until the entire chain has been |
| 2788 | // rematerialized. |
| 2789 | const MachineOperand &UseMO = UseMI->getOperand(i: 0); |
| 2790 | if (IsMarkedForRemat(UseMO) || |
| 2791 | llvm::any_of(Range: DefMI.operands(), P: IsMarkedForRemat)) |
| 2792 | continue; |
| 2793 | |
| 2794 | // Do not rematerialize an instruction it it uses registers that aren't |
| 2795 | // available at its use. This ensures that we are not extending any live |
| 2796 | // range while rematerializing. |
| 2797 | SlotIndex UseIdx = DAG.LIS->getInstructionIndex(Instr: *UseMI).getRegSlot(EC: true); |
| 2798 | if (!VirtRegAuxInfo::allUsesAvailableAt(MI: &DefMI, UseIdx, LIS: *DAG.LIS, MRI: DAG.MRI, |
| 2799 | TII: *DAG.TII)) |
| 2800 | continue; |
| 2801 | |
| 2802 | // Add the instruction to the rematerializable list. |
| 2803 | MarkedRegs.insert(V: Reg); |
| 2804 | RematRegs.emplace_back(Args: &DefMI, Args&: UseMI, Args&: DAG, Args: MIRegion); |
| 2805 | } |
| 2806 | } |
| 2807 | |
| 2808 | return !RematRegs.empty(); |
| 2809 | } |
| 2810 | |
| 2811 | PreRARematStage::RematReg::RematReg( |
| 2812 | MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG, |
| 2813 | const DenseMap<MachineInstr *, unsigned> &MIRegion) |
| 2814 | : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()), |
| 2815 | LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()), |
| 2816 | DefRegion(MIRegion.at(Val: DefMI)), UseRegion(MIRegion.at(Val: UseMI)) { |
| 2817 | |
| 2818 | // Mark regions in which the rematerializable register is live. |
| 2819 | Register Reg = getReg(); |
| 2820 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| 2821 | auto LiveInIt = DAG.LiveIns[I].find(Val: Reg); |
| 2822 | if (LiveInIt != DAG.LiveIns[I].end()) |
| 2823 | LiveIn.set(I); |
| 2824 | const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I); |
| 2825 | if (auto LiveOutIt = LiveOuts.find(Val: Reg); LiveOutIt != LiveOuts.end()) |
| 2826 | LiveOut.set(I); |
| 2827 | } |
| 2828 | Live |= LiveIn; |
| 2829 | Live |= LiveOut; |
| 2830 | Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: DefRegion).at(Val: Reg); |
| 2831 | } |
| 2832 | |
| 2833 | bool PreRARematStage::RematReg::maybeBeneficial( |
| 2834 | const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const { |
| 2835 | Register Reg = getReg(); |
| 2836 | for (unsigned I : TargetRegions.set_bits()) { |
| 2837 | if (Live[I] && RPTargets[I].isSaveBeneficial(Reg)) |
| 2838 | return true; |
| 2839 | } |
| 2840 | return false; |
| 2841 | } |
| 2842 | |
| 2843 | void PreRARematStage::RematReg::insertMI(unsigned RegionIdx, |
| 2844 | MachineInstr *RematMI, |
| 2845 | GCNScheduleDAGMILive &DAG) const { |
| 2846 | RegionBoundaries &Bounds = DAG.Regions[RegionIdx]; |
| 2847 | if (Bounds.first == std::next(x: MachineBasicBlock::iterator(RematMI))) |
| 2848 | Bounds.first = RematMI; |
| 2849 | DAG.LIS->InsertMachineInstrInMaps(MI&: *RematMI); |
| 2850 | DAG.LIS->createAndComputeVirtRegInterval(Reg: RematMI->getOperand(i: 0).getReg()); |
| 2851 | } |
| 2852 | |
| 2853 | PreRARematStage::ScoredRemat::FreqInfo::FreqInfo( |
| 2854 | MachineFunction &MF, const GCNScheduleDAGMILive &DAG) { |
| 2855 | assert(DAG.MLI && "MLI not defined in DAG" ); |
| 2856 | MachineBranchProbabilityInfo MBPI; |
| 2857 | MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI); |
| 2858 | |
| 2859 | const unsigned NumRegions = DAG.Regions.size(); |
| 2860 | MinFreq = MBFI.getEntryFreq().getFrequency(); |
| 2861 | MaxFreq = 0; |
| 2862 | Regions.reserve(N: NumRegions); |
| 2863 | for (unsigned I = 0; I < NumRegions; ++I) { |
| 2864 | MachineBasicBlock *MBB = DAG.Regions[I].first->getParent(); |
| 2865 | uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency(); |
| 2866 | Regions.push_back(Elt: BlockFreq); |
| 2867 | if (BlockFreq && BlockFreq < MinFreq) |
| 2868 | MinFreq = BlockFreq; |
| 2869 | else if (BlockFreq > MaxFreq) |
| 2870 | MaxFreq = BlockFreq; |
| 2871 | } |
| 2872 | if (!MinFreq) |
| 2873 | return; |
| 2874 | |
| 2875 | // Scale everything down if frequencies are high. |
| 2876 | if (MinFreq >= ScaleFactor * ScaleFactor) { |
| 2877 | for (uint64_t &Freq : Regions) |
| 2878 | Freq /= ScaleFactor; |
| 2879 | MinFreq /= ScaleFactor; |
| 2880 | MaxFreq /= ScaleFactor; |
| 2881 | } |
| 2882 | } |
| 2883 | |
| 2884 | PreRARematStage::ScoredRemat::ScoredRemat(RematReg *Remat, const FreqInfo &Freq, |
| 2885 | const GCNScheduleDAGMILive &DAG) |
| 2886 | : Remat(Remat), NumRegs(getNumRegs(DAG)), FreqDiff(getFreqDiff(Freq)) {} |
| 2887 | |
| 2888 | unsigned PreRARematStage::ScoredRemat::getNumRegs( |
| 2889 | const GCNScheduleDAGMILive &DAG) const { |
| 2890 | const TargetRegisterClass &RC = *DAG.MRI.getRegClass(Reg: Remat->getReg()); |
| 2891 | unsigned RegSize = DAG.TRI->getRegSizeInBits(RC); |
| 2892 | if (unsigned SubIdx = Remat->DefMI->getOperand(i: 0).getSubReg()) { |
| 2893 | // The following may return -1 (i.e., a large unsigned number) on indices |
| 2894 | // that may be used to access subregisters of multiple sizes; in such cases |
| 2895 | // fallback on the size derived from the register class. |
| 2896 | unsigned SubRegSize = DAG.TRI->getSubRegIdxSize(Idx: SubIdx); |
| 2897 | if (SubRegSize < RegSize) |
| 2898 | RegSize = SubRegSize; |
| 2899 | } |
| 2900 | return divideCeil(Numerator: RegSize, Denominator: 32); |
| 2901 | } |
| 2902 | |
| 2903 | int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const { |
| 2904 | // Get frequencies of defining and using regions. A rematerialization from the |
| 2905 | // least frequent region to the most frequent region will yield the greatest |
| 2906 | // latency penalty and therefore should get minimum score. Reciprocally, a |
| 2907 | // rematerialization in the other direction should get maximum score. Default |
| 2908 | // to values that will yield the worst possible score given known frequencies |
| 2909 | // in order to penalize rematerializations from or into regions whose |
| 2910 | // frequency is unknown. |
| 2911 | int64_t DefOrMin = std::max(a: Freq.Regions[Remat->DefRegion], b: Freq.MinFreq); |
| 2912 | int64_t UseOrMax = Freq.Regions[Remat->UseRegion]; |
| 2913 | if (!UseOrMax) |
| 2914 | UseOrMax = Freq.MaxFreq; |
| 2915 | return DefOrMin - UseOrMax; |
| 2916 | } |
| 2917 | |
| 2918 | void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions, |
| 2919 | ArrayRef<GCNRPTarget> RPTargets, |
| 2920 | const FreqInfo &FreqInfo, |
| 2921 | bool ReduceSpill) { |
| 2922 | MaxFreq = 0; |
| 2923 | RegionImpact = 0; |
| 2924 | for (unsigned I : TargetRegions.set_bits()) { |
| 2925 | if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Reg: Remat->getReg())) |
| 2926 | continue; |
| 2927 | bool UnusedLT = Remat->isUnusedLiveThrough(I); |
| 2928 | |
| 2929 | // Regions in which RP is guaranteed to decrease have more weight. |
| 2930 | RegionImpact += UnusedLT ? 2 : 1; |
| 2931 | |
| 2932 | if (ReduceSpill) { |
| 2933 | uint64_t Freq = FreqInfo.Regions[I]; |
| 2934 | if (!UnusedLT) { |
| 2935 | // Apply a frequency penalty in regions in which we are not sure that RP |
| 2936 | // will decrease. |
| 2937 | Freq /= 2; |
| 2938 | } |
| 2939 | MaxFreq = std::max(a: MaxFreq, b: Freq); |
| 2940 | } |
| 2941 | } |
| 2942 | RegionImpact *= NumRegs; |
| 2943 | } |
| 2944 | |
| 2945 | void PreRARematStage::rematerialize(const RematReg &Remat, |
| 2946 | BitVector &RecomputeRP, |
| 2947 | RollbackInfo *Rollback) { |
| 2948 | const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo(); |
| 2949 | MachineInstr &DefMI = *Remat.DefMI; |
| 2950 | Register Reg = DefMI.getOperand(i: 0).getReg(); |
| 2951 | Register NewReg = DAG.MRI.cloneVirtualRegister(VReg: Reg); |
| 2952 | |
| 2953 | // Rematerialize the register in the region where it is used. |
| 2954 | MachineBasicBlock::iterator InsertPos = Remat.UseMI; |
| 2955 | TII->reMaterialize(MBB&: *InsertPos->getParent(), MI: InsertPos, DestReg: NewReg, SubIdx: 0, Orig: DefMI); |
| 2956 | MachineInstr *RematMI = &*std::prev(x: InsertPos); |
| 2957 | Remat.UseMI->substituteRegister(FromReg: Reg, ToReg: NewReg, SubIdx: 0, RegInfo: *DAG.TRI); |
| 2958 | Remat.insertMI(RegionIdx: Remat.UseRegion, RematMI, DAG); |
| 2959 | if (Rollback) { |
| 2960 | Rollback->RematMI = RematMI; |
| 2961 | // Make the original MI a debug value so that it does not influence |
| 2962 | // scheduling and replace all read registers with a sentinel register to |
| 2963 | // prevent operands to appear in use-lists of other MIs during LIS |
| 2964 | // updates. Store mappings between operand indices and original registers |
| 2965 | // for potential rollback. |
| 2966 | DefMI.setDesc(TII->get(Opcode: TargetOpcode::DBG_VALUE)); |
| 2967 | for (auto [Idx, MO] : enumerate(First: Remat.DefMI->operands())) { |
| 2968 | if (MO.isReg() && MO.readsReg()) { |
| 2969 | Rollback->RegMap.insert(KV: {Idx, MO.getReg()}); |
| 2970 | MO.setReg(Register()); |
| 2971 | } |
| 2972 | } |
| 2973 | } else { |
| 2974 | // Just delete the original instruction if it cannot be rolled back. |
| 2975 | DAG.deleteMI(RegionIdx: Remat.DefRegion, MI: &DefMI); |
| 2976 | } |
| 2977 | |
| 2978 | #ifdef EXPENSIVE_CHECKS |
| 2979 | // All uses are known to be available / live at the remat point. Thus, |
| 2980 | // the uses should already be live in to the using region. |
| 2981 | for (MachineOperand &MO : DefMI.operands()) { |
| 2982 | if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) |
| 2983 | continue; |
| 2984 | |
| 2985 | Register UseReg = MO.getReg(); |
| 2986 | if (!UseReg.isVirtual()) |
| 2987 | continue; |
| 2988 | |
| 2989 | LiveInterval &LI = DAG.LIS->getInterval(UseReg); |
| 2990 | LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg()); |
| 2991 | if (LI.hasSubRanges() && MO.getSubReg()) |
| 2992 | LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg()); |
| 2993 | |
| 2994 | LaneBitmask LiveInMask = DAG.LiveIns[Remat.UseRegion].at(UseReg); |
| 2995 | LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM); |
| 2996 | // If this register has lanes not covered by the LiveIns, be sure they |
| 2997 | // do not map to any subrange. ref: |
| 2998 | // machine-scheduler-sink-trivial-remats.mir::omitted_subrange |
| 2999 | if (UncoveredLanes.any()) { |
| 3000 | assert(LI.hasSubRanges()); |
| 3001 | for (LiveInterval::SubRange &SR : LI.subranges()) |
| 3002 | assert((SR.LaneMask & UncoveredLanes).none()); |
| 3003 | } |
| 3004 | } |
| 3005 | #endif |
| 3006 | |
| 3007 | // Remove the register from all regions where it is a live-in or live-out |
| 3008 | // and adjust RP targets. The save is guaranteed in regions in which the |
| 3009 | // register is live-through and unused but optimistic in all other regions |
| 3010 | // where the register is live. |
| 3011 | for (unsigned I : Remat.Live.set_bits()) { |
| 3012 | RPTargets[I].saveReg(Reg, Mask: Remat.Mask, MRI: DAG.MRI); |
| 3013 | DAG.LiveIns[I].erase(Val: Reg); |
| 3014 | DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I).erase(Val: Reg); |
| 3015 | if (!Remat.isUnusedLiveThrough(I)) |
| 3016 | RecomputeRP.set(I); |
| 3017 | } |
| 3018 | |
| 3019 | RescheduleRegions |= Remat.Live; |
| 3020 | } |
| 3021 | |
| 3022 | void PreRARematStage::rollback(const RollbackInfo &Rollback) const { |
| 3023 | const auto &[Remat, RematMI, RegMap] = Rollback; |
| 3024 | |
| 3025 | // Restore the original defining instruction to its original state. |
| 3026 | Remat->DefMI->setDesc(DAG.TII->get(Opcode: RematMI->getOpcode())); |
| 3027 | for (const auto &[MOIdx, Reg] : RegMap) |
| 3028 | Remat->DefMI->getOperand(i: MOIdx).setReg(Reg); |
| 3029 | |
| 3030 | // Switch back to using the original register and delete the |
| 3031 | // rematerialization. |
| 3032 | Register Reg = RematMI->getOperand(i: 0).getReg(); |
| 3033 | Register OriginalReg = Remat->DefMI->getOperand(i: 0).getReg(); |
| 3034 | Remat->UseMI->substituteRegister(FromReg: Reg, ToReg: OriginalReg, SubIdx: 0, RegInfo: *DAG.TRI); |
| 3035 | REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion |
| 3036 | << "] Deleting rematerialization " << *RematMI); |
| 3037 | DAG.deleteMI(RegionIdx: Remat->UseRegion, MI: RematMI); |
| 3038 | |
| 3039 | // Re-add the defined register as a live-in/live-out in all regions it used to |
| 3040 | // be one in. |
| 3041 | std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask); |
| 3042 | for (unsigned I : Remat->LiveIn.set_bits()) |
| 3043 | DAG.LiveIns[I].insert(KV: LiveReg); |
| 3044 | for (unsigned I : Remat->LiveOut.set_bits()) |
| 3045 | DAG.RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx: I).insert(KV: LiveReg); |
| 3046 | } |
| 3047 | |
| 3048 | void PreRARematStage::commitRematerializations() const { |
| 3049 | REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n" ); |
| 3050 | for (const RollbackInfo &Rollback : Rollbacks) |
| 3051 | DAG.deleteMI(RegionIdx: Rollback.Remat->DefRegion, MI: Rollback.Remat->DefMI); |
| 3052 | } |
| 3053 | |
| 3054 | void PreRARematStage::unsetSatisifedRPTargets(const BitVector &Regions) { |
| 3055 | for (unsigned I : Regions.set_bits()) { |
| 3056 | if (TargetRegions[I] && RPTargets[I].satisfied()) { |
| 3057 | REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n" ); |
| 3058 | TargetRegions.reset(Idx: I); |
| 3059 | } |
| 3060 | } |
| 3061 | } |
| 3062 | |
| 3063 | bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) { |
| 3064 | bool TooOptimistic = false; |
| 3065 | for (unsigned I : Regions.set_bits()) { |
| 3066 | GCNRPTarget &Target = RPTargets[I]; |
| 3067 | Target.setRP(DAG.getRealRegPressure(RegionIdx: I)); |
| 3068 | |
| 3069 | // Since we were optimistic in assessing RP decreases in these regions, we |
| 3070 | // may need to remark the target as a target region if RP didn't decrease |
| 3071 | // as expected. |
| 3072 | if (!TargetRegions[I] && !Target.satisfied()) { |
| 3073 | REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n" ); |
| 3074 | TooOptimistic = true; |
| 3075 | TargetRegions.set(I); |
| 3076 | } |
| 3077 | } |
| 3078 | return TooOptimistic; |
| 3079 | } |
| 3080 | |
| 3081 | // Copied from MachineLICM |
| 3082 | bool PreRARematStage::isReMaterializable(const MachineInstr &MI) { |
| 3083 | if (!DAG.TII->isReMaterializable(MI)) |
| 3084 | return false; |
| 3085 | |
| 3086 | for (const MachineOperand &MO : MI.all_uses()) { |
| 3087 | // We can't remat physreg uses, unless it is a constant or an ignorable |
| 3088 | // use (e.g. implicit exec use on VALU instructions) |
| 3089 | if (MO.getReg().isPhysical()) { |
| 3090 | if (DAG.MRI.isConstantPhysReg(PhysReg: MO.getReg()) || DAG.TII->isIgnorableUse(MO)) |
| 3091 | continue; |
| 3092 | return false; |
| 3093 | } |
| 3094 | } |
| 3095 | |
| 3096 | return true; |
| 3097 | } |
| 3098 | |
| 3099 | void PreRARematStage::finalizeGCNSchedStage() { |
| 3100 | // We consider that reducing spilling is always beneficial so we never |
| 3101 | // rollback rematerializations or revert scheduling in such cases. |
| 3102 | if (!TargetOcc) |
| 3103 | return; |
| 3104 | |
| 3105 | // When increasing occupancy, it is possible that re-scheduling is not able to |
| 3106 | // achieve the target occupancy in all regions, in which case re-scheduling in |
| 3107 | // all regions should be reverted. |
| 3108 | if (DAG.MinOccupancy >= *TargetOcc) { |
| 3109 | commitRematerializations(); |
| 3110 | return; |
| 3111 | } |
| 3112 | for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) { |
| 3113 | REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx |
| 3114 | << '\n'); |
| 3115 | DAG.Pressure[RegionIdx] = MaxPressure; |
| 3116 | modifyRegionSchedule(RegionIdx, MBB: RegionBB[RegionIdx], MIOrder: OrigMIOrder); |
| 3117 | } |
| 3118 | |
| 3119 | // It is possible that re-scheduling lowers occupancy over the one achieved |
| 3120 | // just through rematerializations, in which case we revert re-scheduling in |
| 3121 | // all regions but do not roll back rematerializations. |
| 3122 | if (AchievedOcc >= *TargetOcc) { |
| 3123 | commitRematerializations(); |
| 3124 | DAG.setTargetOccupancy(AchievedOcc); |
| 3125 | return; |
| 3126 | } |
| 3127 | // Reset the target occupancy to what it was pre-rematerialization. |
| 3128 | DAG.setTargetOccupancy(*TargetOcc - 1); |
| 3129 | |
| 3130 | // Rollback, then recompute pressure in all affected regions. |
| 3131 | REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n" ); |
| 3132 | BitVector RecomputeRP(DAG.Regions.size()); |
| 3133 | DenseSet<Register> RecomputeLI; |
| 3134 | for (const RollbackInfo &Rollback : Rollbacks) { |
| 3135 | rollback(Rollback); |
| 3136 | RecomputeRP |= Rollback.Remat->Live; |
| 3137 | // Regenerate intervals for all register operands of rematerialized MIs as |
| 3138 | // slot indices may have changed slightly from before re-scheduling. |
| 3139 | for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) { |
| 3140 | if (MO.isReg() && MO.getReg().isVirtual()) |
| 3141 | RecomputeLI.insert(V: MO.getReg()); |
| 3142 | } |
| 3143 | } |
| 3144 | for (Register Reg : RecomputeLI) { |
| 3145 | DAG.LIS->removeInterval(Reg); |
| 3146 | DAG.LIS->createAndComputeVirtRegInterval(Reg); |
| 3147 | } |
| 3148 | for (unsigned I : RecomputeRP.set_bits()) |
| 3149 | DAG.Pressure[I] = DAG.getRealRegPressure(RegionIdx: I); |
| 3150 | |
| 3151 | GCNSchedStage::finalizeGCNSchedStage(); |
| 3152 | } |
| 3153 | |
| 3154 | void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) { |
| 3155 | // It's not possible for the deleted instruction to be upper region boundary |
| 3156 | // since we don't delete region terminators. |
| 3157 | if (Regions[RegionIdx].first == MI) |
| 3158 | Regions[RegionIdx].first = std::next(x: MachineBasicBlock::iterator(MI)); |
| 3159 | LIS->removeInterval(Reg: MI->getOperand(i: 0).getReg()); |
| 3160 | LIS->RemoveMachineInstrFromMaps(MI&: *MI); |
| 3161 | MI->eraseFromParent(); |
| 3162 | } |
| 3163 | |
| 3164 | void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) { |
| 3165 | MinOccupancy = TargetOccupancy; |
| 3166 | if (MFI.getOccupancy() < TargetOccupancy) |
| 3167 | MFI.increaseOccupancy(MF, Limit: MinOccupancy); |
| 3168 | else |
| 3169 | MFI.limitOccupancy(Limit: MinOccupancy); |
| 3170 | } |
| 3171 | |
| 3172 | static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { |
| 3173 | const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII); |
| 3174 | return any_of(Range&: *DAG, P: [SII](MachineBasicBlock::iterator MI) { |
| 3175 | return SII->isIGLPMutationOnly(Opcode: MI->getOpcode()); |
| 3176 | }); |
| 3177 | } |
| 3178 | |
| 3179 | GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( |
| 3180 | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, |
| 3181 | bool RemoveKillFlags) |
| 3182 | : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} |
| 3183 | |
| 3184 | void GCNPostScheduleDAGMILive::schedule() { |
| 3185 | HasIGLPInstrs = hasIGLPInstrs(DAG: this); |
| 3186 | if (HasIGLPInstrs) { |
| 3187 | SavedMutations.clear(); |
| 3188 | SavedMutations.swap(x&: Mutations); |
| 3189 | addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA)); |
| 3190 | } |
| 3191 | |
| 3192 | ScheduleDAGMI::schedule(); |
| 3193 | } |
| 3194 | |
| 3195 | void GCNPostScheduleDAGMILive::finalizeSchedule() { |
| 3196 | if (HasIGLPInstrs) |
| 3197 | SavedMutations.swap(x&: Mutations); |
| 3198 | |
| 3199 | ScheduleDAGMI::finalizeSchedule(); |
| 3200 | } |
| 3201 | |