| 1 | //===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file This file contains a DAG scheduling mutation to add latency to: |
| 10 | /// 1. Barrier edges between ATOMIC_FENCE instructions and preceding |
| 11 | /// memory accesses potentially affected by the fence. |
| 12 | /// This encourages the scheduling of more instructions before |
| 13 | /// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may |
| 14 | /// introduce wait counting or indicate an impending S_BARRIER |
| 15 | /// wait. Having more instructions in-flight across these |
| 16 | /// constructs improves latency hiding. |
| 17 | /// 2. Barrier edges from S_BARRIER_SIGNAL to S_BARRIER_WAIT. |
| 18 | /// This encourages independent work to be scheduled between |
| 19 | /// signal and wait, hiding barrier synchronization latency. |
| 20 | // |
| 21 | //===----------------------------------------------------------------------===// |
| 22 | |
| 23 | #include "AMDGPUBarrierLatency.h" |
| 24 | #include "GCNSubtarget.h" |
| 25 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 26 | #include "SIInstrInfo.h" |
| 27 | #include "llvm/CodeGen/ScheduleDAGInstrs.h" |
| 28 | #include "llvm/Support/CommandLine.h" |
| 29 | |
| 30 | using namespace llvm; |
| 31 | |
| 32 | static cl::opt<unsigned> BarrierSignalWaitLatencyOpt( |
| 33 | "amdgpu-barrier-signal-wait-latency" , |
| 34 | cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT " |
| 35 | "to encourage scheduling independent work between them" ), |
| 36 | cl::init(Val: 16), cl::Hidden); |
| 37 | |
| 38 | namespace { |
| 39 | |
| 40 | class BarrierLatency : public ScheduleDAGMutation { |
| 41 | private: |
| 42 | SmallSet<SyncScope::ID, 4> IgnoredScopes; |
| 43 | |
| 44 | public: |
| 45 | BarrierLatency(MachineFunction *MF) { |
| 46 | LLVMContext &Context = MF->getFunction().getContext(); |
| 47 | IgnoredScopes.insert(V: SyncScope::SingleThread); |
| 48 | IgnoredScopes.insert(V: Context.getOrInsertSyncScopeID(SSN: "wavefront" )); |
| 49 | IgnoredScopes.insert(V: Context.getOrInsertSyncScopeID(SSN: "wavefront-one-as" )); |
| 50 | IgnoredScopes.insert(V: Context.getOrInsertSyncScopeID(SSN: "singlethread-one-as" )); |
| 51 | |
| 52 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
| 53 | if (!ST.requiresWaitOnWorkgroupReleaseFence()) { |
| 54 | // Prior to GFX10 workgroup scope does not normally require waitcnts |
| 55 | IgnoredScopes.insert(V: Context.getOrInsertSyncScopeID(SSN: "workgroup" )); |
| 56 | } |
| 57 | } |
| 58 | void apply(ScheduleDAGInstrs *DAG) override; |
| 59 | }; |
| 60 | |
| 61 | void addLatencyToEdge(SDep &PredDep, SUnit &SU, unsigned Latency) { |
| 62 | SUnit *PredSU = PredDep.getSUnit(); |
| 63 | SDep ForwardD = PredDep; |
| 64 | ForwardD.setSUnit(&SU); |
| 65 | for (SDep &SuccDep : PredSU->Succs) { |
| 66 | if (SuccDep == ForwardD) { |
| 67 | SuccDep.setLatency(SuccDep.getLatency() + Latency); |
| 68 | break; |
| 69 | } |
| 70 | } |
| 71 | PredDep.setLatency(PredDep.getLatency() + Latency); |
| 72 | PredSU->setDepthDirty(); |
| 73 | SU.setDepthDirty(); |
| 74 | } |
| 75 | |
| 76 | void setLatencyForEdge(SDep &PredDep, SUnit &SU, unsigned Latency) { |
| 77 | SUnit *PredSU = PredDep.getSUnit(); |
| 78 | SDep ForwardD = PredDep; |
| 79 | ForwardD.setSUnit(&SU); |
| 80 | for (SDep &SuccDep : PredSU->Succs) { |
| 81 | if (SuccDep == ForwardD) { |
| 82 | SuccDep.setLatency(Latency); |
| 83 | break; |
| 84 | } |
| 85 | } |
| 86 | PredDep.setLatency(Latency); |
| 87 | PredSU->setDepthDirty(); |
| 88 | SU.setDepthDirty(); |
| 89 | } |
| 90 | |
| 91 | void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { |
| 92 | const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII); |
| 93 | constexpr unsigned FenceLatency = 2000; |
| 94 | const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt; |
| 95 | SmallVector<SUnit *, 8> RegionTDM; |
| 96 | SmallVector<SUnit *, 8> RegionAsync; |
| 97 | const TargetSchedModel *SchedModel = DAG->getSchedModel(); |
| 98 | |
| 99 | for (SUnit &SU : DAG->SUnits) { |
| 100 | const MachineInstr *MI = SU.getInstr(); |
| 101 | unsigned Op = MI->getOpcode(); |
| 102 | |
| 103 | if (Op == AMDGPU::ATOMIC_FENCE) { |
| 104 | // Update latency on barrier edges of ATOMIC_FENCE. |
| 105 | // Ignore scopes not expected to have any latency. |
| 106 | SyncScope::ID SSID = |
| 107 | static_cast<SyncScope::ID>(MI->getOperand(i: 1).getImm()); |
| 108 | if (IgnoredScopes.contains(V: SSID)) |
| 109 | continue; |
| 110 | |
| 111 | for (SDep &PredDep : SU.Preds) { |
| 112 | if (!PredDep.isBarrier()) |
| 113 | continue; |
| 114 | SUnit *PredSU = PredDep.getSUnit(); |
| 115 | MachineInstr *MI = PredSU->getInstr(); |
| 116 | // Only consider memory loads |
| 117 | if (!MI->mayLoad() || MI->mayStore()) |
| 118 | continue; |
| 119 | |
| 120 | addLatencyToEdge(PredDep, SU, |
| 121 | Latency: SchedModel ? SchedModel->computeInstrLatency(MI, UseDefaultDefLatency: false) |
| 122 | : FenceLatency); |
| 123 | } |
| 124 | } else if (Op == AMDGPU::S_BARRIER_WAIT) { |
| 125 | for (SDep &PredDep : SU.Preds) { |
| 126 | SUnit *PredSU = PredDep.getSUnit(); |
| 127 | const MachineInstr *PredMI = PredSU->getInstr(); |
| 128 | if (TII->isBarrierStart(Opcode: PredMI->getOpcode())) { |
| 129 | addLatencyToEdge(PredDep, SU, Latency: BarrierSignalWaitLatency); |
| 130 | } |
| 131 | } |
| 132 | } else if (TII->isLDSDMA(MI: *MI)) { |
| 133 | if (MI->getDesc().TSFlags & SIInstrFlags::TENSOR_CNT) |
| 134 | RegionTDM.push_back(Elt: &SU); |
| 135 | else if (MI->getDesc().TSFlags & SIInstrFlags::ASYNC_CNT) |
| 136 | RegionAsync.push_back(Elt: &SU); |
| 137 | } else if (Op == AMDGPU::S_WAIT_TENSORCNT || |
| 138 | Op == AMDGPU::S_WAIT_ASYNCCNT) { |
| 139 | auto needWaitFor = [&](SmallVectorImpl<SUnit *> &RegionLDSDMA, SUnit *SU, |
| 140 | int64_t Count) { |
| 141 | if (RegionLDSDMA.size() <= static_cast<uint64_t>(Count)) { |
| 142 | return false; |
| 143 | } |
| 144 | |
| 145 | int64_t Counter = 0; |
| 146 | auto I = RegionLDSDMA.rbegin(), E = RegionLDSDMA.rend(); |
| 147 | for (; I != E; I++) { |
| 148 | if (Counter >= Count) |
| 149 | return true; |
| 150 | |
| 151 | if (SU->NodeNum == (*I)->NodeNum) |
| 152 | return false; |
| 153 | |
| 154 | ++Counter; |
| 155 | } |
| 156 | llvm_unreachable("Malformed RegionLDSDMA" ); |
| 157 | }; |
| 158 | |
| 159 | int64_t WaitVal = MI->getOperand(i: 0).getImm(); |
| 160 | for (SDep &PredDep : SU.Preds) { |
| 161 | if (PredDep.getKind() != SDep::Kind::Data) |
| 162 | continue; |
| 163 | |
| 164 | Register DepReg = PredDep.getReg(); |
| 165 | Register LDSDMACnt = AMDGPU::TENSORcnt; |
| 166 | uint64_t LDSDMAFlags = SIInstrFlags::TENSOR_CNT; |
| 167 | if (Op == AMDGPU::S_WAIT_ASYNCCNT) { |
| 168 | LDSDMACnt = AMDGPU::ASYNCcnt; |
| 169 | LDSDMAFlags = SIInstrFlags::ASYNC_CNT; |
| 170 | } |
| 171 | |
| 172 | if (DepReg != LDSDMACnt) |
| 173 | continue; |
| 174 | |
| 175 | SUnit *PredSU = PredDep.getSUnit(); |
| 176 | |
| 177 | // The data dep can be carried by a non-LDSDMA SU |
| 178 | // (e.g. an intervening COPY or pseudo). Such predecessors are not |
| 179 | // tracked, so needWaitFor cannot reason about them. |
| 180 | if (!(PredSU->getInstr()->getDesc().TSFlags & LDSDMAFlags)) |
| 181 | continue; |
| 182 | |
| 183 | if (!needWaitFor(Op == AMDGPU::S_WAIT_ASYNCCNT ? RegionAsync |
| 184 | : RegionTDM, |
| 185 | PredSU, WaitVal)) { |
| 186 | setLatencyForEdge(PredDep, SU, Latency: 1); |
| 187 | } |
| 188 | } |
| 189 | } |
| 190 | } |
| 191 | } |
| 192 | |
| 193 | } // end namespace |
| 194 | |
| 195 | std::unique_ptr<ScheduleDAGMutation> |
| 196 | llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) { |
| 197 | return std::make_unique<BarrierLatency>(args&: MF); |
| 198 | } |
| 199 | |