| 1 | //===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file This file contains a DAG scheduling mutation to add latency to: |
| 10 | /// 1. Barrier edges between ATOMIC_FENCE instructions and preceding |
| 11 | /// memory accesses potentially affected by the fence. |
| 12 | /// This encourages the scheduling of more instructions before |
| 13 | /// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may |
| 14 | /// introduce wait counting or indicate an impending S_BARRIER |
| 15 | /// wait. Having more instructions in-flight across these |
| 16 | /// constructs improves latency hiding. |
| 17 | /// 2. Barrier edges from S_BARRIER_SIGNAL to S_BARRIER_WAIT. |
| 18 | /// This encourages independent work to be scheduled between |
| 19 | /// signal and wait, hiding barrier synchronization latency. |
| 20 | // |
| 21 | //===----------------------------------------------------------------------===// |
| 22 | |
| 23 | #include "AMDGPUBarrierLatency.h" |
| 24 | #include "GCNSubtarget.h" |
| 25 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 26 | #include "SIInstrInfo.h" |
| 27 | #include "llvm/CodeGen/ScheduleDAGInstrs.h" |
| 28 | #include "llvm/Support/CommandLine.h" |
| 29 | |
| 30 | using namespace llvm; |
| 31 | |
| 32 | static cl::opt<unsigned> BarrierSignalWaitLatencyOpt( |
| 33 | "amdgpu-barrier-signal-wait-latency" , |
| 34 | cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT " |
| 35 | "to encourage scheduling independent work between them" ), |
| 36 | cl::init(Val: 16), cl::Hidden); |
| 37 | |
| 38 | namespace { |
| 39 | |
| 40 | class BarrierLatency : public ScheduleDAGMutation { |
| 41 | private: |
| 42 | SmallSet<SyncScope::ID, 4> IgnoredScopes; |
| 43 | |
| 44 | public: |
| 45 | BarrierLatency(MachineFunction *MF) { |
| 46 | LLVMContext &Context = MF->getFunction().getContext(); |
| 47 | IgnoredScopes.insert(V: SyncScope::SingleThread); |
| 48 | IgnoredScopes.insert(V: Context.getOrInsertSyncScopeID(SSN: "wavefront" )); |
| 49 | IgnoredScopes.insert(V: Context.getOrInsertSyncScopeID(SSN: "wavefront-one-as" )); |
| 50 | IgnoredScopes.insert(V: Context.getOrInsertSyncScopeID(SSN: "singlethread-one-as" )); |
| 51 | |
| 52 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
| 53 | if (!ST.requiresWaitOnWorkgroupReleaseFence()) { |
| 54 | // Prior to GFX10 workgroup scope does not normally require waitcnts |
| 55 | IgnoredScopes.insert(V: Context.getOrInsertSyncScopeID(SSN: "workgroup" )); |
| 56 | } |
| 57 | } |
| 58 | void apply(ScheduleDAGInstrs *DAG) override; |
| 59 | }; |
| 60 | |
| 61 | void addLatencyToEdge(SDep &PredDep, SUnit &SU, unsigned Latency) { |
| 62 | SUnit *PredSU = PredDep.getSUnit(); |
| 63 | SDep ForwardD = PredDep; |
| 64 | ForwardD.setSUnit(&SU); |
| 65 | for (SDep &SuccDep : PredSU->Succs) { |
| 66 | if (SuccDep == ForwardD) { |
| 67 | SuccDep.setLatency(SuccDep.getLatency() + Latency); |
| 68 | break; |
| 69 | } |
| 70 | } |
| 71 | PredDep.setLatency(PredDep.getLatency() + Latency); |
| 72 | PredSU->setDepthDirty(); |
| 73 | SU.setDepthDirty(); |
| 74 | } |
| 75 | |
| 76 | void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { |
| 77 | const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII); |
| 78 | constexpr unsigned FenceLatency = 2000; |
| 79 | const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt; |
| 80 | |
| 81 | for (SUnit &SU : DAG->SUnits) { |
| 82 | const MachineInstr *MI = SU.getInstr(); |
| 83 | unsigned Op = MI->getOpcode(); |
| 84 | |
| 85 | if (Op == AMDGPU::ATOMIC_FENCE) { |
| 86 | // Update latency on barrier edges of ATOMIC_FENCE. |
| 87 | // Ignore scopes not expected to have any latency. |
| 88 | SyncScope::ID SSID = |
| 89 | static_cast<SyncScope::ID>(MI->getOperand(i: 1).getImm()); |
| 90 | if (IgnoredScopes.contains(V: SSID)) |
| 91 | continue; |
| 92 | |
| 93 | for (SDep &PredDep : SU.Preds) { |
| 94 | if (!PredDep.isBarrier()) |
| 95 | continue; |
| 96 | SUnit *PredSU = PredDep.getSUnit(); |
| 97 | MachineInstr *MI = PredSU->getInstr(); |
| 98 | // Only consider memory loads |
| 99 | if (!MI->mayLoad() || MI->mayStore()) |
| 100 | continue; |
| 101 | addLatencyToEdge(PredDep, SU, Latency: FenceLatency); |
| 102 | } |
| 103 | } else if (Op == AMDGPU::S_BARRIER_WAIT) { |
| 104 | for (SDep &PredDep : SU.Preds) { |
| 105 | SUnit *PredSU = PredDep.getSUnit(); |
| 106 | const MachineInstr *PredMI = PredSU->getInstr(); |
| 107 | if (TII->isBarrierStart(Opcode: PredMI->getOpcode())) { |
| 108 | addLatencyToEdge(PredDep, SU, Latency: BarrierSignalWaitLatency); |
| 109 | } |
| 110 | } |
| 111 | } |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | } // end namespace |
| 116 | |
| 117 | std::unique_ptr<ScheduleDAGMutation> |
| 118 | llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) { |
| 119 | return std::make_unique<BarrierLatency>(args&: MF); |
| 120 | } |
| 121 | |