1 | //===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// |
10 | /// This file implements the bottleneck analysis view. |
11 | /// |
12 | /// This view internally observes backend pressure increase events in order to |
13 | /// identify problematic data dependencies and processor resource interferences. |
14 | /// |
15 | /// Example of bottleneck analysis report for a dot-product on X86 btver2: |
16 | /// |
17 | /// Cycles with backend pressure increase [ 40.76% ] |
18 | /// Throughput Bottlenecks: |
19 | /// Resource Pressure [ 39.34% ] |
20 | /// - JFPA [ 39.34% ] |
21 | /// - JFPU0 [ 39.34% ] |
22 | /// Data Dependencies: [ 1.42% ] |
23 | /// - Register Dependencies [ 1.42% ] |
24 | /// - Memory Dependencies [ 0.00% ] |
25 | /// |
26 | /// According to the example, backend pressure increased during the 40.76% of |
27 | /// the simulated cycles. In particular, the major cause of backend pressure |
28 | /// increases was the contention on floating point adder JFPA accessible from |
29 | /// pipeline resource JFPU0. |
30 | /// |
31 | /// At the end of each cycle, if pressure on the simulated out-of-order buffers |
32 | /// has increased, a backend pressure event is reported. |
33 | /// In particular, this occurs when there is a delta between the number of uOps |
34 | /// dispatched and the number of uOps issued to the underlying pipelines. |
35 | /// |
36 | /// The bottleneck analysis view is also responsible for identifying and |
37 | /// printing the most "critical" sequence of dependent instructions according to |
38 | /// the simulated run. |
39 | /// |
40 | /// Below is the critical sequence computed for the dot-product example on |
41 | /// btver2: |
42 | /// |
43 | /// Instruction Dependency Information |
44 | /// +----< 2. vhaddps %xmm3, %xmm3, %xmm4 |
45 | /// | |
46 | /// | < loop carried > |
47 | /// | |
48 | /// | 0. vmulps %xmm0, %xmm0, %xmm2 |
49 | /// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] |
50 | /// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3 |
51 | /// | |
52 | /// | < loop carried > |
53 | /// | |
54 | /// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] |
55 | /// |
56 | /// |
57 | /// The algorithm that computes the critical sequence is very similar to a |
58 | /// critical path analysis. |
59 | /// |
60 | /// A dependency graph is used internally to track dependencies between nodes. |
61 | /// Nodes of the graph represent instructions from the input assembly sequence, |
62 | /// and edges of the graph represent data dependencies or processor resource |
63 | /// interferences. |
64 | /// |
65 | /// Edges are dynamically 'discovered' by observing instruction state |
66 | /// transitions and backend pressure increase events. Edges are internally |
67 | /// ranked based on their "criticality". A dependency is considered to be |
68 | /// critical if it takes a long time to execute, and if it contributes to |
69 | /// backend pressure increases. Criticality is internally measured in terms of |
70 | /// cycles; it is computed for every edge in the graph as a function of the edge |
71 | /// latency and the number of backend pressure increase cycles contributed by |
72 | /// that edge. |
73 | /// |
74 | /// At the end of simulation, costs are propagated to nodes through the edges of |
75 | /// the graph, and the most expensive path connecting the root-set (a |
76 | /// set of nodes with no predecessors) to a leaf node is reported as critical |
77 | /// sequence. |
78 | // |
79 | //===----------------------------------------------------------------------===// |
80 | |
81 | #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H |
82 | #define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H |
83 | |
84 | #include "Views/InstructionView.h" |
85 | #include "llvm/ADT/DenseMap.h" |
86 | #include "llvm/ADT/SmallVector.h" |
87 | #include "llvm/MC/MCInstPrinter.h" |
88 | #include "llvm/MC/MCSchedule.h" |
89 | #include "llvm/MC/MCSubtargetInfo.h" |
90 | #include "llvm/Support/FormattedStream.h" |
91 | #include "llvm/Support/raw_ostream.h" |
92 | |
93 | namespace llvm { |
94 | namespace mca { |
95 | |
96 | class PressureTracker { |
97 | const MCSchedModel &SM; |
98 | |
99 | // Resource pressure distribution. There is an element for every processor |
100 | // resource declared by the scheduling model. Quantities are number of cycles. |
101 | SmallVector<unsigned, 4> ResourcePressureDistribution; |
102 | |
103 | // Each processor resource is associated with a so-called processor resource |
104 | // mask. This vector allows to correlate processor resource IDs with processor |
105 | // resource masks. There is exactly one element per each processor resource |
106 | // declared by the scheduling model. |
107 | SmallVector<uint64_t, 4> ProcResID2Mask; |
108 | |
109 | // Maps processor resource state indices (returned by calls to |
110 | // `getResourceStateIndex(Mask)` to processor resource identifiers. |
111 | SmallVector<unsigned, 4> ResIdx2ProcResID; |
112 | |
113 | // Maps Processor Resource identifiers to ResourceUsers indices. |
114 | SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex; |
115 | |
116 | // Identifies the last user of a processor resource unit. |
117 | // This vector is updated on every instruction issued event. |
118 | // There is one entry for every processor resource unit declared by the |
119 | // processor model. An all_ones value is treated like an invalid instruction |
120 | // identifier. |
121 | using User = std::pair<unsigned, unsigned>; |
122 | SmallVector<User, 4> ResourceUsers; |
123 | |
124 | struct InstructionPressureInfo { |
125 | unsigned RegisterPressureCycles; |
126 | unsigned MemoryPressureCycles; |
127 | unsigned ResourcePressureCycles; |
128 | }; |
129 | DenseMap<unsigned, InstructionPressureInfo> IPI; |
130 | |
131 | void updateResourcePressureDistribution(uint64_t CumulativeMask); |
132 | |
133 | User getResourceUser(unsigned ProcResID, unsigned UnitID) const { |
134 | unsigned Index = ProcResID2ResourceUsersIndex[ProcResID]; |
135 | return ResourceUsers[Index + UnitID]; |
136 | } |
137 | |
138 | public: |
139 | PressureTracker(const MCSchedModel &Model); |
140 | |
141 | ArrayRef<unsigned> getResourcePressureDistribution() const { |
142 | return ResourcePressureDistribution; |
143 | } |
144 | |
145 | void getResourceUsers(uint64_t ResourceMask, |
146 | SmallVectorImpl<User> &Users) const; |
147 | |
148 | unsigned getRegisterPressureCycles(unsigned IID) const { |
149 | assert(IPI.contains(IID) && "Instruction is not tracked!" ); |
150 | const InstructionPressureInfo &Info = IPI.find(Val: IID)->second; |
151 | return Info.RegisterPressureCycles; |
152 | } |
153 | |
154 | unsigned getMemoryPressureCycles(unsigned IID) const { |
155 | assert(IPI.contains(IID) && "Instruction is not tracked!" ); |
156 | const InstructionPressureInfo &Info = IPI.find(Val: IID)->second; |
157 | return Info.MemoryPressureCycles; |
158 | } |
159 | |
160 | unsigned getResourcePressureCycles(unsigned IID) const { |
161 | assert(IPI.contains(IID) && "Instruction is not tracked!" ); |
162 | const InstructionPressureInfo &Info = IPI.find(Val: IID)->second; |
163 | return Info.ResourcePressureCycles; |
164 | } |
165 | |
166 | const char *resolveResourceName(uint64_t ResourceMask) const { |
167 | unsigned Index = getResourceStateIndex(Mask: ResourceMask); |
168 | unsigned ProcResID = ResIdx2ProcResID[Index]; |
169 | const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResourceIdx: ProcResID); |
170 | return PRDesc.Name; |
171 | } |
172 | |
173 | void onInstructionDispatched(unsigned IID); |
174 | void onInstructionExecuted(unsigned IID); |
175 | |
176 | void handlePressureEvent(const HWPressureEvent &Event); |
177 | void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event); |
178 | }; |
179 | |
180 | // A dependency edge. |
181 | struct DependencyEdge { |
182 | enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE }; |
183 | |
184 | // Dependency edge descriptor. |
185 | // |
186 | // It specifies the dependency type, as well as the edge cost in cycles. |
187 | struct Dependency { |
188 | DependencyType Type; |
189 | uint64_t ResourceOrRegID; |
190 | uint64_t Cost; |
191 | }; |
192 | Dependency Dep; |
193 | |
194 | unsigned FromIID; |
195 | unsigned ToIID; |
196 | |
197 | // Used by the bottleneck analysis to compute the interference |
198 | // probability for processor resources. |
199 | unsigned Frequency; |
200 | }; |
201 | |
202 | // A dependency graph used by the bottleneck analysis to describe data |
203 | // dependencies and processor resource interferences between instructions. |
204 | // |
205 | // There is a node (an instance of struct DGNode) for every instruction in the |
206 | // input assembly sequence. Edges of the graph represent dependencies between |
207 | // instructions. |
208 | // |
209 | // Each edge of the graph is associated with a cost value which is used |
210 | // internally to rank dependency based on their impact on the runtime |
211 | // performance (see field DependencyEdge::Dependency::Cost). In general, the |
212 | // higher the cost of an edge, the higher the impact on performance. |
213 | // |
214 | // The cost of a dependency is a function of both the latency and the number of |
215 | // cycles where the dependency has been seen as critical (i.e. contributing to |
216 | // back-pressure increases). |
217 | // |
218 | // Loop carried dependencies are carefully expanded by the bottleneck analysis |
219 | // to guarantee that the graph stays acyclic. To this end, extra nodes are |
220 | // pre-allocated at construction time to describe instructions from "past and |
221 | // future" iterations. The graph is kept acyclic mainly because it simplifies |
222 | // the complexity of the algorithm that computes the critical sequence. |
223 | class DependencyGraph { |
224 | struct DGNode { |
225 | unsigned NumPredecessors; |
226 | unsigned NumVisitedPredecessors; |
227 | uint64_t Cost; |
228 | unsigned Depth; |
229 | |
230 | DependencyEdge CriticalPredecessor; |
231 | SmallVector<DependencyEdge, 8> OutgoingEdges; |
232 | }; |
233 | SmallVector<DGNode, 16> Nodes; |
234 | |
235 | DependencyGraph(const DependencyGraph &) = delete; |
236 | DependencyGraph &operator=(const DependencyGraph &) = delete; |
237 | |
238 | void addDependency(unsigned From, unsigned To, |
239 | DependencyEdge::Dependency &&DE); |
240 | |
241 | void pruneEdges(unsigned Iterations); |
242 | void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const; |
243 | void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, |
244 | unsigned Iterations); |
245 | |
246 | #ifndef NDEBUG |
247 | void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE, |
248 | MCInstPrinter &MCIP) const; |
249 | #endif |
250 | |
251 | public: |
252 | DependencyGraph(unsigned Size) : Nodes(Size) {} |
253 | |
254 | void addRegisterDep(unsigned From, unsigned To, unsigned RegID, |
255 | unsigned Cost) { |
256 | addDependency(From, To, DE: {.Type: DependencyEdge::DT_REGISTER, .ResourceOrRegID: RegID, .Cost: Cost}); |
257 | } |
258 | |
259 | void addMemoryDep(unsigned From, unsigned To, unsigned Cost) { |
260 | addDependency(From, To, DE: {.Type: DependencyEdge::DT_MEMORY, /* unused */ .ResourceOrRegID: 0, .Cost: Cost}); |
261 | } |
262 | |
263 | void addResourceDep(unsigned From, unsigned To, uint64_t Mask, |
264 | unsigned Cost) { |
265 | addDependency(From, To, DE: {.Type: DependencyEdge::DT_RESOURCE, .ResourceOrRegID: Mask, .Cost: Cost}); |
266 | } |
267 | |
268 | // Called by the bottleneck analysis at the end of simulation to propagate |
269 | // costs through the edges of the graph, and compute a critical path. |
270 | void finalizeGraph(unsigned Iterations) { |
271 | SmallVector<unsigned, 16> RootSet; |
272 | pruneEdges(Iterations); |
273 | initializeRootSet(RootSet); |
274 | propagateThroughEdges(RootSet, Iterations); |
275 | } |
276 | |
277 | // Returns a sequence of edges representing the critical sequence based on the |
278 | // simulated run. It assumes that the graph has already been finalized (i.e. |
279 | // method `finalizeGraph()` has already been called on this graph). |
280 | void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const; |
281 | |
282 | #ifndef NDEBUG |
283 | void dump(raw_ostream &OS, MCInstPrinter &MCIP) const; |
284 | #endif |
285 | }; |
286 | |
287 | /// A view that collects and prints a few performance numbers. |
288 | class BottleneckAnalysis : public InstructionView { |
289 | PressureTracker Tracker; |
290 | DependencyGraph DG; |
291 | |
292 | unsigned Iterations; |
293 | unsigned TotalCycles; |
294 | |
295 | bool PressureIncreasedBecauseOfResources; |
296 | bool PressureIncreasedBecauseOfRegisterDependencies; |
297 | bool PressureIncreasedBecauseOfMemoryDependencies; |
298 | // True if throughput was affected by dispatch stalls. |
299 | bool SeenStallCycles; |
300 | |
301 | struct BackPressureInfo { |
302 | // Cycles where backpressure increased. |
303 | unsigned PressureIncreaseCycles; |
304 | // Cycles where backpressure increased because of pipeline pressure. |
305 | unsigned ResourcePressureCycles; |
306 | // Cycles where backpressure increased because of data dependencies. |
307 | unsigned DataDependencyCycles; |
308 | // Cycles where backpressure increased because of register dependencies. |
309 | unsigned RegisterDependencyCycles; |
310 | // Cycles where backpressure increased because of memory dependencies. |
311 | unsigned MemoryDependencyCycles; |
312 | }; |
313 | BackPressureInfo BPI; |
314 | |
315 | // Used to populate the dependency graph DG. |
316 | void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy); |
317 | void addMemoryDep(unsigned From, unsigned To, unsigned Cy); |
318 | void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy); |
319 | |
320 | void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI, |
321 | bool UseDifferentColor = false) const; |
322 | |
323 | // Prints a bottleneck message to OS. |
324 | void printBottleneckHints(raw_ostream &OS) const; |
325 | void printCriticalSequence(raw_ostream &OS) const; |
326 | |
327 | public: |
328 | BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP, |
329 | ArrayRef<MCInst> Sequence, unsigned Iterations); |
330 | |
331 | void onCycleEnd() override; |
332 | void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; } |
333 | void onEvent(const HWPressureEvent &Event) override; |
334 | void onEvent(const HWInstructionEvent &Event) override; |
335 | |
336 | void printView(raw_ostream &OS) const override; |
337 | StringRef getNameAsString() const override { return "BottleneckAnalysis" ; } |
338 | bool isSerializable() const override { return false; } |
339 | |
340 | #ifndef NDEBUG |
341 | void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); } |
342 | #endif |
343 | }; |
344 | |
345 | } // namespace mca |
346 | } // namespace llvm |
347 | |
348 | #endif |
349 | |