1//===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the bottleneck analysis view.
11///
12/// This view internally observes backend pressure increase events in order to
13/// identify problematic data dependencies and processor resource interferences.
14///
15/// Example of bottleneck analysis report for a dot-product on X86 btver2:
16///
17/// Cycles with backend pressure increase [ 40.76% ]
18/// Throughput Bottlenecks:
19/// Resource Pressure [ 39.34% ]
20/// - JFPA [ 39.34% ]
21/// - JFPU0 [ 39.34% ]
22/// Data Dependencies: [ 1.42% ]
23/// - Register Dependencies [ 1.42% ]
24/// - Memory Dependencies [ 0.00% ]
25///
26/// According to the example, backend pressure increased during the 40.76% of
27/// the simulated cycles. In particular, the major cause of backend pressure
28/// increases was the contention on floating point adder JFPA accessible from
29/// pipeline resource JFPU0.
30///
31/// At the end of each cycle, if pressure on the simulated out-of-order buffers
32/// has increased, a backend pressure event is reported.
33/// In particular, this occurs when there is a delta between the number of uOps
34/// dispatched and the number of uOps issued to the underlying pipelines.
35///
36/// The bottleneck analysis view is also responsible for identifying and
37/// printing the most "critical" sequence of dependent instructions according to
38/// the simulated run.
39///
40/// Below is the critical sequence computed for the dot-product example on
41/// btver2:
42///
43/// Instruction Dependency Information
44/// +----< 2. vhaddps %xmm3, %xmm3, %xmm4
45/// |
46/// | < loop carried >
47/// |
48/// | 0. vmulps %xmm0, %xmm0, %xmm2
49/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
50/// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3
51/// |
52/// | < loop carried >
53/// |
54/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
55///
56///
57/// The algorithm that computes the critical sequence is very similar to a
58/// critical path analysis.
59///
60/// A dependency graph is used internally to track dependencies between nodes.
61/// Nodes of the graph represent instructions from the input assembly sequence,
62/// and edges of the graph represent data dependencies or processor resource
63/// interferences.
64///
65/// Edges are dynamically 'discovered' by observing instruction state
66/// transitions and backend pressure increase events. Edges are internally
67/// ranked based on their "criticality". A dependency is considered to be
68/// critical if it takes a long time to execute, and if it contributes to
69/// backend pressure increases. Criticality is internally measured in terms of
70/// cycles; it is computed for every edge in the graph as a function of the edge
71/// latency and the number of backend pressure increase cycles contributed by
72/// that edge.
73///
74/// At the end of simulation, costs are propagated to nodes through the edges of
75/// the graph, and the most expensive path connecting the root-set (a
76/// set of nodes with no predecessors) to a leaf node is reported as critical
77/// sequence.
78//
79//===----------------------------------------------------------------------===//
80
81#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
82#define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
83
84#include "Views/InstructionView.h"
85#include "llvm/ADT/DenseMap.h"
86#include "llvm/ADT/SmallVector.h"
87#include "llvm/MC/MCInstPrinter.h"
88#include "llvm/MC/MCSchedule.h"
89#include "llvm/MC/MCSubtargetInfo.h"
90#include "llvm/Support/FormattedStream.h"
91#include "llvm/Support/raw_ostream.h"
92
93namespace llvm {
94namespace mca {
95
96class PressureTracker {
97 const MCSchedModel &SM;
98
99 // Resource pressure distribution. There is an element for every processor
100 // resource declared by the scheduling model. Quantities are number of cycles.
101 SmallVector<unsigned, 4> ResourcePressureDistribution;
102
103 // Each processor resource is associated with a so-called processor resource
104 // mask. This vector allows to correlate processor resource IDs with processor
105 // resource masks. There is exactly one element per each processor resource
106 // declared by the scheduling model.
107 SmallVector<uint64_t, 4> ProcResID2Mask;
108
109 // Maps processor resource state indices (returned by calls to
110 // `getResourceStateIndex(Mask)` to processor resource identifiers.
111 SmallVector<unsigned, 4> ResIdx2ProcResID;
112
113 // Maps Processor Resource identifiers to ResourceUsers indices.
114 SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
115
116 // Identifies the last user of a processor resource unit.
117 // This vector is updated on every instruction issued event.
118 // There is one entry for every processor resource unit declared by the
119 // processor model. An all_ones value is treated like an invalid instruction
120 // identifier.
121 using User = std::pair<unsigned, unsigned>;
122 SmallVector<User, 4> ResourceUsers;
123
124 struct InstructionPressureInfo {
125 unsigned RegisterPressureCycles;
126 unsigned MemoryPressureCycles;
127 unsigned ResourcePressureCycles;
128 };
129 DenseMap<unsigned, InstructionPressureInfo> IPI;
130
131 void updateResourcePressureDistribution(uint64_t CumulativeMask);
132
133 User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
134 unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
135 return ResourceUsers[Index + UnitID];
136 }
137
138public:
139 PressureTracker(const MCSchedModel &Model);
140
141 ArrayRef<unsigned> getResourcePressureDistribution() const {
142 return ResourcePressureDistribution;
143 }
144
145 void getResourceUsers(uint64_t ResourceMask,
146 SmallVectorImpl<User> &Users) const;
147
148 unsigned getRegisterPressureCycles(unsigned IID) const {
149 assert(IPI.contains(IID) && "Instruction is not tracked!");
150 const InstructionPressureInfo &Info = IPI.find(Val: IID)->second;
151 return Info.RegisterPressureCycles;
152 }
153
154 unsigned getMemoryPressureCycles(unsigned IID) const {
155 assert(IPI.contains(IID) && "Instruction is not tracked!");
156 const InstructionPressureInfo &Info = IPI.find(Val: IID)->second;
157 return Info.MemoryPressureCycles;
158 }
159
160 unsigned getResourcePressureCycles(unsigned IID) const {
161 assert(IPI.contains(IID) && "Instruction is not tracked!");
162 const InstructionPressureInfo &Info = IPI.find(Val: IID)->second;
163 return Info.ResourcePressureCycles;
164 }
165
166 const char *resolveResourceName(uint64_t ResourceMask) const {
167 unsigned Index = getResourceStateIndex(Mask: ResourceMask);
168 unsigned ProcResID = ResIdx2ProcResID[Index];
169 const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResourceIdx: ProcResID);
170 return PRDesc.Name;
171 }
172
173 void onInstructionDispatched(unsigned IID);
174 void onInstructionExecuted(unsigned IID);
175
176 void handlePressureEvent(const HWPressureEvent &Event);
177 void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
178};
179
180// A dependency edge.
181struct DependencyEdge {
182 enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
183
184 // Dependency edge descriptor.
185 //
186 // It specifies the dependency type, as well as the edge cost in cycles.
187 struct Dependency {
188 DependencyType Type;
189 uint64_t ResourceOrRegID;
190 uint64_t Cost;
191 };
192 Dependency Dep;
193
194 unsigned FromIID;
195 unsigned ToIID;
196
197 // Used by the bottleneck analysis to compute the interference
198 // probability for processor resources.
199 unsigned Frequency;
200};
201
202// A dependency graph used by the bottleneck analysis to describe data
203// dependencies and processor resource interferences between instructions.
204//
205// There is a node (an instance of struct DGNode) for every instruction in the
206// input assembly sequence. Edges of the graph represent dependencies between
207// instructions.
208//
209// Each edge of the graph is associated with a cost value which is used
210// internally to rank dependency based on their impact on the runtime
211// performance (see field DependencyEdge::Dependency::Cost). In general, the
212// higher the cost of an edge, the higher the impact on performance.
213//
214// The cost of a dependency is a function of both the latency and the number of
215// cycles where the dependency has been seen as critical (i.e. contributing to
216// back-pressure increases).
217//
218// Loop carried dependencies are carefully expanded by the bottleneck analysis
219// to guarantee that the graph stays acyclic. To this end, extra nodes are
220// pre-allocated at construction time to describe instructions from "past and
221// future" iterations. The graph is kept acyclic mainly because it simplifies
222// the complexity of the algorithm that computes the critical sequence.
223class DependencyGraph {
224 struct DGNode {
225 unsigned NumPredecessors;
226 unsigned NumVisitedPredecessors;
227 uint64_t Cost;
228 unsigned Depth;
229
230 DependencyEdge CriticalPredecessor;
231 SmallVector<DependencyEdge, 8> OutgoingEdges;
232 };
233 SmallVector<DGNode, 16> Nodes;
234
235 DependencyGraph(const DependencyGraph &) = delete;
236 DependencyGraph &operator=(const DependencyGraph &) = delete;
237
238 void addDependency(unsigned From, unsigned To,
239 DependencyEdge::Dependency &&DE);
240
241 void pruneEdges(unsigned Iterations);
242 void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
243 void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet,
244 unsigned Iterations);
245
246#ifndef NDEBUG
247 void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
248 MCInstPrinter &MCIP) const;
249#endif
250
251public:
252 DependencyGraph(unsigned Size) : Nodes(Size) {}
253
254 void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
255 unsigned Cost) {
256 addDependency(From, To, DE: {.Type: DependencyEdge::DT_REGISTER, .ResourceOrRegID: RegID, .Cost: Cost});
257 }
258
259 void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
260 addDependency(From, To, DE: {.Type: DependencyEdge::DT_MEMORY, /* unused */ .ResourceOrRegID: 0, .Cost: Cost});
261 }
262
263 void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
264 unsigned Cost) {
265 addDependency(From, To, DE: {.Type: DependencyEdge::DT_RESOURCE, .ResourceOrRegID: Mask, .Cost: Cost});
266 }
267
268 // Called by the bottleneck analysis at the end of simulation to propagate
269 // costs through the edges of the graph, and compute a critical path.
270 void finalizeGraph(unsigned Iterations) {
271 SmallVector<unsigned, 16> RootSet;
272 pruneEdges(Iterations);
273 initializeRootSet(RootSet);
274 propagateThroughEdges(RootSet, Iterations);
275 }
276
277 // Returns a sequence of edges representing the critical sequence based on the
278 // simulated run. It assumes that the graph has already been finalized (i.e.
279 // method `finalizeGraph()` has already been called on this graph).
280 void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
281
282#ifndef NDEBUG
283 void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
284#endif
285};
286
287/// A view that collects and prints a few performance numbers.
288class BottleneckAnalysis : public InstructionView {
289 PressureTracker Tracker;
290 DependencyGraph DG;
291
292 unsigned Iterations;
293 unsigned TotalCycles;
294
295 bool PressureIncreasedBecauseOfResources;
296 bool PressureIncreasedBecauseOfRegisterDependencies;
297 bool PressureIncreasedBecauseOfMemoryDependencies;
298 // True if throughput was affected by dispatch stalls.
299 bool SeenStallCycles;
300
301 struct BackPressureInfo {
302 // Cycles where backpressure increased.
303 unsigned PressureIncreaseCycles;
304 // Cycles where backpressure increased because of pipeline pressure.
305 unsigned ResourcePressureCycles;
306 // Cycles where backpressure increased because of data dependencies.
307 unsigned DataDependencyCycles;
308 // Cycles where backpressure increased because of register dependencies.
309 unsigned RegisterDependencyCycles;
310 // Cycles where backpressure increased because of memory dependencies.
311 unsigned MemoryDependencyCycles;
312 };
313 BackPressureInfo BPI;
314
315 // Used to populate the dependency graph DG.
316 void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
317 void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
318 void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
319
320 void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI,
321 bool UseDifferentColor = false) const;
322
323 // Prints a bottleneck message to OS.
324 void printBottleneckHints(raw_ostream &OS) const;
325 void printCriticalSequence(raw_ostream &OS) const;
326
327public:
328 BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
329 ArrayRef<MCInst> Sequence, unsigned Iterations);
330
331 void onCycleEnd() override;
332 void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
333 void onEvent(const HWPressureEvent &Event) override;
334 void onEvent(const HWInstructionEvent &Event) override;
335
336 void printView(raw_ostream &OS) const override;
337 StringRef getNameAsString() const override { return "BottleneckAnalysis"; }
338 bool isSerializable() const override { return false; }
339
340#ifndef NDEBUG
341 void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
342#endif
343};
344
345} // namespace mca
346} // namespace llvm
347
348#endif
349