1 | //===- lib/CodeGen/MachineTraceMetrics.h - Super-scalar metrics -*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines the interface for the MachineTraceMetrics analysis pass |
10 | // that estimates CPU resource usage and critical data dependency paths through |
11 | // preferred traces. This is useful for super-scalar CPUs where execution speed |
12 | // can be limited both by data dependencies and by limited execution resources. |
13 | // |
14 | // Out-of-order CPUs will often be executing instructions from multiple basic |
15 | // blocks at the same time. This makes it difficult to estimate the resource |
16 | // usage accurately in a single basic block. Resources can be estimated better |
17 | // by looking at a trace through the current basic block. |
18 | // |
19 | // For every block, the MachineTraceMetrics pass will pick a preferred trace |
20 | // that passes through the block. The trace is chosen based on loop structure, |
21 | // branch probabilities, and resource usage. The intention is to pick likely |
22 | // traces that would be the most affected by code transformations. |
23 | // |
24 | // It is expensive to compute a full arbitrary trace for every block, so to |
25 | // save some computations, traces are chosen to be convergent. This means that |
26 | // if the traces through basic blocks A and B ever cross when moving away from |
27 | // A and B, they never diverge again. This applies in both directions - If the |
28 | // traces meet above A and B, they won't diverge when going further back. |
29 | // |
30 | // Traces tend to align with loops. The trace through a block in an inner loop |
31 | // will begin at the loop entry block and end at a back edge. If there are |
32 | // nested loops, the trace may begin and end at those instead. |
33 | // |
34 | // For each trace, we compute the critical path length, which is the number of |
35 | // cycles required to execute the trace when execution is limited by data |
36 | // dependencies only. We also compute the resource height, which is the number |
37 | // of cycles required to execute all instructions in the trace when ignoring |
38 | // data dependencies. |
39 | // |
40 | // Every instruction in the current block has a slack - the number of cycles |
41 | // execution of the instruction can be delayed without extending the critical |
42 | // path. |
43 | // |
44 | //===----------------------------------------------------------------------===// |
45 | |
46 | #ifndef LLVM_CODEGEN_MACHINETRACEMETRICS_H |
47 | #define LLVM_CODEGEN_MACHINETRACEMETRICS_H |
48 | |
49 | #include "llvm/ADT/SparseSet.h" |
50 | #include "llvm/ADT/ArrayRef.h" |
51 | #include "llvm/ADT/DenseMap.h" |
52 | #include "llvm/ADT/SmallVector.h" |
53 | #include "llvm/CodeGen/MachineBasicBlock.h" |
54 | #include "llvm/CodeGen/MachineFunctionPass.h" |
55 | #include "llvm/CodeGen/TargetSchedule.h" |
56 | |
57 | namespace llvm { |
58 | |
59 | class AnalysisUsage; |
60 | class MachineFunction; |
61 | class MachineInstr; |
62 | class MachineLoop; |
63 | class MachineLoopInfo; |
64 | class MachineRegisterInfo; |
65 | struct MCSchedClassDesc; |
66 | class raw_ostream; |
67 | class TargetInstrInfo; |
68 | class TargetRegisterInfo; |
69 | |
70 | // Keep track of physreg data dependencies by recording each live register unit. |
71 | // Associate each regunit with an instruction operand. Depending on the |
72 | // direction instructions are scanned, it could be the operand that defined the |
73 | // regunit, or the highest operand to read the regunit. |
74 | struct LiveRegUnit { |
75 | unsigned RegUnit; |
76 | unsigned Cycle = 0; |
77 | const MachineInstr *MI = nullptr; |
78 | unsigned Op = 0; |
79 | |
80 | unsigned getSparseSetIndex() const { return RegUnit; } |
81 | |
82 | LiveRegUnit(unsigned RU) : RegUnit(RU) {} |
83 | }; |
84 | |
85 | /// Strategies for selecting traces. |
86 | enum class MachineTraceStrategy { |
87 | /// Select the trace through a block that has the fewest instructions. |
88 | TS_MinInstrCount, |
89 | /// Select the trace that contains only the current basic block. For instance, |
90 | /// this strategy can be used by MachineCombiner to make better decisions when |
91 | /// we estimate critical path for in-order cores. |
92 | TS_Local, |
93 | TS_NumStrategies |
94 | }; |
95 | |
96 | class MachineTraceMetrics : public MachineFunctionPass { |
97 | const MachineFunction *MF = nullptr; |
98 | const TargetInstrInfo *TII = nullptr; |
99 | const TargetRegisterInfo *TRI = nullptr; |
100 | const MachineRegisterInfo *MRI = nullptr; |
101 | const MachineLoopInfo *Loops = nullptr; |
102 | TargetSchedModel SchedModel; |
103 | |
104 | public: |
105 | friend class Ensemble; |
106 | friend class Trace; |
107 | |
108 | class Ensemble; |
109 | |
110 | static char ID; |
111 | |
112 | MachineTraceMetrics(); |
113 | |
114 | void getAnalysisUsage(AnalysisUsage&) const override; |
115 | bool runOnMachineFunction(MachineFunction&) override; |
116 | void releaseMemory() override; |
117 | void verifyAnalysis() const override; |
118 | |
119 | /// Per-basic block information that doesn't depend on the trace through the |
120 | /// block. |
121 | struct FixedBlockInfo { |
122 | /// The number of non-trivial instructions in the block. |
123 | /// Doesn't count PHI and COPY instructions that are likely to be removed. |
124 | unsigned InstrCount = ~0u; |
125 | |
126 | /// True when the block contains calls. |
127 | bool HasCalls = false; |
128 | |
129 | FixedBlockInfo() = default; |
130 | |
131 | /// Returns true when resource information for this block has been computed. |
132 | bool hasResources() const { return InstrCount != ~0u; } |
133 | |
134 | /// Invalidate resource information. |
135 | void invalidate() { InstrCount = ~0u; } |
136 | }; |
137 | |
138 | /// Get the fixed resource information about MBB. Compute it on demand. |
139 | const FixedBlockInfo *getResources(const MachineBasicBlock*); |
140 | |
141 | /// Get the scaled number of cycles used per processor resource in MBB. |
142 | /// This is an array with SchedModel.getNumProcResourceKinds() entries. |
143 | /// The getResources() function above must have been called first. |
144 | /// |
145 | /// These numbers have already been scaled by SchedModel.getResourceFactor(). |
146 | ArrayRef<unsigned> getProcReleaseAtCycles(unsigned MBBNum) const; |
147 | |
148 | /// A virtual register or regunit required by a basic block or its trace |
149 | /// successors. |
150 | struct LiveInReg { |
151 | /// The virtual register required, or a register unit. |
152 | Register Reg; |
153 | |
154 | /// For virtual registers: Minimum height of the defining instruction. |
155 | /// For regunits: Height of the highest user in the trace. |
156 | unsigned Height; |
157 | |
158 | LiveInReg(Register Reg, unsigned Height = 0) : Reg(Reg), Height(Height) {} |
159 | }; |
160 | |
161 | /// Per-basic block information that relates to a specific trace through the |
162 | /// block. Convergent traces means that only one of these is required per |
163 | /// block in a trace ensemble. |
164 | struct TraceBlockInfo { |
165 | /// Trace predecessor, or NULL for the first block in the trace. |
166 | /// Valid when hasValidDepth(). |
167 | const MachineBasicBlock *Pred = nullptr; |
168 | |
169 | /// Trace successor, or NULL for the last block in the trace. |
170 | /// Valid when hasValidHeight(). |
171 | const MachineBasicBlock *Succ = nullptr; |
172 | |
173 | /// The block number of the head of the trace. (When hasValidDepth()). |
174 | unsigned Head; |
175 | |
176 | /// The block number of the tail of the trace. (When hasValidHeight()). |
177 | unsigned Tail; |
178 | |
179 | /// Accumulated number of instructions in the trace above this block. |
180 | /// Does not include instructions in this block. |
181 | unsigned InstrDepth = ~0u; |
182 | |
183 | /// Accumulated number of instructions in the trace below this block. |
184 | /// Includes instructions in this block. |
185 | unsigned InstrHeight = ~0u; |
186 | |
187 | TraceBlockInfo() = default; |
188 | |
189 | /// Returns true if the depth resources have been computed from the trace |
190 | /// above this block. |
191 | bool hasValidDepth() const { return InstrDepth != ~0u; } |
192 | |
193 | /// Returns true if the height resources have been computed from the trace |
194 | /// below this block. |
195 | bool hasValidHeight() const { return InstrHeight != ~0u; } |
196 | |
197 | /// Invalidate depth resources when some block above this one has changed. |
198 | void invalidateDepth() { InstrDepth = ~0u; HasValidInstrDepths = false; } |
199 | |
200 | /// Invalidate height resources when a block below this one has changed. |
201 | void invalidateHeight() { InstrHeight = ~0u; HasValidInstrHeights = false; } |
202 | |
203 | /// Assuming that this is a dominator of TBI, determine if it contains |
204 | /// useful instruction depths. A dominating block can be above the current |
205 | /// trace head, and any dependencies from such a far away dominator are not |
206 | /// expected to affect the critical path. |
207 | /// |
208 | /// Also returns true when TBI == this. |
209 | bool isUsefulDominator(const TraceBlockInfo &TBI) const { |
210 | // The trace for TBI may not even be calculated yet. |
211 | if (!hasValidDepth() || !TBI.hasValidDepth()) |
212 | return false; |
213 | // Instruction depths are only comparable if the traces share a head. |
214 | if (Head != TBI.Head) |
215 | return false; |
216 | // It is almost always the case that TBI belongs to the same trace as |
217 | // this block, but rare convoluted cases involving irreducible control |
218 | // flow, a dominator may share a trace head without actually being on the |
219 | // same trace as TBI. This is not a big problem as long as it doesn't |
220 | // increase the instruction depth. |
221 | return HasValidInstrDepths && InstrDepth <= TBI.InstrDepth; |
222 | } |
223 | |
224 | // Data-dependency-related information. Per-instruction depth and height |
225 | // are computed from data dependencies in the current trace, using |
226 | // itinerary data. |
227 | |
228 | /// Instruction depths have been computed. This implies hasValidDepth(). |
229 | bool HasValidInstrDepths = false; |
230 | |
231 | /// Instruction heights have been computed. This implies hasValidHeight(). |
232 | bool HasValidInstrHeights = false; |
233 | |
234 | /// Critical path length. This is the number of cycles in the longest data |
235 | /// dependency chain through the trace. This is only valid when both |
236 | /// HasValidInstrDepths and HasValidInstrHeights are set. |
237 | unsigned CriticalPath; |
238 | |
239 | /// Live-in registers. These registers are defined above the current block |
240 | /// and used by this block or a block below it. |
241 | /// This does not include PHI uses in the current block, but it does |
242 | /// include PHI uses in deeper blocks. |
243 | SmallVector<LiveInReg, 4> LiveIns; |
244 | |
245 | void print(raw_ostream&) const; |
246 | void dump() const { print(dbgs()); } |
247 | }; |
248 | |
249 | /// InstrCycles represents the cycle height and depth of an instruction in a |
250 | /// trace. |
251 | struct InstrCycles { |
252 | /// Earliest issue cycle as determined by data dependencies and instruction |
253 | /// latencies from the beginning of the trace. Data dependencies from |
254 | /// before the trace are not included. |
255 | unsigned Depth; |
256 | |
257 | /// Minimum number of cycles from this instruction is issued to the of the |
258 | /// trace, as determined by data dependencies and instruction latencies. |
259 | unsigned Height; |
260 | }; |
261 | |
262 | /// A trace represents a plausible sequence of executed basic blocks that |
263 | /// passes through the current basic block one. The Trace class serves as a |
264 | /// handle to internal cached data structures. |
265 | class Trace { |
266 | Ensemble &TE; |
267 | TraceBlockInfo &TBI; |
268 | |
269 | unsigned getBlockNum() const { return &TBI - &TE.BlockInfo[0]; } |
270 | |
271 | public: |
272 | explicit Trace(Ensemble &te, TraceBlockInfo &tbi) : TE(te), TBI(tbi) {} |
273 | |
274 | void print(raw_ostream&) const; |
275 | void dump() const { print(dbgs()); } |
276 | |
277 | /// Compute the total number of instructions in the trace. |
278 | unsigned getInstrCount() const { |
279 | return TBI.InstrDepth + TBI.InstrHeight; |
280 | } |
281 | |
282 | /// Return the resource depth of the top/bottom of the trace center block. |
283 | /// This is the number of cycles required to execute all instructions from |
284 | /// the trace head to the trace center block. The resource depth only |
285 | /// considers execution resources, it ignores data dependencies. |
286 | /// When Bottom is set, instructions in the trace center block are included. |
287 | unsigned getResourceDepth(bool Bottom) const; |
288 | |
289 | /// Return the resource length of the trace. This is the number of cycles |
290 | /// required to execute the instructions in the trace if they were all |
291 | /// independent, exposing the maximum instruction-level parallelism. |
292 | /// |
293 | /// Any blocks in Extrablocks are included as if they were part of the |
294 | /// trace. Likewise, extra resources required by the specified scheduling |
295 | /// classes are included. For the caller to account for extra machine |
296 | /// instructions, it must first resolve each instruction's scheduling class. |
297 | unsigned getResourceLength( |
298 | ArrayRef<const MachineBasicBlock *> = std::nullopt, |
299 | ArrayRef<const MCSchedClassDesc *> = std::nullopt, |
300 | ArrayRef<const MCSchedClassDesc *> RemoveInstrs = std::nullopt) const; |
301 | |
302 | /// Return the length of the (data dependency) critical path through the |
303 | /// trace. |
304 | unsigned getCriticalPath() const { return TBI.CriticalPath; } |
305 | |
306 | /// Return the depth and height of MI. The depth is only valid for |
307 | /// instructions in or above the trace center block. The height is only |
308 | /// valid for instructions in or below the trace center block. |
309 | InstrCycles getInstrCycles(const MachineInstr &MI) const { |
310 | return TE.Cycles.lookup(Val: &MI); |
311 | } |
312 | |
313 | /// Return the slack of MI. This is the number of cycles MI can be delayed |
314 | /// before the critical path becomes longer. |
315 | /// MI must be an instruction in the trace center block. |
316 | unsigned getInstrSlack(const MachineInstr &MI) const; |
317 | |
318 | /// Return the Depth of a PHI instruction in a trace center block successor. |
319 | /// The PHI does not have to be part of the trace. |
320 | unsigned getPHIDepth(const MachineInstr &PHI) const; |
321 | |
322 | /// A dependence is useful if the basic block of the defining instruction |
323 | /// is part of the trace of the user instruction. It is assumed that DefMI |
324 | /// dominates UseMI (see also isUsefulDominator). |
325 | bool isDepInTrace(const MachineInstr &DefMI, |
326 | const MachineInstr &UseMI) const; |
327 | }; |
328 | |
329 | /// A trace ensemble is a collection of traces selected using the same |
330 | /// strategy, for example 'minimum resource height'. There is one trace for |
331 | /// every block in the function. |
332 | class Ensemble { |
333 | friend class Trace; |
334 | |
335 | SmallVector<TraceBlockInfo, 4> BlockInfo; |
336 | DenseMap<const MachineInstr*, InstrCycles> Cycles; |
337 | SmallVector<unsigned, 0> ProcResourceDepths; |
338 | SmallVector<unsigned, 0> ProcResourceHeights; |
339 | |
340 | void computeTrace(const MachineBasicBlock*); |
341 | void computeDepthResources(const MachineBasicBlock*); |
342 | void computeHeightResources(const MachineBasicBlock*); |
343 | unsigned computeCrossBlockCriticalPath(const TraceBlockInfo&); |
344 | void computeInstrDepths(const MachineBasicBlock*); |
345 | void computeInstrHeights(const MachineBasicBlock*); |
346 | void addLiveIns(const MachineInstr *DefMI, unsigned DefOp, |
347 | ArrayRef<const MachineBasicBlock*> Trace); |
348 | |
349 | protected: |
350 | MachineTraceMetrics &MTM; |
351 | |
352 | explicit Ensemble(MachineTraceMetrics*); |
353 | |
354 | virtual const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) =0; |
355 | virtual const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*) =0; |
356 | const MachineLoop *getLoopFor(const MachineBasicBlock*) const; |
357 | const TraceBlockInfo *getDepthResources(const MachineBasicBlock*) const; |
358 | const TraceBlockInfo *getHeightResources(const MachineBasicBlock*) const; |
359 | ArrayRef<unsigned> getProcResourceDepths(unsigned MBBNum) const; |
360 | ArrayRef<unsigned> getProcResourceHeights(unsigned MBBNum) const; |
361 | |
362 | public: |
363 | virtual ~Ensemble(); |
364 | |
365 | virtual const char *getName() const = 0; |
366 | void print(raw_ostream &) const; |
367 | void dump() const { print(dbgs()); } |
368 | void invalidate(const MachineBasicBlock *MBB); |
369 | void verify() const; |
370 | |
371 | /// Get the trace that passes through MBB. |
372 | /// The trace is computed on demand. |
373 | Trace getTrace(const MachineBasicBlock *MBB); |
374 | |
375 | /// Updates the depth of an machine instruction, given RegUnits. |
376 | void updateDepth(TraceBlockInfo &TBI, const MachineInstr&, |
377 | SparseSet<LiveRegUnit> &RegUnits); |
378 | void updateDepth(const MachineBasicBlock *, const MachineInstr&, |
379 | SparseSet<LiveRegUnit> &RegUnits); |
380 | |
381 | /// Updates the depth of the instructions from Start to End. |
382 | void updateDepths(MachineBasicBlock::iterator Start, |
383 | MachineBasicBlock::iterator End, |
384 | SparseSet<LiveRegUnit> &RegUnits); |
385 | |
386 | }; |
387 | |
388 | /// Get the trace ensemble representing the given trace selection strategy. |
389 | /// The returned Ensemble object is owned by the MachineTraceMetrics analysis, |
390 | /// and valid for the lifetime of the analysis pass. |
391 | Ensemble *getEnsemble(MachineTraceStrategy); |
392 | |
393 | /// Invalidate cached information about MBB. This must be called *before* MBB |
394 | /// is erased, or the CFG is otherwise changed. |
395 | /// |
396 | /// This invalidates per-block information about resource usage for MBB only, |
397 | /// and it invalidates per-trace information for any trace that passes |
398 | /// through MBB. |
399 | /// |
400 | /// Call Ensemble::getTrace() again to update any trace handles. |
401 | void invalidate(const MachineBasicBlock *MBB); |
402 | |
403 | private: |
404 | // One entry per basic block, indexed by block number. |
405 | SmallVector<FixedBlockInfo, 4> BlockInfo; |
406 | |
407 | // Cycles consumed on each processor resource per block. |
408 | // The number of processor resource kinds is constant for a given subtarget, |
409 | // but it is not known at compile time. The number of cycles consumed by |
410 | // block B on processor resource R is at ProcReleaseAtCycles[B*Kinds + R] |
411 | // where Kinds = SchedModel.getNumProcResourceKinds(). |
412 | SmallVector<unsigned, 0> ProcReleaseAtCycles; |
413 | |
414 | // One ensemble per strategy. |
415 | Ensemble |
416 | *Ensembles[static_cast<size_t>(MachineTraceStrategy::TS_NumStrategies)]; |
417 | |
418 | // Convert scaled resource usage to a cycle count that can be compared with |
419 | // latencies. |
420 | unsigned getCycles(unsigned Scaled) { |
421 | unsigned Factor = SchedModel.getLatencyFactor(); |
422 | return (Scaled + Factor - 1) / Factor; |
423 | } |
424 | }; |
425 | |
426 | inline raw_ostream &operator<<(raw_ostream &OS, |
427 | const MachineTraceMetrics::Trace &Tr) { |
428 | Tr.print(OS); |
429 | return OS; |
430 | } |
431 | |
432 | inline raw_ostream &operator<<(raw_ostream &OS, |
433 | const MachineTraceMetrics::Ensemble &En) { |
434 | En.print(OS); |
435 | return OS; |
436 | } |
437 | |
438 | } // end namespace llvm |
439 | |
440 | #endif // LLVM_CODEGEN_MACHINETRACEMETRICS_H |
441 | |