1 | //===- xray-stacks.cpp: XRay Function Call Stack Accounting ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements stack-based accounting. It takes XRay traces, and |
10 | // collates statistics across these traces to show a breakdown of time spent |
11 | // at various points of the stack to provide insight into which functions |
12 | // spend the most time in terms of a call stack. We provide a few |
13 | // sorting/filtering options for zero'ing in on the useful stacks. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #include <forward_list> |
18 | #include <numeric> |
19 | |
20 | #include "func-id-helper.h" |
21 | #include "trie-node.h" |
22 | #include "xray-registry.h" |
23 | #include "llvm/ADT/StringExtras.h" |
24 | #include "llvm/Support/CommandLine.h" |
25 | #include "llvm/Support/Errc.h" |
26 | #include "llvm/Support/ErrorHandling.h" |
27 | #include "llvm/Support/FormatAdapters.h" |
28 | #include "llvm/Support/FormatVariadic.h" |
29 | #include "llvm/XRay/Graph.h" |
30 | #include "llvm/XRay/InstrumentationMap.h" |
31 | #include "llvm/XRay/Trace.h" |
32 | |
33 | using namespace llvm; |
34 | using namespace llvm::xray; |
35 | |
36 | static cl::SubCommand Stack("stack" , "Call stack accounting" ); |
37 | static cl::list<std::string> StackInputs(cl::Positional, |
38 | cl::desc("<xray trace>" ), cl::Required, |
39 | cl::sub(Stack), cl::OneOrMore); |
40 | |
41 | static cl::opt<bool> |
42 | StackKeepGoing("keep-going" , cl::desc("Keep going on errors encountered" ), |
43 | cl::sub(Stack), cl::init(Val: false)); |
44 | static cl::alias StackKeepGoing2("k" , cl::aliasopt(StackKeepGoing), |
45 | cl::desc("Alias for -keep-going" )); |
46 | |
47 | // TODO: Does there need to be an option to deduce tail or sibling calls? |
48 | |
49 | static cl::opt<std::string> StacksInstrMap( |
50 | "instr_map" , |
51 | cl::desc("instrumentation map used to identify function ids. " |
52 | "Currently supports elf file instrumentation maps." ), |
53 | cl::sub(Stack), cl::init(Val: "" )); |
54 | static cl::alias StacksInstrMap2("m" , cl::aliasopt(StacksInstrMap), |
55 | cl::desc("Alias for -instr_map" )); |
56 | |
57 | static cl::opt<bool> |
58 | SeparateThreadStacks("per-thread-stacks" , |
59 | cl::desc("Report top stacks within each thread id" ), |
60 | cl::sub(Stack), cl::init(Val: false)); |
61 | |
62 | static cl::opt<bool> |
63 | AggregateThreads("aggregate-threads" , |
64 | cl::desc("Aggregate stack times across threads" ), |
65 | cl::sub(Stack), cl::init(Val: false)); |
66 | |
67 | static cl::opt<bool> |
68 | DumpAllStacks("all-stacks" , |
69 | cl::desc("Dump sum of timings for all stacks. " |
70 | "By default separates stacks per-thread." ), |
71 | cl::sub(Stack), cl::init(Val: false)); |
72 | static cl::alias DumpAllStacksShort("all" , cl::aliasopt(DumpAllStacks), |
73 | cl::desc("Alias for -all-stacks" )); |
74 | |
75 | // TODO(kpw): Add other interesting formats. Perhaps chrome trace viewer format |
76 | // possibly with aggregations or just a linear trace of timings. |
77 | enum StackOutputFormat { HUMAN, FLAMETOOL }; |
78 | |
79 | static cl::opt<StackOutputFormat> StacksOutputFormat( |
80 | "stack-format" , |
81 | cl::desc("The format that output stacks should be " |
82 | "output in. Only applies with all-stacks." ), |
83 | cl::values( |
84 | clEnumValN(HUMAN, "human" , |
85 | "Human readable output. Only valid without -all-stacks." ), |
86 | clEnumValN(FLAMETOOL, "flame" , |
87 | "Format consumable by Brendan Gregg's FlameGraph tool. " |
88 | "Only valid with -all-stacks." )), |
89 | cl::sub(Stack), cl::init(Val: HUMAN)); |
90 | |
91 | // Types of values for each stack in a CallTrie. |
92 | enum class AggregationType { |
93 | TOTAL_TIME, // The total time spent in a stack and its callees. |
94 | INVOCATION_COUNT // The number of times the stack was invoked. |
95 | }; |
96 | |
97 | static cl::opt<AggregationType> RequestedAggregation( |
98 | "aggregation-type" , |
99 | cl::desc("The type of aggregation to do on call stacks." ), |
100 | cl::values( |
101 | clEnumValN( |
102 | AggregationType::TOTAL_TIME, "time" , |
103 | "Capture the total time spent in an all invocations of a stack." ), |
104 | clEnumValN(AggregationType::INVOCATION_COUNT, "count" , |
105 | "Capture the number of times a stack was invoked. " |
106 | "In flamegraph mode, this count also includes invocations " |
107 | "of all callees." )), |
108 | cl::sub(Stack), cl::init(Val: AggregationType::TOTAL_TIME)); |
109 | |
110 | /// A helper struct to work with formatv and XRayRecords. Makes it easier to |
111 | /// use instrumentation map names or addresses in formatted output. |
112 | struct format_xray_record : public FormatAdapter<XRayRecord> { |
113 | explicit format_xray_record(XRayRecord record, |
114 | const FuncIdConversionHelper &conv) |
115 | : FormatAdapter<XRayRecord>(std::move(record)), Converter(&conv) {} |
116 | void format(raw_ostream &Stream, StringRef Style) override { |
117 | Stream << formatv( |
118 | Fmt: "{FuncId: \"{0}\", ThreadId: \"{1}\", RecordType: \"{2}\"}" , |
119 | Vals: Converter->SymbolOrNumber(FuncId: Item.FuncId), Vals&: Item.TId, |
120 | Vals: DecodeRecordType(recordType: Item.RecordType)); |
121 | } |
122 | |
123 | private: |
124 | Twine DecodeRecordType(uint16_t recordType) { |
125 | switch (recordType) { |
126 | case 0: |
127 | return Twine("Fn Entry" ); |
128 | case 1: |
129 | return Twine("Fn Exit" ); |
130 | default: |
131 | // TODO: Add Tail exit when it is added to llvm/XRay/XRayRecord.h |
132 | return Twine("Unknown" ); |
133 | } |
134 | } |
135 | |
136 | const FuncIdConversionHelper *Converter; |
137 | }; |
138 | |
139 | /// The stack command will take a set of XRay traces as arguments, and collects |
140 | /// information about the stacks of instrumented functions that appear in the |
141 | /// traces. We track the following pieces of information: |
142 | /// |
143 | /// - Total time: amount of time/cycles accounted for in the traces. |
144 | /// - Stack count: number of times a specific stack appears in the |
145 | /// traces. Only instrumented functions show up in stacks. |
146 | /// - Cumulative stack time: amount of time spent in a stack accumulated |
147 | /// across the invocations in the traces. |
148 | /// - Cumulative local time: amount of time spent in each instrumented |
149 | /// function showing up in a specific stack, accumulated across the traces. |
150 | /// |
151 | /// Example output for the kind of data we'd like to provide looks like the |
152 | /// following: |
153 | /// |
154 | /// Total time: 3.33234 s |
155 | /// Stack ID: ... |
156 | /// Stack Count: 2093 |
157 | /// # Function Local Time (%) Stack Time (%) |
158 | /// 0 main 2.34 ms 0.07% 3.33234 s 100% |
159 | /// 1 foo() 3.30000 s 99.02% 3.33 s 99.92% |
160 | /// 2 bar() 30 ms 0.90% 30 ms 0.90% |
161 | /// |
162 | /// We can also show distributions of the function call durations with |
163 | /// statistics at each level of the stack. This works by doing the following |
164 | /// algorithm: |
165 | /// |
166 | /// 1. When unwinding, record the duration of each unwound function associated |
167 | /// with the path up to which the unwinding stops. For example: |
168 | /// |
169 | /// Step Duration (? means has start time) |
170 | /// |
171 | /// push a <start time> a = ? |
172 | /// push b <start time> a = ?, a->b = ? |
173 | /// push c <start time> a = ?, a->b = ?, a->b->c = ? |
174 | /// pop c <end time> a = ?, a->b = ?, emit duration(a->b->c) |
175 | /// pop b <end time> a = ?, emit duration(a->b) |
176 | /// push c <start time> a = ?, a->c = ? |
177 | /// pop c <end time> a = ?, emit duration(a->c) |
178 | /// pop a <end time> emit duration(a) |
179 | /// |
180 | /// 2. We then account for the various stacks we've collected, and for each of |
181 | /// them will have measurements that look like the following (continuing |
182 | /// with the above simple example): |
183 | /// |
184 | /// c : [<id("a->b->c"), [durations]>, <id("a->c"), [durations]>] |
185 | /// b : [<id("a->b"), [durations]>] |
186 | /// a : [<id("a"), [durations]>] |
187 | /// |
188 | /// This allows us to compute, for each stack id, and each function that |
189 | /// shows up in the stack, some important statistics like: |
190 | /// |
191 | /// - median |
192 | /// - 99th percentile |
193 | /// - mean + stddev |
194 | /// - count |
195 | /// |
196 | /// 3. For cases where we don't have durations for some of the higher levels |
197 | /// of the stack (perhaps instrumentation wasn't activated when the stack was |
198 | /// entered), we can mark them appropriately. |
199 | /// |
200 | /// Computing this data also allows us to implement lookup by call stack nodes, |
201 | /// so that we can find functions that show up in multiple stack traces and |
202 | /// show the statistical properties of that function in various contexts. We |
203 | /// can compute information similar to the following: |
204 | /// |
205 | /// Function: 'c' |
206 | /// Stacks: 2 / 2 |
207 | /// Stack ID: ... |
208 | /// Stack Count: ... |
209 | /// # Function ... |
210 | /// 0 a ... |
211 | /// 1 b ... |
212 | /// 2 c ... |
213 | /// |
214 | /// Stack ID: ... |
215 | /// Stack Count: ... |
216 | /// # Function ... |
217 | /// 0 a ... |
218 | /// 1 c ... |
219 | /// ----------------... |
220 | /// |
221 | /// Function: 'b' |
222 | /// Stacks: 1 / 2 |
223 | /// Stack ID: ... |
224 | /// Stack Count: ... |
225 | /// # Function ... |
226 | /// 0 a ... |
227 | /// 1 b ... |
228 | /// 2 c ... |
229 | /// |
230 | /// |
231 | /// To do this we require a Trie data structure that will allow us to represent |
232 | /// all the call stacks of instrumented functions in an easily traversible |
233 | /// manner when we do the aggregations and lookups. For instrumented call |
234 | /// sequences like the following: |
235 | /// |
236 | /// a() |
237 | /// b() |
238 | /// c() |
239 | /// d() |
240 | /// c() |
241 | /// |
242 | /// We will have a representation like so: |
243 | /// |
244 | /// a -> b -> c |
245 | /// | | |
246 | /// | +--> d |
247 | /// | |
248 | /// +--> c |
249 | /// |
250 | /// We maintain a sequence of durations on the leaves and in the internal nodes |
251 | /// as we go through and process every record from the XRay trace. We also |
252 | /// maintain an index of unique functions, and provide a means of iterating |
253 | /// through all the instrumented call stacks which we know about. |
254 | |
255 | namespace { |
256 | struct StackDuration { |
257 | llvm::SmallVector<int64_t, 4> TerminalDurations; |
258 | llvm::SmallVector<int64_t, 4> IntermediateDurations; |
259 | }; |
260 | } // namespace |
261 | |
262 | static StackDuration mergeStackDuration(const StackDuration &Left, |
263 | const StackDuration &Right) { |
264 | StackDuration Data{}; |
265 | Data.TerminalDurations.reserve(N: Left.TerminalDurations.size() + |
266 | Right.TerminalDurations.size()); |
267 | Data.IntermediateDurations.reserve(N: Left.IntermediateDurations.size() + |
268 | Right.IntermediateDurations.size()); |
269 | // Aggregate the durations. |
270 | for (auto duration : Left.TerminalDurations) |
271 | Data.TerminalDurations.push_back(Elt: duration); |
272 | for (auto duration : Right.TerminalDurations) |
273 | Data.TerminalDurations.push_back(Elt: duration); |
274 | |
275 | for (auto duration : Left.IntermediateDurations) |
276 | Data.IntermediateDurations.push_back(Elt: duration); |
277 | for (auto duration : Right.IntermediateDurations) |
278 | Data.IntermediateDurations.push_back(Elt: duration); |
279 | return Data; |
280 | } |
281 | |
282 | using StackTrieNode = TrieNode<StackDuration>; |
283 | |
284 | template <AggregationType AggType> |
285 | static std::size_t GetValueForStack(const StackTrieNode *Node); |
286 | |
287 | // When computing total time spent in a stack, we're adding the timings from |
288 | // its callees and the timings from when it was a leaf. |
289 | template <> |
290 | std::size_t |
291 | GetValueForStack<AggregationType::TOTAL_TIME>(const StackTrieNode *Node) { |
292 | auto TopSum = std::accumulate(first: Node->ExtraData.TerminalDurations.begin(), |
293 | last: Node->ExtraData.TerminalDurations.end(), init: 0uLL); |
294 | return std::accumulate(first: Node->ExtraData.IntermediateDurations.begin(), |
295 | last: Node->ExtraData.IntermediateDurations.end(), init: TopSum); |
296 | } |
297 | |
298 | // Calculates how many times a function was invoked. |
299 | // TODO: Hook up option to produce stacks |
300 | template <> |
301 | std::size_t |
302 | GetValueForStack<AggregationType::INVOCATION_COUNT>(const StackTrieNode *Node) { |
303 | return Node->ExtraData.TerminalDurations.size() + |
304 | Node->ExtraData.IntermediateDurations.size(); |
305 | } |
306 | |
307 | // Make sure there are implementations for each enum value. |
308 | template <AggregationType T> struct DependentFalseType : std::false_type {}; |
309 | |
310 | template <AggregationType AggType> |
311 | std::size_t GetValueForStack(const StackTrieNode *Node) { |
312 | static_assert(DependentFalseType<AggType>::value, |
313 | "No implementation found for aggregation type provided." ); |
314 | return 0; |
315 | } |
316 | |
317 | class StackTrie { |
318 | // Avoid the magic number of 4 propagated through the code with an alias. |
319 | // We use this SmallVector to track the root nodes in a call graph. |
320 | using RootVector = SmallVector<StackTrieNode *, 4>; |
321 | |
322 | // We maintain pointers to the roots of the tries we see. |
323 | DenseMap<uint32_t, RootVector> Roots; |
324 | |
325 | // We make sure all the nodes are accounted for in this list. |
326 | std::forward_list<StackTrieNode> NodeStore; |
327 | |
328 | // A map of thread ids to pairs call stack trie nodes and their start times. |
329 | DenseMap<uint32_t, SmallVector<std::pair<StackTrieNode *, uint64_t>, 8>> |
330 | ThreadStackMap; |
331 | |
332 | StackTrieNode *createTrieNode(uint32_t ThreadId, int32_t FuncId, |
333 | StackTrieNode *Parent) { |
334 | NodeStore.push_front(val: StackTrieNode{.FuncId: FuncId, .Parent: Parent, .Callees: {}, .ExtraData: {.TerminalDurations: {}, .IntermediateDurations: {}}}); |
335 | auto I = NodeStore.begin(); |
336 | auto *Node = &*I; |
337 | if (!Parent) |
338 | Roots[ThreadId].push_back(Elt: Node); |
339 | return Node; |
340 | } |
341 | |
342 | StackTrieNode *findRootNode(uint32_t ThreadId, int32_t FuncId) { |
343 | const auto &RootsByThread = Roots[ThreadId]; |
344 | auto I = find_if(Range: RootsByThread, |
345 | P: [&](StackTrieNode *N) { return N->FuncId == FuncId; }); |
346 | return (I == RootsByThread.end()) ? nullptr : *I; |
347 | } |
348 | |
349 | public: |
350 | enum class AccountRecordStatus { |
351 | OK, // Successfully processed |
352 | ENTRY_NOT_FOUND, // An exit record had no matching call stack entry |
353 | UNKNOWN_RECORD_TYPE |
354 | }; |
355 | |
356 | struct AccountRecordState { |
357 | // We keep track of whether the call stack is currently unwinding. |
358 | bool wasLastRecordExit; |
359 | |
360 | static AccountRecordState CreateInitialState() { return {.wasLastRecordExit: false}; } |
361 | }; |
362 | |
363 | AccountRecordStatus accountRecord(const XRayRecord &R, |
364 | AccountRecordState *state) { |
365 | auto &TS = ThreadStackMap[R.TId]; |
366 | switch (R.Type) { |
367 | case RecordTypes::CUSTOM_EVENT: |
368 | case RecordTypes::TYPED_EVENT: |
369 | return AccountRecordStatus::OK; |
370 | case RecordTypes::ENTER: |
371 | case RecordTypes::ENTER_ARG: { |
372 | state->wasLastRecordExit = false; |
373 | // When we encounter a new function entry, we want to record the TSC for |
374 | // that entry, and the function id. Before doing so we check the top of |
375 | // the stack to see if there are callees that already represent this |
376 | // function. |
377 | if (TS.empty()) { |
378 | auto *Root = findRootNode(ThreadId: R.TId, FuncId: R.FuncId); |
379 | TS.emplace_back(Args: Root ? Root : createTrieNode(ThreadId: R.TId, FuncId: R.FuncId, Parent: nullptr), |
380 | Args: R.TSC); |
381 | return AccountRecordStatus::OK; |
382 | } |
383 | |
384 | auto &Top = TS.back(); |
385 | auto I = find_if(Range&: Top.first->Callees, |
386 | P: [&](StackTrieNode *N) { return N->FuncId == R.FuncId; }); |
387 | if (I == Top.first->Callees.end()) { |
388 | // We didn't find the callee in the stack trie, so we're going to |
389 | // add to the stack then set up the pointers properly. |
390 | auto N = createTrieNode(ThreadId: R.TId, FuncId: R.FuncId, Parent: Top.first); |
391 | Top.first->Callees.emplace_back(Args&: N); |
392 | |
393 | // Top may be invalidated after this statement. |
394 | TS.emplace_back(Args&: N, Args: R.TSC); |
395 | } else { |
396 | // We found the callee in the stack trie, so we'll use that pointer |
397 | // instead, add it to the stack associated with the TSC. |
398 | TS.emplace_back(Args&: *I, Args: R.TSC); |
399 | } |
400 | return AccountRecordStatus::OK; |
401 | } |
402 | case RecordTypes::EXIT: |
403 | case RecordTypes::TAIL_EXIT: { |
404 | bool wasLastRecordExit = state->wasLastRecordExit; |
405 | state->wasLastRecordExit = true; |
406 | // The exit case is more interesting, since we want to be able to deduce |
407 | // missing exit records. To do that properly, we need to look up the stack |
408 | // and see whether the exit record matches any of the entry records. If it |
409 | // does match, we attempt to record the durations as we pop the stack to |
410 | // where we see the parent. |
411 | if (TS.empty()) { |
412 | // Short circuit, and say we can't find it. |
413 | |
414 | return AccountRecordStatus::ENTRY_NOT_FOUND; |
415 | } |
416 | |
417 | auto FunctionEntryMatch = find_if( |
418 | Range: reverse(C&: TS), P: [&](const std::pair<StackTrieNode *, uint64_t> &E) { |
419 | return E.first->FuncId == R.FuncId; |
420 | }); |
421 | auto status = AccountRecordStatus::OK; |
422 | if (FunctionEntryMatch == TS.rend()) { |
423 | status = AccountRecordStatus::ENTRY_NOT_FOUND; |
424 | } else { |
425 | // Account for offset of 1 between reverse and forward iterators. We |
426 | // want the forward iterator to include the function that is exited. |
427 | ++FunctionEntryMatch; |
428 | } |
429 | auto I = FunctionEntryMatch.base(); |
430 | for (auto &E : make_range(x: I, y: TS.end() - 1)) |
431 | E.first->ExtraData.IntermediateDurations.push_back( |
432 | Elt: std::max(a: E.second, b: R.TSC) - std::min(a: E.second, b: R.TSC)); |
433 | auto &Deepest = TS.back(); |
434 | if (wasLastRecordExit) |
435 | Deepest.first->ExtraData.IntermediateDurations.push_back( |
436 | Elt: std::max(a: Deepest.second, b: R.TSC) - std::min(a: Deepest.second, b: R.TSC)); |
437 | else |
438 | Deepest.first->ExtraData.TerminalDurations.push_back( |
439 | Elt: std::max(a: Deepest.second, b: R.TSC) - std::min(a: Deepest.second, b: R.TSC)); |
440 | TS.erase(CS: I, CE: TS.end()); |
441 | return status; |
442 | } |
443 | } |
444 | return AccountRecordStatus::UNKNOWN_RECORD_TYPE; |
445 | } |
446 | |
447 | bool isEmpty() const { return Roots.empty(); } |
448 | |
449 | void printStack(raw_ostream &OS, const StackTrieNode *Top, |
450 | FuncIdConversionHelper &FN) { |
451 | // Traverse the pointers up to the parent, noting the sums, then print |
452 | // in reverse order (callers at top, callees down bottom). |
453 | SmallVector<const StackTrieNode *, 8> CurrentStack; |
454 | for (auto *F = Top; F != nullptr; F = F->Parent) |
455 | CurrentStack.push_back(Elt: F); |
456 | int Level = 0; |
457 | OS << formatv(Fmt: "{0,-5} {1,-60} {2,+12} {3,+16}\n" , Vals: "lvl" , Vals: "function" , |
458 | Vals: "count" , Vals: "sum" ); |
459 | for (auto *F : reverse(C: drop_begin(RangeOrContainer&: CurrentStack))) { |
460 | auto Sum = std::accumulate(first: F->ExtraData.IntermediateDurations.begin(), |
461 | last: F->ExtraData.IntermediateDurations.end(), init: 0LL); |
462 | auto FuncId = FN.SymbolOrNumber(FuncId: F->FuncId); |
463 | OS << formatv(Fmt: "#{0,-4} {1,-60} {2,+12} {3,+16}\n" , Vals: Level++, |
464 | Vals: FuncId.size() > 60 ? FuncId.substr(pos: 0, n: 57) + "..." : FuncId, |
465 | Vals: F->ExtraData.IntermediateDurations.size(), Vals&: Sum); |
466 | } |
467 | auto *Leaf = *CurrentStack.begin(); |
468 | auto LeafSum = |
469 | std::accumulate(first: Leaf->ExtraData.TerminalDurations.begin(), |
470 | last: Leaf->ExtraData.TerminalDurations.end(), init: 0LL); |
471 | auto LeafFuncId = FN.SymbolOrNumber(FuncId: Leaf->FuncId); |
472 | OS << formatv(Fmt: "#{0,-4} {1,-60} {2,+12} {3,+16}\n" , Vals: Level++, |
473 | Vals: LeafFuncId.size() > 60 ? LeafFuncId.substr(pos: 0, n: 57) + "..." |
474 | : LeafFuncId, |
475 | Vals: Leaf->ExtraData.TerminalDurations.size(), Vals&: LeafSum); |
476 | OS << "\n" ; |
477 | } |
478 | |
479 | /// Prints top stacks for each thread. |
480 | void printPerThread(raw_ostream &OS, FuncIdConversionHelper &FN) { |
481 | for (const auto &iter : Roots) { |
482 | OS << "Thread " << iter.first << ":\n" ; |
483 | print(OS, FN, RootValues: iter.second); |
484 | OS << "\n" ; |
485 | } |
486 | } |
487 | |
488 | /// Prints timing sums for each stack in each threads. |
489 | template <AggregationType AggType> |
490 | void printAllPerThread(raw_ostream &OS, FuncIdConversionHelper &FN, |
491 | StackOutputFormat format) { |
492 | for (const auto &iter : Roots) |
493 | printAll<AggType>(OS, FN, iter.second, iter.first, true); |
494 | } |
495 | |
496 | /// Prints top stacks from looking at all the leaves and ignoring thread IDs. |
497 | /// Stacks that consist of the same function IDs but were called in different |
498 | /// thread IDs are not considered unique in this printout. |
499 | void printIgnoringThreads(raw_ostream &OS, FuncIdConversionHelper &FN) { |
500 | RootVector RootValues; |
501 | |
502 | // Function to pull the values out of a map iterator. |
503 | using RootsType = decltype(Roots.begin())::value_type; |
504 | auto MapValueFn = [](const RootsType &Value) { return Value.second; }; |
505 | |
506 | for (const auto &RootNodeRange : |
507 | make_range(x: map_iterator(I: Roots.begin(), F: MapValueFn), |
508 | y: map_iterator(I: Roots.end(), F: MapValueFn))) { |
509 | for (auto *RootNode : RootNodeRange) |
510 | RootValues.push_back(Elt: RootNode); |
511 | } |
512 | |
513 | print(OS, FN, RootValues); |
514 | } |
515 | |
516 | /// Creates a merged list of Tries for unique stacks that disregards their |
517 | /// thread IDs. |
518 | RootVector mergeAcrossThreads(std::forward_list<StackTrieNode> &NodeStore) { |
519 | RootVector MergedByThreadRoots; |
520 | for (const auto &MapIter : Roots) { |
521 | const auto &RootNodeVector = MapIter.second; |
522 | for (auto *Node : RootNodeVector) { |
523 | auto MaybeFoundIter = |
524 | find_if(Range&: MergedByThreadRoots, P: [Node](StackTrieNode *elem) { |
525 | return Node->FuncId == elem->FuncId; |
526 | }); |
527 | if (MaybeFoundIter == MergedByThreadRoots.end()) { |
528 | MergedByThreadRoots.push_back(Elt: Node); |
529 | } else { |
530 | MergedByThreadRoots.push_back(Elt: mergeTrieNodes( |
531 | Left: **MaybeFoundIter, Right: *Node, NewParent: nullptr, NodeStore, MergeCallable&: mergeStackDuration)); |
532 | MergedByThreadRoots.erase(CI: MaybeFoundIter); |
533 | } |
534 | } |
535 | } |
536 | return MergedByThreadRoots; |
537 | } |
538 | |
539 | /// Print timing sums for all stacks merged by Thread ID. |
540 | template <AggregationType AggType> |
541 | void printAllAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN, |
542 | StackOutputFormat format) { |
543 | std::forward_list<StackTrieNode> AggregatedNodeStore; |
544 | RootVector MergedByThreadRoots = mergeAcrossThreads(NodeStore&: AggregatedNodeStore); |
545 | bool reportThreadId = false; |
546 | printAll<AggType>(OS, FN, MergedByThreadRoots, |
547 | /*threadId*/ 0, reportThreadId); |
548 | } |
549 | |
550 | /// Merges the trie by thread id before printing top stacks. |
551 | void printAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN) { |
552 | std::forward_list<StackTrieNode> AggregatedNodeStore; |
553 | RootVector MergedByThreadRoots = mergeAcrossThreads(NodeStore&: AggregatedNodeStore); |
554 | print(OS, FN, RootValues: MergedByThreadRoots); |
555 | } |
556 | |
557 | // TODO: Add a format option when more than one are supported. |
558 | template <AggregationType AggType> |
559 | void printAll(raw_ostream &OS, FuncIdConversionHelper &FN, |
560 | RootVector RootValues, uint32_t ThreadId, bool ReportThread) { |
561 | SmallVector<const StackTrieNode *, 16> S; |
562 | for (const auto *N : RootValues) { |
563 | S.clear(); |
564 | S.push_back(Elt: N); |
565 | while (!S.empty()) { |
566 | auto *Top = S.pop_back_val(); |
567 | printSingleStack<AggType>(OS, FN, ReportThread, ThreadId, Top); |
568 | for (const auto *C : Top->Callees) |
569 | S.push_back(Elt: C); |
570 | } |
571 | } |
572 | } |
573 | |
574 | /// Prints values for stacks in a format consumable for the flamegraph.pl |
575 | /// tool. This is a line based format that lists each level in the stack |
576 | /// hierarchy in a semicolon delimited form followed by a space and a numeric |
577 | /// value. If breaking down by thread, the thread ID will be added as the |
578 | /// root level of the stack. |
579 | template <AggregationType AggType> |
580 | void printSingleStack(raw_ostream &OS, FuncIdConversionHelper &Converter, |
581 | bool ReportThread, uint32_t ThreadId, |
582 | const StackTrieNode *Node) { |
583 | if (ReportThread) |
584 | OS << "thread_" << ThreadId << ";" ; |
585 | SmallVector<const StackTrieNode *, 5> lineage{}; |
586 | lineage.push_back(Elt: Node); |
587 | while (lineage.back()->Parent != nullptr) |
588 | lineage.push_back(Elt: lineage.back()->Parent); |
589 | while (!lineage.empty()) { |
590 | OS << Converter.SymbolOrNumber(FuncId: lineage.back()->FuncId) << ";" ; |
591 | lineage.pop_back(); |
592 | } |
593 | OS << " " << GetValueForStack<AggType>(Node) << "\n" ; |
594 | } |
595 | |
596 | void print(raw_ostream &OS, FuncIdConversionHelper &FN, |
597 | RootVector RootValues) { |
598 | // Go through each of the roots, and traverse the call stack, producing the |
599 | // aggregates as you go along. Remember these aggregates and stacks, and |
600 | // show summary statistics about: |
601 | // |
602 | // - Total number of unique stacks |
603 | // - Top 10 stacks by count |
604 | // - Top 10 stacks by aggregate duration |
605 | SmallVector<std::pair<const StackTrieNode *, uint64_t>, 11> |
606 | TopStacksByCount; |
607 | SmallVector<std::pair<const StackTrieNode *, uint64_t>, 11> TopStacksBySum; |
608 | auto greater_second = |
609 | [](const std::pair<const StackTrieNode *, uint64_t> &A, |
610 | const std::pair<const StackTrieNode *, uint64_t> &B) { |
611 | return A.second > B.second; |
612 | }; |
613 | uint64_t UniqueStacks = 0; |
614 | for (const auto *N : RootValues) { |
615 | SmallVector<const StackTrieNode *, 16> S; |
616 | S.emplace_back(Args&: N); |
617 | |
618 | while (!S.empty()) { |
619 | auto *Top = S.pop_back_val(); |
620 | |
621 | // We only start printing the stack (by walking up the parent pointers) |
622 | // when we get to a leaf function. |
623 | if (!Top->ExtraData.TerminalDurations.empty()) { |
624 | ++UniqueStacks; |
625 | auto TopSum = |
626 | std::accumulate(first: Top->ExtraData.TerminalDurations.begin(), |
627 | last: Top->ExtraData.TerminalDurations.end(), init: 0uLL); |
628 | { |
629 | auto E = std::make_pair(x&: Top, y&: TopSum); |
630 | TopStacksBySum.insert( |
631 | I: llvm::lower_bound(Range&: TopStacksBySum, Value&: E, C: greater_second), Elt: E); |
632 | if (TopStacksBySum.size() == 11) |
633 | TopStacksBySum.pop_back(); |
634 | } |
635 | { |
636 | auto E = |
637 | std::make_pair(x&: Top, y: Top->ExtraData.TerminalDurations.size()); |
638 | TopStacksByCount.insert( |
639 | I: llvm::lower_bound(Range&: TopStacksByCount, Value&: E, C: greater_second), Elt: E); |
640 | if (TopStacksByCount.size() == 11) |
641 | TopStacksByCount.pop_back(); |
642 | } |
643 | } |
644 | for (const auto *C : Top->Callees) |
645 | S.push_back(Elt: C); |
646 | } |
647 | } |
648 | |
649 | // Now print the statistics in the end. |
650 | OS << "\n" ; |
651 | OS << "Unique Stacks: " << UniqueStacks << "\n" ; |
652 | OS << "Top 10 Stacks by leaf sum:\n\n" ; |
653 | for (const auto &P : TopStacksBySum) { |
654 | OS << "Sum: " << P.second << "\n" ; |
655 | printStack(OS, Top: P.first, FN); |
656 | } |
657 | OS << "\n" ; |
658 | OS << "Top 10 Stacks by leaf count:\n\n" ; |
659 | for (const auto &P : TopStacksByCount) { |
660 | OS << "Count: " << P.second << "\n" ; |
661 | printStack(OS, Top: P.first, FN); |
662 | } |
663 | OS << "\n" ; |
664 | } |
665 | }; |
666 | |
667 | static std::string CreateErrorMessage(StackTrie::AccountRecordStatus Error, |
668 | const XRayRecord &Record, |
669 | const FuncIdConversionHelper &Converter) { |
670 | switch (Error) { |
671 | case StackTrie::AccountRecordStatus::ENTRY_NOT_FOUND: |
672 | return std::string( |
673 | formatv(Fmt: "Found record {0} with no matching function entry\n" , |
674 | Vals: format_xray_record(Record, Converter))); |
675 | default: |
676 | return std::string(formatv(Fmt: "Unknown error type for record {0}\n" , |
677 | Vals: format_xray_record(Record, Converter))); |
678 | } |
679 | } |
680 | |
681 | static CommandRegistration Unused(&Stack, []() -> Error { |
682 | // Load each file provided as a command-line argument. For each one of them |
683 | // account to a single StackTrie, and just print the whole trie for now. |
684 | StackTrie ST; |
685 | InstrumentationMap Map; |
686 | if (!StacksInstrMap.empty()) { |
687 | auto InstrumentationMapOrError = loadInstrumentationMap(Filename: StacksInstrMap); |
688 | if (!InstrumentationMapOrError) |
689 | return joinErrors( |
690 | E1: make_error<StringError>( |
691 | Args: Twine("Cannot open instrumentation map: " ) + StacksInstrMap, |
692 | Args: std::make_error_code(e: std::errc::invalid_argument)), |
693 | E2: InstrumentationMapOrError.takeError()); |
694 | Map = std::move(*InstrumentationMapOrError); |
695 | } |
696 | |
697 | if (SeparateThreadStacks && AggregateThreads) |
698 | return make_error<StringError>( |
699 | Args: Twine("Can't specify options for per thread reporting and reporting " |
700 | "that aggregates threads." ), |
701 | Args: std::make_error_code(e: std::errc::invalid_argument)); |
702 | |
703 | if (!DumpAllStacks && StacksOutputFormat != HUMAN) |
704 | return make_error<StringError>( |
705 | Args: Twine("Can't specify a non-human format without -all-stacks." ), |
706 | Args: std::make_error_code(e: std::errc::invalid_argument)); |
707 | |
708 | if (DumpAllStacks && StacksOutputFormat == HUMAN) |
709 | return make_error<StringError>( |
710 | Args: Twine("You must specify a non-human format when reporting with " |
711 | "-all-stacks." ), |
712 | Args: std::make_error_code(e: std::errc::invalid_argument)); |
713 | |
714 | symbolize::LLVMSymbolizer Symbolizer; |
715 | FuncIdConversionHelper FuncIdHelper(StacksInstrMap, Symbolizer, |
716 | Map.getFunctionAddresses()); |
717 | // TODO: Someday, support output to files instead of just directly to |
718 | // standard output. |
719 | for (const auto &Filename : StackInputs) { |
720 | auto TraceOrErr = loadTraceFile(Filename); |
721 | if (!TraceOrErr) { |
722 | if (!StackKeepGoing) |
723 | return joinErrors( |
724 | E1: make_error<StringError>( |
725 | Args: Twine("Failed loading input file '" ) + Filename + "'" , |
726 | Args: std::make_error_code(e: std::errc::invalid_argument)), |
727 | E2: TraceOrErr.takeError()); |
728 | logAllUnhandledErrors(E: TraceOrErr.takeError(), OS&: errs()); |
729 | continue; |
730 | } |
731 | auto &T = *TraceOrErr; |
732 | StackTrie::AccountRecordState AccountRecordState = |
733 | StackTrie::AccountRecordState::CreateInitialState(); |
734 | for (const auto &Record : T) { |
735 | auto error = ST.accountRecord(R: Record, state: &AccountRecordState); |
736 | if (error != StackTrie::AccountRecordStatus::OK) { |
737 | if (!StackKeepGoing) |
738 | return make_error<StringError>( |
739 | Args: CreateErrorMessage(Error: error, Record, Converter: FuncIdHelper), |
740 | Args: make_error_code(E: errc::illegal_byte_sequence)); |
741 | errs() << CreateErrorMessage(Error: error, Record, Converter: FuncIdHelper); |
742 | } |
743 | } |
744 | } |
745 | if (ST.isEmpty()) { |
746 | return make_error<StringError>( |
747 | Args: "No instrumented calls were accounted in the input file." , |
748 | Args: make_error_code(E: errc::result_out_of_range)); |
749 | } |
750 | |
751 | // Report the stacks in a long form mode for another tool to analyze. |
752 | if (DumpAllStacks) { |
753 | if (AggregateThreads) { |
754 | switch (RequestedAggregation) { |
755 | case AggregationType::TOTAL_TIME: |
756 | ST.printAllAggregatingThreads<AggregationType::TOTAL_TIME>( |
757 | OS&: outs(), FN&: FuncIdHelper, format: StacksOutputFormat); |
758 | break; |
759 | case AggregationType::INVOCATION_COUNT: |
760 | ST.printAllAggregatingThreads<AggregationType::INVOCATION_COUNT>( |
761 | OS&: outs(), FN&: FuncIdHelper, format: StacksOutputFormat); |
762 | break; |
763 | } |
764 | } else { |
765 | switch (RequestedAggregation) { |
766 | case AggregationType::TOTAL_TIME: |
767 | ST.printAllPerThread<AggregationType::TOTAL_TIME>(OS&: outs(), FN&: FuncIdHelper, |
768 | format: StacksOutputFormat); |
769 | break; |
770 | case AggregationType::INVOCATION_COUNT: |
771 | ST.printAllPerThread<AggregationType::INVOCATION_COUNT>( |
772 | OS&: outs(), FN&: FuncIdHelper, format: StacksOutputFormat); |
773 | break; |
774 | } |
775 | } |
776 | return Error::success(); |
777 | } |
778 | |
779 | // We're only outputting top stacks. |
780 | if (AggregateThreads) { |
781 | ST.printAggregatingThreads(OS&: outs(), FN&: FuncIdHelper); |
782 | } else if (SeparateThreadStacks) { |
783 | ST.printPerThread(OS&: outs(), FN&: FuncIdHelper); |
784 | } else { |
785 | ST.printIgnoringThreads(OS&: outs(), FN&: FuncIdHelper); |
786 | } |
787 | return Error::success(); |
788 | }); |
789 | |