1 | //===- xray-stacks.cpp: XRay Function Call Stack Accounting ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements stack-based accounting. It takes XRay traces, and |
10 | // collates statistics across these traces to show a breakdown of time spent |
11 | // at various points of the stack to provide insight into which functions |
12 | // spend the most time in terms of a call stack. We provide a few |
13 | // sorting/filtering options for zero'ing in on the useful stacks. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #include <forward_list> |
18 | #include <numeric> |
19 | |
20 | #include "func-id-helper.h" |
21 | #include "trie-node.h" |
22 | #include "xray-registry.h" |
23 | #include "llvm/ADT/StringExtras.h" |
24 | #include "llvm/Support/CommandLine.h" |
25 | #include "llvm/Support/Errc.h" |
26 | #include "llvm/Support/ErrorHandling.h" |
27 | #include "llvm/Support/FormatAdapters.h" |
28 | #include "llvm/Support/FormatVariadic.h" |
29 | #include "llvm/XRay/Graph.h" |
30 | #include "llvm/XRay/InstrumentationMap.h" |
31 | #include "llvm/XRay/Trace.h" |
32 | |
33 | using namespace llvm; |
34 | using namespace llvm::xray; |
35 | |
36 | static cl::SubCommand Stack("stack" , "Call stack accounting" ); |
37 | static cl::list<std::string> StackInputs(cl::Positional, |
38 | cl::desc("<xray trace>" ), cl::Required, |
39 | cl::sub(Stack), cl::OneOrMore); |
40 | |
41 | static cl::opt<bool> |
42 | StackKeepGoing("keep-going" , cl::desc("Keep going on errors encountered" ), |
43 | cl::sub(Stack), cl::init(Val: false)); |
44 | static cl::alias StackKeepGoing2("k" , cl::aliasopt(StackKeepGoing), |
45 | cl::desc("Alias for -keep-going" )); |
46 | |
47 | // TODO: Does there need to be an option to deduce tail or sibling calls? |
48 | |
49 | static cl::opt<std::string> StacksInstrMap( |
50 | "instr_map" , |
51 | cl::desc("instrumentation map used to identify function ids. " |
52 | "Currently supports elf file instrumentation maps." ), |
53 | cl::sub(Stack), cl::init(Val: "" )); |
54 | static cl::alias StacksInstrMap2("m" , cl::aliasopt(StacksInstrMap), |
55 | cl::desc("Alias for -instr_map" )); |
56 | |
57 | static cl::opt<bool> |
58 | SeparateThreadStacks("per-thread-stacks" , |
59 | cl::desc("Report top stacks within each thread id" ), |
60 | cl::sub(Stack), cl::init(Val: false)); |
61 | |
62 | static cl::opt<bool> |
63 | AggregateThreads("aggregate-threads" , |
64 | cl::desc("Aggregate stack times across threads" ), |
65 | cl::sub(Stack), cl::init(Val: false)); |
66 | |
67 | static cl::opt<bool> |
68 | DumpAllStacks("all-stacks" , |
69 | cl::desc("Dump sum of timings for all stacks. " |
70 | "By default separates stacks per-thread." ), |
71 | cl::sub(Stack), cl::init(Val: false)); |
72 | static cl::alias DumpAllStacksShort("all" , cl::aliasopt(DumpAllStacks), |
73 | cl::desc("Alias for -all-stacks" )); |
74 | |
75 | // TODO(kpw): Add other interesting formats. Perhaps chrome trace viewer format |
76 | // possibly with aggregations or just a linear trace of timings. |
77 | enum StackOutputFormat { HUMAN, FLAMETOOL }; |
78 | |
79 | static cl::opt<StackOutputFormat> StacksOutputFormat( |
80 | "stack-format" , |
81 | cl::desc("The format that output stacks should be " |
82 | "output in. Only applies with all-stacks." ), |
83 | cl::values( |
84 | clEnumValN(HUMAN, "human" , |
85 | "Human readable output. Only valid without -all-stacks." ), |
86 | clEnumValN(FLAMETOOL, "flame" , |
87 | "Format consumable by Brendan Gregg's FlameGraph tool. " |
88 | "Only valid with -all-stacks." )), |
89 | cl::sub(Stack), cl::init(Val: HUMAN)); |
90 | |
91 | // Types of values for each stack in a CallTrie. |
92 | enum class AggregationType { |
93 | TOTAL_TIME, // The total time spent in a stack and its callees. |
94 | INVOCATION_COUNT // The number of times the stack was invoked. |
95 | }; |
96 | |
97 | static cl::opt<AggregationType> RequestedAggregation( |
98 | "aggregation-type" , |
99 | cl::desc("The type of aggregation to do on call stacks." ), |
100 | cl::values( |
101 | clEnumValN( |
102 | AggregationType::TOTAL_TIME, "time" , |
103 | "Capture the total time spent in an all invocations of a stack." ), |
104 | clEnumValN(AggregationType::INVOCATION_COUNT, "count" , |
105 | "Capture the number of times a stack was invoked. " |
106 | "In flamegraph mode, this count also includes invocations " |
107 | "of all callees." )), |
108 | cl::sub(Stack), cl::init(Val: AggregationType::TOTAL_TIME)); |
109 | |
110 | /// A helper struct to work with formatv and XRayRecords. Makes it easier to |
111 | /// use instrumentation map names or addresses in formatted output. |
112 | struct format_xray_record : public FormatAdapter<XRayRecord> { |
113 | explicit format_xray_record(XRayRecord record, |
114 | const FuncIdConversionHelper &conv) |
115 | : FormatAdapter<XRayRecord>(std::move(record)), Converter(&conv) {} |
116 | void format(raw_ostream &Stream, StringRef Style) override { |
117 | Stream << formatv( |
118 | Fmt: "{FuncId: \"{0}\", ThreadId: \"{1}\", RecordType: \"{2}\"}" , |
119 | Vals: Converter->SymbolOrNumber(FuncId: Item.FuncId), Vals&: Item.TId, |
120 | Vals: DecodeRecordType(recordType: Item.RecordType)); |
121 | } |
122 | |
123 | private: |
124 | Twine DecodeRecordType(uint16_t recordType) { |
125 | switch (recordType) { |
126 | case 0: |
127 | return Twine("Fn Entry" ); |
128 | case 1: |
129 | return Twine("Fn Exit" ); |
130 | default: |
131 | // TODO: Add Tail exit when it is added to llvm/XRay/XRayRecord.h |
132 | return Twine("Unknown" ); |
133 | } |
134 | } |
135 | |
136 | const FuncIdConversionHelper *Converter; |
137 | }; |
138 | |
139 | /// The stack command will take a set of XRay traces as arguments, and collects |
140 | /// information about the stacks of instrumented functions that appear in the |
141 | /// traces. We track the following pieces of information: |
142 | /// |
143 | /// - Total time: amount of time/cycles accounted for in the traces. |
144 | /// - Stack count: number of times a specific stack appears in the |
145 | /// traces. Only instrumented functions show up in stacks. |
146 | /// - Cumulative stack time: amount of time spent in a stack accumulated |
147 | /// across the invocations in the traces. |
148 | /// - Cumulative local time: amount of time spent in each instrumented |
149 | /// function showing up in a specific stack, accumulated across the traces. |
150 | /// |
151 | /// Example output for the kind of data we'd like to provide looks like the |
152 | /// following: |
153 | /// |
154 | /// Total time: 3.33234 s |
155 | /// Stack ID: ... |
156 | /// Stack Count: 2093 |
157 | /// # Function Local Time (%) Stack Time (%) |
158 | /// 0 main 2.34 ms 0.07% 3.33234 s 100% |
159 | /// 1 foo() 3.30000 s 99.02% 3.33 s 99.92% |
160 | /// 2 bar() 30 ms 0.90% 30 ms 0.90% |
161 | /// |
162 | /// We can also show distributions of the function call durations with |
163 | /// statistics at each level of the stack. This works by doing the following |
164 | /// algorithm: |
165 | /// |
166 | /// 1. When unwinding, record the duration of each unwound function associated |
167 | /// with the path up to which the unwinding stops. For example: |
168 | /// |
169 | /// Step Duration (? means has start time) |
170 | /// |
171 | /// push a <start time> a = ? |
172 | /// push b <start time> a = ?, a->b = ? |
173 | /// push c <start time> a = ?, a->b = ?, a->b->c = ? |
174 | /// pop c <end time> a = ?, a->b = ?, emit duration(a->b->c) |
175 | /// pop b <end time> a = ?, emit duration(a->b) |
176 | /// push c <start time> a = ?, a->c = ? |
177 | /// pop c <end time> a = ?, emit duration(a->c) |
178 | /// pop a <end time> emit duration(a) |
179 | /// |
180 | /// 2. We then account for the various stacks we've collected, and for each of |
181 | /// them will have measurements that look like the following (continuing |
182 | /// with the above simple example): |
183 | /// |
184 | /// c : [<id("a->b->c"), [durations]>, <id("a->c"), [durations]>] |
185 | /// b : [<id("a->b"), [durations]>] |
186 | /// a : [<id("a"), [durations]>] |
187 | /// |
188 | /// This allows us to compute, for each stack id, and each function that |
189 | /// shows up in the stack, some important statistics like: |
190 | /// |
191 | /// - median |
192 | /// - 99th percentile |
193 | /// - mean + stddev |
194 | /// - count |
195 | /// |
196 | /// 3. For cases where we don't have durations for some of the higher levels |
197 | /// of the stack (perhaps instrumentation wasn't activated when the stack was |
198 | /// entered), we can mark them appropriately. |
199 | /// |
200 | /// Computing this data also allows us to implement lookup by call stack nodes, |
201 | /// so that we can find functions that show up in multiple stack traces and |
202 | /// show the statistical properties of that function in various contexts. We |
203 | /// can compute information similar to the following: |
204 | /// |
205 | /// Function: 'c' |
206 | /// Stacks: 2 / 2 |
207 | /// Stack ID: ... |
208 | /// Stack Count: ... |
209 | /// # Function ... |
210 | /// 0 a ... |
211 | /// 1 b ... |
212 | /// 2 c ... |
213 | /// |
214 | /// Stack ID: ... |
215 | /// Stack Count: ... |
216 | /// # Function ... |
217 | /// 0 a ... |
218 | /// 1 c ... |
219 | /// ----------------... |
220 | /// |
221 | /// Function: 'b' |
222 | /// Stacks: 1 / 2 |
223 | /// Stack ID: ... |
224 | /// Stack Count: ... |
225 | /// # Function ... |
226 | /// 0 a ... |
227 | /// 1 b ... |
228 | /// 2 c ... |
229 | /// |
230 | /// |
231 | /// To do this we require a Trie data structure that will allow us to represent |
232 | /// all the call stacks of instrumented functions in an easily traversible |
233 | /// manner when we do the aggregations and lookups. For instrumented call |
234 | /// sequences like the following: |
235 | /// |
236 | /// a() |
237 | /// b() |
238 | /// c() |
239 | /// d() |
240 | /// c() |
241 | /// |
242 | /// We will have a representation like so: |
243 | /// |
244 | /// a -> b -> c |
245 | /// | | |
246 | /// | +--> d |
247 | /// | |
248 | /// +--> c |
249 | /// |
250 | /// We maintain a sequence of durations on the leaves and in the internal nodes |
251 | /// as we go through and process every record from the XRay trace. We also |
252 | /// maintain an index of unique functions, and provide a means of iterating |
253 | /// through all the instrumented call stacks which we know about. |
254 | |
255 | namespace { |
256 | struct StackDuration { |
257 | llvm::SmallVector<int64_t, 4> TerminalDurations; |
258 | llvm::SmallVector<int64_t, 4> IntermediateDurations; |
259 | }; |
260 | } // namespace |
261 | |
262 | static StackDuration mergeStackDuration(const StackDuration &Left, |
263 | const StackDuration &Right) { |
264 | StackDuration Data{}; |
265 | Data.TerminalDurations.reserve(N: Left.TerminalDurations.size() + |
266 | Right.TerminalDurations.size()); |
267 | Data.IntermediateDurations.reserve(N: Left.IntermediateDurations.size() + |
268 | Right.IntermediateDurations.size()); |
269 | // Aggregate the durations. |
270 | llvm::append_range(C&: Data.TerminalDurations, R: Left.TerminalDurations); |
271 | llvm::append_range(C&: Data.TerminalDurations, R: Right.TerminalDurations); |
272 | |
273 | llvm::append_range(C&: Data.IntermediateDurations, R: Left.IntermediateDurations); |
274 | llvm::append_range(C&: Data.IntermediateDurations, R: Right.IntermediateDurations); |
275 | return Data; |
276 | } |
277 | |
278 | using StackTrieNode = TrieNode<StackDuration>; |
279 | |
280 | template <AggregationType AggType> |
281 | static std::size_t GetValueForStack(const StackTrieNode *Node); |
282 | |
283 | // When computing total time spent in a stack, we're adding the timings from |
284 | // its callees and the timings from when it was a leaf. |
285 | template <> |
286 | std::size_t |
287 | GetValueForStack<AggregationType::TOTAL_TIME>(const StackTrieNode *Node) { |
288 | auto TopSum = std::accumulate(first: Node->ExtraData.TerminalDurations.begin(), |
289 | last: Node->ExtraData.TerminalDurations.end(), init: 0uLL); |
290 | return std::accumulate(first: Node->ExtraData.IntermediateDurations.begin(), |
291 | last: Node->ExtraData.IntermediateDurations.end(), init: TopSum); |
292 | } |
293 | |
294 | // Calculates how many times a function was invoked. |
295 | // TODO: Hook up option to produce stacks |
296 | template <> |
297 | std::size_t |
298 | GetValueForStack<AggregationType::INVOCATION_COUNT>(const StackTrieNode *Node) { |
299 | return Node->ExtraData.TerminalDurations.size() + |
300 | Node->ExtraData.IntermediateDurations.size(); |
301 | } |
302 | |
303 | // Make sure there are implementations for each enum value. |
304 | template <AggregationType T> struct DependentFalseType : std::false_type {}; |
305 | |
306 | template <AggregationType AggType> |
307 | std::size_t GetValueForStack(const StackTrieNode *Node) { |
308 | static_assert(DependentFalseType<AggType>::value, |
309 | "No implementation found for aggregation type provided." ); |
310 | return 0; |
311 | } |
312 | |
313 | class StackTrie { |
314 | // Avoid the magic number of 4 propagated through the code with an alias. |
315 | // We use this SmallVector to track the root nodes in a call graph. |
316 | using RootVector = SmallVector<StackTrieNode *, 4>; |
317 | |
318 | // We maintain pointers to the roots of the tries we see. |
319 | DenseMap<uint32_t, RootVector> Roots; |
320 | |
321 | // We make sure all the nodes are accounted for in this list. |
322 | std::forward_list<StackTrieNode> NodeStore; |
323 | |
324 | // A map of thread ids to pairs call stack trie nodes and their start times. |
325 | DenseMap<uint32_t, SmallVector<std::pair<StackTrieNode *, uint64_t>, 8>> |
326 | ThreadStackMap; |
327 | |
328 | StackTrieNode *createTrieNode(uint32_t ThreadId, int32_t FuncId, |
329 | StackTrieNode *Parent) { |
330 | NodeStore.push_front(val: StackTrieNode{.FuncId: FuncId, .Parent: Parent, .Callees: {}, .ExtraData: {.TerminalDurations: {}, .IntermediateDurations: {}}}); |
331 | auto I = NodeStore.begin(); |
332 | auto *Node = &*I; |
333 | if (!Parent) |
334 | Roots[ThreadId].push_back(Elt: Node); |
335 | return Node; |
336 | } |
337 | |
338 | StackTrieNode *findRootNode(uint32_t ThreadId, int32_t FuncId) { |
339 | const auto &RootsByThread = Roots[ThreadId]; |
340 | auto I = find_if(Range: RootsByThread, |
341 | P: [&](StackTrieNode *N) { return N->FuncId == FuncId; }); |
342 | return (I == RootsByThread.end()) ? nullptr : *I; |
343 | } |
344 | |
345 | public: |
346 | enum class AccountRecordStatus { |
347 | OK, // Successfully processed |
348 | ENTRY_NOT_FOUND, // An exit record had no matching call stack entry |
349 | UNKNOWN_RECORD_TYPE |
350 | }; |
351 | |
352 | struct AccountRecordState { |
353 | // We keep track of whether the call stack is currently unwinding. |
354 | bool wasLastRecordExit; |
355 | |
356 | static AccountRecordState CreateInitialState() { return {.wasLastRecordExit: false}; } |
357 | }; |
358 | |
359 | AccountRecordStatus accountRecord(const XRayRecord &R, |
360 | AccountRecordState *state) { |
361 | auto &TS = ThreadStackMap[R.TId]; |
362 | switch (R.Type) { |
363 | case RecordTypes::CUSTOM_EVENT: |
364 | case RecordTypes::TYPED_EVENT: |
365 | return AccountRecordStatus::OK; |
366 | case RecordTypes::ENTER: |
367 | case RecordTypes::ENTER_ARG: { |
368 | state->wasLastRecordExit = false; |
369 | // When we encounter a new function entry, we want to record the TSC for |
370 | // that entry, and the function id. Before doing so we check the top of |
371 | // the stack to see if there are callees that already represent this |
372 | // function. |
373 | if (TS.empty()) { |
374 | auto *Root = findRootNode(ThreadId: R.TId, FuncId: R.FuncId); |
375 | TS.emplace_back(Args: Root ? Root : createTrieNode(ThreadId: R.TId, FuncId: R.FuncId, Parent: nullptr), |
376 | Args: R.TSC); |
377 | return AccountRecordStatus::OK; |
378 | } |
379 | |
380 | auto &Top = TS.back(); |
381 | auto I = find_if(Range&: Top.first->Callees, |
382 | P: [&](StackTrieNode *N) { return N->FuncId == R.FuncId; }); |
383 | if (I == Top.first->Callees.end()) { |
384 | // We didn't find the callee in the stack trie, so we're going to |
385 | // add to the stack then set up the pointers properly. |
386 | auto N = createTrieNode(ThreadId: R.TId, FuncId: R.FuncId, Parent: Top.first); |
387 | Top.first->Callees.emplace_back(Args&: N); |
388 | |
389 | // Top may be invalidated after this statement. |
390 | TS.emplace_back(Args&: N, Args: R.TSC); |
391 | } else { |
392 | // We found the callee in the stack trie, so we'll use that pointer |
393 | // instead, add it to the stack associated with the TSC. |
394 | TS.emplace_back(Args&: *I, Args: R.TSC); |
395 | } |
396 | return AccountRecordStatus::OK; |
397 | } |
398 | case RecordTypes::EXIT: |
399 | case RecordTypes::TAIL_EXIT: { |
400 | bool wasLastRecordExit = state->wasLastRecordExit; |
401 | state->wasLastRecordExit = true; |
402 | // The exit case is more interesting, since we want to be able to deduce |
403 | // missing exit records. To do that properly, we need to look up the stack |
404 | // and see whether the exit record matches any of the entry records. If it |
405 | // does match, we attempt to record the durations as we pop the stack to |
406 | // where we see the parent. |
407 | if (TS.empty()) { |
408 | // Short circuit, and say we can't find it. |
409 | |
410 | return AccountRecordStatus::ENTRY_NOT_FOUND; |
411 | } |
412 | |
413 | auto FunctionEntryMatch = find_if( |
414 | Range: reverse(C&: TS), P: [&](const std::pair<StackTrieNode *, uint64_t> &E) { |
415 | return E.first->FuncId == R.FuncId; |
416 | }); |
417 | auto status = AccountRecordStatus::OK; |
418 | if (FunctionEntryMatch == TS.rend()) { |
419 | status = AccountRecordStatus::ENTRY_NOT_FOUND; |
420 | } else { |
421 | // Account for offset of 1 between reverse and forward iterators. We |
422 | // want the forward iterator to include the function that is exited. |
423 | ++FunctionEntryMatch; |
424 | } |
425 | auto I = FunctionEntryMatch.base(); |
426 | for (auto &E : make_range(x: I, y: TS.end() - 1)) |
427 | E.first->ExtraData.IntermediateDurations.push_back( |
428 | Elt: std::max(a: E.second, b: R.TSC) - std::min(a: E.second, b: R.TSC)); |
429 | auto &Deepest = TS.back(); |
430 | if (wasLastRecordExit) |
431 | Deepest.first->ExtraData.IntermediateDurations.push_back( |
432 | Elt: std::max(a: Deepest.second, b: R.TSC) - std::min(a: Deepest.second, b: R.TSC)); |
433 | else |
434 | Deepest.first->ExtraData.TerminalDurations.push_back( |
435 | Elt: std::max(a: Deepest.second, b: R.TSC) - std::min(a: Deepest.second, b: R.TSC)); |
436 | TS.erase(CS: I, CE: TS.end()); |
437 | return status; |
438 | } |
439 | } |
440 | return AccountRecordStatus::UNKNOWN_RECORD_TYPE; |
441 | } |
442 | |
443 | bool isEmpty() const { return Roots.empty(); } |
444 | |
445 | void printStack(raw_ostream &OS, const StackTrieNode *Top, |
446 | FuncIdConversionHelper &FN) { |
447 | // Traverse the pointers up to the parent, noting the sums, then print |
448 | // in reverse order (callers at top, callees down bottom). |
449 | SmallVector<const StackTrieNode *, 8> CurrentStack; |
450 | for (auto *F = Top; F != nullptr; F = F->Parent) |
451 | CurrentStack.push_back(Elt: F); |
452 | int Level = 0; |
453 | OS << formatv(Fmt: "{0,-5} {1,-60} {2,+12} {3,+16}\n" , Vals: "lvl" , Vals: "function" , |
454 | Vals: "count" , Vals: "sum" ); |
455 | for (auto *F : reverse(C: drop_begin(RangeOrContainer&: CurrentStack))) { |
456 | auto Sum = std::accumulate(first: F->ExtraData.IntermediateDurations.begin(), |
457 | last: F->ExtraData.IntermediateDurations.end(), init: 0LL); |
458 | auto FuncId = FN.SymbolOrNumber(FuncId: F->FuncId); |
459 | OS << formatv(Fmt: "#{0,-4} {1,-60} {2,+12} {3,+16}\n" , Vals: Level++, |
460 | Vals: FuncId.size() > 60 ? FuncId.substr(pos: 0, n: 57) + "..." : FuncId, |
461 | Vals: F->ExtraData.IntermediateDurations.size(), Vals&: Sum); |
462 | } |
463 | auto *Leaf = *CurrentStack.begin(); |
464 | auto LeafSum = |
465 | std::accumulate(first: Leaf->ExtraData.TerminalDurations.begin(), |
466 | last: Leaf->ExtraData.TerminalDurations.end(), init: 0LL); |
467 | auto LeafFuncId = FN.SymbolOrNumber(FuncId: Leaf->FuncId); |
468 | OS << formatv(Fmt: "#{0,-4} {1,-60} {2,+12} {3,+16}\n" , Vals: Level++, |
469 | Vals: LeafFuncId.size() > 60 ? LeafFuncId.substr(pos: 0, n: 57) + "..." |
470 | : LeafFuncId, |
471 | Vals: Leaf->ExtraData.TerminalDurations.size(), Vals&: LeafSum); |
472 | OS << "\n" ; |
473 | } |
474 | |
475 | /// Prints top stacks for each thread. |
476 | void printPerThread(raw_ostream &OS, FuncIdConversionHelper &FN) { |
477 | for (const auto &iter : Roots) { |
478 | OS << "Thread " << iter.first << ":\n" ; |
479 | print(OS, FN, RootValues: iter.second); |
480 | OS << "\n" ; |
481 | } |
482 | } |
483 | |
484 | /// Prints timing sums for each stack in each threads. |
485 | template <AggregationType AggType> |
486 | void printAllPerThread(raw_ostream &OS, FuncIdConversionHelper &FN, |
487 | StackOutputFormat format) { |
488 | for (const auto &iter : Roots) |
489 | printAll<AggType>(OS, FN, iter.second, iter.first, true); |
490 | } |
491 | |
492 | /// Prints top stacks from looking at all the leaves and ignoring thread IDs. |
493 | /// Stacks that consist of the same function IDs but were called in different |
494 | /// thread IDs are not considered unique in this printout. |
495 | void printIgnoringThreads(raw_ostream &OS, FuncIdConversionHelper &FN) { |
496 | RootVector RootValues; |
497 | |
498 | for (const auto &RootNodeRange : make_second_range(c&: Roots)) |
499 | llvm::append_range(C&: RootValues, R: RootNodeRange); |
500 | |
501 | print(OS, FN, RootValues); |
502 | } |
503 | |
504 | /// Creates a merged list of Tries for unique stacks that disregards their |
505 | /// thread IDs. |
506 | RootVector mergeAcrossThreads(std::forward_list<StackTrieNode> &NodeStore) { |
507 | RootVector MergedByThreadRoots; |
508 | for (const auto &MapIter : Roots) { |
509 | const auto &RootNodeVector = MapIter.second; |
510 | for (auto *Node : RootNodeVector) { |
511 | auto MaybeFoundIter = |
512 | find_if(Range&: MergedByThreadRoots, P: [Node](StackTrieNode *elem) { |
513 | return Node->FuncId == elem->FuncId; |
514 | }); |
515 | if (MaybeFoundIter == MergedByThreadRoots.end()) { |
516 | MergedByThreadRoots.push_back(Elt: Node); |
517 | } else { |
518 | MergedByThreadRoots.push_back(Elt: mergeTrieNodes( |
519 | Left: **MaybeFoundIter, Right: *Node, NewParent: nullptr, NodeStore, MergeCallable&: mergeStackDuration)); |
520 | MergedByThreadRoots.erase(CI: MaybeFoundIter); |
521 | } |
522 | } |
523 | } |
524 | return MergedByThreadRoots; |
525 | } |
526 | |
527 | /// Print timing sums for all stacks merged by Thread ID. |
528 | template <AggregationType AggType> |
529 | void printAllAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN, |
530 | StackOutputFormat format) { |
531 | std::forward_list<StackTrieNode> AggregatedNodeStore; |
532 | RootVector MergedByThreadRoots = mergeAcrossThreads(NodeStore&: AggregatedNodeStore); |
533 | bool reportThreadId = false; |
534 | printAll<AggType>(OS, FN, MergedByThreadRoots, |
535 | /*threadId*/ 0, reportThreadId); |
536 | } |
537 | |
538 | /// Merges the trie by thread id before printing top stacks. |
539 | void printAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN) { |
540 | std::forward_list<StackTrieNode> AggregatedNodeStore; |
541 | RootVector MergedByThreadRoots = mergeAcrossThreads(NodeStore&: AggregatedNodeStore); |
542 | print(OS, FN, RootValues: MergedByThreadRoots); |
543 | } |
544 | |
545 | // TODO: Add a format option when more than one are supported. |
546 | template <AggregationType AggType> |
547 | void printAll(raw_ostream &OS, FuncIdConversionHelper &FN, |
548 | RootVector RootValues, uint32_t ThreadId, bool ReportThread) { |
549 | SmallVector<const StackTrieNode *, 16> S; |
550 | for (const auto *N : RootValues) { |
551 | S.clear(); |
552 | S.push_back(Elt: N); |
553 | while (!S.empty()) { |
554 | auto *Top = S.pop_back_val(); |
555 | printSingleStack<AggType>(OS, FN, ReportThread, ThreadId, Top); |
556 | llvm::append_range(C&: S, R: Top->Callees); |
557 | } |
558 | } |
559 | } |
560 | |
561 | /// Prints values for stacks in a format consumable for the flamegraph.pl |
562 | /// tool. This is a line based format that lists each level in the stack |
563 | /// hierarchy in a semicolon delimited form followed by a space and a numeric |
564 | /// value. If breaking down by thread, the thread ID will be added as the |
565 | /// root level of the stack. |
566 | template <AggregationType AggType> |
567 | void printSingleStack(raw_ostream &OS, FuncIdConversionHelper &Converter, |
568 | bool ReportThread, uint32_t ThreadId, |
569 | const StackTrieNode *Node) { |
570 | if (ReportThread) |
571 | OS << "thread_" << ThreadId << ";" ; |
572 | SmallVector<const StackTrieNode *, 5> lineage{}; |
573 | lineage.push_back(Elt: Node); |
574 | while (lineage.back()->Parent != nullptr) |
575 | lineage.push_back(Elt: lineage.back()->Parent); |
576 | while (!lineage.empty()) { |
577 | OS << Converter.SymbolOrNumber(FuncId: lineage.back()->FuncId) << ";" ; |
578 | lineage.pop_back(); |
579 | } |
580 | OS << " " << GetValueForStack<AggType>(Node) << "\n" ; |
581 | } |
582 | |
583 | void print(raw_ostream &OS, FuncIdConversionHelper &FN, |
584 | RootVector RootValues) { |
585 | // Go through each of the roots, and traverse the call stack, producing the |
586 | // aggregates as you go along. Remember these aggregates and stacks, and |
587 | // show summary statistics about: |
588 | // |
589 | // - Total number of unique stacks |
590 | // - Top 10 stacks by count |
591 | // - Top 10 stacks by aggregate duration |
592 | SmallVector<std::pair<const StackTrieNode *, uint64_t>, 11> |
593 | TopStacksByCount; |
594 | SmallVector<std::pair<const StackTrieNode *, uint64_t>, 11> TopStacksBySum; |
595 | auto greater_second = |
596 | [](const std::pair<const StackTrieNode *, uint64_t> &A, |
597 | const std::pair<const StackTrieNode *, uint64_t> &B) { |
598 | return A.second > B.second; |
599 | }; |
600 | uint64_t UniqueStacks = 0; |
601 | for (const auto *N : RootValues) { |
602 | SmallVector<const StackTrieNode *, 16> S; |
603 | S.emplace_back(Args&: N); |
604 | |
605 | while (!S.empty()) { |
606 | auto *Top = S.pop_back_val(); |
607 | |
608 | // We only start printing the stack (by walking up the parent pointers) |
609 | // when we get to a leaf function. |
610 | if (!Top->ExtraData.TerminalDurations.empty()) { |
611 | ++UniqueStacks; |
612 | auto TopSum = |
613 | std::accumulate(first: Top->ExtraData.TerminalDurations.begin(), |
614 | last: Top->ExtraData.TerminalDurations.end(), init: 0uLL); |
615 | { |
616 | auto E = std::make_pair(x&: Top, y&: TopSum); |
617 | TopStacksBySum.insert( |
618 | I: llvm::lower_bound(Range&: TopStacksBySum, Value&: E, C: greater_second), Elt: E); |
619 | if (TopStacksBySum.size() == 11) |
620 | TopStacksBySum.pop_back(); |
621 | } |
622 | { |
623 | auto E = |
624 | std::make_pair(x&: Top, y: Top->ExtraData.TerminalDurations.size()); |
625 | TopStacksByCount.insert( |
626 | I: llvm::lower_bound(Range&: TopStacksByCount, Value&: E, C: greater_second), Elt: E); |
627 | if (TopStacksByCount.size() == 11) |
628 | TopStacksByCount.pop_back(); |
629 | } |
630 | } |
631 | llvm::append_range(C&: S, R: Top->Callees); |
632 | } |
633 | } |
634 | |
635 | // Now print the statistics in the end. |
636 | OS << "\n" ; |
637 | OS << "Unique Stacks: " << UniqueStacks << "\n" ; |
638 | OS << "Top 10 Stacks by leaf sum:\n\n" ; |
639 | for (const auto &P : TopStacksBySum) { |
640 | OS << "Sum: " << P.second << "\n" ; |
641 | printStack(OS, Top: P.first, FN); |
642 | } |
643 | OS << "\n" ; |
644 | OS << "Top 10 Stacks by leaf count:\n\n" ; |
645 | for (const auto &P : TopStacksByCount) { |
646 | OS << "Count: " << P.second << "\n" ; |
647 | printStack(OS, Top: P.first, FN); |
648 | } |
649 | OS << "\n" ; |
650 | } |
651 | }; |
652 | |
653 | static std::string CreateErrorMessage(StackTrie::AccountRecordStatus Error, |
654 | const XRayRecord &Record, |
655 | const FuncIdConversionHelper &Converter) { |
656 | switch (Error) { |
657 | case StackTrie::AccountRecordStatus::ENTRY_NOT_FOUND: |
658 | return std::string( |
659 | formatv(Fmt: "Found record {0} with no matching function entry\n" , |
660 | Vals: format_xray_record(Record, Converter))); |
661 | default: |
662 | return std::string(formatv(Fmt: "Unknown error type for record {0}\n" , |
663 | Vals: format_xray_record(Record, Converter))); |
664 | } |
665 | } |
666 | |
667 | static CommandRegistration Unused(&Stack, []() -> Error { |
668 | // Load each file provided as a command-line argument. For each one of them |
669 | // account to a single StackTrie, and just print the whole trie for now. |
670 | StackTrie ST; |
671 | InstrumentationMap Map; |
672 | if (!StacksInstrMap.empty()) { |
673 | auto InstrumentationMapOrError = loadInstrumentationMap(Filename: StacksInstrMap); |
674 | if (!InstrumentationMapOrError) |
675 | return joinErrors( |
676 | E1: make_error<StringError>( |
677 | Args: Twine("Cannot open instrumentation map: " ) + StacksInstrMap, |
678 | Args: std::make_error_code(e: std::errc::invalid_argument)), |
679 | E2: InstrumentationMapOrError.takeError()); |
680 | Map = std::move(*InstrumentationMapOrError); |
681 | } |
682 | |
683 | if (SeparateThreadStacks && AggregateThreads) |
684 | return make_error<StringError>( |
685 | Args: Twine("Can't specify options for per thread reporting and reporting " |
686 | "that aggregates threads." ), |
687 | Args: std::make_error_code(e: std::errc::invalid_argument)); |
688 | |
689 | if (!DumpAllStacks && StacksOutputFormat != HUMAN) |
690 | return make_error<StringError>( |
691 | Args: Twine("Can't specify a non-human format without -all-stacks." ), |
692 | Args: std::make_error_code(e: std::errc::invalid_argument)); |
693 | |
694 | if (DumpAllStacks && StacksOutputFormat == HUMAN) |
695 | return make_error<StringError>( |
696 | Args: Twine("You must specify a non-human format when reporting with " |
697 | "-all-stacks." ), |
698 | Args: std::make_error_code(e: std::errc::invalid_argument)); |
699 | |
700 | symbolize::LLVMSymbolizer Symbolizer; |
701 | FuncIdConversionHelper FuncIdHelper(StacksInstrMap, Symbolizer, |
702 | Map.getFunctionAddresses()); |
703 | // TODO: Someday, support output to files instead of just directly to |
704 | // standard output. |
705 | for (const auto &Filename : StackInputs) { |
706 | auto TraceOrErr = loadTraceFile(Filename); |
707 | if (!TraceOrErr) { |
708 | if (!StackKeepGoing) |
709 | return joinErrors( |
710 | E1: make_error<StringError>( |
711 | Args: Twine("Failed loading input file '" ) + Filename + "'" , |
712 | Args: std::make_error_code(e: std::errc::invalid_argument)), |
713 | E2: TraceOrErr.takeError()); |
714 | logAllUnhandledErrors(E: TraceOrErr.takeError(), OS&: errs()); |
715 | continue; |
716 | } |
717 | auto &T = *TraceOrErr; |
718 | StackTrie::AccountRecordState AccountRecordState = |
719 | StackTrie::AccountRecordState::CreateInitialState(); |
720 | for (const auto &Record : T) { |
721 | auto error = ST.accountRecord(R: Record, state: &AccountRecordState); |
722 | if (error != StackTrie::AccountRecordStatus::OK) { |
723 | if (!StackKeepGoing) |
724 | return make_error<StringError>( |
725 | Args: CreateErrorMessage(Error: error, Record, Converter: FuncIdHelper), |
726 | Args: make_error_code(E: errc::illegal_byte_sequence)); |
727 | errs() << CreateErrorMessage(Error: error, Record, Converter: FuncIdHelper); |
728 | } |
729 | } |
730 | } |
731 | if (ST.isEmpty()) { |
732 | return make_error<StringError>( |
733 | Args: "No instrumented calls were accounted in the input file." , |
734 | Args: make_error_code(E: errc::result_out_of_range)); |
735 | } |
736 | |
737 | // Report the stacks in a long form mode for another tool to analyze. |
738 | if (DumpAllStacks) { |
739 | if (AggregateThreads) { |
740 | switch (RequestedAggregation) { |
741 | case AggregationType::TOTAL_TIME: |
742 | ST.printAllAggregatingThreads<AggregationType::TOTAL_TIME>( |
743 | OS&: outs(), FN&: FuncIdHelper, format: StacksOutputFormat); |
744 | break; |
745 | case AggregationType::INVOCATION_COUNT: |
746 | ST.printAllAggregatingThreads<AggregationType::INVOCATION_COUNT>( |
747 | OS&: outs(), FN&: FuncIdHelper, format: StacksOutputFormat); |
748 | break; |
749 | } |
750 | } else { |
751 | switch (RequestedAggregation) { |
752 | case AggregationType::TOTAL_TIME: |
753 | ST.printAllPerThread<AggregationType::TOTAL_TIME>(OS&: outs(), FN&: FuncIdHelper, |
754 | format: StacksOutputFormat); |
755 | break; |
756 | case AggregationType::INVOCATION_COUNT: |
757 | ST.printAllPerThread<AggregationType::INVOCATION_COUNT>( |
758 | OS&: outs(), FN&: FuncIdHelper, format: StacksOutputFormat); |
759 | break; |
760 | } |
761 | } |
762 | return Error::success(); |
763 | } |
764 | |
765 | // We're only outputting top stacks. |
766 | if (AggregateThreads) { |
767 | ST.printAggregatingThreads(OS&: outs(), FN&: FuncIdHelper); |
768 | } else if (SeparateThreadStacks) { |
769 | ST.printPerThread(OS&: outs(), FN&: FuncIdHelper); |
770 | } else { |
771 | ST.printIgnoringThreads(OS&: outs(), FN&: FuncIdHelper); |
772 | } |
773 | return Error::success(); |
774 | }); |
775 | |