1 | //===- MemProfRadixTree.cpp - Radix tree encoded callstacks ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // This file contains logic that implements a space efficient radix tree |
9 | // encoding for callstacks used by MemProf. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/ProfileData/MemProfRadixTree.h" |
14 | |
15 | namespace llvm { |
16 | namespace memprof { |
17 | // Encode a call stack into RadixArray. Return the starting index within |
18 | // RadixArray. For each call stack we encode, we emit two or three components |
19 | // into RadixArray. If a given call stack doesn't have a common prefix relative |
20 | // to the previous one, we emit: |
21 | // |
22 | // - the frames in the given call stack in the root-to-leaf order |
23 | // |
24 | // - the length of the given call stack |
25 | // |
26 | // If a given call stack has a non-empty common prefix relative to the previous |
27 | // one, we emit: |
28 | // |
29 | // - the relative location of the common prefix, encoded as a negative number. |
30 | // |
31 | // - a portion of the given call stack that's beyond the common prefix |
32 | // |
33 | // - the length of the given call stack, including the length of the common |
34 | // prefix. |
35 | // |
36 | // The resulting RadixArray requires a somewhat unintuitive backward traversal |
37 | // to reconstruct a call stack -- read the call stack length and scan backward |
38 | // while collecting frames in the leaf to root order. build, the caller of this |
39 | // function, reverses RadixArray in place so that we can reconstruct a call |
40 | // stack as if we were deserializing an array in a typical way -- the call stack |
41 | // length followed by the frames in the leaf-to-root order except that we need |
42 | // to handle pointers to parents along the way. |
43 | // |
44 | // To quickly determine the location of the common prefix within RadixArray, |
45 | // Indexes caches the indexes of the previous call stack's frames within |
46 | // RadixArray. |
47 | template <typename FrameIdTy> |
48 | LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack( |
49 | const llvm::SmallVector<FrameIdTy> *CallStack, |
50 | const llvm::SmallVector<FrameIdTy> *Prev, |
51 | const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) { |
52 | // Compute the length of the common root prefix between Prev and CallStack. |
53 | uint32_t CommonLen = 0; |
54 | if (Prev) { |
55 | auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(), |
56 | CallStack->rend()); |
57 | CommonLen = std::distance(CallStack->rbegin(), Pos.second); |
58 | } |
59 | |
60 | // Drop the portion beyond CommonLen. |
61 | assert(CommonLen <= Indexes.size()); |
62 | Indexes.resize(new_size: CommonLen); |
63 | |
64 | // Append a pointer to the parent. |
65 | if (CommonLen) { |
66 | uint32_t CurrentIndex = RadixArray.size(); |
67 | uint32_t ParentIndex = Indexes.back(); |
68 | // The offset to the parent must be negative because we are pointing to an |
69 | // element we've already added to RadixArray. |
70 | assert(ParentIndex < CurrentIndex); |
71 | RadixArray.push_back(x: ParentIndex - CurrentIndex); |
72 | } |
73 | |
74 | // Copy the part of the call stack beyond the common prefix to RadixArray. |
75 | assert(CommonLen <= CallStack->size()); |
76 | for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) { |
77 | // Remember the index of F in RadixArray. |
78 | Indexes.push_back(x: RadixArray.size()); |
79 | RadixArray.push_back( |
80 | MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F); |
81 | } |
82 | assert(CallStack->size() == Indexes.size()); |
83 | |
84 | // End with the call stack length. |
85 | RadixArray.push_back(CallStack->size()); |
86 | |
87 | // Return the index within RadixArray where we can start reconstructing a |
88 | // given call stack from. |
89 | return RadixArray.size() - 1; |
90 | } |
91 | |
92 | template <typename FrameIdTy> |
93 | void CallStackRadixTreeBuilder<FrameIdTy>::build( |
94 | llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> |
95 | &&MemProfCallStackData, |
96 | const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes, |
97 | llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) { |
98 | // Take the vector portion of MemProfCallStackData. The vector is exactly |
99 | // what we need to sort. Also, we no longer need its lookup capability. |
100 | llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector(); |
101 | |
102 | // Return early if we have no work to do. |
103 | if (CallStacks.empty()) { |
104 | RadixArray.clear(); |
105 | CallStackPos.clear(); |
106 | return; |
107 | } |
108 | |
109 | // Sorting the list of call stacks in the dictionary order is sufficient to |
110 | // maximize the length of the common prefix between two adjacent call stacks |
111 | // and thus minimize the length of RadixArray. However, we go one step |
112 | // further and try to reduce the number of times we follow pointers to parents |
113 | // during deserilization. Consider a poorly encoded radix tree: |
114 | // |
115 | // CallStackId 1: f1 -> f2 -> f3 |
116 | // | |
117 | // CallStackId 2: +--- f4 -> f5 |
118 | // | |
119 | // CallStackId 3: +--> f6 |
120 | // |
121 | // Here, f2 and f4 appear once and twice, respectively, in the call stacks. |
122 | // Once we encode CallStackId 1 into RadixArray, every other call stack with |
123 | // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3 |
124 | // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to |
125 | // parents twice. |
126 | // |
127 | // We try to alleviate the situation by sorting the list of call stacks by |
128 | // comparing the popularity of frames rather than the integer values of |
129 | // FrameIds. In the example above, f4 is more popular than f2, so we sort the |
130 | // call stacks and encode them as: |
131 | // |
132 | // CallStackId 2: f1 -- f4 -> f5 |
133 | // | | |
134 | // CallStackId 3: | +--> f6 |
135 | // | |
136 | // CallStackId 1: +--> f2 -> f3 |
137 | // |
138 | // Notice that CallStackId 3 follows a pointer to a parent only once. |
139 | // |
140 | // All this is a quick-n-dirty trick to reduce the number of jumps. The |
141 | // proper way would be to compute the weight of each radix tree node -- how |
142 | // many call stacks use a given radix tree node, and encode a radix tree from |
143 | // the heaviest node first. We do not do so because that's a lot of work. |
144 | llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) { |
145 | // Call stacks are stored from leaf to root. Perform comparisons from the |
146 | // root. |
147 | return std::lexicographical_compare( |
148 | L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(), |
149 | [&](FrameIdTy F1, FrameIdTy F2) { |
150 | uint64_t H1 = FrameHistogram[F1].Count; |
151 | uint64_t H2 = FrameHistogram[F2].Count; |
152 | // Popular frames should come later because we encode call stacks from |
153 | // the last one in the list. |
154 | if (H1 != H2) |
155 | return H1 < H2; |
156 | // For sort stability. |
157 | return F1 < F2; |
158 | }); |
159 | }); |
160 | |
161 | // Reserve some reasonable amount of storage. |
162 | RadixArray.clear(); |
163 | RadixArray.reserve(n: CallStacks.size() * 8); |
164 | |
165 | // Indexes will grow as long as the longest call stack. |
166 | Indexes.clear(); |
167 | Indexes.reserve(n: 512); |
168 | |
169 | // CallStackPos will grow to exactly CallStacks.size() entries. |
170 | CallStackPos.clear(); |
171 | CallStackPos.reserve(NumEntries: CallStacks.size()); |
172 | |
173 | // Compute the radix array. We encode one call stack at a time, computing the |
174 | // longest prefix that's shared with the previous call stack we encode. For |
175 | // each call stack we encode, we remember a mapping from CallStackId to its |
176 | // position within RadixArray. |
177 | // |
178 | // As an optimization, we encode from the last call stack in CallStacks to |
179 | // reduce the number of times we follow pointers to the parents. Consider the |
180 | // list of call stacks that has been sorted in the dictionary order: |
181 | // |
182 | // Call Stack 1: F1 |
183 | // Call Stack 2: F1 -> F2 |
184 | // Call Stack 3: F1 -> F2 -> F3 |
185 | // |
186 | // If we traversed CallStacks in the forward order, we would end up with a |
187 | // radix tree like: |
188 | // |
189 | // Call Stack 1: F1 |
190 | // | |
191 | // Call Stack 2: +---> F2 |
192 | // | |
193 | // Call Stack 3: +---> F3 |
194 | // |
195 | // Notice that each call stack jumps to the previous one. However, if we |
196 | // traverse CallStacks in the reverse order, then Call Stack 3 has the |
197 | // complete call stack encoded without any pointers. Call Stack 1 and 2 point |
198 | // to appropriate prefixes of Call Stack 3. |
199 | const llvm::SmallVector<FrameIdTy> *Prev = nullptr; |
200 | for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) { |
201 | LinearCallStackId Pos = |
202 | encodeCallStack(CallStack: &CallStack, Prev, MemProfFrameIndexes); |
203 | CallStackPos.insert({CSId, Pos}); |
204 | Prev = &CallStack; |
205 | } |
206 | |
207 | // "RadixArray.size() - 1" below is problematic if RadixArray is empty. |
208 | assert(!RadixArray.empty()); |
209 | |
210 | // Reverse the radix array in place. We do so mostly for intuitive |
211 | // deserialization where we would read the length field and then the call |
212 | // stack frames proper just like any other array deserialization, except |
213 | // that we have occasional jumps to take advantage of prefixes. |
214 | for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J) |
215 | std::swap(a&: RadixArray[I], b&: RadixArray[J]); |
216 | |
217 | // "Reverse" the indexes stored in CallStackPos. |
218 | for (auto &[K, V] : CallStackPos) |
219 | V = RadixArray.size() - 1 - V; |
220 | } |
221 | |
222 | // Explicitly instantiate class with the utilized FrameIdTy. |
223 | template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<FrameId>; |
224 | template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<LinearFrameId>; |
225 | |
226 | template <typename FrameIdTy> |
227 | llvm::DenseMap<FrameIdTy, FrameStat> |
228 | computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> |
229 | &MemProfCallStackData) { |
230 | llvm::DenseMap<FrameIdTy, FrameStat> Histogram; |
231 | |
232 | for (const auto &KV : MemProfCallStackData) { |
233 | const auto &CS = KV.second; |
234 | for (unsigned I = 0, E = CS.size(); I != E; ++I) { |
235 | auto &S = Histogram[CS[I]]; |
236 | ++S.Count; |
237 | S.PositionSum += I; |
238 | } |
239 | } |
240 | return Histogram; |
241 | } |
242 | |
243 | // Explicitly instantiate function with the utilized FrameIdTy. |
244 | template LLVM_ABI llvm::DenseMap<FrameId, FrameStat> |
245 | computeFrameHistogram<FrameId>( |
246 | llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> |
247 | &MemProfCallStackData); |
248 | template LLVM_ABI llvm::DenseMap<LinearFrameId, FrameStat> |
249 | computeFrameHistogram<LinearFrameId>( |
250 | llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> |
251 | &MemProfCallStackData); |
252 | } // namespace memprof |
253 | } // namespace llvm |
254 | |