1//===- MemProfRadixTree.cpp - Radix tree encoded callstacks ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// This file contains logic that implements a space efficient radix tree
9// encoding for callstacks used by MemProf.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ProfileData/MemProfRadixTree.h"
14
15namespace llvm {
16namespace memprof {
17// Encode a call stack into RadixArray. Return the starting index within
18// RadixArray. For each call stack we encode, we emit two or three components
19// into RadixArray. If a given call stack doesn't have a common prefix relative
20// to the previous one, we emit:
21//
22// - the frames in the given call stack in the root-to-leaf order
23//
24// - the length of the given call stack
25//
26// If a given call stack has a non-empty common prefix relative to the previous
27// one, we emit:
28//
29// - the relative location of the common prefix, encoded as a negative number.
30//
31// - a portion of the given call stack that's beyond the common prefix
32//
33// - the length of the given call stack, including the length of the common
34// prefix.
35//
36// The resulting RadixArray requires a somewhat unintuitive backward traversal
37// to reconstruct a call stack -- read the call stack length and scan backward
38// while collecting frames in the leaf to root order. build, the caller of this
39// function, reverses RadixArray in place so that we can reconstruct a call
40// stack as if we were deserializing an array in a typical way -- the call stack
41// length followed by the frames in the leaf-to-root order except that we need
42// to handle pointers to parents along the way.
43//
44// To quickly determine the location of the common prefix within RadixArray,
45// Indexes caches the indexes of the previous call stack's frames within
46// RadixArray.
47template <typename FrameIdTy>
48LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack(
49 const llvm::SmallVector<FrameIdTy> *CallStack,
50 const llvm::SmallVector<FrameIdTy> *Prev,
51 const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) {
52 // Compute the length of the common root prefix between Prev and CallStack.
53 uint32_t CommonLen = 0;
54 if (Prev) {
55 auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(),
56 CallStack->rend());
57 CommonLen = std::distance(CallStack->rbegin(), Pos.second);
58 }
59
60 // Drop the portion beyond CommonLen.
61 assert(CommonLen <= Indexes.size());
62 Indexes.resize(new_size: CommonLen);
63
64 // Append a pointer to the parent.
65 if (CommonLen) {
66 uint32_t CurrentIndex = RadixArray.size();
67 uint32_t ParentIndex = Indexes.back();
68 // The offset to the parent must be negative because we are pointing to an
69 // element we've already added to RadixArray.
70 assert(ParentIndex < CurrentIndex);
71 RadixArray.push_back(x: ParentIndex - CurrentIndex);
72 }
73
74 // Copy the part of the call stack beyond the common prefix to RadixArray.
75 assert(CommonLen <= CallStack->size());
76 for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
77 // Remember the index of F in RadixArray.
78 Indexes.push_back(x: RadixArray.size());
79 RadixArray.push_back(
80 MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F);
81 }
82 assert(CallStack->size() == Indexes.size());
83
84 // End with the call stack length.
85 RadixArray.push_back(CallStack->size());
86
87 // Return the index within RadixArray where we can start reconstructing a
88 // given call stack from.
89 return RadixArray.size() - 1;
90}
91
92template <typename FrameIdTy>
93void CallStackRadixTreeBuilder<FrameIdTy>::build(
94 llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
95 &&MemProfCallStackData,
96 const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes,
97 llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) {
98 // Take the vector portion of MemProfCallStackData. The vector is exactly
99 // what we need to sort. Also, we no longer need its lookup capability.
100 llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
101
102 // Return early if we have no work to do.
103 if (CallStacks.empty()) {
104 RadixArray.clear();
105 CallStackPos.clear();
106 return;
107 }
108
109 // Sorting the list of call stacks in the dictionary order is sufficient to
110 // maximize the length of the common prefix between two adjacent call stacks
111 // and thus minimize the length of RadixArray. However, we go one step
112 // further and try to reduce the number of times we follow pointers to parents
113 // during deserilization. Consider a poorly encoded radix tree:
114 //
115 // CallStackId 1: f1 -> f2 -> f3
116 // |
117 // CallStackId 2: +--- f4 -> f5
118 // |
119 // CallStackId 3: +--> f6
120 //
121 // Here, f2 and f4 appear once and twice, respectively, in the call stacks.
122 // Once we encode CallStackId 1 into RadixArray, every other call stack with
123 // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3
124 // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
125 // parents twice.
126 //
127 // We try to alleviate the situation by sorting the list of call stacks by
128 // comparing the popularity of frames rather than the integer values of
129 // FrameIds. In the example above, f4 is more popular than f2, so we sort the
130 // call stacks and encode them as:
131 //
132 // CallStackId 2: f1 -- f4 -> f5
133 // | |
134 // CallStackId 3: | +--> f6
135 // |
136 // CallStackId 1: +--> f2 -> f3
137 //
138 // Notice that CallStackId 3 follows a pointer to a parent only once.
139 //
140 // All this is a quick-n-dirty trick to reduce the number of jumps. The
141 // proper way would be to compute the weight of each radix tree node -- how
142 // many call stacks use a given radix tree node, and encode a radix tree from
143 // the heaviest node first. We do not do so because that's a lot of work.
144 llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
145 // Call stacks are stored from leaf to root. Perform comparisons from the
146 // root.
147 return std::lexicographical_compare(
148 L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
149 [&](FrameIdTy F1, FrameIdTy F2) {
150 uint64_t H1 = FrameHistogram[F1].Count;
151 uint64_t H2 = FrameHistogram[F2].Count;
152 // Popular frames should come later because we encode call stacks from
153 // the last one in the list.
154 if (H1 != H2)
155 return H1 < H2;
156 // For sort stability.
157 return F1 < F2;
158 });
159 });
160
161 // Reserve some reasonable amount of storage.
162 RadixArray.clear();
163 RadixArray.reserve(n: CallStacks.size() * 8);
164
165 // Indexes will grow as long as the longest call stack.
166 Indexes.clear();
167 Indexes.reserve(n: 512);
168
169 // CallStackPos will grow to exactly CallStacks.size() entries.
170 CallStackPos.clear();
171 CallStackPos.reserve(NumEntries: CallStacks.size());
172
173 // Compute the radix array. We encode one call stack at a time, computing the
174 // longest prefix that's shared with the previous call stack we encode. For
175 // each call stack we encode, we remember a mapping from CallStackId to its
176 // position within RadixArray.
177 //
178 // As an optimization, we encode from the last call stack in CallStacks to
179 // reduce the number of times we follow pointers to the parents. Consider the
180 // list of call stacks that has been sorted in the dictionary order:
181 //
182 // Call Stack 1: F1
183 // Call Stack 2: F1 -> F2
184 // Call Stack 3: F1 -> F2 -> F3
185 //
186 // If we traversed CallStacks in the forward order, we would end up with a
187 // radix tree like:
188 //
189 // Call Stack 1: F1
190 // |
191 // Call Stack 2: +---> F2
192 // |
193 // Call Stack 3: +---> F3
194 //
195 // Notice that each call stack jumps to the previous one. However, if we
196 // traverse CallStacks in the reverse order, then Call Stack 3 has the
197 // complete call stack encoded without any pointers. Call Stack 1 and 2 point
198 // to appropriate prefixes of Call Stack 3.
199 const llvm::SmallVector<FrameIdTy> *Prev = nullptr;
200 for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {
201 LinearCallStackId Pos =
202 encodeCallStack(CallStack: &CallStack, Prev, MemProfFrameIndexes);
203 CallStackPos.insert({CSId, Pos});
204 Prev = &CallStack;
205 }
206
207 // "RadixArray.size() - 1" below is problematic if RadixArray is empty.
208 assert(!RadixArray.empty());
209
210 // Reverse the radix array in place. We do so mostly for intuitive
211 // deserialization where we would read the length field and then the call
212 // stack frames proper just like any other array deserialization, except
213 // that we have occasional jumps to take advantage of prefixes.
214 for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J)
215 std::swap(a&: RadixArray[I], b&: RadixArray[J]);
216
217 // "Reverse" the indexes stored in CallStackPos.
218 for (auto &[K, V] : CallStackPos)
219 V = RadixArray.size() - 1 - V;
220}
221
222// Explicitly instantiate class with the utilized FrameIdTy.
223template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<FrameId>;
224template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<LinearFrameId>;
225
226template <typename FrameIdTy>
227llvm::DenseMap<FrameIdTy, FrameStat>
228computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
229 &MemProfCallStackData) {
230 llvm::DenseMap<FrameIdTy, FrameStat> Histogram;
231
232 for (const auto &KV : MemProfCallStackData) {
233 const auto &CS = KV.second;
234 for (unsigned I = 0, E = CS.size(); I != E; ++I) {
235 auto &S = Histogram[CS[I]];
236 ++S.Count;
237 S.PositionSum += I;
238 }
239 }
240 return Histogram;
241}
242
243// Explicitly instantiate function with the utilized FrameIdTy.
244template LLVM_ABI llvm::DenseMap<FrameId, FrameStat>
245computeFrameHistogram<FrameId>(
246 llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
247 &MemProfCallStackData);
248template LLVM_ABI llvm::DenseMap<LinearFrameId, FrameStat>
249computeFrameHistogram<LinearFrameId>(
250 llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
251 &MemProfCallStackData);
252} // namespace memprof
253} // namespace llvm
254