MemProfRadixTree.cpp source code [llvm_projects/llvm/lib/ProfileData/MemProfRadixTree.cpp]

1	//===- MemProfRadixTree.cpp - Radix tree encoded callstacks ---------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	// This file contains logic that implements a space efficient radix tree
9	// encoding for callstacks used by MemProf.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/ProfileData/MemProfRadixTree.h"
14
15	namespace llvm {
16	namespace memprof {
17	// Encode a call stack into RadixArray. Return the starting index within
18	// RadixArray. For each call stack we encode, we emit two or three components
19	// into RadixArray. If a given call stack doesn't have a common prefix relative
20	// to the previous one, we emit:
21	//
22	// - the frames in the given call stack in the root-to-leaf order
23	//
24	// - the length of the given call stack
25	//
26	// If a given call stack has a non-empty common prefix relative to the previous
27	// one, we emit:
28	//
29	// - the relative location of the common prefix, encoded as a negative number.
30	//
31	// - a portion of the given call stack that's beyond the common prefix
32	//
33	// - the length of the given call stack, including the length of the common
34	// prefix.
35	//
36	// The resulting RadixArray requires a somewhat unintuitive backward traversal
37	// to reconstruct a call stack -- read the call stack length and scan backward
38	// while collecting frames in the leaf to root order. build, the caller of this
39	// function, reverses RadixArray in place so that we can reconstruct a call
40	// stack as if we were deserializing an array in a typical way -- the call stack
41	// length followed by the frames in the leaf-to-root order except that we need
42	// to handle pointers to parents along the way.
43	//
44	// To quickly determine the location of the common prefix within RadixArray,
45	// Indexes caches the indexes of the previous call stack's frames within
46	// RadixArray.
47	template <typename FrameIdTy>
48	LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack(
49	const llvm::SmallVector<FrameIdTy> *CallStack,
50	const llvm::SmallVector<FrameIdTy> *Prev,
51	const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) {
52	// Compute the length of the common root prefix between Prev and CallStack.
53	uint32_t CommonLen = `0`;
54	if (Prev) {
55	auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(),
56	CallStack->rend());
57	CommonLen = std::distance(CallStack->rbegin(), Pos.second);
58	}
59
60	// Drop the portion beyond CommonLen.
61	assert(CommonLen <= Indexes.size());
62	Indexes.resize(new_size: CommonLen);
63
64	// Append a pointer to the parent.
65	if (CommonLen) {
66	uint32_t CurrentIndex = RadixArray.size();
67	uint32_t ParentIndex = Indexes.back();
68	// The offset to the parent must be negative because we are pointing to an
69	// element we've already added to RadixArray.
70	assert(ParentIndex < CurrentIndex);
71	RadixArray.push_back(x: ParentIndex - CurrentIndex);
72	}
73
74	// Copy the part of the call stack beyond the common prefix to RadixArray.
75	assert(CommonLen <= CallStack->size());
76	for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
77	// Remember the index of F in RadixArray.
78	Indexes.push_back(x: RadixArray.size());
79	RadixArray.push_back(
80	MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F);
81	}
82	assert(CallStack->size() == Indexes.size());
83
84	// End with the call stack length.
85	RadixArray.push_back(CallStack->size());
86
87	// Return the index within RadixArray where we can start reconstructing a
88	// given call stack from.
89	return RadixArray.size() - `1`;
90	}
91
92	template <typename FrameIdTy>
93	void CallStackRadixTreeBuilder<FrameIdTy>::build(
94	llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
95	&&MemProfCallStackData,
96	const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes,
97	llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) {
98	// Take the vector portion of MemProfCallStackData. The vector is exactly
99	// what we need to sort. Also, we no longer need its lookup capability.
100	llvm::SmallVector<CSIdPair, `0`> CallStacks = MemProfCallStackData.takeVector();
101
102	// Return early if we have no work to do.
103	if (CallStacks.empty()) {
104	RadixArray.clear();
105	CallStackPos.clear();
106	return;
107	}
108
109	// Sorting the list of call stacks in the dictionary order is sufficient to
110	// maximize the length of the common prefix between two adjacent call stacks
111	// and thus minimize the length of RadixArray. However, we go one step
112	// further and try to reduce the number of times we follow pointers to parents
113	// during deserilization. Consider a poorly encoded radix tree:
114	//
115	// CallStackId 1: f1 -> f2 -> f3
116	// \|
117	// CallStackId 2: +--- f4 -> f5
118	// \|
119	// CallStackId 3: +--> f6
120	//
121	// Here, f2 and f4 appear once and twice, respectively, in the call stacks.
122	// Once we encode CallStackId 1 into RadixArray, every other call stack with
123	// common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3
124	// share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
125	// parents twice.
126	//
127	// We try to alleviate the situation by sorting the list of call stacks by
128	// comparing the popularity of frames rather than the integer values of
129	// FrameIds. In the example above, f4 is more popular than f2, so we sort the
130	// call stacks and encode them as:
131	//
132	// CallStackId 2: f1 -- f4 -> f5
133	// \| \|
134	// CallStackId 3: \| +--> f6
135	// \|
136	// CallStackId 1: +--> f2 -> f3
137	//
138	// Notice that CallStackId 3 follows a pointer to a parent only once.
139	//
140	// All this is a quick-n-dirty trick to reduce the number of jumps. The
141	// proper way would be to compute the weight of each radix tree node -- how
142	// many call stacks use a given radix tree node, and encode a radix tree from
143	// the heaviest node first. We do not do so because that's a lot of work.
144	llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
145	// Call stacks are stored from leaf to root. Perform comparisons from the
146	// root.
147	return std::lexicographical_compare(
148	L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
149	[&](FrameIdTy F1, FrameIdTy F2) {
150	uint64_t H1 = FrameHistogram[F1].Count;
151	uint64_t H2 = FrameHistogram[F2].Count;
152	// Popular frames should come later because we encode call stacks from
153	// the last one in the list.
154	if (H1 != H2)
155	return H1 < H2;
156	// For sort stability.
157	return F1 < F2;
158	});
159	});
160
161	// Reserve some reasonable amount of storage.
162	RadixArray.clear();
163	RadixArray.reserve(n: CallStacks.size() * `8`);
164
165	// Indexes will grow as long as the longest call stack.
166	Indexes.clear();
167	Indexes.reserve(n: `512`);
168
169	// CallStackPos will grow to exactly CallStacks.size() entries.
170	CallStackPos.clear();
171	CallStackPos.reserve(NumEntries: CallStacks.size());
172
173	// Compute the radix array. We encode one call stack at a time, computing the
174	// longest prefix that's shared with the previous call stack we encode. For
175	// each call stack we encode, we remember a mapping from CallStackId to its
176	// position within RadixArray.
177	//
178	// As an optimization, we encode from the last call stack in CallStacks to
179	// reduce the number of times we follow pointers to the parents. Consider the
180	// list of call stacks that has been sorted in the dictionary order:
181	//
182	// Call Stack 1: F1
183	// Call Stack 2: F1 -> F2
184	// Call Stack 3: F1 -> F2 -> F3
185	//
186	// If we traversed CallStacks in the forward order, we would end up with a
187	// radix tree like:
188	//
189	// Call Stack 1: F1
190	// \|
191	// Call Stack 2: +---> F2
192	// \|
193	// Call Stack 3: +---> F3
194	//
195	// Notice that each call stack jumps to the previous one. However, if we
196	// traverse CallStacks in the reverse order, then Call Stack 3 has the
197	// complete call stack encoded without any pointers. Call Stack 1 and 2 point
198	// to appropriate prefixes of Call Stack 3.
199	const llvm::SmallVector<FrameIdTy> Prev = nullptr*;
200	for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {
201	LinearCallStackId Pos =
202	encodeCallStack(CallStack: &CallStack, Prev, MemProfFrameIndexes);
203	CallStackPos.insert({CSId, Pos});
204	Prev = &CallStack;
205	}
206
207	// "RadixArray.size() - 1" below is problematic if RadixArray is empty.
208	assert(!RadixArray.empty());
209
210	// Reverse the radix array in place. We do so mostly for intuitive
211	// deserialization where we would read the length field and then the call
212	// stack frames proper just like any other array deserialization, except
213	// that we have occasional jumps to take advantage of prefixes.
214	for (size_t I = `0`, J = RadixArray.size() - `1`; I < J; ++I, --J)
215	std::swap(a&: RadixArray [I], b&: RadixArray [J]);
216
217	// "Reverse" the indexes stored in CallStackPos.
218	for (auto &[K, V] : CallStackPos)
219	V = RadixArray.size() - `1` - V;
220	}
221
222	// Explicitly instantiate class with the utilized FrameIdTy.
223	template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<FrameId>;
224	template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<LinearFrameId>;
225
226	template <typename FrameIdTy>
227	llvm::DenseMap<FrameIdTy, FrameStat>
228	computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
229	&MemProfCallStackData) {
230	llvm::DenseMap<FrameIdTy, FrameStat> Histogram;
231
232	for (const auto &KV : MemProfCallStackData) {
233	const auto &CS = KV.second;
234	for (unsigned I = `0`, E = CS.size(); I != E; ++I) {
235	auto &S = Histogram[CS[I]];
236	++S.Count;
237	S.PositionSum += I;
238	}
239	}
240	return Histogram;
241	}
242
243	// Explicitly instantiate function with the utilized FrameIdTy.
244	template LLVM_ABI llvm::DenseMap<FrameId, FrameStat>
245	computeFrameHistogram<FrameId>(
246	llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
247	&MemProfCallStackData);
248	template LLVM_ABI llvm::DenseMap<LinearFrameId, FrameStat>
249	computeFrameHistogram<LinearFrameId>(
250	llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
251	&MemProfCallStackData);
252	} // namespace memprof
253	} // namespace llvm
254

Browse the source code of llvm_projects/llvm/lib/ProfileData/MemProfRadixTree.cpp