1 | #include "llvm/ProfileData/MemProf.h" |
2 | #include "llvm/ADT/SmallVector.h" |
3 | #include "llvm/IR/Function.h" |
4 | #include "llvm/ProfileData/InstrProf.h" |
5 | #include "llvm/ProfileData/SampleProf.h" |
6 | #include "llvm/Support/BLAKE3.h" |
7 | #include "llvm/Support/Endian.h" |
8 | #include "llvm/Support/EndianStream.h" |
9 | #include "llvm/Support/HashBuilder.h" |
10 | |
11 | namespace llvm { |
12 | namespace memprof { |
13 | MemProfSchema getFullSchema() { |
14 | MemProfSchema List; |
15 | #define MIBEntryDef(NameTag, Name, Type) List.push_back(Meta::Name); |
16 | #include "llvm/ProfileData/MIBEntryDef.inc" |
17 | #undef MIBEntryDef |
18 | return List; |
19 | } |
20 | |
21 | MemProfSchema getHotColdSchema() { |
22 | return {Meta::AllocCount, Meta::TotalSize, Meta::TotalLifetime, |
23 | Meta::TotalLifetimeAccessDensity}; |
24 | } |
25 | |
26 | static size_t serializedSizeV0(const IndexedAllocationInfo &IAI, |
27 | const MemProfSchema &Schema) { |
28 | size_t Size = 0; |
29 | // The number of frames to serialize. |
30 | Size += sizeof(uint64_t); |
31 | // The callstack frame ids. |
32 | Size += sizeof(FrameId) * IAI.CallStack.size(); |
33 | // The size of the payload. |
34 | Size += PortableMemInfoBlock::serializedSize(Schema); |
35 | return Size; |
36 | } |
37 | |
38 | static size_t serializedSizeV2(const IndexedAllocationInfo &IAI, |
39 | const MemProfSchema &Schema) { |
40 | size_t Size = 0; |
41 | // The CallStackId |
42 | Size += sizeof(CallStackId); |
43 | // The size of the payload. |
44 | Size += PortableMemInfoBlock::serializedSize(Schema); |
45 | return Size; |
46 | } |
47 | |
48 | static size_t serializedSizeV3(const IndexedAllocationInfo &IAI, |
49 | const MemProfSchema &Schema) { |
50 | size_t Size = 0; |
51 | // The linear call stack ID. |
52 | Size += sizeof(LinearCallStackId); |
53 | // The size of the payload. |
54 | Size += PortableMemInfoBlock::serializedSize(Schema); |
55 | return Size; |
56 | } |
57 | |
58 | size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema, |
59 | IndexedVersion Version) const { |
60 | switch (Version) { |
61 | case Version0: |
62 | case Version1: |
63 | return serializedSizeV0(IAI: *this, Schema); |
64 | case Version2: |
65 | return serializedSizeV2(IAI: *this, Schema); |
66 | case Version3: |
67 | return serializedSizeV3(IAI: *this, Schema); |
68 | } |
69 | llvm_unreachable("unsupported MemProf version" ); |
70 | } |
71 | |
72 | static size_t serializedSizeV0(const IndexedMemProfRecord &Record, |
73 | const MemProfSchema &Schema) { |
74 | // The number of alloc sites to serialize. |
75 | size_t Result = sizeof(uint64_t); |
76 | for (const IndexedAllocationInfo &N : Record.AllocSites) |
77 | Result += N.serializedSize(Schema, Version: Version0); |
78 | |
79 | // The number of callsites we have information for. |
80 | Result += sizeof(uint64_t); |
81 | for (const auto &Frames : Record.CallSites) { |
82 | // The number of frame ids to serialize. |
83 | Result += sizeof(uint64_t); |
84 | Result += Frames.size() * sizeof(FrameId); |
85 | } |
86 | return Result; |
87 | } |
88 | |
89 | static size_t serializedSizeV2(const IndexedMemProfRecord &Record, |
90 | const MemProfSchema &Schema) { |
91 | // The number of alloc sites to serialize. |
92 | size_t Result = sizeof(uint64_t); |
93 | for (const IndexedAllocationInfo &N : Record.AllocSites) |
94 | Result += N.serializedSize(Schema, Version: Version2); |
95 | |
96 | // The number of callsites we have information for. |
97 | Result += sizeof(uint64_t); |
98 | // The CallStackId |
99 | Result += Record.CallSiteIds.size() * sizeof(CallStackId); |
100 | return Result; |
101 | } |
102 | |
103 | static size_t serializedSizeV3(const IndexedMemProfRecord &Record, |
104 | const MemProfSchema &Schema) { |
105 | // The number of alloc sites to serialize. |
106 | size_t Result = sizeof(uint64_t); |
107 | for (const IndexedAllocationInfo &N : Record.AllocSites) |
108 | Result += N.serializedSize(Schema, Version: Version3); |
109 | |
110 | // The number of callsites we have information for. |
111 | Result += sizeof(uint64_t); |
112 | // The linear call stack ID. |
113 | Result += Record.CallSiteIds.size() * sizeof(LinearCallStackId); |
114 | return Result; |
115 | } |
116 | |
117 | size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema, |
118 | IndexedVersion Version) const { |
119 | switch (Version) { |
120 | case Version0: |
121 | case Version1: |
122 | return serializedSizeV0(Record: *this, Schema); |
123 | case Version2: |
124 | return serializedSizeV2(Record: *this, Schema); |
125 | case Version3: |
126 | return serializedSizeV3(Record: *this, Schema); |
127 | } |
128 | llvm_unreachable("unsupported MemProf version" ); |
129 | } |
130 | |
131 | static void serializeV0(const IndexedMemProfRecord &Record, |
132 | const MemProfSchema &Schema, raw_ostream &OS) { |
133 | using namespace support; |
134 | |
135 | endian::Writer LE(OS, llvm::endianness::little); |
136 | |
137 | LE.write<uint64_t>(Val: Record.AllocSites.size()); |
138 | for (const IndexedAllocationInfo &N : Record.AllocSites) { |
139 | LE.write<uint64_t>(Val: N.CallStack.size()); |
140 | for (const FrameId &Id : N.CallStack) |
141 | LE.write<FrameId>(Val: Id); |
142 | N.Info.serialize(Schema, OS); |
143 | } |
144 | |
145 | // Related contexts. |
146 | LE.write<uint64_t>(Val: Record.CallSites.size()); |
147 | for (const auto &Frames : Record.CallSites) { |
148 | LE.write<uint64_t>(Val: Frames.size()); |
149 | for (const FrameId &Id : Frames) |
150 | LE.write<FrameId>(Val: Id); |
151 | } |
152 | } |
153 | |
154 | static void serializeV2(const IndexedMemProfRecord &Record, |
155 | const MemProfSchema &Schema, raw_ostream &OS) { |
156 | using namespace support; |
157 | |
158 | endian::Writer LE(OS, llvm::endianness::little); |
159 | |
160 | LE.write<uint64_t>(Val: Record.AllocSites.size()); |
161 | for (const IndexedAllocationInfo &N : Record.AllocSites) { |
162 | LE.write<CallStackId>(Val: N.CSId); |
163 | N.Info.serialize(Schema, OS); |
164 | } |
165 | |
166 | // Related contexts. |
167 | LE.write<uint64_t>(Val: Record.CallSiteIds.size()); |
168 | for (const auto &CSId : Record.CallSiteIds) |
169 | LE.write<CallStackId>(Val: CSId); |
170 | } |
171 | |
172 | static void serializeV3( |
173 | const IndexedMemProfRecord &Record, const MemProfSchema &Schema, |
174 | raw_ostream &OS, |
175 | llvm::DenseMap<CallStackId, LinearCallStackId> &MemProfCallStackIndexes) { |
176 | using namespace support; |
177 | |
178 | endian::Writer LE(OS, llvm::endianness::little); |
179 | |
180 | LE.write<uint64_t>(Val: Record.AllocSites.size()); |
181 | for (const IndexedAllocationInfo &N : Record.AllocSites) { |
182 | assert(MemProfCallStackIndexes.contains(N.CSId)); |
183 | LE.write<LinearCallStackId>(Val: MemProfCallStackIndexes[N.CSId]); |
184 | N.Info.serialize(Schema, OS); |
185 | } |
186 | |
187 | // Related contexts. |
188 | LE.write<uint64_t>(Val: Record.CallSiteIds.size()); |
189 | for (const auto &CSId : Record.CallSiteIds) { |
190 | assert(MemProfCallStackIndexes.contains(CSId)); |
191 | LE.write<LinearCallStackId>(Val: MemProfCallStackIndexes[CSId]); |
192 | } |
193 | } |
194 | |
195 | void IndexedMemProfRecord::serialize( |
196 | const MemProfSchema &Schema, raw_ostream &OS, IndexedVersion Version, |
197 | llvm::DenseMap<CallStackId, LinearCallStackId> *MemProfCallStackIndexes) |
198 | const { |
199 | switch (Version) { |
200 | case Version0: |
201 | case Version1: |
202 | serializeV0(Record: *this, Schema, OS); |
203 | return; |
204 | case Version2: |
205 | serializeV2(Record: *this, Schema, OS); |
206 | return; |
207 | case Version3: |
208 | serializeV3(Record: *this, Schema, OS, MemProfCallStackIndexes&: *MemProfCallStackIndexes); |
209 | return; |
210 | } |
211 | llvm_unreachable("unsupported MemProf version" ); |
212 | } |
213 | |
214 | static IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema, |
215 | const unsigned char *Ptr) { |
216 | using namespace support; |
217 | |
218 | IndexedMemProfRecord Record; |
219 | |
220 | // Read the meminfo nodes. |
221 | const uint64_t NumNodes = |
222 | endian::readNext<uint64_t, llvm::endianness::little>(memory&: Ptr); |
223 | for (uint64_t I = 0; I < NumNodes; I++) { |
224 | IndexedAllocationInfo Node; |
225 | const uint64_t NumFrames = |
226 | endian::readNext<uint64_t, llvm::endianness::little>(memory&: Ptr); |
227 | for (uint64_t J = 0; J < NumFrames; J++) { |
228 | const FrameId Id = |
229 | endian::readNext<FrameId, llvm::endianness::little>(memory&: Ptr); |
230 | Node.CallStack.push_back(Elt: Id); |
231 | } |
232 | Node.CSId = hashCallStack(CS: Node.CallStack); |
233 | Node.Info.deserialize(IncomingSchema: Schema, Ptr); |
234 | Ptr += PortableMemInfoBlock::serializedSize(Schema); |
235 | Record.AllocSites.push_back(Elt: Node); |
236 | } |
237 | |
238 | // Read the callsite information. |
239 | const uint64_t NumCtxs = |
240 | endian::readNext<uint64_t, llvm::endianness::little>(memory&: Ptr); |
241 | for (uint64_t J = 0; J < NumCtxs; J++) { |
242 | const uint64_t NumFrames = |
243 | endian::readNext<uint64_t, llvm::endianness::little>(memory&: Ptr); |
244 | llvm::SmallVector<FrameId> Frames; |
245 | Frames.reserve(N: NumFrames); |
246 | for (uint64_t K = 0; K < NumFrames; K++) { |
247 | const FrameId Id = |
248 | endian::readNext<FrameId, llvm::endianness::little>(memory&: Ptr); |
249 | Frames.push_back(Elt: Id); |
250 | } |
251 | Record.CallSites.push_back(Elt: Frames); |
252 | Record.CallSiteIds.push_back(Elt: hashCallStack(CS: Frames)); |
253 | } |
254 | |
255 | return Record; |
256 | } |
257 | |
258 | static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema, |
259 | const unsigned char *Ptr) { |
260 | using namespace support; |
261 | |
262 | IndexedMemProfRecord Record; |
263 | |
264 | // Read the meminfo nodes. |
265 | const uint64_t NumNodes = |
266 | endian::readNext<uint64_t, llvm::endianness::little>(memory&: Ptr); |
267 | Record.AllocSites.reserve(N: NumNodes); |
268 | for (uint64_t I = 0; I < NumNodes; I++) { |
269 | IndexedAllocationInfo Node; |
270 | Node.CSId = endian::readNext<CallStackId, llvm::endianness::little>(memory&: Ptr); |
271 | Node.Info.deserialize(IncomingSchema: Schema, Ptr); |
272 | Ptr += PortableMemInfoBlock::serializedSize(Schema); |
273 | Record.AllocSites.push_back(Elt: Node); |
274 | } |
275 | |
276 | // Read the callsite information. |
277 | const uint64_t NumCtxs = |
278 | endian::readNext<uint64_t, llvm::endianness::little>(memory&: Ptr); |
279 | Record.CallSiteIds.reserve(N: NumCtxs); |
280 | for (uint64_t J = 0; J < NumCtxs; J++) { |
281 | CallStackId CSId = |
282 | endian::readNext<CallStackId, llvm::endianness::little>(memory&: Ptr); |
283 | Record.CallSiteIds.push_back(Elt: CSId); |
284 | } |
285 | |
286 | return Record; |
287 | } |
288 | |
289 | static IndexedMemProfRecord deserializeV3(const MemProfSchema &Schema, |
290 | const unsigned char *Ptr) { |
291 | using namespace support; |
292 | |
293 | IndexedMemProfRecord Record; |
294 | |
295 | // Read the meminfo nodes. |
296 | const uint64_t NumNodes = |
297 | endian::readNext<uint64_t, llvm::endianness::little>(memory&: Ptr); |
298 | Record.AllocSites.reserve(N: NumNodes); |
299 | for (uint64_t I = 0; I < NumNodes; I++) { |
300 | IndexedAllocationInfo Node; |
301 | Node.CSId = |
302 | endian::readNext<LinearCallStackId, llvm::endianness::little>(memory&: Ptr); |
303 | Node.Info.deserialize(IncomingSchema: Schema, Ptr); |
304 | Ptr += PortableMemInfoBlock::serializedSize(Schema); |
305 | Record.AllocSites.push_back(Elt: Node); |
306 | } |
307 | |
308 | // Read the callsite information. |
309 | const uint64_t NumCtxs = |
310 | endian::readNext<uint64_t, llvm::endianness::little>(memory&: Ptr); |
311 | Record.CallSiteIds.reserve(N: NumCtxs); |
312 | for (uint64_t J = 0; J < NumCtxs; J++) { |
313 | // We are storing LinearCallStackId in CallSiteIds, which is a vector of |
314 | // CallStackId. Assert that CallStackId is no smaller than |
315 | // LinearCallStackId. |
316 | static_assert(sizeof(LinearCallStackId) <= sizeof(CallStackId)); |
317 | LinearCallStackId CSId = |
318 | endian::readNext<LinearCallStackId, llvm::endianness::little>(memory&: Ptr); |
319 | Record.CallSiteIds.push_back(Elt: CSId); |
320 | } |
321 | |
322 | return Record; |
323 | } |
324 | |
325 | IndexedMemProfRecord |
326 | IndexedMemProfRecord::deserialize(const MemProfSchema &Schema, |
327 | const unsigned char *Ptr, |
328 | IndexedVersion Version) { |
329 | switch (Version) { |
330 | case Version0: |
331 | case Version1: |
332 | return deserializeV0(Schema, Ptr); |
333 | case Version2: |
334 | return deserializeV2(Schema, Ptr); |
335 | case Version3: |
336 | return deserializeV3(Schema, Ptr); |
337 | } |
338 | llvm_unreachable("unsupported MemProf version" ); |
339 | } |
340 | |
341 | MemProfRecord IndexedMemProfRecord::toMemProfRecord( |
342 | llvm::function_ref<std::vector<Frame>(const CallStackId)> Callback) const { |
343 | MemProfRecord Record; |
344 | |
345 | Record.AllocSites.reserve(N: AllocSites.size()); |
346 | for (const IndexedAllocationInfo &IndexedAI : AllocSites) { |
347 | AllocationInfo AI; |
348 | AI.Info = IndexedAI.Info; |
349 | AI.CallStack = Callback(IndexedAI.CSId); |
350 | Record.AllocSites.push_back(Elt: std::move(AI)); |
351 | } |
352 | |
353 | Record.CallSites.reserve(N: CallSiteIds.size()); |
354 | for (CallStackId CSId : CallSiteIds) |
355 | Record.CallSites.push_back(Elt: Callback(CSId)); |
356 | |
357 | return Record; |
358 | } |
359 | |
360 | GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) { |
361 | // Canonicalize the function name to drop suffixes such as ".llvm.". Note |
362 | // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop |
363 | // those by default. This is by design to differentiate internal linkage |
364 | // functions during matching. By dropping the other suffixes we can then match |
365 | // functions in the profile use phase prior to their addition. Note that this |
366 | // applies to both instrumented and sampled function names. |
367 | StringRef CanonicalName = |
368 | sampleprof::FunctionSamples::getCanonicalFnName(FnName: FunctionName); |
369 | |
370 | // We use the function guid which we expect to be a uint64_t. At |
371 | // this time, it is the lower 64 bits of the md5 of the canonical |
372 | // function name. |
373 | return Function::getGUID(GlobalName: CanonicalName); |
374 | } |
375 | |
376 | Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) { |
377 | using namespace support; |
378 | |
379 | const unsigned char *Ptr = Buffer; |
380 | const uint64_t NumSchemaIds = |
381 | endian::readNext<uint64_t, llvm::endianness::little>(memory&: Ptr); |
382 | if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) { |
383 | return make_error<InstrProfError>(Args: instrprof_error::malformed, |
384 | Args: "memprof schema invalid" ); |
385 | } |
386 | |
387 | MemProfSchema Result; |
388 | for (size_t I = 0; I < NumSchemaIds; I++) { |
389 | const uint64_t Tag = |
390 | endian::readNext<uint64_t, llvm::endianness::little>(memory&: Ptr); |
391 | if (Tag >= static_cast<uint64_t>(Meta::Size)) { |
392 | return make_error<InstrProfError>(Args: instrprof_error::malformed, |
393 | Args: "memprof schema invalid" ); |
394 | } |
395 | Result.push_back(Elt: static_cast<Meta>(Tag)); |
396 | } |
397 | // Advance the buffer to one past the schema if we succeeded. |
398 | Buffer = Ptr; |
399 | return Result; |
400 | } |
401 | |
402 | CallStackId hashCallStack(ArrayRef<FrameId> CS) { |
403 | llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little> |
404 | HashBuilder; |
405 | for (FrameId F : CS) |
406 | HashBuilder.add(Value: F); |
407 | llvm::BLAKE3Result<8> Hash = HashBuilder.final(); |
408 | CallStackId CSId; |
409 | std::memcpy(dest: &CSId, src: Hash.data(), n: sizeof(Hash)); |
410 | return CSId; |
411 | } |
412 | |
413 | // Encode a call stack into RadixArray. Return the starting index within |
414 | // RadixArray. For each call stack we encode, we emit two or three components |
415 | // into RadixArray. If a given call stack doesn't have a common prefix relative |
416 | // to the previous one, we emit: |
417 | // |
418 | // - the frames in the given call stack in the root-to-leaf order |
419 | // |
420 | // - the length of the given call stack |
421 | // |
422 | // If a given call stack has a non-empty common prefix relative to the previous |
423 | // one, we emit: |
424 | // |
425 | // - the relative location of the common prefix, encoded as a negative number. |
426 | // |
427 | // - a portion of the given call stack that's beyond the common prefix |
428 | // |
429 | // - the length of the given call stack, including the length of the common |
430 | // prefix. |
431 | // |
432 | // The resulting RadixArray requires a somewhat unintuitive backward traversal |
433 | // to reconstruct a call stack -- read the call stack length and scan backward |
434 | // while collecting frames in the leaf to root order. build, the caller of this |
435 | // function, reverses RadixArray in place so that we can reconstruct a call |
436 | // stack as if we were deserializing an array in a typical way -- the call stack |
437 | // length followed by the frames in the leaf-to-root order except that we need |
438 | // to handle pointers to parents along the way. |
439 | // |
440 | // To quickly determine the location of the common prefix within RadixArray, |
441 | // Indexes caches the indexes of the previous call stack's frames within |
442 | // RadixArray. |
443 | LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack( |
444 | const llvm::SmallVector<FrameId> *CallStack, |
445 | const llvm::SmallVector<FrameId> *Prev, |
446 | const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes) { |
447 | // Compute the length of the common root prefix between Prev and CallStack. |
448 | uint32_t CommonLen = 0; |
449 | if (Prev) { |
450 | auto Pos = std::mismatch(first1: Prev->rbegin(), last1: Prev->rend(), first2: CallStack->rbegin(), |
451 | last2: CallStack->rend()); |
452 | CommonLen = std::distance(first: CallStack->rbegin(), last: Pos.second); |
453 | } |
454 | |
455 | // Drop the portion beyond CommonLen. |
456 | assert(CommonLen <= Indexes.size()); |
457 | Indexes.resize(new_size: CommonLen); |
458 | |
459 | // Append a pointer to the parent. |
460 | if (CommonLen) { |
461 | uint32_t CurrentIndex = RadixArray.size(); |
462 | uint32_t ParentIndex = Indexes.back(); |
463 | // The offset to the parent must be negative because we are pointing to an |
464 | // element we've already added to RadixArray. |
465 | assert(ParentIndex < CurrentIndex); |
466 | RadixArray.push_back(x: ParentIndex - CurrentIndex); |
467 | } |
468 | |
469 | // Copy the part of the call stack beyond the common prefix to RadixArray. |
470 | assert(CommonLen <= CallStack->size()); |
471 | for (FrameId F : llvm::drop_begin(RangeOrContainer: llvm::reverse(C: *CallStack), N: CommonLen)) { |
472 | // Remember the index of F in RadixArray. |
473 | Indexes.push_back(x: RadixArray.size()); |
474 | RadixArray.push_back(x: MemProfFrameIndexes.find(Val: F)->second); |
475 | } |
476 | assert(CallStack->size() == Indexes.size()); |
477 | |
478 | // End with the call stack length. |
479 | RadixArray.push_back(x: CallStack->size()); |
480 | |
481 | // Return the index within RadixArray where we can start reconstructing a |
482 | // given call stack from. |
483 | return RadixArray.size() - 1; |
484 | } |
485 | |
486 | void CallStackRadixTreeBuilder::build( |
487 | llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> |
488 | &&MemProfCallStackData, |
489 | const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes, |
490 | llvm::DenseMap<FrameId, FrameStat> &FrameHistogram) { |
491 | // Take the vector portion of MemProfCallStackData. The vector is exactly |
492 | // what we need to sort. Also, we no longer need its lookup capability. |
493 | llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector(); |
494 | |
495 | // Return early if we have no work to do. |
496 | if (CallStacks.empty()) { |
497 | RadixArray.clear(); |
498 | CallStackPos.clear(); |
499 | return; |
500 | } |
501 | |
502 | // Sorting the list of call stacks in the dictionary order is sufficient to |
503 | // maximize the length of the common prefix between two adjacent call stacks |
504 | // and thus minimize the length of RadixArray. However, we go one step |
505 | // further and try to reduce the number of times we follow pointers to parents |
506 | // during deserilization. Consider a poorly encoded radix tree: |
507 | // |
508 | // CallStackId 1: f1 -> f2 -> f3 |
509 | // | |
510 | // CallStackId 2: +--- f4 -> f5 |
511 | // | |
512 | // CallStackId 3: +--> f6 |
513 | // |
514 | // Here, f2 and f4 appear once and twice, respectively, in the call stacks. |
515 | // Once we encode CallStackId 1 into RadixArray, every other call stack with |
516 | // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3 |
517 | // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to |
518 | // parents twice. |
519 | // |
520 | // We try to alleviate the situation by sorting the list of call stacks by |
521 | // comparing the popularity of frames rather than the integer values of |
522 | // FrameIds. In the example above, f4 is more popular than f2, so we sort the |
523 | // call stacks and encode them as: |
524 | // |
525 | // CallStackId 2: f1 -- f4 -> f5 |
526 | // | | |
527 | // CallStackId 3: | +--> f6 |
528 | // | |
529 | // CallStackId 1: +--> f2 -> f3 |
530 | // |
531 | // Notice that CallStackId 3 follows a pointer to a parent only once. |
532 | // |
533 | // All this is a quick-n-dirty trick to reduce the number of jumps. The |
534 | // proper way would be to compute the weight of each radix tree node -- how |
535 | // many call stacks use a given radix tree node, and encode a radix tree from |
536 | // the heaviest node first. We do not do so because that's a lot of work. |
537 | llvm::sort(C&: CallStacks, Comp: [&](const CSIdPair &L, const CSIdPair &R) { |
538 | // Call stacks are stored from leaf to root. Perform comparisons from the |
539 | // root. |
540 | return std::lexicographical_compare( |
541 | L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(), |
542 | [&](FrameId F1, FrameId F2) { |
543 | uint64_t H1 = FrameHistogram[F1].Count; |
544 | uint64_t H2 = FrameHistogram[F2].Count; |
545 | // Popular frames should come later because we encode call stacks from |
546 | // the last one in the list. |
547 | if (H1 != H2) |
548 | return H1 < H2; |
549 | // For sort stability. |
550 | return F1 < F2; |
551 | }); |
552 | }); |
553 | |
554 | // Reserve some reasonable amount of storage. |
555 | RadixArray.clear(); |
556 | RadixArray.reserve(n: CallStacks.size() * 8); |
557 | |
558 | // Indexes will grow as long as the longest call stack. |
559 | Indexes.clear(); |
560 | Indexes.reserve(n: 512); |
561 | |
562 | // CallStackPos will grow to exactly CallStacks.size() entries. |
563 | CallStackPos.clear(); |
564 | CallStackPos.reserve(NumEntries: CallStacks.size()); |
565 | |
566 | // Compute the radix array. We encode one call stack at a time, computing the |
567 | // longest prefix that's shared with the previous call stack we encode. For |
568 | // each call stack we encode, we remember a mapping from CallStackId to its |
569 | // position within RadixArray. |
570 | // |
571 | // As an optimization, we encode from the last call stack in CallStacks to |
572 | // reduce the number of times we follow pointers to the parents. Consider the |
573 | // list of call stacks that has been sorted in the dictionary order: |
574 | // |
575 | // Call Stack 1: F1 |
576 | // Call Stack 2: F1 -> F2 |
577 | // Call Stack 3: F1 -> F2 -> F3 |
578 | // |
579 | // If we traversed CallStacks in the forward order, we would end up with a |
580 | // radix tree like: |
581 | // |
582 | // Call Stack 1: F1 |
583 | // | |
584 | // Call Stack 2: +---> F2 |
585 | // | |
586 | // Call Stack 3: +---> F3 |
587 | // |
588 | // Notice that each call stack jumps to the previous one. However, if we |
589 | // traverse CallStacks in the reverse order, then Call Stack 3 has the |
590 | // complete call stack encoded without any pointers. Call Stack 1 and 2 point |
591 | // to appropriate prefixes of Call Stack 3. |
592 | const llvm::SmallVector<FrameId> *Prev = nullptr; |
593 | for (const auto &[CSId, CallStack] : llvm::reverse(C&: CallStacks)) { |
594 | LinearCallStackId Pos = |
595 | encodeCallStack(CallStack: &CallStack, Prev, MemProfFrameIndexes); |
596 | CallStackPos.insert(KV: {CSId, Pos}); |
597 | Prev = &CallStack; |
598 | } |
599 | |
600 | // "RadixArray.size() - 1" below is problematic if RadixArray is empty. |
601 | assert(!RadixArray.empty()); |
602 | |
603 | // Reverse the radix array in place. We do so mostly for intuitive |
604 | // deserialization where we would read the length field and then the call |
605 | // stack frames proper just like any other array deserialization, except |
606 | // that we have occasional jumps to take advantage of prefixes. |
607 | for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J) |
608 | std::swap(a&: RadixArray[I], b&: RadixArray[J]); |
609 | |
610 | // "Reverse" the indexes stored in CallStackPos. |
611 | for (auto &[K, V] : CallStackPos) |
612 | V = RadixArray.size() - 1 - V; |
613 | } |
614 | |
615 | llvm::DenseMap<FrameId, FrameStat> |
616 | computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> |
617 | &MemProfCallStackData) { |
618 | llvm::DenseMap<FrameId, FrameStat> Histogram; |
619 | |
620 | for (const auto &KV : MemProfCallStackData) { |
621 | const auto &CS = KV.second; |
622 | for (unsigned I = 0, E = CS.size(); I != E; ++I) { |
623 | auto &S = Histogram[CS[I]]; |
624 | ++S.Count; |
625 | S.PositionSum += I; |
626 | } |
627 | } |
628 | return Histogram; |
629 | } |
630 | |
631 | void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) { |
632 | for (const auto &AS : Record.AllocSites) { |
633 | assert(AS.CSId == hashCallStack(AS.CallStack)); |
634 | (void)AS; |
635 | } |
636 | } |
637 | |
638 | void verifyFunctionProfileData( |
639 | const llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord> |
640 | &FunctionProfileData) { |
641 | for (const auto &[GUID, Record] : FunctionProfileData) { |
642 | (void)GUID; |
643 | verifyIndexedMemProfRecord(Record); |
644 | } |
645 | } |
646 | |
647 | } // namespace memprof |
648 | } // namespace llvm |
649 | |