| 1 | //===- MemProfRadixTree.cpp - Radix tree encoded callstacks ---------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // This file contains logic that implements a space efficient radix tree |
| 9 | // encoding for callstacks used by MemProf. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "llvm/ProfileData/MemProfRadixTree.h" |
| 14 | |
| 15 | namespace llvm { |
| 16 | namespace memprof { |
| 17 | // Encode a call stack into RadixArray. Return the starting index within |
| 18 | // RadixArray. For each call stack we encode, we emit two or three components |
| 19 | // into RadixArray. If a given call stack doesn't have a common prefix relative |
| 20 | // to the previous one, we emit: |
| 21 | // |
| 22 | // - the frames in the given call stack in the root-to-leaf order |
| 23 | // |
| 24 | // - the length of the given call stack |
| 25 | // |
| 26 | // If a given call stack has a non-empty common prefix relative to the previous |
| 27 | // one, we emit: |
| 28 | // |
| 29 | // - the relative location of the common prefix, encoded as a negative number. |
| 30 | // |
| 31 | // - a portion of the given call stack that's beyond the common prefix |
| 32 | // |
| 33 | // - the length of the given call stack, including the length of the common |
| 34 | // prefix. |
| 35 | // |
| 36 | // The resulting RadixArray requires a somewhat unintuitive backward traversal |
| 37 | // to reconstruct a call stack -- read the call stack length and scan backward |
| 38 | // while collecting frames in the leaf to root order. build, the caller of this |
| 39 | // function, reverses RadixArray in place so that we can reconstruct a call |
| 40 | // stack as if we were deserializing an array in a typical way -- the call stack |
| 41 | // length followed by the frames in the leaf-to-root order except that we need |
| 42 | // to handle pointers to parents along the way. |
| 43 | // |
| 44 | // To quickly determine the location of the common prefix within RadixArray, |
| 45 | // Indexes caches the indexes of the previous call stack's frames within |
| 46 | // RadixArray. |
| 47 | template <typename FrameIdTy> |
| 48 | LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack( |
| 49 | const llvm::SmallVector<FrameIdTy> *CallStack, |
| 50 | const llvm::SmallVector<FrameIdTy> *Prev, |
| 51 | const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) { |
| 52 | // Compute the length of the common root prefix between Prev and CallStack. |
| 53 | uint32_t CommonLen = 0; |
| 54 | if (Prev) { |
| 55 | auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(), |
| 56 | CallStack->rend()); |
| 57 | CommonLen = std::distance(CallStack->rbegin(), Pos.second); |
| 58 | } |
| 59 | |
| 60 | // Drop the portion beyond CommonLen. |
| 61 | assert(CommonLen <= Indexes.size()); |
| 62 | Indexes.resize(new_size: CommonLen); |
| 63 | |
| 64 | // Append a pointer to the parent. |
| 65 | if (CommonLen) { |
| 66 | uint32_t CurrentIndex = RadixArray.size(); |
| 67 | uint32_t ParentIndex = Indexes.back(); |
| 68 | // The offset to the parent must be negative because we are pointing to an |
| 69 | // element we've already added to RadixArray. |
| 70 | assert(ParentIndex < CurrentIndex); |
| 71 | RadixArray.push_back(x: ParentIndex - CurrentIndex); |
| 72 | } |
| 73 | |
| 74 | // Copy the part of the call stack beyond the common prefix to RadixArray. |
| 75 | assert(CommonLen <= CallStack->size()); |
| 76 | for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) { |
| 77 | // Remember the index of F in RadixArray. |
| 78 | Indexes.push_back(x: RadixArray.size()); |
| 79 | RadixArray.push_back( |
| 80 | MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F); |
| 81 | } |
| 82 | assert(CallStack->size() == Indexes.size()); |
| 83 | |
| 84 | // End with the call stack length. |
| 85 | RadixArray.push_back(CallStack->size()); |
| 86 | |
| 87 | // Return the index within RadixArray where we can start reconstructing a |
| 88 | // given call stack from. |
| 89 | return RadixArray.size() - 1; |
| 90 | } |
| 91 | |
| 92 | template <typename FrameIdTy> |
| 93 | void CallStackRadixTreeBuilder<FrameIdTy>::build( |
| 94 | llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> |
| 95 | &&MemProfCallStackData, |
| 96 | const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes, |
| 97 | llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) { |
| 98 | // Take the vector portion of MemProfCallStackData. The vector is exactly |
| 99 | // what we need to sort. Also, we no longer need its lookup capability. |
| 100 | llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector(); |
| 101 | |
| 102 | // Return early if we have no work to do. |
| 103 | if (CallStacks.empty()) { |
| 104 | RadixArray.clear(); |
| 105 | CallStackPos.clear(); |
| 106 | return; |
| 107 | } |
| 108 | |
| 109 | // Sorting the list of call stacks in the dictionary order is sufficient to |
| 110 | // maximize the length of the common prefix between two adjacent call stacks |
| 111 | // and thus minimize the length of RadixArray. However, we go one step |
| 112 | // further and try to reduce the number of times we follow pointers to parents |
| 113 | // during deserilization. Consider a poorly encoded radix tree: |
| 114 | // |
| 115 | // CallStackId 1: f1 -> f2 -> f3 |
| 116 | // | |
| 117 | // CallStackId 2: +--- f4 -> f5 |
| 118 | // | |
| 119 | // CallStackId 3: +--> f6 |
| 120 | // |
| 121 | // Here, f2 and f4 appear once and twice, respectively, in the call stacks. |
| 122 | // Once we encode CallStackId 1 into RadixArray, every other call stack with |
| 123 | // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3 |
| 124 | // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to |
| 125 | // parents twice. |
| 126 | // |
| 127 | // We try to alleviate the situation by sorting the list of call stacks by |
| 128 | // comparing the popularity of frames rather than the integer values of |
| 129 | // FrameIds. In the example above, f4 is more popular than f2, so we sort the |
| 130 | // call stacks and encode them as: |
| 131 | // |
| 132 | // CallStackId 2: f1 -- f4 -> f5 |
| 133 | // | | |
| 134 | // CallStackId 3: | +--> f6 |
| 135 | // | |
| 136 | // CallStackId 1: +--> f2 -> f3 |
| 137 | // |
| 138 | // Notice that CallStackId 3 follows a pointer to a parent only once. |
| 139 | // |
| 140 | // All this is a quick-n-dirty trick to reduce the number of jumps. The |
| 141 | // proper way would be to compute the weight of each radix tree node -- how |
| 142 | // many call stacks use a given radix tree node, and encode a radix tree from |
| 143 | // the heaviest node first. We do not do so because that's a lot of work. |
| 144 | llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) { |
| 145 | // Call stacks are stored from leaf to root. Perform comparisons from the |
| 146 | // root. |
| 147 | return std::lexicographical_compare( |
| 148 | L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(), |
| 149 | [&](FrameIdTy F1, FrameIdTy F2) { |
| 150 | uint64_t H1 = FrameHistogram[F1].Count; |
| 151 | uint64_t H2 = FrameHistogram[F2].Count; |
| 152 | // Popular frames should come later because we encode call stacks from |
| 153 | // the last one in the list. |
| 154 | if (H1 != H2) |
| 155 | return H1 < H2; |
| 156 | // For sort stability. |
| 157 | return F1 < F2; |
| 158 | }); |
| 159 | }); |
| 160 | |
| 161 | // Reserve some reasonable amount of storage. |
| 162 | RadixArray.clear(); |
| 163 | RadixArray.reserve(n: CallStacks.size() * 8); |
| 164 | |
| 165 | // Indexes will grow as long as the longest call stack. |
| 166 | Indexes.clear(); |
| 167 | Indexes.reserve(n: 512); |
| 168 | |
| 169 | // CallStackPos will grow to exactly CallStacks.size() entries. |
| 170 | CallStackPos.clear(); |
| 171 | CallStackPos.reserve(NumEntries: CallStacks.size()); |
| 172 | |
| 173 | // Compute the radix array. We encode one call stack at a time, computing the |
| 174 | // longest prefix that's shared with the previous call stack we encode. For |
| 175 | // each call stack we encode, we remember a mapping from CallStackId to its |
| 176 | // position within RadixArray. |
| 177 | // |
| 178 | // As an optimization, we encode from the last call stack in CallStacks to |
| 179 | // reduce the number of times we follow pointers to the parents. Consider the |
| 180 | // list of call stacks that has been sorted in the dictionary order: |
| 181 | // |
| 182 | // Call Stack 1: F1 |
| 183 | // Call Stack 2: F1 -> F2 |
| 184 | // Call Stack 3: F1 -> F2 -> F3 |
| 185 | // |
| 186 | // If we traversed CallStacks in the forward order, we would end up with a |
| 187 | // radix tree like: |
| 188 | // |
| 189 | // Call Stack 1: F1 |
| 190 | // | |
| 191 | // Call Stack 2: +---> F2 |
| 192 | // | |
| 193 | // Call Stack 3: +---> F3 |
| 194 | // |
| 195 | // Notice that each call stack jumps to the previous one. However, if we |
| 196 | // traverse CallStacks in the reverse order, then Call Stack 3 has the |
| 197 | // complete call stack encoded without any pointers. Call Stack 1 and 2 point |
| 198 | // to appropriate prefixes of Call Stack 3. |
| 199 | const llvm::SmallVector<FrameIdTy> *Prev = nullptr; |
| 200 | for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) { |
| 201 | LinearCallStackId Pos = |
| 202 | encodeCallStack(CallStack: &CallStack, Prev, MemProfFrameIndexes); |
| 203 | CallStackPos.insert({CSId, Pos}); |
| 204 | Prev = &CallStack; |
| 205 | } |
| 206 | |
| 207 | // "RadixArray.size() - 1" below is problematic if RadixArray is empty. |
| 208 | assert(!RadixArray.empty()); |
| 209 | |
| 210 | // Reverse the radix array in place. We do so mostly for intuitive |
| 211 | // deserialization where we would read the length field and then the call |
| 212 | // stack frames proper just like any other array deserialization, except |
| 213 | // that we have occasional jumps to take advantage of prefixes. |
| 214 | for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J) |
| 215 | std::swap(a&: RadixArray[I], b&: RadixArray[J]); |
| 216 | |
| 217 | // "Reverse" the indexes stored in CallStackPos. |
| 218 | for (auto &[K, V] : CallStackPos) |
| 219 | V = RadixArray.size() - 1 - V; |
| 220 | } |
| 221 | |
| 222 | // Explicitly instantiate class with the utilized FrameIdTy. |
| 223 | template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<FrameId>; |
| 224 | template class LLVM_EXPORT_TEMPLATE CallStackRadixTreeBuilder<LinearFrameId>; |
| 225 | |
| 226 | template <typename FrameIdTy> |
| 227 | llvm::DenseMap<FrameIdTy, FrameStat> |
| 228 | computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> |
| 229 | &MemProfCallStackData) { |
| 230 | llvm::DenseMap<FrameIdTy, FrameStat> Histogram; |
| 231 | |
| 232 | for (const auto &KV : MemProfCallStackData) { |
| 233 | const auto &CS = KV.second; |
| 234 | for (unsigned I = 0, E = CS.size(); I != E; ++I) { |
| 235 | auto &S = Histogram[CS[I]]; |
| 236 | ++S.Count; |
| 237 | S.PositionSum += I; |
| 238 | } |
| 239 | } |
| 240 | return Histogram; |
| 241 | } |
| 242 | |
| 243 | // Explicitly instantiate function with the utilized FrameIdTy. |
| 244 | template LLVM_ABI llvm::DenseMap<FrameId, FrameStat> |
| 245 | computeFrameHistogram<FrameId>( |
| 246 | llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> |
| 247 | &MemProfCallStackData); |
| 248 | template LLVM_ABI llvm::DenseMap<LinearFrameId, FrameStat> |
| 249 | computeFrameHistogram<LinearFrameId>( |
| 250 | llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> |
| 251 | &MemProfCallStackData); |
| 252 | } // namespace memprof |
| 253 | } // namespace llvm |
| 254 | |