| 1 | /*===- CtxInstrProfiling.h- Contextual instrumentation-based PGO ---------===*\ |
| 2 | |* |
| 3 | |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | |* See https://llvm.org/LICENSE.txt for license information. |
| 5 | |* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | |* |
| 7 | \*===----------------------------------------------------------------------===*/ |
| 8 | |
| 9 | #ifndef CTX_PROFILE_CTXINSTRPROFILING_H_ |
| 10 | #define CTX_PROFILE_CTXINSTRPROFILING_H_ |
| 11 | |
| 12 | #include "CtxInstrContextNode.h" |
| 13 | #include "sanitizer_common/sanitizer_dense_map.h" |
| 14 | #include "sanitizer_common/sanitizer_mutex.h" |
| 15 | #include <sanitizer/common_interface_defs.h> |
| 16 | |
| 17 | using namespace llvm::ctx_profile; |
| 18 | |
| 19 | // Forward-declare for the one unittest checking Arena construction zeroes out |
| 20 | // its allocatable space. |
| 21 | class ArenaTest_ZeroInit_Test; |
| 22 | namespace __ctx_profile { |
| 23 | |
| 24 | static constexpr size_t ExpectedAlignment = 8; |
| 25 | // We really depend on this, see further below. We currently support x86_64. |
| 26 | // When we want to support other archs, we need to trace the places Alignment is |
| 27 | // used and adjust accordingly. |
| 28 | static_assert(sizeof(void *) == ExpectedAlignment); |
| 29 | |
| 30 | /// Arena (bump allocator) forming a linked list. Intentionally not thread safe. |
| 31 | /// Allocation and de-allocation happen using sanitizer APIs. We make that |
| 32 | /// explicit. |
| 33 | class Arena final { |
| 34 | public: |
| 35 | // When allocating a new Arena, optionally specify an existing one to append |
| 36 | // to, assumed to be the last in the Arena list. We only need to support |
| 37 | // appending to the arena list. |
| 38 | static Arena *allocateNewArena(size_t Size, Arena *Prev = nullptr); |
| 39 | static void freeArenaList(Arena *&A); |
| 40 | |
| 41 | uint64_t size() const { return Size; } |
| 42 | |
| 43 | // Allocate S bytes or return nullptr if we don't have that many available. |
| 44 | char *tryBumpAllocate(size_t S) { |
| 45 | if (Pos + S > Size) |
| 46 | return nullptr; |
| 47 | Pos += S; |
| 48 | return start() + (Pos - S); |
| 49 | } |
| 50 | |
| 51 | Arena *next() const { return Next; } |
| 52 | |
| 53 | // the beginning of allocatable memory. |
| 54 | const char *start() const { return const_cast<Arena *>(this)->start(); } |
| 55 | const char *pos() const { return start() + Pos; } |
| 56 | |
| 57 | private: |
| 58 | friend class ::ArenaTest_ZeroInit_Test; |
| 59 | explicit Arena(uint32_t Size); |
| 60 | ~Arena() = delete; |
| 61 | |
| 62 | char *start() { return reinterpret_cast<char *>(&this[1]); } |
| 63 | |
| 64 | Arena *Next = nullptr; |
| 65 | uint64_t Pos = 0; |
| 66 | const uint64_t Size; |
| 67 | }; |
| 68 | |
| 69 | // The memory available for allocation follows the Arena header, and we expect |
| 70 | // it to be thus aligned. |
| 71 | static_assert(alignof(Arena) == ExpectedAlignment); |
| 72 | |
| 73 | // Verify maintenance to ContextNode doesn't change this invariant, which makes |
| 74 | // sure the inlined vectors are appropriately aligned. |
| 75 | static_assert(alignof(ContextNode) == ExpectedAlignment); |
| 76 | |
| 77 | /// ContextRoots hold memory and the start of the contextual profile tree for a |
| 78 | /// root function. |
| 79 | struct ContextRoot { |
| 80 | ContextNode *FirstNode = nullptr; |
| 81 | Arena *FirstMemBlock = nullptr; |
| 82 | Arena *CurrentMem = nullptr; |
| 83 | |
| 84 | // Count the number of entries - regardless if we could take the `Taken` mutex |
| 85 | ::__sanitizer::atomic_uint64_t TotalEntries = {}; |
| 86 | |
| 87 | // Profiles for functions we encounter when collecting a contexutal profile, |
| 88 | // that are not associated with a callsite. This is expected to happen for |
| 89 | // signal handlers, but it also - problematically - currently happens for |
| 90 | // call sites generated after profile instrumentation, primarily |
| 91 | // mem{memset|copy|move|set}. |
| 92 | // `Unhandled` serves 2 purposes: |
| 93 | // 1. identifying such cases (like the memops) |
| 94 | // 2. collecting a profile for them, which can be at least used as a flat |
| 95 | // profile |
| 96 | ::__sanitizer::DenseMap<GUID, ContextNode *> Unhandled; |
| 97 | // Keep the unhandled contexts in a list, as we allocate them, as it makes it |
| 98 | // simpler to send to the writer when the profile is fetched. |
| 99 | ContextNode *FirstUnhandledCalleeNode = nullptr; |
| 100 | |
| 101 | // Taken is used to ensure only one thread traverses the contextual graph - |
| 102 | // either to read it or to write it. On server side, the same entrypoint will |
| 103 | // be entered by numerous threads, but over time, the profile aggregated by |
| 104 | // collecting sequentially on one thread at a time is expected to converge to |
| 105 | // the aggregate profile that may have been observable on all the threads. |
| 106 | // Note that this is node-by-node aggregation, i.e. summing counters of nodes |
| 107 | // at the same position in the graph, not flattening. |
| 108 | // Threads that cannot lock Taken (fail TryLock) are given a "scratch context" |
| 109 | // - a buffer they can clobber, safely from a memory access perspective. |
| 110 | // |
| 111 | // Note about "scratch"-ness: we currently ignore the data written in them |
| 112 | // (which is anyway clobbered). The design allows for that not be the case - |
| 113 | // because "scratch"-ness is first and foremost about not trying to build |
| 114 | // subcontexts, and is captured by tainting the pointer value (pointer to the |
| 115 | // memory treated as context), but right now, we drop that info. |
| 116 | // |
| 117 | // We could consider relaxing the requirement of more than one thread |
| 118 | // entering by holding a few context trees per entrypoint and then aggregating |
| 119 | // them (as explained above) at the end of the profile collection - it's a |
| 120 | // tradeoff between collection time and memory use: higher precision can be |
| 121 | // obtained with either less concurrent collections but more collection time, |
| 122 | // or with more concurrent collections (==more memory) and less collection |
| 123 | // time. Note that concurrent collection does happen for different |
| 124 | // entrypoints, regardless. |
| 125 | ::__sanitizer::SpinMutex Taken; |
| 126 | }; |
| 127 | |
| 128 | // This is allocated and zero-initialized by the compiler, the in-place |
| 129 | // initialization serves mostly as self-documentation and for testing. |
| 130 | // The design is influenced by the observation that typically (at least for |
| 131 | // datacenter binaries, which is the motivating target of this profiler) less |
| 132 | // than 10% of functions in a binary even appear in a profile (of any kind). |
| 133 | // |
| 134 | // 1) We could pre-allocate the flat profile storage in the compiler, just like |
| 135 | // the flat instrumented profiling does. But that penalizes the static size of |
| 136 | // the binary for little reason |
| 137 | // |
| 138 | // 2) We could do the above but zero-initialize the buffers (which should place |
| 139 | // them in .bss), and dynamically populate them. This, though, would page-in |
| 140 | // more memory upfront for the binary's runtime |
| 141 | // |
| 142 | // The current design trades off a bit of overhead at the first time a function |
| 143 | // is encountered *for flat profiling* for avoiding size penalties. |
| 144 | struct FunctionData { |
| 145 | #define _PTRDECL(T, N) T *N = nullptr; |
| 146 | #define _VOLATILE_PTRDECL(T, N) T *volatile N = nullptr; |
| 147 | #define _MUTEXDECL(N) ::__sanitizer::SpinMutex N; |
| 148 | #define _CONTEXT_PTR ContextRoot *CtxRoot = nullptr; |
| 149 | CTXPROF_FUNCTION_DATA(_PTRDECL, _CONTEXT_PTR, _VOLATILE_PTRDECL, _MUTEXDECL) |
| 150 | #undef _CONTEXT_PTR |
| 151 | #undef _PTRDECL |
| 152 | #undef _VOLATILE_PTRDECL |
| 153 | #undef _MUTEXDECL |
| 154 | |
| 155 | // Constructor for test only - since this is expected to be |
| 156 | // initialized by the compiler. |
| 157 | FunctionData() = default; |
| 158 | ContextRoot *getOrAllocateContextRoot(); |
| 159 | |
| 160 | // If (unlikely) StaticSpinMutex internals change, we need to modify the LLVM |
| 161 | // instrumentation lowering side because it is responsible for allocating and |
| 162 | // zero-initializing ContextRoots. |
| 163 | static_assert(sizeof(Mutex) == 1); |
| 164 | }; |
| 165 | |
| 166 | /// This API is exposed for testing. See the APIs below about the contract with |
| 167 | /// LLVM. |
| 168 | inline bool isScratch(const void *Ctx) { |
| 169 | return (reinterpret_cast<uint64_t>(Ctx) & 1); |
| 170 | } |
| 171 | |
| 172 | // True if Ctx is either nullptr or not the 0x1 value. |
| 173 | inline bool canBeRoot(const ContextRoot *Ctx) { |
| 174 | return reinterpret_cast<uintptr_t>(Ctx) != 1U; |
| 175 | } |
| 176 | |
| 177 | } // namespace __ctx_profile |
| 178 | |
| 179 | extern "C" { |
| 180 | |
| 181 | // LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic. |
| 182 | // position 0 is used when the current context isn't scratch, 1 when it is. They |
| 183 | // are volatile because of signal handlers - we mean to specifically control |
| 184 | // when the data is loaded. |
| 185 | // |
| 186 | /// TLS where LLVM stores the pointer of the called value, as part of lowering a |
| 187 | /// llvm.instrprof.callsite |
| 188 | extern __thread void *volatile __llvm_ctx_profile_expected_callee[2]; |
| 189 | /// TLS where LLVM stores the pointer inside a caller's subcontexts vector that |
| 190 | /// corresponds to the callsite being lowered. |
| 191 | extern __thread ContextNode **volatile __llvm_ctx_profile_callsite[2]; |
| 192 | |
| 193 | // __llvm_ctx_profile_current_context_root is exposed for unit testing, |
| 194 | // othwerise it's only used internally by compiler-rt/ctx_profile. |
| 195 | extern __thread __ctx_profile::ContextRoot |
| 196 | *volatile __llvm_ctx_profile_current_context_root; |
| 197 | |
| 198 | /// called by LLVM in the entry BB of a "entry point" function. The returned |
| 199 | /// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch. |
| 200 | ContextNode * |
| 201 | __llvm_ctx_profile_start_context(__ctx_profile::FunctionData *FData, GUID Guid, |
| 202 | uint32_t Counters, uint32_t Callsites); |
| 203 | |
| 204 | /// paired with __llvm_ctx_profile_start_context, and called at the exit of the |
| 205 | /// entry point function. |
| 206 | void __llvm_ctx_profile_release_context(__ctx_profile::FunctionData *FData); |
| 207 | |
| 208 | /// called for any other function than entry points, in the entry BB of such |
| 209 | /// function. Same consideration about LSB of returned value as .._start_context |
| 210 | ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *FData, |
| 211 | void *Callee, GUID Guid, |
| 212 | uint32_t NumCounters, |
| 213 | uint32_t NumCallsites); |
| 214 | |
| 215 | /// Prepares for collection. Currently this resets counter values but preserves |
| 216 | /// internal context tree structure. |
| 217 | void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration = 0); |
| 218 | |
| 219 | /// Completely free allocated memory. |
| 220 | void __llvm_ctx_profile_free(); |
| 221 | |
| 222 | /// Used to obtain the profile. The Writer is called for each root ContextNode, |
| 223 | /// with the ContextRoot::Taken taken. The Writer is responsible for traversing |
| 224 | /// the structure underneath. |
| 225 | /// The Writer's first parameter plays the role of closure for Writer, and is |
| 226 | /// what the caller of __llvm_ctx_profile_fetch passes as the Data parameter. |
| 227 | /// The second parameter is the root of a context tree. |
| 228 | bool __llvm_ctx_profile_fetch(ProfileWriter &); |
| 229 | } |
| 230 | #endif // CTX_PROFILE_CTXINSTRPROFILING_H_ |
| 231 | |