1 | /*===- CtxInstrProfiling.h- Contextual instrumentation-based PGO ---------===*\ |
2 | |* |
3 | |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | |* See https://llvm.org/LICENSE.txt for license information. |
5 | |* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | |* |
7 | \*===----------------------------------------------------------------------===*/ |
8 | |
9 | #ifndef CTX_PROFILE_CTXINSTRPROFILING_H_ |
10 | #define CTX_PROFILE_CTXINSTRPROFILING_H_ |
11 | |
12 | #include "CtxInstrContextNode.h" |
13 | #include "sanitizer_common/sanitizer_dense_map.h" |
14 | #include "sanitizer_common/sanitizer_mutex.h" |
15 | #include <sanitizer/common_interface_defs.h> |
16 | |
17 | using namespace llvm::ctx_profile; |
18 | |
19 | // Forward-declare for the one unittest checking Arena construction zeroes out |
20 | // its allocatable space. |
21 | class ArenaTest_ZeroInit_Test; |
22 | namespace __ctx_profile { |
23 | |
24 | static constexpr size_t ExpectedAlignment = 8; |
25 | // We really depend on this, see further below. We currently support x86_64. |
26 | // When we want to support other archs, we need to trace the places Alignment is |
27 | // used and adjust accordingly. |
28 | static_assert(sizeof(void *) == ExpectedAlignment); |
29 | |
30 | /// Arena (bump allocator) forming a linked list. Intentionally not thread safe. |
31 | /// Allocation and de-allocation happen using sanitizer APIs. We make that |
32 | /// explicit. |
33 | class Arena final { |
34 | public: |
35 | // When allocating a new Arena, optionally specify an existing one to append |
36 | // to, assumed to be the last in the Arena list. We only need to support |
37 | // appending to the arena list. |
38 | static Arena *allocateNewArena(size_t Size, Arena *Prev = nullptr); |
39 | static void freeArenaList(Arena *&A); |
40 | |
41 | uint64_t size() const { return Size; } |
42 | |
43 | // Allocate S bytes or return nullptr if we don't have that many available. |
44 | char *tryBumpAllocate(size_t S) { |
45 | if (Pos + S > Size) |
46 | return nullptr; |
47 | Pos += S; |
48 | return start() + (Pos - S); |
49 | } |
50 | |
51 | Arena *next() const { return Next; } |
52 | |
53 | // the beginning of allocatable memory. |
54 | const char *start() const { return const_cast<Arena *>(this)->start(); } |
55 | const char *pos() const { return start() + Pos; } |
56 | |
57 | private: |
58 | friend class ::ArenaTest_ZeroInit_Test; |
59 | explicit Arena(uint32_t Size); |
60 | ~Arena() = delete; |
61 | |
62 | char *start() { return reinterpret_cast<char *>(&this[1]); } |
63 | |
64 | Arena *Next = nullptr; |
65 | uint64_t Pos = 0; |
66 | const uint64_t Size; |
67 | }; |
68 | |
69 | // The memory available for allocation follows the Arena header, and we expect |
70 | // it to be thus aligned. |
71 | static_assert(alignof(Arena) == ExpectedAlignment); |
72 | |
73 | // Verify maintenance to ContextNode doesn't change this invariant, which makes |
74 | // sure the inlined vectors are appropriately aligned. |
75 | static_assert(alignof(ContextNode) == ExpectedAlignment); |
76 | |
77 | /// ContextRoots hold memory and the start of the contextual profile tree for a |
78 | /// root function. |
79 | struct ContextRoot { |
80 | ContextNode *FirstNode = nullptr; |
81 | Arena *FirstMemBlock = nullptr; |
82 | Arena *CurrentMem = nullptr; |
83 | |
84 | // Count the number of entries - regardless if we could take the `Taken` mutex |
85 | ::__sanitizer::atomic_uint64_t TotalEntries = {}; |
86 | |
87 | // Profiles for functions we encounter when collecting a contexutal profile, |
88 | // that are not associated with a callsite. This is expected to happen for |
89 | // signal handlers, but it also - problematically - currently happens for |
90 | // call sites generated after profile instrumentation, primarily |
91 | // mem{memset|copy|move|set}. |
92 | // `Unhandled` serves 2 purposes: |
93 | // 1. identifying such cases (like the memops) |
94 | // 2. collecting a profile for them, which can be at least used as a flat |
95 | // profile |
96 | ::__sanitizer::DenseMap<GUID, ContextNode *> Unhandled; |
97 | // Keep the unhandled contexts in a list, as we allocate them, as it makes it |
98 | // simpler to send to the writer when the profile is fetched. |
99 | ContextNode *FirstUnhandledCalleeNode = nullptr; |
100 | |
101 | // Taken is used to ensure only one thread traverses the contextual graph - |
102 | // either to read it or to write it. On server side, the same entrypoint will |
103 | // be entered by numerous threads, but over time, the profile aggregated by |
104 | // collecting sequentially on one thread at a time is expected to converge to |
105 | // the aggregate profile that may have been observable on all the threads. |
106 | // Note that this is node-by-node aggregation, i.e. summing counters of nodes |
107 | // at the same position in the graph, not flattening. |
108 | // Threads that cannot lock Taken (fail TryLock) are given a "scratch context" |
109 | // - a buffer they can clobber, safely from a memory access perspective. |
110 | // |
111 | // Note about "scratch"-ness: we currently ignore the data written in them |
112 | // (which is anyway clobbered). The design allows for that not be the case - |
113 | // because "scratch"-ness is first and foremost about not trying to build |
114 | // subcontexts, and is captured by tainting the pointer value (pointer to the |
115 | // memory treated as context), but right now, we drop that info. |
116 | // |
117 | // We could consider relaxing the requirement of more than one thread |
118 | // entering by holding a few context trees per entrypoint and then aggregating |
119 | // them (as explained above) at the end of the profile collection - it's a |
120 | // tradeoff between collection time and memory use: higher precision can be |
121 | // obtained with either less concurrent collections but more collection time, |
122 | // or with more concurrent collections (==more memory) and less collection |
123 | // time. Note that concurrent collection does happen for different |
124 | // entrypoints, regardless. |
125 | ::__sanitizer::SpinMutex Taken; |
126 | }; |
127 | |
128 | // This is allocated and zero-initialized by the compiler, the in-place |
129 | // initialization serves mostly as self-documentation and for testing. |
130 | // The design is influenced by the observation that typically (at least for |
131 | // datacenter binaries, which is the motivating target of this profiler) less |
132 | // than 10% of functions in a binary even appear in a profile (of any kind). |
133 | // |
134 | // 1) We could pre-allocate the flat profile storage in the compiler, just like |
135 | // the flat instrumented profiling does. But that penalizes the static size of |
136 | // the binary for little reason |
137 | // |
138 | // 2) We could do the above but zero-initialize the buffers (which should place |
139 | // them in .bss), and dynamically populate them. This, though, would page-in |
140 | // more memory upfront for the binary's runtime |
141 | // |
142 | // The current design trades off a bit of overhead at the first time a function |
143 | // is encountered *for flat profiling* for avoiding size penalties. |
144 | struct FunctionData { |
145 | #define _PTRDECL(T, N) T *N = nullptr; |
146 | #define _VOLATILE_PTRDECL(T, N) T *volatile N = nullptr; |
147 | #define _MUTEXDECL(N) ::__sanitizer::SpinMutex N; |
148 | #define _CONTEXT_PTR ContextRoot *CtxRoot = nullptr; |
149 | CTXPROF_FUNCTION_DATA(_PTRDECL, _CONTEXT_PTR, _VOLATILE_PTRDECL, _MUTEXDECL) |
150 | #undef _CONTEXT_PTR |
151 | #undef _PTRDECL |
152 | #undef _VOLATILE_PTRDECL |
153 | #undef _MUTEXDECL |
154 | |
155 | // Constructor for test only - since this is expected to be |
156 | // initialized by the compiler. |
157 | FunctionData() = default; |
158 | ContextRoot *getOrAllocateContextRoot(); |
159 | |
160 | // If (unlikely) StaticSpinMutex internals change, we need to modify the LLVM |
161 | // instrumentation lowering side because it is responsible for allocating and |
162 | // zero-initializing ContextRoots. |
163 | static_assert(sizeof(Mutex) == 1); |
164 | }; |
165 | |
166 | /// This API is exposed for testing. See the APIs below about the contract with |
167 | /// LLVM. |
168 | inline bool isScratch(const void *Ctx) { |
169 | return (reinterpret_cast<uint64_t>(Ctx) & 1); |
170 | } |
171 | |
172 | // True if Ctx is either nullptr or not the 0x1 value. |
173 | inline bool canBeRoot(const ContextRoot *Ctx) { |
174 | return reinterpret_cast<uintptr_t>(Ctx) != 1U; |
175 | } |
176 | |
177 | } // namespace __ctx_profile |
178 | |
179 | extern "C" { |
180 | |
181 | // LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic. |
182 | // position 0 is used when the current context isn't scratch, 1 when it is. They |
183 | // are volatile because of signal handlers - we mean to specifically control |
184 | // when the data is loaded. |
185 | // |
186 | /// TLS where LLVM stores the pointer of the called value, as part of lowering a |
187 | /// llvm.instrprof.callsite |
188 | extern __thread void *volatile __llvm_ctx_profile_expected_callee[2]; |
189 | /// TLS where LLVM stores the pointer inside a caller's subcontexts vector that |
190 | /// corresponds to the callsite being lowered. |
191 | extern __thread ContextNode **volatile __llvm_ctx_profile_callsite[2]; |
192 | |
193 | // __llvm_ctx_profile_current_context_root is exposed for unit testing, |
194 | // othwerise it's only used internally by compiler-rt/ctx_profile. |
195 | extern __thread __ctx_profile::ContextRoot |
196 | *volatile __llvm_ctx_profile_current_context_root; |
197 | |
198 | /// called by LLVM in the entry BB of a "entry point" function. The returned |
199 | /// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch. |
200 | ContextNode * |
201 | __llvm_ctx_profile_start_context(__ctx_profile::FunctionData *FData, GUID Guid, |
202 | uint32_t Counters, uint32_t Callsites); |
203 | |
204 | /// paired with __llvm_ctx_profile_start_context, and called at the exit of the |
205 | /// entry point function. |
206 | void __llvm_ctx_profile_release_context(__ctx_profile::FunctionData *FData); |
207 | |
208 | /// called for any other function than entry points, in the entry BB of such |
209 | /// function. Same consideration about LSB of returned value as .._start_context |
210 | ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *FData, |
211 | void *Callee, GUID Guid, |
212 | uint32_t NumCounters, |
213 | uint32_t NumCallsites); |
214 | |
215 | /// Prepares for collection. Currently this resets counter values but preserves |
216 | /// internal context tree structure. |
217 | void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration = 0); |
218 | |
219 | /// Completely free allocated memory. |
220 | void __llvm_ctx_profile_free(); |
221 | |
222 | /// Used to obtain the profile. The Writer is called for each root ContextNode, |
223 | /// with the ContextRoot::Taken taken. The Writer is responsible for traversing |
224 | /// the structure underneath. |
225 | /// The Writer's first parameter plays the role of closure for Writer, and is |
226 | /// what the caller of __llvm_ctx_profile_fetch passes as the Data parameter. |
227 | /// The second parameter is the root of a context tree. |
228 | bool __llvm_ctx_profile_fetch(ProfileWriter &); |
229 | } |
230 | #endif // CTX_PROFILE_CTXINSTRPROFILING_H_ |
231 | |