1 | //===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "CtxInstrProfiling.h" |
10 | #include "sanitizer_common/sanitizer_allocator_internal.h" |
11 | #include "sanitizer_common/sanitizer_common.h" |
12 | #include "sanitizer_common/sanitizer_dense_map.h" |
13 | #include "sanitizer_common/sanitizer_libc.h" |
14 | #include "sanitizer_common/sanitizer_mutex.h" |
15 | #include "sanitizer_common/sanitizer_placement_new.h" |
16 | #include "sanitizer_common/sanitizer_thread_safety.h" |
17 | #include "sanitizer_common/sanitizer_vector.h" |
18 | |
19 | #include <assert.h> |
20 | |
21 | using namespace __ctx_profile; |
22 | |
23 | namespace { |
24 | // Keep track of all the context roots we actually saw, so we can then traverse |
25 | // them when the user asks for the profile in __llvm_ctx_profile_fetch |
26 | __sanitizer::SpinMutex AllContextsMutex; |
27 | SANITIZER_GUARDED_BY(AllContextsMutex) |
28 | __sanitizer::Vector<ContextRoot *> AllContextRoots; |
29 | |
30 | // utility to taint a pointer by setting the LSB. There is an assumption |
31 | // throughout that the addresses of contexts are even (really, they should be |
32 | // align(8), but "even"-ness is the minimum assumption) |
33 | // "scratch contexts" are buffers that we return in certain cases - they are |
34 | // large enough to allow for memory safe counter access, but they don't link |
35 | // subcontexts below them (the runtime recognizes them and enforces that) |
36 | ContextNode *markAsScratch(const ContextNode *Ctx) { |
37 | return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1); |
38 | } |
39 | |
40 | // Used when getting the data from TLS. We don't *really* need to reset, but |
41 | // it's a simpler system if we do. |
42 | template <typename T> inline T consume(T &V) { |
43 | auto R = V; |
44 | V = {0}; |
45 | return R; |
46 | } |
47 | |
48 | // We allocate at least kBuffSize Arena pages. The scratch buffer is also that |
49 | // large. |
50 | constexpr size_t kPower = 20; |
51 | constexpr size_t kBuffSize = 1 << kPower; |
52 | |
53 | // Highly unlikely we need more than kBuffSize for a context. |
54 | size_t getArenaAllocSize(size_t Needed) { |
55 | if (Needed >= kBuffSize) |
56 | return 2 * Needed; |
57 | return kBuffSize; |
58 | } |
59 | |
60 | // verify the structural integrity of the context |
61 | bool validate(const ContextRoot *Root) { |
62 | // all contexts should be laid out in some arena page. Go over each arena |
63 | // allocated for this Root, and jump over contained contexts based on |
64 | // self-reported sizes. |
65 | __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs; |
66 | for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { |
67 | const auto *Pos = Mem->start(); |
68 | while (Pos < Mem->pos()) { |
69 | const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos); |
70 | if (!ContextStartAddrs.insert(KV: {reinterpret_cast<uint64_t>(Ctx), true}) |
71 | .second) |
72 | return false; |
73 | Pos += Ctx->size(); |
74 | } |
75 | } |
76 | |
77 | // Now traverse the contexts again the same way, but validate all nonull |
78 | // subcontext addresses appear in the set computed above. |
79 | for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { |
80 | const auto *Pos = Mem->start(); |
81 | while (Pos < Mem->pos()) { |
82 | const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos); |
83 | for (uint32_t I = 0; I < Ctx->callsites_size(); ++I) |
84 | for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next()) |
85 | if (!ContextStartAddrs.find(Key: reinterpret_cast<uint64_t>(Sub))) |
86 | return false; |
87 | |
88 | Pos += Ctx->size(); |
89 | } |
90 | } |
91 | return true; |
92 | } |
93 | |
94 | inline ContextNode *allocContextNode(char *Place, GUID Guid, |
95 | uint32_t NrCounters, uint32_t NrCallsites, |
96 | ContextNode *Next = nullptr) { |
97 | assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0); |
98 | return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next); |
99 | } |
100 | |
101 | void resetContextNode(ContextNode &Node) { |
102 | // FIXME(mtrofin): this is std::memset, which we can probably use if we |
103 | // drop/reduce the dependency on sanitizer_common. |
104 | for (uint32_t I = 0; I < Node.counters_size(); ++I) |
105 | Node.counters()[I] = 0; |
106 | for (uint32_t I = 0; I < Node.callsites_size(); ++I) |
107 | for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next()) |
108 | resetContextNode(Node&: *Next); |
109 | } |
110 | |
111 | void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; } |
112 | |
113 | } // namespace |
114 | |
115 | // the scratch buffer - what we give when we can't produce a real context (the |
116 | // scratch isn't "real" in that it's expected to be clobbered carelessly - we |
117 | // don't read it). The other important thing is that the callees from a scratch |
118 | // context also get a scratch context. |
119 | // Eventually this can be replaced with per-function buffers, a'la the typical |
120 | // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but |
121 | // the part about determining the nature of the subcontexts does. |
122 | __thread char __Buffer[kBuffSize] = {0}; |
123 | |
124 | #define TheScratchContext \ |
125 | markAsScratch(reinterpret_cast<ContextNode *>(__Buffer)) |
126 | |
127 | // init the TLSes |
128 | __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr, |
129 | nullptr}; |
130 | __thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0}; |
131 | |
132 | __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root = |
133 | nullptr; |
134 | |
135 | Arena::Arena(uint32_t Size) : Size(Size) { |
136 | __sanitizer::internal_memset(s: start(), c: 0, n: Size); |
137 | } |
138 | |
139 | // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce |
140 | // the dependency on the latter. |
141 | Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) { |
142 | assert(!Prev || Prev->Next == nullptr); |
143 | Arena *NewArena = new (__sanitizer::InternalAlloc( |
144 | size: Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment)) |
145 | Arena(Size); |
146 | if (Prev) |
147 | Prev->Next = NewArena; |
148 | return NewArena; |
149 | } |
150 | |
151 | void Arena::freeArenaList(Arena *&A) { |
152 | assert(A); |
153 | for (auto *I = A; I != nullptr;) { |
154 | auto *Current = I; |
155 | I = I->Next; |
156 | __sanitizer::InternalFree(p: Current); |
157 | } |
158 | A = nullptr; |
159 | } |
160 | |
161 | // If this is the first time we hit a callsite with this (Guid) particular |
162 | // callee, we need to allocate. |
163 | ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint, |
164 | uint32_t NrCounters, uint32_t NrCallsites) { |
165 | auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites); |
166 | auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem; |
167 | char *AllocPlace = Mem->tryBumpAllocate(S: AllocSize); |
168 | if (!AllocPlace) { |
169 | // if we failed to allocate on the current arena, allocate a new arena, |
170 | // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we |
171 | // find it from now on for other cases when we need to getCallsiteSlow. |
172 | // Note that allocateNewArena will link the allocated memory in the list of |
173 | // Arenas. |
174 | __llvm_ctx_profile_current_context_root->CurrentMem = Mem = |
175 | Mem->allocateNewArena(Size: getArenaAllocSize(Needed: AllocSize), Prev: Mem); |
176 | AllocPlace = Mem->tryBumpAllocate(S: AllocSize); |
177 | } |
178 | auto *Ret = allocContextNode(Place: AllocPlace, Guid, NrCounters, NrCallsites, |
179 | Next: *InsertionPoint); |
180 | *InsertionPoint = Ret; |
181 | return Ret; |
182 | } |
183 | |
184 | ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, |
185 | uint32_t NrCounters, |
186 | uint32_t NrCallsites) { |
187 | // fast "out" if we're not even doing contextual collection. |
188 | if (!__llvm_ctx_profile_current_context_root) |
189 | return TheScratchContext; |
190 | |
191 | // also fast "out" if the caller is scratch. We can see if it's scratch by |
192 | // looking at the interior pointer into the subcontexts vector that the caller |
193 | // provided, which, if the context is scratch, so is that interior pointer |
194 | // (because all the address calculations are using even values. Or more |
195 | // precisely, aligned - 8 values) |
196 | auto **CallsiteContext = consume(V&: __llvm_ctx_profile_callsite[0]); |
197 | if (!CallsiteContext || isScratch(Ctx: CallsiteContext)) |
198 | return TheScratchContext; |
199 | |
200 | // if the callee isn't the expected one, return scratch. |
201 | // Signal handler(s) could have been invoked at any point in the execution. |
202 | // Should that have happened, and had it (the handler) be built with |
203 | // instrumentation, its __llvm_ctx_profile_get_context would have failed here. |
204 | // Its sub call graph would have then populated |
205 | // __llvm_ctx_profile_{expected_callee | callsite} at index 1. |
206 | // The normal call graph may be impacted in that, if the signal handler |
207 | // happened somewhere before we read the TLS here, we'd see the TLS reset and |
208 | // we'd also fail here. That would just mean we would loose counter values for |
209 | // the normal subgraph, this time around. That should be very unlikely, but if |
210 | // it happens too frequently, we should be able to detect discrepancies in |
211 | // entry counts (caller-callee). At the moment, the design goes on the |
212 | // assumption that is so unfrequent, though, that it's not worth doing more |
213 | // for that case. |
214 | auto *ExpectedCallee = consume(V&: __llvm_ctx_profile_expected_callee[0]); |
215 | if (ExpectedCallee != Callee) |
216 | return TheScratchContext; |
217 | |
218 | auto *Callsite = *CallsiteContext; |
219 | // in the case of indirect calls, we will have all seen targets forming a |
220 | // linked list here. Find the one corresponding to this callee. |
221 | while (Callsite && Callsite->guid() != Guid) { |
222 | Callsite = Callsite->next(); |
223 | } |
224 | auto *Ret = Callsite ? Callsite |
225 | : getCallsiteSlow(Guid, InsertionPoint: CallsiteContext, NrCounters, |
226 | NrCallsites); |
227 | if (Ret->callsites_size() != NrCallsites || |
228 | Ret->counters_size() != NrCounters) |
229 | __sanitizer::Printf(format: "[ctxprof] Returned ctx differs from what's asked: " |
230 | "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n" , |
231 | reinterpret_cast<void *>(Ret), Guid, NrCallsites, |
232 | NrCounters, Ret->guid(), Ret->callsites_size(), |
233 | Ret->counters_size()); |
234 | onContextEnter(Node&: *Ret); |
235 | return Ret; |
236 | } |
237 | |
238 | // This should be called once for a Root. Allocate the first arena, set up the |
239 | // first context. |
240 | void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters, |
241 | uint32_t NrCallsites) { |
242 | __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( |
243 | &AllContextsMutex); |
244 | // Re-check - we got here without having had taken a lock. |
245 | if (Root->FirstMemBlock) |
246 | return; |
247 | const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites); |
248 | auto *M = Arena::allocateNewArena(Size: getArenaAllocSize(Needed)); |
249 | Root->FirstMemBlock = M; |
250 | Root->CurrentMem = M; |
251 | Root->FirstNode = allocContextNode(Place: M->tryBumpAllocate(S: Needed), Guid, |
252 | NrCounters, NrCallsites); |
253 | AllContextRoots.PushBack(v: Root); |
254 | } |
255 | |
256 | ContextNode *__llvm_ctx_profile_start_context( |
257 | ContextRoot *Root, GUID Guid, uint32_t Counters, |
258 | uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { |
259 | if (!Root->FirstMemBlock) { |
260 | setupContext(Root, Guid, NrCounters: Counters, NrCallsites: Callsites); |
261 | } |
262 | if (Root->Taken.TryLock()) { |
263 | __llvm_ctx_profile_current_context_root = Root; |
264 | onContextEnter(Node&: *Root->FirstNode); |
265 | return Root->FirstNode; |
266 | } |
267 | // If this thread couldn't take the lock, return scratch context. |
268 | __llvm_ctx_profile_current_context_root = nullptr; |
269 | return TheScratchContext; |
270 | } |
271 | |
272 | void __llvm_ctx_profile_release_context(ContextRoot *Root) |
273 | SANITIZER_NO_THREAD_SAFETY_ANALYSIS { |
274 | if (__llvm_ctx_profile_current_context_root) { |
275 | __llvm_ctx_profile_current_context_root = nullptr; |
276 | Root->Taken.Unlock(); |
277 | } |
278 | } |
279 | |
280 | void __llvm_ctx_profile_start_collection() { |
281 | size_t NrMemUnits = 0; |
282 | __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( |
283 | &AllContextsMutex); |
284 | for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) { |
285 | auto *Root = AllContextRoots[I]; |
286 | __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock( |
287 | &Root->Taken); |
288 | for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) |
289 | ++NrMemUnits; |
290 | |
291 | resetContextNode(Node&: *Root->FirstNode); |
292 | } |
293 | __sanitizer::Printf(format: "[ctxprof] Initial NrMemUnits: %zu \n" , NrMemUnits); |
294 | } |
295 | |
296 | bool __llvm_ctx_profile_fetch(void *Data, |
297 | bool (*Writer)(void *W, const ContextNode &)) { |
298 | assert(Writer); |
299 | __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( |
300 | &AllContextsMutex); |
301 | |
302 | for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) { |
303 | auto *Root = AllContextRoots[I]; |
304 | __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock( |
305 | &Root->Taken); |
306 | if (!validate(Root)) { |
307 | __sanitizer::Printf(format: "[ctxprof] Contextual Profile is %s\n" , "invalid" ); |
308 | return false; |
309 | } |
310 | if (!Writer(Data, *Root->FirstNode)) |
311 | return false; |
312 | } |
313 | return true; |
314 | } |
315 | |
316 | void __llvm_ctx_profile_free() { |
317 | __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( |
318 | &AllContextsMutex); |
319 | for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) |
320 | for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) { |
321 | auto *C = A; |
322 | A = A->next(); |
323 | __sanitizer::InternalFree(p: C); |
324 | } |
325 | AllContextRoots.Reset(); |
326 | } |
327 | |