1//===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "CtxInstrProfiling.h"
10#include "sanitizer_common/sanitizer_allocator_internal.h"
11#include "sanitizer_common/sanitizer_common.h"
12#include "sanitizer_common/sanitizer_dense_map.h"
13#include "sanitizer_common/sanitizer_libc.h"
14#include "sanitizer_common/sanitizer_mutex.h"
15#include "sanitizer_common/sanitizer_placement_new.h"
16#include "sanitizer_common/sanitizer_thread_safety.h"
17#include "sanitizer_common/sanitizer_vector.h"
18
19#include <assert.h>
20
21using namespace __ctx_profile;
22
23namespace {
24// Keep track of all the context roots we actually saw, so we can then traverse
25// them when the user asks for the profile in __llvm_ctx_profile_fetch
26__sanitizer::SpinMutex AllContextsMutex;
27SANITIZER_GUARDED_BY(AllContextsMutex)
28__sanitizer::Vector<ContextRoot *> AllContextRoots;
29
30// utility to taint a pointer by setting the LSB. There is an assumption
31// throughout that the addresses of contexts are even (really, they should be
32// align(8), but "even"-ness is the minimum assumption)
33// "scratch contexts" are buffers that we return in certain cases - they are
34// large enough to allow for memory safe counter access, but they don't link
35// subcontexts below them (the runtime recognizes them and enforces that)
36ContextNode *markAsScratch(const ContextNode *Ctx) {
37 return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
38}
39
40// Used when getting the data from TLS. We don't *really* need to reset, but
41// it's a simpler system if we do.
42template <typename T> inline T consume(T &V) {
43 auto R = V;
44 V = {0};
45 return R;
46}
47
48// We allocate at least kBuffSize Arena pages. The scratch buffer is also that
49// large.
50constexpr size_t kPower = 20;
51constexpr size_t kBuffSize = 1 << kPower;
52
53// Highly unlikely we need more than kBuffSize for a context.
54size_t getArenaAllocSize(size_t Needed) {
55 if (Needed >= kBuffSize)
56 return 2 * Needed;
57 return kBuffSize;
58}
59
60// verify the structural integrity of the context
61bool validate(const ContextRoot *Root) {
62 // all contexts should be laid out in some arena page. Go over each arena
63 // allocated for this Root, and jump over contained contexts based on
64 // self-reported sizes.
65 __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
66 for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
67 const auto *Pos = Mem->start();
68 while (Pos < Mem->pos()) {
69 const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
70 if (!ContextStartAddrs.insert(KV: {reinterpret_cast<uint64_t>(Ctx), true})
71 .second)
72 return false;
73 Pos += Ctx->size();
74 }
75 }
76
77 // Now traverse the contexts again the same way, but validate all nonull
78 // subcontext addresses appear in the set computed above.
79 for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
80 const auto *Pos = Mem->start();
81 while (Pos < Mem->pos()) {
82 const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
83 for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
84 for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
85 if (!ContextStartAddrs.find(Key: reinterpret_cast<uint64_t>(Sub)))
86 return false;
87
88 Pos += Ctx->size();
89 }
90 }
91 return true;
92}
93
94inline ContextNode *allocContextNode(char *Place, GUID Guid,
95 uint32_t NrCounters, uint32_t NrCallsites,
96 ContextNode *Next = nullptr) {
97 assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
98 return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
99}
100
101void resetContextNode(ContextNode &Node) {
102 // FIXME(mtrofin): this is std::memset, which we can probably use if we
103 // drop/reduce the dependency on sanitizer_common.
104 for (uint32_t I = 0; I < Node.counters_size(); ++I)
105 Node.counters()[I] = 0;
106 for (uint32_t I = 0; I < Node.callsites_size(); ++I)
107 for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next())
108 resetContextNode(Node&: *Next);
109}
110
111void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; }
112
113} // namespace
114
115// the scratch buffer - what we give when we can't produce a real context (the
116// scratch isn't "real" in that it's expected to be clobbered carelessly - we
117// don't read it). The other important thing is that the callees from a scratch
118// context also get a scratch context.
119// Eventually this can be replaced with per-function buffers, a'la the typical
120// (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
121// the part about determining the nature of the subcontexts does.
122__thread char __Buffer[kBuffSize] = {0};
123
124#define TheScratchContext \
125 markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
126
127// init the TLSes
128__thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
129 nullptr};
130__thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};
131
132__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
133 nullptr;
134
135Arena::Arena(uint32_t Size) : Size(Size) {
136 __sanitizer::internal_memset(s: start(), c: 0, n: Size);
137}
138
139// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
140// the dependency on the latter.
141Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
142 assert(!Prev || Prev->Next == nullptr);
143 Arena *NewArena = new (__sanitizer::InternalAlloc(
144 size: Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment))
145 Arena(Size);
146 if (Prev)
147 Prev->Next = NewArena;
148 return NewArena;
149}
150
151void Arena::freeArenaList(Arena *&A) {
152 assert(A);
153 for (auto *I = A; I != nullptr;) {
154 auto *Current = I;
155 I = I->Next;
156 __sanitizer::InternalFree(p: Current);
157 }
158 A = nullptr;
159}
160
161// If this is the first time we hit a callsite with this (Guid) particular
162// callee, we need to allocate.
163ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
164 uint32_t NrCounters, uint32_t NrCallsites) {
165 auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
166 auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
167 char *AllocPlace = Mem->tryBumpAllocate(S: AllocSize);
168 if (!AllocPlace) {
169 // if we failed to allocate on the current arena, allocate a new arena,
170 // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
171 // find it from now on for other cases when we need to getCallsiteSlow.
172 // Note that allocateNewArena will link the allocated memory in the list of
173 // Arenas.
174 __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
175 Mem->allocateNewArena(Size: getArenaAllocSize(Needed: AllocSize), Prev: Mem);
176 AllocPlace = Mem->tryBumpAllocate(S: AllocSize);
177 }
178 auto *Ret = allocContextNode(Place: AllocPlace, Guid, NrCounters, NrCallsites,
179 Next: *InsertionPoint);
180 *InsertionPoint = Ret;
181 return Ret;
182}
183
184ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
185 uint32_t NrCounters,
186 uint32_t NrCallsites) {
187 // fast "out" if we're not even doing contextual collection.
188 if (!__llvm_ctx_profile_current_context_root)
189 return TheScratchContext;
190
191 // also fast "out" if the caller is scratch. We can see if it's scratch by
192 // looking at the interior pointer into the subcontexts vector that the caller
193 // provided, which, if the context is scratch, so is that interior pointer
194 // (because all the address calculations are using even values. Or more
195 // precisely, aligned - 8 values)
196 auto **CallsiteContext = consume(V&: __llvm_ctx_profile_callsite[0]);
197 if (!CallsiteContext || isScratch(Ctx: CallsiteContext))
198 return TheScratchContext;
199
200 // if the callee isn't the expected one, return scratch.
201 // Signal handler(s) could have been invoked at any point in the execution.
202 // Should that have happened, and had it (the handler) be built with
203 // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
204 // Its sub call graph would have then populated
205 // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
206 // The normal call graph may be impacted in that, if the signal handler
207 // happened somewhere before we read the TLS here, we'd see the TLS reset and
208 // we'd also fail here. That would just mean we would loose counter values for
209 // the normal subgraph, this time around. That should be very unlikely, but if
210 // it happens too frequently, we should be able to detect discrepancies in
211 // entry counts (caller-callee). At the moment, the design goes on the
212 // assumption that is so unfrequent, though, that it's not worth doing more
213 // for that case.
214 auto *ExpectedCallee = consume(V&: __llvm_ctx_profile_expected_callee[0]);
215 if (ExpectedCallee != Callee)
216 return TheScratchContext;
217
218 auto *Callsite = *CallsiteContext;
219 // in the case of indirect calls, we will have all seen targets forming a
220 // linked list here. Find the one corresponding to this callee.
221 while (Callsite && Callsite->guid() != Guid) {
222 Callsite = Callsite->next();
223 }
224 auto *Ret = Callsite ? Callsite
225 : getCallsiteSlow(Guid, InsertionPoint: CallsiteContext, NrCounters,
226 NrCallsites);
227 if (Ret->callsites_size() != NrCallsites ||
228 Ret->counters_size() != NrCounters)
229 __sanitizer::Printf(format: "[ctxprof] Returned ctx differs from what's asked: "
230 "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
231 reinterpret_cast<void *>(Ret), Guid, NrCallsites,
232 NrCounters, Ret->guid(), Ret->callsites_size(),
233 Ret->counters_size());
234 onContextEnter(Node&: *Ret);
235 return Ret;
236}
237
238// This should be called once for a Root. Allocate the first arena, set up the
239// first context.
240void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
241 uint32_t NrCallsites) {
242 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
243 &AllContextsMutex);
244 // Re-check - we got here without having had taken a lock.
245 if (Root->FirstMemBlock)
246 return;
247 const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
248 auto *M = Arena::allocateNewArena(Size: getArenaAllocSize(Needed));
249 Root->FirstMemBlock = M;
250 Root->CurrentMem = M;
251 Root->FirstNode = allocContextNode(Place: M->tryBumpAllocate(S: Needed), Guid,
252 NrCounters, NrCallsites);
253 AllContextRoots.PushBack(v: Root);
254}
255
256ContextNode *__llvm_ctx_profile_start_context(
257 ContextRoot *Root, GUID Guid, uint32_t Counters,
258 uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
259 if (!Root->FirstMemBlock) {
260 setupContext(Root, Guid, NrCounters: Counters, NrCallsites: Callsites);
261 }
262 if (Root->Taken.TryLock()) {
263 __llvm_ctx_profile_current_context_root = Root;
264 onContextEnter(Node&: *Root->FirstNode);
265 return Root->FirstNode;
266 }
267 // If this thread couldn't take the lock, return scratch context.
268 __llvm_ctx_profile_current_context_root = nullptr;
269 return TheScratchContext;
270}
271
272void __llvm_ctx_profile_release_context(ContextRoot *Root)
273 SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
274 if (__llvm_ctx_profile_current_context_root) {
275 __llvm_ctx_profile_current_context_root = nullptr;
276 Root->Taken.Unlock();
277 }
278}
279
280void __llvm_ctx_profile_start_collection() {
281 size_t NrMemUnits = 0;
282 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
283 &AllContextsMutex);
284 for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
285 auto *Root = AllContextRoots[I];
286 __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
287 &Root->Taken);
288 for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
289 ++NrMemUnits;
290
291 resetContextNode(Node&: *Root->FirstNode);
292 }
293 __sanitizer::Printf(format: "[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
294}
295
296bool __llvm_ctx_profile_fetch(void *Data,
297 bool (*Writer)(void *W, const ContextNode &)) {
298 assert(Writer);
299 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
300 &AllContextsMutex);
301
302 for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
303 auto *Root = AllContextRoots[I];
304 __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
305 &Root->Taken);
306 if (!validate(Root)) {
307 __sanitizer::Printf(format: "[ctxprof] Contextual Profile is %s\n", "invalid");
308 return false;
309 }
310 if (!Writer(Data, *Root->FirstNode))
311 return false;
312 }
313 return true;
314}
315
316void __llvm_ctx_profile_free() {
317 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
318 &AllContextsMutex);
319 for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
320 for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
321 auto *C = A;
322 A = A->next();
323 __sanitizer::InternalFree(p: C);
324 }
325 AllContextRoots.Reset();
326}
327