CtxInstrProfiling.cpp source code [llvm_runtimes/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp]

1	//===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "CtxInstrProfiling.h"
10	#include "sanitizer_common/sanitizer_allocator_internal.h"
11	#include "sanitizer_common/sanitizer_common.h"
12	#include "sanitizer_common/sanitizer_dense_map.h"
13	#include "sanitizer_common/sanitizer_libc.h"
14	#include "sanitizer_common/sanitizer_mutex.h"
15	#include "sanitizer_common/sanitizer_placement_new.h"
16	#include "sanitizer_common/sanitizer_thread_safety.h"
17	#include "sanitizer_common/sanitizer_vector.h"
18
19	#include <assert.h>
20
21	using namespace __ctx_profile;
22
23	namespace {
24	// Keep track of all the context roots we actually saw, so we can then traverse
25	// them when the user asks for the profile in __llvm_ctx_profile_fetch
26	__sanitizer::SpinMutex AllContextsMutex;
27	SANITIZER_GUARDED_BY(AllContextsMutex)
28	__sanitizer::Vector<ContextRoot *> AllContextRoots;
29
30	// utility to taint a pointer by setting the LSB. There is an assumption
31	// throughout that the addresses of contexts are even (really, they should be
32	// align(8), but "even"-ness is the minimum assumption)
33	// "scratch contexts" are buffers that we return in certain cases - they are
34	// large enough to allow for memory safe counter access, but they don't link
35	// subcontexts below them (the runtime recognizes them and enforces that)
36	ContextNode markAsScratch(const* ContextNode *Ctx) {
37	return reinterpret_cast<ContextNode >(reinterpret_cast*<uint64_t>(Ctx) \| `1`);
38	}
39
40	// Used when getting the data from TLS. We don't really* need to reset, but*
41	// it's a simpler system if we do.
42	template <typename T> inline T consume(T &V) {
43	auto R = V;
44	V = {`0`};
45	return R;
46	}
47
48	// We allocate at least kBuffSize Arena pages. The scratch buffer is also that
49	// large.
50	constexpr size_t kPower = `20`;
51	constexpr size_t kBuffSize = `1` << kPower;
52
53	// Highly unlikely we need more than kBuffSize for a context.
54	size_t getArenaAllocSize(size_t Needed) {
55	if (Needed >= kBuffSize)
56	return `2` * Needed;
57	return kBuffSize;
58	}
59
60	// verify the structural integrity of the context
61	bool validate(const ContextRoot *Root) {
62	// all contexts should be laid out in some arena page. Go over each arena
63	// allocated for this Root, and jump over contained contexts based on
64	// self-reported sizes.
65	__sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
66	for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
67	const auto *Pos = Mem->start();
68	while (Pos < Mem->pos()) {
69	const auto Ctx = reinterpret_cast<const* ContextNode *>(Pos);
70	if (!ContextStartAddrs.insert(KV: {reinterpret_cast<uint64_t>(Ctx), true})
71	.second)
72	return false;
73	Pos += Ctx->size();
74	}
75	}
76
77	// Now traverse the contexts again the same way, but validate all nonull
78	// subcontext addresses appear in the set computed above.
79	for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
80	const auto *Pos = Mem->start();
81	while (Pos < Mem->pos()) {
82	const auto Ctx = reinterpret_cast<const* ContextNode *>(Pos);
83	for (uint32_t I = `0`; I < Ctx->callsites_size(); ++I)
84	for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
85	if (!ContextStartAddrs.find(Key: reinterpret_cast<uint64_t>(Sub)))
86	return false;
87
88	Pos += Ctx->size();
89	}
90	}
91	return true;
92	}
93
94	inline ContextNode allocContextNode(char* *Place, GUID Guid,
95	uint32_t NrCounters, uint32_t NrCallsites,
96	ContextNode Next = nullptr*) {
97	assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == `0`);
98	return new (Place) ContextNode (Guid, NrCounters, NrCallsites, Next);
99	}
100
101	void resetContextNode(ContextNode &Node) {
102	// FIXME(mtrofin): this is std::memset, which we can probably use if we
103	// drop/reduce the dependency on sanitizer_common.
104	for (uint32_t I = `0`; I < Node.counters_size(); ++I)
105	Node.counters()[I] = `0`;
106	for (uint32_t I = `0`; I < Node.callsites_size(); ++I)
107	for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next())
108	resetContextNode(Node&: *Next);
109	}
110
111	void onContextEnter(ContextNode &Node) { ++Node.counters()[`0`]; }
112
113	} // namespace
114
115	// the scratch buffer - what we give when we can't produce a real context (the
116	// scratch isn't "real" in that it's expected to be clobbered carelessly - we
117	// don't read it). The other important thing is that the callees from a scratch
118	// context also get a scratch context.
119	// Eventually this can be replaced with per-function buffers, a'la the typical
120	// (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
121	// the part about determining the nature of the subcontexts does.
122	__thread char __Buffer[kBuffSize] = {`0`};
123
124	#define TheScratchContext \
125	markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
126
127	// init the TLSes
128	__thread void *volatile __llvm_ctx_profile_expected_callee[`2`] = {nullptr,
129	nullptr};
130	__thread ContextNode **volatile __llvm_ctx_profile_callsite[`2`] = {`0`, `0`};
131
132	__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
133	nullptr;
134
135	Arena::Arena(uint32_t Size) : Size(Size) {
136	__sanitizer::internal_memset(s: start(), c: `0`, n: Size);
137	}
138
139	// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
140	// the dependency on the latter.
141	Arena Arena::allocateNewArena(size_t Size, Arena Prev) {
142	assert(!Prev \|\| Prev->Next == nullptr);
143	Arena NewArena = new* (__sanitizer::InternalAlloc(
144	size: Size + sizeof(Arena), /cache=/nullptr, /alignment=/ExpectedAlignment))
145	Arena (Size);
146	if (Prev)
147	Prev->Next = NewArena;
148	return NewArena;
149	}
150
151	void Arena::freeArenaList(Arena *&A) {
152	assert(A);
153	for (auto I = A; I != nullptr*;) {
154	auto *Current = I;
155	I = I->Next;
156	__sanitizer::InternalFree(p: Current);
157	}
158	A = nullptr;
159	}
160
161	// If this is the first time we hit a callsite with this (Guid) particular
162	// callee, we need to allocate.
163	ContextNode getCallsiteSlow(GUID Guid, ContextNode *InsertionPoint,
164	uint32_t NrCounters, uint32_t NrCallsites) {
165	auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
166	auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
167	char *AllocPlace = Mem->tryBumpAllocate(S: AllocSize);
168	if (!AllocPlace) {
169	// if we failed to allocate on the current arena, allocate a new arena,
170	// and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
171	// find it from now on for other cases when we need to getCallsiteSlow.
172	// Note that allocateNewArena will link the allocated memory in the list of
173	// Arenas.
174	__llvm_ctx_profile_current_context_root->CurrentMem = Mem =
175	Mem->allocateNewArena(Size: getArenaAllocSize(Needed: AllocSize), Prev: Mem);
176	AllocPlace = Mem->tryBumpAllocate(S: AllocSize);
177	}
178	auto *Ret = allocContextNode(Place: AllocPlace, Guid, NrCounters, NrCallsites,
179	Next: *InsertionPoint);
180	*InsertionPoint = Ret;
181	return Ret;
182	}
183
184	ContextNode __llvm_ctx_profile_get_context(void* *Callee, GUID Guid,
185	uint32_t NrCounters,
186	uint32_t NrCallsites) {
187	// fast "out" if we're not even doing contextual collection.
188	if (!__llvm_ctx_profile_current_context_root)
189	return TheScratchContext;
190
191	// also fast "out" if the caller is scratch. We can see if it's scratch by
192	// looking at the interior pointer into the subcontexts vector that the caller
193	// provided, which, if the context is scratch, so is that interior pointer
194	// (because all the address calculations are using even values. Or more
195	// precisely, aligned - 8 values)
196	auto **CallsiteContext = consume(V&: __llvm_ctx_profile_callsite[`0`]);
197	if (!CallsiteContext \|\| isScratch(Ctx: CallsiteContext))
198	return TheScratchContext;
199
200	// if the callee isn't the expected one, return scratch.
201	// Signal handler(s) could have been invoked at any point in the execution.
202	// Should that have happened, and had it (the handler) be built with
203	// instrumentation, its __llvm_ctx_profile_get_context would have failed here.
204	// Its sub call graph would have then populated
205	// __llvm_ctx_profile_{expected_callee \| callsite} at index 1.
206	// The normal call graph may be impacted in that, if the signal handler
207	// happened somewhere before we read the TLS here, we'd see the TLS reset and
208	// we'd also fail here. That would just mean we would loose counter values for
209	// the normal subgraph, this time around. That should be very unlikely, but if
210	// it happens too frequently, we should be able to detect discrepancies in
211	// entry counts (caller-callee). At the moment, the design goes on the
212	// assumption that is so unfrequent, though, that it's not worth doing more
213	// for that case.
214	auto *ExpectedCallee = consume(V&: __llvm_ctx_profile_expected_callee[`0`]);
215	if (ExpectedCallee != Callee)
216	return TheScratchContext;
217
218	auto Callsite = CallsiteContext;
219	// in the case of indirect calls, we will have all seen targets forming a
220	// linked list here. Find the one corresponding to this callee.
221	while (Callsite && Callsite->guid() != Guid) {
222	Callsite = Callsite->next();
223	}
224	auto *Ret = Callsite ? Callsite
225	: getCallsiteSlow(Guid, InsertionPoint: CallsiteContext, NrCounters,
226	NrCallsites);
227	if (Ret->callsites_size() != NrCallsites \|\|
228	Ret->counters_size() != NrCounters)
229	__sanitizer::Printf(format: "[ctxprof] Returned ctx differs from what's asked: "
230	"Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
231	reinterpret_cast<void *>(Ret), Guid, NrCallsites,
232	NrCounters, Ret->guid(), Ret->callsites_size(),
233	Ret->counters_size());
234	onContextEnter(Node&: *Ret);
235	return Ret;
236	}
237
238	// This should be called once for a Root. Allocate the first arena, set up the
239	// first context.
240	void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
241	uint32_t NrCallsites) {
242	__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
243	&AllContextsMutex);
244	// Re-check - we got here without having had taken a lock.
245	if (Root->FirstMemBlock)
246	return;
247	const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
248	auto *M = Arena::allocateNewArena(Size: getArenaAllocSize(Needed));
249	Root->FirstMemBlock = M;
250	Root->CurrentMem = M;
251	Root->FirstNode = allocContextNode(Place: M->tryBumpAllocate(S: Needed), Guid,
252	NrCounters, NrCallsites);
253	AllContextRoots.PushBack(v: Root);
254	}
255
256	ContextNode *__llvm_ctx_profile_start_context(
257	ContextRoot *Root, GUID Guid, uint32_t Counters,
258	uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
259	if (!Root->FirstMemBlock) {
260	setupContext(Root, Guid, NrCounters: Counters, NrCallsites: Callsites);
261	}
262	if (Root->Taken.TryLock()) {
263	__llvm_ctx_profile_current_context_root = Root;
264	onContextEnter(Node&: *Root->FirstNode);
265	return Root->FirstNode;
266	}
267	// If this thread couldn't take the lock, return scratch context.
268	__llvm_ctx_profile_current_context_root = nullptr;
269	return TheScratchContext;
270	}
271
272	void __llvm_ctx_profile_release_context(ContextRoot *Root)
273	SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
274	if (__llvm_ctx_profile_current_context_root) {
275	__llvm_ctx_profile_current_context_root = nullptr;
276	Root->Taken.Unlock();
277	}
278	}
279
280	void __llvm_ctx_profile_start_collection() {
281	size_t NrMemUnits = `0`;
282	__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
283	&AllContextsMutex);
284	for (uint32_t I = `0`; I < AllContextRoots.Size(); ++I) {
285	auto *Root = AllContextRoots [I];
286	__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
287	&Root->Taken);
288	for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
289	++NrMemUnits;
290
291	resetContextNode(Node&: *Root->FirstNode);
292	}
293	__sanitizer::Printf(format: "[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
294	}
295
296	bool __llvm_ctx_profile_fetch(void *Data,
297	bool (Writer)(void* W, const* ContextNode &)) {
298	assert(Writer);
299	__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
300	&AllContextsMutex);
301
302	for (int I = `0`, E = AllContextRoots.Size(); I < E; ++I) {
303	auto *Root = AllContextRoots [I];
304	__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
305	&Root->Taken);
306	if (!validate(Root)) {
307	__sanitizer::Printf(format: "[ctxprof] Contextual Profile is %s\n", "invalid");
308	return false;
309	}
310	if (!Writer(Data, *Root->FirstNode))
311	return false;
312	}
313	return true;
314	}
315
316	void __llvm_ctx_profile_free() {
317	__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
318	&AllContextsMutex);
319	for (int I = `0`, E = AllContextRoots.Size(); I < E; ++I)
320	for (auto *A = AllContextRoots [I]->FirstMemBlock; A;) {
321	auto *C = A;
322	A = A->next();
323	__sanitizer::InternalFree(p: C);
324	}
325	AllContextRoots.Reset();
326	}
327

Browse the source code of llvm_runtimes/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp