CtxInstrProfiling.h source code [llvm_runtimes/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h]

1	/===- CtxInstrProfiling.h- Contextual instrumentation-based PGO ---------===\
2	\|*
3	\| Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.*
4	\| See https://llvm.org/LICENSE.txt for license information.*
5	\| SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception*
6	\|*
7	\===----------------------------------------------------------------------===/
8
9	#ifndef CTX_PROFILE_CTXINSTRPROFILING_H_
10	#define CTX_PROFILE_CTXINSTRPROFILING_H_
11
12	#include "CtxInstrContextNode.h"
13	#include "sanitizer_common/sanitizer_dense_map.h"
14	#include "sanitizer_common/sanitizer_mutex.h"
15	#include <sanitizer/common_interface_defs.h>
16
17	using namespace llvm::ctx_profile;
18
19	// Forward-declare for the one unittest checking Arena construction zeroes out
20	// its allocatable space.
21	class ArenaTest_ZeroInit_Test;
22	namespace __ctx_profile {
23
24	static constexpr size_t ExpectedAlignment = `8`;
25	// We really depend on this, see further below. We currently support x86_64.
26	// When we want to support other archs, we need to trace the places Alignment is
27	// used and adjust accordingly.
28	static_assert(sizeof(void *) == ExpectedAlignment);
29
30	/// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
31	/// Allocation and de-allocation happen using sanitizer APIs. We make that
32	/// explicit.
33	class Arena final {
34	public:
35	// When allocating a new Arena, optionally specify an existing one to append
36	// to, assumed to be the last in the Arena list. We only need to support
37	// appending to the arena list.
38	static Arena allocateNewArena(size_t Size, Arena Prev = nullptr);
39	static void freeArenaList(Arena *&A);
40
41	uint64_t size() const { return Size; }
42
43	// Allocate S bytes or return nullptr if we don't have that many available.
44	char *tryBumpAllocate(size_t S) {
45	if (Pos + S > Size)
46	return nullptr;
47	Pos += S;
48	return start() + (Pos - S);
49	}
50
51	Arena next() const* { return Next; }
52
53	// the beginning of allocatable memory.
54	const char start() const* { return const_cast<Arena >(this*)->start(); }
55	const char pos() const* { return start() + Pos; }
56
57	private:
58	friend class ::ArenaTest_ZeroInit_Test;
59	explicit Arena(uint32_t Size);
60	~Arena() = delete;
61
62	char start() { return* reinterpret_cast<char >(&this*[`1`]); }
63
64	Arena Next = nullptr*;
65	uint64_t Pos = `0`;
66	const uint64_t Size;
67	};
68
69	// The memory available for allocation follows the Arena header, and we expect
70	// it to be thus aligned.
71	static_assert(alignof(Arena) == ExpectedAlignment);
72
73	// Verify maintenance to ContextNode doesn't change this invariant, which makes
74	// sure the inlined vectors are appropriately aligned.
75	static_assert(alignof(ContextNode) == ExpectedAlignment);
76
77	/// ContextRoots hold memory and the start of the contextual profile tree for a
78	/// root function.
79	struct ContextRoot {
80	ContextNode FirstNode = nullptr*;
81	Arena FirstMemBlock = nullptr*;
82	Arena CurrentMem = nullptr*;
83
84	// Count the number of entries - regardless if we could take the `Taken` mutex
85	::__sanitizer::atomic_uint64_t TotalEntries = {};
86
87	// Profiles for functions we encounter when collecting a contexutal profile,
88	// that are not associated with a callsite. This is expected to happen for
89	// signal handlers, but it also - problematically - currently happens for
90	// call sites generated after profile instrumentation, primarily
91	// mem{memset\|copy\|move\|set}.
92	// `Unhandled` serves 2 purposes:
93	// 1. identifying such cases (like the memops)
94	// 2. collecting a profile for them, which can be at least used as a flat
95	// profile
96	::__sanitizer::DenseMap<GUID, ContextNode *> Unhandled;
97	// Keep the unhandled contexts in a list, as we allocate them, as it makes it
98	// simpler to send to the writer when the profile is fetched.
99	ContextNode FirstUnhandledCalleeNode = nullptr*;
100
101	// Taken is used to ensure only one thread traverses the contextual graph -
102	// either to read it or to write it. On server side, the same entrypoint will
103	// be entered by numerous threads, but over time, the profile aggregated by
104	// collecting sequentially on one thread at a time is expected to converge to
105	// the aggregate profile that may have been observable on all the threads.
106	// Note that this is node-by-node aggregation, i.e. summing counters of nodes
107	// at the same position in the graph, not flattening.
108	// Threads that cannot lock Taken (fail TryLock) are given a "scratch context"
109	// - a buffer they can clobber, safely from a memory access perspective.
110	//
111	// Note about "scratch"-ness: we currently ignore the data written in them
112	// (which is anyway clobbered). The design allows for that not be the case -
113	// because "scratch"-ness is first and foremost about not trying to build
114	// subcontexts, and is captured by tainting the pointer value (pointer to the
115	// memory treated as context), but right now, we drop that info.
116	//
117	// We could consider relaxing the requirement of more than one thread
118	// entering by holding a few context trees per entrypoint and then aggregating
119	// them (as explained above) at the end of the profile collection - it's a
120	// tradeoff between collection time and memory use: higher precision can be
121	// obtained with either less concurrent collections but more collection time,
122	// or with more concurrent collections (==more memory) and less collection
123	// time. Note that concurrent collection does happen for different
124	// entrypoints, regardless.
125	::__sanitizer::SpinMutex Taken;
126	};
127
128	// This is allocated and zero-initialized by the compiler, the in-place
129	// initialization serves mostly as self-documentation and for testing.
130	// The design is influenced by the observation that typically (at least for
131	// datacenter binaries, which is the motivating target of this profiler) less
132	// than 10% of functions in a binary even appear in a profile (of any kind).
133	//
134	// 1) We could pre-allocate the flat profile storage in the compiler, just like
135	// the flat instrumented profiling does. But that penalizes the static size of
136	// the binary for little reason
137	//
138	// 2) We could do the above but zero-initialize the buffers (which should place
139	// them in .bss), and dynamically populate them. This, though, would page-in
140	// more memory upfront for the binary's runtime
141	//
142	// The current design trades off a bit of overhead at the first time a function
143	// is encountered for flat profiling* for avoiding size penalties.*
144	struct FunctionData {
145	#define _PTRDECL(T, N) T *N = nullptr;
146	#define _VOLATILE_PTRDECL(T, N) T *volatile N = nullptr;
147	#define _MUTEXDECL(N) ::__sanitizer::SpinMutex N;
148	#define _CONTEXT_PTR ContextRoot *CtxRoot = nullptr;
149	CTXPROF_FUNCTION_DATA(_PTRDECL, _CONTEXT_PTR, _VOLATILE_PTRDECL, _MUTEXDECL)
150	#undef _CONTEXT_PTR
151	#undef _PTRDECL
152	#undef _VOLATILE_PTRDECL
153	#undef _MUTEXDECL
154
155	// Constructor for test only - since this is expected to be
156	// initialized by the compiler.
157	FunctionData() = default;
158	ContextRoot *getOrAllocateContextRoot();
159
160	// If (unlikely) StaticSpinMutex internals change, we need to modify the LLVM
161	// instrumentation lowering side because it is responsible for allocating and
162	// zero-initializing ContextRoots.
163	static_assert(sizeof(Mutex) == `1`);
164	};
165
166	/// This API is exposed for testing. See the APIs below about the contract with
167	/// LLVM.
168	inline bool isScratch(const void *Ctx) {
169	return (reinterpret_cast<uint64_t>(Ctx) & `1`);
170	}
171
172	// True if Ctx is either nullptr or not the 0x1 value.
173	inline bool canBeRoot(const ContextRoot *Ctx) {
174	return reinterpret_cast<uintptr_t>(Ctx) != `1U`;
175	}
176
177	} // namespace __ctx_profile
178
179	extern "C" {
180
181	// LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic.
182	// position 0 is used when the current context isn't scratch, 1 when it is. They
183	// are volatile because of signal handlers - we mean to specifically control
184	// when the data is loaded.
185	//
186	/// TLS where LLVM stores the pointer of the called value, as part of lowering a
187	/// llvm.instrprof.callsite
188	extern __thread void *volatile __llvm_ctx_profile_expected_callee[`2`];
189	/// TLS where LLVM stores the pointer inside a caller's subcontexts vector that
190	/// corresponds to the callsite being lowered.
191	extern __thread ContextNode **volatile __llvm_ctx_profile_callsite[`2`];
192
193	// __llvm_ctx_profile_current_context_root is exposed for unit testing,
194	// othwerise it's only used internally by compiler-rt/ctx_profile.
195	extern __thread __ctx_profile::ContextRoot
196	*volatile __llvm_ctx_profile_current_context_root;
197
198	/// called by LLVM in the entry BB of a "entry point" function. The returned
199	/// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch.
200	ContextNode *
201	__llvm_ctx_profile_start_context(__ctx_profile::FunctionData *FData, GUID Guid,
202	uint32_t Counters, uint32_t Callsites);
203
204	/// paired with __llvm_ctx_profile_start_context, and called at the exit of the
205	/// entry point function.
206	void __llvm_ctx_profile_release_context(__ctx_profile::FunctionData *FData);
207
208	/// called for any other function than entry points, in the entry BB of such
209	/// function. Same consideration about LSB of returned value as .._start_context
210	ContextNode __llvm_ctx_profile_get_context(__ctx_profile::FunctionData FData,
211	void *Callee, GUID Guid,
212	uint32_t NumCounters,
213	uint32_t NumCallsites);
214
215	/// Prepares for collection. Currently this resets counter values but preserves
216	/// internal context tree structure.
217	void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration = `0`);
218
219	/// Completely free allocated memory.
220	void __llvm_ctx_profile_free();
221
222	/// Used to obtain the profile. The Writer is called for each root ContextNode,
223	/// with the ContextRoot::Taken taken. The Writer is responsible for traversing
224	/// the structure underneath.
225	/// The Writer's first parameter plays the role of closure for Writer, and is
226	/// what the caller of __llvm_ctx_profile_fetch passes as the Data parameter.
227	/// The second parameter is the root of a context tree.
228	bool __llvm_ctx_profile_fetch(ProfileWriter &);
229	}
230	#endif // CTX_PROFILE_CTXINSTRPROFILING_H_
231

Browse the source code of llvm_runtimes/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h