1 | /*===- CtxInstrProfiling.h- Contextual instrumentation-based PGO ---------===*\ |
2 | |* |
3 | |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | |* See https://llvm.org/LICENSE.txt for license information. |
5 | |* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | |* |
7 | \*===----------------------------------------------------------------------===*/ |
8 | |
9 | #ifndef CTX_PROFILE_CTXINSTRPROFILING_H_ |
10 | #define CTX_PROFILE_CTXINSTRPROFILING_H_ |
11 | |
12 | #include "CtxInstrContextNode.h" |
13 | #include "sanitizer_common/sanitizer_mutex.h" |
14 | #include <sanitizer/common_interface_defs.h> |
15 | |
16 | using namespace llvm::ctx_profile; |
17 | |
18 | // Forward-declare for the one unittest checking Arena construction zeroes out |
19 | // its allocatable space. |
20 | class ArenaTest_ZeroInit_Test; |
21 | namespace __ctx_profile { |
22 | |
23 | static constexpr size_t ExpectedAlignment = 8; |
24 | // We really depend on this, see further below. We currently support x86_64. |
25 | // When we want to support other archs, we need to trace the places Alignment is |
26 | // used and adjust accordingly. |
27 | static_assert(sizeof(void *) == ExpectedAlignment); |
28 | |
29 | /// Arena (bump allocator) forming a linked list. Intentionally not thread safe. |
30 | /// Allocation and de-allocation happen using sanitizer APIs. We make that |
31 | /// explicit. |
32 | class Arena final { |
33 | public: |
34 | // When allocating a new Arena, optionally specify an existing one to append |
35 | // to, assumed to be the last in the Arena list. We only need to support |
36 | // appending to the arena list. |
37 | static Arena *allocateNewArena(size_t Size, Arena *Prev = nullptr); |
38 | static void freeArenaList(Arena *&A); |
39 | |
40 | uint64_t size() const { return Size; } |
41 | |
42 | // Allocate S bytes or return nullptr if we don't have that many available. |
43 | char *tryBumpAllocate(size_t S) { |
44 | if (Pos + S > Size) |
45 | return nullptr; |
46 | Pos += S; |
47 | return start() + (Pos - S); |
48 | } |
49 | |
50 | Arena *next() const { return Next; } |
51 | |
52 | // the beginning of allocatable memory. |
53 | const char *start() const { return const_cast<Arena *>(this)->start(); } |
54 | const char *pos() const { return start() + Pos; } |
55 | |
56 | private: |
57 | friend class ::ArenaTest_ZeroInit_Test; |
58 | explicit Arena(uint32_t Size); |
59 | ~Arena() = delete; |
60 | |
61 | char *start() { return reinterpret_cast<char *>(&this[1]); } |
62 | |
63 | Arena *Next = nullptr; |
64 | uint64_t Pos = 0; |
65 | const uint64_t Size; |
66 | }; |
67 | |
68 | // The memory available for allocation follows the Arena header, and we expect |
69 | // it to be thus aligned. |
70 | static_assert(alignof(Arena) == ExpectedAlignment); |
71 | |
72 | // Verify maintenance to ContextNode doesn't change this invariant, which makes |
73 | // sure the inlined vectors are appropriately aligned. |
74 | static_assert(alignof(ContextNode) == ExpectedAlignment); |
75 | |
76 | /// ContextRoots are allocated by LLVM for entrypoints. LLVM is only concerned |
77 | /// with allocating and zero-initializing the global value (as in, GlobalValue) |
78 | /// for it. |
79 | struct ContextRoot { |
80 | ContextNode *FirstNode = nullptr; |
81 | Arena *FirstMemBlock = nullptr; |
82 | Arena *CurrentMem = nullptr; |
83 | // This is init-ed by the static zero initializer in LLVM. |
84 | // Taken is used to ensure only one thread traverses the contextual graph - |
85 | // either to read it or to write it. On server side, the same entrypoint will |
86 | // be entered by numerous threads, but over time, the profile aggregated by |
87 | // collecting sequentially on one thread at a time is expected to converge to |
88 | // the aggregate profile that may have been observable on all the threads. |
89 | // Note that this is node-by-node aggregation, i.e. summing counters of nodes |
90 | // at the same position in the graph, not flattening. |
91 | // Threads that cannot lock Taken (fail TryLock) are given a "scratch context" |
92 | // - a buffer they can clobber, safely from a memory access perspective. |
93 | // |
94 | // Note about "scratch"-ness: we currently ignore the data written in them |
95 | // (which is anyway clobbered). The design allows for that not be the case - |
96 | // because "scratch"-ness is first and foremost about not trying to build |
97 | // subcontexts, and is captured by tainting the pointer value (pointer to the |
98 | // memory treated as context), but right now, we drop that info. |
99 | // |
100 | // We could consider relaxing the requirement of more than one thread |
101 | // entering by holding a few context trees per entrypoint and then aggregating |
102 | // them (as explained above) at the end of the profile collection - it's a |
103 | // tradeoff between collection time and memory use: higher precision can be |
104 | // obtained with either less concurrent collections but more collection time, |
105 | // or with more concurrent collections (==more memory) and less collection |
106 | // time. Note that concurrent collection does happen for different |
107 | // entrypoints, regardless. |
108 | ::__sanitizer::StaticSpinMutex Taken; |
109 | |
110 | // If (unlikely) StaticSpinMutex internals change, we need to modify the LLVM |
111 | // instrumentation lowering side because it is responsible for allocating and |
112 | // zero-initializing ContextRoots. |
113 | static_assert(sizeof(Taken) == 1); |
114 | }; |
115 | |
116 | /// This API is exposed for testing. See the APIs below about the contract with |
117 | /// LLVM. |
118 | inline bool isScratch(const void *Ctx) { |
119 | return (reinterpret_cast<uint64_t>(Ctx) & 1); |
120 | } |
121 | |
122 | } // namespace __ctx_profile |
123 | |
124 | extern "C" { |
125 | |
126 | // LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic. |
127 | // position 0 is used when the current context isn't scratch, 1 when it is. They |
128 | // are volatile because of signal handlers - we mean to specifically control |
129 | // when the data is loaded. |
130 | // |
131 | /// TLS where LLVM stores the pointer of the called value, as part of lowering a |
132 | /// llvm.instrprof.callsite |
133 | extern __thread void *volatile __llvm_ctx_profile_expected_callee[2]; |
134 | /// TLS where LLVM stores the pointer inside a caller's subcontexts vector that |
135 | /// corresponds to the callsite being lowered. |
136 | extern __thread ContextNode **volatile __llvm_ctx_profile_callsite[2]; |
137 | |
138 | // __llvm_ctx_profile_current_context_root is exposed for unit testing, |
139 | // othwerise it's only used internally by compiler-rt/ctx_profile. |
140 | extern __thread __ctx_profile::ContextRoot |
141 | *volatile __llvm_ctx_profile_current_context_root; |
142 | |
143 | /// called by LLVM in the entry BB of a "entry point" function. The returned |
144 | /// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch. |
145 | ContextNode *__llvm_ctx_profile_start_context(__ctx_profile::ContextRoot *Root, |
146 | GUID Guid, uint32_t Counters, |
147 | uint32_t Callsites); |
148 | |
149 | /// paired with __llvm_ctx_profile_start_context, and called at the exit of the |
150 | /// entry point function. |
151 | void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root); |
152 | |
153 | /// called for any other function than entry points, in the entry BB of such |
154 | /// function. Same consideration about LSB of returned value as .._start_context |
155 | ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, |
156 | uint32_t NrCounters, |
157 | uint32_t NrCallsites); |
158 | |
159 | /// Prepares for collection. Currently this resets counter values but preserves |
160 | /// internal context tree structure. |
161 | void __llvm_ctx_profile_start_collection(); |
162 | |
163 | /// Completely free allocated memory. |
164 | void __llvm_ctx_profile_free(); |
165 | |
166 | /// Used to obtain the profile. The Writer is called for each root ContextNode, |
167 | /// with the ContextRoot::Taken taken. The Writer is responsible for traversing |
168 | /// the structure underneath. |
169 | /// The Writer's first parameter plays the role of closure for Writer, and is |
170 | /// what the caller of __llvm_ctx_profile_fetch passes as the Data parameter. |
171 | /// The second parameter is the root of a context tree. |
172 | bool __llvm_ctx_profile_fetch(void *Data, |
173 | bool (*Writer)(void *, const ContextNode &)); |
174 | } |
175 | #endif // CTX_PROFILE_CTXINSTRPROFILING_H_ |
176 | |