1 | //===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file is a part of XRay, a dynamic runtime instrumentation system. |
10 | // |
11 | // This implements the interface for the profileCollectorService. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | #include "xray_profile_collector.h" |
15 | #include "sanitizer_common/sanitizer_common.h" |
16 | #include "xray_allocator.h" |
17 | #include "xray_defs.h" |
18 | #include "xray_profiling_flags.h" |
19 | #include "xray_segmented_array.h" |
20 | #include <memory> |
21 | #include <pthread.h> |
22 | #include <utility> |
23 | |
24 | namespace __xray { |
25 | namespace profileCollectorService { |
26 | |
27 | namespace { |
28 | |
29 | SpinMutex GlobalMutex; |
30 | struct ThreadTrie { |
31 | tid_t TId; |
32 | alignas(FunctionCallTrie) std::byte TrieStorage[sizeof(FunctionCallTrie)]; |
33 | }; |
34 | |
35 | struct ProfileBuffer { |
36 | void *Data; |
37 | size_t Size; |
38 | }; |
39 | |
40 | // Current version of the profile format. |
41 | constexpr u64 XRayProfilingVersion = 0x20180424; |
42 | |
43 | // Identifier for XRay profiling files 'xrayprof' in hex. |
44 | constexpr u64 XRayMagicBytes = 0x7872617970726f66; |
45 | |
46 | struct { |
47 | const u64 = XRayMagicBytes; |
48 | const u64 = XRayProfilingVersion; |
49 | u64 = 0; // System time in nanoseconds. |
50 | u64 = 0; // Process ID. |
51 | }; |
52 | |
53 | struct { |
54 | u32 ; |
55 | u32 ; |
56 | u64 ; |
57 | }; |
58 | |
59 | struct ThreadData { |
60 | BufferQueue *BQ; |
61 | FunctionCallTrie::Allocators::Buffers Buffers; |
62 | FunctionCallTrie::Allocators Allocators; |
63 | FunctionCallTrie FCT; |
64 | tid_t TId; |
65 | }; |
66 | |
67 | using ThreadDataArray = Array<ThreadData>; |
68 | using ThreadDataAllocator = ThreadDataArray::AllocatorType; |
69 | |
70 | // We use a separate buffer queue for the backing store for the allocator used |
71 | // by the ThreadData array. This lets us host the buffers, allocators, and tries |
72 | // associated with a thread by moving the data into the array instead of |
73 | // attempting to copy the data to a separately backed set of tries. |
74 | alignas(BufferQueue) static std::byte BufferQueueStorage[sizeof(BufferQueue)]; |
75 | static BufferQueue *BQ = nullptr; |
76 | static BufferQueue::Buffer Buffer; |
77 | alignas(ThreadDataAllocator) static std::byte |
78 | ThreadDataAllocatorStorage[sizeof(ThreadDataAllocator)]; |
79 | alignas(ThreadDataArray) static std::byte |
80 | ThreadDataArrayStorage[sizeof(ThreadDataArray)]; |
81 | |
82 | static ThreadDataAllocator *TDAllocator = nullptr; |
83 | static ThreadDataArray *TDArray = nullptr; |
84 | |
85 | using ProfileBufferArray = Array<ProfileBuffer>; |
86 | using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; |
87 | |
88 | // These need to be global aligned storage to avoid dynamic initialization. We |
89 | // need these to be aligned to allow us to placement new objects into the |
90 | // storage, and have pointers to those objects be appropriately aligned. |
91 | alignas(ProfileBufferArray) static std::byte |
92 | [sizeof(ProfileBufferArray)]; |
93 | alignas(ProfileBufferArrayAllocator) static std::byte |
94 | ProfileBufferArrayAllocatorStorage[sizeof(ProfileBufferArrayAllocator)]; |
95 | |
96 | static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; |
97 | static ProfileBufferArray *ProfileBuffers = nullptr; |
98 | |
99 | // Use a global flag to determine whether the collector implementation has been |
100 | // initialized. |
101 | static atomic_uint8_t CollectorInitialized{.val_dont_use: 0}; |
102 | |
103 | } // namespace |
104 | |
105 | void post(BufferQueue *Q, FunctionCallTrie &&T, |
106 | FunctionCallTrie::Allocators &&A, |
107 | FunctionCallTrie::Allocators::Buffers &&B, |
108 | tid_t TId) XRAY_NEVER_INSTRUMENT { |
109 | DCHECK_NE(Q, nullptr); |
110 | |
111 | // Bail out early if the collector has not been initialized. |
112 | if (!atomic_load(a: &CollectorInitialized, mo: memory_order_acquire)) { |
113 | T.~FunctionCallTrie(); |
114 | A.~Allocators(); |
115 | Q->releaseBuffer(Buf&: B.NodeBuffer); |
116 | Q->releaseBuffer(Buf&: B.RootsBuffer); |
117 | Q->releaseBuffer(Buf&: B.ShadowStackBuffer); |
118 | Q->releaseBuffer(Buf&: B.NodeIdPairBuffer); |
119 | B.~Buffers(); |
120 | return; |
121 | } |
122 | |
123 | { |
124 | SpinMutexLock Lock(&GlobalMutex); |
125 | DCHECK_NE(TDAllocator, nullptr); |
126 | DCHECK_NE(TDArray, nullptr); |
127 | |
128 | if (TDArray->AppendEmplace(args&: Q, args: std::move(B), args: std::move(A), args: std::move(T), |
129 | args&: TId) == nullptr) { |
130 | // If we fail to add the data to the array, we should destroy the objects |
131 | // handed us. |
132 | T.~FunctionCallTrie(); |
133 | A.~Allocators(); |
134 | Q->releaseBuffer(Buf&: B.NodeBuffer); |
135 | Q->releaseBuffer(Buf&: B.RootsBuffer); |
136 | Q->releaseBuffer(Buf&: B.ShadowStackBuffer); |
137 | Q->releaseBuffer(Buf&: B.NodeIdPairBuffer); |
138 | B.~Buffers(); |
139 | } |
140 | } |
141 | } |
142 | |
143 | // A PathArray represents the function id's representing a stack trace. In this |
144 | // context a path is almost always represented from the leaf function in a call |
145 | // stack to a root of the call trie. |
146 | using PathArray = Array<int32_t>; |
147 | |
148 | struct ProfileRecord { |
149 | using PathAllocator = typename PathArray::AllocatorType; |
150 | |
151 | // The Path in this record is the function id's from the leaf to the root of |
152 | // the function call stack as represented from a FunctionCallTrie. |
153 | PathArray Path; |
154 | const FunctionCallTrie::Node *Node; |
155 | }; |
156 | |
157 | namespace { |
158 | |
159 | using ProfileRecordArray = Array<ProfileRecord>; |
160 | |
161 | // Walk a depth-first traversal of each root of the FunctionCallTrie to generate |
162 | // the path(s) and the data associated with the path. |
163 | static void |
164 | populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, |
165 | const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT { |
166 | using StackArray = Array<const FunctionCallTrie::Node *>; |
167 | using StackAllocator = typename StackArray::AllocatorType; |
168 | StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); |
169 | StackArray DFSStack(StackAlloc); |
170 | for (const auto *R : Trie.getRoots()) { |
171 | DFSStack.Append(E: R); |
172 | while (!DFSStack.empty()) { |
173 | auto *Node = DFSStack.back(); |
174 | DFSStack.trim(Elements: 1); |
175 | if (Node == nullptr) |
176 | continue; |
177 | auto Record = PRs.AppendEmplace(args: PathArray{PA}, args&: Node); |
178 | if (Record == nullptr) |
179 | return; |
180 | DCHECK_NE(Record, nullptr); |
181 | |
182 | // Traverse the Node's parents and as we're doing so, get the FIds in |
183 | // the order they appear. |
184 | for (auto N = Node; N != nullptr; N = N->Parent) |
185 | Record->Path.Append(E: N->FId); |
186 | DCHECK(!Record->Path.empty()); |
187 | |
188 | for (const auto C : Node->Callees) |
189 | DFSStack.Append(E: C.NodePtr); |
190 | } |
191 | } |
192 | } |
193 | |
194 | static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &, |
195 | const ProfileRecordArray &ProfileRecords) |
196 | XRAY_NEVER_INSTRUMENT { |
197 | auto NextPtr = static_cast<uint8_t *>( |
198 | internal_memcpy(dest: Buffer->Data, src: &Header, n: sizeof(Header))) + |
199 | sizeof(Header); |
200 | for (const auto &Record : ProfileRecords) { |
201 | // List of IDs follow: |
202 | for (const auto FId : Record.Path) |
203 | NextPtr = |
204 | static_cast<uint8_t *>(internal_memcpy(dest: NextPtr, src: &FId, n: sizeof(FId))) + |
205 | sizeof(FId); |
206 | |
207 | // Add the sentinel here. |
208 | constexpr int32_t SentinelFId = 0; |
209 | NextPtr = static_cast<uint8_t *>( |
210 | internal_memset(s: NextPtr, c: SentinelFId, n: sizeof(SentinelFId))) + |
211 | sizeof(SentinelFId); |
212 | |
213 | // Add the node data here. |
214 | NextPtr = |
215 | static_cast<uint8_t *>(internal_memcpy( |
216 | dest: NextPtr, src: &Record.Node->CallCount, n: sizeof(Record.Node->CallCount))) + |
217 | sizeof(Record.Node->CallCount); |
218 | NextPtr = static_cast<uint8_t *>( |
219 | internal_memcpy(dest: NextPtr, src: &Record.Node->CumulativeLocalTime, |
220 | n: sizeof(Record.Node->CumulativeLocalTime))) + |
221 | sizeof(Record.Node->CumulativeLocalTime); |
222 | } |
223 | |
224 | DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size); |
225 | } |
226 | |
227 | } // namespace |
228 | |
229 | void serialize() XRAY_NEVER_INSTRUMENT { |
230 | if (!atomic_load(a: &CollectorInitialized, mo: memory_order_acquire)) |
231 | return; |
232 | |
233 | SpinMutexLock Lock(&GlobalMutex); |
234 | |
235 | // Clear out the global ProfileBuffers, if it's not empty. |
236 | for (auto &B : *ProfileBuffers) |
237 | deallocateBuffer(B: reinterpret_cast<unsigned char *>(B.Data), S: B.Size); |
238 | ProfileBuffers->trim(Elements: ProfileBuffers->size()); |
239 | |
240 | DCHECK_NE(TDArray, nullptr); |
241 | if (TDArray->empty()) |
242 | return; |
243 | |
244 | // Then repopulate the global ProfileBuffers. |
245 | u32 I = 0; |
246 | auto MaxSize = profilingFlags()->global_allocator_max; |
247 | auto ProfileArena = allocateBuffer(S: MaxSize); |
248 | if (ProfileArena == nullptr) |
249 | return; |
250 | |
251 | auto ProfileArenaCleanup = at_scope_exit( |
252 | fn: [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(B: ProfileArena, S: MaxSize); }); |
253 | |
254 | auto PathArena = allocateBuffer(S: profilingFlags()->global_allocator_max); |
255 | if (PathArena == nullptr) |
256 | return; |
257 | |
258 | auto PathArenaCleanup = at_scope_exit( |
259 | fn: [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(B: PathArena, S: MaxSize); }); |
260 | |
261 | for (const auto &ThreadTrie : *TDArray) { |
262 | using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; |
263 | ProfileRecordAllocator PRAlloc(ProfileArena, |
264 | profilingFlags()->global_allocator_max); |
265 | ProfileRecord::PathAllocator PathAlloc( |
266 | PathArena, profilingFlags()->global_allocator_max); |
267 | ProfileRecordArray ProfileRecords(PRAlloc); |
268 | |
269 | // First, we want to compute the amount of space we're going to need. We'll |
270 | // use a local allocator and an __xray::Array<...> to store the intermediary |
271 | // data, then compute the size as we're going along. Then we'll allocate the |
272 | // contiguous space to contain the thread buffer data. |
273 | if (ThreadTrie.FCT.getRoots().empty()) |
274 | continue; |
275 | |
276 | populateRecords(PRs&: ProfileRecords, PA&: PathAlloc, Trie: ThreadTrie.FCT); |
277 | DCHECK(!ThreadTrie.FCT.getRoots().empty()); |
278 | DCHECK(!ProfileRecords.empty()); |
279 | |
280 | // Go through each record, to compute the sizes. |
281 | // |
282 | // header size = block size (4 bytes) |
283 | // + block number (4 bytes) |
284 | // + thread id (8 bytes) |
285 | // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) |
286 | // + call count (8 bytes) |
287 | // + local time (8 bytes) |
288 | // + end of record (8 bytes) |
289 | u32 CumulativeSizes = 0; |
290 | for (const auto &Record : ProfileRecords) |
291 | CumulativeSizes += 20 + (4 * Record.Path.size()); |
292 | |
293 | BlockHeader {.BlockSize: 16 + CumulativeSizes, .BlockNum: I++, .ThreadId: ThreadTrie.TId}; |
294 | auto B = ProfileBuffers->Append(E: {}); |
295 | B->Size = sizeof(Header) + CumulativeSizes; |
296 | B->Data = allocateBuffer(S: B->Size); |
297 | DCHECK_NE(B->Data, nullptr); |
298 | serializeRecords(Buffer: B, Header, ProfileRecords); |
299 | } |
300 | } |
301 | |
302 | void reset() XRAY_NEVER_INSTRUMENT { |
303 | atomic_store(a: &CollectorInitialized, v: 0, mo: memory_order_release); |
304 | SpinMutexLock Lock(&GlobalMutex); |
305 | |
306 | if (ProfileBuffers != nullptr) { |
307 | // Clear out the profile buffers that have been serialized. |
308 | for (auto &B : *ProfileBuffers) |
309 | deallocateBuffer(B: reinterpret_cast<uint8_t *>(B.Data), S: B.Size); |
310 | ProfileBuffers->trim(Elements: ProfileBuffers->size()); |
311 | ProfileBuffers = nullptr; |
312 | } |
313 | |
314 | if (TDArray != nullptr) { |
315 | // Release the resources as required. |
316 | for (auto &TD : *TDArray) { |
317 | TD.BQ->releaseBuffer(Buf&: TD.Buffers.NodeBuffer); |
318 | TD.BQ->releaseBuffer(Buf&: TD.Buffers.RootsBuffer); |
319 | TD.BQ->releaseBuffer(Buf&: TD.Buffers.ShadowStackBuffer); |
320 | TD.BQ->releaseBuffer(Buf&: TD.Buffers.NodeIdPairBuffer); |
321 | } |
322 | // We don't bother destroying the array here because we've already |
323 | // potentially freed the backing store for the array. Instead we're going to |
324 | // reset the pointer to nullptr, and re-use the storage later instead |
325 | // (placement-new'ing into the storage as-is). |
326 | TDArray = nullptr; |
327 | } |
328 | |
329 | if (TDAllocator != nullptr) { |
330 | TDAllocator->~Allocator(); |
331 | TDAllocator = nullptr; |
332 | } |
333 | |
334 | if (Buffer.Data != nullptr) { |
335 | BQ->releaseBuffer(Buf&: Buffer); |
336 | } |
337 | |
338 | if (BQ == nullptr) { |
339 | bool Success = false; |
340 | new (&BufferQueueStorage) |
341 | BufferQueue(profilingFlags()->global_allocator_max, 1, Success); |
342 | if (!Success) |
343 | return; |
344 | BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); |
345 | } else { |
346 | BQ->finalize(); |
347 | |
348 | if (BQ->init(BS: profilingFlags()->global_allocator_max, BC: 1) != |
349 | BufferQueue::ErrorCode::Ok) |
350 | return; |
351 | } |
352 | |
353 | if (BQ->getBuffer(Buf&: Buffer) != BufferQueue::ErrorCode::Ok) |
354 | return; |
355 | |
356 | new (&ProfileBufferArrayAllocatorStorage) |
357 | ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); |
358 | ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>( |
359 | &ProfileBufferArrayAllocatorStorage); |
360 | |
361 | new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator); |
362 | ProfileBuffers = |
363 | reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage); |
364 | |
365 | new (&ThreadDataAllocatorStorage) |
366 | ThreadDataAllocator(Buffer.Data, Buffer.Size); |
367 | TDAllocator = |
368 | reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage); |
369 | new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator); |
370 | TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage); |
371 | |
372 | atomic_store(a: &CollectorInitialized, v: 1, mo: memory_order_release); |
373 | } |
374 | |
375 | XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { |
376 | SpinMutexLock Lock(&GlobalMutex); |
377 | |
378 | if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0) |
379 | return {.Data: nullptr, .Size: 0}; |
380 | |
381 | static pthread_once_t Once = PTHREAD_ONCE_INIT; |
382 | alignas(XRayProfilingFileHeader) static std::byte |
383 | [sizeof(XRayProfilingFileHeader)]; |
384 | pthread_once( |
385 | once_control: &Once, init_routine: +[]() XRAY_NEVER_INSTRUMENT { |
386 | new (&FileHeaderStorage) XRayProfilingFileHeader{}; |
387 | }); |
388 | |
389 | if (UNLIKELY(B.Data == nullptr)) { |
390 | // The first buffer should always contain the file header information. |
391 | auto & = |
392 | *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage); |
393 | FileHeader.Timestamp = NanoTime(); |
394 | FileHeader.PID = internal_getpid(); |
395 | return {.Data: &FileHeaderStorage, .Size: sizeof(XRayProfilingFileHeader)}; |
396 | } |
397 | |
398 | if (UNLIKELY(B.Data == &FileHeaderStorage)) |
399 | return {.Data: (*ProfileBuffers)[0].Data, .Size: (*ProfileBuffers)[0].Size}; |
400 | |
401 | BlockHeader ; |
402 | internal_memcpy(dest: &Header, src: B.Data, n: sizeof(BlockHeader)); |
403 | auto NextBlock = Header.BlockNum + 1; |
404 | if (NextBlock < ProfileBuffers->size()) |
405 | return {.Data: (*ProfileBuffers)[NextBlock].Data, |
406 | .Size: (*ProfileBuffers)[NextBlock].Size}; |
407 | return {.Data: nullptr, .Size: 0}; |
408 | } |
409 | |
410 | } // namespace profileCollectorService |
411 | } // namespace __xray |
412 | |