| 1 | //===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "CtxInstrProfiling.h" |
| 10 | #include "RootAutoDetector.h" |
| 11 | #include "sanitizer_common/sanitizer_allocator_internal.h" |
| 12 | #include "sanitizer_common/sanitizer_atomic.h" |
| 13 | #include "sanitizer_common/sanitizer_atomic_clang.h" |
| 14 | #include "sanitizer_common/sanitizer_common.h" |
| 15 | #include "sanitizer_common/sanitizer_dense_map.h" |
| 16 | #include "sanitizer_common/sanitizer_libc.h" |
| 17 | #include "sanitizer_common/sanitizer_mutex.h" |
| 18 | #include "sanitizer_common/sanitizer_placement_new.h" |
| 19 | #include "sanitizer_common/sanitizer_thread_safety.h" |
| 20 | #include "sanitizer_common/sanitizer_vector.h" |
| 21 | |
| 22 | #include <assert.h> |
| 23 | |
| 24 | using namespace __ctx_profile; |
| 25 | |
| 26 | namespace { |
| 27 | // Keep track of all the context roots we actually saw, so we can then traverse |
| 28 | // them when the user asks for the profile in __llvm_ctx_profile_fetch |
| 29 | __sanitizer::SpinMutex AllContextsMutex; |
| 30 | SANITIZER_GUARDED_BY(AllContextsMutex) |
| 31 | __sanitizer::Vector<ContextRoot *> AllContextRoots; |
| 32 | |
| 33 | __sanitizer::atomic_uintptr_t AllFunctionsData = {}; |
| 34 | |
| 35 | // Keep all the functions for which we collect a flat profile in a linked list. |
| 36 | __sanitizer::SpinMutex FlatCtxArenaMutex; |
| 37 | SANITIZER_GUARDED_BY(FlatCtxArenaMutex) |
| 38 | Arena *FlatCtxArenaHead = nullptr; |
| 39 | SANITIZER_GUARDED_BY(FlatCtxArenaMutex) |
| 40 | Arena *FlatCtxArena = nullptr; |
| 41 | |
| 42 | // Set to true when we enter a root, and false when we exit - regardless if this |
| 43 | // thread collects a contextual profile for that root. |
| 44 | __thread bool IsUnderContext = false; |
| 45 | __sanitizer::atomic_uint8_t ProfilingStarted = {}; |
| 46 | |
| 47 | __sanitizer::atomic_uintptr_t RootDetector = {}; |
| 48 | RootAutoDetector *getRootDetector() { |
| 49 | return reinterpret_cast<RootAutoDetector *>( |
| 50 | __sanitizer::atomic_load_relaxed(a: &RootDetector)); |
| 51 | } |
| 52 | |
| 53 | // utility to taint a pointer by setting the LSB. There is an assumption |
| 54 | // throughout that the addresses of contexts are even (really, they should be |
| 55 | // align(8), but "even"-ness is the minimum assumption) |
| 56 | // "scratch contexts" are buffers that we return in certain cases - they are |
| 57 | // large enough to allow for memory safe counter access, but they don't link |
| 58 | // subcontexts below them (the runtime recognizes them and enforces that) |
| 59 | ContextNode *markAsScratch(const ContextNode *Ctx) { |
| 60 | return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1); |
| 61 | } |
| 62 | |
| 63 | // Used when getting the data from TLS. We don't *really* need to reset, but |
| 64 | // it's a simpler system if we do. |
| 65 | template <typename T> inline T consume(T &V) { |
| 66 | auto R = V; |
| 67 | V = {0}; |
| 68 | return R; |
| 69 | } |
| 70 | |
| 71 | // We allocate at least kBuffSize Arena pages. The scratch buffer is also that |
| 72 | // large. |
| 73 | constexpr size_t kPower = 20; |
| 74 | constexpr size_t kBuffSize = 1 << kPower; |
| 75 | |
| 76 | // Highly unlikely we need more than kBuffSize for a context. |
| 77 | size_t getArenaAllocSize(size_t Needed) { |
| 78 | if (Needed >= kBuffSize) |
| 79 | return 2 * Needed; |
| 80 | return kBuffSize; |
| 81 | } |
| 82 | |
| 83 | // verify the structural integrity of the context |
| 84 | bool validate(const ContextRoot *Root) { |
| 85 | // all contexts should be laid out in some arena page. Go over each arena |
| 86 | // allocated for this Root, and jump over contained contexts based on |
| 87 | // self-reported sizes. |
| 88 | __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs; |
| 89 | for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { |
| 90 | const auto *Pos = Mem->start(); |
| 91 | while (Pos < Mem->pos()) { |
| 92 | const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos); |
| 93 | if (!ContextStartAddrs.insert(KV: {reinterpret_cast<uint64_t>(Ctx), true}) |
| 94 | .second) |
| 95 | return false; |
| 96 | Pos += Ctx->size(); |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | // Now traverse the contexts again the same way, but validate all nonull |
| 101 | // subcontext addresses appear in the set computed above. |
| 102 | for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { |
| 103 | const auto *Pos = Mem->start(); |
| 104 | while (Pos < Mem->pos()) { |
| 105 | const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos); |
| 106 | for (uint32_t I = 0; I < Ctx->callsites_size(); ++I) |
| 107 | for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next()) |
| 108 | if (!ContextStartAddrs.find(Key: reinterpret_cast<uint64_t>(Sub))) |
| 109 | return false; |
| 110 | |
| 111 | Pos += Ctx->size(); |
| 112 | } |
| 113 | } |
| 114 | return true; |
| 115 | } |
| 116 | |
| 117 | inline ContextNode *allocContextNode(char *Place, GUID Guid, |
| 118 | uint32_t NumCounters, |
| 119 | uint32_t NumCallsites, |
| 120 | ContextNode *Next = nullptr) { |
| 121 | assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0); |
| 122 | return new (Place) ContextNode(Guid, NumCounters, NumCallsites, Next); |
| 123 | } |
| 124 | |
| 125 | void resetContextNode(ContextNode &Node) { |
| 126 | // FIXME(mtrofin): this is std::memset, which we can probably use if we |
| 127 | // drop/reduce the dependency on sanitizer_common. |
| 128 | for (uint32_t I = 0; I < Node.counters_size(); ++I) |
| 129 | Node.counters()[I] = 0; |
| 130 | for (uint32_t I = 0; I < Node.callsites_size(); ++I) |
| 131 | for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next()) |
| 132 | resetContextNode(Node&: *Next); |
| 133 | } |
| 134 | |
| 135 | ContextNode *onContextEnter(ContextNode &Node) { |
| 136 | ++Node.counters()[0]; |
| 137 | return &Node; |
| 138 | } |
| 139 | |
| 140 | } // namespace |
| 141 | |
| 142 | // the scratch buffer - what we give when we can't produce a real context (the |
| 143 | // scratch isn't "real" in that it's expected to be clobbered carelessly - we |
| 144 | // don't read it). The other important thing is that the callees from a scratch |
| 145 | // context also get a scratch context. |
| 146 | // Eventually this can be replaced with per-function buffers, a'la the typical |
| 147 | // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but |
| 148 | // the part about determining the nature of the subcontexts does. |
| 149 | __thread char __Buffer[kBuffSize] = {0}; |
| 150 | |
| 151 | #define TheScratchContext \ |
| 152 | markAsScratch(reinterpret_cast<ContextNode *>(__Buffer)) |
| 153 | |
| 154 | // init the TLSes |
| 155 | __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr, |
| 156 | nullptr}; |
| 157 | __thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0}; |
| 158 | |
| 159 | __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root = |
| 160 | nullptr; |
| 161 | |
| 162 | Arena::Arena(uint32_t Size) : Size(Size) { |
| 163 | __sanitizer::internal_memset(s: start(), c: 0, n: Size); |
| 164 | } |
| 165 | |
| 166 | // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce |
| 167 | // the dependency on the latter. |
| 168 | Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) { |
| 169 | assert(!Prev || Prev->Next == nullptr); |
| 170 | Arena *NewArena = new (__sanitizer::InternalAlloc( |
| 171 | size: Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment)) |
| 172 | Arena(Size); |
| 173 | if (Prev) |
| 174 | Prev->Next = NewArena; |
| 175 | return NewArena; |
| 176 | } |
| 177 | |
| 178 | void Arena::freeArenaList(Arena *&A) { |
| 179 | assert(A); |
| 180 | for (auto *I = A; I != nullptr;) { |
| 181 | auto *Current = I; |
| 182 | I = I->Next; |
| 183 | __sanitizer::InternalFree(p: Current); |
| 184 | } |
| 185 | A = nullptr; |
| 186 | } |
| 187 | |
| 188 | // If this is the first time we hit a callsite with this (Guid) particular |
| 189 | // callee, we need to allocate. |
| 190 | ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint, |
| 191 | uint32_t NumCounters, uint32_t NumCallsites) { |
| 192 | auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites); |
| 193 | auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem; |
| 194 | char *AllocPlace = Mem->tryBumpAllocate(S: AllocSize); |
| 195 | if (!AllocPlace) { |
| 196 | // if we failed to allocate on the current arena, allocate a new arena, |
| 197 | // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we |
| 198 | // find it from now on for other cases when we need to getCallsiteSlow. |
| 199 | // Note that allocateNewArena will link the allocated memory in the list of |
| 200 | // Arenas. |
| 201 | __llvm_ctx_profile_current_context_root->CurrentMem = Mem = |
| 202 | Mem->allocateNewArena(Size: getArenaAllocSize(Needed: AllocSize), Prev: Mem); |
| 203 | AllocPlace = Mem->tryBumpAllocate(S: AllocSize); |
| 204 | } |
| 205 | auto *Ret = allocContextNode(Place: AllocPlace, Guid, NumCounters, NumCallsites, |
| 206 | Next: *InsertionPoint); |
| 207 | *InsertionPoint = Ret; |
| 208 | return Ret; |
| 209 | } |
| 210 | |
| 211 | ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid, |
| 212 | uint32_t NumCounters) { |
| 213 | if (ContextNode *Existing = Data.FlatCtx) |
| 214 | return Existing; |
| 215 | { |
| 216 | // We could instead try to take the lock and, if that fails, return |
| 217 | // TheScratchContext. But that could leave message pump loops more sparsely |
| 218 | // profiled than everything else. Maybe that doesn't matter, and we can |
| 219 | // optimize this later. |
| 220 | __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Data.Mutex); |
| 221 | if (ContextNode *Existing = Data.FlatCtx) |
| 222 | return Existing; |
| 223 | |
| 224 | auto NeededSize = ContextNode::getAllocSize(NumCounters, NumCallsites: 0); |
| 225 | char *AllocBuff = nullptr; |
| 226 | { |
| 227 | __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> FL( |
| 228 | &FlatCtxArenaMutex); |
| 229 | if (FlatCtxArena) |
| 230 | AllocBuff = FlatCtxArena->tryBumpAllocate(S: NeededSize); |
| 231 | if (!AllocBuff) { |
| 232 | FlatCtxArena = Arena::allocateNewArena(Size: getArenaAllocSize(Needed: NeededSize), |
| 233 | Prev: FlatCtxArena); |
| 234 | AllocBuff = FlatCtxArena->tryBumpAllocate(S: NeededSize); |
| 235 | } |
| 236 | if (!FlatCtxArenaHead) |
| 237 | FlatCtxArenaHead = FlatCtxArena; |
| 238 | } |
| 239 | auto *Ret = allocContextNode(Place: AllocBuff, Guid, NumCounters, NumCallsites: 0); |
| 240 | Data.FlatCtx = Ret; |
| 241 | |
| 242 | Data.EntryAddress = Callee; |
| 243 | Data.Next = reinterpret_cast<FunctionData *>( |
| 244 | __sanitizer::atomic_load_relaxed(a: &AllFunctionsData)); |
| 245 | while (!__sanitizer::atomic_compare_exchange_strong( |
| 246 | a: &AllFunctionsData, cmp: reinterpret_cast<uintptr_t *>(&Data.Next), |
| 247 | xchg: reinterpret_cast<uintptr_t>(&Data), |
| 248 | mo: __sanitizer::memory_order_release)) { |
| 249 | } |
| 250 | } |
| 251 | |
| 252 | return Data.FlatCtx; |
| 253 | } |
| 254 | |
| 255 | // This should be called once for a Root. Allocate the first arena, set up the |
| 256 | // first context. |
| 257 | void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters, |
| 258 | uint32_t NumCallsites) { |
| 259 | __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( |
| 260 | &AllContextsMutex); |
| 261 | // Re-check - we got here without having had taken a lock. |
| 262 | if (Root->FirstMemBlock) |
| 263 | return; |
| 264 | const auto Needed = ContextNode::getAllocSize(NumCounters, NumCallsites); |
| 265 | auto *M = Arena::allocateNewArena(Size: getArenaAllocSize(Needed)); |
| 266 | Root->FirstMemBlock = M; |
| 267 | Root->CurrentMem = M; |
| 268 | Root->FirstNode = allocContextNode(Place: M->tryBumpAllocate(S: Needed), Guid, |
| 269 | NumCounters, NumCallsites); |
| 270 | AllContextRoots.PushBack(v: Root); |
| 271 | } |
| 272 | |
| 273 | ContextRoot *FunctionData::getOrAllocateContextRoot() { |
| 274 | auto *Root = CtxRoot; |
| 275 | if (!canBeRoot(Ctx: Root)) |
| 276 | return Root; |
| 277 | if (Root) |
| 278 | return Root; |
| 279 | __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Mutex); |
| 280 | Root = CtxRoot; |
| 281 | if (!Root) { |
| 282 | Root = new (__sanitizer::InternalAlloc(size: sizeof(ContextRoot))) ContextRoot(); |
| 283 | CtxRoot = Root; |
| 284 | } |
| 285 | |
| 286 | assert(Root); |
| 287 | return Root; |
| 288 | } |
| 289 | |
| 290 | ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid, |
| 291 | uint32_t Counters, uint32_t Callsites) |
| 292 | SANITIZER_NO_THREAD_SAFETY_ANALYSIS { |
| 293 | IsUnderContext = true; |
| 294 | __sanitizer::atomic_fetch_add(a: &Root->TotalEntries, v: 1, |
| 295 | mo: __sanitizer::memory_order_relaxed); |
| 296 | if (!Root->FirstMemBlock) { |
| 297 | setupContext(Root, Guid, NumCounters: Counters, NumCallsites: Callsites); |
| 298 | } |
| 299 | if (Root->Taken.TryLock()) { |
| 300 | __llvm_ctx_profile_current_context_root = Root; |
| 301 | onContextEnter(Node&: *Root->FirstNode); |
| 302 | return Root->FirstNode; |
| 303 | } |
| 304 | // If this thread couldn't take the lock, return scratch context. |
| 305 | __llvm_ctx_profile_current_context_root = nullptr; |
| 306 | return TheScratchContext; |
| 307 | } |
| 308 | |
| 309 | ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid, |
| 310 | uint32_t NumCounters, uint32_t NumCallsites, |
| 311 | ContextRoot *CtxRoot) { |
| 312 | |
| 313 | // 1) if we are currently collecting a contextual profile, fetch a ContextNode |
| 314 | // in the `Unhandled` set. We want to do this regardless of `ProfilingStarted` |
| 315 | // to (hopefully) offset the penalty of creating these contexts to before |
| 316 | // profiling. |
| 317 | // |
| 318 | // 2) if we are under a root (regardless if this thread is collecting or not a |
| 319 | // contextual profile for that root), do not collect a flat profile. We want |
| 320 | // to keep flat profiles only for activations that can't happen under a root, |
| 321 | // to avoid confusing profiles. We can, for example, combine flattened and |
| 322 | // flat profiles meaningfully, as we wouldn't double-count anything. |
| 323 | // |
| 324 | // 3) to avoid lengthy startup, don't bother with flat profiles until the |
| 325 | // profiling has started. We would reset them anyway when profiling starts. |
| 326 | // HOWEVER. This does lose profiling for message pumps: those functions are |
| 327 | // entered once and never exit. They should be assumed to be entered before |
| 328 | // profiling starts - because profiling should start after the server is up |
| 329 | // and running (which is equivalent to "message pumps are set up"). |
| 330 | if (!CtxRoot) { |
| 331 | if (auto *RAD = getRootDetector()) |
| 332 | RAD->sample(); |
| 333 | else if (auto *CR = Data.CtxRoot) { |
| 334 | if (canBeRoot(Ctx: CR)) |
| 335 | return tryStartContextGivenRoot(Root: CR, Guid, Counters: NumCounters, Callsites: NumCallsites); |
| 336 | } |
| 337 | if (IsUnderContext || !__sanitizer::atomic_load_relaxed(a: &ProfilingStarted)) |
| 338 | return TheScratchContext; |
| 339 | else |
| 340 | return markAsScratch( |
| 341 | Ctx: onContextEnter(Node&: *getFlatProfile(Data, Callee, Guid, NumCounters))); |
| 342 | } |
| 343 | auto [Iter, Ins] = CtxRoot->Unhandled.insert(KV: {Guid, nullptr}); |
| 344 | if (Ins) |
| 345 | Iter->second = getCallsiteSlow(Guid, InsertionPoint: &CtxRoot->FirstUnhandledCalleeNode, |
| 346 | NumCounters, NumCallsites: 0); |
| 347 | return markAsScratch(Ctx: onContextEnter(Node&: *Iter->second)); |
| 348 | } |
| 349 | |
| 350 | ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, |
| 351 | GUID Guid, uint32_t NumCounters, |
| 352 | uint32_t NumCallsites) { |
| 353 | auto *CtxRoot = __llvm_ctx_profile_current_context_root; |
| 354 | // fast "out" if we're not even doing contextual collection. |
| 355 | if (!CtxRoot) |
| 356 | return getUnhandledContext(Data&: *Data, Callee, Guid, NumCounters, NumCallsites, |
| 357 | CtxRoot: nullptr); |
| 358 | |
| 359 | // also fast "out" if the caller is scratch. We can see if it's scratch by |
| 360 | // looking at the interior pointer into the subcontexts vector that the caller |
| 361 | // provided, which, if the context is scratch, so is that interior pointer |
| 362 | // (because all the address calculations are using even values. Or more |
| 363 | // precisely, aligned - 8 values) |
| 364 | auto **CallsiteContext = consume(V&: __llvm_ctx_profile_callsite[0]); |
| 365 | if (!CallsiteContext || isScratch(Ctx: CallsiteContext)) |
| 366 | return getUnhandledContext(Data&: *Data, Callee, Guid, NumCounters, NumCallsites, |
| 367 | CtxRoot); |
| 368 | |
| 369 | // if the callee isn't the expected one, return scratch. |
| 370 | // Signal handler(s) could have been invoked at any point in the execution. |
| 371 | // Should that have happened, and had it (the handler) be built with |
| 372 | // instrumentation, its __llvm_ctx_profile_get_context would have failed here. |
| 373 | // Its sub call graph would have then populated |
| 374 | // __llvm_ctx_profile_{expected_callee | callsite} at index 1. |
| 375 | // The normal call graph may be impacted in that, if the signal handler |
| 376 | // happened somewhere before we read the TLS here, we'd see the TLS reset and |
| 377 | // we'd also fail here. That would just mean we would loose counter values for |
| 378 | // the normal subgraph, this time around. That should be very unlikely, but if |
| 379 | // it happens too frequently, we should be able to detect discrepancies in |
| 380 | // entry counts (caller-callee). At the moment, the design goes on the |
| 381 | // assumption that is so unfrequent, though, that it's not worth doing more |
| 382 | // for that case. |
| 383 | auto *ExpectedCallee = consume(V&: __llvm_ctx_profile_expected_callee[0]); |
| 384 | if (ExpectedCallee != Callee) |
| 385 | return getUnhandledContext(Data&: *Data, Callee, Guid, NumCounters, NumCallsites, |
| 386 | CtxRoot); |
| 387 | |
| 388 | auto *Callsite = *CallsiteContext; |
| 389 | // in the case of indirect calls, we will have all seen targets forming a |
| 390 | // linked list here. Find the one corresponding to this callee. |
| 391 | while (Callsite && Callsite->guid() != Guid) { |
| 392 | Callsite = Callsite->next(); |
| 393 | } |
| 394 | auto *Ret = Callsite ? Callsite |
| 395 | : getCallsiteSlow(Guid, InsertionPoint: CallsiteContext, NumCounters, |
| 396 | NumCallsites); |
| 397 | if (Ret->callsites_size() != NumCallsites || |
| 398 | Ret->counters_size() != NumCounters) |
| 399 | __sanitizer::Printf(format: "[ctxprof] Returned ctx differs from what's asked: " |
| 400 | "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n" , |
| 401 | reinterpret_cast<void *>(Ret), Guid, NumCallsites, |
| 402 | NumCounters, Ret->guid(), Ret->callsites_size(), |
| 403 | Ret->counters_size()); |
| 404 | onContextEnter(Node&: *Ret); |
| 405 | return Ret; |
| 406 | } |
| 407 | |
| 408 | ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid, |
| 409 | uint32_t Counters, |
| 410 | uint32_t Callsites) { |
| 411 | auto *Root = FData->getOrAllocateContextRoot(); |
| 412 | assert(canBeRoot(Root)); |
| 413 | return tryStartContextGivenRoot(Root, Guid, Counters, Callsites); |
| 414 | } |
| 415 | |
| 416 | void __llvm_ctx_profile_release_context(FunctionData *FData) |
| 417 | SANITIZER_NO_THREAD_SAFETY_ANALYSIS { |
| 418 | const auto *CurrentRoot = __llvm_ctx_profile_current_context_root; |
| 419 | auto *CR = FData->CtxRoot; |
| 420 | if (!CurrentRoot || CR != CurrentRoot) |
| 421 | return; |
| 422 | IsUnderContext = false; |
| 423 | assert(CR && canBeRoot(CR)); |
| 424 | __llvm_ctx_profile_current_context_root = nullptr; |
| 425 | CR->Taken.Unlock(); |
| 426 | } |
| 427 | |
| 428 | void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration) { |
| 429 | size_t NumMemUnits = 0; |
| 430 | __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( |
| 431 | &AllContextsMutex); |
| 432 | for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) { |
| 433 | auto *Root = AllContextRoots[I]; |
| 434 | __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock( |
| 435 | &Root->Taken); |
| 436 | for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) |
| 437 | ++NumMemUnits; |
| 438 | |
| 439 | resetContextNode(Node&: *Root->FirstNode); |
| 440 | if (Root->FirstUnhandledCalleeNode) |
| 441 | resetContextNode(Node&: *Root->FirstUnhandledCalleeNode); |
| 442 | __sanitizer::atomic_store_relaxed(a: &Root->TotalEntries, v: 0); |
| 443 | } |
| 444 | if (AutodetectDuration) { |
| 445 | // we leak RD intentionally. Knowing when to free it is tricky, there's a |
| 446 | // race condition with functions observing the `RootDectector` as non-null. |
| 447 | // This can be addressed but the alternatives have some added complexity and |
| 448 | // it's not (yet) worth it. |
| 449 | auto *RD = new (__sanitizer::InternalAlloc(size: sizeof(RootAutoDetector))) |
| 450 | RootAutoDetector(AllFunctionsData, RootDetector, AutodetectDuration); |
| 451 | RD->start(); |
| 452 | } else { |
| 453 | __sanitizer::Printf(format: "[ctxprof] Initial NumMemUnits: %zu \n" , NumMemUnits); |
| 454 | } |
| 455 | __sanitizer::atomic_store_relaxed(a: &ProfilingStarted, v: true); |
| 456 | } |
| 457 | |
| 458 | bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) { |
| 459 | __sanitizer::atomic_store_relaxed(a: &ProfilingStarted, v: false); |
| 460 | if (auto *RD = getRootDetector()) { |
| 461 | __sanitizer::Printf(format: "[ctxprof] Expected the root autodetector to have " |
| 462 | "finished well before attempting to fetch a context" ); |
| 463 | RD->join(); |
| 464 | } |
| 465 | |
| 466 | __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( |
| 467 | &AllContextsMutex); |
| 468 | |
| 469 | Writer.startContextSection(); |
| 470 | for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) { |
| 471 | auto *Root = AllContextRoots[I]; |
| 472 | __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock( |
| 473 | &Root->Taken); |
| 474 | if (!validate(Root)) { |
| 475 | __sanitizer::Printf(format: "[ctxprof] Contextual Profile is %s\n" , "invalid" ); |
| 476 | return false; |
| 477 | } |
| 478 | Writer.writeContextual( |
| 479 | RootNode: *Root->FirstNode, Unhandled: Root->FirstUnhandledCalleeNode, |
| 480 | TotalRootEntryCount: __sanitizer::atomic_load_relaxed(a: &Root->TotalEntries)); |
| 481 | } |
| 482 | Writer.endContextSection(); |
| 483 | Writer.startFlatSection(); |
| 484 | // The list progresses behind the head, so taking this snapshot allows the |
| 485 | // list to grow concurrently without causing a race condition with our |
| 486 | // traversing it. |
| 487 | const auto *Pos = reinterpret_cast<const FunctionData *>( |
| 488 | __sanitizer::atomic_load_relaxed(a: &AllFunctionsData)); |
| 489 | for (; Pos; Pos = Pos->Next) { |
| 490 | const auto *CR = Pos->CtxRoot; |
| 491 | if (!CR && canBeRoot(Ctx: CR)) { |
| 492 | const auto *FP = Pos->FlatCtx; |
| 493 | Writer.writeFlat(Guid: FP->guid(), Buffer: FP->counters(), BufferSize: FP->counters_size()); |
| 494 | } |
| 495 | } |
| 496 | Writer.endFlatSection(); |
| 497 | return true; |
| 498 | } |
| 499 | |
| 500 | void __llvm_ctx_profile_free() { |
| 501 | __sanitizer::atomic_store_relaxed(a: &ProfilingStarted, v: false); |
| 502 | { |
| 503 | __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( |
| 504 | &AllContextsMutex); |
| 505 | for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) |
| 506 | for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) { |
| 507 | auto *C = A; |
| 508 | A = A->next(); |
| 509 | __sanitizer::InternalFree(p: C); |
| 510 | } |
| 511 | AllContextRoots.Reset(); |
| 512 | } |
| 513 | __sanitizer::atomic_store_relaxed(a: &AllFunctionsData, v: 0U); |
| 514 | { |
| 515 | __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( |
| 516 | &FlatCtxArenaMutex); |
| 517 | FlatCtxArena = nullptr; |
| 518 | for (auto *A = FlatCtxArenaHead; A;) { |
| 519 | auto *C = A; |
| 520 | A = C->next(); |
| 521 | __sanitizer::InternalFree(p: C); |
| 522 | } |
| 523 | |
| 524 | FlatCtxArenaHead = nullptr; |
| 525 | } |
| 526 | } |
| 527 | |