| 1 | //===- RootAutodetector.cpp - detect contextual profiling roots -----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "RootAutoDetector.h" |
| 10 | |
| 11 | #include "CtxInstrProfiling.h" |
| 12 | #include "sanitizer_common/sanitizer_common.h" |
| 13 | #include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap) |
| 14 | #include <assert.h> |
| 15 | #include <dlfcn.h> |
| 16 | #include <pthread.h> |
| 17 | |
| 18 | using namespace __ctx_profile; |
| 19 | template <typename T> using Set = DenseMap<T, bool>; |
| 20 | |
| 21 | namespace __sanitizer { |
| 22 | void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context, |
| 23 | bool request_fast, u32 max_depth) { |
| 24 | // We can't implement the fast variant. The fast variant ends up invoking an |
| 25 | // external allocator, because of pthread_attr_getstack. If this happens |
| 26 | // during an allocation of the program being instrumented, a non-reentrant |
| 27 | // lock may be taken (this was observed). The allocator called by |
| 28 | // pthread_attr_getstack will also try to take that lock. |
| 29 | UnwindSlow(pc, max_depth); |
| 30 | } |
| 31 | } // namespace __sanitizer |
| 32 | |
| 33 | RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) { |
| 34 | GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex); |
| 35 | Parent.AllSamples.PushBack(v: this); |
| 36 | } |
| 37 | |
| 38 | void RootAutoDetector::start() { |
| 39 | atomic_store_relaxed(a: &Self, v: reinterpret_cast<uintptr_t>(this)); |
| 40 | pthread_create( |
| 41 | newthread: &WorkerThread, attr: nullptr, |
| 42 | start_routine: +[](void *Ctx) -> void * { |
| 43 | RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx); |
| 44 | SleepForSeconds(seconds: RAD->WaitSeconds); |
| 45 | // To avoid holding the AllSamplesMutex, make a snapshot of all the |
| 46 | // thread samples collected so far |
| 47 | Vector<PerThreadSamples *> SamplesSnapshot; |
| 48 | { |
| 49 | GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex); |
| 50 | SamplesSnapshot.Resize(size: RAD->AllSamples.Size()); |
| 51 | for (uptr I = 0; I < RAD->AllSamples.Size(); ++I) |
| 52 | SamplesSnapshot[I] = RAD->AllSamples[I]; |
| 53 | } |
| 54 | DenseMap<uptr, uint64_t> AllRoots; |
| 55 | for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) { |
| 56 | GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M); |
| 57 | SamplesSnapshot[I]->TrieRoot.determineRoots().forEach(fn: [&](auto &KVP) { |
| 58 | auto [FAddr, Count] = KVP; |
| 59 | AllRoots[FAddr] += Count; |
| 60 | return true; |
| 61 | }); |
| 62 | } |
| 63 | // FIXME: as a next step, establish a minimum relative nr of samples |
| 64 | // per root that would qualify it as a root. |
| 65 | for (auto *FD = reinterpret_cast<FunctionData *>( |
| 66 | atomic_load_relaxed(a: &RAD->FunctionDataListHead)); |
| 67 | FD; FD = FD->Next) { |
| 68 | if (AllRoots.contains(Key: reinterpret_cast<uptr>(FD->EntryAddress))) { |
| 69 | if (canBeRoot(Ctx: FD->CtxRoot)) { |
| 70 | FD->getOrAllocateContextRoot(); |
| 71 | } else { |
| 72 | // FIXME: address this by informing the root detection algorithm |
| 73 | // to skip over such functions and pick the next down in the |
| 74 | // stack. At that point, this becomes an assert. |
| 75 | Printf(format: "[ctxprof] Root auto-detector selected a musttail " |
| 76 | "function for root (%p). Ignoring\n" , |
| 77 | FD->EntryAddress); |
| 78 | } |
| 79 | } |
| 80 | } |
| 81 | atomic_store_relaxed(a: &RAD->Self, v: 0); |
| 82 | return nullptr; |
| 83 | }, |
| 84 | arg: this); |
| 85 | } |
| 86 | |
| 87 | void RootAutoDetector::join() { pthread_join(th: WorkerThread, thread_return: nullptr); } |
| 88 | |
| 89 | void RootAutoDetector::sample() { |
| 90 | // tracking reentry in case we want to re-explore fast stack unwind - which |
| 91 | // does potentially re-enter the runtime because it calls the instrumented |
| 92 | // allocator because of pthread_attr_getstack. See the notes also on |
| 93 | // UnwindImpl above. |
| 94 | static thread_local bool Entered = false; |
| 95 | static thread_local uint64_t Entries = 0; |
| 96 | if (Entered || (++Entries % SampleRate)) |
| 97 | return; |
| 98 | Entered = true; |
| 99 | collectStack(); |
| 100 | Entered = false; |
| 101 | } |
| 102 | |
| 103 | void RootAutoDetector::collectStack() { |
| 104 | GET_CALLER_PC_BP; |
| 105 | BufferedStackTrace CurrentStack; |
| 106 | CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false); |
| 107 | // 2 stack frames would be very unlikely to mean anything, since at least the |
| 108 | // compiler-rt frame - which can't be inlined - should be observable, which |
| 109 | // counts as 1; we can be even more aggressive with this number. |
| 110 | if (CurrentStack.size <= 2) |
| 111 | return; |
| 112 | static thread_local PerThreadSamples *ThisThreadSamples = |
| 113 | new (__sanitizer::InternalAlloc(size: sizeof(PerThreadSamples))) |
| 114 | PerThreadSamples(*this); |
| 115 | |
| 116 | if (!ThisThreadSamples->M.TryLock()) |
| 117 | return; |
| 118 | |
| 119 | ThisThreadSamples->TrieRoot.insertStack(ST: CurrentStack); |
| 120 | ThisThreadSamples->M.Unlock(); |
| 121 | } |
| 122 | |
| 123 | uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const { |
| 124 | // this requires --linkopt=-Wl,--export-dynamic |
| 125 | Dl_info Info; |
| 126 | if (dladdr(address: reinterpret_cast<const void *>(CallsiteAddress), info: &Info) != 0) |
| 127 | return reinterpret_cast<uptr>(Info.dli_saddr); |
| 128 | return 0; |
| 129 | } |
| 130 | |
| 131 | void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) { |
| 132 | ++TheTrie.Count; |
| 133 | auto *Current = &TheTrie; |
| 134 | // the stack is backwards - the first callsite is at the top. |
| 135 | for (int I = ST.size - 1; I >= 0; --I) { |
| 136 | uptr ChildAddr = ST.trace[I]; |
| 137 | auto [Iter, _] = Current->Children.insert(KV: {ChildAddr, Trie(ChildAddr)}); |
| 138 | ++Iter->second.Count; |
| 139 | Current = &Iter->second; |
| 140 | } |
| 141 | } |
| 142 | |
| 143 | DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const { |
| 144 | // Assuming a message pump design, roots are those functions called by the |
| 145 | // message pump. The message pump is an infinite loop (for all practical |
| 146 | // considerations) fetching data from a queue. The root functions return - |
| 147 | // otherwise the message pump doesn't work. This function detects roots as the |
| 148 | // first place in the trie (starting from the root) where a function calls 2 |
| 149 | // or more functions. |
| 150 | // |
| 151 | // We start with a callsite trie - the nodes are callsites. Different child |
| 152 | // nodes may actually correspond to the same function. |
| 153 | // |
| 154 | // For example: using function(callsite) |
| 155 | // f1(csf1_1) -> f2(csf2_1) -> f3 |
| 156 | // -> f2(csf2_2) -> f4 |
| 157 | // |
| 158 | // would be represented in our trie as: |
| 159 | // csf1_1 -> csf2_1 -> f3 |
| 160 | // -> csf2_2 -> f4 |
| 161 | // |
| 162 | // While we can assert the control flow returns to f2, we don't know if it |
| 163 | // ever returns to f1. f2 could be the message pump. |
| 164 | // |
| 165 | // We need to convert our callsite tree into a function tree. We can also, |
| 166 | // more economically, just see how many distinct functions there are at a |
| 167 | // certain depth. When that count is greater than 1, we got to potential roots |
| 168 | // and everything above should be considered as non-roots. |
| 169 | DenseMap<uptr, uint64_t> Result; |
| 170 | Set<const Trie *> Worklist; |
| 171 | Worklist.insert(KV: {&TheTrie, {}}); |
| 172 | |
| 173 | while (!Worklist.empty()) { |
| 174 | Set<const Trie *> NextWorklist; |
| 175 | DenseMap<uptr, uint64_t> Candidates; |
| 176 | Worklist.forEach(fn: [&](const auto &KVP) { |
| 177 | auto [Node, _] = KVP; |
| 178 | auto SA = getFctStartAddr(CallsiteAddress: Node->CallsiteAddress); |
| 179 | Candidates[SA] += Node->Count; |
| 180 | Node->Children.forEach([&](auto &ChildKVP) { |
| 181 | NextWorklist.insert({&ChildKVP.second, true}); |
| 182 | return true; |
| 183 | }); |
| 184 | return true; |
| 185 | }); |
| 186 | if (Candidates.size() > 1) { |
| 187 | Result.swap(RHS&: Candidates); |
| 188 | break; |
| 189 | } |
| 190 | Worklist.swap(RHS&: NextWorklist); |
| 191 | } |
| 192 | return Result; |
| 193 | } |
| 194 | |