| //===- RootAutodetector.cpp - detect contextual profiling roots -----------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "RootAutoDetector.h" |
| |
| #include "CtxInstrProfiling.h" |
| #include "sanitizer_common/sanitizer_common.h" |
| #include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap) |
| #include <assert.h> |
| #include <dlfcn.h> |
| #include <pthread.h> |
| |
| using namespace __ctx_profile; |
| template <typename T> using Set = DenseMap<T, bool>; |
| |
| namespace __sanitizer { |
| void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context, |
| bool request_fast, u32 max_depth) { |
| // We can't implement the fast variant. The fast variant ends up invoking an |
| // external allocator, because of pthread_attr_getstack. If this happens |
| // during an allocation of the program being instrumented, a non-reentrant |
| // lock may be taken (this was observed). The allocator called by |
| // pthread_attr_getstack will also try to take that lock. |
| UnwindSlow(pc, max_depth); |
| } |
| } // namespace __sanitizer |
| |
| RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) { |
| GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex); |
| Parent.AllSamples.PushBack(this); |
| } |
| |
| void RootAutoDetector::start() { |
| atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this)); |
| pthread_create( |
| &WorkerThread, nullptr, |
| +[](void *Ctx) -> void * { |
| RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx); |
| SleepForSeconds(RAD->WaitSeconds); |
| // To avoid holding the AllSamplesMutex, make a snapshot of all the |
| // thread samples collected so far |
| Vector<PerThreadSamples *> SamplesSnapshot; |
| { |
| GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex); |
| SamplesSnapshot.Resize(RAD->AllSamples.Size()); |
| for (uptr I = 0; I < RAD->AllSamples.Size(); ++I) |
| SamplesSnapshot[I] = RAD->AllSamples[I]; |
| } |
| DenseMap<uptr, uint64_t> AllRoots; |
| for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) { |
| GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M); |
| SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) { |
| auto [FAddr, Count] = KVP; |
| AllRoots[FAddr] += Count; |
| return true; |
| }); |
| } |
| // FIXME: as a next step, establish a minimum relative nr of samples |
| // per root that would qualify it as a root. |
| for (auto *FD = reinterpret_cast<FunctionData *>( |
| atomic_load_relaxed(&RAD->FunctionDataListHead)); |
| FD; FD = FD->Next) { |
| if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) { |
| if (canBeRoot(FD->CtxRoot)) { |
| FD->getOrAllocateContextRoot(); |
| } else { |
| // FIXME: address this by informing the root detection algorithm |
| // to skip over such functions and pick the next down in the |
| // stack. At that point, this becomes an assert. |
| Printf("[ctxprof] Root auto-detector selected a musttail " |
| "function for root (%p). Ignoring\n", |
| FD->EntryAddress); |
| } |
| } |
| } |
| atomic_store_relaxed(&RAD->Self, 0); |
| return nullptr; |
| }, |
| this); |
| } |
| |
| void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); } |
| |
| void RootAutoDetector::sample() { |
| // tracking reentry in case we want to re-explore fast stack unwind - which |
| // does potentially re-enter the runtime because it calls the instrumented |
| // allocator because of pthread_attr_getstack. See the notes also on |
| // UnwindImpl above. |
| static thread_local bool Entered = false; |
| static thread_local uint64_t Entries = 0; |
| if (Entered || (++Entries % SampleRate)) |
| return; |
| Entered = true; |
| collectStack(); |
| Entered = false; |
| } |
| |
| void RootAutoDetector::collectStack() { |
| GET_CALLER_PC_BP; |
| BufferedStackTrace CurrentStack; |
| CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false); |
| // 2 stack frames would be very unlikely to mean anything, since at least the |
| // compiler-rt frame - which can't be inlined - should be observable, which |
| // counts as 1; we can be even more aggressive with this number. |
| if (CurrentStack.size <= 2) |
| return; |
| static thread_local PerThreadSamples *ThisThreadSamples = |
| new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples))) |
| PerThreadSamples(*this); |
| |
| if (!ThisThreadSamples->M.TryLock()) |
| return; |
| |
| ThisThreadSamples->TrieRoot.insertStack(CurrentStack); |
| ThisThreadSamples->M.Unlock(); |
| } |
| |
| uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const { |
| // this requires --linkopt=-Wl,--export-dynamic |
| Dl_info Info; |
| if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0) |
| return reinterpret_cast<uptr>(Info.dli_saddr); |
| return 0; |
| } |
| |
| void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) { |
| ++TheTrie.Count; |
| auto *Current = &TheTrie; |
| // the stack is backwards - the first callsite is at the top. |
| for (int I = ST.size - 1; I >= 0; --I) { |
| uptr ChildAddr = ST.trace[I]; |
| auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)}); |
| ++Iter->second.Count; |
| Current = &Iter->second; |
| } |
| } |
| |
| DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const { |
| // Assuming a message pump design, roots are those functions called by the |
| // message pump. The message pump is an infinite loop (for all practical |
| // considerations) fetching data from a queue. The root functions return - |
| // otherwise the message pump doesn't work. This function detects roots as the |
| // first place in the trie (starting from the root) where a function calls 2 |
| // or more functions. |
| // |
| // We start with a callsite trie - the nodes are callsites. Different child |
| // nodes may actually correspond to the same function. |
| // |
| // For example: using function(callsite) |
| // f1(csf1_1) -> f2(csf2_1) -> f3 |
| // -> f2(csf2_2) -> f4 |
| // |
| // would be represented in our trie as: |
| // csf1_1 -> csf2_1 -> f3 |
| // -> csf2_2 -> f4 |
| // |
| // While we can assert the control flow returns to f2, we don't know if it |
| // ever returns to f1. f2 could be the message pump. |
| // |
| // We need to convert our callsite tree into a function tree. We can also, |
| // more economically, just see how many distinct functions there are at a |
| // certain depth. When that count is greater than 1, we got to potential roots |
| // and everything above should be considered as non-roots. |
| DenseMap<uptr, uint64_t> Result; |
| Set<const Trie *> Worklist; |
| Worklist.insert({&TheTrie, {}}); |
| |
| while (!Worklist.empty()) { |
| Set<const Trie *> NextWorklist; |
| DenseMap<uptr, uint64_t> Candidates; |
| Worklist.forEach([&](const auto &KVP) { |
| auto [Node, _] = KVP; |
| auto SA = getFctStartAddr(Node->CallsiteAddress); |
| Candidates[SA] += Node->Count; |
| Node->Children.forEach([&](auto &ChildKVP) { |
| NextWorklist.insert({&ChildKVP.second, true}); |
| return true; |
| }); |
| return true; |
| }); |
| if (Candidates.size() > 1) { |
| Result.swap(Candidates); |
| break; |
| } |
| Worklist.swap(NextWorklist); |
| } |
| return Result; |
| } |