lib/ctx_profile/RootAutoDetector.cpp - llvm-project/compiler-rt - Git at Google

 //===- RootAutodetector.cpp - detect contextual profiling roots -----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "RootAutoDetector.h"

 #include "CtxInstrProfiling.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
 #include <assert.h>
 #include <dlfcn.h>
 #include <pthread.h>

 using namespace __ctx_profile;
 template <typename T> using Set = DenseMap<T, bool>;

 namespace __sanitizer {
 void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
                                     bool request_fast, u32 max_depth) {
   // We can't implement the fast variant. The fast variant ends up invoking an
   // external allocator, because of pthread_attr_getstack. If this happens
   // during an allocation of the program being instrumented, a non-reentrant
   // lock may be taken (this was observed). The allocator called by
   // pthread_attr_getstack will also try to take that lock.
   UnwindSlow(pc, max_depth);
 }
 } // namespace __sanitizer

 RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
   GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
   Parent.AllSamples.PushBack(this);
 }

 void RootAutoDetector::start() {
   atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));
   pthread_create(
       &WorkerThread, nullptr,
       +[](void *Ctx) -> void * {
         RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);
         SleepForSeconds(RAD->WaitSeconds);
         // To avoid holding the AllSamplesMutex, make a snapshot of all the
         // thread samples collected so far
         Vector<PerThreadSamples *> SamplesSnapshot;
         {
           GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
           SamplesSnapshot.Resize(RAD->AllSamples.Size());
           for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
             SamplesSnapshot[I] = RAD->AllSamples[I];
         }
         DenseMap<uptr, uint64_t> AllRoots;
         for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) {
           GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M);
           SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {
             auto [FAddr, Count] = KVP;
             AllRoots[FAddr] += Count;
             return true;
           });
         }
         // FIXME: as a next step, establish a minimum relative nr of samples
         // per root that would qualify it as a root.
         for (auto *FD = reinterpret_cast<FunctionData *>(
                  atomic_load_relaxed(&RAD->FunctionDataListHead));
              FD; FD = FD->Next) {
           if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {
             if (canBeRoot(FD->CtxRoot)) {
               FD->getOrAllocateContextRoot();
             } else {
               // FIXME: address this by informing the root detection algorithm
               // to skip over such functions and pick the next down in the
               // stack. At that point, this becomes an assert.
               Printf("[ctxprof] Root auto-detector selected a musttail "
                      "function for root (%p). Ignoring\n",
                      FD->EntryAddress);
             }
           }
         }
         atomic_store_relaxed(&RAD->Self, 0);
         return nullptr;
       },
       this);
 }

 void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); }

 void RootAutoDetector::sample() {
   // tracking reentry in case we want to re-explore fast stack unwind - which
   // does potentially re-enter the runtime because it calls the instrumented
   // allocator because of pthread_attr_getstack. See the notes also on
   // UnwindImpl above.
   static thread_local bool Entered = false;
   static thread_local uint64_t Entries = 0;
   if (Entered || (++Entries % SampleRate))
     return;
   Entered = true;
   collectStack();
   Entered = false;
 }

 void RootAutoDetector::collectStack() {
   GET_CALLER_PC_BP;
   BufferedStackTrace CurrentStack;
   CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false);
   // 2 stack frames would be very unlikely to mean anything, since at least the
   // compiler-rt frame - which can't be inlined - should be observable, which
   // counts as 1; we can be even more aggressive with this number.
   if (CurrentStack.size <= 2)
     return;
   static thread_local PerThreadSamples *ThisThreadSamples =
       new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))
           PerThreadSamples(*this);

   if (!ThisThreadSamples->M.TryLock())
     return;

   ThisThreadSamples->TrieRoot.insertStack(CurrentStack);
   ThisThreadSamples->M.Unlock();
 }

 uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
   // this requires --linkopt=-Wl,--export-dynamic
   Dl_info Info;
   if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0)
     return reinterpret_cast<uptr>(Info.dli_saddr);
   return 0;
 }

 void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {
   ++TheTrie.Count;
   auto *Current = &TheTrie;
   // the stack is backwards - the first callsite is at the top.
   for (int I = ST.size - 1; I >= 0; --I) {
     uptr ChildAddr = ST.trace[I];
     auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)});
     ++Iter->second.Count;
     Current = &Iter->second;
   }
 }

 DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {
   // Assuming a message pump design, roots are those functions called by the
   // message pump. The message pump is an infinite loop (for all practical
   // considerations) fetching data from a queue. The root functions return -
   // otherwise the message pump doesn't work. This function detects roots as the
   // first place in the trie (starting from the root) where a function calls 2
   // or more functions.
   //
   // We start with a callsite trie - the nodes are callsites. Different child
   // nodes may actually correspond to the same function.
   //
   // For example: using function(callsite)
   // f1(csf1_1) -> f2(csf2_1) -> f3
   //            -> f2(csf2_2) -> f4
   //
   // would be represented in our trie as:
   // csf1_1 -> csf2_1 -> f3
   //        -> csf2_2 -> f4
   //
   // While we can assert the control flow returns to f2, we don't know if it
   // ever returns to f1. f2 could be the message pump.
   //
   // We need to convert our callsite tree into a function tree. We can also,
   // more economically, just see how many distinct functions there are at a
   // certain depth. When that count is greater than 1, we got to potential roots
   // and everything above should be considered as non-roots.
   DenseMap<uptr, uint64_t> Result;
   Set<const Trie *> Worklist;
   Worklist.insert({&TheTrie, {}});

   while (!Worklist.empty()) {
     Set<const Trie *> NextWorklist;
     DenseMap<uptr, uint64_t> Candidates;
     Worklist.forEach([&](const auto &KVP) {
       auto [Node, _] = KVP;
       auto SA = getFctStartAddr(Node->CallsiteAddress);
       Candidates[SA] += Node->Count;
       Node->Children.forEach([&](auto &ChildKVP) {
         NextWorklist.insert({&ChildKVP.second, true});
         return true;
       });
       return true;
     });
     if (Candidates.size() > 1) {
       Result.swap(Candidates);
       break;
     }
     Worklist.swap(NextWorklist);
   }
   return Result;
 }
	//===- RootAutodetector.cpp - detect contextual profiling roots -----------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "RootAutoDetector.h"

	#include "CtxInstrProfiling.h"
	#include "sanitizer_common/sanitizer_common.h"
	#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
	#include <assert.h>
	#include <dlfcn.h>
	#include <pthread.h>

	using namespace __ctx_profile;
	template <typename T> using Set = DenseMap<T, bool>;

	namespace __sanitizer {
	void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
	bool request_fast, u32 max_depth) {
	// We can't implement the fast variant. The fast variant ends up invoking an
	// external allocator, because of pthread_attr_getstack. If this happens
	// during an allocation of the program being instrumented, a non-reentrant
	// lock may be taken (this was observed). The allocator called by
	// pthread_attr_getstack will also try to take that lock.
	UnwindSlow(pc, max_depth);
	}
	} // namespace __sanitizer

	RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
	GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
	Parent.AllSamples.PushBack(this);
	}

	void RootAutoDetector::start() {
	atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));
	pthread_create(
	&WorkerThread, nullptr,
	+[](void Ctx) -> void {
	RootAutoDetector RAD = reinterpret_cast<RootAutoDetector >(Ctx);
	SleepForSeconds(RAD->WaitSeconds);
	// To avoid holding the AllSamplesMutex, make a snapshot of all the
	// thread samples collected so far
	Vector<PerThreadSamples *> SamplesSnapshot;
	{
	GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
	SamplesSnapshot.Resize(RAD->AllSamples.Size());
	for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
	SamplesSnapshot[I] = RAD->AllSamples[I];
	}
	DenseMap<uptr, uint64_t> AllRoots;
	for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) {
	GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M);
	SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {
	auto [FAddr, Count] = KVP;
	AllRoots[FAddr] += Count;
	return true;
	});
	}
	// FIXME: as a next step, establish a minimum relative nr of samples
	// per root that would qualify it as a root.
	for (auto FD = reinterpret_cast<FunctionData >(
	atomic_load_relaxed(&RAD->FunctionDataListHead));
	FD; FD = FD->Next) {
	if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {
	if (canBeRoot(FD->CtxRoot)) {
	FD->getOrAllocateContextRoot();
	} else {
	// FIXME: address this by informing the root detection algorithm
	// to skip over such functions and pick the next down in the
	// stack. At that point, this becomes an assert.
	Printf("[ctxprof] Root auto-detector selected a musttail "
	"function for root (%p). Ignoring\n",
	FD->EntryAddress);
	}
	}
	}
	atomic_store_relaxed(&RAD->Self, 0);
	return nullptr;
	},
	this);
	}

	void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); }

	void RootAutoDetector::sample() {
	// tracking reentry in case we want to re-explore fast stack unwind - which
	// does potentially re-enter the runtime because it calls the instrumented
	// allocator because of pthread_attr_getstack. See the notes also on
	// UnwindImpl above.
	static thread_local bool Entered = false;
	static thread_local uint64_t Entries = 0;
	if (Entered \|\| (++Entries % SampleRate))
	return;
	Entered = true;
	collectStack();
	Entered = false;
	}

	void RootAutoDetector::collectStack() {
	GET_CALLER_PC_BP;
	BufferedStackTrace CurrentStack;
	CurrentStack.Unwind(pc, bp, /context=/nullptr, /request_fast=/false);
	// 2 stack frames would be very unlikely to mean anything, since at least the
	// compiler-rt frame - which can't be inlined - should be observable, which
	// counts as 1; we can be even more aggressive with this number.
	if (CurrentStack.size <= 2)
	return;
	static thread_local PerThreadSamples *ThisThreadSamples =
	new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))
	PerThreadSamples(*this);

	if (!ThisThreadSamples->M.TryLock())
	return;

	ThisThreadSamples->TrieRoot.insertStack(CurrentStack);
	ThisThreadSamples->M.Unlock();
	}

	uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
	// this requires --linkopt=-Wl,--export-dynamic
	Dl_info Info;
	if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0)
	return reinterpret_cast<uptr>(Info.dli_saddr);
	return 0;
	}

	void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {
	++TheTrie.Count;
	auto *Current = &TheTrie;
	// the stack is backwards - the first callsite is at the top.
	for (int I = ST.size - 1; I >= 0; --I) {
	uptr ChildAddr = ST.trace[I];
	auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)});
	++Iter->second.Count;
	Current = &Iter->second;
	}
	}

	DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {
	// Assuming a message pump design, roots are those functions called by the
	// message pump. The message pump is an infinite loop (for all practical
	// considerations) fetching data from a queue. The root functions return -
	// otherwise the message pump doesn't work. This function detects roots as the
	// first place in the trie (starting from the root) where a function calls 2
	// or more functions.
	//
	// We start with a callsite trie - the nodes are callsites. Different child
	// nodes may actually correspond to the same function.
	//
	// For example: using function(callsite)
	// f1(csf1_1) -> f2(csf2_1) -> f3
	// -> f2(csf2_2) -> f4
	//
	// would be represented in our trie as:
	// csf1_1 -> csf2_1 -> f3
	// -> csf2_2 -> f4
	//
	// While we can assert the control flow returns to f2, we don't know if it
	// ever returns to f1. f2 could be the message pump.
	//
	// We need to convert our callsite tree into a function tree. We can also,
	// more economically, just see how many distinct functions there are at a
	// certain depth. When that count is greater than 1, we got to potential roots
	// and everything above should be considered as non-roots.
	DenseMap<uptr, uint64_t> Result;
	Set<const Trie *> Worklist;
	Worklist.insert({&TheTrie, {}});

	while (!Worklist.empty()) {
	Set<const Trie *> NextWorklist;
	DenseMap<uptr, uint64_t> Candidates;
	Worklist.forEach([&](const auto &KVP) {
	auto [Node, _] = KVP;
	auto SA = getFctStartAddr(Node->CallsiteAddress);
	Candidates[SA] += Node->Count;
	Node->Children.forEach([&](auto &ChildKVP) {
	NextWorklist.insert({&ChildKVP.second, true});
	return true;
	});
	return true;
	});
	if (Candidates.size() > 1) {
	Result.swap(Candidates);
	break;
	}
	Worklist.swap(NextWorklist);
	}
	return Result;
	}