compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp - llvm-project - Git at Google

 //===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "CtxInstrProfiling.h"
 #include "RootAutoDetector.h"
 #include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_atomic_clang.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_dense_map.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_mutex.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_thread_safety.h"
 #include "sanitizer_common/sanitizer_vector.h"

 #include <assert.h>

 using namespace __ctx_profile;

 namespace {
 // Keep track of all the context roots we actually saw, so we can then traverse
 // them when the user asks for the profile in __llvm_ctx_profile_fetch
 __sanitizer::SpinMutex AllContextsMutex;
 SANITIZER_GUARDED_BY(AllContextsMutex)
 __sanitizer::Vector<ContextRoot *> AllContextRoots;

 __sanitizer::atomic_uintptr_t AllFunctionsData = {};

 // Keep all the functions for which we collect a flat profile in a linked list.
 __sanitizer::SpinMutex FlatCtxArenaMutex;
 SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
 Arena *FlatCtxArenaHead = nullptr;
 SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
 Arena *FlatCtxArena = nullptr;

 // Set to true when we enter a root, and false when we exit - regardless if this
 // thread collects a contextual profile for that root.
 __thread bool IsUnderContext = false;
 __sanitizer::atomic_uint8_t ProfilingStarted = {};

 __sanitizer::atomic_uintptr_t RootDetector = {};
 RootAutoDetector *getRootDetector() {
   return reinterpret_cast<RootAutoDetector *>(
       __sanitizer::atomic_load_relaxed(&RootDetector));
 }

 // utility to taint a pointer by setting the LSB. There is an assumption
 // throughout that the addresses of contexts are even (really, they should be
 // align(8), but "even"-ness is the minimum assumption)
 // "scratch contexts" are buffers that we return in certain cases - they are
 // large enough to allow for memory safe counter access, but they don't link
 // subcontexts below them (the runtime recognizes them and enforces that)
 ContextNode *markAsScratch(const ContextNode *Ctx) {
   return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
 }

 // Used when getting the data from TLS. We don't *really* need to reset, but
 // it's a simpler system if we do.
 template <typename T> inline T consume(T &V) {
   auto R = V;
   V = {0};
   return R;
 }

 // We allocate at least kBuffSize Arena pages. The scratch buffer is also that
 // large.
 constexpr size_t kPower = 20;
 constexpr size_t kBuffSize = 1 << kPower;

 // Highly unlikely we need more than kBuffSize for a context.
 size_t getArenaAllocSize(size_t Needed) {
   if (Needed >= kBuffSize)
     return 2 * Needed;
   return kBuffSize;
 }

 // verify the structural integrity of the context
 bool validate(const ContextRoot *Root) {
   // all contexts should be laid out in some arena page. Go over each arena
   // allocated for this Root, and jump over contained contexts based on
   // self-reported sizes.
   __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
   for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
     const auto *Pos = Mem->start();
     while (Pos < Mem->pos()) {
       const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
       if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
                .second)
         return false;
       Pos += Ctx->size();
     }
   }

   // Now traverse the contexts again the same way, but validate all nonull
   // subcontext addresses appear in the set computed above.
   for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
     const auto *Pos = Mem->start();
     while (Pos < Mem->pos()) {
       const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
       for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
         for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
           if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
             return false;

       Pos += Ctx->size();
     }
   }
   return true;
 }

 inline ContextNode *allocContextNode(char *Place, GUID Guid,
                                      uint32_t NumCounters,
                                      uint32_t NumCallsites,
                                      ContextNode *Next = nullptr) {
   assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
   return new (Place) ContextNode(Guid, NumCounters, NumCallsites, Next);
 }

 void resetContextNode(ContextNode &Node) {
   // FIXME(mtrofin): this is std::memset, which we can probably use if we
   // drop/reduce the dependency on sanitizer_common.
   for (uint32_t I = 0; I < Node.counters_size(); ++I)
     Node.counters()[I] = 0;
   for (uint32_t I = 0; I < Node.callsites_size(); ++I)
     for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next())
       resetContextNode(*Next);
 }

 ContextNode *onContextEnter(ContextNode &Node) {
   ++Node.counters()[0];
   return &Node;
 }

 } // namespace

 // the scratch buffer - what we give when we can't produce a real context (the
 // scratch isn't "real" in that it's expected to be clobbered carelessly - we
 // don't read it). The other important thing is that the callees from a scratch
 // context also get a scratch context.
 // Eventually this can be replaced with per-function buffers, a'la the typical
 // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
 // the part about determining the nature of the subcontexts does.
 __thread char __Buffer[kBuffSize] = {0};

 #define TheScratchContext                                                      \
   markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))

 // init the TLSes
 __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
                                                                  nullptr};
 __thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};

 __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
     nullptr;

 Arena::Arena(uint32_t Size) : Size(Size) {
   __sanitizer::internal_memset(start(), 0, Size);
 }

 // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
 // the dependency on the latter.
 Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
   assert(!Prev || Prev->Next == nullptr);
   Arena *NewArena = new (__sanitizer::InternalAlloc(
       Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment))
       Arena(Size);
   if (Prev)
     Prev->Next = NewArena;
   return NewArena;
 }

 void Arena::freeArenaList(Arena *&A) {
   assert(A);
   for (auto *I = A; I != nullptr;) {
     auto *Current = I;
     I = I->Next;
     __sanitizer::InternalFree(Current);
   }
   A = nullptr;
 }

 // If this is the first time we hit a callsite with this (Guid) particular
 // callee, we need to allocate.
 ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
                              uint32_t NumCounters, uint32_t NumCallsites) {
   auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites);
   auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
   char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
   if (!AllocPlace) {
     // if we failed to allocate on the current arena, allocate a new arena,
     // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
     // find it from now on for other cases when we need to getCallsiteSlow.
     // Note that allocateNewArena will link the allocated memory in the list of
     // Arenas.
     __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
         Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
     AllocPlace = Mem->tryBumpAllocate(AllocSize);
   }
   auto *Ret = allocContextNode(AllocPlace, Guid, NumCounters, NumCallsites,
                                *InsertionPoint);
   *InsertionPoint = Ret;
   return Ret;
 }

 ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid,
                             uint32_t NumCounters) {
   if (ContextNode *Existing = Data.FlatCtx)
     return Existing;
   {
     // We could instead try to take the lock and, if that fails, return
     // TheScratchContext. But that could leave message pump loops more sparsely
     // profiled than everything else. Maybe that doesn't matter, and we can
     // optimize this later.
     __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Data.Mutex);
     if (ContextNode *Existing = Data.FlatCtx)
       return Existing;

     auto NeededSize = ContextNode::getAllocSize(NumCounters, 0);
     char *AllocBuff = nullptr;
     {
       __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> FL(
           &FlatCtxArenaMutex);
       if (FlatCtxArena)
         AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
       if (!AllocBuff) {
         FlatCtxArena = Arena::allocateNewArena(getArenaAllocSize(NeededSize),
                                                FlatCtxArena);
         AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
       }
       if (!FlatCtxArenaHead)
         FlatCtxArenaHead = FlatCtxArena;
     }
     auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
     Data.FlatCtx = Ret;

     Data.EntryAddress = Callee;
     Data.Next = reinterpret_cast<FunctionData *>(
         __sanitizer::atomic_load_relaxed(&AllFunctionsData));
     while (!__sanitizer::atomic_compare_exchange_strong(
         &AllFunctionsData, reinterpret_cast<uintptr_t *>(&Data.Next),
         reinterpret_cast<uintptr_t>(&Data),
         __sanitizer::memory_order_release)) {
     }
   }

   return Data.FlatCtx;
 }

 // This should be called once for a Root. Allocate the first arena, set up the
 // first context.
 void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters,
                   uint32_t NumCallsites) {
   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
       &AllContextsMutex);
   // Re-check - we got here without having had taken a lock.
   if (Root->FirstMemBlock)
     return;
   const auto Needed = ContextNode::getAllocSize(NumCounters, NumCallsites);
   auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
   Root->FirstMemBlock = M;
   Root->CurrentMem = M;
   Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid,
                                      NumCounters, NumCallsites);
   AllContextRoots.PushBack(Root);
 }

 ContextRoot *FunctionData::getOrAllocateContextRoot() {
   auto *Root = CtxRoot;
   if (Root)
     return Root;
   __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Mutex);
   Root = CtxRoot;
   if (!Root) {
     Root = new (__sanitizer::InternalAlloc(sizeof(ContextRoot))) ContextRoot();
     CtxRoot = Root;
   }

   assert(Root);
   return Root;
 }

 ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid,
                                       uint32_t Counters, uint32_t Callsites)
     SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
   IsUnderContext = true;
   __sanitizer::atomic_fetch_add(&Root->TotalEntries, 1,
                                 __sanitizer::memory_order_relaxed);
   if (!Root->FirstMemBlock) {
     setupContext(Root, Guid, Counters, Callsites);
   }
   if (Root->Taken.TryLock()) {
     __llvm_ctx_profile_current_context_root = Root;
     onContextEnter(*Root->FirstNode);
     return Root->FirstNode;
   }
   // If this thread couldn't take the lock, return scratch context.
   __llvm_ctx_profile_current_context_root = nullptr;
   return TheScratchContext;
 }

 ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid,
                                  uint32_t NumCounters, uint32_t NumCallsites,
                                  ContextRoot *CtxRoot) {

   // 1) if we are currently collecting a contextual profile, fetch a ContextNode
   // in the `Unhandled` set. We want to do this regardless of `ProfilingStarted`
   // to (hopefully) offset the penalty of creating these contexts to before
   // profiling.
   //
   // 2) if we are under a root (regardless if this thread is collecting or not a
   // contextual profile for that root), do not collect a flat profile. We want
   // to keep flat profiles only for activations that can't happen under a root,
   // to avoid confusing profiles. We can, for example, combine flattened and
   // flat profiles meaningfully, as we wouldn't double-count anything.
   //
   // 3) to avoid lengthy startup, don't bother with flat profiles until the
   // profiling has started. We would reset them anyway when profiling starts.
   // HOWEVER. This does lose profiling for message pumps: those functions are
   // entered once and never exit. They should be assumed to be entered before
   // profiling starts - because profiling should start after the server is up
   // and running (which is equivalent to "message pumps are set up").
   if (!CtxRoot) {
     if (auto *RAD = getRootDetector())
       RAD->sample();
     else if (auto *CR = Data.CtxRoot)
       return tryStartContextGivenRoot(CR, Guid, NumCounters, NumCallsites);
     if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
       return TheScratchContext;
     else
       return markAsScratch(
           onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters)));
   }
   auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr});
   if (Ins)
     Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode,
                                    NumCounters, 0);
   return markAsScratch(onContextEnter(*Iter->second));
 }

 ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
                                             GUID Guid, uint32_t NumCounters,
                                             uint32_t NumCallsites) {
   auto *CtxRoot = __llvm_ctx_profile_current_context_root;
   // fast "out" if we're not even doing contextual collection.
   if (!CtxRoot)
     return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
                                nullptr);

   // also fast "out" if the caller is scratch. We can see if it's scratch by
   // looking at the interior pointer into the subcontexts vector that the caller
   // provided, which, if the context is scratch, so is that interior pointer
   // (because all the address calculations are using even values. Or more
   // precisely, aligned - 8 values)
   auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
   if (!CallsiteContext || isScratch(CallsiteContext))
     return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
                                CtxRoot);

   // if the callee isn't the expected one, return scratch.
   // Signal handler(s) could have been invoked at any point in the execution.
   // Should that have happened, and had it (the handler) be built with
   // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
   // Its sub call graph would have then populated
   // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
   // The normal call graph may be impacted in that, if the signal handler
   // happened somewhere before we read the TLS here, we'd see the TLS reset and
   // we'd also fail here. That would just mean we would loose counter values for
   // the normal subgraph, this time around. That should be very unlikely, but if
   // it happens too frequently, we should be able to detect discrepancies in
   // entry counts (caller-callee). At the moment, the design goes on the
   // assumption that is so unfrequent, though, that it's not worth doing more
   // for that case.
   auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
   if (ExpectedCallee != Callee)
     return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
                                CtxRoot);

   auto *Callsite = *CallsiteContext;
   // in the case of indirect calls, we will have all seen targets forming a
   // linked list here. Find the one corresponding to this callee.
   while (Callsite && Callsite->guid() != Guid) {
     Callsite = Callsite->next();
   }
   auto *Ret = Callsite ? Callsite
                        : getCallsiteSlow(Guid, CallsiteContext, NumCounters,
                                          NumCallsites);
   if (Ret->callsites_size() != NumCallsites ||
       Ret->counters_size() != NumCounters)
     __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
                         "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
                         reinterpret_cast<void *>(Ret), Guid, NumCallsites,
                         NumCounters, Ret->guid(), Ret->callsites_size(),
                         Ret->counters_size());
   onContextEnter(*Ret);
   return Ret;
 }

 ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid,
                                               uint32_t Counters,
                                               uint32_t Callsites) {

   return tryStartContextGivenRoot(FData->getOrAllocateContextRoot(), Guid,
                                   Counters, Callsites);
 }

 void __llvm_ctx_profile_release_context(FunctionData *FData)
     SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
   const auto *CurrentRoot = __llvm_ctx_profile_current_context_root;
   if (!CurrentRoot || FData->CtxRoot != CurrentRoot)
     return;
   IsUnderContext = false;
   assert(FData->CtxRoot);
   __llvm_ctx_profile_current_context_root = nullptr;
   FData->CtxRoot->Taken.Unlock();
 }

 void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration) {
   size_t NumMemUnits = 0;
   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
       &AllContextsMutex);
   for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
     auto *Root = AllContextRoots[I];
     __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
         &Root->Taken);
     for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
       ++NumMemUnits;

     resetContextNode(*Root->FirstNode);
     if (Root->FirstUnhandledCalleeNode)
       resetContextNode(*Root->FirstUnhandledCalleeNode);
     __sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0);
   }
   if (AutodetectDuration) {
     // we leak RD intentionally. Knowing when to free it is tricky, there's a
     // race condition with functions observing the `RootDectector` as non-null.
     // This can be addressed but the alternatives have some added complexity and
     // it's not (yet) worth it.
     auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector)))
         RootAutoDetector(AllFunctionsData, RootDetector, AutodetectDuration);
     RD->start();
   } else {
     __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
   }
   __sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
 }

 bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
   __sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
   if (auto *RD = getRootDetector()) {
     __sanitizer::Printf("[ctxprof] Expected the root autodetector to have "
                         "finished well before attempting to fetch a context");
     RD->join();
   }

   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
       &AllContextsMutex);

   Writer.startContextSection();
   for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
     auto *Root = AllContextRoots[I];
     __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
         &Root->Taken);
     if (!validate(Root)) {
       __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
       return false;
     }
     Writer.writeContextual(
         *Root->FirstNode, Root->FirstUnhandledCalleeNode,
         __sanitizer::atomic_load_relaxed(&Root->TotalEntries));
   }
   Writer.endContextSection();
   Writer.startFlatSection();
   // The list progresses behind the head, so taking this snapshot allows the
   // list to grow concurrently without causing a race condition with our
   // traversing it.
   const auto *Pos = reinterpret_cast<const FunctionData *>(
       __sanitizer::atomic_load_relaxed(&AllFunctionsData));
   for (; Pos; Pos = Pos->Next)
     if (!Pos->CtxRoot)
       Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
                        Pos->FlatCtx->counters_size());
   Writer.endFlatSection();
   return true;
 }

 void __llvm_ctx_profile_free() {
   __sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
   {
     __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
         &AllContextsMutex);
     for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
       for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
         auto *C = A;
         A = A->next();
         __sanitizer::InternalFree(C);
       }
     AllContextRoots.Reset();
   }
   __sanitizer::atomic_store_relaxed(&AllFunctionsData, 0U);
   {
     __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
         &FlatCtxArenaMutex);
     FlatCtxArena = nullptr;
     for (auto *A = FlatCtxArenaHead; A;) {
       auto *C = A;
       A = C->next();
       __sanitizer::InternalFree(C);
     }

     FlatCtxArenaHead = nullptr;
   }
 }
	//===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "CtxInstrProfiling.h"
	#include "RootAutoDetector.h"
	#include "sanitizer_common/sanitizer_allocator_internal.h"
	#include "sanitizer_common/sanitizer_atomic.h"
	#include "sanitizer_common/sanitizer_atomic_clang.h"
	#include "sanitizer_common/sanitizer_common.h"
	#include "sanitizer_common/sanitizer_dense_map.h"
	#include "sanitizer_common/sanitizer_libc.h"
	#include "sanitizer_common/sanitizer_mutex.h"
	#include "sanitizer_common/sanitizer_placement_new.h"
	#include "sanitizer_common/sanitizer_thread_safety.h"
	#include "sanitizer_common/sanitizer_vector.h"

	#include <assert.h>

	using namespace __ctx_profile;

	namespace {
	// Keep track of all the context roots we actually saw, so we can then traverse
	// them when the user asks for the profile in __llvm_ctx_profile_fetch
	__sanitizer::SpinMutex AllContextsMutex;
	SANITIZER_GUARDED_BY(AllContextsMutex)
	__sanitizer::Vector<ContextRoot *> AllContextRoots;

	__sanitizer::atomic_uintptr_t AllFunctionsData = {};

	// Keep all the functions for which we collect a flat profile in a linked list.
	__sanitizer::SpinMutex FlatCtxArenaMutex;
	SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
	Arena *FlatCtxArenaHead = nullptr;
	SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
	Arena *FlatCtxArena = nullptr;

	// Set to true when we enter a root, and false when we exit - regardless if this
	// thread collects a contextual profile for that root.
	__thread bool IsUnderContext = false;
	__sanitizer::atomic_uint8_t ProfilingStarted = {};

	__sanitizer::atomic_uintptr_t RootDetector = {};
	RootAutoDetector *getRootDetector() {
	return reinterpret_cast<RootAutoDetector *>(
	__sanitizer::atomic_load_relaxed(&RootDetector));
	}

	// utility to taint a pointer by setting the LSB. There is an assumption
	// throughout that the addresses of contexts are even (really, they should be
	// align(8), but "even"-ness is the minimum assumption)
	// "scratch contexts" are buffers that we return in certain cases - they are
	// large enough to allow for memory safe counter access, but they don't link
	// subcontexts below them (the runtime recognizes them and enforces that)
	ContextNode markAsScratch(const ContextNode Ctx) {
	return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) \| 1);
	}

	// Used when getting the data from TLS. We don't really need to reset, but
	// it's a simpler system if we do.
	template <typename T> inline T consume(T &V) {
	auto R = V;
	V = {0};
	return R;
	}

	// We allocate at least kBuffSize Arena pages. The scratch buffer is also that
	// large.
	constexpr size_t kPower = 20;
	constexpr size_t kBuffSize = 1 << kPower;

	// Highly unlikely we need more than kBuffSize for a context.
	size_t getArenaAllocSize(size_t Needed) {
	if (Needed >= kBuffSize)
	return 2 * Needed;
	return kBuffSize;
	}

	// verify the structural integrity of the context
	bool validate(const ContextRoot *Root) {
	// all contexts should be laid out in some arena page. Go over each arena
	// allocated for this Root, and jump over contained contexts based on
	// self-reported sizes.
	__sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
	for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
	const auto *Pos = Mem->start();
	while (Pos < Mem->pos()) {
	const auto Ctx = reinterpret_cast<const ContextNode >(Pos);
	if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
	.second)
	return false;
	Pos += Ctx->size();
	}
	}

	// Now traverse the contexts again the same way, but validate all nonull
	// subcontext addresses appear in the set computed above.
	for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
	const auto *Pos = Mem->start();
	while (Pos < Mem->pos()) {
	const auto Ctx = reinterpret_cast<const ContextNode >(Pos);
	for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
	for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
	if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
	return false;

	Pos += Ctx->size();
	}
	}
	return true;
	}

	inline ContextNode allocContextNode(char Place, GUID Guid,
	uint32_t NumCounters,
	uint32_t NumCallsites,
	ContextNode *Next = nullptr) {
	assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
	return new (Place) ContextNode(Guid, NumCounters, NumCallsites, Next);
	}

	void resetContextNode(ContextNode &Node) {
	// FIXME(mtrofin): this is std::memset, which we can probably use if we
	// drop/reduce the dependency on sanitizer_common.
	for (uint32_t I = 0; I < Node.counters_size(); ++I)
	Node.counters()[I] = 0;
	for (uint32_t I = 0; I < Node.callsites_size(); ++I)
	for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next())
	resetContextNode(*Next);
	}

	ContextNode *onContextEnter(ContextNode &Node) {
	++Node.counters()[0];
	return &Node;
	}

	} // namespace

	// the scratch buffer - what we give when we can't produce a real context (the
	// scratch isn't "real" in that it's expected to be clobbered carelessly - we
	// don't read it). The other important thing is that the callees from a scratch
	// context also get a scratch context.
	// Eventually this can be replaced with per-function buffers, a'la the typical
	// (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
	// the part about determining the nature of the subcontexts does.
	__thread char __Buffer[kBuffSize] = {0};

	#define TheScratchContext \
	markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))

	// init the TLSes
	__thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
	nullptr};
	__thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};

	__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
	nullptr;

	Arena::Arena(uint32_t Size) : Size(Size) {
	__sanitizer::internal_memset(start(), 0, Size);
	}

	// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
	// the dependency on the latter.
	Arena Arena::allocateNewArena(size_t Size, Arena Prev) {
	assert(!Prev \|\| Prev->Next == nullptr);
	Arena *NewArena = new (__sanitizer::InternalAlloc(
	Size + sizeof(Arena), /cache=/nullptr, /alignment=/ExpectedAlignment))
	Arena(Size);
	if (Prev)
	Prev->Next = NewArena;
	return NewArena;
	}

	void Arena::freeArenaList(Arena *&A) {
	assert(A);
	for (auto *I = A; I != nullptr;) {
	auto *Current = I;
	I = I->Next;
	__sanitizer::InternalFree(Current);
	}
	A = nullptr;
	}

	// If this is the first time we hit a callsite with this (Guid) particular
	// callee, we need to allocate.
	ContextNode getCallsiteSlow(GUID Guid, ContextNode *InsertionPoint,
	uint32_t NumCounters, uint32_t NumCallsites) {
	auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites);
	auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
	char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
	if (!AllocPlace) {
	// if we failed to allocate on the current arena, allocate a new arena,
	// and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
	// find it from now on for other cases when we need to getCallsiteSlow.
	// Note that allocateNewArena will link the allocated memory in the list of
	// Arenas.
	__llvm_ctx_profile_current_context_root->CurrentMem = Mem =
	Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
	AllocPlace = Mem->tryBumpAllocate(AllocSize);
	}
	auto *Ret = allocContextNode(AllocPlace, Guid, NumCounters, NumCallsites,
	*InsertionPoint);
	*InsertionPoint = Ret;
	return Ret;
	}

	ContextNode getFlatProfile(FunctionData &Data, void Callee, GUID Guid,
	uint32_t NumCounters) {
	if (ContextNode *Existing = Data.FlatCtx)
	return Existing;
	{
	// We could instead try to take the lock and, if that fails, return
	// TheScratchContext. But that could leave message pump loops more sparsely
	// profiled than everything else. Maybe that doesn't matter, and we can
	// optimize this later.
	__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Data.Mutex);
	if (ContextNode *Existing = Data.FlatCtx)
	return Existing;

	auto NeededSize = ContextNode::getAllocSize(NumCounters, 0);
	char *AllocBuff = nullptr;
	{
	__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> FL(
	&FlatCtxArenaMutex);
	if (FlatCtxArena)
	AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
	if (!AllocBuff) {
	FlatCtxArena = Arena::allocateNewArena(getArenaAllocSize(NeededSize),
	FlatCtxArena);
	AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
	}
	if (!FlatCtxArenaHead)
	FlatCtxArenaHead = FlatCtxArena;
	}
	auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
	Data.FlatCtx = Ret;

	Data.EntryAddress = Callee;
	Data.Next = reinterpret_cast<FunctionData *>(
	__sanitizer::atomic_load_relaxed(&AllFunctionsData));
	while (!__sanitizer::atomic_compare_exchange_strong(
	&AllFunctionsData, reinterpret_cast<uintptr_t *>(&Data.Next),
	reinterpret_cast<uintptr_t>(&Data),
	__sanitizer::memory_order_release)) {
	}
	}

	return Data.FlatCtx;
	}

	// This should be called once for a Root. Allocate the first arena, set up the
	// first context.
	void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters,
	uint32_t NumCallsites) {
	__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
	&AllContextsMutex);
	// Re-check - we got here without having had taken a lock.
	if (Root->FirstMemBlock)
	return;
	const auto Needed = ContextNode::getAllocSize(NumCounters, NumCallsites);
	auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
	Root->FirstMemBlock = M;
	Root->CurrentMem = M;
	Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid,
	NumCounters, NumCallsites);
	AllContextRoots.PushBack(Root);
	}

	ContextRoot *FunctionData::getOrAllocateContextRoot() {
	auto *Root = CtxRoot;
	if (Root)
	return Root;
	__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Mutex);
	Root = CtxRoot;
	if (!Root) {
	Root = new (__sanitizer::InternalAlloc(sizeof(ContextRoot))) ContextRoot();
	CtxRoot = Root;
	}

	assert(Root);
	return Root;
	}

	ContextNode tryStartContextGivenRoot(ContextRoot Root, GUID Guid,
	uint32_t Counters, uint32_t Callsites)
	SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
	IsUnderContext = true;
	__sanitizer::atomic_fetch_add(&Root->TotalEntries, 1,
	__sanitizer::memory_order_relaxed);
	if (!Root->FirstMemBlock) {
	setupContext(Root, Guid, Counters, Callsites);
	}
	if (Root->Taken.TryLock()) {
	__llvm_ctx_profile_current_context_root = Root;
	onContextEnter(*Root->FirstNode);
	return Root->FirstNode;
	}
	// If this thread couldn't take the lock, return scratch context.
	__llvm_ctx_profile_current_context_root = nullptr;
	return TheScratchContext;
	}

	ContextNode getUnhandledContext(FunctionData &Data, void Callee, GUID Guid,
	uint32_t NumCounters, uint32_t NumCallsites,
	ContextRoot *CtxRoot) {

	// 1) if we are currently collecting a contextual profile, fetch a ContextNode
	// in the `Unhandled` set. We want to do this regardless of `ProfilingStarted`
	// to (hopefully) offset the penalty of creating these contexts to before
	// profiling.
	//
	// 2) if we are under a root (regardless if this thread is collecting or not a
	// contextual profile for that root), do not collect a flat profile. We want
	// to keep flat profiles only for activations that can't happen under a root,
	// to avoid confusing profiles. We can, for example, combine flattened and
	// flat profiles meaningfully, as we wouldn't double-count anything.
	//
	// 3) to avoid lengthy startup, don't bother with flat profiles until the
	// profiling has started. We would reset them anyway when profiling starts.
	// HOWEVER. This does lose profiling for message pumps: those functions are
	// entered once and never exit. They should be assumed to be entered before
	// profiling starts - because profiling should start after the server is up
	// and running (which is equivalent to "message pumps are set up").
	if (!CtxRoot) {
	if (auto *RAD = getRootDetector())
	RAD->sample();
	else if (auto *CR = Data.CtxRoot)
	return tryStartContextGivenRoot(CR, Guid, NumCounters, NumCallsites);
	if (IsUnderContext \|\| !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
	return TheScratchContext;
	else
	return markAsScratch(
	onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters)));
	}
	auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr});
	if (Ins)
	Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode,
	NumCounters, 0);
	return markAsScratch(onContextEnter(*Iter->second));
	}

	ContextNode __llvm_ctx_profile_get_context(FunctionData Data, void *Callee,
	GUID Guid, uint32_t NumCounters,
	uint32_t NumCallsites) {
	auto *CtxRoot = __llvm_ctx_profile_current_context_root;
	// fast "out" if we're not even doing contextual collection.
	if (!CtxRoot)
	return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
	nullptr);

	// also fast "out" if the caller is scratch. We can see if it's scratch by
	// looking at the interior pointer into the subcontexts vector that the caller
	// provided, which, if the context is scratch, so is that interior pointer
	// (because all the address calculations are using even values. Or more
	// precisely, aligned - 8 values)
	auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
	if (!CallsiteContext \|\| isScratch(CallsiteContext))
	return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
	CtxRoot);

	// if the callee isn't the expected one, return scratch.
	// Signal handler(s) could have been invoked at any point in the execution.
	// Should that have happened, and had it (the handler) be built with
	// instrumentation, its __llvm_ctx_profile_get_context would have failed here.
	// Its sub call graph would have then populated
	// __llvm_ctx_profile_{expected_callee \| callsite} at index 1.
	// The normal call graph may be impacted in that, if the signal handler
	// happened somewhere before we read the TLS here, we'd see the TLS reset and
	// we'd also fail here. That would just mean we would loose counter values for
	// the normal subgraph, this time around. That should be very unlikely, but if
	// it happens too frequently, we should be able to detect discrepancies in
	// entry counts (caller-callee). At the moment, the design goes on the
	// assumption that is so unfrequent, though, that it's not worth doing more
	// for that case.
	auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
	if (ExpectedCallee != Callee)
	return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
	CtxRoot);

	auto Callsite = CallsiteContext;
	// in the case of indirect calls, we will have all seen targets forming a
	// linked list here. Find the one corresponding to this callee.
	while (Callsite && Callsite->guid() != Guid) {
	Callsite = Callsite->next();
	}
	auto *Ret = Callsite ? Callsite
	: getCallsiteSlow(Guid, CallsiteContext, NumCounters,
	NumCallsites);
	if (Ret->callsites_size() != NumCallsites \|\|
	Ret->counters_size() != NumCounters)
	__sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
	"Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
	reinterpret_cast<void *>(Ret), Guid, NumCallsites,
	NumCounters, Ret->guid(), Ret->callsites_size(),
	Ret->counters_size());
	onContextEnter(*Ret);
	return Ret;
	}

	ContextNode __llvm_ctx_profile_start_context(FunctionData FData, GUID Guid,
	uint32_t Counters,
	uint32_t Callsites) {

	return tryStartContextGivenRoot(FData->getOrAllocateContextRoot(), Guid,
	Counters, Callsites);
	}

	void __llvm_ctx_profile_release_context(FunctionData *FData)
	SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
	const auto *CurrentRoot = __llvm_ctx_profile_current_context_root;
	if (!CurrentRoot \|\| FData->CtxRoot != CurrentRoot)
	return;
	IsUnderContext = false;
	assert(FData->CtxRoot);
	__llvm_ctx_profile_current_context_root = nullptr;
	FData->CtxRoot->Taken.Unlock();
	}

	void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration) {
	size_t NumMemUnits = 0;
	__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
	&AllContextsMutex);
	for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
	auto *Root = AllContextRoots[I];
	__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
	&Root->Taken);
	for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
	++NumMemUnits;

	resetContextNode(*Root->FirstNode);
	if (Root->FirstUnhandledCalleeNode)
	resetContextNode(*Root->FirstUnhandledCalleeNode);
	__sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0);
	}
	if (AutodetectDuration) {
	// we leak RD intentionally. Knowing when to free it is tricky, there's a
	// race condition with functions observing the `RootDectector` as non-null.
	// This can be addressed but the alternatives have some added complexity and
	// it's not (yet) worth it.
	auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector)))
	RootAutoDetector(AllFunctionsData, RootDetector, AutodetectDuration);
	RD->start();
	} else {
	__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
	}
	__sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
	}

	bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
	__sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
	if (auto *RD = getRootDetector()) {
	__sanitizer::Printf("[ctxprof] Expected the root autodetector to have "
	"finished well before attempting to fetch a context");
	RD->join();
	}

	__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
	&AllContextsMutex);

	Writer.startContextSection();
	for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
	auto *Root = AllContextRoots[I];
	__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
	&Root->Taken);
	if (!validate(Root)) {
	__sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
	return false;
	}
	Writer.writeContextual(
	*Root->FirstNode, Root->FirstUnhandledCalleeNode,
	__sanitizer::atomic_load_relaxed(&Root->TotalEntries));
	}
	Writer.endContextSection();
	Writer.startFlatSection();
	// The list progresses behind the head, so taking this snapshot allows the
	// list to grow concurrently without causing a race condition with our
	// traversing it.
	const auto Pos = reinterpret_cast<const FunctionData >(
	__sanitizer::atomic_load_relaxed(&AllFunctionsData));
	for (; Pos; Pos = Pos->Next)
	if (!Pos->CtxRoot)
	Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
	Pos->FlatCtx->counters_size());
	Writer.endFlatSection();
	return true;
	}

	void __llvm_ctx_profile_free() {
	__sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
	{
	__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
	&AllContextsMutex);
	for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
	for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
	auto *C = A;
	A = A->next();
	__sanitizer::InternalFree(C);
	}
	AllContextRoots.Reset();
	}
	__sanitizer::atomic_store_relaxed(&AllFunctionsData, 0U);
	{
	__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
	&FlatCtxArenaMutex);
	FlatCtxArena = nullptr;
	for (auto *A = FlatCtxArenaHead; A;) {
	auto *C = A;
	A = C->next();
	__sanitizer::InternalFree(C);
	}

	FlatCtxArenaHead = nullptr;
	}
	}