| //===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file is a part of XRay, a dynamic runtime instrumentation system. |
| // |
| // This implements the interface for the profileCollectorService. |
| // |
| //===----------------------------------------------------------------------===// |
| #include "xray_profile_collector.h" |
| #include "sanitizer_common/sanitizer_common.h" |
| #include "xray_allocator.h" |
| #include "xray_defs.h" |
| #include "xray_profiling_flags.h" |
| #include "xray_segmented_array.h" |
| #include <memory> |
| #include <pthread.h> |
| #include <utility> |
| |
| namespace __xray { |
| namespace profileCollectorService { |
| |
| namespace { |
| |
| SpinMutex GlobalMutex; |
| struct ThreadTrie { |
| tid_t TId; |
| typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage; |
| }; |
| |
| struct ProfileBuffer { |
| void *Data; |
| size_t Size; |
| }; |
| |
| // Current version of the profile format. |
| constexpr u64 XRayProfilingVersion = 0x20180424; |
| |
| // Identifier for XRay profiling files 'xrayprof' in hex. |
| constexpr u64 XRayMagicBytes = 0x7872617970726f66; |
| |
| struct XRayProfilingFileHeader { |
| const u64 MagicBytes = XRayMagicBytes; |
| const u64 Version = XRayProfilingVersion; |
| u64 Timestamp = 0; // System time in nanoseconds. |
| u64 PID = 0; // Process ID. |
| }; |
| |
| struct BlockHeader { |
| u32 BlockSize; |
| u32 BlockNum; |
| u64 ThreadId; |
| }; |
| |
| struct ThreadData { |
| BufferQueue *BQ; |
| FunctionCallTrie::Allocators::Buffers Buffers; |
| FunctionCallTrie::Allocators Allocators; |
| FunctionCallTrie FCT; |
| tid_t TId; |
| }; |
| |
| using ThreadDataArray = Array<ThreadData>; |
| using ThreadDataAllocator = ThreadDataArray::AllocatorType; |
| |
| // We use a separate buffer queue for the backing store for the allocator used |
| // by the ThreadData array. This lets us host the buffers, allocators, and tries |
| // associated with a thread by moving the data into the array instead of |
| // attempting to copy the data to a separately backed set of tries. |
| static typename std::aligned_storage< |
| sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage; |
| static BufferQueue *BQ = nullptr; |
| static BufferQueue::Buffer Buffer; |
| static typename std::aligned_storage<sizeof(ThreadDataAllocator), |
| alignof(ThreadDataAllocator)>::type |
| ThreadDataAllocatorStorage; |
| static typename std::aligned_storage<sizeof(ThreadDataArray), |
| alignof(ThreadDataArray)>::type |
| ThreadDataArrayStorage; |
| |
| static ThreadDataAllocator *TDAllocator = nullptr; |
| static ThreadDataArray *TDArray = nullptr; |
| |
| using ProfileBufferArray = Array<ProfileBuffer>; |
| using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; |
| |
| // These need to be global aligned storage to avoid dynamic initialization. We |
| // need these to be aligned to allow us to placement new objects into the |
| // storage, and have pointers to those objects be appropriately aligned. |
| static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type |
| ProfileBuffersStorage; |
| static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type |
| ProfileBufferArrayAllocatorStorage; |
| |
| static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; |
| static ProfileBufferArray *ProfileBuffers = nullptr; |
| |
| // Use a global flag to determine whether the collector implementation has been |
| // initialized. |
| static atomic_uint8_t CollectorInitialized{0}; |
| |
| } // namespace |
| |
| void post(BufferQueue *Q, FunctionCallTrie &&T, |
| FunctionCallTrie::Allocators &&A, |
| FunctionCallTrie::Allocators::Buffers &&B, |
| tid_t TId) XRAY_NEVER_INSTRUMENT { |
| DCHECK_NE(Q, nullptr); |
| |
| // Bail out early if the collector has not been initialized. |
| if (!atomic_load(&CollectorInitialized, memory_order_acquire)) { |
| T.~FunctionCallTrie(); |
| A.~Allocators(); |
| Q->releaseBuffer(B.NodeBuffer); |
| Q->releaseBuffer(B.RootsBuffer); |
| Q->releaseBuffer(B.ShadowStackBuffer); |
| Q->releaseBuffer(B.NodeIdPairBuffer); |
| B.~Buffers(); |
| return; |
| } |
| |
| { |
| SpinMutexLock Lock(&GlobalMutex); |
| DCHECK_NE(TDAllocator, nullptr); |
| DCHECK_NE(TDArray, nullptr); |
| |
| if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T), |
| TId) == nullptr) { |
| // If we fail to add the data to the array, we should destroy the objects |
| // handed us. |
| T.~FunctionCallTrie(); |
| A.~Allocators(); |
| Q->releaseBuffer(B.NodeBuffer); |
| Q->releaseBuffer(B.RootsBuffer); |
| Q->releaseBuffer(B.ShadowStackBuffer); |
| Q->releaseBuffer(B.NodeIdPairBuffer); |
| B.~Buffers(); |
| } |
| } |
| } |
| |
| // A PathArray represents the function id's representing a stack trace. In this |
| // context a path is almost always represented from the leaf function in a call |
| // stack to a root of the call trie. |
| using PathArray = Array<int32_t>; |
| |
| struct ProfileRecord { |
| using PathAllocator = typename PathArray::AllocatorType; |
| |
| // The Path in this record is the function id's from the leaf to the root of |
| // the function call stack as represented from a FunctionCallTrie. |
| PathArray Path; |
| const FunctionCallTrie::Node *Node; |
| }; |
| |
| namespace { |
| |
| using ProfileRecordArray = Array<ProfileRecord>; |
| |
| // Walk a depth-first traversal of each root of the FunctionCallTrie to generate |
| // the path(s) and the data associated with the path. |
| static void |
| populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, |
| const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT { |
| using StackArray = Array<const FunctionCallTrie::Node *>; |
| using StackAllocator = typename StackArray::AllocatorType; |
| StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); |
| StackArray DFSStack(StackAlloc); |
| for (const auto *R : Trie.getRoots()) { |
| DFSStack.Append(R); |
| while (!DFSStack.empty()) { |
| auto *Node = DFSStack.back(); |
| DFSStack.trim(1); |
| if (Node == nullptr) |
| continue; |
| auto Record = PRs.AppendEmplace(PathArray{PA}, Node); |
| if (Record == nullptr) |
| return; |
| DCHECK_NE(Record, nullptr); |
| |
| // Traverse the Node's parents and as we're doing so, get the FIds in |
| // the order they appear. |
| for (auto N = Node; N != nullptr; N = N->Parent) |
| Record->Path.Append(N->FId); |
| DCHECK(!Record->Path.empty()); |
| |
| for (const auto C : Node->Callees) |
| DFSStack.Append(C.NodePtr); |
| } |
| } |
| } |
| |
| static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, |
| const ProfileRecordArray &ProfileRecords) |
| XRAY_NEVER_INSTRUMENT { |
| auto NextPtr = static_cast<uint8_t *>( |
| internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + |
| sizeof(Header); |
| for (const auto &Record : ProfileRecords) { |
| // List of IDs follow: |
| for (const auto FId : Record.Path) |
| NextPtr = |
| static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) + |
| sizeof(FId); |
| |
| // Add the sentinel here. |
| constexpr int32_t SentinelFId = 0; |
| NextPtr = static_cast<uint8_t *>( |
| internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + |
| sizeof(SentinelFId); |
| |
| // Add the node data here. |
| NextPtr = |
| static_cast<uint8_t *>(internal_memcpy( |
| NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) + |
| sizeof(Record.Node->CallCount); |
| NextPtr = static_cast<uint8_t *>( |
| internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, |
| sizeof(Record.Node->CumulativeLocalTime))) + |
| sizeof(Record.Node->CumulativeLocalTime); |
| } |
| |
| DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size); |
| } |
| |
| } // namespace |
| |
| void serialize() XRAY_NEVER_INSTRUMENT { |
| if (!atomic_load(&CollectorInitialized, memory_order_acquire)) |
| return; |
| |
| SpinMutexLock Lock(&GlobalMutex); |
| |
| // Clear out the global ProfileBuffers, if it's not empty. |
| for (auto &B : *ProfileBuffers) |
| deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size); |
| ProfileBuffers->trim(ProfileBuffers->size()); |
| |
| DCHECK_NE(TDArray, nullptr); |
| if (TDArray->empty()) |
| return; |
| |
| // Then repopulate the global ProfileBuffers. |
| u32 I = 0; |
| auto MaxSize = profilingFlags()->global_allocator_max; |
| auto ProfileArena = allocateBuffer(MaxSize); |
| if (ProfileArena == nullptr) |
| return; |
| |
| auto ProfileArenaCleanup = at_scope_exit( |
| [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); }); |
| |
| auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max); |
| if (PathArena == nullptr) |
| return; |
| |
| auto PathArenaCleanup = at_scope_exit( |
| [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); }); |
| |
| for (const auto &ThreadTrie : *TDArray) { |
| using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; |
| ProfileRecordAllocator PRAlloc(ProfileArena, |
| profilingFlags()->global_allocator_max); |
| ProfileRecord::PathAllocator PathAlloc( |
| PathArena, profilingFlags()->global_allocator_max); |
| ProfileRecordArray ProfileRecords(PRAlloc); |
| |
| // First, we want to compute the amount of space we're going to need. We'll |
| // use a local allocator and an __xray::Array<...> to store the intermediary |
| // data, then compute the size as we're going along. Then we'll allocate the |
| // contiguous space to contain the thread buffer data. |
| if (ThreadTrie.FCT.getRoots().empty()) |
| continue; |
| |
| populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT); |
| DCHECK(!ThreadTrie.FCT.getRoots().empty()); |
| DCHECK(!ProfileRecords.empty()); |
| |
| // Go through each record, to compute the sizes. |
| // |
| // header size = block size (4 bytes) |
| // + block number (4 bytes) |
| // + thread id (8 bytes) |
| // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) |
| // + call count (8 bytes) |
| // + local time (8 bytes) |
| // + end of record (8 bytes) |
| u32 CumulativeSizes = 0; |
| for (const auto &Record : ProfileRecords) |
| CumulativeSizes += 20 + (4 * Record.Path.size()); |
| |
| BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId}; |
| auto B = ProfileBuffers->Append({}); |
| B->Size = sizeof(Header) + CumulativeSizes; |
| B->Data = allocateBuffer(B->Size); |
| DCHECK_NE(B->Data, nullptr); |
| serializeRecords(B, Header, ProfileRecords); |
| } |
| } |
| |
| void reset() XRAY_NEVER_INSTRUMENT { |
| atomic_store(&CollectorInitialized, 0, memory_order_release); |
| SpinMutexLock Lock(&GlobalMutex); |
| |
| if (ProfileBuffers != nullptr) { |
| // Clear out the profile buffers that have been serialized. |
| for (auto &B : *ProfileBuffers) |
| deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size); |
| ProfileBuffers->trim(ProfileBuffers->size()); |
| ProfileBuffers = nullptr; |
| } |
| |
| if (TDArray != nullptr) { |
| // Release the resources as required. |
| for (auto &TD : *TDArray) { |
| TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer); |
| TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer); |
| TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer); |
| TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer); |
| } |
| // We don't bother destroying the array here because we've already |
| // potentially freed the backing store for the array. Instead we're going to |
| // reset the pointer to nullptr, and re-use the storage later instead |
| // (placement-new'ing into the storage as-is). |
| TDArray = nullptr; |
| } |
| |
| if (TDAllocator != nullptr) { |
| TDAllocator->~Allocator(); |
| TDAllocator = nullptr; |
| } |
| |
| if (Buffer.Data != nullptr) { |
| BQ->releaseBuffer(Buffer); |
| } |
| |
| if (BQ == nullptr) { |
| bool Success = false; |
| new (&BufferQueueStorage) |
| BufferQueue(profilingFlags()->global_allocator_max, 1, Success); |
| if (!Success) |
| return; |
| BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); |
| } else { |
| BQ->finalize(); |
| |
| if (BQ->init(profilingFlags()->global_allocator_max, 1) != |
| BufferQueue::ErrorCode::Ok) |
| return; |
| } |
| |
| if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok) |
| return; |
| |
| new (&ProfileBufferArrayAllocatorStorage) |
| ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); |
| ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>( |
| &ProfileBufferArrayAllocatorStorage); |
| |
| new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator); |
| ProfileBuffers = |
| reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage); |
| |
| new (&ThreadDataAllocatorStorage) |
| ThreadDataAllocator(Buffer.Data, Buffer.Size); |
| TDAllocator = |
| reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage); |
| new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator); |
| TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage); |
| |
| atomic_store(&CollectorInitialized, 1, memory_order_release); |
| } |
| |
| XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { |
| SpinMutexLock Lock(&GlobalMutex); |
| |
| if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0) |
| return {nullptr, 0}; |
| |
| static pthread_once_t Once = PTHREAD_ONCE_INIT; |
| static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type |
| FileHeaderStorage; |
| pthread_once( |
| &Once, +[]() XRAY_NEVER_INSTRUMENT { |
| new (&FileHeaderStorage) XRayProfilingFileHeader{}; |
| }); |
| |
| if (UNLIKELY(B.Data == nullptr)) { |
| // The first buffer should always contain the file header information. |
| auto &FileHeader = |
| *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage); |
| FileHeader.Timestamp = NanoTime(); |
| FileHeader.PID = internal_getpid(); |
| return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)}; |
| } |
| |
| if (UNLIKELY(B.Data == &FileHeaderStorage)) |
| return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size}; |
| |
| BlockHeader Header; |
| internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); |
| auto NextBlock = Header.BlockNum + 1; |
| if (NextBlock < ProfileBuffers->size()) |
| return {(*ProfileBuffers)[NextBlock].Data, |
| (*ProfileBuffers)[NextBlock].Size}; |
| return {nullptr, 0}; |
| } |
| |
| } // namespace profileCollectorService |
| } // namespace __xray |