[XRay] Simplify FDR buffer management Summary: This change makes XRay FDR mode use a single backing store for the buffer queue, and have indexes into that backing store instead. We also remove the reliance on the internal allocator implementation in the FDR mode logging implementation. In the process of making this change we found an inconsistency with the way we're returning buffers to the queue, and how we're setting the extents. We take the chance to simplify the way we're managing the extents of each buffer. It turns out we do not need the indirection for the extents, so we co-host the atomic 64-bit int with the buffer object. It also seems that we've not been returning the buffers for the thread running the flush functionality when writing out the files, so we can run into a situation where we could be missing data. We consolidate all the allocation routines now into xray_allocator.h, where we used to have routines defined in xray_buffer_queue.cc. Reviewers: mboerger, eizan Subscribers: jfb, llvm-commits Differential Revision: https://reviews.llvm.org/D52077 git-svn-id: https://llvm.org/svn/llvm-project/compiler-rt/trunk@342356 91177308-0d34-0410-b5e6-96231b3b80d8

commit: 641b245943334ad18b672e0ed85497f47fa19b45 [log] [tgz]
author: Dean Michael Berris <dberris@google.com> Mon Sep 17 03:09:01 2018 +0000
committer: Dean Michael Berris <dberris@google.com> Mon Sep 17 03:09:01 2018 +0000
tree: 44d3e284c3e329f296b38ddb2dba912e6c552fe9
parent: 31c8a31facbcf213ff5e2c9f066c4577f8208d99 [diff]
diff --git a/lib/xray/xray_allocator.h b/lib/xray/xray_allocator.h
index 5a78273..aea1280 100644
--- a/lib/xray/xray_allocator.h
+++ b/lib/xray/xray_allocator.h

@@ -50,23 +50,33 @@
   internal_munmap(B, sizeof(T));
 }
 
-inline void *allocateBuffer(size_t S) XRAY_NEVER_INSTRUMENT {
-  auto B = reinterpret_cast<void *>(internal_mmap(
-      NULL, S, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+template <class T = uint8_t> T *allocateBuffer(size_t S) XRAY_NEVER_INSTRUMENT {
+  auto B = reinterpret_cast<void *>(
+      internal_mmap(NULL, S * sizeof(T), PROT_READ | PROT_WRITE,
+                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
   if (B == MAP_FAILED) {
     if (Verbosity())
       Report("XRay Profiling: Failed to allocate memory of size %d.\n", S);
     return nullptr;
   }
-  return B;
+  return reinterpret_cast<T *>(B);
 }
 
-inline void deallocateBuffer(void *B, size_t S) XRAY_NEVER_INSTRUMENT {
+template <class T> void deallocateBuffer(T *B, size_t S) XRAY_NEVER_INSTRUMENT {
   if (B == nullptr)
     return;
   internal_munmap(B, S);
 }
 
+template <class T, class... U>
+T *initArray(size_t N, U &&... Us) XRAY_NEVER_INSTRUMENT {
+  auto A = allocateBuffer<T>(N);
+  if (A != nullptr)
+    while (N > 0)
+      new (A + (--N)) T(std::forward<U>(Us)...);
+  return A;
+}
+
 /// The Allocator type hands out fixed-sized chunks of memory that are
 /// cache-line aligned and sized. This is useful for placement of
 /// performance-sensitive data in memory that's frequently accessed. The

diff --git a/lib/xray/xray_buffer_queue.cc b/lib/xray/xray_buffer_queue.cc
index 5bdd914..5a88ecd 100644
--- a/lib/xray/xray_buffer_queue.cc
+++ b/lib/xray/xray_buffer_queue.cc

@@ -16,68 +16,42 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_posix.h"
+#include "xray_allocator.h"
+#include "xray_defs.h"
 #include <memory>
 #include <sys/mman.h>
 
 using namespace __xray;
 using namespace __sanitizer;
 
-template <class T> static T *allocRaw(size_t N) {
-  // TODO: Report errors?
-  void *A = reinterpret_cast<void *>(
-      internal_mmap(NULL, N * sizeof(T), PROT_WRITE | PROT_READ,
-                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
-  return (A == MAP_FAILED) ? nullptr : reinterpret_cast<T *>(A);
-}
-
-template <class T> static void deallocRaw(T *ptr, size_t N) {
-  // TODO: Report errors?
-  if (ptr != nullptr)
-    internal_munmap(ptr, N);
-}
-
-template <class T> static T *initArray(size_t N) {
-  auto A = allocRaw<T>(N);
-  if (A != nullptr)
-    while (N > 0)
-      new (A + (--N)) T();
-  return A;
-}
-
-BufferQueue::BufferQueue(size_t B, size_t N, bool &Success)
-    : BufferSize(B), Buffers(initArray<BufferQueue::BufferRep>(N)),
-      BufferCount(N), Finalizing{0}, OwnedBuffers(initArray<void *>(N)),
-      Next(Buffers), First(Buffers), LiveBuffers(0) {
-  if (Buffers == nullptr) {
+BufferQueue::BufferQueue(size_t B, size_t N,
+                         bool &Success) XRAY_NEVER_INSTRUMENT
+    : BufferSize(B),
+      BufferCount(N),
+      Mutex(),
+      Finalizing{0},
+      BackingStore(allocateBuffer(B *N)),
+      Buffers(initArray<BufferQueue::BufferRep>(N)),
+      Next(Buffers),
+      First(Buffers),
+      LiveBuffers(0) {
+  if (BackingStore == nullptr) {
     Success = false;
     return;
   }
-  if (OwnedBuffers == nullptr) {
-    // Clean up the buffers we've already allocated.
-    for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
-      B->~BufferRep();
-    deallocRaw(Buffers, N);
+  if (Buffers == nullptr) {
+    deallocateBuffer(BackingStore, BufferSize * BufferCount);
     Success = false;
     return;
-  };
+  }
 
   for (size_t i = 0; i < N; ++i) {
     auto &T = Buffers[i];
-    void *Tmp = allocRaw<char>(BufferSize);
-    if (Tmp == nullptr) {
-      Success = false;
-      return;
-    }
-    auto *Extents = allocRaw<BufferExtents>(1);
-    if (Extents == nullptr) {
-      Success = false;
-      return;
-    }
     auto &Buf = T.Buff;
-    Buf.Data = Tmp;
+    Buf.Data = reinterpret_cast<char *>(BackingStore) + (BufferSize * i);
     Buf.Size = B;
-    Buf.Extents = Extents;
-    OwnedBuffers[i] = Tmp;
+    atomic_store(&Buf.Extents, 0, memory_order_release);
+    T.Used = false;
   }
   Success = true;
 }
@@ -85,13 +59,17 @@
 BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) {
   if (atomic_load(&Finalizing, memory_order_acquire))
     return ErrorCode::QueueFinalizing;
+
   SpinMutexLock Guard(&Mutex);
   if (LiveBuffers == BufferCount)
     return ErrorCode::NotEnoughMemory;
 
   auto &T = *Next;
   auto &B = T.Buff;
-  Buf = B;
+  auto Extents = atomic_load(&B.Extents, memory_order_acquire);
+  atomic_store(&Buf.Extents, Extents, memory_order_release);
+  Buf.Data = B.Data;
+  Buf.Size = B.Size;
   T.Used = true;
   ++LiveBuffers;
 
@@ -102,15 +80,11 @@
 }
 
 BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
-  // Blitz through the buffers array to find the buffer.
-  bool Found = false;
-  for (auto I = OwnedBuffers, E = OwnedBuffers + BufferCount; I != E; ++I) {
-    if (*I == Buf.Data) {
-      Found = true;
-      break;
-    }
-  }
-  if (!Found)
+  // Check whether the buffer being referred to is within the bounds of the
+  // backing store's range.
+  if (Buf.Data < BackingStore ||
+      Buf.Data >
+          reinterpret_cast<char *>(BackingStore) + (BufferCount * BufferSize))
     return ErrorCode::UnrecognizedBuffer;
 
   SpinMutexLock Guard(&Mutex);
@@ -121,10 +95,14 @@
     return ErrorCode::NotEnoughMemory;
 
   // Now that the buffer has been released, we mark it as "used".
-  First->Buff = Buf;
+  auto Extents = atomic_load(&Buf.Extents, memory_order_acquire);
+  atomic_store(&First->Buff.Extents, Extents, memory_order_release);
+  First->Buff.Data = Buf.Data;
+  First->Buff.Size = Buf.Size;
   First->Used = true;
   Buf.Data = nullptr;
   Buf.Size = 0;
+  atomic_store(&Buf.Extents, 0, memory_order_release);
   --LiveBuffers;
   if (++First == (Buffers + BufferCount))
     First = Buffers;
@@ -139,14 +117,8 @@
 }
 
 BufferQueue::~BufferQueue() {
-  for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) {
-    auto &T = *I;
-    auto &Buf = T.Buff;
-    deallocRaw(Buf.Data, Buf.Size);
-    deallocRaw(Buf.Extents, 1);
-  }
   for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
     B->~BufferRep();
-  deallocRaw(Buffers, BufferCount);
-  deallocRaw(OwnedBuffers, BufferCount);
+  deallocateBuffer(Buffers, BufferCount);
+  deallocateBuffer(BackingStore, BufferSize * BufferCount);
 }

diff --git a/lib/xray/xray_buffer_queue.h b/lib/xray/xray_buffer_queue.h
index e76fa79..bfb091e 100644
--- a/lib/xray/xray_buffer_queue.h
+++ b/lib/xray/xray_buffer_queue.h

@@ -18,6 +18,7 @@
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_mutex.h"
+#include "xray_defs.h"
 #include <cstddef>
 
 namespace __xray {
@@ -29,14 +30,10 @@
 /// trace collection.
 class BufferQueue {
 public:
-  struct alignas(64) BufferExtents {
-    atomic_uint64_t Size;
-  };
-
   struct Buffer {
+    atomic_uint64_t Extents{0};
     void *Data = nullptr;
     size_t Size = 0;
-    BufferExtents *Extents;
   };
 
   struct BufferRep {
@@ -76,8 +73,10 @@
 
     T *operator->() const { return &(Buffers[Offset].Buff); }
 
-    Iterator(BufferRep *Root, size_t O, size_t M)
-        : Buffers(Root), Offset(O), Max(M) {
+    Iterator(BufferRep *Root, size_t O, size_t M) XRAY_NEVER_INSTRUMENT
+        : Buffers(Root),
+          Offset(O),
+          Max(M) {
       // We want to advance to the first Offset where the 'Used' property is
       // true, or to the end of the list/queue.
       while (!Buffers[Offset].Used && Offset != Max) {
@@ -107,16 +106,18 @@
   // Size of each individual Buffer.
   size_t BufferSize;
 
-  BufferRep *Buffers;
-
   // Amount of pre-allocated buffers.
   size_t BufferCount;
 
   SpinMutex Mutex;
   atomic_uint8_t Finalizing;
 
-  // Pointers to buffers managed/owned by the BufferQueue.
-  void **OwnedBuffers;
+  // A pointer to a contiguous block of memory to serve as the backing store for
+  // all the individual buffers handed out.
+  void *BackingStore;
+
+  // A dynamically allocated array of BufferRep instances.
+  BufferRep *Buffers;
 
   // Pointer to the next buffer to be handed out.
   BufferRep *Next;
@@ -198,7 +199,7 @@
   /// Applies the provided function F to each Buffer in the queue, only if the
   /// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a
   /// releaseBuffer(...) operation).
-  template <class F> void apply(F Fn) {
+  template <class F> void apply(F Fn) XRAY_NEVER_INSTRUMENT {
     SpinMutexLock G(&Mutex);
     for (auto I = begin(), E = end(); I != E; ++I)
       Fn(*I);

diff --git a/lib/xray/xray_fdr_logging.cc b/lib/xray/xray_fdr_logging.cc
index 4b308b2..131c137 100644
--- a/lib/xray/xray_fdr_logging.cc
+++ b/lib/xray/xray_fdr_logging.cc

@@ -30,6 +30,7 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "xray/xray_interface.h"
 #include "xray/xray_records.h"
+#include "xray_allocator.h"
 #include "xray_buffer_queue.h"
 #include "xray_defs.h"
 #include "xray_fdr_flags.h"
@@ -47,7 +48,7 @@
 // force the alignment to 64-bytes for x86 cache line alignment, as this
 // structure is used in the hot path of implementation.
 struct alignas(64) ThreadLocalData {
-  BufferQueue::Buffer Buffer;
+  BufferQueue::Buffer Buffer{};
   char *RecordPtr = nullptr;
   // The number of FunctionEntry records immediately preceding RecordPtr.
   uint8_t NumConsecutiveFnEnters = 0;
@@ -81,6 +82,7 @@
 static pthread_key_t Key;
 
 // Global BufferQueue.
+static std::aligned_storage<sizeof(BufferQueue)>::type BufferQueueStorage;
 static BufferQueue *BQ = nullptr;
 
 static atomic_sint32_t LogFlushStatus = {
@@ -184,11 +186,10 @@
   TLD.NumTailCalls = 0;
   if (TLD.BQ == nullptr || TLD.BQ->finalizing())
     return;
-  internal_memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata));
-  TLD.RecordPtr += sizeof(Metadata);
-  // Since we write out the extents as the first metadata record of the
-  // buffer, we need to write out the extents including the extents record.
-  atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata),
+  internal_memcpy(TLD.RecordPtr, Metadata,
+                  sizeof(MetadataRecord) * InitRecordsCount);
+  TLD.RecordPtr += sizeof(MetadataRecord) * InitRecordsCount;
+  atomic_store(&TLD.Buffer.Extents, sizeof(MetadataRecord) * InitRecordsCount,
                memory_order_release);
 }
 
@@ -209,12 +210,12 @@
 
 static void incrementExtents(size_t Add) {
   auto &TLD = getThreadLocalData();
-  atomic_fetch_add(&TLD.Buffer.Extents->Size, Add, memory_order_acq_rel);
+  atomic_fetch_add(&TLD.Buffer.Extents, Add, memory_order_acq_rel);
 }
 
 static void decrementExtents(size_t Subtract) {
   auto &TLD = getThreadLocalData();
-  atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract, memory_order_acq_rel);
+  atomic_fetch_sub(&TLD.Buffer.Extents, Subtract, memory_order_acq_rel);
 }
 
 static void writeNewCPUIdMetadata(uint16_t CPU,
@@ -583,7 +584,7 @@
 
   auto &TLD = getThreadLocalData();
 
-  if (TLD.BQ == nullptr)
+  if (TLD.BQ == nullptr && BQ != nullptr)
     TLD.BQ = BQ;
 
   if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader))
@@ -758,6 +759,7 @@
   static BufferQueue::const_iterator It{};
   static BufferQueue::const_iterator End{};
   static void *CurrentBuffer{nullptr};
+  static size_t SerializedBufferSize = 0;
   if (B.Data == static_cast<void *>(&Header) && B.Size == sizeof(Header)) {
     // From this point on, we provide raw access to the raw buffer we're getting
     // from the BufferQueue. We're relying on the iterators from the current
@@ -767,7 +769,7 @@
   }
 
   if (CurrentBuffer != nullptr) {
-    InternalFree(CurrentBuffer);
+    deallocateBuffer(CurrentBuffer, SerializedBufferSize);
     CurrentBuffer = nullptr;
   }
 
@@ -778,9 +780,9 @@
   // out to disk. The difference here would be that we still write "empty"
   // buffers, or at least go through the iterators faithfully to let the
   // handlers see the empty buffers in the queue.
-  auto BufferSize = atomic_load(&It->Extents->Size, memory_order_acquire);
-  auto SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
-  CurrentBuffer = InternalAlloc(SerializedBufferSize);
+  auto BufferSize = atomic_load(&It->Extents, memory_order_acquire);
+  SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
+  CurrentBuffer = allocateBuffer(SerializedBufferSize);
   if (CurrentBuffer == nullptr)
     return {nullptr, 0};
 
@@ -848,7 +850,6 @@
       if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr)
         releaseThreadLocalBuffer(*TLD.BQ);
       BQ->~BufferQueue();
-      InternalFree(BQ);
       BQ = nullptr;
     }
   });
@@ -883,6 +884,13 @@
   retryingWriteAll(Fd, reinterpret_cast<char *>(&Header),
                    reinterpret_cast<char *>(&Header) + sizeof(Header));
 
+  // Release the current thread's buffer before we attempt to write out all the
+  // buffers. This ensures that in case we had only a single thread going, that
+  // we are able to capture the data nonetheless.
+  auto &TLD = getThreadLocalData();
+  if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr)
+    releaseThreadLocalBuffer(*TLD.BQ);
+
   BQ->apply([&](const BufferQueue::Buffer &B) {
     // Starting at version 2 of the FDR logging implementation, we only write
     // the records identified by the extents of the buffer. We use the Extents
@@ -890,7 +898,7 @@
     // still use a Metadata record, but fill in the extents instead for the
     // data.
     MetadataRecord ExtentsRecord;
-    auto BufferExtents = atomic_load(&B.Extents->Size, memory_order_acquire);
+    auto BufferExtents = atomic_load(&B.Extents, memory_order_acquire);
     DCHECK(BufferExtents <= B.Size);
     ExtentsRecord.Type = uint8_t(RecordType::Metadata);
     ExtentsRecord.RecordKind =
@@ -922,7 +930,12 @@
 
   // Do special things to make the log finalize itself, and not allow any more
   // operations to be performed until re-initialized.
-  BQ->finalize();
+  if (BQ == nullptr) {
+    if (Verbosity())
+      Report("Attempting to finalize an uninitialized global buffer!\n");
+  } else {
+    BQ->finalize();
+  }
 
   atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
                memory_order_release);
@@ -1132,13 +1145,11 @@
 
   if (BQ != nullptr) {
     BQ->~BufferQueue();
-    InternalFree(BQ);
     BQ = nullptr;
   }
 
   if (BQ == nullptr) {
-    BQ = reinterpret_cast<BufferQueue *>(
-        InternalAlloc(sizeof(BufferQueue), nullptr, 64));
+    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
     new (BQ) BufferQueue(BufferSize, BufferMax, Success);
   }
 
@@ -1146,7 +1157,6 @@
     Report("BufferQueue init failed.\n");
     if (BQ != nullptr) {
       BQ->~BufferQueue();
-      InternalFree(BQ);
       BQ = nullptr;
     }
     return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
commit	641b245943334ad18b672e0ed85497f47fa19b45	[log] [tgz]
author	Dean Michael Berris <dberris@google.com>	Mon Sep 17 03:09:01 2018 +0000
committer	Dean Michael Berris <dberris@google.com>	Mon Sep 17 03:09:01 2018 +0000
tree	44d3e284c3e329f296b38ddb2dba912e6c552fe9
parent	31c8a31facbcf213ff5e2c9f066c4577f8208d99 [diff]