tsan: mmap shadow stack

We used to mmap C++ shadow stack as part of the trace region
before ed7f3f5bc9 ("tsan: move shadow stack into ThreadState"),
which moved the shadow stack into TLS. This started causing
timeouts and OOMs on some of our internal tests that repeatedly
create and destroy thousands of threads.
Allocate C++ shadow stack with mmap and small pages again.
This prevents the observed timeouts and OOMs.
But we now need to be more careful with interceptors that
run after thread finalization because FuncEntry/Exit and
TraceAddEvent all need the shadow stack.

Reviewed By: vitalybuka

Differential Revision: https://reviews.llvm.org/D113786

GitOrigin-RevId: b5ff187b7b51dd76f881e10c1c2b4033e672fb12
diff --git a/lib/tsan/rtl/tsan_mman.cpp b/lib/tsan/rtl/tsan_mman.cpp
index f1b6768..ef97ad0 100644
--- a/lib/tsan/rtl/tsan_mman.cpp
+++ b/lib/tsan/rtl/tsan_mman.cpp
@@ -220,7 +220,7 @@
 void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {
   DPrintf("#%d: alloc(%zu) = 0x%zx\n", thr->tid, sz, p);
   ctx->metamap.AllocBlock(thr, pc, p, sz);
-  if (write && thr->ignore_reads_and_writes == 0)
+  if (write && thr->ignore_reads_and_writes == 0 && thr->is_inited)
     MemoryRangeImitateWrite(thr, pc, (uptr)p, sz);
   else
     MemoryResetRange(thr, pc, (uptr)p, sz);
@@ -230,7 +230,7 @@
   CHECK_NE(p, (void*)0);
   uptr sz = ctx->metamap.FreeBlock(thr->proc(), p);
   DPrintf("#%d: free(0x%zx, %zu)\n", thr->tid, p, sz);
-  if (write && thr->ignore_reads_and_writes == 0)
+  if (write && thr->ignore_reads_and_writes == 0 && thr->is_inited)
     MemoryRangeFreed(thr, pc, (uptr)p, sz);
 }
 
diff --git a/lib/tsan/rtl/tsan_rtl.cpp b/lib/tsan/rtl/tsan_rtl.cpp
index 46dec04..ff7726e 100644
--- a/lib/tsan/rtl/tsan_rtl.cpp
+++ b/lib/tsan/rtl/tsan_rtl.cpp
@@ -148,15 +148,19 @@
 {
   CHECK_EQ(reinterpret_cast<uptr>(this) % SANITIZER_CACHE_LINE_SIZE, 0);
 #if !SANITIZER_GO
-  shadow_stack_pos = shadow_stack;
-  shadow_stack_end = shadow_stack + kShadowStackSize;
+  // C/C++ uses fixed size shadow stack.
+  const int kInitStackSize = kShadowStackSize;
+  shadow_stack = static_cast<uptr *>(
+      MmapNoReserveOrDie(kInitStackSize * sizeof(uptr), "shadow stack"));
+  SetShadowRegionHugePageMode(reinterpret_cast<uptr>(shadow_stack),
+                              kInitStackSize * sizeof(uptr));
 #else
-  // Setup dynamic shadow stack.
+  // Go uses malloc-allocated shadow stack with dynamic size.
   const int kInitStackSize = 8;
-  shadow_stack = (uptr *)Alloc(kInitStackSize * sizeof(uptr));
+  shadow_stack = static_cast<uptr *>(Alloc(kInitStackSize * sizeof(uptr)));
+#endif
   shadow_stack_pos = shadow_stack;
   shadow_stack_end = shadow_stack + kInitStackSize;
-#endif
 }
 
 #if !SANITIZER_GO
diff --git a/lib/tsan/rtl/tsan_rtl.h b/lib/tsan/rtl/tsan_rtl.h
index eab8370..c71b27e 100644
--- a/lib/tsan/rtl/tsan_rtl.h
+++ b/lib/tsan/rtl/tsan_rtl.h
@@ -159,12 +159,8 @@
 #if !SANITIZER_GO
   IgnoreSet mop_ignore_set;
   IgnoreSet sync_ignore_set;
-  // C/C++ uses fixed size shadow stack.
-  uptr shadow_stack[kShadowStackSize];
-#else
-  // Go uses malloc-allocated shadow stack with dynamic size.
-  uptr *shadow_stack;
 #endif
+  uptr *shadow_stack;
   uptr *shadow_stack_end;
   uptr *shadow_stack_pos;
   RawShadow *racy_shadow_addr;
@@ -616,6 +612,9 @@
                                         EventType typ, u64 addr) {
   if (!kCollectHistory)
     return;
+  // TraceSwitch accesses shadow_stack, but it's called infrequently,
+  // so we check it here proactively.
+  DCHECK(thr->shadow_stack);
   DCHECK_GE((int)typ, 0);
   DCHECK_LE((int)typ, 7);
   DCHECK_EQ(GetLsb(addr, kEventPCBits), addr);
diff --git a/lib/tsan/rtl/tsan_rtl_thread.cpp b/lib/tsan/rtl/tsan_rtl_thread.cpp
index 6e652ee..8532f5d 100644
--- a/lib/tsan/rtl/tsan_rtl_thread.cpp
+++ b/lib/tsan/rtl/tsan_rtl_thread.cpp
@@ -227,15 +227,11 @@
   if (thr->tls_addr && thr->tls_size)
     DontNeedShadowFor(thr->tls_addr, thr->tls_size);
   thr->is_dead = true;
+  thr->is_inited = false;
   ctx->thread_registry.FinishThread(thr->tid);
 }
 
 void ThreadContext::OnFinished() {
-#if SANITIZER_GO
-  Free(thr->shadow_stack);
-  thr->shadow_stack_pos = nullptr;
-  thr->shadow_stack_end = nullptr;
-#endif
   if (!detached) {
     thr->fast_state.IncrementEpoch();
     // Can't increment epoch w/o writing to the trace as well.
@@ -244,6 +240,15 @@
   }
   epoch1 = thr->fast_state.epoch();
 
+#if !SANITIZER_GO
+  UnmapOrDie(thr->shadow_stack, kShadowStackSize * sizeof(uptr));
+#else
+  Free(thr->shadow_stack);
+#endif
+  thr->shadow_stack = nullptr;
+  thr->shadow_stack_pos = nullptr;
+  thr->shadow_stack_end = nullptr;
+
   if (common_flags()->detect_deadlocks)
     ctx->dd->DestroyLogicalThread(thr->dd_lt);
   thr->clock.ResetCached(&thr->proc()->clock_cache);