[dfsan] Add origin chain utils

This is a part of https://reviews.llvm.org/D95835.

The design is based on MSan origin chains.

An 4-byte origin is a hash of an origin chain. An origin chain is a
pair of a stack hash id and a hash to its previous origin chain. 0 means
no previous origin chains exist. We limit the length of a chain to be
16. With origin_history_size = 0, the limit is removed.

The change does not have any test cases yet. The following change
will be adding test cases when the APIs are used.

Reviewed-by: morehouse

Differential Revision: https://reviews.llvm.org/D96160

GitOrigin-RevId: 2d9c6e10e92e983fb3ad497f29bea697a94c935a
diff --git a/lib/dfsan/CMakeLists.txt b/lib/dfsan/CMakeLists.txt
index b4e589b..fa6823c 100644
--- a/lib/dfsan/CMakeLists.txt
+++ b/lib/dfsan/CMakeLists.txt
@@ -3,6 +3,7 @@
 # Runtime library sources and build flags.
 set(DFSAN_RTL_SOURCES
   dfsan.cpp
+  dfsan_chained_origin_depot.cpp
   dfsan_custom.cpp
   dfsan_interceptors.cpp
   dfsan_thread.cpp
@@ -10,7 +11,9 @@
 
 set(DFSAN_RTL_HEADERS
   dfsan.h
+  dfsan_chained_origin_depot.h
   dfsan_flags.inc
+  dfsan_flags.h
   dfsan_platform.h
   dfsan_thread.h
   )
diff --git a/lib/dfsan/dfsan.cpp b/lib/dfsan/dfsan.cpp
index 31c176c..4f02c49 100644
--- a/lib/dfsan/dfsan.cpp
+++ b/lib/dfsan/dfsan.cpp
@@ -20,6 +20,9 @@
 
 #include "dfsan/dfsan.h"
 
+#include "dfsan/dfsan_chained_origin_depot.h"
+#include "dfsan/dfsan_flags.h"
+#include "dfsan/dfsan_origin.h"
 #include "dfsan/dfsan_thread.h"
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
@@ -282,6 +285,46 @@
   return label;
 }
 
+// For platforms which support slow unwinder only, we need to restrict the store
+// context size to 1, basically only storing the current pc, because the slow
+// unwinder which is based on libunwind is not async signal safe and causes
+// random freezes in forking applications as well as in signal handlers.
+// DFSan supports only Linux. So we do not restrict the store context size.
+#define GET_STORE_STACK_TRACE_PC_BP(pc, bp) \
+  BufferedStackTrace stack;                 \
+  stack.Unwind(pc, bp, nullptr, true, flags().store_context_size);
+
+#define PRINT_CALLER_STACK_TRACE        \
+  {                                     \
+    GET_CALLER_PC_BP_SP;                \
+    (void)sp;                           \
+    GET_STORE_STACK_TRACE_PC_BP(pc, bp) \
+    stack.Print();                      \
+  }
+
+static u32 ChainOrigin(u32 id, StackTrace *stack, bool from_init = false) {
+  // StackDepot is not async signal safe. Do not create new chains in a signal
+  // handler.
+  DFsanThread *t = GetCurrentThread();
+  if (t && t->InSignalHandler())
+    return id;
+
+  // As an optimization the origin of an application byte is updated only when
+  // its shadow is non-zero. Because we are only interested in the origins of
+  // taint labels, it does not matter what origin a zero label has. This reduces
+  // memory write cost. MSan does similar optimization. The following invariant
+  // may not hold because of some bugs. We check the invariant to help debug.
+  if (!from_init && id == 0 && flags().check_origin_invariant) {
+    Printf("  DFSan found invalid origin invariant\n");
+    PRINT_CALLER_STACK_TRACE
+  }
+
+  Origin o = Origin::FromRawId(id);
+  stack->tag = StackTrace::TAG_UNKNOWN;
+  Origin chained = Origin::CreateChainedOrigin(o, stack);
+  return chained.raw_id();
+}
+
 static void WriteShadowIfDifferent(dfsan_label label, uptr shadow_addr,
                                    uptr size) {
   dfsan_label *labelp = (dfsan_label *)shadow_addr;
diff --git a/lib/dfsan/dfsan.h b/lib/dfsan/dfsan.h
index e03716f..62eda73 100644
--- a/lib/dfsan/dfsan.h
+++ b/lib/dfsan/dfsan.h
@@ -15,6 +15,8 @@
 #define DFSAN_H
 
 #include "sanitizer_common/sanitizer_internal_defs.h"
+
+#include "dfsan_flags.h"
 #include "dfsan_platform.h"
 
 using __sanitizer::uptr;
@@ -58,19 +60,6 @@
   return shadow_for(const_cast<void *>(ptr));
 }
 
-struct Flags {
-#define DFSAN_FLAG(Type, Name, DefaultValue, Description) Type Name;
-#include "dfsan_flags.inc"
-#undef DFSAN_FLAG
-
-  void SetDefaults();
-};
-
-extern Flags flags_data;
-inline Flags &flags() {
-  return flags_data;
-}
-
 }  // namespace __dfsan
 
 #endif  // DFSAN_H
diff --git a/lib/dfsan/dfsan_chained_origin_depot.cpp b/lib/dfsan/dfsan_chained_origin_depot.cpp
new file mode 100644
index 0000000..9ec598b
--- /dev/null
+++ b/lib/dfsan/dfsan_chained_origin_depot.cpp
@@ -0,0 +1,22 @@
+//===-- dfsan_chained_origin_depot.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of DataFlowSanitizer.
+//
+// A storage for chained origins.
+//===----------------------------------------------------------------------===//
+
+#include "dfsan_chained_origin_depot.h"
+
+namespace __dfsan {
+
+static ChainedOriginDepot chainedOriginDepot;
+
+ChainedOriginDepot* GetChainedOriginDepot() { return &chainedOriginDepot; }
+
+}  // namespace __dfsan
diff --git a/lib/dfsan/dfsan_chained_origin_depot.h b/lib/dfsan/dfsan_chained_origin_depot.h
new file mode 100644
index 0000000..d715ef7
--- /dev/null
+++ b/lib/dfsan/dfsan_chained_origin_depot.h
@@ -0,0 +1,26 @@
+//===-- dfsan_chained_origin_depot.h ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of DataFlowSanitizer.
+//
+// A storage for chained origins.
+//===----------------------------------------------------------------------===//
+
+#ifndef DFSAN_CHAINED_ORIGIN_DEPOT_H
+#define DFSAN_CHAINED_ORIGIN_DEPOT_H
+
+#include "sanitizer_common/sanitizer_chained_origin_depot.h"
+#include "sanitizer_common/sanitizer_common.h"
+
+namespace __dfsan {
+
+ChainedOriginDepot* GetChainedOriginDepot();
+
+}  // namespace __dfsan
+
+#endif  // DFSAN_CHAINED_ORIGIN_DEPOT_H
diff --git a/lib/dfsan/dfsan_flags.h b/lib/dfsan/dfsan_flags.h
new file mode 100644
index 0000000..ec7edf6
--- /dev/null
+++ b/lib/dfsan/dfsan_flags.h
@@ -0,0 +1,32 @@
+//===-- dfsan_flags.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of DataFlowSanitizer.
+//
+// DFSan flags.
+//===----------------------------------------------------------------------===//
+
+#ifndef DFSAN_FLAGS_H
+#define DFSAN_FLAGS_H
+
+namespace __dfsan {
+
+struct Flags {
+#define DFSAN_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "dfsan_flags.inc"
+#undef DFSAN_FLAG
+
+  void SetDefaults();
+};
+
+extern Flags flags_data;
+inline Flags &flags() { return flags_data; }
+
+}  // namespace __dfsan
+
+#endif  // DFSAN_FLAGS_H
diff --git a/lib/dfsan/dfsan_flags.inc b/lib/dfsan/dfsan_flags.inc
index cdd0035..4ac4ef7 100644
--- a/lib/dfsan/dfsan_flags.inc
+++ b/lib/dfsan/dfsan_flags.inc
@@ -29,3 +29,14 @@
 DFSAN_FLAG(const char *, dump_labels_at_exit, "", "The path of the file where "
                                                   "to dump the labels when the "
                                                   "program terminates.")
+DFSAN_FLAG(
+     int, origin_history_size, Origin::kMaxDepth,
+    "The limit of origin chain length. Non-positive values mean unlimited.")
+DFSAN_FLAG(
+     int, origin_history_per_stack_limit, 20000,
+    "The limit of origin node's references count. "
+    "Non-positive values mean unlimited.")
+DFSAN_FLAG(int, store_context_size, 20,
+           "The depth limit of origin tracking stack traces.")
+DFSAN_FLAG(bool, check_origin_invariant, false,
+           "Whether to check if the origin invariant holds.")
diff --git a/lib/dfsan/dfsan_origin.h b/lib/dfsan/dfsan_origin.h
new file mode 100644
index 0000000..89fd7f9
--- /dev/null
+++ b/lib/dfsan/dfsan_origin.h
@@ -0,0 +1,127 @@
+//===-- dfsan_origin.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of DataFlowSanitizer.
+//
+// Origin id utils.
+//===----------------------------------------------------------------------===//
+
+#ifndef DFSAN_ORIGIN_H
+#define DFSAN_ORIGIN_H
+
+#include "dfsan_chained_origin_depot.h"
+#include "dfsan_flags.h"
+#include "sanitizer_common/sanitizer_stackdepot.h"
+
+namespace __dfsan {
+
+// Origin handling.
+//
+// Origin is a 32-bit identifier that is attached to any taint value in the
+// program and describes how this memory came to be tainted.
+//
+// Chained origin id is like:
+// zzzz xxxx xxxx xxxx
+//
+// Chained origin id describes an event of storing a taint value to
+// memory. The xxx part is a value of ChainedOriginDepot, which is a mapping of
+// (stack_id, prev_id) -> id, where
+//  * stack_id describes the event.
+//    StackDepot keeps a mapping between those and corresponding stack traces.
+//  * prev_id is another origin id that describes the earlier part of the
+//    taint value history. 0 prev_id indicates the start of a chain.
+// Following a chain of prev_id provides the full recorded history of a taint
+// value.
+//
+// This, effectively, defines a forest where nodes are points in value history
+// marked with origin ids, and edges are events that are marked with stack_id.
+//
+// The "zzzz" bits of chained origin id are used to store the length of the
+// origin chain.
+
+class Origin {
+ public:
+  static bool isValidId(u32 id) { return id != 0; }
+
+  u32 raw_id() const { return raw_id_; }
+
+  bool isChainedOrigin() const { return Origin::isValidId(raw_id_); }
+
+  u32 getChainedId() const {
+    CHECK(Origin::isValidId(raw_id_));
+    return raw_id_ & kChainedIdMask;
+  }
+
+  // Returns the next origin in the chain and the current stack trace.
+  //
+  // It scans a partition of StackDepot linearly, and is used only by origin
+  // tracking report.
+  Origin getNextChainedOrigin(StackTrace *stack) const {
+    CHECK(Origin::isValidId(raw_id_));
+    u32 prev_id;
+    u32 stack_id = GetChainedOriginDepot()->Get(getChainedId(), &prev_id);
+    if (stack)
+      *stack = StackDepotGet(stack_id);
+    return Origin(prev_id);
+  }
+
+  static Origin CreateChainedOrigin(Origin prev, StackTrace *stack) {
+    int depth = prev.isChainedOrigin() ? prev.depth() : -1;
+    // depth is the length of the chain minus 1.
+    // origin_history_size of 0 means unlimited depth.
+    if (flags().origin_history_size > 0) {
+      ++depth;
+      if (depth >= flags().origin_history_size || depth > kMaxDepth)
+        return prev;
+    }
+
+    StackDepotHandle h = StackDepotPut_WithHandle(*stack);
+    if (!h.valid())
+      return prev;
+
+    if (flags().origin_history_per_stack_limit > 0) {
+      int use_count = h.use_count();
+      if (use_count > flags().origin_history_per_stack_limit)
+        return prev;
+    }
+
+    u32 chained_id;
+    bool inserted =
+        GetChainedOriginDepot()->Put(h.id(), prev.raw_id(), &chained_id);
+    CHECK((chained_id & kChainedIdMask) == chained_id);
+
+    if (inserted && flags().origin_history_per_stack_limit > 0)
+      h.inc_use_count_unsafe();
+
+    return Origin((depth << kDepthShift) | chained_id);
+  }
+
+  static Origin FromRawId(u32 id) { return Origin(id); }
+
+ private:
+  static const int kDepthBits = 4;
+  static const int kDepthShift = 32 - kDepthBits;
+
+  static const u32 kChainedIdMask = ((u32)-1) >> kDepthBits;
+
+  u32 raw_id_;
+
+  explicit Origin(u32 raw_id) : raw_id_(raw_id) {}
+
+  int depth() const {
+    CHECK(isChainedOrigin());
+    return (raw_id_ >> kDepthShift) & ((1 << kDepthBits) - 1);
+  }
+
+ public:
+  static const int kMaxDepth = (1 << kDepthBits) - 1;
+};
+
+}  // namespace __dfsan
+
+#endif  // DFSAN_ORIGIN_H