[libFuzzer] initial implementation of -data_flow_trace. It parses the data flow trace and prints the summary, but doesn't use the information in any other way yet

llvm-svn: 334058
GitOrigin-RevId: 1fd005f552595ceb2a10f2deacc6b64a50019afb
diff --git a/FuzzerDataFlowTrace.cpp b/FuzzerDataFlowTrace.cpp
new file mode 100644
index 0000000..69efd6f
--- /dev/null
+++ b/FuzzerDataFlowTrace.cpp
@@ -0,0 +1,90 @@
+//===- FuzzerDataFlowTrace.cpp - DataFlowTrace                ---*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// fuzzer::DataFlowTrace
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerDataFlowTrace.h"
+#include "FuzzerIO.h"
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <vector>
+
+namespace fuzzer {
+
+void DataFlowTrace::Init(const std::string &DirPath,
+                         const std::string &FocusFunction) {
+  if (DirPath.empty()) return;
+  const char *kFunctionsTxt = "functions.txt";
+  Printf("INFO: DataFlowTrace: reading from '%s'\n", DirPath.c_str());
+  Vector<SizedFile> Files;
+  GetSizedFilesFromDir(DirPath, &Files);
+  std::string L;
+
+  // Read functions.txt
+  std::ifstream IF(DirPlusFile(DirPath, kFunctionsTxt));
+  size_t FocusFuncIdx = SIZE_MAX;
+  size_t NumFunctions = 0;
+  while (std::getline(IF, L, '\n')) {
+    NumFunctions++;
+    if (FocusFunction == L)
+      FocusFuncIdx = NumFunctions - 1;
+  }
+  if (!NumFunctions || FocusFuncIdx == SIZE_MAX || Files.size() <= 1)
+    return;
+  // Read traces.
+  size_t NumTraceFiles = 0;
+  size_t NumTracesWithFocusFunction = 0;
+  for (auto &SF : Files) {
+    auto Name = Basename(SF.File);
+    if (Name == kFunctionsTxt) continue;
+    auto ParseError = [&](const char *Err) {
+      Printf("DataFlowTrace: parse error: %s\n  File: %s\n  Line: %s\n", Err,
+             Name.c_str(), L.c_str());
+    };
+    NumTraceFiles++;
+    // Printf("=== %s\n", Name.c_str());
+    std::ifstream IF(SF.File);
+    while (std::getline(IF, L, '\n')) {
+      size_t SpacePos = L.find(' ');
+      if (SpacePos == std::string::npos)
+        return ParseError("no space in the trace line");
+      if (L.empty() || L[0] != 'F')
+        return ParseError("the trace line doesn't start with 'F'");
+      size_t N = std::atol(L.c_str() + 1);
+      if (N >= NumFunctions)
+        return ParseError("N is greater than the number of functions");
+      if (N == FocusFuncIdx) {
+        NumTracesWithFocusFunction++;
+        const char *Beg = L.c_str() + SpacePos + 1;
+        const char *End = L.c_str() + L.size();
+        assert(Beg < End);
+        size_t Len = End - Beg;
+        Vector<bool> V(Len);
+        for (size_t I = 0; I < Len; I++) {
+          if (Beg[I] != '0' && Beg[I] != '1')
+            ParseError("the trace should contain only 0 or 1");
+          V[I] = Beg[I] == '1';
+        }
+        // Print just a few small traces.
+        if (NumTracesWithFocusFunction <= 3 && Len <= 16)
+          Printf("%s => |%s|\n", Name.c_str(), L.c_str() + SpacePos + 1);
+        break;  // No need to parse the following lines.
+      }
+    }
+  }
+  assert(NumTraceFiles == Files.size() - 1);
+  Printf("INFO: DataFlowTrace: %zd trace files, %zd functions, "
+         "%zd traces with focus function\n",
+         NumTraceFiles, NumFunctions, NumTracesWithFocusFunction);
+}
+
+}  // namespace fuzzer
+
diff --git a/FuzzerDataFlowTrace.h b/FuzzerDataFlowTrace.h
new file mode 100644
index 0000000..2b7b71f
--- /dev/null
+++ b/FuzzerDataFlowTrace.h
@@ -0,0 +1,40 @@
+//===- FuzzerDataFlowTrace.h - Internal header for the Fuzzer ---*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// fuzzer::DataFlowTrace; reads and handles a data-flow trace.
+//
+// A data flow trace is generated by e.g. dataflow/DataFlow.cpp
+// and is stored on disk in a separate directory.
+//
+// The trace dir contains a file 'functions.txt' which lists function names,
+// oner per line, e.g.
+// ==> functions.txt <==
+// Func2
+// LLVMFuzzerTestOneInput
+// Func1
+//
+// All other files in the dir are the traces, see dataflow/DataFlow.cpp.
+// The name of the file is sha1 of the input used to generate the trace.
+//
+// Current status:
+//   the data is parsed and the summary is printed, but the data is not yet
+//   used in any other way.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_FUZZER_DATA_FLOW_TRACE
+#define LLVM_FUZZER_DATA_FLOW_TRACE
+
+#include "FuzzerDefs.h"
+
+namespace fuzzer {
+struct DataFlowTrace {
+  void Init(const std::string &DirPath, const std::string &FocusFunction);
+};
+}  // namespace fuzzer
+
+#endif // LLVM_FUZZER_DATA_FLOW_TRACE
diff --git a/FuzzerDriver.cpp b/FuzzerDriver.cpp
index dfb3d49..d7b9573 100644
--- a/FuzzerDriver.cpp
+++ b/FuzzerDriver.cpp
@@ -623,6 +623,8 @@
     Options.ExitOnItem = Flags.exit_on_item;
   if (Flags.focus_function)
     Options.FocusFunction = Flags.focus_function;
+  if (Flags.data_flow_trace)
+    Options.DataFlowTrace = Flags.data_flow_trace;
 
   unsigned Seed = Flags.seed;
   // Initialize Seed.
diff --git a/FuzzerFlags.def b/FuzzerFlags.def
index 139e618..5be6d26 100644
--- a/FuzzerFlags.def
+++ b/FuzzerFlags.def
@@ -153,3 +153,5 @@
 FUZZER_FLAG_INT(analyze_dict, 0, "Experimental")
 FUZZER_DEPRECATED_FLAG(use_clang_coverage)
 FUZZER_FLAG_INT(use_feature_frequency, 0, "Experimental/internal")
+
+FUZZER_FLAG_STRING(data_flow_trace, "Experimental: use the data flow trace")
diff --git a/FuzzerIO.cpp b/FuzzerIO.cpp
index dac5ec6..f3ead0e 100644
--- a/FuzzerIO.cpp
+++ b/FuzzerIO.cpp
@@ -100,6 +100,14 @@
   return DirPath + GetSeparator() + FileName;
 }
 
+std::string Basename(const std::string &Path, char Separator) {
+  size_t Pos = Path.rfind(Separator);
+  if (Pos == std::string::npos)
+    return Path;
+  assert(Pos < Path.size());
+  return Path.substr(Pos + 1);
+}
+
 void DupAndCloseStderr() {
   int OutputFd = DuplicateFile(2);
   if (OutputFd > 0) {
diff --git a/FuzzerIO.h b/FuzzerIO.h
index ea9f0d5..6d77574 100644
--- a/FuzzerIO.h
+++ b/FuzzerIO.h
@@ -67,6 +67,8 @@
 void GetSizedFilesFromDir(const std::string &Dir, Vector<SizedFile> *V);
 
 char GetSeparator();
+// Similar to the basename utility: returns the file name w/o the dir prefix.
+std::string Basename(const std::string &Path, char Separator = GetSeparator());
 
 FILE* OpenFile(int Fd, const char *Mode);
 
diff --git a/FuzzerInternal.h b/FuzzerInternal.h
index 2b2638f..ec098a7 100644
--- a/FuzzerInternal.h
+++ b/FuzzerInternal.h
@@ -12,6 +12,7 @@
 #ifndef LLVM_FUZZER_INTERNAL_H
 #define LLVM_FUZZER_INTERNAL_H
 
+#include "FuzzerDataFlowTrace.h"
 #include "FuzzerDefs.h"
 #include "FuzzerExtFunctions.h"
 #include "FuzzerInterface.h"
@@ -134,6 +135,7 @@
   InputCorpus &Corpus;
   MutationDispatcher &MD;
   FuzzingOptions Options;
+  DataFlowTrace DFT;
 
   system_clock::time_point ProcessStartTime = system_clock::now();
   system_clock::time_point UnitStartTime, UnitStopTime;
diff --git a/FuzzerLoop.cpp b/FuzzerLoop.cpp
index 9c19ba9..27bd5ee 100644
--- a/FuzzerLoop.cpp
+++ b/FuzzerLoop.cpp
@@ -160,6 +160,7 @@
   CurrentUnitSize = 0;
   memset(BaseSha1, 0, sizeof(BaseSha1));
   TPC.SetFocusFunction(Options.FocusFunction);
+  DFT.Init(Options.DataFlowTrace, Options.FocusFunction);
 }
 
 Fuzzer::~Fuzzer() {}
diff --git a/FuzzerOptions.h b/FuzzerOptions.h
index 946f0b9..7a52d36 100644
--- a/FuzzerOptions.h
+++ b/FuzzerOptions.h
@@ -46,6 +46,7 @@
   std::string ExitOnSrcPos;
   std::string ExitOnItem;
   std::string FocusFunction;
+  std::string DataFlowTrace;
   bool SaveArtifacts = true;
   bool PrintNEW = true; // Print a status line when new units are found;
   bool PrintNewCovPcs = false;
diff --git a/dataflow/DataFlow.cpp b/dataflow/DataFlow.cpp
index 9986307..a79c796 100644
--- a/dataflow/DataFlow.cpp
+++ b/dataflow/DataFlow.cpp
@@ -69,6 +69,7 @@
 static __thread size_t CurrentFunc;
 static dfsan_label *FuncLabels;  // Array of NumFuncs elements.
 static char *PrintableStringForLabel;  // InputLen + 2 bytes.
+static bool LabelSeen[1 << 8 * sizeof(dfsan_label)];
 
 // Prints all instrumented functions.
 static int PrintFunctions() {
@@ -89,7 +90,11 @@
   return 0;
 }
 
-static void SetBytesForLabel(dfsan_label L, char *Bytes) {
+extern "C"
+void SetBytesForLabel(dfsan_label L, char *Bytes) {
+  if (LabelSeen[L])
+    return;
+  LabelSeen[L] = true;
   assert(L);
   if (L <= InputLen + 1) {
     Bytes[L - 1] = '1';
@@ -103,6 +108,7 @@
 static char *GetPrintableStringForLabel(dfsan_label L) {
   memset(PrintableStringForLabel, '0', InputLen + 1);
   PrintableStringForLabel[InputLen + 1] = 0;
+  memset(LabelSeen, 0, sizeof(LabelSeen));
   SetBytesForLabel(L, PrintableStringForLabel);
   return PrintableStringForLabel;
 }
diff --git a/scripts/collect_data_flow.py b/scripts/collect_data_flow.py
index d13f6dc..c3faf71 100755
--- a/scripts/collect_data_flow.py
+++ b/scripts/collect_data_flow.py
@@ -11,9 +11,15 @@
 # the complete trace for all input bytes (running it on all bytes at once
 # may fail if DFSan runs out of labels).
 # Usage:
-#   collect_data_flow.py BINARY INPUT [RESULT]
+#
+#   # Collect dataflow for one input, store it in OUTPUT (default is stdout)
+#   collect_data_flow.py BINARY INPUT [OUTPUT]
+#
+#   # Collect dataflow for all inputs in CORPUS_DIR, store them in OUTPUT_DIR
+#   collect_data_flow.py BINARY CORPUS_DIR OUTPUT_DIR
 #===------------------------------------------------------------------------===#
 import atexit
+import hashlib
 import sys
 import os
 import subprocess
@@ -26,9 +32,26 @@
   print "removing: ", d
   shutil.rmtree(d)
 
+def collect_dataflow_for_corpus(self, exe, corpus_dir, output_dir):
+  print "Collecting dataflow for corpus:", corpus_dir, \
+    "output_dir:", output_dir
+  assert not os.path.exists(output_dir)
+  os.mkdir(output_dir)
+  for root, dirs, files in os.walk(corpus_dir):
+    for f in files:
+      path = os.path.join(root, f)
+      sha1 = hashlib.sha1(open(path).read()).hexdigest()
+      output = os.path.join(output_dir, sha1)
+      subprocess.call([self, exe, path, output])
+  functions_txt = open(os.path.join(output_dir, "functions.txt"), "w")
+  subprocess.call([exe], stdout=functions_txt)
+
+
 def main(argv):
   exe = argv[1]
   inp = argv[2]
+  if os.path.isdir(inp):
+    return collect_dataflow_for_corpus(argv[0], exe, inp, argv[3])
   size = os.path.getsize(inp)
   q = [[0, size]]
   tmpdir = tempfile.mkdtemp(prefix="libfuzzer-tmp-")
diff --git a/tests/FuzzerUnittest.cpp b/tests/FuzzerUnittest.cpp
index a38a453..0b86738 100644
--- a/tests/FuzzerUnittest.cpp
+++ b/tests/FuzzerUnittest.cpp
@@ -28,6 +28,14 @@
   abort();
 }
 
+TEST(Fuzzer, Basename) {
+  EXPECT_EQ(Basename("foo/bar"), "bar");
+  EXPECT_EQ(Basename("bar"), "bar");
+  EXPECT_EQ(Basename("/bar"), "bar");
+  EXPECT_EQ(Basename("foo/x"), "x");
+  EXPECT_EQ(Basename("foo/"), "");
+}
+
 TEST(Fuzzer, CrossOver) {
   std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
   fuzzer::EF = t.get();