[libFuzzer] simplify the DFT trace collection using the new faster DFSan mode that traces up to 16 labels at a time and never runs out of labels.

llvm-svn: 363326
GitOrigin-RevId: 2fa83cb7ee0cef9baebe4b36128a19b7f01602d7
diff --git a/FuzzerDataFlowTrace.cpp b/FuzzerDataFlowTrace.cpp
index 1fba391..311f53a 100644
--- a/FuzzerDataFlowTrace.cpp
+++ b/FuzzerDataFlowTrace.cpp
@@ -120,12 +120,6 @@
   return DFT;
 }
 
-static std::ostream &operator<<(std::ostream &OS, const Vector<uint8_t> &DFT) {
-  for (auto B : DFT)
-    OS << (B ? "1" : "0");
-  return OS;
-}
-
 static bool ParseError(const char *Err, const std::string &Line) {
   Printf("DataFlowTrace: parse error: %s: Line: %s\n", Err, Line.c_str());
   return false;
@@ -246,74 +240,24 @@
                     const Vector<SizedFile> &CorporaFiles) {
   Printf("INFO: collecting data flow: bin: %s dir: %s files: %zd\n",
          DFTBinary.c_str(), DirPath.c_str(), CorporaFiles.size());
+  setenv("DFSAN_OPTIONS", "fast16labels=1:warn_unimplemented=0", 1);
   MkDir(DirPath);
-  auto Temp = TempPath(".dft");
   for (auto &F : CorporaFiles) {
     // For every input F we need to collect the data flow and the coverage.
     // Data flow collection may fail if we request too many DFSan tags at once.
     // So, we start from requesting all tags in range [0,Size) and if that fails
     // we then request tags in [0,Size/2) and [Size/2, Size), and so on.
     // Function number => DFT.
+    auto OutPath = DirPlusFile(DirPath, Hash(FileToVector(F.File)));
     std::unordered_map<size_t, Vector<uint8_t>> DFTMap;
     std::unordered_set<std::string> Cov;
-    std::queue<std::pair<size_t, size_t>> Q;
-    Q.push({0, F.Size});
-    while (!Q.empty()) {
-      auto R = Q.front();
-      Printf("\n\n\n********* Trying: [%zd, %zd)\n", R.first, R.second);
-      Q.pop();
-      Command Cmd;
-      Cmd.addArgument(DFTBinary);
-      Cmd.addArgument(std::to_string(R.first));
-      Cmd.addArgument(std::to_string(R.second));
-      Cmd.addArgument(F.File);
-      Cmd.addArgument(Temp);
-      Printf("CMD: %s\n", Cmd.toString().c_str());
-      if (ExecuteCommand(Cmd)) {
-        // DFSan has failed, collect tags for two subsets.
-        if (R.second - R.first >= 2) {
-          size_t Mid = (R.second + R.first) / 2;
-          Q.push({R.first, Mid});
-          Q.push({Mid, R.second});
-        }
-      } else {
-        Printf("********* Success: [%zd, %zd)\n", R.first, R.second);
-        std::ifstream IF(Temp);
-        std::string L;
-        while (std::getline(IF, L, '\n')) {
-          // Data flow collection has succeeded.
-          // Merge the results with the other runs.
-          if (L.empty()) continue;
-          if (L[0] == 'C') {
-            // Take coverage lines as is, they will be the same in all attempts.
-            Cov.insert(L);
-          } else if (L[0] == 'F') {
-            size_t FunctionNum = 0;
-            std::string DFTString;
-            if (ParseDFTLine(L, &FunctionNum, &DFTString)) {
-              auto &DFT = DFTMap[FunctionNum];
-              if (DFT.empty()) {
-                // Haven't seen this function before, take DFT as is.
-                DFT = DFTStringToVector(DFTString);
-              } else if (DFT.size() == DFTString.size()) {
-                // Have seen this function already, merge DFTs.
-                DFTStringAppendToVector(&DFT, DFTString);
-              }
-            }
-          }
-        }
-      }
-    }
-    auto OutPath = DirPlusFile(DirPath, Hash(FileToVector(F.File)));
-    // Dump combined DFT to disk.
-    Printf("Producing DFT for %s\n", OutPath.c_str());
-    std::ofstream OF(OutPath);
-    for (auto &DFT: DFTMap)
-      OF << "F" << DFT.first << " " << DFT.second << std::endl;
-    for (auto &C : Cov)
-      OF << C << std::endl;
+    Command Cmd;
+    Cmd.addArgument(DFTBinary);
+    Cmd.addArgument(F.File);
+    Cmd.addArgument(OutPath);
+    Printf("CMD: %s\n", Cmd.toString().c_str());
+    ExecuteCommand(Cmd);
   }
-  RemoveFile(Temp);
   // Write functions.txt if it's currently empty or doesn't exist.
   auto FunctionsTxtPath = DirPlusFile(DirPath, kFunctionsTxt);
   if (FileToString(FunctionsTxtPath).empty()) {
diff --git a/FuzzerFork.cpp b/FuzzerFork.cpp
index 870a224..5c4855f 100644
--- a/FuzzerFork.cpp
+++ b/FuzzerFork.cpp
@@ -89,6 +89,7 @@
   std::string DFTDir;
   std::string DataFlowBinary;
   Set<uint32_t> Features, Cov;
+  Set<std::string> FilesWithDFT;
   Vector<std::string> Files;
   Random *Rand;
   std::chrono::system_clock::time_point ProcessStartTime;
@@ -126,10 +127,13 @@
     auto Job = new FuzzJob;
     std::string Seeds;
     if (size_t CorpusSubsetSize =
-            std::min(Files.size(), (size_t)sqrt(Files.size() + 2)))
-      for (size_t i = 0; i < CorpusSubsetSize; i++)
-        Seeds += (Seeds.empty() ? "" : ",") +
-                 Files[Rand->SkewTowardsLast(Files.size())];
+            std::min(Files.size(), (size_t)sqrt(Files.size() + 2))) {
+      for (size_t i = 0; i < CorpusSubsetSize; i++) {
+        auto &SF = Files[Rand->SkewTowardsLast(Files.size())];
+        Seeds += (Seeds.empty() ? "" : ",") + SF;
+        CollectDFT(SF);
+      }
+    }
     if (!Seeds.empty()) {
       Job->SeedListPath =
           DirPlusFile(TempDir, std::to_string(JobId) + ".seeds");
@@ -196,7 +200,6 @@
       auto NewPath = DirPlusFile(MainCorpusDir, Hash(U));
       WriteToFile(U, NewPath);
       Files.push_back(NewPath);
-      CollectDFT(NewPath);
     }
     Features.insert(NewFeatures.begin(), NewFeatures.end());
     Cov.insert(NewCov.begin(), NewCov.end());
@@ -217,6 +220,7 @@
 
   void CollectDFT(const std::string &InputPath) {
     if (DataFlowBinary.empty()) return;
+    if (!FilesWithDFT.insert(InputPath).second) return;
     Command Cmd(Args);
     Cmd.removeFlag("fork");
     Cmd.removeFlag("runs");
@@ -226,7 +230,7 @@
       Cmd.removeArgument(C);
     Cmd.setOutputFile(DirPlusFile(TempDir, "dft.log"));
     Cmd.combineOutAndErr();
-    // Printf("CollectDFT: %s %s\n", InputPath.c_str(), Cmd.toString().c_str());
+    // Printf("CollectDFT: %s\n", Cmd.toString().c_str());
     ExecuteCommand(Cmd);
   }
 
@@ -296,9 +300,6 @@
   CrashResistantMerge(Env.Args, {}, SeedFiles, &Env.Files, {}, &Env.Features,
                       {}, &Env.Cov,
                       CFPath, false);
-  for (auto &F : Env.Files)
-    Env.CollectDFT(F);
-
   RemoveFile(CFPath);
   Printf("INFO: -fork=%d: %zd seed inputs, starting to fuzz in %s\n", NumJobs,
          Env.Files.size(), Env.TempDir.c_str());
diff --git a/dataflow/DataFlow.cpp b/dataflow/DataFlow.cpp
index 989675e..8a5d695 100644
--- a/dataflow/DataFlow.cpp
+++ b/dataflow/DataFlow.cpp
@@ -35,7 +35,8 @@
 // Run:
 //   # Collect data flow and coverage for INPUT_FILE
 //   # write to OUTPUT_FILE (default: stdout)
-//   ./a.out FIRST_LABEL LAST_LABEL INPUT_FILE [OUTPUT_FILE]
+//   export DFSAN_OPTIONS=fast16labels=1:warn_unimplemented=0
+//   ./a.out INPUT_FILE [OUTPUT_FILE]
 //
 //   # Print all instrumented functions. llvm-symbolizer must be present in PATH
 //   ./a.out
@@ -48,8 +49,6 @@
 //  C1 8
 //  ===============
 // "FN xxxxxxxxxx": tells what bytes of the input does the function N depend on.
-//    The byte string is LEN+1 bytes. The last byte is set if the function
-//    depends on the input length.
 // "CN X Y Z T": tells that a function N has basic blocks X, Y, and Z covered
 //    in addition to the function's entry block, out of T total instrumented
 //    blocks.
@@ -72,22 +71,20 @@
 } // extern "C"
 
 static size_t InputLen;
-static size_t InputLabelBeg;
-static size_t InputLabelEnd;
-static size_t InputSizeLabel;
+static size_t NumIterations;
 static size_t NumFuncs, NumGuards;
 static uint32_t *GuardsBeg, *GuardsEnd;
 static const uintptr_t *PCsBeg, *PCsEnd;
-static __thread size_t CurrentFunc;
-static dfsan_label *FuncLabels;  // Array of NumFuncs elements.
+static __thread size_t CurrentFunc, CurrentIteration;
+static dfsan_label **FuncLabels;  // NumFuncs x NumIterations.
 static bool *BBExecuted;  // Array of NumGuards elements.
-static char *PrintableStringForLabel;  // InputLen + 2 bytes.
-static bool LabelSeen[1 << 8 * sizeof(dfsan_label)];
 
 enum {
   PCFLAG_FUNC_ENTRY = 1,
 };
 
+const int kNumLabels = 16;
+
 static inline bool BlockIsEntry(size_t BlockIdx) {
   return PCsBeg[BlockIdx * 2 + 1] & PCFLAG_FUNC_ENTRY;
 }
@@ -112,35 +109,32 @@
   return 0;
 }
 
-extern "C"
-void SetBytesForLabel(dfsan_label L, char *Bytes) {
-  if (LabelSeen[L])
-    return;
-  LabelSeen[L] = true;
-  assert(L);
-  if (L < InputSizeLabel) {
-    Bytes[L + InputLabelBeg - 1] = '1';
-  } else if (L == InputSizeLabel) {
-    Bytes[InputLen] = '1';
-  } else {
-    auto *DLI = dfsan_get_label_info(L);
-    SetBytesForLabel(DLI->l1, Bytes);
-    SetBytesForLabel(DLI->l2, Bytes);
-  }
-}
-
-static char *GetPrintableStringForLabel(dfsan_label L) {
-  memset(PrintableStringForLabel, '0', InputLen + 1);
-  PrintableStringForLabel[InputLen + 1] = 0;
-  memset(LabelSeen, 0, sizeof(LabelSeen));
-  SetBytesForLabel(L, PrintableStringForLabel);
-  return PrintableStringForLabel;
+static void PrintBinary(FILE *Out, dfsan_label L, size_t Len) {
+  char buf[kNumLabels + 1];
+  assert(Len <= kNumLabels);
+  for (int i = 0; i < kNumLabels; i++)
+    buf[i] = (L & (1 << i)) ? '1' : '0';
+  buf[Len] = 0;
+  fprintf(Out, "%s", buf);
 }
 
 static void PrintDataFlow(FILE *Out) {
-  for (size_t I = 0; I < NumFuncs; I++)
-    if (FuncLabels[I])
-      fprintf(Out, "F%zd %s\n", I, GetPrintableStringForLabel(FuncLabels[I]));
+  for (size_t Func = 0; Func < NumFuncs; Func++) {
+    bool HasAny = false;
+    for (size_t Iter = 0; Iter < NumIterations; Iter++)
+      if (FuncLabels[Func][Iter])
+        HasAny = true;
+    if (!HasAny)
+      continue;
+    fprintf(Out, "F%zd ", Func);
+    size_t LenOfLastIteration = kNumLabels;
+    if (auto Tail = InputLen % kNumLabels)
+        LenOfLastIteration = Tail;
+    for (size_t Iter = 0; Iter < NumIterations; Iter++)
+      PrintBinary(Out, FuncLabels[Func][Iter],
+                  Iter == NumIterations - 1 ? LenOfLastIteration : kNumLabels);
+    fprintf(Out, "\n");
+  }
 }
 
 static void PrintCoverage(FILE *Out) {
@@ -169,12 +163,9 @@
     LLVMFuzzerInitialize(&argc, &argv);
   if (argc == 1)
     return PrintFunctions();
-  assert(argc == 4 || argc == 5);
-  InputLabelBeg = atoi(argv[1]);
-  InputLabelEnd = atoi(argv[2]);
-  assert(InputLabelBeg < InputLabelEnd);
+  assert(argc == 2 || argc == 3);
 
-  const char *Input = argv[3];
+  const char *Input = argv[1];
   fprintf(stderr, "INFO: reading '%s'\n", Input);
   FILE *In = fopen(Input, "r");
   assert(In);
@@ -184,30 +175,35 @@
   unsigned char *Buf = (unsigned char*)malloc(InputLen);
   size_t NumBytesRead = fread(Buf, 1, InputLen, In);
   assert(NumBytesRead == InputLen);
-  PrintableStringForLabel = (char*)malloc(InputLen + 2);
   fclose(In);
 
-  fprintf(stderr, "INFO: running '%s'\n", Input);
-  for (size_t I = 1; I <= InputLen; I++) {
-    size_t Idx = I - 1;
-    if (Idx >= InputLabelBeg && Idx < InputLabelEnd) {
-      dfsan_label L = dfsan_create_label("", nullptr);
-      assert(L == I - InputLabelBeg);
-      dfsan_set_label(L, Buf + Idx, 1);
-    }
-  }
-  dfsan_label SizeL = dfsan_create_label("", nullptr);
-  InputSizeLabel = SizeL;
-  assert(InputSizeLabel == InputLabelEnd - InputLabelBeg + 1);
-  dfsan_set_label(SizeL, &InputLen, sizeof(InputLen));
+  NumIterations = (NumBytesRead + kNumLabels - 1) / kNumLabels;
+  FuncLabels = (dfsan_label**)calloc(NumFuncs, sizeof(dfsan_label*));
+  for (size_t Func = 0; Func < NumFuncs; Func++)
+    FuncLabels[Func] =
+        (dfsan_label *)calloc(NumIterations, sizeof(dfsan_label));
 
-  LLVMFuzzerTestOneInput(Buf, InputLen);
+  for (CurrentIteration = 0; CurrentIteration < NumIterations;
+       CurrentIteration++) {
+    fprintf(stderr, "INFO: running '%s' %zd/%zd\n", Input, CurrentIteration,
+            NumIterations);
+    dfsan_flush();
+    dfsan_set_label(0, Buf, InputLen);
+
+    size_t BaseIdx = CurrentIteration * kNumLabels;
+    size_t LastIdx = BaseIdx + kNumLabels < NumBytesRead ? BaseIdx + kNumLabels
+                                                         : NumBytesRead;
+    assert(BaseIdx < LastIdx);
+    for (size_t Idx = BaseIdx; Idx < LastIdx; Idx++)
+      dfsan_set_label(1 << (Idx - BaseIdx), Buf + Idx, 1);
+    LLVMFuzzerTestOneInput(Buf, InputLen);
+  }
   free(Buf);
 
-  bool OutIsStdout = argc == 4;
+  bool OutIsStdout = argc == 2;
   fprintf(stderr, "INFO: writing dataflow to %s\n",
-          OutIsStdout ? "<stdout>" : argv[4]);
-  FILE *Out = OutIsStdout ? stdout : fopen(argv[4], "w");
+          OutIsStdout ? "<stdout>" : argv[2]);
+  FILE *Out = OutIsStdout ? stdout : fopen(argv[2], "w");
   PrintDataFlow(Out);
   PrintCoverage(Out);
   if (!OutIsStdout) fclose(Out);
@@ -237,7 +233,6 @@
       GuardsBeg[i] = NumFuncs;
     }
   }
-  FuncLabels = (dfsan_label*)calloc(NumFuncs, sizeof(dfsan_label));
   BBExecuted = (bool*)calloc(NumGuards, sizeof(bool));
   fprintf(stderr, "INFO: %zd instrumented function(s) observed "
           "and %zd basic blocks\n", NumFuncs, NumGuards);
@@ -258,14 +253,13 @@
 void __dfsw___sanitizer_cov_trace_switch(uint64_t Val, uint64_t *Cases,
                                          dfsan_label L1, dfsan_label UnusedL) {
   assert(CurrentFunc < NumFuncs);
-  FuncLabels[CurrentFunc] = dfsan_union(FuncLabels[CurrentFunc], L1);
+  FuncLabels[CurrentFunc][CurrentIteration] |= L1;
 }
 
 #define HOOK(Name, Type)                                                       \
   void Name(Type Arg1, Type Arg2, dfsan_label L1, dfsan_label L2) {            \
     assert(CurrentFunc < NumFuncs);                                            \
-    FuncLabels[CurrentFunc] =                                                  \
-        dfsan_union(FuncLabels[CurrentFunc], dfsan_union(L1, L2));             \
+    FuncLabels[CurrentFunc][CurrentIteration] |= L1 | L2;                      \
   }
 
 HOOK(__dfsw___sanitizer_cov_trace_const_cmp1, uint8_t)