Greedy set cover implementation of `Merger::Merge`

Extend the existing single-pass algorithm for `Merger::Merge` with an algorithm that gives better results. This new implementation can be used with a new **set_cover_merge=1** flag.

This greedy set cover implementation gives a substantially smaller final corpus (40%-80% less testcases) while preserving the same features/coverage. At the same time, the execution time penalty is not that significant (+50% for ~1M corpus files and far less for smaller corpora). These results were obtained by comparing several targets with varying size corpora.

Change `Merger::CrashResistantMergeInternalStep` to collect all features from each file and not just unique ones. This is needed for the set cover algorithm to work correctly. The implementation of the algorithm in `Merger::SetCoverMerge` uses a bitvector to store features that are covered by a file while performing the pass. Collisions while indexing the bitvector are ignored similarly to the fuzzer.

Reviewed By: morehouse

Differential Revision: https://reviews.llvm.org/D105284

GitOrigin-RevId: e6597dbae84034ce8ef97802584039b723adb526
diff --git a/FuzzerDriver.cpp b/FuzzerDriver.cpp
index 653c6ca..c1a7e1b 100644
--- a/FuzzerDriver.cpp
+++ b/FuzzerDriver.cpp
@@ -523,7 +523,7 @@
   std::vector<std::string> NewFiles;
   std::set<uint32_t> NewFeatures, NewCov;
   CrashResistantMerge(Args, OldCorpus, NewCorpus, &NewFiles, {}, &NewFeatures,
-                      {}, &NewCov, CFPath, true);
+                      {}, &NewCov, CFPath, true, Flags.set_cover_merge);
   for (auto &Path : NewFiles)
     F->WriteToOutputCorpus(FileToVector(Path, Options.MaxLen));
   // We are done, delete the control file if it was a temporary one.
@@ -797,7 +797,8 @@
   if (Flags.verbosity)
     Printf("INFO: Seed: %u\n", Seed);
 
-  if (Flags.collect_data_flow && !Flags.fork && !Flags.merge) {
+  if (Flags.collect_data_flow && !Flags.fork &&
+      !(Flags.merge || Flags.set_cover_merge)) {
     if (RunIndividualFiles)
       return CollectDataFlow(Flags.collect_data_flow, Flags.data_flow_trace,
                         ReadCorpora({}, *Inputs));
@@ -872,7 +873,7 @@
   if (Flags.fork)
     FuzzWithFork(F->GetMD().GetRand(), Options, Args, *Inputs, Flags.fork);
 
-  if (Flags.merge)
+  if (Flags.merge || Flags.set_cover_merge)
     Merge(F, Options, Args, *Inputs, Flags.merge_control_file);
 
   if (Flags.merge_inner) {
@@ -880,7 +881,8 @@
     if (Options.MaxLen == 0)
       F->SetMaxInputLen(kDefaultMaxMergeLen);
     assert(Flags.merge_control_file);
-    F->CrashResistantMergeInternalStep(Flags.merge_control_file);
+    F->CrashResistantMergeInternalStep(Flags.merge_control_file,
+                                       !strncmp(Flags.merge_inner, "2", 1));
     exit(0);
   }
 
diff --git a/FuzzerFlags.def b/FuzzerFlags.def
index ab31da0..d0cf0e9 100644
--- a/FuzzerFlags.def
+++ b/FuzzerFlags.def
@@ -64,6 +64,11 @@
 FUZZER_FLAG_INT(merge, 0, "If 1, the 2-nd, 3-rd, etc corpora will be "
   "merged into the 1-st corpus. Only interesting units will be taken. "
   "This flag can be used to minimize a corpus.")
+FUZZER_FLAG_INT(set_cover_merge, 0, "If 1, the 2-nd, 3-rd, etc corpora will be "
+  "merged into the 1-st corpus. Same as the 'merge' flag, but uses the "
+  "standard greedy algorithm for the set cover problem to "
+  "compute an approximation of the minimum set of testcases that "
+  "provide the same coverage as the initial corpora")
 FUZZER_FLAG_STRING(stop_file, "Stop fuzzing ASAP if this file exists")
 FUZZER_FLAG_STRING(merge_inner, "internal flag")
 FUZZER_FLAG_STRING(merge_control_file,
diff --git a/FuzzerFork.cpp b/FuzzerFork.cpp
index f7a17fd..8771ab3 100644
--- a/FuzzerFork.cpp
+++ b/FuzzerFork.cpp
@@ -213,8 +213,11 @@
 
     std::vector<std::string> FilesToAdd;
     std::set<uint32_t> NewFeatures, NewCov;
+    bool IsSetCoverMerge =
+        !Job->Cmd.getFlagValue("set_cover_merge").compare("1");
     CrashResistantMerge(Args, {}, MergeCandidates, &FilesToAdd, Features,
-                        &NewFeatures, Cov, &NewCov, Job->CFPath, false);
+                        &NewFeatures, Cov, &NewCov, Job->CFPath, false,
+                        IsSetCoverMerge);
     for (auto &Path : FilesToAdd) {
       auto U = FileToVector(Path);
       auto NewPath = DirPlusFile(MainCorpusDir, Hash(U));
@@ -318,7 +321,8 @@
     auto CFPath = DirPlusFile(Env.TempDir, "merge.txt");
     std::set<uint32_t> NewFeatures, NewCov;
     CrashResistantMerge(Env.Args, {}, SeedFiles, &Env.Files, Env.Features,
-                        &NewFeatures, Env.Cov, &NewCov, CFPath, false);
+                        &NewFeatures, Env.Cov, &NewCov, CFPath,
+                        /*Verbose=*/false, /*IsSetCoverMerge=*/false);
     Env.Features.insert(NewFeatures.begin(), NewFeatures.end());
     Env.Cov.insert(NewFeatures.begin(), NewFeatures.end());
     RemoveFile(CFPath);
diff --git a/FuzzerInternal.h b/FuzzerInternal.h
index bfce1df..6637b00 100644
--- a/FuzzerInternal.h
+++ b/FuzzerInternal.h
@@ -73,7 +73,8 @@
 
   // Merge Corpora[1:] into Corpora[0].
   void Merge(const std::vector<std::string> &Corpora);
-  void CrashResistantMergeInternalStep(const std::string &ControlFilePath);
+  void CrashResistantMergeInternalStep(const std::string &ControlFilePath,
+                                       bool IsSetCoverMerge);
   MutationDispatcher &GetMD() { return MD; }
   void PrintFinalStats();
   void SetMaxInputLen(size_t MaxInputLen);
diff --git a/FuzzerMerge.cpp b/FuzzerMerge.cpp
index 4bae782..24bd119 100644
--- a/FuzzerMerge.cpp
+++ b/FuzzerMerge.cpp
@@ -197,7 +197,8 @@
 }
 
 // Inner process. May crash if the target crashes.
-void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath) {
+void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath,
+                                             bool IsSetCoverMerge) {
   Printf("MERGE-INNER: using the control file '%s'\n", CFPath.c_str());
   Merger M;
   std::ifstream IF(CFPath);
@@ -235,13 +236,14 @@
     // Collect coverage. We are iterating over the files in this order:
     // * First, files in the initial corpus ordered by size, smallest first.
     // * Then, all other files, smallest first.
-    // So it makes no sense to record all features for all files, instead we
-    // only record features that were not seen before.
-    std::set<size_t> UniqFeatures;
-    TPC.CollectFeatures([&](size_t Feature) {
-      if (AllFeatures.insert(Feature).second)
-        UniqFeatures.insert(Feature);
-    });
+    std::set<size_t> Features;
+    if (IsSetCoverMerge)
+      TPC.CollectFeatures([&](size_t Feature) { Features.insert(Feature); });
+    else
+      TPC.CollectFeatures([&](size_t Feature) {
+        if (AllFeatures.insert(Feature).second)
+          Features.insert(Feature);
+      });
     TPC.UpdateObservedPCs();
     // Show stats.
     if (!(TotalNumberOfRuns & (TotalNumberOfRuns - 1)))
@@ -250,7 +252,7 @@
       PrintStatsWrapper("LOADED");
     // Write the post-run marker and the coverage.
     OF << "FT " << i;
-    for (size_t F : UniqFeatures)
+    for (size_t F : Features)
       OF << " " << F;
     OF << "\n";
     OF << "COV " << i;
@@ -264,6 +266,127 @@
   PrintStatsWrapper("DONE  ");
 }
 
+// Merges all corpora into the first corpus. A file is added into
+// the first corpus only if it adds new features. Unlike `Merger::Merge`,
+// this implementation calculates an approximation of the minimum set
+// of corpora files, that cover all known features (set cover problem).
+// Generally, this means that files with more features are preferred for
+// merge into the first corpus. When two files have the same number of
+// features, the smaller one is preferred.
+size_t Merger::SetCoverMerge(const std::set<uint32_t> &InitialFeatures,
+                             std::set<uint32_t> *NewFeatures,
+                             const std::set<uint32_t> &InitialCov,
+                             std::set<uint32_t> *NewCov,
+                             std::vector<std::string> *NewFiles) {
+  assert(NumFilesInFirstCorpus <= Files.size());
+  NewFiles->clear();
+  NewFeatures->clear();
+  NewCov->clear();
+  std::set<uint32_t> AllFeatures;
+  // 1 << 21 - 1 is the maximum feature index.
+  // See 'kFeatureSetSize' in 'FuzzerCorpus.h'.
+  const uint32_t kFeatureSetSize = 1 << 21;
+  std::vector<bool> Covered(kFeatureSetSize, false);
+  size_t NumCovered = 0;
+
+  std::set<uint32_t> ExistingFeatures = InitialFeatures;
+  for (size_t i = 0; i < NumFilesInFirstCorpus; ++i)
+    ExistingFeatures.insert(Files[i].Features.begin(), Files[i].Features.end());
+
+  // Mark the existing features as covered.
+  for (const auto &F : ExistingFeatures) {
+    if (!Covered[F % kFeatureSetSize]) {
+      ++NumCovered;
+      Covered[F % kFeatureSetSize] = true;
+    }
+    // Calculate an underestimation of the set of covered features
+    // since the `Covered` bitvector is smaller than the feature range.
+    AllFeatures.insert(F % kFeatureSetSize);
+  }
+
+  std::set<size_t> RemainingFiles;
+  for (size_t i = NumFilesInFirstCorpus; i < Files.size(); ++i) {
+    // Construct an incremental sequence which represent the
+    // indices to all files (excluding those in the initial corpus).
+    // RemainingFiles = range(NumFilesInFirstCorpus..Files.size()).
+    RemainingFiles.insert(i);
+    // Insert this file's unique features to all features.
+    for (const auto &F : Files[i].Features)
+      AllFeatures.insert(F % kFeatureSetSize);
+  }
+
+  // Integrate files into Covered until set is complete.
+  while (NumCovered != AllFeatures.size()) {
+    // Index to file with largest number of unique features.
+    size_t MaxFeaturesIndex = NumFilesInFirstCorpus;
+    // Indices to remove from RemainingFiles.
+    std::set<size_t> RemoveIndices;
+    // Running max unique feature count.
+    // Updated upon finding a file with more features.
+    size_t MaxNumFeatures = 0;
+
+    // Iterate over all files not yet integrated into Covered,
+    // to find the file which has the largest number of
+    // features that are not already in Covered.
+    for (const auto &i : RemainingFiles) {
+      const auto &File = Files[i];
+      size_t CurrentUnique = 0;
+      // Count number of features in this file
+      // which are not yet in Covered.
+      for (const auto &F : File.Features)
+        if (!Covered[F % kFeatureSetSize])
+          ++CurrentUnique;
+
+      if (CurrentUnique == 0) {
+        // All features in this file are already in Covered: skip next time.
+        RemoveIndices.insert(i);
+      } else if (CurrentUnique > MaxNumFeatures ||
+                 (CurrentUnique == MaxNumFeatures &&
+                  File.Size < Files[MaxFeaturesIndex].Size)) {
+        // Update the max features file based on unique features
+        // Break ties by selecting smaller files.
+        MaxNumFeatures = CurrentUnique;
+        MaxFeaturesIndex = i;
+      }
+    }
+    // Must be a valid index/
+    assert(MaxFeaturesIndex < Files.size());
+    // Remove any feature-less files found.
+    for (const auto &i : RemoveIndices)
+      RemainingFiles.erase(i);
+    if (MaxNumFeatures == 0) {
+      // Did not find a file that adds unique features.
+      // This means that we should have no remaining files.
+      assert(RemainingFiles.size() == 0);
+      assert(NumCovered == AllFeatures.size());
+      break;
+    }
+
+    // MaxFeaturesIndex must be an element of Remaining.
+    assert(RemainingFiles.find(MaxFeaturesIndex) != RemainingFiles.end());
+    // Remove the file with the most features from Remaining.
+    RemainingFiles.erase(MaxFeaturesIndex);
+    const auto &MaxFeatureFile = Files[MaxFeaturesIndex];
+    // Add the features of the max feature file to Covered.
+    for (const auto &F : MaxFeatureFile.Features) {
+      if (!Covered[F % kFeatureSetSize]) {
+        ++NumCovered;
+        Covered[F % kFeatureSetSize] = true;
+        NewFeatures->insert(F);
+      }
+    }
+    // Add the index to this file to the result.
+    NewFiles->push_back(MaxFeatureFile.Name);
+    // Update NewCov with the additional coverage
+    // that MaxFeatureFile provides.
+    for (const auto &C : MaxFeatureFile.Cov)
+      if (InitialCov.find(C) == InitialCov.end())
+        NewCov->insert(C);
+  }
+
+  return NewFeatures->size();
+}
+
 static size_t
 WriteNewControlFile(const std::string &CFPath,
                     const std::vector<SizedFile> &OldCorpus,
@@ -309,7 +432,8 @@
                          std::set<uint32_t> *NewFeatures,
                          const std::set<uint32_t> &InitialCov,
                          std::set<uint32_t> *NewCov, const std::string &CFPath,
-                         bool V /*Verbose*/) {
+                         bool V, /*Verbose*/
+                         bool IsSetCoverMerge) {
   if (NewCorpus.empty() && OldCorpus.empty()) return;  // Nothing to merge.
   size_t NumAttempts = 0;
   std::vector<MergeFileInfo> KnownFiles;
@@ -364,6 +488,7 @@
   // Every inner process should execute at least one input.
   Command BaseCmd(Args);
   BaseCmd.removeFlag("merge");
+  BaseCmd.removeFlag("set_cover_merge");
   BaseCmd.removeFlag("fork");
   BaseCmd.removeFlag("collect_data_flow");
   for (size_t Attempt = 1; Attempt <= NumAttempts; Attempt++) {
@@ -371,7 +496,9 @@
     VPrintf(V, "MERGE-OUTER: attempt %zd\n", Attempt);
     Command Cmd(BaseCmd);
     Cmd.addFlag("merge_control_file", CFPath);
-    Cmd.addFlag("merge_inner", "1");
+    // If we are going to use the set cover implementation for
+    // minimization add the merge_inner=2 internal flag.
+    Cmd.addFlag("merge_inner", IsSetCoverMerge ? "2" : "1");
     if (!V) {
       Cmd.setOutputFile(getDevNull());
       Cmd.combineOutAndErr();
@@ -396,7 +523,10 @@
           M.ApproximateMemoryConsumption() >> 20, GetPeakRSSMb());
 
   M.Files.insert(M.Files.end(), KnownFiles.begin(), KnownFiles.end());
-  M.Merge(InitialFeatures, NewFeatures, InitialCov, NewCov, NewFiles);
+  if (IsSetCoverMerge)
+    M.SetCoverMerge(InitialFeatures, NewFeatures, InitialCov, NewCov, NewFiles);
+  else
+    M.Merge(InitialFeatures, NewFeatures, InitialCov, NewCov, NewFiles);
   VPrintf(V, "MERGE-OUTER: %zd new files with %zd new features added; "
           "%zd new coverage edges\n",
          NewFiles->size(), NewFeatures->size(), NewCov->size());
diff --git a/FuzzerMerge.h b/FuzzerMerge.h
index de21301..42f798e 100644
--- a/FuzzerMerge.h
+++ b/FuzzerMerge.h
@@ -69,6 +69,11 @@
                std::set<uint32_t> *NewFeatures,
                const std::set<uint32_t> &InitialCov, std::set<uint32_t> *NewCov,
                std::vector<std::string> *NewFiles);
+  size_t SetCoverMerge(const std::set<uint32_t> &InitialFeatures,
+                       std::set<uint32_t> *NewFeatures,
+                       const std::set<uint32_t> &InitialCov,
+                       std::set<uint32_t> *NewCov,
+                       std::vector<std::string> *NewFiles);
   size_t ApproximateMemoryConsumption() const;
   std::set<uint32_t> AllFeatures() const;
 };
@@ -81,7 +86,7 @@
                          std::set<uint32_t> *NewFeatures,
                          const std::set<uint32_t> &InitialCov,
                          std::set<uint32_t> *NewCov, const std::string &CFPath,
-                         bool Verbose);
+                         bool Verbose, bool IsSetCoverMerge);
 
 }  // namespace fuzzer
 
diff --git a/tests/FuzzerUnittest.cpp b/tests/FuzzerUnittest.cpp
index 99cf79c..1b0e734 100644
--- a/tests/FuzzerUnittest.cpp
+++ b/tests/FuzzerUnittest.cpp
@@ -865,6 +865,137 @@
   TRACED_EQ(NewFeatures, {1, 2, 3});
 }
 
+TEST(Merger, SetCoverMerge) {
+  Merger M;
+  std::set<uint32_t> Features, NewFeatures;
+  std::set<uint32_t> Cov, NewCov;
+  std::vector<std::string> NewFiles;
+
+  // Adds new files and features
+  EXPECT_TRUE(M.Parse("3\n0\nA\nB\nC\n"
+                      "STARTED 0 1000\n"
+                      "FT 0 1 2 3\n"
+                      "STARTED 1 1001\n"
+                      "FT 1 4 5 6 \n"
+                      "STARTED 2 1002\n"
+                      "FT 2 6 1 3\n"
+                      "",
+                      true));
+  EXPECT_EQ(M.SetCoverMerge(Features, &NewFeatures, Cov, &NewCov, &NewFiles),
+            6U);
+  TRACED_EQ(M.Files, {"A", "B", "C"});
+  TRACED_EQ(NewFiles, {"A", "B"});
+  TRACED_EQ(NewFeatures, {1, 2, 3, 4, 5, 6});
+
+  // Doesn't return features or files in the initial corpus.
+  EXPECT_TRUE(M.Parse("3\n1\nA\nB\nC\n"
+                      "STARTED 0 1000\n"
+                      "FT 0 1 2 3\n"
+                      "STARTED 1 1001\n"
+                      "FT 1 4 5 6 \n"
+                      "STARTED 2 1002\n"
+                      "FT 2 6 1 3\n"
+                      "",
+                      true));
+  EXPECT_EQ(M.SetCoverMerge(Features, &NewFeatures, Cov, &NewCov, &NewFiles),
+            3U);
+  TRACED_EQ(M.Files, {"A", "B", "C"});
+  TRACED_EQ(NewFiles, {"B"});
+  TRACED_EQ(NewFeatures, {4, 5, 6});
+
+  // No new features, so no new files
+  EXPECT_TRUE(M.Parse("3\n2\nA\nB\nC\n"
+                      "STARTED 0 1000\n"
+                      "FT 0 1 2 3\n"
+                      "STARTED 1 1001\n"
+                      "FT 1 4 5 6 \n"
+                      "STARTED 2 1002\n"
+                      "FT 2 6 1 3\n"
+                      "",
+                      true));
+  EXPECT_EQ(M.SetCoverMerge(Features, &NewFeatures, Cov, &NewCov, &NewFiles),
+            0U);
+  TRACED_EQ(M.Files, {"A", "B", "C"});
+  TRACED_EQ(NewFiles, {});
+  TRACED_EQ(NewFeatures, {});
+
+  // Can pass initial features and coverage.
+  Features = {1, 2, 3};
+  Cov = {};
+  EXPECT_TRUE(M.Parse("2\n0\nA\nB\n"
+                      "STARTED 0 1000\n"
+                      "FT 0 1 2 3\n"
+                      "STARTED 1 1001\n"
+                      "FT 1 4 5 6\n"
+                      "",
+                      true));
+  EXPECT_EQ(M.SetCoverMerge(Features, &NewFeatures, Cov, &NewCov, &NewFiles),
+            3U);
+  TRACED_EQ(M.Files, {"A", "B"});
+  TRACED_EQ(NewFiles, {"B"});
+  TRACED_EQ(NewFeatures, {4, 5, 6});
+  Features.clear();
+  Cov.clear();
+
+  // Prefer files with a lot of features first (C has 4 features)
+  // Then prefer B over A due to the smaller size. After choosing C and B,
+  // A and D have no new features to contribute.
+  EXPECT_TRUE(M.Parse("4\n0\nA\nB\nC\nD\n"
+                      "STARTED 0 2000\n"
+                      "FT 0 3 5 6\n"
+                      "STARTED 1 1000\n"
+                      "FT 1 4 5 6 \n"
+                      "STARTED 2 1000\n"
+                      "FT 2 1 2 3 4 \n"
+                      "STARTED 3 500\n"
+                      "FT 3 1  \n"
+                      "",
+                      true));
+  EXPECT_EQ(M.SetCoverMerge(Features, &NewFeatures, Cov, &NewCov, &NewFiles),
+            6U);
+  TRACED_EQ(M.Files, {"A", "B", "C", "D"});
+  TRACED_EQ(NewFiles, {"C", "B"});
+  TRACED_EQ(NewFeatures, {1, 2, 3, 4, 5, 6});
+
+  // Only 1 file covers all features.
+  EXPECT_TRUE(M.Parse("4\n1\nA\nB\nC\nD\n"
+                      "STARTED 0 2000\n"
+                      "FT 0 4 5 6 7 8\n"
+                      "STARTED 1 1100\n"
+                      "FT 1 1 2 3 \n"
+                      "STARTED 2 1100\n"
+                      "FT 2 2 3 \n"
+                      "STARTED 3 1000\n"
+                      "FT 3 1  \n"
+                      "",
+                      true));
+  EXPECT_EQ(M.SetCoverMerge(Features, &NewFeatures, Cov, &NewCov, &NewFiles),
+            3U);
+  TRACED_EQ(M.Files, {"A", "B", "C", "D"});
+  TRACED_EQ(NewFiles, {"B"});
+  TRACED_EQ(NewFeatures, {1, 2, 3});
+
+  // A Feature has a value greater than (1 << 21) and hence
+  // there are collisions in the underlying `covered features`
+  // bitvector.
+  EXPECT_TRUE(M.Parse("3\n0\nA\nB\nC\n"
+                      "STARTED 0 2000\n"
+                      "FT 0 1 2 3\n"
+                      "STARTED 1 1000\n"
+                      "FT 1 3 4 5 \n"
+                      "STARTED 2 1000\n"
+                      "FT 2 3 2097153 \n" // Last feature is (2^21 + 1).
+                      "",
+                      true));
+  EXPECT_EQ(M.SetCoverMerge(Features, &NewFeatures, Cov, &NewCov, &NewFiles),
+            5U);
+  TRACED_EQ(M.Files, {"A", "B", "C"});
+  // File 'C' is not added because it's last feature is considered
+  // covered due to collision with feature 1.
+  TRACED_EQ(NewFiles, {"B", "A"});
+  TRACED_EQ(NewFeatures, {1, 2, 3, 4, 5});
+}
+
 #undef TRACED_EQ
 
 TEST(DFT, BlockCoverage) {