Redistribute energy for Corpus I found that the initial corpus allocation of fork mode has certain defects. I designed a new initial corpus allocation strategy based on size grouping. This method can give more energy to the small seeds in the corpus and increase the throughput of the test. Fuzzbench data (glibfuzzer is -fork_corpus_groups=1): https://www.fuzzbench.com/reports/experimental/2021-08-05-parallel/index.html Reviewed By: morehouse Differential Revision: https://reviews.llvm.org/D105084 GitOrigin-RevId: a30dbbe9241fa72580638eb08468d3e6a39cf2b0

commit: bc98df024fff5a9c4b85a84b8571b2a597b636d9 [log] [tgz]
author: gtt1995 <1271159369@qq.com> Wed Sep 08 06:18:19 2021 -0700
committer: Copybara-Service <copybara-worker@google.com> Wed Sep 08 09:25:44 2021 -0700
tree: 2d15e12d8b6f6a8e05908a714481c047a3f0c7b0
parent: 3e55f4356f9d4754b13ac7b1f72e5e6e7164d833 [diff]
diff --git a/FuzzerDriver.cpp b/FuzzerDriver.cpp
index c1a7e1b..6b007f2 100644
--- a/FuzzerDriver.cpp
+++ b/FuzzerDriver.cpp

@@ -870,6 +870,7 @@
     exit(0);
   }
 
+  Options.ForkCorpusGroups = Flags.fork_corpus_groups;
   if (Flags.fork)
     FuzzWithFork(F->GetMD().GetRand(), Options, Args, *Inputs, Flags.fork);
 

diff --git a/FuzzerFlags.def b/FuzzerFlags.def
index d0cf0e9..1181534 100644
--- a/FuzzerFlags.def
+++ b/FuzzerFlags.def

@@ -58,6 +58,10 @@
 FUZZER_FLAG_INT(help, 0, "Print help.")
 FUZZER_FLAG_INT(fork, 0, "Experimental mode where fuzzing happens "
                 "in a subprocess")
+FUZZER_FLAG_INT(fork_corpus_groups, 0, "For fork mode, enable the corpus-group "
+		"strategy, The main corpus will be grouped according to size, "
+		"and each sub-process will randomly select seeds from different "
+		"groups as the sub-corpus.")
 FUZZER_FLAG_INT(ignore_timeouts, 1, "Ignore timeouts in fork mode")
 FUZZER_FLAG_INT(ignore_ooms, 1, "Ignore OOMs in fork mode")
 FUZZER_FLAG_INT(ignore_crashes, 0, "Ignore crashes in fork mode")

diff --git a/FuzzerFork.cpp b/FuzzerFork.cpp
index 8771ab3..eb68b72 100644
--- a/FuzzerFork.cpp
+++ b/FuzzerFork.cpp

@@ -95,9 +95,12 @@
   std::set<uint32_t> Features, Cov;
   std::set<std::string> FilesWithDFT;
   std::vector<std::string> Files;
+  std::vector<std::size_t> FilesSizes;
   Random *Rand;
   std::chrono::system_clock::time_point ProcessStartTime;
   int Verbosity = 0;
+  int Group = 0;
+  int NumCorpuses = 8;
 
   size_t NumTimeouts = 0;
   size_t NumOOMs = 0;
@@ -136,10 +139,24 @@
     if (size_t CorpusSubsetSize =
             std::min(Files.size(), (size_t)sqrt(Files.size() + 2))) {
       auto Time1 = std::chrono::system_clock::now();
-      for (size_t i = 0; i < CorpusSubsetSize; i++) {
-        auto &SF = Files[Rand->SkewTowardsLast(Files.size())];
-        Seeds += (Seeds.empty() ? "" : ",") + SF;
-        CollectDFT(SF);
+      if (Group) { // whether to group the corpus.
+        size_t AverageCorpusSize = Files.size() / NumCorpuses + 1;
+        size_t StartIndex = ((JobId - 1) % NumCorpuses) * AverageCorpusSize;
+        for (size_t i = 0; i < CorpusSubsetSize; i++) {
+          size_t RandNum = (*Rand)(AverageCorpusSize);
+          size_t Index = RandNum + StartIndex;
+          Index = Index < Files.size() ? Index
+                                       : Rand->SkewTowardsLast(Files.size());
+          auto &SF = Files[Index];
+          Seeds += (Seeds.empty() ? "" : ",") + SF;
+          CollectDFT(SF);
+        }
+      } else {
+        for (size_t i = 0; i < CorpusSubsetSize; i++) {
+          auto &SF = Files[Rand->SkewTowardsLast(Files.size())];
+          Seeds += (Seeds.empty() ? "" : ",") + SF;
+          CollectDFT(SF);
+        }
       }
       auto Time2 = std::chrono::system_clock::now();
       auto DftTimeInSeconds = duration_cast<seconds>(Time2 - Time1).count();
@@ -222,7 +239,16 @@
       auto U = FileToVector(Path);
       auto NewPath = DirPlusFile(MainCorpusDir, Hash(U));
       WriteToFile(U, NewPath);
-      Files.push_back(NewPath);
+      if (Group) { // Insert the queue according to the size of the seed.
+        size_t UnitSize = U.size();
+        auto Idx =
+            std::upper_bound(FilesSizes.begin(), FilesSizes.end(), UnitSize) -
+            FilesSizes.begin();
+        FilesSizes.insert(FilesSizes.begin() + Idx, UnitSize);
+        Files.insert(Files.begin() + Idx, NewPath);
+      } else {
+        Files.push_back(NewPath);
+      }
     }
     Features.insert(NewFeatures.begin(), NewFeatures.end());
     Cov.insert(NewCov.begin(), NewCov.end());
@@ -231,10 +257,8 @@
         if (TPC.PcIsFuncEntry(TE))
           PrintPC("  NEW_FUNC: %p %F %L\n", "",
                   TPC.GetNextInstructionPc(TE->PC));
-
   }
 
-
   void CollectDFT(const std::string &InputPath) {
     if (DataFlowBinary.empty()) return;
     if (!FilesWithDFT.insert(InputPath).second) return;
@@ -297,6 +321,7 @@
   Env.Verbosity = Options.Verbosity;
   Env.ProcessStartTime = std::chrono::system_clock::now();
   Env.DataFlowBinary = Options.CollectDataFlow;
+  Env.Group = Options.ForkCorpusGroups;
 
   std::vector<SizedFile> SeedFiles;
   for (auto &Dir : CorpusDirs)
@@ -327,6 +352,12 @@
     Env.Cov.insert(NewFeatures.begin(), NewFeatures.end());
     RemoveFile(CFPath);
   }
+
+  if (Env.Group) {
+    for (auto &path : Env.Files)
+      Env.FilesSizes.push_back(FileSize(path));
+  }
+
   Printf("INFO: -fork=%d: %zd seed inputs, starting to fuzz in %s\n", NumJobs,
          Env.Files.size(), Env.TempDir.c_str());
 
@@ -341,6 +372,8 @@
     WriteToFile(Unit({1}), Env.StopFile());
   };
 
+  size_t MergeCycle = 20;
+  size_t JobExecuted = 0;
   size_t JobId = 1;
   std::vector<std::thread> Threads;
   for (int t = 0; t < NumJobs; t++) {
@@ -362,6 +395,45 @@
 
     Env.RunOneMergeJob(Job.get());
 
+    // merge the corpus .
+    JobExecuted++;
+    if (Env.Group && JobExecuted >= MergeCycle) {
+      std::vector<SizedFile> CurrentSeedFiles;
+      for (auto &Dir : CorpusDirs)
+        GetSizedFilesFromDir(Dir, &CurrentSeedFiles);
+      std::sort(CurrentSeedFiles.begin(), CurrentSeedFiles.end());
+
+      auto CFPath = DirPlusFile(Env.TempDir, "merge.txt");
+      std::set<uint32_t> TmpNewFeatures, TmpNewCov;
+      std::set<uint32_t> TmpFeatures, TmpCov;
+      Env.Files.clear();
+      Env.FilesSizes.clear();
+      CrashResistantMerge(Env.Args, {}, CurrentSeedFiles, &Env.Files,
+                          TmpFeatures, &TmpNewFeatures, TmpCov, &TmpNewCov,
+                          CFPath, false);
+      for (auto &path : Env.Files)
+        Env.FilesSizes.push_back(FileSize(path));
+      RemoveFile(CFPath);
+      JobExecuted = 0;
+      MergeCycle += 5;
+    }
+
+    // Since the number of corpus seeds will gradually increase, in order to
+    // control the number in each group to be about three times the number of
+    // seeds selected each time, the number of groups is dynamically adjusted.
+    if (Env.Files.size() < 2000)
+      Env.NumCorpuses = 12;
+    else if (Env.Files.size() < 6000)
+      Env.NumCorpuses = 20;
+    else if (Env.Files.size() < 12000)
+      Env.NumCorpuses = 32;
+    else if (Env.Files.size() < 16000)
+      Env.NumCorpuses = 40;
+    else if (Env.Files.size() < 24000)
+      Env.NumCorpuses = 60;
+    else
+      Env.NumCorpuses = 80;
+
     // Continue if our crash is one of the ignored ones.
     if (Options.IgnoreTimeouts && ExitCode == Options.TimeoutExitCode)
       Env.NumTimeouts++;

diff --git a/FuzzerOptions.h b/FuzzerOptions.h
index d0c285a..72e2561 100644
--- a/FuzzerOptions.h
+++ b/FuzzerOptions.h

@@ -47,6 +47,7 @@
   int ReportSlowUnits = 10;
   bool OnlyASCII = false;
   bool Entropic = true;
+  bool ForkCorpusGroups = false;
   size_t EntropicFeatureFrequencyThreshold = 0xFF;
   size_t EntropicNumberOfRarestFeatures = 100;
   bool EntropicScalePerExecTime = false;
commit	bc98df024fff5a9c4b85a84b8571b2a597b636d9	[log] [tgz]
author	gtt1995 <1271159369@qq.com>	Wed Sep 08 06:18:19 2021 -0700
committer	Copybara-Service <copybara-worker@google.com>	Wed Sep 08 09:25:44 2021 -0700
tree	2d15e12d8b6f6a8e05908a714481c047a3f0c7b0
parent	3e55f4356f9d4754b13ac7b1f72e5e6e7164d833 [diff]