Redistribute energy for Corpus

I found that the initial corpus allocation of fork mode has certain defects.
I designed a new initial corpus allocation strategy based on size grouping.
This method can give more energy to the small seeds in the corpus and
increase the throughput of the test.

Fuzzbench data (glibfuzzer is -fork_corpus_groups=1):
https://www.fuzzbench.com/reports/experimental/2021-08-05-parallel/index.html

Reviewed By: morehouse

Differential Revision: https://reviews.llvm.org/D105084

GitOrigin-RevId: a30dbbe9241fa72580638eb08468d3e6a39cf2b0
diff --git a/FuzzerDriver.cpp b/FuzzerDriver.cpp
index c1a7e1b..6b007f2 100644
--- a/FuzzerDriver.cpp
+++ b/FuzzerDriver.cpp
@@ -870,6 +870,7 @@
     exit(0);
   }
 
+  Options.ForkCorpusGroups = Flags.fork_corpus_groups;
   if (Flags.fork)
     FuzzWithFork(F->GetMD().GetRand(), Options, Args, *Inputs, Flags.fork);
 
diff --git a/FuzzerFlags.def b/FuzzerFlags.def
index d0cf0e9..1181534 100644
--- a/FuzzerFlags.def
+++ b/FuzzerFlags.def
@@ -58,6 +58,10 @@
 FUZZER_FLAG_INT(help, 0, "Print help.")
 FUZZER_FLAG_INT(fork, 0, "Experimental mode where fuzzing happens "
                 "in a subprocess")
+FUZZER_FLAG_INT(fork_corpus_groups, 0, "For fork mode, enable the corpus-group "
+		"strategy, The main corpus will be grouped according to size, "
+		"and each sub-process will randomly select seeds from different "
+		"groups as the sub-corpus.")
 FUZZER_FLAG_INT(ignore_timeouts, 1, "Ignore timeouts in fork mode")
 FUZZER_FLAG_INT(ignore_ooms, 1, "Ignore OOMs in fork mode")
 FUZZER_FLAG_INT(ignore_crashes, 0, "Ignore crashes in fork mode")
diff --git a/FuzzerFork.cpp b/FuzzerFork.cpp
index 8771ab3..eb68b72 100644
--- a/FuzzerFork.cpp
+++ b/FuzzerFork.cpp
@@ -95,9 +95,12 @@
   std::set<uint32_t> Features, Cov;
   std::set<std::string> FilesWithDFT;
   std::vector<std::string> Files;
+  std::vector<std::size_t> FilesSizes;
   Random *Rand;
   std::chrono::system_clock::time_point ProcessStartTime;
   int Verbosity = 0;
+  int Group = 0;
+  int NumCorpuses = 8;
 
   size_t NumTimeouts = 0;
   size_t NumOOMs = 0;
@@ -136,10 +139,24 @@
     if (size_t CorpusSubsetSize =
             std::min(Files.size(), (size_t)sqrt(Files.size() + 2))) {
       auto Time1 = std::chrono::system_clock::now();
-      for (size_t i = 0; i < CorpusSubsetSize; i++) {
-        auto &SF = Files[Rand->SkewTowardsLast(Files.size())];
-        Seeds += (Seeds.empty() ? "" : ",") + SF;
-        CollectDFT(SF);
+      if (Group) { // whether to group the corpus.
+        size_t AverageCorpusSize = Files.size() / NumCorpuses + 1;
+        size_t StartIndex = ((JobId - 1) % NumCorpuses) * AverageCorpusSize;
+        for (size_t i = 0; i < CorpusSubsetSize; i++) {
+          size_t RandNum = (*Rand)(AverageCorpusSize);
+          size_t Index = RandNum + StartIndex;
+          Index = Index < Files.size() ? Index
+                                       : Rand->SkewTowardsLast(Files.size());
+          auto &SF = Files[Index];
+          Seeds += (Seeds.empty() ? "" : ",") + SF;
+          CollectDFT(SF);
+        }
+      } else {
+        for (size_t i = 0; i < CorpusSubsetSize; i++) {
+          auto &SF = Files[Rand->SkewTowardsLast(Files.size())];
+          Seeds += (Seeds.empty() ? "" : ",") + SF;
+          CollectDFT(SF);
+        }
       }
       auto Time2 = std::chrono::system_clock::now();
       auto DftTimeInSeconds = duration_cast<seconds>(Time2 - Time1).count();
@@ -222,7 +239,16 @@
       auto U = FileToVector(Path);
       auto NewPath = DirPlusFile(MainCorpusDir, Hash(U));
       WriteToFile(U, NewPath);
-      Files.push_back(NewPath);
+      if (Group) { // Insert the queue according to the size of the seed.
+        size_t UnitSize = U.size();
+        auto Idx =
+            std::upper_bound(FilesSizes.begin(), FilesSizes.end(), UnitSize) -
+            FilesSizes.begin();
+        FilesSizes.insert(FilesSizes.begin() + Idx, UnitSize);
+        Files.insert(Files.begin() + Idx, NewPath);
+      } else {
+        Files.push_back(NewPath);
+      }
     }
     Features.insert(NewFeatures.begin(), NewFeatures.end());
     Cov.insert(NewCov.begin(), NewCov.end());
@@ -231,10 +257,8 @@
         if (TPC.PcIsFuncEntry(TE))
           PrintPC("  NEW_FUNC: %p %F %L\n", "",
                   TPC.GetNextInstructionPc(TE->PC));
-
   }
 
-
   void CollectDFT(const std::string &InputPath) {
     if (DataFlowBinary.empty()) return;
     if (!FilesWithDFT.insert(InputPath).second) return;
@@ -297,6 +321,7 @@
   Env.Verbosity = Options.Verbosity;
   Env.ProcessStartTime = std::chrono::system_clock::now();
   Env.DataFlowBinary = Options.CollectDataFlow;
+  Env.Group = Options.ForkCorpusGroups;
 
   std::vector<SizedFile> SeedFiles;
   for (auto &Dir : CorpusDirs)
@@ -327,6 +352,12 @@
     Env.Cov.insert(NewFeatures.begin(), NewFeatures.end());
     RemoveFile(CFPath);
   }
+
+  if (Env.Group) {
+    for (auto &path : Env.Files)
+      Env.FilesSizes.push_back(FileSize(path));
+  }
+
   Printf("INFO: -fork=%d: %zd seed inputs, starting to fuzz in %s\n", NumJobs,
          Env.Files.size(), Env.TempDir.c_str());
 
@@ -341,6 +372,8 @@
     WriteToFile(Unit({1}), Env.StopFile());
   };
 
+  size_t MergeCycle = 20;
+  size_t JobExecuted = 0;
   size_t JobId = 1;
   std::vector<std::thread> Threads;
   for (int t = 0; t < NumJobs; t++) {
@@ -362,6 +395,45 @@
 
     Env.RunOneMergeJob(Job.get());
 
+    // merge the corpus .
+    JobExecuted++;
+    if (Env.Group && JobExecuted >= MergeCycle) {
+      std::vector<SizedFile> CurrentSeedFiles;
+      for (auto &Dir : CorpusDirs)
+        GetSizedFilesFromDir(Dir, &CurrentSeedFiles);
+      std::sort(CurrentSeedFiles.begin(), CurrentSeedFiles.end());
+
+      auto CFPath = DirPlusFile(Env.TempDir, "merge.txt");
+      std::set<uint32_t> TmpNewFeatures, TmpNewCov;
+      std::set<uint32_t> TmpFeatures, TmpCov;
+      Env.Files.clear();
+      Env.FilesSizes.clear();
+      CrashResistantMerge(Env.Args, {}, CurrentSeedFiles, &Env.Files,
+                          TmpFeatures, &TmpNewFeatures, TmpCov, &TmpNewCov,
+                          CFPath, false);
+      for (auto &path : Env.Files)
+        Env.FilesSizes.push_back(FileSize(path));
+      RemoveFile(CFPath);
+      JobExecuted = 0;
+      MergeCycle += 5;
+    }
+
+    // Since the number of corpus seeds will gradually increase, in order to
+    // control the number in each group to be about three times the number of
+    // seeds selected each time, the number of groups is dynamically adjusted.
+    if (Env.Files.size() < 2000)
+      Env.NumCorpuses = 12;
+    else if (Env.Files.size() < 6000)
+      Env.NumCorpuses = 20;
+    else if (Env.Files.size() < 12000)
+      Env.NumCorpuses = 32;
+    else if (Env.Files.size() < 16000)
+      Env.NumCorpuses = 40;
+    else if (Env.Files.size() < 24000)
+      Env.NumCorpuses = 60;
+    else
+      Env.NumCorpuses = 80;
+
     // Continue if our crash is one of the ignored ones.
     if (Options.IgnoreTimeouts && ExitCode == Options.TimeoutExitCode)
       Env.NumTimeouts++;
diff --git a/FuzzerOptions.h b/FuzzerOptions.h
index d0c285a..72e2561 100644
--- a/FuzzerOptions.h
+++ b/FuzzerOptions.h
@@ -47,6 +47,7 @@
   int ReportSlowUnits = 10;
   bool OnlyASCII = false;
   bool Entropic = true;
+  bool ForkCorpusGroups = false;
   size_t EntropicFeatureFrequencyThreshold = 0xFF;
   size_t EntropicNumberOfRarestFeatures = 100;
   bool EntropicScalePerExecTime = false;