[scudo][standalone] Allow the release of smaller sizes

Initially we were avoiding the release of smaller size classes due to
the fact that it was an expensive operation, particularly on 32-bit
platforms. With a lot of batches, and given that there are a lot of
blocks per page, this was a lengthy operation with little results.

There has been some improvements since then to the 32-bit release,
and we still have some criterias preventing us from wasting time
(eg, 9x% free blocks in the class size, etc).

Allowing to release blocks < 128 bytes helps in situations where a lot
of small chunks would not have been reclaimed if not for a forced
reclaiming.

Additionally change some `CHECK` to `DCHECK` and rearrange a bit the
code.

I didn't experience any regressions in my benchmarks.

Differential Revision: https://reviews.llvm.org/D93141

GitOrigin-RevId: 1dbf2c96bce93e0a954806d9bdcafcb702a06672
diff --git a/primary32.h b/primary32.h
index 016a96b..0db95d2 100644
--- a/primary32.h
+++ b/primary32.h
@@ -76,17 +76,12 @@
     if (UNLIKELY(!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed))))
       Seed = static_cast<u32>(
           Time ^ (reinterpret_cast<uptr>(SizeClassInfoArray) >> 6));
-    const uptr PageSize = getPageSizeCached();
     for (uptr I = 0; I < NumClasses; I++) {
       SizeClassInfo *Sci = getSizeClassInfo(I);
       Sci->RandState = getRandomU32(&Seed);
       // Sci->MaxRegionIndex is already initialized to 0.
       Sci->MinRegionIndex = NumRegions;
-      // See comment in the 64-bit primary about releasing smaller size classes.
-      Sci->CanRelease = (I != SizeClassMap::BatchClassId) &&
-                        (getSizeByClassId(I) >= (PageSize / 32));
-      if (Sci->CanRelease)
-        Sci->ReleaseInfo.LastReleaseAtNs = Time;
+      Sci->ReleaseInfo.LastReleaseAtNs = Time;
     }
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
@@ -137,7 +132,7 @@
     ScopedLock L(Sci->Mutex);
     Sci->FreeList.push_front(B);
     Sci->Stats.PushedBlocks += B->getCount();
-    if (Sci->CanRelease)
+    if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Sci, ClassId);
   }
 
@@ -217,6 +212,8 @@
   uptr releaseToOS() {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
+      if (I == SizeClassMap::BatchClassId)
+        continue;
       SizeClassInfo *Sci = getSizeClassInfo(I);
       ScopedLock L(Sci->Mutex);
       TotalReleasedBytes += releaseToOSMaybe(Sci, I, /*Force=*/true);
@@ -262,7 +259,6 @@
     uptr CurrentRegion;
     uptr CurrentRegionAllocated;
     SizeClassStats Stats;
-    bool CanRelease;
     u32 RandState;
     uptr AllocatedUser;
     // Lowest & highest region index allocated for this size class, to avoid
diff --git a/primary64.h b/primary64.h
index f6c4d8c..f9854cb 100644
--- a/primary64.h
+++ b/primary64.h
@@ -80,17 +80,7 @@
       Region->RegionBeg =
           getRegionBaseByClassId(I) + (getRandomModN(&Seed, 16) + 1) * PageSize;
       Region->RandState = getRandomU32(&Seed);
-      // Releasing smaller size classes doesn't necessarily yield to a
-      // meaningful RSS impact: there are more blocks per page, they are
-      // randomized around, and thus pages are less likely to be entirely empty.
-      // On top of this, attempting to release those require more iterations and
-      // memory accesses which ends up being fairly costly. The current lower
-      // limit is mostly arbitrary and based on empirical observations.
-      // TODO(kostyak): make the lower limit a runtime option
-      Region->CanRelease = (I != SizeClassMap::BatchClassId) &&
-                           (getSizeByClassId(I) >= (PageSize / 32));
-      if (Region->CanRelease)
-        Region->ReleaseInfo.LastReleaseAtNs = Time;
+      Region->ReleaseInfo.LastReleaseAtNs = Time;
     }
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
 
@@ -129,7 +119,7 @@
     ScopedLock L(Region->Mutex);
     Region->FreeList.push_front(B);
     Region->Stats.PushedBlocks += B->getCount();
-    if (Region->CanRelease)
+    if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Region, ClassId);
   }
 
@@ -201,6 +191,8 @@
   uptr releaseToOS() {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
+      if (I == SizeClassMap::BatchClassId)
+        continue;
       RegionInfo *Region = getRegionInfo(I);
       ScopedLock L(Region->Mutex);
       TotalReleasedBytes += releaseToOSMaybe(Region, I, /*Force=*/true);
@@ -291,7 +283,6 @@
     HybridMutex Mutex;
     SinglyLinkedList<TransferBatch> FreeList;
     RegionStats Stats;
-    bool CanRelease;
     bool Exhausted;
     u32 RandState;
     uptr RegionBeg;
@@ -417,7 +408,7 @@
     const uptr BlockSize = getSizeByClassId(ClassId);
     const uptr PageSize = getPageSizeCached();
 
-    CHECK_GE(Region->Stats.PoppedBlocks, Region->Stats.PushedBlocks);
+    DCHECK_GE(Region->Stats.PoppedBlocks, Region->Stats.PushedBlocks);
     const uptr BytesInFreeList =
         Region->AllocatedUser -
         (Region->Stats.PoppedBlocks - Region->Stats.PushedBlocks) * BlockSize;
diff --git a/release.h b/release.h
index cd9e66d..5c11da2 100644
--- a/release.h
+++ b/release.h
@@ -52,20 +52,20 @@
   PackedCounterArray(uptr NumberOfRegions, uptr CountersPerRegion,
                      uptr MaxValue)
       : Regions(NumberOfRegions), NumCounters(CountersPerRegion) {
-    CHECK_GT(Regions, 0);
-    CHECK_GT(NumCounters, 0);
-    CHECK_GT(MaxValue, 0);
+    DCHECK_GT(Regions, 0);
+    DCHECK_GT(NumCounters, 0);
+    DCHECK_GT(MaxValue, 0);
     constexpr uptr MaxCounterBits = sizeof(*Buffer) * 8UL;
     // Rounding counter storage size up to the power of two allows for using
     // bit shifts calculating particular counter's Index and offset.
     const uptr CounterSizeBits =
         roundUpToPowerOfTwo(getMostSignificantSetBitIndex(MaxValue) + 1);
-    CHECK_LE(CounterSizeBits, MaxCounterBits);
+    DCHECK_LE(CounterSizeBits, MaxCounterBits);
     CounterSizeBitsLog = getLog2(CounterSizeBits);
     CounterMask = ~(static_cast<uptr>(0)) >> (MaxCounterBits - CounterSizeBits);
 
     const uptr PackingRatio = MaxCounterBits >> CounterSizeBitsLog;
-    CHECK_GT(PackingRatio, 0);
+    DCHECK_GT(PackingRatio, 0);
     PackingRatioLog = getLog2(PackingRatio);
     BitOffsetMask = PackingRatio - 1;
 
@@ -235,20 +235,14 @@
   if (BlockSize <= PageSize && PageSize % BlockSize == 0) {
     // Each chunk affects one page only.
     for (const auto &It : FreeList) {
-      // If dealing with a TransferBatch, the first pointer of the batch will
-      // point to the batch itself, we do not want to mark this for release as
-      // the batch is in use, so skip the first entry.
-      const bool IsTransferBatch =
-          (It.getCount() != 0) &&
-          (reinterpret_cast<uptr>(It.get(0)) == reinterpret_cast<uptr>(&It));
-      for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
+      for (u32 I = 0; I < It.getCount(); I++) {
         const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
         // This takes care of P < Base and P >= Base + RoundedSize.
-        if (P < RoundedSize) {
-          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
-          const uptr PInRegion = P - RegionIndex * RegionSize;
-          Counters.inc(RegionIndex, PInRegion >> PageSizeLog);
-        }
+        if (UNLIKELY(P >= RoundedSize))
+          continue;
+        const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
+        const uptr PInRegion = P - RegionIndex * RegionSize;
+        Counters.inc(RegionIndex, PInRegion >> PageSizeLog);
       }
     }
   } else {
@@ -256,27 +250,23 @@
     DCHECK_GE(RegionSize, BlockSize);
     const uptr LastBlockInRegion = ((RegionSize / BlockSize) - 1U) * BlockSize;
     for (const auto &It : FreeList) {
-      // See TransferBatch comment above.
-      const bool IsTransferBatch =
-          (It.getCount() != 0) &&
-          (reinterpret_cast<uptr>(It.get(0)) == reinterpret_cast<uptr>(&It));
-      for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
+      for (u32 I = 0; I < It.getCount(); I++) {
         const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
         // This takes care of P < Base and P >= Base + RoundedSize.
-        if (P < RoundedSize) {
-          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
-          uptr PInRegion = P - RegionIndex * RegionSize;
-          Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
-                            (PInRegion + BlockSize - 1) >> PageSizeLog);
-          // The last block in a region might straddle a page, so if it's
-          // free, we mark the following "pretend" memory block(s) as free.
-          if (PInRegion == LastBlockInRegion) {
+        if (UNLIKELY(P >= RoundedSize))
+          continue;
+        const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
+        uptr PInRegion = P - RegionIndex * RegionSize;
+        Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
+                          (PInRegion + BlockSize - 1) >> PageSizeLog);
+        // The last block in a region might straddle a page, so if it's
+        // free, we mark the following "pretend" memory block(s) as free.
+        if (PInRegion == LastBlockInRegion) {
+          PInRegion += BlockSize;
+          while (PInRegion < RoundedRegionSize) {
+            Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
+                              (PInRegion + BlockSize - 1) >> PageSizeLog);
             PInRegion += BlockSize;
-            while (PInRegion < RoundedRegionSize) {
-              Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
-                                (PInRegion + BlockSize - 1) >> PageSizeLog);
-              PInRegion += BlockSize;
-            }
           }
         }
       }
@@ -327,7 +317,6 @@
           }
         }
         PrevPageBoundary = PageBoundary;
-
         RangeTracker.processNextPage(Counters.get(I, J) == BlocksPerPage);
       }
     }