[libc][NFC] Allow memset (and bzero) to be inlined

This allows shipping individual functions without also having to provide
memset or bzero at the expense of bigger functions.

Similar to D113097.

Differential Revision: https://reviews.llvm.org/D113108

GitOrigin-RevId: c02aa15438459a2b148c1c84267fff3e926c2632
diff --git a/src/string/CMakeLists.txt b/src/string/CMakeLists.txt
index 4631818..5a130fc 100644
--- a/src/string/CMakeLists.txt
+++ b/src/string/CMakeLists.txt
@@ -389,7 +389,7 @@
 
 function(add_memset memset_name)
   add_implementation(memset ${memset_name}
-    SRCS ${MEMSET_SRC}
+    SRCS ${LIBC_SOURCE_DIR}/src/string/memset.cpp
     HDRS ${LIBC_SOURCE_DIR}/src/string/memset.h
     DEPENDS
       .memory_utils.memory_utils
@@ -401,7 +401,6 @@
 endfunction()
 
 if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
-  set(MEMSET_SRC ${LIBC_SOURCE_DIR}/src/string/memset.cpp)
   add_memset(memset_x86_64_opt_sse2   COMPILE_OPTIONS -march=k8             REQUIRE SSE2)
   add_memset(memset_x86_64_opt_sse4   COMPILE_OPTIONS -march=nehalem        REQUIRE SSE4_2)
   add_memset(memset_x86_64_opt_avx2   COMPILE_OPTIONS -march=haswell        REQUIRE AVX2)
@@ -409,12 +408,10 @@
   add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memset(memset)
 elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
-  set(MEMSET_SRC ${LIBC_SOURCE_DIR}/src/string/aarch64/memset.cpp)
   add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}
                                       COMPILE_OPTIONS "SHELL:-mllvm --tail-merge-threshold=0")
   add_memset(memset                   COMPILE_OPTIONS "SHELL:-mllvm --tail-merge-threshold=0")
 else()
-  set(MEMSET_SRC ${LIBC_SOURCE_DIR}/src/string/memset.cpp)
   add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memset(memset)
 endif()
diff --git a/src/string/aarch64/memset.cpp b/src/string/aarch64/memset.cpp
deleted file mode 100644
index fa66ffe..0000000
--- a/src/string/aarch64/memset.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- Implementation of memset ------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/string/memset.h"
-#include "src/__support/common.h"
-#include "src/string/memory_utils/memset_utils.h"
-
-namespace __llvm_libc {
-
-using namespace __llvm_libc::aarch64_memset;
-
-inline static void AArch64Memset(char *dst, int value, size_t count) {
-  if (count == 0)
-    return;
-  if (count <= 3) {
-    SplatSet<_1>(dst, value);
-    if (count > 1)
-      SplatSet<Tail<_2>>(dst, value, count);
-    return;
-  }
-  if (count <= 8)
-    return SplatSet<HeadTail<_4>>(dst, value, count);
-  if (count <= 16)
-    return SplatSet<HeadTail<_8>>(dst, value, count);
-  if (count <= 32)
-    return SplatSet<HeadTail<_16>>(dst, value, count);
-  if (count <= 96) {
-    SplatSet<_32>(dst, value);
-    if (count <= 64)
-      return SplatSet<Tail<_32>>(dst, value, count);
-    SplatSet<Skip<32>::Then<_32>>(dst, value);
-    SplatSet<Tail<_32>>(dst, value, count);
-    return;
-  }
-  if (count < 448 || value != 0 || !AArch64ZVA(dst, count))
-    return SplatSet<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count);
-}
-
-LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) {
-  AArch64Memset((char *)dst, value, count);
-  return dst;
-}
-
-} // namespace __llvm_libc
diff --git a/src/string/bzero.cpp b/src/string/bzero.cpp
index 3c76ef6..c57c922 100644
--- a/src/string/bzero.cpp
+++ b/src/string/bzero.cpp
@@ -8,12 +8,12 @@
 
 #include "src/string/bzero.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/memset_utils.h"
+#include "src/string/memory_utils/memset_implementations.h"
 
 namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(void, bzero, (void *ptr, size_t count)) {
-  GeneralPurposeMemset(reinterpret_cast<char *>(ptr), 0, count);
+  inline_memset(reinterpret_cast<char *>(ptr), 0, count);
 }
 
 } // namespace __llvm_libc
diff --git a/src/string/memory_utils/memset_implementations.h b/src/string/memory_utils/memset_implementations.h
new file mode 100644
index 0000000..e34b13a
--- /dev/null
+++ b/src/string/memory_utils/memset_implementations.h
@@ -0,0 +1,135 @@
+//===-- Implementation of memset and bzero --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
+
+#include "src/__support/architectures.h"
+#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/utils.h"
+
+#include <stddef.h> // size_t
+
+namespace __llvm_libc {
+
+// A general purpose implementation assuming cheap unaligned writes for sizes:
+// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
+// or 64 Bytes at a time, the compiler will expand them as needed.
+//
+// This implementation is subject to change as we benchmark more processors. We
+// may also want to customize it for processors with specialized instructions
+// that performs better (e.g. `rep stosb`).
+//
+// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
+// We want to balance two things here:
+//  - The number of redundant writes (when using `SetBlockOverlap`),
+//  - The number of conditionals for sizes <=128 (~90% of memset calls are for
+//    such sizes).
+//
+// For the range 64-128:
+//  - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
+//  is wasteful near 65 but efficient toward 128.
+//  - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
+//  96 or 128 Bytes.
+//  - Another approach could be to use an hybrid approach Copy<64>+Overlap<32>
+//  for 65-96 and Copy<96>+Overlap<32> for 97-128
+//
+// Benchmarks showed that redundant writes were cheap (for Intel X86) but
+// conditional were expensive, even on processor that do not support writing 64B
+// at a time (pre-AVX512F). We also want to favor short functions that allow
+// more hot code to fit in the iL1 cache.
+//
+// Above 128 we have to use conditionals since we don't know the upper bound in
+// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
+// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
+// superior for sizes that mattered.
+inline static void inline_memset(char *dst, unsigned char value, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86)
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_X86
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace __llvm_libc::x86;
+  if (count == 0)
+    return;
+  if (count == 1)
+    return SplatSet<_1>(dst, value);
+  if (count == 2)
+    return SplatSet<_2>(dst, value);
+  if (count == 3)
+    return SplatSet<_3>(dst, value);
+  if (count <= 8)
+    return SplatSet<HeadTail<_4>>(dst, value, count);
+  if (count <= 16)
+    return SplatSet<HeadTail<_8>>(dst, value, count);
+  if (count <= 32)
+    return SplatSet<HeadTail<_16>>(dst, value, count);
+  if (count <= 64)
+    return SplatSet<HeadTail<_32>>(dst, value, count);
+  if (count <= 128)
+    return SplatSet<HeadTail<_64>>(dst, value, count);
+  return SplatSet<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_AARCH64
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace __llvm_libc::aarch64_memset;
+  if (count == 0)
+    return;
+  if (count <= 3) {
+    SplatSet<_1>(dst, value);
+    if (count > 1)
+      SplatSet<Tail<_2>>(dst, value, count);
+    return;
+  }
+  if (count <= 8)
+    return SplatSet<HeadTail<_4>>(dst, value, count);
+  if (count <= 16)
+    return SplatSet<HeadTail<_8>>(dst, value, count);
+  if (count <= 32)
+    return SplatSet<HeadTail<_16>>(dst, value, count);
+  if (count <= 96) {
+    SplatSet<_32>(dst, value);
+    if (count <= 64)
+      return SplatSet<Tail<_32>>(dst, value, count);
+    SplatSet<Skip<32>::Then<_32>>(dst, value);
+    SplatSet<Tail<_32>>(dst, value, count);
+    return;
+  }
+  if (count < 448 || value != 0 || !AArch64ZVA(dst, count))
+    return SplatSet<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count);
+#else
+  /////////////////////////////////////////////////////////////////////////////
+  // Default
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace ::__llvm_libc::scalar;
+
+  if (count == 0)
+    return;
+  if (count == 1)
+    return SplatSet<_1>(dst, value);
+  if (count == 2)
+    return SplatSet<_2>(dst, value);
+  if (count == 3)
+    return SplatSet<_3>(dst, value);
+  if (count <= 8)
+    return SplatSet<HeadTail<_4>>(dst, value, count);
+  if (count <= 16)
+    return SplatSet<HeadTail<_8>>(dst, value, count);
+  if (count <= 32)
+    return SplatSet<HeadTail<_16>>(dst, value, count);
+  if (count <= 64)
+    return SplatSet<HeadTail<_32>>(dst, value, count);
+  if (count <= 128)
+    return SplatSet<HeadTail<_64>>(dst, value, count);
+  return SplatSet<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
+#endif
+}
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
diff --git a/src/string/memory_utils/memset_utils.h b/src/string/memory_utils/memset_utils.h
deleted file mode 100644
index 666d649..0000000
--- a/src/string/memory_utils/memset_utils.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//===-- Memset utils --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
-
-#include "src/__support/architectures.h"
-#include "src/string/memory_utils/elements.h"
-#include "src/string/memory_utils/utils.h"
-
-#include <stddef.h> // size_t
-
-namespace __llvm_libc {
-
-// A general purpose implementation assuming cheap unaligned writes for sizes:
-// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
-// or 64 Bytes at a time, the compiler will expand them as needed.
-//
-// This implementation is subject to change as we benchmark more processors. We
-// may also want to customize it for processors with specialized instructions
-// that performs better (e.g. `rep stosb`).
-//
-// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
-// We want to balance two things here:
-//  - The number of redundant writes (when using `SetBlockOverlap`),
-//  - The number of conditionals for sizes <=128 (~90% of memset calls are for
-//    such sizes).
-//
-// For the range 64-128:
-//  - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
-//  is wasteful near 65 but efficient toward 128.
-//  - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
-//  96 or 128 Bytes.
-//  - Another approach could be to use an hybrid approach Copy<64>+Overlap<32>
-//  for 65-96 and Copy<96>+Overlap<32> for 97-128
-//
-// Benchmarks showed that redundant writes were cheap (for Intel X86) but
-// conditional were expensive, even on processor that do not support writing 64B
-// at a time (pre-AVX512F). We also want to favor short functions that allow
-// more hot code to fit in the iL1 cache.
-//
-// Above 128 we have to use conditionals since we don't know the upper bound in
-// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
-// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
-// superior for sizes that mattered.
-inline static void GeneralPurposeMemset(char *dst, unsigned char value,
-                                        size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86)
-  using namespace ::__llvm_libc::x86;
-#else
-  using namespace ::__llvm_libc::scalar;
-#endif
-
-  if (count == 0)
-    return;
-  if (count == 1)
-    return SplatSet<_1>(dst, value);
-  if (count == 2)
-    return SplatSet<_2>(dst, value);
-  if (count == 3)
-    return SplatSet<_3>(dst, value);
-  if (count <= 8)
-    return SplatSet<HeadTail<_4>>(dst, value, count);
-  if (count <= 16)
-    return SplatSet<HeadTail<_8>>(dst, value, count);
-  if (count <= 32)
-    return SplatSet<HeadTail<_16>>(dst, value, count);
-  if (count <= 64)
-    return SplatSet<HeadTail<_32>>(dst, value, count);
-  if (count <= 128)
-    return SplatSet<HeadTail<_64>>(dst, value, count);
-  return SplatSet<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
-}
-
-} // namespace __llvm_libc
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
diff --git a/src/string/memset.cpp b/src/string/memset.cpp
index 945aeda..549c074 100644
--- a/src/string/memset.cpp
+++ b/src/string/memset.cpp
@@ -8,13 +8,13 @@
 
 #include "src/string/memset.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/memset_utils.h"
+#include "src/string/memory_utils/memset_implementations.h"
 
 namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) {
-  GeneralPurposeMemset(reinterpret_cast<char *>(dst),
-                       static_cast<unsigned char>(value), count);
+  inline_memset(reinterpret_cast<char *>(dst),
+                static_cast<unsigned char>(value), count);
   return dst;
 }