[libc] Allow customization of memcpy via flags.

 - Adds LLVM_LIBC_IS_DEFINED macro to libc/src/__support/common.h
 - Adds a few knobs to memcpy to help with experimentations:
   - LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB replaces the implementation with a single call to rep;movsb
   - LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE customizes where the usage of rep;movsb

Differential Revision: https://reviews.llvm.org/D94692

GitOrigin-RevId: a10300a2b27c426556f9266364337d5d546a3c14
diff --git a/src/__support/common.h b/src/__support/common.h
index 208c8bd..53a63fc 100644
--- a/src/__support/common.h
+++ b/src/__support/common.h
@@ -29,4 +29,27 @@
 #define LLVM_LIBC_FUNCTION(type, name, arglist) type name arglist
 #endif
 
+namespace __llvm_libc {
+namespace internal {
+constexpr bool same_string(char const *lhs, char const *rhs) {
+  for (; *lhs || *rhs; ++lhs, ++rhs)
+    if (*lhs != *rhs)
+      return false;
+  return true;
+}
+} // namespace internal
+} // namespace __llvm_libc
+
+// LLVM_LIBC_IS_DEFINED checks whether a particular macro is defined.
+// Usage: constexpr bool kUseAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
+//
+// This works by comparing the stringified version of the macro with and without
+// evaluation. If FOO is not undefined both stringifications yield "FOO". If FOO
+// is defined, one stringification yields "FOO" while the other yields its
+// stringified value "1".
+#define LLVM_LIBC_IS_DEFINED(macro)                                            \
+  !__llvm_libc::internal::same_string(                                         \
+      LLVM_LIBC_IS_DEFINED__EVAL_AND_STRINGIZE(macro), #macro)
+#define LLVM_LIBC_IS_DEFINED__EVAL_AND_STRINGIZE(s) #s
+
 #endif // LLVM_LIBC_SUPPORT_COMMON_H
diff --git a/src/string/x86/memcpy.cpp b/src/string/x86/memcpy.cpp
index 7c5740b..b9163d9 100644
--- a/src/string/x86/memcpy.cpp
+++ b/src/string/x86/memcpy.cpp
@@ -12,6 +12,26 @@
 
 namespace __llvm_libc {
 
+// Whether to use only rep;movsb.
+constexpr bool kUseOnlyRepMovsb =
+    LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
+
+// kRepMovsBSize == -1 : Only CopyAligned is used.
+// kRepMovsBSize ==  0 : Only RepMovsb is used.
+// else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
+constexpr size_t kRepMovsBSize =
+#ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+    LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
+#else
+    -1;
+#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+
+// Whether target supports AVX instructions.
+constexpr bool kHasAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
+
+// The chunk size used for the loop copy strategy.
+constexpr size_t kLoopCopyBlockSize = kHasAvx ? 64 : 32;
+
 static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
                          size_t count) {
   // FIXME: Add MSVC support with
@@ -21,12 +41,6 @@
   asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
 }
 
-#if defined(__AVX__)
-#define BEST_SIZE 64
-#else
-#define BEST_SIZE 32
-#endif
-
 // Design rationale
 // ================
 //
@@ -47,6 +61,9 @@
 //   with little change on the code side.
 static void memcpy_x86(char *__restrict dst, const char *__restrict src,
                        size_t count) {
+  if (kUseOnlyRepMovsb)
+    return CopyRepMovsb(dst, src, count);
+
   if (count == 0)
     return;
   if (count == 1)
@@ -67,16 +84,10 @@
     return CopyBlockOverlap<32>(dst, src, count);
   if (count < 128)
     return CopyBlockOverlap<64>(dst, src, count);
-#if defined(__AVX__)
-  if (count < 256)
+  if (kHasAvx && count < 256)
     return CopyBlockOverlap<128>(dst, src, count);
-#endif
-  // kRepMovsBSize == -1 : Only CopyAligned is used.
-  // kRepMovsBSize ==  0 : Only RepMovsb is used.
-  // else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
-  constexpr size_t kRepMovsBSize = -1;
   if (count <= kRepMovsBSize)
-    return CopyAlignedBlocks<BEST_SIZE>(dst, src, count);
+    return CopyAlignedBlocks<kLoopCopyBlockSize>(dst, src, count);
   return CopyRepMovsb(dst, src, count);
 }