[LIBC] Add optimized memcpy routine for AArch64

This patch adds an optimized memcpy routine for AArch64 tuned and benchmarked
on Neoverse-N1.

Differential Revision: https://reviews.llvm.org/D92235

GitOrigin-RevId: 369f7de3135a517a69c45084d4b175f7b0d5e6f5
diff --git a/src/string/CMakeLists.txt b/src/string/CMakeLists.txt
index 19eab39..9dd8b6b 100644
--- a/src/string/CMakeLists.txt
+++ b/src/string/CMakeLists.txt
@@ -214,6 +214,11 @@
 if(${LIBC_TARGET_MACHINE} STREQUAL "x86_64")
   set(LIBC_STRING_TARGET_ARCH "x86")
   set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/x86/memcpy.cpp)
+elseif(${LIBC_TARGET_MACHINE} STREQUAL "aarch64")
+  set(LIBC_STRING_TARGET_ARCH "aarch64")
+  set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/aarch64/memcpy.cpp)
+#Disable tail merging as it leads to lower performance
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mllvm --tail-merge-threshold=0")
 else()
   set(LIBC_STRING_TARGET_ARCH ${LIBC_TARGET_MACHINE})
   set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp)
diff --git a/src/string/aarch64/CMakeLists.txt b/src/string/aarch64/CMakeLists.txt
new file mode 100644
index 0000000..8430891
--- /dev/null
+++ b/src/string/aarch64/CMakeLists.txt
@@ -0,0 +1 @@
+add_memcpy("memcpy_${LIBC_TARGET_MACHINE}")
diff --git a/src/string/aarch64/memcpy.cpp b/src/string/aarch64/memcpy.cpp
new file mode 100644
index 0000000..63ed5fd
--- /dev/null
+++ b/src/string/aarch64/memcpy.cpp
@@ -0,0 +1,68 @@
+//===-- Implementation of memcpy ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memcpy.h"
+#include "src/__support/common.h"
+#include "src/string/memory_utils/memcpy_utils.h"
+
+namespace __llvm_libc {
+
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc
+// functions, it was found most operations act on a small number of bytes.
+// This makes it important to favor small sizes.
+//
+// We have used __builtin_expect to tell the compiler to favour lower sizes as
+// that will reduce the branching overhead where that would hurt most
+// proportional to total cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+//   Guided Optimization as the optimized code can take advantage of branching
+//   probabilities.
+// - It also allows for easier customization and favors testing multiple
+//   implementation parameters.
+// - As compilers and processors get better, the generated code is improved
+//   with little change on the code side.
+// This implementation has been tuned for Neoverse-N1.
+static void memcpy_aarch64(char *__restrict dst, const char *__restrict src,
+                           size_t count) {
+  if (count == 0)
+    return;
+  if (count == 1)
+    return CopyBlock<1>(dst, src);
+  if (count == 2)
+    return CopyBlock<2>(dst, src);
+  if (count == 3)
+    return CopyBlock<3>(dst, src);
+  if (count == 4)
+    return CopyBlock<4>(dst, src);
+  if (count < 8)
+    return CopyBlockOverlap<4>(dst, src, count);
+  if (count < 16)
+    return CopyBlockOverlap<8>(dst, src, count);
+  if (count < 32)
+    return CopyBlockOverlap<16>(dst, src, count);
+  if (count < 64)
+    return CopyBlockOverlap<32>(dst, src, count);
+  if (count < 128)
+    return CopyBlockOverlap<64>(dst, src, count);
+  return CopyAlignedBlocks<64, 16>(dst, src, count);
+}
+
+LLVM_LIBC_FUNCTION(void *, memcpy,
+                   (void *__restrict dst, const void *__restrict src,
+                    size_t size)) {
+  memcpy_aarch64(reinterpret_cast<char *>(dst),
+                 reinterpret_cast<const char *>(src), size);
+  return dst;
+}
+
+} // namespace __llvm_libc