libc/src/string/memory_utils/arm/inline_memcpy.h - llvm-project - Git at Google

 //===-- Memcpy implementation for arm ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 // The functions defined in this file give approximate code size. These sizes
 // assume the following configuration options:
 // - LIBC_CONF_KEEP_FRAME_POINTER = false
 // - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
 // - LIBC_ADD_NULL_CHECKS = false
 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H

 #include "src/__support/CPP/type_traits.h"     // always_false
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
 #include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
 #include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
 #include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align

 #include <stddef.h> // size_t

 namespace LIBC_NAMESPACE_DECL {

 namespace {

 // Performs a copy of `bytes` byte from `src` to `dst`. This function has the
 // semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is
 // free to use whatever instruction is best for the size and assumed access.
 template <size_t bytes, AssumeAccess access>
 LIBC_INLINE void copy(void *dst, const void *src) {
   if constexpr (access == AssumeAccess::kAligned) {
     constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
     memcpy_inline<bytes>(assume_aligned<alignment>(dst),
                          assume_aligned<alignment>(src));
   } else if constexpr (access == AssumeAccess::kUnknown) {
     memcpy_inline<bytes>(dst, src);
   } else {
     static_assert(cpp::always_false<decltype(access)>, "Invalid AssumeAccess");
   }
 }

 template <size_t bytes, BlockOp block_op = BlockOp::kFull,
           AssumeAccess access = AssumeAccess::kUnknown>
 LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) {
   if constexpr (block_op == BlockOp::kFull) {
     copy<bytes, access>(dst, src);
   } else if constexpr (block_op == BlockOp::kByWord) {
     // We restrict loads/stores to 4 byte to prevent the use of load/store
     // multiple (LDM, STM) and load/store double (LDRD, STRD).
     static_assert((bytes % kWordSize == 0) && (bytes >= kWordSize));
     LIBC_LOOP_UNROLL
     for (size_t offset = 0; offset < bytes; offset += kWordSize) {
       copy<kWordSize, access>(dst + offset, src + offset);
     }
   } else {
     static_assert(cpp::always_false<decltype(block_op)>, "Invalid BlockOp");
   }
   // In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
   // into the load/store instructions.
   // e.g.,
   // ldrb  r3, [r1], #1
   // strb  r3, [r0], #1
   dst += bytes;
   src += bytes;
 }

 template <size_t bytes, BlockOp block_op, AssumeAccess access>
 LIBC_INLINE void consume_by_block(Ptr &dst, CPtr &src, size_t &size) {
   LIBC_LOOP_NOUNROLL
   for (size_t i = 0; i < size / bytes; ++i)
     copy_block_and_bump_pointers<bytes, block_op, access>(dst, src);
   size %= bytes;
 }

 [[maybe_unused]] LIBC_INLINE void
 copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
   LIBC_LOOP_NOUNROLL
   for (size_t i = 0; i < size; ++i)
     *dst++ = *src++;
 }

 } // namespace

 // Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned
 // loads/stores. It compiles down to 208 bytes when used through `memcpy` that
 // also needs to return the `dst` ptr.
 // Note:
 // - When `src` and `dst` are coaligned, we start by aligning them and perform
 //   bulk copies. We let the compiler know the pointers are aligned so it can
 //   use load/store multiple (LDM, STM). This significantly increase throughput
 //   but it also requires more registers and push/pop instructions. This impacts
 //   latency for small size copies.
 // - When `src` and `dst` are misaligned, we align `dst` and recompose words
 //   using multiple aligned loads. `load_aligned` takes care of endianness
 //   issues.
 [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_low_end(Ptr dst, CPtr src,
                                                             size_t size) {
   if (size >= 8) {
     if (const size_t offset = distance_to_align_up<kWordSize>(dst))
       LIBC_ATTR_UNLIKELY {
         copy_bytes_and_bump_pointers(dst, src, offset);
         size -= offset;
       }
     constexpr AssumeAccess kAligned = AssumeAccess::kAligned;
     const auto src_alignment = distance_to_align_down<kWordSize>(src);
     if (src_alignment == 0)
       LIBC_ATTR_LIKELY {
         // Both `src` and `dst` are now word-aligned.
         // We first copy by blocks of 64 bytes, the compiler will use 4
         // load/store multiple (LDM, STM), each of 4 words. This requires more
         // registers so additional push/pop are needed but the speedup is worth
         // it.
         consume_by_block<64, BlockOp::kFull, kAligned>(dst, src, size);
         // Then we use blocks of 4 word load/store.
         consume_by_block<16, BlockOp::kByWord, kAligned>(dst, src, size);
         // Then we use word by word copy.
         consume_by_block<4, BlockOp::kByWord, kAligned>(dst, src, size);
       }
     else {
       // `dst` is aligned but `src` is not.
       LIBC_LOOP_NOUNROLL
       while (size >= kWordSize) {
         // Recompose word from multiple loads depending on the alignment.
         const uint32_t value =
             src_alignment == 2
                 ? load_aligned<uint32_t, uint16_t, uint16_t>(src)
                 : load_aligned<uint32_t, uint8_t, uint16_t, uint8_t>(src);
         copy<kWordSize, kAligned>(dst, &value);
         dst += kWordSize;
         src += kWordSize;
         size -= kWordSize;
       }
     }
     // Up to 3 bytes may still need to be copied.
     // Handling them with the slow loop below.
   }
   copy_bytes_and_bump_pointers(dst, src, size);
 }

 // Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
 // support for unaligned loads and stores. It compiles down to 272 bytes when
 // used through `memcpy` that also needs to return the `dst` ptr.
 [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end(Ptr dst, CPtr src,
                                                             size_t size) {
   if (misaligned(bitwise_or(src, dst)))
     LIBC_ATTR_UNLIKELY {
       if (size < 8)
         LIBC_ATTR_UNLIKELY {
           if (size & 1)
             copy_block_and_bump_pointers<1>(dst, src);
           if (size & 2)
             copy_block_and_bump_pointers<2>(dst, src);
           if (size & 4)
             copy_block_and_bump_pointers<4>(dst, src);
           return;
         }
       if (misaligned(src))
         LIBC_ATTR_UNLIKELY {
           const size_t offset = distance_to_align_up<kWordSize>(dst);
           if (offset & 1)
             copy_block_and_bump_pointers<1>(dst, src);
           if (offset & 2)
             copy_block_and_bump_pointers<2>(dst, src);
           size -= offset;
         }
     }
   // `dst` and `src` are not necessarily both aligned at that point but this
   // implementation assumes hardware support for unaligned loads and stores so
   // it is still fast to perform unrolled word by word copy. Note that wider
   // accesses through the use of load/store multiple (LDM, STM) and load/store
   // double (LDRD, STRD) instructions are generally not supported and can fault.
   // By forcing decomposition of 64 bytes copy into word by word copy, the
   // compiler uses a load to prefetch the next cache line:
   //   ldr  r3, [r1, #64]!  <- prefetch next cache line
   //   str  r3, [r0]
   //   ldr  r3, [r1, #0x4]
   //   str  r3, [r0, #0x4]
   //   ...
   //   ldr  r3, [r1, #0x3c]
   //   str  r3, [r0, #0x3c]
   // This is a bit detrimental for sizes between 64 and 256 (less than 10%
   // penalty) but the prefetch yields better throughput for larger copies.
   constexpr AssumeAccess kUnknown = AssumeAccess::kUnknown;
   consume_by_block<64, BlockOp::kByWord, kUnknown>(dst, src, size);
   consume_by_block<16, BlockOp::kByWord, kUnknown>(dst, src, size);
   consume_by_block<4, BlockOp::kByWord, kUnknown>(dst, src, size);
   if (size & 1)
     copy_block_and_bump_pointers<1>(dst, src);
   if (size & 2)
     copy_block_and_bump_pointers<2>(dst, src);
 }

 [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(Ptr dst, CPtr src,
                                                     size_t size) {
   // The compiler performs alias analysis and is able to prove that `dst` and
   // `src` do not alias by propagating the `__restrict` keyword from the
   // `memcpy` prototype. This allows the compiler to merge consecutive
   // load/store (LDR, STR) instructions generated in
   // `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store
   // double (LDRD, STRD) instructions, this is is undesirable so we prevent the
   // compiler from inferring `__restrict` with the following line.
   asm volatile("" : "+r"(dst), "+r"(src));
 #ifdef __ARM_FEATURE_UNALIGNED
   return inline_memcpy_arm_mid_end(dst, src, size);
 #else
   return inline_memcpy_arm_low_end(dst, src, size);
 #endif
 }

 } // namespace LIBC_NAMESPACE_DECL

 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
	//===-- Memcpy implementation for arm ---------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	// The functions defined in this file give approximate code size. These sizes
	// assume the following configuration options:
	// - LIBC_CONF_KEEP_FRAME_POINTER = false
	// - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
	// - LIBC_ADD_NULL_CHECKS = false
	#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
	#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H

	#include "src/__support/CPP/type_traits.h" // always_false
	#include "src/__support/macros/attributes.h" // LIBC_INLINE
	#include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
	#include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
	#include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align

	#include <stddef.h> // size_t

	namespace LIBC_NAMESPACE_DECL {

	namespace {

	// Performs a copy of `bytes` byte from `src` to `dst`. This function has the
	// semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is
	// free to use whatever instruction is best for the size and assumed access.
	template <size_t bytes, AssumeAccess access>
	LIBC_INLINE void copy(void dst, const void src) {
	if constexpr (access == AssumeAccess::kAligned) {
	constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
	memcpy_inline<bytes>(assume_aligned<alignment>(dst),
	assume_aligned<alignment>(src));
	} else if constexpr (access == AssumeAccess::kUnknown) {
	memcpy_inline<bytes>(dst, src);
	} else {
	static_assert(cpp::always_false<decltype(access)>, "Invalid AssumeAccess");
	}
	}

	template <size_t bytes, BlockOp block_op = BlockOp::kFull,
	AssumeAccess access = AssumeAccess::kUnknown>
	LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) {
	if constexpr (block_op == BlockOp::kFull) {
	copy<bytes, access>(dst, src);
	} else if constexpr (block_op == BlockOp::kByWord) {
	// We restrict loads/stores to 4 byte to prevent the use of load/store
	// multiple (LDM, STM) and load/store double (LDRD, STRD).
	static_assert((bytes % kWordSize == 0) && (bytes >= kWordSize));
	LIBC_LOOP_UNROLL
	for (size_t offset = 0; offset < bytes; offset += kWordSize) {
	copy<kWordSize, access>(dst + offset, src + offset);
	}
	} else {
	static_assert(cpp::always_false<decltype(block_op)>, "Invalid BlockOp");
	}
	// In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
	// into the load/store instructions.
	// e.g.,
	// ldrb r3, [r1], #1
	// strb r3, [r0], #1
	dst += bytes;
	src += bytes;
	}

	template <size_t bytes, BlockOp block_op, AssumeAccess access>
	LIBC_INLINE void consume_by_block(Ptr &dst, CPtr &src, size_t &size) {
	LIBC_LOOP_NOUNROLL
	for (size_t i = 0; i < size / bytes; ++i)
	copy_block_and_bump_pointers<bytes, block_op, access>(dst, src);
	size %= bytes;
	}

	[[maybe_unused]] LIBC_INLINE void
	copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
	LIBC_LOOP_NOUNROLL
	for (size_t i = 0; i < size; ++i)
	dst++ = src++;
	}

	} // namespace

	// Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned
	// loads/stores. It compiles down to 208 bytes when used through `memcpy` that
	// also needs to return the `dst` ptr.
	// Note:
	// - When `src` and `dst` are coaligned, we start by aligning them and perform
	// bulk copies. We let the compiler know the pointers are aligned so it can
	// use load/store multiple (LDM, STM). This significantly increase throughput
	// but it also requires more registers and push/pop instructions. This impacts
	// latency for small size copies.
	// - When `src` and `dst` are misaligned, we align `dst` and recompose words
	// using multiple aligned loads. `load_aligned` takes care of endianness
	// issues.
	[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_low_end(Ptr dst, CPtr src,
	size_t size) {
	if (size >= 8) {
	if (const size_t offset = distance_to_align_up<kWordSize>(dst))
	LIBC_ATTR_UNLIKELY {
	copy_bytes_and_bump_pointers(dst, src, offset);
	size -= offset;
	}
	constexpr AssumeAccess kAligned = AssumeAccess::kAligned;
	const auto src_alignment = distance_to_align_down<kWordSize>(src);
	if (src_alignment == 0)
	LIBC_ATTR_LIKELY {
	// Both `src` and `dst` are now word-aligned.
	// We first copy by blocks of 64 bytes, the compiler will use 4
	// load/store multiple (LDM, STM), each of 4 words. This requires more
	// registers so additional push/pop are needed but the speedup is worth
	// it.
	consume_by_block<64, BlockOp::kFull, kAligned>(dst, src, size);
	// Then we use blocks of 4 word load/store.
	consume_by_block<16, BlockOp::kByWord, kAligned>(dst, src, size);
	// Then we use word by word copy.
	consume_by_block<4, BlockOp::kByWord, kAligned>(dst, src, size);
	}
	else {
	// `dst` is aligned but `src` is not.
	LIBC_LOOP_NOUNROLL
	while (size >= kWordSize) {
	// Recompose word from multiple loads depending on the alignment.
	const uint32_t value =
	src_alignment == 2
	? load_aligned<uint32_t, uint16_t, uint16_t>(src)
	: load_aligned<uint32_t, uint8_t, uint16_t, uint8_t>(src);
	copy<kWordSize, kAligned>(dst, &value);
	dst += kWordSize;
	src += kWordSize;
	size -= kWordSize;
	}
	}
	// Up to 3 bytes may still need to be copied.
	// Handling them with the slow loop below.
	}
	copy_bytes_and_bump_pointers(dst, src, size);
	}

	// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
	// support for unaligned loads and stores. It compiles down to 272 bytes when
	// used through `memcpy` that also needs to return the `dst` ptr.
	[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end(Ptr dst, CPtr src,
	size_t size) {
	if (misaligned(bitwise_or(src, dst)))
	LIBC_ATTR_UNLIKELY {
	if (size < 8)
	LIBC_ATTR_UNLIKELY {
	if (size & 1)
	copy_block_and_bump_pointers<1>(dst, src);
	if (size & 2)
	copy_block_and_bump_pointers<2>(dst, src);
	if (size & 4)
	copy_block_and_bump_pointers<4>(dst, src);
	return;
	}
	if (misaligned(src))
	LIBC_ATTR_UNLIKELY {
	const size_t offset = distance_to_align_up<kWordSize>(dst);
	if (offset & 1)
	copy_block_and_bump_pointers<1>(dst, src);
	if (offset & 2)
	copy_block_and_bump_pointers<2>(dst, src);
	size -= offset;
	}
	}
	// `dst` and `src` are not necessarily both aligned at that point but this
	// implementation assumes hardware support for unaligned loads and stores so
	// it is still fast to perform unrolled word by word copy. Note that wider
	// accesses through the use of load/store multiple (LDM, STM) and load/store
	// double (LDRD, STRD) instructions are generally not supported and can fault.
	// By forcing decomposition of 64 bytes copy into word by word copy, the
	// compiler uses a load to prefetch the next cache line:
	// ldr r3, [r1, #64]! <- prefetch next cache line
	// str r3, [r0]
	// ldr r3, [r1, #0x4]
	// str r3, [r0, #0x4]
	// ...
	// ldr r3, [r1, #0x3c]
	// str r3, [r0, #0x3c]
	// This is a bit detrimental for sizes between 64 and 256 (less than 10%
	// penalty) but the prefetch yields better throughput for larger copies.
	constexpr AssumeAccess kUnknown = AssumeAccess::kUnknown;
	consume_by_block<64, BlockOp::kByWord, kUnknown>(dst, src, size);
	consume_by_block<16, BlockOp::kByWord, kUnknown>(dst, src, size);
	consume_by_block<4, BlockOp::kByWord, kUnknown>(dst, src, size);
	if (size & 1)
	copy_block_and_bump_pointers<1>(dst, src);
	if (size & 2)
	copy_block_and_bump_pointers<2>(dst, src);
	}

	[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(Ptr dst, CPtr src,
	size_t size) {
	// The compiler performs alias analysis and is able to prove that `dst` and
	// `src` do not alias by propagating the `__restrict` keyword from the
	// `memcpy` prototype. This allows the compiler to merge consecutive
	// load/store (LDR, STR) instructions generated in
	// `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store
	// double (LDRD, STRD) instructions, this is is undesirable so we prevent the
	// compiler from inferring `__restrict` with the following line.
	asm volatile("" : "+r"(dst), "+r"(src));
	#ifdef __ARM_FEATURE_UNALIGNED
	return inline_memcpy_arm_mid_end(dst, src, size);
	#else
	return inline_memcpy_arm_low_end(dst, src, size);
	#endif
	}

	} // namespace LIBC_NAMESPACE_DECL

	#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H