| //===-- Memcpy implementation for arm ---------------------------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // The functions defined in this file give approximate code size. These sizes |
| // assume the following configuration options: |
| // - LIBC_CONF_KEEP_FRAME_POINTER = false |
| // - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false |
| // - LIBC_ADD_NULL_CHECKS = false |
| #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H |
| #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H |
| |
| #include "src/__support/CPP/type_traits.h" // always_false |
| #include "src/__support/macros/attributes.h" // LIBC_INLINE |
| #include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL |
| #include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY |
| #include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align |
| |
| #include <stddef.h> // size_t |
| |
| namespace LIBC_NAMESPACE_DECL { |
| |
| namespace { |
| |
| // Performs a copy of `bytes` byte from `src` to `dst`. This function has the |
| // semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is |
| // free to use whatever instruction is best for the size and assumed access. |
| template <size_t bytes, AssumeAccess access> |
| LIBC_INLINE void copy(void *dst, const void *src) { |
| if constexpr (access == AssumeAccess::kAligned) { |
| constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes; |
| memcpy_inline<bytes>(assume_aligned<alignment>(dst), |
| assume_aligned<alignment>(src)); |
| } else if constexpr (access == AssumeAccess::kUnknown) { |
| memcpy_inline<bytes>(dst, src); |
| } else { |
| static_assert(cpp::always_false<decltype(access)>, "Invalid AssumeAccess"); |
| } |
| } |
| |
| template <size_t bytes, BlockOp block_op = BlockOp::kFull, |
| AssumeAccess access = AssumeAccess::kUnknown> |
| LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) { |
| if constexpr (block_op == BlockOp::kFull) { |
| copy<bytes, access>(dst, src); |
| } else if constexpr (block_op == BlockOp::kByWord) { |
| // We restrict loads/stores to 4 byte to prevent the use of load/store |
| // multiple (LDM, STM) and load/store double (LDRD, STRD). |
| static_assert((bytes % kWordSize == 0) && (bytes >= kWordSize)); |
| LIBC_LOOP_UNROLL |
| for (size_t offset = 0; offset < bytes; offset += kWordSize) { |
| copy<kWordSize, access>(dst + offset, src + offset); |
| } |
| } else { |
| static_assert(cpp::always_false<decltype(block_op)>, "Invalid BlockOp"); |
| } |
| // In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting |
| // into the load/store instructions. |
| // e.g., |
| // ldrb r3, [r1], #1 |
| // strb r3, [r0], #1 |
| dst += bytes; |
| src += bytes; |
| } |
| |
| template <size_t bytes, BlockOp block_op, AssumeAccess access> |
| LIBC_INLINE void consume_by_block(Ptr &dst, CPtr &src, size_t &size) { |
| LIBC_LOOP_NOUNROLL |
| for (size_t i = 0; i < size / bytes; ++i) |
| copy_block_and_bump_pointers<bytes, block_op, access>(dst, src); |
| size %= bytes; |
| } |
| |
| [[maybe_unused]] LIBC_INLINE void |
| copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) { |
| LIBC_LOOP_NOUNROLL |
| for (size_t i = 0; i < size; ++i) |
| *dst++ = *src++; |
| } |
| |
| } // namespace |
| |
| // Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned |
| // loads/stores. It compiles down to 208 bytes when used through `memcpy` that |
| // also needs to return the `dst` ptr. |
| // Note: |
| // - When `src` and `dst` are coaligned, we start by aligning them and perform |
| // bulk copies. We let the compiler know the pointers are aligned so it can |
| // use load/store multiple (LDM, STM). This significantly increase throughput |
| // but it also requires more registers and push/pop instructions. This impacts |
| // latency for small size copies. |
| // - When `src` and `dst` are misaligned, we align `dst` and recompose words |
| // using multiple aligned loads. `load_aligned` takes care of endianness |
| // issues. |
| [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_low_end(Ptr dst, CPtr src, |
| size_t size) { |
| if (size >= 8) { |
| if (const size_t offset = distance_to_align_up<kWordSize>(dst)) |
| LIBC_ATTR_UNLIKELY { |
| copy_bytes_and_bump_pointers(dst, src, offset); |
| size -= offset; |
| } |
| constexpr AssumeAccess kAligned = AssumeAccess::kAligned; |
| const auto src_alignment = distance_to_align_down<kWordSize>(src); |
| if (src_alignment == 0) |
| LIBC_ATTR_LIKELY { |
| // Both `src` and `dst` are now word-aligned. |
| // We first copy by blocks of 64 bytes, the compiler will use 4 |
| // load/store multiple (LDM, STM), each of 4 words. This requires more |
| // registers so additional push/pop are needed but the speedup is worth |
| // it. |
| consume_by_block<64, BlockOp::kFull, kAligned>(dst, src, size); |
| // Then we use blocks of 4 word load/store. |
| consume_by_block<16, BlockOp::kByWord, kAligned>(dst, src, size); |
| // Then we use word by word copy. |
| consume_by_block<4, BlockOp::kByWord, kAligned>(dst, src, size); |
| } |
| else { |
| // `dst` is aligned but `src` is not. |
| LIBC_LOOP_NOUNROLL |
| while (size >= kWordSize) { |
| // Recompose word from multiple loads depending on the alignment. |
| const uint32_t value = |
| src_alignment == 2 |
| ? load_aligned<uint32_t, uint16_t, uint16_t>(src) |
| : load_aligned<uint32_t, uint8_t, uint16_t, uint8_t>(src); |
| copy<kWordSize, kAligned>(dst, &value); |
| dst += kWordSize; |
| src += kWordSize; |
| size -= kWordSize; |
| } |
| } |
| // Up to 3 bytes may still need to be copied. |
| // Handling them with the slow loop below. |
| } |
| copy_bytes_and_bump_pointers(dst, src, size); |
| } |
| |
| // Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware |
| // support for unaligned loads and stores. It compiles down to 272 bytes when |
| // used through `memcpy` that also needs to return the `dst` ptr. |
| [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end(Ptr dst, CPtr src, |
| size_t size) { |
| if (misaligned(bitwise_or(src, dst))) |
| LIBC_ATTR_UNLIKELY { |
| if (size < 8) |
| LIBC_ATTR_UNLIKELY { |
| if (size & 1) |
| copy_block_and_bump_pointers<1>(dst, src); |
| if (size & 2) |
| copy_block_and_bump_pointers<2>(dst, src); |
| if (size & 4) |
| copy_block_and_bump_pointers<4>(dst, src); |
| return; |
| } |
| if (misaligned(src)) |
| LIBC_ATTR_UNLIKELY { |
| const size_t offset = distance_to_align_up<kWordSize>(dst); |
| if (offset & 1) |
| copy_block_and_bump_pointers<1>(dst, src); |
| if (offset & 2) |
| copy_block_and_bump_pointers<2>(dst, src); |
| size -= offset; |
| } |
| } |
| // `dst` and `src` are not necessarily both aligned at that point but this |
| // implementation assumes hardware support for unaligned loads and stores so |
| // it is still fast to perform unrolled word by word copy. Note that wider |
| // accesses through the use of load/store multiple (LDM, STM) and load/store |
| // double (LDRD, STRD) instructions are generally not supported and can fault. |
| // By forcing decomposition of 64 bytes copy into word by word copy, the |
| // compiler uses a load to prefetch the next cache line: |
| // ldr r3, [r1, #64]! <- prefetch next cache line |
| // str r3, [r0] |
| // ldr r3, [r1, #0x4] |
| // str r3, [r0, #0x4] |
| // ... |
| // ldr r3, [r1, #0x3c] |
| // str r3, [r0, #0x3c] |
| // This is a bit detrimental for sizes between 64 and 256 (less than 10% |
| // penalty) but the prefetch yields better throughput for larger copies. |
| constexpr AssumeAccess kUnknown = AssumeAccess::kUnknown; |
| consume_by_block<64, BlockOp::kByWord, kUnknown>(dst, src, size); |
| consume_by_block<16, BlockOp::kByWord, kUnknown>(dst, src, size); |
| consume_by_block<4, BlockOp::kByWord, kUnknown>(dst, src, size); |
| if (size & 1) |
| copy_block_and_bump_pointers<1>(dst, src); |
| if (size & 2) |
| copy_block_and_bump_pointers<2>(dst, src); |
| } |
| |
| [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(Ptr dst, CPtr src, |
| size_t size) { |
| // The compiler performs alias analysis and is able to prove that `dst` and |
| // `src` do not alias by propagating the `__restrict` keyword from the |
| // `memcpy` prototype. This allows the compiler to merge consecutive |
| // load/store (LDR, STR) instructions generated in |
| // `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store |
| // double (LDRD, STRD) instructions, this is is undesirable so we prevent the |
| // compiler from inferring `__restrict` with the following line. |
| asm volatile("" : "+r"(dst), "+r"(src)); |
| #ifdef __ARM_FEATURE_UNALIGNED |
| return inline_memcpy_arm_mid_end(dst, src, size); |
| #else |
| return inline_memcpy_arm_low_end(dst, src, size); |
| #endif |
| } |
| |
| } // namespace LIBC_NAMESPACE_DECL |
| |
| #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H |