| //===-- x86 implementation of memory function building blocks -------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file provides x86 specific building blocks to compose memory functions. |
| // |
| //===----------------------------------------------------------------------===// |
| #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H |
| #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H |
| |
| #include "src/__support/macros/properties/architectures.h" |
| |
| #if defined(LIBC_TARGET_ARCH_IS_X86_64) |
| |
| #include "src/__support/common.h" |
| #include "src/string/memory_utils/op_builtin.h" |
| #include "src/string/memory_utils/op_generic.h" |
| |
| #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) || \ |
| defined(__SSE2__) |
| #include <immintrin.h> |
| #endif |
| |
| // Define fake functions to prevent the compiler from failing on undefined |
| // functions in case the CPU extension is not present. |
| #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__)) |
| #define _mm512_cmpneq_epi8_mask(A, B) 0 |
| #endif |
| #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__)) |
| #define _mm256_movemask_epi8(A) 0 |
| #endif |
| #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__)) |
| #define _mm_movemask_epi8(A) 0 |
| #endif |
| |
| namespace __llvm_libc::x86 { |
| |
| // A set of constants to check compile time features. |
| static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__); |
| static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__); |
| static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__); |
| static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__); |
| static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // Memcpy repmovsb implementation |
| struct Memcpy { |
| static void repmovsb(void *dst, const void *src, size_t count) { |
| asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); |
| } |
| }; |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // Bcmp |
| |
| // Base implementation for the Bcmp specializations. |
| // - BlockSize is either 16, 32 or 64 depending on the available compile time |
| // features, it is used to switch between "single native operation" or a |
| // "sequence of native operations". |
| // - BlockBcmp is the function that implements the bcmp logic. |
| template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl { |
| static constexpr size_t SIZE = Size; |
| LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { |
| if constexpr (Size == BlockSize) { |
| return BlockBcmp(p1, p2); |
| } else if constexpr (Size % BlockSize == 0) { |
| for (size_t offset = 0; offset < Size; offset += BlockSize) |
| if (auto value = BlockBcmp(p1 + offset, p2 + offset)) |
| return value; |
| } else { |
| deferred_static_assert("SIZE not implemented"); |
| } |
| return BcmpReturnType::ZERO(); |
| } |
| |
| LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { |
| return block(p1 + count - Size, p2 + count - Size); |
| } |
| |
| LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { |
| return block(p1, p2) | tail(p1, p2, count); |
| } |
| |
| LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, |
| size_t count) { |
| static_assert(Size > 1, "a loop of size 1 does not need tail"); |
| size_t offset = 0; |
| do { |
| if (auto value = block(p1 + offset, p2 + offset)) |
| return value; |
| offset += Size; |
| } while (offset < count - Size); |
| return tail(p1, p2, count); |
| } |
| }; |
| |
| namespace sse2 { |
| LIBC_INLINE BcmpReturnType bcmp16(CPtr p1, CPtr p2) { |
| #if defined(__SSE2__) |
| using T = char __attribute__((__vector_size__(16))); |
| // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. |
| const int mask = |
| _mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2))); |
| return static_cast<uint32_t>(mask); |
| #else |
| (void)p1; |
| (void)p2; |
| return BcmpReturnType::ZERO(); |
| #endif // defined(__SSE2__) |
| } |
| template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>; |
| } // namespace sse2 |
| |
| namespace avx2 { |
| LIBC_INLINE BcmpReturnType bcmp32(CPtr p1, CPtr p2) { |
| #if defined(__AVX2__) |
| using T = char __attribute__((__vector_size__(32))); |
| // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. |
| const int mask = |
| _mm256_movemask_epi8(cpp::bit_cast<__m256i>(load<T>(p1) != load<T>(p2))); |
| // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit |
| // mask. |
| return static_cast<uint32_t>(mask); |
| #else |
| (void)p1; |
| (void)p2; |
| return BcmpReturnType::ZERO(); |
| #endif // defined(__AVX2__) |
| } |
| template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>; |
| } // namespace avx2 |
| |
| namespace avx512bw { |
| LIBC_INLINE BcmpReturnType bcmp64(CPtr p1, CPtr p2) { |
| #if defined(__AVX512BW__) |
| using T = char __attribute__((__vector_size__(64))); |
| // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. |
| const uint64_t mask = _mm512_cmpneq_epi8_mask( |
| cpp::bit_cast<__m512i>(load<T>(p1)), cpp::bit_cast<__m512i>(load<T>(p2))); |
| const bool mask_is_set = mask != 0; |
| return static_cast<uint32_t>(mask_is_set); |
| #else |
| (void)p1; |
| (void)p2; |
| return BcmpReturnType::ZERO(); |
| #endif // defined(__AVX512BW__) |
| } |
| template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>; |
| } // namespace avx512bw |
| |
| // Assuming that the mask is non zero, the index of the first mismatching byte |
| // is the number of trailing zeros in the mask. Trailing zeros and not leading |
| // zeros because the x86 architecture is little endian. |
| LIBC_INLINE MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2, |
| uint64_t mask) { |
| const size_t diff_index = __builtin_ctzll(mask); |
| const int16_t ca = cpp::to_integer<uint8_t>(p1[diff_index]); |
| const int16_t cb = cpp::to_integer<uint8_t>(p2[diff_index]); |
| return ca - cb; |
| } |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // Memcmp |
| |
| // Base implementation for the Memcmp specializations. |
| // - BlockSize is either 16, 32 or 64 depending on the available compile time |
| // features, it is used to switch between "single native operation" or a |
| // "sequence of native operations". |
| // - BlockMemcmp is the function that implements the memcmp logic. |
| // - BlockBcmp is the function that implements the bcmp logic. |
| template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp> |
| struct MemcmpImpl { |
| static constexpr size_t SIZE = Size; |
| LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) { |
| if constexpr (Size == BlockSize) { |
| return BlockMemcmp(p1, p2); |
| } else if constexpr (Size % BlockSize == 0) { |
| for (size_t offset = 0; offset < Size; offset += BlockSize) |
| if (auto value = BlockBcmp(p1 + offset, p2 + offset)) |
| return BlockMemcmp(p1 + offset, p2 + offset); |
| } else { |
| deferred_static_assert("SIZE not implemented"); |
| } |
| return MemcmpReturnType::ZERO(); |
| } |
| |
| LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { |
| return block(p1 + count - Size, p2 + count - Size); |
| } |
| |
| LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2, |
| size_t count) { |
| if (auto value = block(p1, p2)) |
| return value; |
| return tail(p1, p2, count); |
| } |
| |
| LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, |
| size_t count) { |
| static_assert(Size > 1, "a loop of size 1 does not need tail"); |
| size_t offset = 0; |
| do { |
| if (auto value = block(p1 + offset, p2 + offset)) |
| return value; |
| offset += Size; |
| } while (offset < count - Size); |
| return tail(p1, p2, count); |
| } |
| }; |
| |
| namespace sse2 { |
| LIBC_INLINE MemcmpReturnType memcmp16(CPtr p1, CPtr p2) { |
| #if defined(__SSE2__) |
| using T = char __attribute__((__vector_size__(16))); |
| // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. |
| if (int mask = |
| _mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2)))) |
| return char_diff_no_zero(p1, p2, mask); |
| return MemcmpReturnType::ZERO(); |
| #else |
| (void)p1; |
| (void)p2; |
| return MemcmpReturnType::ZERO(); |
| #endif // defined(__SSE2__) |
| } |
| template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>; |
| } // namespace sse2 |
| |
| namespace avx2 { |
| LIBC_INLINE MemcmpReturnType memcmp32(CPtr p1, CPtr p2) { |
| #if defined(__AVX2__) |
| using T = char __attribute__((__vector_size__(32))); |
| // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. |
| if (int mask = _mm256_movemask_epi8( |
| cpp::bit_cast<__m256i>(load<T>(p1) != load<T>(p2)))) |
| return char_diff_no_zero(p1, p2, mask); |
| return MemcmpReturnType::ZERO(); |
| #else |
| (void)p1; |
| (void)p2; |
| return MemcmpReturnType::ZERO(); |
| #endif // defined(__AVX2__) |
| } |
| template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>; |
| } // namespace avx2 |
| |
| namespace avx512bw { |
| LIBC_INLINE MemcmpReturnType memcmp64(CPtr p1, CPtr p2) { |
| #if defined(__AVX512BW__) |
| using T = char __attribute__((__vector_size__(64))); |
| // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. |
| if (uint64_t mask = |
| _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(load<T>(p1)), |
| cpp::bit_cast<__m512i>(load<T>(p2)))) |
| return char_diff_no_zero(p1, p2, mask); |
| return MemcmpReturnType::ZERO(); |
| #else |
| (void)p1; |
| (void)p2; |
| return MemcmpReturnType::ZERO(); |
| #endif // defined(__AVX512BW__) |
| } |
| template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>; |
| } // namespace avx512bw |
| |
| } // namespace __llvm_libc::x86 |
| |
| #endif // LIBC_TARGET_ARCH_IS_X86_64 |
| |
| #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H |