blob: cec8e969b123edd52913344909c07aaf42a9e144 [file] [log] [blame]
//===-- Memmove implementation for x86_64 -----------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMMOVE_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMMOVE_H
#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h"
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE_DECL {
LIBC_INLINE bool inline_memmove_small_size_x86(Ptr dst, CPtr src,
size_t count) {
#if defined(__AVX512F__)
constexpr size_t vector_size = 64;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = generic_v512;
#elif defined(__AVX__)
constexpr size_t vector_size = 32;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = cpp::array<generic_v256, 2>;
#elif defined(__SSE2__)
constexpr size_t vector_size = 16;
using uint128_t = generic_v128;
using uint256_t = cpp::array<generic_v128, 2>;
using uint512_t = cpp::array<generic_v128, 4>;
#else
constexpr size_t vector_size = 8;
using uint128_t = cpp::array<uint64_t, 2>;
using uint256_t = cpp::array<uint64_t, 4>;
using uint512_t = cpp::array<uint64_t, 8>;
#endif
(void)vector_size;
if (count == 0)
return true;
if (count == 1) {
generic::Memmove<uint8_t>::block(dst, src);
return true;
}
if (count == 2) {
generic::Memmove<uint16_t>::block(dst, src);
return true;
}
if (count == 3) {
generic::Memmove<cpp::array<uint8_t, 3>>::block(dst, src);
return true;
}
if (count == 4) {
generic::Memmove<uint32_t>::block(dst, src);
return true;
}
if (count < 8) {
generic::Memmove<uint32_t>::head_tail(dst, src, count);
return true;
}
// If count is equal to a power of 2, we can handle it as head-tail
// of both smaller size and larger size (head-tail are either
// non-overlapping for smaller size, or completely collapsed
// for larger size). It seems to be more profitable to do the copy
// with the larger size, if it's natively supported (e.g. doing
// 2 collapsed 32-byte moves for count=64 if AVX2 is supported).
// But it's not profitable to use larger size if it's not natively
// supported: we will both use more instructions and handle fewer
// sizes in earlier branches.
if (vector_size >= 16 ? count < 16 : count <= 16) {
generic::Memmove<uint64_t>::head_tail(dst, src, count);
return true;
}
if (vector_size >= 32 ? count < 32 : count <= 32) {
generic::Memmove<uint128_t>::head_tail(dst, src, count);
return true;
}
if (vector_size >= 64 ? count < 64 : count <= 64) {
generic::Memmove<uint256_t>::head_tail(dst, src, count);
return true;
}
if (count <= 128) {
generic::Memmove<uint512_t>::head_tail(dst, src, count);
return true;
}
return false;
}
LIBC_INLINE void inline_memmove_follow_up_x86(Ptr dst, CPtr src, size_t count) {
#if defined(__AVX512F__)
using uint256_t = generic_v256;
using uint512_t = generic_v512;
#elif defined(__AVX__)
using uint256_t = generic_v256;
using uint512_t = cpp::array<generic_v256, 2>;
#elif defined(__SSE2__)
using uint256_t = cpp::array<generic_v128, 2>;
using uint512_t = cpp::array<generic_v128, 4>;
#else
using uint256_t = cpp::array<uint64_t, 4>;
using uint512_t = cpp::array<uint64_t, 8>;
#endif
if (dst < src) {
generic::Memmove<uint256_t>::align_forward<Arg::Src>(dst, src, count);
return generic::Memmove<uint512_t>::loop_and_tail_forward(dst, src, count);
} else {
generic::Memmove<uint256_t>::align_backward<Arg::Src>(dst, src, count);
return generic::Memmove<uint512_t>::loop_and_tail_backward(dst, src, count);
}
}
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMMOVE_H