blob: e34b13a872fcace6b154028f6e3b2e47a9ab116c [file] [log] [blame]
//===-- Implementation of memset and bzero --------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
#include "src/__support/architectures.h"
#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/utils.h"
#include <stddef.h> // size_t
namespace __llvm_libc {
// A general purpose implementation assuming cheap unaligned writes for sizes:
// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
// or 64 Bytes at a time, the compiler will expand them as needed.
//
// This implementation is subject to change as we benchmark more processors. We
// may also want to customize it for processors with specialized instructions
// that performs better (e.g. `rep stosb`).
//
// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
// We want to balance two things here:
// - The number of redundant writes (when using `SetBlockOverlap`),
// - The number of conditionals for sizes <=128 (~90% of memset calls are for
// such sizes).
//
// For the range 64-128:
// - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
// is wasteful near 65 but efficient toward 128.
// - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
// 96 or 128 Bytes.
// - Another approach could be to use an hybrid approach Copy<64>+Overlap<32>
// for 65-96 and Copy<96>+Overlap<32> for 97-128
//
// Benchmarks showed that redundant writes were cheap (for Intel X86) but
// conditional were expensive, even on processor that do not support writing 64B
// at a time (pre-AVX512F). We also want to favor short functions that allow
// more hot code to fit in the iL1 cache.
//
// Above 128 we have to use conditionals since we don't know the upper bound in
// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
// superior for sizes that mattered.
inline static void inline_memset(char *dst, unsigned char value, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_X86
/////////////////////////////////////////////////////////////////////////////
using namespace __llvm_libc::x86;
if (count == 0)
return;
if (count == 1)
return SplatSet<_1>(dst, value);
if (count == 2)
return SplatSet<_2>(dst, value);
if (count == 3)
return SplatSet<_3>(dst, value);
if (count <= 8)
return SplatSet<HeadTail<_4>>(dst, value, count);
if (count <= 16)
return SplatSet<HeadTail<_8>>(dst, value, count);
if (count <= 32)
return SplatSet<HeadTail<_16>>(dst, value, count);
if (count <= 64)
return SplatSet<HeadTail<_32>>(dst, value, count);
if (count <= 128)
return SplatSet<HeadTail<_64>>(dst, value, count);
return SplatSet<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
/////////////////////////////////////////////////////////////////////////////
using namespace __llvm_libc::aarch64_memset;
if (count == 0)
return;
if (count <= 3) {
SplatSet<_1>(dst, value);
if (count > 1)
SplatSet<Tail<_2>>(dst, value, count);
return;
}
if (count <= 8)
return SplatSet<HeadTail<_4>>(dst, value, count);
if (count <= 16)
return SplatSet<HeadTail<_8>>(dst, value, count);
if (count <= 32)
return SplatSet<HeadTail<_16>>(dst, value, count);
if (count <= 96) {
SplatSet<_32>(dst, value);
if (count <= 64)
return SplatSet<Tail<_32>>(dst, value, count);
SplatSet<Skip<32>::Then<_32>>(dst, value);
SplatSet<Tail<_32>>(dst, value, count);
return;
}
if (count < 448 || value != 0 || !AArch64ZVA(dst, count))
return SplatSet<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count);
#else
/////////////////////////////////////////////////////////////////////////////
// Default
/////////////////////////////////////////////////////////////////////////////
using namespace ::__llvm_libc::scalar;
if (count == 0)
return;
if (count == 1)
return SplatSet<_1>(dst, value);
if (count == 2)
return SplatSet<_2>(dst, value);
if (count == 3)
return SplatSet<_3>(dst, value);
if (count <= 8)
return SplatSet<HeadTail<_4>>(dst, value, count);
if (count <= 16)
return SplatSet<HeadTail<_8>>(dst, value, count);
if (count <= 32)
return SplatSet<HeadTail<_16>>(dst, value, count);
if (count <= 64)
return SplatSet<HeadTail<_32>>(dst, value, count);
if (count <= 128)
return SplatSet<HeadTail<_64>>(dst, value, count);
return SplatSet<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
#endif
}
} // namespace __llvm_libc
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H