src/string/memory_utils/op_aarch64.h - llvm-project/libc - Git at Google

 //===-- aarch64 implementation of memory function building blocks ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file provides aarch64 specific building blocks to compose memory
 // functions.
 //
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H

 #include "src/__support/macros/properties/architectures.h"

 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)

 #include "src/__support/common.h"
 #include "src/string/memory_utils/op_generic.h"

 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif //__ARM_NEON

 namespace __llvm_libc::aarch64 {

 static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);

 namespace neon {

 struct BzeroCacheLine {
   static constexpr size_t SIZE = 64;

   LIBC_INLINE static void block(Ptr dst, uint8_t) {
 #if __SIZEOF_POINTER__ == 4
     asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
 #else
     asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
 #endif
   }

   LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
     size_t offset = 0;
     do {
       block(dst + offset, value);
       offset += SIZE;
     } while (offset < count - SIZE);
     // Unaligned store, we can't use 'dc zva' here.
     generic::Memset<uint8x64_t>::tail(dst, value, count);
   }
 };

 LIBC_INLINE static bool hasZva() {
   uint64_t zva_val;
   asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
   // DC ZVA is permitted if DZP, bit [4] is zero.
   // BS, bits [3:0] is log2 of the block count in words.
   // So the next line checks whether the instruction is permitted and block
   // count is 16 words (i.e. 64 bytes).
   return (zva_val & 0b11111) == 0b00100;
 }

 } // namespace neon

 ///////////////////////////////////////////////////////////////////////////////
 // Bcmp
 template <size_t Size> struct Bcmp {
   static constexpr size_t SIZE = Size;
   static constexpr size_t BlockSize = 32;

   LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) {
     return reinterpret_cast<const unsigned char *>(ptr);
   }

   LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
     if constexpr (Size == 16) {
       auto _p1 = as_u8(p1);
       auto _p2 = as_u8(p2);
       uint8x16_t a = vld1q_u8(_p1);
       uint8x16_t n = vld1q_u8(_p2);
       uint8x16_t an = veorq_u8(a, n);
       uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
       return vmaxv_u32(an_reduced);
     } else if constexpr (Size == 32) {
       auto _p1 = as_u8(p1);
       auto _p2 = as_u8(p2);
       uint8x16_t a = vld1q_u8(_p1);
       uint8x16_t b = vld1q_u8(_p1 + 16);
       uint8x16_t n = vld1q_u8(_p2);
       uint8x16_t o = vld1q_u8(_p2 + 16);
       uint8x16_t an = veorq_u8(a, n);
       uint8x16_t bo = veorq_u8(b, o);
       // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
       // a difference between the two buffers.  We reduce this value down to 4
       // bytes in two steps. First, calculate the saturated move value when
       // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
       // a single 32 bit nonzero value if a mismatch occurred.
       uint8x16_t anbo = vorrq_u8(an, bo);
       uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
       return vmaxv_u32(anbo_reduced);
     } else if constexpr ((Size % BlockSize) == 0) {
       for (size_t offset = 0; offset < Size; offset += BlockSize)
         if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
           return value;
     } else {
       deferred_static_assert("SIZE not implemented");
     }
     return BcmpReturnType::ZERO();
   }

   LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
     return block(p1 + count - SIZE, p2 + count - SIZE);
   }

   LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
     if constexpr (Size == 16) {
       auto _p1 = as_u8(p1);
       auto _p2 = as_u8(p2);
       uint8x16_t a = vld1q_u8(_p1);
       uint8x16_t b = vld1q_u8(_p1 + count - 16);
       uint8x16_t n = vld1q_u8(_p2);
       uint8x16_t o = vld1q_u8(_p2 + count - 16);
       uint8x16_t an = veorq_u8(a, n);
       uint8x16_t bo = veorq_u8(b, o);
       // anbo = (a ^ n) | (b ^ o)
       uint8x16_t anbo = vorrq_u8(an, bo);
       uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
       return vmaxv_u32(anbo_reduced);
     } else if constexpr (Size == 32) {
       auto _p1 = as_u8(p1);
       auto _p2 = as_u8(p2);
       uint8x16_t a = vld1q_u8(_p1);
       uint8x16_t b = vld1q_u8(_p1 + 16);
       uint8x16_t c = vld1q_u8(_p1 + count - 16);
       uint8x16_t d = vld1q_u8(_p1 + count - 32);
       uint8x16_t n = vld1q_u8(_p2);
       uint8x16_t o = vld1q_u8(_p2 + 16);
       uint8x16_t p = vld1q_u8(_p2 + count - 16);
       uint8x16_t q = vld1q_u8(_p2 + count - 32);
       uint8x16_t an = veorq_u8(a, n);
       uint8x16_t bo = veorq_u8(b, o);
       uint8x16_t cp = veorq_u8(c, p);
       uint8x16_t dq = veorq_u8(d, q);
       uint8x16_t anbo = vorrq_u8(an, bo);
       uint8x16_t cpdq = vorrq_u8(cp, dq);
       // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
       // a nonzero 32 bit value if a mismatch occurred.
       uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
       uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
       return vmaxv_u32(abnocpdq_reduced);
     } else {
       deferred_static_assert("SIZE not implemented");
     }
     return BcmpReturnType::ZERO();
   }

   LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
                                                   size_t count) {
     static_assert(Size > 1, "a loop of size 1 does not need tail");
     size_t offset = 0;
     do {
       if (auto value = block(p1 + offset, p2 + offset))
         return value;
       offset += SIZE;
     } while (offset < count - SIZE);
     return tail(p1, p2, count);
   }
 };

 } // namespace __llvm_libc::aarch64

 #endif // LIBC_TARGET_ARCH_IS_AARCH64

 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
	//===-- aarch64 implementation of memory function building blocks ---------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file provides aarch64 specific building blocks to compose memory
	// functions.
	//
	//===----------------------------------------------------------------------===//
	#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
	#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H

	#include "src/__support/macros/properties/architectures.h"

	#if defined(LIBC_TARGET_ARCH_IS_AARCH64)

	#include "src/__support/common.h"
	#include "src/string/memory_utils/op_generic.h"

	#ifdef __ARM_NEON
	#include <arm_neon.h>
	#endif //__ARM_NEON

	namespace __llvm_libc::aarch64 {

	static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);

	namespace neon {

	struct BzeroCacheLine {
	static constexpr size_t SIZE = 64;

	LIBC_INLINE static void block(Ptr dst, uint8_t) {
	#if __SIZEOF_POINTER__ == 4
	asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
	#else
	asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
	#endif
	}

	LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
	size_t offset = 0;
	do {
	block(dst + offset, value);
	offset += SIZE;
	} while (offset < count - SIZE);
	// Unaligned store, we can't use 'dc zva' here.
	generic::Memset<uint8x64_t>::tail(dst, value, count);
	}
	};

	LIBC_INLINE static bool hasZva() {
	uint64_t zva_val;
	asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
	// DC ZVA is permitted if DZP, bit [4] is zero.
	// BS, bits [3:0] is log2 of the block count in words.
	// So the next line checks whether the instruction is permitted and block
	// count is 16 words (i.e. 64 bytes).
	return (zva_val & 0b11111) == 0b00100;
	}

	} // namespace neon

	///////////////////////////////////////////////////////////////////////////////
	// Bcmp
	template <size_t Size> struct Bcmp {
	static constexpr size_t SIZE = Size;
	static constexpr size_t BlockSize = 32;

	LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) {
	return reinterpret_cast<const unsigned char *>(ptr);
	}

	LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
	if constexpr (Size == 16) {
	auto _p1 = as_u8(p1);
	auto _p2 = as_u8(p2);
	uint8x16_t a = vld1q_u8(_p1);
	uint8x16_t n = vld1q_u8(_p2);
	uint8x16_t an = veorq_u8(a, n);
	uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
	return vmaxv_u32(an_reduced);
	} else if constexpr (Size == 32) {
	auto _p1 = as_u8(p1);
	auto _p2 = as_u8(p2);
	uint8x16_t a = vld1q_u8(_p1);
	uint8x16_t b = vld1q_u8(_p1 + 16);
	uint8x16_t n = vld1q_u8(_p2);
	uint8x16_t o = vld1q_u8(_p2 + 16);
	uint8x16_t an = veorq_u8(a, n);
	uint8x16_t bo = veorq_u8(b, o);
	// anbo = (a ^ n) \| (b ^ o). At least one byte is nonzero if there is
	// a difference between the two buffers. We reduce this value down to 4
	// bytes in two steps. First, calculate the saturated move value when
	// going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
	// a single 32 bit nonzero value if a mismatch occurred.
	uint8x16_t anbo = vorrq_u8(an, bo);
	uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
	return vmaxv_u32(anbo_reduced);
	} else if constexpr ((Size % BlockSize) == 0) {
	for (size_t offset = 0; offset < Size; offset += BlockSize)
	if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
	return value;
	} else {
	deferred_static_assert("SIZE not implemented");
	}
	return BcmpReturnType::ZERO();
	}

	LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
	return block(p1 + count - SIZE, p2 + count - SIZE);
	}

	LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
	if constexpr (Size == 16) {
	auto _p1 = as_u8(p1);
	auto _p2 = as_u8(p2);
	uint8x16_t a = vld1q_u8(_p1);
	uint8x16_t b = vld1q_u8(_p1 + count - 16);
	uint8x16_t n = vld1q_u8(_p2);
	uint8x16_t o = vld1q_u8(_p2 + count - 16);
	uint8x16_t an = veorq_u8(a, n);
	uint8x16_t bo = veorq_u8(b, o);
	// anbo = (a ^ n) \| (b ^ o)
	uint8x16_t anbo = vorrq_u8(an, bo);
	uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
	return vmaxv_u32(anbo_reduced);
	} else if constexpr (Size == 32) {
	auto _p1 = as_u8(p1);
	auto _p2 = as_u8(p2);
	uint8x16_t a = vld1q_u8(_p1);
	uint8x16_t b = vld1q_u8(_p1 + 16);
	uint8x16_t c = vld1q_u8(_p1 + count - 16);
	uint8x16_t d = vld1q_u8(_p1 + count - 32);
	uint8x16_t n = vld1q_u8(_p2);
	uint8x16_t o = vld1q_u8(_p2 + 16);
	uint8x16_t p = vld1q_u8(_p2 + count - 16);
	uint8x16_t q = vld1q_u8(_p2 + count - 32);
	uint8x16_t an = veorq_u8(a, n);
	uint8x16_t bo = veorq_u8(b, o);
	uint8x16_t cp = veorq_u8(c, p);
	uint8x16_t dq = veorq_u8(d, q);
	uint8x16_t anbo = vorrq_u8(an, bo);
	uint8x16_t cpdq = vorrq_u8(cp, dq);
	// abnocpdq = ((a ^ n) \| (b ^ o)) \| ((c ^ p) \| (d ^ q)). Reduce this to
	// a nonzero 32 bit value if a mismatch occurred.
	uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo \| cpdq);
	uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
	return vmaxv_u32(abnocpdq_reduced);
	} else {
	deferred_static_assert("SIZE not implemented");
	}
	return BcmpReturnType::ZERO();
	}

	LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
	size_t count) {
	static_assert(Size > 1, "a loop of size 1 does not need tail");
	size_t offset = 0;
	do {
	if (auto value = block(p1 + offset, p2 + offset))
	return value;
	offset += SIZE;
	} while (offset < count - SIZE);
	return tail(p1, p2, count);
	}
	};

	} // namespace __llvm_libc::aarch64

	#endif // LIBC_TARGET_ARCH_IS_AARCH64

	#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H