compiler-rt/lib/builtins/aarch64/sme-libc-opt-memset-memchr.S - llvm-project - Git at Google

 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// This file contains assembly-optimized implementations of Scalable Matrix
 /// Extension (SME) compatible memset and memchr functions.
 ///
 /// These implementations depend on unaligned access and floating-point support.
 ///
 /// Routines taken from libc/AOR_v20.02/string/aarch64.
 ///
 //===----------------------------------------------------------------------===//

 #include "../assembly.h"

 //
 //  __arm_sc_memset
 //

 #define dstin    x0
 #define val      x1
 #define valw     w1
 #define count    x2
 #define dst      x3
 #define dstend2  x4
 #define zva_val  x5

 DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
 #ifdef __ARM_FEATURE_SVE
         mov     z0.b, valw
 #else
         bfi valw, valw, #8, #8
         bfi valw, valw, #16, #16
         bfi val, val, #32, #32
         fmov d0, val
         fmov v0.d[1], val
 #endif
         add     dstend2, dstin, count

         cmp     count, 96
         b.hi    7f  // set_long
         cmp     count, 16
         b.hs    4f  // set_medium
         mov     val, v0.D[0]

         /* Set 0..15 bytes.  */
         tbz     count, 3, 1f
         str     val, [dstin]
         str     val, [dstend2, -8]
         ret
         nop
 1:      tbz     count, 2, 2f
         str     valw, [dstin]
         str     valw, [dstend2, -4]
         ret
 2:      cbz     count, 3f
         strb    valw, [dstin]
         tbz     count, 1, 3f
         strh    valw, [dstend2, -2]
 3:      ret

         /* Set 17..96 bytes.  */
 4:  // set_medium
         str     q0, [dstin]
         tbnz    count, 6, 6f  // set96
         str     q0, [dstend2, -16]
         tbz     count, 5, 5f
         str     q0, [dstin, 16]
         str     q0, [dstend2, -32]
 5:      ret

         .p2align 4
         /* Set 64..96 bytes.  Write 64 bytes from the start and
            32 bytes from the end.  */
 6:  // set96
         str     q0, [dstin, 16]
         stp     q0, q0, [dstin, 32]
         stp     q0, q0, [dstend2, -32]
         ret

         .p2align 4
 7:  // set_long
         and     valw, valw, 255
         bic     dst, dstin, 15
         str     q0, [dstin]
         cmp     count, 160
         ccmp    valw, 0, 0, hs
         b.ne    9f  // no_zva

 #ifndef SKIP_ZVA_CHECK
         mrs     zva_val, dczid_el0
         and     zva_val, zva_val, 31
         cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
         b.ne    9f  // no_zva
 #endif
         str     q0, [dst, 16]
         stp     q0, q0, [dst, 32]
         bic     dst, dst, 63
         sub     count, dstend2, dst      /* Count is now 64 too large.  */
         sub     count, count, 128       /* Adjust count and bias for loop.  */

         .p2align 4
 8:  // zva_loop
         add     dst, dst, 64
         dc      zva, dst
         subs    count, count, 64
         b.hi    8b  // zva_loop
         stp     q0, q0, [dstend2, -64]
         stp     q0, q0, [dstend2, -32]
         ret

 9:  // no_zva
         sub     count, dstend2, dst      /* Count is 16 too large.  */
         sub     dst, dst, 16            /* Dst is biased by -32.  */
         sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
 10: // no_zva_loop
         stp     q0, q0, [dst, 32]
         stp     q0, q0, [dst, 64]!
         subs    count, count, 64
         b.hi    10b  // no_zva_loop
         stp     q0, q0, [dstend2, -64]
         stp     q0, q0, [dstend2, -32]
         ret
 END_COMPILERRT_FUNCTION(__arm_sc_memset)

 //
 //  __arm_sc_memchr
 //

 #define srcin		x0
 #define chrin		w1
 #define cntin		x2

 #define result		x0

 #define src		x3
 #define	tmp		x4
 #define wtmp2		w5
 #define synd		x6
 #define soff		x9
 #define cntrem		x10

 #define vrepchr		v0
 #define vdata1		v1
 #define vdata2		v2
 #define vhas_chr1	v3
 #define vhas_chr2	v4
 #define vrepmask	v5
 #define vend		v6

 /*
  * Core algorithm:
  *
  * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
  * per byte. For each tuple, bit 0 is set if the relevant byte matched the
  * requested character and bit 1 is not used (faster than using a 32bit
  * syndrome). Since the bits in the syndrome reflect exactly the order in which
  * things occur in the original string, counting trailing zeros allows to
  * identify exactly which byte has matched.
  */

 DEFINE_COMPILERRT_FUNCTION(__arm_sc_memchr)
 	/* Do not dereference srcin if no bytes to compare.  */
 	cbz	cntin, 4f
 	/*
 	 * Magic constant 0x40100401 allows us to identify which lane matches
 	 * the requested byte.
 	 */
 	mov	wtmp2, #0x0401
 	movk	wtmp2, #0x4010, lsl #16
 	dup	vrepchr.16b, chrin
 	/* Work with aligned 32-byte chunks */
 	bic	src, srcin, #31
 	dup	vrepmask.4s, wtmp2
 	ands	soff, srcin, #31
 	and	cntrem, cntin, #31
 	b.eq	0f

 	/*
 	 * Input string is not 32-byte aligned. We calculate the syndrome
 	 * value for the aligned 32 bytes block containing the first bytes
 	 * and mask the irrelevant part.
 	 */

 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	sub	tmp, soff, #32
 	adds	cntin, cntin, tmp
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
 	mov	synd, vend.d[0]
 	/* Clear the soff*2 lower bits */
 	lsl	tmp, soff, #1
 	lsr	synd, synd, tmp
 	lsl	synd, synd, tmp
 	/* The first block can also be the last */
 	b.ls	2f
 	/* Have we found something already? */
 	cbnz	synd, 3f

 0: // loop
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	subs	cntin, cntin, #32
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	/* If we're out of data we finish regardless of the result */
 	b.ls	1f
 	/* Use a fast check for the termination condition */
 	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
 	addp	vend.2d, vend.2d, vend.2d
 	mov	synd, vend.d[0]
 	/* We're not out of data, loop if we haven't found the character */
 	cbz	synd, 0b

 1: // end
 	/* Termination condition found, let's calculate the syndrome value */
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
 	mov	synd, vend.d[0]
 	/* Only do the clear for the last possible block */
 	b.hi	3f

 2: // masklast
 	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
 	add	tmp, cntrem, soff
 	and	tmp, tmp, #31
 	sub	tmp, tmp, #32
 	neg	tmp, tmp, lsl #1
 	lsl	synd, synd, tmp
 	lsr	synd, synd, tmp

 3: // tail
 	/* Count the trailing zeros using bit reversing */
 	rbit	synd, synd
 	/* Compensate the last post-increment */
 	sub	src, src, #32
 	/* Check that we have found a character */
 	cmp	synd, #0
 	/* And count the leading zeros */
 	clz	synd, synd
 	/* Compute the potential result */
 	add	result, src, synd, lsr #1
 	/* Select result or NULL */
 	csel	result, xzr, result, eq
 	ret

 4: // zero_length
 	mov	result, #0
 	ret
 END_COMPILERRT_FUNCTION(__arm_sc_memchr)
	//===----------------------------------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// This file contains assembly-optimized implementations of Scalable Matrix
	/// Extension (SME) compatible memset and memchr functions.
	///
	/// These implementations depend on unaligned access and floating-point support.
	///
	/// Routines taken from libc/AOR_v20.02/string/aarch64.
	///
	//===----------------------------------------------------------------------===//

	#include "../assembly.h"

	//
	// __arm_sc_memset
	//

	#define dstin x0
	#define val x1
	#define valw w1
	#define count x2
	#define dst x3
	#define dstend2 x4
	#define zva_val x5

	DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
	#ifdef __ARM_FEATURE_SVE
	mov z0.b, valw
	#else
	bfi valw, valw, #8, #8
	bfi valw, valw, #16, #16
	bfi val, val, #32, #32
	fmov d0, val
	fmov v0.d[1], val
	#endif
	add dstend2, dstin, count

	cmp count, 96
	b.hi 7f // set_long
	cmp count, 16
	b.hs 4f // set_medium
	mov val, v0.D[0]

	/* Set 0..15 bytes. */
	tbz count, 3, 1f
	str val, [dstin]
	str val, [dstend2, -8]
	ret
	nop
	1: tbz count, 2, 2f
	str valw, [dstin]
	str valw, [dstend2, -4]
	ret
	2: cbz count, 3f
	strb valw, [dstin]
	tbz count, 1, 3f
	strh valw, [dstend2, -2]
	3: ret

	/* Set 17..96 bytes. */
	4: // set_medium
	str q0, [dstin]
	tbnz count, 6, 6f // set96
	str q0, [dstend2, -16]
	tbz count, 5, 5f
	str q0, [dstin, 16]
	str q0, [dstend2, -32]
	5: ret

	.p2align 4
	/* Set 64..96 bytes. Write 64 bytes from the start and
	32 bytes from the end. */
	6: // set96
	str q0, [dstin, 16]
	stp q0, q0, [dstin, 32]
	stp q0, q0, [dstend2, -32]
	ret

	.p2align 4
	7: // set_long
	and valw, valw, 255
	bic dst, dstin, 15
	str q0, [dstin]
	cmp count, 160
	ccmp valw, 0, 0, hs
	b.ne 9f // no_zva

	#ifndef SKIP_ZVA_CHECK
	mrs zva_val, dczid_el0
	and zva_val, zva_val, 31
	cmp zva_val, 4 /* ZVA size is 64 bytes. */
	b.ne 9f // no_zva
	#endif
	str q0, [dst, 16]
	stp q0, q0, [dst, 32]
	bic dst, dst, 63
	sub count, dstend2, dst /* Count is now 64 too large. */
	sub count, count, 128 /* Adjust count and bias for loop. */

	.p2align 4
	8: // zva_loop
	add dst, dst, 64
	dc zva, dst
	subs count, count, 64
	b.hi 8b // zva_loop
	stp q0, q0, [dstend2, -64]
	stp q0, q0, [dstend2, -32]
	ret

	9: // no_zva
	sub count, dstend2, dst /* Count is 16 too large. */
	sub dst, dst, 16 /* Dst is biased by -32. */
	sub count, count, 64 + 16 /* Adjust count and bias for loop. */
	10: // no_zva_loop
	stp q0, q0, [dst, 32]
	stp q0, q0, [dst, 64]!
	subs count, count, 64
	b.hi 10b // no_zva_loop
	stp q0, q0, [dstend2, -64]
	stp q0, q0, [dstend2, -32]
	ret
	END_COMPILERRT_FUNCTION(__arm_sc_memset)

	//
	// __arm_sc_memchr
	//

	#define srcin x0
	#define chrin w1
	#define cntin x2

	#define result x0

	#define src x3
	#define tmp x4
	#define wtmp2 w5
	#define synd x6
	#define soff x9
	#define cntrem x10

	#define vrepchr v0
	#define vdata1 v1
	#define vdata2 v2
	#define vhas_chr1 v3
	#define vhas_chr2 v4
	#define vrepmask v5
	#define vend v6

	/*
	* Core algorithm:
	*
	* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
	* per byte. For each tuple, bit 0 is set if the relevant byte matched the
	* requested character and bit 1 is not used (faster than using a 32bit
	* syndrome). Since the bits in the syndrome reflect exactly the order in which
	* things occur in the original string, counting trailing zeros allows to
	* identify exactly which byte has matched.
	*/

	DEFINE_COMPILERRT_FUNCTION(__arm_sc_memchr)
	/* Do not dereference srcin if no bytes to compare. */
	cbz cntin, 4f
	/*
	* Magic constant 0x40100401 allows us to identify which lane matches
	* the requested byte.
	*/
	mov wtmp2, #0x0401
	movk wtmp2, #0x4010, lsl #16
	dup vrepchr.16b, chrin
	/* Work with aligned 32-byte chunks */
	bic src, srcin, #31
	dup vrepmask.4s, wtmp2
	ands soff, srcin, #31
	and cntrem, cntin, #31
	b.eq 0f

	/*
	* Input string is not 32-byte aligned. We calculate the syndrome
	* value for the aligned 32 bytes block containing the first bytes
	* and mask the irrelevant part.
	*/

	ld1 {vdata1.16b, vdata2.16b}, [src], #32
	sub tmp, soff, #32
	adds cntin, cntin, tmp
	cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
	cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
	and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
	and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
	addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
	addp vend.16b, vend.16b, vend.16b /* 128->64 */
	mov synd, vend.d[0]
	/* Clear the soff2 lower bits /
	lsl tmp, soff, #1
	lsr synd, synd, tmp
	lsl synd, synd, tmp
	/* The first block can also be the last */
	b.ls 2f
	/* Have we found something already? */
	cbnz synd, 3f

	0: // loop
	ld1 {vdata1.16b, vdata2.16b}, [src], #32
	subs cntin, cntin, #32
	cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
	cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
	/* If we're out of data we finish regardless of the result */
	b.ls 1f
	/* Use a fast check for the termination condition */
	orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
	addp vend.2d, vend.2d, vend.2d
	mov synd, vend.d[0]
	/* We're not out of data, loop if we haven't found the character */
	cbz synd, 0b

	1: // end
	/* Termination condition found, let's calculate the syndrome value */
	and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
	and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
	addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
	addp vend.16b, vend.16b, vend.16b /* 128->64 */
	mov synd, vend.d[0]
	/* Only do the clear for the last possible block */
	b.hi 3f

	2: // masklast
	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
	add tmp, cntrem, soff
	and tmp, tmp, #31
	sub tmp, tmp, #32
	neg tmp, tmp, lsl #1
	lsl synd, synd, tmp
	lsr synd, synd, tmp

	3: // tail
	/* Count the trailing zeros using bit reversing */
	rbit synd, synd
	/* Compensate the last post-increment */
	sub src, src, #32
	/* Check that we have found a character */
	cmp synd, #0
	/* And count the leading zeros */
	clz synd, synd
	/* Compute the potential result */
	add result, src, synd, lsr #1
	/* Select result or NULL */
	csel result, xzr, result, eq
	ret

	4: // zero_length
	mov result, #0
	ret
	END_COMPILERRT_FUNCTION(__arm_sc_memchr)