AOR_v20.02/string/aarch64/strchr-mte.S - llvm-project/libc - Git at Google

 /*
  * strchr - find a character in a string
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  */

 /* Assumptions:
  *
  * ARMv8-a, AArch64
  * Neon Available.
  */

 #include "../asmdefs.h"

 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1

 #define result		x0

 #define src		x2
 #define	tmp1		x3
 #define wtmp2		w4
 #define tmp3		x5

 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask_0	v4
 #define vrepmask_c	v5
 #define vend		v6

 #define L(l) .L ## l

 /* Core algorithm.

    For each 16-byte chunk we calculate a 64-bit syndrome value, with
    four bits per byte (LSB is always in bits 0 and 1, for both big
    and little-endian systems).  For each tuple, bit 0 is set if
    the relevant byte matched the requested character; bit 1 is set
    if the relevant byte matched the NUL end of string (we trigger
    off bit0 for the special case of looking for NUL) and bits 2 and 3
    are not used.
    Since the bits in the syndrome reflect exactly the order in which
    things occur in the original string a count_trailing_zeros()
    operation will identify exactly which byte is causing the termination,
    and why. */

 /* Locals and temporaries. */

 ENTRY(__strchr_aarch64_mte)
 	/* Magic constant 0x10011001 to allow us to identify which lane
 	   matches the requested byte.  Magic constant 0x20022002 used
 	   similarly for NUL termination. */
 	mov	wtmp2, #0x1001
 	movk	wtmp2, #0x1001, lsl #16
 	dup	vrepchr.16b, chrin
 	bic	src, srcin, #15		/* Work with aligned 16-byte chunks. */
 	dup	vrepmask_c.4s, wtmp2
 	ands	tmp1, srcin, #15
 	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
 	b.eq	L(loop)

 	/* Input string is not 16-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
 	   for all the bytes, but then mask off those bits of the
 	   syndrome that are related to the padding.  */
 	ldr	qdata, [src], #16
 	cmeq	vhas_nul.16b, vdata.16b, #0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	and	vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
 	and	vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
 	lsl	tmp1, tmp1, #2
 	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
 	mov	tmp3, #~0
 	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */
 	lsl	tmp1, tmp3, tmp1

 	mov	tmp3, vend.d[0]
 	ands	tmp1, tmp3, tmp1	/* Mask padding bits. */
 	b.ne	L(tail)

 L(loop):
 	ldr	qdata, [src], #32
 	cmeq	vhas_nul.16b, vdata.16b, #0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	/* Use a fast check for the termination condition.  */
 	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
 	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */
 	mov	tmp1, vend.d[0]
 	cbnz	tmp1, L(end)

 	ldr	qdata, [src, #-16]
 	cmeq	vhas_nul.16b, vdata.16b, #0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	/* Use a fast check for the termination condition.  */
 	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
 	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */
 	mov	tmp1, vend.d[0]
 	cbz	tmp1, L(loop)

 	/* Adjust src for next two subtractions. */
 	add	src, src, #16
 L(end):
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
 	and	vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
 	and	vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
 	sub	src, src, #16
 	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
 	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */

 	mov	tmp1, vend.d[0]
 L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
 	rbit	tmp1, tmp1
 	/* Re-bias source.  */
 	sub	src, src, #16
 	clz	tmp1, tmp1	/* And counting the leading zeros.  */
 	/* Tmp1 is even if the target character was found first.  Otherwise
 	   we've found the end of string and we weren't looking for NUL.  */
 	tst	tmp1, #1
 	add	result, src, tmp1, lsr #2
 	csel	result, result, xzr, eq
 	ret

 END(__strchr_aarch64_mte)
	/*
	* strchr - find a character in a string
	*
	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	* See https://llvm.org/LICENSE.txt for license information.
	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	*/

	/* Assumptions:
	*
	* ARMv8-a, AArch64
	* Neon Available.
	*/

	#include "../asmdefs.h"

	/* Arguments and results. */
	#define srcin x0
	#define chrin w1

	#define result x0

	#define src x2
	#define tmp1 x3
	#define wtmp2 w4
	#define tmp3 x5

	#define vrepchr v0
	#define qdata q1
	#define vdata v1
	#define vhas_nul v2
	#define vhas_chr v3
	#define vrepmask_0 v4
	#define vrepmask_c v5
	#define vend v6

	#define L(l) .L ## l

	/* Core algorithm.

	For each 16-byte chunk we calculate a 64-bit syndrome value, with
	four bits per byte (LSB is always in bits 0 and 1, for both big
	and little-endian systems). For each tuple, bit 0 is set if
	the relevant byte matched the requested character; bit 1 is set
	if the relevant byte matched the NUL end of string (we trigger
	off bit0 for the special case of looking for NUL) and bits 2 and 3
	are not used.
	Since the bits in the syndrome reflect exactly the order in which
	things occur in the original string a count_trailing_zeros()
	operation will identify exactly which byte is causing the termination,
	and why. */

	/* Locals and temporaries. */

	ENTRY(__strchr_aarch64_mte)
	/* Magic constant 0x10011001 to allow us to identify which lane
	matches the requested byte. Magic constant 0x20022002 used
	similarly for NUL termination. */
	mov wtmp2, #0x1001
	movk wtmp2, #0x1001, lsl #16
	dup vrepchr.16b, chrin
	bic src, srcin, #15 /* Work with aligned 16-byte chunks. */
	dup vrepmask_c.4s, wtmp2
	ands tmp1, srcin, #15
	add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
	b.eq L(loop)

	/* Input string is not 16-byte aligned. Rather than forcing
	the padding bytes to a safe value, we calculate the syndrome
	for all the bytes, but then mask off those bits of the
	syndrome that are related to the padding. */
	ldr qdata, [src], #16
	cmeq vhas_nul.16b, vdata.16b, #0
	cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
	and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
	and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
	lsl tmp1, tmp1, #2
	orr vend.16b, vhas_nul.16b, vhas_chr.16b
	mov tmp3, #~0
	addp vend.16b, vend.16b, vend.16b /* 128->64 */
	lsl tmp1, tmp3, tmp1

	mov tmp3, vend.d[0]
	ands tmp1, tmp3, tmp1 /* Mask padding bits. */
	b.ne L(tail)

	L(loop):
	ldr qdata, [src], #32
	cmeq vhas_nul.16b, vdata.16b, #0
	cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
	/* Use a fast check for the termination condition. */
	orr vend.16b, vhas_nul.16b, vhas_chr.16b
	addp vend.16b, vend.16b, vend.16b /* 128->64 */
	mov tmp1, vend.d[0]
	cbnz tmp1, L(end)

	ldr qdata, [src, #-16]
	cmeq vhas_nul.16b, vdata.16b, #0
	cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
	/* Use a fast check for the termination condition. */
	orr vend.16b, vhas_nul.16b, vhas_chr.16b
	addp vend.16b, vend.16b, vend.16b /* 128->64 */
	mov tmp1, vend.d[0]
	cbz tmp1, L(loop)

	/* Adjust src for next two subtractions. */
	add src, src, #16
	L(end):
	/* Termination condition found. Now need to establish exactly why
	we terminated. */
	and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
	and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
	sub src, src, #16
	orr vend.16b, vhas_nul.16b, vhas_chr.16b
	addp vend.16b, vend.16b, vend.16b /* 128->64 */

	mov tmp1, vend.d[0]
	L(tail):
	/* Count the trailing zeros, by bit reversing... */
	rbit tmp1, tmp1
	/* Re-bias source. */
	sub src, src, #16
	clz tmp1, tmp1 /* And counting the leading zeros. */
	/* Tmp1 is even if the target character was found first. Otherwise
	we've found the end of string and we weren't looking for NUL. */
	tst tmp1, #1
	add result, src, tmp1, lsr #2
	csel result, result, xzr, eq
	ret

	END(__strchr_aarch64_mte)