Reapply "[libc] Enable wide-read memory operations by default on Linux (#154602)" (#154640)

Reland afterr the sanitizer and arm32 builds complained.

GitOrigin-RevId: 6ac01d12d630d534f61d43578b4d7c2c35526874
diff --git a/config/linux/arm/config.json b/config/linux/arm/config.json
new file mode 100644
index 0000000..e7ad454
--- /dev/null
+++ b/config/linux/arm/config.json
@@ -0,0 +1,7 @@
+{
+  "string": {
+    "LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
+      "value": false
+    }
+  }
+}
diff --git a/config/linux/config.json b/config/linux/config.json
new file mode 100644
index 0000000..30e8b2c
--- /dev/null
+++ b/config/linux/config.json
@@ -0,0 +1,7 @@
+{
+  "string": {
+    "LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
+      "value": true
+    }
+  }
+}
diff --git a/src/string/memory_utils/aarch64/inline_strlen.h b/src/string/memory_utils/aarch64/inline_strlen.h
index 79487f4..36fd1aa 100644
--- a/src/string/memory_utils/aarch64/inline_strlen.h
+++ b/src/string/memory_utils/aarch64/inline_strlen.h
@@ -17,14 +17,16 @@
 namespace LIBC_NAMESPACE_DECL {
 
 namespace neon {
-[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
+[[gnu::no_sanitize_address]] [[maybe_unused]] LIBC_INLINE static size_t
+string_length(const char *src) {
   using Vector __attribute__((may_alias)) = uint8x8_t;
 
   uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
-  Vector *block_ptr = reinterpret_cast<Vector *>(src - misalign_bytes);
+  const Vector *block_ptr =
+      reinterpret_cast<const Vector *>(src - misalign_bytes);
   Vector v = *block_ptr;
   Vector vcmp = vceqz_u8(v);
-  uint64x1_t cmp_mask = vreinterpret_u64_s8(vcmp);
+  uint64x1_t cmp_mask = vreinterpret_u64_u8(vcmp);
   uint64_t cmp = vget_lane_u64(cmp_mask, 0);
   cmp = cmp >> (misalign_bytes << 3);
   if (cmp)
@@ -34,7 +36,7 @@
     ++block_ptr;
     v = *block_ptr;
     vcmp = vceqz_u8(v);
-    cmp_mask = vreinterpret_u64_s8(vcmp);
+    cmp_mask = vreinterpret_u64_u8(vcmp);
     cmp = vget_lane_u64(cmp_mask, 0);
     if (cmp)
       return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
diff --git a/src/string/memory_utils/x86_64/inline_strlen.h b/src/string/memory_utils/x86_64/inline_strlen.h
index 5eb184c..6dad6ac 100644
--- a/src/string/memory_utils/x86_64/inline_strlen.h
+++ b/src/string/memory_utils/x86_64/inline_strlen.h
@@ -18,22 +18,23 @@
 namespace string_length_internal {
 // Return a bit-mask with the nth bit set if the nth-byte in block_ptr is zero.
 template <typename Vector, typename Mask>
-Mask CompareAndMask(const Vector *block_ptr);
+LIBC_INLINE static Mask compare_and_mask(const Vector *block_ptr);
 
 template <typename Vector, typename Mask,
-          decltype(CompareAndMask<Vector, Mask>)>
-size_t string_length_vector(const char *src) {
+          decltype(compare_and_mask<Vector, Mask>)>
+[[gnu::no_sanitize_address]] LIBC_INLINE static size_t
+string_length_vector(const char *src) {
   uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
 
   const Vector *block_ptr =
       reinterpret_cast<const Vector *>(src - misalign_bytes);
-  auto cmp = CompareAndMask<Vector, Mask>(block_ptr) >> misalign_bytes;
+  auto cmp = compare_and_mask<Vector, Mask>(block_ptr) >> misalign_bytes;
   if (cmp)
     return cpp::countr_zero(cmp);
 
   while (true) {
     block_ptr++;
-    cmp = CompareAndMask<Vector, Mask>(block_ptr);
+    cmp = compare_and_mask<Vector, Mask>(block_ptr);
     if (cmp)
       return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
                                  reinterpret_cast<uintptr_t>(src) +
@@ -42,7 +43,8 @@
 }
 
 template <>
-uint32_t CompareAndMask<__m128i, uint32_t>(const __m128i *block_ptr) {
+LIBC_INLINE uint32_t
+compare_and_mask<__m128i, uint32_t>(const __m128i *block_ptr) {
   __m128i v = _mm_load_si128(block_ptr);
   __m128i z = _mm_setzero_si128();
   __m128i c = _mm_cmpeq_epi8(z, v);
@@ -52,13 +54,14 @@
 namespace sse2 {
 [[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
   return string_length_vector<__m128i, uint32_t,
-                              CompareAndMask<__m128i, uint32_t>>(src);
+                              compare_and_mask<__m128i, uint32_t>>(src);
 }
 } // namespace sse2
 
 #if defined(__AVX2__)
 template <>
-uint32_t CompareAndMask<__m256i, uint32_t>(const __m256i *block_ptr) {
+LIBC_INLINE uint32_t
+compare_and_mask<__m256i, uint32_t>(const __m256i *block_ptr) {
   __m256i v = _mm256_load_si256(block_ptr);
   __m256i z = _mm256_setzero_si256();
   __m256i c = _mm256_cmpeq_epi8(z, v);
@@ -68,14 +71,15 @@
 namespace avx2 {
 [[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
   return string_length_vector<__m256i, uint32_t,
-                              CompareAndMask<__m256i, uint32_t>>(src);
+                              compare_and_mask<__m256i, uint32_t>>(src);
 }
 } // namespace avx2
 #endif
 
 #if defined(__AVX512F__)
 template <>
-__mmask64 CompareAndMask<__m512i, __mmask64>(const __m512i *block_ptr) {
+LIBC_INLINE __mmask64
+compare_and_mask<__m512i, __mmask64>(const __m512i *block_ptr) {
   __m512i v = _mm512_load_si512(block_ptr);
   __m512i z = _mm512_setzero_si512();
   return _mm512_cmp_epu8_mask(z, v, _MM_CMPINT_EQ);
@@ -83,7 +87,7 @@
 namespace avx512 {
 [[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
   return string_length_vector<__m512i, __mmask64,
-                              CompareAndMask<__m512i, __mmask64>>(src);
+                              compare_and_mask<__m512i, __mmask64>>(src);
 }
 } // namespace avx512
 #endif