[libc] Extend fputil::sqrt to use floating point instructions for arm32. (#134499)

diff --git a/libc/src/__support/FPUtil/aarch64/sqrt.h b/libc/src/__support/FPUtil/arm/sqrt.h
similarity index 61%
rename from libc/src/__support/FPUtil/aarch64/sqrt.h
rename to libc/src/__support/FPUtil/arm/sqrt.h
index b69267f..39ac539 100644
--- a/libc/src/__support/FPUtil/aarch64/sqrt.h
+++ b/libc/src/__support/FPUtil/arm/sqrt.h
@@ -6,35 +6,38 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_AARCH64_SQRT_H
-#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_AARCH64_SQRT_H
+#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_ARM_SQRT_H
+#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_ARM_SQRT_H
 
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
-#if !defined(LIBC_TARGET_ARCH_IS_AARCH64)
+#if !defined(LIBC_TARGET_ARCH_IS_ANY_ARM)
 #error "Invalid include"
 #endif
 
-#include "src/__support/FPUtil/generic/sqrt.h"
-
 namespace LIBC_NAMESPACE_DECL {
 namespace fputil {
 
+#ifdef LIBC_TARGET_CPU_HAS_FPU_FLOAT
 template <> LIBC_INLINE float sqrt<float>(float x) {
   float y;
-  __asm__ __volatile__("fsqrt %s0, %s1\n\t" : "=w"(y) : "w"(x));
+  asm("fsqrt %s0, %s1\n\t" : "=w"(y) : "w"(x));
   return y;
 }
+#endif // LIBC_TARGET_CPU_HAS_FPU_FLOAT
 
+#ifdef LIBC_TARGET_CPU_HAS_FPU_DOUBLE
 template <> LIBC_INLINE double sqrt<double>(double x) {
   double y;
-  __asm__ __volatile__("fsqrt %d0, %d1\n\t" : "=w"(y) : "w"(x));
+  asm("fsqrt %d0, %d1\n\t" : "=w"(y) : "w"(x));
   return y;
 }
+#endif // LIBC_TARGET_CPU_HAS_FPU_DOUBLE
 
 } // namespace fputil
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_AARCH64_SQRT_H
+#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_ARM_SQRT_H
diff --git a/libc/src/__support/FPUtil/riscv/sqrt.h b/libc/src/__support/FPUtil/riscv/sqrt.h
index 0363822..694451d 100644
--- a/libc/src/__support/FPUtil/riscv/sqrt.h
+++ b/libc/src/__support/FPUtil/riscv/sqrt.h
@@ -12,31 +12,30 @@
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 #if !defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
 #error "Invalid include"
 #endif
 
-#include "src/__support/FPUtil/generic/sqrt.h"
-
 namespace LIBC_NAMESPACE_DECL {
 namespace fputil {
 
-#ifdef __riscv_flen
+#ifdef LIBC_TARGET_CPU_HAS_FPU_FLOAT
 template <> LIBC_INLINE float sqrt<float>(float x) {
   float result;
-  __asm__ __volatile__("fsqrt.s %0, %1\n\t" : "=f"(result) : "f"(x));
+  asm("fsqrt.s %0, %1\n\t" : "=f"(result) : "f"(x));
   return result;
 }
+#endif // LIBC_TARGET_CPU_HAS_FPU_FLOAT
 
-#if __riscv_flen >= 64
+#if LIBC_TARGET_CPU_HAS_FPU_DOUBLE
 template <> LIBC_INLINE double sqrt<double>(double x) {
   double result;
-  __asm__ __volatile__("fsqrt.d %0, %1\n\t" : "=f"(result) : "f"(x));
+  asm("fsqrt.d %0, %1\n\t" : "=f"(result) : "f"(x));
   return result;
 }
-#endif // __riscv_flen >= 64
-#endif // __riscv_flen
+#endif // LIBC_TARGET_CPU_HAS_FPU_FLOAT
 
 } // namespace fputil
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/FPUtil/sqrt.h b/libc/src/__support/FPUtil/sqrt.h
index eb86ddf..9b151c4 100644
--- a/libc/src/__support/FPUtil/sqrt.h
+++ b/libc/src/__support/FPUtil/sqrt.h
@@ -12,14 +12,44 @@
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/cpu_features.h"
 
-#if defined(LIBC_TARGET_ARCH_IS_X86_64) && defined(LIBC_TARGET_CPU_HAS_SSE2)
+#include "src/__support/FPUtil/generic/sqrt.h"
+
+// Generic instruction specializations with __builtin_elementwise_sqrt.
+#if defined(LIBC_TARGET_CPU_HAS_FPU_FLOAT) ||                                  \
+    defined(LIBC_TARGET_CPU_HAS_FPU_DOUBLE)
+
+#if __has_builtin(__builtin_elementwise_sqrt)
+
+namespace LIBC_NAMESPACE_DECL {
+namespace fputil {
+
+#ifdef LIBC_TARGET_CPU_HAS_FPU_FLOAT
+template <> LIBC_INLINE float sqrt<float>(float x) {
+  return __builtin_elementwise_sqrt(x);
+}
+#endif // LIBC_TARGET_CPU_HAS_FPU_FLOAT
+
+#ifdef LIBC_TARGET_CPU_HAS_FPU_DOUBLE
+template <> LIBC_INLINE double sqrt<double>(double x) {
+  return __builtin_elementwise_sqrt(x);
+}
+#endif // LIBC_TARGET_CPU_HAS_FPU_DOUBLE
+
+} // namespace fputil
+} // namespace LIBC_NAMESPACE_DECL
+
+#else // __builtin_elementwise_sqrt
+// Use inline assembly when __builtin_elementwise_sqrt is not available.
+#if defined(LIBC_TARGET_CPU_HAS_SSE2)
 #include "x86_64/sqrt.h"
-#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
-#include "aarch64/sqrt.h"
+#elif defined(LIBC_TARGET_ARCH_IS_ANY_ARM)
+#include "arm/sqrt.h"
 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
 #include "riscv/sqrt.h"
-#else
-#include "generic/sqrt.h"
+#endif // Target specific header of inline asm.
 
-#endif
+#endif // __builtin_elementwise_sqrt
+
+#endif // LIBC_TARGET_CPU_HAS_FPU_FLOAT or DOUBLE
+
 #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_SQRT_H
diff --git a/libc/src/__support/FPUtil/x86_64/sqrt.h b/libc/src/__support/FPUtil/x86_64/sqrt.h
index e104477..eae40cc 100644
--- a/libc/src/__support/FPUtil/x86_64/sqrt.h
+++ b/libc/src/__support/FPUtil/x86_64/sqrt.h
@@ -18,8 +18,6 @@
 #error "sqrtss / sqrtsd need SSE2"
 #endif
 
-#include "src/__support/FPUtil/generic/sqrt.h"
-
 namespace LIBC_NAMESPACE_DECL {
 namespace fputil {
 
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index 1714775..3677e1f 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -20,6 +20,8 @@
 
 #if defined(__SSE2__)
 #define LIBC_TARGET_CPU_HAS_SSE2
+#define LIBC_TARGET_CPU_HAS_FPU_FLOAT
+#define LIBC_TARGET_CPU_HAS_FPU_DOUBLE
 #endif
 
 #if defined(__SSE4_2__)
@@ -42,24 +44,55 @@
 #define LIBC_TARGET_CPU_HAS_AVX512BW
 #endif
 
+#if defined(__ARM_FP)
+#if (__ARM_FP & 0x2)
+#define LIBC_TARGET_CPU_HAS_ARM_FPU_HALF
+#define LIBC_TARGET_CPU_HAS_FPU_HALF
+#endif // LIBC_TARGET_CPU_HAS_ARM_FPU_HALF
+#if (__ARM_FP & 0x4)
+#define LIBC_TARGET_CPU_HAS_ARM_FPU_FLOAT
+#define LIBC_TARGET_CPU_HAS_FPU_FLOAT
+#endif // LIBC_TARGET_CPU_HAS_ARM_FPU_FLOAT
+#if (__ARM_FP & 0x8)
+#define LIBC_TARGET_CPU_HAS_ARM_FPU_DOUBLE
+#define LIBC_TARGET_CPU_HAS_FPU_DOUBLE
+#endif // LIBC_TARGET_CPU_HAS_ARM_FPU_DOUBLE
+#endif // __ARM_FP
+
+#if defined(__riscv_flen)
+// https://github.com/riscv-non-isa/riscv-c-api-doc/blob/main/src/c-api.adoc
+#if (__riscv_flen & 0x10)
+#define LIBC_TARGET_CPU_HAS_RISCV_FPU_HALF
+#define LIBC_TARGET_CPU_HAS_FPU_HALF
+#endif // LIBC_TARGET_CPU_HAS_RISCV_FPU_HALF
+#if (__riscv_flen & 0x20)
+#define LIBC_TARGET_CPU_HAS_RISCV_FPU_FLOAT
+#define LIBC_TARGET_CPU_HAS_FPU_FLOAT
+#endif // LIBC_TARGET_CPU_HAS_RISCV_FPU_FLOAT
+#if (__riscv_flen & 0x40)
+#define LIBC_TARGET_CPU_HAS_RISCV_FPU_DOUBLE
+#define LIBC_TARGET_CPU_HAS_FPU_DOUBLE
+#endif // LIBC_TARGET_CPU_HAS_RISCV_FPU_DOUBLE
+#endif // __riscv_flen
+
+#if defined(__NVPTX__) || defined(__AMDGPU__)
+#define LIBC_TARGET_CPU_HAS_FPU_FLOAT
+#define LIBC_TARGET_CPU_HAS_FPU_DOUBLE
+#endif
+
 #if defined(__ARM_FEATURE_FMA) || (defined(__AVX2__) && defined(__FMA__)) ||   \
     defined(__NVPTX__) || defined(__AMDGPU__) || defined(__LIBC_RISCV_USE_FMA)
 #define LIBC_TARGET_CPU_HAS_FMA
 // Provide a more fine-grained control of FMA instruction for ARM targets.
-#if defined(__ARM_FP)
-#if (__ARM_FP & 0x2)
+#if defined(LIBC_TARGET_CPU_HAS_FPU_HALF)
 #define LIBC_TARGET_CPU_HAS_FMA_HALF
 #endif // LIBC_TARGET_CPU_HAS_FMA_HALF
-#if (__ARM_FP & 0x4)
+#if defined(LIBC_TARGET_CPU_HAS_FPU_FLOAT)
 #define LIBC_TARGET_CPU_HAS_FMA_FLOAT
 #endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
-#if (__ARM_FP & 0x8)
+#if defined(LIBC_TARGET_CPU_HAS_FPU_DOUBLE)
 #define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 #endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-#else
-#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
-#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-#endif
 #endif
 
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64) ||                                    \
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index a935748..dcb4d53 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1179,7 +1179,7 @@
         "src/__support/FPUtil/x86_64/sqrt.h",
     ],
     PLATFORM_CPU_ARM64: sqrt_common_hdrs + [
-        "src/__support/FPUtil/aarch64/sqrt.h",
+        "src/__support/FPUtil/arm/sqrt.h",
     ],
 })
 
@@ -1195,6 +1195,7 @@
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
         ":__support_fputil_rounding_mode",
+        ":__support_macros_properties_cpu_features",
         ":__support_uint128",
     ],
 )