[libc][math] Improve the performance of sin/cos for small inputs |x| < 2^-4. (#201748)

- Use a degree-9 polynomial for fast path for sin(x), generated by Sollya,
with errors bounded by `|x| * 2^-68 + 2* ulp(x^3 / 6)`.
- Use a degree-8 polynomial for fast path for cos(x), generated by Sollya,
with errors bounded by `2^-69 + ulp(x^2/ 2 )`.

GitOrigin-RevId: 5d2cc4dc1742d7373f0c52a0a29dcc5ab97a4446
diff --git a/src/__support/math/cos.h b/src/__support/math/cos.h
index f968a99..4271324 100644
--- a/src/__support/math/cos.h
+++ b/src/__support/math/cos.h
@@ -30,6 +30,47 @@
 
 namespace math {
 
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+LIBC_INLINE double
+cos_accurate(double x, uint16_t x_e, unsigned k,
+             const range_reduction_double_internal::LargeRangeReduction
+                 &range_reduction_large) {
+  using namespace math::range_reduction_double_internal;
+  using FPBits = typename fputil::FPBits<double>;
+
+  DFloat128 u_f128, sin_u, cos_u;
+  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
+    u_f128 = range_reduction_small_f128(x);
+  else
+    u_f128 = range_reduction_large.accurate();
+
+  math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u);
+
+  auto get_sin_k = [](unsigned kk) -> DFloat128 {
+    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+    DFloat128 ans = SIN_K_PI_OVER_128_F128[idx];
+    if (kk & 128)
+      ans.sign = Sign::NEG;
+    return ans;
+  };
+
+  // -sin(k * pi/128) = sin((k + 128) * pi/128)
+  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
+  DFloat128 msin_k_f128 = get_sin_k(k + 128);
+  DFloat128 cos_k_f128 = get_sin_k(k + 64);
+
+  // cos(x) = cos((k * pi/128 + u)
+  //        = cos(u) * cos(k*pi/128) - sin(u) * sin(k*pi/128)
+  DFloat128 r = fputil::quick_add(fputil::quick_mul(cos_k_f128, cos_u),
+                                  fputil::quick_mul(msin_k_f128, sin_u));
+
+  // TODO: Add assertion if Ziv's accuracy tests fail in debug mode.
+  // https://github.com/llvm/llvm-project/issues/96452.
+
+  return static_cast<double>(r);
+}
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 LIBC_INLINE double cos(double x) {
   using namespace range_reduction_double_internal;
   using FPBits = typename fputil::FPBits<double>;
@@ -43,8 +84,8 @@
 
   // |x| < 2^16.
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
-    // |x| < 2^-7
-    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) {
+    // |x| < 2^-4
+    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 4)) {
       // |x| < 2^-27
       if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
         // Signed zeros.
@@ -55,9 +96,39 @@
         return fputil::round_result_slightly_down(1.0);
       }
       // No range reduction needed.
-      k = 0;
-      y.lo = 0.0;
-      y.hi = x;
+
+      // Use degree-8 polynomial approximation:
+      //   cos(x) ~ 1 + a1 * x^2 + a2 * x^4 + a3 * x^6 + a4 * x^8
+      //          ~ 1 + x^2 * Q(x^2).
+      // > P = fpminimax(cos(x), [|0, 2, 4, 6, 8|], [|1, D...|], [0, 2^-4]);
+      // > dirtyinfnorm(cos(x) - P, [-2^-4, 2^-4]);
+      // 0x1.3cfe...p-70
+      // > P;
+      constexpr double COEFFS[] = {-0x1p-1, 0x1.5555555555262p-5,
+                                   -0x1.6c16c1508bff1p-10,
+                                   0x1.a00ffd769159ap-16};
+      double x_sq = x * x;
+      double c0 = fputil::multiply_add(x_sq, COEFFS[1], COEFFS[0]);
+      double c1 = fputil::multiply_add(x_sq, COEFFS[3], COEFFS[2]);
+      double x4 = x_sq * x_sq;
+      double r_lo = fputil::multiply_add(x4, c1, c0) * x_sq;
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      return 1.0 + r_lo;
+#else
+      // Overall errors <= ulp(x^2/2) + 2^-69.
+      double err = fputil::multiply_add(x_sq, 0x1.0p-53, 0x1.0p-69);
+      double r_lo_u = r_lo + err;
+      double r_lo_l = r_lo - err;
+      double r_upper = 1.0 + r_lo_u;
+      double r_lower = 1.0 + r_lo_l;
+
+      if (LIBC_LIKELY(r_upper == r_lower))
+        return r_upper;
+
+      k = range_reduction_small(x, y);
+      return cos_accurate(x, x_e, k, range_reduction_large);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
     } else {
       // Small range reduction.
       k = range_reduction_small(x, y);
@@ -114,8 +185,11 @@
   //             = cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128)
   DoubleDouble cos_k_cos_y = fputil::quick_mult(cos_y, cos_k);
   DoubleDouble msin_k_sin_y = fputil::quick_mult(sin_y, msin_k);
-
-  DoubleDouble rr = fputil::exact_add<false>(cos_k_cos_y.hi, msin_k_sin_y.hi);
+  // When k != 64 mod 128,
+  //   |cos( k * pi/128 )| > pi/128 - epsilon > |y| >= |sin(y)|,
+  // and cos(y) > 1 - pi/128.  So we can use Fast2Sum for the subtraction:
+  //   cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128).
+  DoubleDouble rr = fputil::exact_add(cos_k_cos_y.hi, msin_k_sin_y.hi);
   rr.lo += msin_k_sin_y.lo + cos_k_cos_y.lo;
 
 #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
@@ -131,36 +205,7 @@
   if (LIBC_LIKELY(r_upper == r_lower))
     return r_upper;
 
-  DFloat128 u_f128, sin_u, cos_u;
-  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
-    u_f128 = range_reduction_small_f128(x);
-  else
-    u_f128 = range_reduction_large.accurate();
-
-  math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u);
-
-  auto get_sin_k = [](unsigned kk) -> DFloat128 {
-    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-    DFloat128 ans = SIN_K_PI_OVER_128_F128[idx];
-    if (kk & 128)
-      ans.sign = Sign::NEG;
-    return ans;
-  };
-
-  // -sin(k * pi/128) = sin((k + 128) * pi/128)
-  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
-  DFloat128 msin_k_f128 = get_sin_k(k + 128);
-  DFloat128 cos_k_f128 = get_sin_k(k + 64);
-
-  // cos(x) = cos((k * pi/128 + u)
-  //        = cos(u) * cos(k*pi/128) - sin(u) * sin(k*pi/128)
-  DFloat128 r = fputil::quick_add(fputil::quick_mul(cos_k_f128, cos_u),
-                                  fputil::quick_mul(msin_k_f128, sin_u));
-
-  // TODO: Add assertion if Ziv's accuracy tests fail in debug mode.
-  // https://github.com/llvm/llvm-project/issues/96452.
-
-  return static_cast<double>(r);
+  return cos_accurate(x, x_e, k, range_reduction_large);
 #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 }
 
diff --git a/src/__support/math/sin.h b/src/__support/math/sin.h
index cd0c15a..bd53ec6 100644
--- a/src/__support/math/sin.h
+++ b/src/__support/math/sin.h
@@ -29,6 +29,46 @@
 
 namespace math {
 
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+LIBC_INLINE double
+sin_accurate(double x, uint16_t x_e, unsigned k,
+             const range_reduction_double_internal::LargeRangeReduction
+                 &range_reduction_large) {
+  using namespace math::range_reduction_double_internal;
+  using FPBits = typename fputil::FPBits<double>;
+
+  DFloat128 u_f128, sin_u, cos_u;
+  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
+    u_f128 = range_reduction_small_f128(x);
+  else
+    u_f128 = range_reduction_large.accurate();
+
+  math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u);
+
+  auto get_sin_k = [](unsigned kk) -> DFloat128 {
+    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+    DFloat128 ans = SIN_K_PI_OVER_128_F128[idx];
+    if (kk & 128)
+      ans.sign = Sign::NEG;
+    return ans;
+  };
+
+  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
+  DFloat128 sin_k_f128 = get_sin_k(k);
+  DFloat128 cos_k_f128 = get_sin_k(k + 64);
+
+  // sin(x) = sin(k * pi/128 + u)
+  //        = sin(u) * cos(k*pi/128) + cos(u) * sin(k*pi/128)
+  DFloat128 r = fputil::quick_add(fputil::quick_mul(sin_k_f128, cos_u),
+                                  fputil::quick_mul(cos_k_f128, sin_u));
+
+  // TODO: Add assertion if Ziv's accuracy tests fail in debug mode.
+  // https://github.com/llvm/llvm-project/issues/96452.
+
+  return static_cast<double>(r);
+}
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 LIBC_INLINE double sin(double x) {
   using namespace math::range_reduction_double_internal;
   using FPBits = typename fputil::FPBits<double>;
@@ -42,8 +82,8 @@
 
   // |x| < 2^16
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
-    // |x| < 2^-7
-    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) {
+    // |x| < 2^-4
+    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 4)) {
       // |x| < 2^-26, |sin(x) - x| < ulp(x)/2.
       if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 26)) {
         // Signed zeros.
@@ -64,9 +104,40 @@
 #endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
       }
       // No range reduction needed.
-      k = 0;
-      y.lo = 0.0;
-      y.hi = x;
+
+      // Use degree-9 polynomial approximation:
+      //   sin(x) ~ x + a1 * x^3 + a2 * x^5 + a3 * x^7 + a4 * x^9
+      //          ~ x + x^3 * Q(x^2).
+      // > P = fpminimax(sin(x)/x, [|0, 2, 4, 6, 8|], [|1, D...|], [0, 2^-4]);
+      // > dirtyinfnorm((sin(x) - x*P)/sin(x), [-2^-4, 2^-4]);
+      // 0x1.3c2e...p-69
+      // > P;
+      constexpr double COEFFS[] = {-0x1.5555555555555p-3, 0x1.111111110f491p-7,
+                                   -0x1.a01a00e16af3ep-13,
+                                   0x1.71c24233f1bafp-19};
+      double x_sq = x * x;
+      double c0 = fputil::multiply_add(x_sq, COEFFS[1], COEFFS[0]);
+      double c1 = fputil::multiply_add(x_sq, COEFFS[3], COEFFS[2]);
+      double x4 = x_sq * x_sq;
+      double x3 = x * x_sq;
+      double r_lo = fputil::multiply_add(x4, c1, c0) * x3;
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      return x + r_lo;
+#else
+      // Overall errors <= 2 * ulp(x^3/6) + |x| * 2^-68.
+      double err = fputil::multiply_add(x_sq, 0x1.0p-53, 0x1.0p-68);
+      double r_lo_u = fputil::multiply_add(x, err, r_lo);
+      double r_lo_l = fputil::multiply_add(-x, err, r_lo);
+      double r_upper = x + r_lo_u;
+      double r_lower = x + r_lo_l;
+
+      if (LIBC_LIKELY(r_upper == r_lower))
+        return r_upper;
+
+      k = range_reduction_small(x, y);
+      return sin_accurate(x, x_e, k, range_reduction_large);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
     } else {
       // Small range reduction.
       k = range_reduction_small(x, y);
@@ -123,8 +194,11 @@
   //             = sin(y) * cos(k*pi/128) + cos(y) * sin(k*pi/128)
   DoubleDouble sin_k_cos_y = fputil::quick_mult(cos_y, sin_k);
   DoubleDouble cos_k_sin_y = fputil::quick_mult(sin_y, cos_k);
-
-  DoubleDouble rr = fputil::exact_add<false>(sin_k_cos_y.hi, cos_k_sin_y.hi);
+  // When k != 0 mod 128,
+  //   |sin( k * pi/128 )| > pi/128 - epsilon > |y| >= |sin(y)|,
+  // and cos(y) > 1 - pi/128.  So we can use Fast2Sum for the addition:
+  //   sin(y) * cos(k*pi/128) + cos(y) * sin(k*pi/128).
+  DoubleDouble rr = fputil::exact_add(sin_k_cos_y.hi, cos_k_sin_y.hi);
   rr.lo += sin_k_cos_y.lo + cos_k_sin_y.lo;
 
 #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
@@ -142,35 +216,7 @@
   if (LIBC_LIKELY(r_upper == r_lower))
     return r_upper;
 
-  DFloat128 u_f128, sin_u, cos_u;
-  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
-    u_f128 = range_reduction_small_f128(x);
-  else
-    u_f128 = range_reduction_large.accurate();
-
-  math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u);
-
-  auto get_sin_k = [](unsigned kk) -> DFloat128 {
-    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-    DFloat128 ans = SIN_K_PI_OVER_128_F128[idx];
-    if (kk & 128)
-      ans.sign = Sign::NEG;
-    return ans;
-  };
-
-  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
-  DFloat128 sin_k_f128 = get_sin_k(k);
-  DFloat128 cos_k_f128 = get_sin_k(k + 64);
-
-  // sin(x) = sin(k * pi/128 + u)
-  //        = sin(u) * cos(k*pi/128) + cos(u) * sin(k*pi/128)
-  DFloat128 r = fputil::quick_add(fputil::quick_mul(sin_k_f128, cos_u),
-                                  fputil::quick_mul(cos_k_f128, sin_u));
-
-  // TODO: Add assertion if Ziv's accuracy tests fail in debug mode.
-  // https://github.com/llvm/llvm-project/issues/96452.
-
-  return static_cast<double>(r);
+  return sin_accurate(x, x_e, k, range_reduction_large);
 #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 }