[libc][math] Improve the performance of sin/cos for small inputs |x| < 2^-4. (#201748) - Use a degree-9 polynomial for fast path for sin(x), generated by Sollya, with errors bounded by `|x| * 2^-68 + 2* ulp(x^3 / 6)`. - Use a degree-8 polynomial for fast path for cos(x), generated by Sollya, with errors bounded by `2^-69 + ulp(x^2/ 2 )`. GitOrigin-RevId: 5d2cc4dc1742d7373f0c52a0a29dcc5ab97a4446
diff --git a/src/__support/math/cos.h b/src/__support/math/cos.h index f968a99..4271324 100644 --- a/src/__support/math/cos.h +++ b/src/__support/math/cos.h
@@ -30,6 +30,47 @@ namespace math { +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +LIBC_INLINE double +cos_accurate(double x, uint16_t x_e, unsigned k, + const range_reduction_double_internal::LargeRangeReduction + &range_reduction_large) { + using namespace math::range_reduction_double_internal; + using FPBits = typename fputil::FPBits<double>; + + DFloat128 u_f128, sin_u, cos_u; + if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) + u_f128 = range_reduction_small_f128(x); + else + u_f128 = range_reduction_large.accurate(); + + math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u); + + auto get_sin_k = [](unsigned kk) -> DFloat128 { + unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); + DFloat128 ans = SIN_K_PI_OVER_128_F128[idx]; + if (kk & 128) + ans.sign = Sign::NEG; + return ans; + }; + + // -sin(k * pi/128) = sin((k + 128) * pi/128) + // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128). + DFloat128 msin_k_f128 = get_sin_k(k + 128); + DFloat128 cos_k_f128 = get_sin_k(k + 64); + + // cos(x) = cos((k * pi/128 + u) + // = cos(u) * cos(k*pi/128) - sin(u) * sin(k*pi/128) + DFloat128 r = fputil::quick_add(fputil::quick_mul(cos_k_f128, cos_u), + fputil::quick_mul(msin_k_f128, sin_u)); + + // TODO: Add assertion if Ziv's accuracy tests fail in debug mode. + // https://github.com/llvm/llvm-project/issues/96452. + + return static_cast<double>(r); +} +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS + LIBC_INLINE double cos(double x) { using namespace range_reduction_double_internal; using FPBits = typename fputil::FPBits<double>; @@ -43,8 +84,8 @@ // |x| < 2^16. if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) { - // |x| < 2^-7 - if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) { + // |x| < 2^-4 + if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 4)) { // |x| < 2^-27 if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) { // Signed zeros. @@ -55,9 +96,39 @@ return fputil::round_result_slightly_down(1.0); } // No range reduction needed. - k = 0; - y.lo = 0.0; - y.hi = x; + + // Use degree-8 polynomial approximation: + // cos(x) ~ 1 + a1 * x^2 + a2 * x^4 + a3 * x^6 + a4 * x^8 + // ~ 1 + x^2 * Q(x^2). + // > P = fpminimax(cos(x), [|0, 2, 4, 6, 8|], [|1, D...|], [0, 2^-4]); + // > dirtyinfnorm(cos(x) - P, [-2^-4, 2^-4]); + // 0x1.3cfe...p-70 + // > P; + constexpr double COEFFS[] = {-0x1p-1, 0x1.5555555555262p-5, + -0x1.6c16c1508bff1p-10, + 0x1.a00ffd769159ap-16}; + double x_sq = x * x; + double c0 = fputil::multiply_add(x_sq, COEFFS[1], COEFFS[0]); + double c1 = fputil::multiply_add(x_sq, COEFFS[3], COEFFS[2]); + double x4 = x_sq * x_sq; + double r_lo = fputil::multiply_add(x4, c1, c0) * x_sq; + +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + return 1.0 + r_lo; +#else + // Overall errors <= ulp(x^2/2) + 2^-69. + double err = fputil::multiply_add(x_sq, 0x1.0p-53, 0x1.0p-69); + double r_lo_u = r_lo + err; + double r_lo_l = r_lo - err; + double r_upper = 1.0 + r_lo_u; + double r_lower = 1.0 + r_lo_l; + + if (LIBC_LIKELY(r_upper == r_lower)) + return r_upper; + + k = range_reduction_small(x, y); + return cos_accurate(x, x_e, k, range_reduction_large); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS } else { // Small range reduction. k = range_reduction_small(x, y); @@ -114,8 +185,11 @@ // = cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128) DoubleDouble cos_k_cos_y = fputil::quick_mult(cos_y, cos_k); DoubleDouble msin_k_sin_y = fputil::quick_mult(sin_y, msin_k); - - DoubleDouble rr = fputil::exact_add<false>(cos_k_cos_y.hi, msin_k_sin_y.hi); + // When k != 64 mod 128, + // |cos( k * pi/128 )| > pi/128 - epsilon > |y| >= |sin(y)|, + // and cos(y) > 1 - pi/128. So we can use Fast2Sum for the subtraction: + // cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128). + DoubleDouble rr = fputil::exact_add(cos_k_cos_y.hi, msin_k_sin_y.hi); rr.lo += msin_k_sin_y.lo + cos_k_cos_y.lo; #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS @@ -131,36 +205,7 @@ if (LIBC_LIKELY(r_upper == r_lower)) return r_upper; - DFloat128 u_f128, sin_u, cos_u; - if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) - u_f128 = range_reduction_small_f128(x); - else - u_f128 = range_reduction_large.accurate(); - - math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u); - - auto get_sin_k = [](unsigned kk) -> DFloat128 { - unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); - DFloat128 ans = SIN_K_PI_OVER_128_F128[idx]; - if (kk & 128) - ans.sign = Sign::NEG; - return ans; - }; - - // -sin(k * pi/128) = sin((k + 128) * pi/128) - // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128). - DFloat128 msin_k_f128 = get_sin_k(k + 128); - DFloat128 cos_k_f128 = get_sin_k(k + 64); - - // cos(x) = cos((k * pi/128 + u) - // = cos(u) * cos(k*pi/128) - sin(u) * sin(k*pi/128) - DFloat128 r = fputil::quick_add(fputil::quick_mul(cos_k_f128, cos_u), - fputil::quick_mul(msin_k_f128, sin_u)); - - // TODO: Add assertion if Ziv's accuracy tests fail in debug mode. - // https://github.com/llvm/llvm-project/issues/96452. - - return static_cast<double>(r); + return cos_accurate(x, x_e, k, range_reduction_large); #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS }
diff --git a/src/__support/math/sin.h b/src/__support/math/sin.h index cd0c15a..bd53ec6 100644 --- a/src/__support/math/sin.h +++ b/src/__support/math/sin.h
@@ -29,6 +29,46 @@ namespace math { +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +LIBC_INLINE double +sin_accurate(double x, uint16_t x_e, unsigned k, + const range_reduction_double_internal::LargeRangeReduction + &range_reduction_large) { + using namespace math::range_reduction_double_internal; + using FPBits = typename fputil::FPBits<double>; + + DFloat128 u_f128, sin_u, cos_u; + if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) + u_f128 = range_reduction_small_f128(x); + else + u_f128 = range_reduction_large.accurate(); + + math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u); + + auto get_sin_k = [](unsigned kk) -> DFloat128 { + unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); + DFloat128 ans = SIN_K_PI_OVER_128_F128[idx]; + if (kk & 128) + ans.sign = Sign::NEG; + return ans; + }; + + // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128). + DFloat128 sin_k_f128 = get_sin_k(k); + DFloat128 cos_k_f128 = get_sin_k(k + 64); + + // sin(x) = sin(k * pi/128 + u) + // = sin(u) * cos(k*pi/128) + cos(u) * sin(k*pi/128) + DFloat128 r = fputil::quick_add(fputil::quick_mul(sin_k_f128, cos_u), + fputil::quick_mul(cos_k_f128, sin_u)); + + // TODO: Add assertion if Ziv's accuracy tests fail in debug mode. + // https://github.com/llvm/llvm-project/issues/96452. + + return static_cast<double>(r); +} +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS + LIBC_INLINE double sin(double x) { using namespace math::range_reduction_double_internal; using FPBits = typename fputil::FPBits<double>; @@ -42,8 +82,8 @@ // |x| < 2^16 if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) { - // |x| < 2^-7 - if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) { + // |x| < 2^-4 + if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 4)) { // |x| < 2^-26, |sin(x) - x| < ulp(x)/2. if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 26)) { // Signed zeros. @@ -64,9 +104,40 @@ #endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE } // No range reduction needed. - k = 0; - y.lo = 0.0; - y.hi = x; + + // Use degree-9 polynomial approximation: + // sin(x) ~ x + a1 * x^3 + a2 * x^5 + a3 * x^7 + a4 * x^9 + // ~ x + x^3 * Q(x^2). + // > P = fpminimax(sin(x)/x, [|0, 2, 4, 6, 8|], [|1, D...|], [0, 2^-4]); + // > dirtyinfnorm((sin(x) - x*P)/sin(x), [-2^-4, 2^-4]); + // 0x1.3c2e...p-69 + // > P; + constexpr double COEFFS[] = {-0x1.5555555555555p-3, 0x1.111111110f491p-7, + -0x1.a01a00e16af3ep-13, + 0x1.71c24233f1bafp-19}; + double x_sq = x * x; + double c0 = fputil::multiply_add(x_sq, COEFFS[1], COEFFS[0]); + double c1 = fputil::multiply_add(x_sq, COEFFS[3], COEFFS[2]); + double x4 = x_sq * x_sq; + double x3 = x * x_sq; + double r_lo = fputil::multiply_add(x4, c1, c0) * x3; + +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + return x + r_lo; +#else + // Overall errors <= 2 * ulp(x^3/6) + |x| * 2^-68. + double err = fputil::multiply_add(x_sq, 0x1.0p-53, 0x1.0p-68); + double r_lo_u = fputil::multiply_add(x, err, r_lo); + double r_lo_l = fputil::multiply_add(-x, err, r_lo); + double r_upper = x + r_lo_u; + double r_lower = x + r_lo_l; + + if (LIBC_LIKELY(r_upper == r_lower)) + return r_upper; + + k = range_reduction_small(x, y); + return sin_accurate(x, x_e, k, range_reduction_large); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS } else { // Small range reduction. k = range_reduction_small(x, y); @@ -123,8 +194,11 @@ // = sin(y) * cos(k*pi/128) + cos(y) * sin(k*pi/128) DoubleDouble sin_k_cos_y = fputil::quick_mult(cos_y, sin_k); DoubleDouble cos_k_sin_y = fputil::quick_mult(sin_y, cos_k); - - DoubleDouble rr = fputil::exact_add<false>(sin_k_cos_y.hi, cos_k_sin_y.hi); + // When k != 0 mod 128, + // |sin( k * pi/128 )| > pi/128 - epsilon > |y| >= |sin(y)|, + // and cos(y) > 1 - pi/128. So we can use Fast2Sum for the addition: + // sin(y) * cos(k*pi/128) + cos(y) * sin(k*pi/128). + DoubleDouble rr = fputil::exact_add(sin_k_cos_y.hi, cos_k_sin_y.hi); rr.lo += sin_k_cos_y.lo + cos_k_sin_y.lo; #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS @@ -142,35 +216,7 @@ if (LIBC_LIKELY(r_upper == r_lower)) return r_upper; - DFloat128 u_f128, sin_u, cos_u; - if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) - u_f128 = range_reduction_small_f128(x); - else - u_f128 = range_reduction_large.accurate(); - - math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u); - - auto get_sin_k = [](unsigned kk) -> DFloat128 { - unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63); - DFloat128 ans = SIN_K_PI_OVER_128_F128[idx]; - if (kk & 128) - ans.sign = Sign::NEG; - return ans; - }; - - // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128). - DFloat128 sin_k_f128 = get_sin_k(k); - DFloat128 cos_k_f128 = get_sin_k(k + 64); - - // sin(x) = sin(k * pi/128 + u) - // = sin(u) * cos(k*pi/128) + cos(u) * sin(k*pi/128) - DFloat128 r = fputil::quick_add(fputil::quick_mul(sin_k_f128, cos_u), - fputil::quick_mul(cos_k_f128, sin_u)); - - // TODO: Add assertion if Ziv's accuracy tests fail in debug mode. - // https://github.com/llvm/llvm-project/issues/96452. - - return static_cast<double>(r); + return sin_accurate(x, x_e, k, range_reduction_large); #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS }