| //===-- Implementation of cbrt function -----------------------------------===// | 
 | // | 
 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
 | // See https://llvm.org/LICENSE.txt for license information. | 
 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 |  | 
 | #include "src/math/cbrt.h" | 
 | #include "hdr/fenv_macros.h" | 
 | #include "src/__support/FPUtil/FEnvImpl.h" | 
 | #include "src/__support/FPUtil/FPBits.h" | 
 | #include "src/__support/FPUtil/PolyEval.h" | 
 | #include "src/__support/FPUtil/double_double.h" | 
 | #include "src/__support/FPUtil/dyadic_float.h" | 
 | #include "src/__support/FPUtil/multiply_add.h" | 
 | #include "src/__support/common.h" | 
 | #include "src/__support/integer_literals.h" | 
 | #include "src/__support/macros/config.h" | 
 | #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY | 
 |  | 
 | #if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0) | 
 | #define LIBC_MATH_CBRT_SKIP_ACCURATE_PASS | 
 | #endif | 
 |  | 
 | namespace LIBC_NAMESPACE_DECL { | 
 |  | 
 | using DoubleDouble = fputil::DoubleDouble; | 
 | using Float128 = fputil::DyadicFloat<128>; | 
 |  | 
 | namespace { | 
 |  | 
 | // Initial approximation of x^(-2/3) for 1 <= x < 2. | 
 | // Polynomial generated by Sollya with: | 
 | // > P = fpminimax(x^(-2/3), 7, [|D...|], [1, 2]); | 
 | // > dirtyinfnorm(P/x^(-2/3) - 1, [1, 2]); | 
 | // 0x1.28...p-21 | 
 | double intial_approximation(double x) { | 
 |   constexpr double COEFFS[8] = { | 
 |       0x1.bc52aedead5c6p1,  -0x1.b52bfebf110b3p2,  0x1.1d8d71d53d126p3, | 
 |       -0x1.de2db9e81cf87p2, 0x1.0154ca06153bdp2,   -0x1.5973c66ee6da7p0, | 
 |       0x1.07bf6ac832552p-2, -0x1.5e53d9ce41cb8p-6, | 
 |   }; | 
 |  | 
 |   double x_sq = x * x; | 
 |  | 
 |   double c0 = fputil::multiply_add(x, COEFFS[1], COEFFS[0]); | 
 |   double c1 = fputil::multiply_add(x, COEFFS[3], COEFFS[2]); | 
 |   double c2 = fputil::multiply_add(x, COEFFS[5], COEFFS[4]); | 
 |   double c3 = fputil::multiply_add(x, COEFFS[7], COEFFS[6]); | 
 |  | 
 |   double x_4 = x_sq * x_sq; | 
 |   double d0 = fputil::multiply_add(x_sq, c1, c0); | 
 |   double d1 = fputil::multiply_add(x_sq, c3, c2); | 
 |  | 
 |   return fputil::multiply_add(x_4, d1, d0); | 
 | } | 
 |  | 
 | // Get the error term for Newton iteration: | 
 | //   h(x) = x^3 * a^2 - 1, | 
 | #ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE | 
 | double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) { | 
 |   return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) + | 
 |          fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo); | 
 | } | 
 | #else | 
 | double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) { | 
 |   DoubleDouble x_3_a_sq = fputil::quick_mult(a_sq, x_3); | 
 |   return (x_3_a_sq.hi - 1.0) + x_3_a_sq.lo; | 
 | } | 
 | #endif | 
 |  | 
 | } // anonymous namespace | 
 |  | 
 | // Correctly rounded cbrt algorithm: | 
 | // | 
 | // === Step 1 - Range reduction === | 
 | // For x = (-1)^s * 2^e * (1.m), we get 2 reduced arguments x_r and a as: | 
 | //   x_r = 1.m | 
 | //   a   = (-1)^s * 2^(e % 3) * (1.m) | 
 | // Then cbrt(x) = x^(1/3) can be computed as: | 
 | //   x^(1/3) = 2^(e / 3) * a^(1/3). | 
 | // | 
 | // In order to avoid division, we compute a^(-2/3) using Newton method and then | 
 | // multiply the results by a: | 
 | //   a^(1/3) = a * a^(-2/3). | 
 | // | 
 | // === Step 2 - First approximation to a^(-2/3) === | 
 | // First, we use a degree-7 minimax polynomial generated by Sollya to | 
 | // approximate x_r^(-2/3) for 1 <= x_r < 2. | 
 | //   p = P(x_r) ~ x_r^(-2/3), | 
 | // with relative errors bounded by: | 
 | //   | p / x_r^(-2/3) - 1 | < 1.16 * 2^-21. | 
 | // | 
 | // Then we multiply with 2^(e % 3) from a small lookup table to get: | 
 | //   x_0 = 2^(-2*(e % 3)/3) * p | 
 | //       ~ 2^(-2*(e % 3)/3) * x_r^(-2/3) | 
 | //       = a^(-2/3) | 
 | // With relative errors: | 
 | //   | x_0 / a^(-2/3) - 1 | < 1.16 * 2^-21. | 
 | // This step is done in double precision. | 
 | // | 
 | // === Step 3 - First Newton iteration === | 
 | // We follow the method described in: | 
 | //   Sibidanov, A. and Zimmermann, P., "Correctly rounded cubic root evaluation | 
 | //   in double precision", https://core-math.gitlabpages.inria.fr/cbrt64.pdf | 
 | // to derive multiplicative Newton iterations as below: | 
 | // Let x_n be the nth approximation to a^(-2/3).  Define the n^th error as: | 
 | //   h_n = x_n^3 * a^2 - 1 | 
 | // Then: | 
 | //   a^(-2/3) = x_n / (1 + h_n)^(1/3) | 
 | //            = x_n * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3 + ...) | 
 | // using the Taylor series expansion of (1 + h_n)^(-1/3). | 
 | // | 
 | // Apply to x_0 above: | 
 | //   h_0 = x_0^3 * a^2 - 1 | 
 | //       = a^2 * (x_0 - a^(-2/3)) * (x_0^2 + x_0 * a^(-2/3) + a^(-4/3)), | 
 | // it's bounded by: | 
 | //   |h_0| < 4 * 3 * 1.16 * 2^-21 * 4 < 2^-17. | 
 | // So in the first iteration step, we use: | 
 | //   x_1 = x_0 * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3) | 
 | // Its relative error is bounded by: | 
 | //   | x_1 / a^(-2/3) - 1 | < 35/242 * |h_0|^4 < 2^-70. | 
 | // Then we perform Ziv's rounding test and check if the answer is exact. | 
 | // This step is done in double-double precision. | 
 | // | 
 | // === Step 4 - Second Newton iteration === | 
 | // If the Ziv's rounding test from the previous step fails, we define the error | 
 | // term: | 
 | //   h_1 = x_1^3 * a^2 - 1, | 
 | // And perform another iteration: | 
 | //   x_2 = x_1 * (1 - h_1 / 3) | 
 | // with the relative errors exceed the precision of double-double. | 
 | // We then check the Ziv's accuracy test with relative errors < 2^-102 to | 
 | // compensate for rounding errors. | 
 | // | 
 | // === Step 5 - Final iteration === | 
 | // If the Ziv's accuracy test from the previous step fails, we perform another | 
 | // iteration in 128-bit precision and check for exact outputs. | 
 | // | 
 | // TODO: It is possible to replace this costly computation step with special | 
 | // exceptional handling, similar to what was done in the CORE-MATH project: | 
 | // https://gitlab.inria.fr/core-math/core-math/-/blob/master/src/binary64/cbrt/cbrt.c | 
 |  | 
 | LLVM_LIBC_FUNCTION(double, cbrt, (double x)) { | 
 |   using FPBits = fputil::FPBits<double>; | 
 |  | 
 |   uint64_t x_abs = FPBits(x).abs().uintval(); | 
 |  | 
 |   unsigned exp_bias_correction = 682; // 1023 * 2/3 | 
 |  | 
 |   if (LIBC_UNLIKELY(x_abs < FPBits::min_normal().uintval() || | 
 |                     x_abs >= FPBits::inf().uintval())) { | 
 |     if (x == 0.0 || x_abs >= FPBits::inf().uintval()) | 
 |       // x is 0, Inf, or NaN. | 
 |       // Make sure it works for FTZ/DAZ modes. | 
 |       return static_cast<double>(x + x); | 
 |  | 
 |     // x is non-zero denormal number. | 
 |     // Normalize x. | 
 |     x *= 0x1.0p60; | 
 |     exp_bias_correction -= 20; | 
 |   } | 
 |  | 
 |   FPBits x_bits(x); | 
 |  | 
 |   // When using biased exponent of x in double precision, | 
 |   //   x_e = real_exponent_of_x + 1023 | 
 |   // Then: | 
 |   //   x_e / 3 = real_exponent_of_x / 3 + 1023/3 | 
 |   //           = real_exponent_of_x / 3 + 341 | 
 |   // So to make it the correct biased exponent of x^(1/3), we add | 
 |   //   1023 - 341 = 682 | 
 |   // to the quotient x_e / 3. | 
 |   unsigned x_e = static_cast<unsigned>(x_bits.get_biased_exponent()); | 
 |   unsigned out_e = (x_e / 3 + exp_bias_correction); | 
 |   unsigned shift_e = x_e % 3; | 
 |  | 
 |   // Set x_r = 1.mantissa | 
 |   double x_r = | 
 |       FPBits(x_bits.get_mantissa() | | 
 |              (static_cast<uint64_t>(FPBits::EXP_BIAS) << FPBits::FRACTION_LEN)) | 
 |           .get_val(); | 
 |  | 
 |   // Set a = (-1)^x_sign * 2^(x_e % 3) * (1.mantissa) | 
 |   uint64_t a_bits = x_bits.uintval() & 0x800F'FFFF'FFFF'FFFF; | 
 |   a_bits |= | 
 |       (static_cast<uint64_t>(shift_e + static_cast<unsigned>(FPBits::EXP_BIAS)) | 
 |        << FPBits::FRACTION_LEN); | 
 |   double a = FPBits(a_bits).get_val(); | 
 |  | 
 |   // Initial approximation of x_r^(-2/3). | 
 |   double p = intial_approximation(x_r); | 
 |  | 
 |   // Look up for 2^(-2*n/3) used for first approximation step. | 
 |   constexpr double EXP2_M2_OVER_3[3] = {1.0, 0x1.428a2f98d728bp-1, | 
 |                                         0x1.965fea53d6e3dp-2}; | 
 |  | 
 |   // x0 is an initial approximation of a^(-2/3) for 1 <= |a| < 8. | 
 |   // Relative error: < 1.16 * 2^(-21). | 
 |   double x0 = static_cast<double>(EXP2_M2_OVER_3[shift_e] * p); | 
 |  | 
 |   // First iteration in double precision. | 
 |   DoubleDouble a_sq = fputil::exact_mult(a, a); | 
 |  | 
 |   // h0 = x0^3 * a^2 - 1 | 
 |   DoubleDouble x0_sq = fputil::exact_mult(x0, x0); | 
 |   DoubleDouble x0_3 = fputil::quick_mult(x0, x0_sq); | 
 |  | 
 |   double h0 = get_error(x0_3, a_sq); | 
 |  | 
 | #ifdef LIBC_MATH_CBRT_SKIP_ACCURATE_PASS | 
 |   constexpr double REL_ERROR = 0; | 
 | #else | 
 |   constexpr double REL_ERROR = 0x1.0p-51; | 
 | #endif // LIBC_MATH_CBRT_SKIP_ACCURATE_PASS | 
 |  | 
 |   // Taylor polynomial of (1 + h)^(-1/3): | 
 |   //   (1 + h)^(-1/3) = 1 - h/3 + 2 h^2 / 9 - 14 h^3 / 81 + ... | 
 |   constexpr double ERR_COEFFS[3] = { | 
 |       -0x1.5555555555555p-2 - REL_ERROR, // -1/3 - relative_error | 
 |       0x1.c71c71c71c71cp-3,              // 2/9 | 
 |       -0x1.61f9add3c0ca4p-3,             // -14/81 | 
 |   }; | 
 |   // e0 = -14 * h^2 / 81 + 2 * h / 9 - 1/3 - relative_error. | 
 |   double e0 = fputil::polyeval(h0, ERR_COEFFS[0], ERR_COEFFS[1], ERR_COEFFS[2]); | 
 |   double x0_h0 = x0 * h0; | 
 |  | 
 |   // x1 = x0 (1 - h0/3 + 2 h0^2 / 9 - 14 h0^3 / 81) | 
 |   // x1 approximate a^(-2/3) with relative errors bounded by: | 
 |   //   | x1 / a^(-2/3) - 1 | < (34/243) h0^4 < h0 * REL_ERROR | 
 |   DoubleDouble x1_dd{x0_h0 * e0, x0}; | 
 |  | 
 |   // r1 = x1 * a ~ a^(-2/3) * a = a^(1/3). | 
 |   DoubleDouble r1 = fputil::quick_mult(a, x1_dd); | 
 |  | 
 |   // Lambda function to update the exponent of the result. | 
 |   auto update_exponent = [=](double r) -> double { | 
 |     uint64_t r_m = FPBits(r).uintval() - 0x3FF0'0000'0000'0000; | 
 |     // Adjust exponent and sign. | 
 |     uint64_t r_bits = | 
 |         r_m + (static_cast<uint64_t>(out_e) << FPBits::FRACTION_LEN); | 
 |     return FPBits(r_bits).get_val(); | 
 |   }; | 
 |  | 
 | #ifdef LIBC_MATH_CBRT_SKIP_ACCURATE_PASS | 
 |   // TODO: We probably don't need to use double-double if accurate tests and | 
 |   // passes are skipped. | 
 |   return update_exponent(r1.hi + r1.lo); | 
 | #else | 
 |   // Accurate checks and passes. | 
 |   double r1_lower = r1.hi + r1.lo; | 
 |   double r1_upper = | 
 |       r1.hi + fputil::multiply_add(x0_h0, 2.0 * REL_ERROR * a, r1.lo); | 
 |  | 
 |   // Ziv's accuracy test. | 
 |   if (LIBC_LIKELY(r1_upper == r1_lower)) { | 
 |     // Test for exact outputs. | 
 |     // Check if lower (52 - 17 = 35) bits are 0's. | 
 |     if (LIBC_UNLIKELY((FPBits(r1_lower).uintval() & 0x0000'0007'FFFF'FFFF) == | 
 |                       0)) { | 
 |       double r1_err = (r1_lower - r1.hi) - r1.lo; | 
 |       if (FPBits(r1_err).abs().get_val() < 0x1.0p69) | 
 |         fputil::clear_except_if_required(FE_INEXACT); | 
 |     } | 
 |  | 
 |     return update_exponent(r1_lower); | 
 |   } | 
 |  | 
 |   // Accuracy test failed, perform another Newton iteration. | 
 |   double x1 = x1_dd.hi + (e0 + REL_ERROR) * x0_h0; | 
 |  | 
 |   // Second iteration in double-double precision. | 
 |   // h1 = x1^3 * a^2 - 1. | 
 |   DoubleDouble x1_sq = fputil::exact_mult(x1, x1); | 
 |   DoubleDouble x1_3 = fputil::quick_mult(x1, x1_sq); | 
 |   double h1 = get_error(x1_3, a_sq); | 
 |  | 
 |   // e1 = -x1*h1/3. | 
 |   double e1 = h1 * (x1 * -0x1.5555555555555p-2); | 
 |   // x2 = x1*(1 - h1/3) = x1 + e1 ~ a^(-2/3) with relative errors < 2^-101. | 
 |   DoubleDouble x2 = fputil::exact_add(x1, e1); | 
 |   // r2 = a * x2 ~ a * a^(-2/3) = a^(1/3) with relative errors < 2^-100. | 
 |   DoubleDouble r2 = fputil::quick_mult(a, x2); | 
 |  | 
 |   double r2_upper = r2.hi + fputil::multiply_add(a, 0x1.0p-102, r2.lo); | 
 |   double r2_lower = r2.hi + fputil::multiply_add(a, -0x1.0p-102, r2.lo); | 
 |  | 
 |   // Ziv's accuracy test. | 
 |   if (LIBC_LIKELY(r2_upper == r2_lower)) | 
 |     return update_exponent(r2_upper); | 
 |  | 
 |   // TODO: Investigate removing float128 and just list exceptional cases. | 
 |   // Apply another Newton iteration with ~126-bit accuracy. | 
 |   Float128 x2_f128 = fputil::quick_add(Float128(x2.hi), Float128(x2.lo)); | 
 |   // x2^3 | 
 |   Float128 x2_3 = | 
 |       fputil::quick_mul(fputil::quick_mul(x2_f128, x2_f128), x2_f128); | 
 |   // a^2 | 
 |   Float128 a_sq_f128 = fputil::quick_mul(Float128(a), Float128(a)); | 
 |   // x2^3 * a^2 | 
 |   Float128 x2_3_a_sq = fputil::quick_mul(x2_3, a_sq_f128); | 
 |   // h2 = x2^3 * a^2 - 1 | 
 |   Float128 h2_f128 = fputil::quick_add(x2_3_a_sq, Float128(-1.0)); | 
 |   double h2 = static_cast<double>(h2_f128); | 
 |   // t2 = 1 - h2 / 3 | 
 |   Float128 t2 = | 
 |       fputil::quick_add(Float128(1.0), Float128(h2 * (-0x1.5555555555555p-2))); | 
 |   // x3 = x2 * (1 - h2 / 3) ~ a^(-2/3) | 
 |   Float128 x3 = fputil::quick_mul(x2_f128, t2); | 
 |   // r3 = a * x3 ~ a * a^(-2/3) = a^(1/3) | 
 |   Float128 r3 = fputil::quick_mul(Float128(a), x3); | 
 |  | 
 |   // Check for exact cases: | 
 |   Float128::MantissaType rounding_bits = | 
 |       r3.mantissa & 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFFF_u128; | 
 |  | 
 |   double result = static_cast<double>(r3); | 
 |   if ((rounding_bits < 0x0000'0000'0000'0000'0000'0000'0000'000F_u128) || | 
 |       (rounding_bits >= 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFF0_u128)) { | 
 |     // Output is exact. | 
 |     r3.mantissa &= 0xFFFF'FFFF'FFFF'FFFF'FFFF'FFFF'FFFF'FFF0_u128; | 
 |  | 
 |     if (rounding_bits >= 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFF0_u128) { | 
 |       Float128 tmp{r3.sign, r3.exponent - 123, | 
 |                    0x8000'0000'0000'0000'0000'0000'0000'0000_u128}; | 
 |       Float128 r4 = fputil::quick_add(r3, tmp); | 
 |       result = static_cast<double>(r4); | 
 |     } else { | 
 |       result = static_cast<double>(r3); | 
 |     } | 
 |  | 
 |     fputil::clear_except_if_required(FE_INEXACT); | 
 |   } | 
 |  | 
 |   return update_exponent(result); | 
 | #endif // LIBC_MATH_CBRT_SKIP_ACCURATE_PASS | 
 | } | 
 |  | 
 | } // namespace LIBC_NAMESPACE_DECL |