src/math/generic/cbrt.cpp - llvm-project/libc - Git at Google

 //===-- Implementation of cbrt function -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "src/math/cbrt.h"
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
 #include "src/__support/FPUtil/double_double.h"
 #include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/common.h"
 #include "src/__support/integer_literals.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY

 #if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
 #define LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
 #endif

 namespace LIBC_NAMESPACE_DECL {

 using DoubleDouble = fputil::DoubleDouble;
 using Float128 = fputil::DyadicFloat<128>;

 namespace {

 // Initial approximation of x^(-2/3) for 1 <= x < 2.
 // Polynomial generated by Sollya with:
 // > P = fpminimax(x^(-2/3), 7, [|D...|], [1, 2]);
 // > dirtyinfnorm(P/x^(-2/3) - 1, [1, 2]);
 // 0x1.28...p-21
 double intial_approximation(double x) {
   constexpr double COEFFS[8] = {
       0x1.bc52aedead5c6p1,  -0x1.b52bfebf110b3p2,  0x1.1d8d71d53d126p3,
       -0x1.de2db9e81cf87p2, 0x1.0154ca06153bdp2,   -0x1.5973c66ee6da7p0,
       0x1.07bf6ac832552p-2, -0x1.5e53d9ce41cb8p-6,
   };

   double x_sq = x * x;

   double c0 = fputil::multiply_add(x, COEFFS[1], COEFFS[0]);
   double c1 = fputil::multiply_add(x, COEFFS[3], COEFFS[2]);
   double c2 = fputil::multiply_add(x, COEFFS[5], COEFFS[4]);
   double c3 = fputil::multiply_add(x, COEFFS[7], COEFFS[6]);

   double x_4 = x_sq * x_sq;
   double d0 = fputil::multiply_add(x_sq, c1, c0);
   double d1 = fputil::multiply_add(x_sq, c3, c2);

   return fputil::multiply_add(x_4, d1, d0);
 }

 // Get the error term for Newton iteration:
 //   h(x) = x^3 * a^2 - 1,
 #ifdef LIBC_TARGET_CPU_HAS_FMA
 double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
   return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) +
          fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo);
 }
 #else
 double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
   DoubleDouble x_3_a_sq = fputil::quick_mult(a_sq, x_3);
   return (x_3_a_sq.hi - 1.0) + x_3_a_sq.lo;
 }
 #endif

 } // anonymous namespace

 // Correctly rounded cbrt algorithm:
 //
 // === Step 1 - Range reduction ===
 // For x = (-1)^s * 2^e * (1.m), we get 2 reduced arguments x_r and a as:
 //   x_r = 1.m
 //   a   = (-1)^s * 2^(e % 3) * (1.m)
 // Then cbrt(x) = x^(1/3) can be computed as:
 //   x^(1/3) = 2^(e / 3) * a^(1/3).
 //
 // In order to avoid division, we compute a^(-2/3) using Newton method and then
 // multiply the results by a:
 //   a^(1/3) = a * a^(-2/3).
 //
 // === Step 2 - First approximation to a^(-2/3) ===
 // First, we use a degree-7 minimax polynomial generated by Sollya to
 // approximate x_r^(-2/3) for 1 <= x_r < 2.
 //   p = P(x_r) ~ x_r^(-2/3),
 // with relative errors bounded by:
 //   | p / x_r^(-2/3) - 1 | < 1.16 * 2^-21.
 //
 // Then we multiply with 2^(e % 3) from a small lookup table to get:
 //   x_0 = 2^(-2*(e % 3)/3) * p
 //       ~ 2^(-2*(e % 3)/3) * x_r^(-2/3)
 //       = a^(-2/3)
 // With relative errors:
 //   | x_0 / a^(-2/3) - 1 | < 1.16 * 2^-21.
 // This step is done in double precision.
 //
 // === Step 3 - First Newton iteration ===
 // We follow the method described in:
 //   Sibidanov, A. and Zimmermann, P., "Correctly rounded cubic root evaluation
 //   in double precision", https://core-math.gitlabpages.inria.fr/cbrt64.pdf
 // to derive multiplicative Newton iterations as below:
 // Let x_n be the nth approximation to a^(-2/3).  Define the n^th error as:
 //   h_n = x_n^3 * a^2 - 1
 // Then:
 //   a^(-2/3) = x_n / (1 + h_n)^(1/3)
 //            = x_n * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3 + ...)
 // using the Taylor series expansion of (1 + h_n)^(-1/3).
 //
 // Apply to x_0 above:
 //   h_0 = x_0^3 * a^2 - 1
 //       = a^2 * (x_0 - a^(-2/3)) * (x_0^2 + x_0 * a^(-2/3) + a^(-4/3)),
 // it's bounded by:
 //   |h_0| < 4 * 3 * 1.16 * 2^-21 * 4 < 2^-17.
 // So in the first iteration step, we use:
 //   x_1 = x_0 * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3)
 // Its relative error is bounded by:
 //   | x_1 / a^(-2/3) - 1 | < 35/242 * |h_0|^4 < 2^-70.
 // Then we perform Ziv's rounding test and check if the answer is exact.
 // This step is done in double-double precision.
 //
 // === Step 4 - Second Newton iteration ===
 // If the Ziv's rounding test from the previous step fails, we define the error
 // term:
 //   h_1 = x_1^3 * a^2 - 1,
 // And perform another iteration:
 //   x_2 = x_1 * (1 - h_1 / 3)
 // with the relative errors exceed the precision of double-double.
 // We then check the Ziv's accuracy test with relative errors < 2^-102 to
 // compensate for rounding errors.
 //
 // === Step 5 - Final iteration ===
 // If the Ziv's accuracy test from the previous step fails, we perform another
 // iteration in 128-bit precision and check for exact outputs.
 //
 // TODO: It is possible to replace this costly computation step with special
 // exceptional handling, similar to what was done in the CORE-MATH project:
 // https://gitlab.inria.fr/core-math/core-math/-/blob/master/src/binary64/cbrt/cbrt.c

 LLVM_LIBC_FUNCTION(double, cbrt, (double x)) {
   using FPBits = fputil::FPBits<double>;

   uint64_t x_abs = FPBits(x).abs().uintval();

   unsigned exp_bias_correction = 682; // 1023 * 2/3

   if (LIBC_UNLIKELY(x_abs < FPBits::min_normal().uintval() ||
                     x_abs >= FPBits::inf().uintval())) {
     if (x_abs == 0 || x_abs >= FPBits::inf().uintval())
       // x is 0, Inf, or NaN.
       return x;

     // x is non-zero denormal number.
     // Normalize x.
     x *= 0x1.0p60;
     exp_bias_correction -= 20;
   }

   FPBits x_bits(x);

   // When using biased exponent of x in double precision,
   //   x_e = real_exponent_of_x + 1023
   // Then:
   //   x_e / 3 = real_exponent_of_x / 3 + 1023/3
   //           = real_exponent_of_x / 3 + 341
   // So to make it the correct biased exponent of x^(1/3), we add
   //   1023 - 341 = 682
   // to the quotient x_e / 3.
   unsigned x_e = static_cast<unsigned>(x_bits.get_biased_exponent());
   unsigned out_e = (x_e / 3 + exp_bias_correction);
   unsigned shift_e = x_e % 3;

   // Set x_r = 1.mantissa
   double x_r =
       FPBits(x_bits.get_mantissa() |
              (static_cast<uint64_t>(FPBits::EXP_BIAS) << FPBits::FRACTION_LEN))
           .get_val();

   // Set a = (-1)^x_sign * 2^(x_e % 3) * (1.mantissa)
   uint64_t a_bits = x_bits.uintval() & 0x800F'FFFF'FFFF'FFFF;
   a_bits |=
       (static_cast<uint64_t>(shift_e + static_cast<unsigned>(FPBits::EXP_BIAS))
        << FPBits::FRACTION_LEN);
   double a = FPBits(a_bits).get_val();

   // Initial approximation of x_r^(-2/3).
   double p = intial_approximation(x_r);

   // Look up for 2^(-2*n/3) used for first approximation step.
   constexpr double EXP2_M2_OVER_3[3] = {1.0, 0x1.428a2f98d728bp-1,
                                         0x1.965fea53d6e3dp-2};

   // x0 is an initial approximation of a^(-2/3) for 1 <= |a| < 8.
   // Relative error: < 1.16 * 2^(-21).
   double x0 = static_cast<double>(EXP2_M2_OVER_3[shift_e] * p);

   // First iteration in double precision.
   DoubleDouble a_sq = fputil::exact_mult(a, a);

   // h0 = x0^3 * a^2 - 1
   DoubleDouble x0_sq = fputil::exact_mult(x0, x0);
   DoubleDouble x0_3 = fputil::quick_mult(x0, x0_sq);

   double h0 = get_error(x0_3, a_sq);

 #ifdef LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
   constexpr double REL_ERROR = 0;
 #else
   constexpr double REL_ERROR = 0x1.0p-51;
 #endif // LIBC_MATH_CBRT_SKIP_ACCURATE_PASS

   // Taylor polynomial of (1 + h)^(-1/3):
   //   (1 + h)^(-1/3) = 1 - h/3 + 2 h^2 / 9 - 14 h^3 / 81 + ...
   constexpr double ERR_COEFFS[3] = {
       -0x1.5555555555555p-2 - REL_ERROR, // -1/3 - relative_error
       0x1.c71c71c71c71cp-3,              // 2/9
       -0x1.61f9add3c0ca4p-3,             // -14/81
   };
   // e0 = -14 * h^2 / 81 + 2 * h / 9 - 1/3 - relative_error.
   double e0 = fputil::polyeval(h0, ERR_COEFFS[0], ERR_COEFFS[1], ERR_COEFFS[2]);
   double x0_h0 = x0 * h0;

   // x1 = x0 (1 - h0/3 + 2 h0^2 / 9 - 14 h0^3 / 81)
   // x1 approximate a^(-2/3) with relative errors bounded by:
   //   | x1 / a^(-2/3) - 1 | < (34/243) h0^4 < h0 * REL_ERROR
   DoubleDouble x1_dd{x0_h0 * e0, x0};

   // r1 = x1 * a ~ a^(-2/3) * a = a^(1/3).
   DoubleDouble r1 = fputil::quick_mult(a, x1_dd);

   // Lambda function to update the exponent of the result.
   auto update_exponent = [=](double r) -> double {
     uint64_t r_m = FPBits(r).uintval() - 0x3FF0'0000'0000'0000;
     // Adjust exponent and sign.
     uint64_t r_bits =
         r_m + (static_cast<uint64_t>(out_e) << FPBits::FRACTION_LEN);
     return FPBits(r_bits).get_val();
   };

 #ifdef LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
   // TODO: We probably don't need to use double-double if accurate tests and
   // passes are skipped.
   return update_exponent(r1.hi + r1.lo);
 #else
   // Accurate checks and passes.
   double r1_lower = r1.hi + r1.lo;
   double r1_upper =
       r1.hi + fputil::multiply_add(x0_h0, 2.0 * REL_ERROR * a, r1.lo);

   // Ziv's accuracy test.
   if (LIBC_LIKELY(r1_upper == r1_lower)) {
     // Test for exact outputs.
     // Check if lower (52 - 17 = 35) bits are 0's.
     if (LIBC_UNLIKELY((FPBits(r1_lower).uintval() & 0x0000'0007'FFFF'FFFF) ==
                       0)) {
       double r1_err = (r1_lower - r1.hi) - r1.lo;
       if (FPBits(r1_err).abs().get_val() < 0x1.0p69)
         fputil::clear_except_if_required(FE_INEXACT);
     }

     return update_exponent(r1_lower);
   }

   // Accuracy test failed, perform another Newton iteration.
   double x1 = x1_dd.hi + (e0 + REL_ERROR) * x0_h0;

   // Second iteration in double-double precision.
   // h1 = x1^3 * a^2 - 1.
   DoubleDouble x1_sq = fputil::exact_mult(x1, x1);
   DoubleDouble x1_3 = fputil::quick_mult(x1, x1_sq);
   double h1 = get_error(x1_3, a_sq);

   // e1 = -x1*h1/3.
   double e1 = h1 * (x1 * -0x1.5555555555555p-2);
   // x2 = x1*(1 - h1/3) = x1 + e1 ~ a^(-2/3) with relative errors < 2^-101.
   DoubleDouble x2 = fputil::exact_add(x1, e1);
   // r2 = a * x2 ~ a * a^(-2/3) = a^(1/3) with relative errors < 2^-100.
   DoubleDouble r2 = fputil::quick_mult(a, x2);

   double r2_upper = r2.hi + fputil::multiply_add(a, 0x1.0p-102, r2.lo);
   double r2_lower = r2.hi + fputil::multiply_add(a, -0x1.0p-102, r2.lo);

   // Ziv's accuracy test.
   if (LIBC_LIKELY(r2_upper == r2_lower))
     return update_exponent(r2_upper);

   // TODO: Investigate removing float128 and just list exceptional cases.
   // Apply another Newton iteration with ~126-bit accuracy.
   Float128 x2_f128 = fputil::quick_add(Float128(x2.hi), Float128(x2.lo));
   // x2^3
   Float128 x2_3 =
       fputil::quick_mul(fputil::quick_mul(x2_f128, x2_f128), x2_f128);
   // a^2
   Float128 a_sq_f128 = fputil::quick_mul(Float128(a), Float128(a));
   // x2^3 * a^2
   Float128 x2_3_a_sq = fputil::quick_mul(x2_3, a_sq_f128);
   // h2 = x2^3 * a^2 - 1
   Float128 h2_f128 = fputil::quick_add(x2_3_a_sq, Float128(-1.0));
   double h2 = static_cast<double>(h2_f128);
   // t2 = 1 - h2 / 3
   Float128 t2 =
       fputil::quick_add(Float128(1.0), Float128(h2 * (-0x1.5555555555555p-2)));
   // x3 = x2 * (1 - h2 / 3) ~ a^(-2/3)
   Float128 x3 = fputil::quick_mul(x2_f128, t2);
   // r3 = a * x3 ~ a * a^(-2/3) = a^(1/3)
   Float128 r3 = fputil::quick_mul(Float128(a), x3);

   // Check for exact cases:
   Float128::MantissaType rounding_bits =
       r3.mantissa & 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFFF_u128;

   double result = static_cast<double>(r3);
   if ((rounding_bits < 0x0000'0000'0000'0000'0000'0000'0000'000F_u128) ||
       (rounding_bits >= 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFF0_u128)) {
     // Output is exact.
     r3.mantissa &= 0xFFFF'FFFF'FFFF'FFFF'FFFF'FFFF'FFFF'FFF0_u128;

     if (rounding_bits >= 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFF0_u128) {
       Float128 tmp{r3.sign, r3.exponent - 123,
                    0x8000'0000'0000'0000'0000'0000'0000'0000_u128};
       Float128 r4 = fputil::quick_add(r3, tmp);
       result = static_cast<double>(r4);
     } else {
       result = static_cast<double>(r3);
     }

     fputil::clear_except_if_required(FE_INEXACT);
   }

   return update_exponent(result);
 #endif // LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
 }

 } // namespace LIBC_NAMESPACE_DECL
	//===-- Implementation of cbrt function -----------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "src/math/cbrt.h"
	#include "hdr/fenv_macros.h"
	#include "src/__support/FPUtil/FEnvImpl.h"
	#include "src/__support/FPUtil/FPBits.h"
	#include "src/__support/FPUtil/PolyEval.h"
	#include "src/__support/FPUtil/double_double.h"
	#include "src/__support/FPUtil/dyadic_float.h"
	#include "src/__support/FPUtil/multiply_add.h"
	#include "src/__support/common.h"
	#include "src/__support/integer_literals.h"
	#include "src/__support/macros/config.h"
	#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY

	#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
	#define LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
	#endif

	namespace LIBC_NAMESPACE_DECL {

	using DoubleDouble = fputil::DoubleDouble;
	using Float128 = fputil::DyadicFloat<128>;

	namespace {

	// Initial approximation of x^(-2/3) for 1 <= x < 2.
	// Polynomial generated by Sollya with:
	// > P = fpminimax(x^(-2/3), 7, [\|D...\|], [1, 2]);
	// > dirtyinfnorm(P/x^(-2/3) - 1, [1, 2]);
	// 0x1.28...p-21
	double intial_approximation(double x) {
	constexpr double COEFFS[8] = {
	0x1.bc52aedead5c6p1, -0x1.b52bfebf110b3p2, 0x1.1d8d71d53d126p3,
	-0x1.de2db9e81cf87p2, 0x1.0154ca06153bdp2, -0x1.5973c66ee6da7p0,
	0x1.07bf6ac832552p-2, -0x1.5e53d9ce41cb8p-6,
	};

	double x_sq = x * x;

	double c0 = fputil::multiply_add(x, COEFFS[1], COEFFS[0]);
	double c1 = fputil::multiply_add(x, COEFFS[3], COEFFS[2]);
	double c2 = fputil::multiply_add(x, COEFFS[5], COEFFS[4]);
	double c3 = fputil::multiply_add(x, COEFFS[7], COEFFS[6]);

	double x_4 = x_sq * x_sq;
	double d0 = fputil::multiply_add(x_sq, c1, c0);
	double d1 = fputil::multiply_add(x_sq, c3, c2);

	return fputil::multiply_add(x_4, d1, d0);
	}

	// Get the error term for Newton iteration:
	// h(x) = x^3 * a^2 - 1,
	#ifdef LIBC_TARGET_CPU_HAS_FMA
	double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
	return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) +
	fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo);
	}
	#else
	double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
	DoubleDouble x_3_a_sq = fputil::quick_mult(a_sq, x_3);
	return (x_3_a_sq.hi - 1.0) + x_3_a_sq.lo;
	}
	#endif

	} // anonymous namespace

	// Correctly rounded cbrt algorithm:
	//
	// === Step 1 - Range reduction ===
	// For x = (-1)^s * 2^e * (1.m), we get 2 reduced arguments x_r and a as:
	// x_r = 1.m
	// a = (-1)^s * 2^(e % 3) * (1.m)
	// Then cbrt(x) = x^(1/3) can be computed as:
	// x^(1/3) = 2^(e / 3) * a^(1/3).
	//
	// In order to avoid division, we compute a^(-2/3) using Newton method and then
	// multiply the results by a:
	// a^(1/3) = a * a^(-2/3).
	//
	// === Step 2 - First approximation to a^(-2/3) ===
	// First, we use a degree-7 minimax polynomial generated by Sollya to
	// approximate x_r^(-2/3) for 1 <= x_r < 2.
	// p = P(x_r) ~ x_r^(-2/3),
	// with relative errors bounded by:
	// \| p / x_r^(-2/3) - 1 \| < 1.16 * 2^-21.
	//
	// Then we multiply with 2^(e % 3) from a small lookup table to get:
	// x_0 = 2^(-2(e % 3)/3) p
	// ~ 2^(-2(e % 3)/3) x_r^(-2/3)
	// = a^(-2/3)
	// With relative errors:
	// \| x_0 / a^(-2/3) - 1 \| < 1.16 * 2^-21.
	// This step is done in double precision.
	//
	// === Step 3 - First Newton iteration ===
	// We follow the method described in:
	// Sibidanov, A. and Zimmermann, P., "Correctly rounded cubic root evaluation
	// in double precision", https://core-math.gitlabpages.inria.fr/cbrt64.pdf
	// to derive multiplicative Newton iterations as below:
	// Let x_n be the nth approximation to a^(-2/3). Define the n^th error as:
	// h_n = x_n^3 * a^2 - 1
	// Then:
	// a^(-2/3) = x_n / (1 + h_n)^(1/3)
	// = x_n * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3 + ...)
	// using the Taylor series expansion of (1 + h_n)^(-1/3).
	//
	// Apply to x_0 above:
	// h_0 = x_0^3 * a^2 - 1
	// = a^2 * (x_0 - a^(-2/3)) * (x_0^2 + x_0 * a^(-2/3) + a^(-4/3)),
	// it's bounded by:
	// \|h_0\| < 4 * 3 * 1.16 * 2^-21 * 4 < 2^-17.
	// So in the first iteration step, we use:
	// x_1 = x_0 * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3)
	// Its relative error is bounded by:
	// \| x_1 / a^(-2/3) - 1 \| < 35/242 * \|h_0\|^4 < 2^-70.
	// Then we perform Ziv's rounding test and check if the answer is exact.
	// This step is done in double-double precision.
	//
	// === Step 4 - Second Newton iteration ===
	// If the Ziv's rounding test from the previous step fails, we define the error
	// term:
	// h_1 = x_1^3 * a^2 - 1,
	// And perform another iteration:
	// x_2 = x_1 * (1 - h_1 / 3)
	// with the relative errors exceed the precision of double-double.
	// We then check the Ziv's accuracy test with relative errors < 2^-102 to
	// compensate for rounding errors.
	//
	// === Step 5 - Final iteration ===
	// If the Ziv's accuracy test from the previous step fails, we perform another
	// iteration in 128-bit precision and check for exact outputs.
	//
	// TODO: It is possible to replace this costly computation step with special
	// exceptional handling, similar to what was done in the CORE-MATH project:
	// https://gitlab.inria.fr/core-math/core-math/-/blob/master/src/binary64/cbrt/cbrt.c

	LLVM_LIBC_FUNCTION(double, cbrt, (double x)) {
	using FPBits = fputil::FPBits<double>;

	uint64_t x_abs = FPBits(x).abs().uintval();

	unsigned exp_bias_correction = 682; // 1023 * 2/3

	if (LIBC_UNLIKELY(x_abs < FPBits::min_normal().uintval() \|\|
	x_abs >= FPBits::inf().uintval())) {
	if (x_abs == 0 \|\| x_abs >= FPBits::inf().uintval())
	// x is 0, Inf, or NaN.
	return x;

	// x is non-zero denormal number.
	// Normalize x.
	x *= 0x1.0p60;
	exp_bias_correction -= 20;
	}

	FPBits x_bits(x);

	// When using biased exponent of x in double precision,
	// x_e = real_exponent_of_x + 1023
	// Then:
	// x_e / 3 = real_exponent_of_x / 3 + 1023/3
	// = real_exponent_of_x / 3 + 341
	// So to make it the correct biased exponent of x^(1/3), we add
	// 1023 - 341 = 682
	// to the quotient x_e / 3.
	unsigned x_e = static_cast<unsigned>(x_bits.get_biased_exponent());
	unsigned out_e = (x_e / 3 + exp_bias_correction);
	unsigned shift_e = x_e % 3;

	// Set x_r = 1.mantissa
	double x_r =
	FPBits(x_bits.get_mantissa() \|
	(static_cast<uint64_t>(FPBits::EXP_BIAS) << FPBits::FRACTION_LEN))
	.get_val();

	// Set a = (-1)^x_sign * 2^(x_e % 3) * (1.mantissa)
	uint64_t a_bits = x_bits.uintval() & 0x800F'FFFF'FFFF'FFFF;
	a_bits \|=
	(static_cast<uint64_t>(shift_e + static_cast<unsigned>(FPBits::EXP_BIAS))
	<< FPBits::FRACTION_LEN);
	double a = FPBits(a_bits).get_val();

	// Initial approximation of x_r^(-2/3).
	double p = intial_approximation(x_r);

	// Look up for 2^(-2*n/3) used for first approximation step.
	constexpr double EXP2_M2_OVER_3[3] = {1.0, 0x1.428a2f98d728bp-1,
	0x1.965fea53d6e3dp-2};

	// x0 is an initial approximation of a^(-2/3) for 1 <= \|a\| < 8.
	// Relative error: < 1.16 * 2^(-21).
	double x0 = static_cast<double>(EXP2_M2_OVER_3[shift_e] * p);

	// First iteration in double precision.
	DoubleDouble a_sq = fputil::exact_mult(a, a);

	// h0 = x0^3 * a^2 - 1
	DoubleDouble x0_sq = fputil::exact_mult(x0, x0);
	DoubleDouble x0_3 = fputil::quick_mult(x0, x0_sq);

	double h0 = get_error(x0_3, a_sq);

	#ifdef LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
	constexpr double REL_ERROR = 0;
	#else
	constexpr double REL_ERROR = 0x1.0p-51;
	#endif // LIBC_MATH_CBRT_SKIP_ACCURATE_PASS

	// Taylor polynomial of (1 + h)^(-1/3):
	// (1 + h)^(-1/3) = 1 - h/3 + 2 h^2 / 9 - 14 h^3 / 81 + ...
	constexpr double ERR_COEFFS[3] = {
	-0x1.5555555555555p-2 - REL_ERROR, // -1/3 - relative_error
	0x1.c71c71c71c71cp-3, // 2/9
	-0x1.61f9add3c0ca4p-3, // -14/81
	};
	// e0 = -14 * h^2 / 81 + 2 * h / 9 - 1/3 - relative_error.
	double e0 = fputil::polyeval(h0, ERR_COEFFS[0], ERR_COEFFS[1], ERR_COEFFS[2]);
	double x0_h0 = x0 * h0;

	// x1 = x0 (1 - h0/3 + 2 h0^2 / 9 - 14 h0^3 / 81)
	// x1 approximate a^(-2/3) with relative errors bounded by:
	// \| x1 / a^(-2/3) - 1 \| < (34/243) h0^4 < h0 * REL_ERROR
	DoubleDouble x1_dd{x0_h0 * e0, x0};

	// r1 = x1 * a ~ a^(-2/3) * a = a^(1/3).
	DoubleDouble r1 = fputil::quick_mult(a, x1_dd);

	// Lambda function to update the exponent of the result.
	auto update_exponent = [=](double r) -> double {
	uint64_t r_m = FPBits(r).uintval() - 0x3FF0'0000'0000'0000;
	// Adjust exponent and sign.
	uint64_t r_bits =
	r_m + (static_cast<uint64_t>(out_e) << FPBits::FRACTION_LEN);
	return FPBits(r_bits).get_val();
	};

	#ifdef LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
	// TODO: We probably don't need to use double-double if accurate tests and
	// passes are skipped.
	return update_exponent(r1.hi + r1.lo);
	#else
	// Accurate checks and passes.
	double r1_lower = r1.hi + r1.lo;
	double r1_upper =
	r1.hi + fputil::multiply_add(x0_h0, 2.0 * REL_ERROR * a, r1.lo);

	// Ziv's accuracy test.
	if (LIBC_LIKELY(r1_upper == r1_lower)) {
	// Test for exact outputs.
	// Check if lower (52 - 17 = 35) bits are 0's.
	if (LIBC_UNLIKELY((FPBits(r1_lower).uintval() & 0x0000'0007'FFFF'FFFF) ==
	0)) {
	double r1_err = (r1_lower - r1.hi) - r1.lo;
	if (FPBits(r1_err).abs().get_val() < 0x1.0p69)
	fputil::clear_except_if_required(FE_INEXACT);
	}

	return update_exponent(r1_lower);
	}

	// Accuracy test failed, perform another Newton iteration.
	double x1 = x1_dd.hi + (e0 + REL_ERROR) * x0_h0;

	// Second iteration in double-double precision.
	// h1 = x1^3 * a^2 - 1.
	DoubleDouble x1_sq = fputil::exact_mult(x1, x1);
	DoubleDouble x1_3 = fputil::quick_mult(x1, x1_sq);
	double h1 = get_error(x1_3, a_sq);

	// e1 = -x1*h1/3.
	double e1 = h1 * (x1 * -0x1.5555555555555p-2);
	// x2 = x1*(1 - h1/3) = x1 + e1 ~ a^(-2/3) with relative errors < 2^-101.
	DoubleDouble x2 = fputil::exact_add(x1, e1);
	// r2 = a * x2 ~ a * a^(-2/3) = a^(1/3) with relative errors < 2^-100.
	DoubleDouble r2 = fputil::quick_mult(a, x2);

	double r2_upper = r2.hi + fputil::multiply_add(a, 0x1.0p-102, r2.lo);
	double r2_lower = r2.hi + fputil::multiply_add(a, -0x1.0p-102, r2.lo);

	// Ziv's accuracy test.
	if (LIBC_LIKELY(r2_upper == r2_lower))
	return update_exponent(r2_upper);

	// TODO: Investigate removing float128 and just list exceptional cases.
	// Apply another Newton iteration with ~126-bit accuracy.
	Float128 x2_f128 = fputil::quick_add(Float128(x2.hi), Float128(x2.lo));
	// x2^3
	Float128 x2_3 =
	fputil::quick_mul(fputil::quick_mul(x2_f128, x2_f128), x2_f128);
	// a^2
	Float128 a_sq_f128 = fputil::quick_mul(Float128(a), Float128(a));
	// x2^3 * a^2
	Float128 x2_3_a_sq = fputil::quick_mul(x2_3, a_sq_f128);
	// h2 = x2^3 * a^2 - 1
	Float128 h2_f128 = fputil::quick_add(x2_3_a_sq, Float128(-1.0));
	double h2 = static_cast<double>(h2_f128);
	// t2 = 1 - h2 / 3
	Float128 t2 =
	fputil::quick_add(Float128(1.0), Float128(h2 * (-0x1.5555555555555p-2)));
	// x3 = x2 * (1 - h2 / 3) ~ a^(-2/3)
	Float128 x3 = fputil::quick_mul(x2_f128, t2);
	// r3 = a * x3 ~ a * a^(-2/3) = a^(1/3)
	Float128 r3 = fputil::quick_mul(Float128(a), x3);

	// Check for exact cases:
	Float128::MantissaType rounding_bits =
	r3.mantissa & 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFFF_u128;

	double result = static_cast<double>(r3);
	if ((rounding_bits < 0x0000'0000'0000'0000'0000'0000'0000'000F_u128) \|\|
	(rounding_bits >= 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFF0_u128)) {
	// Output is exact.
	r3.mantissa &= 0xFFFF'FFFF'FFFF'FFFF'FFFF'FFFF'FFFF'FFF0_u128;

	if (rounding_bits >= 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFF0_u128) {
	Float128 tmp{r3.sign, r3.exponent - 123,
	0x8000'0000'0000'0000'0000'0000'0000'0000_u128};
	Float128 r4 = fputil::quick_add(r3, tmp);
	result = static_cast<double>(r4);
	} else {
	result = static_cast<double>(r3);
	}

	fputil::clear_except_if_required(FE_INEXACT);
	}

	return update_exponent(result);
	#endif // LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
	}

	} // namespace LIBC_NAMESPACE_DECL