[libc] Implement sincosf function correctly rounded to all rounding modes.

Refactor common range reductions and evaluations for sinf, cosf, and
sincosf.  Added exhaustive tests for sincosf.

Performance before the patch:
```
System LIBC reciprocal throughput : 30.205
LIBC reciprocal throughput        : 30.533

System LIBC latency : 67.961
LIBC latency        : 61.564
```
Performance after the patch:
```
System LIBC reciprocal throughput : 30.409
LIBC reciprocal throughput        : 20.273

System LIBC latency : 67.527
LIBC latency        : 61.959
```

Reviewed By: orex

Differential Revision: https://reviews.llvm.org/D130901

GitOrigin-RevId: 131dda9acc6978aba4740d75ca45f411fbabd726
diff --git a/docs/math.rst b/docs/math.rst
index 4445c1a..f3bab26 100644
--- a/docs/math.rst
+++ b/docs/math.rst
@@ -166,7 +166,7 @@
 log1p          |check|
 log2           |check|
 sin            |check|          large
-sincos         0.776 ULPs       large
+sincos         |check|          large
 sinh           |check|
 sqrt           |check|          |check|         |check|
 tanh           |check|
@@ -232,6 +232,8 @@
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 | sinf         |        13 |                25 |        54 |                57 | :math:`[-\pi, \pi]`                 | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
+| sincosf      |        20 |                30 |        62 |                68 | :math:`[-\pi, \pi]`                 | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
++--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 | sinhf        |        23 |                64 |        73 |               141 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 | tanhf        |        25 |                59 |        95 |               125 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
diff --git a/src/math/generic/CMakeLists.txt b/src/math/generic/CMakeLists.txt
index b10af04..cd027a0 100644
--- a/src/math/generic/CMakeLists.txt
+++ b/src/math/generic/CMakeLists.txt
@@ -46,14 +46,26 @@
     libc.src.errno.errno
 )
 
-add_object_library(
+add_header_library(
+  range_reduction
+  HDRS
+    range_reduction.h
+    range_reduction_fma.h
+  DEPENDS
+    libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+)
+
+add_header_library(
   sincosf_utils
   HDRS
     sincosf_utils.h
-  SRCS
-    sincosf_data.cpp
   DEPENDS
-    .math_utils
+    .range_reduction
+    libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.polyeval
 )
 
 add_entrypoint_object(
@@ -62,16 +74,13 @@
     cosf.cpp
   HDRS
     ../cosf.h
-    range_reduction.h
-    range_reduction_fma.h
   DEPENDS
-    .common_constants
+    .sincosf_utils
     libc.include.math
     libc.src.errno.errno
     libc.src.__support.FPUtil.fputil
     libc.src.__support.FPUtil.fma
     libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
     libc.src.__support.FPUtil.polyeval
   COMPILE_OPTIONS
     -O3
@@ -83,16 +92,14 @@
     sinf.cpp
   HDRS
     ../sinf.h
-    range_reduction.h
-    range_reduction_fma.h
   DEPENDS
-    .common_constants
+    .range_reduction
+    .sincosf_utils
     libc.include.math
     libc.src.errno.errno
     libc.src.__support.FPUtil.fputil
     libc.src.__support.FPUtil.fma
     libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
     libc.src.__support.FPUtil.polyeval
   COMPILE_OPTIONS
     -O3
@@ -105,9 +112,14 @@
   HDRS
     ../sincosf.h
   DEPENDS
+    .range_reduction
     .sincosf_utils
     libc.include.math
     libc.src.errno.errno
+    libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
   COMPILE_OPTIONS
     -O3
 )
diff --git a/src/math/generic/common_constants.cpp b/src/math/generic/common_constants.cpp
index 1bbca8f..5f7fe56 100644
--- a/src/math/generic/common_constants.cpp
+++ b/src/math/generic/common_constants.cpp
@@ -240,21 +240,4 @@
     0x1.13821818624b4p-2,  0x1.2ff6b54d8a89cp-2,  0x1.4d0ad5a753e07p-2,
     0x1.6ac1f752150a5p-2,  0x1.891fac0e95613p-2};
 
-// Lookup table for sin(k * pi / 16) with k = 0, ..., 31.
-// Table is generated with Sollya as follow:
-// > display = hexadecimal;
-// > for k from 0 to 31 do { D(sin(k * pi/16)); };
-const double SIN_K_PI_OVER_16[32] = {
-    0x0.0000000000000p+0,  0x1.8f8b83c69a60bp-3,  0x1.87de2a6aea963p-2,
-    0x1.1c73b39ae68c8p-1,  0x1.6a09e667f3bcdp-1,  0x1.a9b66290ea1a3p-1,
-    0x1.d906bcf328d46p-1,  0x1.f6297cff75cb0p-1,  0x1.0000000000000p+0,
-    0x1.f6297cff75cb0p-1,  0x1.d906bcf328d46p-1,  0x1.a9b66290ea1a3p-1,
-    0x1.6a09e667f3bcdp-1,  0x1.1c73b39ae68c8p-1,  0x1.87de2a6aea963p-2,
-    0x1.8f8b83c69a60bp-3,  0x0.0000000000000p+0,  -0x1.8f8b83c69a60bp-3,
-    -0x1.87de2a6aea963p-2, -0x1.1c73b39ae68c8p-1, -0x1.6a09e667f3bcdp-1,
-    -0x1.a9b66290ea1a3p-1, -0x1.d906bcf328d46p-1, -0x1.f6297cff75cb0p-1,
-    -0x1.0000000000000p+0, -0x1.f6297cff75cb0p-1, -0x1.d906bcf328d46p-1,
-    -0x1.a9b66290ea1a3p-1, -0x1.6a09e667f3bcdp-1, -0x1.1c73b39ae68c8p-1,
-    -0x1.87de2a6aea963p-2, -0x1.8f8b83c69a60bp-3};
-
 } // namespace __llvm_libc
diff --git a/src/math/generic/common_constants.h b/src/math/generic/common_constants.h
index 1c0d685..998d1eb 100644
--- a/src/math/generic/common_constants.h
+++ b/src/math/generic/common_constants.h
@@ -31,18 +31,13 @@
 // > for i from 0 to 127 do { D(exp(i / 128)); };
 extern const double EXP_M2[128];
 
-// Lookup table for sin(k * pi / 16) with k = 0, ..., 31.
-// Table is generated with Sollya as follow:
-// > display = hexadecimal;
-// > for k from 0 to 31 do { D(sin(k * pi/16)); };
-extern const double SIN_K_PI_OVER_16[32];
-
 static constexpr int EXP_bits_p = 5;
 static constexpr int EXP_num_p = 1 << EXP_bits_p;
 
 // Wolfram alpha: N[Table[2^x-1,{x,-16/32,15/32,1/32}],27]
 // printf("%.13a,\n", d[i]);
 extern const double EXP_2_POW[EXP_num_p];
+
 } // namespace __llvm_libc
 
 #endif // LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
diff --git a/src/math/generic/cosf.cpp b/src/math/generic/cosf.cpp
index 5298b2b..e082400 100644
--- a/src/math/generic/cosf.cpp
+++ b/src/math/generic/cosf.cpp
@@ -7,31 +7,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/cosf.h"
-#include "common_constants.h"
+#include "sincosf_utils.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
 #include "src/__support/FPUtil/except_value_utils.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/common.h"
 
 #include <errno.h>
 
-#if defined(LIBC_TARGET_HAS_FMA)
-#include "range_reduction_fma.h"
-// using namespace __llvm_libc::fma;
-using __llvm_libc::fma::FAST_PASS_BOUND;
-using __llvm_libc::fma::large_range_reduction;
-using __llvm_libc::fma::small_range_reduction;
-#else
-#include "range_reduction.h"
-// using namespace __llvm_libc::generic;
-using __llvm_libc::generic::FAST_PASS_BOUND;
-using __llvm_libc::generic::large_range_reduction;
-using __llvm_libc::generic::small_range_reduction;
-#endif
-
 namespace __llvm_libc {
 
 // Exceptional cases for cosf.
@@ -132,58 +117,26 @@
       return result;
   }
 
-  // TODO(lntue): refactor range reduction and most of polynomial approximation
-  // to share between sinf, cosf, and sincosf.
-  int k;
-  double y;
-
-  if (likely(x_abs < FAST_PASS_BOUND)) {
-    k = small_range_reduction(xd, y);
-  } else {
-    // x is inf or nan.
-    if (unlikely(x_abs >= 0x7f80'0000U)) {
-      if (x_abs == 0x7f80'0000U) {
-        errno = EDOM;
-        fputil::set_except(FE_INVALID);
-      }
-      return x +
-             FPBits::build_nan(1 << (fputil::MantissaWidth<float>::VALUE - 1));
+  // x is inf or nan.
+  if (unlikely(x_abs >= 0x7f80'0000U)) {
+    if (x_abs == 0x7f80'0000U) {
+      errno = EDOM;
+      fputil::set_except(FE_INVALID);
     }
-
-    k = large_range_reduction(xd, xbits.get_exponent(), y);
+    return x +
+           FPBits::build_nan(1 << (fputil::MantissaWidth<float>::VALUE - 1));
   }
 
-  // After range reduction, k = round(x * 16 / pi) and y = (x * 16 / pi) - k.
-  // So k is an integer and -0.5 <= y <= 0.5.
-  // Then cos(x) = cos((k + y)*pi/16)
-  //             = cos(y*pi/16) * cos(k*pi/16) - sin(y*pi/16) * sin(k*pi/16)
-
-  double ysq = y * y;
-
-  // Degree-6 minimax even polynomial for sin(y*pi/16)/y generated by Sollya
-  // with:
-  // > Q = fpminimax(sin(y*pi/16)/y, [|0, 2, 4, 6|], [|D...|], [0, 0.5]);
-  double sin_y =
-      y * fputil::polyeval(ysq, 0x1.921fb54442d17p-3, -0x1.4abbce6256adp-10,
-                           0x1.466bc5a5ac6b3p-19, -0x1.32bdcb4207562p-29);
-  // Degree-8 minimax even polynomial for cos(y*pi/16) generated by Sollya with:
-  // > P = fpminimax(cos(x*pi/16), [|0, 2, 4, 6, 8|], [|1, D...|], [0, 0.5]);
-  // Note that cosm1_y = cos(y*pi/16) - 1.
-  double cosm1_y =
-      ysq * fputil::polyeval(ysq, -0x1.3bd3cc9be45dcp-6, 0x1.03c1f081b08ap-14,
-                             -0x1.55d3c6fb0fb6ep-24, 0x1.e1d3d60f58873p-35);
-
-  double sin_k = -SIN_K_PI_OVER_16[k & 31];
-  // cos(k * pi/16) = sin(k * pi/16 + pi/2) = sin((k + 8) * pi/16).
-  // cos_k = y * cos(k * pi/16)
-  double cos_k = SIN_K_PI_OVER_16[(k + 8) & 31];
-
   // Combine the results with the sine of sum formula:
   //   cos(x) = cos((k + y)*pi/16)
   //          = cos(y*pi/16) * cos(k*pi/16) - sin(y*pi/16) * sin(k*pi/16)
   //          = cosm1_y * cos_k + sin_y * sin_k
   //          = (cosm1_y * cos_k + cos_k) + sin_y * sin_k
-  return fputil::multiply_add(sin_y, sin_k,
+  double sin_k, cos_k, sin_y, cosm1_y;
+
+  sincosf_eval(xd, x_abs, sin_k, cos_k, sin_y, cosm1_y);
+
+  return fputil::multiply_add(sin_y, -sin_k,
                               fputil::multiply_add(cosm1_y, cos_k, cos_k));
 }
 
diff --git a/src/math/generic/sincosf.cpp b/src/math/generic/sincosf.cpp
index fa05cd2..3a1149f 100644
--- a/src/math/generic/sincosf.cpp
+++ b/src/math/generic/sincosf.cpp
@@ -7,71 +7,210 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/sincosf.h"
-#include "math_utils.h"
 #include "sincosf_utils.h"
-
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/common.h"
-#include <math.h>
 
-#include <stdint.h>
+#include <errno.h>
 
 namespace __llvm_libc {
 
+// Exceptional values
+static constexpr int N_EXCEPTS = 10;
+
+static constexpr uint32_t EXCEPT_INPUTS[N_EXCEPTS] = {
+    0x3b5637f5, // x = 0x1.ac6feap-9
+    0x3fa7832a, // x = 0x1.4f0654p0
+    0x46199998, // x = 0x1.33333p13
+    0x55325019, // x = 0x1.64a032p43
+    0x55cafb2a, // x = 0x1.95f654p44
+    0x5922aa80, // x = 0x1.4555p51
+    0x5aa4542c, // x = 0x1.48a858p54
+    0x5f18b878, // x = 0x1.3170fp63
+    0x6115cb11, // x = 0x1.2b9622p67
+    0x7beef5ef, // x = 0x1.ddebdep120
+};
+
+static constexpr uint32_t EXCEPT_OUTPUTS_SIN[N_EXCEPTS][4] = {
+    {0x3b5637dc, 1, 0, 0}, // x = 0x1.ac6feap-9, sin(x) = 0x1.ac6fb8p-9 (RZ)
+    {0x3f7741b5, 1, 0, 1}, // x = 0x1.4f0654p0, sin(x) = 0x1.ee836ap-1 (RZ)
+    {0xbeb1fa5d, 0, 1, 0}, // x = 0x1.33333p13, sin(x) = -0x1.63f4bap-2 (RZ)
+    {0xbf171adf, 0, 1, 1}, // x = 0x1.64a032p43, sin(x) = -0x1.2e35bep-1 (RZ)
+    {0xbf7e7a16, 0, 1, 1}, // x = 0x1.95f654p44, sin(x) = -0x1.fcf42cp-1 (RZ)
+    {0xbf587521, 0, 1, 1}, // x = 0x1.4555p51, sin(x) = -0x1.b0ea42p-1 (RZ)
+    {0x3f5f5646, 1, 0, 0}, // x = 0x1.48a858p54, sin(x) = 0x1.beac8cp-1 (RZ)
+    {0x3dad60f6, 1, 0, 1}, // x = 0x1.3170fp63, sin(x) = 0x1.5ac1ecp-4 (RZ)
+    {0xbe7cc1e0, 0, 1, 1}, // x = 0x1.2b9622p67, sin(x) = -0x1.f983cp-3 (RZ)
+    {0xbf587d1b, 0, 1, 1}, // x = 0x1.ddebdep120, sin(x) = -0x1.b0fa36p-1 (RZ)
+};
+
+static constexpr uint32_t EXCEPT_OUTPUTS_COS[N_EXCEPTS][4] = {
+    {0x3f7fffa6, 1, 0, 0}, // x = 0x1.ac6feap-9, cos(x) = 0x1.ffff4cp-1 (RZ)
+    {0x3e84aabf, 1, 0, 1}, // x = 0x1.4f0654p0, cos(x) = 0x1.09557ep-2 (RZ)
+    {0xbf70090b, 0, 1, 0}, // x = 0x1.33333p13, cos(x) = -0x1.e01216p-1 (RZ)
+    {0x3f4ea5d2, 1, 0, 0}, // x = 0x1.64a032p43, cos(x) = 0x1.9d4ba4p-1 (RZ)
+    {0x3ddf11f3, 1, 0, 1}, // x = 0x1.95f654p44, cos(x) = 0x1.be23e6p-4 (RZ)
+    {0x3f08aebe, 1, 0, 1}, // x = 0x1.4555p51, cos(x) = 0x1.115d7cp-1 (RZ)
+    {0x3efa40a4, 1, 0, 0}, // x = 0x1.48a858p54, cos(x) = 0x1.f48148p-2 (RZ)
+    {0x3f7f14bb, 1, 0, 0}, // x = 0x1.3170fp63, cos(x) = 0x1.fe2976p-1 (RZ)
+    {0x3f78142e, 1, 0, 1}, // x = 0x1.2b9622p67, cos(x) = 0x1.f0285cp-1 (RZ)
+    {0x3f08a21c, 1, 0, 0}, // x = 0x1.ddebdep120, cos(x) = 0x1.114438p-1 (RZ)
+};
+
 // Fast sincosf implementation. Worst-case ULP is 0.5607, maximum relative
 // error is 0.5303 * 2^-23. A single-step range reduction is used for
 // small values. Large inputs have their range reduced using fast integer
 // arithmetic.
-LLVM_LIBC_FUNCTION(void, sincosf, (float y, float *sinp, float *cosp)) {
-  double x = y;
-  double s;
-  int n;
-  const sincos_t *p = &SINCOSF_TABLE[0];
+LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) {
+  using FPBits = typename fputil::FPBits<float>;
+  FPBits xbits(x);
 
-  if (abstop12(y) < abstop12(PIO4)) {
-    double x2 = x * x;
+  uint32_t x_abs = xbits.uintval() & 0x7fff'ffffU;
+  double xd = static_cast<double>(x);
 
-    if (unlikely(abstop12(y) < abstop12(as_float(0x39800000)))) {
-      if (unlikely(abstop12(y) < abstop12(as_float(0x800000))))
-        // Force underflow for tiny y.
-        force_eval<float>(x2);
-      *sinp = y;
+  // Range reduction:
+  // For |x| > pi/16, we perform range reduction as follows:
+  // Find k and y such that:
+  //   x = (k + y) * pi/16
+  //   k is an integer
+  //   |y| < 0.5
+  // For small range (|x| < 2^46 when FMA instructions are available, 2^22
+  // otherwise), this is done by performing:
+  //   k = round(x * 16/pi)
+  //   y = x * 16/pi - k
+  // For large range, we will omit all the higher parts of 16/pi such that the
+  // least significant bits of their full products with x are larger than 31,
+  // since:
+  //     sin((k + y + 32*i) * pi/16) = sin(x + i * 2pi) = sin(x), and
+  //     cos((k + y + 32*i) * pi/16) = cos(x + i * 2pi) = cos(x).
+  //
+  // When FMA instructions are not available, we store the digits of 16/pi in
+  // chunks of 28-bit precision.  This will make sure that the products:
+  //   x * SIXTEEN_OVER_PI_28[i] are all exact.
+  // When FMA instructions are available, we simply store the digits of 16/pi in
+  // chunks of doubles (53-bit of precision).
+  // So when multiplying by the largest values of single precision, the
+  // resulting output should be correct up to 2^(-208 + 128) ~ 2^-80.  By the
+  // worst-case analysis of range reduction, |y| >= 2^-38, so this should give
+  // us more than 40 bits of accuracy. For the worst-case estimation of range
+  // reduction, see for instances:
+  //   Elementary Functions by J-M. Muller, Chapter 11,
+  //   Handbook of Floating-Point Arithmetic by J-M. Muller et. al.,
+  //   Chapter 10.2.
+  //
+  // Once k and y are computed, we then deduce the answer by the sine and cosine
+  // of sum formulas:
+  //   sin(x) = sin((k + y)*pi/16)
+  //          = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16)
+  //   cos(x) = cos((k + y)*pi/16)
+  //          = cos(y*pi/16) * cos(k*pi/16) - sin(y*pi/16) * sin(k*pi/16)
+  // The values of sin(k*pi/16) and cos(k*pi/16) for k = 0..31 are precomputed
+  // and stored using a vector of 32 doubles. Sin(y*pi/16) and cos(y*pi/16) are
+  // computed using degree-7 and degree-8 minimax polynomials generated by
+  // Sollya respectively.
+
+  // |x| < 0x1.0p-12f
+  if (unlikely(x_abs < 0x3980'0000U)) {
+    if (unlikely(x_abs == 0U)) {
+      // For signed zeros.
+      *sinp = x;
       *cosp = 1.0f;
       return;
     }
-
-    sincosf_poly(x, x2, p, 0, sinp, cosp);
-  } else if (abstop12(y) < abstop12(120.0f)) {
-    x = reduce_fast(x, p, &n);
-
-    // Setup the signs for sin and cos.
-    s = p->sign[n & 3];
-
-    if (n & 2)
-      p = &SINCOSF_TABLE[1];
-
-    sincosf_poly(x * s, x * x, p, n, sinp, cosp);
-  } else if (likely(abstop12(y) < abstop12(INFINITY))) {
-    uint32_t xi = as_uint32_bits(y);
-    int sign = xi >> 31;
-
-    x = reduce_large(xi, &n);
-
-    // Setup signs for sin and cos - include original sign.
-    s = p->sign[(n + sign) & 3];
-
-    if ((n + sign) & 2)
-      p = &SINCOSF_TABLE[1];
-
-    sincosf_poly(x * s, x * x, p, n, sinp, cosp);
-  } else {
-    // Return NaN if Inf or NaN for both sin and cos.
-    *sinp = *cosp = y - y;
-
-    // Needed to set errno for +-Inf, the add is a hack to work
-    // around a gcc register allocation issue: just passing y
-    // affects code generation in the fast path.
-    invalid(y + y);
+    // When |x| < 2^-12, the relative errors of the approximations
+    //   sin(x) ~ x, cos(x) ~ 1
+    // are:
+    //   |sin(x) - x| / |sin(x)| < |x^3| / (6|x|)
+    //                           = x^2 / 6
+    //                           < 2^-25
+    //                           < epsilon(1)/2.
+    //   |cos(x) - 1| < |x^2 / 2| = 2^-25 < epsilon(1)/2.
+    // So the correctly rounded values of sin(x) and cos(x) are:
+    //   sin(x) = x - sign(x)*eps(x) if rounding mode = FE_TOWARDZERO,
+    //                        or (rounding mode = FE_UPWARD and x is
+    //                        negative),
+    //          = x otherwise.
+    //   cos(x) = 1 - eps(x) if rounding mode = FE_TOWARDZERO or FE_DOWWARD,
+    //          = 1 otherwise.
+    // To simplify the rounding decision and make it more efficient and to
+    // prevent compiler to perform constant folding, we use
+    //   sin(x) = fma(x, -2^-25, x),
+    //   cos(x) = fma(x*0.5f, -x, 1)
+    // instead.
+    // Note: to use the formula x - 2^-25*x to decide the correct rounding, we
+    // do need fma(x, -2^-25, x) to prevent underflow caused by -2^-25*x when
+    // |x| < 2^-125. For targets without FMA instructions, we simply use
+    // double for intermediate results as it is more efficient than using an
+    // emulated version of FMA.
+#if defined(LIBC_TARGET_HAS_FMA)
+    *sinp = fputil::multiply_add(x, -0x1.0p-25f, x);
+    *cosp = fputil::multiply_add(FPBits(x_abs).get_val(), -0x1.0p-25f, 1.0f);
+#else
+    *sinp = static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, xd));
+    *cosp = static_cast<float>(fputil::multiply_add(
+        static_cast<double>(FPBits(x_abs).get_val()), -0x1.0p-25, 1.0));
+#endif // LIBC_TARGET_HAS_FMA
+    return;
   }
+
+  // x is inf or nan.
+  if (unlikely(x_abs >= 0x7f80'0000U)) {
+    if (x_abs == 0x7f80'0000U) {
+      errno = EDOM;
+      fputil::set_except(FE_INVALID);
+    }
+    *sinp =
+        x + FPBits::build_nan(1 << (fputil::MantissaWidth<float>::VALUE - 1));
+    *cosp = *sinp;
+    return;
+  }
+
+  // Check exceptional values.
+  for (int i = 0; i < N_EXCEPTS; ++i) {
+    if (unlikely(x_abs == EXCEPT_INPUTS[i])) {
+      uint32_t s = EXCEPT_OUTPUTS_SIN[i][0]; // FE_TOWARDZERO
+      uint32_t c = EXCEPT_OUTPUTS_COS[i][0]; // FE_TOWARDZERO
+      bool x_sign = x < 0;
+      switch (fputil::get_round()) {
+      case FE_UPWARD:
+        s += x_sign ? EXCEPT_OUTPUTS_SIN[i][2] : EXCEPT_OUTPUTS_SIN[i][1];
+        c += EXCEPT_OUTPUTS_COS[i][1];
+        break;
+      case FE_DOWNWARD:
+        s += x_sign ? EXCEPT_OUTPUTS_SIN[i][1] : EXCEPT_OUTPUTS_SIN[i][2];
+        c += EXCEPT_OUTPUTS_COS[i][2];
+        break;
+      case FE_TONEAREST:
+        s += EXCEPT_OUTPUTS_SIN[i][3];
+        c += EXCEPT_OUTPUTS_COS[i][3];
+        break;
+      }
+      *sinp = x_sign ? -FPBits(s).get_val() : FPBits(s).get_val();
+      *cosp = FPBits(c).get_val();
+
+      return;
+    }
+  }
+
+  // Combine the results with the sine and cosine of sum formulas:
+  //   sin(x) = sin((k + y)*pi/16)
+  //          = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16)
+  //          = sin_y * cos_k + (1 + cosm1_y) * sin_k
+  //          = sin_y * cos_k + (cosm1_y * sin_k + sin_k)
+  //   cos(x) = cos((k + y)*pi/16)
+  //          = cos(y*pi/16) * cos(k*pi/16) - sin(y*pi/16) * sin(k*pi/16)
+  //          = cosm1_y * cos_k + sin_y * sin_k
+  //          = (cosm1_y * cos_k + cos_k) + sin_y * sin_k
+  double sin_k, cos_k, sin_y, cosm1_y;
+
+  sincosf_eval(xd, x_abs, sin_k, cos_k, sin_y, cosm1_y);
+
+  *sinp = fputil::multiply_add(sin_y, cos_k,
+                               fputil::multiply_add(cosm1_y, sin_k, sin_k));
+  *cosp = fputil::multiply_add(sin_y, -sin_k,
+                               fputil::multiply_add(cosm1_y, cos_k, cos_k));
 }
 
 } // namespace __llvm_libc
diff --git a/src/math/generic/sincosf_data.cpp b/src/math/generic/sincosf_data.cpp
deleted file mode 100644
index ac63265..0000000
--- a/src/math/generic/sincosf_data.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//===-- sinf/cosf data tables ---------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "math_utils.h"
-#include "sincosf_utils.h"
-
-#include <stdint.h>
-
-namespace __llvm_libc {
-
-// The constants and polynomials for sine and cosine.  The 2nd entry
-// computes -cos (x) rather than cos (x) to get negation for free.
-constexpr sincos_t SINCOSF_TABLE[2] = {
-    {{1.0, -1.0, -1.0, 1.0},
-     0x1.45f306dc9c883p+23,
-     0x1.921fb54442d18p+0,
-     0x1p+0,
-     -0x1.ffffffd0c621cp-2,
-     0x1.55553e1068f19p-5,
-     -0x1.6c087e89a359dp-10,
-     0x1.99343027bf8c3p-16,
-     -0x1.555545995a603p-3,
-     0x1.1107605230bc4p-7,
-     -0x1.994eb3774cf24p-13},
-    {{1.0, -1.0, -1.0, 1.0},
-     0x1.45f306dc9c883p+23,
-     0x1.921fb54442d18p+0,
-     -0x1p+0,
-     0x1.ffffffd0c621cp-2,
-     -0x1.55553e1068f19p-5,
-     0x1.6c087e89a359dp-10,
-     -0x1.99343027bf8c3p-16,
-     -0x1.555545995a603p-3,
-     0x1.1107605230bc4p-7,
-     -0x1.994eb3774cf24p-13},
-};
-
-// Table with 4/PI to 192 bit precision.  To avoid unaligned accesses
-// only 8 new bits are added per entry, making the table 4 times larger.
-constexpr uint32_t INV_PIO4[24] = {
-    0xa2,       0xa2f9,     0xa2f983,   0xa2f9836e, 0xf9836e4e, 0x836e4e44,
-    0x6e4e4415, 0x4e441529, 0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1,
-    0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0, 0x34ddc0db, 0xddc0db62,
-    0xc0db6295, 0xdb629599, 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041};
-
-} // namespace __llvm_libc
diff --git a/src/math/generic/sincosf_utils.h b/src/math/generic/sincosf_utils.h
index 5ddcc9c..06064c8 100644
--- a/src/math/generic/sincosf_utils.h
+++ b/src/math/generic/sincosf_utils.h
@@ -1,4 +1,4 @@
-//===-- Collection of utils for cosf/sinf/sincosf ---------------*- C++ -*-===//
+//===-- Collection of utils for sinf/cosf/sincosf ---------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,132 +9,78 @@
 #ifndef LLVM_LIBC_SRC_MATH_SINCOSF_UTILS_H
 #define LLVM_LIBC_SRC_MATH_SINCOSF_UTILS_H
 
-#include "math_utils.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/common.h"
 
-#include <stdint.h>
+#if defined(LIBC_TARGET_HAS_FMA)
+#include "range_reduction_fma.h"
+// using namespace __llvm_libc::fma;
+using __llvm_libc::fma::FAST_PASS_BOUND;
+using __llvm_libc::fma::large_range_reduction;
+using __llvm_libc::fma::small_range_reduction;
+#else
+#include "range_reduction.h"
+// using namespace __llvm_libc::generic;
+using __llvm_libc::generic::FAST_PASS_BOUND;
+using __llvm_libc::generic::large_range_reduction;
+using __llvm_libc::generic::small_range_reduction;
+#endif // LIBC_TARGET_HAS_FMA
 
 namespace __llvm_libc {
 
-// 2PI * 2^-64.
-static constexpr double PI63 = 0x1.921fb54442d18p-62;
-// PI / 4.
-static constexpr double PIO4 = 0x1.921fb54442d18p-1;
+// Lookup table for sin(k * pi / 16) with k = 0, ..., 31.
+// Table is generated with Sollya as follow:
+// > display = hexadecimal;
+// > for k from 0 to 31 do { D(sin(k * pi/16)); };
+const double SIN_K_PI_OVER_16[32] = {
+    0x0.0000000000000p+0,  0x1.8f8b83c69a60bp-3,  0x1.87de2a6aea963p-2,
+    0x1.1c73b39ae68c8p-1,  0x1.6a09e667f3bcdp-1,  0x1.a9b66290ea1a3p-1,
+    0x1.d906bcf328d46p-1,  0x1.f6297cff75cb0p-1,  0x1.0000000000000p+0,
+    0x1.f6297cff75cb0p-1,  0x1.d906bcf328d46p-1,  0x1.a9b66290ea1a3p-1,
+    0x1.6a09e667f3bcdp-1,  0x1.1c73b39ae68c8p-1,  0x1.87de2a6aea963p-2,
+    0x1.8f8b83c69a60bp-3,  0x0.0000000000000p+0,  -0x1.8f8b83c69a60bp-3,
+    -0x1.87de2a6aea963p-2, -0x1.1c73b39ae68c8p-1, -0x1.6a09e667f3bcdp-1,
+    -0x1.a9b66290ea1a3p-1, -0x1.d906bcf328d46p-1, -0x1.f6297cff75cb0p-1,
+    -0x1.0000000000000p+0, -0x1.f6297cff75cb0p-1, -0x1.d906bcf328d46p-1,
+    -0x1.a9b66290ea1a3p-1, -0x1.6a09e667f3bcdp-1, -0x1.1c73b39ae68c8p-1,
+    -0x1.87de2a6aea963p-2, -0x1.8f8b83c69a60bp-3};
 
-// The constants and polynomials for sine and cosine.
-typedef struct {
-  double sign[4];            // Sign of sine in quadrants 0..3.
-  double hpi_inv;            // 2 / PI ( * 2^24 ).
-  double hpi;                // PI / 2.
-  double c0, c1, c2, c3, c4; // Cosine polynomial.
-  double s1, s2, s3;         // Sine polynomial.
-} sincos_t;
+static inline void sincosf_eval(double xd, uint32_t x_abs, double &sin_k,
+                                double &cos_k, double &sin_y, double &cosm1_y) {
+  int64_t k;
+  double y;
 
-// Polynomial data (the cosine polynomial is negated in the 2nd entry).
-extern const sincos_t SINCOSF_TABLE[2];
-
-// Table with 4/PI to 192 bit precision.
-extern const uint32_t INV_PIO4[];
-
-// Top 12 bits of the float representation with the sign bit cleared.
-static inline uint32_t abstop12(float x) {
-  return (as_uint32_bits(x) >> 20) & 0x7ff;
-}
-
-// Compute the sine and cosine of inputs X and X2 (X squared), using the
-// polynomial P and store the results in SINP and COSP. N is the quadrant,
-// if odd the cosine and sine polynomials are swapped.
-static inline void sincosf_poly(double x, double x2, const sincos_t *p, int n,
-                                float *sinp, float *cosp) {
-  double x3, x4, x5, x6, s, c, c1, c2, s1;
-
-  x4 = x2 * x2;
-  x3 = x2 * x;
-  c2 = p->c3 + x2 * p->c4;
-  s1 = p->s2 + x2 * p->s3;
-
-  // Swap sin/cos result based on quadrant.
-  float *tmp = (n & 1 ? cosp : sinp);
-  cosp = (n & 1 ? sinp : cosp);
-  sinp = tmp;
-
-  c1 = p->c0 + x2 * p->c1;
-  x5 = x3 * x2;
-  x6 = x4 * x2;
-
-  s = x + x3 * p->s1;
-  c = c1 + x4 * p->c2;
-
-  *sinp = s + x5 * s1;
-  *cosp = c + x6 * c2;
-}
-
-// Return the sine of inputs X and X2 (X squared) using the polynomial P.
-// N is the quadrant, and if odd the cosine polynomial is used.
-static inline float sinf_poly(double x, double x2, const sincos_t *p, int n) {
-  double x3, x4, x6, x7, s, c, c1, c2, s1;
-
-  if ((n & 1) == 0) {
-    x3 = x * x2;
-    s1 = p->s2 + x2 * p->s3;
-
-    x7 = x3 * x2;
-    s = x + x3 * p->s1;
-
-    return s + x7 * s1;
+  if (likely(x_abs < FAST_PASS_BOUND)) {
+    k = small_range_reduction(xd, y);
   } else {
-    x4 = x2 * x2;
-    c2 = p->c3 + x2 * p->c4;
-    c1 = p->c0 + x2 * p->c1;
-
-    x6 = x4 * x2;
-    c = c1 + x4 * p->c2;
-
-    return c + x6 * c2;
+    fputil::FPBits<float> x_bits(x_abs);
+    k = large_range_reduction(xd, x_bits.get_exponent(), y);
   }
-}
 
-// Fast range reduction using single multiply-subtract. Return the modulo of
-// X as a value between -PI/4 and PI/4 and store the quadrant in NP.
-// The values for PI/2 and 2/PI are accessed via P. Since PI/2 as a double
-// is accurate to 55 bits and the worst-case cancellation happens at 6 * PI/4,
-// the result is accurate for |X| <= 120.0.
-static inline double reduce_fast(double x, const sincos_t *p, int *np) {
-  double r;
-  // Use scaled float to int conversion with explicit rounding.
-  // hpi_inv is prescaled by 2^24 so the quadrant ends up in bits 24..31.
-  // This avoids inaccuracies introduced by truncating negative values.
-  r = x * p->hpi_inv;
-  int n = ((int32_t)r + 0x800000) >> 24;
-  *np = n;
-  return x - n * p->hpi;
-}
+  // After range reduction, k = round(x * 16 / pi) and y = (x * 16 / pi) - k.
+  // So k is an integer and -0.5 <= y <= 0.5.
+  // Then sin(x) = sin((k + y)*pi/16)
+  //             = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16)
 
-// Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
-// XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
-// Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
-// Reduction uses a table of 4/PI with 192 bits of precision. A 32x96->128 bit
-// multiply computes the exact 2.62-bit fixed-point modulo. Since the result
-// can have at most 29 leading zeros after the binary point, the double
-// precision result is accurate to 33 bits.
-static inline double reduce_large(uint32_t xi, int *np) {
-  const uint32_t *arr = &INV_PIO4[(xi >> 26) & 15];
-  int shift = (xi >> 23) & 7;
-  uint64_t n, res0, res1, res2;
+  sin_k = SIN_K_PI_OVER_16[k & 31];
+  // cos(k * pi/16) = sin(k * pi/16 + pi/2) = sin((k + 8) * pi/16).
+  // cos_k = y * cos(k * pi/16)
+  cos_k = SIN_K_PI_OVER_16[(k + 8) & 31];
 
-  xi = (xi & 0xffffff) | 0x800000;
-  xi <<= shift;
+  double ysq = y * y;
 
-  res0 = xi * arr[0];
-  res1 = (uint64_t)xi * arr[4];
-  res2 = (uint64_t)xi * arr[8];
-  res0 = (res2 >> 32) | (res0 << 32);
-  res0 += res1;
-
-  n = (res0 + (1ULL << 61)) >> 62;
-  res0 -= n << 62;
-  double x = (int64_t)res0;
-  *np = n;
-  return x * PI63;
+  // Degree-6 minimax even polynomial for sin(y*pi/16)/y generated by Sollya
+  // with:
+  // > Q = fpminimax(sin(y*pi/16)/y, [|0, 2, 4, 6|], [|D...|], [0, 0.5]);
+  sin_y = y * fputil::polyeval(ysq, 0x1.921fb54442d17p-3, -0x1.4abbce6256adp-10,
+                               0x1.466bc5a5ac6b3p-19, -0x1.32bdcb4207562p-29);
+  // Degree-8 minimax even polynomial for cos(y*pi/16) generated by Sollya with:
+  // > P = fpminimax(cos(x*pi/16), [|0, 2, 4, 6, 8|], [|1, D...|], [0, 0.5]);
+  // Note that cosm1_y = cos(y*pi/16) - 1.
+  cosm1_y =
+      ysq * fputil::polyeval(ysq, -0x1.3bd3cc9be45dcp-6, 0x1.03c1f081b08ap-14,
+                             -0x1.55d3c6fb0fb6ep-24, 0x1.e1d3d60f58873p-35);
 }
 
 } // namespace __llvm_libc
diff --git a/src/math/generic/sinf.cpp b/src/math/generic/sinf.cpp
index 4449403..bc725ea 100644
--- a/src/math/generic/sinf.cpp
+++ b/src/math/generic/sinf.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/sinf.h"
-#include "common_constants.h"
+#include "sincosf_utils.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
@@ -21,19 +21,13 @@
 #if defined(LIBC_TARGET_HAS_FMA)
 #include "range_reduction_fma.h"
 // using namespace __llvm_libc::fma;
-using __llvm_libc::fma::FAST_PASS_BOUND;
-using __llvm_libc::fma::large_range_reduction;
 using __llvm_libc::fma::N_EXCEPTS;
 using __llvm_libc::fma::SinfExcepts;
-using __llvm_libc::fma::small_range_reduction;
 #else
 #include "range_reduction.h"
 // using namespace __llvm_libc::generic;
-using __llvm_libc::generic::FAST_PASS_BOUND;
-using __llvm_libc::generic::large_range_reduction;
 using __llvm_libc::generic::N_EXCEPTS;
 using __llvm_libc::generic::SinfExcepts;
-using __llvm_libc::generic::small_range_reduction;
 #endif
 
 namespace __llvm_libc {
@@ -143,55 +137,24 @@
       return result;
   }
 
-  int k;
-  double y;
-
-  if (likely(x_abs < FAST_PASS_BOUND)) {
-    k = small_range_reduction(xd, y);
-  } else {
-    // x is inf or nan.
-    if (unlikely(x_abs >= 0x7f80'0000U)) {
-      if (x_abs == 0x7f80'0000U) {
-        errno = EDOM;
-        fputil::set_except(FE_INVALID);
-      }
-      return x +
-             FPBits::build_nan(1 << (fputil::MantissaWidth<float>::VALUE - 1));
+  if (unlikely(x_abs >= 0x7f80'0000U)) {
+    if (x_abs == 0x7f80'0000U) {
+      errno = EDOM;
+      fputil::set_except(FE_INVALID);
     }
-
-    k = large_range_reduction(xd, xbits.get_exponent(), y);
+    return x +
+           FPBits::build_nan(1 << (fputil::MantissaWidth<float>::VALUE - 1));
   }
 
-  // After range reduction, k = round(x * 16 / pi) and y = (x * 16 / pi) - k.
-  // So k is an integer and -0.5 <= y <= 0.5.
-  // Then sin(x) = sin((k + y)*pi/16)
-  //             = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16)
-
-  double ysq = y * y;
-
-  // Degree-6 minimax even polynomial for sin(y*pi/16)/y generated by Sollya
-  // with:
-  // > Q = fpminimax(sin(y*pi/16)/y, [|0, 2, 4, 6|], [|D...|], [0, 0.5]);
-  double sin_y =
-      fputil::polyeval(ysq, 0x1.921fb54442d17p-3, -0x1.4abbce6256adp-10,
-                       0x1.466bc5a5ac6b3p-19, -0x1.32bdcb4207562p-29);
-  // Degree-8 minimax even polynomial for cos(y*pi/16) generated by Sollya with:
-  // > P = fpminimax(cos(x*pi/16), [|0, 2, 4, 6, 8|], [|1, D...|], [0, 0.5]);
-  // Note that cosm1_y = cos(y*pi/16) - 1.
-  double cosm1_y =
-      ysq * fputil::polyeval(ysq, -0x1.3bd3cc9be45dcp-6, 0x1.03c1f081b08ap-14,
-                             -0x1.55d3c6fb0fb6ep-24, 0x1.e1d3d60f58873p-35);
-
-  double sin_k = SIN_K_PI_OVER_16[k & 31];
-  // cos(k * pi/16) = sin(k * pi/16 + pi/2) = sin((k + 8) * pi/16).
-  // cos_k = y * cos(k * pi/16)
-  double cos_k = y * SIN_K_PI_OVER_16[(k + 8) & 31];
-
   // Combine the results with the sine of sum formula:
   //   sin(x) = sin((k + y)*pi/16)
   //          = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16)
   //          = sin_y * cos_k + (1 + cosm1_y) * sin_k
   //          = sin_y * cos_k + (cosm1_y * sin_k + sin_k)
+  double sin_k, cos_k, sin_y, cosm1_y;
+
+  sincosf_eval(xd, x_abs, sin_k, cos_k, sin_y, cosm1_y);
+
   return fputil::multiply_add(sin_y, cos_k,
                               fputil::multiply_add(cosm1_y, sin_k, sin_k));
 }
diff --git a/test/src/math/exhaustive/CMakeLists.txt b/test/src/math/exhaustive/CMakeLists.txt
index 5505222..c7e073e 100644
--- a/test/src/math/exhaustive/CMakeLists.txt
+++ b/test/src/math/exhaustive/CMakeLists.txt
@@ -56,6 +56,23 @@
 )
 
 add_fp_unittest(
+  sincosf_test
+  NO_RUN_POSTBUILD
+  NEED_MPFR
+  SUITE
+    libc_math_exhaustive_tests
+  SRCS
+    sincosf_test.cpp
+  DEPENDS
+    .exhaustive_test
+    libc.include.math
+    libc.src.math.sincosf
+    libc.src.__support.FPUtil.fputil
+  LINK_LIBRARIES
+    -lpthread
+)
+
+add_fp_unittest(
   expf_test
   NO_RUN_POSTBUILD
   NEED_MPFR
diff --git a/test/src/math/exhaustive/sincosf_test.cpp b/test/src/math/exhaustive/sincosf_test.cpp
new file mode 100644
index 0000000..bbd14e6
--- /dev/null
+++ b/test/src/math/exhaustive/sincosf_test.cpp
@@ -0,0 +1,77 @@
+//===-- Exhaustive test for sincosf ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "exhaustive_test.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/sincosf.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+#include "utils/UnitTest/FPMatcher.h"
+
+#include <thread>
+
+using FPBits = __llvm_libc::fputil::FPBits<float>;
+
+namespace mpfr = __llvm_libc::testing::mpfr;
+
+struct LlvmLibcSinCosfExhaustiveTest : public LlvmLibcExhaustiveTest<uint32_t> {
+  bool check(uint32_t start, uint32_t stop,
+             mpfr::RoundingMode rounding) override {
+    mpfr::ForceRoundingMode r(rounding);
+    uint32_t bits = start;
+    bool result = true;
+    do {
+      FPBits xbits(bits);
+      float x = float(xbits);
+      float sinx, cosx;
+      __llvm_libc::sincosf(x, &sinx, &cosx);
+      result &= EXPECT_MPFR_MATCH(mpfr::Operation::Sin, x, sinx, 0.5, rounding);
+      result &= EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, cosx, 0.5, rounding);
+    } while (++bits < stop);
+    return result;
+  }
+};
+
+// Range: [0, +Inf);
+static constexpr uint32_t POS_START = 0x0000'0000U;
+static constexpr uint32_t POS_STOP = 0x7f80'0000U;
+
+TEST_F(LlvmLibcSinCosfExhaustiveTest, PostiveRangeRoundNearestTieToEven) {
+  test_full_range(POS_START, POS_STOP, mpfr::RoundingMode::Nearest);
+}
+
+TEST_F(LlvmLibcSinCosfExhaustiveTest, PostiveRangeRoundUp) {
+  test_full_range(POS_START, POS_STOP, mpfr::RoundingMode::Upward);
+}
+
+TEST_F(LlvmLibcSinCosfExhaustiveTest, PostiveRangeRoundDown) {
+  test_full_range(POS_START, POS_STOP, mpfr::RoundingMode::Downward);
+}
+
+TEST_F(LlvmLibcSinCosfExhaustiveTest, PostiveRangeRoundTowardZero) {
+  test_full_range(POS_START, POS_STOP, mpfr::RoundingMode::TowardZero);
+}
+
+// Range: (-Inf, 0];
+static constexpr uint32_t NEG_START = 0x8000'0000U;
+static constexpr uint32_t NEG_STOP = 0xff80'0000U;
+
+TEST_F(LlvmLibcSinCosfExhaustiveTest, NegativeRangeRoundNearestTieToEven) {
+  test_full_range(NEG_START, NEG_STOP, mpfr::RoundingMode::Nearest);
+}
+
+TEST_F(LlvmLibcSinCosfExhaustiveTest, NegativeRangeRoundUp) {
+  test_full_range(NEG_START, NEG_STOP, mpfr::RoundingMode::Upward);
+}
+
+TEST_F(LlvmLibcSinCosfExhaustiveTest, NegativeRangeRoundDown) {
+  test_full_range(NEG_START, NEG_STOP, mpfr::RoundingMode::Downward);
+}
+
+TEST_F(LlvmLibcSinCosfExhaustiveTest, NegativeRangeRoundTowardZero) {
+  test_full_range(NEG_START, NEG_STOP, mpfr::RoundingMode::TowardZero);
+}
diff --git a/test/src/math/sincosf_test.cpp b/test/src/math/sincosf_test.cpp
index f23d719..559dec0 100644
--- a/test/src/math/sincosf_test.cpp
+++ b/test/src/math/sincosf_test.cpp
@@ -54,6 +54,40 @@
   EXPECT_MATH_ERRNO(EDOM);
 }
 
+#define EXPECT_SINCOS_MATCH_ALL_ROUNDING(input)                                \
+  {                                                                            \
+    float sin, cos;                                                            \
+    namespace mpfr = __llvm_libc::testing::mpfr;                               \
+                                                                               \
+    mpfr::ForceRoundingMode __r1(mpfr::RoundingMode::Nearest);                 \
+    __llvm_libc::sincosf(input, &sin, &cos);                                   \
+    EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5,                   \
+                      mpfr::RoundingMode::Nearest);                            \
+    EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5,                   \
+                      mpfr::RoundingMode::Nearest);                            \
+                                                                               \
+    mpfr::ForceRoundingMode __r2(mpfr::RoundingMode::Upward);                  \
+    __llvm_libc::sincosf(input, &sin, &cos);                                   \
+    EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5,                   \
+                      mpfr::RoundingMode::Upward);                             \
+    EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5,                   \
+                      mpfr::RoundingMode::Upward);                             \
+                                                                               \
+    mpfr::ForceRoundingMode __r3(mpfr::RoundingMode::Downward);                \
+    __llvm_libc::sincosf(input, &sin, &cos);                                   \
+    EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5,                   \
+                      mpfr::RoundingMode::Downward);                           \
+    EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5,                   \
+                      mpfr::RoundingMode::Downward);                           \
+                                                                               \
+    mpfr::ForceRoundingMode __r4(mpfr::RoundingMode::TowardZero);              \
+    __llvm_libc::sincosf(input, &sin, &cos);                                   \
+    EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5,                   \
+                      mpfr::RoundingMode::TowardZero);                         \
+    EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5,                   \
+                      mpfr::RoundingMode::TowardZero);                         \
+  }
+
 TEST(LlvmLibcSinCosfTest, InFloatRange) {
   constexpr uint32_t COUNT = 1000000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
@@ -62,31 +96,65 @@
     if (isnan(x) || isinf(x))
       continue;
 
-    float sin, cos;
-    __llvm_libc::sincosf(x, &sin, &cos);
-    ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, cos, 1.0);
-    ASSERT_MPFR_MATCH(mpfr::Operation::Sin, x, sin, 1.0);
+    EXPECT_SINCOS_MATCH_ALL_ROUNDING(x);
+    EXPECT_SINCOS_MATCH_ALL_ROUNDING(-x);
   }
 }
 
-// For small values, cos(x) is 1 and sin(x) is x.
-TEST(LlvmLibcSinCosfTest, SmallValues) {
-  uint32_t bits = 0x17800000;
-  float x = float(FPBits((bits)));
-  float result_cos, result_sin;
-  __llvm_libc::sincosf(x, &result_sin, &result_cos);
-  EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, result_cos, 1.0);
-  EXPECT_MPFR_MATCH(mpfr::Operation::Sin, x, result_sin, 1.0);
-  EXPECT_FP_EQ(1.0f, result_cos);
-  EXPECT_FP_EQ(x, result_sin);
+// For hard to round inputs.
+TEST(LlvmLibcSinCosfTest, SpecialValues) {
+  constexpr int N = 43;
+  constexpr uint32_t INPUTS[N] = {
+      0x3b56'37f5U, // x = 0x1.ac6feap-9f
+      0x3f06'0a92U, // x = pi/6
+      0x3f3a'dc51U, // x = 0x1.75b8a2p-1f
+      0x3f49'0fdbU, // x = pi/4
+      0x3f86'0a92U, // x = pi/3
+      0x3fa7'832aU, // x = 0x1.4f0654p+0f
+      0x3fc9'0fdbU, // x = pi/2
+      0x4017'1973U, // x = 0x1.2e32e6p+1f
+      0x4049'0fdbU, // x = pi
+      0x4096'cbe4U, // x = 0x1.2d97c8p+2f
+      0x40c9'0fdbU, // x = 2*pi
+      0x433b'7490U, // x = 0x1.76e92p+7f
+      0x437c'e5f1U, // x = 0x1.f9cbe2p+7f
+      0x4619'9998U, // x = 0x1.33333p+13f
+      0x474d'246fU, // x = 0x1.9a48dep+15f
+      0x4afd'ece4U, // x = 0x1.fbd9c8p+22f
+      0x4c23'32e9U, // x = 0x1.4665d2p+25f
+      0x50a3'e87fU, // x = 0x1.47d0fep+34f
+      0x5239'47f6U, // x = 0x1.728fecp+37f
+      0x53b1'46a6U, // x = 0x1.628d4cp+40f
+      0x5532'5019U, // x = 0x1.64a032p+43f
+      0x55ca'fb2aU, // x = 0x1.95f654p+44f
+      0x588e'f060U, // x = 0x1.1de0cp+50f
+      0x5922'aa80U, // x = 0x1.4555p+51f
+      0x5aa4'542cU, // x = 0x1.48a858p+54f
+      0x5c07'bcd0U, // x = 0x1.0f79ap+57f
+      0x5ebc'fddeU, // x = 0x1.79fbbcp+62f
+      0x5f18'b878U, // x = 0x1.3170fp+63f
+      0x5fa6'eba7U, // x = 0x1.4dd74ep+64f
+      0x6115'cb11U, // x = 0x1.2b9622p+67f
+      0x61a4'0b40U, // x = 0x1.48168p+68f
+      0x6386'134eU, // x = 0x1.0c269cp+72f
+      0x6589'8498U, // x = 0x1.13093p+76f
+      0x6600'0001U, // x = 0x1.000002p+77f
+      0x664e'46e4U, // x = 0x1.9c8dc8p+77f
+      0x66b0'14aaU, // x = 0x1.602954p+78f
+      0x67a9'242bU, // x = 0x1.524856p+80f
+      0x6a19'76f1U, // x = 0x1.32ede2p+85f
+      0x6c55'da58U, // x = 0x1.abb4bp+89f
+      0x6f79'be45U, // x = 0x1.f37c8ap+95f
+      0x7276'69d4U, // x = 0x1.ecd3a8p+101f
+      0x7758'4625U, // x = 0x1.b08c4ap+111f
+      0x7bee'f5efU, // x = 0x1.ddebdep+120f
+  };
 
-  bits = 0x00400000;
-  x = float(FPBits((bits)));
-  __llvm_libc::sincosf(x, &result_sin, &result_cos);
-  EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, result_cos, 1.0);
-  EXPECT_MPFR_MATCH(mpfr::Operation::Sin, x, result_sin, 1.0);
-  EXPECT_FP_EQ(1.0f, result_cos);
-  EXPECT_FP_EQ(x, result_sin);
+  for (int i = 0; i < N; ++i) {
+    float x = float(FPBits(INPUTS[i]));
+    EXPECT_SINCOS_MATCH_ALL_ROUNDING(x);
+    EXPECT_SINCOS_MATCH_ALL_ROUNDING(-x);
+  }
 }
 
 // SDCOMP-26094: check sinf in the cases for which the range reducer
@@ -94,9 +162,7 @@
 TEST(LlvmLibcSinCosfTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
     float x = float(FPBits((v)));
-    float sin, cos;
-    __llvm_libc::sincosf(x, &sin, &cos);
-    EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, cos, 1.0);
-    EXPECT_MPFR_MATCH(mpfr::Operation::Sin, x, sin, 1.0);
+    EXPECT_SINCOS_MATCH_ALL_ROUNDING(x);
+    EXPECT_SINCOS_MATCH_ALL_ROUNDING(-x);
   }
 }