blob: 8e3a92c092e223f928e6b9f208358087ada44fc6 [file] [log] [blame] [edit]
//===-- Implementation header for erff16 ------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ERFF16_H
#define LLVM_LIBC_SRC___SUPPORT_MATH_ERFF16_H
#include "include/llvm-libc-macros/float16-macros.h"
#ifdef LIBC_TYPES_HAS_FLOAT16
#include "src/__support/FPUtil/FEnvImpl.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/except_value_utils.h"
#include "src/__support/FPUtil/multiply_add.h"
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
namespace LIBC_NAMESPACE_DECL {
namespace math {
LIBC_INLINE float16 erff16(float16 x) {
// Polynomials approximating erf(x)/x on ( k/8, (k + 1)/8 ) generated by
// Sollya with: > P = fpminimax(erf(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14|],
// [|D...|],
// [k/8, (k + 1)/8]);
// for k = 0..31.
constexpr float ERFF16_COEFFS[32][8] = {
{0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82cdap-6f,
0x1.564792p-8f, -0x1.8b3ac6p-11f, -0x1.126fcap-8f, 0x1.2d0bdcp-4f},
{0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82ce4p-6f,
0x1.565bcap-8f, -0x1.c02c66p-11f, 0x1.f92f68p-14f, -0x1.def402p-17f},
{0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82ce2p-6f,
0x1.565b9ep-8f, -0x1.c021f2p-11f, 0x1.f7c6d2p-14f, -0x1.c9e442p-17f},
{0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82cccp-6f,
0x1.56597p-8f, -0x1.bfde2p-11f, 0x1.f31a9ep-14f, -0x1.a5a436p-17f},
{0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f18p-4f, -0x1.b82be4p-6f,
0x1.564becp-8f, -0x1.bee87p-11f, 0x1.e94436p-14f, -0x1.79c0f2p-17f},
{0x1.20dd74p+0f, -0x1.812744p-2f, 0x1.ce2ecp-4f, -0x1.b8265cp-6f,
0x1.5615a2p-8f, -0x1.bc63aep-11f, 0x1.d87c22p-14f, -0x1.49584cp-17f},
{0x1.20dd74p+0f, -0x1.81272ap-2f, 0x1.ce2cbp-4f, -0x1.b80ecap-6f,
0x1.5572e2p-8f, -0x1.b715e6p-11f, 0x1.bfbb1ap-14f, -0x1.177a56p-17f},
{0x1.20dd7p+0f, -0x1.812692p-2f, 0x1.ce23ap-4f, -0x1.b7c1dcp-6f,
0x1.53e93p-8f, -0x1.ad97ccp-11f, 0x1.9f028cp-14f, -0x1.cdc4dap-18f},
{0x1.20dd58p+0f, -0x1.8123e6p-2f, 0x1.ce0458p-4f, -0x1.b6f52ep-6f,
0x1.50c292p-8f, -0x1.9ea246p-11f, 0x1.776546p-14f, -0x1.737c12p-18f},
{0x1.20dce6p+0f, -0x1.811a5ap-2f, 0x1.cdab54p-4f, -0x1.b526d2p-6f,
0x1.4b1d32p-8f, -0x1.896314p-11f, 0x1.4ad57p-14f, -0x1.231e1p-18f},
{0x1.20db48p+0f, -0x1.80fdd8p-2f, 0x1.ccd34p-4f, -0x1.b196a2p-6f,
0x1.4210c2p-8f, -0x1.6dbdfcp-11f, 0x1.1bca2ep-14f, -0x1.bca37p-19f},
{0x1.20d64cp+0f, -0x1.80b4d4p-2f, 0x1.cb0882p-4f, -0x1.ab51fep-6f,
0x1.34e1e6p-8f, -0x1.4c6638p-11f, 0x1.d9ad26p-15f, -0x1.4b0df8p-19f},
{0x1.20c8fcp+0f, -0x1.8010ccp-2f, 0x1.c7a47ep-4f, -0x1.a155bep-6f,
0x1.233502p-8f, -0x1.26c94cp-11f, 0x1.8094f2p-15f, -0x1.e0e3d8p-20f},
{0x1.20a9bep+0f, -0x1.7ec7fcp-2f, 0x1.c1d758p-4f, -0x1.92c16p-6f,
0x1.0d3072p-8f, -0x1.fda5bp-12f, 0x1.2fdd7cp-15f, -0x1.54eed4p-20f},
{0x1.206828p+0f, -0x1.7c73f8p-2f, 0x1.b8c2dcp-4f, -0x1.7f0e5p-6f,
0x1.e7061ep-9f, -0x1.ad36e8p-12f, 0x1.d39222p-16f, -0x1.d83dacp-21f},
{0x1.1feb8ep+0f, -0x1.789834p-2f, 0x1.aba346p-4f, -0x1.663adcp-6f,
0x1.ae99fcp-9f, -0x1.602f96p-12f, 0x1.5e9718p-16f, -0x1.3fca1p-21f},
{0x1.1f12fep+0f, -0x1.72b1d2p-2f, 0x1.99fc0ep-4f, -0x1.48db0ap-6f,
0x1.73e368p-9f, -0x1.19b35ep-12f, 0x1.007988p-16f, -0x1.a7edcep-22f},
{0x1.1db7bp+0f, -0x1.6a4e4ap-2f, 0x1.83bbdep-4f, -0x1.2809b4p-6f,
0x1.39c08cp-9f, -0x1.b7b45ap-13f, 0x1.6e99b4p-17f, -0x1.13619cp-22f},
{0x1.1bb1c8p+0f, -0x1.5f23bap-2f, 0x1.694c92p-4f, -0x1.053e1cp-6f,
0x1.02bf72p-9f, -0x1.4f479p-13f, 0x1.005f8p-17f, -0x1.5f2446p-23f},
{0x1.18dec4p+0f, -0x1.5123f6p-2f, 0x1.4b8a1cp-4f, -0x1.c4243p-7f,
0x1.a1a8ap-10f, -0x1.f466b4p-14f, 0x1.5f835ep-18f, -0x1.b83166p-24f},
{0x1.152804p+0f, -0x1.4084cep-2f, 0x1.2ba2e8p-4f, -0x1.800f2ep-7f,
0x1.4a6dbp-10f, -0x1.6e326ap-14f, 0x1.d9761ap-19f, -0x1.0fca34p-24f},
{0x1.1087aep+0f, -0x1.2dbb04p-2f, 0x1.0aea8cp-4f, -0x1.40b516p-7f,
0x1.00c9ep-10f, -0x1.076afcp-14f, 0x1.39fadep-19f, -0x1.4b5762p-25f},
{0x1.0b0a7ap+0f, -0x1.19699p-2f, 0x1.d5551ep-5f, -0x1.07cce2p-7f,
0x1.890348p-11f, -0x1.757ecap-15f, 0x1.9b258ap-20f, -0x1.8fc6d2p-26f},
{0x1.04ce2cp+0f, -0x1.0449e4p-2f, 0x1.97f742p-5f, -0x1.ac8254p-8f,
0x1.28f5f6p-11f, -0x1.05b69ap-15f, 0x1.0a888ep-20f, -0x1.deace2p-27f},
{0x1.fbf9fcp-1f, -0x1.de264p-3f, 0x1.5f5b2p-5f, -0x1.588bc8p-8f,
0x1.bc6a0ap-12f, -0x1.6b9faep-16f, 0x1.573204p-21f, -0x1.1d3806p-27f},
{0x1.ed8f18p-1f, -0x1.b4cb6ap-3f, 0x1.2c7f3ep-5f, -0x1.13052p-8f,
0x1.4a5028p-12f, -0x1.f672bap-17f, 0x1.b83c76p-22f, -0x1.534f1ap-28f},
{0x1.debd34p-1f, -0x1.8d7cdap-3f, 0x1.ff9958p-6f, -0x1.b50be6p-9f,
0x1.e92c8ep-13f, -0x1.5a4b88p-17f, 0x1.1a2774p-22f, -0x1.942ae6p-29f},
{0x1.cfdbfp-1f, -0x1.68e33ep-3f, 0x1.b2683ep-6f, -0x1.5a9174p-9f,
0x1.69ddd4p-13f, -0x1.dd8f3ap-18f, 0x1.6a755p-23f, -0x1.e366ep-30f},
{0x1.c132aep-1f, -0x1.475a8ap-3f, 0x1.70a432p-6f, -0x1.12e3d4p-9f,
0x1.0c16bp-13f, -0x1.4a47f8p-18f, 0x1.d3d494p-24f, -0x1.2302c6p-30f},
{0x1.b2f5fep-1f, -0x1.28fefp-3f, 0x1.3923acp-6f, -0x1.b4ff7ap-10f,
0x1.8ea0ecp-14f, -0x1.cb31ecp-19f, 0x1.30011ep-24f, -0x1.61771p-31f},
{0x1.a54854p-1f, -0x1.0dbdbap-3f, 0x1.0a93e2p-6f, -0x1.5c96ap-10f,
0x1.29e0ccp-14f, -0x1.4160d8p-19f, 0x1.8e7b68p-25f, -0x1.b1cf2cp-32f},
{0x1.983ceep-1f, -0x1.eacc78p-4f, 0x1.c74418p-7f, -0x1.1756ap-10f,
0x1.bff366p-15f, -0x1.c56c02p-20f, 0x1.07b492p-25f, -0x1.0d4be8p-32f},
};
#ifndef LIBC_TARGET_CPU_HAS_FMA
constexpr size_t N_ERFF16_EXCEPTS = 5;
#else
constexpr size_t N_ERFF16_EXCEPTS = 4;
#endif
constexpr fputil::ExceptValues<float16, N_ERFF16_EXCEPTS> ERFF16_EXCEPTS{{
{0x1612, 0x16D9, 1, 0, 0},
{0x165C, 0x172C, 1, 0, 1},
{0x16F0, 0x17D3, 1, 0, 1},
{0x3BF2, 0x3AB7, 1, 0, 1},
#ifndef LIBC_TARGET_CPU_HAS_FMA
{0x3FF6, 0x3BF5, 1, 0, 1},
#endif
}};
using FPBits = typename fputil::FPBits<float16>;
FPBits xbits(x);
uint16_t x_abs = xbits.abs().uintval();
bool is_neg = xbits.is_neg();
float xf = fputil::cast<float16>(x);
if (auto r = ERFF16_EXCEPTS.lookup_odd(x_abs, is_neg);
LIBC_UNLIKELY(r.has_value()))
return r.value();
// |x| >= 4.0
if (LIBC_UNLIKELY(x_abs >= 0x4200U)) {
// Check for NaN or Inf
if (LIBC_UNLIKELY(x_abs >= 0x7c00U)) {
if (x_abs > 0x7c00U) {
if (xbits.is_signaling_nan()) {
fputil::raise_except_if_required(FE_INVALID);
return FPBits::quiet_nan().get_val();
}
return x;
}
// Inf -> returns 1.0 or -1.0
return is_neg ? -1.0f16 : 1.0f16;
}
return fputil::cast<float16>(is_neg ? -1.0f - xf * 0x1.0p-28f
: 1.0f - xf * 0x1.0p-28f);
}
// Polynomial approximation:
// erf(x) ~ x * (c0 + c1 * x^2 + c2 * x^4 + ... + c7 * x^14)
int idx = static_cast<int>(xbits.abs().get_val() * 8.0f16);
float xsq = xf * xf;
float x4 = xsq * xsq;
float c0 = fputil::multiply_add(xsq, (ERFF16_COEFFS[idx][1]),
(ERFF16_COEFFS[idx][0]));
float c1 = fputil::multiply_add(xsq, (ERFF16_COEFFS[idx][3]),
(ERFF16_COEFFS[idx][2]));
float c2 = fputil::multiply_add(xsq, (ERFF16_COEFFS[idx][5]),
(ERFF16_COEFFS[idx][4]));
float c3 = fputil::multiply_add(xsq, (ERFF16_COEFFS[idx][7]),
(ERFF16_COEFFS[idx][6]));
float x8 = x4 * x4;
float p0 = fputil::multiply_add(x4, c1, c0);
float p1 = fputil::multiply_add(x4, c3, c2);
float result = xf * fputil::multiply_add(x8, p1, p0);
// Clamp result to just inside [-1, 1]
if (LIBC_UNLIKELY(result > 1.0f))
return fputil::cast<float16>(1.0f - xf * 0x1.0p-28f);
else if (LIBC_UNLIKELY(result < -1.0f))
return fputil::cast<float16>(-1.0f - xf * 0x1.0p-28f);
return fputil::cast<float16>(result);
}
} // namespace math
} // namespace LIBC_NAMESPACE_DECL
#endif // LIBC_TYPES_HAS_FLOAT16
#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ERFF16_H