| 1 | //===-- Implementation header for erff16 ------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ERFF16_H |
| 10 | #define LLVM_LIBC_SRC___SUPPORT_MATH_ERFF16_H |
| 11 | |
| 12 | #include "include/llvm-libc-macros/float16-macros.h" |
| 13 | |
| 14 | #ifdef LIBC_TYPES_HAS_FLOAT16 |
| 15 | |
| 16 | #include "src/__support/FPUtil/FEnvImpl.h" |
| 17 | #include "src/__support/FPUtil/FPBits.h" |
| 18 | #include "src/__support/FPUtil/except_value_utils.h" |
| 19 | #include "src/__support/FPUtil/multiply_add.h" |
| 20 | #include "src/__support/macros/config.h" |
| 21 | #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY |
| 22 | |
| 23 | namespace LIBC_NAMESPACE_DECL { |
| 24 | |
| 25 | namespace math { |
| 26 | |
| 27 | LIBC_INLINE float16 erff16(float16 x) { |
| 28 | |
| 29 | // Polynomials approximating erf(x)/x on ( k/8, (k + 1)/8 ) generated by |
| 30 | // Sollya with: > P = fpminimax(erf(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14|], |
| 31 | // [|D...|], |
| 32 | // [k/8, (k + 1)/8]); |
| 33 | // for k = 0..31. |
| 34 | constexpr float ERFF16_COEFFS[32][8] = { |
| 35 | {0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82cdap-6f, |
| 36 | 0x1.564792p-8f, -0x1.8b3ac6p-11f, -0x1.126fcap-8f, 0x1.2d0bdcp-4f}, |
| 37 | {0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82ce4p-6f, |
| 38 | 0x1.565bcap-8f, -0x1.c02c66p-11f, 0x1.f92f68p-14f, -0x1.def402p-17f}, |
| 39 | {0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82ce2p-6f, |
| 40 | 0x1.565b9ep-8f, -0x1.c021f2p-11f, 0x1.f7c6d2p-14f, -0x1.c9e442p-17f}, |
| 41 | {0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82cccp-6f, |
| 42 | 0x1.56597p-8f, -0x1.bfde2p-11f, 0x1.f31a9ep-14f, -0x1.a5a436p-17f}, |
| 43 | {0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f18p-4f, -0x1.b82be4p-6f, |
| 44 | 0x1.564becp-8f, -0x1.bee87p-11f, 0x1.e94436p-14f, -0x1.79c0f2p-17f}, |
| 45 | {0x1.20dd74p+0f, -0x1.812744p-2f, 0x1.ce2ecp-4f, -0x1.b8265cp-6f, |
| 46 | 0x1.5615a2p-8f, -0x1.bc63aep-11f, 0x1.d87c22p-14f, -0x1.49584cp-17f}, |
| 47 | {0x1.20dd74p+0f, -0x1.81272ap-2f, 0x1.ce2cbp-4f, -0x1.b80ecap-6f, |
| 48 | 0x1.5572e2p-8f, -0x1.b715e6p-11f, 0x1.bfbb1ap-14f, -0x1.177a56p-17f}, |
| 49 | {0x1.20dd7p+0f, -0x1.812692p-2f, 0x1.ce23ap-4f, -0x1.b7c1dcp-6f, |
| 50 | 0x1.53e93p-8f, -0x1.ad97ccp-11f, 0x1.9f028cp-14f, -0x1.cdc4dap-18f}, |
| 51 | {0x1.20dd58p+0f, -0x1.8123e6p-2f, 0x1.ce0458p-4f, -0x1.b6f52ep-6f, |
| 52 | 0x1.50c292p-8f, -0x1.9ea246p-11f, 0x1.776546p-14f, -0x1.737c12p-18f}, |
| 53 | {0x1.20dce6p+0f, -0x1.811a5ap-2f, 0x1.cdab54p-4f, -0x1.b526d2p-6f, |
| 54 | 0x1.4b1d32p-8f, -0x1.896314p-11f, 0x1.4ad57p-14f, -0x1.231e1p-18f}, |
| 55 | {0x1.20db48p+0f, -0x1.80fdd8p-2f, 0x1.ccd34p-4f, -0x1.b196a2p-6f, |
| 56 | 0x1.4210c2p-8f, -0x1.6dbdfcp-11f, 0x1.1bca2ep-14f, -0x1.bca37p-19f}, |
| 57 | {0x1.20d64cp+0f, -0x1.80b4d4p-2f, 0x1.cb0882p-4f, -0x1.ab51fep-6f, |
| 58 | 0x1.34e1e6p-8f, -0x1.4c6638p-11f, 0x1.d9ad26p-15f, -0x1.4b0df8p-19f}, |
| 59 | {0x1.20c8fcp+0f, -0x1.8010ccp-2f, 0x1.c7a47ep-4f, -0x1.a155bep-6f, |
| 60 | 0x1.233502p-8f, -0x1.26c94cp-11f, 0x1.8094f2p-15f, -0x1.e0e3d8p-20f}, |
| 61 | {0x1.20a9bep+0f, -0x1.7ec7fcp-2f, 0x1.c1d758p-4f, -0x1.92c16p-6f, |
| 62 | 0x1.0d3072p-8f, -0x1.fda5bp-12f, 0x1.2fdd7cp-15f, -0x1.54eed4p-20f}, |
| 63 | {0x1.206828p+0f, -0x1.7c73f8p-2f, 0x1.b8c2dcp-4f, -0x1.7f0e5p-6f, |
| 64 | 0x1.e7061ep-9f, -0x1.ad36e8p-12f, 0x1.d39222p-16f, -0x1.d83dacp-21f}, |
| 65 | {0x1.1feb8ep+0f, -0x1.789834p-2f, 0x1.aba346p-4f, -0x1.663adcp-6f, |
| 66 | 0x1.ae99fcp-9f, -0x1.602f96p-12f, 0x1.5e9718p-16f, -0x1.3fca1p-21f}, |
| 67 | {0x1.1f12fep+0f, -0x1.72b1d2p-2f, 0x1.99fc0ep-4f, -0x1.48db0ap-6f, |
| 68 | 0x1.73e368p-9f, -0x1.19b35ep-12f, 0x1.007988p-16f, -0x1.a7edcep-22f}, |
| 69 | {0x1.1db7bp+0f, -0x1.6a4e4ap-2f, 0x1.83bbdep-4f, -0x1.2809b4p-6f, |
| 70 | 0x1.39c08cp-9f, -0x1.b7b45ap-13f, 0x1.6e99b4p-17f, -0x1.13619cp-22f}, |
| 71 | {0x1.1bb1c8p+0f, -0x1.5f23bap-2f, 0x1.694c92p-4f, -0x1.053e1cp-6f, |
| 72 | 0x1.02bf72p-9f, -0x1.4f479p-13f, 0x1.005f8p-17f, -0x1.5f2446p-23f}, |
| 73 | {0x1.18dec4p+0f, -0x1.5123f6p-2f, 0x1.4b8a1cp-4f, -0x1.c4243p-7f, |
| 74 | 0x1.a1a8ap-10f, -0x1.f466b4p-14f, 0x1.5f835ep-18f, -0x1.b83166p-24f}, |
| 75 | {0x1.152804p+0f, -0x1.4084cep-2f, 0x1.2ba2e8p-4f, -0x1.800f2ep-7f, |
| 76 | 0x1.4a6dbp-10f, -0x1.6e326ap-14f, 0x1.d9761ap-19f, -0x1.0fca34p-24f}, |
| 77 | {0x1.1087aep+0f, -0x1.2dbb04p-2f, 0x1.0aea8cp-4f, -0x1.40b516p-7f, |
| 78 | 0x1.00c9ep-10f, -0x1.076afcp-14f, 0x1.39fadep-19f, -0x1.4b5762p-25f}, |
| 79 | {0x1.0b0a7ap+0f, -0x1.19699p-2f, 0x1.d5551ep-5f, -0x1.07cce2p-7f, |
| 80 | 0x1.890348p-11f, -0x1.757ecap-15f, 0x1.9b258ap-20f, -0x1.8fc6d2p-26f}, |
| 81 | {0x1.04ce2cp+0f, -0x1.0449e4p-2f, 0x1.97f742p-5f, -0x1.ac8254p-8f, |
| 82 | 0x1.28f5f6p-11f, -0x1.05b69ap-15f, 0x1.0a888ep-20f, -0x1.deace2p-27f}, |
| 83 | {0x1.fbf9fcp-1f, -0x1.de264p-3f, 0x1.5f5b2p-5f, -0x1.588bc8p-8f, |
| 84 | 0x1.bc6a0ap-12f, -0x1.6b9faep-16f, 0x1.573204p-21f, -0x1.1d3806p-27f}, |
| 85 | {0x1.ed8f18p-1f, -0x1.b4cb6ap-3f, 0x1.2c7f3ep-5f, -0x1.13052p-8f, |
| 86 | 0x1.4a5028p-12f, -0x1.f672bap-17f, 0x1.b83c76p-22f, -0x1.534f1ap-28f}, |
| 87 | {0x1.debd34p-1f, -0x1.8d7cdap-3f, 0x1.ff9958p-6f, -0x1.b50be6p-9f, |
| 88 | 0x1.e92c8ep-13f, -0x1.5a4b88p-17f, 0x1.1a2774p-22f, -0x1.942ae6p-29f}, |
| 89 | {0x1.cfdbfp-1f, -0x1.68e33ep-3f, 0x1.b2683ep-6f, -0x1.5a9174p-9f, |
| 90 | 0x1.69ddd4p-13f, -0x1.dd8f3ap-18f, 0x1.6a755p-23f, -0x1.e366ep-30f}, |
| 91 | {0x1.c132aep-1f, -0x1.475a8ap-3f, 0x1.70a432p-6f, -0x1.12e3d4p-9f, |
| 92 | 0x1.0c16bp-13f, -0x1.4a47f8p-18f, 0x1.d3d494p-24f, -0x1.2302c6p-30f}, |
| 93 | {0x1.b2f5fep-1f, -0x1.28fefp-3f, 0x1.3923acp-6f, -0x1.b4ff7ap-10f, |
| 94 | 0x1.8ea0ecp-14f, -0x1.cb31ecp-19f, 0x1.30011ep-24f, -0x1.61771p-31f}, |
| 95 | {0x1.a54854p-1f, -0x1.0dbdbap-3f, 0x1.0a93e2p-6f, -0x1.5c96ap-10f, |
| 96 | 0x1.29e0ccp-14f, -0x1.4160d8p-19f, 0x1.8e7b68p-25f, -0x1.b1cf2cp-32f}, |
| 97 | {0x1.983ceep-1f, -0x1.eacc78p-4f, 0x1.c74418p-7f, -0x1.1756ap-10f, |
| 98 | 0x1.bff366p-15f, -0x1.c56c02p-20f, 0x1.07b492p-25f, -0x1.0d4be8p-32f}, |
| 99 | }; |
| 100 | #ifndef LIBC_TARGET_CPU_HAS_FMA |
| 101 | constexpr size_t N_ERFF16_EXCEPTS = 5; |
| 102 | #else |
| 103 | constexpr size_t N_ERFF16_EXCEPTS = 4; |
| 104 | #endif |
| 105 | |
| 106 | constexpr fputil::ExceptValues<float16, N_ERFF16_EXCEPTS> ERFF16_EXCEPTS{.values: { |
| 107 | {.input: 0x1612, .rnd_towardzero_result: 0x16D9, .rnd_upward_offset: 1, .rnd_downward_offset: 0, .rnd_tonearest_offset: 0}, |
| 108 | {.input: 0x165C, .rnd_towardzero_result: 0x172C, .rnd_upward_offset: 1, .rnd_downward_offset: 0, .rnd_tonearest_offset: 1}, |
| 109 | {.input: 0x16F0, .rnd_towardzero_result: 0x17D3, .rnd_upward_offset: 1, .rnd_downward_offset: 0, .rnd_tonearest_offset: 1}, |
| 110 | {.input: 0x3BF2, .rnd_towardzero_result: 0x3AB7, .rnd_upward_offset: 1, .rnd_downward_offset: 0, .rnd_tonearest_offset: 1}, |
| 111 | #ifndef LIBC_TARGET_CPU_HAS_FMA |
| 112 | {.input: 0x3FF6, .rnd_towardzero_result: 0x3BF5, .rnd_upward_offset: 1, .rnd_downward_offset: 0, .rnd_tonearest_offset: 1}, |
| 113 | #endif |
| 114 | }}; |
| 115 | using FPBits = typename fputil::FPBits<float16>; |
| 116 | FPBits xbits(x); |
| 117 | uint16_t x_abs = xbits.abs().uintval(); |
| 118 | bool is_neg = xbits.is_neg(); |
| 119 | |
| 120 | float xf = fputil::cast<float16>(x); |
| 121 | if (auto r = ERFF16_EXCEPTS.lookup_odd(x_abs, sign: is_neg); |
| 122 | LIBC_UNLIKELY(r.has_value())) |
| 123 | return r.value(); |
| 124 | |
| 125 | // |x| >= 4.0 |
| 126 | if (LIBC_UNLIKELY(x_abs >= 0x4200U)) { |
| 127 | // Check for NaN or Inf |
| 128 | if (LIBC_UNLIKELY(x_abs >= 0x7c00U)) { |
| 129 | if (x_abs > 0x7c00U) { |
| 130 | if (xbits.is_signaling_nan()) { |
| 131 | fputil::raise_except_if_required(FE_INVALID); |
| 132 | return FPBits::quiet_nan().get_val(); |
| 133 | } |
| 134 | return x; |
| 135 | } |
| 136 | // Inf -> returns 1.0 or -1.0 |
| 137 | return is_neg ? -1.0f16 : 1.0f16; |
| 138 | } |
| 139 | |
| 140 | return fputil::cast<float16>(x: is_neg ? -1.0f - xf * 0x1.0p-28f |
| 141 | : 1.0f - xf * 0x1.0p-28f); |
| 142 | } |
| 143 | |
| 144 | // Polynomial approximation: |
| 145 | // erf(x) ~ x * (c0 + c1 * x^2 + c2 * x^4 + ... + c7 * x^14) |
| 146 | |
| 147 | int idx = static_cast<int>(xbits.abs().get_val() * 8.0f16); |
| 148 | |
| 149 | float xsq = xf * xf; |
| 150 | float x4 = xsq * xsq; |
| 151 | float c0 = fputil::multiply_add(x: xsq, y: (ERFF16_COEFFS[idx][1]), |
| 152 | z: (ERFF16_COEFFS[idx][0])); |
| 153 | float c1 = fputil::multiply_add(x: xsq, y: (ERFF16_COEFFS[idx][3]), |
| 154 | z: (ERFF16_COEFFS[idx][2])); |
| 155 | float c2 = fputil::multiply_add(x: xsq, y: (ERFF16_COEFFS[idx][5]), |
| 156 | z: (ERFF16_COEFFS[idx][4])); |
| 157 | float c3 = fputil::multiply_add(x: xsq, y: (ERFF16_COEFFS[idx][7]), |
| 158 | z: (ERFF16_COEFFS[idx][6])); |
| 159 | |
| 160 | float x8 = x4 * x4; |
| 161 | float p0 = fputil::multiply_add(x: x4, y: c1, z: c0); |
| 162 | float p1 = fputil::multiply_add(x: x4, y: c3, z: c2); |
| 163 | |
| 164 | float result = xf * fputil::multiply_add(x: x8, y: p1, z: p0); |
| 165 | |
| 166 | // Clamp result to just inside [-1, 1] |
| 167 | if (LIBC_UNLIKELY(result > 1.0f)) |
| 168 | return fputil::cast<float16>(x: 1.0f - xf * 0x1.0p-28f); |
| 169 | else if (LIBC_UNLIKELY(result < -1.0f)) |
| 170 | return fputil::cast<float16>(x: -1.0f - xf * 0x1.0p-28f); |
| 171 | |
| 172 | return fputil::cast<float16>(x: result); |
| 173 | } |
| 174 | |
| 175 | } // namespace math |
| 176 | |
| 177 | } // namespace LIBC_NAMESPACE_DECL |
| 178 | |
| 179 | #endif // LIBC_TYPES_HAS_FLOAT16 |
| 180 | |
| 181 | #endif // LLVM_LIBC_SRC___SUPPORT_MATH_ERFF16_H |
| 182 | |