1//===-- Implementation header for erff16 ------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ERFF16_H
10#define LLVM_LIBC_SRC___SUPPORT_MATH_ERFF16_H
11
12#include "include/llvm-libc-macros/float16-macros.h"
13
14#ifdef LIBC_TYPES_HAS_FLOAT16
15
16#include "src/__support/FPUtil/FEnvImpl.h"
17#include "src/__support/FPUtil/FPBits.h"
18#include "src/__support/FPUtil/except_value_utils.h"
19#include "src/__support/FPUtil/multiply_add.h"
20#include "src/__support/macros/config.h"
21#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
22
23namespace LIBC_NAMESPACE_DECL {
24
25namespace math {
26
27LIBC_INLINE float16 erff16(float16 x) {
28
29 // Polynomials approximating erf(x)/x on ( k/8, (k + 1)/8 ) generated by
30 // Sollya with: > P = fpminimax(erf(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14|],
31 // [|D...|],
32 // [k/8, (k + 1)/8]);
33 // for k = 0..31.
34 constexpr float ERFF16_COEFFS[32][8] = {
35 {0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82cdap-6f,
36 0x1.564792p-8f, -0x1.8b3ac6p-11f, -0x1.126fcap-8f, 0x1.2d0bdcp-4f},
37 {0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82ce4p-6f,
38 0x1.565bcap-8f, -0x1.c02c66p-11f, 0x1.f92f68p-14f, -0x1.def402p-17f},
39 {0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82ce2p-6f,
40 0x1.565b9ep-8f, -0x1.c021f2p-11f, 0x1.f7c6d2p-14f, -0x1.c9e442p-17f},
41 {0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f22p-4f, -0x1.b82cccp-6f,
42 0x1.56597p-8f, -0x1.bfde2p-11f, 0x1.f31a9ep-14f, -0x1.a5a436p-17f},
43 {0x1.20dd76p+0f, -0x1.812746p-2f, 0x1.ce2f18p-4f, -0x1.b82be4p-6f,
44 0x1.564becp-8f, -0x1.bee87p-11f, 0x1.e94436p-14f, -0x1.79c0f2p-17f},
45 {0x1.20dd74p+0f, -0x1.812744p-2f, 0x1.ce2ecp-4f, -0x1.b8265cp-6f,
46 0x1.5615a2p-8f, -0x1.bc63aep-11f, 0x1.d87c22p-14f, -0x1.49584cp-17f},
47 {0x1.20dd74p+0f, -0x1.81272ap-2f, 0x1.ce2cbp-4f, -0x1.b80ecap-6f,
48 0x1.5572e2p-8f, -0x1.b715e6p-11f, 0x1.bfbb1ap-14f, -0x1.177a56p-17f},
49 {0x1.20dd7p+0f, -0x1.812692p-2f, 0x1.ce23ap-4f, -0x1.b7c1dcp-6f,
50 0x1.53e93p-8f, -0x1.ad97ccp-11f, 0x1.9f028cp-14f, -0x1.cdc4dap-18f},
51 {0x1.20dd58p+0f, -0x1.8123e6p-2f, 0x1.ce0458p-4f, -0x1.b6f52ep-6f,
52 0x1.50c292p-8f, -0x1.9ea246p-11f, 0x1.776546p-14f, -0x1.737c12p-18f},
53 {0x1.20dce6p+0f, -0x1.811a5ap-2f, 0x1.cdab54p-4f, -0x1.b526d2p-6f,
54 0x1.4b1d32p-8f, -0x1.896314p-11f, 0x1.4ad57p-14f, -0x1.231e1p-18f},
55 {0x1.20db48p+0f, -0x1.80fdd8p-2f, 0x1.ccd34p-4f, -0x1.b196a2p-6f,
56 0x1.4210c2p-8f, -0x1.6dbdfcp-11f, 0x1.1bca2ep-14f, -0x1.bca37p-19f},
57 {0x1.20d64cp+0f, -0x1.80b4d4p-2f, 0x1.cb0882p-4f, -0x1.ab51fep-6f,
58 0x1.34e1e6p-8f, -0x1.4c6638p-11f, 0x1.d9ad26p-15f, -0x1.4b0df8p-19f},
59 {0x1.20c8fcp+0f, -0x1.8010ccp-2f, 0x1.c7a47ep-4f, -0x1.a155bep-6f,
60 0x1.233502p-8f, -0x1.26c94cp-11f, 0x1.8094f2p-15f, -0x1.e0e3d8p-20f},
61 {0x1.20a9bep+0f, -0x1.7ec7fcp-2f, 0x1.c1d758p-4f, -0x1.92c16p-6f,
62 0x1.0d3072p-8f, -0x1.fda5bp-12f, 0x1.2fdd7cp-15f, -0x1.54eed4p-20f},
63 {0x1.206828p+0f, -0x1.7c73f8p-2f, 0x1.b8c2dcp-4f, -0x1.7f0e5p-6f,
64 0x1.e7061ep-9f, -0x1.ad36e8p-12f, 0x1.d39222p-16f, -0x1.d83dacp-21f},
65 {0x1.1feb8ep+0f, -0x1.789834p-2f, 0x1.aba346p-4f, -0x1.663adcp-6f,
66 0x1.ae99fcp-9f, -0x1.602f96p-12f, 0x1.5e9718p-16f, -0x1.3fca1p-21f},
67 {0x1.1f12fep+0f, -0x1.72b1d2p-2f, 0x1.99fc0ep-4f, -0x1.48db0ap-6f,
68 0x1.73e368p-9f, -0x1.19b35ep-12f, 0x1.007988p-16f, -0x1.a7edcep-22f},
69 {0x1.1db7bp+0f, -0x1.6a4e4ap-2f, 0x1.83bbdep-4f, -0x1.2809b4p-6f,
70 0x1.39c08cp-9f, -0x1.b7b45ap-13f, 0x1.6e99b4p-17f, -0x1.13619cp-22f},
71 {0x1.1bb1c8p+0f, -0x1.5f23bap-2f, 0x1.694c92p-4f, -0x1.053e1cp-6f,
72 0x1.02bf72p-9f, -0x1.4f479p-13f, 0x1.005f8p-17f, -0x1.5f2446p-23f},
73 {0x1.18dec4p+0f, -0x1.5123f6p-2f, 0x1.4b8a1cp-4f, -0x1.c4243p-7f,
74 0x1.a1a8ap-10f, -0x1.f466b4p-14f, 0x1.5f835ep-18f, -0x1.b83166p-24f},
75 {0x1.152804p+0f, -0x1.4084cep-2f, 0x1.2ba2e8p-4f, -0x1.800f2ep-7f,
76 0x1.4a6dbp-10f, -0x1.6e326ap-14f, 0x1.d9761ap-19f, -0x1.0fca34p-24f},
77 {0x1.1087aep+0f, -0x1.2dbb04p-2f, 0x1.0aea8cp-4f, -0x1.40b516p-7f,
78 0x1.00c9ep-10f, -0x1.076afcp-14f, 0x1.39fadep-19f, -0x1.4b5762p-25f},
79 {0x1.0b0a7ap+0f, -0x1.19699p-2f, 0x1.d5551ep-5f, -0x1.07cce2p-7f,
80 0x1.890348p-11f, -0x1.757ecap-15f, 0x1.9b258ap-20f, -0x1.8fc6d2p-26f},
81 {0x1.04ce2cp+0f, -0x1.0449e4p-2f, 0x1.97f742p-5f, -0x1.ac8254p-8f,
82 0x1.28f5f6p-11f, -0x1.05b69ap-15f, 0x1.0a888ep-20f, -0x1.deace2p-27f},
83 {0x1.fbf9fcp-1f, -0x1.de264p-3f, 0x1.5f5b2p-5f, -0x1.588bc8p-8f,
84 0x1.bc6a0ap-12f, -0x1.6b9faep-16f, 0x1.573204p-21f, -0x1.1d3806p-27f},
85 {0x1.ed8f18p-1f, -0x1.b4cb6ap-3f, 0x1.2c7f3ep-5f, -0x1.13052p-8f,
86 0x1.4a5028p-12f, -0x1.f672bap-17f, 0x1.b83c76p-22f, -0x1.534f1ap-28f},
87 {0x1.debd34p-1f, -0x1.8d7cdap-3f, 0x1.ff9958p-6f, -0x1.b50be6p-9f,
88 0x1.e92c8ep-13f, -0x1.5a4b88p-17f, 0x1.1a2774p-22f, -0x1.942ae6p-29f},
89 {0x1.cfdbfp-1f, -0x1.68e33ep-3f, 0x1.b2683ep-6f, -0x1.5a9174p-9f,
90 0x1.69ddd4p-13f, -0x1.dd8f3ap-18f, 0x1.6a755p-23f, -0x1.e366ep-30f},
91 {0x1.c132aep-1f, -0x1.475a8ap-3f, 0x1.70a432p-6f, -0x1.12e3d4p-9f,
92 0x1.0c16bp-13f, -0x1.4a47f8p-18f, 0x1.d3d494p-24f, -0x1.2302c6p-30f},
93 {0x1.b2f5fep-1f, -0x1.28fefp-3f, 0x1.3923acp-6f, -0x1.b4ff7ap-10f,
94 0x1.8ea0ecp-14f, -0x1.cb31ecp-19f, 0x1.30011ep-24f, -0x1.61771p-31f},
95 {0x1.a54854p-1f, -0x1.0dbdbap-3f, 0x1.0a93e2p-6f, -0x1.5c96ap-10f,
96 0x1.29e0ccp-14f, -0x1.4160d8p-19f, 0x1.8e7b68p-25f, -0x1.b1cf2cp-32f},
97 {0x1.983ceep-1f, -0x1.eacc78p-4f, 0x1.c74418p-7f, -0x1.1756ap-10f,
98 0x1.bff366p-15f, -0x1.c56c02p-20f, 0x1.07b492p-25f, -0x1.0d4be8p-32f},
99 };
100#ifndef LIBC_TARGET_CPU_HAS_FMA
101 constexpr size_t N_ERFF16_EXCEPTS = 5;
102#else
103 constexpr size_t N_ERFF16_EXCEPTS = 4;
104#endif
105
106 constexpr fputil::ExceptValues<float16, N_ERFF16_EXCEPTS> ERFF16_EXCEPTS{.values: {
107 {.input: 0x1612, .rnd_towardzero_result: 0x16D9, .rnd_upward_offset: 1, .rnd_downward_offset: 0, .rnd_tonearest_offset: 0},
108 {.input: 0x165C, .rnd_towardzero_result: 0x172C, .rnd_upward_offset: 1, .rnd_downward_offset: 0, .rnd_tonearest_offset: 1},
109 {.input: 0x16F0, .rnd_towardzero_result: 0x17D3, .rnd_upward_offset: 1, .rnd_downward_offset: 0, .rnd_tonearest_offset: 1},
110 {.input: 0x3BF2, .rnd_towardzero_result: 0x3AB7, .rnd_upward_offset: 1, .rnd_downward_offset: 0, .rnd_tonearest_offset: 1},
111#ifndef LIBC_TARGET_CPU_HAS_FMA
112 {.input: 0x3FF6, .rnd_towardzero_result: 0x3BF5, .rnd_upward_offset: 1, .rnd_downward_offset: 0, .rnd_tonearest_offset: 1},
113#endif
114 }};
115 using FPBits = typename fputil::FPBits<float16>;
116 FPBits xbits(x);
117 uint16_t x_abs = xbits.abs().uintval();
118 bool is_neg = xbits.is_neg();
119
120 float xf = fputil::cast<float16>(x);
121 if (auto r = ERFF16_EXCEPTS.lookup_odd(x_abs, sign: is_neg);
122 LIBC_UNLIKELY(r.has_value()))
123 return r.value();
124
125 // |x| >= 4.0
126 if (LIBC_UNLIKELY(x_abs >= 0x4200U)) {
127 // Check for NaN or Inf
128 if (LIBC_UNLIKELY(x_abs >= 0x7c00U)) {
129 if (x_abs > 0x7c00U) {
130 if (xbits.is_signaling_nan()) {
131 fputil::raise_except_if_required(FE_INVALID);
132 return FPBits::quiet_nan().get_val();
133 }
134 return x;
135 }
136 // Inf -> returns 1.0 or -1.0
137 return is_neg ? -1.0f16 : 1.0f16;
138 }
139
140 return fputil::cast<float16>(x: is_neg ? -1.0f - xf * 0x1.0p-28f
141 : 1.0f - xf * 0x1.0p-28f);
142 }
143
144 // Polynomial approximation:
145 // erf(x) ~ x * (c0 + c1 * x^2 + c2 * x^4 + ... + c7 * x^14)
146
147 int idx = static_cast<int>(xbits.abs().get_val() * 8.0f16);
148
149 float xsq = xf * xf;
150 float x4 = xsq * xsq;
151 float c0 = fputil::multiply_add(x: xsq, y: (ERFF16_COEFFS[idx][1]),
152 z: (ERFF16_COEFFS[idx][0]));
153 float c1 = fputil::multiply_add(x: xsq, y: (ERFF16_COEFFS[idx][3]),
154 z: (ERFF16_COEFFS[idx][2]));
155 float c2 = fputil::multiply_add(x: xsq, y: (ERFF16_COEFFS[idx][5]),
156 z: (ERFF16_COEFFS[idx][4]));
157 float c3 = fputil::multiply_add(x: xsq, y: (ERFF16_COEFFS[idx][7]),
158 z: (ERFF16_COEFFS[idx][6]));
159
160 float x8 = x4 * x4;
161 float p0 = fputil::multiply_add(x: x4, y: c1, z: c0);
162 float p1 = fputil::multiply_add(x: x4, y: c3, z: c2);
163
164 float result = xf * fputil::multiply_add(x: x8, y: p1, z: p0);
165
166 // Clamp result to just inside [-1, 1]
167 if (LIBC_UNLIKELY(result > 1.0f))
168 return fputil::cast<float16>(x: 1.0f - xf * 0x1.0p-28f);
169 else if (LIBC_UNLIKELY(result < -1.0f))
170 return fputil::cast<float16>(x: -1.0f - xf * 0x1.0p-28f);
171
172 return fputil::cast<float16>(x: result);
173}
174
175} // namespace math
176
177} // namespace LIBC_NAMESPACE_DECL
178
179#endif // LIBC_TYPES_HAS_FLOAT16
180
181#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ERFF16_H
182