1//===-- Single-precision general inverse trigonometric functions ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_INV_TRIGF_UTILS_H
10#define LLVM_LIBC_SRC___SUPPORT_MATH_INV_TRIGF_UTILS_H
11
12#include "src/__support/FPUtil/PolyEval.h"
13#include "src/__support/FPUtil/multiply_add.h"
14#include "src/__support/common.h"
15#include "src/__support/macros/config.h"
16
17namespace LIBC_NAMESPACE_DECL {
18
19namespace inv_trigf_utils_internal {
20
21// PI and PI / 2
22LIBC_INLINE_VAR constexpr double M_MATH_PI = 0x1.921fb54442d18p+1;
23LIBC_INLINE_VAR constexpr double M_MATH_PI_2 = 0x1.921fb54442d18p+0;
24
25// Polynomial approximation for 0 <= x <= 1:
26// atan(x) ~ atan((i/16) + (x - (i/16)) * Q(x - i/16)
27// = P(x - i/16)
28// Generated by Sollya with:
29// > for i from 1 to 16 do {
30// mid_point = i/16;
31// P = fpminimax(atan(mid_point + x), 8, [|D...|], [-1/32, 1/32]);
32// print("{", coeff(P, 0), ",", coeff(P, 1), ",", coeff(P, 2), ",",
33// coeff(P, 3), ",", coeff(P, 4), ",", coeff(P, 5), ",", coeff(P, 6),
34// ",", coeff(P, 7), ",", coeff(P, 8), "},");
35// };
36// For i = 0, the polynomial is generated by:
37// > P = fpminimax(atan(x)/x, 7, [|1, D...|], [0, 1/32]);
38// > dirtyinfnorm((atan(x) - x*P)/x, [0, 1/32]);
39// 0x1.feb2fcdba66447ccbe28a1a0f935b51678a718fb1p-59
40// Notice that degree-7 is good enough for atanf, but degree-8 helps reduce the
41// error bounds for atan2f's fast pass 16 times, and it does not affect the
42// performance of atanf much.
43LIBC_INLINE_VAR constexpr double ATAN_COEFFS[17][9] = {
44 {0.0, 1.0, 0x1.3f8d76d26d61bp-47, -0x1.5555555574cd8p-2,
45 0x1.0dde5d06878eap-29, 0x1.99997738acc77p-3, 0x1.2c43eac9797cap-16,
46 -0x1.25fb020007dbdp-3, 0x1.c1b6c31d7b0aep-7},
47 {0x1.ff55bb72cfde9p-5, 0x1.fe01fe01fe007p-1, -0x1.fc05f809ed8dap-5,
48 -0x1.4d69303afe04ep-2, 0x1.f61bc3e8349cp-5, 0x1.820839278756bp-3,
49 -0x1.eda4de1c6bf3fp-5, -0x1.0514d42d64a63p-3, 0x1.db3746a442dcbp-5},
50 {0x1.fd5ba9aac2f6ep-4, 0x1.f81f81f81f813p-1, -0x1.f05e09d0dc378p-4,
51 -0x1.368c3aa719215p-2, 0x1.d9b16b33ff9c9p-4, 0x1.40488f9c6262ap-3,
52 -0x1.ba55933e62ea5p-4, -0x1.64c6a15cd9116p-4, 0x1.9273d5939a75ap-4},
53 {0x1.7b97b4bce5b02p-3, 0x1.ee9c7f8458e05p-1, -0x1.665c226d6961p-3,
54 -0x1.1344bb7391703p-2, 0x1.42aca8b0081b9p-3, 0x1.c32d9381d7c03p-4,
55 -0x1.13e970672e246p-3, -0x1.181ed934dd733p-5, 0x1.bad81ea190c08p-4},
56 {0x1.f5b75f92c80ddp-3, 0x1.e1e1e1e1e1e2cp-1, -0x1.c5894d10d363dp-3,
57 -0x1.ce6de025f9f5ep-3, 0x1.78a3a07c8dd7fp-3, 0x1.dd5f5180f386ep-5,
58 -0x1.1b1f513c4536bp-3, 0x1.0df852e58c43cp-6, 0x1.722e7a7e42505p-4},
59 {0x1.362773707ebccp-2, 0x1.d272ca3fc5b2ep-1, -0x1.0997e8aeca8fbp-2,
60 -0x1.6cf6666e5e693p-3, 0x1.8dd1e907e88adp-3, 0x1.24849ac0caa5dp-7,
61 -0x1.f496be486229dp-4, 0x1.b7d54b8e759ecp-5, 0x1.d39c0d39c3922p-5},
62 {0x1.6f61941e4def1p-2, 0x1.c0e070381c0f2p-1, -0x1.2726dd135d9eep-2,
63 -0x1.09f37b39b70e4p-3, 0x1.85eacdaadd712p-3, -0x1.04d66340d5b9p-5,
64 -0x1.8056b15a22b98p-4, 0x1.29baf494ad3ddp-4, 0x1.52d5881322a7ap-6},
65 {0x1.a64eec3cc23fdp-2, 0x1.adbe87f94906ap-1, -0x1.3b9d8eab55addp-2,
66 -0x1.57c09646eb7p-4, 0x1.6795319e3b8dfp-3, -0x1.f2d89b5ef31bep-5,
67 -0x1.f38aac26203cap-5, 0x1.3262802235e3fp-4, -0x1.2afd6b9a57d66p-7},
68 {0x1.dac670561bb4fp-2, 0x1.99999999999ap-1, -0x1.47ae147adff11p-2,
69 -0x1.5d867c40188b7p-5, 0x1.3a92a2df85e7ap-3, -0x1.3ec457c46e851p-4,
70 -0x1.ec1b9777e2e5bp-6, 0x1.0a542992a821ep-4, -0x1.ccffbe2f0d945p-6},
71 {0x1.0657e94db30dp-1, 0x1.84f00c2780615p-1, -0x1.4c62cb562defap-2,
72 -0x1.e6495b3c14e03p-8, 0x1.063c2fa617bfcp-3, -0x1.58b782d9907aap-4,
73 -0x1.41e6ff524b7fp-8, 0x1.937dfff3205a7p-5, -0x1.0fb1fd1c729dp-5},
74 {0x1.1e00babdefeb4p-1, 0x1.702e05c0b816ep-1, -0x1.4af2b78215fbep-2,
75 0x1.5d0b7e9f36997p-6, 0x1.a1247cb978debp-4, -0x1.519e1457734cap-4,
76 0x1.a755cf86b5bfbp-7, 0x1.096d174284564p-5, -0x1.081adf539ad58p-5},
77 {0x1.345f01cce37bbp-1, 0x1.5babcc647fa8ep-1, -0x1.449db09426a6dp-2,
78 0x1.655caac5896dap-5, 0x1.3bbbd22d05a61p-4, -0x1.34a2febee042fp-4,
79 0x1.84df9c8269e34p-6, 0x1.200e8176c899ap-6, -0x1.c00b23c3ce222p-6},
80 {0x1.4978fa3269ee1p-1, 0x1.47ae147ae1477p-1, -0x1.3a92a3055231ap-2,
81 0x1.ec21b515a4a2p-5, 0x1.c2f8b81f9a0d2p-5, -0x1.0ba9964125453p-4,
82 0x1.d7b5614777a05p-6, 0x1.971e91ed73595p-8, -0x1.3fc375a78dc74p-6},
83 {0x1.5d58987169b18p-1, 0x1.34679ace01343p-1, -0x1.2ddfb039136e5p-2,
84 0x1.2491307b9fb73p-4, 0x1.29c7e4886dc22p-5, -0x1.bca78bcca83ap-5,
85 0x1.e63efd7cbe1ddp-6, -0x1.8ea6c4f03b42dp-10, -0x1.9385b5c3a6997p-7},
86 {0x1.700a7c5784634p-1, 0x1.21fb78121fb76p-1, -0x1.1f6a8499e5d1ap-2,
87 0x1.41b15e5e29423p-4, 0x1.59bc953163345p-6, -0x1.63b54b13184ddp-5,
88 0x1.c9086666d213p-6, -0x1.90c3b4ad8d4bcp-8, -0x1.80f08ed9f6f57p-8},
89 {0x1.819d0b7158a4dp-1, 0x1.107fbbe01107ep-1, -0x1.0feeb4089670ep-2,
90 0x1.50e5afb93f5cbp-4, 0x1.2a7c2adffeffbp-7, -0x1.12bd29b4f1b43p-5,
91 0x1.93f71f0eb00eap-6, -0x1.10ece5ad30e28p-7, -0x1.db1a76bcd2b9cp-10},
92 {0x1.921fb54442d18p-1, 0x1.ffffffffffffep-2, -0x1.fffffffffc51cp-3,
93 0x1.555555557002ep-4, -0x1.a88260c338e75p-30, -0x1.99999f9a7614fp-6,
94 0x1.555e31a1e15e9p-6, -0x1.245240d65e629p-7, -0x1.fa9ba66478903p-11},
95};
96
97// Look-up table for atan(k/16) with k = 0..16.
98LIBC_INLINE_VAR constexpr double ATAN_K_OVER_16[17] = {
99 0.0,
100 0x1.ff55bb72cfdeap-5,
101 0x1.fd5ba9aac2f6ep-4,
102 0x1.7b97b4bce5b02p-3,
103 0x1.f5b75f92c80ddp-3,
104 0x1.362773707ebccp-2,
105 0x1.6f61941e4def1p-2,
106 0x1.a64eec3cc23fdp-2,
107 0x1.dac670561bb4fp-2,
108 0x1.0657e94db30dp-1,
109 0x1.1e00babdefeb4p-1,
110 0x1.345f01cce37bbp-1,
111 0x1.4978fa3269ee1p-1,
112 0x1.5d58987169b18p-1,
113 0x1.700a7c5784634p-1,
114 0x1.819d0b7158a4dp-1,
115 0x1.921fb54442d18p-1,
116};
117
118// For |x| <= 1/32 and 0 <= i <= 16, return Q(x) such that:
119// Q(x) ~ (atan(x + i/16) - atan(i/16)) / x.
120LIBC_INLINE double atan_eval(double x, unsigned i) {
121 double x2 = x * x;
122
123 double c0 = fputil::multiply_add(x, y: ATAN_COEFFS[i][2], z: ATAN_COEFFS[i][1]);
124 double c1 = fputil::multiply_add(x, y: ATAN_COEFFS[i][4], z: ATAN_COEFFS[i][3]);
125 double c2 = fputil::multiply_add(x, y: ATAN_COEFFS[i][6], z: ATAN_COEFFS[i][5]);
126 double c3 = fputil::multiply_add(x, y: ATAN_COEFFS[i][8], z: ATAN_COEFFS[i][7]);
127
128 double x4 = x2 * x2;
129 double d1 = fputil::multiply_add(x: x2, y: c1, z: c0);
130 double d2 = fputil::multiply_add(x: x2, y: c3, z: c2);
131 double p = fputil::multiply_add(x: x4, y: d2, z: d1);
132 return p;
133}
134
135// Evaluate atan without big lookup table.
136// atan(n/d) - atan(k/16) = atan((n/d - k/16) / (1 + (n/d) * (k/16)))
137// = atan((n - d * k/16)) / (d + n * k/16))
138// So we let q = (n - d * k/16) / (d + n * k/16),
139// and approximate with Taylor polynomial:
140// atan(q) ~ q - q^3/3 + q^5/5 - q^7/7 + q^9/9
141LIBC_INLINE double atan_eval_no_table(double num, double den,
142 double k_over_16) {
143 double num_r = fputil::multiply_add(x: den, y: -k_over_16, z: num);
144 double den_r = fputil::multiply_add(x: num, y: k_over_16, z: den);
145 double q = num_r / den_r;
146
147 constexpr double ATAN_TAYLOR[] = {
148 -0x1.5555555555555p-2,
149 0x1.999999999999ap-3,
150 -0x1.2492492492492p-3,
151 0x1.c71c71c71c71cp-4,
152 };
153 double q2 = q * q;
154 double q3 = q2 * q;
155 double q4 = q2 * q2;
156 double c0 = fputil::multiply_add(x: q2, y: ATAN_TAYLOR[1], z: ATAN_TAYLOR[0]);
157 double c1 = fputil::multiply_add(x: q2, y: ATAN_TAYLOR[3], z: ATAN_TAYLOR[2]);
158 double d = fputil::multiply_add(x: q4, y: c1, z: c0);
159 return fputil::multiply_add(x: q3, y: d, z: q);
160}
161
162// > Q = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24|],
163// [|1, D...|], [0, 0.5]);
164// > dirtyinfnorm((asin(x) - x*Q)/asin(x), [0, 0.5]);
165// 0x1.1ff...p-56
166LIBC_INLINE_VAR constexpr double ASIN_COEFFS[12] = {
167 0x1.555555555538p-3, 0x1.333333336fd5bp-4, 0x1.6db6db41ce4bcp-5,
168 0x1.f1c72c66896dep-6, 0x1.6e89f0a0ac64bp-6, 0x1.1c6c111de4074p-6,
169 0x1.c6fa84b5699acp-7, 0x1.8ed60a3e6dd19p-7, 0x1.ab3a090750049p-8,
170 0x1.405213cd1ef46p-6, -0x1.0a5a381f73f65p-6, 0x1.05985a32a9045p-5,
171};
172
173// Evaluate P(x^2) - 1, where P(x^2) ~ asin(x)/x
174LIBC_INLINE LIBC_CONSTEXPR double asin_eval(double xsq) {
175 double x4 = xsq * xsq;
176 double c0 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[1], z: ASIN_COEFFS[0]);
177 double c1 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[3], z: ASIN_COEFFS[2]);
178 double c2 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[5], z: ASIN_COEFFS[4]);
179 double c3 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[7], z: ASIN_COEFFS[6]);
180 double c4 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[9], z: ASIN_COEFFS[8]);
181 double c5 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[11], z: ASIN_COEFFS[10]);
182 double x8 = x4 * x4;
183 double d0 = fputil::multiply_add(x: x4, y: c1, z: c0);
184 double d1 = fputil::multiply_add(x: x4, y: c3, z: c2);
185 double d2 = fputil::multiply_add(x: x4, y: c5, z: c4);
186 return fputil::polyeval(x: x8, a0: d0, a: d1, a: d2);
187}
188
189// the coefficients for the polynomial approximation of asin(x)/(pi*x) in the
190// range [0, 0.5] extracted using Sollya.
191//
192// Sollya code:
193// > prec = 200;
194// > display = hexadecimal;
195// > g = asin(x) / (pi * x);
196// > P = fpminimax(g, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22|],
197// > [|D...|], [0, 0.5]);
198// > for i from 0 to degree(P) do coeff(P, i);
199// > print("Error:", dirtyinfnorm(P - g, [1e-30; 0.25]));
200// Error : 0x1.6b01ec54170565911f924eb53361de37df00d74e2a10a21d5p-56 ~ 2^−55.496
201//
202// Non-zero coefficients (even powers only):
203constexpr double ASINPI_COEFFS[13] = {
204 0x1.45f306dc9c883p-2, // x^0
205 0x1.b2995e7b7af0fp-5, // x^2
206 0x1.8723a1d61d2e9p-6, // x^4
207 0x1.d1a4529a30a69p-7, // x^6
208 0x1.3ce53861f8f1fp-7, // x^8
209 0x1.d2b076c914efep-8, // x^10
210 0x1.6a2b36f9aed68p-8, // x^12
211 0x1.21604ae2879a2p-8, // x^14
212 0x1.ff0549b4fd0d6p-9, // x^16
213 0x1.035d343508f72p-9, // x^18
214 0x1.a7b91f72b1592p-8, // x^20
215 -0x1.6a3fb073e97aep-8, // x^22
216 0x1.547a51d51664ap-7 // x^24
217};
218
219// Evaluates P1(v2) = c1 + c2*v2 + c3*v2^2 + ... + c12*v2^11 (tail of P
220// without c0) using Estrin's scheme for instruction-level parallelism.
221//
222// The tail polynomial has 12 coefficients (ASINPI_COEFFS[1] through
223// ASINPI_COEFFS[12]) in powers of v2:
224// P1(v2) = c1 + c2*v2 + c3*v2^2 + c4*v2^3 + ... + c12*v2^11
225//
226// Estrin pairs them bottom-up:
227// Level 0 (6 pairs, using v2):
228// p0 = c1 + c2*v2 p1 = c3 + c4*v2
229// p2 = c5 + c6*v2 p3 = c7 + c8*v2
230// p4 = c9 + c10*v2 p5 = c11 + c12*v2
231// Level 1 (3 pairs, using v4):
232// q0 = p0 + p1*v4 q1 = p2 + p3*v4
233// q2 = p4 + p5*v4
234// Level 2 (using v8):
235// r0 = q0 + q1*v8 r1 = q2
236// Level 3 (using v16):
237// result = r0 + r1*v16
238LIBC_INLINE double asinpi_eval(double v2) {
239 double v4 = v2 * v2;
240 double v8 = v4 * v4;
241 double v16 = v8 * v8;
242
243 double p0 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[2], z: ASINPI_COEFFS[1]);
244 double p1 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[4], z: ASINPI_COEFFS[3]);
245 double p2 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[6], z: ASINPI_COEFFS[5]);
246 double p3 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[8], z: ASINPI_COEFFS[7]);
247 double p4 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[10], z: ASINPI_COEFFS[9]);
248 double p5 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[12], z: ASINPI_COEFFS[11]);
249
250 double q0 = fputil::multiply_add(x: v4, y: p1, z: p0);
251 double q1 = fputil::multiply_add(x: v4, y: p3, z: p2);
252 double q2 = fputil::multiply_add(x: v4, y: p5, z: p4);
253
254 double r0 = fputil::multiply_add(x: v8, y: q1, z: q0);
255
256 return fputil::multiply_add(x: v16, y: q2, z: r0);
257}
258
259} // namespace inv_trigf_utils_internal
260
261} // namespace LIBC_NAMESPACE_DECL
262
263#endif // LLVM_LIBC_SRC___SUPPORT_MATH_INV_TRIGF_UTILS_H
264