| 1 | //===-- Single-precision general inverse trigonometric functions ----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_LIBC_SRC___SUPPORT_MATH_INV_TRIGF_UTILS_H |
| 10 | #define LLVM_LIBC_SRC___SUPPORT_MATH_INV_TRIGF_UTILS_H |
| 11 | |
| 12 | #include "src/__support/FPUtil/PolyEval.h" |
| 13 | #include "src/__support/FPUtil/multiply_add.h" |
| 14 | #include "src/__support/common.h" |
| 15 | #include "src/__support/macros/config.h" |
| 16 | |
| 17 | namespace LIBC_NAMESPACE_DECL { |
| 18 | |
| 19 | namespace inv_trigf_utils_internal { |
| 20 | |
| 21 | // PI and PI / 2 |
| 22 | LIBC_INLINE_VAR constexpr double M_MATH_PI = 0x1.921fb54442d18p+1; |
| 23 | LIBC_INLINE_VAR constexpr double M_MATH_PI_2 = 0x1.921fb54442d18p+0; |
| 24 | |
| 25 | // Polynomial approximation for 0 <= x <= 1: |
| 26 | // atan(x) ~ atan((i/16) + (x - (i/16)) * Q(x - i/16) |
| 27 | // = P(x - i/16) |
| 28 | // Generated by Sollya with: |
| 29 | // > for i from 1 to 16 do { |
| 30 | // mid_point = i/16; |
| 31 | // P = fpminimax(atan(mid_point + x), 8, [|D...|], [-1/32, 1/32]); |
| 32 | // print("{", coeff(P, 0), ",", coeff(P, 1), ",", coeff(P, 2), ",", |
| 33 | // coeff(P, 3), ",", coeff(P, 4), ",", coeff(P, 5), ",", coeff(P, 6), |
| 34 | // ",", coeff(P, 7), ",", coeff(P, 8), "},"); |
| 35 | // }; |
| 36 | // For i = 0, the polynomial is generated by: |
| 37 | // > P = fpminimax(atan(x)/x, 7, [|1, D...|], [0, 1/32]); |
| 38 | // > dirtyinfnorm((atan(x) - x*P)/x, [0, 1/32]); |
| 39 | // 0x1.feb2fcdba66447ccbe28a1a0f935b51678a718fb1p-59 |
| 40 | // Notice that degree-7 is good enough for atanf, but degree-8 helps reduce the |
| 41 | // error bounds for atan2f's fast pass 16 times, and it does not affect the |
| 42 | // performance of atanf much. |
| 43 | LIBC_INLINE_VAR constexpr double ATAN_COEFFS[17][9] = { |
| 44 | {0.0, 1.0, 0x1.3f8d76d26d61bp-47, -0x1.5555555574cd8p-2, |
| 45 | 0x1.0dde5d06878eap-29, 0x1.99997738acc77p-3, 0x1.2c43eac9797cap-16, |
| 46 | -0x1.25fb020007dbdp-3, 0x1.c1b6c31d7b0aep-7}, |
| 47 | {0x1.ff55bb72cfde9p-5, 0x1.fe01fe01fe007p-1, -0x1.fc05f809ed8dap-5, |
| 48 | -0x1.4d69303afe04ep-2, 0x1.f61bc3e8349cp-5, 0x1.820839278756bp-3, |
| 49 | -0x1.eda4de1c6bf3fp-5, -0x1.0514d42d64a63p-3, 0x1.db3746a442dcbp-5}, |
| 50 | {0x1.fd5ba9aac2f6ep-4, 0x1.f81f81f81f813p-1, -0x1.f05e09d0dc378p-4, |
| 51 | -0x1.368c3aa719215p-2, 0x1.d9b16b33ff9c9p-4, 0x1.40488f9c6262ap-3, |
| 52 | -0x1.ba55933e62ea5p-4, -0x1.64c6a15cd9116p-4, 0x1.9273d5939a75ap-4}, |
| 53 | {0x1.7b97b4bce5b02p-3, 0x1.ee9c7f8458e05p-1, -0x1.665c226d6961p-3, |
| 54 | -0x1.1344bb7391703p-2, 0x1.42aca8b0081b9p-3, 0x1.c32d9381d7c03p-4, |
| 55 | -0x1.13e970672e246p-3, -0x1.181ed934dd733p-5, 0x1.bad81ea190c08p-4}, |
| 56 | {0x1.f5b75f92c80ddp-3, 0x1.e1e1e1e1e1e2cp-1, -0x1.c5894d10d363dp-3, |
| 57 | -0x1.ce6de025f9f5ep-3, 0x1.78a3a07c8dd7fp-3, 0x1.dd5f5180f386ep-5, |
| 58 | -0x1.1b1f513c4536bp-3, 0x1.0df852e58c43cp-6, 0x1.722e7a7e42505p-4}, |
| 59 | {0x1.362773707ebccp-2, 0x1.d272ca3fc5b2ep-1, -0x1.0997e8aeca8fbp-2, |
| 60 | -0x1.6cf6666e5e693p-3, 0x1.8dd1e907e88adp-3, 0x1.24849ac0caa5dp-7, |
| 61 | -0x1.f496be486229dp-4, 0x1.b7d54b8e759ecp-5, 0x1.d39c0d39c3922p-5}, |
| 62 | {0x1.6f61941e4def1p-2, 0x1.c0e070381c0f2p-1, -0x1.2726dd135d9eep-2, |
| 63 | -0x1.09f37b39b70e4p-3, 0x1.85eacdaadd712p-3, -0x1.04d66340d5b9p-5, |
| 64 | -0x1.8056b15a22b98p-4, 0x1.29baf494ad3ddp-4, 0x1.52d5881322a7ap-6}, |
| 65 | {0x1.a64eec3cc23fdp-2, 0x1.adbe87f94906ap-1, -0x1.3b9d8eab55addp-2, |
| 66 | -0x1.57c09646eb7p-4, 0x1.6795319e3b8dfp-3, -0x1.f2d89b5ef31bep-5, |
| 67 | -0x1.f38aac26203cap-5, 0x1.3262802235e3fp-4, -0x1.2afd6b9a57d66p-7}, |
| 68 | {0x1.dac670561bb4fp-2, 0x1.99999999999ap-1, -0x1.47ae147adff11p-2, |
| 69 | -0x1.5d867c40188b7p-5, 0x1.3a92a2df85e7ap-3, -0x1.3ec457c46e851p-4, |
| 70 | -0x1.ec1b9777e2e5bp-6, 0x1.0a542992a821ep-4, -0x1.ccffbe2f0d945p-6}, |
| 71 | {0x1.0657e94db30dp-1, 0x1.84f00c2780615p-1, -0x1.4c62cb562defap-2, |
| 72 | -0x1.e6495b3c14e03p-8, 0x1.063c2fa617bfcp-3, -0x1.58b782d9907aap-4, |
| 73 | -0x1.41e6ff524b7fp-8, 0x1.937dfff3205a7p-5, -0x1.0fb1fd1c729dp-5}, |
| 74 | {0x1.1e00babdefeb4p-1, 0x1.702e05c0b816ep-1, -0x1.4af2b78215fbep-2, |
| 75 | 0x1.5d0b7e9f36997p-6, 0x1.a1247cb978debp-4, -0x1.519e1457734cap-4, |
| 76 | 0x1.a755cf86b5bfbp-7, 0x1.096d174284564p-5, -0x1.081adf539ad58p-5}, |
| 77 | {0x1.345f01cce37bbp-1, 0x1.5babcc647fa8ep-1, -0x1.449db09426a6dp-2, |
| 78 | 0x1.655caac5896dap-5, 0x1.3bbbd22d05a61p-4, -0x1.34a2febee042fp-4, |
| 79 | 0x1.84df9c8269e34p-6, 0x1.200e8176c899ap-6, -0x1.c00b23c3ce222p-6}, |
| 80 | {0x1.4978fa3269ee1p-1, 0x1.47ae147ae1477p-1, -0x1.3a92a3055231ap-2, |
| 81 | 0x1.ec21b515a4a2p-5, 0x1.c2f8b81f9a0d2p-5, -0x1.0ba9964125453p-4, |
| 82 | 0x1.d7b5614777a05p-6, 0x1.971e91ed73595p-8, -0x1.3fc375a78dc74p-6}, |
| 83 | {0x1.5d58987169b18p-1, 0x1.34679ace01343p-1, -0x1.2ddfb039136e5p-2, |
| 84 | 0x1.2491307b9fb73p-4, 0x1.29c7e4886dc22p-5, -0x1.bca78bcca83ap-5, |
| 85 | 0x1.e63efd7cbe1ddp-6, -0x1.8ea6c4f03b42dp-10, -0x1.9385b5c3a6997p-7}, |
| 86 | {0x1.700a7c5784634p-1, 0x1.21fb78121fb76p-1, -0x1.1f6a8499e5d1ap-2, |
| 87 | 0x1.41b15e5e29423p-4, 0x1.59bc953163345p-6, -0x1.63b54b13184ddp-5, |
| 88 | 0x1.c9086666d213p-6, -0x1.90c3b4ad8d4bcp-8, -0x1.80f08ed9f6f57p-8}, |
| 89 | {0x1.819d0b7158a4dp-1, 0x1.107fbbe01107ep-1, -0x1.0feeb4089670ep-2, |
| 90 | 0x1.50e5afb93f5cbp-4, 0x1.2a7c2adffeffbp-7, -0x1.12bd29b4f1b43p-5, |
| 91 | 0x1.93f71f0eb00eap-6, -0x1.10ece5ad30e28p-7, -0x1.db1a76bcd2b9cp-10}, |
| 92 | {0x1.921fb54442d18p-1, 0x1.ffffffffffffep-2, -0x1.fffffffffc51cp-3, |
| 93 | 0x1.555555557002ep-4, -0x1.a88260c338e75p-30, -0x1.99999f9a7614fp-6, |
| 94 | 0x1.555e31a1e15e9p-6, -0x1.245240d65e629p-7, -0x1.fa9ba66478903p-11}, |
| 95 | }; |
| 96 | |
| 97 | // Look-up table for atan(k/16) with k = 0..16. |
| 98 | LIBC_INLINE_VAR constexpr double ATAN_K_OVER_16[17] = { |
| 99 | 0.0, |
| 100 | 0x1.ff55bb72cfdeap-5, |
| 101 | 0x1.fd5ba9aac2f6ep-4, |
| 102 | 0x1.7b97b4bce5b02p-3, |
| 103 | 0x1.f5b75f92c80ddp-3, |
| 104 | 0x1.362773707ebccp-2, |
| 105 | 0x1.6f61941e4def1p-2, |
| 106 | 0x1.a64eec3cc23fdp-2, |
| 107 | 0x1.dac670561bb4fp-2, |
| 108 | 0x1.0657e94db30dp-1, |
| 109 | 0x1.1e00babdefeb4p-1, |
| 110 | 0x1.345f01cce37bbp-1, |
| 111 | 0x1.4978fa3269ee1p-1, |
| 112 | 0x1.5d58987169b18p-1, |
| 113 | 0x1.700a7c5784634p-1, |
| 114 | 0x1.819d0b7158a4dp-1, |
| 115 | 0x1.921fb54442d18p-1, |
| 116 | }; |
| 117 | |
| 118 | // For |x| <= 1/32 and 0 <= i <= 16, return Q(x) such that: |
| 119 | // Q(x) ~ (atan(x + i/16) - atan(i/16)) / x. |
| 120 | LIBC_INLINE double atan_eval(double x, unsigned i) { |
| 121 | double x2 = x * x; |
| 122 | |
| 123 | double c0 = fputil::multiply_add(x, y: ATAN_COEFFS[i][2], z: ATAN_COEFFS[i][1]); |
| 124 | double c1 = fputil::multiply_add(x, y: ATAN_COEFFS[i][4], z: ATAN_COEFFS[i][3]); |
| 125 | double c2 = fputil::multiply_add(x, y: ATAN_COEFFS[i][6], z: ATAN_COEFFS[i][5]); |
| 126 | double c3 = fputil::multiply_add(x, y: ATAN_COEFFS[i][8], z: ATAN_COEFFS[i][7]); |
| 127 | |
| 128 | double x4 = x2 * x2; |
| 129 | double d1 = fputil::multiply_add(x: x2, y: c1, z: c0); |
| 130 | double d2 = fputil::multiply_add(x: x2, y: c3, z: c2); |
| 131 | double p = fputil::multiply_add(x: x4, y: d2, z: d1); |
| 132 | return p; |
| 133 | } |
| 134 | |
| 135 | // Evaluate atan without big lookup table. |
| 136 | // atan(n/d) - atan(k/16) = atan((n/d - k/16) / (1 + (n/d) * (k/16))) |
| 137 | // = atan((n - d * k/16)) / (d + n * k/16)) |
| 138 | // So we let q = (n - d * k/16) / (d + n * k/16), |
| 139 | // and approximate with Taylor polynomial: |
| 140 | // atan(q) ~ q - q^3/3 + q^5/5 - q^7/7 + q^9/9 |
| 141 | LIBC_INLINE double atan_eval_no_table(double num, double den, |
| 142 | double k_over_16) { |
| 143 | double num_r = fputil::multiply_add(x: den, y: -k_over_16, z: num); |
| 144 | double den_r = fputil::multiply_add(x: num, y: k_over_16, z: den); |
| 145 | double q = num_r / den_r; |
| 146 | |
| 147 | constexpr double ATAN_TAYLOR[] = { |
| 148 | -0x1.5555555555555p-2, |
| 149 | 0x1.999999999999ap-3, |
| 150 | -0x1.2492492492492p-3, |
| 151 | 0x1.c71c71c71c71cp-4, |
| 152 | }; |
| 153 | double q2 = q * q; |
| 154 | double q3 = q2 * q; |
| 155 | double q4 = q2 * q2; |
| 156 | double c0 = fputil::multiply_add(x: q2, y: ATAN_TAYLOR[1], z: ATAN_TAYLOR[0]); |
| 157 | double c1 = fputil::multiply_add(x: q2, y: ATAN_TAYLOR[3], z: ATAN_TAYLOR[2]); |
| 158 | double d = fputil::multiply_add(x: q4, y: c1, z: c0); |
| 159 | return fputil::multiply_add(x: q3, y: d, z: q); |
| 160 | } |
| 161 | |
| 162 | // > Q = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24|], |
| 163 | // [|1, D...|], [0, 0.5]); |
| 164 | // > dirtyinfnorm((asin(x) - x*Q)/asin(x), [0, 0.5]); |
| 165 | // 0x1.1ff...p-56 |
| 166 | LIBC_INLINE_VAR constexpr double ASIN_COEFFS[12] = { |
| 167 | 0x1.555555555538p-3, 0x1.333333336fd5bp-4, 0x1.6db6db41ce4bcp-5, |
| 168 | 0x1.f1c72c66896dep-6, 0x1.6e89f0a0ac64bp-6, 0x1.1c6c111de4074p-6, |
| 169 | 0x1.c6fa84b5699acp-7, 0x1.8ed60a3e6dd19p-7, 0x1.ab3a090750049p-8, |
| 170 | 0x1.405213cd1ef46p-6, -0x1.0a5a381f73f65p-6, 0x1.05985a32a9045p-5, |
| 171 | }; |
| 172 | |
| 173 | // Evaluate P(x^2) - 1, where P(x^2) ~ asin(x)/x |
| 174 | LIBC_INLINE LIBC_CONSTEXPR double asin_eval(double xsq) { |
| 175 | double x4 = xsq * xsq; |
| 176 | double c0 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[1], z: ASIN_COEFFS[0]); |
| 177 | double c1 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[3], z: ASIN_COEFFS[2]); |
| 178 | double c2 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[5], z: ASIN_COEFFS[4]); |
| 179 | double c3 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[7], z: ASIN_COEFFS[6]); |
| 180 | double c4 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[9], z: ASIN_COEFFS[8]); |
| 181 | double c5 = fputil::multiply_add(x: xsq, y: ASIN_COEFFS[11], z: ASIN_COEFFS[10]); |
| 182 | double x8 = x4 * x4; |
| 183 | double d0 = fputil::multiply_add(x: x4, y: c1, z: c0); |
| 184 | double d1 = fputil::multiply_add(x: x4, y: c3, z: c2); |
| 185 | double d2 = fputil::multiply_add(x: x4, y: c5, z: c4); |
| 186 | return fputil::polyeval(x: x8, a0: d0, a: d1, a: d2); |
| 187 | } |
| 188 | |
| 189 | // the coefficients for the polynomial approximation of asin(x)/(pi*x) in the |
| 190 | // range [0, 0.5] extracted using Sollya. |
| 191 | // |
| 192 | // Sollya code: |
| 193 | // > prec = 200; |
| 194 | // > display = hexadecimal; |
| 195 | // > g = asin(x) / (pi * x); |
| 196 | // > P = fpminimax(g, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22|], |
| 197 | // > [|D...|], [0, 0.5]); |
| 198 | // > for i from 0 to degree(P) do coeff(P, i); |
| 199 | // > print("Error:", dirtyinfnorm(P - g, [1e-30; 0.25])); |
| 200 | // Error : 0x1.6b01ec54170565911f924eb53361de37df00d74e2a10a21d5p-56 ~ 2^−55.496 |
| 201 | // |
| 202 | // Non-zero coefficients (even powers only): |
| 203 | constexpr double ASINPI_COEFFS[13] = { |
| 204 | 0x1.45f306dc9c883p-2, // x^0 |
| 205 | 0x1.b2995e7b7af0fp-5, // x^2 |
| 206 | 0x1.8723a1d61d2e9p-6, // x^4 |
| 207 | 0x1.d1a4529a30a69p-7, // x^6 |
| 208 | 0x1.3ce53861f8f1fp-7, // x^8 |
| 209 | 0x1.d2b076c914efep-8, // x^10 |
| 210 | 0x1.6a2b36f9aed68p-8, // x^12 |
| 211 | 0x1.21604ae2879a2p-8, // x^14 |
| 212 | 0x1.ff0549b4fd0d6p-9, // x^16 |
| 213 | 0x1.035d343508f72p-9, // x^18 |
| 214 | 0x1.a7b91f72b1592p-8, // x^20 |
| 215 | -0x1.6a3fb073e97aep-8, // x^22 |
| 216 | 0x1.547a51d51664ap-7 // x^24 |
| 217 | }; |
| 218 | |
| 219 | // Evaluates P1(v2) = c1 + c2*v2 + c3*v2^2 + ... + c12*v2^11 (tail of P |
| 220 | // without c0) using Estrin's scheme for instruction-level parallelism. |
| 221 | // |
| 222 | // The tail polynomial has 12 coefficients (ASINPI_COEFFS[1] through |
| 223 | // ASINPI_COEFFS[12]) in powers of v2: |
| 224 | // P1(v2) = c1 + c2*v2 + c3*v2^2 + c4*v2^3 + ... + c12*v2^11 |
| 225 | // |
| 226 | // Estrin pairs them bottom-up: |
| 227 | // Level 0 (6 pairs, using v2): |
| 228 | // p0 = c1 + c2*v2 p1 = c3 + c4*v2 |
| 229 | // p2 = c5 + c6*v2 p3 = c7 + c8*v2 |
| 230 | // p4 = c9 + c10*v2 p5 = c11 + c12*v2 |
| 231 | // Level 1 (3 pairs, using v4): |
| 232 | // q0 = p0 + p1*v4 q1 = p2 + p3*v4 |
| 233 | // q2 = p4 + p5*v4 |
| 234 | // Level 2 (using v8): |
| 235 | // r0 = q0 + q1*v8 r1 = q2 |
| 236 | // Level 3 (using v16): |
| 237 | // result = r0 + r1*v16 |
| 238 | LIBC_INLINE double asinpi_eval(double v2) { |
| 239 | double v4 = v2 * v2; |
| 240 | double v8 = v4 * v4; |
| 241 | double v16 = v8 * v8; |
| 242 | |
| 243 | double p0 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[2], z: ASINPI_COEFFS[1]); |
| 244 | double p1 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[4], z: ASINPI_COEFFS[3]); |
| 245 | double p2 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[6], z: ASINPI_COEFFS[5]); |
| 246 | double p3 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[8], z: ASINPI_COEFFS[7]); |
| 247 | double p4 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[10], z: ASINPI_COEFFS[9]); |
| 248 | double p5 = fputil::multiply_add(x: v2, y: ASINPI_COEFFS[12], z: ASINPI_COEFFS[11]); |
| 249 | |
| 250 | double q0 = fputil::multiply_add(x: v4, y: p1, z: p0); |
| 251 | double q1 = fputil::multiply_add(x: v4, y: p3, z: p2); |
| 252 | double q2 = fputil::multiply_add(x: v4, y: p5, z: p4); |
| 253 | |
| 254 | double r0 = fputil::multiply_add(x: v8, y: q1, z: q0); |
| 255 | |
| 256 | return fputil::multiply_add(x: v16, y: q2, z: r0); |
| 257 | } |
| 258 | |
| 259 | } // namespace inv_trigf_utils_internal |
| 260 | |
| 261 | } // namespace LIBC_NAMESPACE_DECL |
| 262 | |
| 263 | #endif // LLVM_LIBC_SRC___SUPPORT_MATH_INV_TRIGF_UTILS_H |
| 264 | |