1//===-- Implementation header for pow ---------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_POW_H
10#define LLVM_LIBC_SRC___SUPPORT_MATH_POW_H
11
12#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
13#include "exp_constants.h" // Lookup tables EXP_M1 and EXP_M2.
14#include "hdr/errno_macros.h"
15#include "hdr/fenv_macros.h"
16#include "src/__support/CPP/bit.h"
17#include "src/__support/FPUtil/FEnvImpl.h"
18#include "src/__support/FPUtil/FPBits.h"
19#include "src/__support/FPUtil/PolyEval.h"
20#include "src/__support/FPUtil/double_double.h"
21#include "src/__support/FPUtil/multiply_add.h"
22#include "src/__support/FPUtil/nearest_integer.h"
23#include "src/__support/FPUtil/sqrt.h" // Speedup for pow(x, 1/2) = sqrt(x)
24#include "src/__support/common.h"
25#include "src/__support/macros/config.h"
26#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
27
28namespace LIBC_NAMESPACE_DECL {
29
30namespace math {
31
32namespace pow_internal {
33
34using fputil::DoubleDouble;
35
36using namespace common_constants_internal;
37
38// Constants for log2(x) range reduction, generated by Sollya with:
39// > for i from 0 to 127 do {
40// r = 2^-8 * ceil( 2^8 * (1 - 2^(-8)) / (1 + i*2^-7) );
41// b = nearestint(log2(r) * 2^41) * 2^-41;
42// c = round(log2(r) - b, D, RN);
43// print("{", -c, ",", -b, "},");
44// };
45// This is the same as -log2(RD[i]), with the least significant bits of the
46// high part set to be 2^-41, so that the sum of high parts + e_x is exact in
47// double precision.
48// We also replace the first and the last ones to be 0.
49LIBC_INLINE_VAR constexpr DoubleDouble LOG2_R_DD[128] = {
50 {.lo: 0.0, .hi: 0.0},
51 {.lo: -0x1.19b14945cf6bap-44, .hi: 0x1.72c7ba21p-7},
52 {.lo: -0x1.95539356f93dcp-43, .hi: 0x1.743ee862p-6},
53 {.lo: 0x1.abe0a48f83604p-43, .hi: 0x1.184b8e4c5p-5},
54 {.lo: 0x1.635577970e04p-43, .hi: 0x1.77394c9d9p-5},
55 {.lo: -0x1.401fbaaa67e3cp-45, .hi: 0x1.d6ebd1f2p-5},
56 {.lo: -0x1.5b1799ceaeb51p-43, .hi: 0x1.1bb32a6008p-4},
57 {.lo: 0x1.7c407050799bfp-43, .hi: 0x1.4c560fe688p-4},
58 {.lo: 0x1.da6339da288fcp-43, .hi: 0x1.7d60496cf8p-4},
59 {.lo: 0x1.be4f6f22dbbadp-43, .hi: 0x1.960caf9ab8p-4},
60 {.lo: -0x1.c760bc9b188c4p-45, .hi: 0x1.c7b528b71p-4},
61 {.lo: 0x1.164e932b2d51cp-44, .hi: 0x1.f9c95dc1dp-4},
62 {.lo: 0x1.924ae921f7ecap-45, .hi: 0x1.097e38ce6p-3},
63 {.lo: -0x1.6d25a5b8a19b2p-44, .hi: 0x1.22dadc2ab4p-3},
64 {.lo: 0x1.e50a1644ac794p-43, .hi: 0x1.3c6fb650ccp-3},
65 {.lo: 0x1.f34baa74a7942p-43, .hi: 0x1.494f863b8cp-3},
66 {.lo: -0x1.8f7aac147fdc1p-46, .hi: 0x1.633a8bf438p-3},
67 {.lo: 0x1.f84be19cb9578p-43, .hi: 0x1.7046031c78p-3},
68 {.lo: -0x1.66cccab240e9p-46, .hi: 0x1.8a8980abfcp-3},
69 {.lo: -0x1.3f7a55cd2af4cp-47, .hi: 0x1.97c1cb13c8p-3},
70 {.lo: 0x1.3458cde69308cp-43, .hi: 0x1.b2602497d4p-3},
71 {.lo: -0x1.667f21fa8423fp-44, .hi: 0x1.bfc67a8p-3},
72 {.lo: 0x1.d2fe4574e09b9p-47, .hi: 0x1.dac22d3e44p-3},
73 {.lo: 0x1.367bde40c5e6dp-43, .hi: 0x1.e857d3d36p-3},
74 {.lo: 0x1.d45da26510033p-46, .hi: 0x1.01d9bbcfa6p-2},
75 {.lo: -0x1.7204f55bbf90dp-44, .hi: 0x1.08bce0d96p-2},
76 {.lo: -0x1.d4f1b95e0ff45p-43, .hi: 0x1.169c05364p-2},
77 {.lo: 0x1.c20d74c0211bfp-44, .hi: 0x1.1d982c9d52p-2},
78 {.lo: 0x1.ad89a083e072ap-43, .hi: 0x1.249cd2b13cp-2},
79 {.lo: 0x1.cd0cb4492f1bcp-43, .hi: 0x1.32bfee370ep-2},
80 {.lo: -0x1.2101a9685c779p-47, .hi: 0x1.39de8e155ap-2},
81 {.lo: 0x1.9451cd394fe8dp-43, .hi: 0x1.4106017c3ep-2},
82 {.lo: 0x1.661e393a16b95p-44, .hi: 0x1.4f6fbb2cecp-2},
83 {.lo: -0x1.c6d8d86531d56p-44, .hi: 0x1.56b22e6b58p-2},
84 {.lo: 0x1.c1c885adb21d3p-43, .hi: 0x1.5dfdcf1eeap-2},
85 {.lo: 0x1.3bb5921006679p-45, .hi: 0x1.6552b49986p-2},
86 {.lo: 0x1.1d406db502403p-43, .hi: 0x1.6cb0f6865cp-2},
87 {.lo: 0x1.55a63e278bad5p-43, .hi: 0x1.7b89f02cf2p-2},
88 {.lo: -0x1.66ae2a7ada553p-49, .hi: 0x1.8304d90c12p-2},
89 {.lo: -0x1.66cccab240e9p-45, .hi: 0x1.8a8980abfcp-2},
90 {.lo: -0x1.62404772a151dp-45, .hi: 0x1.921800924ep-2},
91 {.lo: 0x1.ac9bca36fd02ep-44, .hi: 0x1.99b072a96cp-2},
92 {.lo: 0x1.4bc302ffa76fbp-43, .hi: 0x1.a8ff97181p-2},
93 {.lo: 0x1.01fea1ec47c71p-43, .hi: 0x1.b0b67f4f46p-2},
94 {.lo: -0x1.f20203b3186a6p-43, .hi: 0x1.b877c57b1cp-2},
95 {.lo: -0x1.2642415d47384p-45, .hi: 0x1.c043859e3p-2},
96 {.lo: -0x1.bc76a2753b99bp-50, .hi: 0x1.c819dc2d46p-2},
97 {.lo: -0x1.da93ae3a5f451p-43, .hi: 0x1.cffae611aep-2},
98 {.lo: -0x1.50e785694a8c6p-43, .hi: 0x1.d7e6c0abc4p-2},
99 {.lo: 0x1.c56138c894641p-43, .hi: 0x1.dfdd89d586p-2},
100 {.lo: 0x1.5669df6a2b592p-43, .hi: 0x1.e7df5fe538p-2},
101 {.lo: -0x1.ea92d9e0e8ac2p-48, .hi: 0x1.efec61b012p-2},
102 {.lo: 0x1.a0331af2e6feap-43, .hi: 0x1.f804ae8d0cp-2},
103 {.lo: 0x1.9518ce032f41dp-48, .hi: 0x1.0014332bep-1},
104 {.lo: -0x1.b3b3864c60011p-44, .hi: 0x1.042bd4b9a8p-1},
105 {.lo: -0x1.103e8f00d41c8p-45, .hi: 0x1.08494c66b9p-1},
106 {.lo: 0x1.65be75cc3da17p-43, .hi: 0x1.0c6caaf0c5p-1},
107 {.lo: 0x1.3676289cd3dd4p-43, .hi: 0x1.1096015deep-1},
108 {.lo: -0x1.41dfc7d7c3321p-43, .hi: 0x1.14c560fe69p-1},
109 {.lo: 0x1.e0cda8bd74461p-44, .hi: 0x1.18fadb6e2dp-1},
110 {.lo: 0x1.2a606046ad444p-44, .hi: 0x1.1d368296b5p-1},
111 {.lo: 0x1.f9ea977a639cp-43, .hi: 0x1.217868b0c3p-1},
112 {.lo: -0x1.50520a377c7ecp-45, .hi: 0x1.25c0a0463cp-1},
113 {.lo: 0x1.6e3cb71b554e7p-47, .hi: 0x1.2a0f3c3407p-1},
114 {.lo: -0x1.4275f1035e5e8p-48, .hi: 0x1.2e644fac05p-1},
115 {.lo: -0x1.4275f1035e5e8p-48, .hi: 0x1.2e644fac05p-1},
116 {.lo: -0x1.979a5db68721dp-45, .hi: 0x1.32bfee370fp-1},
117 {.lo: 0x1.1ee969a95f529p-43, .hi: 0x1.37222bb707p-1},
118 {.lo: 0x1.bb4b69336b66ep-43, .hi: 0x1.3b8b1c68fap-1},
119 {.lo: 0x1.d5e6a8a4fb059p-45, .hi: 0x1.3ffad4e74fp-1},
120 {.lo: 0x1.3106e404cabb7p-44, .hi: 0x1.44716a2c08p-1},
121 {.lo: 0x1.3106e404cabb7p-44, .hi: 0x1.44716a2c08p-1},
122 {.lo: -0x1.9bcaf1aa4168ap-43, .hi: 0x1.48eef19318p-1},
123 {.lo: 0x1.1646b761c48dep-44, .hi: 0x1.4d7380dcc4p-1},
124 {.lo: 0x1.2f0c0bfe9dbecp-43, .hi: 0x1.51ff2e3021p-1},
125 {.lo: 0x1.29904613e33cp-43, .hi: 0x1.5692101d9bp-1},
126 {.lo: 0x1.1d406db502403p-44, .hi: 0x1.5b2c3da197p-1},
127 {.lo: 0x1.1d406db502403p-44, .hi: 0x1.5b2c3da197p-1},
128 {.lo: -0x1.125d6cbcd1095p-44, .hi: 0x1.5fcdce2728p-1},
129 {.lo: -0x1.bd9b32266d92cp-43, .hi: 0x1.6476d98adap-1},
130 {.lo: 0x1.54243b21709cep-44, .hi: 0x1.6927781d93p-1},
131 {.lo: 0x1.54243b21709cep-44, .hi: 0x1.6927781d93p-1},
132 {.lo: -0x1.ce60916e52e91p-44, .hi: 0x1.6ddfc2a79p-1},
133 {.lo: 0x1.f1f5ae718f241p-43, .hi: 0x1.729fd26b7p-1},
134 {.lo: -0x1.6eb9612e0b4f3p-43, .hi: 0x1.7767c12968p-1},
135 {.lo: -0x1.6eb9612e0b4f3p-43, .hi: 0x1.7767c12968p-1},
136 {.lo: 0x1.fed21f9cb2cc5p-43, .hi: 0x1.7c37a9227ep-1},
137 {.lo: 0x1.7f5dc57266758p-43, .hi: 0x1.810fa51bf6p-1},
138 {.lo: 0x1.7f5dc57266758p-43, .hi: 0x1.810fa51bf6p-1},
139 {.lo: 0x1.5b338360c2ae2p-43, .hi: 0x1.85efd062c6p-1},
140 {.lo: -0x1.96fc8f4b56502p-43, .hi: 0x1.8ad846cf37p-1},
141 {.lo: -0x1.96fc8f4b56502p-43, .hi: 0x1.8ad846cf37p-1},
142 {.lo: -0x1.bdc81c4db3134p-44, .hi: 0x1.8fc924c89bp-1},
143 {.lo: 0x1.36c101ee1344p-43, .hi: 0x1.94c287492cp-1},
144 {.lo: 0x1.36c101ee1344p-43, .hi: 0x1.94c287492cp-1},
145 {.lo: 0x1.e41fa0a62e6aep-44, .hi: 0x1.99c48be206p-1},
146 {.lo: -0x1.d97ee9124773bp-46, .hi: 0x1.9ecf50bf44p-1},
147 {.lo: -0x1.d97ee9124773bp-46, .hi: 0x1.9ecf50bf44p-1},
148 {.lo: -0x1.3f94e00e7d6bcp-46, .hi: 0x1.a3e2f4ac44p-1},
149 {.lo: -0x1.6879fa00b120ap-43, .hi: 0x1.a8ff971811p-1},
150 {.lo: -0x1.6879fa00b120ap-43, .hi: 0x1.a8ff971811p-1},
151 {.lo: 0x1.1659d8e2d7d38p-44, .hi: 0x1.ae255819fp-1},
152 {.lo: 0x1.1e5e0ae0d3f8ap-43, .hi: 0x1.b35458761dp-1},
153 {.lo: 0x1.1e5e0ae0d3f8ap-43, .hi: 0x1.b35458761dp-1},
154 {.lo: 0x1.484a15babcf88p-43, .hi: 0x1.b88cb9a2abp-1},
155 {.lo: 0x1.484a15babcf88p-43, .hi: 0x1.b88cb9a2abp-1},
156 {.lo: 0x1.871a7610e40bdp-45, .hi: 0x1.bdce9dcc96p-1},
157 {.lo: -0x1.2d90e5edaeceep-43, .hi: 0x1.c31a27dd01p-1},
158 {.lo: -0x1.2d90e5edaeceep-43, .hi: 0x1.c31a27dd01p-1},
159 {.lo: -0x1.5dd31d962d373p-43, .hi: 0x1.c86f7b7ea5p-1},
160 {.lo: -0x1.5dd31d962d373p-43, .hi: 0x1.c86f7b7ea5p-1},
161 {.lo: -0x1.9ad57391924a7p-43, .hi: 0x1.cdcebd2374p-1},
162 {.lo: -0x1.3167ccc538261p-44, .hi: 0x1.d338120a6ep-1},
163 {.lo: -0x1.3167ccc538261p-44, .hi: 0x1.d338120a6ep-1},
164 {.lo: 0x1.c7a4ff65ddbc9p-45, .hi: 0x1.d8aba045bp-1},
165 {.lo: 0x1.c7a4ff65ddbc9p-45, .hi: 0x1.d8aba045bp-1},
166 {.lo: -0x1.f9ab3cf74babap-44, .hi: 0x1.de298ec0bbp-1},
167 {.lo: -0x1.f9ab3cf74babap-44, .hi: 0x1.de298ec0bbp-1},
168 {.lo: 0x1.52842c1c1e586p-43, .hi: 0x1.e3b20546f5p-1},
169 {.lo: 0x1.52842c1c1e586p-43, .hi: 0x1.e3b20546f5p-1},
170 {.lo: 0x1.3c6764fc87b4ap-48, .hi: 0x1.e9452c8a71p-1},
171 {.lo: 0x1.3c6764fc87b4ap-48, .hi: 0x1.e9452c8a71p-1},
172 {.lo: -0x1.a0976c0a2827dp-44, .hi: 0x1.eee32e2aedp-1},
173 {.lo: -0x1.a0976c0a2827dp-44, .hi: 0x1.eee32e2aedp-1},
174 {.lo: -0x1.a45314dc4fc42p-43, .hi: 0x1.f48c34bd1fp-1},
175 {.lo: -0x1.a45314dc4fc42p-43, .hi: 0x1.f48c34bd1fp-1},
176 {.lo: 0x1.ef5d00e390ap-44, .hi: 0x1.fa406bd244p-1},
177 {.lo: 0.0, .hi: 1.0},
178};
179
180LIBC_INLINE bool is_odd_integer(double x) {
181 using FPBits = fputil::FPBits<double>;
182 FPBits xbits(x);
183 uint64_t x_u = xbits.uintval();
184 unsigned x_e = static_cast<unsigned>(xbits.get_biased_exponent());
185 unsigned lsb =
186 static_cast<unsigned>(cpp::countr_zero(value: x_u | FPBits::EXP_MASK));
187 constexpr unsigned UNIT_EXPONENT =
188 static_cast<unsigned>(FPBits::EXP_BIAS + FPBits::FRACTION_LEN);
189 return (x_e + lsb == UNIT_EXPONENT);
190}
191
192LIBC_INLINE bool is_integer(double x) {
193 using FPBits = fputil::FPBits<double>;
194 FPBits xbits(x);
195 uint64_t x_u = xbits.uintval();
196 unsigned x_e = static_cast<unsigned>(xbits.get_biased_exponent());
197 unsigned lsb =
198 static_cast<unsigned>(cpp::countr_zero(value: x_u | FPBits::EXP_MASK));
199 constexpr unsigned UNIT_EXPONENT =
200 static_cast<unsigned>(FPBits::EXP_BIAS + FPBits::FRACTION_LEN);
201 return (x_e + lsb >= UNIT_EXPONENT);
202}
203
204} // namespace pow_internal
205
206LIBC_INLINE double pow(double x, double y) {
207 using namespace pow_internal;
208 using FPBits = fputil::FPBits<double>;
209
210 FPBits xbits(x), ybits(y);
211
212 bool x_sign = xbits.sign() == Sign::NEG;
213 bool y_sign = ybits.sign() == Sign::NEG;
214
215 FPBits x_abs = xbits.abs();
216 FPBits y_abs = ybits.abs();
217
218 uint64_t x_mant = xbits.get_mantissa();
219 uint64_t y_mant = ybits.get_mantissa();
220 uint64_t x_u = xbits.uintval();
221 uint64_t x_a = x_abs.uintval();
222 uint64_t y_a = y_abs.uintval();
223
224 double e_x = static_cast<double>(xbits.get_exponent());
225 uint64_t sign = 0;
226
227 ///////// BEGIN - Check exceptional cases ////////////////////////////////////
228 // If x or y is signaling NaN
229 if (x_abs.is_signaling_nan() || y_abs.is_signaling_nan()) {
230 fputil::raise_except_if_required(FE_INVALID);
231 return FPBits::quiet_nan().get_val();
232 }
233
234 // The double precision number that is closest to 1 is (1 - 2^-53), which has
235 // log2(1 - 2^-53) ~ -1.715...p-53.
236 // So if |y| > |1075 / log2(1 - 2^-53)|, and x is finite:
237 // |y * log2(x)| = 0 or > 1075.
238 // Hence x^y will either overflow or underflow if x is not zero.
239 if (LIBC_UNLIKELY(y_mant == 0 || y_a > 0x43d7'4910'd52d'3052 ||
240 x_u == FPBits::one().uintval() ||
241 x_u >= FPBits::inf().uintval() ||
242 x_u < FPBits::min_normal().uintval())) {
243 // Exceptional exponents.
244 if (y == 0.0)
245 return 1.0;
246
247 switch (y_a) {
248 case 0x3fe0'0000'0000'0000: { // y = +-0.5
249 // TODO: speed up x^(-1/2) with rsqrt(x) when available.
250 if (LIBC_UNLIKELY(
251 (x == 0.0 || x_u == FPBits::inf(Sign::NEG).uintval()))) {
252 // pow(-0, 1/2) = +0
253 // pow(-inf, 1/2) = +inf
254 // Make sure it works correctly for FTZ/DAZ.
255 return y_sign ? 1.0 / (x * x) : (x * x);
256 }
257 return y_sign ? (1.0 / fputil::sqrt<double>(x)) : fputil::sqrt<double>(x);
258 }
259 case 0x3ff0'0000'0000'0000: // y = +-1.0
260 return y_sign ? (1.0 / x) : x;
261 case 0x4000'0000'0000'0000: // y = +-2.0;
262 return y_sign ? (1.0 / (x * x)) : (x * x);
263 }
264
265 // |y| > |1075 / log2(1 - 2^-53)|.
266 if (y_a > 0x43d7'4910'd52d'3052) {
267 if (y_a >= 0x7ff0'0000'0000'0000) {
268 // y is inf or nan
269 if (y_mant != 0) {
270 // y is NaN
271 // pow(1, NaN) = 1
272 // pow(x, NaN) = NaN
273 return (x_u == FPBits::one().uintval()) ? 1.0 : y;
274 }
275
276 // Now y is +-Inf
277 if (x_abs.is_nan()) {
278 // pow(NaN, +-Inf) = NaN
279 return x;
280 }
281
282 if (x_a == 0x3ff0'0000'0000'0000) {
283 // pow(+-1, +-Inf) = 1.0
284 return 1.0;
285 }
286
287 if (x == 0.0 && y_sign) {
288 // pow(+-0, -Inf) = +inf and raise FE_DIVBYZERO
289 fputil::set_errno_if_required(EDOM);
290 fputil::raise_except_if_required(FE_DIVBYZERO);
291 return FPBits::inf().get_val();
292 }
293 // pow (|x| < 1, -inf) = +inf
294 // pow (|x| < 1, +inf) = 0.0
295 // pow (|x| > 1, -inf) = 0.0
296 // pow (|x| > 1, +inf) = +inf
297 return ((x_a < FPBits::one().uintval()) == y_sign)
298 ? FPBits::inf().get_val()
299 : 0.0;
300 }
301 // x^y will overflow / underflow in double precision. Set y to a
302 // large enough exponent but not too large, so that the computations
303 // won't overflow in double precision.
304 y = y_sign ? -0x1.0p100 : 0x1.0p100;
305 }
306
307 // y is finite and non-zero.
308
309 if (x_u == FPBits::one().uintval()) {
310 // pow(1, y) = 1
311 return 1.0;
312 }
313
314 // TODO: Speed things up with pow(2, y) = exp2(y) and pow(10, y) = exp10(y).
315
316 if (x == 0.0) {
317 bool out_is_neg = x_sign && is_odd_integer(x: y);
318 if (y_sign) {
319 // pow(0, negative number) = inf
320 fputil::set_errno_if_required(EDOM);
321 fputil::raise_except_if_required(FE_DIVBYZERO);
322 return FPBits::inf(sign: out_is_neg ? Sign::NEG : Sign::POS).get_val();
323 }
324 // pow(0, positive number) = 0
325 return out_is_neg ? -0.0 : 0.0;
326 }
327
328 if (x_a == FPBits::inf().uintval()) {
329 bool out_is_neg = x_sign && is_odd_integer(x: y);
330 if (y_sign)
331 return out_is_neg ? -0.0 : 0.0;
332 return FPBits::inf(sign: out_is_neg ? Sign::NEG : Sign::POS).get_val();
333 }
334
335 if (x_a > FPBits::inf().uintval()) {
336 // x is NaN.
337 // pow (aNaN, 0) is already taken care above.
338 return x;
339 }
340
341 // Normalize denormal inputs.
342 if (x_a < FPBits::min_normal().uintval()) {
343 FPBits x_norm(x * 0x1.0p64);
344 e_x = static_cast<double>(x_norm.get_exponent()) - 64.0;
345 x_mant = x_norm.get_mantissa();
346 }
347
348 // x is finite and negative, and y is a finite integer.
349 if (x_sign) {
350 if (is_integer(x: y)) {
351 x = -x;
352 if (is_odd_integer(x: y))
353 // sign = -1.0;
354 sign = 0x8000'0000'0000'0000;
355 } else {
356 // pow( negative, non-integer ) = NaN
357 fputil::set_errno_if_required(EDOM);
358 fputil::raise_except_if_required(FE_INVALID);
359 return FPBits::quiet_nan().get_val();
360 }
361 }
362 }
363
364 ///////// END - Check exceptional cases //////////////////////////////////////
365
366 // x^y = 2^( y * log2(x) )
367 // = 2^( y * ( e_x + log2(m_x) ) )
368 // First we compute log2(x) = e_x + log2(m_x)
369
370 // Extract exponent field of x.
371
372 // Use the highest 7 fractional bits of m_x as the index for look up tables.
373 unsigned idx_x = static_cast<unsigned>(x_mant >> (FPBits::FRACTION_LEN - 7));
374 // Add the hidden bit to the mantissa.
375 // 1 <= m_x < 2
376 FPBits m_x = FPBits(x_mant | 0x3ff0'0000'0000'0000);
377
378 // Reduced argument for log2(m_x):
379 // dx = r * m_x - 1.
380 // The computation is exact, and -2^-8 <= dx < 2^-7.
381 // Then m_x = (1 + dx) / r, and
382 // log2(m_x) = log2( (1 + dx) / r )
383 // = log2(1 + dx) - log2(r).
384
385 // In order for the overall computations x^y = 2^(y * log2(x)) to have the
386 // relative errors < 2^-52 (1ULP), we will need to evaluate the exponent part
387 // y * log2(x) with absolute errors < 2^-52 (or better, 2^-53). Since the
388 // whole exponent range for double precision is bounded by
389 // |y * log2(x)| < 1076 ~ 2^10, we need to evaluate log2(x) with absolute
390 // errors < 2^-53 * 2^-10 = 2^-63.
391
392 // With that requirement, we use the following degree-6 polynomial
393 // approximation:
394 // P(dx) ~ log2(1 + dx) / dx
395 // Generated by Sollya with:
396 // > P = fpminimax(log2(1 + x)/x, 6, [|D...|], [-2^-8, 2^-7]); P;
397 // > dirtyinfnorm(log2(1 + x) - x*P, [-2^-8, 2^-7]);
398 // 0x1.d03cc...p-66
399 constexpr double COEFFS[] = {0x1.71547652b82fep0, -0x1.71547652b82e7p-1,
400 0x1.ec709dc3b1fd5p-2, -0x1.7154766124215p-2,
401 0x1.2776bd90259d8p-2, -0x1.ec586c6f3d311p-3,
402 0x1.9c4775eccf524p-3};
403 // Error: ulp(dx^2) <= (2^-7)^2 * 2^-52 = 2^-66
404 // Extra errors from various computations and rounding directions, the overall
405 // errors we can be bounded by 2^-65.
406
407 DoubleDouble dx_c0;
408
409 // Perform exact range reduction and exact product dx * c0.
410#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
411 double dx = fputil::multiply_add(RD[idx_x], m_x.get_val(), -1.0); // Exact
412 dx_c0 = fputil::exact_mult(COEFFS[0], dx);
413#else
414 double c = FPBits(m_x.uintval() & 0x3fff'e000'0000'0000).get_val();
415 double dx =
416 fputil::multiply_add(x: RD[idx_x], y: m_x.get_val() - c, z: CD[idx_x]); // Exact
417 dx_c0 = fputil::exact_mult<double, 28>(a: dx, b: COEFFS[0]); // Exact
418#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
419
420 double dx2 = dx * dx;
421 double c0 = fputil::multiply_add(x: dx, y: COEFFS[2], z: COEFFS[1]);
422 double c1 = fputil::multiply_add(x: dx, y: COEFFS[4], z: COEFFS[3]);
423 double c2 = fputil::multiply_add(x: dx, y: COEFFS[6], z: COEFFS[5]);
424
425 double p = fputil::polyeval(x: dx2, a0: c0, a: c1, a: c2);
426
427 // s = e_x - log2(r) + dx * P(dx)
428 // Absolute error bound:
429 // |log2(x) - log2_x.hi - log2_x.lo| < 2^-65.
430
431 // Notice that e_x - log2(r).hi is exact, so we perform an exact sum of
432 // e_x - log2(r).hi and the high part of the product dx * c0:
433 // log2_x_hi.hi + log2_x_hi.lo = e_x - log2(r).hi + (dx * c0).hi
434 DoubleDouble log2_x_hi =
435 fputil::exact_add(a: e_x + LOG2_R_DD[idx_x].hi, b: dx_c0.hi);
436 // The low part is dx^2 * p + low part of (dx * c0) + low part of -log2(r).
437 double log2_x_lo =
438 fputil::multiply_add(x: dx2, y: p, z: dx_c0.lo + LOG2_R_DD[idx_x].lo);
439 // Perform accurate sums.
440 DoubleDouble log2_x = fputil::exact_add(a: log2_x_hi.hi, b: log2_x_lo);
441 log2_x.lo += log2_x_hi.lo;
442
443 // To compute 2^(y * log2(x)), we break the exponent into 3 parts:
444 // y * log(2) = hi + mid + lo, where
445 // hi is an integer
446 // mid * 2^6 is an integer
447 // |lo| <= 2^-7
448 // Then:
449 // x^y = 2^(y * log2(x)) = 2^hi * 2^mid * 2^lo,
450 // In which 2^mid is obtained from a look-up table of size 2^6 = 64 elements,
451 // and 2^lo ~ 1 + lo * P(lo).
452 // Thus, we have:
453 // hi + mid = 2^-6 * round( 2^6 * y * log2(x) )
454 // If we restrict the output such that |hi| < 150, (hi + mid) uses (8 + 6)
455 // bits, hence, if we use double precision to perform
456 // round( 2^6 * y * log2(x))
457 // the lo part is bounded by 2^-7 + 2^(-(52 - 14)) = 2^-7 + 2^-38
458
459 // In the following computations:
460 // y6 = 2^6 * y
461 // hm = 2^6 * (hi + mid) = round(2^6 * y * log2(x)) ~ round(y6 * s)
462 // lo6 = 2^6 * lo = 2^6 * (y - (hi + mid)) = y6 * log2(x) - hm.
463 double y6 = y * 0x1.0p6; // Exact.
464
465 DoubleDouble y6_log2_x = fputil::exact_mult(a: y6, b: log2_x.hi);
466 y6_log2_x.lo = fputil::multiply_add(x: y6, y: log2_x.lo, z: y6_log2_x.lo);
467
468 // Check overflow/underflow.
469 double scale = 1.0;
470
471 // |2^(hi + mid) - exp2_hi_mid| <= ulp(exp2_hi_mid) / 2
472 // Clamp the exponent part into smaller range that fits double precision.
473 // For those exponents that are out of range, the final conversion will round
474 // them correctly to inf/max float or 0/min float accordingly.
475 constexpr double UPPER_EXP_BOUND = 512.0 * 0x1.0p6;
476 if (LIBC_UNLIKELY(FPBits(y6_log2_x.hi).abs().get_val() >= UPPER_EXP_BOUND)) {
477 if (FPBits(y6_log2_x.hi).sign() == Sign::POS) {
478 scale = 0x1.0p512;
479 y6_log2_x.hi -= 512.0 * 64.0;
480 if (y6_log2_x.hi > 513.0 * 64.0)
481 y6_log2_x.hi = 513.0 * 64.0;
482 } else {
483 scale = 0x1.0p-512;
484 y6_log2_x.hi += 512.0 * 64.0;
485 if (y6_log2_x.hi < (-1076.0 + 512.0) * 64.0)
486 y6_log2_x.hi = -564.0 * 64.0;
487 }
488 }
489
490 double hm = fputil::nearest_integer(x: y6_log2_x.hi);
491
492 // lo6 = 2^6 * lo.
493 double lo6_hi = y6_log2_x.hi - hm;
494 double lo6 = lo6_hi + y6_log2_x.lo;
495
496 int hm_i = static_cast<int>(hm);
497 unsigned idx_y = static_cast<unsigned>(hm_i) & 0x3f;
498
499 // 2^hi
500 int64_t exp2_hi_i = static_cast<int64_t>(
501 static_cast<uint64_t>(static_cast<int64_t>(hm_i >> 6))
502 << FPBits::FRACTION_LEN);
503 // 2^mid
504 int64_t exp2_mid_hi_i =
505 static_cast<int64_t>(FPBits(EXP2_MID1[idx_y].hi).uintval());
506 int64_t exp2_mid_lo_i =
507 static_cast<int64_t>(FPBits(EXP2_MID1[idx_y].mid).uintval());
508 // (-1)^sign * 2^hi * 2^mid
509 // Error <= 2^hi * 2^-53
510 uint64_t exp2_hm_hi_i =
511 static_cast<uint64_t>(exp2_hi_i + exp2_mid_hi_i) + sign;
512 // The low part could be 0.
513 uint64_t exp2_hm_lo_i =
514 idx_y != 0 ? static_cast<uint64_t>(exp2_hi_i + exp2_mid_lo_i) + sign
515 : sign;
516 double exp2_hm_hi = FPBits(exp2_hm_hi_i).get_val();
517 double exp2_hm_lo = FPBits(exp2_hm_lo_i).get_val();
518
519 // Degree-5 polynomial approximation P(lo6) ~ 2^(lo6 / 2^6) = 2^(lo).
520 // Generated by Sollya with:
521 // > P = fpminimax(2^(x/64), 5, [|1, D...|], [-2^-1, 2^-1]);
522 // > dirtyinfnorm(2^(x/64) - P, [-0.5, 0.5]);
523 // 0x1.a2b77e618f5c4c176fd11b7659016cde5de83cb72p-60
524 constexpr double EXP2_COEFFS[] = {0x1p0,
525 0x1.62e42fefa39efp-7,
526 0x1.ebfbdff82a23ap-15,
527 0x1.c6b08d7076268p-23,
528 0x1.3b2ad33f8b48bp-31,
529 0x1.5d870c4d84445p-40};
530
531 double lo6_sqr = lo6 * lo6;
532
533 double d0 = fputil::multiply_add(x: lo6, y: EXP2_COEFFS[2], z: EXP2_COEFFS[1]);
534 double d1 = fputil::multiply_add(x: lo6, y: EXP2_COEFFS[4], z: EXP2_COEFFS[3]);
535 double pp = fputil::polyeval(x: lo6_sqr, a0: d0, a: d1, a: EXP2_COEFFS[5]);
536
537 double r = fputil::multiply_add(x: exp2_hm_hi * lo6, y: pp, z: exp2_hm_lo);
538 r += exp2_hm_hi;
539
540 return r * scale;
541}
542
543} // namespace math
544} // namespace LIBC_NAMESPACE_DECL
545
546#endif // LLVM_LIBC_SRC___SUPPORT_MATH_POW_H
547