| 1 | //===-- Utilities for double-double data type. ------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_DOUBLE_DOUBLE_H |
| 10 | #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_DOUBLE_DOUBLE_H |
| 11 | |
| 12 | #include "multiply_add.h" |
| 13 | #include "src/__support/common.h" |
| 14 | #include "src/__support/macros/config.h" |
| 15 | #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA |
| 16 | #include "src/__support/number_pair.h" |
| 17 | |
| 18 | namespace LIBC_NAMESPACE_DECL { |
| 19 | namespace fputil { |
| 20 | |
| 21 | template <typename T> struct DefaultSplit; |
| 22 | template <> struct DefaultSplit<float> { |
| 23 | static constexpr size_t VALUE = 12; |
| 24 | }; |
| 25 | template <> struct DefaultSplit<double> { |
| 26 | static constexpr size_t VALUE = 27; |
| 27 | }; |
| 28 | |
| 29 | using DoubleDouble = NumberPair<double>; |
| 30 | using FloatFloat = NumberPair<float>; |
| 31 | |
| 32 | // Dekker's FastTwoSum: |
| 33 | // r_hi + r_lo ~ a + b |
| 34 | // In precision `p`, it's exact if `lsb(a) >= ulp(b)` and one of the following |
| 35 | // conditions satisfies: |
| 36 | // (i) rounding mode = RN |
| 37 | // (ii) e_a - e_b <= p |
| 38 | // (iii) rounding mode = RD and b >= 0 |
| 39 | // (iv) rounding mode = RU and b <= 0 |
| 40 | // (v) rounding mode = RZ and ab >= 0 |
| 41 | // Otherwise, the errors err = (a + b) - (r_hi + r_lo) is bounded by: |
| 42 | // |err| <= 2^(-2p + 1) * ufp(a + b) <= 2^(-2p + 1) * ufp(r_hi) |
| 43 | // when `lsb(a) >= ulp(b)`. |
| 44 | // In particular, |
| 45 | // |err| <= 2^-47 * ufp(r_hi) for single precision, |
| 46 | // <= 2^-105 * ufp(r_hi) for double precision. |
| 47 | // If the condition `lsb(a) >= ulp(b)` does NOT satisfy, then: |
| 48 | // |err| < 3 * 2^(-p) * |r_hi|. |
| 49 | // Reference: |
| 50 | // Jeannerod, C.-P. and Zimmermann, P., "FastTwoSum revisited", ARITH 2025, |
| 51 | // <https://www.arith2025.org/proceedings/215900a141.pdf>. |
| 52 | template <bool FAST2SUM = true, typename T = double> |
| 53 | LIBC_INLINE constexpr NumberPair<T> exact_add(T a, T b) { |
| 54 | NumberPair<T> r{0.0, 0.0}; |
| 55 | if constexpr (FAST2SUM) { |
| 56 | r.hi = a + b; |
| 57 | T t = r.hi - a; |
| 58 | r.lo = b - t; |
| 59 | } else { |
| 60 | r.hi = a + b; |
| 61 | T t1 = r.hi - a; |
| 62 | T t2 = r.hi - t1; |
| 63 | T t3 = b - t1; |
| 64 | T t4 = a - t2; |
| 65 | r.lo = t3 + t4; |
| 66 | } |
| 67 | return r; |
| 68 | } |
| 69 | |
| 70 | // Following the above analysis of FastTwoSum, |
| 71 | // If `lsb(a.hi) >= ulp(b.hi)`, then the errors: |
| 72 | // err = (a.hi + a.lo + b.hi + b.lo) - (r.hi - r.lo) is bounded by: |
| 73 | // |err| < 2^(-2p) * ufp(r.hi) + 2^(-p + 2) * ufp(a.lo + b.lo). |
| 74 | template <typename T> |
| 75 | LIBC_INLINE constexpr NumberPair<T> add(const NumberPair<T> &a, |
| 76 | const NumberPair<T> &b) { |
| 77 | NumberPair<T> r = exact_add(a.hi, b.hi); |
| 78 | T lo = a.lo + b.lo; |
| 79 | return exact_add(r.hi, r.lo + lo); |
| 80 | } |
| 81 | |
| 82 | // Assumption: |a.hi| >= |b| |
| 83 | template <typename T> |
| 84 | LIBC_INLINE constexpr NumberPair<T> add(const NumberPair<T> &a, T b) { |
| 85 | NumberPair<T> r = exact_add<false>(a.hi, b); |
| 86 | return exact_add(r.hi, r.lo + a.lo); |
| 87 | } |
| 88 | |
| 89 | // Veltkamp's Splitting for double precision. |
| 90 | // Note: This is proved to be correct for all rounding modes: |
| 91 | // Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed |
| 92 | // Roundings," https://inria.hal.science/hal-04480440. |
| 93 | // Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1. |
| 94 | template <typename T = double, size_t N = DefaultSplit<T>::VALUE> |
| 95 | LIBC_INLINE constexpr NumberPair<T> split(T a) { |
| 96 | NumberPair<T> r{0.0, 0.0}; |
| 97 | // CN = 2^N. |
| 98 | constexpr T CN = static_cast<T>(1 << N); |
| 99 | constexpr T C = CN + T(1); |
| 100 | T t1 = C * a; |
| 101 | T t2 = a - t1; |
| 102 | r.hi = t1 + t2; |
| 103 | r.lo = a - r.hi; |
| 104 | return r; |
| 105 | } |
| 106 | |
| 107 | // Helper for non-fma exact mult where the first number is already split. |
| 108 | template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE> |
| 109 | LIBC_INLINE constexpr NumberPair<T> exact_mult(const NumberPair<T> &as, T a, |
| 110 | T b) { |
| 111 | NumberPair<T> bs = split<T, SPLIT_B>(b); |
| 112 | NumberPair<T> r{0.0, 0.0}; |
| 113 | |
| 114 | r.hi = a * b; |
| 115 | T t1 = as.hi * bs.hi - r.hi; |
| 116 | T t2 = as.hi * bs.lo + t1; |
| 117 | T t3 = as.lo * bs.hi + t2; |
| 118 | r.lo = as.lo * bs.lo + t3; |
| 119 | |
| 120 | return r; |
| 121 | } |
| 122 | |
| 123 | // The templated exact multiplication needs template version of |
| 124 | // LIBC_TARGET_CPU_HAS_FMA_* macro to correctly select the implementation. |
| 125 | // These can be moved to "src/__support/macros/properties/cpu_features.h" if |
| 126 | // other part of libc needed. |
| 127 | template <typename T> struct TargetHasFmaInstruction { |
| 128 | static constexpr bool VALUE = false; |
| 129 | }; |
| 130 | |
| 131 | #ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT |
| 132 | template <> struct TargetHasFmaInstruction<float> { |
| 133 | static constexpr bool VALUE = true; |
| 134 | }; |
| 135 | #endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT |
| 136 | |
| 137 | #ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE |
| 138 | template <> struct TargetHasFmaInstruction<double> { |
| 139 | static constexpr bool VALUE = true; |
| 140 | }; |
| 141 | #endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE |
| 142 | |
| 143 | // Note: When FMA instruction is not available, the `exact_mult` function is |
| 144 | // only correct for round-to-nearest mode. See: |
| 145 | // Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed |
| 146 | // Roundings," https://inria.hal.science/hal-04480440. |
| 147 | // Using Theorem 1 in the paper above, without FMA instruction, if we restrict |
| 148 | // the generated constants to precision <= 51, and splitting it by 2^28 + 1, |
| 149 | // then a * b = r.hi + r.lo is exact for all rounding modes. |
| 150 | template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE> |
| 151 | LIBC_INLINE LIBC_CONSTEXPR NumberPair<T> exact_mult(T a, T b) { |
| 152 | NumberPair<T> r{0.0, 0.0}; |
| 153 | |
| 154 | if constexpr (TargetHasFmaInstruction<T>::VALUE) { |
| 155 | r.hi = a * b; |
| 156 | r.lo = fputil::multiply_add(a, b, -r.hi); |
| 157 | } else { |
| 158 | // Dekker's Product. |
| 159 | NumberPair<T> as = split(a); |
| 160 | |
| 161 | r = exact_mult<T, SPLIT_B>(as, a, b); |
| 162 | } |
| 163 | |
| 164 | return r; |
| 165 | } |
| 166 | |
| 167 | template <typename T = double> |
| 168 | LIBC_INLINE NumberPair<T> quick_mult(T a, const NumberPair<T> &b) { |
| 169 | NumberPair<T> r = exact_mult(a, b.hi); |
| 170 | r.lo = multiply_add(a, b.lo, r.lo); |
| 171 | return r; |
| 172 | } |
| 173 | |
| 174 | template <size_t SPLIT_B = 27> |
| 175 | LIBC_INLINE constexpr DoubleDouble quick_mult(const DoubleDouble &a, |
| 176 | const DoubleDouble &b) { |
| 177 | DoubleDouble r = exact_mult<double, SPLIT_B>(a.hi, b.hi); |
| 178 | double t1 = multiply_add(x: a.hi, y: b.lo, z: r.lo); |
| 179 | double t2 = multiply_add(x: a.lo, y: b.hi, z: t1); |
| 180 | r.lo = t2; |
| 181 | return r; |
| 182 | } |
| 183 | |
| 184 | // Assuming |c| >= |a * b|. |
| 185 | template <> |
| 186 | LIBC_INLINE DoubleDouble multiply_add<DoubleDouble>(const DoubleDouble &a, |
| 187 | const DoubleDouble &b, |
| 188 | const DoubleDouble &c) { |
| 189 | return add(a: c, b: quick_mult(a, b)); |
| 190 | } |
| 191 | |
| 192 | // Accurate double-double division, following Karp-Markstein's trick for |
| 193 | // division, implemented in the CORE-MATH project at: |
| 194 | // https://gitlab.inria.fr/core-math/core-math/-/blob/master/src/binary64/tan/tan.c#L1855 |
| 195 | // |
| 196 | // Error bounds: |
| 197 | // Let a = ah + al, b = bh + bl. |
| 198 | // Let r = rh + rl be the approximation of (ah + al) / (bh + bl). |
| 199 | // Then: |
| 200 | // (ah + al) / (bh + bl) - rh = |
| 201 | // = ((ah - bh * rh) + (al - bl * rh)) / (bh + bl) |
| 202 | // = (1 + O(bl/bh)) * ((ah - bh * rh) + (al - bl * rh)) / bh |
| 203 | // Let q = round(1/bh), then the above expressions are approximately: |
| 204 | // = (1 + O(bl / bh)) * (1 + O(2^-52)) * q * ((ah - bh * rh) + (al - bl * rh)) |
| 205 | // So we can compute: |
| 206 | // rl = q * (ah - bh * rh) + q * (al - bl * rh) |
| 207 | // as accurate as possible, then the error is bounded by: |
| 208 | // |(ah + al) / (bh + bl) - (rh + rl)| < O(bl/bh) * (2^-52 + al/ah + bl/bh) |
| 209 | template <typename T> |
| 210 | LIBC_INLINE NumberPair<T> div(const NumberPair<T> &a, const NumberPair<T> &b) { |
| 211 | NumberPair<T> r; |
| 212 | T q = T(1) / b.hi; |
| 213 | r.hi = a.hi * q; |
| 214 | |
| 215 | #ifdef LIBC_TARGET_CPU_HAS_FMA |
| 216 | T e_hi = fputil::multiply_add(b.hi, -r.hi, a.hi); |
| 217 | T e_lo = fputil::multiply_add(b.lo, -r.hi, a.lo); |
| 218 | #else |
| 219 | NumberPair<T> b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi); |
| 220 | NumberPair<T> b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi); |
| 221 | T e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo; |
| 222 | T e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo; |
| 223 | #endif // LIBC_TARGET_CPU_HAS_FMA |
| 224 | |
| 225 | r.lo = q * (e_hi + e_lo); |
| 226 | return r; |
| 227 | } |
| 228 | |
| 229 | } // namespace fputil |
| 230 | } // namespace LIBC_NAMESPACE_DECL |
| 231 | |
| 232 | #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_DOUBLE_DOUBLE_H |
| 233 | |