| 1 | //===----------------------------------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | // Copyright (c) Microsoft Corporation. |
| 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 11 | |
| 12 | // Copyright 2018 Ulf Adams |
| 13 | // Copyright (c) Microsoft Corporation. All rights reserved. |
| 14 | |
| 15 | // Boost Software License - Version 1.0 - August 17th, 2003 |
| 16 | |
| 17 | // Permission is hereby granted, free of charge, to any person or organization |
| 18 | // obtaining a copy of the software and accompanying documentation covered by |
| 19 | // this license (the "Software") to use, reproduce, display, distribute, |
| 20 | // execute, and transmit the Software, and to prepare derivative works of the |
| 21 | // Software, and to permit third-parties to whom the Software is furnished to |
| 22 | // do so, all subject to the following: |
| 23 | |
| 24 | // The copyright notices in the Software and this entire statement, including |
| 25 | // the above license grant, this restriction and the following disclaimer, |
| 26 | // must be included in all copies of the Software, in whole or in part, and |
| 27 | // all derivative works of the Software, unless such copies or derivative |
| 28 | // works are solely in the form of machine-executable object code generated by |
| 29 | // a source language processor. |
| 30 | |
| 31 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 32 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 33 | // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT |
| 34 | // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE |
| 35 | // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, |
| 36 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
| 37 | // DEALINGS IN THE SOFTWARE. |
| 38 | |
| 39 | #ifndef _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H |
| 40 | #define _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H |
| 41 | |
| 42 | // Avoid formatting to keep the changes with the original code minimal. |
| 43 | // clang-format off |
| 44 | |
| 45 | #include <__assert> |
| 46 | #include <__config> |
| 47 | |
| 48 | #include "include/ryu/ryu.h" |
| 49 | |
| 50 | _LIBCPP_BEGIN_NAMESPACE_STD |
| 51 | |
| 52 | #if defined(_M_X64) && defined(_MSC_VER) |
| 53 | #define _LIBCPP_INTRINSIC128 1 |
| 54 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) { |
| 55 | return _umul128(__a, __b, __productHi); |
| 56 | } |
| 57 | |
| 58 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) { |
| 59 | // For the __shiftright128 intrinsic, the shift value is always |
| 60 | // modulo 64. |
| 61 | // In the current implementation of the double-precision version |
| 62 | // of Ryu, the shift value is always < 64. |
| 63 | // (The shift value is in the range [49, 58].) |
| 64 | // Check this here in case a future change requires larger shift |
| 65 | // values. In this case this function needs to be adjusted. |
| 66 | _LIBCPP_ASSERT_INTERNAL(__dist < 64, "" ); |
| 67 | return __shiftright128(__lo, __hi, static_cast<unsigned char>(__dist)); |
| 68 | } |
| 69 | |
| 70 | // ^^^ intrinsics available ^^^ / vvv __int128 available vvv |
| 71 | #elif defined(__SIZEOF_INT128__) && ( \ |
| 72 | (defined(__clang__) && !defined(_MSC_VER)) || \ |
| 73 | (defined(__GNUC__) && !defined(__clang__) && !defined(__CUDACC__))) |
| 74 | #define _LIBCPP_INTRINSIC128 1 |
| 75 | // We have __uint128 support in clang or gcc |
| 76 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) { |
| 77 | auto __temp = __a * (unsigned __int128)__b; |
| 78 | *__productHi = __temp >> 64; |
| 79 | return static_cast<uint64_t>(__temp); |
| 80 | } |
| 81 | |
| 82 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) { |
| 83 | // In the current implementation of the double-precision version |
| 84 | // of Ryu, the shift value is always < 64. |
| 85 | // (The shift value is in the range [49, 58].) |
| 86 | // Check this here in case a future change requires larger shift |
| 87 | // values. In this case this function needs to be adjusted. |
| 88 | _LIBCPP_ASSERT_INTERNAL(__dist < 64, "" ); |
| 89 | auto __temp = __lo | ((unsigned __int128)__hi << 64); |
| 90 | // For x64 128-bit shfits using the `shrd` instruction and two 64-bit |
| 91 | // registers, the shift value is modulo 64. Thus the `& 63` is free. |
| 92 | return static_cast<uint64_t>(__temp >> (__dist & 63)); |
| 93 | } |
| 94 | #else // ^^^ __int128 available ^^^ / vvv intrinsics unavailable vvv |
| 95 | |
| 96 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_ALWAYS_INLINE uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) { |
| 97 | // TRANSITION, VSO-634761 |
| 98 | // The casts here help MSVC to avoid calls to the __allmul library function. |
| 99 | const uint32_t __aLo = static_cast<uint32_t>(__a); |
| 100 | const uint32_t __aHi = static_cast<uint32_t>(__a >> 32); |
| 101 | const uint32_t __bLo = static_cast<uint32_t>(__b); |
| 102 | const uint32_t __bHi = static_cast<uint32_t>(__b >> 32); |
| 103 | |
| 104 | const uint64_t __b00 = static_cast<uint64_t>(__aLo) * __bLo; |
| 105 | const uint64_t __b01 = static_cast<uint64_t>(__aLo) * __bHi; |
| 106 | const uint64_t __b10 = static_cast<uint64_t>(__aHi) * __bLo; |
| 107 | const uint64_t __b11 = static_cast<uint64_t>(__aHi) * __bHi; |
| 108 | |
| 109 | const uint32_t __b00Lo = static_cast<uint32_t>(__b00); |
| 110 | const uint32_t __b00Hi = static_cast<uint32_t>(__b00 >> 32); |
| 111 | |
| 112 | const uint64_t __mid1 = __b10 + __b00Hi; |
| 113 | const uint32_t __mid1Lo = static_cast<uint32_t>(__mid1); |
| 114 | const uint32_t __mid1Hi = static_cast<uint32_t>(__mid1 >> 32); |
| 115 | |
| 116 | const uint64_t __mid2 = __b01 + __mid1Lo; |
| 117 | const uint32_t __mid2Lo = static_cast<uint32_t>(__mid2); |
| 118 | const uint32_t __mid2Hi = static_cast<uint32_t>(__mid2 >> 32); |
| 119 | |
| 120 | const uint64_t __pHi = __b11 + __mid1Hi + __mid2Hi; |
| 121 | const uint64_t __pLo = (static_cast<uint64_t>(__mid2Lo) << 32) | __b00Lo; |
| 122 | |
| 123 | *__productHi = __pHi; |
| 124 | return __pLo; |
| 125 | } |
| 126 | |
| 127 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) { |
| 128 | // We don't need to handle the case __dist >= 64 here (see above). |
| 129 | _LIBCPP_ASSERT_INTERNAL(__dist < 64, "" ); |
| 130 | #ifdef _LIBCPP_64_BIT |
| 131 | _LIBCPP_ASSERT_INTERNAL(__dist > 0, "" ); |
| 132 | return (__hi << (64 - __dist)) | (__lo >> __dist); |
| 133 | #else // ^^^ 64-bit ^^^ / vvv 32-bit vvv |
| 134 | // Avoid a 64-bit shift by taking advantage of the range of shift values. |
| 135 | _LIBCPP_ASSERT_INTERNAL(__dist >= 32, "" ); |
| 136 | return (__hi << (64 - __dist)) | (static_cast<uint32_t>(__lo >> 32) >> (__dist - 32)); |
| 137 | #endif // ^^^ 32-bit ^^^ |
| 138 | } |
| 139 | |
| 140 | #endif // ^^^ intrinsics unavailable ^^^ |
| 141 | |
| 142 | #ifndef _LIBCPP_64_BIT |
| 143 | |
| 144 | // Returns the high 64 bits of the 128-bit product of __a and __b. |
| 145 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __umulh(const uint64_t __a, const uint64_t __b) { |
| 146 | // Reuse the __ryu_umul128 implementation. |
| 147 | // Optimizers will likely eliminate the instructions used to compute the |
| 148 | // low part of the product. |
| 149 | uint64_t __hi; |
| 150 | (void) __ryu_umul128(__a, __b, productHi: &__hi); |
| 151 | return __hi; |
| 152 | } |
| 153 | |
| 154 | // On 32-bit platforms, compilers typically generate calls to library |
| 155 | // functions for 64-bit divisions, even if the divisor is a constant. |
| 156 | // |
| 157 | // TRANSITION, LLVM-37932 |
| 158 | // |
| 159 | // The functions here perform division-by-constant using multiplications |
| 160 | // in the same way as 64-bit compilers would do. |
| 161 | // |
| 162 | // NB: |
| 163 | // The multipliers and shift values are the ones generated by clang x64 |
| 164 | // for expressions like x/5, x/10, etc. |
| 165 | |
| 166 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div5(const uint64_t __x) { |
| 167 | return __umulh(a: __x, b: 0xCCCCCCCCCCCCCCCDu) >> 2; |
| 168 | } |
| 169 | |
| 170 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div10(const uint64_t __x) { |
| 171 | return __umulh(a: __x, b: 0xCCCCCCCCCCCCCCCDu) >> 3; |
| 172 | } |
| 173 | |
| 174 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div100(const uint64_t __x) { |
| 175 | return __umulh(a: __x >> 2, b: 0x28F5C28F5C28F5C3u) >> 2; |
| 176 | } |
| 177 | |
| 178 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e8(const uint64_t __x) { |
| 179 | return __umulh(a: __x, b: 0xABCC77118461CEFDu) >> 26; |
| 180 | } |
| 181 | |
| 182 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e9(const uint64_t __x) { |
| 183 | return __umulh(a: __x >> 9, b: 0x44B82FA09B5A53u) >> 11; |
| 184 | } |
| 185 | |
| 186 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __mod1e9(const uint64_t __x) { |
| 187 | // Avoid 64-bit math as much as possible. |
| 188 | // Returning static_cast<uint32_t>(__x - 1000000000 * __div1e9(__x)) would |
| 189 | // perform 32x64-bit multiplication and 64-bit subtraction. |
| 190 | // __x and 1000000000 * __div1e9(__x) are guaranteed to differ by |
| 191 | // less than 10^9, so their highest 32 bits must be identical, |
| 192 | // so we can truncate both sides to uint32_t before subtracting. |
| 193 | // We can also simplify static_cast<uint32_t>(1000000000 * __div1e9(__x)). |
| 194 | // We can truncate before multiplying instead of after, as multiplying |
| 195 | // the highest 32 bits of __div1e9(__x) can't affect the lowest 32 bits. |
| 196 | return static_cast<uint32_t>(__x) - 1000000000 * static_cast<uint32_t>(__div1e9(__x)); |
| 197 | } |
| 198 | |
| 199 | #else // ^^^ 32-bit ^^^ / vvv 64-bit vvv |
| 200 | |
| 201 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div5(const uint64_t __x) { |
| 202 | return __x / 5; |
| 203 | } |
| 204 | |
| 205 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div10(const uint64_t __x) { |
| 206 | return __x / 10; |
| 207 | } |
| 208 | |
| 209 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div100(const uint64_t __x) { |
| 210 | return __x / 100; |
| 211 | } |
| 212 | |
| 213 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e8(const uint64_t __x) { |
| 214 | return __x / 100000000; |
| 215 | } |
| 216 | |
| 217 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e9(const uint64_t __x) { |
| 218 | return __x / 1000000000; |
| 219 | } |
| 220 | |
| 221 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __mod1e9(const uint64_t __x) { |
| 222 | return static_cast<uint32_t>(__x - 1000000000 * __div1e9(__x)); |
| 223 | } |
| 224 | |
| 225 | #endif // ^^^ 64-bit ^^^ |
| 226 | |
| 227 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __pow5Factor(uint64_t __value) { |
| 228 | uint32_t __count = 0; |
| 229 | for (;;) { |
| 230 | _LIBCPP_ASSERT_INTERNAL(__value != 0, "" ); |
| 231 | const uint64_t __q = __div5(x: __value); |
| 232 | const uint32_t __r = static_cast<uint32_t>(__value) - 5 * static_cast<uint32_t>(__q); |
| 233 | if (__r != 0) { |
| 234 | break; |
| 235 | } |
| 236 | __value = __q; |
| 237 | ++__count; |
| 238 | } |
| 239 | return __count; |
| 240 | } |
| 241 | |
| 242 | // Returns true if __value is divisible by 5^__p. |
| 243 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline bool __multipleOfPowerOf5(const uint64_t __value, const uint32_t __p) { |
| 244 | // I tried a case distinction on __p, but there was no performance difference. |
| 245 | return __pow5Factor(__value) >= __p; |
| 246 | } |
| 247 | |
| 248 | // Returns true if __value is divisible by 2^__p. |
| 249 | [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline bool __multipleOfPowerOf2(const uint64_t __value, const uint32_t __p) { |
| 250 | _LIBCPP_ASSERT_INTERNAL(__value != 0, "" ); |
| 251 | _LIBCPP_ASSERT_INTERNAL(__p < 64, "" ); |
| 252 | // __builtin_ctzll doesn't appear to be faster here. |
| 253 | return (__value & ((1ull << __p) - 1)) == 0; |
| 254 | } |
| 255 | |
| 256 | _LIBCPP_END_NAMESPACE_STD |
| 257 | |
| 258 | // clang-format on |
| 259 | |
| 260 | #endif // _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H |
| 261 | |