| 1 | // Optimizations for random number functions, x86 version -*- C++ -*- |
| 2 | |
| 3 | // Copyright (C) 2012-2024 Free Software Foundation, Inc. |
| 4 | // |
| 5 | // This file is part of the GNU ISO C++ Library. This library is free |
| 6 | // software; you can redistribute it and/or modify it under the |
| 7 | // terms of the GNU General Public License as published by the |
| 8 | // Free Software Foundation; either version 3, or (at your option) |
| 9 | // any later version. |
| 10 | |
| 11 | // This library is distributed in the hope that it will be useful, |
| 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | // GNU General Public License for more details. |
| 15 | |
| 16 | // Under Section 7 of GPL version 3, you are granted additional |
| 17 | // permissions described in the GCC Runtime Library Exception, version |
| 18 | // 3.1, as published by the Free Software Foundation. |
| 19 | |
| 20 | // You should have received a copy of the GNU General Public License and |
| 21 | // a copy of the GCC Runtime Library Exception along with this program; |
| 22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| 23 | // <http://www.gnu.org/licenses/>. |
| 24 | |
| 25 | /** @file bits/opt_random.h |
| 26 | * This is an internal header file, included by other library headers. |
| 27 | * Do not attempt to use it directly. @headername{random} |
| 28 | */ |
| 29 | |
| 30 | #ifndef _BITS_OPT_RANDOM_H |
| 31 | #define _BITS_OPT_RANDOM_H 1 |
| 32 | |
| 33 | #ifdef __SSE3__ |
| 34 | #include <pmmintrin.h> |
| 35 | #endif |
| 36 | |
| 37 | |
| 38 | #pragma GCC system_header |
| 39 | |
| 40 | |
| 41 | namespace std _GLIBCXX_VISIBILITY(default) |
| 42 | { |
| 43 | _GLIBCXX_BEGIN_NAMESPACE_VERSION |
| 44 | |
| 45 | #ifdef __SSE3__ |
| 46 | template<> |
| 47 | template<typename _UniformRandomNumberGenerator> |
| 48 | void |
| 49 | normal_distribution<double>:: |
| 50 | __generate(typename normal_distribution<double>::result_type* __f, |
| 51 | typename normal_distribution<double>::result_type* __t, |
| 52 | _UniformRandomNumberGenerator& __urng, |
| 53 | const param_type& __param) |
| 54 | { |
| 55 | typedef uint64_t __uctype; |
| 56 | |
| 57 | if (__f == __t) |
| 58 | return; |
| 59 | |
| 60 | if (_M_saved_available) |
| 61 | { |
| 62 | _M_saved_available = false; |
| 63 | *__f++ = _M_saved * __param.stddev() + __param.mean(); |
| 64 | |
| 65 | if (__f == __t) |
| 66 | return; |
| 67 | } |
| 68 | |
| 69 | constexpr uint64_t __maskval = 0xfffffffffffffull; |
| 70 | static const __m128i __mask = _mm_set1_epi64x(__maskval); |
| 71 | static const __m128i __two = _mm_set1_epi64x(0x4000000000000000ull); |
| 72 | static const __m128d __three = _mm_set1_pd(3.0); |
| 73 | const __m128d __av = _mm_set1_pd(__param.mean()); |
| 74 | |
| 75 | const __uctype __urngmin = __urng.min(); |
| 76 | const __uctype __urngmax = __urng.max(); |
| 77 | const __uctype __urngrange = __urngmax - __urngmin; |
| 78 | const __uctype __uerngrange = __urngrange + 1; |
| 79 | |
| 80 | while (__f + 1 < __t) |
| 81 | { |
| 82 | double __le; |
| 83 | __m128d __x; |
| 84 | do |
| 85 | { |
| 86 | union |
| 87 | { |
| 88 | __m128i __i; |
| 89 | __m128d __d; |
| 90 | } __v; |
| 91 | |
| 92 | if (__urngrange > __maskval) |
| 93 | { |
| 94 | if (__detail::_Power_of_2(__uerngrange)) |
| 95 | __v.__i = _mm_and_si128(_mm_set_epi64x(__urng(), |
| 96 | __urng()), |
| 97 | __mask); |
| 98 | else |
| 99 | { |
| 100 | const __uctype __uerange = __maskval + 1; |
| 101 | const __uctype __scaling = __urngrange / __uerange; |
| 102 | const __uctype __past = __uerange * __scaling; |
| 103 | uint64_t __v1; |
| 104 | do |
| 105 | __v1 = __uctype(__urng()) - __urngmin; |
| 106 | while (__v1 >= __past); |
| 107 | __v1 /= __scaling; |
| 108 | uint64_t __v2; |
| 109 | do |
| 110 | __v2 = __uctype(__urng()) - __urngmin; |
| 111 | while (__v2 >= __past); |
| 112 | __v2 /= __scaling; |
| 113 | |
| 114 | __v.__i = _mm_set_epi64x(__v1, __v2); |
| 115 | } |
| 116 | } |
| 117 | else if (__urngrange == __maskval) |
| 118 | __v.__i = _mm_set_epi64x(__urng(), __urng()); |
| 119 | else if ((__urngrange + 2) * __urngrange >= __maskval |
| 120 | && __detail::_Power_of_2(__uerngrange)) |
| 121 | { |
| 122 | uint64_t __v1 = __urng() * __uerngrange + __urng(); |
| 123 | uint64_t __v2 = __urng() * __uerngrange + __urng(); |
| 124 | |
| 125 | __v.__i = _mm_and_si128(_mm_set_epi64x(__v1, __v2), |
| 126 | __mask); |
| 127 | } |
| 128 | else |
| 129 | { |
| 130 | size_t __nrng = 2; |
| 131 | __uctype __high = __maskval / __uerngrange / __uerngrange; |
| 132 | while (__high > __uerngrange) |
| 133 | { |
| 134 | ++__nrng; |
| 135 | __high /= __uerngrange; |
| 136 | } |
| 137 | const __uctype __highrange = __high + 1; |
| 138 | const __uctype __scaling = __urngrange / __highrange; |
| 139 | const __uctype __past = __highrange * __scaling; |
| 140 | __uctype __tmp; |
| 141 | |
| 142 | uint64_t __v1; |
| 143 | do |
| 144 | { |
| 145 | do |
| 146 | __tmp = __uctype(__urng()) - __urngmin; |
| 147 | while (__tmp >= __past); |
| 148 | __v1 = __tmp / __scaling; |
| 149 | for (size_t __cnt = 0; __cnt < __nrng; ++__cnt) |
| 150 | { |
| 151 | __tmp = __v1; |
| 152 | __v1 *= __uerngrange; |
| 153 | __v1 += __uctype(__urng()) - __urngmin; |
| 154 | } |
| 155 | } |
| 156 | while (__v1 > __maskval || __v1 < __tmp); |
| 157 | |
| 158 | uint64_t __v2; |
| 159 | do |
| 160 | { |
| 161 | do |
| 162 | __tmp = __uctype(__urng()) - __urngmin; |
| 163 | while (__tmp >= __past); |
| 164 | __v2 = __tmp / __scaling; |
| 165 | for (size_t __cnt = 0; __cnt < __nrng; ++__cnt) |
| 166 | { |
| 167 | __tmp = __v2; |
| 168 | __v2 *= __uerngrange; |
| 169 | __v2 += __uctype(__urng()) - __urngmin; |
| 170 | } |
| 171 | } |
| 172 | while (__v2 > __maskval || __v2 < __tmp); |
| 173 | |
| 174 | __v.__i = _mm_set_epi64x(__v1, __v2); |
| 175 | } |
| 176 | |
| 177 | __v.__i = _mm_or_si128(__v.__i, __two); |
| 178 | __x = _mm_sub_pd(__v.__d, __three); |
| 179 | __m128d __m = _mm_mul_pd(__x, __x); |
| 180 | __le = _mm_cvtsd_f64(_mm_hadd_pd (__m, __m)); |
| 181 | } |
| 182 | while (__le == 0.0 || __le >= 1.0); |
| 183 | |
| 184 | double __mult = (std::sqrt(-2.0 * std::log(__le) / __le) |
| 185 | * __param.stddev()); |
| 186 | |
| 187 | __x = _mm_add_pd(_mm_mul_pd(__x, _mm_set1_pd(__mult)), __av); |
| 188 | |
| 189 | _mm_storeu_pd(__f, __x); |
| 190 | __f += 2; |
| 191 | } |
| 192 | |
| 193 | if (__f != __t) |
| 194 | { |
| 195 | result_type __x, __y, __r2; |
| 196 | |
| 197 | __detail::_Adaptor<_UniformRandomNumberGenerator, result_type> |
| 198 | __aurng(__urng); |
| 199 | |
| 200 | do |
| 201 | { |
| 202 | __x = result_type(2.0) * __aurng() - 1.0; |
| 203 | __y = result_type(2.0) * __aurng() - 1.0; |
| 204 | __r2 = __x * __x + __y * __y; |
| 205 | } |
| 206 | while (__r2 > 1.0 || __r2 == 0.0); |
| 207 | |
| 208 | const result_type __mult = std::sqrt(-2 * std::log(__r2) / __r2); |
| 209 | _M_saved = __x * __mult; |
| 210 | _M_saved_available = true; |
| 211 | *__f = __y * __mult * __param.stddev() + __param.mean(); |
| 212 | } |
| 213 | } |
| 214 | #endif |
| 215 | |
| 216 | |
| 217 | _GLIBCXX_END_NAMESPACE_VERSION |
| 218 | } // namespace |
| 219 | |
| 220 | |
| 221 | #endif // _BITS_OPT_RANDOM_H |
| 222 | |