| 1 | #include <stdbool.h> |
| 2 | #include <stddef.h> |
| 3 | #include <stdint.h> |
| 4 | |
| 5 | #include "blake3_impl.h" |
| 6 | |
| 7 | #if defined(_MSC_VER) |
| 8 | #include <Windows.h> |
| 9 | #endif |
| 10 | |
| 11 | #if defined(IS_X86) |
| 12 | #if defined(_MSC_VER) |
| 13 | #include <intrin.h> |
| 14 | #elif defined(__GNUC__) |
| 15 | #include <immintrin.h> |
| 16 | #else |
| 17 | #undef IS_X86 /* Unimplemented! */ |
| 18 | #endif |
| 19 | #endif |
| 20 | |
| 21 | #if !defined(BLAKE3_ATOMICS) |
| 22 | #if defined(__has_include) |
| 23 | #if __has_include(<stdatomic.h>) && !defined(_MSC_VER) |
| 24 | #define BLAKE3_ATOMICS 1 |
| 25 | #else |
| 26 | #define BLAKE3_ATOMICS 0 |
| 27 | #endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */ |
| 28 | #else |
| 29 | #define BLAKE3_ATOMICS 0 |
| 30 | #endif /* defined(__has_include) */ |
| 31 | #endif /* BLAKE3_ATOMICS */ |
| 32 | |
| 33 | #if BLAKE3_ATOMICS |
| 34 | #define ATOMIC_INT _Atomic int |
| 35 | #define ATOMIC_LOAD(x) x |
| 36 | #define ATOMIC_STORE(x, y) x = y |
| 37 | #elif defined(_MSC_VER) |
| 38 | #define ATOMIC_INT LONG |
| 39 | #define ATOMIC_LOAD(x) InterlockedOr(&x, 0) |
| 40 | #define ATOMIC_STORE(x, y) InterlockedExchange(&x, y) |
| 41 | #else |
| 42 | #define ATOMIC_INT int |
| 43 | #define ATOMIC_LOAD(x) x |
| 44 | #define ATOMIC_STORE(x, y) x = y |
| 45 | #endif |
| 46 | |
| 47 | #define MAYBE_UNUSED(x) (void)((x)) |
| 48 | |
| 49 | #if defined(IS_X86) |
| 50 | static uint64_t xgetbv(void) { |
| 51 | #if defined(_MSC_VER) |
| 52 | return _xgetbv(0); |
| 53 | #else |
| 54 | uint32_t eax = 0, edx = 0; |
| 55 | __asm__ __volatile__("xgetbv\n" : "=a" (eax), "=d" (edx) : "c" (0)); |
| 56 | return ((uint64_t)edx << 32) | eax; |
| 57 | #endif |
| 58 | } |
| 59 | |
| 60 | static void cpuid(uint32_t out[4], uint32_t id) { |
| 61 | #if defined(_MSC_VER) |
| 62 | __cpuid((int *)out, id); |
| 63 | #elif defined(__i386__) || defined(_M_IX86) |
| 64 | __asm__ __volatile__("movl %%ebx, %1\n" |
| 65 | "cpuid\n" |
| 66 | "xchgl %1, %%ebx\n" |
| 67 | : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) |
| 68 | : "a" (id)); |
| 69 | #else |
| 70 | __asm__ __volatile__("cpuid\n" |
| 71 | : "=a" (out[0]), "=b" (out[1]), "=c" (out[2]), "=d" (out[3]) |
| 72 | : "a" (id)); |
| 73 | #endif |
| 74 | } |
| 75 | |
| 76 | static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { |
| 77 | #if defined(_MSC_VER) |
| 78 | __cpuidex((int *)out, id, sid); |
| 79 | #elif defined(__i386__) || defined(_M_IX86) |
| 80 | __asm__ __volatile__("movl %%ebx, %1\n" |
| 81 | "cpuid\n" |
| 82 | "xchgl %1, %%ebx\n" |
| 83 | : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) |
| 84 | : "a" (id), "c" (sid)); |
| 85 | #else |
| 86 | __asm__ __volatile__("cpuid\n" |
| 87 | : "=a" (out[0]), "=b" (out[1]), "=c" (out[2]), "=d" (out[3]) |
| 88 | : "a" (id), "c" (sid)); |
| 89 | #endif |
| 90 | } |
| 91 | |
| 92 | #endif |
| 93 | |
| 94 | enum cpu_feature { |
| 95 | SSE2 = 1 << 0, |
| 96 | SSSE3 = 1 << 1, |
| 97 | SSE41 = 1 << 2, |
| 98 | AVX = 1 << 3, |
| 99 | AVX2 = 1 << 4, |
| 100 | AVX512F = 1 << 5, |
| 101 | AVX512VL = 1 << 6, |
| 102 | /* ... */ |
| 103 | UNDEFINED = 1 << 30 |
| 104 | }; |
| 105 | |
| 106 | #if !defined(BLAKE3_TESTING) |
| 107 | static /* Allow the variable to be controlled manually for testing */ |
| 108 | #endif |
| 109 | ATOMIC_INT g_cpu_features = UNDEFINED; |
| 110 | |
| 111 | LLVM_ATTRIBUTE_USED |
| 112 | #if !defined(BLAKE3_TESTING) |
| 113 | static |
| 114 | #endif |
| 115 | enum cpu_feature |
| 116 | get_cpu_features(void) { |
| 117 | |
| 118 | /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */ |
| 119 | enum cpu_feature features = ATOMIC_LOAD(g_cpu_features); |
| 120 | if (features != UNDEFINED) { |
| 121 | return features; |
| 122 | } else { |
| 123 | #if defined(IS_X86) |
| 124 | uint32_t regs[4] = {0}; |
| 125 | uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; |
| 126 | (void)edx; |
| 127 | features = 0; |
| 128 | cpuid(out: regs, id: 0); |
| 129 | const int max_id = *eax; |
| 130 | cpuid(out: regs, id: 1); |
| 131 | #if defined(__amd64__) || defined(_M_X64) |
| 132 | features |= SSE2; |
| 133 | #else |
| 134 | if (*edx & (1UL << 26)) |
| 135 | features |= SSE2; |
| 136 | #endif |
| 137 | if (*ecx & (1UL << 9)) |
| 138 | features |= SSSE3; |
| 139 | if (*ecx & (1UL << 19)) |
| 140 | features |= SSE41; |
| 141 | |
| 142 | if (*ecx & (1UL << 27)) { // OSXSAVE |
| 143 | const uint64_t mask = xgetbv(); |
| 144 | if ((mask & 6) == 6) { // SSE and AVX states |
| 145 | if (*ecx & (1UL << 28)) |
| 146 | features |= AVX; |
| 147 | if (max_id >= 7) { |
| 148 | cpuidex(out: regs, id: 7, sid: 0); |
| 149 | if (*ebx & (1UL << 5)) |
| 150 | features |= AVX2; |
| 151 | if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm |
| 152 | if (*ebx & (1UL << 31)) |
| 153 | features |= AVX512VL; |
| 154 | if (*ebx & (1UL << 16)) |
| 155 | features |= AVX512F; |
| 156 | } |
| 157 | } |
| 158 | } |
| 159 | } |
| 160 | ATOMIC_STORE(g_cpu_features, features); |
| 161 | return features; |
| 162 | #else |
| 163 | /* How to detect NEON? */ |
| 164 | return 0; |
| 165 | #endif |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | void blake3_compress_in_place(uint32_t cv[8], |
| 170 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 171 | uint8_t block_len, uint64_t counter, |
| 172 | uint8_t flags) { |
| 173 | #if defined(IS_X86) |
| 174 | const enum cpu_feature features = get_cpu_features(); |
| 175 | MAYBE_UNUSED(features); |
| 176 | #if !defined(BLAKE3_NO_AVX512) |
| 177 | if (features & AVX512VL) { |
| 178 | blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); |
| 179 | return; |
| 180 | } |
| 181 | #endif |
| 182 | #if !defined(BLAKE3_NO_SSE41) |
| 183 | if (features & SSE41) { |
| 184 | blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); |
| 185 | return; |
| 186 | } |
| 187 | #endif |
| 188 | #if !defined(BLAKE3_NO_SSE2) |
| 189 | if (features & SSE2) { |
| 190 | blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); |
| 191 | return; |
| 192 | } |
| 193 | #endif |
| 194 | #endif |
| 195 | blake3_compress_in_place_portable(cv, block, block_len, counter, flags); |
| 196 | } |
| 197 | |
| 198 | void blake3_compress_xof(const uint32_t cv[8], |
| 199 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 200 | uint8_t block_len, uint64_t counter, uint8_t flags, |
| 201 | uint8_t out[64]) { |
| 202 | #if defined(IS_X86) |
| 203 | const enum cpu_feature features = get_cpu_features(); |
| 204 | MAYBE_UNUSED(features); |
| 205 | #if !defined(BLAKE3_NO_AVX512) |
| 206 | if (features & AVX512VL) { |
| 207 | blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); |
| 208 | return; |
| 209 | } |
| 210 | #endif |
| 211 | #if !defined(BLAKE3_NO_SSE41) |
| 212 | if (features & SSE41) { |
| 213 | blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); |
| 214 | return; |
| 215 | } |
| 216 | #endif |
| 217 | #if !defined(BLAKE3_NO_SSE2) |
| 218 | if (features & SSE2) { |
| 219 | blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); |
| 220 | return; |
| 221 | } |
| 222 | #endif |
| 223 | #endif |
| 224 | blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); |
| 225 | } |
| 226 | |
| 227 | |
| 228 | void blake3_xof_many(const uint32_t cv[8], |
| 229 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 230 | uint8_t block_len, uint64_t counter, uint8_t flags, |
| 231 | uint8_t out[64], size_t outblocks) { |
| 232 | if (outblocks == 0) { |
| 233 | // The current assembly implementation always outputs at least 1 block. |
| 234 | return; |
| 235 | } |
| 236 | #if defined(IS_X86) |
| 237 | const enum cpu_feature features = get_cpu_features(); |
| 238 | MAYBE_UNUSED(features); |
| 239 | #if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(BLAKE3_NO_AVX512) |
| 240 | if (features & AVX512VL) { |
| 241 | blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks); |
| 242 | return; |
| 243 | } |
| 244 | #endif |
| 245 | #endif |
| 246 | for(size_t i = 0; i < outblocks; ++i) { |
| 247 | blake3_compress_xof(cv, block, block_len, counter: counter + i, flags, out: out + 64*i); |
| 248 | } |
| 249 | } |
| 250 | |
| 251 | void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, |
| 252 | size_t blocks, const uint32_t key[8], uint64_t counter, |
| 253 | bool increment_counter, uint8_t flags, |
| 254 | uint8_t flags_start, uint8_t flags_end, uint8_t *out) { |
| 255 | #if defined(IS_X86) |
| 256 | const enum cpu_feature features = get_cpu_features(); |
| 257 | MAYBE_UNUSED(features); |
| 258 | #if !defined(BLAKE3_NO_AVX512) |
| 259 | if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { |
| 260 | blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, |
| 261 | increment_counter, flags, flags_start, flags_end, |
| 262 | out); |
| 263 | return; |
| 264 | } |
| 265 | #endif |
| 266 | #if !defined(BLAKE3_NO_AVX2) |
| 267 | if (features & AVX2) { |
| 268 | blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, |
| 269 | increment_counter, flags, flags_start, flags_end, |
| 270 | out); |
| 271 | return; |
| 272 | } |
| 273 | #endif |
| 274 | #if !defined(BLAKE3_NO_SSE41) |
| 275 | if (features & SSE41) { |
| 276 | blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, |
| 277 | increment_counter, flags, flags_start, flags_end, |
| 278 | out); |
| 279 | return; |
| 280 | } |
| 281 | #endif |
| 282 | #if !defined(BLAKE3_NO_SSE2) |
| 283 | if (features & SSE2) { |
| 284 | blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, |
| 285 | increment_counter, flags, flags_start, flags_end, |
| 286 | out); |
| 287 | return; |
| 288 | } |
| 289 | #endif |
| 290 | #endif |
| 291 | |
| 292 | #if BLAKE3_USE_NEON == 1 |
| 293 | blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, |
| 294 | increment_counter, flags, flags_start, flags_end, out); |
| 295 | return; |
| 296 | #endif |
| 297 | |
| 298 | blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, |
| 299 | increment_counter, flags, flags_start, flags_end, |
| 300 | out); |
| 301 | } |
| 302 | |
| 303 | // The dynamically detected SIMD degree of the current platform. |
| 304 | size_t blake3_simd_degree(void) { |
| 305 | #if defined(IS_X86) |
| 306 | const enum cpu_feature features = get_cpu_features(); |
| 307 | MAYBE_UNUSED(features); |
| 308 | #if !defined(BLAKE3_NO_AVX512) |
| 309 | if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { |
| 310 | return 16; |
| 311 | } |
| 312 | #endif |
| 313 | #if !defined(BLAKE3_NO_AVX2) |
| 314 | if (features & AVX2) { |
| 315 | return 8; |
| 316 | } |
| 317 | #endif |
| 318 | #if !defined(BLAKE3_NO_SSE41) |
| 319 | if (features & SSE41) { |
| 320 | return 4; |
| 321 | } |
| 322 | #endif |
| 323 | #if !defined(BLAKE3_NO_SSE2) |
| 324 | if (features & SSE2) { |
| 325 | return 4; |
| 326 | } |
| 327 | #endif |
| 328 | #endif |
| 329 | #if BLAKE3_USE_NEON == 1 |
| 330 | return 4; |
| 331 | #endif |
| 332 | return 1; |
| 333 | } |
| 334 | |