| 1 | #ifndef BLAKE3_IMPL_H |
| 2 | #define BLAKE3_IMPL_H |
| 3 | |
| 4 | #include <assert.h> |
| 5 | #include <stdbool.h> |
| 6 | #include <stddef.h> |
| 7 | #include <stdint.h> |
| 8 | #include <string.h> |
| 9 | |
| 10 | #include "llvm-c/blake3.h" |
| 11 | // For \p LLVM_LIBRARY_VISIBILITY |
| 12 | #include "llvm/Support/Compiler.h" |
| 13 | |
| 14 | #include "llvm_blake3_prefix.h" |
| 15 | |
| 16 | #define BLAKE3_PRIVATE |
| 17 | |
| 18 | // internal flags |
| 19 | enum blake3_flags { |
| 20 | CHUNK_START = 1 << 0, |
| 21 | CHUNK_END = 1 << 1, |
| 22 | PARENT = 1 << 2, |
| 23 | ROOT = 1 << 3, |
| 24 | KEYED_HASH = 1 << 4, |
| 25 | DERIVE_KEY_CONTEXT = 1 << 5, |
| 26 | DERIVE_KEY_MATERIAL = 1 << 6, |
| 27 | }; |
| 28 | |
| 29 | // This C implementation tries to support recent versions of GCC, Clang, and |
| 30 | // MSVC. |
| 31 | #if defined(_MSC_VER) |
| 32 | #define INLINE static __forceinline |
| 33 | #else |
| 34 | #define INLINE static inline __attribute__((always_inline)) |
| 35 | #endif |
| 36 | |
| 37 | #if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC) |
| 38 | #define IS_X86 |
| 39 | #define IS_X86_64 |
| 40 | #endif |
| 41 | |
| 42 | #if defined(__i386__) || defined(_M_IX86) |
| 43 | #define IS_X86 |
| 44 | #define IS_X86_32 |
| 45 | #endif |
| 46 | |
| 47 | #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) |
| 48 | #define IS_AARCH64 |
| 49 | #endif |
| 50 | |
| 51 | #if defined(IS_X86) |
| 52 | #if defined(_MSC_VER) |
| 53 | #include <intrin.h> |
| 54 | #endif |
| 55 | #include <immintrin.h> |
| 56 | #endif |
| 57 | |
| 58 | #if !defined(BLAKE3_USE_NEON) |
| 59 | // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness |
| 60 | #if defined(IS_AARCH64) |
| 61 | #if defined(__ARM_BIG_ENDIAN) |
| 62 | #define BLAKE3_USE_NEON 0 |
| 63 | #else |
| 64 | #define BLAKE3_USE_NEON 1 |
| 65 | #endif |
| 66 | #else |
| 67 | #define BLAKE3_USE_NEON 0 |
| 68 | #endif |
| 69 | #endif |
| 70 | |
| 71 | #if defined(IS_X86) |
| 72 | #define MAX_SIMD_DEGREE 16 |
| 73 | #elif BLAKE3_USE_NEON == 1 |
| 74 | #define MAX_SIMD_DEGREE 4 |
| 75 | #else |
| 76 | #define MAX_SIMD_DEGREE 1 |
| 77 | #endif |
| 78 | |
| 79 | // There are some places where we want a static size that's equal to the |
| 80 | // MAX_SIMD_DEGREE, but also at least 2. |
| 81 | #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) |
| 82 | |
| 83 | static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, |
| 84 | 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, |
| 85 | 0x1F83D9ABUL, 0x5BE0CD19UL}; |
| 86 | |
| 87 | static const uint8_t MSG_SCHEDULE[7][16] = { |
| 88 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
| 89 | {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, |
| 90 | {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, |
| 91 | {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, |
| 92 | {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, |
| 93 | {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, |
| 94 | {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, |
| 95 | }; |
| 96 | |
| 97 | /* Find index of the highest set bit */ |
| 98 | /* x is assumed to be nonzero. */ |
| 99 | static unsigned int highest_one(uint64_t x) { |
| 100 | #if defined(__GNUC__) || defined(__clang__) |
| 101 | return 63 ^ (unsigned int)__builtin_clzll(x); |
| 102 | #elif defined(_MSC_VER) && defined(IS_X86_64) |
| 103 | unsigned long index; |
| 104 | _BitScanReverse64(&index, x); |
| 105 | return index; |
| 106 | #elif defined(_MSC_VER) && defined(IS_X86_32) |
| 107 | if(x >> 32) { |
| 108 | unsigned long index; |
| 109 | _BitScanReverse(&index, (unsigned long)(x >> 32)); |
| 110 | return 32 + index; |
| 111 | } else { |
| 112 | unsigned long index; |
| 113 | _BitScanReverse(&index, (unsigned long)x); |
| 114 | return index; |
| 115 | } |
| 116 | #else |
| 117 | unsigned int c = 0; |
| 118 | if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } |
| 119 | if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } |
| 120 | if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } |
| 121 | if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } |
| 122 | if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } |
| 123 | if(x & 0x0000000000000002ULL) { c += 1; } |
| 124 | return c; |
| 125 | #endif |
| 126 | } |
| 127 | |
| 128 | // Count the number of 1 bits. |
| 129 | INLINE unsigned int popcnt(uint64_t x) { |
| 130 | #if defined(__GNUC__) || defined(__clang__) |
| 131 | return (unsigned int)__builtin_popcountll(x); |
| 132 | #else |
| 133 | unsigned int count = 0; |
| 134 | while (x != 0) { |
| 135 | count += 1; |
| 136 | x &= x - 1; |
| 137 | } |
| 138 | return count; |
| 139 | #endif |
| 140 | } |
| 141 | |
| 142 | // Largest power of two less than or equal to x. As a special case, returns 1 |
| 143 | // when x is 0. |
| 144 | INLINE uint64_t round_down_to_power_of_2(uint64_t x) { |
| 145 | return 1ULL << highest_one(x: x | 1); |
| 146 | } |
| 147 | |
| 148 | INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } |
| 149 | |
| 150 | INLINE uint32_t counter_high(uint64_t counter) { |
| 151 | return (uint32_t)(counter >> 32); |
| 152 | } |
| 153 | |
| 154 | INLINE uint32_t load32(const void *src) { |
| 155 | const uint8_t *p = (const uint8_t *)src; |
| 156 | return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | |
| 157 | ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); |
| 158 | } |
| 159 | |
| 160 | INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], |
| 161 | uint32_t key_words[8]) { |
| 162 | key_words[0] = load32(src: &key[0 * 4]); |
| 163 | key_words[1] = load32(src: &key[1 * 4]); |
| 164 | key_words[2] = load32(src: &key[2 * 4]); |
| 165 | key_words[3] = load32(src: &key[3 * 4]); |
| 166 | key_words[4] = load32(src: &key[4 * 4]); |
| 167 | key_words[5] = load32(src: &key[5 * 4]); |
| 168 | key_words[6] = load32(src: &key[6 * 4]); |
| 169 | key_words[7] = load32(src: &key[7 * 4]); |
| 170 | } |
| 171 | |
| 172 | INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN], |
| 173 | uint32_t block_words[16]) { |
| 174 | for (size_t i = 0; i < 16; i++) { |
| 175 | block_words[i] = load32(src: &block[i * 4]); |
| 176 | } |
| 177 | } |
| 178 | |
| 179 | INLINE void store32(void *dst, uint32_t w) { |
| 180 | uint8_t *p = (uint8_t *)dst; |
| 181 | p[0] = (uint8_t)(w >> 0); |
| 182 | p[1] = (uint8_t)(w >> 8); |
| 183 | p[2] = (uint8_t)(w >> 16); |
| 184 | p[3] = (uint8_t)(w >> 24); |
| 185 | } |
| 186 | |
| 187 | INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { |
| 188 | store32(dst: &bytes_out[0 * 4], w: cv_words[0]); |
| 189 | store32(dst: &bytes_out[1 * 4], w: cv_words[1]); |
| 190 | store32(dst: &bytes_out[2 * 4], w: cv_words[2]); |
| 191 | store32(dst: &bytes_out[3 * 4], w: cv_words[3]); |
| 192 | store32(dst: &bytes_out[4 * 4], w: cv_words[4]); |
| 193 | store32(dst: &bytes_out[5 * 4], w: cv_words[5]); |
| 194 | store32(dst: &bytes_out[6 * 4], w: cv_words[6]); |
| 195 | store32(dst: &bytes_out[7 * 4], w: cv_words[7]); |
| 196 | } |
| 197 | |
| 198 | LLVM_LIBRARY_VISIBILITY |
| 199 | void blake3_compress_in_place(uint32_t cv[8], |
| 200 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 201 | uint8_t block_len, uint64_t counter, |
| 202 | uint8_t flags); |
| 203 | |
| 204 | LLVM_LIBRARY_VISIBILITY |
| 205 | void blake3_compress_xof(const uint32_t cv[8], |
| 206 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 207 | uint8_t block_len, uint64_t counter, uint8_t flags, |
| 208 | uint8_t out[64]); |
| 209 | |
| 210 | LLVM_LIBRARY_VISIBILITY |
| 211 | void blake3_xof_many(const uint32_t cv[8], |
| 212 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 213 | uint8_t block_len, uint64_t counter, uint8_t flags, |
| 214 | uint8_t out[64], size_t outblocks); |
| 215 | |
| 216 | LLVM_LIBRARY_VISIBILITY |
| 217 | void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, |
| 218 | size_t blocks, const uint32_t key[8], uint64_t counter, |
| 219 | bool increment_counter, uint8_t flags, |
| 220 | uint8_t flags_start, uint8_t flags_end, uint8_t *out); |
| 221 | |
| 222 | LLVM_LIBRARY_VISIBILITY |
| 223 | size_t blake3_simd_degree(void); |
| 224 | |
| 225 | BLAKE3_PRIVATE size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, |
| 226 | const uint32_t key[8], |
| 227 | uint64_t chunk_counter, uint8_t flags, |
| 228 | uint8_t *out, bool use_tbb); |
| 229 | |
| 230 | #if defined(BLAKE3_USE_TBB) |
| 231 | BLAKE3_PRIVATE void blake3_compress_subtree_wide_join_tbb( |
| 232 | // shared params |
| 233 | const uint32_t key[8], uint8_t flags, bool use_tbb, |
| 234 | // left-hand side params |
| 235 | const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter, |
| 236 | uint8_t *l_cvs, size_t *l_n, |
| 237 | // right-hand side params |
| 238 | const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter, |
| 239 | uint8_t *r_cvs, size_t *r_n) NOEXCEPT; |
| 240 | #endif |
| 241 | |
| 242 | // Declarations for implementation-specific functions. |
| 243 | LLVM_LIBRARY_VISIBILITY |
| 244 | void blake3_compress_in_place_portable(uint32_t cv[8], |
| 245 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 246 | uint8_t block_len, uint64_t counter, |
| 247 | uint8_t flags); |
| 248 | |
| 249 | LLVM_LIBRARY_VISIBILITY |
| 250 | void blake3_compress_xof_portable(const uint32_t cv[8], |
| 251 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 252 | uint8_t block_len, uint64_t counter, |
| 253 | uint8_t flags, uint8_t out[64]); |
| 254 | |
| 255 | LLVM_LIBRARY_VISIBILITY |
| 256 | void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, |
| 257 | size_t blocks, const uint32_t key[8], |
| 258 | uint64_t counter, bool increment_counter, |
| 259 | uint8_t flags, uint8_t flags_start, |
| 260 | uint8_t flags_end, uint8_t *out); |
| 261 | |
| 262 | #if defined(IS_X86) |
| 263 | #if !defined(BLAKE3_NO_SSE2) |
| 264 | LLVM_LIBRARY_VISIBILITY |
| 265 | void blake3_compress_in_place_sse2(uint32_t cv[8], |
| 266 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 267 | uint8_t block_len, uint64_t counter, |
| 268 | uint8_t flags); |
| 269 | LLVM_LIBRARY_VISIBILITY |
| 270 | void blake3_compress_xof_sse2(const uint32_t cv[8], |
| 271 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 272 | uint8_t block_len, uint64_t counter, |
| 273 | uint8_t flags, uint8_t out[64]); |
| 274 | LLVM_LIBRARY_VISIBILITY |
| 275 | void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, |
| 276 | size_t blocks, const uint32_t key[8], |
| 277 | uint64_t counter, bool increment_counter, |
| 278 | uint8_t flags, uint8_t flags_start, |
| 279 | uint8_t flags_end, uint8_t *out); |
| 280 | #endif |
| 281 | #if !defined(BLAKE3_NO_SSE41) |
| 282 | LLVM_LIBRARY_VISIBILITY |
| 283 | void blake3_compress_in_place_sse41(uint32_t cv[8], |
| 284 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 285 | uint8_t block_len, uint64_t counter, |
| 286 | uint8_t flags); |
| 287 | LLVM_LIBRARY_VISIBILITY |
| 288 | void blake3_compress_xof_sse41(const uint32_t cv[8], |
| 289 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 290 | uint8_t block_len, uint64_t counter, |
| 291 | uint8_t flags, uint8_t out[64]); |
| 292 | LLVM_LIBRARY_VISIBILITY |
| 293 | void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, |
| 294 | size_t blocks, const uint32_t key[8], |
| 295 | uint64_t counter, bool increment_counter, |
| 296 | uint8_t flags, uint8_t flags_start, |
| 297 | uint8_t flags_end, uint8_t *out); |
| 298 | #endif |
| 299 | #if !defined(BLAKE3_NO_AVX2) |
| 300 | LLVM_LIBRARY_VISIBILITY |
| 301 | void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, |
| 302 | size_t blocks, const uint32_t key[8], |
| 303 | uint64_t counter, bool increment_counter, |
| 304 | uint8_t flags, uint8_t flags_start, |
| 305 | uint8_t flags_end, uint8_t *out); |
| 306 | #endif |
| 307 | #if !defined(BLAKE3_NO_AVX512) |
| 308 | LLVM_LIBRARY_VISIBILITY |
| 309 | void blake3_compress_in_place_avx512(uint32_t cv[8], |
| 310 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 311 | uint8_t block_len, uint64_t counter, |
| 312 | uint8_t flags); |
| 313 | |
| 314 | LLVM_LIBRARY_VISIBILITY |
| 315 | void blake3_compress_xof_avx512(const uint32_t cv[8], |
| 316 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 317 | uint8_t block_len, uint64_t counter, |
| 318 | uint8_t flags, uint8_t out[64]); |
| 319 | |
| 320 | LLVM_LIBRARY_VISIBILITY |
| 321 | void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, |
| 322 | size_t blocks, const uint32_t key[8], |
| 323 | uint64_t counter, bool increment_counter, |
| 324 | uint8_t flags, uint8_t flags_start, |
| 325 | uint8_t flags_end, uint8_t *out); |
| 326 | |
| 327 | #if !defined(_WIN32) && !defined(__CYGWIN__) |
| 328 | LLVM_LIBRARY_VISIBILITY |
| 329 | void blake3_xof_many_avx512(const uint32_t cv[8], |
| 330 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 331 | uint8_t block_len, uint64_t counter, uint8_t flags, |
| 332 | uint8_t* out, size_t outblocks); |
| 333 | #endif |
| 334 | #endif |
| 335 | #endif |
| 336 | |
| 337 | #if BLAKE3_USE_NEON == 1 |
| 338 | LLVM_LIBRARY_VISIBILITY |
| 339 | void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, |
| 340 | size_t blocks, const uint32_t key[8], |
| 341 | uint64_t counter, bool increment_counter, |
| 342 | uint8_t flags, uint8_t flags_start, |
| 343 | uint8_t flags_end, uint8_t *out); |
| 344 | #endif |
| 345 | |
| 346 | |
| 347 | #endif /* BLAKE3_IMPL_H */ |
| 348 | |