| 1 | #ifndef BLAKE3_IMPL_H | 
|---|
| 2 | #define BLAKE3_IMPL_H | 
|---|
| 3 |  | 
|---|
| 4 | #include <assert.h> | 
|---|
| 5 | #include <stdbool.h> | 
|---|
| 6 | #include <stddef.h> | 
|---|
| 7 | #include <stdint.h> | 
|---|
| 8 | #include <string.h> | 
|---|
| 9 |  | 
|---|
| 10 | #include "llvm-c/blake3.h" | 
|---|
| 11 | // For \p LLVM_LIBRARY_VISIBILITY | 
|---|
| 12 | #include "llvm/Support/Compiler.h" | 
|---|
| 13 |  | 
|---|
| 14 | #include "llvm_blake3_prefix.h" | 
|---|
| 15 |  | 
|---|
| 16 | // internal flags | 
|---|
| 17 | enum blake3_flags { | 
|---|
| 18 | CHUNK_START         = 1 << 0, | 
|---|
| 19 | CHUNK_END           = 1 << 1, | 
|---|
| 20 | PARENT              = 1 << 2, | 
|---|
| 21 | ROOT                = 1 << 3, | 
|---|
| 22 | KEYED_HASH          = 1 << 4, | 
|---|
| 23 | DERIVE_KEY_CONTEXT  = 1 << 5, | 
|---|
| 24 | DERIVE_KEY_MATERIAL = 1 << 6, | 
|---|
| 25 | }; | 
|---|
| 26 |  | 
|---|
| 27 | // This C implementation tries to support recent versions of GCC, Clang, and | 
|---|
| 28 | // MSVC. | 
|---|
| 29 | #if defined(_MSC_VER) | 
|---|
| 30 | #define INLINE static __forceinline | 
|---|
| 31 | #else | 
|---|
| 32 | #define INLINE static inline __attribute__((always_inline)) | 
|---|
| 33 | #endif | 
|---|
| 34 |  | 
|---|
| 35 | #if defined(__x86_64__) || defined(_M_X64) | 
|---|
| 36 | #define IS_X86 | 
|---|
| 37 | #define IS_X86_64 | 
|---|
| 38 | #endif | 
|---|
| 39 |  | 
|---|
| 40 | #if defined(__i386__) || defined(_M_IX86) | 
|---|
| 41 | #define IS_X86 | 
|---|
| 42 | #define IS_X86_32 | 
|---|
| 43 | #endif | 
|---|
| 44 |  | 
|---|
| 45 | #if defined(__aarch64__) || defined(_M_ARM64) | 
|---|
| 46 | #define IS_AARCH64 | 
|---|
| 47 | #endif | 
|---|
| 48 |  | 
|---|
| 49 | #if defined(IS_X86) | 
|---|
| 50 | #if defined(_MSC_VER) | 
|---|
| 51 | #include <intrin.h> | 
|---|
| 52 | #endif | 
|---|
| 53 | #include <immintrin.h> | 
|---|
| 54 | #endif | 
|---|
| 55 |  | 
|---|
| 56 | #if !defined(BLAKE3_USE_NEON) | 
|---|
| 57 | // If BLAKE3_USE_NEON not manually set, autodetect based on | 
|---|
| 58 | // AArch64ness and endianness. | 
|---|
| 59 | #if defined(IS_AARCH64) && !defined(__ARM_BIG_ENDIAN) | 
|---|
| 60 | #define BLAKE3_USE_NEON 1 | 
|---|
| 61 | #else | 
|---|
| 62 | #define BLAKE3_USE_NEON 0 | 
|---|
| 63 | #endif | 
|---|
| 64 | #endif | 
|---|
| 65 |  | 
|---|
| 66 | #if defined(IS_X86) | 
|---|
| 67 | #define MAX_SIMD_DEGREE 16 | 
|---|
| 68 | #elif BLAKE3_USE_NEON == 1 | 
|---|
| 69 | #define MAX_SIMD_DEGREE 4 | 
|---|
| 70 | #else | 
|---|
| 71 | #define MAX_SIMD_DEGREE 1 | 
|---|
| 72 | #endif | 
|---|
| 73 |  | 
|---|
| 74 | // There are some places where we want a static size that's equal to the | 
|---|
| 75 | // MAX_SIMD_DEGREE, but also at least 2. | 
|---|
| 76 | #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) | 
|---|
| 77 |  | 
|---|
| 78 | static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, | 
|---|
| 79 | 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, | 
|---|
| 80 | 0x1F83D9ABUL, 0x5BE0CD19UL}; | 
|---|
| 81 |  | 
|---|
| 82 | static const uint8_t MSG_SCHEDULE[7][16] = { | 
|---|
| 83 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, | 
|---|
| 84 | {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, | 
|---|
| 85 | {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, | 
|---|
| 86 | {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, | 
|---|
| 87 | {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, | 
|---|
| 88 | {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, | 
|---|
| 89 | {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, | 
|---|
| 90 | }; | 
|---|
| 91 |  | 
|---|
| 92 | /* Find index of the highest set bit */ | 
|---|
| 93 | /* x is assumed to be nonzero.       */ | 
|---|
| 94 | static unsigned int highest_one(uint64_t x) { | 
|---|
| 95 | #if defined(__GNUC__) || defined(__clang__) | 
|---|
| 96 | return 63 ^ __builtin_clzll(x); | 
|---|
| 97 | #elif defined(_MSC_VER) && defined(IS_X86_64) | 
|---|
| 98 | unsigned long index; | 
|---|
| 99 | _BitScanReverse64(&index, x); | 
|---|
| 100 | return index; | 
|---|
| 101 | #elif defined(_MSC_VER) && defined(IS_X86_32) | 
|---|
| 102 | if(x >> 32) { | 
|---|
| 103 | unsigned long index; | 
|---|
| 104 | _BitScanReverse(&index, (unsigned long)(x >> 32)); | 
|---|
| 105 | return 32 + index; | 
|---|
| 106 | } else { | 
|---|
| 107 | unsigned long index; | 
|---|
| 108 | _BitScanReverse(&index, (unsigned long)x); | 
|---|
| 109 | return index; | 
|---|
| 110 | } | 
|---|
| 111 | #else | 
|---|
| 112 | unsigned int c = 0; | 
|---|
| 113 | if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } | 
|---|
| 114 | if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } | 
|---|
| 115 | if(x & 0x000000000000ff00ULL) { x >>=  8; c +=  8; } | 
|---|
| 116 | if(x & 0x00000000000000f0ULL) { x >>=  4; c +=  4; } | 
|---|
| 117 | if(x & 0x000000000000000cULL) { x >>=  2; c +=  2; } | 
|---|
| 118 | if(x & 0x0000000000000002ULL) {           c +=  1; } | 
|---|
| 119 | return c; | 
|---|
| 120 | #endif | 
|---|
| 121 | } | 
|---|
| 122 |  | 
|---|
| 123 | // Count the number of 1 bits. | 
|---|
| 124 | INLINE unsigned int popcnt(uint64_t x) { | 
|---|
| 125 | #if defined(__GNUC__) || defined(__clang__) | 
|---|
| 126 | return __builtin_popcountll(x); | 
|---|
| 127 | #else | 
|---|
| 128 | unsigned int count = 0; | 
|---|
| 129 | while (x != 0) { | 
|---|
| 130 | count += 1; | 
|---|
| 131 | x &= x - 1; | 
|---|
| 132 | } | 
|---|
| 133 | return count; | 
|---|
| 134 | #endif | 
|---|
| 135 | } | 
|---|
| 136 |  | 
|---|
| 137 | // Largest power of two less than or equal to x. As a special case, returns 1 | 
|---|
| 138 | // when x is 0. | 
|---|
| 139 | INLINE uint64_t round_down_to_power_of_2(uint64_t x) { | 
|---|
| 140 | return 1ULL << highest_one(x: x | 1); | 
|---|
| 141 | } | 
|---|
| 142 |  | 
|---|
| 143 | INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } | 
|---|
| 144 |  | 
|---|
| 145 | INLINE uint32_t counter_high(uint64_t counter) { | 
|---|
| 146 | return (uint32_t)(counter >> 32); | 
|---|
| 147 | } | 
|---|
| 148 |  | 
|---|
| 149 | INLINE uint32_t load32(const void *src) { | 
|---|
| 150 | const uint8_t *p = (const uint8_t *)src; | 
|---|
| 151 | return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | | 
|---|
| 152 | ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); | 
|---|
| 153 | } | 
|---|
| 154 |  | 
|---|
| 155 | INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], | 
|---|
| 156 | uint32_t key_words[8]) { | 
|---|
| 157 | key_words[0] = load32(src: &key[0 * 4]); | 
|---|
| 158 | key_words[1] = load32(src: &key[1 * 4]); | 
|---|
| 159 | key_words[2] = load32(src: &key[2 * 4]); | 
|---|
| 160 | key_words[3] = load32(src: &key[3 * 4]); | 
|---|
| 161 | key_words[4] = load32(src: &key[4 * 4]); | 
|---|
| 162 | key_words[5] = load32(src: &key[5 * 4]); | 
|---|
| 163 | key_words[6] = load32(src: &key[6 * 4]); | 
|---|
| 164 | key_words[7] = load32(src: &key[7 * 4]); | 
|---|
| 165 | } | 
|---|
| 166 |  | 
|---|
| 167 | INLINE void store32(void *dst, uint32_t w) { | 
|---|
| 168 | uint8_t *p = (uint8_t *)dst; | 
|---|
| 169 | p[0] = (uint8_t)(w >> 0); | 
|---|
| 170 | p[1] = (uint8_t)(w >> 8); | 
|---|
| 171 | p[2] = (uint8_t)(w >> 16); | 
|---|
| 172 | p[3] = (uint8_t)(w >> 24); | 
|---|
| 173 | } | 
|---|
| 174 |  | 
|---|
| 175 | INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { | 
|---|
| 176 | store32(dst: &bytes_out[0 * 4], w: cv_words[0]); | 
|---|
| 177 | store32(dst: &bytes_out[1 * 4], w: cv_words[1]); | 
|---|
| 178 | store32(dst: &bytes_out[2 * 4], w: cv_words[2]); | 
|---|
| 179 | store32(dst: &bytes_out[3 * 4], w: cv_words[3]); | 
|---|
| 180 | store32(dst: &bytes_out[4 * 4], w: cv_words[4]); | 
|---|
| 181 | store32(dst: &bytes_out[5 * 4], w: cv_words[5]); | 
|---|
| 182 | store32(dst: &bytes_out[6 * 4], w: cv_words[6]); | 
|---|
| 183 | store32(dst: &bytes_out[7 * 4], w: cv_words[7]); | 
|---|
| 184 | } | 
|---|
| 185 |  | 
|---|
| 186 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 187 | void blake3_compress_in_place(uint32_t cv[8], | 
|---|
| 188 | const uint8_t block[BLAKE3_BLOCK_LEN], | 
|---|
| 189 | uint8_t block_len, uint64_t counter, | 
|---|
| 190 | uint8_t flags); | 
|---|
| 191 |  | 
|---|
| 192 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 193 | void blake3_compress_xof(const uint32_t cv[8], | 
|---|
| 194 | const uint8_t block[BLAKE3_BLOCK_LEN], | 
|---|
| 195 | uint8_t block_len, uint64_t counter, uint8_t flags, | 
|---|
| 196 | uint8_t out[64]); | 
|---|
| 197 |  | 
|---|
| 198 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 199 | void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, | 
|---|
| 200 | size_t blocks, const uint32_t key[8], uint64_t counter, | 
|---|
| 201 | bool increment_counter, uint8_t flags, | 
|---|
| 202 | uint8_t flags_start, uint8_t flags_end, uint8_t *out); | 
|---|
| 203 |  | 
|---|
| 204 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 205 | size_t blake3_simd_degree(void); | 
|---|
| 206 |  | 
|---|
| 207 |  | 
|---|
| 208 | // Declarations for implementation-specific functions. | 
|---|
| 209 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 210 | void blake3_compress_in_place_portable(uint32_t cv[8], | 
|---|
| 211 | const uint8_t block[BLAKE3_BLOCK_LEN], | 
|---|
| 212 | uint8_t block_len, uint64_t counter, | 
|---|
| 213 | uint8_t flags); | 
|---|
| 214 |  | 
|---|
| 215 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 216 | void blake3_compress_xof_portable(const uint32_t cv[8], | 
|---|
| 217 | const uint8_t block[BLAKE3_BLOCK_LEN], | 
|---|
| 218 | uint8_t block_len, uint64_t counter, | 
|---|
| 219 | uint8_t flags, uint8_t out[64]); | 
|---|
| 220 |  | 
|---|
| 221 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 222 | void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, | 
|---|
| 223 | size_t blocks, const uint32_t key[8], | 
|---|
| 224 | uint64_t counter, bool increment_counter, | 
|---|
| 225 | uint8_t flags, uint8_t flags_start, | 
|---|
| 226 | uint8_t flags_end, uint8_t *out); | 
|---|
| 227 |  | 
|---|
| 228 | #if defined(IS_X86) | 
|---|
| 229 | #if !defined(BLAKE3_NO_SSE2) | 
|---|
| 230 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 231 | void blake3_compress_in_place_sse2(uint32_t cv[8], | 
|---|
| 232 | const uint8_t block[BLAKE3_BLOCK_LEN], | 
|---|
| 233 | uint8_t block_len, uint64_t counter, | 
|---|
| 234 | uint8_t flags); | 
|---|
| 235 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 236 | void blake3_compress_xof_sse2(const uint32_t cv[8], | 
|---|
| 237 | const uint8_t block[BLAKE3_BLOCK_LEN], | 
|---|
| 238 | uint8_t block_len, uint64_t counter, | 
|---|
| 239 | uint8_t flags, uint8_t out[64]); | 
|---|
| 240 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 241 | void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, | 
|---|
| 242 | size_t blocks, const uint32_t key[8], | 
|---|
| 243 | uint64_t counter, bool increment_counter, | 
|---|
| 244 | uint8_t flags, uint8_t flags_start, | 
|---|
| 245 | uint8_t flags_end, uint8_t *out); | 
|---|
| 246 | #endif | 
|---|
| 247 | #if !defined(BLAKE3_NO_SSE41) | 
|---|
| 248 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 249 | void blake3_compress_in_place_sse41(uint32_t cv[8], | 
|---|
| 250 | const uint8_t block[BLAKE3_BLOCK_LEN], | 
|---|
| 251 | uint8_t block_len, uint64_t counter, | 
|---|
| 252 | uint8_t flags); | 
|---|
| 253 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 254 | void blake3_compress_xof_sse41(const uint32_t cv[8], | 
|---|
| 255 | const uint8_t block[BLAKE3_BLOCK_LEN], | 
|---|
| 256 | uint8_t block_len, uint64_t counter, | 
|---|
| 257 | uint8_t flags, uint8_t out[64]); | 
|---|
| 258 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 259 | void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, | 
|---|
| 260 | size_t blocks, const uint32_t key[8], | 
|---|
| 261 | uint64_t counter, bool increment_counter, | 
|---|
| 262 | uint8_t flags, uint8_t flags_start, | 
|---|
| 263 | uint8_t flags_end, uint8_t *out); | 
|---|
| 264 | #endif | 
|---|
| 265 | #if !defined(BLAKE3_NO_AVX2) | 
|---|
| 266 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 267 | void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, | 
|---|
| 268 | size_t blocks, const uint32_t key[8], | 
|---|
| 269 | uint64_t counter, bool increment_counter, | 
|---|
| 270 | uint8_t flags, uint8_t flags_start, | 
|---|
| 271 | uint8_t flags_end, uint8_t *out); | 
|---|
| 272 | #endif | 
|---|
| 273 | #if !defined(BLAKE3_NO_AVX512) | 
|---|
| 274 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 275 | void blake3_compress_in_place_avx512(uint32_t cv[8], | 
|---|
| 276 | const uint8_t block[BLAKE3_BLOCK_LEN], | 
|---|
| 277 | uint8_t block_len, uint64_t counter, | 
|---|
| 278 | uint8_t flags); | 
|---|
| 279 |  | 
|---|
| 280 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 281 | void blake3_compress_xof_avx512(const uint32_t cv[8], | 
|---|
| 282 | const uint8_t block[BLAKE3_BLOCK_LEN], | 
|---|
| 283 | uint8_t block_len, uint64_t counter, | 
|---|
| 284 | uint8_t flags, uint8_t out[64]); | 
|---|
| 285 |  | 
|---|
| 286 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 287 | void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, | 
|---|
| 288 | size_t blocks, const uint32_t key[8], | 
|---|
| 289 | uint64_t counter, bool increment_counter, | 
|---|
| 290 | uint8_t flags, uint8_t flags_start, | 
|---|
| 291 | uint8_t flags_end, uint8_t *out); | 
|---|
| 292 | #endif | 
|---|
| 293 | #endif | 
|---|
| 294 |  | 
|---|
| 295 | #if BLAKE3_USE_NEON == 1 | 
|---|
| 296 | LLVM_LIBRARY_VISIBILITY | 
|---|
| 297 | void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, | 
|---|
| 298 | size_t blocks, const uint32_t key[8], | 
|---|
| 299 | uint64_t counter, bool increment_counter, | 
|---|
| 300 | uint8_t flags, uint8_t flags_start, | 
|---|
| 301 | uint8_t flags_end, uint8_t *out); | 
|---|
| 302 | #endif | 
|---|
| 303 |  | 
|---|
| 304 |  | 
|---|
| 305 | #endif /* BLAKE3_IMPL_H */ | 
|---|
| 306 |  | 
|---|