| 1 | #ifndef BLAKE3_IMPL_H |
| 2 | #define BLAKE3_IMPL_H |
| 3 | |
| 4 | #include <assert.h> |
| 5 | #include <stdbool.h> |
| 6 | #include <stddef.h> |
| 7 | #include <stdint.h> |
| 8 | #include <string.h> |
| 9 | |
| 10 | #include "llvm-c/blake3.h" |
| 11 | // For \p LLVM_LIBRARY_VISIBILITY |
| 12 | #include "llvm/Support/Compiler.h" |
| 13 | |
| 14 | #include "llvm_blake3_prefix.h" |
| 15 | |
| 16 | // internal flags |
| 17 | enum blake3_flags { |
| 18 | CHUNK_START = 1 << 0, |
| 19 | CHUNK_END = 1 << 1, |
| 20 | PARENT = 1 << 2, |
| 21 | ROOT = 1 << 3, |
| 22 | KEYED_HASH = 1 << 4, |
| 23 | DERIVE_KEY_CONTEXT = 1 << 5, |
| 24 | DERIVE_KEY_MATERIAL = 1 << 6, |
| 25 | }; |
| 26 | |
| 27 | // This C implementation tries to support recent versions of GCC, Clang, and |
| 28 | // MSVC. |
| 29 | #if defined(_MSC_VER) |
| 30 | #define INLINE static __forceinline |
| 31 | #else |
| 32 | #define INLINE static inline __attribute__((always_inline)) |
| 33 | #endif |
| 34 | |
| 35 | #if defined(__x86_64__) || defined(_M_X64) |
| 36 | #define IS_X86 |
| 37 | #define IS_X86_64 |
| 38 | #endif |
| 39 | |
| 40 | #if defined(__i386__) || defined(_M_IX86) |
| 41 | #define IS_X86 |
| 42 | #define IS_X86_32 |
| 43 | #endif |
| 44 | |
| 45 | #if defined(__aarch64__) || defined(_M_ARM64) |
| 46 | #define IS_AARCH64 |
| 47 | #endif |
| 48 | |
| 49 | #if defined(IS_X86) |
| 50 | #if defined(_MSC_VER) |
| 51 | #include <intrin.h> |
| 52 | #endif |
| 53 | #include <immintrin.h> |
| 54 | #endif |
| 55 | |
| 56 | #if !defined(BLAKE3_USE_NEON) |
| 57 | // If BLAKE3_USE_NEON not manually set, autodetect based on |
| 58 | // AArch64ness and endianness. |
| 59 | #if defined(IS_AARCH64) && !defined(__ARM_BIG_ENDIAN) |
| 60 | #define BLAKE3_USE_NEON 1 |
| 61 | #else |
| 62 | #define BLAKE3_USE_NEON 0 |
| 63 | #endif |
| 64 | #endif |
| 65 | |
| 66 | #if defined(IS_X86) |
| 67 | #define MAX_SIMD_DEGREE 16 |
| 68 | #elif BLAKE3_USE_NEON == 1 |
| 69 | #define MAX_SIMD_DEGREE 4 |
| 70 | #else |
| 71 | #define MAX_SIMD_DEGREE 1 |
| 72 | #endif |
| 73 | |
| 74 | // There are some places where we want a static size that's equal to the |
| 75 | // MAX_SIMD_DEGREE, but also at least 2. |
| 76 | #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) |
| 77 | |
| 78 | static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, |
| 79 | 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, |
| 80 | 0x1F83D9ABUL, 0x5BE0CD19UL}; |
| 81 | |
| 82 | static const uint8_t MSG_SCHEDULE[7][16] = { |
| 83 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
| 84 | {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, |
| 85 | {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, |
| 86 | {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, |
| 87 | {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, |
| 88 | {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, |
| 89 | {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, |
| 90 | }; |
| 91 | |
| 92 | /* Find index of the highest set bit */ |
| 93 | /* x is assumed to be nonzero. */ |
| 94 | static unsigned int highest_one(uint64_t x) { |
| 95 | #if defined(__GNUC__) || defined(__clang__) |
| 96 | return 63 ^ __builtin_clzll(x); |
| 97 | #elif defined(_MSC_VER) && defined(IS_X86_64) |
| 98 | unsigned long index; |
| 99 | _BitScanReverse64(&index, x); |
| 100 | return index; |
| 101 | #elif defined(_MSC_VER) && defined(IS_X86_32) |
| 102 | if(x >> 32) { |
| 103 | unsigned long index; |
| 104 | _BitScanReverse(&index, (unsigned long)(x >> 32)); |
| 105 | return 32 + index; |
| 106 | } else { |
| 107 | unsigned long index; |
| 108 | _BitScanReverse(&index, (unsigned long)x); |
| 109 | return index; |
| 110 | } |
| 111 | #else |
| 112 | unsigned int c = 0; |
| 113 | if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } |
| 114 | if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } |
| 115 | if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } |
| 116 | if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } |
| 117 | if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } |
| 118 | if(x & 0x0000000000000002ULL) { c += 1; } |
| 119 | return c; |
| 120 | #endif |
| 121 | } |
| 122 | |
| 123 | // Count the number of 1 bits. |
| 124 | INLINE unsigned int popcnt(uint64_t x) { |
| 125 | #if defined(__GNUC__) || defined(__clang__) |
| 126 | return __builtin_popcountll(x); |
| 127 | #else |
| 128 | unsigned int count = 0; |
| 129 | while (x != 0) { |
| 130 | count += 1; |
| 131 | x &= x - 1; |
| 132 | } |
| 133 | return count; |
| 134 | #endif |
| 135 | } |
| 136 | |
| 137 | // Largest power of two less than or equal to x. As a special case, returns 1 |
| 138 | // when x is 0. |
| 139 | INLINE uint64_t round_down_to_power_of_2(uint64_t x) { |
| 140 | return 1ULL << highest_one(x: x | 1); |
| 141 | } |
| 142 | |
| 143 | INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } |
| 144 | |
| 145 | INLINE uint32_t counter_high(uint64_t counter) { |
| 146 | return (uint32_t)(counter >> 32); |
| 147 | } |
| 148 | |
| 149 | INLINE uint32_t load32(const void *src) { |
| 150 | const uint8_t *p = (const uint8_t *)src; |
| 151 | return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | |
| 152 | ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); |
| 153 | } |
| 154 | |
| 155 | INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], |
| 156 | uint32_t key_words[8]) { |
| 157 | key_words[0] = load32(src: &key[0 * 4]); |
| 158 | key_words[1] = load32(src: &key[1 * 4]); |
| 159 | key_words[2] = load32(src: &key[2 * 4]); |
| 160 | key_words[3] = load32(src: &key[3 * 4]); |
| 161 | key_words[4] = load32(src: &key[4 * 4]); |
| 162 | key_words[5] = load32(src: &key[5 * 4]); |
| 163 | key_words[6] = load32(src: &key[6 * 4]); |
| 164 | key_words[7] = load32(src: &key[7 * 4]); |
| 165 | } |
| 166 | |
| 167 | INLINE void store32(void *dst, uint32_t w) { |
| 168 | uint8_t *p = (uint8_t *)dst; |
| 169 | p[0] = (uint8_t)(w >> 0); |
| 170 | p[1] = (uint8_t)(w >> 8); |
| 171 | p[2] = (uint8_t)(w >> 16); |
| 172 | p[3] = (uint8_t)(w >> 24); |
| 173 | } |
| 174 | |
| 175 | INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { |
| 176 | store32(dst: &bytes_out[0 * 4], w: cv_words[0]); |
| 177 | store32(dst: &bytes_out[1 * 4], w: cv_words[1]); |
| 178 | store32(dst: &bytes_out[2 * 4], w: cv_words[2]); |
| 179 | store32(dst: &bytes_out[3 * 4], w: cv_words[3]); |
| 180 | store32(dst: &bytes_out[4 * 4], w: cv_words[4]); |
| 181 | store32(dst: &bytes_out[5 * 4], w: cv_words[5]); |
| 182 | store32(dst: &bytes_out[6 * 4], w: cv_words[6]); |
| 183 | store32(dst: &bytes_out[7 * 4], w: cv_words[7]); |
| 184 | } |
| 185 | |
| 186 | LLVM_LIBRARY_VISIBILITY |
| 187 | void blake3_compress_in_place(uint32_t cv[8], |
| 188 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 189 | uint8_t block_len, uint64_t counter, |
| 190 | uint8_t flags); |
| 191 | |
| 192 | LLVM_LIBRARY_VISIBILITY |
| 193 | void blake3_compress_xof(const uint32_t cv[8], |
| 194 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 195 | uint8_t block_len, uint64_t counter, uint8_t flags, |
| 196 | uint8_t out[64]); |
| 197 | |
| 198 | LLVM_LIBRARY_VISIBILITY |
| 199 | void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, |
| 200 | size_t blocks, const uint32_t key[8], uint64_t counter, |
| 201 | bool increment_counter, uint8_t flags, |
| 202 | uint8_t flags_start, uint8_t flags_end, uint8_t *out); |
| 203 | |
| 204 | LLVM_LIBRARY_VISIBILITY |
| 205 | size_t blake3_simd_degree(void); |
| 206 | |
| 207 | |
| 208 | // Declarations for implementation-specific functions. |
| 209 | LLVM_LIBRARY_VISIBILITY |
| 210 | void blake3_compress_in_place_portable(uint32_t cv[8], |
| 211 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 212 | uint8_t block_len, uint64_t counter, |
| 213 | uint8_t flags); |
| 214 | |
| 215 | LLVM_LIBRARY_VISIBILITY |
| 216 | void blake3_compress_xof_portable(const uint32_t cv[8], |
| 217 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 218 | uint8_t block_len, uint64_t counter, |
| 219 | uint8_t flags, uint8_t out[64]); |
| 220 | |
| 221 | LLVM_LIBRARY_VISIBILITY |
| 222 | void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, |
| 223 | size_t blocks, const uint32_t key[8], |
| 224 | uint64_t counter, bool increment_counter, |
| 225 | uint8_t flags, uint8_t flags_start, |
| 226 | uint8_t flags_end, uint8_t *out); |
| 227 | |
| 228 | #if defined(IS_X86) |
| 229 | #if !defined(BLAKE3_NO_SSE2) |
| 230 | LLVM_LIBRARY_VISIBILITY |
| 231 | void blake3_compress_in_place_sse2(uint32_t cv[8], |
| 232 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 233 | uint8_t block_len, uint64_t counter, |
| 234 | uint8_t flags); |
| 235 | LLVM_LIBRARY_VISIBILITY |
| 236 | void blake3_compress_xof_sse2(const uint32_t cv[8], |
| 237 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 238 | uint8_t block_len, uint64_t counter, |
| 239 | uint8_t flags, uint8_t out[64]); |
| 240 | LLVM_LIBRARY_VISIBILITY |
| 241 | void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, |
| 242 | size_t blocks, const uint32_t key[8], |
| 243 | uint64_t counter, bool increment_counter, |
| 244 | uint8_t flags, uint8_t flags_start, |
| 245 | uint8_t flags_end, uint8_t *out); |
| 246 | #endif |
| 247 | #if !defined(BLAKE3_NO_SSE41) |
| 248 | LLVM_LIBRARY_VISIBILITY |
| 249 | void blake3_compress_in_place_sse41(uint32_t cv[8], |
| 250 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 251 | uint8_t block_len, uint64_t counter, |
| 252 | uint8_t flags); |
| 253 | LLVM_LIBRARY_VISIBILITY |
| 254 | void blake3_compress_xof_sse41(const uint32_t cv[8], |
| 255 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 256 | uint8_t block_len, uint64_t counter, |
| 257 | uint8_t flags, uint8_t out[64]); |
| 258 | LLVM_LIBRARY_VISIBILITY |
| 259 | void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, |
| 260 | size_t blocks, const uint32_t key[8], |
| 261 | uint64_t counter, bool increment_counter, |
| 262 | uint8_t flags, uint8_t flags_start, |
| 263 | uint8_t flags_end, uint8_t *out); |
| 264 | #endif |
| 265 | #if !defined(BLAKE3_NO_AVX2) |
| 266 | LLVM_LIBRARY_VISIBILITY |
| 267 | void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, |
| 268 | size_t blocks, const uint32_t key[8], |
| 269 | uint64_t counter, bool increment_counter, |
| 270 | uint8_t flags, uint8_t flags_start, |
| 271 | uint8_t flags_end, uint8_t *out); |
| 272 | #endif |
| 273 | #if !defined(BLAKE3_NO_AVX512) |
| 274 | LLVM_LIBRARY_VISIBILITY |
| 275 | void blake3_compress_in_place_avx512(uint32_t cv[8], |
| 276 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 277 | uint8_t block_len, uint64_t counter, |
| 278 | uint8_t flags); |
| 279 | |
| 280 | LLVM_LIBRARY_VISIBILITY |
| 281 | void blake3_compress_xof_avx512(const uint32_t cv[8], |
| 282 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 283 | uint8_t block_len, uint64_t counter, |
| 284 | uint8_t flags, uint8_t out[64]); |
| 285 | |
| 286 | LLVM_LIBRARY_VISIBILITY |
| 287 | void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, |
| 288 | size_t blocks, const uint32_t key[8], |
| 289 | uint64_t counter, bool increment_counter, |
| 290 | uint8_t flags, uint8_t flags_start, |
| 291 | uint8_t flags_end, uint8_t *out); |
| 292 | #endif |
| 293 | #endif |
| 294 | |
| 295 | #if BLAKE3_USE_NEON == 1 |
| 296 | LLVM_LIBRARY_VISIBILITY |
| 297 | void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, |
| 298 | size_t blocks, const uint32_t key[8], |
| 299 | uint64_t counter, bool increment_counter, |
| 300 | uint8_t flags, uint8_t flags_start, |
| 301 | uint8_t flags_end, uint8_t *out); |
| 302 | #endif |
| 303 | |
| 304 | |
| 305 | #endif /* BLAKE3_IMPL_H */ |
| 306 | |