1 | #ifndef BLAKE3_IMPL_H |
2 | #define BLAKE3_IMPL_H |
3 | |
4 | #include <assert.h> |
5 | #include <stdbool.h> |
6 | #include <stddef.h> |
7 | #include <stdint.h> |
8 | #include <string.h> |
9 | |
10 | #include "llvm-c/blake3.h" |
11 | // For \p LLVM_LIBRARY_VISIBILITY |
12 | #include "llvm/Support/Compiler.h" |
13 | |
14 | #include "llvm_blake3_prefix.h" |
15 | |
16 | // internal flags |
17 | enum blake3_flags { |
18 | CHUNK_START = 1 << 0, |
19 | CHUNK_END = 1 << 1, |
20 | PARENT = 1 << 2, |
21 | ROOT = 1 << 3, |
22 | KEYED_HASH = 1 << 4, |
23 | DERIVE_KEY_CONTEXT = 1 << 5, |
24 | DERIVE_KEY_MATERIAL = 1 << 6, |
25 | }; |
26 | |
27 | // This C implementation tries to support recent versions of GCC, Clang, and |
28 | // MSVC. |
29 | #if defined(_MSC_VER) |
30 | #define INLINE static __forceinline |
31 | #else |
32 | #define INLINE static inline __attribute__((always_inline)) |
33 | #endif |
34 | |
35 | #if defined(__x86_64__) || defined(_M_X64) |
36 | #define IS_X86 |
37 | #define IS_X86_64 |
38 | #endif |
39 | |
40 | #if defined(__i386__) || defined(_M_IX86) |
41 | #define IS_X86 |
42 | #define IS_X86_32 |
43 | #endif |
44 | |
45 | #if defined(__aarch64__) || defined(_M_ARM64) |
46 | #define IS_AARCH64 |
47 | #endif |
48 | |
49 | #if defined(IS_X86) |
50 | #if defined(_MSC_VER) |
51 | #include <intrin.h> |
52 | #endif |
53 | #include <immintrin.h> |
54 | #endif |
55 | |
56 | #if !defined(BLAKE3_USE_NEON) |
57 | // If BLAKE3_USE_NEON not manually set, autodetect based on |
58 | // AArch64ness and endianness. |
59 | #if defined(IS_AARCH64) && !defined(__ARM_BIG_ENDIAN) |
60 | #define BLAKE3_USE_NEON 1 |
61 | #else |
62 | #define BLAKE3_USE_NEON 0 |
63 | #endif |
64 | #endif |
65 | |
66 | #if defined(IS_X86) |
67 | #define MAX_SIMD_DEGREE 16 |
68 | #elif BLAKE3_USE_NEON == 1 |
69 | #define MAX_SIMD_DEGREE 4 |
70 | #else |
71 | #define MAX_SIMD_DEGREE 1 |
72 | #endif |
73 | |
74 | // There are some places where we want a static size that's equal to the |
75 | // MAX_SIMD_DEGREE, but also at least 2. |
76 | #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) |
77 | |
78 | static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, |
79 | 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, |
80 | 0x1F83D9ABUL, 0x5BE0CD19UL}; |
81 | |
82 | static const uint8_t MSG_SCHEDULE[7][16] = { |
83 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
84 | {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, |
85 | {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, |
86 | {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, |
87 | {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, |
88 | {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, |
89 | {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, |
90 | }; |
91 | |
92 | /* Find index of the highest set bit */ |
93 | /* x is assumed to be nonzero. */ |
94 | static unsigned int highest_one(uint64_t x) { |
95 | #if defined(__GNUC__) || defined(__clang__) |
96 | return 63 ^ __builtin_clzll(x); |
97 | #elif defined(_MSC_VER) && defined(IS_X86_64) |
98 | unsigned long index; |
99 | _BitScanReverse64(&index, x); |
100 | return index; |
101 | #elif defined(_MSC_VER) && defined(IS_X86_32) |
102 | if(x >> 32) { |
103 | unsigned long index; |
104 | _BitScanReverse(&index, (unsigned long)(x >> 32)); |
105 | return 32 + index; |
106 | } else { |
107 | unsigned long index; |
108 | _BitScanReverse(&index, (unsigned long)x); |
109 | return index; |
110 | } |
111 | #else |
112 | unsigned int c = 0; |
113 | if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } |
114 | if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } |
115 | if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } |
116 | if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } |
117 | if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } |
118 | if(x & 0x0000000000000002ULL) { c += 1; } |
119 | return c; |
120 | #endif |
121 | } |
122 | |
123 | // Count the number of 1 bits. |
124 | INLINE unsigned int popcnt(uint64_t x) { |
125 | #if defined(__GNUC__) || defined(__clang__) |
126 | return __builtin_popcountll(x); |
127 | #else |
128 | unsigned int count = 0; |
129 | while (x != 0) { |
130 | count += 1; |
131 | x &= x - 1; |
132 | } |
133 | return count; |
134 | #endif |
135 | } |
136 | |
137 | // Largest power of two less than or equal to x. As a special case, returns 1 |
138 | // when x is 0. |
139 | INLINE uint64_t round_down_to_power_of_2(uint64_t x) { |
140 | return 1ULL << highest_one(x: x | 1); |
141 | } |
142 | |
143 | INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } |
144 | |
145 | INLINE uint32_t counter_high(uint64_t counter) { |
146 | return (uint32_t)(counter >> 32); |
147 | } |
148 | |
149 | INLINE uint32_t load32(const void *src) { |
150 | const uint8_t *p = (const uint8_t *)src; |
151 | return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | |
152 | ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); |
153 | } |
154 | |
155 | INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], |
156 | uint32_t key_words[8]) { |
157 | key_words[0] = load32(src: &key[0 * 4]); |
158 | key_words[1] = load32(src: &key[1 * 4]); |
159 | key_words[2] = load32(src: &key[2 * 4]); |
160 | key_words[3] = load32(src: &key[3 * 4]); |
161 | key_words[4] = load32(src: &key[4 * 4]); |
162 | key_words[5] = load32(src: &key[5 * 4]); |
163 | key_words[6] = load32(src: &key[6 * 4]); |
164 | key_words[7] = load32(src: &key[7 * 4]); |
165 | } |
166 | |
167 | INLINE void store32(void *dst, uint32_t w) { |
168 | uint8_t *p = (uint8_t *)dst; |
169 | p[0] = (uint8_t)(w >> 0); |
170 | p[1] = (uint8_t)(w >> 8); |
171 | p[2] = (uint8_t)(w >> 16); |
172 | p[3] = (uint8_t)(w >> 24); |
173 | } |
174 | |
175 | INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { |
176 | store32(dst: &bytes_out[0 * 4], w: cv_words[0]); |
177 | store32(dst: &bytes_out[1 * 4], w: cv_words[1]); |
178 | store32(dst: &bytes_out[2 * 4], w: cv_words[2]); |
179 | store32(dst: &bytes_out[3 * 4], w: cv_words[3]); |
180 | store32(dst: &bytes_out[4 * 4], w: cv_words[4]); |
181 | store32(dst: &bytes_out[5 * 4], w: cv_words[5]); |
182 | store32(dst: &bytes_out[6 * 4], w: cv_words[6]); |
183 | store32(dst: &bytes_out[7 * 4], w: cv_words[7]); |
184 | } |
185 | |
186 | LLVM_LIBRARY_VISIBILITY |
187 | void blake3_compress_in_place(uint32_t cv[8], |
188 | const uint8_t block[BLAKE3_BLOCK_LEN], |
189 | uint8_t block_len, uint64_t counter, |
190 | uint8_t flags); |
191 | |
192 | LLVM_LIBRARY_VISIBILITY |
193 | void blake3_compress_xof(const uint32_t cv[8], |
194 | const uint8_t block[BLAKE3_BLOCK_LEN], |
195 | uint8_t block_len, uint64_t counter, uint8_t flags, |
196 | uint8_t out[64]); |
197 | |
198 | LLVM_LIBRARY_VISIBILITY |
199 | void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, |
200 | size_t blocks, const uint32_t key[8], uint64_t counter, |
201 | bool increment_counter, uint8_t flags, |
202 | uint8_t flags_start, uint8_t flags_end, uint8_t *out); |
203 | |
204 | LLVM_LIBRARY_VISIBILITY |
205 | size_t blake3_simd_degree(void); |
206 | |
207 | |
208 | // Declarations for implementation-specific functions. |
209 | LLVM_LIBRARY_VISIBILITY |
210 | void blake3_compress_in_place_portable(uint32_t cv[8], |
211 | const uint8_t block[BLAKE3_BLOCK_LEN], |
212 | uint8_t block_len, uint64_t counter, |
213 | uint8_t flags); |
214 | |
215 | LLVM_LIBRARY_VISIBILITY |
216 | void blake3_compress_xof_portable(const uint32_t cv[8], |
217 | const uint8_t block[BLAKE3_BLOCK_LEN], |
218 | uint8_t block_len, uint64_t counter, |
219 | uint8_t flags, uint8_t out[64]); |
220 | |
221 | LLVM_LIBRARY_VISIBILITY |
222 | void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, |
223 | size_t blocks, const uint32_t key[8], |
224 | uint64_t counter, bool increment_counter, |
225 | uint8_t flags, uint8_t flags_start, |
226 | uint8_t flags_end, uint8_t *out); |
227 | |
228 | #if defined(IS_X86) |
229 | #if !defined(BLAKE3_NO_SSE2) |
230 | LLVM_LIBRARY_VISIBILITY |
231 | void blake3_compress_in_place_sse2(uint32_t cv[8], |
232 | const uint8_t block[BLAKE3_BLOCK_LEN], |
233 | uint8_t block_len, uint64_t counter, |
234 | uint8_t flags); |
235 | LLVM_LIBRARY_VISIBILITY |
236 | void blake3_compress_xof_sse2(const uint32_t cv[8], |
237 | const uint8_t block[BLAKE3_BLOCK_LEN], |
238 | uint8_t block_len, uint64_t counter, |
239 | uint8_t flags, uint8_t out[64]); |
240 | LLVM_LIBRARY_VISIBILITY |
241 | void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, |
242 | size_t blocks, const uint32_t key[8], |
243 | uint64_t counter, bool increment_counter, |
244 | uint8_t flags, uint8_t flags_start, |
245 | uint8_t flags_end, uint8_t *out); |
246 | #endif |
247 | #if !defined(BLAKE3_NO_SSE41) |
248 | LLVM_LIBRARY_VISIBILITY |
249 | void blake3_compress_in_place_sse41(uint32_t cv[8], |
250 | const uint8_t block[BLAKE3_BLOCK_LEN], |
251 | uint8_t block_len, uint64_t counter, |
252 | uint8_t flags); |
253 | LLVM_LIBRARY_VISIBILITY |
254 | void blake3_compress_xof_sse41(const uint32_t cv[8], |
255 | const uint8_t block[BLAKE3_BLOCK_LEN], |
256 | uint8_t block_len, uint64_t counter, |
257 | uint8_t flags, uint8_t out[64]); |
258 | LLVM_LIBRARY_VISIBILITY |
259 | void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, |
260 | size_t blocks, const uint32_t key[8], |
261 | uint64_t counter, bool increment_counter, |
262 | uint8_t flags, uint8_t flags_start, |
263 | uint8_t flags_end, uint8_t *out); |
264 | #endif |
265 | #if !defined(BLAKE3_NO_AVX2) |
266 | LLVM_LIBRARY_VISIBILITY |
267 | void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, |
268 | size_t blocks, const uint32_t key[8], |
269 | uint64_t counter, bool increment_counter, |
270 | uint8_t flags, uint8_t flags_start, |
271 | uint8_t flags_end, uint8_t *out); |
272 | #endif |
273 | #if !defined(BLAKE3_NO_AVX512) |
274 | LLVM_LIBRARY_VISIBILITY |
275 | void blake3_compress_in_place_avx512(uint32_t cv[8], |
276 | const uint8_t block[BLAKE3_BLOCK_LEN], |
277 | uint8_t block_len, uint64_t counter, |
278 | uint8_t flags); |
279 | |
280 | LLVM_LIBRARY_VISIBILITY |
281 | void blake3_compress_xof_avx512(const uint32_t cv[8], |
282 | const uint8_t block[BLAKE3_BLOCK_LEN], |
283 | uint8_t block_len, uint64_t counter, |
284 | uint8_t flags, uint8_t out[64]); |
285 | |
286 | LLVM_LIBRARY_VISIBILITY |
287 | void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, |
288 | size_t blocks, const uint32_t key[8], |
289 | uint64_t counter, bool increment_counter, |
290 | uint8_t flags, uint8_t flags_start, |
291 | uint8_t flags_end, uint8_t *out); |
292 | #endif |
293 | #endif |
294 | |
295 | #if BLAKE3_USE_NEON == 1 |
296 | LLVM_LIBRARY_VISIBILITY |
297 | void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, |
298 | size_t blocks, const uint32_t key[8], |
299 | uint64_t counter, bool increment_counter, |
300 | uint8_t flags, uint8_t flags_start, |
301 | uint8_t flags_end, uint8_t *out); |
302 | #endif |
303 | |
304 | |
305 | #endif /* BLAKE3_IMPL_H */ |
306 | |