| 1 | /*===-- blake3.c - BLAKE3 C Implementation ------------------------*- C -*-===*\ |
| 2 | |* *| |
| 3 | |* Released into the public domain with CC0 1.0 *| |
| 4 | |* See 'llvm/lib/Support/BLAKE3/LICENSE' for info. *| |
| 5 | |* SPDX-License-Identifier: CC0-1.0 *| |
| 6 | |* *| |
| 7 | \*===----------------------------------------------------------------------===*/ |
| 8 | |
| 9 | #include <assert.h> |
| 10 | #include <stdbool.h> |
| 11 | #include <string.h> |
| 12 | |
| 13 | #include "blake3_impl.h" |
| 14 | |
| 15 | const char *llvm_blake3_version(void) { return BLAKE3_VERSION_STRING; } |
| 16 | |
| 17 | INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], |
| 18 | uint8_t flags) { |
| 19 | memcpy(dest: self->cv, src: key, BLAKE3_KEY_LEN); |
| 20 | self->chunk_counter = 0; |
| 21 | memset(s: self->buf, c: 0, BLAKE3_BLOCK_LEN); |
| 22 | self->buf_len = 0; |
| 23 | self->blocks_compressed = 0; |
| 24 | self->flags = flags; |
| 25 | } |
| 26 | |
| 27 | INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], |
| 28 | uint64_t chunk_counter) { |
| 29 | memcpy(dest: self->cv, src: key, BLAKE3_KEY_LEN); |
| 30 | self->chunk_counter = chunk_counter; |
| 31 | self->blocks_compressed = 0; |
| 32 | memset(s: self->buf, c: 0, BLAKE3_BLOCK_LEN); |
| 33 | self->buf_len = 0; |
| 34 | } |
| 35 | |
| 36 | INLINE size_t chunk_state_len(const blake3_chunk_state *self) { |
| 37 | return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + |
| 38 | ((size_t)self->buf_len); |
| 39 | } |
| 40 | |
| 41 | INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, |
| 42 | const uint8_t *input, size_t input_len) { |
| 43 | size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); |
| 44 | if (take > input_len) { |
| 45 | take = input_len; |
| 46 | } |
| 47 | uint8_t *dest = self->buf + ((size_t)self->buf_len); |
| 48 | memcpy(dest: dest, src: input, n: take); |
| 49 | self->buf_len += (uint8_t)take; |
| 50 | return take; |
| 51 | } |
| 52 | |
| 53 | INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { |
| 54 | if (self->blocks_compressed == 0) { |
| 55 | return CHUNK_START; |
| 56 | } else { |
| 57 | return 0; |
| 58 | } |
| 59 | } |
| 60 | |
| 61 | typedef struct { |
| 62 | uint32_t input_cv[8]; |
| 63 | uint64_t counter; |
| 64 | uint8_t block[BLAKE3_BLOCK_LEN]; |
| 65 | uint8_t block_len; |
| 66 | uint8_t flags; |
| 67 | } output_t; |
| 68 | |
| 69 | INLINE output_t make_output(const uint32_t input_cv[8], |
| 70 | const uint8_t block[BLAKE3_BLOCK_LEN], |
| 71 | uint8_t block_len, uint64_t counter, |
| 72 | uint8_t flags) { |
| 73 | output_t ret; |
| 74 | memcpy(dest: ret.input_cv, src: input_cv, n: 32); |
| 75 | memcpy(dest: ret.block, src: block, BLAKE3_BLOCK_LEN); |
| 76 | ret.block_len = block_len; |
| 77 | ret.counter = counter; |
| 78 | ret.flags = flags; |
| 79 | return ret; |
| 80 | } |
| 81 | |
| 82 | // Chaining values within a given chunk (specifically the compress_in_place |
| 83 | // interface) are represented as words. This avoids unnecessary bytes<->words |
| 84 | // conversion overhead in the portable implementation. However, the hash_many |
| 85 | // interface handles both user input and parent node blocks, so it accepts |
| 86 | // bytes. For that reason, chaining values in the CV stack are represented as |
| 87 | // bytes. |
| 88 | INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { |
| 89 | uint32_t cv_words[8]; |
| 90 | memcpy(dest: cv_words, src: self->input_cv, n: 32); |
| 91 | blake3_compress_in_place(cv: cv_words, block: self->block, block_len: self->block_len, |
| 92 | counter: self->counter, flags: self->flags); |
| 93 | store_cv_words(bytes_out: cv, cv_words); |
| 94 | } |
| 95 | |
| 96 | INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, |
| 97 | size_t out_len) { |
| 98 | uint64_t output_block_counter = seek / 64; |
| 99 | size_t offset_within_block = seek % 64; |
| 100 | uint8_t wide_buf[64]; |
| 101 | while (out_len > 0) { |
| 102 | blake3_compress_xof(cv: self->input_cv, block: self->block, block_len: self->block_len, |
| 103 | counter: output_block_counter, flags: self->flags | ROOT, out: wide_buf); |
| 104 | size_t available_bytes = 64 - offset_within_block; |
| 105 | size_t memcpy_len; |
| 106 | if (out_len > available_bytes) { |
| 107 | memcpy_len = available_bytes; |
| 108 | } else { |
| 109 | memcpy_len = out_len; |
| 110 | } |
| 111 | memcpy(dest: out, src: wide_buf + offset_within_block, n: memcpy_len); |
| 112 | out += memcpy_len; |
| 113 | out_len -= memcpy_len; |
| 114 | output_block_counter += 1; |
| 115 | offset_within_block = 0; |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, |
| 120 | size_t input_len) { |
| 121 | if (self->buf_len > 0) { |
| 122 | size_t take = chunk_state_fill_buf(self, input, input_len); |
| 123 | input += take; |
| 124 | input_len -= take; |
| 125 | if (input_len > 0) { |
| 126 | blake3_compress_in_place( |
| 127 | cv: self->cv, block: self->buf, BLAKE3_BLOCK_LEN, counter: self->chunk_counter, |
| 128 | flags: self->flags | chunk_state_maybe_start_flag(self)); |
| 129 | self->blocks_compressed += 1; |
| 130 | self->buf_len = 0; |
| 131 | memset(s: self->buf, c: 0, BLAKE3_BLOCK_LEN); |
| 132 | } |
| 133 | } |
| 134 | |
| 135 | while (input_len > BLAKE3_BLOCK_LEN) { |
| 136 | blake3_compress_in_place(cv: self->cv, block: input, BLAKE3_BLOCK_LEN, |
| 137 | counter: self->chunk_counter, |
| 138 | flags: self->flags | chunk_state_maybe_start_flag(self)); |
| 139 | self->blocks_compressed += 1; |
| 140 | input += BLAKE3_BLOCK_LEN; |
| 141 | input_len -= BLAKE3_BLOCK_LEN; |
| 142 | } |
| 143 | |
| 144 | chunk_state_fill_buf(self, input, input_len); |
| 145 | } |
| 146 | |
| 147 | INLINE output_t chunk_state_output(const blake3_chunk_state *self) { |
| 148 | uint8_t block_flags = |
| 149 | self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; |
| 150 | return make_output(input_cv: self->cv, block: self->buf, block_len: self->buf_len, counter: self->chunk_counter, |
| 151 | flags: block_flags); |
| 152 | } |
| 153 | |
| 154 | INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], |
| 155 | const uint32_t key[8], uint8_t flags) { |
| 156 | return make_output(input_cv: key, block, BLAKE3_BLOCK_LEN, counter: 0, flags: flags | PARENT); |
| 157 | } |
| 158 | |
| 159 | // Given some input larger than one chunk, return the number of bytes that |
| 160 | // should go in the left subtree. This is the largest power-of-2 number of |
| 161 | // chunks that leaves at least 1 byte for the right subtree. |
| 162 | INLINE size_t left_len(size_t content_len) { |
| 163 | // Subtract 1 to reserve at least one byte for the right side. content_len |
| 164 | // should always be greater than BLAKE3_CHUNK_LEN. |
| 165 | size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; |
| 166 | return round_down_to_power_of_2(x: full_chunks) * BLAKE3_CHUNK_LEN; |
| 167 | } |
| 168 | |
| 169 | // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time |
| 170 | // on a single thread. Write out the chunk chaining values and return the |
| 171 | // number of chunks hashed. These chunks are never the root and never empty; |
| 172 | // those cases use a different codepath. |
| 173 | INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, |
| 174 | const uint32_t key[8], |
| 175 | uint64_t chunk_counter, uint8_t flags, |
| 176 | uint8_t *out) { |
| 177 | #if defined(BLAKE3_TESTING) |
| 178 | assert(0 < input_len); |
| 179 | assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); |
| 180 | #endif |
| 181 | |
| 182 | const uint8_t *chunks_array[MAX_SIMD_DEGREE]; |
| 183 | size_t input_position = 0; |
| 184 | size_t chunks_array_len = 0; |
| 185 | while (input_len - input_position >= BLAKE3_CHUNK_LEN) { |
| 186 | chunks_array[chunks_array_len] = &input[input_position]; |
| 187 | input_position += BLAKE3_CHUNK_LEN; |
| 188 | chunks_array_len += 1; |
| 189 | } |
| 190 | |
| 191 | blake3_hash_many(inputs: chunks_array, num_inputs: chunks_array_len, |
| 192 | BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, counter: chunk_counter, |
| 193 | true, flags, flags_start: CHUNK_START, flags_end: CHUNK_END, out); |
| 194 | |
| 195 | // Hash the remaining partial chunk, if there is one. Note that the empty |
| 196 | // chunk (meaning the empty message) is a different codepath. |
| 197 | if (input_len > input_position) { |
| 198 | uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; |
| 199 | blake3_chunk_state chunk_state; |
| 200 | chunk_state_init(self: &chunk_state, key, flags); |
| 201 | chunk_state.chunk_counter = counter; |
| 202 | chunk_state_update(self: &chunk_state, input: &input[input_position], |
| 203 | input_len: input_len - input_position); |
| 204 | output_t output = chunk_state_output(self: &chunk_state); |
| 205 | output_chaining_value(self: &output, cv: &out[chunks_array_len * BLAKE3_OUT_LEN]); |
| 206 | return chunks_array_len + 1; |
| 207 | } else { |
| 208 | return chunks_array_len; |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time |
| 213 | // on a single thread. Write out the parent chaining values and return the |
| 214 | // number of parents hashed. (If there's an odd input chaining value left over, |
| 215 | // return it as an additional output.) These parents are never the root and |
| 216 | // never empty; those cases use a different codepath. |
| 217 | INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, |
| 218 | size_t num_chaining_values, |
| 219 | const uint32_t key[8], uint8_t flags, |
| 220 | uint8_t *out) { |
| 221 | #if defined(BLAKE3_TESTING) |
| 222 | assert(2 <= num_chaining_values); |
| 223 | assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); |
| 224 | #endif |
| 225 | |
| 226 | const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; |
| 227 | size_t parents_array_len = 0; |
| 228 | while (num_chaining_values - (2 * parents_array_len) >= 2) { |
| 229 | parents_array[parents_array_len] = |
| 230 | &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; |
| 231 | parents_array_len += 1; |
| 232 | } |
| 233 | |
| 234 | blake3_hash_many(inputs: parents_array, num_inputs: parents_array_len, blocks: 1, key, |
| 235 | counter: 0, // Parents always use counter 0. |
| 236 | false, flags: flags | PARENT, |
| 237 | flags_start: 0, // Parents have no start flags. |
| 238 | flags_end: 0, // Parents have no end flags. |
| 239 | out); |
| 240 | |
| 241 | // If there's an odd child left over, it becomes an output. |
| 242 | if (num_chaining_values > 2 * parents_array_len) { |
| 243 | memcpy(dest: &out[parents_array_len * BLAKE3_OUT_LEN], |
| 244 | src: &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], |
| 245 | BLAKE3_OUT_LEN); |
| 246 | return parents_array_len + 1; |
| 247 | } else { |
| 248 | return parents_array_len; |
| 249 | } |
| 250 | } |
| 251 | |
| 252 | // The wide helper function returns (writes out) an array of chaining values |
| 253 | // and returns the length of that array. The number of chaining values returned |
| 254 | // is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, |
| 255 | // if the input is shorter than that many chunks. The reason for maintaining a |
| 256 | // wide array of chaining values going back up the tree, is to allow the |
| 257 | // implementation to hash as many parents in parallel as possible. |
| 258 | // |
| 259 | // As a special case when the SIMD degree is 1, this function will still return |
| 260 | // at least 2 outputs. This guarantees that this function doesn't perform the |
| 261 | // root compression. (If it did, it would use the wrong flags, and also we |
| 262 | // wouldn't be able to implement exendable ouput.) Note that this function is |
| 263 | // not used when the whole input is only 1 chunk long; that's a different |
| 264 | // codepath. |
| 265 | // |
| 266 | // Why not just have the caller split the input on the first update(), instead |
| 267 | // of implementing this special rule? Because we don't want to limit SIMD or |
| 268 | // multi-threading parallelism for that update(). |
| 269 | static size_t blake3_compress_subtree_wide(const uint8_t *input, |
| 270 | size_t input_len, |
| 271 | const uint32_t key[8], |
| 272 | uint64_t chunk_counter, |
| 273 | uint8_t flags, uint8_t *out) { |
| 274 | // Note that the single chunk case does *not* bump the SIMD degree up to 2 |
| 275 | // when it is 1. If this implementation adds multi-threading in the future, |
| 276 | // this gives us the option of multi-threading even the 2-chunk case, which |
| 277 | // can help performance on smaller platforms. |
| 278 | if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { |
| 279 | return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, |
| 280 | out); |
| 281 | } |
| 282 | |
| 283 | // With more than simd_degree chunks, we need to recurse. Start by dividing |
| 284 | // the input into left and right subtrees. (Note that this is only optimal |
| 285 | // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree |
| 286 | // of 3 or something, we'll need a more complicated strategy.) |
| 287 | size_t left_input_len = left_len(content_len: input_len); |
| 288 | size_t right_input_len = input_len - left_input_len; |
| 289 | const uint8_t *right_input = &input[left_input_len]; |
| 290 | uint64_t right_chunk_counter = |
| 291 | chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); |
| 292 | |
| 293 | // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to |
| 294 | // account for the special case of returning 2 outputs when the SIMD degree |
| 295 | // is 1. |
| 296 | uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; |
| 297 | size_t degree = blake3_simd_degree(); |
| 298 | if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { |
| 299 | // The special case: We always use a degree of at least two, to make |
| 300 | // sure there are two outputs. Except, as noted above, at the chunk |
| 301 | // level, where we allow degree=1. (Note that the 1-chunk-input case is |
| 302 | // a different codepath.) |
| 303 | degree = 2; |
| 304 | } |
| 305 | uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; |
| 306 | |
| 307 | // Recurse! If this implementation adds multi-threading support in the |
| 308 | // future, this is where it will go. |
| 309 | size_t left_n = blake3_compress_subtree_wide(input, input_len: left_input_len, key, |
| 310 | chunk_counter, flags, out: cv_array); |
| 311 | size_t right_n = blake3_compress_subtree_wide( |
| 312 | input: right_input, input_len: right_input_len, key, chunk_counter: right_chunk_counter, flags, out: right_cvs); |
| 313 | |
| 314 | // The special case again. If simd_degree=1, then we'll have left_n=1 and |
| 315 | // right_n=1. Rather than compressing them into a single output, return |
| 316 | // them directly, to make sure we always have at least two outputs. |
| 317 | if (left_n == 1) { |
| 318 | memcpy(dest: out, src: cv_array, n: 2 * BLAKE3_OUT_LEN); |
| 319 | return 2; |
| 320 | } |
| 321 | |
| 322 | // Otherwise, do one layer of parent node compression. |
| 323 | size_t num_chaining_values = left_n + right_n; |
| 324 | return compress_parents_parallel(child_chaining_values: cv_array, num_chaining_values, key, flags, |
| 325 | out); |
| 326 | } |
| 327 | |
| 328 | // Hash a subtree with compress_subtree_wide(), and then condense the resulting |
| 329 | // list of chaining values down to a single parent node. Don't compress that |
| 330 | // last parent node, however. Instead, return its message bytes (the |
| 331 | // concatenated chaining values of its children). This is necessary when the |
| 332 | // first call to update() supplies a complete subtree, because the topmost |
| 333 | // parent node of that subtree could end up being the root. It's also necessary |
| 334 | // for extended output in the general case. |
| 335 | // |
| 336 | // As with compress_subtree_wide(), this function is not used on inputs of 1 |
| 337 | // chunk or less. That's a different codepath. |
| 338 | INLINE void compress_subtree_to_parent_node( |
| 339 | const uint8_t *input, size_t input_len, const uint32_t key[8], |
| 340 | uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { |
| 341 | #if defined(BLAKE3_TESTING) |
| 342 | assert(input_len > BLAKE3_CHUNK_LEN); |
| 343 | #endif |
| 344 | |
| 345 | uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; |
| 346 | size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, |
| 347 | chunk_counter, flags, out: cv_array); |
| 348 | assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); |
| 349 | |
| 350 | // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, |
| 351 | // compress_subtree_wide() returns more than 2 chaining values. Condense |
| 352 | // them into 2 by forming parent nodes repeatedly. |
| 353 | uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; |
| 354 | // The second half of this loop condition is always true, and we just |
| 355 | // asserted it above. But GCC can't tell that it's always true, and if NDEBUG |
| 356 | // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious |
| 357 | // warnings here. GCC 8.5 is particularly sensitive, so if you're changing |
| 358 | // this code, test it against that version. |
| 359 | while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) { |
| 360 | num_cvs = |
| 361 | compress_parents_parallel(child_chaining_values: cv_array, num_chaining_values: num_cvs, key, flags, out: out_array); |
| 362 | memcpy(dest: cv_array, src: out_array, n: num_cvs * BLAKE3_OUT_LEN); |
| 363 | } |
| 364 | memcpy(dest: out, src: cv_array, n: 2 * BLAKE3_OUT_LEN); |
| 365 | } |
| 366 | |
| 367 | INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], |
| 368 | uint8_t flags) { |
| 369 | memcpy(dest: self->key, src: key, BLAKE3_KEY_LEN); |
| 370 | chunk_state_init(self: &self->chunk, key, flags); |
| 371 | self->cv_stack_len = 0; |
| 372 | } |
| 373 | |
| 374 | void llvm_blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, key: IV, flags: 0); } |
| 375 | |
| 376 | void llvm_blake3_hasher_init_keyed(blake3_hasher *self, |
| 377 | const uint8_t key[BLAKE3_KEY_LEN]) { |
| 378 | uint32_t key_words[8]; |
| 379 | load_key_words(key, key_words); |
| 380 | hasher_init_base(self, key: key_words, flags: KEYED_HASH); |
| 381 | } |
| 382 | |
| 383 | void llvm_blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, |
| 384 | size_t context_len) { |
| 385 | blake3_hasher context_hasher; |
| 386 | hasher_init_base(self: &context_hasher, key: IV, flags: DERIVE_KEY_CONTEXT); |
| 387 | llvm_blake3_hasher_update(self: &context_hasher, input: context, input_len: context_len); |
| 388 | uint8_t context_key[BLAKE3_KEY_LEN]; |
| 389 | llvm_blake3_hasher_finalize(self: &context_hasher, out: context_key, BLAKE3_KEY_LEN); |
| 390 | uint32_t context_key_words[8]; |
| 391 | load_key_words(key: context_key, key_words: context_key_words); |
| 392 | hasher_init_base(self, key: context_key_words, flags: DERIVE_KEY_MATERIAL); |
| 393 | } |
| 394 | |
| 395 | void llvm_blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { |
| 396 | llvm_blake3_hasher_init_derive_key_raw(self, context, context_len: strlen(s: context)); |
| 397 | } |
| 398 | |
| 399 | // As described in hasher_push_cv() below, we do "lazy merging", delaying |
| 400 | // merges until right before the next CV is about to be added. This is |
| 401 | // different from the reference implementation. Another difference is that we |
| 402 | // aren't always merging 1 chunk at a time. Instead, each CV might represent |
| 403 | // any power-of-two number of chunks, as long as the smaller-above-larger stack |
| 404 | // order is maintained. Instead of the "count the trailing 0-bits" algorithm |
| 405 | // described in the spec, we use a "count the total number of 1-bits" variant |
| 406 | // that doesn't require us to retain the subtree size of the CV on top of the |
| 407 | // stack. The principle is the same: each CV that should remain in the stack is |
| 408 | // represented by a 1-bit in the total number of chunks (or bytes) so far. |
| 409 | INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { |
| 410 | size_t post_merge_stack_len = (size_t)popcnt(x: total_len); |
| 411 | while (self->cv_stack_len > post_merge_stack_len) { |
| 412 | uint8_t *parent_node = |
| 413 | &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; |
| 414 | output_t output = parent_output(block: parent_node, key: self->key, flags: self->chunk.flags); |
| 415 | output_chaining_value(self: &output, cv: parent_node); |
| 416 | self->cv_stack_len -= 1; |
| 417 | } |
| 418 | } |
| 419 | |
| 420 | // In reference_impl.rs, we merge the new CV with existing CVs from the stack |
| 421 | // before pushing it. We can do that because we know more input is coming, so |
| 422 | // we know none of the merges are root. |
| 423 | // |
| 424 | // This setting is different. We want to feed as much input as possible to |
| 425 | // compress_subtree_wide(), without setting aside anything for the chunk_state. |
| 426 | // If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once |
| 427 | // as a single subtree, if at all possible. |
| 428 | // |
| 429 | // This leads to two problems: |
| 430 | // 1) This 64 KiB input might be the only call that ever gets made to update. |
| 431 | // In this case, the root node of the 64 KiB subtree would be the root node |
| 432 | // of the whole tree, and it would need to be ROOT finalized. We can't |
| 433 | // compress it until we know. |
| 434 | // 2) This 64 KiB input might complete a larger tree, whose root node is |
| 435 | // similarly going to be the the root of the whole tree. For example, maybe |
| 436 | // we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the |
| 437 | // node at the root of the 256 KiB subtree until we know how to finalize it. |
| 438 | // |
| 439 | // The second problem is solved with "lazy merging". That is, when we're about |
| 440 | // to add a CV to the stack, we don't merge it with anything first, as the |
| 441 | // reference impl does. Instead we do merges using the *previous* CV that was |
| 442 | // added, which is sitting on top of the stack, and we put the new CV |
| 443 | // (unmerged) on top of the stack afterwards. This guarantees that we never |
| 444 | // merge the root node until finalize(). |
| 445 | // |
| 446 | // Solving the first problem requires an additional tool, |
| 447 | // compress_subtree_to_parent_node(). That function always returns the top |
| 448 | // *two* chaining values of the subtree it's compressing. We then do lazy |
| 449 | // merging with each of them separately, so that the second CV will always |
| 450 | // remain unmerged. (That also helps us support extendable output when we're |
| 451 | // hashing an input all-at-once.) |
| 452 | INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], |
| 453 | uint64_t chunk_counter) { |
| 454 | hasher_merge_cv_stack(self, total_len: chunk_counter); |
| 455 | memcpy(dest: &self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], src: new_cv, |
| 456 | BLAKE3_OUT_LEN); |
| 457 | self->cv_stack_len += 1; |
| 458 | } |
| 459 | |
| 460 | void llvm_blake3_hasher_update(blake3_hasher *self, const void *input, |
| 461 | size_t input_len) { |
| 462 | // Explicitly checking for zero avoids causing UB by passing a null pointer |
| 463 | // to memcpy. This comes up in practice with things like: |
| 464 | // std::vector<uint8_t> v; |
| 465 | // blake3_hasher_update(&hasher, v.data(), v.size()); |
| 466 | if (input_len == 0) { |
| 467 | return; |
| 468 | } |
| 469 | |
| 470 | const uint8_t *input_bytes = (const uint8_t *)input; |
| 471 | |
| 472 | // If we have some partial chunk bytes in the internal chunk_state, we need |
| 473 | // to finish that chunk first. |
| 474 | if (chunk_state_len(self: &self->chunk) > 0) { |
| 475 | size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(self: &self->chunk); |
| 476 | if (take > input_len) { |
| 477 | take = input_len; |
| 478 | } |
| 479 | chunk_state_update(self: &self->chunk, input: input_bytes, input_len: take); |
| 480 | input_bytes += take; |
| 481 | input_len -= take; |
| 482 | // If we've filled the current chunk and there's more coming, finalize this |
| 483 | // chunk and proceed. In this case we know it's not the root. |
| 484 | if (input_len > 0) { |
| 485 | output_t output = chunk_state_output(self: &self->chunk); |
| 486 | uint8_t chunk_cv[32]; |
| 487 | output_chaining_value(self: &output, cv: chunk_cv); |
| 488 | hasher_push_cv(self, new_cv: chunk_cv, chunk_counter: self->chunk.chunk_counter); |
| 489 | chunk_state_reset(self: &self->chunk, key: self->key, chunk_counter: self->chunk.chunk_counter + 1); |
| 490 | } else { |
| 491 | return; |
| 492 | } |
| 493 | } |
| 494 | |
| 495 | // Now the chunk_state is clear, and we have more input. If there's more than |
| 496 | // a single chunk (so, definitely not the root chunk), hash the largest whole |
| 497 | // subtree we can, with the full benefits of SIMD (and maybe in the future, |
| 498 | // multi-threading) parallelism. Two restrictions: |
| 499 | // - The subtree has to be a power-of-2 number of chunks. Only subtrees along |
| 500 | // the right edge can be incomplete, and we don't know where the right edge |
| 501 | // is going to be until we get to finalize(). |
| 502 | // - The subtree must evenly divide the total number of chunks up until this |
| 503 | // point (if total is not 0). If the current incomplete subtree is only |
| 504 | // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have |
| 505 | // to complete the current subtree first. |
| 506 | // Because we might need to break up the input to form powers of 2, or to |
| 507 | // evenly divide what we already have, this part runs in a loop. |
| 508 | while (input_len > BLAKE3_CHUNK_LEN) { |
| 509 | size_t subtree_len = round_down_to_power_of_2(x: input_len); |
| 510 | uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; |
| 511 | // Shrink the subtree_len until it evenly divides the count so far. We know |
| 512 | // that subtree_len itself is a power of 2, so we can use a bitmasking |
| 513 | // trick instead of an actual remainder operation. (Note that if the caller |
| 514 | // consistently passes power-of-2 inputs of the same size, as is hopefully |
| 515 | // typical, this loop condition will always fail, and subtree_len will |
| 516 | // always be the full length of the input.) |
| 517 | // |
| 518 | // An aside: We don't have to shrink subtree_len quite this much. For |
| 519 | // example, if count_so_far is 1, we could pass 2 chunks to |
| 520 | // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still |
| 521 | // get the right answer in the end, and we might get to use 2-way SIMD |
| 522 | // parallelism. The problem with this optimization, is that it gets us |
| 523 | // stuck always hashing 2 chunks. The total number of chunks will remain |
| 524 | // odd, and we'll never graduate to higher degrees of parallelism. See |
| 525 | // https://github.com/BLAKE3-team/BLAKE3/issues/69. |
| 526 | while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { |
| 527 | subtree_len /= 2; |
| 528 | } |
| 529 | // The shrunken subtree_len might now be 1 chunk long. If so, hash that one |
| 530 | // chunk by itself. Otherwise, compress the subtree into a pair of CVs. |
| 531 | uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; |
| 532 | if (subtree_len <= BLAKE3_CHUNK_LEN) { |
| 533 | blake3_chunk_state chunk_state; |
| 534 | chunk_state_init(self: &chunk_state, key: self->key, flags: self->chunk.flags); |
| 535 | chunk_state.chunk_counter = self->chunk.chunk_counter; |
| 536 | chunk_state_update(self: &chunk_state, input: input_bytes, input_len: subtree_len); |
| 537 | output_t output = chunk_state_output(self: &chunk_state); |
| 538 | uint8_t cv[BLAKE3_OUT_LEN]; |
| 539 | output_chaining_value(self: &output, cv); |
| 540 | hasher_push_cv(self, new_cv: cv, chunk_counter: chunk_state.chunk_counter); |
| 541 | } else { |
| 542 | // This is the high-performance happy path, though getting here depends |
| 543 | // on the caller giving us a long enough input. |
| 544 | uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; |
| 545 | compress_subtree_to_parent_node(input: input_bytes, input_len: subtree_len, key: self->key, |
| 546 | chunk_counter: self->chunk.chunk_counter, |
| 547 | flags: self->chunk.flags, out: cv_pair); |
| 548 | hasher_push_cv(self, new_cv: cv_pair, chunk_counter: self->chunk.chunk_counter); |
| 549 | hasher_push_cv(self, new_cv: &cv_pair[BLAKE3_OUT_LEN], |
| 550 | chunk_counter: self->chunk.chunk_counter + (subtree_chunks / 2)); |
| 551 | } |
| 552 | self->chunk.chunk_counter += subtree_chunks; |
| 553 | input_bytes += subtree_len; |
| 554 | input_len -= subtree_len; |
| 555 | } |
| 556 | |
| 557 | // If there's any remaining input less than a full chunk, add it to the chunk |
| 558 | // state. In that case, also do a final merge loop to make sure the subtree |
| 559 | // stack doesn't contain any unmerged pairs. The remaining input means we |
| 560 | // know these merges are non-root. This merge loop isn't strictly necessary |
| 561 | // here, because hasher_push_chunk_cv already does its own merge loop, but it |
| 562 | // simplifies blake3_hasher_finalize below. |
| 563 | if (input_len > 0) { |
| 564 | chunk_state_update(self: &self->chunk, input: input_bytes, input_len); |
| 565 | hasher_merge_cv_stack(self, total_len: self->chunk.chunk_counter); |
| 566 | } |
| 567 | } |
| 568 | |
| 569 | void llvm_blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, |
| 570 | size_t out_len) { |
| 571 | llvm_blake3_hasher_finalize_seek(self, seek: 0, out, out_len); |
| 572 | #if LLVM_MEMORY_SANITIZER_BUILD |
| 573 | // Avoid false positives due to uninstrumented assembly code. |
| 574 | __msan_unpoison(out, out_len); |
| 575 | #endif |
| 576 | } |
| 577 | |
| 578 | void llvm_blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, |
| 579 | uint8_t *out, size_t out_len) { |
| 580 | // Explicitly checking for zero avoids causing UB by passing a null pointer |
| 581 | // to memcpy. This comes up in practice with things like: |
| 582 | // std::vector<uint8_t> v; |
| 583 | // blake3_hasher_finalize(&hasher, v.data(), v.size()); |
| 584 | if (out_len == 0) { |
| 585 | return; |
| 586 | } |
| 587 | |
| 588 | // If the subtree stack is empty, then the current chunk is the root. |
| 589 | if (self->cv_stack_len == 0) { |
| 590 | output_t output = chunk_state_output(self: &self->chunk); |
| 591 | output_root_bytes(self: &output, seek, out, out_len); |
| 592 | return; |
| 593 | } |
| 594 | // If there are any bytes in the chunk state, finalize that chunk and do a |
| 595 | // roll-up merge between that chunk hash and every subtree in the stack. In |
| 596 | // this case, the extra merge loop at the end of blake3_hasher_update |
| 597 | // guarantees that none of the subtrees in the stack need to be merged with |
| 598 | // each other first. Otherwise, if there are no bytes in the chunk state, |
| 599 | // then the top of the stack is a chunk hash, and we start the merge from |
| 600 | // that. |
| 601 | output_t output; |
| 602 | size_t cvs_remaining; |
| 603 | if (chunk_state_len(self: &self->chunk) > 0) { |
| 604 | cvs_remaining = self->cv_stack_len; |
| 605 | output = chunk_state_output(self: &self->chunk); |
| 606 | } else { |
| 607 | // There are always at least 2 CVs in the stack in this case. |
| 608 | cvs_remaining = self->cv_stack_len - 2; |
| 609 | output = parent_output(block: &self->cv_stack[cvs_remaining * 32], key: self->key, |
| 610 | flags: self->chunk.flags); |
| 611 | } |
| 612 | while (cvs_remaining > 0) { |
| 613 | cvs_remaining -= 1; |
| 614 | uint8_t parent_block[BLAKE3_BLOCK_LEN]; |
| 615 | memcpy(dest: parent_block, src: &self->cv_stack[cvs_remaining * 32], n: 32); |
| 616 | output_chaining_value(self: &output, cv: &parent_block[32]); |
| 617 | output = parent_output(block: parent_block, key: self->key, flags: self->chunk.flags); |
| 618 | } |
| 619 | output_root_bytes(self: &output, seek, out, out_len); |
| 620 | } |
| 621 | |
| 622 | void llvm_blake3_hasher_reset(blake3_hasher *self) { |
| 623 | chunk_state_reset(self: &self->chunk, key: self->key, chunk_counter: 0); |
| 624 | self->cv_stack_len = 0; |
| 625 | } |
| 626 | |