1#include "blake3_impl.h"
2#include <string.h>
3
4INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
5 return (w >> c) | (w << (32 - c));
6}
7
8INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
9 uint32_t x, uint32_t y) {
10 state[a] = state[a] + state[b] + x;
11 state[d] = rotr32(w: state[d] ^ state[a], c: 16);
12 state[c] = state[c] + state[d];
13 state[b] = rotr32(w: state[b] ^ state[c], c: 12);
14 state[a] = state[a] + state[b] + y;
15 state[d] = rotr32(w: state[d] ^ state[a], c: 8);
16 state[c] = state[c] + state[d];
17 state[b] = rotr32(w: state[b] ^ state[c], c: 7);
18}
19
20INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
21 // Select the message schedule based on the round.
22 const uint8_t *schedule = MSG_SCHEDULE[round];
23
24 // Mix the columns.
25 g(state, a: 0, b: 4, c: 8, d: 12, x: msg[schedule[0]], y: msg[schedule[1]]);
26 g(state, a: 1, b: 5, c: 9, d: 13, x: msg[schedule[2]], y: msg[schedule[3]]);
27 g(state, a: 2, b: 6, c: 10, d: 14, x: msg[schedule[4]], y: msg[schedule[5]]);
28 g(state, a: 3, b: 7, c: 11, d: 15, x: msg[schedule[6]], y: msg[schedule[7]]);
29
30 // Mix the rows.
31 g(state, a: 0, b: 5, c: 10, d: 15, x: msg[schedule[8]], y: msg[schedule[9]]);
32 g(state, a: 1, b: 6, c: 11, d: 12, x: msg[schedule[10]], y: msg[schedule[11]]);
33 g(state, a: 2, b: 7, c: 8, d: 13, x: msg[schedule[12]], y: msg[schedule[13]]);
34 g(state, a: 3, b: 4, c: 9, d: 14, x: msg[schedule[14]], y: msg[schedule[15]]);
35}
36
37INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
38 const uint8_t block[BLAKE3_BLOCK_LEN],
39 uint8_t block_len, uint64_t counter, uint8_t flags) {
40 uint32_t block_words[16];
41 block_words[0] = load32(src: block + 4 * 0);
42 block_words[1] = load32(src: block + 4 * 1);
43 block_words[2] = load32(src: block + 4 * 2);
44 block_words[3] = load32(src: block + 4 * 3);
45 block_words[4] = load32(src: block + 4 * 4);
46 block_words[5] = load32(src: block + 4 * 5);
47 block_words[6] = load32(src: block + 4 * 6);
48 block_words[7] = load32(src: block + 4 * 7);
49 block_words[8] = load32(src: block + 4 * 8);
50 block_words[9] = load32(src: block + 4 * 9);
51 block_words[10] = load32(src: block + 4 * 10);
52 block_words[11] = load32(src: block + 4 * 11);
53 block_words[12] = load32(src: block + 4 * 12);
54 block_words[13] = load32(src: block + 4 * 13);
55 block_words[14] = load32(src: block + 4 * 14);
56 block_words[15] = load32(src: block + 4 * 15);
57
58 state[0] = cv[0];
59 state[1] = cv[1];
60 state[2] = cv[2];
61 state[3] = cv[3];
62 state[4] = cv[4];
63 state[5] = cv[5];
64 state[6] = cv[6];
65 state[7] = cv[7];
66 state[8] = IV[0];
67 state[9] = IV[1];
68 state[10] = IV[2];
69 state[11] = IV[3];
70 state[12] = counter_low(counter);
71 state[13] = counter_high(counter);
72 state[14] = (uint32_t)block_len;
73 state[15] = (uint32_t)flags;
74
75 round_fn(state, msg: &block_words[0], round: 0);
76 round_fn(state, msg: &block_words[0], round: 1);
77 round_fn(state, msg: &block_words[0], round: 2);
78 round_fn(state, msg: &block_words[0], round: 3);
79 round_fn(state, msg: &block_words[0], round: 4);
80 round_fn(state, msg: &block_words[0], round: 5);
81 round_fn(state, msg: &block_words[0], round: 6);
82}
83
84void blake3_compress_in_place_portable(uint32_t cv[8],
85 const uint8_t block[BLAKE3_BLOCK_LEN],
86 uint8_t block_len, uint64_t counter,
87 uint8_t flags) {
88 uint32_t state[16];
89 compress_pre(state, cv, block, block_len, counter, flags);
90 cv[0] = state[0] ^ state[8];
91 cv[1] = state[1] ^ state[9];
92 cv[2] = state[2] ^ state[10];
93 cv[3] = state[3] ^ state[11];
94 cv[4] = state[4] ^ state[12];
95 cv[5] = state[5] ^ state[13];
96 cv[6] = state[6] ^ state[14];
97 cv[7] = state[7] ^ state[15];
98}
99
100void blake3_compress_xof_portable(const uint32_t cv[8],
101 const uint8_t block[BLAKE3_BLOCK_LEN],
102 uint8_t block_len, uint64_t counter,
103 uint8_t flags, uint8_t out[64]) {
104 uint32_t state[16];
105 compress_pre(state, cv, block, block_len, counter, flags);
106
107 store32(dst: &out[0 * 4], w: state[0] ^ state[8]);
108 store32(dst: &out[1 * 4], w: state[1] ^ state[9]);
109 store32(dst: &out[2 * 4], w: state[2] ^ state[10]);
110 store32(dst: &out[3 * 4], w: state[3] ^ state[11]);
111 store32(dst: &out[4 * 4], w: state[4] ^ state[12]);
112 store32(dst: &out[5 * 4], w: state[5] ^ state[13]);
113 store32(dst: &out[6 * 4], w: state[6] ^ state[14]);
114 store32(dst: &out[7 * 4], w: state[7] ^ state[15]);
115 store32(dst: &out[8 * 4], w: state[8] ^ cv[0]);
116 store32(dst: &out[9 * 4], w: state[9] ^ cv[1]);
117 store32(dst: &out[10 * 4], w: state[10] ^ cv[2]);
118 store32(dst: &out[11 * 4], w: state[11] ^ cv[3]);
119 store32(dst: &out[12 * 4], w: state[12] ^ cv[4]);
120 store32(dst: &out[13 * 4], w: state[13] ^ cv[5]);
121 store32(dst: &out[14 * 4], w: state[14] ^ cv[6]);
122 store32(dst: &out[15 * 4], w: state[15] ^ cv[7]);
123}
124
125INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
126 const uint32_t key[8], uint64_t counter,
127 uint8_t flags, uint8_t flags_start,
128 uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
129 uint32_t cv[8];
130 memcpy(dest: cv, src: key, BLAKE3_KEY_LEN);
131 uint8_t block_flags = flags | flags_start;
132 while (blocks > 0) {
133 if (blocks == 1) {
134 block_flags |= flags_end;
135 }
136 blake3_compress_in_place_portable(cv, block: input, BLAKE3_BLOCK_LEN, counter,
137 flags: block_flags);
138 input = &input[BLAKE3_BLOCK_LEN];
139 blocks -= 1;
140 block_flags = flags;
141 }
142 store_cv_words(bytes_out: out, cv_words: cv);
143}
144
145void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
146 size_t blocks, const uint32_t key[8],
147 uint64_t counter, bool increment_counter,
148 uint8_t flags, uint8_t flags_start,
149 uint8_t flags_end, uint8_t *out) {
150 while (num_inputs > 0) {
151 hash_one_portable(input: inputs[0], blocks, key, counter, flags, flags_start,
152 flags_end, out);
153 if (increment_counter) {
154 counter += 1;
155 }
156 inputs += 1;
157 num_inputs -= 1;
158 out = &out[BLAKE3_OUT_LEN];
159 }
160}
161