1#include <stdbool.h>
2#include <stddef.h>
3#include <stdint.h>
4
5#include "blake3_impl.h"
6
7#if defined(IS_X86)
8#if defined(_MSC_VER)
9#include <intrin.h>
10#elif defined(__GNUC__)
11#include <immintrin.h>
12#else
13#error "Unimplemented!"
14#endif
15#endif
16
17#define MAYBE_UNUSED(x) (void)((x))
18
19#if defined(IS_X86)
20static uint64_t xgetbv(void) {
21#if defined(_MSC_VER)
22 return _xgetbv(0);
23#else
24 uint32_t eax = 0, edx = 0;
25 __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
26 return ((uint64_t)edx << 32) | eax;
27#endif
28}
29
30static void cpuid(uint32_t out[4], uint32_t id) {
31#if defined(_MSC_VER)
32 __cpuid((int *)out, id);
33#elif defined(__i386__) || defined(_M_IX86)
34 __asm__ __volatile__("movl %%ebx, %1\n"
35 "cpuid\n"
36 "xchgl %1, %%ebx\n"
37 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
38 : "a"(id));
39#else
40 __asm__ __volatile__("cpuid\n"
41 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
42 : "a"(id));
43#endif
44}
45
46static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
47#if defined(_MSC_VER)
48 __cpuidex((int *)out, id, sid);
49#elif defined(__i386__) || defined(_M_IX86)
50 __asm__ __volatile__("movl %%ebx, %1\n"
51 "cpuid\n"
52 "xchgl %1, %%ebx\n"
53 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
54 : "a"(id), "c"(sid));
55#else
56 __asm__ __volatile__("cpuid\n"
57 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
58 : "a"(id), "c"(sid));
59#endif
60}
61
62#endif
63
64enum cpu_feature {
65 SSE2 = 1 << 0,
66 SSSE3 = 1 << 1,
67 SSE41 = 1 << 2,
68 AVX = 1 << 3,
69 AVX2 = 1 << 4,
70 AVX512F = 1 << 5,
71 AVX512VL = 1 << 6,
72 /* ... */
73 UNDEFINED = 1 << 30
74};
75
76#if !defined(BLAKE3_TESTING)
77static /* Allow the variable to be controlled manually for testing */
78#endif
79 enum cpu_feature g_cpu_features = UNDEFINED;
80
81LLVM_ATTRIBUTE_USED
82#if !defined(BLAKE3_TESTING)
83static
84#endif
85 enum cpu_feature
86 get_cpu_features(void) {
87
88 if (g_cpu_features != UNDEFINED) {
89 return g_cpu_features;
90 } else {
91#if defined(IS_X86)
92 uint32_t regs[4] = {0};
93 uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
94 (void)edx;
95 enum cpu_feature features = 0;
96 cpuid(out: regs, id: 0);
97 const int max_id = *eax;
98 cpuid(out: regs, id: 1);
99#if defined(__amd64__) || defined(_M_X64)
100 features |= SSE2;
101#else
102 if (*edx & (1UL << 26))
103 features |= SSE2;
104#endif
105 if (*ecx & (1UL << 0))
106 features |= SSSE3;
107 if (*ecx & (1UL << 19))
108 features |= SSE41;
109
110 if (*ecx & (1UL << 27)) { // OSXSAVE
111 const uint64_t mask = xgetbv();
112 if ((mask & 6) == 6) { // SSE and AVX states
113 if (*ecx & (1UL << 28))
114 features |= AVX;
115 if (max_id >= 7) {
116 cpuidex(out: regs, id: 7, sid: 0);
117 if (*ebx & (1UL << 5))
118 features |= AVX2;
119 if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
120 if (*ebx & (1UL << 31))
121 features |= AVX512VL;
122 if (*ebx & (1UL << 16))
123 features |= AVX512F;
124 }
125 }
126 }
127 }
128 g_cpu_features = features;
129 return features;
130#else
131 /* How to detect NEON? */
132 return 0;
133#endif
134 }
135}
136
137void blake3_compress_in_place(uint32_t cv[8],
138 const uint8_t block[BLAKE3_BLOCK_LEN],
139 uint8_t block_len, uint64_t counter,
140 uint8_t flags) {
141#if defined(IS_X86)
142 const enum cpu_feature features = get_cpu_features();
143 MAYBE_UNUSED(features);
144#if !defined(BLAKE3_NO_AVX512)
145 if (features & AVX512VL) {
146 blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
147 return;
148 }
149#endif
150#if !defined(BLAKE3_NO_SSE41)
151 if (features & SSE41) {
152 blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
153 return;
154 }
155#endif
156#if !defined(BLAKE3_NO_SSE2)
157 if (features & SSE2) {
158 blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
159 return;
160 }
161#endif
162#endif
163 blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
164}
165
166void blake3_compress_xof(const uint32_t cv[8],
167 const uint8_t block[BLAKE3_BLOCK_LEN],
168 uint8_t block_len, uint64_t counter, uint8_t flags,
169 uint8_t out[64]) {
170#if defined(IS_X86)
171 const enum cpu_feature features = get_cpu_features();
172 MAYBE_UNUSED(features);
173#if !defined(BLAKE3_NO_AVX512)
174 if (features & AVX512VL) {
175 blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
176 return;
177 }
178#endif
179#if !defined(BLAKE3_NO_SSE41)
180 if (features & SSE41) {
181 blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
182 return;
183 }
184#endif
185#if !defined(BLAKE3_NO_SSE2)
186 if (features & SSE2) {
187 blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
188 return;
189 }
190#endif
191#endif
192 blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
193}
194
195void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
196 size_t blocks, const uint32_t key[8], uint64_t counter,
197 bool increment_counter, uint8_t flags,
198 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
199#if defined(IS_X86)
200 const enum cpu_feature features = get_cpu_features();
201 MAYBE_UNUSED(features);
202#if !defined(BLAKE3_NO_AVX512)
203 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
204 blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
205 increment_counter, flags, flags_start, flags_end,
206 out);
207 return;
208 }
209#endif
210#if !defined(BLAKE3_NO_AVX2)
211 if (features & AVX2) {
212 blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
213 increment_counter, flags, flags_start, flags_end,
214 out);
215 return;
216 }
217#endif
218#if !defined(BLAKE3_NO_SSE41)
219 if (features & SSE41) {
220 blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
221 increment_counter, flags, flags_start, flags_end,
222 out);
223 return;
224 }
225#endif
226#if !defined(BLAKE3_NO_SSE2)
227 if (features & SSE2) {
228 blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
229 increment_counter, flags, flags_start, flags_end,
230 out);
231 return;
232 }
233#endif
234#endif
235
236#if BLAKE3_USE_NEON == 1
237 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
238 increment_counter, flags, flags_start, flags_end, out);
239 return;
240#endif
241
242 blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
243 increment_counter, flags, flags_start, flags_end,
244 out);
245}
246
247// The dynamically detected SIMD degree of the current platform.
248size_t blake3_simd_degree(void) {
249#if defined(IS_X86)
250 const enum cpu_feature features = get_cpu_features();
251 MAYBE_UNUSED(features);
252#if !defined(BLAKE3_NO_AVX512)
253 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
254 return 16;
255 }
256#endif
257#if !defined(BLAKE3_NO_AVX2)
258 if (features & AVX2) {
259 return 8;
260 }
261#endif
262#if !defined(BLAKE3_NO_SSE41)
263 if (features & SSE41) {
264 return 4;
265 }
266#endif
267#if !defined(BLAKE3_NO_SSE2)
268 if (features & SSE2) {
269 return 4;
270 }
271#endif
272#endif
273#if BLAKE3_USE_NEON == 1
274 return 4;
275#endif
276 return 1;
277}
278